diff options
200 files changed, 66076 insertions, 10 deletions
diff --git a/Android.mk b/Android.mk index 0d5917ce5f2..6a3014c81bb 100644 --- a/Android.mk +++ b/Android.mk @@ -24,7 +24,7 @@ # BOARD_GPU_DRIVERS should be defined. The valid values are # # classic drivers: i915 i965 -# gallium drivers: swrast i915g nouveau r300g r600g vmwgfx +# gallium drivers: swrast i915g nouveau r300g r600g radeonsi vmwgfx # # The main target is libGLES_mesa. For each classic driver enabled, a DRI # module will also be built. DRI modules will be loaded by libGLES_mesa. @@ -37,7 +37,7 @@ DRM_TOP := external/drm DRM_GRALLOC_TOP := hardware/drm_gralloc classic_drivers := i915 i965 -gallium_drivers := swrast i915g nouveau r300g r600g vmwgfx +gallium_drivers := swrast i915g nouveau r300g r600g radeonsi vmwgfx MESA_GPU_DRIVERS := $(strip $(BOARD_GPU_DRIVERS)) diff --git a/configs/autoconf.in b/configs/autoconf.in index 95cca6f239e..ec3f3194e2d 100644 --- a/configs/autoconf.in +++ b/configs/autoconf.in @@ -32,9 +32,12 @@ INTEL_LIBS = @INTEL_LIBS@ INTEL_CFLAGS = @INTEL_CFLAGS@ X11_LIBS = @X11_LIBS@ X11_CFLAGS = @X11_CFLAGS@ +LLVM_BINDIR = @LLVM_BINDIR@ LLVM_CFLAGS = @LLVM_CFLAGS@ +LLVM_CXXFLAGS = @LLVM_CXXFLAGS@ LLVM_LDFLAGS = @LLVM_LDFLAGS@ LLVM_LIBS = @LLVM_LIBS@ +LLVM_INCLUDEDIR = @LLVM_INCLUDEDIR@ GLW_CFLAGS = @GLW_CFLAGS@ GLX_TLS = @GLX_TLS@ DRI_CFLAGS = @DRI_CFLAGS@ @@ -58,6 +61,9 @@ AWK = @AWK@ GREP = @GREP@ NM = @NM@ +# Perl +PERL = @PERL@ + # Python and flags (generally only needed by the developers) PYTHON2 = @PYTHON2@ PYTHON_FLAGS = -t -O -O diff --git a/configure.ac b/configure.ac index 65d358e0a8d..17564f12885 100644 --- a/configure.ac +++ b/configure.ac @@ -67,6 +67,8 @@ if test ! -f "$srcdir/src/glsl/glcpp/glcpp-parse.y"; then fi AC_PROG_LEX +AC_PATH_PROG([PERL], [perl]) + dnl Our fallback install-sh is a symlink to minstall. Use the existing dnl configuration in that case. AC_PROG_INSTALL @@ -1647,9 +1649,12 @@ if test "x$with_gallium_drivers" != x; then SRC_DIRS="$SRC_DIRS gallium gallium/winsys gallium/targets" fi +AC_SUBST([LLVM_BINDIR]) AC_SUBST([LLVM_CFLAGS]) +AC_SUBST([LLVM_CXXFLAGS]) AC_SUBST([LLVM_LIBS]) AC_SUBST([LLVM_LDFLAGS]) +AC_SUBST([LLVM_INCLUDEDIR]) AC_SUBST([LLVM_VERSION]) case "x$enable_opengl$enable_gles1$enable_gles2" in @@ -1795,6 +1800,9 @@ if test "x$enable_gallium_llvm" = xyes; then LLVM_LIBS="`$LLVM_CONFIG --libs engine bitwriter`" fi LLVM_LDFLAGS=`$LLVM_CONFIG --ldflags` + LLVM_BINDIR=`$LLVM_CONFIG --bindir` + LLVM_CXXFLAGS=`$LLVM_CONFIG --cxxflags` + LLVM_INCLUDEDIR=`$LLVM_CONFIG --includedir` DEFINES="$DEFINES -D__STDC_CONSTANT_MACROS" MESA_LLVM=1 else @@ -1898,6 +1906,14 @@ if test "x$with_gallium_drivers" != x; then GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS r600" gallium_check_st "radeon/drm" "dri-r600" "xorg-r600" "" "xvmc-r600" "vdpau-r600" "va-r600" ;; + xradeonsi) + GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS radeonsi" + if test "x$LLVM_VERSION" != "x3.1"; then + AC_MSG_ERROR([LLVM 3.1 is required to build the radeonsi driver.]) + fi + NEED_RADEON_GALLIUM=yes; + gallium_check_st "radeon/drm" "dri-radeonsi" "xorg-radeonsi" + ;; xnouveau) PKG_CHECK_MODULES([NOUVEAU], [libdrm_nouveau >= $LIBDRM_NOUVEAU_REQUIRED]) GALLIUM_DRIVERS_DIRS="$GALLIUM_DRIVERS_DIRS nouveau nvfx nv50 nvc0" @@ -1957,6 +1973,7 @@ done AM_CONDITIONAL(HAVE_GALAHAD_GALLIUM, test x$HAVE_GALAHAD_GALLIUM = xyes) AM_CONDITIONAL(HAVE_IDENTITY_GALLIUM, test x$HAVE_IDENTITY_GALLIUM = xyes) AM_CONDITIONAL(HAVE_NOOP_GALLIUM, test x$HAVE_NOOP_GALLIUM = xyes) +AM_CONDITIONAL(NEED_RADEON_GALLIUM, test x$NEED_RADEON_GALLIUM = xyes) AC_SUBST([GALLIUM_MAKE_DIRS]) dnl prepend CORE_DIRS to SRC_DIRS diff --git a/include/pci_ids/pci_id_driver_map.h b/include/pci_ids/pci_id_driver_map.h index 232359f6e0d..fce38af0fe0 100644 --- a/include/pci_ids/pci_id_driver_map.h +++ b/include/pci_ids/pci_id_driver_map.h @@ -45,6 +45,12 @@ static const int r600_chip_ids[] = { #undef CHIPSET }; +static const int radeonsi_chip_ids[] = { +#define CHIPSET(chip, name, family) chip, +#include "pci_ids/radeonsi_pci_ids.h" +#undef CHIPSET +}; + static const int vmwgfx_chip_ids[] = { #define CHIPSET(chip, name, family) chip, #include "pci_ids/vmwgfx_pci_ids.h" @@ -65,6 +71,7 @@ static const struct { #endif { 0x1002, "r300", r300_chip_ids, ARRAY_SIZE(r300_chip_ids) }, { 0x1002, "r600", r600_chip_ids, ARRAY_SIZE(r600_chip_ids) }, + { 0x1002, "radeonsi", radeonsi_chip_ids, ARRAY_SIZE(radeonsi_chip_ids) }, { 0x10de, "nouveau", NULL, -1 }, { 0x15ad, "vmwgfx", vmwgfx_chip_ids, ARRAY_SIZE(vmwgfx_chip_ids) }, { 0x0000, NULL, NULL, 0 }, diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h new file mode 100644 index 00000000000..55ade1247d7 --- /dev/null +++ b/include/pci_ids/radeonsi_pci_ids.h @@ -0,0 +1,40 @@ +CHIPSET(0x6780, TAHITI_6780, TAHITI) +CHIPSET(0x6784, TAHITI_6784, TAHITI) +CHIPSET(0x6788, TAHITI_678A, TAHITI) +CHIPSET(0x678A, TAHITI_678A, TAHITI) +CHIPSET(0x6790, TAHITI_6790, TAHITI) +CHIPSET(0x6798, TAHITI_6798, TAHITI) +CHIPSET(0x6799, TAHITI_6799, TAHITI) +CHIPSET(0x679A, TAHITI_679E, TAHITI) +CHIPSET(0x679E, TAHITI_679E, TAHITI) +CHIPSET(0x679F, TAHITI_679F, TAHITI) + +CHIPSET(0x6800, PITCAIRN_6800, PITCAIRN) +CHIPSET(0x6801, PITCAIRN_6801, PITCAIRN) +CHIPSET(0x6802, PITCAIRN_6802, PITCAIRN) +CHIPSET(0x6808, PITCAIRN_6808, PITCAIRN) +CHIPSET(0x6809, PITCAIRN_6809, PITCAIRN) +CHIPSET(0x6810, PITCAIRN_6810, PITCAIRN) +CHIPSET(0x6818, PITCAIRN_6818, PITCAIRN) +CHIPSET(0x6819, PITCAIRN_6819, PITCAIRN) +CHIPSET(0x684C, PITCAIRN_684C, PITCAIRN) + +CHIPSET(0x6820, VERDE_6820, VERDE) +CHIPSET(0x6821, VERDE_6821, VERDE) +CHIPSET(0x6823, VERDE_6824, VERDE) +CHIPSET(0x6824, VERDE_6824, VERDE) +CHIPSET(0x6825, VERDE_6825, VERDE) +CHIPSET(0x6826, VERDE_6825, VERDE) +CHIPSET(0x6827, VERDE_6827, VERDE) +CHIPSET(0x6828, VERDE_6828, VERDE) +CHIPSET(0x6829, VERDE_6829, VERDE) +CHIPSET(0x682D, VERDE_682D, VERDE) +CHIPSET(0x682F, VERDE_682F, VERDE) +CHIPSET(0x6830, VERDE_6830, VERDE) +CHIPSET(0x6831, VERDE_6831, VERDE) +CHIPSET(0x6837, VERDE_6831, VERDE) +CHIPSET(0x6838, VERDE_6838, VERDE) +CHIPSET(0x6839, VERDE_6839, VERDE) +CHIPSET(0x683B, VERDE_683B, VERDE) +CHIPSET(0x683D, VERDE_683D, VERDE) +CHIPSET(0x683F, VERDE_683F, VERDE) diff --git a/src/egl/main/Android.mk b/src/egl/main/Android.mk index d96da228aa7..a4a00f3bb35 100644 --- a/src/egl/main/Android.mk +++ b/src/egl/main/Android.mk @@ -107,8 +107,8 @@ gallium_DRIVERS += \ LOCAL_SHARED_LIBRARIES += libdrm_nouveau endif -# r300g/r600g -ifneq ($(filter r300g r600g, $(MESA_GPU_DRIVERS)),) +# r300g/r600g/radeonsi +ifneq ($(filter r300g r600g radeonsi, $(MESA_GPU_DRIVERS)),) gallium_DRIVERS += libmesa_winsys_radeon ifneq ($(filter r300g, $(MESA_GPU_DRIVERS)),) gallium_DRIVERS += libmesa_pipe_r300 @@ -116,6 +116,9 @@ endif ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),) gallium_DRIVERS += libmesa_pipe_r600 endif +ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),) +gallium_DRIVERS += libmesa_pipe_radeonsi +endif endif # vmwgfx diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk index 41c59b13c6f..1d002d05374 100644 --- a/src/gallium/Android.mk +++ b/src/gallium/Android.mk @@ -49,8 +49,8 @@ SUBDIRS += \ drivers/nvc0 endif -# r300g/r600g -ifneq ($(filter r300g r600g, $(MESA_GPU_DRIVERS)),) +# r300g/r600g/radeonsi +ifneq ($(filter r300g r600g radeonsi, $(MESA_GPU_DRIVERS)),) SUBDIRS += winsys/radeon/drm ifneq ($(filter r300g, $(MESA_GPU_DRIVERS)),) SUBDIRS += drivers/r300 @@ -58,6 +58,9 @@ endif ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),) SUBDIRS += drivers/r600 endif +ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),) +SUBDIRS += drivers/radeonsi +endif endif # vmwgfx diff --git a/src/gallium/SConscript b/src/gallium/SConscript index 4413bc8742b..da2e4dd5ded 100644 --- a/src/gallium/SConscript +++ b/src/gallium/SConscript @@ -33,6 +33,7 @@ if env['drm']: SConscript([ 'drivers/r300/SConscript', 'drivers/r600/SConscript', + 'drivers/radeonsi/SConscript', ]) # XXX: nouveau drivers have a tight dependency on libdrm, so to enable # we need some version logic before we enable them. Also, ATM there is @@ -152,6 +153,7 @@ if not env['embedded']: SConscript([ 'targets/dri-r300/SConscript', 'targets/dri-r600/SConscript', + 'targets/dri-radeonsi/SConscript', ]) if env['xorg'] and env['drm']: diff --git a/src/gallium/drivers/Makefile.am b/src/gallium/drivers/Makefile.am index 0aa2653a0f1..97c5695fa15 100644 --- a/src/gallium/drivers/Makefile.am +++ b/src/gallium/drivers/Makefile.am @@ -10,6 +10,8 @@ AM_CPPFLAGS = \ noinst_LIBRARIES = +SUBDIRS = + ################################################################################ if HAVE_GALAHAD_GALLIUM @@ -52,7 +54,16 @@ noop_libnoop_a_SOURCES = \ endif ################################################################################ -SUBDIRS = $(GALLIUM_MAKE_DIRS) + +if NEED_RADEON_GALLIUM + +SUBDIRS+= radeon + +endif + +################################################################################ + +SUBDIRS+= $(GALLIUM_MAKE_DIRS) # FIXME: Remove when the rest of Gallium is converted to automake. default: all diff --git a/src/gallium/drivers/radeon/AMDGPU.h b/src/gallium/drivers/radeon/AMDGPU.h new file mode 100644 index 00000000000..5613dab4b35 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPU.h @@ -0,0 +1,47 @@ +//===-- AMDGPU.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_H +#define AMDGPU_H + +#include "AMDGPUTargetMachine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + class FunctionPass; + class AMDGPUTargetMachine; + + FunctionPass *createR600CodeEmitterPass(formatted_raw_ostream &OS); + FunctionPass *createR600LowerShaderInstructionsPass(TargetMachine &tm); + FunctionPass *createR600LowerInstructionsPass(TargetMachine &tm); + + FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); + FunctionPass *createSIConvertToISAPass(TargetMachine &tm); + FunctionPass *createSIInitMachineFunctionInfoPass(TargetMachine &tm); + FunctionPass *createSILowerShaderInstructionsPass(TargetMachine &tm); + FunctionPass *createSIPropagateImmReadsPass(TargetMachine &tm); + FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); + + FunctionPass *createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm); + + FunctionPass *createAMDGPULowerShaderInstructionsPass(TargetMachine &tm); + + FunctionPass *createAMDGPUDelimitInstGroupsPass(TargetMachine &tm); + + FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); + + FunctionPass *createAMDGPUFixRegClassesPass(TargetMachine &tm); + +} /* End namespace llvm */ +#endif /* AMDGPU_H */ diff --git a/src/gallium/drivers/radeon/AMDGPUConstants.pm b/src/gallium/drivers/radeon/AMDGPUConstants.pm new file mode 100644 index 00000000000..b64ff49c187 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUConstants.pm @@ -0,0 +1,44 @@ +#===-- AMDGPUConstants.pm - TODO: Add brief description -------===# +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +#===----------------------------------------------------------------------===# +# +# TODO: Add full description +# +#===----------------------------------------------------------------------===# + +package AMDGPUConstants; + +use base 'Exporter'; + +use constant CONST_REG_COUNT => 256; +use constant TEMP_REG_COUNT => 128; + +our @EXPORT = ('TEMP_REG_COUNT', 'CONST_REG_COUNT', 'get_hw_index', 'get_chan_str'); + +sub get_hw_index { + my ($index) = @_; + return int($index / 4); +} + +sub get_chan_str { + my ($index) = @_; + my $chan = $index % 4; + if ($chan == 0 ) { + return 'X'; + } elsif ($chan == 1) { + return 'Y'; + } elsif ($chan == 2) { + return 'Z'; + } elsif ($chan == 3) { + return 'W'; + } else { + die("Unknown chan value: $chan"); + } +} + +1; diff --git a/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp b/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp new file mode 100644 index 00000000000..ce947f8ff78 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp @@ -0,0 +1,65 @@ +//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers AMDIL machine instructions to the appropriate hardware +// instructions. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +namespace { + class AMDGPUConvertToISAPass : public MachineFunctionPass { + + private: + static char ID; + TargetMachine &TM; + + void lowerFLT(MachineInstr &MI); + + public: + AMDGPUConvertToISAPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + }; +} /* End anonymous namespace */ + +char AMDGPUConvertToISAPass::ID = 0; + +FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) { + return new AMDGPUConvertToISAPass(tm); +} + +bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) +{ + const AMDGPUInstrInfo * TII = + static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next, Next = llvm::next(I) ) { + MachineInstr &MI = *I; + MachineInstr * newInstr = TII->convertToISA(MI, MF, MBB.findDebugLoc(I)); + if (!newInstr) { + continue; + } + MBB.insert(I, newInstr); + MI.eraseFromParent(); + } + } + return false; +} diff --git a/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl b/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl new file mode 100644 index 00000000000..1fd4fb04b3e --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl @@ -0,0 +1,126 @@ +#===-- AMDGPUGenInstrEnums.pl - TODO: Add brief description -------===# +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +#===----------------------------------------------------------------------===# +# +# TODO: Add full description +# +#===----------------------------------------------------------------------===# + +use warnings; +use strict; + +my @F32_MULTICLASSES = qw { + UnaryIntrinsicFloat + UnaryIntrinsicFloatScalar + BinaryIntrinsicFloat + TernaryIntrinsicFloat + BinaryOpMCFloat +}; + +my @I32_MULTICLASSES = qw { + BinaryOpMCInt + BinaryOpMCi32 + BinaryOpMCi32Const +}; + +my @GENERATION_ENUM = qw { + R600_CAYMAN + R600 + EG + EG_CAYMAN + CAYMAN + SI +}; + +my $FILE_TYPE = $ARGV[0]; + +open AMDIL, '<', 'AMDILInstructions.td'; + +my @INST_ENUMS = ('NONE', 'FEQ', 'FGE', 'FLT', 'FNE', 'MOVE_f32', 'MOVE_i32', 'FTOI', 'ITOF', 'CMOVLOG_f32', 'UGT', 'IGE', 'INE', 'UGE', 'IEQ'); + +while (<AMDIL>) { + if ($_ =~ /defm\s+([A-Z_]+)\s+:\s+([A-Za-z0-9]+)</) { + if (grep {$_ eq $2} @F32_MULTICLASSES) { + push @INST_ENUMS, "$1\_f32"; + + } elsif (grep {$_ eq $2} @I32_MULTICLASSES) { + push @INST_ENUMS, "$1\_i32"; + } + } elsif ($_ =~ /def\s+([A-Z_]+)(_[fi]32)/) { + push @INST_ENUMS, "$1$2"; + } +} + +if ($FILE_TYPE eq 'td') { + + print_td_enum('AMDILInst', 'AMDILInstEnums', 'field bits<16>', @INST_ENUMS); + + print_td_enum('AMDGPUGen', 'AMDGPUGenEnums', 'field bits<3>', @GENERATION_ENUM); + + my %constants = ( + 'PI' => '0x40490fdb', + 'TWO_PI' => '0x40c90fdb', + 'TWO_PI_INV' => '0x3e22f983' + ); + + print "class Constants {\n"; + foreach (keys(%constants)) { + print "int $_ = $constants{$_};\n"; + } + print "}\n"; + print "def CONST : Constants;\n"; + +} elsif ($FILE_TYPE eq 'h') { + + print "unsigned GetRealAMDILOpcode(unsigned internalOpcode) const;\n"; + + print_h_enum('AMDILTblgenOpcode', @INST_ENUMS); + + print_h_enum('AMDGPUGen', @GENERATION_ENUM); + +} elsif ($FILE_TYPE eq 'inc') { + print "unsigned AMDGPUInstrInfo::GetRealAMDILOpcode(unsigned internalOpcode) const\n{\n"; + print " switch(internalOpcode) {\n"; + #Start at 1 so we skip NONE + for (my $i = 1; $i < scalar(@INST_ENUMS); $i++) { + my $inst = $INST_ENUMS[$i]; + print " case AMDGPUInstrInfo::$inst: return AMDIL::$inst;\n"; + } + print " default: abort();\n"; + print " }\n}\n"; +} + + +sub print_td_enum { + my ($instance, $class, $field, @values) = @_; + + print "class $class {\n"; + + for (my $i = 0; $i < scalar(@values); $i++) { + print " $field $values[$i] = $i;\n"; + } + print "}\n"; + + print "def $instance : $class;\n"; +} + +sub print_h_enum { + + my ($enum, @list) = @_; + print "enum $enum {\n"; + + for (my $i = 0; $i < scalar(@list); $i++) { + print " $list[$i] = $i"; + if ($i != $#list) { + print ','; + } + print "\n"; + } + print "};\n"; +} + diff --git a/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl b/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl new file mode 100644 index 00000000000..60523a7b48f --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl @@ -0,0 +1,30 @@ +#===-- AMDGPUGenShaderPatterns.pl - TODO: Add brief description -------===# +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +#===----------------------------------------------------------------------===# +# +# TODO: Add full description +# +#===----------------------------------------------------------------------===# + +use strict; +use warnings; + +use AMDGPUConstants; + +my $reg_prefix = $ARGV[0]; + +for (my $i = 0; $i < CONST_REG_COUNT * 4; $i++) { + my $index = get_hw_index($i); + my $chan = get_chan_str($i); +print <<STRING; +def : Pat < + (int_AMDGPU_load_const $i), + (f32 (MOV (f32 $reg_prefix$index\_$chan))) +>; +STRING +} diff --git a/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp new file mode 100644 index 00000000000..2c1052fd8ea --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp @@ -0,0 +1,31 @@ +//===-- AMDGPUISelLowering.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUISelLowering.h" +#include "AMDGPUUtil.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : + AMDILTargetLowering(TM) +{ +} + +void AMDGPUTargetLowering::addLiveIn(MachineInstr * MI, + MachineFunction * MF, MachineRegisterInfo & MRI, + const struct TargetInstrInfo * TII, unsigned reg) const +{ + AMDGPU::utilAddLiveIn(MF, MRI, TII, reg, MI->getOperand(0).getReg()); +} + diff --git a/src/gallium/drivers/radeon/AMDGPUISelLowering.h b/src/gallium/drivers/radeon/AMDGPUISelLowering.h new file mode 100644 index 00000000000..3c5beb1cdae --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.h @@ -0,0 +1,35 @@ +//===-- AMDGPUISelLowering.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUISELLOWERING_H +#define AMDGPUISELLOWERING_H + +#include "AMDILISelLowering.h" + +namespace llvm { + +class AMDGPUTargetLowering : public AMDILTargetLowering +{ +protected: + void addLiveIn(MachineInstr * MI, MachineFunction * MF, + MachineRegisterInfo & MRI, const struct TargetInstrInfo * TII, + unsigned reg) const; + +public: + AMDGPUTargetLowering(TargetMachine &TM); + +}; + +} /* End namespace llvm */ + +#endif /* AMDGPUISELLOWERING_H */ diff --git a/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp b/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp new file mode 100644 index 00000000000..4742283f688 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp @@ -0,0 +1,116 @@ +//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the TargetInstrInfo class that is +// common to all AMD GPUs. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUInstrInfo.h" +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "AMDIL.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +AMDGPUInstrInfo::AMDGPUInstrInfo(AMDGPUTargetMachine &tm) + : AMDILInstrInfo(tm), TM(tm) +{ + const AMDILDevice * dev = TM.getSubtarget<AMDILSubtarget>().device(); + for (unsigned i = 0; i < AMDIL::INSTRUCTION_LIST_END; i++) { + const MCInstrDesc & instDesc = get(i); + uint32_t instGen = (instDesc.TSFlags >> 40) & 0x7; + uint32_t inst = (instDesc.TSFlags >> 48) & 0xffff; + if (inst == 0) { + continue; + } + switch (instGen) { + case AMDGPUInstrInfo::R600_CAYMAN: + if (dev->getGeneration() > AMDILDeviceInfo::HD6XXX) { + continue; + } + break; + case AMDGPUInstrInfo::R600: + if (dev->getGeneration() != AMDILDeviceInfo::HD4XXX) { + continue; + } + break; + case AMDGPUInstrInfo::EG_CAYMAN: + if (dev->getGeneration() < AMDILDeviceInfo::HD5XXX + || dev->getGeneration() > AMDILDeviceInfo::HD6XXX) { + continue; + } + break; + case AMDGPUInstrInfo::CAYMAN: + if (dev->getDeviceFlag() != OCL_DEVICE_CAYMAN) { + continue; + } + break; + case AMDGPUInstrInfo::SI: + if (dev->getGeneration() != AMDILDeviceInfo::HD7XXX) { + continue; + } + break; + default: + abort(); + break; + } + + unsigned amdilOpcode = GetRealAMDILOpcode(inst); + amdilToISA[amdilOpcode] = instDesc.Opcode; + } +} + +MachineInstr * AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF, + DebugLoc DL) const +{ + MachineInstrBuilder newInstr; + MachineRegisterInfo &MRI = MF.getRegInfo(); + const AMDGPURegisterInfo & RI = getRegisterInfo(); + unsigned ISAOpcode = getISAOpcode(MI.getOpcode()); + + /* Create the new instruction */ + newInstr = BuildMI(MF, DL, TM.getInstrInfo()->get(ISAOpcode)); + + for (unsigned i = 0; i < MI.getNumOperands(); i++) { + MachineOperand &MO = MI.getOperand(i); + /* Convert dst regclass to one that is supported by the ISA */ + if (MO.isReg() && MO.isDef()) { + if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) { + const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg()); + const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass); + + assert(newRegClass); + + MRI.setRegClass(MO.getReg(), newRegClass); + } + } + /* Add the operand to the new instruction */ + newInstr.addOperand(MO); + } + + return newInstr; +} + +unsigned AMDGPUInstrInfo::getISAOpcode(unsigned opcode) const +{ + if (amdilToISA.count(opcode) == 0) { + return opcode; + } else { + return amdilToISA.find(opcode)->second; + } +} + +bool AMDGPUInstrInfo::isRegPreload(const MachineInstr &MI) const +{ + return (get(MI.getOpcode()).TSFlags >> AMDGPU_TFLAG_SHIFTS::PRELOAD_REG) & 0x1; +} + +#include "AMDGPUInstrEnums.include" diff --git a/src/gallium/drivers/radeon/AMDGPUInstrInfo.h b/src/gallium/drivers/radeon/AMDGPUInstrInfo.h new file mode 100644 index 00000000000..fa009bc6302 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.h @@ -0,0 +1,59 @@ +//===-- AMDGPUInstrInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUINSTRUCTIONINFO_H_ +#define AMDGPUINSTRUCTIONINFO_H_ + +#include "AMDGPURegisterInfo.h" +#include "AMDILInstrInfo.h" + +#include <map> + +namespace llvm { + + class AMDGPUTargetMachine; + class MachineFunction; + class MachineInstr; + class MachineInstrBuilder; + + class AMDGPUInstrInfo : public AMDILInstrInfo { + private: + AMDGPUTargetMachine & TM; + std::map<unsigned, unsigned> amdilToISA; + + public: + explicit AMDGPUInstrInfo(AMDGPUTargetMachine &tm); + + virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0; + + virtual unsigned getISAOpcode(unsigned AMDILopcode) const; + + virtual MachineInstr * convertToISA(MachineInstr & MI, MachineFunction &MF, + DebugLoc DL) const; + + bool isRegPreload(const MachineInstr &MI) const; + + #include "AMDGPUInstrEnums.h.include" + }; + +} // End llvm namespace + +/* AMDGPU target flags are stored in bits 32-39 */ +namespace AMDGPU_TFLAG_SHIFTS { + enum TFLAGS { + PRELOAD_REG = 32 + }; +} + + +#endif // AMDGPUINSTRINFO_H_ diff --git a/src/gallium/drivers/radeon/AMDGPUInstructions.td b/src/gallium/drivers/radeon/AMDGPUInstructions.td new file mode 100644 index 00000000000..10eceb6ce53 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUInstructions.td @@ -0,0 +1,90 @@ +//===-- AMDGPUInstructions.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +include "AMDGPUInstrEnums.td" + +class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction { + field bits<16> AMDILOp = 0; + field bits<3> Gen = 0; + field bit PreloadReg = 0; + + let Namespace = "AMDIL"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let TSFlags{32} = PreloadReg; + let TSFlags{42-40} = Gen; + let TSFlags{63-48} = AMDILOp; +} + +class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern> + : AMDGPUInst<outs, ins, asm, pattern> { + + field bits<32> Inst = 0xffffffff; + +} + +let isCodeGenOnly = 1 in { + + def EXPORT_REG : AMDGPUShaderInst < + (outs), + (ins GPRF32:$src), + "EXPORT_REG $src", + [(int_AMDGPU_export_reg GPRF32:$src)] + >; + + def LOAD_INPUT : AMDGPUShaderInst < + (outs GPRF32:$dst), + (ins i32imm:$src), + "LOAD_INPUT $dst, $src", + [] >{ + let PreloadReg = 1; + } + + def MASK_WRITE : AMDGPUShaderInst < + (outs), + (ins GPRF32:$src), + "MASK_WRITE $src", + [] + >; + + def RESERVE_REG : AMDGPUShaderInst < + (outs GPRF32:$dst), + (ins i32imm:$src), + "RESERVE_REG $dst, $src", + [(set GPRF32:$dst, (int_AMDGPU_reserve_reg imm:$src))]> { + let PreloadReg = 1; + } + + def STORE_OUTPUT: AMDGPUShaderInst < + (outs GPRF32:$dst), + (ins GPRF32:$src0, i32imm:$src1), + "STORE_OUTPUT $dst, $src0, $src1", + [(set GPRF32:$dst, (int_AMDGPU_store_output GPRF32:$src0, imm:$src1))] + >; +} + +/* Generic helper patterns for intrinsics */ +/* -------------------------------------- */ + +class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul, + RegisterClass rc> : Pat < + (int_AMDGPU_pow rc:$src0, rc:$src1), + (exp_ieee (mul rc:$src1, (log_ieee rc:$src0))) +>; + +include "R600Instructions.td" + +include "SIInstrInfo.td" + diff --git a/src/gallium/drivers/radeon/AMDGPUIntrinsics.td b/src/gallium/drivers/radeon/AMDGPUIntrinsics.td new file mode 100644 index 00000000000..d2cda0db936 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUIntrinsics.td @@ -0,0 +1,56 @@ +//===-- AMDGPUIntrinsics.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "AMDGPU", isTarget = 1 in { + + def int_AMDGPU_export_reg : Intrinsic<[], [llvm_float_ty], []>; + def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], []>; + def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], []>; + def int_AMDGPU_reserve_reg : Intrinsic<[llvm_float_ty], [llvm_i32_ty], []>; + def int_AMDGPU_store_output : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty], []>; + def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], []>; + + def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], []>; + def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_cos : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>; + def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], []>; + def int_AMDGPU_floor : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>; + def int_AMDGPU_kill : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>; + def int_AMDGPU_kilp : Intrinsic<[], [], []>; + def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>; + def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>; + def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_sge : BinaryIntFloat; + def int_AMDGPU_sin : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>; + def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_ssg : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>; + def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>; + def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>; +} + +let TargetPrefix = "TGSI", isTarget = 1 in { + + def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[]>; +} + +include "SIIntrinsics.td" diff --git a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp new file mode 100644 index 00000000000..d33055ccb87 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp @@ -0,0 +1,38 @@ +//===-- AMDGPULowerShaderInstructions.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#include "AMDGPULowerShaderInstructions.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +void AMDGPULowerShaderInstructionsPass::preloadRegister(MachineFunction * MF, + const TargetInstrInfo * TII, unsigned physReg, unsigned virtReg) const +{ + if (!MRI->isLiveIn(physReg)) { + MRI->addLiveIn(physReg, virtReg); + MachineBasicBlock &EntryMBB = MF->front(); + BuildMI(MF->front(), EntryMBB.begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + virtReg) + .addReg(physReg); + } else { + /* We can't mark the same register as preloaded twice, but we still must + * associate virtReg with the correct preloaded register. */ + unsigned newReg = MRI->getLiveInVirtReg(physReg); + MRI->replaceRegWith(virtReg, newReg); + } +} diff --git a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h new file mode 100644 index 00000000000..5ee77fafe2b --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h @@ -0,0 +1,40 @@ +//===-- AMDGPULowerShaderInstructions.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#ifndef AMDGPU_LOWER_SHADER_INSTRUCTIONS +#define AMDGPU_LOWER_SHADER_INSTRUCTIONS + +namespace llvm { + +class MachineFunction; +class MachineRegisterInfo; +class TargetInstrInfo; + +class AMDGPULowerShaderInstructionsPass { + + protected: + MachineRegisterInfo * MRI; + /** + * @param physReg The physical register that will be preloaded. + * @param virtReg The virtual register that currently holds the + * preloaded value. + */ + void preloadRegister(MachineFunction * MF, const TargetInstrInfo * TII, + unsigned physReg, unsigned virtReg) const; +}; + +} // end namespace llvm + + +#endif // AMDGPU_LOWER_SHADER_INSTRUCTIONS diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp b/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp new file mode 100644 index 00000000000..162a49116a0 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp @@ -0,0 +1,24 @@ +//===-- AMDGPURegisterInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPURegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +using namespace llvm; + +AMDGPURegisterInfo::AMDGPURegisterInfo(AMDGPUTargetMachine &tm, + const TargetInstrInfo &tii) +: AMDILRegisterInfo(tm, tii), + TM(tm), + TII(tii) + { } diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.h b/src/gallium/drivers/radeon/AMDGPURegisterInfo.h new file mode 100644 index 00000000000..f4492e9795d --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.h @@ -0,0 +1,38 @@ +//===-- AMDGPURegisterInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPUREGISTERINFO_H_ +#define AMDGPUREGISTERINFO_H_ + +#include "AMDILRegisterInfo.h" + +namespace llvm { + + class AMDGPUTargetMachine; + class TargetInstrInfo; + + struct AMDGPURegisterInfo : public AMDILRegisterInfo + { + AMDGPUTargetMachine &TM; + const TargetInstrInfo &TII; + + AMDGPURegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii); + + virtual BitVector getReservedRegs(const MachineFunction &MF) const = 0; + + virtual const TargetRegisterClass * + getISARegClass(const TargetRegisterClass * rc) const = 0; + }; +} // End namespace llvm + +#endif // AMDIDSAREGISTERINFO_H_ diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.td b/src/gallium/drivers/radeon/AMDGPURegisterInfo.td new file mode 100644 index 00000000000..173d6622569 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.td @@ -0,0 +1,22 @@ +//===-- AMDGPURegisterInfo.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +let Namespace = "AMDIL" in { + def sel_x : SubRegIndex; + def sel_y : SubRegIndex; + def sel_z : SubRegIndex; + def sel_w : SubRegIndex; +} + +include "R600RegisterInfo.td" +include "SIRegisterInfo.td" diff --git a/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp b/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp new file mode 100644 index 00000000000..c923f19c39f --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp @@ -0,0 +1,66 @@ +//===-- AMDGPUReorderPreloadInstructions.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDIL.h" +#include "AMDILInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Function.h" + +using namespace llvm; + +namespace { + class AMDGPUReorderPreloadInstructionsPass : public MachineFunctionPass { + + private: + static char ID; + TargetMachine &TM; + + public: + AMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "AMDGPU Reorder Preload Instructions"; } + }; +} /* End anonymous namespace */ + +char AMDGPUReorderPreloadInstructionsPass::ID = 0; + +FunctionPass *llvm::createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) { + return new AMDGPUReorderPreloadInstructionsPass(tm); +} + +/* This pass moves instructions that represent preloaded registers to the + * start of the program. */ +bool AMDGPUReorderPreloadInstructionsPass::runOnMachineFunction(MachineFunction &MF) +{ + const AMDGPUInstrInfo * TII = + static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next, Next = llvm::next(I) ) { + MachineInstr &MI = *I; + if (TII->isRegPreload(MI)) { + MF.front().insert(MF.front().begin(), MI.removeFromParent()); + } + } + } + return false; +} diff --git a/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp b/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp new file mode 100644 index 00000000000..4d6a1bd7e34 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp @@ -0,0 +1,180 @@ +//===-- AMDGPUTargetMachine.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPU.h" +#include "AMDILGlobalManager.h" +#include "AMDILKernelManager.h" +#include "AMDILTargetMachine.h" +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "R600KernelParameters.h" +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/PassManager.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/raw_os_ostream.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT, + StringRef CPU, StringRef FS, + TargetOptions Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OptLevel +) +: + AMDILTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel), + Subtarget(TT, CPU, FS), + mGM(new AMDILGlobalManager(0 /* Debug mode */)), + mKM(new AMDILKernelManager(this, mGM)), + mDump(false) + +{ + /* XXX: Add these two initializations to fix a segfault, not sure if this + * is correct. These are normally initialized in the AsmPrinter, but AMDGPU + * does not use the asm printer */ + Subtarget.setGlobalManager(mGM); + Subtarget.setKernelManager(mKM); + /* TLInfo uses InstrInfo so it must be initialized after. */ + if (Subtarget.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + InstrInfo = new R600InstrInfo(*this); + TLInfo = new R600TargetLowering(*this); + } else { + InstrInfo = new SIInstrInfo(*this); + TLInfo = new SITargetLowering(*this); + } +} + +AMDGPUTargetMachine::~AMDGPUTargetMachine() +{ + delete mGM; + delete mKM; +} + +bool AMDGPUTargetMachine::addPassesToEmitFile(PassManagerBase &PM, + formatted_raw_ostream &Out, + CodeGenFileType FileType, + bool DisableVerify) { + /* XXX: Hack here addPassesToEmitFile will fail, but this is Ok since we are + * only using it to access addPassesToGenerateCode() */ + bool fail = LLVMTargetMachine::addPassesToEmitFile(PM, Out, FileType, + DisableVerify); + assert(fail); + + const AMDILSubtarget &STM = getSubtarget<AMDILSubtarget>(); + std::string gpu = STM.getDeviceName(); + if (gpu == "SI") { + PM.add(createSICodeEmitterPass(Out)); + } else if (Subtarget.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + PM.add(createR600CodeEmitterPass(Out)); + } else { + abort(); + return true; + } + PM.add(createGCInfoDeleter()); + + return false; +} + +namespace { +class AMDGPUPassConfig : public TargetPassConfig { +public: + AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + AMDGPUTargetMachine &getAMDGPUTargetMachine() const { + return getTM<AMDGPUTargetMachine>(); + } + + virtual bool addPreISel(); + virtual bool addInstSelector(); + virtual bool addPreRegAlloc(); + virtual bool addPostRegAlloc(); + virtual bool addPreSched2(); + virtual bool addPreEmitPass(); +}; +} // End of anonymous namespace + +TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { + return new AMDGPUPassConfig(this, PM); +} + +bool +AMDGPUPassConfig::addPreISel() +{ + const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>(); + if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + PM.add(createR600KernelParametersPass( + getAMDGPUTargetMachine().getTargetData())); + } + return false; +} + +bool AMDGPUPassConfig::addInstSelector() { + PM.add(createAMDILBarrierDetect(*TM)); + PM.add(createAMDILPrintfConvert(*TM)); + PM.add(createAMDILInlinePass(*TM)); + PM.add(createAMDILPeepholeOpt(*TM)); + PM.add(createAMDILISelDag(getAMDGPUTargetMachine())); + return false; +} + +bool AMDGPUPassConfig::addPreRegAlloc() { + const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>(); + + if (ST.device()->getGeneration() == AMDILDeviceInfo::HD7XXX) { + PM.add(createSIInitMachineFunctionInfoPass(*TM)); + } + + PM.add(createAMDGPUReorderPreloadInstructionsPass(*TM)); + if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + PM.add(createR600LowerShaderInstructionsPass(*TM)); + PM.add(createR600LowerInstructionsPass(*TM)); + } else { + PM.add(createSILowerShaderInstructionsPass(*TM)); + PM.add(createSIAssignInterpRegsPass(*TM)); + PM.add(createSIConvertToISAPass(*TM)); + } + PM.add(createAMDGPUConvertToISAPass(*TM)); + return false; +} + +bool AMDGPUPassConfig::addPostRegAlloc() { + return false; +} + +bool AMDGPUPassConfig::addPreSched2() { + return false; +} + +bool AMDGPUPassConfig::addPreEmitPass() { + const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>(); + PM.add(createAMDILCFGPreparationPass(*TM)); + PM.add(createAMDILCFGStructurizerPass(*TM)); + if (ST.device()->getGeneration() == AMDILDeviceInfo::HD7XXX) { + PM.add(createSIPropagateImmReadsPass(*TM)); + } + + PM.add(createAMDILIOExpansion(*TM)); + return false; +} + diff --git a/src/gallium/drivers/radeon/AMDGPUTargetMachine.h b/src/gallium/drivers/radeon/AMDGPUTargetMachine.h new file mode 100644 index 00000000000..d4165b09e84 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUTargetMachine.h @@ -0,0 +1,62 @@ +//===-- AMDGPUTargetMachine.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_TARGET_MACHINE_H +#define AMDGPU_TARGET_MACHINE_H + +#include "AMDGPUInstrInfo.h" +#include "AMDILTargetMachine.h" +#include "R600ISelLowering.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/Target/TargetData.h" + +namespace llvm { + +MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT); + +class AMDGPUTargetMachine : public AMDILTargetMachine { + AMDILSubtarget Subtarget; + const AMDGPUInstrInfo * InstrInfo; + AMDGPUTargetLowering * TLInfo; + AMDILGlobalManager *mGM; + AMDILKernelManager *mKM; + bool mDump; + +public: + AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS, + StringRef CPU, + TargetOptions Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); + ~AMDGPUTargetMachine(); + virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;} + virtual const AMDILSubtarget *getSubtargetImpl() const {return &Subtarget; } + virtual const AMDGPURegisterInfo *getRegisterInfo() const { + return &InstrInfo->getRegisterInfo(); + } + virtual AMDGPUTargetLowering * getTargetLowering() const { + return TLInfo; + } + virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); + virtual bool addPassesToEmitFile(PassManagerBase &PM, + formatted_raw_ostream &Out, + CodeGenFileType FileType, + bool DisableVerify); +public: + void dumpCode() { mDump = true; } + bool shouldDumpCode() const { return mDump; } +}; + +} /* End namespace llvm */ + +#endif /* AMDGPU_TARGET_MACHINE_H */ diff --git a/src/gallium/drivers/radeon/AMDGPUUtil.cpp b/src/gallium/drivers/radeon/AMDGPUUtil.cpp new file mode 100644 index 00000000000..d24b98070de --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUUtil.cpp @@ -0,0 +1,127 @@ +//===-- AMDGPUUtil.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUUtil.h" +#include "AMDGPURegisterInfo.h" +#include "AMDIL.h" +#include "AMDILMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetRegisterInfo.h" + +using namespace llvm; + +/* Some instructions act as place holders to emulate operations that the GPU + * hardware does automatically. This function can be used to check if + * an opcode falls into this category. */ +bool llvm::isPlaceHolderOpcode(unsigned opcode) +{ + switch (opcode) { + default: return false; + case AMDIL::EXPORT_REG: + case AMDIL::RETURN: + case AMDIL::LOAD_INPUT: + case AMDIL::LAST: + case AMDIL::RESERVE_REG: + return true; + } +} + +bool llvm::isTransOp(unsigned opcode) +{ + switch(opcode) { + default: return false; + + case AMDIL::COS_f32: + case AMDIL::COS_r600: + case AMDIL::COS_eg: + case AMDIL::RSQ_f32: + case AMDIL::FTOI: + case AMDIL::ITOF: + case AMDIL::MULLIT: + case AMDIL::MUL_LIT_r600: + case AMDIL::MUL_LIT_eg: + case AMDIL::SHR_i32: + case AMDIL::SIN_f32: + case AMDIL::EXP_f32: + case AMDIL::EXP_IEEE_r600: + case AMDIL::EXP_IEEE_eg: + case AMDIL::LOG_CLAMPED_r600: + case AMDIL::LOG_IEEE_r600: + case AMDIL::LOG_CLAMPED_eg: + case AMDIL::LOG_IEEE_eg: + case AMDIL::LOG_f32: + return true; + } +} + +bool llvm::isTexOp(unsigned opcode) +{ + switch(opcode) { + default: return false; + case AMDIL::TEX_SAMPLE: + case AMDIL::TEX_SAMPLE_C: + case AMDIL::TEX_SAMPLE_L: + case AMDIL::TEX_SAMPLE_C_L: + case AMDIL::TEX_SAMPLE_LB: + case AMDIL::TEX_SAMPLE_C_LB: + case AMDIL::TEX_SAMPLE_G: + case AMDIL::TEX_SAMPLE_C_G: + return true; + } +} + +bool llvm::isReductionOp(unsigned opcode) +{ + switch(opcode) { + default: return false; + case AMDIL::DOT4_r600: + case AMDIL::DOT4_eg: + return true; + } +} + +bool llvm::isFCOp(unsigned opcode) +{ + switch(opcode) { + default: return false; + case AMDIL::BREAK_LOGICALZ_f32: + case AMDIL::BREAK_LOGICALNZ_i32: + case AMDIL::BREAK_LOGICALZ_i32: + case AMDIL::CONTINUE_LOGICALNZ_f32: + case AMDIL::IF_LOGICALNZ_i32: + case AMDIL::IF_LOGICALZ_f32: + case AMDIL::ELSE: + case AMDIL::ENDIF: + case AMDIL::ENDLOOP: + case AMDIL::IF_LOGICALNZ_f32: + case AMDIL::WHILELOOP: + return true; + } +} + +void AMDGPU::utilAddLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI, + const struct TargetInstrInfo * TII, unsigned physReg, unsigned virtReg) +{ + if (!MRI.isLiveIn(physReg)) { + MRI.addLiveIn(physReg, virtReg); + BuildMI(MF->front(), MF->front().begin(), DebugLoc(), + TII->get(TargetOpcode::COPY), virtReg) + .addReg(physReg); + } else { + MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg)); + } +} diff --git a/src/gallium/drivers/radeon/AMDGPUUtil.h b/src/gallium/drivers/radeon/AMDGPUUtil.h new file mode 100644 index 00000000000..299146e1ba7 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDGPUUtil.h @@ -0,0 +1,49 @@ +//===-- AMDGPUUtil.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef AMDGPU_UTIL_H +#define AMDGPU_UTIL_H + +#include "AMDGPURegisterInfo.h" +#include "llvm/Support/DataTypes.h" + +namespace llvm { + +class AMDILMachineFunctionInfo; + +class TargetMachine; +class TargetRegisterInfo; + +bool isPlaceHolderOpcode(unsigned opcode); + +bool isTransOp(unsigned opcode); +bool isTexOp(unsigned opcode); +bool isReductionOp(unsigned opcode); +bool isFCOp(unsigned opcode); + +/* XXX: Move these to AMDGPUInstrInfo.h */ +#define MO_FLAG_CLAMP (1 << 0) +#define MO_FLAG_NEG (1 << 1) +#define MO_FLAG_ABS (1 << 2) +#define MO_FLAG_MASK (1 << 3) + +} /* End namespace llvm */ + +namespace AMDGPU { + +void utilAddLiveIn(llvm::MachineFunction * MF, llvm::MachineRegisterInfo & MRI, + const struct llvm::TargetInstrInfo * TII, unsigned physReg, unsigned virtReg); + +} // End namespace AMDGPU + +#endif /* AMDGPU_UTIL_H */ diff --git a/src/gallium/drivers/radeon/AMDIL.h b/src/gallium/drivers/radeon/AMDIL.h new file mode 100644 index 00000000000..cc6590c82a9 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDIL.h @@ -0,0 +1,292 @@ +//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file contains the entry points for global functions defined in the LLVM +// AMDIL back-end. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDIL_H_ +#define AMDIL_H_ + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetMachine.h" + +#define AMDIL_MAJOR_VERSION 2 +#define AMDIL_MINOR_VERSION 0 +#define AMDIL_REVISION_NUMBER 74 +#define ARENA_SEGMENT_RESERVED_UAVS 12 +#define DEFAULT_ARENA_UAV_ID 8 +#define DEFAULT_RAW_UAV_ID 7 +#define GLOBAL_RETURN_RAW_UAV_ID 11 +#define HW_MAX_NUM_CB 8 +#define MAX_NUM_UNIQUE_UAVS 8 +#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8 +#define OPENCL_MAX_READ_IMAGES 128 +#define OPENCL_MAX_WRITE_IMAGES 8 +#define OPENCL_MAX_SAMPLERS 16 + +// The next two values can never be zero, as zero is the ID that is +// used to assert against. +#define DEFAULT_LDS_ID 1 +#define DEFAULT_GDS_ID 1 +#define DEFAULT_SCRATCH_ID 1 +#define DEFAULT_VEC_SLOTS 8 + +// SC->CAL version matchings. +#define CAL_VERSION_SC_150 1700 +#define CAL_VERSION_SC_149 1700 +#define CAL_VERSION_SC_148 1525 +#define CAL_VERSION_SC_147 1525 +#define CAL_VERSION_SC_146 1525 +#define CAL_VERSION_SC_145 1451 +#define CAL_VERSION_SC_144 1451 +#define CAL_VERSION_SC_143 1441 +#define CAL_VERSION_SC_142 1441 +#define CAL_VERSION_SC_141 1420 +#define CAL_VERSION_SC_140 1400 +#define CAL_VERSION_SC_139 1387 +#define CAL_VERSION_SC_138 1387 +#define CAL_APPEND_BUFFER_SUPPORT 1340 +#define CAL_VERSION_SC_137 1331 +#define CAL_VERSION_SC_136 982 +#define CAL_VERSION_SC_135 950 +#define CAL_VERSION_GLOBAL_RETURN_BUFFER 990 + +#define OCL_DEVICE_RV710 0x0001 +#define OCL_DEVICE_RV730 0x0002 +#define OCL_DEVICE_RV770 0x0004 +#define OCL_DEVICE_CEDAR 0x0008 +#define OCL_DEVICE_REDWOOD 0x0010 +#define OCL_DEVICE_JUNIPER 0x0020 +#define OCL_DEVICE_CYPRESS 0x0040 +#define OCL_DEVICE_CAICOS 0x0080 +#define OCL_DEVICE_TURKS 0x0100 +#define OCL_DEVICE_BARTS 0x0200 +#define OCL_DEVICE_CAYMAN 0x0400 +#define OCL_DEVICE_ALL 0x3FFF + +/// The number of function ID's that are reserved for +/// internal compiler usage. +const unsigned int RESERVED_FUNCS = 1024; + +#define AMDIL_OPT_LEVEL_DECL +#define AMDIL_OPT_LEVEL_VAR +#define AMDIL_OPT_LEVEL_VAR_NO_COMMA + +namespace llvm { +class AMDILInstrPrinter; +class AMDILTargetMachine; +class FunctionPass; +class MCAsmInfo; +class raw_ostream; +class Target; +class TargetMachine; + +/// Instruction selection passes. +FunctionPass* + createAMDILISelDag(AMDILTargetMachine &TM AMDIL_OPT_LEVEL_DECL); +FunctionPass* + createAMDILBarrierDetect(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); +FunctionPass* + createAMDILPrintfConvert(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); +FunctionPass* + createAMDILInlinePass(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); +FunctionPass* + createAMDILPeepholeOpt(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); + +/// Pre regalloc passes. +FunctionPass* + createAMDILPointerManager(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); +FunctionPass* + createAMDILMachinePeephole(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); + +/// Pre emit passes. +FunctionPass* + createAMDILCFGPreparationPass(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); +FunctionPass* + createAMDILCFGStructurizerPass(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); +FunctionPass* + createAMDILLiteralManager(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); +FunctionPass* + createAMDILIOExpansion(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); + +extern Target TheAMDILTarget; +extern Target TheAMDGPUTarget; +} // end namespace llvm; + +#define GET_REGINFO_ENUM +#include "AMDILGenRegisterInfo.inc" +#define GET_INSTRINFO_ENUM +#include "AMDILGenInstrInfo.inc" + +/// Include device information enumerations +#include "AMDILDeviceInfo.h" + +namespace llvm { +/// OpenCL uses address spaces to differentiate between +/// various memory regions on the hardware. On the CPU +/// all of the address spaces point to the same memory, +/// however on the GPU, each address space points to +/// a seperate piece of memory that is unique from other +/// memory locations. +namespace AMDILAS { +enum AddressSpaces { + PRIVATE_ADDRESS = 0, // Address space for private memory. + GLOBAL_ADDRESS = 1, // Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, // Address space for constant memory. + LOCAL_ADDRESS = 3, // Address space for local memory. + REGION_ADDRESS = 4, // Address space for region memory. + ADDRESS_NONE = 5, // Address space for unknown memory. + PARAM_D_ADDRESS = 6, // Address space for direct addressible parameter memory (CONST0) + PARAM_I_ADDRESS = 7, // Address space for indirect addressible parameter memory (VTX1) + LAST_ADDRESS = 8 +}; + +// We are piggybacking on the CommentFlag enum in MachineInstr.h to +// set bits in AsmPrinterFlags of the MachineInstruction. We will +// start at bit 16 and allocate down while LLVM will start at bit +// 1 and allocate up. + +// This union/struct combination is an easy way to read out the +// exact bits that are needed. +typedef union ResourceRec { + struct { +#ifdef __BIG_ENDIAN__ + unsigned short isImage : 1; // Reserved for future use/llvm. + unsigned short ResourceID : 10; // Flag to specify the resourece ID for + // the op. + unsigned short HardwareInst : 1; // Flag to specify that this instruction + // is a hardware instruction. + unsigned short ConflictPtr : 1; // Flag to specify that the pointer has a + // conflict. + unsigned short ByteStore : 1; // Flag to specify if the op is a byte + // store op. + unsigned short PointerPath : 1; // Flag to specify if the op is on the + // pointer path. + unsigned short CacheableRead : 1; // Flag to specify if the read is + // cacheable. +#else + unsigned short CacheableRead : 1; // Flag to specify if the read is + // cacheable. + unsigned short PointerPath : 1; // Flag to specify if the op is on the + // pointer path. + unsigned short ByteStore : 1; // Flag to specify if the op is byte + // store op. + unsigned short ConflictPtr : 1; // Flag to specify that the pointer has + // a conflict. + unsigned short HardwareInst : 1; // Flag to specify that this instruction + // is a hardware instruction. + unsigned short ResourceID : 10; // Flag to specify the resource ID for + // the op. + unsigned short isImage : 1; // Reserved for future use. +#endif + } bits; + unsigned short u16all; +} InstrResEnc; + +} // namespace AMDILAS + +// The OpSwizzle encodes a subset of all possible +// swizzle combinations into a number of bits using +// only the combinations utilized by the backend. +// The lower 128 are for source swizzles and the +// upper 128 or for destination swizzles. +// The valid mappings can be found in the +// getSrcSwizzle and getDstSwizzle functions of +// AMDILUtilityFunctions.cpp. +typedef union SwizzleRec { + struct { +#ifdef __BIG_ENDIAN__ + unsigned char dst : 1; + unsigned char swizzle : 7; +#else + unsigned char swizzle : 7; + unsigned char dst : 1; +#endif + } bits; + unsigned char u8all; +} OpSwizzle; +// Enums corresponding to AMDIL condition codes for IL. These +// values must be kept in sync with the ones in the .td file. +namespace AMDILCC { +enum CondCodes { + // AMDIL specific condition codes. These correspond to the IL_CC_* + // in AMDILInstrInfo.td and must be kept in the same order. + IL_CC_D_EQ = 0, // DEQ instruction. + IL_CC_D_GE = 1, // DGE instruction. + IL_CC_D_LT = 2, // DLT instruction. + IL_CC_D_NE = 3, // DNE instruction. + IL_CC_F_EQ = 4, // EQ instruction. + IL_CC_F_GE = 5, // GE instruction. + IL_CC_F_LT = 6, // LT instruction. + IL_CC_F_NE = 7, // NE instruction. + IL_CC_I_EQ = 8, // IEQ instruction. + IL_CC_I_GE = 9, // IGE instruction. + IL_CC_I_LT = 10, // ILT instruction. + IL_CC_I_NE = 11, // INE instruction. + IL_CC_U_GE = 12, // UGE instruction. + IL_CC_U_LT = 13, // ULE instruction. + // Pseudo IL Comparison instructions here. + IL_CC_F_GT = 14, // GT instruction. + IL_CC_U_GT = 15, + IL_CC_I_GT = 16, + IL_CC_D_GT = 17, + IL_CC_F_LE = 18, // LE instruction + IL_CC_U_LE = 19, + IL_CC_I_LE = 20, + IL_CC_D_LE = 21, + IL_CC_F_UNE = 22, + IL_CC_F_UEQ = 23, + IL_CC_F_ULT = 24, + IL_CC_F_UGT = 25, + IL_CC_F_ULE = 26, + IL_CC_F_UGE = 27, + IL_CC_F_ONE = 28, + IL_CC_F_OEQ = 29, + IL_CC_F_OLT = 30, + IL_CC_F_OGT = 31, + IL_CC_F_OLE = 32, + IL_CC_F_OGE = 33, + IL_CC_D_UNE = 34, + IL_CC_D_UEQ = 35, + IL_CC_D_ULT = 36, + IL_CC_D_UGT = 37, + IL_CC_D_ULE = 38, + IL_CC_D_UGE = 39, + IL_CC_D_ONE = 40, + IL_CC_D_OEQ = 41, + IL_CC_D_OLT = 42, + IL_CC_D_OGT = 43, + IL_CC_D_OLE = 44, + IL_CC_D_OGE = 45, + IL_CC_U_EQ = 46, + IL_CC_U_NE = 47, + IL_CC_F_O = 48, + IL_CC_D_O = 49, + IL_CC_F_UO = 50, + IL_CC_D_UO = 51, + IL_CC_L_LE = 52, + IL_CC_L_GE = 53, + IL_CC_L_EQ = 54, + IL_CC_L_NE = 55, + IL_CC_L_LT = 56, + IL_CC_L_GT = 57, + IL_CC_UL_LE = 58, + IL_CC_UL_GE = 59, + IL_CC_UL_EQ = 60, + IL_CC_UL_NE = 61, + IL_CC_UL_LT = 62, + IL_CC_UL_GT = 63, + COND_ERROR = 64 +}; + +} // end namespace AMDILCC +} // end namespace llvm +#endif // AMDIL_H_ diff --git a/src/gallium/drivers/radeon/AMDIL.td b/src/gallium/drivers/radeon/AMDIL.td new file mode 100644 index 00000000000..9bcccac2411 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDIL.td @@ -0,0 +1,19 @@ +//===-- AMDIL.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// This file specifies where the base TD file exists +// and where the version specific TD file exists. +include "AMDILBase.td" +include "AMDILVersion.td" + +include "R600Schedule.td" +include "SISchedule.td" +include "Processors.td" +include "AMDGPUIntrinsics.td" +include "AMDGPURegisterInfo.td" +include "AMDGPUInstructions.td" diff --git a/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp b/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp new file mode 100644 index 00000000000..cf5afb9d195 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDIL789IOExpansion.cpp @@ -0,0 +1,723 @@ +//===-- AMDIL789IOExpansion.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// @file AMDIL789IOExpansion.cpp +// @details Implementation of the IO expansion class for 789 devices. +// +#include "AMDILCompilerErrors.h" +#include "AMDILCompilerWarnings.h" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILIOExpansion.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Value.h" + +using namespace llvm; +AMDIL789IOExpansion::AMDIL789IOExpansion(TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) +: AMDILIOExpansion(tm AMDIL_OPT_LEVEL_VAR) +{ +} + +AMDIL789IOExpansion::~AMDIL789IOExpansion() { +} + +const char *AMDIL789IOExpansion::getPassName() const +{ + return "AMDIL 789 IO Expansion Pass"; +} +// This code produces the following pseudo-IL: +// mov r1007, $src.y000 +// cmov_logical r1007.x___, $flag.yyyy, r1007.xxxx, $src.xxxx +// mov r1006, $src.z000 +// cmov_logical r1007.x___, $flag.zzzz, r1006.xxxx, r1007.xxxx +// mov r1006, $src.w000 +// cmov_logical $dst.x___, $flag.wwww, r1006.xxxx, r1007.xxxx +void +AMDIL789IOExpansion::emitComponentExtract(MachineInstr *MI, + unsigned flag, unsigned src, unsigned dst, bool before) +{ + MachineBasicBlock::iterator I = *MI; + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(src) + .addImm(2); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007) + .addReg(flag) + .addReg(AMDIL::R1007) + .addReg(src); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1006) + .addReg(src) + .addImm(3); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1007) + .addReg(flag) + .addReg(AMDIL::R1006) + .addReg(AMDIL::R1007); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1006) + .addReg(src) + .addImm(4); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_W_i32), dst) + .addReg(flag) + .addReg(AMDIL::R1006) + .addReg(AMDIL::R1007); + +} +// We have a 128 bit load but a 8/16/32bit value, so we need to +// select the correct component and make sure that the correct +// bits are selected. For the 8 and 16 bit cases we need to +// extract from the component the correct bits and for 32 bits +// we just need to select the correct component. + void +AMDIL789IOExpansion::emitDataLoadSelect(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + DebugLoc DL = MI->getDebugLoc(); + emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1011, AMDIL::R1011, false); + if (getMemorySize(MI) == 1) { + // This produces the following pseudo-IL: + // iand r1006.x___, r1010.xxxx, l14.xxxx + // mov r1006, r1006.xxxx + // iadd r1006, r1006, {0, -1, 2, 3} + // ieq r1008, r1006, 0 + // mov r1011, r1011.xxxx + // ishr r1011, r1011, {0, 8, 16, 24} + // mov r1007, r1011.y000 + // cmov_logical r1007.x___, r1008.yyyy, r1007.xxxx, r1011.xxxx + // mov r1006, r1011.z000 + // cmov_logical r1007.x___, r1008.zzzz, r1006.xxxx, r1007.xxxx + // mov r1006, r1011.w000 + // cmov_logical r1011.x___, r1008.wwww, r1006.xxxx, r1007.xxxx + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1006) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1006) + .addReg(AMDIL::R1006); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1006) + .addReg(AMDIL::R1006) + .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, + (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32)))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1006) + .addImm(mMFI->addi32Literal(0)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHRVEC_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32))); + emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1011, AMDIL::R1011, false); + } else if (getMemorySize(MI) == 2) { + // This produces the following pseudo-IL: + // ishr r1007.x___, r1010.xxxx, 1 + // iand r1008.x___, r1007.xxxx, 1 + // ishr r1007.x___, r1011.xxxx, 16 + // cmov_logical r1011.x___, r1008.xxxx, r1007.xxxx, r1011.xxxx + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1011) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1007) + .addReg(AMDIL::R1011); + } +} +// This function does address calculations modifications to load from a vector +// register type instead of a dword addressed load. + void +AMDIL789IOExpansion::emitVectorAddressCalc(MachineInstr *MI, bool is32bit, bool needsSelect) +{ + MachineBasicBlock::iterator I = *MI; + DebugLoc DL = MI->getDebugLoc(); + // This produces the following pseudo-IL: + // ishr r1007.x___, r1010.xxxx, (is32bit) ? 2 : 3 + // iand r1008.x___, r1007.xxxx, (is32bit) ? 3 : 1 + // ishr r1007.x___, r1007.xxxx, (is32bit) ? 2 : 1 + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal((is32bit) ? 0x2 : 3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal((is32bit) ? 3 : 1)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1007) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal((is32bit) ? 2 : 1)); + if (needsSelect) { + // If the component selection is required, the following + // pseudo-IL is produced. + // mov r1008, r1008.xxxx + // iadd r1008, r1008, (is32bit) ? {0, -1, -2, -3} : {0, 0, -1, -1} + // ieq r1008, r1008, 0 + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi128Literal((is32bit) ? 0xFFFFFFFFULL << 32 : 0ULL, + (is32bit) ? 0xFFFFFFFEULL | (0xFFFFFFFDULL << 32) : + -1ULL)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0)); + } +} +// This function emits a switch statement and writes 32bit/64bit +// value to a 128bit vector register type. + void +AMDIL789IOExpansion::emitVectorSwitchWrite(MachineInstr *MI, bool is32bit) +{ + MachineBasicBlock::iterator I = *MI; + uint32_t xID = getPointerID(MI); + assert(xID && "Found a scratch store that was incorrectly marked as zero ID!\n"); + // This section generates the following pseudo-IL: + // switch r1008.x + // default + // mov x1[r1007.x].(is32bit) ? x___ : xy__, r1011.x{y} + // break + // case 1 + // mov x1[r1007.x].(is32bit) ? _y__ : __zw, r1011.x{yxy} + // break + // if is32bit is true, case 2 and 3 are emitted. + // case 2 + // mov x1[r1007.x].__z_, r1011.x + // break + // case 3 + // mov x1[r1007.x].___w, r1011.x + // break + // endswitch + DebugLoc DL; + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::SWITCH)) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::DEFAULT)); + BuildMI(*mBB, I, DL, + mTII->get((is32bit) ? AMDIL::SCRATCHSTORE_X : AMDIL::SCRATCHSTORE_XY) + , AMDIL::R1007) + .addReg(AMDIL::R1011) + .addImm(xID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(1); + BuildMI(*mBB, I, DL, + mTII->get((is32bit) ? AMDIL::SCRATCHSTORE_Y : AMDIL::SCRATCHSTORE_ZW), AMDIL::R1007) + .addReg(AMDIL::R1011) + .addImm(xID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK)); + if (is32bit) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(2); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SCRATCHSTORE_Z), AMDIL::R1007) + .addReg(AMDIL::R1011) + .addImm(xID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CASE)).addImm(3); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SCRATCHSTORE_W), AMDIL::R1007) + .addReg(AMDIL::R1011) + .addImm(xID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BREAK)); + } + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ENDSWITCH)); + +} + void +AMDIL789IOExpansion::expandPrivateLoad(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + bool HWPrivate = mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem); + if (!HWPrivate || mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV)) { + return expandGlobalLoad(MI); + } + if (!mMFI->usesMem(AMDILDevice::SCRATCH_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + uint32_t xID = getPointerID(MI); + assert(xID && "Found a scratch load that was incorrectly marked as zero ID!\n"); + if (!xID) { + xID = mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + DebugLoc DL; + // These instructions go before the current MI. + expandLoadStartCode(MI); + switch (getMemorySize(MI)) { + default: + // Since the private register is a 128 bit aligned, we have to align the address + // first, since our source address is 32bit aligned and then load the data. + // This produces the following pseudo-IL: + // ishr r1010.x___, r1010.xxxx, 4 + // mov r1011, x1[r1010.x] + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SHR_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(4)); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(xID); + break; + case 1: + case 2: + case 4: + emitVectorAddressCalc(MI, true, true); + // This produces the following pseudo-IL: + // mov r1011, x1[r1007.x] + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011) + .addReg(AMDIL::R1007) + .addImm(xID); + // These instructions go after the current MI. + emitDataLoadSelect(MI); + break; + case 8: + emitVectorAddressCalc(MI, false, true); + // This produces the following pseudo-IL: + // mov r1011, x1[r1007.x] + // mov r1007, r1011.zw00 + // cmov_logical r1011.xy__, r1008.xxxx, r1011.xy, r1007.zw + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1011) + .addReg(AMDIL::R1007) + .addImm(xID); + // These instructions go after the current MI. + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::VEXTRACT_v2i64), AMDIL::R1007) + .addReg(AMDIL::R1011) + .addImm(2); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::CMOVLOG_i64), AMDIL::R1011) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1007); + break; + } + expandPackedData(MI); + expandExtendLoad(MI); + BuildMI(*mBB, I, MI->getDebugLoc(), + mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass)), + MI->getOperand(0).getReg()) + .addReg(AMDIL::R1011); +} + + + void +AMDIL789IOExpansion::expandConstantLoad(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + if (!isHardwareInst(MI) || MI->memoperands_empty()) { + return expandGlobalLoad(MI); + } + uint32_t cID = getPointerID(MI); + if (cID < 2) { + return expandGlobalLoad(MI); + } + if (!mMFI->usesMem(AMDILDevice::CONSTANT_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + + DebugLoc DL; + // These instructions go before the current MI. + expandLoadStartCode(MI); + switch (getMemorySize(MI)) { + default: + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SHR_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(4)); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::CBLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(cID); + break; + case 1: + case 2: + case 4: + emitVectorAddressCalc(MI, true, true); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::CBLOAD), AMDIL::R1011) + .addReg(AMDIL::R1007) + .addImm(cID); + // These instructions go after the current MI. + emitDataLoadSelect(MI); + break; + case 8: + emitVectorAddressCalc(MI, false, true); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::CBLOAD), AMDIL::R1011) + .addReg(AMDIL::R1007) + .addImm(cID); + // These instructions go after the current MI. + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::VEXTRACT_v2i64), AMDIL::R1007) + .addReg(AMDIL::R1011) + .addImm(2); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1008) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::CMOVLOG_i64), AMDIL::R1011) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1007); + break; + } + expandPackedData(MI); + expandExtendLoad(MI); + BuildMI(*mBB, I, MI->getDebugLoc(), + mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass)), + MI->getOperand(0).getReg()) + .addReg(AMDIL::R1011); + MI->getOperand(0).setReg(AMDIL::R1011); +} + + void +AMDIL789IOExpansion::expandConstantPoolLoad(MachineInstr *MI) +{ + if (!isStaticCPLoad(MI)) { + return expandConstantLoad(MI); + } else { + uint32_t idx = MI->getOperand(1).getIndex(); + const MachineConstantPool *MCP = MI->getParent()->getParent() + ->getConstantPool(); + const std::vector<MachineConstantPoolEntry> &consts + = MCP->getConstants(); + const Constant *C = consts[idx].Val.ConstVal; + emitCPInst(MI, C, mKM, 0, isExtendLoad(MI)); + } +} + + void +AMDIL789IOExpansion::expandPrivateStore(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + bool HWPrivate = mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem); + if (!HWPrivate || mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV)) { + return expandGlobalStore(MI); + } + if (!mMFI->usesMem(AMDILDevice::SCRATCH_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + uint32_t xID = getPointerID(MI); + assert(xID && "Found a scratch store that was incorrectly marked as zero ID!\n"); + if (!xID) { + xID = mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + DebugLoc DL; + // These instructions go before the current MI. + expandStoreSetupCode(MI); + switch (getMemorySize(MI)) { + default: + // This section generates the following pseudo-IL: + // ishr r1010.x___, r1010.xxxx, 4 + // mov x1[r1010.x], r1011 + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SHR_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(4)); + BuildMI(*mBB, I, MI->getDebugLoc(), + mTII->get(AMDIL::SCRATCHSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(xID); + break; + case 1: + emitVectorAddressCalc(MI, true, true); + // This section generates the following pseudo-IL: + // mov r1002, x1[r1007.x] + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1002) + .addReg(AMDIL::R1007) + .addImm(xID); + emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1002, AMDIL::R1002, true); + // This section generates the following pseudo-IL: + // iand r1003.x, r1010.x, 3 + // mov r1003, r1003.xxxx + // iadd r1000, r1003, {0, -1, -2, -3} + // ieq r1000, r1000, 0 + // mov r1002, r1002.xxxx + // ishr r1002, r1002, {0, 8, 16, 24} + // mov r1011, r1011.xxxx + // cmov_logical r1002, r1000, r1011, r1002 + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1003) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1003) + .addReg(AMDIL::R1003); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1001) + .addReg(AMDIL::R1003) + .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, + (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32)))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1001) + .addReg(AMDIL::R1001) + .addImm(mMFI->addi32Literal(0)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1002) + .addReg(AMDIL::R1002); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHRVEC_v4i32), AMDIL::R1002) + .addReg(AMDIL::R1002) + .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_v4i32), AMDIL::R1002) + .addReg(AMDIL::R1001) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1002); + if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + // This section generates the following pseudo-IL: + // iand r1002, r1002, 0xFF + // ishl r1002, r1002, {0, 8, 16, 24} + // ior r1002.xy, r1002.xy, r1002.zw + // ior r1011.x, r1002.x, r1002.y + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1002) + .addReg(AMDIL::R1002) + .addImm(mMFI->addi32Literal(0xFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1002) + .addReg(AMDIL::R1002) + .addImm(mMFI->addi128Literal(8ULL << 32, 16ULL | (24ULL << 32))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i64), AMDIL::R1002) + .addReg(AMDIL::R1002).addReg(AMDIL::R1002); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1002).addReg(AMDIL::R1002); + } else { + // This section generates the following pseudo-IL: + // mov r1001.xy, r1002.yw + // mov r1002.xy, r1002.xz + // ubit_insert r1002.xy, 8, 8, r1001.xy, r1002.xy + // mov r1001.x, r1002.y + // ubit_insert r1011.x, 16, 16, r1002.y, r1002.x + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI_v2i64), AMDIL::R1001) + .addReg(AMDIL::R1002); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LLO_v2i64), AMDIL::R1002) + .addReg(AMDIL::R1002); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_v2i32), AMDIL::R1002) + .addImm(mMFI->addi32Literal(8)) + .addImm(mMFI->addi32Literal(8)) + .addReg(AMDIL::R1001) + .addReg(AMDIL::R1002); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1001) + .addReg(AMDIL::R1002); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)) + .addImm(mMFI->addi32Literal(16)) + .addReg(AMDIL::R1001) + .addReg(AMDIL::R1002); + } + emitVectorAddressCalc(MI, true, false); + emitVectorSwitchWrite(MI, true); + break; + case 2: + emitVectorAddressCalc(MI, true, true); + // This section generates the following pseudo-IL: + // mov r1002, x1[r1007.x] + BuildMI(*mBB, I, DL, + mTII->get(AMDIL::SCRATCHLOAD), AMDIL::R1002) + .addReg(AMDIL::R1007) + .addImm(xID); + emitComponentExtract(MI, AMDIL::R1008, AMDIL::R1002, AMDIL::R1002, true); + // This section generates the following pseudo-IL: + // ishr r1003.x, r1010.x, 1 + // iand r1003.x, r1003.x, 1 + // ishr r1001.x, r1002.x, 16 + // cmov_logical r1002.x, r1003.x, r1002.x, r1011.x + // cmov_logical r1001.x, r1003.x, r1011.x, r1001.x + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1003) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1003) + .addReg(AMDIL::R1003) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1001) + .addReg(AMDIL::R1002) + .addImm(mMFI->addi32Literal(16)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1002) + .addReg(AMDIL::R1003) + .addReg(AMDIL::R1002) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1001) + .addReg(AMDIL::R1003) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1001); + if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + // This section generates the following pseudo-IL: + // iand r1002.x, r1002.x, 0xFFFF + // iand r1001.x, r1001.x, 0xFFFF + // ishl r1001.x, r1002.x, 16 + // ior r1011.x, r1002.x, r1001.x + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1002) + .addReg(AMDIL::R1002) + .addImm(mMFI->addi32Literal(0xFFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1001) + .addReg(AMDIL::R1001) + .addImm(mMFI->addi32Literal(0xFFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1001) + .addReg(AMDIL::R1001) + .addImm(mMFI->addi32Literal(16)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_OR_i32), AMDIL::R1011) + .addReg(AMDIL::R1002).addReg(AMDIL::R1001); + } else { + // This section generates the following pseudo-IL: + // ubit_insert r1011.x, 16, 16, r1001.y, r1002.x + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)) + .addImm(mMFI->addi32Literal(16)) + .addReg(AMDIL::R1001) + .addReg(AMDIL::R1002); + } + emitVectorAddressCalc(MI, true, false); + emitVectorSwitchWrite(MI, true); + break; + case 4: + emitVectorAddressCalc(MI, true, false); + emitVectorSwitchWrite(MI, true); + break; + case 8: + emitVectorAddressCalc(MI, false, false); + emitVectorSwitchWrite(MI, false); + break; + }; +} + void +AMDIL789IOExpansion::expandStoreSetupCode(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + DebugLoc DL; + if (MI->getOperand(0).isUndef()) { + BuildMI(*mBB, I, DL, mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass)), AMDIL::R1011) + .addImm(mMFI->addi32Literal(0)); + } else { + BuildMI(*mBB, I, DL, mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass)), AMDIL::R1011) + .addReg(MI->getOperand(0).getReg()); + } + expandTruncData(MI); + if (MI->getOperand(2).isReg()) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_i32), AMDIL::R1010) + .addReg(MI->getOperand(1).getReg()) + .addReg(MI->getOperand(2).getReg()); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::MOVE_i32), AMDIL::R1010) + .addReg(MI->getOperand(1).getReg()); + } + expandAddressCalc(MI); + expandPackedData(MI); +} + + +void +AMDIL789IOExpansion::expandPackedData(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + if (!isPackedData(MI)) { + return; + } + DebugLoc DL; + // If we have packed data, then the shift size is no longer + // the same as the load size and we need to adjust accordingly + switch(getPackedID(MI)) { + default: + break; + case PACK_V2I8: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi64Literal(0xFFULL | (0xFFULL << 32))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1011).addImm(mMFI->addi64Literal(8ULL << 32)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1011); + } + break; + case PACK_V4I8: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi128Literal(8ULL << 32, (16ULL | (24ULL << 32)))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1011); + } + break; + case PACK_V2I16: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi64Literal(16ULL << 32)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1011); + } + break; + case PACK_V4I16: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi64Literal(16ULL << 32)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::HILO_BITOR_v4i16), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1011); + } + break; + case UNPACK_V2I8: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_i32), AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1012); + break; + case UNPACK_V4I8: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i8), AMDIL::R1011) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_v4i8), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi128Literal(8ULL << 32, (16ULL | (24ULL << 32)))); + } + break; + case UNPACK_V2I16: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_i32), AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1012); + } + break; + case UNPACK_V4I16: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::USHRVEC_v2i32), AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1012); + } + break; + }; +} diff --git a/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp b/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp new file mode 100644 index 00000000000..df81c44f288 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp @@ -0,0 +1,157 @@ +//===-- AMDIL7XXDevice.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDIL7XXDevice.h" +#ifdef UPSTREAM_LLVM +#include "AMDIL7XXAsmPrinter.h" +#endif +#include "AMDILDevice.h" +#include "AMDILIOExpansion.h" +#include "AMDILPointerManager.h" + +using namespace llvm; + +AMDIL7XXDevice::AMDIL7XXDevice(AMDILSubtarget *ST) : AMDILDevice(ST) +{ + setCaps(); + std::string name = mSTM->getDeviceName(); + if (name == "rv710") { + mDeviceFlag = OCL_DEVICE_RV710; + } else if (name == "rv730") { + mDeviceFlag = OCL_DEVICE_RV730; + } else { + mDeviceFlag = OCL_DEVICE_RV770; + } +} + +AMDIL7XXDevice::~AMDIL7XXDevice() +{ +} + +void AMDIL7XXDevice::setCaps() +{ + mSWBits.set(AMDILDeviceInfo::LocalMem); +} + +size_t AMDIL7XXDevice::getMaxLDSSize() const +{ + if (usesHardware(AMDILDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_700; + } + return 0; +} + +size_t AMDIL7XXDevice::getWavefrontSize() const +{ + return AMDILDevice::HalfWavefrontSize; +} + +uint32_t AMDIL7XXDevice::getGeneration() const +{ + return AMDILDeviceInfo::HD4XXX; +} + +uint32_t AMDIL7XXDevice::getResourceID(uint32_t DeviceID) const +{ + switch (DeviceID) { + default: + assert(0 && "ID type passed in is unknown!"); + break; + case GLOBAL_ID: + case CONSTANT_ID: + case RAW_UAV_ID: + case ARENA_UAV_ID: + break; + case LDS_ID: + if (usesHardware(AMDILDeviceInfo::LocalMem)) { + return DEFAULT_LDS_ID; + } + break; + case SCRATCH_ID: + if (usesHardware(AMDILDeviceInfo::PrivateMem)) { + return DEFAULT_SCRATCH_ID; + } + break; + case GDS_ID: + assert(0 && "GDS UAV ID is not supported on this chip"); + if (usesHardware(AMDILDeviceInfo::RegionMem)) { + return DEFAULT_GDS_ID; + } + break; + }; + + return 0; +} + +uint32_t AMDIL7XXDevice::getMaxNumUAVs() const +{ + return 1; +} + +FunctionPass* +AMDIL7XXDevice::getIOExpansion( + TargetMachine& TM AMDIL_OPT_LEVEL_DECL) const +{ + return new AMDIL7XXIOExpansion(TM AMDIL_OPT_LEVEL_VAR); +} + +AsmPrinter* +AMDIL7XXDevice::getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const +{ +#ifdef UPSTREAM_LLVM + return new AMDIL7XXAsmPrinter(TM, Streamer); +#else + return NULL; +#endif +} + +FunctionPass* +AMDIL7XXDevice::getPointerManager( + TargetMachine& TM AMDIL_OPT_LEVEL_DECL) const +{ + return new AMDILPointerManager(TM AMDIL_OPT_LEVEL_VAR); +} + +AMDIL770Device::AMDIL770Device(AMDILSubtarget *ST): AMDIL7XXDevice(ST) +{ + setCaps(); +} + +AMDIL770Device::~AMDIL770Device() +{ +} + +void AMDIL770Device::setCaps() +{ + if (mSTM->isOverride(AMDILDeviceInfo::DoubleOps)) { + mSWBits.set(AMDILDeviceInfo::FMA); + mHWBits.set(AMDILDeviceInfo::DoubleOps); + } + mSWBits.set(AMDILDeviceInfo::BarrierDetect); + mHWBits.reset(AMDILDeviceInfo::LongOps); + mSWBits.set(AMDILDeviceInfo::LongOps); + mSWBits.set(AMDILDeviceInfo::LocalMem); +} + +size_t AMDIL770Device::getWavefrontSize() const +{ + return AMDILDevice::WavefrontSize; +} + +AMDIL710Device::AMDIL710Device(AMDILSubtarget *ST) : AMDIL7XXDevice(ST) +{ +} + +AMDIL710Device::~AMDIL710Device() +{ +} + +size_t AMDIL710Device::getWavefrontSize() const +{ + return AMDILDevice::QuarterWavefrontSize; +} diff --git a/src/gallium/drivers/radeon/AMDIL7XXDevice.h b/src/gallium/drivers/radeon/AMDIL7XXDevice.h new file mode 100644 index 00000000000..87238e96006 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDIL7XXDevice.h @@ -0,0 +1,77 @@ +//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface for the subtarget data classes. +// +//===----------------------------------------------------------------------===// +// This file will define the interface that each generation needs to +// implement in order to correctly answer queries on the capabilities of the +// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef _AMDIL7XXDEVICEIMPL_H_ +#define _AMDIL7XXDEVICEIMPL_H_ +#include "AMDILDevice.h" +#include "AMDILSubtarget.h" + +namespace llvm { +class AMDILSubtarget; + +//===----------------------------------------------------------------------===// +// 7XX generation of devices and their respective sub classes +//===----------------------------------------------------------------------===// + +// The AMDIL7XXDevice class represents the generic 7XX device. All 7XX +// devices are derived from this class. The AMDIL7XX device will only +// support the minimal features that are required to be considered OpenCL 1.0 +// compliant and nothing more. +class AMDIL7XXDevice : public AMDILDevice { +public: + AMDIL7XXDevice(AMDILSubtarget *ST); + virtual ~AMDIL7XXDevice(); + virtual size_t getMaxLDSSize() const; + virtual size_t getWavefrontSize() const; + virtual uint32_t getGeneration() const; + virtual uint32_t getResourceID(uint32_t DeviceID) const; + virtual uint32_t getMaxNumUAVs() const; + FunctionPass* + getIOExpansion(TargetMachine& AMDIL_OPT_LEVEL_DECL) const; + AsmPrinter* + getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const; + FunctionPass* + getPointerManager(TargetMachine& AMDIL_OPT_LEVEL_DECL) const; + +protected: + virtual void setCaps(); +}; // AMDIL7XXDevice + +// The AMDIL770Device class represents the RV770 chip and it's +// derivative cards. The difference between this device and the base +// class is this device device adds support for double precision +// and has a larger wavefront size. +class AMDIL770Device : public AMDIL7XXDevice { +public: + AMDIL770Device(AMDILSubtarget *ST); + virtual ~AMDIL770Device(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; // AMDIL770Device + +// The AMDIL710Device class derives from the 7XX base class, but this +// class is a smaller derivative, so we need to overload some of the +// functions in order to correctly specify this information. +class AMDIL710Device : public AMDIL7XXDevice { +public: + AMDIL710Device(AMDILSubtarget *ST); + virtual ~AMDIL710Device(); + virtual size_t getWavefrontSize() const; +}; // AMDIL710Device + +} // namespace llvm +#endif // _AMDILDEVICEIMPL_H_ diff --git a/src/gallium/drivers/radeon/AMDIL7XXIOExpansion.cpp b/src/gallium/drivers/radeon/AMDIL7XXIOExpansion.cpp new file mode 100644 index 00000000000..cddde313e2b --- /dev/null +++ b/src/gallium/drivers/radeon/AMDIL7XXIOExpansion.cpp @@ -0,0 +1,548 @@ +//===-- AMDIL7XXIOExpansion.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// @file AMDIL7XXIOExpansion.cpp +// @details Implementation of the IO Printing class for 7XX devices +// +#include "AMDILCompilerErrors.h" +#include "AMDILCompilerWarnings.h" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILIOExpansion.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Value.h" + +using namespace llvm; +AMDIL7XXIOExpansion::AMDIL7XXIOExpansion(TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) : AMDIL789IOExpansion(tm AMDIL_OPT_LEVEL_VAR) +{ +} + +AMDIL7XXIOExpansion::~AMDIL7XXIOExpansion() { +} +const char *AMDIL7XXIOExpansion::getPassName() const +{ + return "AMDIL 7XX IO Expansion Pass"; +} + + void +AMDIL7XXIOExpansion::expandGlobalLoad(MachineInstr *MI) +{ + DebugLoc DL; + // These instructions go before the current MI. + expandLoadStartCode(MI); + uint32_t ID = getPointerID(MI); + mKM->setOutputInst(); + switch(getMemorySize(MI)) { + default: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + break; + case 4: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + break; + case 8: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + break; + case 1: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, + (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32)))); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1012) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008) + .addReg(AMDIL::R1012) + .addImm(mMFI->addi32Literal(0)) + .addImm(mMFI->addi32Literal(24)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1008) + .addReg(AMDIL::R1012) + .addImm(mMFI->addi32Literal(8)) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1008) + .addReg(AMDIL::R1012) + .addImm(mMFI->addi32Literal(16)) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i8), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + break; + case 2: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(16)) + .addImm(mMFI->addi32Literal(0)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i16), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + break; + } + // These instructions go after the current MI. + expandPackedData(MI); + expandExtendLoad(MI); + BuildMI(*mBB, MI, MI->getDebugLoc(), + mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass))) + .addOperand(MI->getOperand(0)) + .addReg(AMDIL::R1011); + MI->getOperand(0).setReg(AMDIL::R1011); +} + + void +AMDIL7XXIOExpansion::expandRegionLoad(MachineInstr *MI) +{ + bool HWRegion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem); + if (!mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) { + mMFI->addErrorMsg( + amd::CompilerErrorMessage[REGION_MEMORY_ERROR]); + return; + } + if (!HWRegion || !isHardwareRegion(MI)) { + return expandGlobalLoad(MI); + } + if (!mMFI->usesMem(AMDILDevice::GDS_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + uint32_t gID = getPointerID(MI); + assert(gID && "Found a GDS load that was incorrectly marked as zero ID!\n"); + if (!gID) { + gID = mSTM->device()->getResourceID(AMDILDevice::GDS_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + + DebugLoc DL; + // These instructions go before the current MI. + expandLoadStartCode(MI); + switch (getMemorySize(MI)) { + default: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi128Literal(1ULL << 32, 2ULL | (3ULL << 32))); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD_Y), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD_Z), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD_W), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + break; + case 1: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + // The instruction would normally fit in right here so everything created + // after this point needs to go into the afterInst vector. + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(24)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(24)); + break; + case 2: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + // The instruction would normally fit in right here so everything created + // after this point needs to go into the afterInst vector. + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)); + break; + case 4: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + break; + case 8: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi64Literal(1ULL << 32)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSLOAD_Y), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + break; + } + + // These instructions go after the current MI. + expandPackedData(MI); + expandExtendLoad(MI); + BuildMI(*mBB, MI, MI->getDebugLoc(), + mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass))) + .addOperand(MI->getOperand(0)) + .addReg(AMDIL::R1011); + MI->getOperand(0).setReg(AMDIL::R1011); +} + void +AMDIL7XXIOExpansion::expandLocalLoad(MachineInstr *MI) +{ + bool HWLocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem); + if (!HWLocal || !isHardwareLocal(MI)) { + return expandGlobalLoad(MI); + } + if (!mMFI->usesMem(AMDILDevice::LDS_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + uint32_t lID = getPointerID(MI); + assert(lID && "Found a LDS load that was incorrectly marked as zero ID!\n"); + if (!lID) { + lID = mSTM->device()->getResourceID(AMDILDevice::LDS_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + DebugLoc DL; + // These instructions go before the current MI. + expandLoadStartCode(MI); + switch (getMemorySize(MI)) { + default: + case 8: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSLOADVEC), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + break; + case 4: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + break; + case 1: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(24)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(24)); + break; + case 2: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)); + break; + } + + // These instructions go after the current MI. + expandPackedData(MI); + expandExtendLoad(MI); + BuildMI(*mBB, MI, MI->getDebugLoc(), + mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass))) + .addOperand(MI->getOperand(0)) + .addReg(AMDIL::R1011); + MI->getOperand(0).setReg(AMDIL::R1011); +} + + void +AMDIL7XXIOExpansion::expandGlobalStore(MachineInstr *MI) +{ + uint32_t ID = getPointerID(MI); + mKM->setOutputInst(); + DebugLoc DL = MI->getDebugLoc(); + // These instructions go before the current MI. + expandStoreSetupCode(MI); + switch (getMemorySize(MI)) { + default: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_v4i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + break; + case 1: + mMFI->addErrorMsg( + amd::CompilerErrorMessage[BYTE_STORE_ERROR]); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + break; + case 2: + mMFI->addErrorMsg( + amd::CompilerErrorMessage[BYTE_STORE_ERROR]); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + break; + case 4: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + break; + case 8: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UAVRAWSTORE_v2i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + break; + }; +} + + void +AMDIL7XXIOExpansion::expandRegionStore(MachineInstr *MI) +{ + bool HWRegion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem); + if (!mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) { + mMFI->addErrorMsg( + amd::CompilerErrorMessage[REGION_MEMORY_ERROR]); + return; + } + if (!HWRegion || !isHardwareRegion(MI)) { + return expandGlobalStore(MI); + } + DebugLoc DL = MI->getDebugLoc(); + mKM->setOutputInst(); + if (!mMFI->usesMem(AMDILDevice::GDS_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + uint32_t gID = getPointerID(MI); + assert(gID && "Found a GDS store that was incorrectly marked as zero ID!\n"); + if (!gID) { + gID = mSTM->device()->getResourceID(AMDILDevice::GDS_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + + // These instructions go before the current MI. + expandStoreSetupCode(MI); + switch (getMemorySize(MI)) { + default: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi128Literal(1ULL << 32, 2ULL | (3ULL << 32))); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE_Y), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE_Z), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE_W), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + case 1: + mMFI->addErrorMsg( + amd::CompilerErrorMessage[BYTE_STORE_ERROR]); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFF)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1012) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, + (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32)))); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::UMUL_i32), AMDIL::R1006) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1007) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0xFFFFFF00)) + .addImm(mMFI->addi32Literal(0x00FFFFFF)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal(0xFF00FFFF)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1012) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal(0xFFFF00FF)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1007); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + case 2: + mMFI->addErrorMsg( + amd::CompilerErrorMessage[BYTE_STORE_ERROR]); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0x0000FFFF)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1012) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0x0000FFFF)) + .addImm(mMFI->addi32Literal(0xFFFF0000)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(16)) + .addImm(mMFI->addi32Literal(0)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + case 4: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + case 8: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi64Literal(1ULL << 32)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::GDSSTORE_Y), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + }; +} + + void +AMDIL7XXIOExpansion::expandLocalStore(MachineInstr *MI) +{ + bool HWLocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem); + if (!HWLocal || !isHardwareLocal(MI)) { + return expandGlobalStore(MI); + } + uint32_t lID = getPointerID(MI); + assert(lID && "Found a LDS store that was incorrectly marked as zero ID!\n"); + if (!lID) { + lID = mSTM->device()->getResourceID(AMDILDevice::LDS_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + DebugLoc DL = MI->getDebugLoc(); + // These instructions go before the current MI. + expandStoreSetupCode(MI); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LDSSTOREVEC), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(lID); +} diff --git a/src/gallium/drivers/radeon/AMDILAlgorithms.tpp b/src/gallium/drivers/radeon/AMDILAlgorithms.tpp new file mode 100644 index 00000000000..058475f0f98 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILAlgorithms.tpp @@ -0,0 +1,93 @@ +//===------ AMDILAlgorithms.tpp - AMDIL Template Algorithms Header --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides templates algorithms that extend the STL algorithms, but +// are useful for the AMDIL backend +// +//===----------------------------------------------------------------------===// + +// A template function that loops through the iterators and passes the second +// argument along with each iterator to the function. If the function returns +// true, then the current iterator is invalidated and it moves back, before +// moving forward to the next iterator, otherwise it moves forward without +// issue. This is based on the for_each STL function, but allows a reference to +// the second argument +template<class InputIterator, class Function, typename Arg> +Function binaryForEach(InputIterator First, InputIterator Last, Function F, + Arg &Second) +{ + for ( ; First!=Last; ++First ) { + F(*First, Second); + } + return F; +} + +template<class InputIterator, class Function, typename Arg> +Function safeBinaryForEach(InputIterator First, InputIterator Last, Function F, + Arg &Second) +{ + for ( ; First!=Last; ++First ) { + if (F(*First, Second)) { + --First; + } + } + return F; +} + +// A template function that has two levels of looping before calling the +// function with the passed in argument. See binaryForEach for further +// explanation +template<class InputIterator, class Function, typename Arg> +Function binaryNestedForEach(InputIterator First, InputIterator Last, + Function F, Arg &Second) +{ + for ( ; First != Last; ++First) { + binaryForEach(First->begin(), First->end(), F, Second); + } + return F; +} +template<class InputIterator, class Function, typename Arg> +Function safeBinaryNestedForEach(InputIterator First, InputIterator Last, + Function F, Arg &Second) +{ + for ( ; First != Last; ++First) { + safeBinaryForEach(First->begin(), First->end(), F, Second); + } + return F; +} + +// Unlike the STL, a pointer to the iterator itself is passed in with the 'safe' +// versions of these functions This allows the function to handle situations +// such as invalidated iterators +template<class InputIterator, class Function> +Function safeForEach(InputIterator First, InputIterator Last, Function F) +{ + for ( ; First!=Last; ++First ) F(&First) + ; // Do nothing. + return F; +} + +// A template function that has two levels of looping before calling the +// function with a pointer to the current iterator. See binaryForEach for +// further explanation +template<class InputIterator, class SecondIterator, class Function> +Function safeNestedForEach(InputIterator First, InputIterator Last, + SecondIterator S, Function F) +{ + for ( ; First != Last; ++First) { + SecondIterator sf, sl; + for (sf = First->begin(), sl = First->end(); + sf != sl; ) { + if (!F(&sf)) { + ++sf; + } + } + } + return F; +} diff --git a/src/gallium/drivers/radeon/AMDILAsmBackend.cpp b/src/gallium/drivers/radeon/AMDILAsmBackend.cpp new file mode 100644 index 00000000000..63b688d20fd --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILAsmBackend.cpp @@ -0,0 +1,82 @@ +//===------ AMDILAsmBackend.cpp - AMDIL Assembly Backend ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// +#include "AMDILAsmBackend.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; +namespace llvm { + ASM_BACKEND_CLASS* createAMDILAsmBackend(const ASM_BACKEND_CLASS &T, + const std::string &TT) + { + return new AMDILAsmBackend(T); + } +} // namespace llvm + +//===--------------------- Default AMDIL Asm Backend ---------------------===// +AMDILAsmBackend::AMDILAsmBackend(const ASM_BACKEND_CLASS &T) + : ASM_BACKEND_CLASS() +{ +} + +MCObjectWriter * +AMDILAsmBackend::createObjectWriter(raw_ostream &OS) const +{ + return 0; +} + +bool +AMDILAsmBackend::doesSectionRequireSymbols(const MCSection &Section) const +{ + return false; +} + +bool +AMDILAsmBackend::isSectionAtomizable(const MCSection &Section) const +{ + return true; +} + +bool +AMDILAsmBackend::isVirtualSection(const MCSection &Section) const +{ + return false; + //const MCSectionELF &SE = static_cast<const MCSectionELF&>(Section); + //return SE.getType() == MCSectionELF::SHT_NOBITS; +} +void +AMDILAsmBackend::ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const +{ +} + +bool +AMDILAsmBackend::MayNeedRelaxation(const MCInst &Inst) const +{ + return false; +} + +void +AMDILAsmBackend::RelaxInstruction(const MCInst &Inst, + MCInst &Res) const +{ +} + +bool +AMDILAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const +{ + return false; +} + +unsigned +AMDILAsmBackend::getNumFixupKinds() const +{ + return 0; +} diff --git a/src/gallium/drivers/radeon/AMDILAsmBackend.h b/src/gallium/drivers/radeon/AMDILAsmBackend.h new file mode 100644 index 00000000000..ae027681b6f --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILAsmBackend.h @@ -0,0 +1,49 @@ +//===-- AMDILAsmBackend.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#ifndef _AMDIL_ASM_BACKEND_H_ +#define _AMDIL_ASM_BACKEND_H_ +#include "AMDIL.h" +#include "llvm/MC/MCAsmBackend.h" + +#define ASM_BACKEND_CLASS MCAsmBackend + +using namespace llvm; +namespace llvm { + class AMDILAsmBackend : public ASM_BACKEND_CLASS { + public: + AMDILAsmBackend(const ASM_BACKEND_CLASS &T); + virtual MCObjectWriter *createObjectWriter(raw_ostream &OS) const; + virtual bool doesSectionRequireSymbols(const MCSection &Section) const; + virtual bool isSectionAtomizable(const MCSection &Section) const; + virtual bool isVirtualSection(const MCSection &Section) const; + virtual void ApplyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize, + uint64_t Value) const; + virtual bool + MayNeedRelaxation(const MCInst &Inst + ) const; + virtual void RelaxInstruction(const MCInst &Inst, MCInst &Res) const; + virtual bool WriteNopData(uint64_t Count, MCObjectWriter *OW) const; + unsigned getNumFixupKinds() const; + + virtual void applyFixup(const MCFixup &Fixup, char * Data, unsigned DataSize, + uint64_t value) const { } + virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; } + virtual bool fixupNeedsRelaxation(const MCFixup &fixup, uint64_t value, + const MCInstFragment *DF, + const MCAsmLayout &Layout) const + { return false; } + virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const + {} + virtual bool writeNopData(uint64_t data, llvm::MCObjectWriter * writer) const + { return false; } + + }; // class AMDILAsmBackend; +} // llvm namespace + +#endif // _AMDIL_ASM_BACKEND_H_ diff --git a/src/gallium/drivers/radeon/AMDILAsmPrinter7XX.cpp b/src/gallium/drivers/radeon/AMDILAsmPrinter7XX.cpp new file mode 100644 index 00000000000..1a739294cc1 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILAsmPrinter7XX.cpp @@ -0,0 +1,149 @@ +//===-- AMDILAsmPrinter7XX.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDIL7XXAsmPrinter.h" + +#include "AMDILAlgorithms.tpp" +#include "AMDIL7XXAsmPrinter.h" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Constants.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Metadata.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Type.h" + +using namespace llvm; + +// TODO: Add support for verbose. + AMDIL7XXAsmPrinter::AMDIL7XXAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) +: AMDILAsmPrinter(TM, Streamer) +{ +} + +AMDIL7XXAsmPrinter::~AMDIL7XXAsmPrinter() +{ +} +/// +/// @param name +/// @brief strips KERNEL_PREFIX and KERNEL_SUFFIX from the name +/// and returns that name if both of the tokens are present. +/// + static +std::string Strip(const std::string &name) +{ + size_t start = name.find("__OpenCL_"); + size_t end = name.find("_kernel"); + if (start == std::string::npos + || end == std::string::npos + || (start == end)) { + return name; + } else { + return name.substr(9, name.length()-16); + } +} + void +AMDIL7XXAsmPrinter::emitMacroFunc(const MachineInstr *MI, + llvm::raw_ostream &O) +{ + const AMDILSubtarget *curTarget = mTM->getSubtargetImpl(); + const char *name = "unknown"; + llvm::StringRef nameRef; + if (MI->getOperand(0).isGlobal()) { + nameRef = MI->getOperand(0).getGlobal()->getName(); + name = nameRef.data(); + if (curTarget->device()->usesHardware( + AMDILDeviceInfo::DoubleOps) + && !::strncmp(name, "__sqrt_f64", 10) ) { + name = "__sqrt_f64_7xx"; + } + } + emitMCallInst(MI, O, name); +} + + bool +AMDIL7XXAsmPrinter::runOnMachineFunction(MachineFunction &lMF) +{ + this->MF = &lMF; + mMeta->setMF(&lMF); + mMFI = lMF.getInfo<AMDILMachineFunctionInfo>(); + SetupMachineFunction(lMF); + std::string kernelName = MF->getFunction()->getName(); + mName = Strip(kernelName); + + mKernelName = kernelName; + EmitFunctionHeader(); + EmitFunctionBody(); + return false; +} + + void +AMDIL7XXAsmPrinter::EmitInstruction(const MachineInstr *II) +{ + std::string FunStr; + raw_string_ostream OFunStr(FunStr); + formatted_raw_ostream O(OFunStr); + const AMDILSubtarget *curTarget = mTM->getSubtargetImpl(); + if (mDebugMode) { + O << ";" ; + II->print(O); + } + if (isMacroFunc(II)) { + emitMacroFunc(II, O); + O.flush(); + OutStreamer.EmitRawText(StringRef(FunStr)); + return; + } + if (isMacroCall(II)) { + const char *name; + name = mTM->getInstrInfo()->getName(II->getOpcode()) + 5; + int macronum = amd::MacroDBFindMacro(name); + O << "\t;"<< name<<"\n"; + O << "\tmcall("<<macronum<<")"; + if (curTarget->device()->isSupported( + AMDILDeviceInfo::MacroDB)) { + mMacroIDs.insert(macronum); + } else { + mMFI->addCalledIntr(macronum); + } + } + + // Print the assembly for the instruction. + // We want to make sure that we do HW constants + // before we do arena segment + if (mMeta->useCompilerWrite(II)) { + // TODO: This is a hack to get around some + // conformance failures. + O << "\tif_logicalz cb0[0].x\n"; + O << "\tuav_raw_store_id(" + << curTarget->device()->getResourceID(AMDILDevice::RAW_UAV_ID) + << ") "; + O << "mem0.x___, cb0[3].x, r0.0\n"; + O << "\tendif\n"; + mMFI->addMetadata(";memory:compilerwrite"); + } else { + printInstruction(II, O); + } + O.flush(); + OutStreamer.EmitRawText(StringRef(FunStr)); +} diff --git a/src/gallium/drivers/radeon/AMDILAsmPrinterEG.cpp b/src/gallium/drivers/radeon/AMDILAsmPrinterEG.cpp new file mode 100644 index 00000000000..4a9732a2b68 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILAsmPrinterEG.cpp @@ -0,0 +1,162 @@ +//===-- AMDILAsmPrinterEG.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILEGAsmPrinter.h" + +#include "AMDILAlgorithms.tpp" +#include "AMDILDevices.h" +#include "AMDILEGAsmPrinter.h" +#include "AMDILGlobalManager.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/DebugInfo.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Constants.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSymbol.h" +#include "llvm/Metadata.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Type.h" + +using namespace llvm; + + +// TODO: Add support for verbose. +AMDILEGAsmPrinter::AMDILEGAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) +: AMDILAsmPrinter(TM, Streamer) +{ +} + +AMDILEGAsmPrinter::~AMDILEGAsmPrinter() +{ +} +// +// @param name +// @brief strips KERNEL_PREFIX and KERNEL_SUFFIX from the name +// and returns that name if both of the tokens are present. +// + static +std::string Strip(const std::string &name) +{ + size_t start = name.find("__OpenCL_"); + size_t end = name.find("_kernel"); + if (start == std::string::npos + || end == std::string::npos + || (start == end)) { + return name; + } else { + return name.substr(9, name.length()-16); + } +} +void +AMDILEGAsmPrinter::emitMacroFunc(const MachineInstr *MI, + llvm::raw_ostream &O) +{ + const AMDILSubtarget *curTarget = mTM->getSubtargetImpl(); + const char *name = "unknown"; + llvm::StringRef nameRef; + if (MI->getOperand(0).isGlobal()) { + nameRef = MI->getOperand(0).getGlobal()->getName(); + name = nameRef.data(); + } + if (!::strncmp(name, "__fma_f32", 9) && curTarget->device()->usesHardware( + AMDILDeviceInfo::FMA)) { + name = "__hwfma_f32"; + } + emitMCallInst(MI, O, name); +} + + bool +AMDILEGAsmPrinter::runOnMachineFunction(MachineFunction &lMF) +{ + this->MF = &lMF; + mMeta->setMF(&lMF); + mMFI = lMF.getInfo<AMDILMachineFunctionInfo>(); + SetupMachineFunction(lMF); + std::string kernelName = MF->getFunction()->getName(); + mName = Strip(kernelName); + + mKernelName = kernelName; + EmitFunctionHeader(); + EmitFunctionBody(); + return false; +} + void +AMDILEGAsmPrinter::EmitInstruction(const MachineInstr *II) +{ + std::string FunStr; + raw_string_ostream OFunStr(FunStr); + formatted_raw_ostream O(OFunStr); + const AMDILSubtarget *curTarget = mTM->getSubtargetImpl(); + if (mDebugMode) { + O << ";" ; + II->print(O); + } + if (isMacroFunc(II)) { + emitMacroFunc(II, O); + O.flush(); + OutStreamer.EmitRawText(StringRef(FunStr)); + return; + } + if (isMacroCall(II)) { + const char *name; + name = mTM->getInstrInfo()->getName(II->getOpcode()) + 5; + if (!::strncmp(name, "__fma_f32", 9) + && curTarget->device()->usesHardware( + AMDILDeviceInfo::FMA)) { + name = "__hwfma_f32"; + } + //assert(0 && + //"Found a macro that is still in use!"); + int macronum = amd::MacroDBFindMacro(name); + O << "\t;"<< name<<"\n"; + O << "\tmcall("<<macronum<<")"; + if (curTarget->device()->isSupported( + AMDILDeviceInfo::MacroDB)) { + mMacroIDs.insert(macronum); + } else { + mMFI->addCalledIntr(macronum); + } + } + + // Print the assembly for the instruction. + // We want to make sure that we do HW constants + // before we do arena segment + // TODO: This is a hack to get around some + // conformance failures. + if (mMeta->useCompilerWrite(II)) { + O << "\tif_logicalz cb0[0].x\n"; + if (mMFI->usesMem(AMDILDevice::RAW_UAV_ID)) { + O << "\tuav_raw_store_id(" + << curTarget->device()->getResourceID(AMDILDevice::RAW_UAV_ID) + << ") "; + O << "mem0.x___, cb0[3].x, r0.0\n"; + } else { + O << "\tuav_arena_store_id(" + << curTarget->device()->getResourceID(AMDILDevice::ARENA_UAV_ID) + << ")_size(dword) "; + O << "cb0[3].x, r0.0\n"; + } + O << "\tendif\n"; + mMFI->addMetadata(";memory:compilerwrite"); + } else { + printInstruction(II, O); + } + O.flush(); + OutStreamer.EmitRawText(StringRef(FunStr)); +} diff --git a/src/gallium/drivers/radeon/AMDILBarrierDetect.cpp b/src/gallium/drivers/radeon/AMDILBarrierDetect.cpp new file mode 100644 index 00000000000..1bc9651e7a4 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILBarrierDetect.cpp @@ -0,0 +1,254 @@ +//===----- AMDILBarrierDetect.cpp - Barrier Detect pass -*- C++ -*- ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#define DEBUG_TYPE "BarrierDetect" +#ifdef DEBUG +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME 0 +#endif +#include "AMDILAlgorithms.tpp" +#include "AMDILCompilerWarnings.h" +#include "AMDILDevices.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILSubtarget.h" +#include "AMDILTargetMachine.h" +#include "llvm/BasicBlock.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +// The barrier detect pass determines if a barrier has been duplicated in the +// source program which can cause undefined behaviour if more than a single +// wavefront is executed in a group. This is because LLVM does not have an +// execution barrier and if this barrier function gets duplicated, undefined +// behaviour can occur. In order to work around this, we detect the duplicated +// barrier and then make the work-group execute in a single wavefront mode, +// essentially making the barrier a no-op. + +namespace +{ + class LLVM_LIBRARY_VISIBILITY AMDILBarrierDetect : public FunctionPass + { + TargetMachine &TM; + static char ID; + public: + AMDILBarrierDetect(TargetMachine &TM AMDIL_OPT_LEVEL_DECL); + ~AMDILBarrierDetect(); + const char *getPassName() const; + bool runOnFunction(Function &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + void getAnalysisUsage(AnalysisUsage &AU) const; + private: + bool detectBarrier(BasicBlock::iterator *BBI); + bool detectMemFence(BasicBlock::iterator *BBI); + bool mChanged; + SmallVector<int64_t, DEFAULT_VEC_SLOTS> bVecMap; + const AMDILSubtarget *mStm; + + // Constants used to define memory type. + static const unsigned int LOCAL_MEM_FENCE = 1<<0; + static const unsigned int GLOBAL_MEM_FENCE = 1<<1; + static const unsigned int REGION_MEM_FENCE = 1<<2; + }; + char AMDILBarrierDetect::ID = 0; +} // anonymouse namespace + +namespace llvm +{ + FunctionPass * + createAMDILBarrierDetect(TargetMachine &TM AMDIL_OPT_LEVEL_DECL) + { + return new AMDILBarrierDetect(TM AMDIL_OPT_LEVEL_VAR); + } +} // llvm namespace + +AMDILBarrierDetect::AMDILBarrierDetect(TargetMachine &TM + AMDIL_OPT_LEVEL_DECL) + : + FunctionPass(ID), + TM(TM) +{ +} + +AMDILBarrierDetect::~AMDILBarrierDetect() +{ +} + +bool AMDILBarrierDetect::detectBarrier(BasicBlock::iterator *BBI) +{ + SmallVector<int64_t, DEFAULT_VEC_SLOTS>::iterator bIter; + int64_t bID; + Instruction *inst = (*BBI); + CallInst *CI = dyn_cast<CallInst>(inst); + + if (!CI || !CI->getNumOperands()) { + return false; + } + const Value *funcVal = CI->getOperand(CI->getNumOperands() - 1); + if (funcVal && strncmp(funcVal->getName().data(), "__amd_barrier", 13)) { + return false; + } + + if (inst->getNumOperands() >= 3) { + const Value *V = inst->getOperand(0); + const ConstantInt *Cint = dyn_cast<ConstantInt>(V); + bID = Cint->getSExtValue(); + bIter = std::find(bVecMap.begin(), bVecMap.end(), bID); + if (bIter == bVecMap.end()) { + bVecMap.push_back(bID); + } else { + if (mStm->device()->isSupported(AMDILDeviceInfo::BarrierDetect)) { + AMDILMachineFunctionInfo *MFI = + getAnalysis<MachineFunctionAnalysis>().getMF() + .getInfo<AMDILMachineFunctionInfo>(); + MFI->addMetadata(";limitgroupsize"); + MFI->addErrorMsg(amd::CompilerWarningMessage[BAD_BARRIER_OPT]); + } + } + } + if (mStm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + AMDILMachineFunctionInfo *MFI = + getAnalysis<MachineFunctionAnalysis>().getMF() + .getInfo<AMDILMachineFunctionInfo>(); + MFI->addErrorMsg(amd::CompilerWarningMessage[LIMIT_BARRIER]); + MFI->addMetadata(";limitgroupsize"); + MFI->setUsesLocal(); + } + const Value *V = inst->getOperand(inst->getNumOperands()-2); + const ConstantInt *Cint = dyn_cast<ConstantInt>(V); + Function *iF = dyn_cast<Function>(inst->getOperand(inst->getNumOperands()-1)); + Module *M = iF->getParent(); + bID = Cint->getSExtValue(); + if (bID > 0) { + const char *name = "barrier"; + if (bID == GLOBAL_MEM_FENCE) { + name = "barrierGlobal"; + } else if (bID == LOCAL_MEM_FENCE + && mStm->device()->usesHardware(AMDILDeviceInfo::LocalMem)) { + name = "barrierLocal"; + } else if (bID == REGION_MEM_FENCE + && mStm->device()->usesHardware(AMDILDeviceInfo::RegionMem)) { + name = "barrierRegion"; + } + Function *nF = + dyn_cast<Function>(M->getOrInsertFunction(name, iF->getFunctionType())); + inst->setOperand(inst->getNumOperands()-1, nF); + return false; + } + + return false; +} + +bool AMDILBarrierDetect::detectMemFence(BasicBlock::iterator *BBI) +{ + int64_t bID; + Instruction *inst = (*BBI); + CallInst *CI = dyn_cast<CallInst>(inst); + + if (!CI || CI->getNumOperands() != 2) { + return false; + } + + const Value *V = inst->getOperand(inst->getNumOperands()-2); + const ConstantInt *Cint = dyn_cast<ConstantInt>(V); + Function *iF = dyn_cast<Function>(inst->getOperand(inst->getNumOperands()-1)); + + const char *fence_local_name; + const char *fence_global_name; + const char *fence_region_name; + const char* fence_name = "mem_fence"; + if (!iF) { + return false; + } + + if (strncmp(iF->getName().data(), "mem_fence", 9) == 0) { + fence_local_name = "mem_fence_local"; + fence_global_name = "mem_fence_global"; + fence_region_name = "mem_fence_region"; + } else if (strncmp(iF->getName().data(), "read_mem_fence", 14) == 0) { + fence_local_name = "read_mem_fence_local"; + fence_global_name = "read_mem_fence_global"; + fence_region_name = "read_mem_fence_region"; + } else if (strncmp(iF->getName().data(), "write_mem_fence", 15) == 0) { + fence_local_name = "write_mem_fence_local"; + fence_global_name = "write_mem_fence_global"; + fence_region_name = "write_mem_fence_region"; + } else { + return false; + } + + Module *M = iF->getParent(); + bID = Cint->getSExtValue(); + if (bID > 0) { + const char *name = fence_name; + if (bID == GLOBAL_MEM_FENCE) { + name = fence_global_name; + } else if (bID == LOCAL_MEM_FENCE + && mStm->device()->usesHardware(AMDILDeviceInfo::LocalMem)) { + name = fence_local_name; + } else if (bID == REGION_MEM_FENCE + && mStm->device()->usesHardware(AMDILDeviceInfo::RegionMem)) { + name = fence_region_name; + } + Function *nF = + dyn_cast<Function>(M->getOrInsertFunction(name, iF->getFunctionType())); + inst->setOperand(inst->getNumOperands()-1, nF); + return false; + } + + return false; + +} + +bool AMDILBarrierDetect::runOnFunction(Function &MF) +{ + mChanged = false; + bVecMap.clear(); + mStm = &TM.getSubtarget<AMDILSubtarget>(); + Function *F = &MF; + safeNestedForEach(F->begin(), F->end(), F->begin()->begin(), + std::bind1st( + std::mem_fun( + &AMDILBarrierDetect::detectBarrier), this)); + safeNestedForEach(F->begin(), F->end(), F->begin()->begin(), + std::bind1st( + std::mem_fun( + &AMDILBarrierDetect::detectMemFence), this)); + return mChanged; +} + +const char* AMDILBarrierDetect::getPassName() const +{ + return "AMDIL Barrier Detect Pass"; +} + +bool AMDILBarrierDetect::doInitialization(Module &M) +{ + return false; +} + +bool AMDILBarrierDetect::doFinalization(Module &M) +{ + return false; +} + +void AMDILBarrierDetect::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<MachineFunctionAnalysis>(); + FunctionPass::getAnalysisUsage(AU); + AU.setPreservesAll(); +} diff --git a/src/gallium/drivers/radeon/AMDILBase.td b/src/gallium/drivers/radeon/AMDILBase.td new file mode 100644 index 00000000000..2706b211f2d --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILBase.td @@ -0,0 +1,104 @@ +//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// Target-independent interfaces which we are implementing +//===----------------------------------------------------------------------===// + +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// AMDIL Subtarget features. +//===----------------------------------------------------------------------===// +def FeatureFP64 : SubtargetFeature<"fp64", + "CapsOverride[AMDILDeviceInfo::DoubleOps]", + "true", + "Enable 64bit double precision operations">; +def FeatureByteAddress : SubtargetFeature<"byte_addressable_store", + "CapsOverride[AMDILDeviceInfo::ByteStores]", + "true", + "Enable byte addressable stores">; +def FeatureBarrierDetect : SubtargetFeature<"barrier_detect", + "CapsOverride[AMDILDeviceInfo::BarrierDetect]", + "true", + "Enable duplicate barrier detection(HD5XXX or later).">; +def FeatureImages : SubtargetFeature<"images", + "CapsOverride[AMDILDeviceInfo::Images]", + "true", + "Enable image functions">; +def FeatureMultiUAV : SubtargetFeature<"multi_uav", + "CapsOverride[AMDILDeviceInfo::MultiUAV]", + "true", + "Generate multiple UAV code(HD5XXX family or later)">; +def FeatureMacroDB : SubtargetFeature<"macrodb", + "CapsOverride[AMDILDeviceInfo::MacroDB]", + "true", + "Use internal macrodb, instead of macrodb in driver">; +def FeatureNoAlias : SubtargetFeature<"noalias", + "CapsOverride[AMDILDeviceInfo::NoAlias]", + "true", + "assert that all kernel argument pointers are not aliased">; +def FeatureNoInline : SubtargetFeature<"no-inline", + "CapsOverride[AMDILDeviceInfo::NoInline]", + "true", + "specify whether to not inline functions">; + +def Feature64BitPtr : SubtargetFeature<"64BitPtr", + "mIs64bit", + "false", + "Specify if 64bit addressing should be used.">; + +def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr", + "mIs32on64bit", + "false", + "Specify if 64bit sized pointers with 32bit addressing should be used.">; +def FeatureDebug : SubtargetFeature<"debug", + "CapsOverride[AMDILDeviceInfo::Debug]", + "true", + "Debug mode is enabled, so disable hardware accelerated address spaces.">; + +//===----------------------------------------------------------------------===// +// Register File, Calling Conv, Instruction Descriptions +//===----------------------------------------------------------------------===// + + +include "AMDILRegisterInfo.td" +include "AMDILCallingConv.td" +include "AMDILInstrInfo.td" + +def AMDILInstrInfo : InstrInfo {} + +//===----------------------------------------------------------------------===// +// AMDIL processors supported. +//===----------------------------------------------------------------------===// +//include "Processors.td" + +//===----------------------------------------------------------------------===// +// Declare the target which we are implementing +//===----------------------------------------------------------------------===// +def AMDILAsmWriter : AsmWriter { + string AsmWriterClassName = "AsmPrinter"; + int Variant = 0; +} + +def AMDILAsmParser : AsmParser { + string AsmParserClassName = "AsmParser"; + int Variant = 0; + + string CommentDelimiter = ";"; + + string RegisterPrefix = "r"; + +} + + +def AMDIL : Target { + // Pull in Instruction Info: + let InstructionSet = AMDILInstrInfo; + let AssemblyWriters = [AMDILAsmWriter]; + let AssemblyParsers = [AMDILAsmParser]; +} diff --git a/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp b/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp new file mode 100644 index 00000000000..a7d39466bdf --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp @@ -0,0 +1,3257 @@ +//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#define DEBUG_TYPE "structcfg" +#ifdef DEBUG +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME 0 +#endif + +#include "AMDILCompilerErrors.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +#define FirstNonDebugInstr(A) A->begin() +using namespace llvm; + +// bixia TODO: move this out to analysis lib. Make this work for both target +// AMDIL and CBackend. +// TODO: move-begin. + +//===----------------------------------------------------------------------===// +// +// Statistics for CFGStructurizer. +// +//===----------------------------------------------------------------------===// + +STATISTIC(numSerialPatternMatch, "CFGStructurizer number of serial pattern " + "matched"); +STATISTIC(numIfPatternMatch, "CFGStructurizer number of if pattern " + "matched"); +STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break " + "pattern matched"); +STATISTIC(numLoopcontPatternMatch, "CFGStructurizer number of loop-continue " + "pattern matched"); +STATISTIC(numLoopPatternMatch, "CFGStructurizer number of loop pattern " + "matched"); +STATISTIC(numClonedBlock, "CFGStructurizer cloned blocks"); +STATISTIC(numClonedInstr, "CFGStructurizer cloned instructions"); + +//===----------------------------------------------------------------------===// +// +// Miscellaneous utility for CFGStructurizer. +// +//===----------------------------------------------------------------------===// +namespace llvmCFGStruct +{ +#define SHOWNEWINSTR(i) \ + if (DEBUGME) errs() << "New instr: " << *i << "\n" + +#define SHOWNEWBLK(b, msg) \ +if (DEBUGME) { \ + errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + errs() << "\n"; \ +} + +#define SHOWBLK_DETAIL(b, msg) \ +if (DEBUGME) { \ + if (b) { \ + errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \ + b->print(errs()); \ + errs() << "\n"; \ + } \ +} + +#define INVALIDSCCNUM -1 +#define INVALIDREGNUM 0 + +template<class LoopinfoT> +void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) { + for (typename LoopinfoT::iterator iter = LoopInfo.begin(), + iterEnd = LoopInfo.end(); + iter != iterEnd; ++iter) { + (*iter)->print(OS, 0); + } +} + +template<class NodeT> +void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) { + size_t sz = Src.size(); + for (size_t i = 0; i < sz/2; ++i) { + NodeT *t = Src[i]; + Src[i] = Src[sz - i - 1]; + Src[sz - i - 1] = t; + } +} + +} //end namespace llvmCFGStruct + + +//===----------------------------------------------------------------------===// +// +// MachinePostDominatorTree +// +//===----------------------------------------------------------------------===// + +#include "AMDILCompilerErrors.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/DominatorInternals.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineJumpTableInfo.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +namespace llvm { + +/// PostDominatorTree Class - Concrete subclass of DominatorTree that is used +/// to compute the a post-dominator tree. +/// +struct MachinePostDominatorTree : public MachineFunctionPass { + static char ID; // Pass identification, replacement for typeid + DominatorTreeBase<MachineBasicBlock> *DT; + MachinePostDominatorTree() : MachineFunctionPass(ID) + { + DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate + // postdominator + } + + ~MachinePostDominatorTree(); + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + inline const std::vector<MachineBasicBlock *> &getRoots() const { + return DT->getRoots(); + } + + inline MachineDomTreeNode *getRootNode() const { + return DT->getRootNode(); + } + + inline MachineDomTreeNode *operator[](MachineBasicBlock *BB) const { + return DT->getNode(BB); + } + + inline MachineDomTreeNode *getNode(MachineBasicBlock *BB) const { + return DT->getNode(BB); + } + + inline bool dominates(MachineDomTreeNode *A, MachineDomTreeNode *B) const { + return DT->dominates(A, B); + } + + inline bool dominates(MachineBasicBlock *A, MachineBasicBlock *B) const { + return DT->dominates(A, B); + } + + inline bool + properlyDominates(const MachineDomTreeNode *A, MachineDomTreeNode *B) const { + return DT->properlyDominates(A, B); + } + + inline bool + properlyDominates(MachineBasicBlock *A, MachineBasicBlock *B) const { + return DT->properlyDominates(A, B); + } + + inline MachineBasicBlock * + findNearestCommonDominator(MachineBasicBlock *A, MachineBasicBlock *B) { + return DT->findNearestCommonDominator(A, B); + } + + virtual void print(llvm::raw_ostream &OS, const Module *M = 0) const { + DT->print(OS); + } +}; +} //end of namespace llvm + +char MachinePostDominatorTree::ID = 0; +static RegisterPass<MachinePostDominatorTree> +machinePostDominatorTreePass("machinepostdomtree", + "MachinePostDominator Tree Construction", + true, true); + +//const PassInfo *const llvm::MachinePostDominatorsID +//= &machinePostDominatorTreePass; + +bool MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) { + DT->recalculate(F); + //DEBUG(DT->dump()); + return false; +} + +MachinePostDominatorTree::~MachinePostDominatorTree() { + delete DT; +} + +//===----------------------------------------------------------------------===// +// +// supporting data structure for CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct +{ +template<class PassT> +struct CFGStructTraits { +}; + +template <class InstrT> +class BlockInformation { +public: + bool isRetired; + int sccNum; + //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr; + //Instructions defining the corresponding successor. + BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {} +}; + +template <class BlockT, class InstrT, class RegiT> +class LandInformation { +public: + BlockT *landBlk; + std::set<RegiT> breakInitRegs; //Registers that need to "reg = 0", before + //WHILELOOP(thisloop) init before entering + //thisloop. + std::set<RegiT> contInitRegs; //Registers that need to "reg = 0", after + //WHILELOOP(thisloop) init after entering + //thisloop. + std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop + //land block, branch cond on this reg. + std::set<RegiT> breakOnRegs; //registers that need to "if (reg) break + //endif" after ENDLOOP(thisloop) break + //outerLoopOf(thisLoop). + std::set<RegiT> contOnRegs; //registers that need to "if (reg) continue + //endif" after ENDLOOP(thisloop) continue on + //outerLoopOf(thisLoop). + LandInformation() : landBlk(NULL) {} +}; + +} //end of namespace llvmCFGStruct + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct +{ +// bixia TODO: port it to BasicBlock, not just MachineBasicBlock. +template<class PassT> +class CFGStructurizer +{ +public: + typedef enum { + Not_SinglePath = 0, + SinglePath_InPath = 1, + SinglePath_NotInPath = 2 + } PathToKind; + +public: + typedef typename PassT::InstructionType InstrT; + typedef typename PassT::FunctionType FuncT; + typedef typename PassT::DominatortreeType DomTreeT; + typedef typename PassT::PostDominatortreeType PostDomTreeT; + typedef typename PassT::DomTreeNodeType DomTreeNodeT; + typedef typename PassT::LoopinfoType LoopInfoT; + + typedef GraphTraits<FuncT *> FuncGTraits; + //typedef FuncGTraits::nodes_iterator BlockIterator; + typedef typename FuncT::iterator BlockIterator; + + typedef typename FuncGTraits::NodeType BlockT; + typedef GraphTraits<BlockT *> BlockGTraits; + typedef GraphTraits<Inverse<BlockT *> > InvBlockGTraits; + //typedef BlockGTraits::succ_iterator InstructionIterator; + typedef typename BlockT::iterator InstrIterator; + + typedef CFGStructTraits<PassT> CFGTraits; + typedef BlockInformation<InstrT> BlockInfo; + typedef std::map<BlockT *, BlockInfo *> BlockInfoMap; + + typedef int RegiT; + typedef typename PassT::LoopType LoopT; + typedef LandInformation<BlockT, InstrT, RegiT> LoopLandInfo; + typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap; + //landing info for loop break + typedef SmallVector<BlockT *, 32> BlockTSmallerVector; + +public: + CFGStructurizer(); + ~CFGStructurizer(); + + /// Perform the CFG structurization + bool run(FuncT &Func, PassT &Pass); + + /// Perform the CFG preparation + bool prepare(FuncT &Func, PassT &Pass); + +private: + void orderBlocks(); + void printOrderedBlocks(llvm::raw_ostream &OS); + int patternMatch(BlockT *CurBlock); + int patternMatchGroup(BlockT *CurBlock); + + int serialPatternMatch(BlockT *CurBlock); + int ifPatternMatch(BlockT *CurBlock); + int switchPatternMatch(BlockT *CurBlock); + int loopendPatternMatch(BlockT *CurBlock); + int loopPatternMatch(BlockT *CurBlock); + + int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader); + int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader); + //int loopWithoutBreak(BlockT *); + + void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop, + BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock); + void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop, + BlockT *ContBlock, LoopT *contLoop); + bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block); + int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock); + int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock); + int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock, BlockT **LandBlockPtr); + void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock, + BlockT *FalseBlock, BlockT *LandBlock, + bool Detail = false); + PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock, + bool AllowSideEntry = true); + BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock, + bool AllowSideEntry = true); + int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock); + void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock); + + void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock, + BlockT *TrueBlock, BlockT *FalseBlock, + BlockT *LandBlock); + void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand); + void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock, + BlockT *ExitLandBlock, RegiT SetReg); + void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock, + RegiT SetReg); + BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep, + std::set<BlockT*> &ExitBlockSet, + BlockT *ExitLandBlk); + BlockT *addLoopEndbranchBlock(LoopT *LoopRep, + BlockTSmallerVector &ExitingBlocks, + BlockTSmallerVector &ExitBlocks); + BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep); + void removeUnconditionalBranch(BlockT *SrcBlock); + void removeRedundantConditionalBranch(BlockT *SrcBlock); + void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks); + + void removeSuccessor(BlockT *SrcBlock); + BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock); + BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock); + + void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock, + InstrIterator InsertPos); + + void recordSccnum(BlockT *SrcBlock, int SCCNum); + int getSCCNum(BlockT *srcBlk); + + void retireBlock(BlockT *DstBlock, BlockT *SrcBlock); + bool isRetiredBlock(BlockT *SrcBlock); + bool isActiveLoophead(BlockT *CurBlock); + bool needMigrateBlock(BlockT *Block); + + BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock, + BlockTSmallerVector &exitBlocks, + std::set<BlockT*> &ExitBlockSet); + void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL); + BlockT *getLoopLandBlock(LoopT *LoopRep); + LoopLandInfo *getLoopLandInfo(LoopT *LoopRep); + + void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum); + void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum); + void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum); + void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum); + void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum); + + bool hasBackEdge(BlockT *curBlock); + unsigned getLoopDepth (LoopT *LoopRep); + int countActiveBlock( + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart, + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd); + BlockT *findNearestCommonPostDom(std::set<BlockT *>&); + BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2); + +private: + DomTreeT *domTree; + PostDomTreeT *postDomTree; + LoopInfoT *loopInfo; + PassT *passRep; + FuncT *funcRep; + + BlockInfoMap blockInfoMap; + LoopLandInfoMap loopLandInfoMap; + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks; + +}; //template class CFGStructurizer + +template<class PassT> CFGStructurizer<PassT>::CFGStructurizer() + : domTree(NULL), postDomTree(NULL), loopInfo(NULL) { +} + +template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() { + for (typename BlockInfoMap::iterator I = blockInfoMap.begin(), + E = blockInfoMap.end(); I != E; ++I) { + delete I->second; + } +} + +template<class PassT> +bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass) { + passRep = &pass; + funcRep = &func; + + bool changed = false; + //func.RenumberBlocks(); + + //to do, if not reducible flow graph, make it so ??? + + if (DEBUGME) { + errs() << "AMDILCFGStructurizer::prepare\n"; + //func.viewCFG(); + //func.viewCFGOnly(); + //func.dump(); + } + + //FIXME: gcc complains on this. + //domTree = &pass.getAnalysis<DomTreeT>(); + //domTree = CFGTraits::getDominatorTree(pass); + //if (DEBUGME) { + // domTree->print(errs()); + //} + + //FIXME: gcc complains on this. + //domTree = &pass.getAnalysis<DomTreeT>(); + //postDomTree = CFGTraits::getPostDominatorTree(pass); + //if (DEBUGME) { + // postDomTree->print(errs()); + //} + + //FIXME: gcc complains on this. + //loopInfo = &pass.getAnalysis<LoopInfoT>(); + loopInfo = CFGTraits::getLoopInfo(pass); + if (DEBUGME) { + errs() << "LoopInfo:\n"; + PrintLoopinfo(*loopInfo, errs()); + } + + orderBlocks(); + if (DEBUGME) { + errs() << "Ordered blocks:\n"; + printOrderedBlocks(errs()); + } + + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks; + + for (typename LoopInfoT::iterator iter = loopInfo->begin(), + iterEnd = loopInfo->end(); + iter != iterEnd; ++iter) { + LoopT* loopRep = (*iter); + BlockTSmallerVector exitingBlks; + loopRep->getExitingBlocks(exitingBlks); + + if (exitingBlks.size() == 0) { + BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep); + if (dummyExitBlk != NULL) + retBlks.push_back(dummyExitBlk); + } + } + + // Remove unconditional branch instr. + // Add dummy exit block iff there are multiple returns. + + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end(); + iterBlk != iterEndBlk; + ++iterBlk) { + BlockT *curBlk = *iterBlk; + removeUnconditionalBranch(curBlk); + removeRedundantConditionalBranch(curBlk); + if (CFGTraits::isReturnBlock(curBlk)) { + retBlks.push_back(curBlk); + } + assert(curBlk->succ_size() <= 2); + //assert(curBlk->size() > 0); + //removeEmptyBlock(curBlk) ?? + } //for + + if (retBlks.size() >= 2) { + addDummyExitBlock(retBlks); + changed = true; + } + + return changed; +} //CFGStructurizer::prepare + +template<class PassT> +bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass) { + passRep = &pass; + funcRep = &func; + + //func.RenumberBlocks(); + + //Assume reducible CFG... + if (DEBUGME) { + errs() << "AMDILCFGStructurizer::run\n"; + //errs() << func.getFunction()->getNameStr() << "\n"; + func.viewCFG(); + //func.viewCFGOnly(); + //func.dump(); + } + +#if 1 + //FIXME: gcc complains on this. + //domTree = &pass.getAnalysis<DomTreeT>(); + domTree = CFGTraits::getDominatorTree(pass); + if (DEBUGME) { + domTree->print(errs(), (const llvm::Module*)0); + } +#endif + + //FIXME: gcc complains on this. + //domTree = &pass.getAnalysis<DomTreeT>(); + postDomTree = CFGTraits::getPostDominatorTree(pass); + if (DEBUGME) { + postDomTree->print(errs()); + } + + //FIXME: gcc complains on this. + //loopInfo = &pass.getAnalysis<LoopInfoT>(); + loopInfo = CFGTraits::getLoopInfo(pass); + if (DEBUGME) { + errs() << "LoopInfo:\n"; + PrintLoopinfo(*loopInfo, errs()); + } + + orderBlocks(); +//#define STRESSTEST +#ifdef STRESSTEST + //Use the worse block ordering to test the algorithm. + ReverseVector(orderedBlks); +#endif + + if (DEBUGME) { + errs() << "Ordered blocks:\n"; + printOrderedBlocks(errs()); + } + int numIter = 0; + bool finish = false; + BlockT *curBlk; + bool makeProgress = false; + int numRemainedBlk = countActiveBlock(orderedBlks.begin(), + orderedBlks.end()); + + do { + ++numIter; + if (DEBUGME) { + errs() << "numIter = " << numIter + << ", numRemaintedBlk = " << numRemainedBlk << "\n"; + } + + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + iterBlk = orderedBlks.begin(); + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + iterBlkEnd = orderedBlks.end(); + + typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + sccBeginIter = iterBlk; + BlockT *sccBeginBlk = NULL; + int sccNumBlk = 0; // The number of active blocks, init to a + // maximum possible number. + int sccNumIter; // Number of iteration in this SCC. + + while (iterBlk != iterBlkEnd) { + curBlk = *iterBlk; + + if (sccBeginBlk == NULL) { + sccBeginIter = iterBlk; + sccBeginBlk = curBlk; + sccNumIter = 0; + sccNumBlk = numRemainedBlk; // Init to maximum possible number. + if (DEBUGME) { + errs() << "start processing SCC" << getSCCNum(sccBeginBlk); + errs() << "\n"; + } + } + + if (!isRetiredBlock(curBlk)) { + patternMatch(curBlk); + } + + ++iterBlk; + + bool contNextScc = true; + if (iterBlk == iterBlkEnd + || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) { + // Just finish one scc. + ++sccNumIter; + int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk); + if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) { + if (DEBUGME) { + errs() << "Can't reduce SCC " << getSCCNum(curBlk) + << ", sccNumIter = " << sccNumIter; + errs() << "doesn't make any progress\n"; + } + contNextScc = true; + } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) { + sccNumBlk = sccRemainedNumBlk; + iterBlk = sccBeginIter; + contNextScc = false; + if (DEBUGME) { + errs() << "repeat processing SCC" << getSCCNum(curBlk) + << "sccNumIter = " << sccNumIter << "\n"; + func.viewCFG(); + //func.viewCFGOnly(); + } + } else { + // Finish the current scc. + contNextScc = true; + } + } else { + // Continue on next component in the current scc. + contNextScc = false; + } + + if (contNextScc) { + sccBeginBlk = NULL; + } + } //while, "one iteration" over the function. + + BlockT *entryBlk = FuncGTraits::nodes_begin(&func); + if (entryBlk->succ_size() == 0) { + finish = true; + if (DEBUGME) { + errs() << "Reduce to one block\n"; + } + } else { + int newnumRemainedBlk + = countActiveBlock(orderedBlks.begin(), orderedBlks.end()); + // consider cloned blocks ?? + if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) { + makeProgress = true; + numRemainedBlk = newnumRemainedBlk; + } else { + makeProgress = false; + if (DEBUGME) { + errs() << "No progress\n"; + } + } + } + } while (!finish && makeProgress); + + // Misc wrap up to maintain the consistency of the Function representation. + CFGTraits::wrapup(FuncGTraits::nodes_begin(&func)); + + // Detach retired Block, release memory. + for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(), + iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) { + if ((*iterMap).second && (*iterMap).second->isRetired) { + assert(((*iterMap).first)->getNumber() != -1); + if (DEBUGME) { + errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n"; + } + (*iterMap).first->eraseFromParent(); //Remove from the parent Function. + } + delete (*iterMap).second; + } + blockInfoMap.clear(); + + // clear loopLandInfoMap + for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(), + iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) { + delete (*iterMap).second; + } + loopLandInfoMap.clear(); + + if (DEBUGME) { + func.viewCFG(); + //func.dump(); + } + + if (!finish) { + MachineFunction *MF = &func; + AMDILMachineFunctionInfo *mMFI = + MF->getInfo<AMDILMachineFunctionInfo>(); + mMFI->addErrorMsg(amd::CompilerErrorMessage[IRREDUCIBLE_CF]); + } + + return true; +} //CFGStructurizer::run + +/// Print the ordered Blocks. +/// +template<class PassT> +void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) { + size_t i = 0; + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator + iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end(); + iterBlk != iterBlkEnd; + ++iterBlk, ++i) { + os << "BB" << (*iterBlk)->getNumber(); + os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")"; + if (i != 0 && i % 10 == 0) { + os << "\n"; + } else { + os << " "; + } + } +} //printOrderedBlocks + +/// Compute the reversed DFS post order of Blocks +/// +template<class PassT> void CFGStructurizer<PassT>::orderBlocks() { + int sccNum = 0; + BlockT *bb; + for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep), + sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) { + std::vector<BlockT *> &sccNext = *sccIter; + for (typename std::vector<BlockT *>::const_iterator + blockIter = sccNext.begin(), blockEnd = sccNext.end(); + blockIter != blockEnd; ++blockIter) { + bb = *blockIter; + orderedBlks.push_back(bb); + recordSccnum(bb, sccNum); + } + } + + //walk through all the block in func to check for unreachable + for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep), + blockEnd1 = FuncGTraits::nodes_end(funcRep); + blockIter1 != blockEnd1; ++blockIter1) { + BlockT *bb = &(*blockIter1); + sccNum = getSCCNum(bb); + if (sccNum == INVALIDSCCNUM) { + errs() << "unreachable block BB" << bb->getNumber() << "\n"; + } + } //end of for +} //orderBlocks + +template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) { + int numMatch = 0; + int curMatch; + + if (DEBUGME) { + errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n"; + } + + while ((curMatch = patternMatchGroup(curBlk)) > 0) { + numMatch += curMatch; + } + + if (DEBUGME) { + errs() << "End patternMatch BB" << curBlk->getNumber() + << ", numMatch = " << numMatch << "\n"; + } + + return numMatch; +} //patternMatch + +template<class PassT> +int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) { + int numMatch = 0; + numMatch += serialPatternMatch(curBlk); + numMatch += ifPatternMatch(curBlk); + //numMatch += switchPatternMatch(curBlk); + numMatch += loopendPatternMatch(curBlk); + numMatch += loopPatternMatch(curBlk); + return numMatch; +}//patternMatchGroup + +template<class PassT> +int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) { + if (curBlk->succ_size() != 1) { + return 0; + } + + BlockT *childBlk = *curBlk->succ_begin(); + if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) { + return 0; + } + + mergeSerialBlock(curBlk, childBlk); + ++numSerialPatternMatch; + return 1; +} //serialPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) { + //two edges + if (curBlk->succ_size() != 2) { + return 0; + } + + if (hasBackEdge(curBlk)) { + return 0; + } + + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk); + if (branchInstr == NULL) { + return 0; + } + + assert(CFGTraits::isCondBranch(branchInstr)); + + BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr); + BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr); + BlockT *landBlk; + int cloned = 0; + + // TODO: Simplify + if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1 + && *trueBlk->succ_begin() == *falseBlk->succ_begin()) { + landBlk = *trueBlk->succ_begin(); + } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) { + landBlk = NULL; + } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) { + landBlk = falseBlk; + falseBlk = NULL; + } else if (falseBlk->succ_size() == 1 + && *falseBlk->succ_begin() == trueBlk) { + landBlk = trueBlk; + trueBlk = NULL; + } else if (falseBlk->succ_size() == 1 + && isSameloopDetachedContbreak(trueBlk, falseBlk)) { + landBlk = *falseBlk->succ_begin(); + } else if (trueBlk->succ_size() == 1 + && isSameloopDetachedContbreak(falseBlk, trueBlk)) { + landBlk = *trueBlk->succ_begin(); + } else { + return handleJumpintoIf(curBlk, trueBlk, falseBlk); + } + + // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the + // new BB created for landBlk==NULL may introduce new challenge to the + // reduction process. + if (landBlk != NULL && + ((trueBlk && trueBlk->pred_size() > 1) + || (falseBlk && falseBlk->pred_size() > 1))) { + cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk); + } + + if (trueBlk && trueBlk->pred_size() > 1) { + trueBlk = cloneBlockForPredecessor(trueBlk, curBlk); + ++cloned; + } + + if (falseBlk && falseBlk->pred_size() > 1) { + falseBlk = cloneBlockForPredecessor(falseBlk, curBlk); + ++cloned; + } + + mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk); + + ++numIfPatternMatch; + + numClonedBlock += cloned; + + return 1 + cloned; +} //ifPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) { + return 0; +} //switchPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + typename std::vector<LoopT *> nestedLoops; + while (loopRep) { + nestedLoops.push_back(loopRep); + loopRep = loopRep->getParentLoop(); + } + + if (nestedLoops.size() == 0) { + return 0; + } + + // Process nested loop outside->inside, so "continue" to a outside loop won't + // be mistaken as "break" of the current loop. + int num = 0; + for (typename std::vector<LoopT *>::reverse_iterator + iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend(); + iter != iterEnd; ++iter) { + loopRep = *iter; + + if (getLoopLandBlock(loopRep) != NULL) { + continue; + } + + BlockT *loopHeader = loopRep->getHeader(); + + int numBreak = loopbreakPatternMatch(loopRep, loopHeader); + + if (numBreak == -1) { + break; + } + + int numCont = loopcontPatternMatch(loopRep, loopHeader); + num += numBreak + numCont; + } + + return num; +} //loopendPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) { + if (curBlk->succ_size() != 0) { + return 0; + } + + int numLoop = 0; + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + while (loopRep && loopRep->getHeader() == curBlk) { + LoopLandInfo *loopLand = getLoopLandInfo(loopRep); + if (loopLand) { + BlockT *landBlk = loopLand->landBlk; + assert(landBlk); + if (!isRetiredBlock(landBlk)) { + mergeLooplandBlock(curBlk, loopLand); + ++numLoop; + } + } + loopRep = loopRep->getParentLoop(); + } + + numLoopPatternMatch += numLoop; + + return numLoop; +} //loopPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep, + BlockT *loopHeader) { + BlockTSmallerVector exitingBlks; + loopRep->getExitingBlocks(exitingBlks); + + if (DEBUGME) { + errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n"; + } + + if (exitingBlks.size() == 0) { + setLoopLandBlock(loopRep); + return 0; + } + + // Compute the corresponding exitBlks and exit block set. + BlockTSmallerVector exitBlks; + std::set<BlockT *> exitBlkSet; + for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(), + iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) { + BlockT *exitingBlk = *iter; + BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk); + exitBlks.push_back(exitBlk); + exitBlkSet.insert(exitBlk); //non-duplicate insert + } + + assert(exitBlkSet.size() > 0); + assert(exitBlks.size() == exitingBlks.size()); + + if (DEBUGME) { + errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n"; + } + + // Find exitLandBlk. + BlockT *exitLandBlk = NULL; + int numCloned = 0; + int numSerial = 0; + + if (exitBlkSet.size() == 1) + { + exitLandBlk = *exitBlkSet.begin(); + } else { + exitLandBlk = findNearestCommonPostDom(exitBlkSet); + + if (exitLandBlk == NULL) { + return -1; + } + + bool allInPath = true; + bool allNotInPath = true; + for (typename std::set<BlockT*>::const_iterator + iter = exitBlkSet.begin(), + iterEnd = exitBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *exitBlk = *iter; + + PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true); + if (DEBUGME) { + errs() << "BB" << exitBlk->getNumber() + << " to BB" << exitLandBlk->getNumber() << " PathToKind=" + << pathKind << "\n"; + } + + allInPath = allInPath && (pathKind == SinglePath_InPath); + allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath); + + if (!allInPath && !allNotInPath) { + if (DEBUGME) { + errs() << "singlePath check fail\n"; + } + return -1; + } + } // check all exit blocks + + if (allNotInPath) { +#if 1 + + // TODO: Simplify, maybe separate function? + //funcRep->viewCFG(); + LoopT *parentLoopRep = loopRep->getParentLoop(); + BlockT *parentLoopHeader = NULL; + if (parentLoopRep) + parentLoopHeader = parentLoopRep->getHeader(); + + if (exitLandBlk == parentLoopHeader && + (exitLandBlk = relocateLoopcontBlock(parentLoopRep, + loopRep, + exitBlkSet, + exitLandBlk)) != NULL) { + if (DEBUGME) { + errs() << "relocateLoopcontBlock success\n"; + } + } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep, + exitingBlks, + exitBlks)) != NULL) { + if (DEBUGME) { + errs() << "insertEndbranchBlock success\n"; + } + } else { + if (DEBUGME) { + errs() << "loop exit fail\n"; + } + return -1; + } +#else + return -1; +#endif + } + + // Handle side entry to exit path. + exitBlks.clear(); + exitBlkSet.clear(); + for (typename BlockTSmallerVector::iterator iterExiting = + exitingBlks.begin(), + iterExitingEnd = exitingBlks.end(); + iterExiting != iterExitingEnd; ++iterExiting) { + BlockT *exitingBlk = *iterExiting; + BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk); + BlockT *newExitBlk = exitBlk; + + if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) { + newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk); + ++numCloned; + } + + numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk); + + exitBlks.push_back(newExitBlk); + exitBlkSet.insert(newExitBlk); + } + + for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(), + iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit) { + BlockT *exitBlk = *iterExit; + numSerial += serialPatternMatch(exitBlk); + } + + for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(), + iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit) { + BlockT *exitBlk = *iterExit; + if (exitBlk->pred_size() > 1) { + if (exitBlk != exitLandBlk) { + return -1; + } + } else { + if (exitBlk != exitLandBlk && + (exitBlk->succ_size() != 1 || + *exitBlk->succ_begin() != exitLandBlk)) { + return -1; + } + } + } + } // else + + // LoopT *exitLandLoop = loopInfo->getLoopFor(exitLandBlk); + exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet); + + // Fold break into the breaking block. Leverage across level breaks. + assert(exitingBlks.size() == exitBlks.size()); + for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(), + iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end(); + iterExit != iterExitEnd; ++iterExit, ++iterExiting) { + BlockT *exitBlk = *iterExit; + BlockT *exitingBlk = *iterExiting; + assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk); + LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk); + handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk); + } + + int numBreak = static_cast<int>(exitingBlks.size()); + numLoopbreakPatternMatch += numBreak; + numClonedBlock += numCloned; + return numBreak + numSerial + numCloned; +} //loopbreakPatternMatch + +template<class PassT> +int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep, + BlockT *loopHeader) { + int numCont = 0; + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk; + for (typename InvBlockGTraits::ChildIteratorType iter = + InvBlockGTraits::child_begin(loopHeader), + iterEnd = InvBlockGTraits::child_end(loopHeader); + iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + if (loopRep->contains(curBlk)) { + handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk), + loopHeader, loopRep); + contBlk.push_back(curBlk); + ++numCont; + } + } + + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator + iter = contBlk.begin(), iterEnd = contBlk.end(); + iter != iterEnd; ++iter) { + (*iter)->removeSuccessor(loopHeader); + } + + numLoopcontPatternMatch += numCont; + + return numCont; +} //loopcontPatternMatch + + +template<class PassT> +bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk, + BlockT *src2Blk) { + // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the + // same loop with LoopLandInfo without explicitly keeping track of + // loopContBlks and loopBreakBlks, this is a method to get the information. + // + if (src1Blk->succ_size() == 0) { + LoopT *loopRep = loopInfo->getLoopFor(src1Blk); + if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + if (theEntry != NULL) { + if (DEBUGME) { + errs() << "isLoopContBreakBlock yes src1 = BB" + << src1Blk->getNumber() + << " src2 = BB" << src2Blk->getNumber() << "\n"; + } + return true; + } + } + } + return false; +} //isSameloopDetachedContbreak + +template<class PassT> +int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk) { + int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk); + if (num == 0) { + if (DEBUGME) { + errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n"; + } + num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk); + } + return num; +} + +template<class PassT> +int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk) { + int num = 0; + BlockT *downBlk; + + //trueBlk could be the common post dominator + downBlk = trueBlk; + + if (DEBUGME) { + errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber() + << " true = BB" << trueBlk->getNumber() + << ", numSucc=" << trueBlk->succ_size() + << " false = BB" << falseBlk->getNumber() << "\n"; + } + + while (downBlk) { + if (DEBUGME) { + errs() << "check down = BB" << downBlk->getNumber(); + } + + if (//postDomTree->dominates(downBlk, falseBlk) && + singlePathTo(falseBlk, downBlk) == SinglePath_InPath) { + if (DEBUGME) { + errs() << " working\n"; + } + + num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk); + num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk); + + numClonedBlock += num; + num += serialPatternMatch(*headBlk->succ_begin()); + num += serialPatternMatch(*(++headBlk->succ_begin())); + num += ifPatternMatch(headBlk); + assert(num > 0); // + + break; + } + if (DEBUGME) { + errs() << " not working\n"; + } + downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL; + } // walk down the postDomTree + + return num; +} //handleJumpintoIf + +template<class PassT> +void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT *landBlk, + bool detail) { + errs() << "head = BB" << headBlk->getNumber() + << " size = " << headBlk->size(); + if (detail) { + errs() << "\n"; + headBlk->print(errs()); + errs() << "\n"; + } + + if (trueBlk) { + errs() << ", true = BB" << trueBlk->getNumber() << " size = " + << trueBlk->size() << " numPred = " << trueBlk->pred_size(); + if (detail) { + errs() << "\n"; + trueBlk->print(errs()); + errs() << "\n"; + } + } + if (falseBlk) { + errs() << ", false = BB" << falseBlk->getNumber() << " size = " + << falseBlk->size() << " numPred = " << falseBlk->pred_size(); + if (detail) { + errs() << "\n"; + falseBlk->print(errs()); + errs() << "\n"; + } + } + if (landBlk) { + errs() << ", land = BB" << landBlk->getNumber() << " size = " + << landBlk->size() << " numPred = " << landBlk->pred_size(); + if (detail) { + errs() << "\n"; + landBlk->print(errs()); + errs() << "\n"; + } + } + + errs() << "\n"; +} //showImproveSimpleJumpintoIf + +template<class PassT> +int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT **plandBlk) { + bool migrateTrue = false; + bool migrateFalse = false; + + BlockT *landBlk = *plandBlk; + + assert((trueBlk == NULL || trueBlk->succ_size() <= 1) + && (falseBlk == NULL || falseBlk->succ_size() <= 1)); + + if (trueBlk == falseBlk) { + return 0; + } + +#if 0 + if (DEBUGME) { + errs() << "improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); + } +#endif + + // unsigned landPredSize = landBlk ? landBlk->pred_size() : 0; + // May consider the # landBlk->pred_size() as it represents the number of + // assignment initReg = .. needed to insert. + migrateTrue = needMigrateBlock(trueBlk); + migrateFalse = needMigrateBlock(falseBlk); + + if (!migrateTrue && !migrateFalse) { + return 0; + } + + // If we need to migrate either trueBlk and falseBlk, migrate the rest that + // have more than one predecessors. without doing this, its predecessor + // rather than headBlk will have undefined value in initReg. + if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) { + migrateTrue = true; + } + if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) { + migrateFalse = true; + } + + if (DEBUGME) { + errs() << "before improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); + //showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 1); + } + + // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk + // + // new: headBlk => if () {initReg = 1; org trueBlk branch} else + // {initReg = 0; org falseBlk branch } + // => landBlk => if (initReg) {org trueBlk} else {org falseBlk} + // => org landBlk + // if landBlk->pred_size() > 2, put the about if-else inside + // if (initReg !=2) {...} + // + // add initReg = initVal to headBlk + unsigned initReg = + funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass); + if (!migrateTrue || !migrateFalse) { + int initVal = migrateTrue ? 0 : 1; + CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal); + } + + int numNewBlk = 0; + + if (landBlk == NULL) { + landBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(landBlk); //insert to function + + if (trueBlk) { + trueBlk->addSuccessor(landBlk); + } else { + headBlk->addSuccessor(landBlk); + } + + if (falseBlk) { + falseBlk->addSuccessor(landBlk); + } else { + headBlk->addSuccessor(landBlk); + } + + numNewBlk ++; + } + + bool landBlkHasOtherPred = (landBlk->pred_size() > 2); + + //insert AMDIL::ENDIF to avoid special case "input landBlk == NULL" + typename BlockT::iterator insertPos = + CFGTraits::getInstrPos + (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDIL::ENDIF, passRep)); + + if (landBlkHasOtherPred) { + unsigned immReg = + funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass); + CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2); + unsigned cmpResReg = + funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass); + + CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg, + initReg, immReg); + CFGTraits::insertCondBranchBefore(landBlk, insertPos, + AMDIL::IF_LOGICALZ_i32, passRep, + cmpResReg, DebugLoc()); + } + + CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDIL::IF_LOGICALNZ_i32, + passRep, initReg, DebugLoc()); + + if (migrateTrue) { + migrateInstruction(trueBlk, landBlk, insertPos); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 1). + CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1); + } + CFGTraits::insertInstrBefore(insertPos, AMDIL::ELSE, passRep); + + if (migrateFalse) { + migrateInstruction(falseBlk, landBlk, insertPos); + // need to uncondionally insert the assignment to ensure a path from its + // predecessor rather than headBlk has valid value in initReg if + // (initVal != 0) + CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0); + } + //CFGTraits::insertInstrBefore(insertPos, AMDIL::ENDIF, passRep); + + if (landBlkHasOtherPred) { + // add endif + CFGTraits::insertInstrBefore(insertPos, AMDIL::ENDIF, passRep); + + // put initReg = 2 to other predecessors of landBlk + for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(), + predIterEnd = landBlk->pred_end(); predIter != predIterEnd; + ++predIter) { + BlockT *curBlk = *predIter; + if (curBlk != trueBlk && curBlk != falseBlk) { + CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2); + } + } //for + } + if (DEBUGME) { + errs() << "result from improveSimpleJumpintoIf: "; + showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0); + //showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 1); + } + + // update landBlk + *plandBlk = landBlk; + + return numNewBlk; +} //improveSimpleJumpintoIf + +template<class PassT> +void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk, + LoopT *exitingLoop, + BlockT *exitBlk, + LoopT *exitLoop, + BlockT *landBlk) { + if (DEBUGME) { + errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop) + << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n"; + } + + RegiT initReg = INVALIDREGNUM; + if (exitingLoop != exitLoop) { + initReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass)); + assert(initReg != INVALIDREGNUM); + addLoopBreakInitReg(exitLoop, initReg); + while (exitingLoop != exitLoop && exitingLoop) { + addLoopBreakOnReg(exitingLoop, initReg); + exitingLoop = exitingLoop->getParentLoop(); + } + assert(exitingLoop == exitLoop); + } + + mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg); + +} //handleLoopbreak + +template<class PassT> +void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk, + LoopT *contingLoop, + BlockT *contBlk, + LoopT *contLoop) { + if (DEBUGME) { + errs() << "loopcontPattern cont = BB" << contingBlk->getNumber() + << " header = BB" << contBlk->getNumber() << "\n"; + + errs() << "Trying to continue loop-depth = " + << getLoopDepth(contLoop) + << " from loop-depth = " << getLoopDepth(contingLoop) << "\n"; + } + + RegiT initReg = INVALIDREGNUM; + if (contingLoop != contLoop) { + initReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass)); + assert(initReg != INVALIDREGNUM); + addLoopContInitReg(contLoop, initReg); + while (contingLoop && contingLoop->getParentLoop() != contLoop) { + addLoopBreakOnReg(contingLoop, initReg); //not addLoopContOnReg + contingLoop = contingLoop->getParentLoop(); + } + assert(contingLoop && contingLoop->getParentLoop() == contLoop); + addLoopContOnReg(contingLoop, initReg); + } + + settleLoopcontBlock(contingBlk, contBlk, initReg); + //contingBlk->removeSuccessor(loopHeader); +} //handleLoopcontBlock + +template<class PassT> +void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) { + if (DEBUGME) { + errs() << "serialPattern BB" << dstBlk->getNumber() + << " <= BB" << srcBlk->getNumber() << "\n"; + } + //removeUnconditionalBranch(dstBlk); + dstBlk->splice(dstBlk->end(), srcBlk, FirstNonDebugInstr(srcBlk), srcBlk->end()); + + dstBlk->removeSuccessor(srcBlk); + CFGTraits::cloneSuccessorList(dstBlk, srcBlk); + + removeSuccessor(srcBlk); + retireBlock(dstBlk, srcBlk); +} //mergeSerialBlock + +template<class PassT> +void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr, + BlockT *curBlk, + BlockT *trueBlk, + BlockT *falseBlk, + BlockT *landBlk) { + if (DEBUGME) { + errs() << "ifPattern BB" << curBlk->getNumber(); + errs() << "{ "; + if (trueBlk) { + errs() << "BB" << trueBlk->getNumber(); + } + errs() << " } else "; + errs() << "{ "; + if (falseBlk) { + errs() << "BB" << falseBlk->getNumber(); + } + errs() << " }\n "; + errs() << "landBlock: "; + if (landBlk == NULL) { + errs() << "NULL"; + } else { + errs() << "BB" << landBlk->getNumber(); + } + errs() << "\n"; + } + + int oldOpcode = branchInstr->getOpcode(); + DebugLoc branchDL = branchInstr->getDebugLoc(); + +// transform to +// if cond +// trueBlk +// else +// falseBlk +// endif +// landBlk + + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(curBlk, branchInstr); + CFGTraits::insertCondBranchBefore(branchInstrPos, + CFGTraits::getBranchNzeroOpcode(oldOpcode), + passRep, + branchDL); + + if (trueBlk) { + curBlk->splice(branchInstrPos, trueBlk, FirstNonDebugInstr(trueBlk), trueBlk->end()); + curBlk->removeSuccessor(trueBlk); + if (landBlk && trueBlk->succ_size()!=0) { + trueBlk->removeSuccessor(landBlk); + } + retireBlock(curBlk, trueBlk); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ELSE, passRep); + + if (falseBlk) { + curBlk->splice(branchInstrPos, falseBlk, FirstNonDebugInstr(falseBlk), + falseBlk->end()); + curBlk->removeSuccessor(falseBlk); + if (landBlk && falseBlk->succ_size() != 0) { + falseBlk->removeSuccessor(landBlk); + } + retireBlock(curBlk, falseBlk); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ENDIF, passRep); + + //curBlk->remove(branchInstrPos); + branchInstr->eraseFromParent(); + + if (landBlk && trueBlk && falseBlk) { + curBlk->addSuccessor(landBlk); + } + +} //mergeIfthenelseBlock + +template<class PassT> +void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk, + LoopLandInfo *loopLand) { + BlockT *landBlk = loopLand->landBlk; + + if (DEBUGME) { + errs() << "loopPattern header = BB" << dstBlk->getNumber() + << " land = BB" << landBlk->getNumber() << "\n"; + } + + // Loop contInitRegs are init at the beginning of the loop. + for (typename std::set<RegiT>::const_iterator iter = + loopLand->contInitRegs.begin(), + iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + + /* we last inserterd the DebugLoc in the + * BREAK_LOGICALZ_i32 or AMDIL::BREAK_LOGICALNZ statement in the current dstBlk. + * search for the DebugLoc in the that statement. + * if not found, we have to insert the empty/default DebugLoc */ + InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk); + DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc(); + + CFGTraits::insertInstrBefore(dstBlk, AMDIL::WHILELOOP, passRep, DLBreak); + // Loop breakInitRegs are init before entering the loop. + for (typename std::set<RegiT>::const_iterator iter = + loopLand->breakInitRegs.begin(), + iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) + { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + // Loop endbranchInitRegs are init before entering the loop. + for (typename std::set<RegiT>::const_iterator iter = + loopLand->endbranchInitRegs.begin(), + iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0); + } + + /* we last inserterd the DebugLoc in the continue statement in the current dstBlk + * search for the DebugLoc in the continue statement. + * if not found, we have to insert the empty/default DebugLoc */ + InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk); + DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc(); + + CFGTraits::insertInstrEnd(dstBlk, AMDIL::ENDLOOP, passRep, DLContinue); + // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this + // loop. + for (typename std::set<RegiT>::const_iterator iter = + loopLand->breakOnRegs.begin(), + iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertCondBranchEnd(dstBlk, AMDIL::BREAK_LOGICALNZ_i32, passRep, + *iter); + } + + // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this + // loop. + for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(), + iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) { + CFGTraits::insertCondBranchEnd(dstBlk, AMDIL::CONTINUE_LOGICALNZ_i32, + passRep, *iter); + } + + dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end()); + + for (typename BlockT::succ_iterator iter = landBlk->succ_begin(), + iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) { + dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of. + } + + removeSuccessor(landBlk); + retireBlock(dstBlk, landBlk); +} //mergeLooplandBlock + +template<class PassT> +void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk, + BlockT *exitBlk, + BlockT *exitLandBlk, + RegiT setReg) { + if (DEBUGME) { + errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber() + << " exit = BB" << exitBlk->getNumber() + << " land = BB" << exitLandBlk->getNumber() << "\n"; + } + + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk); + assert(branchInstr && CFGTraits::isCondBranch(branchInstr)); + + DebugLoc DL = branchInstr->getDebugLoc(); + + BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr); + int oldOpcode = branchInstr->getOpcode(); + + // transform exitingBlk to + // if ( ) { + // exitBlk (if exitBlk != exitLandBlk) + // setReg = 1 + // break + // }endif + // successor = {orgSuccessor(exitingBlk) - exitBlk} + + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(exitingBlk, branchInstr); + + if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) { + //break_logical + int newOpcode = + (trueBranch == exitBlk) ? CFGTraits::getBreakNzeroOpcode(oldOpcode) + : CFGTraits::getBreakZeroOpcode(oldOpcode); + CFGTraits::insertCondBranchBefore(branchInstrPos, newOpcode, passRep, DL); + } else { + int newOpcode = + (trueBranch == exitBlk) ? CFGTraits::getBranchNzeroOpcode(oldOpcode) + : CFGTraits::getBranchZeroOpcode(oldOpcode); + CFGTraits::insertCondBranchBefore(branchInstrPos, newOpcode, passRep, DL); + if (exitBlk != exitLandBlk) { + //splice is insert-before ... + exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(), + exitBlk->end()); + } + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1); + } + CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::BREAK, passRep); + CFGTraits::insertInstrBefore(branchInstrPos, AMDIL::ENDIF, passRep); + } //if_logical + + //now branchInst can be erase safely + //exitingBlk->eraseFromParent(branchInstr); + branchInstr->eraseFromParent(); + + //now take care of successors, retire blocks + exitingBlk->removeSuccessor(exitBlk); + if (exitBlk != exitLandBlk) { + //splice is insert-before ... + exitBlk->removeSuccessor(exitLandBlk); + retireBlock(exitingBlk, exitBlk); + } + +} //mergeLoopbreakBlock + +template<class PassT> +void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk, + BlockT *contBlk, + RegiT setReg) { + if (DEBUGME) { + errs() << "settleLoopcontBlock conting = BB" + << contingBlk->getNumber() + << ", cont = BB" << contBlk->getNumber() << "\n"; + } + + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk); + if (branchInstr) { + assert(CFGTraits::isCondBranch(branchInstr)); + typename BlockT::iterator branchInstrPos = + CFGTraits::getInstrPos(contingBlk, branchInstr); + BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr); + int oldOpcode = branchInstr->getOpcode(); + DebugLoc DL = branchInstr->getDebugLoc(); + + // transform contingBlk to + // if () { + // move instr after branchInstr + // continue + // or + // setReg = 1 + // break + // }endif + // successor = {orgSuccessor(contingBlk) - loopHeader} + + bool useContinueLogical = + (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr); + + if (useContinueLogical == false) + { + int branchOpcode = + trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode) + : CFGTraits::getBranchZeroOpcode(oldOpcode); + + CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL); + + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDIL::BREAK, passRep, DL); + } else { + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDIL::CONTINUE, passRep, DL); + } + + CFGTraits::insertInstrEnd(contingBlk, AMDIL::ENDIF, passRep, DL); + } else { + int branchOpcode = + trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode) + : CFGTraits::getContinueZeroOpcode(oldOpcode); + + CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL); + } + + //contingBlk->eraseFromParent(branchInstr); + branchInstr->eraseFromParent(); + } else { + /* if we've arrived here then we've already erased the branch instruction + * travel back up the basic block to see the last reference of our debug location + * we've just inserted that reference here so it should be representative */ + if (setReg != INVALIDREGNUM) { + CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1); + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDIL::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk)); + } else { + // insertEnd to ensure phi-moves, if exist, go before the continue-instr. + CFGTraits::insertInstrEnd(contingBlk, AMDIL::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk)); + } + } //else + +} //settleLoopcontBlock + +// BBs in exitBlkSet are determined as in break-path for loopRep, +// before we can put code for BBs as inside loop-body for loopRep +// check whether those BBs are determined as cont-BB for parentLoopRep +// earlier. +// If so, generate a new BB newBlk +// (1) set newBlk common successor of BBs in exitBlkSet +// (2) change the continue-instr in BBs in exitBlkSet to break-instr +// (3) generate continue-instr in newBlk +// +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep, + LoopT *loopRep, + std::set<BlockT *> &exitBlkSet, + BlockT *exitLandBlk) { + std::set<BlockT *> endBlkSet; + +// BlockT *parentLoopHead = parentLoopRep->getHeader(); + + + for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(), + iterEnd = exitBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *exitBlk = *iter; + BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk); + + if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL) + return NULL; + + endBlkSet.insert(endBlk); + } + + BlockT *newBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(newBlk); //insert to function + CFGTraits::insertInstrEnd(newBlk, AMDIL::CONTINUE, passRep); + SHOWNEWBLK(newBlk, "New continue block: "); + + for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(), + iterEnd = endBlkSet.end(); + iter != iterEnd; ++iter) { + BlockT *endBlk = *iter; + InstrT *contInstr = CFGTraits::getContinueInstr(endBlk); + if (contInstr) { + contInstr->eraseFromParent(); + } + endBlk->addSuccessor(newBlk); + if (DEBUGME) { + errs() << "Add new continue Block to BB" + << endBlk->getNumber() << " successors\n"; + } + } + + return newBlk; +} //relocateLoopcontBlock + + +// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as +// LoopLandBlock. This BB branch on the loop endBranchInit register to the +// pathes corresponding to the loop exiting branches. + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep, + BlockTSmallerVector &exitingBlks, + BlockTSmallerVector &exitBlks) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + + RegiT endBranchReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass)); + assert(endBranchReg >= 0); + + // reg = 0 before entering the loop + addLoopEndbranchInitReg(loopRep, endBranchReg); + + uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size()); + assert(numBlks >=2 && numBlks == exitBlks.size()); + + BlockT *preExitingBlk = exitingBlks[0]; + BlockT *preExitBlk = exitBlks[0]; + BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(preBranchBlk); //insert to function + SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: "); + + BlockT *newLandBlk = preBranchBlk; + + CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk, + newLandBlk); + preExitingBlk->removeSuccessor(preExitBlk); + preExitingBlk->addSuccessor(newLandBlk); + + //it is redundant to add reg = 0 to exitingBlks[0] + + // For 1..n th exiting path (the last iteration handles two pathes) create the + // branch to the previous path and the current path. + for (uint32_t i = 1; i < numBlks; ++i) { + BlockT *curExitingBlk = exitingBlks[i]; + BlockT *curExitBlk = exitBlks[i]; + BlockT *curBranchBlk; + + if (i == numBlks - 1) { + curBranchBlk = curExitBlk; + } else { + curBranchBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(curBranchBlk); //insert to function + SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: "); + } + + // Add reg = i to exitingBlks[i]. + CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep, + endBranchReg, i); + + // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge + // (exitingBlks[i], newLandBlk). + CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk, + newLandBlk); + curExitingBlk->removeSuccessor(curExitBlk); + curExitingBlk->addSuccessor(newLandBlk); + + // add to preBranchBlk the branch instruction: + // if (endBranchReg == preVal) + // preExitBlk + // else + // curBranchBlk + // + // preValReg = i - 1 + + DebugLoc DL; + RegiT preValReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass)); + BuildMI(preBranchBlk, DL, tii->get(AMDIL::LOADCONST_i32), preValReg) + .addImm(i - 1); //preVal + + // condResReg = (endBranchReg == preValReg) + RegiT condResReg = static_cast<int> + (funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass)); + BuildMI(preBranchBlk, DL, tii->get(AMDIL::IEQ), condResReg) + .addReg(endBranchReg).addReg(preValReg); + + BuildMI(preBranchBlk, DL, tii->get(AMDIL::BRANCH_COND_i32)) + .addMBB(preExitBlk).addReg(condResReg); + + preBranchBlk->addSuccessor(preExitBlk); + preBranchBlk->addSuccessor(curBranchBlk); + + // Update preExitingBlk, preExitBlk, preBranchBlk. + preExitingBlk = curExitingBlk; + preExitBlk = curExitBlk; + preBranchBlk = curBranchBlk; + + } //end for 1 .. n blocks + + return newLandBlk; +} //addLoopEndbranchBlock + +template<class PassT> +typename CFGStructurizer<PassT>::PathToKind +CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk, + bool allowSideEntry) { + assert(dstBlk); + + if (srcBlk == dstBlk) { + return SinglePath_InPath; + } + + while (srcBlk && srcBlk->succ_size() == 1) { + srcBlk = *srcBlk->succ_begin(); + if (srcBlk == dstBlk) { + return SinglePath_InPath; + } + + if (!allowSideEntry && srcBlk->pred_size() > 1) { + return Not_SinglePath; + } + } + + if (srcBlk && srcBlk->succ_size()==0) { + return SinglePath_NotInPath; + } + + return Not_SinglePath; +} //singlePathTo + +// If there is a single path from srcBlk to dstBlk, return the last block before +// dstBlk If there is a single path from srcBlk->end without dstBlk, return the +// last block in the path Otherwise, return NULL +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk, + bool allowSideEntry) { + assert(dstBlk); + + if (srcBlk == dstBlk) { + return srcBlk; + } + + if (srcBlk->succ_size() == 0) { + return srcBlk; + } + + while (srcBlk && srcBlk->succ_size() == 1) { + BlockT *preBlk = srcBlk; + + srcBlk = *srcBlk->succ_begin(); + if (srcBlk == NULL) { + return preBlk; + } + + if (!allowSideEntry && srcBlk->pred_size() > 1) { + return NULL; + } + } + + if (srcBlk && srcBlk->succ_size()==0) { + return srcBlk; + } + + return NULL; + +} //singlePathEnd + +template<class PassT> +int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk, + BlockT *dstBlk) { + int cloned = 0; + assert(preBlk->isSuccessor(srcBlk)); + while (srcBlk && srcBlk != dstBlk) { + assert(srcBlk->succ_size() == 1); + if (srcBlk->pred_size() > 1) { + srcBlk = cloneBlockForPredecessor(srcBlk, preBlk); + ++cloned; + } + + preBlk = srcBlk; + srcBlk = *srcBlk->succ_begin(); + } + + return cloned; +} //cloneOnSideEntryTo + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk, + BlockT *predBlk) { + assert(predBlk->isSuccessor(curBlk) && + "succBlk is not a prececessor of curBlk"); + + BlockT *cloneBlk = CFGTraits::clone(curBlk); //clone instructions + CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk); + //srcBlk, oldBlk, newBlk + + predBlk->removeSuccessor(curBlk); + predBlk->addSuccessor(cloneBlk); + + // add all successor to cloneBlk + CFGTraits::cloneSuccessorList(cloneBlk, curBlk); + + numClonedInstr += curBlk->size(); + + if (DEBUGME) { + errs() << "Cloned block: " << "BB" + << curBlk->getNumber() << "size " << curBlk->size() << "\n"; + } + + SHOWNEWBLK(cloneBlk, "result of Cloned block: "); + + return cloneBlk; +} //cloneBlockForPredecessor + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep, + BlockT *exitingBlk) { + BlockT *exitBlk = NULL; + + for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(), + iterSuccEnd = exitingBlk->succ_end(); + iterSucc != iterSuccEnd; ++iterSucc) { + BlockT *curBlk = *iterSucc; + if (!loopRep->contains(curBlk)) { + assert(exitBlk == NULL); + exitBlk = curBlk; + } + } + + assert(exitBlk != NULL); + + return exitBlk; +} //exitingBlock2ExitBlock + +template<class PassT> +void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk, + BlockT *dstBlk, + InstrIterator insertPos) { + InstrIterator spliceEnd; + //look for the input branchinstr, not the AMDIL branchinstr + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk); + if (branchInstr == NULL) { + if (DEBUGME) { + errs() << "migrateInstruction don't see branch instr\n" ; + } + spliceEnd = srcBlk->end(); + } else { + if (DEBUGME) { + errs() << "migrateInstruction see branch instr\n" ; + branchInstr->dump(); + } + spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr); + } + if (DEBUGME) { + errs() << "migrateInstruction before splice dstSize = " << dstBlk->size() + << "srcSize = " << srcBlk->size() << "\n"; + } + + //splice insert before insertPos + dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd); + + if (DEBUGME) { + errs() << "migrateInstruction after splice dstSize = " << dstBlk->size() + << "srcSize = " << srcBlk->size() << "\n"; + } +} //migrateInstruction + +// normalizeInfiniteLoopExit change +// B1: +// uncond_br LoopHeader +// +// to +// B1: +// cond_br 1 LoopHeader dummyExit +// and return the newly added dummy exit block +// +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) { + BlockT *loopHeader; + BlockT *loopLatch; + loopHeader = LoopRep->getHeader(); + loopLatch = LoopRep->getLoopLatch(); + BlockT *dummyExitBlk = NULL; + if (loopHeader!=NULL && loopLatch!=NULL) { + InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch); + if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) { + dummyExitBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(dummyExitBlk); //insert to function + SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: "); + + if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n"; + + typename BlockT::iterator insertPos = + CFGTraits::getInstrPos(loopLatch, branchInstr); + unsigned immReg = + funcRep->getRegInfo().createVirtualRegister(&AMDIL::GPRI32RegClass); + CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1); + InstrT *newInstr = + CFGTraits::insertInstrBefore(insertPos, AMDIL::BRANCH_COND_i32, passRep); + MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false); + + SHOWNEWINSTR(newInstr); + + branchInstr->eraseFromParent(); + loopLatch->addSuccessor(dummyExitBlk); + } + } + + return dummyExitBlk; +} //normalizeInfiniteLoopExit + +template<class PassT> +void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) { + InstrT *branchInstr; + + // I saw two unconditional branch in one basic block in example + // test_fc_do_while_or.c need to fix the upstream on this to remove the loop. + while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk)) + && CFGTraits::isUncondBranch(branchInstr)) { + if (DEBUGME) { + errs() << "Removing unconditional branch instruction" ; + branchInstr->dump(); + } + branchInstr->eraseFromParent(); + } +} //removeUnconditionalBranch + +template<class PassT> +void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) { + if (srcBlk->succ_size() == 2) { + BlockT *blk1 = *srcBlk->succ_begin(); + BlockT *blk2 = *(++srcBlk->succ_begin()); + + if (blk1 == blk2) { + InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk); + assert(branchInstr && CFGTraits::isCondBranch(branchInstr)); + if (DEBUGME) { + errs() << "Removing unneeded conditional branch instruction" ; + branchInstr->dump(); + } + branchInstr->eraseFromParent(); + SHOWNEWBLK(blk1, "Removing redundant successor"); + srcBlk->removeSuccessor(blk1); + } + } +} //removeRedundantConditionalBranch + +template<class PassT> +void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*, + DEFAULT_VEC_SLOTS> &retBlks) { + BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(dummyExitBlk); //insert to function + CFGTraits::insertInstrEnd(dummyExitBlk, AMDIL::RETURN, passRep); + + for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter = + retBlks.begin(), + iterEnd = retBlks.end(); iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + InstrT *curInstr = CFGTraits::getReturnInstr(curBlk); + if (curInstr) { + curInstr->eraseFromParent(); + } +#if 0 + if (curBlk->size()==0 && curBlk->pred_size() == 1) { + if (DEBUGME) { + errs() << "Replace empty block BB" << curBlk->getNumber() + << " with dummyExitBlock\n"; + } + BlockT *predb = *curBlk->pred_begin(); + predb->removeSuccessor(curBlk); + curBlk = predb; + } //handle empty curBlk +#endif + curBlk->addSuccessor(dummyExitBlk); + if (DEBUGME) { + errs() << "Add dummyExitBlock to BB" << curBlk->getNumber() + << " successors\n"; + } + } //for + + SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: "); +} //addDummyExitBlock + +template<class PassT> +void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) { + while (srcBlk->succ_size()) { + srcBlk->removeSuccessor(*srcBlk->succ_begin()); + } +} + +template<class PassT> +void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) { + BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk]; + + if (srcBlkInfo == NULL) { + srcBlkInfo = new BlockInfo(); + } + + srcBlkInfo->sccNum = sccNum; +} + +template<class PassT> +int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) { + BlockInfo *srcBlkInfo = blockInfoMap[srcBlk]; + return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM; +} + +template<class PassT> +void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) { + if (DEBUGME) { + errs() << "Retiring BB" << srcBlk->getNumber() << "\n"; + } + + BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk]; + + if (srcBlkInfo == NULL) { + srcBlkInfo = new BlockInfo(); + } + + srcBlkInfo->isRetired = true; + //int i = srcBlk->succ_size(); + //int j = srcBlk->pred_size(); + assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0 + && "can't retire block yet"); +} + +template<class PassT> +bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) { + BlockInfo *srcBlkInfo = blockInfoMap[srcBlk]; + return (srcBlkInfo && srcBlkInfo->isRetired); +} + +template<class PassT> +bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + while (loopRep && loopRep->getHeader() == curBlk) { + LoopLandInfo *loopLand = getLoopLandInfo(loopRep); + + if(loopLand == NULL) + return true; + + BlockT *landBlk = loopLand->landBlk; + assert(landBlk); + if (!isRetiredBlock(landBlk)) { + return true; + } + + loopRep = loopRep->getParentLoop(); + } + + return false; +} //isActiveLoophead + +template<class PassT> +bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) { + const unsigned blockSizeThreshold = 30; + const unsigned cloneInstrThreshold = 100; + + bool multiplePreds = blk && (blk->pred_size() > 1); + + if(!multiplePreds) + return false; + + unsigned blkSize = blk->size(); + return ((blkSize > blockSizeThreshold) + && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold)); +} //needMigrateBlock + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk, + BlockTSmallerVector &exitBlks, + std::set<BlockT *> &exitBlkSet) { + SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks; //in exit path blocks + + for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(), + predIterEnd = landBlk->pred_end(); + predIter != predIterEnd; ++predIter) { + BlockT *curBlk = *predIter; + if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) { + inpathBlks.push_back(curBlk); + } + } //for + + //if landBlk has predecessors that are not in the given loop, + //create a new block + BlockT *newLandBlk = landBlk; + if (inpathBlks.size() != landBlk->pred_size()) { + newLandBlk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(newLandBlk); //insert to function + newLandBlk->addSuccessor(landBlk); + for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter = + inpathBlks.begin(), + iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) { + BlockT *curBlk = *iter; + CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk); + //srcBlk, oldBlk, newBlk + curBlk->removeSuccessor(landBlk); + curBlk->addSuccessor(newLandBlk); + } + for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) { + if (exitBlks[i] == landBlk) { + exitBlks[i] = newLandBlk; + } + } + SHOWNEWBLK(newLandBlk, "NewLandingBlock: "); + } + + setLoopLandBlock(loopRep, newLandBlk); + + return newLandBlk; +} // recordLoopbreakLand + +template<class PassT> +void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + assert(theEntry->landBlk == NULL); + + if (blk == NULL) { + blk = funcRep->CreateMachineBasicBlock(); + funcRep->push_back(blk); //insert to function + SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: "); + } + + theEntry->landBlk = blk; + + if (DEBUGME) { + errs() << "setLoopLandBlock loop-header = BB" + << loopRep->getHeader()->getNumber() + << " landing-block = BB" << blk->getNumber() << "\n"; + } +} // setLoopLandBlock + +template<class PassT> +void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + + theEntry->breakOnRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopBreakOnReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopBreakOnReg + +template<class PassT> +void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->contOnRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopContOnReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopContOnReg + +template<class PassT> +void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->breakInitRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopBreakInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopBreakInitReg + +template<class PassT> +void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->contInitRegs.insert(regNum); + + if (DEBUGME) { + errs() << "addLoopContInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopContInitReg + +template<class PassT> +void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep, + RegiT regNum) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + if (theEntry == NULL) { + theEntry = new LoopLandInfo(); + } + theEntry->endbranchInitRegs.insert(regNum); + + if (DEBUGME) + { + errs() << "addLoopEndbranchInitReg loop-header = BB" + << loopRep->getHeader()->getNumber() + << " regNum = " << regNum << "\n"; + } +} // addLoopEndbranchInitReg + +template<class PassT> +typename CFGStructurizer<PassT>::LoopLandInfo * +CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + return theEntry; +} // getLoopLandInfo + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) { + LoopLandInfo *&theEntry = loopLandInfoMap[loopRep]; + + return theEntry ? theEntry->landBlk : NULL; +} // getLoopLandBlock + + +template<class PassT> +bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) { + LoopT *loopRep = loopInfo->getLoopFor(curBlk); + if (loopRep == NULL) + return false; + + BlockT *loopHeader = loopRep->getHeader(); + + return curBlk->isSuccessor(loopHeader); + +} //hasBackEdge + +template<class PassT> +unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) { + return loopRep ? loopRep->getLoopDepth() : 0; +} //getLoopDepth + +template<class PassT> +int CFGStructurizer<PassT>::countActiveBlock +(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart, + typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) { + int count = 0; + while (iterStart != iterEnd) { + if (!isRetiredBlock(*iterStart)) { + ++count; + } + ++iterStart; + } + + return count; +} //countActiveBlock + +// This is work around solution for findNearestCommonDominator not avaiable to +// post dom a proper fix should go to Dominators.h. + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT* +CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) { + + if (postDomTree->dominates(blk1, blk2)) { + return blk1; + } + if (postDomTree->dominates(blk2, blk1)) { + return blk2; + } + + DomTreeNodeT *node1 = postDomTree->getNode(blk1); + DomTreeNodeT *node2 = postDomTree->getNode(blk2); + + // Handle newly cloned node. + if (node1 == NULL && blk1->succ_size() == 1) { + return findNearestCommonPostDom(*blk1->succ_begin(), blk2); + } + if (node2 == NULL && blk2->succ_size() == 1) { + return findNearestCommonPostDom(blk1, *blk2->succ_begin()); + } + + if (node1 == NULL || node2 == NULL) { + return NULL; + } + + node1 = node1->getIDom(); + while (node1) { + if (postDomTree->dominates(node1, node2)) { + return node1->getBlock(); + } + node1 = node1->getIDom(); + } + + return NULL; +} + +template<class PassT> +typename CFGStructurizer<PassT>::BlockT * +CFGStructurizer<PassT>::findNearestCommonPostDom +(typename std::set<BlockT *> &blks) { + BlockT *commonDom; + typename std::set<BlockT *>::const_iterator iter = blks.begin(); + typename std::set<BlockT *>::const_iterator iterEnd = blks.end(); + for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) { + BlockT *curBlk = *iter; + if (curBlk != commonDom) { + commonDom = findNearestCommonPostDom(curBlk, commonDom); + } + } + + if (DEBUGME) { + errs() << "Common post dominator for exit blocks is "; + if (commonDom) { + errs() << "BB" << commonDom->getNumber() << "\n"; + } else { + errs() << "NULL\n"; + } + } + + return commonDom; +} //findNearestCommonPostDom + +} //end namespace llvm + +//todo: move-end + + +//===----------------------------------------------------------------------===// +// +// CFGStructurizer for AMDIL +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm +{ +class AMDILCFGStructurizer : public MachineFunctionPass +{ +public: + typedef MachineInstr InstructionType; + typedef MachineFunction FunctionType; + typedef MachineBasicBlock BlockType; + typedef MachineLoopInfo LoopinfoType; + typedef MachineDominatorTree DominatortreeType; + typedef MachinePostDominatorTree PostDominatortreeType; + typedef MachineDomTreeNode DomTreeNodeType; + typedef MachineLoop LoopType; +//private: + TargetMachine &TM; + const TargetInstrInfo *TII; + +//public: +// static char ID; + +public: + AMDILCFGStructurizer(char &pid, TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + const TargetInstrInfo *getTargetInstrInfo() const; + //bool runOnMachineFunction(MachineFunction &F); + +private: + +}; //end of class AMDILCFGStructurizer + +//char AMDILCFGStructurizer::ID = 0; +} //end of namespace llvm +AMDILCFGStructurizer::AMDILCFGStructurizer(char &pid, TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) +: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()) { +} + +const TargetInstrInfo *AMDILCFGStructurizer::getTargetInstrInfo() const { + return TII; +} +//===----------------------------------------------------------------------===// +// +// CFGPrepare +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm +{ +class AMDILCFGPrepare : public AMDILCFGStructurizer +{ +public: + static char ID; + +public: + AMDILCFGPrepare(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + + virtual const char *getPassName() const; + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + + bool runOnMachineFunction(MachineFunction &F); + +private: + +}; //end of class AMDILCFGPrepare + +char AMDILCFGPrepare::ID = 0; +} //end of namespace llvm + +AMDILCFGPrepare::AMDILCFGPrepare(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + : AMDILCFGStructurizer(ID, tm AMDIL_OPT_LEVEL_VAR) +{ +} +const char *AMDILCFGPrepare::getPassName() const { + return "AMD IL Control Flow Graph Preparation Pass"; +} + +void AMDILCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addRequired<MachineFunctionAnalysis>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); +} + +//===----------------------------------------------------------------------===// +// +// CFGPerform +// +//===----------------------------------------------------------------------===// + + +using namespace llvmCFGStruct; + +namespace llvm +{ +class AMDILCFGPerform : public AMDILCFGStructurizer +{ +public: + static char ID; + +public: + AMDILCFGPerform(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + virtual const char *getPassName() const; + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnMachineFunction(MachineFunction &F); + +private: + +}; //end of class AMDILCFGPerform + +char AMDILCFGPerform::ID = 0; +} //end of namespace llvm + + AMDILCFGPerform::AMDILCFGPerform(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) +: AMDILCFGStructurizer(ID, tm AMDIL_OPT_LEVEL_VAR) +{ +} + +const char *AMDILCFGPerform::getPassName() const { + return "AMD IL Control Flow Graph structurizer Pass"; +} + +void AMDILCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<MachineFunctionAnalysis>(); + AU.addRequired<MachineFunctionAnalysis>(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + AU.addRequired<MachineLoopInfo>(); +} + +//===----------------------------------------------------------------------===// +// +// CFGStructTraits<AMDILCFGStructurizer> +// +//===----------------------------------------------------------------------===// + +namespace llvmCFGStruct +{ +// this class is tailor to the AMDIL backend +template<> +struct CFGStructTraits<AMDILCFGStructurizer> +{ + typedef int RegiT; + + static int getBreakNzeroOpcode(int oldOpcode) { + switch(oldOpcode) { + ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::BREAK_LOGICALNZ); + default: + assert(0 && "internal error"); + }; + return -1; + } + + static int getBreakZeroOpcode(int oldOpcode) { + switch(oldOpcode) { + ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::BREAK_LOGICALZ); + default: + assert(0 && "internal error"); + }; + return -1; + } + + static int getBranchNzeroOpcode(int oldOpcode) { + switch(oldOpcode) { + ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::IF_LOGICALNZ); + default: + assert(0 && "internal error"); + }; + return -1; + } + + static int getBranchZeroOpcode(int oldOpcode) { + switch(oldOpcode) { + ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::IF_LOGICALZ); + default: + assert(0 && "internal error"); + }; + return -1; + } + + static int getContinueNzeroOpcode(int oldOpcode) + { + switch(oldOpcode) { + ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::CONTINUE_LOGICALNZ); + default: + assert(0 && "internal error"); + }; + return -1; + } + + static int getContinueZeroOpcode(int oldOpcode) { + switch(oldOpcode) { + ExpandCaseToAllScalarReturn(AMDIL::BRANCH_COND, AMDIL::CONTINUE_LOGICALZ); + default: + assert(0 && "internal error"); + }; + return -1; + } + +// the explicitly represented branch target is the true branch target +#define getExplicitBranch getTrueBranch +#define setExplicitBranch setTrueBranch + + static MachineBasicBlock *getTrueBranch(MachineInstr *instr) { + return instr->getOperand(0).getMBB(); + } + + static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) { + instr->getOperand(0).setMBB(blk); + } + + static MachineBasicBlock * + getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) { + assert(blk->succ_size() == 2); + MachineBasicBlock *trueBranch = getTrueBranch(instr); + MachineBasicBlock::succ_iterator iter = blk->succ_begin(); + MachineBasicBlock::succ_iterator iterNext = iter; + ++iterNext; + + return (*iter == trueBranch) ? *iterNext : *iter; + } + + static bool isCondBranch(MachineInstr *instr) { + switch (instr->getOpcode()) { + ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND); + break; + default: + return false; + } + return true; + } + + static bool isUncondBranch(MachineInstr *instr) { + switch (instr->getOpcode()) { + case AMDIL::BRANCH: + break; + default: + return false; + } + return true; + } + + static bool isPhimove(MachineInstr *instr) { + switch (instr->getOpcode()) { + ExpandCaseToAllTypes(AMDIL::MOVE); + break; + default: + return false; + } + return true; + } + + static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) { + //get DebugLoc from the first MachineBasicBlock instruction with debug info + DebugLoc DL; + for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) { + MachineInstr *instr = &(*iter); + if (instr->getDebugLoc().isUnknown() == false) { + DL = instr->getDebugLoc(); + } + } + return DL; + } + + static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + MachineInstr *instr = &*iter; + if (instr && (isCondBranch(instr) || isUncondBranch(instr))) { + return instr; + } + return NULL; + } + + // The correct naming for this is getPossibleLoopendBlockBranchInstr. + // + // BB with backward-edge could have move instructions after the branch + // instruction. Such move instruction "belong to" the loop backward-edge. + // + static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) { + for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(), + iterEnd = blk->rend(); iter != iterEnd; ++iter) { + // FIXME: Simplify + MachineInstr *instr = &*iter; + if (instr) { + if (isCondBranch(instr) || isUncondBranch(instr)) { + return instr; + } else if (!isPhimove(instr)) { + break; + } + } + } + return NULL; + } + + static MachineInstr *getReturnInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + if (iter != blk->rend()) { + MachineInstr *instr = &(*iter); + if (instr->getOpcode() == AMDIL::RETURN) { + return instr; + } + } + return NULL; + } + + static MachineInstr *getContinueInstr(MachineBasicBlock *blk) { + MachineBasicBlock::reverse_iterator iter = blk->rbegin(); + if (iter != blk->rend()) { + MachineInstr *instr = &(*iter); + if (instr->getOpcode() == AMDIL::CONTINUE) { + return instr; + } + } + return NULL; + } + + static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) { + for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) { + MachineInstr *instr = &(*iter); + if ((instr->getOpcode() == AMDIL::BREAK_LOGICALNZ_i32) || (instr->getOpcode() == AMDIL::BREAK_LOGICALZ_i32)) { + return instr; + } + } + return NULL; + } + + static bool isReturnBlock(MachineBasicBlock *blk) { + MachineInstr *instr = getReturnInstr(blk); + bool isReturn = (blk->succ_size() == 0); + if (instr) { + assert(isReturn); + } else if (isReturn) { + if (DEBUGME) { + errs() << "BB" << blk->getNumber() + <<" is return block without RETURN instr\n"; + } + } + + return isReturn; + } + + static MachineBasicBlock::iterator + getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) { + assert(instr->getParent() == blk && "instruction doesn't belong to block"); + MachineBasicBlock::iterator iter = blk->begin(); + MachineBasicBlock::iterator iterEnd = blk->end(); + while (&(*iter) != instr && iter != iterEnd) { + ++iter; + } + + assert(iter != iterEnd); + return iter; + }//getInstrPos + + static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode, + AMDILCFGStructurizer *passRep) { + return insertInstrBefore(blk,newOpcode,passRep,DebugLoc()); + } //insertInstrBefore + + static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode, + AMDILCFGStructurizer *passRep, DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL); + + MachineBasicBlock::iterator res; + if (blk->begin() != blk->end()) { + blk->insert(blk->begin(), newInstr); + } else { + blk->push_back(newInstr); + } + + SHOWNEWINSTR(newInstr); + + return newInstr; + } //insertInstrBefore + + static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode, + AMDILCFGStructurizer *passRep) { + insertInstrEnd(blk,newOpcode,passRep,DebugLoc()); + } //insertInstrEnd + + static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode, + AMDILCFGStructurizer *passRep, DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = blk->getParent() + ->CreateMachineInstr(tii->get(newOpcode), DL); + + blk->push_back(newInstr); + //assume the instruction doesn't take any reg operand ... + + SHOWNEWINSTR(newInstr); + } //insertInstrEnd + + static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos, + int newOpcode, + AMDILCFGStructurizer *passRep) { + MachineInstr *oldInstr = &(*instrPos); + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), + DebugLoc()); + + blk->insert(instrPos, newInstr); + //assume the instruction doesn't take any reg operand ... + + SHOWNEWINSTR(newInstr); + return newInstr; + } //insertInstrBefore + + static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos, + int newOpcode, + AMDILCFGStructurizer *passRep, + DebugLoc DL) { + MachineInstr *oldInstr = &(*instrPos); + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), + DL); + + blk->insert(instrPos, newInstr); + MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(), + false); + + SHOWNEWINSTR(newInstr); + //erase later oldInstr->eraseFromParent(); + } //insertCondBranchBefore + + static void insertCondBranchBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator insertPos, + int newOpcode, + AMDILCFGStructurizer *passRep, + RegiT regNum, + DebugLoc DL) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL); + + //insert before + blk->insert(insertPos, newInstr); + MachineInstrBuilder(newInstr).addReg(regNum, false); + + SHOWNEWINSTR(newInstr); + } //insertCondBranchBefore + + static void insertCondBranchEnd(MachineBasicBlock *blk, + int newOpcode, + AMDILCFGStructurizer *passRep, + RegiT regNum) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc()); + + blk->push_back(newInstr); + MachineInstrBuilder(newInstr).addReg(regNum, false); + + SHOWNEWINSTR(newInstr); + } //insertCondBranchEnd + + + static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos, + AMDILCFGStructurizer *passRep, + RegiT regNum, int regVal) { + MachineInstr *oldInstr = &(*instrPos); + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineBasicBlock *blk = oldInstr->getParent(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(AMDIL::LOADCONST_i32), + DebugLoc()); + MachineInstrBuilder(newInstr).addReg(regNum, RegState::Define); //set target + MachineInstrBuilder(newInstr).addImm(regVal); //set src value + + blk->insert(instrPos, newInstr); + + SHOWNEWINSTR(newInstr); + } //insertAssignInstrBefore + + static void insertAssignInstrBefore(MachineBasicBlock *blk, + AMDILCFGStructurizer *passRep, + RegiT regNum, int regVal) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(AMDIL::LOADCONST_i32), + DebugLoc()); + MachineInstrBuilder(newInstr).addReg(regNum, RegState::Define); //set target + MachineInstrBuilder(newInstr).addImm(regVal); //set src value + + if (blk->begin() != blk->end()) { + blk->insert(blk->begin(), newInstr); + } else { + blk->push_back(newInstr); + } + + SHOWNEWINSTR(newInstr); + + } //insertInstrBefore + + static void insertCompareInstrBefore(MachineBasicBlock *blk, + MachineBasicBlock::iterator instrPos, + AMDILCFGStructurizer *passRep, + RegiT dstReg, RegiT src1Reg, + RegiT src2Reg) { + const TargetInstrInfo *tii = passRep->getTargetInstrInfo(); + MachineInstr *newInstr = + blk->getParent()->CreateMachineInstr(tii->get(AMDIL::IEQ), DebugLoc()); + + MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target + MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value + MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value + + blk->insert(instrPos, newInstr); + SHOWNEWINSTR(newInstr); + + } //insertCompareInstrBefore + + static void cloneSuccessorList(MachineBasicBlock *dstBlk, + MachineBasicBlock *srcBlk) { + for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(), + iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) { + dstBlk->addSuccessor(*iter); // *iter's predecessor is also taken care of + } + } //cloneSuccessorList + + static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) { + MachineFunction *func = srcBlk->getParent(); + MachineBasicBlock *newBlk = func->CreateMachineBasicBlock(); + func->push_back(newBlk); //insert to function + //newBlk->setNumber(srcBlk->getNumber()); + for (MachineBasicBlock::iterator iter = srcBlk->begin(), + iterEnd = srcBlk->end(); + iter != iterEnd; ++iter) { + MachineInstr *instr = func->CloneMachineInstr(iter); + // This is a workaround for LLVM bugzilla 8420 because CloneMachineInstr + // does not clone the AsmPrinterFlags. + instr->setAsmPrinterFlag( + (llvm::MachineInstr::CommentFlag)iter->getAsmPrinterFlags()); + newBlk->push_back(instr); + } + return newBlk; + } + + //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because + //the AMDIL instruction is not recognized as terminator fix this and retire + //this routine + static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk, + MachineBasicBlock *oldBlk, + MachineBasicBlock *newBlk) { + MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk); + if (branchInstr && isCondBranch(branchInstr) && + getExplicitBranch(branchInstr) == oldBlk) { + setExplicitBranch(branchInstr, newBlk); + } + } + + static void wrapup(MachineBasicBlock *entryBlk) { + assert((!entryBlk->getParent()->getJumpTableInfo() + || entryBlk->getParent()->getJumpTableInfo()->isEmpty()) + && "found a jump table"); + + //collect continue right before endloop + SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr; + MachineBasicBlock::iterator pre = entryBlk->begin(); + MachineBasicBlock::iterator iterEnd = entryBlk->end(); + MachineBasicBlock::iterator iter = pre; + while (iter != iterEnd) { + if (pre->getOpcode() == AMDIL::CONTINUE + && iter->getOpcode() == AMDIL::ENDLOOP) { + contInstr.push_back(pre); + } + pre = iter; + ++iter; + } //end while + + //delete continue right before endloop + for (unsigned i = 0; i < contInstr.size(); ++i) { + contInstr[i]->eraseFromParent(); + } + + // TODO to fix up jump table so later phase won't be confused. if + // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but + // there isn't such an interface yet. alternatively, replace all the other + // blocks in the jump table with the entryBlk //} + + } //wrapup + + static MachineDominatorTree *getDominatorTree(AMDILCFGStructurizer &pass) { + return &pass.getAnalysis<MachineDominatorTree>(); + } + + static MachinePostDominatorTree* + getPostDominatorTree(AMDILCFGStructurizer &pass) { + return &pass.getAnalysis<MachinePostDominatorTree>(); + } + + static MachineLoopInfo *getLoopInfo(AMDILCFGStructurizer &pass) { + return &pass.getAnalysis<MachineLoopInfo>(); + } +}; // template class CFGStructTraits +} //end of namespace llvm + +// createAMDILCFGPreparationPass- Returns a pass +FunctionPass *llvm::createAMDILCFGPreparationPass(TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) { + return new AMDILCFGPrepare(tm AMDIL_OPT_LEVEL_VAR); +} + +bool AMDILCFGPrepare::runOnMachineFunction(MachineFunction &func) { + return llvmCFGStruct::CFGStructurizer<AMDILCFGStructurizer>().prepare(func, + *this); +} + +// createAMDILCFGStructurizerPass- Returns a pass +FunctionPass *llvm::createAMDILCFGStructurizerPass(TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) { + return new AMDILCFGPerform(tm AMDIL_OPT_LEVEL_VAR); +} + +bool AMDILCFGPerform::runOnMachineFunction(MachineFunction &func) { + return llvmCFGStruct::CFGStructurizer<AMDILCFGStructurizer>().run(func, + *this); +} + +//end of file newline goes below + diff --git a/src/gallium/drivers/radeon/AMDILCallingConv.td b/src/gallium/drivers/radeon/AMDILCallingConv.td new file mode 100644 index 00000000000..c37ff0a7e7c --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILCallingConv.td @@ -0,0 +1,75 @@ +//===- AMDILCallingConv.td - Calling Conventions AMDIL -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This describes the calling conventions for the AMDIL architectures. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Return Value Calling Conventions +//===----------------------------------------------------------------------===// + +// AMDIL 32-bit C return-value convention. +def RetCC_AMDIL32 : CallingConv<[ + // Since IL has no return values, all values can be emulated on the stack + // The stack can then be mapped to a number of sequential virtual registers + // in IL + + // Integer and FP scalar values get put on the stack at 16-byte alignment + // but with a size of 4 bytes + CCIfType<[i1, i8, i16, i32, f32, f64, i64], CCAssignToReg< + [ + R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767 +]> >, + + // 2-element Short vector types get 16 byte alignment and size of 8 bytes + CCIfType<[v2i32, v2f32, v2i8, v4i8, v2i16, v4i16], CCAssignToReg< +[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767 +]> >, + + // 4-element Short vector types get 16 byte alignment and size of 16 bytes + CCIfType<[v4i32, v4f32], CCAssignToReg< +[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767 +]> >, + + // 2-element 64-bit vector types get aligned to 16 bytes with a size of 16 bytes + CCIfType<[v2f64, v2i64], CCAssignToReg< +[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767 +]> >, CCAssignToStack<16, 16> +]>; + +// AMDIL 32-bit C Calling convention. +def CC_AMDIL32 : CallingConv<[ + // Since IL has parameter values, all values can be emulated on the stack + // The stack can then be mapped to a number of sequential virtual registers + // in IL + // Integer and FP scalar values get put on the stack at 16-byte alignment + // but with a size of 4 bytes + // Integer and FP scalar values get put on the stack at 16-byte alignment + // but with a size of 4 bytes + CCIfType<[i1, i8, i16, i32, f32, f64, i64], CCAssignToReg< +[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767 +]> >, + + // 2-element Short vector types get 16 byte alignment and size of 8 bytes + CCIfType<[v2i32, v2f32, v2i8, v4i8, v2i16, v4i16], CCAssignToReg< +[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767 +]> >, + + // 4-element Short vector types get 16 byte alignment and size of 16 bytes + CCIfType<[v4i32, v4f32], CCAssignToReg< +[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767 +]> >, + + // 2-element 64-bit vector types get aligned to 16 bytes with a size of 16 bytes + CCIfType<[v2f64, v2i64], CCAssignToReg< +[R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109, R110, R111, R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, R125, R126, R127, R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, R141, R142, R143, R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, R157, R158, R159, R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, R173, R174, R175, R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, R189, R190, R191, R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, R205, R206, R207, R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, R221, R222, R223, R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, R237, R238, R239, R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, R253, R254, R255, R256, R257, R258, R259, R260, R261, R262, R263, R264, R265, R266, R267, R268, R269, R270, R271, R272, R273, R274, R275, R276, R277, R278, R279, R280, R281, R282, R283, R284, R285, R286, R287, R288, R289, R290, R291, R292, R293, R294, R295, R296, R297, R298, R299, R300, R301, R302, R303, R304, R305, R306, R307, R308, R309, R310, R311, R312, R313, R314, R315, R316, R317, R318, R319, R320, R321, R322, R323, R324, R325, R326, R327, R328, R329, R330, R331, R332, R333, R334, R335, R336, R337, R338, R339, R340, R341, R342, R343, R344, R345, R346, R347, R348, R349, R350, R351, R352, R353, R354, R355, R356, R357, R358, R359, R360, R361, R362, R363, R364, R365, R366, R367, R368, R369, R370, R371, R372, R373, R374, R375, R376, R377, R378, R379, R380, R381, R382, R383, R384, R385, R386, R387, R388, R389, R390, R391, R392, R393, R394, R395, R396, R397, R398, R399, R400, R401, R402, R403, R404, R405, R406, R407, R408, R409, R410, R411, R412, R413, R414, R415, R416, R417, R418, R419, R420, R421, R422, R423, R424, R425, R426, R427, R428, R429, R430, R431, R432, R433, R434, R435, R436, R437, R438, R439, R440, R441, R442, R443, R444, R445, R446, R447, R448, R449, R450, R451, R452, R453, R454, R455, R456, R457, R458, R459, R460, R461, R462, R463, R464, R465, R466, R467, R468, R469, R470, R471, R472, R473, R474, R475, R476, R477, R478, R479, R480, R481, R482, R483, R484, R485, R486, R487, R488, R489, R490, R491, R492, R493, R494, R495, R496, R497, R498, R499, R500, R501, R502, R503, R504, R505, R506, R507, R508, R509, R510, R511, R512, R513, R514, R515, R516, R517, R518, R519, R520, R521, R522, R523, R524, R525, R526, R527, R528, R529, R530, R531, R532, R533, R534, R535, R536, R537, R538, R539, R540, R541, R542, R543, R544, R545, R546, R547, R548, R549, R550, R551, R552, R553, R554, R555, R556, R557, R558, R559, R560, R561, R562, R563, R564, R565, R566, R567, R568, R569, R570, R571, R572, R573, R574, R575, R576, R577, R578, R579, R580, R581, R582, R583, R584, R585, R586, R587, R588, R589, R590, R591, R592, R593, R594, R595, R596, R597, R598, R599, R600, R601, R602, R603, R604, R605, R606, R607, R608, R609, R610, R611, R612, R613, R614, R615, R616, R617, R618, R619, R620, R621, R622, R623, R624, R625, R626, R627, R628, R629, R630, R631, R632, R633, R634, R635, R636, R637, R638, R639, R640, R641, R642, R643, R644, R645, R646, R647, R648, R649, R650, R651, R652, R653, R654, R655, R656, R657, R658, R659, R660, R661, R662, R663, R664, R665, R666, R667, R668, R669, R670, R671, R672, R673, R674, R675, R676, R677, R678, R679, R680, R681, R682, R683, R684, R685, R686, R687, R688, R689, R690, R691, R692, R693, R694, R695, R696, R697, R698, R699, R700, R701, R702, R703, R704, R705, R706, R707, R708, R709, R710, R711, R712, R713, R714, R715, R716, R717, R718, R719, R720, R721, R722, R723, R724, R725, R726, R727, R728, R729, R730, R731, R732, R733, R734, R735, R736, R737, R738, R739, R740, R741, R742, R743, R744, R745, R746, R747, R748, R749, R750, R751, R752, R753, R754, R755, R756, R757, R758, R759, R760, R761, R762, R763, R764, R765, R766, R767 +]> >, CCAssignToStack<16, 16> +]>; + diff --git a/src/gallium/drivers/radeon/AMDILCodeEmitter.h b/src/gallium/drivers/radeon/AMDILCodeEmitter.h new file mode 100644 index 00000000000..b0ea1455cf9 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILCodeEmitter.h @@ -0,0 +1,46 @@ +// The LLVM Compiler Infrastructure +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===// +//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===// +//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// + +#ifndef AMDILCODEEMITTER_H +#define AMDILCODEEMITTER_H + +namespace llvm { + + /* XXX: Temp HACK to work around tablegen name generation */ + class AMDILCodeEmitter { + public: + uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const; + virtual uint64_t getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const { return 0; } + virtual unsigned GPR4AlignEncode(const MachineInstr &MI, + unsigned OpNo) const { + return 0; + } + virtual unsigned GPR2AlignEncode(const MachineInstr &MI, + unsigned OpNo) const { + return 0; + } + virtual uint64_t VOPPostEncode(const MachineInstr &MI, + uint64_t Value) const { + return Value; + } + virtual uint64_t i32LiteralEncode(const MachineInstr &MI, + unsigned OpNo) const { + return 0; + } + }; + +} // End namespace llvm + +#endif // AMDILCODEEMITTER_H diff --git a/src/gallium/drivers/radeon/AMDILCompilerErrors.h b/src/gallium/drivers/radeon/AMDILCompilerErrors.h new file mode 100644 index 00000000000..7d935f5e782 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILCompilerErrors.h @@ -0,0 +1,75 @@ +//===-- AMDILCompilerErrors.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#ifndef _AMDIL_COMPILER_ERRORS_H_ +#define _AMDIL_COMPILER_ERRORS_H_ +// Compiler errors generated by the backend that will cause +// the runtime to abort compilation. These are mainly for +// device constraint violations or invalid code. +namespace amd { + +#define INVALID_COMPUTE 0 +#define GENERIC_ERROR 1 +#define INTERNAL_ERROR 2 +#define MISSING_FUNCTION_CALL 3 +#define RESERVED_FUNCTION 4 +#define BYTE_STORE_ERROR 5 +#define UNKNOWN_TYPE_NAME 6 +#define NO_IMAGE_SUPPORT 7 +#define NO_ATOMIC_32 8 +#define NO_ATOMIC_64 9 +#define IRREDUCIBLE_CF 10 +#define INSUFFICIENT_RESOURCES 11 +#define INSUFFICIENT_LOCAL_RESOURCES 12 +#define INSUFFICIENT_PRIVATE_RESOURCES 13 +#define INSUFFICIENT_IMAGE_RESOURCES 14 +#define DOUBLE_NOT_SUPPORTED 15 +#define INVALID_CONSTANT_WRITE 16 +#define INSUFFICIENT_CONSTANT_RESOURCES 17 +#define INSUFFICIENT_COUNTER_RESOURCES 18 +#define INSUFFICIENT_REGION_RESOURCES 19 +#define REGION_MEMORY_ERROR 20 +#define MEMOP_NO_ALLOCATION 21 +#define RECURSIVE_FUNCTION 22 +#define INCORRECT_COUNTER_USAGE 23 +#define INVALID_INTRINSIC_USAGE 24 +#define NUM_ERROR_MESSAGES 25 + + + static const char *CompilerErrorMessage[NUM_ERROR_MESSAGES] = + { + "E000:Compute Shader Not Supported! ", + "E001:Generic Compiler Error Message! ", + "E002:Internal Compiler Error Message!", + "E003:Missing Function Call Detected! ", + "E004:Reserved Function Call Detected!", + "E005:Byte Addressable Stores Invalid!", + "E006:Kernel Arg Type Name Is Invalid!", + "E007:Image 1.0 Extension Unsupported!", + "E008:32bit Atomic Op are Unsupported!", + "E009:64bit Atomic Op are Unsupported!", + "E010:Irreducible ControlFlow Detected", + "E011:Insufficient Resources Detected!", + "E012:Insufficient Local Resources! ", + "E013:Insufficient Private Resources! ", + "E014:Images not currently supported! ", + "E015:Double precision not supported! ", + "E016:Invalid Constant Memory Write! ", + "E017:Max number Constant Ptr reached!", + "E018:Max number of Counters reached! ", + "E019:Insufficient Region Resources! ", + "E020:Region address space invalid! ", + "E021:MemOp with no memory allocated! ", + "E022:Recursive Function detected! ", + "E023:Illegal Inc+Dec to same counter!", + "E024:Illegal usage of intrinsic inst!" + }; + +} + +#endif // _AMDIL_COMPILER_ERRORS_H_ diff --git a/src/gallium/drivers/radeon/AMDILCompilerWarnings.h b/src/gallium/drivers/radeon/AMDILCompilerWarnings.h new file mode 100644 index 00000000000..c257980a1e4 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILCompilerWarnings.h @@ -0,0 +1,31 @@ +//===-- AMDILCompilerWarnings.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#ifndef _AMDIL_COMPILER_WARNINGS_H_ +#define _AMDIL_COMPILER_WARNINGS_H_ +/// Compiler backend generated warnings that might cause +/// issues with compilation. These warnings become errors if +/// -Werror is specified on the command line. +namespace amd { + +#define LIMIT_BARRIER 0 +#define BAD_BARRIER_OPT 1 +#define RECOVERABLE_ERROR 2 +#define NUM_WARN_MESSAGES 3 + /// All warnings must be prefixed with the W token or they might be + /// treated as errors. + static const char *CompilerWarningMessage[NUM_WARN_MESSAGES] = + { + "W000:Barrier caused limited groupsize", + "W001:Dangerous Barrier Opt Detected! ", + "W002:Recoverable BE Error Detected! " + + }; +} + +#endif // _AMDIL_COMPILER_WARNINGS_H_ diff --git a/src/gallium/drivers/radeon/AMDILConversions.td b/src/gallium/drivers/radeon/AMDILConversions.td new file mode 100644 index 00000000000..0db66ae8475 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILConversions.td @@ -0,0 +1,1022 @@ +//===-- AMDILConversions.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +def actos_i16:Pat < (i16 (anyext GPRI8:$src)), +(IL_ASSHORT_i32 + (USHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24))) >; + + +def uctos_i16:Pat < (i16 (zext GPRI8:$src)), +(IL_ASSHORT_i32 + (USHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24))) >; + + +def sctos_i16:Pat < (i16 (sext GPRI8:$src)), +(IL_ASSHORT_i32 + (SHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24))) >; + + +def actoi_i32:Pat < (i32 (anyext GPRI8:$src)), +(IL_ASINT_i32 + (USHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24))) >; + + +def uctoi_i32:Pat < (i32 (zext GPRI8:$src)), +(IL_ASINT_i32 + (USHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24))) >; + + +def sctoi_i32:Pat < (i32 (sext GPRI8:$src)), +(IL_ASINT_i32 + (SHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24))) >; + + +def actol_i64:Pat < (i64 (anyext GPRI8:$src)), +(LCREATE + (USHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24)), + (LOADCONST_i32 0)) >; + + +def uctol_i64:Pat < (i64 (zext GPRI8:$src)), +(LCREATE + (USHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24)), + (LOADCONST_i32 0)) >; + + +def sctol_i64:Pat < (i64 (sext GPRI8:$src)), +(LCREATE + (SHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24)), + (SHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 31))) >; + + +def astoi_i32:Pat < (i32 (anyext GPRI16:$src)), +(IL_ASINT_i32 + (USHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16))) >; + + +def ustoi_i32:Pat < (i32 (zext GPRI16:$src)), +(IL_ASINT_i32 + (USHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16))) >; + + +def sstoi_i32:Pat < (i32 (sext GPRI16:$src)), +(IL_ASINT_i32 + (SHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16))) >; + + +def astol_i64:Pat < (i64 (anyext GPRI16:$src)), +(LCREATE + (USHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16)), + (LOADCONST_i32 0)) >; + + +def ustol_i64:Pat < (i64 (zext GPRI16:$src)), +(LCREATE + (USHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16)), + (LOADCONST_i32 0)) >; + + +def sstol_i64:Pat < (i64 (sext GPRI16:$src)), +(LCREATE + (SHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16)), + (SHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 31))) >; + + +def aitol_i64:Pat < (i64 (anyext GPRI32:$src)), +(LCREATE + (USHR_i32 + (SHL_i32 +(IL_ASINT_i32 GPRI32:$src), + (LOADCONST_i32 0)), + (LOADCONST_i32 0)), + (LOADCONST_i32 0)) >; + + +def uitol_i64:Pat < (i64 (zext GPRI32:$src)), +(LCREATE + (USHR_i32 + (SHL_i32 +(IL_ASINT_i32 GPRI32:$src), + (LOADCONST_i32 0)), + (LOADCONST_i32 0)), + (LOADCONST_i32 0)) >; + + +def sitol_i64:Pat < (i64 (sext GPRI32:$src)), +(LCREATE + (SHR_i32 + (SHL_i32 +(IL_ASINT_i32 GPRI32:$src), + (LOADCONST_i32 0)), + (LOADCONST_i32 0)), + (SHR_i32 + (SHL_i32 +(IL_ASINT_i32 GPRI32:$src), + (LOADCONST_i32 0)), + (LOADCONST_i32 31))) >; + + + +def sctof_f32:Pat < (f32 (sint_to_fp GPRI8:$src)), +(f32 + (ITOF + (SHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24)))) >; + + +def uctof_f32:Pat < (f32 (uint_to_fp GPRI8:$src)), +(f32 + (UTOF + (USHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24)))) >; + + +def ftosc_i8:Pat < (i8 (fp_to_sint GPRF32:$src)), +(i8 + (IL_ASCHAR_i32 + (BINARY_AND_i32 +(FTOI GPRF32:$src), + (LOADCONST_i32 0x000000FF)))) >; + + +def ftouc_i8:Pat < (i8 (fp_to_uint GPRF32:$src)), +(i8 + (IL_ASCHAR_i32 + (BINARY_AND_i32 +(FTOU GPRF32:$src), + (LOADCONST_i32 0x000000FF)))) >; + + +def sctod_f64:Pat < (f64 (sint_to_fp GPRI8:$src)), +(f64 (FTOD + (ITOF + (SHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24))))) >; + + +def uctod_f64:Pat < (f64 (uint_to_fp GPRI8:$src)), +(f64 (FTOD + (UTOF + (USHR_i32 + (SHL_i32 +(IL_ASINT_i8 GPRI8:$src), + (LOADCONST_i32 24)), + (LOADCONST_i32 24))))) >; + + +def dtosc_i8:Pat < (i8 (fp_to_sint GPRF64:$src)), +(i8 + (IL_ASCHAR_i32 + (BINARY_AND_i32 +(FTOI (DTOF GPRF64:$src)), + (LOADCONST_i32 0x000000FF)))) >; + + +def dtouc_i8:Pat < (i8 (fp_to_uint GPRF64:$src)), +(i8 + (IL_ASCHAR_i32 + (BINARY_AND_i32 +(FTOU (DTOF GPRF64:$src)), + (LOADCONST_i32 0x000000FF)))) >; + + +def sstof_f32:Pat < (f32 (sint_to_fp GPRI16:$src)), +(f32 + (ITOF + (SHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16)))) >; + + +def ustof_f32:Pat < (f32 (uint_to_fp GPRI16:$src)), +(f32 + (UTOF + (USHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16)))) >; + + +def ftoss_i16:Pat < (i16 (fp_to_sint GPRF32:$src)), +(i16 + (IL_ASSHORT_i32 + (BINARY_AND_i32 +(FTOI GPRF32:$src), + (LOADCONST_i32 0x0000FFFF)))) >; + + +def ftous_i16:Pat < (i16 (fp_to_uint GPRF32:$src)), +(i16 + (IL_ASSHORT_i32 + (BINARY_AND_i32 +(FTOU GPRF32:$src), + (LOADCONST_i32 0x0000FFFF)))) >; + + +def sstod_f64:Pat < (f64 (sint_to_fp GPRI16:$src)), +(f64 (FTOD + (ITOF + (SHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16))))) >; + + +def ustod_f64:Pat < (f64 (uint_to_fp GPRI16:$src)), +(f64 (FTOD + (UTOF + (USHR_i32 + (SHL_i32 +(IL_ASINT_i16 GPRI16:$src), + (LOADCONST_i32 16)), + (LOADCONST_i32 16))))) >; + + +def dtoss_i16:Pat < (i16 (fp_to_sint GPRF64:$src)), +(i16 + (IL_ASSHORT_i32 + (BINARY_AND_i32 +(FTOI (DTOF GPRF64:$src)), + (LOADCONST_i32 0x0000FFFF)))) >; + + +def dtous_i16:Pat < (i16 (fp_to_uint GPRF64:$src)), +(i16 + (IL_ASSHORT_i32 + (BINARY_AND_i32 +(FTOU (DTOF GPRF64:$src)), + (LOADCONST_i32 0x0000FFFF)))) >; + + + + + +def stoc_i8:Pat < (i8 (trunc GPRI16:$src)), +(IL_ASCHAR_i32 + (IL_ASINT_i16 +(BINARY_AND_i16 GPRI16:$src, + (LOADCONST_i16 0x000000FF))) + ) >; + + +def itoc_i8:Pat < (i8 (trunc GPRI32:$src)), +(IL_ASCHAR_i32 + (IL_ASINT_i32 +(BINARY_AND_i32 GPRI32:$src, + (LOADCONST_i32 0x000000FF))) + ) >; + + +def itos_i16:Pat < (i16 (trunc GPRI32:$src)), +(IL_ASSHORT_i32 + (IL_ASINT_i32 +(BINARY_AND_i32 GPRI32:$src, + (LOADCONST_i32 0x0000FFFF))) + ) >; + + +def ltoc_i8:Pat < (i8 (trunc GPRI64:$src)), +(IL_ASCHAR_i32 + (BINARY_AND_i32 +(LLO GPRI64:$src), + (LOADCONST_i32 0x000000FF)) + ) >; + + +def ltos_i16:Pat < (i16 (trunc GPRI64:$src)), +(IL_ASSHORT_i32 + (BINARY_AND_i32 +(LLO GPRI64:$src), + (LOADCONST_i32 0x0000FFFF)) + ) >; + + +def ltoi_i32:Pat < (i32 (trunc GPRI64:$src)), +(IL_ASINT_i32 + (BINARY_AND_i32 +(LLO GPRI64:$src), + (LOADCONST_i32 0xFFFFFFFF)) + ) >; + + +def actos_v2i16:Pat < (v2i16 (anyext GPRV2I8:$src)), +(IL_ASV2SHORT_v2i32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))) >; + + +def uctos_v2i16:Pat < (v2i16 (zext GPRV2I8:$src)), +(IL_ASV2SHORT_v2i32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))) >; + + +def sctos_v2i16:Pat < (v2i16 (sext GPRV2I8:$src)), +(IL_ASV2SHORT_v2i32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))) >; + + +def actoi_v2i32:Pat < (v2i32 (anyext GPRV2I8:$src)), +(IL_ASV2INT_v2i32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))) >; + + +def uctoi_v2i32:Pat < (v2i32 (zext GPRV2I8:$src)), +(IL_ASV2INT_v2i32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))) >; + + +def sctoi_v2i32:Pat < (v2i32 (sext GPRV2I8:$src)), +(IL_ASV2INT_v2i32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))) >; + + +def actol_v2i64:Pat < (v2i64 (anyext GPRV2I8:$src)), +(LCREATE_v2i64 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 0))) >; + + +def uctol_v2i64:Pat < (v2i64 (zext GPRV2I8:$src)), +(LCREATE_v2i64 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 0))) >; + + +def sctol_v2i64:Pat < (v2i64 (sext GPRV2I8:$src)), +(LCREATE_v2i64 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 31)))) >; + + +def astoi_v2i32:Pat < (v2i32 (anyext GPRV2I16:$src)), +(IL_ASV2INT_v2i32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16)))) >; + + +def ustoi_v2i32:Pat < (v2i32 (zext GPRV2I16:$src)), +(IL_ASV2INT_v2i32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16)))) >; + + +def sstoi_v2i32:Pat < (v2i32 (sext GPRV2I16:$src)), +(IL_ASV2INT_v2i32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16)))) >; + + +def astol_v2i64:Pat < (v2i64 (anyext GPRV2I16:$src)), +(LCREATE_v2i64 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 0))) >; + + +def ustol_v2i64:Pat < (v2i64 (zext GPRV2I16:$src)), +(LCREATE_v2i64 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 0))) >; + + +def sstol_v2i64:Pat < (v2i64 (sext GPRV2I16:$src)), +(LCREATE_v2i64 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 31)))) >; + + +def aitol_v2i64:Pat < (v2i64 (anyext GPRV2I32:$src)), +(LCREATE_v2i64 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i32 GPRV2I32:$src), + (VCREATE_v2i32 (LOADCONST_i32 0))), + (VCREATE_v2i32 (LOADCONST_i32 0))), + (VCREATE_v2i32 (LOADCONST_i32 0))) >; + + +def uitol_v2i64:Pat < (v2i64 (zext GPRV2I32:$src)), +(LCREATE_v2i64 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i32 GPRV2I32:$src), + (VCREATE_v2i32 (LOADCONST_i32 0))), + (VCREATE_v2i32 (LOADCONST_i32 0))), + (VCREATE_v2i32 (LOADCONST_i32 0))) >; + + +def sitol_v2i64:Pat < (v2i64 (sext GPRV2I32:$src)), +(LCREATE_v2i64 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i32 GPRV2I32:$src), + (VCREATE_v2i32 (LOADCONST_i32 0))), + (VCREATE_v2i32 (LOADCONST_i32 0))), + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i32 GPRV2I32:$src), + (VCREATE_v2i32 (LOADCONST_i32 0))), + (VCREATE_v2i32 (LOADCONST_i32 31)))) >; + + + +def sctof_v2f32:Pat < (v2f32 (sint_to_fp GPRV2I8:$src)), +(v2f32 + (ITOF_v2f32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24))))) >; + + +def uctof_v2f32:Pat < (v2f32 (uint_to_fp GPRV2I8:$src)), +(v2f32 + (UTOF_v2f32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24))))) >; + + +def ftosc_v2i8:Pat < (v2i8 (fp_to_sint GPRV2F32:$src)), +(v2i8 + (IL_ASV2CHAR_v2i32 + (BINARY_AND_v2i32 +(FTOI_v2i32 GPRV2F32:$src), + (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))) >; + + +def ftouc_v2i8:Pat < (v2i8 (fp_to_uint GPRV2F32:$src)), +(v2i8 + (IL_ASV2CHAR_v2i32 + (BINARY_AND_v2i32 +(FTOU_v2i32 GPRV2F32:$src), + (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))) >; + +def sctod_v2f64:Pat < (v2f64 (sint_to_fp GPRV2I8:$src)), +(v2f64 + (VINSERT_v2f64 + (VCREATE_v2f64 + (FTOD + (VEXTRACT_v2f32 + (ITOF_v2f32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))), + 1) + )), + (FTOD + (VEXTRACT_v2f32 + (ITOF_v2f32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))), + 2) + ), 1, 256) + ) >; + +def uctod_v2f64:Pat < (v2f64 (uint_to_fp GPRV2I8:$src)), +(v2f64 + (VINSERT_v2f64 + (VCREATE_v2f64 + (FTOD + (VEXTRACT_v2f32 + (UTOF_v2f32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))), + 1) + )), + (FTOD + (VEXTRACT_v2f32 + (UTOF_v2f32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i8 GPRV2I8:$src), + (VCREATE_v2i32 (LOADCONST_i32 24))), + (VCREATE_v2i32 (LOADCONST_i32 24)))), + 2) + ), 1, 256) + ) >; + + +def dtosc_v2i8:Pat < (v2i8 (fp_to_sint GPRV2F64:$src)), +(v2i8 + (IL_ASV2CHAR_v2i32 + (BINARY_AND_v2i32 +(FTOI_v2i32 (VINSERT_v2f32 + (VCREATE_v2f32 + (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 1))), + (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 2)), 1, 256)), + (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))) >; + + +def dtouc_v2i8:Pat < (v2i8 (fp_to_uint GPRV2F64:$src)), +(v2i8 + (IL_ASV2CHAR_v2i32 + (BINARY_AND_v2i32 +(FTOU_v2i32 (VINSERT_v2f32 + (VCREATE_v2f32 + (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 1))), + (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 2)), 1, 256)), + (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))))) >; + + +def sstof_v2f32:Pat < (v2f32 (sint_to_fp GPRV2I16:$src)), +(v2f32 + (ITOF_v2f32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16))))) >; + + +def ustof_v2f32:Pat < (v2f32 (uint_to_fp GPRV2I16:$src)), +(v2f32 + (UTOF_v2f32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16))))) >; + + +def ftoss_v2i16:Pat < (v2i16 (fp_to_sint GPRV2F32:$src)), +(v2i16 + (IL_ASV2SHORT_v2i32 + (BINARY_AND_v2i32 +(FTOI_v2i32 GPRV2F32:$src), + (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))) >; + + +def ftous_v2i16:Pat < (v2i16 (fp_to_uint GPRV2F32:$src)), +(v2i16 + (IL_ASV2SHORT_v2i32 + (BINARY_AND_v2i32 +(FTOU_v2i32 GPRV2F32:$src), + (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))) >; + + +def sstod_v2f64:Pat < (v2f64 (sint_to_fp GPRV2I16:$src)), +(v2f64 + (VINSERT_v2f64 + (VCREATE_v2f64 + (FTOD + (VEXTRACT_v2f32 + (ITOF_v2f32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16)))), + 1) + )), + (FTOD + (VEXTRACT_v2f32 + (ITOF_v2f32 + (SHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16)))), + 2) + ), 1, 256) + ) >; + +def ustod_v2f64:Pat < (v2f64 (uint_to_fp GPRV2I16:$src)), +(v2f64 + (VINSERT_v2f64 + (VCREATE_v2f64 + (FTOD + (VEXTRACT_v2f32 + (UTOF_v2f32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16)))), + 1) + )), + (FTOD + (VEXTRACT_v2f32 + (UTOF_v2f32 + (USHRVEC_v2i32 + (SHLVEC_v2i32 +(IL_ASV2INT_v2i16 GPRV2I16:$src), + (VCREATE_v2i32 (LOADCONST_i32 16))), + (VCREATE_v2i32 (LOADCONST_i32 16)))), + 2) + ), 1, 256) + ) >; + + +def dtoss_v2i16:Pat < (v2i16 (fp_to_sint GPRV2F64:$src)), +(v2i16 + (IL_ASV2SHORT_v2i32 + (BINARY_AND_v2i32 +(FTOI_v2i32 (VINSERT_v2f32 + (VCREATE_v2f32 + (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 1))), + (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 2)), 1, 256)), + (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))) >; + + +def dtous_v2i16:Pat < (v2i16 (fp_to_uint GPRV2F64:$src)), +(v2i16 + (IL_ASV2SHORT_v2i32 + (BINARY_AND_v2i32 +(FTOU_v2i32 (VINSERT_v2f32 + (VCREATE_v2f32 + (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 1))), + (DTOF (VEXTRACT_v2f64 GPRV2F64:$src, 2)), 1, 256)), + (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))))) >; + +def stoc_v2i8:Pat < (v2i8 (trunc GPRV2I16:$src)), +(IL_ASV2CHAR_v2i32 + (IL_ASV2INT_v2i16 +(BINARY_AND_v2i16 GPRV2I16:$src, + (VCREATE_v2i16 (LOADCONST_i16 0x000000FF)))) + ) >; + + +def itoc_v2i8:Pat < (v2i8 (trunc GPRV2I32:$src)), +(IL_ASV2CHAR_v2i32 + (IL_ASV2INT_v2i32 +(BINARY_AND_v2i32 GPRV2I32:$src, + (VCREATE_v2i32 (LOADCONST_i32 0x000000FF)))) + ) >; + + +def itos_v2i16:Pat < (v2i16 (trunc GPRV2I32:$src)), +(IL_ASV2SHORT_v2i32 + (IL_ASV2INT_v2i32 +(BINARY_AND_v2i32 GPRV2I32:$src, + (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF)))) + ) >; + + +def ltoc_v2i8:Pat < (v2i8 (trunc GPRV2I64:$src)), +(IL_ASV2CHAR_v2i32 + (BINARY_AND_v2i32 +(LLO_v2i64 GPRV2I64:$src), + (VCREATE_v2i32 (LOADCONST_i32 0x000000FF))) + ) >; + + +def ltos_v2i16:Pat < (v2i16 (trunc GPRV2I64:$src)), +(IL_ASV2SHORT_v2i32 + (BINARY_AND_v2i32 +(LLO_v2i64 GPRV2I64:$src), + (VCREATE_v2i32 (LOADCONST_i32 0x0000FFFF))) + ) >; + + +def ltoi_v2i32:Pat < (v2i32 (trunc GPRV2I64:$src)), +(IL_ASV2INT_v2i32 + (BINARY_AND_v2i32 +(LLO_v2i64 GPRV2I64:$src), + (VCREATE_v2i32 (LOADCONST_i32 0xFFFFFFFF))) + ) >; + + + + +def actos_v4i16:Pat < (v4i16 (anyext GPRV4I8:$src)), +(IL_ASV4SHORT_v4i32 + (USHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i8 GPRV4I8:$src), + (VCREATE_v4i32 (LOADCONST_i32 24))), + (VCREATE_v4i32 (LOADCONST_i32 24)))) >; + + +def uctos_v4i16:Pat < (v4i16 (zext GPRV4I8:$src)), +(IL_ASV4SHORT_v4i32 + (USHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i8 GPRV4I8:$src), + (VCREATE_v4i32 (LOADCONST_i32 24))), + (VCREATE_v4i32 (LOADCONST_i32 24)))) >; + + +def sctos_v4i16:Pat < (v4i16 (sext GPRV4I8:$src)), +(IL_ASV4SHORT_v4i32 + (SHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i8 GPRV4I8:$src), + (VCREATE_v4i32 (LOADCONST_i32 24))), + (VCREATE_v4i32 (LOADCONST_i32 24)))) >; + + +def actoi_v4i32:Pat < (v4i32 (anyext GPRV4I8:$src)), +(IL_ASV4INT_v4i32 + (USHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i8 GPRV4I8:$src), + (VCREATE_v4i32 (LOADCONST_i32 24))), + (VCREATE_v4i32 (LOADCONST_i32 24)))) >; + + +def uctoi_v4i32:Pat < (v4i32 (zext GPRV4I8:$src)), +(IL_ASV4INT_v4i32 + (USHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i8 GPRV4I8:$src), + (VCREATE_v4i32 (LOADCONST_i32 24))), + (VCREATE_v4i32 (LOADCONST_i32 24)))) >; + + +def sctoi_v4i32:Pat < (v4i32 (sext GPRV4I8:$src)), +(IL_ASV4INT_v4i32 + (SHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i8 GPRV4I8:$src), + (VCREATE_v4i32 (LOADCONST_i32 24))), + (VCREATE_v4i32 (LOADCONST_i32 24)))) >; + + +def astoi_v4i32:Pat < (v4i32 (anyext GPRV4I16:$src)), +(IL_ASV4INT_v4i32 + (USHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i16 GPRV4I16:$src), + (VCREATE_v4i32 (LOADCONST_i32 16))), + (VCREATE_v4i32 (LOADCONST_i32 16)))) >; + + +def ustoi_v4i32:Pat < (v4i32 (zext GPRV4I16:$src)), +(IL_ASV4INT_v4i32 + (USHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i16 GPRV4I16:$src), + (VCREATE_v4i32 (LOADCONST_i32 16))), + (VCREATE_v4i32 (LOADCONST_i32 16)))) >; + + +def sstoi_v4i32:Pat < (v4i32 (sext GPRV4I16:$src)), +(IL_ASV4INT_v4i32 + (SHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i16 GPRV4I16:$src), + (VCREATE_v4i32 (LOADCONST_i32 16))), + (VCREATE_v4i32 (LOADCONST_i32 16)))) >; + + + +def sctof_v4f32:Pat < (v4f32 (sint_to_fp GPRV4I8:$src)), +(v4f32 + (ITOF_v4f32 + (SHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i8 GPRV4I8:$src), + (VCREATE_v4i32 (LOADCONST_i32 24))), + (VCREATE_v4i32 (LOADCONST_i32 24))))) >; + + +def uctof_v4f32:Pat < (v4f32 (uint_to_fp GPRV4I8:$src)), +(v4f32 + (UTOF_v4f32 + (USHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i8 GPRV4I8:$src), + (VCREATE_v4i32 (LOADCONST_i32 24))), + (VCREATE_v4i32 (LOADCONST_i32 24))))) >; + + +def ftosc_v4i8:Pat < (v4i8 (fp_to_sint GPRV4F32:$src)), +(v4i8 + (IL_ASV4CHAR_v4i32 + (BINARY_AND_v4i32 +(FTOI_v4i32 GPRV4F32:$src), + (VCREATE_v4i32 (LOADCONST_i32 0x000000FF))))) >; + + +def ftouc_v4i8:Pat < (v4i8 (fp_to_uint GPRV4F32:$src)), +(v4i8 + (IL_ASV4CHAR_v4i32 + (BINARY_AND_v4i32 +(FTOU_v4i32 GPRV4F32:$src), + (VCREATE_v4i32 (LOADCONST_i32 0x000000FF))))) >; + + +def sstof_v4f32:Pat < (v4f32 (sint_to_fp GPRV4I16:$src)), +(v4f32 + (ITOF_v4f32 + (SHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i16 GPRV4I16:$src), + (VCREATE_v4i32 (LOADCONST_i32 16))), + (VCREATE_v4i32 (LOADCONST_i32 16))))) >; + + +def ustof_v4f32:Pat < (v4f32 (uint_to_fp GPRV4I16:$src)), +(v4f32 + (UTOF_v4f32 + (USHRVEC_v4i32 + (SHLVEC_v4i32 +(IL_ASV4INT_v4i16 GPRV4I16:$src), + (VCREATE_v4i32 (LOADCONST_i32 16))), + (VCREATE_v4i32 (LOADCONST_i32 16))))) >; + + +def ftoss_v4i16:Pat < (v4i16 (fp_to_sint GPRV4F32:$src)), +(v4i16 + (IL_ASV4SHORT_v4i32 + (BINARY_AND_v4i32 +(FTOI_v4i32 GPRV4F32:$src), + (VCREATE_v4i32 (LOADCONST_i32 0x0000FFFF))))) >; + + +def ftous_v4i16:Pat < (v4i16 (fp_to_uint GPRV4F32:$src)), +(v4i16 + (IL_ASV4SHORT_v4i32 + (BINARY_AND_v4i32 +(FTOU_v4i32 GPRV4F32:$src), + (VCREATE_v4i32 (LOADCONST_i32 0x0000FFFF))))) >; + + + + + +def stoc_v4i8:Pat < (v4i8 (trunc GPRV4I16:$src)), +(IL_ASV4CHAR_v4i32 + (IL_ASV4INT_v4i16 +(BINARY_AND_v4i16 GPRV4I16:$src, + (VCREATE_v4i16 (LOADCONST_i16 0x000000FF)))) + ) >; + + +def itoc_v4i8:Pat < (v4i8 (trunc GPRV4I32:$src)), +(IL_ASV4CHAR_v4i32 + (IL_ASV4INT_v4i32 +(BINARY_AND_v4i32 GPRV4I32:$src, + (VCREATE_v4i32 (LOADCONST_i32 0x000000FF)))) + ) >; + + +def itos_v4i16:Pat < (v4i16 (trunc GPRV4I32:$src)), +(IL_ASV4SHORT_v4i32 + (IL_ASV4INT_v4i32 +(BINARY_AND_v4i32 GPRV4I32:$src, + (VCREATE_v4i32 (LOADCONST_i32 0x0000FFFF)))) + ) >; + + diff --git a/src/gallium/drivers/radeon/AMDILDevice.cpp b/src/gallium/drivers/radeon/AMDILDevice.cpp new file mode 100644 index 00000000000..aa6d8af7012 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILDevice.cpp @@ -0,0 +1,137 @@ +//===-- AMDILDevice.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILDevice.h" +#include "AMDILSubtarget.h" + +using namespace llvm; +// Default implementation for all of the classes. +AMDILDevice::AMDILDevice(AMDILSubtarget *ST) : mSTM(ST) +{ + mHWBits.resize(AMDILDeviceInfo::MaxNumberCapabilities); + mSWBits.resize(AMDILDeviceInfo::MaxNumberCapabilities); + setCaps(); + mDeviceFlag = OCL_DEVICE_ALL; +} + +AMDILDevice::~AMDILDevice() +{ + mHWBits.clear(); + mSWBits.clear(); +} + +size_t AMDILDevice::getMaxGDSSize() const +{ + return 0; +} + +uint32_t +AMDILDevice::getDeviceFlag() const +{ + return mDeviceFlag; +} + +size_t AMDILDevice::getMaxNumCBs() const +{ + if (usesHardware(AMDILDeviceInfo::ConstantMem)) { + return HW_MAX_NUM_CB; + } + + return 0; +} + +size_t AMDILDevice::getMaxCBSize() const +{ + if (usesHardware(AMDILDeviceInfo::ConstantMem)) { + return MAX_CB_SIZE; + } + + return 0; +} + +size_t AMDILDevice::getMaxScratchSize() const +{ + return 65536; +} + +uint32_t AMDILDevice::getStackAlignment() const +{ + return 16; +} + +void AMDILDevice::setCaps() +{ + mSWBits.set(AMDILDeviceInfo::HalfOps); + mSWBits.set(AMDILDeviceInfo::ByteOps); + mSWBits.set(AMDILDeviceInfo::ShortOps); + mSWBits.set(AMDILDeviceInfo::HW64BitDivMod); + if (mSTM->isOverride(AMDILDeviceInfo::NoInline)) { + mSWBits.set(AMDILDeviceInfo::NoInline); + } + if (mSTM->isOverride(AMDILDeviceInfo::MacroDB)) { + mSWBits.set(AMDILDeviceInfo::MacroDB); + } + if (mSTM->isOverride(AMDILDeviceInfo::Debug)) { + mSWBits.set(AMDILDeviceInfo::ConstantMem); + } else { + mHWBits.set(AMDILDeviceInfo::ConstantMem); + } + if (mSTM->isOverride(AMDILDeviceInfo::Debug)) { + mSWBits.set(AMDILDeviceInfo::PrivateMem); + } else { + mHWBits.set(AMDILDeviceInfo::PrivateMem); + } + if (mSTM->isOverride(AMDILDeviceInfo::BarrierDetect)) { + mSWBits.set(AMDILDeviceInfo::BarrierDetect); + } + mSWBits.set(AMDILDeviceInfo::ByteLDSOps); + mSWBits.set(AMDILDeviceInfo::LongOps); +} + +AMDILDeviceInfo::ExecutionMode +AMDILDevice::getExecutionMode(AMDILDeviceInfo::Caps Caps) const +{ + if (mHWBits[Caps]) { + assert(!mSWBits[Caps] && "Cannot set both SW and HW caps"); + return AMDILDeviceInfo::Hardware; + } + + if (mSWBits[Caps]) { + assert(!mHWBits[Caps] && "Cannot set both SW and HW caps"); + return AMDILDeviceInfo::Software; + } + + return AMDILDeviceInfo::Unsupported; + +} + +bool AMDILDevice::isSupported(AMDILDeviceInfo::Caps Mode) const +{ + return getExecutionMode(Mode) != AMDILDeviceInfo::Unsupported; +} + +bool AMDILDevice::usesHardware(AMDILDeviceInfo::Caps Mode) const +{ + return getExecutionMode(Mode) == AMDILDeviceInfo::Hardware; +} + +bool AMDILDevice::usesSoftware(AMDILDeviceInfo::Caps Mode) const +{ + return getExecutionMode(Mode) == AMDILDeviceInfo::Software; +} + +std::string +AMDILDevice::getDataLayout() const +{ + return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16" + "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" + "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" + "-v512:512:512-v1024:1024:1024-v2048:2048:2048" + "-n8:16:32:64"); +} diff --git a/src/gallium/drivers/radeon/AMDILDevice.h b/src/gallium/drivers/radeon/AMDILDevice.h new file mode 100644 index 00000000000..338212101b4 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILDevice.h @@ -0,0 +1,132 @@ +//===---- AMDILDevice.h - Define Device Data for AMDIL -----*- C++ -*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface for the subtarget data classes. +// +//===----------------------------------------------------------------------===// +// This file will define the interface that each generation needs to +// implement in order to correctly answer queries on the capabilities of the +// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef _AMDILDEVICEIMPL_H_ +#define _AMDILDEVICEIMPL_H_ +#include "AMDIL.h" +#include "llvm/ADT/BitVector.h" + +namespace llvm { + class AMDILSubtarget; + class AMDILAsmPrinter; + class AMDILIOExpansion; + class AMDILPointerManager; + class AsmPrinter; + class MCStreamer; +//===----------------------------------------------------------------------===// +// Interface for data that is specific to a single device +//===----------------------------------------------------------------------===// +class AMDILDevice { +public: + AMDILDevice(AMDILSubtarget *ST); + virtual ~AMDILDevice(); + + // Enum values for the various memory types. + enum { + RAW_UAV_ID = 0, + ARENA_UAV_ID = 1, + LDS_ID = 2, + GDS_ID = 3, + SCRATCH_ID = 4, + CONSTANT_ID = 5, + GLOBAL_ID = 6, + MAX_IDS = 7 + } IO_TYPE_IDS; + + // Returns the max LDS size that the hardware supports. Size is in + // bytes. + virtual size_t getMaxLDSSize() const = 0; + + // Returns the max GDS size that the hardware supports if the GDS is + // supported by the hardware. Size is in bytes. + virtual size_t getMaxGDSSize() const; + + // Returns the max number of hardware constant address spaces that + // are supported by this device. + virtual size_t getMaxNumCBs() const; + + // Returns the max number of bytes a single hardware constant buffer + // can support. Size is in bytes. + virtual size_t getMaxCBSize() const; + + // Returns the max number of bytes allowed by the hardware scratch + // buffer. Size is in bytes. + virtual size_t getMaxScratchSize() const; + + // Get the flag that corresponds to the device. + virtual uint32_t getDeviceFlag() const; + + // Returns the number of work-items that exist in a single hardware + // wavefront. + virtual size_t getWavefrontSize() const = 0; + + // Get the generational name of this specific device. + virtual uint32_t getGeneration() const = 0; + + // Get the stack alignment of this specific device. + virtual uint32_t getStackAlignment() const; + + // Get the resource ID for this specific device. + virtual uint32_t getResourceID(uint32_t DeviceID) const = 0; + + // Get the max number of UAV's for this device. + virtual uint32_t getMaxNumUAVs() const = 0; + + // Interface to get the IO Expansion pass for each device. + virtual FunctionPass* + getIOExpansion(TargetMachine& AMDIL_OPT_LEVEL_DECL) const = 0; + + // Interface to get the Asm printer for each device. + virtual AsmPrinter* + getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const = 0; + + // Interface to get the Pointer manager pass for each device. + virtual FunctionPass* + getPointerManager(TargetMachine& AMDIL_OPT_LEVEL_DECL) const = 0; + + + // API utilizing more detailed capabilities of each family of + // cards. If a capability is supported, then either usesHardware or + // usesSoftware returned true. If usesHardware returned true, then + // usesSoftware must return false for the same capability. Hardware + // execution means that the feature is done natively by the hardware + // and is not emulated by the softare. Software execution means + // that the feature could be done in the hardware, but there is + // software that emulates it with possibly using the hardware for + // support since the hardware does not fully comply with OpenCL + // specs. + bool isSupported(AMDILDeviceInfo::Caps Mode) const; + bool usesHardware(AMDILDeviceInfo::Caps Mode) const; + bool usesSoftware(AMDILDeviceInfo::Caps Mode) const; + virtual std::string getDataLayout() const; + static const unsigned int MAX_LDS_SIZE_700 = 16384; + static const unsigned int MAX_LDS_SIZE_800 = 32768; + static const unsigned int WavefrontSize = 64; + static const unsigned int HalfWavefrontSize = 32; + static const unsigned int QuarterWavefrontSize = 16; +protected: + virtual void setCaps(); + llvm::BitVector mHWBits; + llvm::BitVector mSWBits; + AMDILSubtarget *mSTM; + uint32_t mDeviceFlag; +private: + AMDILDeviceInfo::ExecutionMode + getExecutionMode(AMDILDeviceInfo::Caps Caps) const; +}; // AMDILDevice + +} // namespace llvm +#endif // _AMDILDEVICEIMPL_H_ diff --git a/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp b/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp new file mode 100644 index 00000000000..89b8312c294 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp @@ -0,0 +1,87 @@ +//===-- AMDILDeviceInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILDevices.h" +#include "AMDILSubtarget.h" + +using namespace llvm; +namespace llvm { + AMDILDevice* +getDeviceFromName(const std::string &deviceName, AMDILSubtarget *ptr, bool is64bit, bool is64on32bit) +{ + if (deviceName.c_str()[2] == '7') { + switch (deviceName.c_str()[3]) { + case '1': + return new AMDIL710Device(ptr); + case '7': + return new AMDIL770Device(ptr); + default: + return new AMDIL7XXDevice(ptr); + }; + } else if (deviceName == "cypress") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDILCypressDevice(ptr); + } else if (deviceName == "juniper") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDILEvergreenDevice(ptr); + } else if (deviceName == "redwood") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDILRedwoodDevice(ptr); + } else if (deviceName == "cedar") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDILCedarDevice(ptr); + } else if (deviceName == "barts" + || deviceName == "turks") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDILNIDevice(ptr); + } else if (deviceName == "cayman") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDILCaymanDevice(ptr); + } else if (deviceName == "caicos") { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDILNIDevice(ptr); + } else if (deviceName == "SI") { + return new AMDILSIDevice(ptr); + } else { +#if DEBUG + assert(!is64bit && "This device does not support 64bit pointers!"); + assert(!is64on32bit && "This device does not support 64bit" + " on 32bit pointers!"); +#endif + return new AMDIL7XXDevice(ptr); + } +} +} diff --git a/src/gallium/drivers/radeon/AMDILDeviceInfo.h b/src/gallium/drivers/radeon/AMDILDeviceInfo.h new file mode 100644 index 00000000000..c4acf9145ae --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILDeviceInfo.h @@ -0,0 +1,89 @@ +//===-- AMDILDeviceInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#ifndef _AMDILDEVICEINFO_H_ +#define _AMDILDEVICEINFO_H_ + + +#include <string> + +namespace llvm +{ + class AMDILDevice; + class AMDILSubtarget; + namespace AMDILDeviceInfo + { + // Each Capabilities can be executed using a hardware instruction, + // emulated with a sequence of software instructions, or not + // supported at all. + enum ExecutionMode { + Unsupported = 0, // Unsupported feature on the card(Default value) + Software, // This is the execution mode that is set if the + // feature is emulated in software + Hardware // This execution mode is set if the feature exists + // natively in hardware + }; + + // Any changes to this needs to have a corresponding update to the + // twiki page GPUMetadataABI + enum Caps { + HalfOps = 0x1, // Half float is supported or not. + DoubleOps = 0x2, // Double is supported or not. + ByteOps = 0x3, // Byte(char) is support or not. + ShortOps = 0x4, // Short is supported or not. + LongOps = 0x5, // Long is supported or not. + Images = 0x6, // Images are supported or not. + ByteStores = 0x7, // ByteStores available(!HD4XXX). + ConstantMem = 0x8, // Constant/CB memory. + LocalMem = 0x9, // Local/LDS memory. + PrivateMem = 0xA, // Scratch/Private/Stack memory. + RegionMem = 0xB, // OCL GDS Memory Extension. + FMA = 0xC, // Use HW FMA or SW FMA. + ArenaSegment = 0xD, // Use for Arena UAV per pointer 12-1023. + MultiUAV = 0xE, // Use for UAV per Pointer 0-7. + Reserved0 = 0xF, // ReservedFlag + NoAlias = 0x10, // Cached loads. + Signed24BitOps = 0x11, // Peephole Optimization. + // Debug mode implies that no hardware features or optimizations + // are performned and that all memory access go through a single + // uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX). + Debug = 0x12, // Debug mode is enabled. + CachedMem = 0x13, // Cached mem is available or not. + BarrierDetect = 0x14, // Detect duplicate barriers. + Reserved1 = 0x15, // Reserved flag + ByteLDSOps = 0x16, // Flag to specify if byte LDS ops are available. + ArenaVectors = 0x17, // Flag to specify if vector loads from arena work. + TmrReg = 0x18, // Flag to specify if Tmr register is supported. + NoInline = 0x19, // Flag to specify that no inlining should occur. + MacroDB = 0x1A, // Flag to specify that backend handles macrodb. + HW64BitDivMod = 0x1B, // Flag for backend to generate 64bit div/mod. + ArenaUAV = 0x1C, // Flag to specify that arena uav is supported. + PrivateUAV = 0x1D, // Flag to specify that private memory uses uav's. + // If more capabilities are required, then + // this number needs to be increased. + // All capabilities must come before this + // number. + MaxNumberCapabilities = 0x20 + }; + // These have to be in order with the older generations + // having the lower number enumerations. + enum Generation { + HD4XXX = 0, // 7XX based devices. + HD5XXX, // Evergreen based devices. + HD6XXX, // NI/Evergreen+ based devices. + HD7XXX, + HDTEST, // Experimental feature testing device. + HDNUMGEN + }; + + + } // namespace AMDILDeviceInfo + llvm::AMDILDevice* + getDeviceFromName(const std::string &name, llvm::AMDILSubtarget *ptr, bool is64bit = false, bool is64on32bit = false); +} // namespace llvm +#endif // _AMDILDEVICEINFO_H_ diff --git a/src/gallium/drivers/radeon/AMDILDevices.h b/src/gallium/drivers/radeon/AMDILDevices.h new file mode 100644 index 00000000000..3fc5fa05669 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILDevices.h @@ -0,0 +1,19 @@ +//===-- AMDILDevices.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#ifndef __AMDIL_DEVICES_H_ +#define __AMDIL_DEVICES_H_ +// Include all of the device specific header files +// This file is for Internal use only! +#include "AMDIL7XXDevice.h" +#include "AMDILDevice.h" +#include "AMDILEvergreenDevice.h" +#include "AMDILNIDevice.h" +#include "AMDILSIDevice.h" + +#endif // _AMDIL_DEVICES_H_ diff --git a/src/gallium/drivers/radeon/AMDILEGIOExpansion.cpp b/src/gallium/drivers/radeon/AMDILEGIOExpansion.cpp new file mode 100644 index 00000000000..185fc70a00b --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILEGIOExpansion.cpp @@ -0,0 +1,1093 @@ +//===-- AMDILEGIOExpansion.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// @file AMDILEGIOExpansion.cpp +// @details Implementation of IO expansion class for evergreen and NI devices. +// +#include "AMDILCompilerErrors.h" +#include "AMDILCompilerWarnings.h" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILIOExpansion.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Value.h" + +using namespace llvm; +AMDILEGIOExpansion::AMDILEGIOExpansion(TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) : AMDILImageExpansion(tm AMDIL_OPT_LEVEL_VAR) +{ +} + +AMDILEGIOExpansion::~AMDILEGIOExpansion() { +} +const char *AMDILEGIOExpansion::getPassName() const +{ + return "AMDIL EG/NI IO Expansion Pass"; +} + bool +AMDILEGIOExpansion::isImageIO(MachineInstr *MI) +{ + if (!MI->getOperand(0).isGlobal()) { + return false; + } + const llvm::StringRef& nameRef = MI->getOperand(0).getGlobal()->getName(); + const char *name = nameRef.data(); + if (nameRef.size() > 8 && !strncmp(name, "__amdil_", 8)) { + name += 8; + if (!strncmp(name, "sample_data", 11) + || !strncmp(name, "write_image", 11) + || !strncmp(name, "get_image2d_params", 18) + || !strncmp(name, "get_image3d_params", 18)) { + return true; + } + } + return false; +} +bool +AMDILEGIOExpansion::isIOInstruction(MachineInstr *MI) +{ + if (!MI) { + return false; + } + switch (MI->getOpcode()) { + default: + return AMDILIOExpansion::isIOInstruction(MI); + case AMDIL::IMAGE2D_READ: + case AMDIL::IMAGE2D_READ_UNNORM: + case AMDIL::IMAGE2D_WRITE: + case AMDIL::IMAGE2D_INFO0: + case AMDIL::IMAGE2D_INFO1: + case AMDIL::IMAGE3D_READ: + case AMDIL::IMAGE3D_READ_UNNORM: + case AMDIL::IMAGE3D_WRITE: + case AMDIL::IMAGE3D_INFO0: + case AMDIL::IMAGE3D_INFO1: + return true; + }; + return false; +} +void +AMDILEGIOExpansion::expandIOInstruction(MachineInstr *MI) +{ + assert(isIOInstruction(MI) && "Must be an IO instruction to " + "be passed to this function!"); + switch (MI->getOpcode()) { + default: + AMDILIOExpansion::expandIOInstruction(MI); + break; + case AMDIL::IMAGE2D_READ: + case AMDIL::IMAGE3D_READ: + case AMDIL::IMAGE2D_READ_UNNORM: + case AMDIL::IMAGE3D_READ_UNNORM: + expandImageLoad(mBB, MI); + break; + case AMDIL::IMAGE2D_WRITE: + case AMDIL::IMAGE3D_WRITE: + expandImageStore(mBB, MI); + break; + case AMDIL::IMAGE2D_INFO0: + case AMDIL::IMAGE2D_INFO1: + case AMDIL::IMAGE3D_INFO0: + case AMDIL::IMAGE3D_INFO1: + expandImageParam(mBB, MI); + break; + }; +} + bool +AMDILEGIOExpansion::isCacheableOp(MachineInstr *MI) +{ + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(MI, curRes); + // We only support caching on UAV11 - JeffG + if (curRes.bits.ResourceID == 11) { + return curRes.bits.CacheableRead; + } else { + return false; + } +} + bool +AMDILEGIOExpansion::isArenaOp(MachineInstr *MI) +{ + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(MI, curRes); + return curRes.bits.ResourceID + == mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID) + || curRes.bits.ResourceID >= ARENA_SEGMENT_RESERVED_UAVS; +} + void +AMDILEGIOExpansion::expandPackedData(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + if (!isPackedData(MI)) { + return; + } + // There is a bug in the CAL compiler that incorrectly + // errors when the UBIT_INSERT instruction is + if (mSTM->calVersion() < CAL_VERSION_SC_137) { + AMDIL789IOExpansion::expandPackedData(MI); + return; + } + DebugLoc DL; + // If we have packed data, then the shift size is no longer + // the same as the load size and we need to adjust accordingly + switch(getPackedID(MI)) { + default: + break; + case PACK_V2I8: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1012) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(8)).addImm(mMFI->addi32Literal(8)) + .addReg(AMDIL::R1012).addReg(AMDIL::R1011); + } + break; + case PACK_V4I8: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI_v2i64), AMDIL::R1012) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_v2i32), + AMDIL::R1011) + .addImm(mMFI->addi64Literal(8ULL | (8ULL << 32))) + .addImm(mMFI->addi64Literal(8ULL | (8ULL << 32))) + .addReg(AMDIL::R1012).addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1012) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)).addImm(mMFI->addi32Literal(16)) + .addReg(AMDIL::R1012).addReg(AMDIL::R1011); + } + break; + case PACK_V2I16: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI), AMDIL::R1012) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)).addImm(mMFI->addi32Literal(16)) + .addReg(AMDIL::R1012).addReg(AMDIL::R1011); + } + break; + case PACK_V4I16: + { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LHI_v2i64), AMDIL::R1012) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UBIT_INSERT_v2i32), AMDIL::R1011) + .addImm(mMFI->addi64Literal(16ULL | (16ULL << 32))) + .addImm(mMFI->addi64Literal(16ULL | (16ULL << 32))) + .addReg(AMDIL::R1012).addReg(AMDIL::R1011); + } + break; + case UNPACK_V2I8: + case UNPACK_V4I8: + case UNPACK_V2I16: + case UNPACK_V4I16: + AMDIL789IOExpansion::expandPackedData(MI); + break; + }; +} + + void +AMDILEGIOExpansion::expandGlobalLoad(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + bool usesArena = isArenaOp(MI); + bool cacheable = isCacheableOp(MI); + uint32_t ID = getPointerID(MI); + mKM->setOutputInst(); + if (!mMFI->usesMem(AMDILDevice::RAW_UAV_ID) + && !mMFI->usesMem(AMDILDevice::ARENA_UAV_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + // These instructions are generated before the current MI. + expandLoadStartCode(MI); + expandArenaSetup(MI); + DebugLoc DL; + if (getMemorySize(MI) == 1) { + if (usesArena) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i8), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, + (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32)))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::IEQ_v4i32), AMDIL::R1012) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008) + .addReg(AMDIL::R1012) + .addImm(mMFI->addi32Literal(0)) + .addImm(mMFI->addi32Literal(24)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1008) + .addReg(AMDIL::R1012) + .addImm(mMFI->addi32Literal(8)) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1008) + .addReg(AMDIL::R1012) + .addImm(mMFI->addi32Literal(16)) + .addReg(AMDIL::R1008); + if (cacheable) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_v4i8), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + } + } else if (getMemorySize(MI) == 2) { + if (usesArena) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i16), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(16)) + .addImm(mMFI->addi32Literal(0)); + if (cacheable) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i16), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + } + } else if (getMemorySize(MI) == 4) { + if (usesArena) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + } else { + if (cacheable) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } + } + } else if (getMemorySize(MI) == 8) { + if (usesArena) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + if (mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaVectors)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_Y_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(2); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + } + } else { + if (cacheable) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_v2i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_v2i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } + } + } else { + if (usesArena) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + if (mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaVectors)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_Y_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_Z_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_W_i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(2); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(3); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(4); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENALOAD_i32), AMDIL::R1006) + .addReg(AMDIL::R1007) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1008) + .addReg(AMDIL::R1006) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + } + } else { + if (cacheable) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOADCACHED_v4i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVRAWLOAD_v4i32), + AMDIL::R1011).addReg(AMDIL::R1010).addImm(ID); + } + } + } + // These instructions are generated after the current MI. + expandPackedData(MI); + expandExtendLoad(MI); + BuildMI(*mBB, I, MI->getDebugLoc(), + mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass))) + .addOperand(MI->getOperand(0)) + .addReg(AMDIL::R1011); + MI->getOperand(0).setReg(AMDIL::R1011); +} + + void +AMDILEGIOExpansion::expandRegionLoad(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + bool HWRegion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem); + if (!mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) { + mMFI->addErrorMsg( + amd::CompilerErrorMessage[REGION_MEMORY_ERROR]); + return; + } + if (!HWRegion || !isHardwareRegion(MI)) { + return expandGlobalLoad(MI); + } + if (!mMFI->usesMem(AMDILDevice::GDS_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + DebugLoc DL; + unsigned mulOp = 0; + uint32_t gID = getPointerID(MI); + assert(gID && "Found a GDS load that was incorrectly marked as zero ID!\n"); + if (!gID) { + gID = mSTM->device()->getResourceID(AMDILDevice::GDS_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + // These instructions are generated before the current MI. + expandLoadStartCode(MI); + switch (getMemorySize(MI)) { + default: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi128Literal(1ULL << 32, 2ULL | (3ULL << 32))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD_Y), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD_Z), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD_W), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + break; + case 1: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + mulOp = (mSTM->device()->usesSoftware(AMDILDeviceInfo::RegionMem)) + ? AMDIL::UMUL_i32 : AMDIL::UMUL24_i32; + BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + // The instruction would normally fit in right here so everything created + // after this point needs to go into the afterInst vector. + BuildMI(*mBB, I, DL, mTII->get(AMDIL::IBIT_EXTRACT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(8)) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1011); + break; + case 2: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + mulOp = (mSTM->device()->usesSoftware(AMDILDeviceInfo::RegionMem)) + ? AMDIL::UMUL_i32 : AMDIL::UMUL24_i32; + BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::IBIT_EXTRACT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1011); + break; + case 4: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + break; + case 8: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi64Literal(1ULL << 32)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSLOAD_Y), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(gID); + break; + }; + + // These instructions are generated after the current MI. + expandPackedData(MI); + expandExtendLoad(MI); + BuildMI(*mBB, I, MI->getDebugLoc(), + mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass))) + .addOperand(MI->getOperand(0)) + .addReg(AMDIL::R1011); + MI->getOperand(0).setReg(AMDIL::R1011); +} + void +AMDILEGIOExpansion::expandLocalLoad(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + bool HWLocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem); + if (!HWLocal || !isHardwareLocal(MI)) { + return expandGlobalLoad(MI); + } + if (!mMFI->usesMem(AMDILDevice::LDS_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + uint32_t lID = getPointerID(MI); + assert(lID && "Found a LDS load that was incorrectly marked as zero ID!\n"); + if (!lID) { + lID = mSTM->device()->getResourceID(AMDILDevice::LDS_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + DebugLoc DL; + unsigned mulOp = 0; + // These instructions are generated before the current MI. + expandLoadStartCode(MI); + switch (getMemorySize(MI)) { + default: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOADVEC_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + break; + case 8: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOADVEC_v2i32), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + break; + case 4: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + break; + case 1: + if (!mSTM->device()->usesHardware(AMDILDeviceInfo::ByteLDSOps)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + mulOp = (mSTM->device()->usesSoftware(AMDILDeviceInfo::LocalMem)) + ? AMDIL::UMUL_i32 : AMDIL::UMUL24_i32; + BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::IBIT_EXTRACT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(8)) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1011); + } else { + if (isSWSExtLoadInst(MI)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD_i8), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD_u8), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + } + } + break; + case 2: + if (!mSTM->device()->usesHardware(AMDILDeviceInfo::ByteLDSOps)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + mulOp = (mSTM->device()->usesSoftware(AMDILDeviceInfo::LocalMem)) + ? AMDIL::UMUL_i32 : AMDIL::UMUL24_i32; + BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(0xFFFFFFFC)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::IBIT_EXTRACT_i32), AMDIL::R1011) + .addImm(mMFI->addi32Literal(16)) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1011); + } else { + if (isSWSExtLoadInst(MI)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD_i16), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::LDSLOAD_u16), AMDIL::R1011) + .addReg(AMDIL::R1010) + .addImm(lID); + } + } + break; + } + + // These instructions are generated after the current MI. + expandPackedData(MI); + expandExtendLoad(MI); + BuildMI(*mBB, I, MI->getDebugLoc(), + mTII->get(getMoveInstFromID( + MI->getDesc().OpInfo[0].RegClass))) + .addOperand(MI->getOperand(0)) + .addReg(AMDIL::R1011); + MI->getOperand(0).setReg(AMDIL::R1011); +} + void +AMDILEGIOExpansion::expandGlobalStore(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + bool usesArena = isArenaOp(MI); + uint32_t ID = getPointerID(MI); + mKM->setOutputInst(); + if (!mMFI->usesMem(AMDILDevice::RAW_UAV_ID) + && !mMFI->usesMem(AMDILDevice::ARENA_UAV_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + DebugLoc DL; + // These instructions are expandted before the current MI. + expandStoreSetupCode(MI); + expandArenaSetup(MI); + switch (getMemorySize(MI)) { + default: + if (usesArena) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + if (mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaVectors)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_Y_i32), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_Z_i32), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_W_i32), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(2); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1011) + .addImm(2); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(3); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1011) + .addImm(3); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(ID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(4); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1011) + .addImm(4); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(ID); + } + } else { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_v4i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } + break; + case 1: + if (usesArena) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFF)); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i8), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } else { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } + break; + case 2: + if (usesArena) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFFFF)); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i16), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } else { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } + break; + case 4: + if (usesArena) { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } else { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } + break; + case 8: + if (usesArena) { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + if (mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaVectors)) { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_Y_i32), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1007) + .addReg(AMDIL::R1010) + .addImm(2); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1011) + .addImm(2); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVARENASTORE_i32), AMDIL::R1007) + .addReg(AMDIL::R1008) + .addImm(ID); + } + } else { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::UAVRAWSTORE_v2i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(ID); + } + break; + }; +} + void +AMDILEGIOExpansion::expandRegionStore(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + bool HWRegion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem); + if (!HWRegion || !isHardwareRegion(MI)) { + return expandGlobalStore(MI); + } + mKM->setOutputInst(); + if (!mMFI->usesMem(AMDILDevice::GDS_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + uint32_t gID = getPointerID(MI); + assert(gID && "Found a GDS store that was incorrectly marked as zero ID!\n"); + if (!gID) { + gID = mSTM->device()->getResourceID(AMDILDevice::GDS_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + DebugLoc DL; + unsigned mulOp = HWRegion ? AMDIL::UMUL24_i32 : AMDIL::UMUL24_i32; + // These instructions are expandted before the current MI. + expandStoreSetupCode(MI); + expandArenaSetup(MI); + switch (getMemorySize(MI)) { + default: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi128Literal(1ULL << 32, 2ULL | (3ULL << 32))); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSSTORE_Y), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSSTORE_Z), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::GDSSTORE_W), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + case 1: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1012) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, + (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32)))); + BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1006) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1007) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0xFFFFFF00)) + .addImm(mMFI->addi32Literal(0x00FFFFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal(0xFF00FFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1012) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal(0xFFFF00FF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1007); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_R_MSKOR), AMDIL::R1010) + .addReg(AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + case 2: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0x0000FFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1012) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0x0000FFFF)) + .addImm(mMFI->addi32Literal(0xFFFF0000)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(16)) + .addImm(mMFI->addi32Literal(0)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_R_MSKOR), AMDIL::R1010) + .addReg(AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + case 4: + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::GDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + case 8: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi64Literal(1ULL << 32)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::GDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::GDSSTORE_Y), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(gID); + break; + }; + +} + + void +AMDILEGIOExpansion::expandLocalStore(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + bool HWLocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem); + if (!HWLocal || !isHardwareLocal(MI)) { + return expandGlobalStore(MI); + } + DebugLoc DL; + if (!mMFI->usesMem(AMDILDevice::LDS_ID) + && mKM->isKernel()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[MEMOP_NO_ALLOCATION]); + } + uint32_t lID = getPointerID(MI); + assert(lID && "Found a LDS store that was incorrectly marked as zero ID!\n"); + if (!lID) { + lID = mSTM->device()->getResourceID(AMDILDevice::LDS_ID); + mMFI->addErrorMsg(amd::CompilerWarningMessage[RECOVERABLE_ERROR]); + } + unsigned mulOp = HWLocal ? AMDIL::UMUL24_i32 : AMDIL::UMUL24_i32; + // These instructions are expandted before the current MI. + expandStoreSetupCode(MI); + switch (getMemorySize(MI)) { + default: + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTOREVEC_v4i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(lID); + break; + case 8: + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTOREVEC_v2i32), AMDIL::MEM) + .addReg(AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(lID); + break; + case 4: + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTORE), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(lID); + break; + case 1: + if (!mSTM->device()->usesHardware(AMDILDeviceInfo::ByteLDSOps)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1012) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi128Literal(0xFFFFFFFFULL << 32, + (0xFFFFFFFEULL | (0xFFFFFFFDULL << 32)))); + BuildMI(*mBB, I, DL, mTII->get(mulOp), AMDIL::R1006) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(8)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1007) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0xFFFFFF00)) + .addImm(mMFI->addi32Literal(0x00FFFFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Y_i32), AMDIL::R1007) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal(0xFF00FFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_Z_i32), AMDIL::R1012) + .addReg(AMDIL::R1008) + .addReg(AMDIL::R1007) + .addImm(mMFI->addi32Literal(0xFFFF00FF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1007); + if (mSTM->calVersion() >= CAL_VERSION_SC_137) { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_L_MSKOR_NORET), + AMDIL::R1010) + .addReg(AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(lID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ATOM_L_ADD_NORET), + AMDIL::R1010) + .addReg(AMDIL::R1012) + .addImm(lID); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_L_OR_NORET), + AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(lID); + } + } else { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTORE_i8), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(lID); + } + break; + case 2: + if (!mSTM->device()->usesHardware(AMDILDeviceInfo::ByteLDSOps)) { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0x0000FFFF)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::BINARY_AND_i32), AMDIL::R1008) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi32Literal(3)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHR_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(1)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1012) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(0x0000FFFF)) + .addImm(mMFI->addi32Literal(0xFFFF0000)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CMOVLOG_i32), AMDIL::R1008) + .addReg(AMDIL::R1008) + .addImm(mMFI->addi32Literal(16)) + .addImm(mMFI->addi32Literal(0)); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::SHL_i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1008); + if (mSTM->calVersion() >= CAL_VERSION_SC_137) { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_L_MSKOR_NORET), + AMDIL::R1010) + .addReg(AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(lID); + } else { + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ATOM_L_ADD_NORET), + AMDIL::R1010) + .addReg(AMDIL::R1012) + .addImm(lID); + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::ATOM_L_OR_NORET), + AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(lID); + } + } else { + BuildMI(*mBB, I, MI->getDebugLoc(), mTII->get(AMDIL::LDSSTORE_i16), AMDIL::R1010) + .addReg(AMDIL::R1011) + .addImm(lID); + } + break; + } +} + + + void +AMDILEGIOExpansion::expandStoreSetupCode(MachineInstr *MI) +{ + AMDIL789IOExpansion::expandStoreSetupCode(MI); +} + void +AMDILEGIOExpansion::expandArenaSetup(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + if (!isArenaOp(MI)) { + return; + } + const MCInstrDesc &TID = (MI->getDesc()); + const MCOperandInfo &TOI = TID.OpInfo[0]; + unsigned short RegClass = TOI.RegClass; + DebugLoc DL; + switch (RegClass) { + case AMDIL::GPRV4I16RegClassID: + case AMDIL::GPRI64RegClassID: + case AMDIL::GPRF64RegClassID: + case AMDIL::GPRV2I32RegClassID: + case AMDIL::GPRV2F32RegClassID: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v2i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v2i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi64Literal(4ULL << 32)); + break; + default: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VCREATE_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::ADD_v4i32), AMDIL::R1010) + .addReg(AMDIL::R1010) + .addImm(mMFI->addi128Literal(4ULL << 32, 8ULL | (12ULL << 32))); + break; + case AMDIL::GPRI8RegClassID: + case AMDIL::GPRV2I8RegClassID: + case AMDIL::GPRI16RegClassID: + case AMDIL::GPRV2I16RegClassID: + case AMDIL::GPRV4I8RegClassID: + case AMDIL::GPRI32RegClassID: + case AMDIL::GPRF32RegClassID: + break; + }; +} + diff --git a/src/gallium/drivers/radeon/AMDILELFWriterInfo.cpp b/src/gallium/drivers/radeon/AMDILELFWriterInfo.cpp new file mode 100644 index 00000000000..84ae9a33413 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILELFWriterInfo.cpp @@ -0,0 +1,71 @@ +//===-- AMDILELFWriterInfo.cpp - Elf Writer Info for AMDIL ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements ELF writer information for the AMDIL backend. +// +//===----------------------------------------------------------------------===// + +#include "AMDILELFWriterInfo.h" +#include "AMDIL.h" +#include "llvm/Function.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetELFWriterInfo.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Implementation of the AMDILELFWriterInfo class +//===----------------------------------------------------------------------===// +AMDILELFWriterInfo::AMDILELFWriterInfo(bool is64bit, bool endian) + : TargetELFWriterInfo(is64bit, endian) +{ +} + +AMDILELFWriterInfo::~AMDILELFWriterInfo() { +} + +unsigned AMDILELFWriterInfo::getRelocationType(unsigned MachineRelTy) const { + assert(0 && "What do we do here? Lets assert an analyze"); + return 0; +} + +bool AMDILELFWriterInfo::hasRelocationAddend() const { + assert(0 && "What do we do here? Lets assert an analyze"); + return false; +} + +long int AMDILELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy, + long int Modifier) const { + assert(0 && "What do we do here? Lets assert an analyze"); + return 0; +} + +unsigned AMDILELFWriterInfo::getRelocationTySize(unsigned RelTy) const { + assert(0 && "What do we do here? Lets assert an analyze"); + return 0; +} + +bool AMDILELFWriterInfo::isPCRelativeRel(unsigned RelTy) const { + assert(0 && "What do we do here? Lets assert an analyze"); + return false; +} + +unsigned AMDILELFWriterInfo::getAbsoluteLabelMachineRelTy() const { + assert(0 && "What do we do here? Lets assert an analyze"); + return 0; +} + +long int AMDILELFWriterInfo::computeRelocation(unsigned SymOffset, + unsigned RelOffset, + unsigned RelTy) const { + assert(0 && "What do we do here? Lets assert an analyze"); + return 0; +} diff --git a/src/gallium/drivers/radeon/AMDILELFWriterInfo.h b/src/gallium/drivers/radeon/AMDILELFWriterInfo.h new file mode 100644 index 00000000000..0bcffd27f59 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILELFWriterInfo.h @@ -0,0 +1,54 @@ +//===-- AMDILELFWriterInfo.h - Elf Writer Info for AMDIL ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This file implements ELF writer information for the AMDIL backend. +// +//===---------------------------------------------------------------------===// +#ifndef _AMDIL_ELF_WRITER_INFO_H_ +#define _AMDIL_ELF_WRITER_INFO_H_ +#include "llvm/Target/TargetELFWriterInfo.h" + +namespace llvm { + class AMDILELFWriterInfo : public TargetELFWriterInfo { + public: + AMDILELFWriterInfo(bool is64Bit_, bool isLittleEndian_); + virtual ~AMDILELFWriterInfo(); + + /// getRelocationType - Returns the target specific ELF Relocation type. + /// 'MachineRelTy' contains the object code independent relocation type + virtual unsigned getRelocationType(unsigned MachineRelTy) const; + + /// 'hasRelocationAddend - True if the target uses and addend in the + /// ELF relocation entry. + virtual bool hasRelocationAddend() const; + + /// getDefaultAddendForRelTy - Gets the default addend value for a + /// relocation entry based on the target ELF relocation type. + virtual long int getDefaultAddendForRelTy(unsigned RelTy, + long int Modifier = 0) const; + + /// getRelTySize - Returns the size of relocatble field in bits + virtual unsigned getRelocationTySize(unsigned RelTy) const; + + /// isPCRelativeRel - True if the relocation type is pc relative + virtual bool isPCRelativeRel(unsigned RelTy) const; + + /// getJumpTableRelocationTy - Returns the machine relocation type used + /// to reference a jumptable. + virtual unsigned getAbsoluteLabelMachineRelTy() const; + + /// computeRelocation - Some relocatable fields could be relocated + /// directly, avoiding the relocation symbol emission, compute the + /// final relocation value for this symbol. + virtual long int computeRelocation(unsigned SymOffset, + unsigned RelOffset, + unsigned RelTy) const; + }; +} // namespace llvm +#endif // _AMDIL_ELF_WRITER_INFO_H_ diff --git a/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td b/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td new file mode 100644 index 00000000000..445fd608bbb --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td @@ -0,0 +1,522 @@ +//===-- AMDILEnumeratedTypes.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// ILEnumreatedTypes.td - The IL Enumerated Types +//===--------------------------------------------------------------------===// + +// Section 5.1 IL Shader +class ILShader<bits<8> val> { + bits<8> Value = val; +} +// Table 5-1 +def IL_SHADER_PIXEL : ILShader<0>; +def IL_SHADER_COMPUTE : ILShader<1>; + +// Section 5.2 IL RegType +class ILRegType<bits<6> val> { + bits<6> Value = val; +} +// Table 5-2 +def IL_REGTYPE_TEMP : ILRegType<0>; +def IL_REGTYPE_WINCOORD : ILRegType<1>; +def IL_REGTYPE_CONST_BUF : ILRegType<2>; +def IL_REGTYPE_LITERAL : ILRegType<3>; +def IL_REGTYPE_ITEMP : ILRegType<4>; +def IL_REGTYPE_GLOBAL : ILRegType<5>; + +// Section 5.3 IL Component Select +class ILComponentSelect<bits<3> val, string text> { + bits<3> Value = val; + string Text = text; +} +// Table 5-3 +def IL_COMPSEL_X : ILComponentSelect<0, "x">; +def IL_COMPSEL_Y : ILComponentSelect<1, "y">; +def IL_COMPSEL_Z : ILComponentSelect<2, "z">; +def IL_COMPSEL_W : ILComponentSelect<3, "w">; +def IL_COMPSEL_0 : ILComponentSelect<4, "0">; +def IL_COMPSEL_1 : ILComponentSelect<5, "1">; + +// Section 5.4 IL Mod Dst Comp +class ILModDstComp<bits<2> val, string text> { + bits<2> Value = val; + string Text = text; +} +// Table 5-4 +def IL_MODCOMP_NOWRITE : ILModDstComp<0, "_">; +def IL_MODCOMP_WRITE_X : ILModDstComp<1, "x">; +def IL_MODCOMP_WRITE_y : ILModDstComp<1, "y">; +def IL_MODCOMP_WRITE_z : ILModDstComp<1, "z">; +def IL_MODCOMP_WRITE_w : ILModDstComp<1, "w">; +def IL_MODCOMP_0 : ILModDstComp<2, "0">; +def IL_MODCOMP_1 : ILModDstComp<3, "1">; + +// Section 5.5 IL Import Usage +class ILImportUsage<bits<1> val, string usage> { + bits<1> Value = val; + string Text = usage; +} +// Table 5-5 +def IL_IMPORTUSAGE_WINCOORD : ILImportUsage<0, "_usage(wincoord)">; + +// Section 5.6 Il Shift Scale +class ILShiftScale<bits<4> val, string scale> { + bits<4> Value = val; + string Text = scale; +} + +// Table 5-6 +def IL_SHIFT_NONE : ILShiftScale<0, "">; +def IL_SHIFT_X2 : ILShiftScale<1, "_x2">; +def IL_SHIFT_X4 : ILShiftScale<2, "_x4">; +def IL_SHIFT_X8 : ILShiftScale<3, "_x8">; +def IL_SHIFT_D2 : ILShiftScale<4, "_d2">; +def IL_SHIFT_D4 : ILShiftScale<5, "_d4">; +def IL_SHIFT_D8 : ILShiftScale<6, "_d8">; + +// Section 5.7 IL Divide Component +class ILDivComp<bits<3> val, string divcomp> { + bits<3> Value = val; + string Text = divcomp; +} + +// Table 5-7 +def IL_DIVCOMP_NONE : ILDivComp<0, "_divcomp(none)">; +def IL_DIVCOMP_Y : ILDivComp<1, "_divcomp(y)">; +def IL_DIVCOMP_Z : ILDivComp<2, "_divcomp(z)">; +def IL_DIVCOMP_W : ILDivComp<3, "_divcomp(w)">; +//def IL_DIVCOMP_UNKNOWN : ILDivComp<4, "_divcomp(unknown)">; + +// Section 5.8 IL Relational Op +class ILRelOp<bits<3> val, string op> { + bits<3> Value = val; + string Text = op; +} + +// Table 5-8 +def IL_RELOP_EQ : ILRelOp<0, "_relop(eq)">; +def IL_RELOP_NE : ILRelOp<1, "_relop(ne)">; +def IL_RELOP_GT : ILRelOp<2, "_relop(gt)">; +def IL_RELOP_GE : ILRelOp<3, "_relop(ge)">; +def IL_RELOP_LT : ILRelOp<4, "_relop(lt)">; +def IL_RELOP_LE : ILRelOp<5, "_relop(le)">; + +// Section 5.9 IL Zero Op +class ILZeroOp<bits<3> val, string behavior> { + bits<3> Value = val; + string Text = behavior; +} + +// Table 5-9 +def IL_ZEROOP_FLTMAX : ILZeroOp<0, "_zeroop(fltmax)">; +def IL_ZEROOP_0 : ILZeroOp<1, "_zeroop(zero)">; +def IL_ZEROOP_INFINITY : ILZeroOp<2, "_zeroop(infinity)">; +def IL_ZEROOP_INF_ELSE_MAX : ILZeroOp<3, "_zeroop(inf_else_max)">; + +// Section 5.10 IL Cmp Value +class ILCmpValue<bits<3> val, string num> { + bits<3> Value = val; + string Text = num; +} + +// Table 5-10 +def IL_CMPVAL_0_0 : ILCmpValue<0, "0.0">; +def IL_CMPVAL_0_5 : ILCmpValue<1, "0.5">; +def IL_CMPVAL_1_0 : ILCmpValue<2, "1.0">; +def IL_CMPVAL_NEG_0_5 : ILCmpValue<3, "-0.5">; +def IL_CMPVAL_NEG_1_0 : ILCmpValue<4, "-1.0">; + +// Section 5.11 IL Addressing +class ILAddressing<bits<3> val> { + bits<3> Value = val; +} + +// Table 5-11 +def IL_ADDR_ABSOLUTE : ILAddressing<0>; +def IL_ADDR_RELATIVE : ILAddressing<1>; +def IL_ADDR_REG_RELATIVE : ILAddressing<2>; + +// Section 5.11 IL Element Format +class ILElementFormat<bits<5> val> { + bits<5> Value = val; +} + +// Table 5-11 +def IL_ELEMENTFORMAT_UNKNOWN : ILElementFormat<0>; +def IL_ELEMENTFORMAT_SNORM : ILElementFormat<1>; +def IL_ELEMENTFORMAT_UNORM : ILElementFormat<2>; +def IL_ELEMENTFORMAT_SINT : ILElementFormat<3>; +def IL_ELEMENTFORMAT_UINT : ILElementFormat<4>; +def IL_ELEMENTFORMAT_FLOAT : ILElementFormat<5>; +def IL_ELEMENTFORMAT_SRGB : ILElementFormat<6>; +def IL_ELEMENTFORMAT_MIXED : ILElementFormat<7>; +def IL_ELEMENTFORMAT_Last : ILElementFormat<8>; + +// Section 5.12 IL Op Code +class ILOpCode<bits<16> val = -1, string cmd> { + bits<16> Value = val; + string Text = cmd; +} + +// Table 5-12 +def IL_DCL_CONST_BUFFER : ILOpCode<0, "dcl_cb">; +def IL_DCL_INDEXED_TEMP_ARRAY : ILOpCode<1, "dcl_index_temp_array">; +def IL_DCL_INPUT : ILOpCode<2, "dcl_input">; +def IL_DCL_LITERAL : ILOpCode<3, "dcl_literal">; +def IL_DCL_OUTPUT : ILOpCode<4, "dcl_output">; +def IL_DCL_RESOURCE : ILOpCode<5, "dcl_resource">; +def IL_OP_ABS : ILOpCode<6, "abs">; +def IL_OP_ADD : ILOpCode<7, "add">; +def IL_OP_AND : ILOpCode<8, "iand">; +def IL_OP_BREAK : ILOpCode<9, "break">; +def IL_OP_BREAK_LOGICALNZ : ILOpCode<10, "break_logicalnz">; +def IL_OP_BREAK_LOGICALZ : ILOpCode<11, "break_logicalz">; +def IL_OP_BREAKC : ILOpCode<12, "breakc">; +def IL_OP_CALL : ILOpCode<13, "call">; +def IL_OP_CALL_LOGICALNZ : ILOpCode<14, "call_logicalnz">; +def IL_OP_CALL_LOGICALZ : ILOpCode<15, "call_logicalz">; +def IL_OP_CASE : ILOpCode<16, "case">; +def IL_OP_CLG : ILOpCode<17, "clg">; +def IL_OP_CMOV : ILOpCode<18, "cmov">; +def IL_OP_CMOV_LOGICAL : ILOpCode<19, "cmov_logical">; +def IL_OP_CMP : ILOpCode<20, "cmp">; +def IL_OP_CONTINUE : ILOpCode<21, "continue">; +def IL_OP_CONTINUE_LOGICALNZ : ILOpCode<22, "continue_logicalnz">; +def IL_OP_CONTINUE_LOGICALZ : ILOpCode<23, "continue_logicalz">; +def IL_OP_CONTINUEC : ILOpCode<24, "continuec">; +def IL_OP_COS : ILOpCode<25, "cos">; +def IL_OP_COS_VEC : ILOpCode<26, "cos_vec">; +def IL_OP_D_2_F : ILOpCode<27, "d2f">; +def IL_OP_D_ADD : ILOpCode<28, "dadd">; +def IL_OP_D_EQ : ILOpCode<29, "deq">; +def IL_OP_D_FRC : ILOpCode<30, "dfrac">; +def IL_OP_D_FREXP : ILOpCode<31, "dfrexp">; +def IL_OP_D_GE : ILOpCode<32, "dge">; +def IL_OP_D_LDEXP : ILOpCode<33, "dldexp">; +def IL_OP_D_LT : ILOpCode<34, "dlt">; +def IL_OP_D_MAD : ILOpCode<35, "dmad">; +def IL_OP_D_MUL : ILOpCode<36, "dmul">; +def IL_OP_D_NE : ILOpCode<37, "dne">; +def IL_OP_DEFAULT : ILOpCode<38, "default">; +def IL_OP_DISCARD_LOGICALNZ : ILOpCode<39, "discard_logicalnz">; +def IL_OP_DISCARD_LOGICALZ : ILOpCode<40, "discard_logicalz">; +def IL_OP_DIV : ILOpCode<41, "div_zeroop(infinity)">; +def IL_OP_DP2 : ILOpCode<42, "dp2">; +def IL_OP_DP3 : ILOpCode<43, "dp3">; +def IL_OP_DP4 : ILOpCode<44, "dp4">; +def IL_OP_ELSE : ILOpCode<45, "else">; +def IL_OP_END : ILOpCode<46, "end">; +def IL_OP_ENDFUNC : ILOpCode<47, "endfunc">; +def IL_OP_ENDIF : ILOpCode<48, "endif">; +def IL_OP_ENDLOOP : ILOpCode<49, "endloop">; +def IL_OP_ENDMAIN : ILOpCode<50, "endmain">; +def IL_OP_ENDSWITCH : ILOpCode<51, "endswitch">; +def IL_OP_EQ : ILOpCode<52, "eq">; +def IL_OP_EXP : ILOpCode<53, "exp">; +def IL_OP_EXP_VEC : ILOpCode<54, "exp_vec">; +def IL_OP_F_2_D : ILOpCode<55, "f2d">; +def IL_OP_FLR : ILOpCode<56, "flr">; +def IL_OP_FRC : ILOpCode<57, "frc">; +def IL_OP_FTOI : ILOpCode<58, "ftoi">; +def IL_OP_FTOU : ILOpCode<59, "ftou">; +def IL_OP_FUNC : ILOpCode<60, "func">; +def IL_OP_GE : ILOpCode<61, "ge">; +def IL_OP_I_ADD : ILOpCode<62, "iadd">; +def IL_OP_I_EQ : ILOpCode<63, "ieq">; +def IL_OP_I_GE : ILOpCode<64, "ige">; +def IL_OP_I_LT : ILOpCode<65, "ilt">; +def IL_OP_I_MAD : ILOpCode<66, "imad">; +def IL_OP_I_MAX : ILOpCode<67, "imax">; +def IL_OP_I_MIN : ILOpCode<68, "imin">; +def IL_OP_I_MUL : ILOpCode<69, "imul">; +def IL_OP_I_MUL_HIGH : ILOpCode<70, "imul_high">; +def IL_OP_I_NE : ILOpCode<71, "ine">; +def IL_OP_I_NEGATE : ILOpCode<72, "inegate">; +def IL_OP_I_NOT : ILOpCode<73, "inot">; +def IL_OP_I_OR : ILOpCode<74, "ior">; +def IL_OP_I_SHL : ILOpCode<75, "ishl">; +def IL_OP_I_SHR : ILOpCode<76, "ishr">; +def IL_OP_I_XOR : ILOpCode<77, "ixor">; +def IL_OP_IF_LOGICALNZ : ILOpCode<78, "if_logicalnz">; +def IL_OP_IF_LOGICALZ : ILOpCode<79, "if_logicalz">; +def IL_OP_IFC : ILOpCode<80, "ifc">; +def IL_OP_ITOF : ILOpCode<81, "itof">; +def IL_OP_LN : ILOpCode<82, "ln">; +def IL_OP_LOG : ILOpCode<83, "log">; +def IL_OP_LOG_VEC : ILOpCode<84, "log_vec">; +def IL_OP_LOOP : ILOpCode<85, "loop">; +def IL_OP_LT : ILOpCode<86, "lt">; +def IL_OP_MAD : ILOpCode<87, "mad_ieee">; +def IL_OP_MAX : ILOpCode<88, "max_ieee">; +def IL_OP_MIN : ILOpCode<89, "min_ieee">; +def IL_OP_MOD : ILOpCode<90, "mod_ieee">; +def IL_OP_MOV : ILOpCode<91, "mov">; +def IL_OP_MUL_IEEE : ILOpCode<92, "mul_ieee">; +def IL_OP_NE : ILOpCode<93, "ne">; +def IL_OP_NRM : ILOpCode<94, "nrm_nrm4_zeroop(zero)">; +def IL_OP_POW : ILOpCode<95, "pow">; +def IL_OP_RCP : ILOpCode<96, "rcp">; +def IL_OP_RET : ILOpCode<97, "ret">; +def IL_OP_RET_DYN : ILOpCode<98, "ret_dyn">; +def IL_OP_RET_LOGICALNZ : ILOpCode<99, "ret_logicalnz">; +def IL_OP_RET_LOGICALZ : ILOpCode<100, "ret_logicalz">; +def IL_OP_RND : ILOpCode<101, "rnd">; +def IL_OP_ROUND_NEAR : ILOpCode<102, "round_nearest">; +def IL_OP_ROUND_NEG_INF : ILOpCode<103, "round_neginf">; +def IL_OP_ROUND_POS_INF : ILOpCode<104, "round_plusinf">; +def IL_OP_ROUND_ZERO : ILOpCode<105, "round_z">; +def IL_OP_RSQ : ILOpCode<106, "rsq">; +def IL_OP_RSQ_VEC : ILOpCode<107, "rsq_vec">; +def IL_OP_SAMPLE : ILOpCode<108, "sample">; +def IL_OP_SAMPLE_L : ILOpCode<109, "sample_l">; +def IL_OP_SET : ILOpCode<110, "set">; +def IL_OP_SGN : ILOpCode<111, "sgn">; +def IL_OP_SIN : ILOpCode<112, "sin">; +def IL_OP_SIN_VEC : ILOpCode<113, "sin_vec">; +def IL_OP_SUB : ILOpCode<114, "sub">; +def IL_OP_SWITCH : ILOpCode<115, "switch">; +def IL_OP_TRC : ILOpCode<116, "trc">; +def IL_OP_U_DIV : ILOpCode<117, "udiv">; +def IL_OP_U_GE : ILOpCode<118, "uge">; +def IL_OP_U_LT : ILOpCode<119, "ult">; +def IL_OP_U_MAD : ILOpCode<120, "umad">; +def IL_OP_U_MAX : ILOpCode<121, "umax">; +def IL_OP_U_MIN : ILOpCode<122, "umin">; +def IL_OP_U_MOD : ILOpCode<123, "umod">; +def IL_OP_U_MUL : ILOpCode<124, "umul">; +def IL_OP_U_MUL_HIGH : ILOpCode<125, "umul_high">; +def IL_OP_U_SHR : ILOpCode<126, "ushr">; +def IL_OP_UTOF : ILOpCode<127, "utof">; +def IL_OP_WHILE : ILOpCode<128, "whileloop">; +// SC IL instructions that are not in CAL IL +def IL_OP_ACOS : ILOpCode<129, "acos">; +def IL_OP_ASIN : ILOpCode<130, "asin">; +def IL_OP_EXN : ILOpCode<131, "exn">; +def IL_OP_UBIT_REVERSE : ILOpCode<132, "ubit_reverse">; +def IL_OP_UBIT_EXTRACT : ILOpCode<133, "ubit_extract">; +def IL_OP_IBIT_EXTRACT : ILOpCode<134, "ibit_extract">; +def IL_OP_SQRT : ILOpCode<135, "sqrt">; +def IL_OP_SQRT_VEC : ILOpCode<136, "sqrt_vec">; +def IL_OP_ATAN : ILOpCode<137, "atan">; +def IL_OP_TAN : ILOpCode<137, "tan">; +def IL_OP_D_DIV : ILOpCode<138, "ddiv">; +def IL_OP_F_NEG : ILOpCode<139, "mov">; +def IL_OP_GT : ILOpCode<140, "gt">; +def IL_OP_LE : ILOpCode<141, "lt">; +def IL_OP_DIST : ILOpCode<142, "dist">; +def IL_OP_LEN : ILOpCode<143, "len">; +def IL_OP_MACRO : ILOpCode<144, "mcall">; +def IL_OP_INTR : ILOpCode<145, "call">; +def IL_OP_I_FFB_HI : ILOpCode<146, "ffb_hi">; +def IL_OP_I_FFB_LO : ILOpCode<147, "ffb_lo">; +def IL_OP_BARRIER : ILOpCode<148, "fence_threads_memory_lds">; +def IL_OP_BARRIER_LOCAL : ILOpCode<149, "fence_threads_lds">; +def IL_OP_BARRIER_GLOBAL : ILOpCode<150, "fence_threads_memory">; +def IL_OP_FENCE : ILOpCode<151, "fence_lds_memory">; +def IL_OP_FENCE_READ_ONLY : ILOpCode<152, "fence_lds_mem_read_only">; +def IL_OP_FENCE_WRITE_ONLY : ILOpCode<153, "fence_lds_mem_write_only">; +def IL_PSEUDO_INST : ILOpCode<154, ";Pseudo Op">; +def IL_OP_UNPACK_0 : ILOpCode<155, "unpack0">; +def IL_OP_UNPACK_1 : ILOpCode<156, "unpack1">; +def IL_OP_UNPACK_2 : ILOpCode<157, "unpack2">; +def IL_OP_UNPACK_3 : ILOpCode<158, "unpack3">; +def IL_OP_PI_REDUCE : ILOpCode<159, "pireduce">; +def IL_OP_IBIT_COUNT : ILOpCode<160, "icbits">; +def IL_OP_I_FFB_SGN : ILOpCode<161, "ffb_shi">; +def IL_OP_F2U4 : ILOpCode<162, "f_2_u4">; +def IL_OP_BIT_ALIGN : ILOpCode<163, "bitalign">; +def IL_OP_BYTE_ALIGN : ILOpCode<164, "bytealign">; +def IL_OP_U4_LERP : ILOpCode<165, "u4lerp">; +def IL_OP_SAD : ILOpCode<166, "sad">; +def IL_OP_SAD_HI : ILOpCode<167, "sadhi">; +def IL_OP_SAD4 : ILOpCode<168, "sad4">; +def IL_OP_UBIT_INSERT : ILOpCode<169, "ubit_insert">; +def IL_OP_I_CARRY : ILOpCode<170, "icarry">; +def IL_OP_I_BORROW : ILOpCode<171, "iborrow">; +def IL_OP_U_MAD24 : ILOpCode<172, "umad24">; +def IL_OP_U_MUL24 : ILOpCode<173, "umul24">; +def IL_OP_I_MAD24 : ILOpCode<174, "imad24">; +def IL_OP_I_MUL24 : ILOpCode<175, "imul24">; +def IL_OP_CLAMP : ILOpCode<176, "clamp">; +def IL_OP_LERP : ILOpCode<177, "lrp">; +def IL_OP_FMA : ILOpCode<178, "fma">; +def IL_OP_D_MIN : ILOpCode<179, "dmin">; +def IL_OP_D_MAX : ILOpCode<180, "dmax">; +def IL_OP_D_SQRT : ILOpCode<181, "dsqrt">; +def IL_OP_DP2_ADD : ILOpCode<182, "dp2add">; +def IL_OP_F16_TO_F32 : ILOpCode<183, "f162f">; +def IL_OP_F32_TO_F16 : ILOpCode<184, "f2f16">; +def IL_REG_LOCAL_ID_FLAT : ILOpCode<185, "vTidInGrpFlat">; +def IL_REG_LOCAL_ID : ILOpCode<186, "vTidInGrp">; +def IL_REG_GLOBAL_ID_FLAT : ILOpCode<187, "vAbsTidFlag">; +def IL_REG_GLOBAL_ID : ILOpCode<188, "vAbsTid">; +def IL_REG_GROUP_ID_FLAT : ILOpCode<189, "vThreadGrpIDFlat">; +def IL_REG_GROUP_ID : ILOpCode<190, "vThreadGrpID">; +def IL_OP_D_RCP : ILOpCode<191, "drcp_zeroop(infinity)">; +def IL_OP_D_RSQ : ILOpCode<192, "drsq_zeroop(infinity)">; +def IL_OP_D_MOV : ILOpCode<193, "dmov">; +def IL_OP_D_MOVC : ILOpCode<194, "dmovc">; +def IL_OP_NOP : ILOpCode<195, "nop">; +def IL_OP_UAV_ADD : ILOpCode<196, "uav_add">; +def IL_OP_UAV_AND : ILOpCode<197, "uav_and">; +def IL_OP_UAV_MAX : ILOpCode<198, "uav_max">; +def IL_OP_UAV_MIN : ILOpCode<199, "uav_min">; +def IL_OP_UAV_OR : ILOpCode<200, "uav_or">; +def IL_OP_UAV_RSUB : ILOpCode<201, "uav_rsub">; +def IL_OP_UAV_SUB : ILOpCode<202, "uav_sub">; +def IL_OP_UAV_UMAX : ILOpCode<203, "uav_umax">; +def IL_OP_UAV_UMIN : ILOpCode<204, "uav_umin">; +def IL_OP_UAV_XOR : ILOpCode<205, "uav_xor">; +def IL_OP_UAV_INC : ILOpCode<206, "uav_uinc">; +def IL_OP_UAV_DEC : ILOpCode<207, "uav_udec">; +def IL_OP_UAV_CMP : ILOpCode<208, "uav_cmp">; +def IL_OP_UAV_READ_ADD : ILOpCode<209, "uav_read_add">; +def IL_OP_UAV_READ_AND : ILOpCode<210, "uav_read_and">; +def IL_OP_UAV_READ_MAX : ILOpCode<211, "uav_read_max">; +def IL_OP_UAV_READ_MIN : ILOpCode<212, "uav_read_min">; +def IL_OP_UAV_READ_OR : ILOpCode<213, "uav_read_or">; +def IL_OP_UAV_READ_RSUB : ILOpCode<214, "uav_read_rsub">; +def IL_OP_UAV_READ_SUB : ILOpCode<215, "uav_read_sub">; +def IL_OP_UAV_READ_UMAX : ILOpCode<216, "uav_read_umax">; +def IL_OP_UAV_READ_UMIN : ILOpCode<217, "uav_read_umin">; +def IL_OP_UAV_READ_XOR : ILOpCode<218, "uav_read_xor">; +def IL_OP_UAV_READ_INC : ILOpCode<219, "uav_read_uinc">; +def IL_OP_UAV_READ_DEC : ILOpCode<220, "uav_read_udec">; +def IL_OP_UAV_READ_XCHG : ILOpCode<221, "uav_read_xchg">; +def IL_OP_UAV_READ_CMPXCHG : ILOpCode<222, "uav_read_cmp_xchg">; +def IL_OP_LDS_ADD : ILOpCode<223, "lds_add">; +def IL_OP_LDS_AND : ILOpCode<224, "lds_and">; +def IL_OP_LDS_MAX : ILOpCode<225, "lds_max">; +def IL_OP_LDS_MIN : ILOpCode<226, "lds_min">; +def IL_OP_LDS_OR : ILOpCode<227, "lds_or">; +def IL_OP_LDS_RSUB : ILOpCode<228, "lds_rsub">; +def IL_OP_LDS_SUB : ILOpCode<229, "lds_sub">; +def IL_OP_LDS_UMAX : ILOpCode<230, "lds_umax">; +def IL_OP_LDS_UMIN : ILOpCode<231, "lds_umin">; +def IL_OP_LDS_XOR : ILOpCode<232, "lds_xor">; +def IL_OP_LDS_INC : ILOpCode<233, "lds_inc">; +def IL_OP_LDS_DEC : ILOpCode<234, "lds_dec">; +def IL_OP_LDS_CMP : ILOpCode<235, "lds_cmp">; +def IL_OP_LDS_READ_ADD : ILOpCode<236, "lds_read_add">; +def IL_OP_LDS_READ_AND : ILOpCode<237, "lds_read_and">; +def IL_OP_LDS_READ_MAX : ILOpCode<238, "lds_read_max">; +def IL_OP_LDS_READ_MIN : ILOpCode<239, "lds_read_min">; +def IL_OP_LDS_READ_OR : ILOpCode<240, "lds_read_or">; +def IL_OP_LDS_READ_RSUB : ILOpCode<241, "lds_read_rsub">; +def IL_OP_LDS_READ_SUB : ILOpCode<242, "lds_read_sub">; +def IL_OP_LDS_READ_UMAX : ILOpCode<243, "lds_read_umax">; +def IL_OP_LDS_READ_UMIN : ILOpCode<244, "lds_read_umin">; +def IL_OP_LDS_READ_XOR : ILOpCode<245, "lds_read_xor">; +def IL_OP_LDS_READ_INC : ILOpCode<246, "lds_read_inc">; +def IL_OP_LDS_READ_DEC : ILOpCode<247, "lds_read_dec">; +def IL_OP_LDS_READ_XCHG : ILOpCode<248, "lds_read_xchg">; +def IL_OP_LDS_READ_CMPXCHG : ILOpCode<249, "lds_read_cmp_xchg">; +def IL_OP_GDS_ADD : ILOpCode<250, "gds_add">; +def IL_OP_GDS_AND : ILOpCode<251, "gds_and">; +def IL_OP_GDS_MAX : ILOpCode<252, "gds_max">; +def IL_OP_GDS_MIN : ILOpCode<253, "gds_min">; +def IL_OP_GDS_OR : ILOpCode<254, "gds_or">; +def IL_OP_GDS_RSUB : ILOpCode<255, "gds_rsub">; +def IL_OP_GDS_SUB : ILOpCode<256, "gds_sub">; +def IL_OP_GDS_UMAX : ILOpCode<257, "gds_umax">; +def IL_OP_GDS_UMIN : ILOpCode<258, "gds_umin">; +def IL_OP_GDS_MSKOR : ILOpCode<259, "gds_mskor">; +def IL_OP_GDS_XOR : ILOpCode<260, "gds_xor">; +def IL_OP_GDS_INC : ILOpCode<261, "gds_inc">; +def IL_OP_GDS_DEC : ILOpCode<262, "gds_dec">; +def IL_OP_GDS_CMP : ILOpCode<263, "gds_cmp">; +def IL_OP_GDS_READ_ADD : ILOpCode<264, "gds_read_add">; +def IL_OP_GDS_READ_AND : ILOpCode<265, "gds_read_and">; +def IL_OP_GDS_READ_MAX : ILOpCode<266, "gds_read_max">; +def IL_OP_GDS_READ_MIN : ILOpCode<267, "gds_read_min">; +def IL_OP_GDS_READ_OR : ILOpCode<268, "gds_read_or">; +def IL_OP_GDS_READ_RSUB : ILOpCode<269, "gds_read_rsub">; +def IL_OP_GDS_READ_SUB : ILOpCode<270, "gds_read_sub">; +def IL_OP_GDS_READ_UMAX : ILOpCode<271, "gds_read_umax">; +def IL_OP_GDS_READ_UMIN : ILOpCode<272, "gds_read_umin">; +def IL_OP_GDS_READ_MSKOR : ILOpCode<273, "gds_read_mskor">; +def IL_OP_GDS_READ_XOR : ILOpCode<274, "gds_read_xor">; +def IL_OP_GDS_READ_INC : ILOpCode<275, "gds_read_inc">; +def IL_OP_GDS_READ_DEC : ILOpCode<276, "gds_read_dec">; +def IL_OP_GDS_READ_XCHG : ILOpCode<277, "gds_read_xchg">; +def IL_OP_GDS_READ_CMPXCHG : ILOpCode<278, "gds_read_cmp_xchg">; +def IL_OP_APPEND_BUF_ALLOC : ILOpCode<279, "append_buf_alloc">; +def IL_OP_APPEND_BUF_CONSUME : ILOpCode<280, "append_buf_consume">; +def IL_OP_I64_ADD : ILOpCode<281, "i64add">; +def IL_OP_I64_MAX : ILOpCode<282, "i64max">; +def IL_OP_U64_MAX : ILOpCode<283, "u64max">; +def IL_OP_I64_MIN : ILOpCode<284, "i64min">; +def IL_OP_U64_MIN : ILOpCode<285, "u64min">; +def IL_OP_I64_NEGATE : ILOpCode<286, "i64negate">; +def IL_OP_I64_SHL : ILOpCode<287, "i64shl">; +def IL_OP_I64_SHR : ILOpCode<288, "i64shr">; +def IL_OP_U64_SHR : ILOpCode<289, "u64shr">; +def IL_OP_I64_EQ : ILOpCode<290, "i64eq">; +def IL_OP_I64_GE : ILOpCode<291, "i64ge">; +def IL_OP_U64_GE : ILOpCode<292, "u64ge">; +def IL_OP_I64_LT : ILOpCode<293, "i64lt">; +def IL_OP_U64_LT : ILOpCode<294, "u64lt">; +def IL_OP_I64_NE : ILOpCode<295, "i64ne">; +def IL_OP_U_MULHI24 : ILOpCode<296, "umul24_high">; +def IL_OP_I_MULHI24 : ILOpCode<297, "imul24_high">; +def IL_OP_GDS_LOAD : ILOpCode<298, "gds_load">; +def IL_OP_GDS_STORE : ILOpCode<299, "gds_store">; +def IL_OP_LDS_LOAD : ILOpCode<300, "lds_load">; +def IL_OP_LDS_LOAD_VEC : ILOpCode<301, "lds_load_vec">; +def IL_OP_LDS_LOAD_BYTE : ILOpCode<302, "lds_load_byte">; +def IL_OP_LDS_LOAD_UBYTE : ILOpCode<303, "lds_load_ubyte">; +def IL_OP_LDS_LOAD_SHORT : ILOpCode<304, "lds_load_short">; +def IL_OP_LDS_LOAD_USHORT : ILOpCode<305, "lds_load_ushort">; +def IL_OP_LDS_STORE : ILOpCode<306, "lds_store">; +def IL_OP_LDS_STORE_VEC : ILOpCode<307, "lds_store_vec">; +def IL_OP_LDS_STORE_BYTE : ILOpCode<308, "lds_store_byte">; +def IL_OP_LDS_STORE_SHORT : ILOpCode<309, "lds_store_short">; +def IL_OP_RAW_UAV_LOAD : ILOpCode<310, "uav_raw_load">; +def IL_OP_RAW_UAV_STORE : ILOpCode<311, "uav_raw_store">; +def IL_OP_ARENA_UAV_LOAD : ILOpCode<312, "uav_arena_load">; +def IL_OP_ARENA_UAV_STORE : ILOpCode<313, "uav_arena_store">; +def IL_OP_LDS_MSKOR : ILOpCode<314, "lds_mskor">; +def IL_OP_LDS_READ_MSKOR : ILOpCode<315, "lds_read_mskor">; +def IL_OP_UAV_BYTE_LOAD : ILOpCode<316, "uav_byte_load">; +def IL_OP_UAV_UBYTE_LOAD : ILOpCode<317, "uav_ubyte_load">; +def IL_OP_UAV_SHORT_LOAD : ILOpCode<318, "uav_short_load">; +def IL_OP_UAV_USHORT_LOAD : ILOpCode<319, "uav_ushort_load">; +def IL_OP_UAV_BYTE_STORE : ILOpCode<320, "uav_byte_store">; +def IL_OP_UAV_SHORT_STORE : ILOpCode<320, "uav_short_store">; +def IL_OP_UAV_STORE : ILOpCode<321, "uav_store">; +def IL_OP_UAV_LOAD : ILOpCode<322, "uav_load">; +def IL_OP_MUL : ILOpCode<323, "mul">; +def IL_OP_DIV_INF : ILOpCode<324, "div_zeroop(infinity)">; +def IL_OP_DIV_FLTMAX : ILOpCode<325, "div_zeroop(fltmax)">; +def IL_OP_DIV_ZERO : ILOpCode<326, "div_zeroop(zero)">; +def IL_OP_DIV_INFELSEMAX : ILOpCode<327, "div_zeroop(inf_else_max)">; +def IL_OP_FTOI_FLR : ILOpCode<328, "ftoi_flr">; +def IL_OP_FTOI_RPI : ILOpCode<329, "ftoi_rpi">; +def IL_OP_F32_TO_F16_NEAR : ILOpCode<330, "f2f16_near">; +def IL_OP_F32_TO_F16_NEG_INF : ILOpCode<331, "f2f16_neg_inf">; +def IL_OP_F32_TO_F16_PLUS_INF : ILOpCode<332, "f2f16_plus_inf">; +def IL_OP_I64_MUL : ILOpCode<333, "i64mul">; +def IL_OP_U64_MUL : ILOpCode<334, "u64mul">; +def IL_OP_CU_ID : ILOpCode<355, "cu_id">; +def IL_OP_WAVE_ID : ILOpCode<356, "wave_id">; +def IL_OP_I64_SUB : ILOpCode<357, "i64sub">; +def IL_OP_I64_DIV : ILOpCode<358, "i64div">; +def IL_OP_U64_DIV : ILOpCode<359, "u64div">; +def IL_OP_I64_MOD : ILOpCode<360, "i64mod">; +def IL_OP_U64_MOD : ILOpCode<361, "u64mod">; +def IL_DCL_GWS_THREAD_COUNT : ILOpCode<362, "dcl_gws_thread_count">; +def IL_DCL_SEMAPHORE : ILOpCode<363, "dcl_semaphore">; +def IL_OP_SEMAPHORE_INIT : ILOpCode<364, "init_semaphore">; +def IL_OP_SEMAPHORE_WAIT : ILOpCode<365, "semaphore_wait">; +def IL_OP_SEMAPHORE_SIGNAL : ILOpCode<366, "semaphore_signal">; +def IL_OP_BARRIER_REGION : ILOpCode<377, "fence_threads_gds">; +def IL_OP_BFI : ILOpCode<394, "bfi">; +def IL_OP_BFM : ILOpCode<395, "bfm">; +def IL_DBG_STRING : ILOpCode<396, "dbg_string">; +def IL_DBG_LINE : ILOpCode<397, "dbg_line">; +def IL_DBG_TEMPLOC : ILOpCode<398, "dbg_temploc">; diff --git a/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp b/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp new file mode 100644 index 00000000000..1af28063da6 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp @@ -0,0 +1,211 @@ +//===-- AMDILEvergreenDevice.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILEvergreenDevice.h" +#ifdef UPSTREAM_LLVM +#include "AMDILEGAsmPrinter.h" +#endif +#include "AMDILIOExpansion.h" +#include "AMDILPointerManager.h" + +using namespace llvm; + +AMDILEvergreenDevice::AMDILEvergreenDevice(AMDILSubtarget *ST) +: AMDILDevice(ST) { + setCaps(); + std::string name = ST->getDeviceName(); + if (name == "cedar") { + mDeviceFlag = OCL_DEVICE_CEDAR; + } else if (name == "redwood") { + mDeviceFlag = OCL_DEVICE_REDWOOD; + } else if (name == "cypress") { + mDeviceFlag = OCL_DEVICE_CYPRESS; + } else { + mDeviceFlag = OCL_DEVICE_JUNIPER; + } +} + +AMDILEvergreenDevice::~AMDILEvergreenDevice() { +} + +size_t AMDILEvergreenDevice::getMaxLDSSize() const { + if (usesHardware(AMDILDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_800; + } else { + return 0; + } +} +size_t AMDILEvergreenDevice::getMaxGDSSize() const { + if (usesHardware(AMDILDeviceInfo::RegionMem)) { + return MAX_LDS_SIZE_800; + } else { + return 0; + } +} +uint32_t AMDILEvergreenDevice::getMaxNumUAVs() const { + return 12; +} + +uint32_t AMDILEvergreenDevice::getResourceID(uint32_t id) const { + switch(id) { + default: + assert(0 && "ID type passed in is unknown!"); + break; + case CONSTANT_ID: + case RAW_UAV_ID: + if (mSTM->calVersion() >= CAL_VERSION_GLOBAL_RETURN_BUFFER) { + return GLOBAL_RETURN_RAW_UAV_ID; + } else { + return DEFAULT_RAW_UAV_ID; + } + case GLOBAL_ID: + case ARENA_UAV_ID: + return DEFAULT_ARENA_UAV_ID; + case LDS_ID: + if (usesHardware(AMDILDeviceInfo::LocalMem)) { + return DEFAULT_LDS_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + case GDS_ID: + if (usesHardware(AMDILDeviceInfo::RegionMem)) { + return DEFAULT_GDS_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + case SCRATCH_ID: + if (usesHardware(AMDILDeviceInfo::PrivateMem)) { + return DEFAULT_SCRATCH_ID; + } else { + return DEFAULT_ARENA_UAV_ID; + } + }; + return 0; +} + +size_t AMDILEvergreenDevice::getWavefrontSize() const { + return AMDILDevice::WavefrontSize; +} + +uint32_t AMDILEvergreenDevice::getGeneration() const { + return AMDILDeviceInfo::HD5XXX; +} + +void AMDILEvergreenDevice::setCaps() { + mSWBits.set(AMDILDeviceInfo::ArenaSegment); + mHWBits.set(AMDILDeviceInfo::ArenaUAV); + if (mSTM->calVersion() >= CAL_VERSION_SC_140) { + mHWBits.set(AMDILDeviceInfo::HW64BitDivMod); + mSWBits.reset(AMDILDeviceInfo::HW64BitDivMod); + } + mSWBits.set(AMDILDeviceInfo::Signed24BitOps); + if (mSTM->isOverride(AMDILDeviceInfo::ByteStores)) { + mHWBits.set(AMDILDeviceInfo::ByteStores); + } + if (mSTM->isOverride(AMDILDeviceInfo::Debug)) { + mSWBits.set(AMDILDeviceInfo::LocalMem); + mSWBits.set(AMDILDeviceInfo::RegionMem); + } else { + mHWBits.set(AMDILDeviceInfo::LocalMem); + mHWBits.set(AMDILDeviceInfo::RegionMem); + } + mHWBits.set(AMDILDeviceInfo::Images); + if (mSTM->isOverride(AMDILDeviceInfo::NoAlias)) { + mHWBits.set(AMDILDeviceInfo::NoAlias); + } + if (mSTM->calVersion() > CAL_VERSION_GLOBAL_RETURN_BUFFER) { + mHWBits.set(AMDILDeviceInfo::CachedMem); + } + if (mSTM->isOverride(AMDILDeviceInfo::MultiUAV)) { + mHWBits.set(AMDILDeviceInfo::MultiUAV); + } + if (mSTM->calVersion() > CAL_VERSION_SC_136) { + mHWBits.set(AMDILDeviceInfo::ByteLDSOps); + mSWBits.reset(AMDILDeviceInfo::ByteLDSOps); + mHWBits.set(AMDILDeviceInfo::ArenaVectors); + } else { + mSWBits.set(AMDILDeviceInfo::ArenaVectors); + } + if (mSTM->calVersion() > CAL_VERSION_SC_137) { + mHWBits.set(AMDILDeviceInfo::LongOps); + mSWBits.reset(AMDILDeviceInfo::LongOps); + } + mHWBits.set(AMDILDeviceInfo::TmrReg); +} +FunctionPass* +AMDILEvergreenDevice::getIOExpansion( + TargetMachine& TM AMDIL_OPT_LEVEL_DECL) const +{ + return new AMDILEGIOExpansion(TM AMDIL_OPT_LEVEL_VAR); +} + +AsmPrinter* +AMDILEvergreenDevice::getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const +{ +#ifdef UPSTREAM_LLVM + return new AMDILEGAsmPrinter(TM, Streamer); +#else + return NULL; +#endif +} + +FunctionPass* +AMDILEvergreenDevice::getPointerManager( + TargetMachine& TM AMDIL_OPT_LEVEL_DECL) const +{ + return new AMDILEGPointerManager(TM AMDIL_OPT_LEVEL_VAR); +} + +AMDILCypressDevice::AMDILCypressDevice(AMDILSubtarget *ST) + : AMDILEvergreenDevice(ST) { + setCaps(); +} + +AMDILCypressDevice::~AMDILCypressDevice() { +} + +void AMDILCypressDevice::setCaps() { + if (mSTM->isOverride(AMDILDeviceInfo::DoubleOps)) { + mHWBits.set(AMDILDeviceInfo::DoubleOps); + mHWBits.set(AMDILDeviceInfo::FMA); + } +} + + +AMDILCedarDevice::AMDILCedarDevice(AMDILSubtarget *ST) + : AMDILEvergreenDevice(ST) { + setCaps(); +} + +AMDILCedarDevice::~AMDILCedarDevice() { +} + +void AMDILCedarDevice::setCaps() { + mSWBits.set(AMDILDeviceInfo::FMA); +} + +size_t AMDILCedarDevice::getWavefrontSize() const { + return AMDILDevice::QuarterWavefrontSize; +} + +AMDILRedwoodDevice::AMDILRedwoodDevice(AMDILSubtarget *ST) + : AMDILEvergreenDevice(ST) { + setCaps(); +} + +AMDILRedwoodDevice::~AMDILRedwoodDevice() +{ +} + +void AMDILRedwoodDevice::setCaps() { + mSWBits.set(AMDILDeviceInfo::FMA); +} + +size_t AMDILRedwoodDevice::getWavefrontSize() const { + return AMDILDevice::HalfWavefrontSize; +} diff --git a/src/gallium/drivers/radeon/AMDILEvergreenDevice.h b/src/gallium/drivers/radeon/AMDILEvergreenDevice.h new file mode 100644 index 00000000000..726b479c7ea --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILEvergreenDevice.h @@ -0,0 +1,93 @@ +//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface for the subtarget data classes. +// +//===----------------------------------------------------------------------===// +// This file will define the interface that each generation needs to +// implement in order to correctly answer queries on the capabilities of the +// specific hardware. +//===----------------------------------------------------------------------===// +#ifndef _AMDILEVERGREENDEVICE_H_ +#define _AMDILEVERGREENDEVICE_H_ +#include "AMDILDevice.h" +#include "AMDILSubtarget.h" + +namespace llvm { + class AMDILSubtarget; +//===----------------------------------------------------------------------===// +// Evergreen generation of devices and their respective sub classes +//===----------------------------------------------------------------------===// + + +// The AMDILEvergreenDevice is the base device class for all of the Evergreen +// series of cards. This class contains information required to differentiate +// the Evergreen device from the generic AMDILDevice. This device represents +// that capabilities of the 'Juniper' cards, also known as the HD57XX. +class AMDILEvergreenDevice : public AMDILDevice { +public: + AMDILEvergreenDevice(AMDILSubtarget *ST); + virtual ~AMDILEvergreenDevice(); + virtual size_t getMaxLDSSize() const; + virtual size_t getMaxGDSSize() const; + virtual size_t getWavefrontSize() const; + virtual uint32_t getGeneration() const; + virtual uint32_t getMaxNumUAVs() const; + virtual uint32_t getResourceID(uint32_t) const; + virtual FunctionPass* + getIOExpansion(TargetMachine& AMDIL_OPT_LEVEL_DECL) const; + virtual AsmPrinter* + getAsmPrinter(TargetMachine& TM, MCStreamer &Streamer) const; + virtual FunctionPass* + getPointerManager(TargetMachine& AMDIL_OPT_LEVEL_DECL) const; +protected: + virtual void setCaps(); +}; // AMDILEvergreenDevice + +// The AMDILCypressDevice is similiar to the AMDILEvergreenDevice, except it has +// support for double precision operations. This device is used to represent +// both the Cypress and Hemlock cards, which are commercially known as HD58XX +// and HD59XX cards. +class AMDILCypressDevice : public AMDILEvergreenDevice { +public: + AMDILCypressDevice(AMDILSubtarget *ST); + virtual ~AMDILCypressDevice(); +private: + virtual void setCaps(); +}; // AMDILCypressDevice + + +// The AMDILCedarDevice is the class that represents all of the 'Cedar' based +// devices. This class differs from the base AMDILEvergreenDevice in that the +// device is a ~quarter of the 'Juniper'. These are commercially known as the +// HD54XX and HD53XX series of cards. +class AMDILCedarDevice : public AMDILEvergreenDevice { +public: + AMDILCedarDevice(AMDILSubtarget *ST); + virtual ~AMDILCedarDevice(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; // AMDILCedarDevice + +// The AMDILRedwoodDevice is the class the represents all of the 'Redwood' based +// devices. This class differs from the base class, in that these devices are +// considered about half of a 'Juniper' device. These are commercially known as +// the HD55XX and HD56XX series of cards. +class AMDILRedwoodDevice : public AMDILEvergreenDevice { +public: + AMDILRedwoodDevice(AMDILSubtarget *ST); + virtual ~AMDILRedwoodDevice(); + virtual size_t getWavefrontSize() const; +private: + virtual void setCaps(); +}; // AMDILRedwoodDevice + +} // namespace llvm +#endif // _AMDILEVERGREENDEVICE_H_ diff --git a/src/gallium/drivers/radeon/AMDILFormats.td b/src/gallium/drivers/radeon/AMDILFormats.td new file mode 100644 index 00000000000..99489e7e92c --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILFormats.td @@ -0,0 +1,450 @@ +//==- AMDILFormats.td - AMDIL Instruction Formats ----*- tablegen -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +//===--------------------------------------------------------------------===// +include "AMDILTokenDesc.td" + +//===--------------------------------------------------------------------===// +// The parent IL instruction class that inherits the Instruction class. This +// class sets the corresponding namespace, the out and input dag lists the +// pattern to match to and the string to print out for the assembly printer. +//===--------------------------------------------------------------------===// +class ILFormat<ILOpCode op, dag outs, dag ins, string asmstr, list<dag> pattern> +: Instruction { + + let Namespace = "AMDIL"; + dag OutOperandList = outs; + dag InOperandList = ins; + ILOpCode operation = op; + let Pattern = pattern; + let AsmString = !strconcat(asmstr, "\n"); + let isPseudo = 1; + bit hasIEEEFlag = 0; + bit hasZeroOpFlag = 0; +} + +//===--------------------------------------------------------------------===// +// The base class for vector insert instructions. It is a single dest, quad +// source instruction where the last two source operands must be 32bit +// immediate values that are encoding the swizzle of the source register +// The src2 and src3 instructions must also be inversion of each other such +// that if src2 is 0x1000300(x0z0), src3 must be 0x20004(0y0w). The values +// are encoded as 32bit integer with each 8 char representing a swizzle value. +// The encoding is as follows for 32bit register types: +// 0x00 -> '_' +// 0x01 -> 'x' +// 0x02 -> 'y' +// 0x03 -> 'z' +// 0x04 -> 'w' +// 0x05 -> 'x' +// 0x06 -> 'y' +// 0x07 -> 'z' +// 0x08 -> 'w' +// 0x09 -> '0' +// The encoding is as follows for 64bit register types: +// 0x00 -> "__" +// 0x01 -> "xy" +// 0x02 -> "zw" +// 0x03 -> "xy" +// 0x04 -> "zw" +// 0x05 -> "00" +//===--------------------------------------------------------------------===// +class InsertVectorClass<ILOpCode op, RegisterClass DReg, RegisterClass SReg, + SDNode OpNode, string asmstr> : + ILFormat<op, (outs DReg:$dst), + (ins DReg:$src0, SReg:$src1, i32imm:$src2, i32imm:$src3), + !strconcat(asmstr, " $dst, $src0, $src1"), + [(set DReg:$dst, (OpNode DReg:$src0, SReg:$src1, + timm:$src2, timm:$src3))]>; + +//===--------------------------------------------------------------------===// +// Class that has one input parameters and one output parameter. +// The basic pattern for this class is "Opcode Dst, Src0" and +// handles the unary math operators. +// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod +// if the addressing is register relative for input and output register 0. +//===--------------------------------------------------------------------===// +class OneInOneOut<ILOpCode op, dag outs, dag ins, + string asmstr, list<dag> pattern> + : ILFormat<op, outs, ins, asmstr, pattern> +{ + ILDst dst_reg; + ILDstMod dst_mod; + ILRelAddr dst_rel; + ILSrc dst_reg_rel; + ILSrcMod dst_reg_rel_mod; + ILSrc src0_reg; + ILSrcMod src0_mod; + ILRelAddr src0_rel; + ILSrc src0_reg_rel; + ILSrcMod src0_reg_rel_mod; +} + +//===--------------------------------------------------------------------===// +// A simplified version of OneInOneOut class where the pattern is standard +// and does not need special cases. This requires that the pattern has +// a SDNode and takes a source and destination register that is of type +// RegisterClass. This is the standard unary op class. +//===--------------------------------------------------------------------===// +class UnaryOp<ILOpCode op, SDNode OpNode, + RegisterClass dRegs, RegisterClass sRegs> + : OneInOneOut<op, (outs dRegs:$dst), (ins sRegs:$src), + !strconcat(op.Text, " $dst, $src"), + [(set dRegs:$dst, (OpNode sRegs:$src))]>; + +//===--------------------------------------------------------------------===// +// This class is similiar to the UnaryOp class, however, there is no +// result value to assign. +//===--------------------------------------------------------------------===// +class UnaryOpNoRet<ILOpCode op, dag outs, dag ins, + string asmstr, list<dag> pattern> + : ILFormat<op, outs, ins, asmstr, pattern> +{ + ILSrc src0_reg; + ILSrcMod src0_mod; + ILRelAddr src0_rel; + ILSrc src0_reg_rel; + ILSrcMod src0_reg_rel_mod; +} + +//===--------------------------------------------------------------------===// +// Set of classes that have two input parameters and one output parameter. +// The basic pattern for this class is "Opcode Dst, Src0, Src1" and +// handles the binary math operators and comparison operations. +// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod +// if the addressing is register relative for input register 1. +//===--------------------------------------------------------------------===// +class TwoInOneOut<ILOpCode op, dag outs, dag ins, + string asmstr, list<dag> pattern> + : OneInOneOut<op, outs, ins, asmstr, pattern> +{ + ILSrc src1_reg; + ILSrcMod src1_mod; + ILRelAddr src1_rel; + ILSrc src1_reg_rel; + ILSrcMod src1_reg_rel_mod; +} +//===--------------------------------------------------------------------===// +// A simplification of the TwoInOneOut pattern for Binary Operations. +// This class is a helper class that assumes the simple pattern of +// $dst = op $src0 $src1. +// Other type of matching patterns need to use the TwoInOneOut class. +//===--------------------------------------------------------------------===// +class BinaryOp<ILOpCode op, SDNode OpNode, RegisterClass dReg, + RegisterClass sReg0, RegisterClass sReg1> + : TwoInOneOut<op, (outs dReg:$dst), (ins sReg0:$src0, sReg1:$src1), + !strconcat(op.Text, " $dst, $src0, $src1"), + [(set dReg:$dst, (OpNode sReg0:$src0, sReg1:$src1))]>; + +//===--------------------------------------------------------------------===// +// The base class for vector extract instructions. The vector extract +// instructions take as an input value a source register and a 32bit integer +// with the same encoding as specified in InsertVectorClass and produces +// a result with only the swizzled component in the destination register. +//===--------------------------------------------------------------------===// +class ExtractVectorClass<RegisterClass DReg, RegisterClass SReg, SDNode OpNode> +: TwoInOneOut<IL_OP_MOV, (outs DReg:$dst), (ins SReg:$src0, i32imm:$src1), + "mov $dst, $src0", + [(set DReg:$dst, (OpNode SReg:$src0, timm:$src1))]>; + +//===--------------------------------------------------------------------===// +// The base class for vector concatenation. This class creates either a vec2 +// or a vec4 of 32bit data types or a vec2 of 64bit data types. This is done +// by swizzling either the 'x' or 'xy' components of the source operands +// into the destination register. +//===--------------------------------------------------------------------===// +class VectorConcatClass<RegisterClass Dst, RegisterClass Src, SDNode OpNode> + : TwoInOneOut<IL_OP_I_ADD, (outs Dst:$dst), (ins Src:$src0, Src:$src1), + "iadd $dst, $src0, $src1", + [(set Dst:$dst, (OpNode Src:$src0, Src:$src1))]>; + +//===--------------------------------------------------------------------===// +// Similiar to the UnaryOpNoRet class, but takes as arguments two input +// operands. Used mainly for barrier instructions on PC platform. +//===--------------------------------------------------------------------===// +class BinaryOpNoRet<ILOpCode op, dag outs, dag ins, + string asmstr, list<dag> pattern> + : UnaryOpNoRet<op, outs, ins, asmstr, pattern> +{ + ILSrc src1_reg; + ILSrcMod src1_mod; + ILRelAddr src1_rel; + ILSrc src1_reg_rel; + ILSrcMod src1_reg_rel_mod; +} + +//===--------------------------------------------------------------------===// +// Set of classes that have three input parameters and one output parameter. +// The basic pattern for this class is "Opcode Dst, Src0, Src1, Src2" and +// handles the mad and conditional mov instruction. +// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod +// if the addressing is register relative. +// This class is the parent class of TernaryOp +//===--------------------------------------------------------------------===// +class ThreeInOneOut<ILOpCode op, dag outs, dag ins, + string asmstr, list<dag> pattern> + : TwoInOneOut<op, outs, ins, asmstr, pattern> { + ILSrc src2_reg; + ILSrcMod src2_mod; + ILRelAddr src2_rel; + ILSrc src2_reg_rel; + ILSrcMod src2_reg_rel_mod; + } + +//===--------------------------------------------------------------------===// +// The g version of the Three Input pattern uses a standard pattern but +// but allows specification of the register to further generalize the class +// This class is mainly used in the generic multiclasses in AMDILMultiClass.td +//===--------------------------------------------------------------------===// +class TernaryOp<ILOpCode op, SDNode OpNode, + RegisterClass dReg, + RegisterClass sReg0, + RegisterClass sReg1, + RegisterClass sReg2> + : ThreeInOneOut<op, (outs dReg:$dst), + (ins sReg0:$src0, sReg1:$src1, sReg2:$src2), + !strconcat(op.Text, " $dst, $src0, $src1, $src2"), + [(set dReg:$dst, + (OpNode sReg0:$src0, sReg1:$src1, sReg2:$src2))]>; + +//===--------------------------------------------------------------------===// +// Set of classes that have three input parameters and one output parameter. +// The basic pattern for this class is "Opcode Dst, Src0, Src1, Src2" and +// handles the mad and conditional mov instruction. +// It sets the binary token ILSrc, ILSrcMod, ILRelAddr and ILSrc and ILSrcMod +// if the addressing is register relative. +// This class is the parent class of TernaryOp +//===--------------------------------------------------------------------===// +class FourInOneOut<ILOpCode op, dag outs, dag ins, + string asmstr, list<dag> pattern> + : ThreeInOneOut<op, outs, ins, asmstr, pattern> { + ILSrc src3_reg; + ILSrcMod src3_mod; + ILRelAddr src3_rel; + ILSrc src3_reg_rel; + ILSrcMod src3_reg_rel_mod; + } + + +//===--------------------------------------------------------------------===// +// The macro class that is an extension of OneInOneOut but is tailored for +// macros only where all the register types are the same +//===--------------------------------------------------------------------===// +class UnaryMacro<RegisterClass Dst, RegisterClass Src0, SDNode OpNode> +: OneInOneOut<IL_OP_MACRO, (outs Dst:$dst), + (ins Src0:$src0), + "($dst),($src0)", + [(set Dst:$dst, (OpNode Src0:$src0))]>; + +//===--------------------------------------------------------------------===// +// The macro class is an extension of TwoInOneOut but is tailored for +// macros only where all the register types are the same +//===--------------------------------------------------------------------===// +class BinaryMacro<RegisterClass Dst, + RegisterClass Src0, + RegisterClass Src1, + SDNode OpNode> + : TwoInOneOut<IL_OP_MACRO, (outs Dst:$dst), + (ins Src0: $src0, Src1:$src1), + "($dst),($src0, $src1)", + [(set Dst:$dst, (OpNode Src0:$src0, Src1:$src1))]>; + +//===--------------------------------------------------------------------===// +// Classes for dealing with atomic instructions w/ 32bit pointers +//===--------------------------------------------------------------------===// +class Append<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI32:$id), + !strconcat(op.Text, !strconcat(idType," $dst")), + [(set GPRI32:$dst, (intr ADDR:$id))]>; + + +// TODO: Need to get this working without dst... +class AppendNoRet<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI32:$id), + !strconcat(op.Text, !strconcat(idType," $dst")), + [(set GPRI32:$dst, (intr ADDR:$id))]>; + +class UniAtom<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI32:$ptr, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $dst, $ptr")), + [(set GPRI32:$dst, (intr ADDR:$ptr, timm:$id))]>; + + +// TODO: Need to get this working without dst... +class UniAtomNoRet<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), (ins MEMI32:$ptr, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $ptr")), + [(set GPRI32:$dst, (intr ADDR:$ptr, timm:$id))]>; + +class BinAtom<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI32:$ptr, GPRI32:$src, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src")), + [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, timm:$id))]>; + + +// TODO: Need to get this working without dst... +class BinAtomNoRet<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), (ins MEMI32:$ptr, GPRI32:$src, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $ptr, $src")), + [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, timm:$id))]>; + +class TriAtom<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI32:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src, $src1")), + [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>; + +class CmpXChg<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI32:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src1, $src")), + [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>; + +// TODO: Need to get this working without dst... +class TriAtomNoRet<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI32:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $ptr, $src, $src1")), + [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>; + +// TODO: Need to get this working without dst... +class CmpXChgNoRet<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI32:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $ptr, $src1, $src")), + [(set GPRI32:$dst, (intr ADDR:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>; + + +//===--------------------------------------------------------------------===// +// Classes for dealing with atomic instructions w/ 64bit pointers +//===--------------------------------------------------------------------===// +class Append64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI64:$id), + !strconcat(op.Text, !strconcat(idType," $dst")), + [(set GPRI32:$dst, (intr ADDR64:$id))]>; + + +// TODO: Need to get this working without dst... +class AppendNoRet64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI64:$id), + !strconcat(op.Text, !strconcat(idType," $dst")), + [(set GPRI32:$dst, (intr ADDR64:$id))]>; + +class UniAtom64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI64:$ptr, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $dst, $ptr")), + [(set GPRI32:$dst, (intr ADDR64:$ptr, timm:$id))]>; + + +// TODO: Need to get this working without dst... +class UniAtomNoRet64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), (ins MEMI64:$ptr, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $ptr")), + [(set GPRI32:$dst, (intr ADDR64:$ptr, timm:$id))]>; + +class BinAtom64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI64:$ptr, GPRI32:$src, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src")), + [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, timm:$id))]>; + + +// TODO: Need to get this working without dst... +class BinAtomNoRet64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), (ins MEMI64:$ptr, GPRI32:$src, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $ptr, $src")), + [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, timm:$id))]>; + +class TriAtom64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI64:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src, $src1")), + [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>; + +class CmpXChg64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI64:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $dst, $ptr, $src1, $src")), + [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>; + +// TODO: Need to get this working without dst... +class TriAtomNoRet64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI64:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $ptr, $src, $src1")), + [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>; + +// TODO: Need to get this working without dst... +class CmpXChgNoRet64<ILOpCode op, string idType, SDNode intr> + : ILFormat<op, (outs GPRI32:$dst), + (ins MEMI64:$ptr, GPRI32:$src, GPRI32:$src1, i32imm:$id), + !strconcat(op.Text, !strconcat(idType," $ptr, $src1, $src")), + [(set GPRI32:$dst, (intr ADDR64:$ptr, GPRI32:$src, GPRI32:$src1, timm:$id))]>; + +//===--------------------------------------------------------------------===// +// Intrinsic classes +// Generic versions of the above classes but for Target specific intrinsics +// instead of SDNode patterns. +//===--------------------------------------------------------------------===// +let TargetPrefix = "AMDIL", isTarget = 1 in { + class VoidIntLong : + Intrinsic<[llvm_i64_ty], [], []>; + class VoidIntInt : + Intrinsic<[llvm_i32_ty], [], []>; + class VoidIntBool : + Intrinsic<[llvm_i32_ty], [], []>; + class UnaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], []>; + class UnaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], []>; + class ConvertIntFTOI : + Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], []>; + class ConvertIntITOF : + Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], []>; + class UnaryIntNoRetInt : + Intrinsic<[], [llvm_anyint_ty], []>; + class UnaryIntNoRetFloat : + Intrinsic<[], [llvm_anyfloat_ty], []>; + class BinaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], []>; + class BinaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], []>; + class BinaryIntNoRetInt : + Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>; + class BinaryIntNoRetFloat : + Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>; + class TernaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], []>; + class TernaryIntFloat : + Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], []>; + class QuaternaryIntInt : + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], []>; + class UnaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class BinaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class TernaryAtomicInt : + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + class UnaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class BinaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; + class TernaryAtomicIntNoRet : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>; +} diff --git a/src/gallium/drivers/radeon/AMDILFrameLowering.cpp b/src/gallium/drivers/radeon/AMDILFrameLowering.cpp new file mode 100644 index 00000000000..87eca87e301 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILFrameLowering.cpp @@ -0,0 +1,53 @@ +//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface to describe a layout of a stack frame on a AMDIL target machine +// +//===----------------------------------------------------------------------===// +#include "AMDILFrameLowering.h" +#include "llvm/CodeGen/MachineFrameInfo.h" + +using namespace llvm; +AMDILFrameLowering::AMDILFrameLowering(StackDirection D, unsigned StackAl, + int LAO, unsigned TransAl) + : TargetFrameLowering(D, StackAl, LAO, TransAl) +{ +} + +AMDILFrameLowering::~AMDILFrameLowering() +{ +} + +/// getFrameIndexOffset - Returns the displacement from the frame register to +/// the stack frame of the specified index. +int AMDILFrameLowering::getFrameIndexOffset(const MachineFunction &MF, + int FI) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + return MFI->getObjectOffset(FI); +} + +const TargetFrameLowering::SpillSlot * +AMDILFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const +{ + NumEntries = 0; + return 0; +} +void +AMDILFrameLowering::emitPrologue(MachineFunction &MF) const +{ +} +void +AMDILFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const +{ +} +bool +AMDILFrameLowering::hasFP(const MachineFunction &MF) const +{ + return false; +} diff --git a/src/gallium/drivers/radeon/AMDILFrameLowering.h b/src/gallium/drivers/radeon/AMDILFrameLowering.h new file mode 100644 index 00000000000..b1d919ef524 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILFrameLowering.h @@ -0,0 +1,46 @@ +//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Interface to describe a layout of a stack frame on a AMDIL target machine +// +//===----------------------------------------------------------------------===// +#ifndef _AMDILFRAME_LOWERING_H_ +#define _AMDILFRAME_LOWERING_H_ + +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Target/TargetFrameLowering.h" + +/// Information about the stack frame layout on the AMDIL targets. It holds +/// the direction of the stack growth, the known stack alignment on entry to +/// each function, and the offset to the locals area. +/// See TargetFrameInfo for more comments. + +namespace llvm { + class AMDILFrameLowering : public TargetFrameLowering { + public: + AMDILFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned + TransAl = 1); + virtual ~AMDILFrameLowering(); + virtual int getFrameIndexOffset(const MachineFunction &MF, + int FI) const; + virtual const SpillSlot * + getCalleeSavedSpillSlots(unsigned &NumEntries) const; + virtual void emitPrologue(MachineFunction &MF) const; + virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const; + virtual bool hasFP(const MachineFunction &MF) const; + }; // class AMDILFrameLowering +} // namespace llvm +#endif // _AMDILFRAME_LOWERING_H_ diff --git a/src/gallium/drivers/radeon/AMDILGlobalManager.cpp b/src/gallium/drivers/radeon/AMDILGlobalManager.cpp new file mode 100644 index 00000000000..eafd36eaa4e --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILGlobalManager.cpp @@ -0,0 +1,1353 @@ +//===-- AMDILGlobalManager.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILGlobalManager.h" +#include "AMDILDevices.h" +#include "AMDILKernelManager.h" +#include "AMDILSubtarget.h" + +#include "AMDILAlgorithms.tpp" +#include "AMDILGlobalManager.h" +#include "AMDILDevices.h" +#include "AMDILKernelManager.h" +#include "AMDILSubtarget.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Support/FormattedStream.h" + +#include <cstdio> + +using namespace llvm; + +AMDILGlobalManager::AMDILGlobalManager(bool debugMode) { + mOffset = 0; + mReservedBuffs = 0; + symTab = NULL; + mCurrentCPOffset = 0; + mDebugMode = debugMode; +} + +AMDILGlobalManager::~AMDILGlobalManager() { +} + +void AMDILGlobalManager::print(llvm::raw_ostream &O) { + if (!mDebugMode) { + return; + } + O << ";AMDIL Global Manager State Dump:\n"; + O << ";\tSubtarget: " << mSTM << "\tSymbol Table: " << symTab + << "\n"; + O << ";\tConstant Offset: " << mOffset << "\tCP Offset: " + << mCurrentCPOffset << "\tReserved Buffers: " << mReservedBuffs + << "\n"; + if (!mImageNameMap.empty()) { + llvm::DenseMap<uint32_t, llvm::StringRef>::iterator imb, ime; + O << ";\tGlobal Image Mapping: \n"; + for (imb = mImageNameMap.begin(), ime = mImageNameMap.end(); imb != ime; + ++imb) { + O << ";\t\tImage ID: " << imb->first << "\tName: " + << imb->second << "\n"; + } + } + std::set<llvm::StringRef>::iterator sb, se; + if (!mByteStore.empty()) { + O << ";Byte Store Kernels: \n"; + for (sb = mByteStore.begin(), se = mByteStore.end(); sb != se; ++sb) { + O << ";\t\t" << *sb << "\n"; + } + } + if (!mIgnoreStr.empty()) { + O << ";\tIgnored Data Strings: \n"; + for (sb = mIgnoreStr.begin(), se = mIgnoreStr.end(); sb != se; ++sb) { + O << ";\t\t" << *sb << "\n"; + } + } +} + +void AMDILGlobalManager::dump() { + print(errs()); +} + +static const constPtr *getConstPtr(const kernel &krnl, const std::string &arg) { + llvm::SmallVector<constPtr, DEFAULT_VEC_SLOTS>::const_iterator begin, end; + for (begin = krnl.constPtr.begin(), end = krnl.constPtr.end(); + begin != end; ++begin) { + if (!strcmp(begin->name.data(),arg.c_str())) { + return &(*begin); + } + } + return NULL; +} +#if 0 +static bool structContainsSub32bitType(const StructType *ST) { + StructType::element_iterator eib, eie; + for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { + Type *ptr = *eib; + uint32_t size = (uint32_t)GET_SCALAR_SIZE(ptr); + if (!size) { + if (const StructType *ST = dyn_cast<StructType>(ptr)) { + if (structContainsSub32bitType(ST)) { + return true; + } + } + } else if (size < 32) { + return true; + } + } + return false; +} +#endif + +void AMDILGlobalManager::processModule(const Module &M, + const AMDILTargetMachine *mTM) +{ + Module::const_global_iterator GI; + Module::const_global_iterator GE; + symTab = "NoSymTab"; + mSTM = mTM->getSubtargetImpl(); + for (GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) { + const GlobalValue *GV = GI; + if (mDebugMode) { + GV->dump(); + errs() << "\n"; + } + llvm::StringRef GVName = GV->getName(); + const char *name = GVName.data(); + if (!strncmp(name, "sgv", 3)) { + mKernelArgs[GVName] = parseSGV(GV); + } else if (!strncmp(name, "fgv", 3)) { + // we can ignore this since we don't care about the filename + // string + } else if (!strncmp(name, "lvgv", 4)) { + mLocalArgs[GVName] = parseLVGV(GV); + } else if (!strncmp(name, "llvm.image.annotations", 22)) { + if (strstr(name, "__OpenCL") + && strstr(name, "_kernel")) { + // we only want to parse the image information if the + // image is a kernel, we might have to parse out the + // information if a function is found that is not + // inlined. + parseImageAnnotate(GV); + } + } else if (!strncmp(name, "llvm.global.annotations", 23)) { + parseGlobalAnnotate(GV); + } else if (!strncmp(name, "llvm.constpointer.annotations", 29)) { + if (strstr(name, "__OpenCL") + && strstr(name, "_kernel")) { + // we only want to parse constant pointer information + // if it is a kernel + parseConstantPtrAnnotate(GV); + } + } else if (!strncmp(name, "llvm.readonlypointer.annotations", 32)) { + // These are skipped as we handle them later in AMDILPointerManager.cpp + } else if (GV->getType()->getAddressSpace() == 3) { // *** Match cl_kernel.h local AS # + parseAutoArray(GV, false); + } else if (strstr(name, "clregion")) { + parseAutoArray(GV, true); + } else if (!GV->use_empty() + && mIgnoreStr.find(GVName) == mIgnoreStr.end()) { + parseConstantPtr(GV); + } + } + allocateGlobalCB(); + + safeForEach(M.begin(), M.end(), + std::bind1st( + std::mem_fun(&AMDILGlobalManager::checkConstPtrsUseHW), + this)); +} + +void AMDILGlobalManager::allocateGlobalCB(void) { + uint32_t maxCBSize = mSTM->device()->getMaxCBSize(); + uint32_t offset = 0; + uint32_t curCB = 0; + uint32_t swoffset = 0; + for (StringMap<constPtr>::iterator cpb = mConstMems.begin(), + cpe = mConstMems.end(); cpb != cpe; ++cpb) { + bool constHW = mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem); + cpb->second.usesHardware = false; + if (constHW) { + // If we have a limit on the max CB Size, then we need to make sure that + // the constant sizes fall within the limits. + if (cpb->second.size <= maxCBSize) { + if (offset + cpb->second.size > maxCBSize) { + offset = 0; + curCB++; + } + if (curCB < mSTM->device()->getMaxNumCBs()) { + cpb->second.cbNum = curCB + CB_BASE_OFFSET; + cpb->second.offset = offset; + offset += (cpb->second.size + 15) & (~15); + cpb->second.usesHardware = true; + continue; + } + } + } + cpb->second.cbNum = 0; + cpb->second.offset = swoffset; + swoffset += (cpb->second.size + 15) & (~15); + } + if (!mConstMems.empty()) { + mReservedBuffs = curCB + 1; + } +} + +bool AMDILGlobalManager::checkConstPtrsUseHW(llvm::Module::const_iterator *FCI) +{ + Function::const_arg_iterator AI, AE; + const Function *func = *FCI; + std::string name = func->getName(); + if (!strstr(name.c_str(), "__OpenCL") + || !strstr(name.c_str(), "_kernel")) { + return false; + } + kernel &krnl = mKernels[name]; + if (mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)) { + for (AI = func->arg_begin(), AE = func->arg_end(); + AI != AE; ++AI) { + const Argument *Arg = &(*AI); + const PointerType *P = dyn_cast<PointerType>(Arg->getType()); + if (!P) { + continue; + } + if (P->getAddressSpace() != AMDILAS::CONSTANT_ADDRESS) { + continue; + } + const constPtr *ptr = getConstPtr(krnl, Arg->getName()); + if (ptr) { + continue; + } + constPtr constAttr; + constAttr.name = Arg->getName(); + constAttr.size = this->mSTM->device()->getMaxCBSize(); + constAttr.base = Arg; + constAttr.isArgument = true; + constAttr.isArray = false; + constAttr.offset = 0; + constAttr.usesHardware = + mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem); + if (constAttr.usesHardware) { + constAttr.cbNum = krnl.constPtr.size() + 2; + } else { + constAttr.cbNum = 0; + } + krnl.constPtr.push_back(constAttr); + } + } + // Now lets make sure that only the N largest buffers + // get allocated in hardware if we have too many buffers + uint32_t numPtrs = krnl.constPtr.size(); + if (numPtrs > (this->mSTM->device()->getMaxNumCBs() - mReservedBuffs)) { + // TODO: Change this routine so it sorts + // constPtr instead of pulling the sizes out + // and then grab the N largest and disable the rest + llvm::SmallVector<uint32_t, 16> sizes; + for (uint32_t x = 0; x < numPtrs; ++x) { + sizes.push_back(krnl.constPtr[x].size); + } + std::sort(sizes.begin(), sizes.end()); + uint32_t numToDisable = numPtrs - (mSTM->device()->getMaxNumCBs() - + mReservedBuffs); + uint32_t safeSize = sizes[numToDisable-1]; + for (uint32_t x = 0; x < numPtrs && numToDisable; ++x) { + if (krnl.constPtr[x].size <= safeSize) { + krnl.constPtr[x].usesHardware = false; + --numToDisable; + } + } + } + // Renumber all of the valid CB's so that + // they are linear increase + uint32_t CBid = 2 + mReservedBuffs; + for (uint32_t x = 0; x < numPtrs; ++x) { + if (krnl.constPtr[x].usesHardware) { + krnl.constPtr[x].cbNum = CBid++; + } + } + for (StringMap<constPtr>::iterator cpb = mConstMems.begin(), + cpe = mConstMems.end(); cpb != cpe; ++cpb) { + if (cpb->second.usesHardware) { + krnl.constPtr.push_back(cpb->second); + } + } + for (uint32_t x = 0; x < krnl.constPtr.size(); ++x) { + constPtr &c = krnl.constPtr[x]; + uint32_t cbNum = c.cbNum - CB_BASE_OFFSET; + if (cbNum < HW_MAX_NUM_CB && c.cbNum >= CB_BASE_OFFSET) { + if ((c.size + c.offset) > krnl.constSizes[cbNum]) { + krnl.constSizes[cbNum] = + ((c.size + c.offset) + 15) & ~15; + } + } else { + krnl.constPtr[x].usesHardware = false; + } + } + return false; +} + +int32_t AMDILGlobalManager::getArrayOffset(const llvm::StringRef &a) const { + StringMap<arraymem>::const_iterator iter = mArrayMems.find(a); + if (iter != mArrayMems.end()) { + return iter->second.offset; + } else { + return -1; + } +} + +int32_t AMDILGlobalManager::getConstOffset(const llvm::StringRef &a) const { + StringMap<constPtr>::const_iterator iter = mConstMems.find(a); + if (iter != mConstMems.end()) { + return iter->second.offset; + } else { + return -1; + } +} + +bool AMDILGlobalManager::getConstHWBit(const llvm::StringRef &name) const { + StringMap<constPtr>::const_iterator iter = mConstMems.find(name); + if (iter != mConstMems.end()) { + return iter->second.usesHardware; + } else { + return false; + } +} + +// As of right now we only care about the required group size +// so we can skip the variable encoding +kernelArg AMDILGlobalManager::parseSGV(const GlobalValue *G) { + kernelArg nArg; + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + memset(&nArg, 0, sizeof(nArg)); + for (int x = 0; x < 3; ++x) { + nArg.reqGroupSize[x] = mSTM->getDefaultSize(x); + nArg.reqRegionSize[x] = mSTM->getDefaultSize(x); + } + if (!GV || !GV->hasInitializer()) { + return nArg; + } + const Constant *CV = GV->getInitializer(); + const ConstantDataArray *CA =dyn_cast_or_null<ConstantDataArray>(CV); + + if (!CA || !CA->isString()) { + return nArg; + } + std::string init = CA->getAsString(); + size_t pos = init.find("RWG"); + if (pos != llvm::StringRef::npos) { + pos += 3; + std::string LWS = init.substr(pos, init.length() - pos); + const char *lws = LWS.c_str(); + sscanf(lws, "%u,%u,%u", &(nArg.reqGroupSize[0]), + &(nArg.reqGroupSize[1]), + &(nArg.reqGroupSize[2])); + nArg.mHasRWG = true; + } + pos = init.find("RWR"); + if (pos != llvm::StringRef::npos) { + pos += 3; + std::string LWS = init.substr(pos, init.length() - pos); + const char *lws = LWS.c_str(); + sscanf(lws, "%u,%u,%u", &(nArg.reqRegionSize[0]), + &(nArg.reqRegionSize[1]), + &(nArg.reqRegionSize[2])); + nArg.mHasRWR = true; + } + return nArg; +} + +localArg AMDILGlobalManager::parseLVGV(const GlobalValue *G) { + localArg nArg; + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + nArg.name = ""; + if (!GV || !GV->hasInitializer()) { + return nArg; + } + const ConstantArray *CA = + dyn_cast_or_null<ConstantArray>(GV->getInitializer()); + if (!CA) { + return nArg; + } + for (size_t x = 0, y = CA->getNumOperands(); x < y; ++x) { + const Value *local = CA->getOperand(x); + const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(local); + if (!CE || !CE->getNumOperands()) { + continue; + } + nArg.name = (*(CE->op_begin()))->getName(); + if (mArrayMems.find(nArg.name) != mArrayMems.end()) { + nArg.local.push_back(&(mArrayMems[nArg.name])); + } + } + return nArg; +} + +void AMDILGlobalManager::parseConstantPtrAnnotate(const GlobalValue *G) { + const GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(G); + const ConstantArray *CA = + dyn_cast_or_null<ConstantArray>(GV->getInitializer()); + if (!CA) { + return; + } + uint32_t numOps = CA->getNumOperands(); + for (uint32_t x = 0; x < numOps; ++x) { + const Value *V = CA->getOperand(x); + const ConstantStruct *CS = dyn_cast_or_null<ConstantStruct>(V); + if (!CS) { + continue; + } + assert(CS->getNumOperands() == 2 && "There can only be 2" + " fields, a name and size"); + const ConstantExpr *nameField = dyn_cast<ConstantExpr>(CS->getOperand(0)); + const ConstantInt *sizeField = dyn_cast<ConstantInt>(CS->getOperand(1)); + assert(nameField && "There must be a constant name field"); + assert(sizeField && "There must be a constant size field"); + const GlobalVariable *nameGV = + dyn_cast<GlobalVariable>(nameField->getOperand(0)); + const ConstantDataArray *nameArray = + dyn_cast<ConstantDataArray>(nameGV->getInitializer()); + // Lets add this string to the set of strings we should ignore processing + mIgnoreStr.insert(nameGV->getName()); + if (mConstMems.find(nameGV->getName()) + != mConstMems.end()) { + // If we already processesd this string as a constant, lets remove it from + // the list of known constants. This way we don't process unneeded data + // and don't generate code/metadata for strings that are never used. + mConstMems.erase(mConstMems.find(nameGV->getName())); + } else { + mIgnoreStr.insert(CS->getOperand(0)->getName()); + } + constPtr constAttr; + constAttr.name = nameArray->getAsString(); + constAttr.size = (sizeField->getZExtValue() + 15) & ~15; + constAttr.base = CS; + constAttr.isArgument = true; + constAttr.isArray = false; + constAttr.cbNum = 0; + constAttr.offset = 0; + constAttr.usesHardware = (constAttr.size <= mSTM->device()->getMaxCBSize()); + // Now that we have all our constant information, + // lets update the kernel + llvm::StringRef kernelName = G->getName().data() + 30; + kernel k; + if (mKernels.find(kernelName) != mKernels.end()) { + k = mKernels[kernelName]; + } else { + k.curSize = 0; + k.curRSize = 0; + k.curHWSize = 0; + k.curHWRSize = 0; + k.constSize = 0; + k.lvgv = NULL; + k.sgv = NULL; + memset(k.constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB); + } + constAttr.cbNum = k.constPtr.size() + 2; + k.constPtr.push_back(constAttr); + mKernels[kernelName] = k; + } +} + +void AMDILGlobalManager::parseImageAnnotate(const GlobalValue *G) { + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + const ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer()); + if (!CA) { + return; + } + if (isa<GlobalValue>(CA)) { + return; + } + uint32_t e = CA->getNumOperands(); + if (!e) { + return; + } + kernel k; + llvm::StringRef name = G->getName().data() + 23; + if (mKernels.find(name) != mKernels.end()) { + k = mKernels[name]; + } else { + k.curSize = 0; + k.curRSize = 0; + k.curHWSize = 0; + k.curHWRSize = 0; + k.constSize = 0; + k.lvgv = NULL; + k.sgv = NULL; + memset(k.constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB); + } + for (uint32_t i = 0; i != e; ++i) { + const Value *V = CA->getOperand(i); + const Constant *C = dyn_cast<Constant>(V); + const ConstantStruct *CS = dyn_cast<ConstantStruct>(C); + if (CS && CS->getNumOperands() == 2) { + if (mConstMems.find(CS->getOperand(0)->getOperand(0)->getName()) != + mConstMems.end()) { + // If we already processesd this string as a constant, lets remove it + // from the list of known constants. This way we don't process unneeded + // data and don't generate code/metadata for strings that are never + // used. + mConstMems.erase( + mConstMems.find(CS->getOperand(0)->getOperand(0)->getName())); + } else { + mIgnoreStr.insert(CS->getOperand(0)->getOperand(0)->getName()); + } + const ConstantInt *CI = dyn_cast<ConstantInt>(CS->getOperand(1)); + uint32_t val = (uint32_t)CI->getZExtValue(); + if (val == 1) { + k.readOnly.insert(i); + } else if (val == 2) { + k.writeOnly.insert(i); + } else { + assert(!"Unknown image type value!"); + } + } + } + mKernels[name] = k; +} + +void AMDILGlobalManager::parseAutoArray(const GlobalValue *GV, bool isRegion) { + const GlobalVariable *G = dyn_cast<GlobalVariable>(GV); + Type *Ty = (G) ? G->getType() : NULL; + arraymem tmp; + tmp.isHW = true; + tmp.offset = 0; + tmp.vecSize = getTypeSize(Ty, true); + tmp.isRegion = isRegion; + mArrayMems[GV->getName()] = tmp; +} + +void AMDILGlobalManager::parseConstantPtr(const GlobalValue *GV) { + const GlobalVariable *G = dyn_cast<GlobalVariable>(GV); + Type *Ty = (G) ? G->getType() : NULL; + constPtr constAttr; + constAttr.name = G->getName(); + constAttr.size = getTypeSize(Ty, true); + constAttr.base = GV; + constAttr.isArgument = false; + constAttr.isArray = true; + constAttr.offset = 0; + constAttr.cbNum = 0; + constAttr.usesHardware = false; + mConstMems[GV->getName()] = constAttr; +} + +void AMDILGlobalManager::parseGlobalAnnotate(const GlobalValue *G) { + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + if (!GV->hasInitializer()) { + return; + } + const Constant *CT = GV->getInitializer(); + if (!CT || isa<GlobalValue>(CT)) { + return; + } + const ConstantArray *CA = dyn_cast<ConstantArray>(CT); + if (!CA) { + return; + } + + unsigned int nKernels = CA->getNumOperands(); + for (unsigned int i = 0, e = nKernels; i != e; ++i) { + parseKernelInformation(CA->getOperand(i)); + } +} + +void AMDILGlobalManager::parseKernelInformation(const Value *V) { + if (isa<GlobalValue>(V)) { + return; + } + const ConstantStruct *CS = dyn_cast_or_null<ConstantStruct>(V); + if (!CS) { + return; + } + uint32_t N = CS->getNumOperands(); + if (N != 5) { + return; + } + kernel tmp; + + tmp.curSize = 0; + tmp.curRSize = 0; + tmp.curHWSize = 0; + tmp.curHWRSize = 0; + // The first operand is always a pointer to the kernel. + const Constant *CV = dyn_cast<Constant>(CS->getOperand(0)); + llvm::StringRef kernelName = ""; + if (CV->getNumOperands()) { + kernelName = (*(CV->op_begin()))->getName(); + } + + // If we have images, then we have already created the kernel and we just need + // to get the kernel information. + if (mKernels.find(kernelName) != mKernels.end()) { + tmp = mKernels[kernelName]; + } else { + tmp.curSize = 0; + tmp.curRSize = 0; + tmp.curHWSize = 0; + tmp.curHWRSize = 0; + tmp.constSize = 0; + tmp.lvgv = NULL; + tmp.sgv = NULL; + memset(tmp.constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB); + } + + + // The second operand is SGV, there can only be one so we don't need to worry + // about parsing out multiple data points. + CV = dyn_cast<Constant>(CS->getOperand(1)); + + llvm::StringRef sgvName; + if (CV->getNumOperands()) { + sgvName = (*(CV->op_begin()))->getName(); + } + + if (mKernelArgs.find(sgvName) != mKernelArgs.end()) { + tmp.sgv = &mKernelArgs[sgvName]; + } + // The third operand is FGV, which is skipped + // The fourth operand is LVGV + // There can be multiple local arrays, so we + // need to handle each one seperatly + CV = dyn_cast<Constant>(CS->getOperand(3)); + llvm::StringRef lvgvName = ""; + if (CV->getNumOperands()) { + lvgvName = (*(CV->op_begin()))->getName(); + } + if (mLocalArgs.find(lvgvName) != mLocalArgs.end()) { + localArg *ptr = &mLocalArgs[lvgvName]; + tmp.lvgv = ptr; + llvm::SmallVector<arraymem *, DEFAULT_VEC_SLOTS>::iterator ib, ie; + for (ib = ptr->local.begin(), ie = ptr->local.end(); ib != ie; ++ib) { + if ((*ib)->isRegion) { + if ((*ib)->isHW) { + (*ib)->offset = tmp.curHWRSize; + tmp.curHWRSize += ((*ib)->vecSize + 15) & ~15; + } else { + (*ib)->offset = tmp.curRSize; + tmp.curRSize += ((*ib)->vecSize + 15) & ~15; + } + } else { + if ((*ib)->isHW) { + (*ib)->offset = tmp.curHWSize; + tmp.curHWSize += ((*ib)->vecSize + 15) & ~15; + } else { + (*ib)->offset = tmp.curSize; + tmp.curSize += ((*ib)->vecSize + 15) & ~15; + } + } + } + } + + // The fifth operand is NULL + mKernels[kernelName] = tmp; +} + +const kernel &AMDILGlobalManager::getKernel(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + assert(isKernel(name) && "Must be a kernel to call getKernel"); + return iter->second; +} + +bool AMDILGlobalManager::isKernel(const llvm::StringRef &name) const { + return (mKernels.find(name) != mKernels.end()); +} + +bool AMDILGlobalManager::isWriteOnlyImage(const llvm::StringRef &name, + uint32_t iID) const { + const StringMap<kernel>::const_iterator kiter = mKernels.find(name); + if (kiter == mKernels.end()) { + return false; + } + return kiter->second.writeOnly.count(iID); +} + +uint32_t +AMDILGlobalManager::getNumWriteImages(const llvm::StringRef &name) const { + char *env = NULL; + env = getenv("GPU_DISABLE_RAW_UAV"); + if (env && env[0] == '1') { + return 8; + } + const StringMap<kernel>::const_iterator kiter = mKernels.find(name); + if (kiter == mKernels.end()) { + return 0; + } else { + return kiter->second.writeOnly.size(); + } +} + +bool AMDILGlobalManager::isReadOnlyImage(const llvm::StringRef &name, + uint32_t iID) const { + const StringMap<kernel>::const_iterator kiter = mKernels.find(name); + if (kiter == mKernels.end()) { + return false; + } + return kiter->second.readOnly.count(iID); +} + +bool AMDILGlobalManager::hasRWG(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + kernelArg *ptr = iter->second.sgv; + if (ptr) { + return ptr->mHasRWG; + } + } + return false; +} + +bool AMDILGlobalManager::hasRWR(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + kernelArg *ptr = iter->second.sgv; + if (ptr) { + return ptr->mHasRWR; + } + } + return false; +} + +uint32_t +AMDILGlobalManager::getMaxGroupSize(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + kernelArg *sgv = iter->second.sgv; + if (sgv) { + return sgv->reqGroupSize[0] * sgv->reqGroupSize[1] * sgv->reqGroupSize[2]; + } + } + return mSTM->getDefaultSize(0) * + mSTM->getDefaultSize(1) * + mSTM->getDefaultSize(2); +} + +uint32_t +AMDILGlobalManager::getMaxRegionSize(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + kernelArg *sgv = iter->second.sgv; + if (sgv) { + return sgv->reqRegionSize[0] * + sgv->reqRegionSize[1] * + sgv->reqRegionSize[2]; + } + } + return mSTM->getDefaultSize(0) * + mSTM->getDefaultSize(1) * + mSTM->getDefaultSize(2); +} + +uint32_t AMDILGlobalManager::getRegionSize(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second.curRSize; + } else { + return 0; + } +} + +uint32_t AMDILGlobalManager::getLocalSize(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second.curSize; + } else { + return 0; + } +} + +uint32_t AMDILGlobalManager::getConstSize(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second.constSize; + } else { + return 0; + } +} + +uint32_t +AMDILGlobalManager::getHWRegionSize(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second.curHWRSize; + } else { + return 0; + } +} + +uint32_t AMDILGlobalManager::getHWLocalSize(const llvm::StringRef &name) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second.curHWSize; + } else { + return 0; + } +} + +int32_t AMDILGlobalManager::getArgID(const Argument *arg) { + DenseMap<const Argument *, int32_t>::iterator argiter = mArgIDMap.find(arg); + if (argiter != mArgIDMap.end()) { + return argiter->second; + } else { + return -1; + } +} + + +uint32_t +AMDILGlobalManager::getLocal(const llvm::StringRef &name, uint32_t dim) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end() && iter->second.sgv) { + kernelArg *sgv = iter->second.sgv; + switch (dim) { + default: break; + case 0: + case 1: + case 2: + return sgv->reqGroupSize[dim]; + break; + case 3: + return sgv->reqGroupSize[0] * sgv->reqGroupSize[1] * sgv->reqGroupSize[2]; + }; + } + switch (dim) { + default: + return 1; + case 3: + return mSTM->getDefaultSize(0) * + mSTM->getDefaultSize(1) * + mSTM->getDefaultSize(2); + case 2: + case 1: + case 0: + return mSTM->getDefaultSize(dim); + break; + }; + return 1; +} + +uint32_t +AMDILGlobalManager::getRegion(const llvm::StringRef &name, uint32_t dim) const { + StringMap<kernel>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end() && iter->second.sgv) { + kernelArg *sgv = iter->second.sgv; + switch (dim) { + default: break; + case 0: + case 1: + case 2: + return sgv->reqRegionSize[dim]; + break; + case 3: + return sgv->reqRegionSize[0] * + sgv->reqRegionSize[1] * + sgv->reqRegionSize[2]; + }; + } + switch (dim) { + default: + return 1; + case 3: + return mSTM->getDefaultSize(0) * + mSTM->getDefaultSize(1) * + mSTM->getDefaultSize(2); + case 2: + case 1: + case 0: + return mSTM->getDefaultSize(dim); + break; + }; + return 1; +} + +StringMap<constPtr>::iterator AMDILGlobalManager::consts_begin() { + return mConstMems.begin(); +} + + +StringMap<constPtr>::iterator AMDILGlobalManager::consts_end() { + return mConstMems.end(); +} + +bool AMDILGlobalManager::byteStoreExists(StringRef S) const { + return mByteStore.find(S) != mByteStore.end(); +} + +bool AMDILGlobalManager::usesHWConstant(const kernel &krnl, + const llvm::StringRef &arg) { + const constPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->usesHardware; + } else { + return false; + } +} + +uint32_t AMDILGlobalManager::getConstPtrSize(const kernel &krnl, + const llvm::StringRef &arg) +{ + const constPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->size; + } else { + return 0; + } +} + +uint32_t AMDILGlobalManager::getConstPtrOff(const kernel &krnl, + const llvm::StringRef &arg) +{ + const constPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->offset; + } else { + return 0; + } +} + +uint32_t AMDILGlobalManager::getConstPtrCB(const kernel &krnl, + const llvm::StringRef &arg) +{ + const constPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->cbNum; + } else { + return 0; + } +} + +void AMDILGlobalManager::calculateCPOffsets(const MachineFunction *MF, + kernel &krnl) +{ + const MachineConstantPool *MCP = MF->getConstantPool(); + if (!MCP) { + return; + } + const std::vector<MachineConstantPoolEntry> consts = MCP->getConstants(); + size_t numConsts = consts.size(); + for (size_t x = 0; x < numConsts; ++x) { + krnl.CPOffsets.push_back( + std::make_pair<uint32_t, const Constant*>( + mCurrentCPOffset, consts[x].Val.ConstVal)); + size_t curSize = getTypeSize(consts[x].Val.ConstVal->getType(), true); + // Align the size to the vector boundary + curSize = (curSize + 15) & (~15); + mCurrentCPOffset += curSize; + } +} + +bool AMDILGlobalManager::isConstPtrArray(const kernel &krnl, + const llvm::StringRef &arg) { + const constPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->isArray; + } else { + return false; + } +} + +bool AMDILGlobalManager::isConstPtrArgument(const kernel &krnl, + const llvm::StringRef &arg) +{ + const constPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->isArgument; + } else { + return false; + } +} + +const Value *AMDILGlobalManager::getConstPtrValue(const kernel &krnl, + const llvm::StringRef &arg) { + const constPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->base; + } else { + return NULL; + } +} + +static void +dumpZeroElements(const StructType * const T, llvm::raw_ostream &O, bool asBytes); +static void +dumpZeroElements(const IntegerType * const T, llvm::raw_ostream &O, bool asBytes); +static void +dumpZeroElements(const ArrayType * const T, llvm::raw_ostream &O, bool asBytes); +static void +dumpZeroElements(const VectorType * const T, llvm::raw_ostream &O, bool asBytes); +static void +dumpZeroElements(const Type * const T, llvm::raw_ostream &O, bool asBytes); + +void dumpZeroElements(const Type * const T, llvm::raw_ostream &O, bool asBytes) { + if (!T) { + return; + } + switch(T->getTypeID()) { + case Type::X86_FP80TyID: + case Type::FP128TyID: + case Type::PPC_FP128TyID: + case Type::LabelTyID: + assert(0 && "These types are not supported by this backend"); + default: + case Type::DoubleTyID: + if (asBytes) { + O << ":0:0:0:0:0:0:0:0"; + } else { + O << ":0"; + } + break; + case Type::FloatTyID: + case Type::PointerTyID: + case Type::FunctionTyID: + if (asBytes) { + O << ":0:0:0:0"; + } else { + O << ":0"; + } + break; + case Type::IntegerTyID: + dumpZeroElements(dyn_cast<IntegerType>(T), O, asBytes); + break; + case Type::StructTyID: + { + const StructType *ST = cast<StructType>(T); + if (!ST->isOpaque()) { + dumpZeroElements(dyn_cast<StructType>(T), O, asBytes); + } else { // A pre-LLVM 3.0 opaque type + if (asBytes) { + O << ":0:0:0:0"; + } else { + O << ":0"; + } + } + } + break; + case Type::ArrayTyID: + dumpZeroElements(dyn_cast<ArrayType>(T), O, asBytes); + break; + case Type::VectorTyID: + dumpZeroElements(dyn_cast<VectorType>(T), O, asBytes); + break; + }; +} + +void +dumpZeroElements(const StructType * const ST, llvm::raw_ostream &O, bool asBytes) { + if (!ST) { + return; + } + Type *curType; + StructType::element_iterator eib = ST->element_begin(); + StructType::element_iterator eie = ST->element_end(); + for (;eib != eie; ++eib) { + curType = *eib; + dumpZeroElements(curType, O, asBytes); + } +} + +void +dumpZeroElements(const IntegerType * const IT, llvm::raw_ostream &O, bool asBytes) { + if (asBytes) { + unsigned byteWidth = (IT->getBitWidth() >> 3); + for (unsigned x = 0; x < byteWidth; ++x) { + O << ":0"; + } + } +} + +void +dumpZeroElements(const ArrayType * const AT, llvm::raw_ostream &O, bool asBytes) { + size_t size = AT->getNumElements(); + for (size_t x = 0; x < size; ++x) { + dumpZeroElements(AT->getElementType(), O, asBytes); + } +} + +void +dumpZeroElements(const VectorType * const VT, llvm::raw_ostream &O, bool asBytes) { + size_t size = VT->getNumElements(); + for (size_t x = 0; x < size; ++x) { + dumpZeroElements(VT->getElementType(), O, asBytes); + } +} + +void AMDILGlobalManager::printConstantValue(const Constant *CAval, + llvm::raw_ostream &O, bool asBytes) { + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CAval)) { + bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble; + if (isDouble) { + double val = CFP->getValueAPF().convertToDouble(); + union dtol_union { + double d; + uint64_t l; + char c[8]; + } conv; + conv.d = val; + if (!asBytes) { + O << ":"; + O.write_hex(conv.l); + } else { + for (int i = 0; i < 8; ++i) { + O << ":"; + O.write_hex((unsigned)conv.c[i] & 0xFF); + } + } + } else { + float val = CFP->getValueAPF().convertToFloat(); + union ftoi_union { + float f; + uint32_t u; + char c[4]; + } conv; + conv.f = val; + if (!asBytes) { + O << ":"; + O.write_hex(conv.u); + } else { + for (int i = 0; i < 4; ++i) { + O << ":"; + O.write_hex((unsigned)conv.c[i] & 0xFF); + } + } + } + } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CAval)) { + uint64_t zVal = CI->getValue().getZExtValue(); + if (!asBytes) { + O << ":"; + O.write_hex(zVal); + } else { + switch (CI->getBitWidth()) { + default: + { + union ltob_union { + uint64_t l; + char c[8]; + } conv; + conv.l = zVal; + for (int i = 0; i < 8; ++i) { + O << ":"; + O.write_hex((unsigned)conv.c[i] & 0xFF); + } + } + break; + case 8: + O << ":"; + O.write_hex(zVal & 0xFF); + break; + case 16: + { + union stob_union { + uint16_t s; + char c[2]; + } conv; + conv.s = (uint16_t)zVal; + O << ":"; + O.write_hex((unsigned)conv.c[0] & 0xFF); + O << ":"; + O.write_hex((unsigned)conv.c[1] & 0xFF); + } + break; + case 32: + { + union itob_union { + uint32_t i; + char c[4]; + } conv; + conv.i = (uint32_t)zVal; + for (int i = 0; i < 4; ++i) { + O << ":"; + O.write_hex((unsigned)conv.c[i] & 0xFF); + } + } + break; + } + } + } else if (const ConstantVector *CV = dyn_cast<ConstantVector>(CAval)) { + int y = CV->getNumOperands()-1; + int x = 0; + for (; x < y; ++x) { + printConstantValue(CV->getOperand(x), O, asBytes); + } + printConstantValue(CV->getOperand(x), O, asBytes); + } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CAval)) { + int y = CS->getNumOperands(); + int x = 0; + for (; x < y; ++x) { + printConstantValue(CS->getOperand(x), O, asBytes); + } + } else if (const ConstantAggregateZero *CAZ + = dyn_cast<ConstantAggregateZero>(CAval)) { + int y = CAZ->getNumOperands(); + if (y > 0) { + int x = 0; + for (; x < y; ++x) { + printConstantValue((llvm::Constant *)CAZ->getOperand(x), + O, asBytes); + } + } else { + if (asBytes) { + dumpZeroElements(CAval->getType(), O, asBytes); + } else { + int y = getNumElements(CAval->getType())-1; + for (int x = 0; x < y; ++x) { + O << ":0"; + } + O << ":0"; + } + } + } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CAval)) { + int y = CA->getNumOperands(); + int x = 0; + for (; x < y; ++x) { + printConstantValue(CA->getOperand(x), O, asBytes); + } + } else if (dyn_cast<ConstantPointerNull>(CAval)) { + O << ":0"; + //assert(0 && "Hit condition which was not expected"); + } else if (dyn_cast<ConstantExpr>(CAval)) { + O << ":0"; + //assert(0 && "Hit condition which was not expected"); + } else if (dyn_cast<UndefValue>(CAval)) { + O << ":0"; + //assert(0 && "Hit condition which was not expected"); + } else { + assert(0 && "Hit condition which was not expected"); + } +} + +static bool isStruct(Type * const T) +{ + if (!T) { + return false; + } + switch (T->getTypeID()) { + default: + return false; + case Type::PointerTyID: + return isStruct(T->getContainedType(0)); + case Type::StructTyID: + return true; + case Type::ArrayTyID: + case Type::VectorTyID: + return isStruct(dyn_cast<SequentialType>(T)->getElementType()); + }; + +} + +void AMDILGlobalManager::dumpDataToCB(llvm::raw_ostream &O, AMDILKernelManager *km, + uint32_t id) { + uint32_t size = 0; + for (StringMap<constPtr>::iterator cmb = consts_begin(), + cme = consts_end(); cmb != cme; ++cmb) { + if (id == cmb->second.cbNum) { + size += (cmb->second.size + 15) & (~15); + } + } + if (id == 0) { + O << ";#DATASTART:" << (size + mCurrentCPOffset) << "\n"; + if (mCurrentCPOffset) { + for (StringMap<kernel>::iterator kcpb = mKernels.begin(), + kcpe = mKernels.end(); kcpb != kcpe; ++kcpb) { + const kernel& k = kcpb->second; + size_t numConsts = k.CPOffsets.size(); + for (size_t x = 0; x < numConsts; ++x) { + size_t offset = k.CPOffsets[x].first; + const Constant *C = k.CPOffsets[x].second; + Type *Ty = C->getType(); + size_t size = (isStruct(Ty) ? getTypeSize(Ty, true) + : getNumElements(Ty)); + O << ";#" << km->getTypeName(Ty, symTab) << ":"; + O << offset << ":" << size ; + printConstantValue(C, O, isStruct(Ty)); + O << "\n"; + } + } + } + } else { + O << ";#DATASTART:" << id << ":" << size << "\n"; + } + + for (StringMap<constPtr>::iterator cmb = consts_begin(), cme = consts_end(); + cmb != cme; ++cmb) { + if (cmb->second.cbNum != id) { + continue; + } + const GlobalVariable *G = dyn_cast<GlobalVariable>(cmb->second.base); + Type *Ty = (G) ? G->getType() : NULL; + size_t offset = cmb->second.offset; + const Constant *C = G->getInitializer(); + size_t size = (isStruct(Ty) + ? getTypeSize(Ty, true) + : getNumElements(Ty)); + O << ";#" << km->getTypeName(Ty, symTab) << ":"; + if (!id) { + O << (offset + mCurrentCPOffset) << ":" << size; + } else { + O << offset << ":" << size; + } + if (C) { + printConstantValue(C, O, isStruct(Ty)); + } else { + assert(0 && "Cannot have a constant pointer" + " without an initializer!"); + } + O <<"\n"; + } + if (id == 0) { + O << ";#DATAEND\n"; + } else { + O << ";#DATAEND:" << id << "\n"; + } +} + +void +AMDILGlobalManager::dumpDataSection(llvm::raw_ostream &O, AMDILKernelManager *km) { + if (mConstMems.empty() && !mCurrentCPOffset) { + return; + } else { + llvm::DenseSet<uint32_t> const_set; + for (StringMap<constPtr>::iterator cmb = consts_begin(), cme = consts_end(); + cmb != cme; ++cmb) { + const_set.insert(cmb->second.cbNum); + } + if (mCurrentCPOffset) { + const_set.insert(0); + } + for (llvm::DenseSet<uint32_t>::iterator setb = const_set.begin(), + sete = const_set.end(); setb != sete; ++setb) { + dumpDataToCB(O, km, *setb); + } + } +} + +/// Create a function ID if it is not known or return the known +/// function ID. +uint32_t AMDILGlobalManager::getOrCreateFunctionID(const GlobalValue* func) { + if (func->getName().size()) { + return getOrCreateFunctionID(func->getName()); + } + uint32_t id; + if (mFuncPtrNames.find(func) == mFuncPtrNames.end()) { + id = mFuncPtrNames.size() + RESERVED_FUNCS + mFuncNames.size(); + mFuncPtrNames[func] = id; + } else { + id = mFuncPtrNames[func]; + } + return id; +} +uint32_t AMDILGlobalManager::getOrCreateFunctionID(const std::string &func) { + uint32_t id; + if (mFuncNames.find(func) == mFuncNames.end()) { + id = mFuncNames.size() + RESERVED_FUNCS + mFuncPtrNames.size(); + mFuncNames[func] = id; + } else { + id = mFuncNames[func]; + } + return id; +} diff --git a/src/gallium/drivers/radeon/AMDILGlobalManager.h b/src/gallium/drivers/radeon/AMDILGlobalManager.h new file mode 100644 index 00000000000..1b0361e0174 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILGlobalManager.h @@ -0,0 +1,256 @@ +//===-- AMDILGlobalManager.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +// ==-----------------------------------------------------------------------===// +// +// Class that handles parsing and storing global variables that are relevant to +// the compilation of the module. +// +// ==-----------------------------------------------------------------------===// + +#ifndef _AMDILGLOBALMANAGER_H_ +#define _AMDILGLOBALMANAGER_H_ + +#include "AMDIL.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Module.h" +#include "llvm/Support/raw_ostream.h" + +#include <set> +#include <string> + +#define CB_BASE_OFFSET 2 + +namespace llvm { + +class PointerType; +class AMDILKernelManager; +class AMDILSubtarget; +class TypeSymbolTable; +class Argument; +class GlobalValue; +class MachineFunction; + +/// structure that holds information for a single local/region address array +typedef struct _arrayMemRec { + uint32_t vecSize; // size of each vector + uint32_t offset; // offset into the memory section + bool isHW; // flag to specify if HW is used or SW is used + bool isRegion; // flag to specify if GDS is used or not +} arraymem; + +/// Structure that holds information for all local/region address +/// arrays in the kernel +typedef struct _localArgRec { + llvm::SmallVector<arraymem *, DEFAULT_VEC_SLOTS> local; + std::string name; // Kernel Name +} localArg; + +/// structure that holds information about a constant address +/// space pointer that is a kernel argument +typedef struct _constPtrRec { + const Value *base; + uint32_t size; + uint32_t offset; + uint32_t cbNum; // value of 0 means that it does not use hw CB + bool isArray; + bool isArgument; + bool usesHardware; + std::string name; +} constPtr; + +/// Structure that holds information for each kernel argument +typedef struct _kernelArgRec { + uint32_t reqGroupSize[3]; + uint32_t reqRegionSize[3]; + llvm::SmallVector<uint32_t, DEFAULT_VEC_SLOTS> argInfo; + bool mHasRWG; + bool mHasRWR; +} kernelArg; + +/// Structure that holds information for each kernel +typedef struct _kernelRec { + mutable uint32_t curSize; + mutable uint32_t curRSize; + mutable uint32_t curHWSize; + mutable uint32_t curHWRSize; + uint32_t constSize; + kernelArg *sgv; + localArg *lvgv; + llvm::SmallVector<struct _constPtrRec, DEFAULT_VEC_SLOTS> constPtr; + uint32_t constSizes[HW_MAX_NUM_CB]; + llvm::SmallSet<uint32_t, OPENCL_MAX_READ_IMAGES> readOnly; + llvm::SmallSet<uint32_t, OPENCL_MAX_WRITE_IMAGES> writeOnly; + llvm::SmallVector<std::pair<uint32_t, const Constant *>, + DEFAULT_VEC_SLOTS> CPOffsets; +} kernel; + +class AMDILGlobalManager { +public: + AMDILGlobalManager(bool debugMode = false); + ~AMDILGlobalManager(); + + /// Process the given module and parse out the global variable metadata passed + /// down from the frontend-compiler + void processModule(const Module &MF, const AMDILTargetMachine* mTM); + + /// Returns whether the current name is the name of a kernel function or a + /// normal function + bool isKernel(const llvm::StringRef &name) const; + + /// Returns true if the image ID corresponds to a read only image. + bool isReadOnlyImage(const llvm::StringRef &name, uint32_t iID) const; + + /// Returns true if the image ID corresponds to a write only image. + bool isWriteOnlyImage(const llvm::StringRef &name, uint32_t iID) const; + + /// Returns the number of write only images for the kernel. + uint32_t getNumWriteImages(const llvm::StringRef &name) const; + + /// Gets the group size of the kernel for the given dimension. + uint32_t getLocal(const llvm::StringRef &name, uint32_t dim) const; + + /// Gets the region size of the kernel for the given dimension. + uint32_t getRegion(const llvm::StringRef &name, uint32_t dim) const; + + /// Get the Region memory size in 1d for the given function/kernel. + uint32_t getRegionSize(const llvm::StringRef &name) const; + + /// Get the region memory size in 1d for the given function/kernel. + uint32_t getLocalSize(const llvm::StringRef &name) const; + + // Get the max group size in one 1D for the given function/kernel. + uint32_t getMaxGroupSize(const llvm::StringRef &name) const; + + // Get the max region size in one 1D for the given function/kernel. + uint32_t getMaxRegionSize(const llvm::StringRef &name) const; + + /// Get the constant memory size in 1d for the given function/kernel. + uint32_t getConstSize(const llvm::StringRef &name) const; + + /// Get the HW local size in 1d for the given function/kernel We need to + /// seperate SW local and HW local for the case where some local memory is + /// emulated in global and some is using the hardware features. The main + /// problem is that in OpenCL 1.0/1.1 cl_khr_byte_addressable_store allows + /// these actions to happen on all memory spaces, but the hardware can only + /// write byte address stores to UAV and LDS, not GDS or Stack. + uint32_t getHWLocalSize(const llvm::StringRef &name) const; + uint32_t getHWRegionSize(const llvm::StringRef &name) const; + + /// Get the offset of the array for the kernel. + int32_t getArrayOffset(const llvm::StringRef &name) const; + + /// Get the offset of the const memory for the kernel. + int32_t getConstOffset(const llvm::StringRef &name) const; + + /// Get the boolean value if this particular constant uses HW or not. + bool getConstHWBit(const llvm::StringRef &name) const; + + /// Get a reference to the kernel metadata information for the given function + /// name. + const kernel &getKernel(const llvm::StringRef &name) const; + + /// Returns whether a reqd_workgroup_size attribute has been used or not. + bool hasRWG(const llvm::StringRef &name) const; + + /// Returns whether a reqd_workregion_size attribute has been used or not. + bool hasRWR(const llvm::StringRef &name) const; + + + /// Dump the data section to the output stream for the given kernel. + void dumpDataSection(llvm::raw_ostream &O, AMDILKernelManager *km); + + /// Iterate through the constants that are global to the compilation unit. + StringMap<constPtr>::iterator consts_begin(); + StringMap<constPtr>::iterator consts_end(); + + /// Query if the kernel has a byte store. + bool byteStoreExists(llvm::StringRef S) const; + + /// Query if the kernel and argument uses hardware constant memory. + bool usesHWConstant(const kernel &krnl, const llvm::StringRef &arg); + + /// Query if the constant pointer is an argument. + bool isConstPtrArgument(const kernel &krnl, const llvm::StringRef &arg); + + /// Query if the constant pointer is an array that is globally scoped. + bool isConstPtrArray(const kernel &krnl, const llvm::StringRef &arg); + + /// Query the size of the constant pointer. + uint32_t getConstPtrSize(const kernel &krnl, const llvm::StringRef &arg); + + /// Query the offset of the constant pointer. + uint32_t getConstPtrOff(const kernel &krnl, const llvm::StringRef &arg); + + /// Query the constant buffer number for a constant pointer. + uint32_t getConstPtrCB(const kernel &krnl, const llvm::StringRef &arg); + + /// Query the Value* that the constant pointer originates from. + const Value *getConstPtrValue(const kernel &krnl, const llvm::StringRef &arg); + + /// Get the ID of the argument. + int32_t getArgID(const Argument *arg); + + /// Get the unique function ID for the specific function name and create a new + /// unique ID if it is not found. + uint32_t getOrCreateFunctionID(const GlobalValue* func); + uint32_t getOrCreateFunctionID(const std::string& func); + + /// Calculate the offsets of the constant pool for the given kernel and + /// machine function. + void calculateCPOffsets(const MachineFunction *MF, kernel &krnl); + + /// Print the global manager to the output stream. + void print(llvm::raw_ostream& O); + + /// Dump the global manager to the output stream - debug use. + void dump(); + +private: + /// Various functions that parse global value information and store them in + /// the global manager. This approach is used instead of dynamic parsing as it + /// might require more space, but should allow caching of data that gets + /// requested multiple times. + kernelArg parseSGV(const GlobalValue *GV); + localArg parseLVGV(const GlobalValue *GV); + void parseGlobalAnnotate(const GlobalValue *G); + void parseImageAnnotate(const GlobalValue *G); + void parseConstantPtrAnnotate(const GlobalValue *G); + void printConstantValue(const Constant *CAval, + llvm::raw_ostream& O, + bool asByte); + void parseKernelInformation(const Value *V); + void parseAutoArray(const GlobalValue *G, bool isRegion); + void parseConstantPtr(const GlobalValue *G); + void allocateGlobalCB(); + void dumpDataToCB(llvm::raw_ostream &O, AMDILKernelManager *km, uint32_t id); + bool checkConstPtrsUseHW(Module::const_iterator *F); + + llvm::StringMap<arraymem> mArrayMems; + llvm::StringMap<localArg> mLocalArgs; + llvm::StringMap<kernelArg> mKernelArgs; + llvm::StringMap<kernel> mKernels; + llvm::StringMap<constPtr> mConstMems; + llvm::StringMap<uint32_t> mFuncNames; + llvm::DenseMap<const GlobalValue*, uint32_t> mFuncPtrNames; + llvm::DenseMap<uint32_t, llvm::StringRef> mImageNameMap; + std::set<llvm::StringRef> mByteStore; + std::set<llvm::StringRef> mIgnoreStr; + llvm::DenseMap<const Argument *, int32_t> mArgIDMap; + const char *symTab; + const AMDILSubtarget *mSTM; + size_t mOffset; + uint32_t mReservedBuffs; + uint32_t mCurrentCPOffset; + bool mDebugMode; +}; +} // namespace llvm +#endif // __AMDILGLOBALMANAGER_H_ diff --git a/src/gallium/drivers/radeon/AMDILIOExpansion.cpp b/src/gallium/drivers/radeon/AMDILIOExpansion.cpp new file mode 100644 index 00000000000..68d8eef344d --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILIOExpansion.cpp @@ -0,0 +1,1160 @@ +//===----------- AMDILIOExpansion.cpp - IO Expansion Pass -----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// The AMDIL IO Expansion class expands pseudo IO instructions into a sequence +// of instructions that produces the correct results. These instructions are +// not expanded earlier in the pass because any pass before this can assume to +// be able to generate a load/store instruction. So this pass can only have +// passes that execute after it if no load/store instructions can be generated. +//===----------------------------------------------------------------------===// +#include "AMDILIOExpansion.h" +#include "AMDIL.h" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Value.h" + +using namespace llvm; + +char AMDILIOExpansion::ID = 0; +namespace llvm { + FunctionPass* + createAMDILIOExpansion(TargetMachine &TM AMDIL_OPT_LEVEL_DECL) + { + return TM.getSubtarget<AMDILSubtarget>() + .device()->getIOExpansion(TM AMDIL_OPT_LEVEL_VAR); + } +} + +AMDILIOExpansion::AMDILIOExpansion(TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) : + MachineFunctionPass(ID), TM(tm) +{ + mSTM = &tm.getSubtarget<AMDILSubtarget>(); + mDebug = DEBUGME; + mTII = tm.getInstrInfo(); + mKM = NULL; +} + +AMDILIOExpansion::~AMDILIOExpansion() +{ +} + bool +AMDILIOExpansion::runOnMachineFunction(MachineFunction &MF) +{ + mKM = const_cast<AMDILKernelManager*>(mSTM->getKernelManager()); + mMFI = MF.getInfo<AMDILMachineFunctionInfo>(); + for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end(); + MFI != MFE; ++MFI) { + MachineBasicBlock *MBB = MFI; + for (MachineBasicBlock::iterator MBI = MBB->begin(), MBE = MBB->end(); + MBI != MBE; ++MBI) { + MachineInstr *MI = MBI; + if (isIOInstruction(MI)) { + mBB = MBB; + saveInst = false; + expandIOInstruction(MI); + if (!saveInst) { + // erase returns the instruction after + // and we want the instruction before + MBI = MBB->erase(MI); + --MBI; + } + } + } + } + return false; +} +const char *AMDILIOExpansion::getPassName() const +{ + return "AMDIL Generic IO Expansion Pass"; +} + bool +AMDILIOExpansion::isIOInstruction(MachineInstr *MI) +{ + if (!MI) { + return false; + } + switch(MI->getOpcode()) { + default: + return false; + ExpandCaseToAllTypes(AMDIL::CPOOLLOAD) + ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::CONSTANTLOAD) + ExpandCaseToAllTypes(AMDIL::CONSTANTSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::CONSTANTZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::CONSTANTAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATELOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATESTORE) + ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE) + ExpandCaseToAllTypes(AMDIL::REGIONSTORE) + ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE) + ExpandCaseToAllTypes(AMDIL::REGIONLOAD) + ExpandCaseToAllTypes(AMDIL::REGIONSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::REGIONZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::REGIONAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALSTORE) + ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE) + ExpandCaseToAllTypes(AMDIL::LOCALLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::GLOBALLOAD) + ExpandCaseToAllTypes(AMDIL::GLOBALSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::GLOBALAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::GLOBALZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::GLOBALSTORE) + ExpandCaseToAllTruncTypes(AMDIL::GLOBALTRUNCSTORE) + return true; + }; + return false; +} +void +AMDILIOExpansion::expandIOInstruction(MachineInstr *MI) +{ + assert(isIOInstruction(MI) && "Must be an IO instruction to " + "be passed to this function!"); + switch (MI->getOpcode()) { + default: + assert(0 && "Not an IO Instruction!"); + ExpandCaseToAllTypes(AMDIL::GLOBALLOAD); + ExpandCaseToAllTypes(AMDIL::GLOBALSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::GLOBALZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::GLOBALAEXTLOAD); + expandGlobalLoad(MI); + break; + ExpandCaseToAllTypes(AMDIL::REGIONLOAD); + ExpandCaseToAllTypes(AMDIL::REGIONSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::REGIONZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::REGIONAEXTLOAD); + expandRegionLoad(MI); + break; + ExpandCaseToAllTypes(AMDIL::LOCALLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD); + expandLocalLoad(MI); + break; + ExpandCaseToAllTypes(AMDIL::CONSTANTLOAD); + ExpandCaseToAllTypes(AMDIL::CONSTANTSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CONSTANTZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CONSTANTAEXTLOAD); + expandConstantLoad(MI); + break; + ExpandCaseToAllTypes(AMDIL::PRIVATELOAD); + ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD); + ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD); + expandPrivateLoad(MI); + break; + ExpandCaseToAllTypes(AMDIL::CPOOLLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD); + expandConstantPoolLoad(MI); + break; + ExpandCaseToAllTruncTypes(AMDIL::GLOBALTRUNCSTORE) + ExpandCaseToAllTypes(AMDIL::GLOBALSTORE); + expandGlobalStore(MI); + break; + ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE); + ExpandCaseToAllTypes(AMDIL::PRIVATESTORE); + expandPrivateStore(MI); + break; + ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE); + ExpandCaseToAllTypes(AMDIL::REGIONSTORE); + expandRegionStore(MI); + break; + ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE); + ExpandCaseToAllTypes(AMDIL::LOCALSTORE); + expandLocalStore(MI); + break; + } +} + bool +AMDILIOExpansion::isAddrCalcInstr(MachineInstr *MI) +{ + switch(MI->getOpcode()) { + ExpandCaseToAllTypes(AMDIL::PRIVATELOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD) + { + // This section of code is a workaround for the problem of + // globally scoped constant address variables. The problems + // comes that although they are declared in the constant + // address space, all variables must be allocated in the + // private address space. So when there is a load from + // the global address, it automatically goes into the private + // address space. However, the data section is placed in the + // constant address space so we need to check to see if our + // load base address is a global variable or not. Only if it + // is not a global variable can we do the address calculation + // into the private memory ring. + + MachineMemOperand& memOp = (**MI->memoperands_begin()); + const Value *V = memOp.getValue(); + if (V) { + const GlobalValue *GV = dyn_cast<GlobalVariable>(V); + return mSTM->device()->usesSoftware(AMDILDeviceInfo::PrivateMem) + && !(GV); + } else { + return false; + } + } + ExpandCaseToAllTypes(AMDIL::CPOOLLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD); + return MI->getOperand(1).isReg(); + ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE); + ExpandCaseToAllTypes(AMDIL::PRIVATESTORE); + return mSTM->device()->usesSoftware(AMDILDeviceInfo::PrivateMem); + ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE); + ExpandCaseToAllTypes(AMDIL::LOCALSTORE); + ExpandCaseToAllTypes(AMDIL::LOCALLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD); + return mSTM->device()->usesSoftware(AMDILDeviceInfo::LocalMem); + }; + return false; + +} + bool +AMDILIOExpansion::isExtendLoad(MachineInstr *MI) +{ + return isSExtLoadInst(TM.getInstrInfo(), MI) || + isZExtLoadInst(TM.getInstrInfo(), MI) || + isAExtLoadInst(TM.getInstrInfo(), MI) + || isSWSExtLoadInst(MI); +} + + bool +AMDILIOExpansion::isHardwareRegion(MachineInstr *MI) +{ + switch(MI->getOpcode()) { + default: + return false; + break; + ExpandCaseToAllTypes(AMDIL::REGIONLOAD) + ExpandCaseToAllTypes(AMDIL::REGIONSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::REGIONZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::REGIONAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::REGIONSTORE) + ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE) + return mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem); + }; + return false; +} + bool +AMDILIOExpansion::isHardwareLocal(MachineInstr *MI) +{ + switch(MI->getOpcode()) { + default: + return false; + break; + ExpandCaseToAllTypes(AMDIL::LOCALLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALSTORE) + ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE) + return mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem); + }; + return false; +} + bool +AMDILIOExpansion::isPackedData(MachineInstr *MI) +{ + switch(MI->getOpcode()) { + default: + if (isTruncStoreInst(TM.getInstrInfo(), MI)) { + switch (MI->getDesc().OpInfo[0].RegClass) { + default: + break; + case AMDIL::GPRV2I64RegClassID: + case AMDIL::GPRV2I32RegClassID: + switch (getMemorySize(MI)) { + case 2: + case 4: + return true; + default: + break; + } + break; + case AMDIL::GPRV4I32RegClassID: + switch (getMemorySize(MI)) { + case 4: + case 8: + return true; + default: + break; + } + break; + } + } + break; + ExpandCaseToPackedTypes(AMDIL::CPOOLLOAD); + ExpandCaseToPackedTypes(AMDIL::CPOOLSEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::CPOOLZEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::CPOOLAEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::GLOBALLOAD); + ExpandCaseToPackedTypes(AMDIL::GLOBALSEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::GLOBALZEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::GLOBALAEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::LOCALLOAD); + ExpandCaseToPackedTypes(AMDIL::LOCALSEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::LOCALZEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::LOCALAEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::REGIONLOAD); + ExpandCaseToPackedTypes(AMDIL::REGIONSEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::REGIONZEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::REGIONAEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::PRIVATELOAD); + ExpandCaseToPackedTypes(AMDIL::PRIVATESEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::PRIVATEZEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::PRIVATEAEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::CONSTANTLOAD); + ExpandCaseToPackedTypes(AMDIL::CONSTANTSEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::CONSTANTAEXTLOAD); + ExpandCaseToPackedTypes(AMDIL::CONSTANTZEXTLOAD); + ExpandCaseToAllTruncTypes(AMDIL::GLOBALTRUNCSTORE) + ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE); + ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE); + ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE); + ExpandCaseToPackedTypes(AMDIL::GLOBALSTORE); + ExpandCaseToPackedTypes(AMDIL::PRIVATESTORE); + ExpandCaseToPackedTypes(AMDIL::LOCALSTORE); + ExpandCaseToPackedTypes(AMDIL::REGIONSTORE); + return true; + } + return false; +} + + bool +AMDILIOExpansion::isStaticCPLoad(MachineInstr *MI) +{ + switch(MI->getOpcode()) { + ExpandCaseToAllTypes(AMDIL::CPOOLLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD); + { + uint32_t x = 0; + uint32_t num = MI->getNumOperands(); + for (x = 0; x < num; ++x) { + if (MI->getOperand(x).isCPI()) { + return true; + } + } + } + break; + default: + break; + } + return false; +} + + bool +AMDILIOExpansion::isNbitType(Type *mType, uint32_t nBits, bool isScalar) +{ + if (!mType) { + return false; + } + if (dyn_cast<PointerType>(mType)) { + PointerType *PT = dyn_cast<PointerType>(mType); + return isNbitType(PT->getElementType(), nBits); + } else if (dyn_cast<StructType>(mType)) { + return getTypeSize(mType) == nBits; + } else if (dyn_cast<VectorType>(mType)) { + VectorType *VT = dyn_cast<VectorType>(mType); + size_t size = VT->getScalarSizeInBits(); + return (isScalar ? + VT->getNumElements() * size == nBits : size == nBits); + } else if (dyn_cast<ArrayType>(mType)) { + ArrayType *AT = dyn_cast<ArrayType>(mType); + size_t size = AT->getScalarSizeInBits(); + return (isScalar ? + AT->getNumElements() * size == nBits : size == nBits); + } else if (mType->isSized()) { + return mType->getScalarSizeInBits() == nBits; + } else { + assert(0 && "Found a type that we don't know how to handle!"); + return false; + } +} + + bool +AMDILIOExpansion::isHardwareInst(MachineInstr *MI) +{ + AMDILAS::InstrResEnc curRes; + curRes.u16all = MI->getAsmPrinterFlags(); + return curRes.bits.HardwareInst; +} + +REG_PACKED_TYPE +AMDILIOExpansion::getPackedID(MachineInstr *MI) +{ + switch (MI->getOpcode()) { + default: + break; + case AMDIL::GLOBALTRUNCSTORE_v2i64i8: + case AMDIL::REGIONTRUNCSTORE_v2i64i8: + case AMDIL::LOCALTRUNCSTORE_v2i64i8: + case AMDIL::PRIVATETRUNCSTORE_v2i64i8: + case AMDIL::GLOBALTRUNCSTORE_v2i32i8: + case AMDIL::REGIONTRUNCSTORE_v2i32i8: + case AMDIL::LOCALTRUNCSTORE_v2i32i8: + case AMDIL::PRIVATETRUNCSTORE_v2i32i8: + case AMDIL::GLOBALTRUNCSTORE_v2i16i8: + case AMDIL::REGIONTRUNCSTORE_v2i16i8: + case AMDIL::LOCALTRUNCSTORE_v2i16i8: + case AMDIL::PRIVATETRUNCSTORE_v2i16i8: + case AMDIL::GLOBALSTORE_v2i8: + case AMDIL::LOCALSTORE_v2i8: + case AMDIL::REGIONSTORE_v2i8: + case AMDIL::PRIVATESTORE_v2i8: + return PACK_V2I8; + case AMDIL::GLOBALTRUNCSTORE_v4i32i8: + case AMDIL::REGIONTRUNCSTORE_v4i32i8: + case AMDIL::LOCALTRUNCSTORE_v4i32i8: + case AMDIL::PRIVATETRUNCSTORE_v4i32i8: + case AMDIL::GLOBALTRUNCSTORE_v4i16i8: + case AMDIL::REGIONTRUNCSTORE_v4i16i8: + case AMDIL::LOCALTRUNCSTORE_v4i16i8: + case AMDIL::PRIVATETRUNCSTORE_v4i16i8: + case AMDIL::GLOBALSTORE_v4i8: + case AMDIL::LOCALSTORE_v4i8: + case AMDIL::REGIONSTORE_v4i8: + case AMDIL::PRIVATESTORE_v4i8: + return PACK_V4I8; + case AMDIL::GLOBALTRUNCSTORE_v2i64i16: + case AMDIL::REGIONTRUNCSTORE_v2i64i16: + case AMDIL::LOCALTRUNCSTORE_v2i64i16: + case AMDIL::PRIVATETRUNCSTORE_v2i64i16: + case AMDIL::GLOBALTRUNCSTORE_v2i32i16: + case AMDIL::REGIONTRUNCSTORE_v2i32i16: + case AMDIL::LOCALTRUNCSTORE_v2i32i16: + case AMDIL::PRIVATETRUNCSTORE_v2i32i16: + case AMDIL::GLOBALSTORE_v2i16: + case AMDIL::LOCALSTORE_v2i16: + case AMDIL::REGIONSTORE_v2i16: + case AMDIL::PRIVATESTORE_v2i16: + return PACK_V2I16; + case AMDIL::GLOBALTRUNCSTORE_v4i32i16: + case AMDIL::REGIONTRUNCSTORE_v4i32i16: + case AMDIL::LOCALTRUNCSTORE_v4i32i16: + case AMDIL::PRIVATETRUNCSTORE_v4i32i16: + case AMDIL::GLOBALSTORE_v4i16: + case AMDIL::LOCALSTORE_v4i16: + case AMDIL::REGIONSTORE_v4i16: + case AMDIL::PRIVATESTORE_v4i16: + return PACK_V4I16; + case AMDIL::GLOBALLOAD_v2i8: + case AMDIL::GLOBALSEXTLOAD_v2i8: + case AMDIL::GLOBALAEXTLOAD_v2i8: + case AMDIL::GLOBALZEXTLOAD_v2i8: + case AMDIL::LOCALLOAD_v2i8: + case AMDIL::LOCALSEXTLOAD_v2i8: + case AMDIL::LOCALAEXTLOAD_v2i8: + case AMDIL::LOCALZEXTLOAD_v2i8: + case AMDIL::REGIONLOAD_v2i8: + case AMDIL::REGIONSEXTLOAD_v2i8: + case AMDIL::REGIONAEXTLOAD_v2i8: + case AMDIL::REGIONZEXTLOAD_v2i8: + case AMDIL::PRIVATELOAD_v2i8: + case AMDIL::PRIVATESEXTLOAD_v2i8: + case AMDIL::PRIVATEAEXTLOAD_v2i8: + case AMDIL::PRIVATEZEXTLOAD_v2i8: + case AMDIL::CONSTANTLOAD_v2i8: + case AMDIL::CONSTANTSEXTLOAD_v2i8: + case AMDIL::CONSTANTAEXTLOAD_v2i8: + case AMDIL::CONSTANTZEXTLOAD_v2i8: + return UNPACK_V2I8; + case AMDIL::GLOBALLOAD_v4i8: + case AMDIL::GLOBALSEXTLOAD_v4i8: + case AMDIL::GLOBALAEXTLOAD_v4i8: + case AMDIL::GLOBALZEXTLOAD_v4i8: + case AMDIL::LOCALLOAD_v4i8: + case AMDIL::LOCALSEXTLOAD_v4i8: + case AMDIL::LOCALAEXTLOAD_v4i8: + case AMDIL::LOCALZEXTLOAD_v4i8: + case AMDIL::REGIONLOAD_v4i8: + case AMDIL::REGIONSEXTLOAD_v4i8: + case AMDIL::REGIONAEXTLOAD_v4i8: + case AMDIL::REGIONZEXTLOAD_v4i8: + case AMDIL::PRIVATELOAD_v4i8: + case AMDIL::PRIVATESEXTLOAD_v4i8: + case AMDIL::PRIVATEAEXTLOAD_v4i8: + case AMDIL::PRIVATEZEXTLOAD_v4i8: + case AMDIL::CONSTANTLOAD_v4i8: + case AMDIL::CONSTANTSEXTLOAD_v4i8: + case AMDIL::CONSTANTAEXTLOAD_v4i8: + case AMDIL::CONSTANTZEXTLOAD_v4i8: + return UNPACK_V4I8; + case AMDIL::GLOBALLOAD_v2i16: + case AMDIL::GLOBALSEXTLOAD_v2i16: + case AMDIL::GLOBALAEXTLOAD_v2i16: + case AMDIL::GLOBALZEXTLOAD_v2i16: + case AMDIL::LOCALLOAD_v2i16: + case AMDIL::LOCALSEXTLOAD_v2i16: + case AMDIL::LOCALAEXTLOAD_v2i16: + case AMDIL::LOCALZEXTLOAD_v2i16: + case AMDIL::REGIONLOAD_v2i16: + case AMDIL::REGIONSEXTLOAD_v2i16: + case AMDIL::REGIONAEXTLOAD_v2i16: + case AMDIL::REGIONZEXTLOAD_v2i16: + case AMDIL::PRIVATELOAD_v2i16: + case AMDIL::PRIVATESEXTLOAD_v2i16: + case AMDIL::PRIVATEAEXTLOAD_v2i16: + case AMDIL::PRIVATEZEXTLOAD_v2i16: + case AMDIL::CONSTANTLOAD_v2i16: + case AMDIL::CONSTANTSEXTLOAD_v2i16: + case AMDIL::CONSTANTAEXTLOAD_v2i16: + case AMDIL::CONSTANTZEXTLOAD_v2i16: + return UNPACK_V2I16; + case AMDIL::GLOBALLOAD_v4i16: + case AMDIL::GLOBALSEXTLOAD_v4i16: + case AMDIL::GLOBALAEXTLOAD_v4i16: + case AMDIL::GLOBALZEXTLOAD_v4i16: + case AMDIL::LOCALLOAD_v4i16: + case AMDIL::LOCALSEXTLOAD_v4i16: + case AMDIL::LOCALAEXTLOAD_v4i16: + case AMDIL::LOCALZEXTLOAD_v4i16: + case AMDIL::REGIONLOAD_v4i16: + case AMDIL::REGIONSEXTLOAD_v4i16: + case AMDIL::REGIONAEXTLOAD_v4i16: + case AMDIL::REGIONZEXTLOAD_v4i16: + case AMDIL::PRIVATELOAD_v4i16: + case AMDIL::PRIVATESEXTLOAD_v4i16: + case AMDIL::PRIVATEAEXTLOAD_v4i16: + case AMDIL::PRIVATEZEXTLOAD_v4i16: + case AMDIL::CONSTANTLOAD_v4i16: + case AMDIL::CONSTANTSEXTLOAD_v4i16: + case AMDIL::CONSTANTAEXTLOAD_v4i16: + case AMDIL::CONSTANTZEXTLOAD_v4i16: + return UNPACK_V4I16; + }; + return NO_PACKING; +} + + uint32_t +AMDILIOExpansion::getPointerID(MachineInstr *MI) +{ + AMDILAS::InstrResEnc curInst; + getAsmPrinterFlags(MI, curInst); + return curInst.bits.ResourceID; +} + + uint32_t +AMDILIOExpansion::getShiftSize(MachineInstr *MI) +{ + switch(getPackedID(MI)) { + default: + return 0; + case PACK_V2I8: + case PACK_V4I8: + case UNPACK_V2I8: + case UNPACK_V4I8: + return 1; + case PACK_V2I16: + case PACK_V4I16: + case UNPACK_V2I16: + case UNPACK_V4I16: + return 2; + } + return 0; +} + uint32_t +AMDILIOExpansion::getMemorySize(MachineInstr *MI) +{ + if (MI->memoperands_empty()) { + return 4; + } + return (uint32_t)((*MI->memoperands_begin())->getSize()); +} + + void +AMDILIOExpansion::expandLongExtend(MachineInstr *MI, + uint32_t numComps, uint32_t size, bool signedShift) +{ + DebugLoc DL = MI->getDebugLoc(); + switch(size) { + default: + assert(0 && "Found a case we don't handle!"); + break; + case 8: + if (numComps == 1) { + expandLongExtendSub32(MI, AMDIL::SHL_i8, AMDIL::SHRVEC_v2i32, + AMDIL::USHRVEC_i8, + 24, (24ULL | (31ULL << 32)), 24, AMDIL::LCREATE, signedShift); + } else if (numComps == 2) { + expandLongExtendSub32(MI, AMDIL::SHL_v2i8, AMDIL::SHRVEC_v4i32, + AMDIL::USHRVEC_v2i8, + 24, (24ULL | (31ULL << 32)), 24, AMDIL::LCREATE_v2i64, signedShift); + } else { + assert(0 && "Found a case we don't handle!"); + } + break; + case 16: + if (numComps == 1) { + expandLongExtendSub32(MI, AMDIL::SHL_i16, AMDIL::SHRVEC_v2i32, + AMDIL::USHRVEC_i16, + 16, (16ULL | (31ULL << 32)), 16, AMDIL::LCREATE, signedShift); + } else if (numComps == 2) { + expandLongExtendSub32(MI, AMDIL::SHL_v2i16, AMDIL::SHRVEC_v4i32, + AMDIL::USHRVEC_v2i16, + 16, (16ULL | (31ULL << 32)), 16, AMDIL::LCREATE_v2i64, signedShift); + } else { + assert(0 && "Found a case we don't handle!"); + } + break; + case 32: + if (numComps == 1) { + if (signedShift) { + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHRVEC_i32), AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(31)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1012); + } else { + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LCREATE), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0)); + } + } else if (numComps == 2) { + if (signedShift) { + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::SHRVEC_v2i32), AMDIL::R1012) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(31)); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addReg(AMDIL::R1012); + } else { + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::LCREATE_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0)); + } + } else { + assert(0 && "Found a case we don't handle!"); + } + }; +} + void +AMDILIOExpansion::expandLongExtendSub32(MachineInstr *MI, + unsigned SHLop, unsigned SHRop, unsigned USHRop, + unsigned SHLimm, uint64_t SHRimm, unsigned USHRimm, + unsigned LCRop, bool signedShift) +{ + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*mBB, MI, DL, mTII->get(SHLop), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(SHLimm)); + if (signedShift) { + BuildMI(*mBB, MI, DL, mTII->get(LCRop), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1011); + BuildMI(*mBB, MI, DL, mTII->get(SHRop), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi64Literal(SHRimm)); + } else { + BuildMI(*mBB, MI, DL, mTII->get(USHRop), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(USHRimm)); + BuildMI(*mBB, MI, MI->getDebugLoc(), mTII->get(LCRop), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0)); + } +} + + void +AMDILIOExpansion::expandIntegerExtend(MachineInstr *MI, unsigned SHLop, + unsigned SHRop, unsigned offset) +{ + DebugLoc DL = MI->getDebugLoc(); + offset = mMFI->addi32Literal(offset); + BuildMI(*mBB, MI, DL, + mTII->get(SHLop), AMDIL::R1011) + .addReg(AMDIL::R1011).addImm(offset); + BuildMI(*mBB, MI, DL, + mTII->get(SHRop), AMDIL::R1011) + .addReg(AMDIL::R1011).addImm(offset); +} + void +AMDILIOExpansion::expandExtendLoad(MachineInstr *MI) +{ + if (!isExtendLoad(MI)) { + return; + } + Type *mType = NULL; + if (!MI->memoperands_empty()) { + MachineMemOperand *memOp = (*MI->memoperands_begin()); + const Value *moVal = (memOp) ? memOp->getValue() : NULL; + mType = (moVal) ? moVal->getType() : NULL; + } + unsigned opcode = 0; + DebugLoc DL = MI->getDebugLoc(); + if (isZExtLoadInst(TM.getInstrInfo(), MI) || isAExtLoadInst(TM.getInstrInfo(), MI) || isSExtLoadInst(TM.getInstrInfo(), MI)) { + switch(MI->getDesc().OpInfo[0].RegClass) { + default: + assert(0 && "Found an extending load that we don't handle!"); + break; + case AMDIL::GPRI16RegClassID: + if (!isHardwareLocal(MI) + || mSTM->device()->usesSoftware(AMDILDeviceInfo::ByteLDSOps)) { + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_i16 : AMDIL::USHRVEC_i16; + expandIntegerExtend(MI, AMDIL::SHL_i16, opcode, 24); + } + break; + case AMDIL::GPRV2I16RegClassID: + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v2i16 : AMDIL::USHRVEC_v2i16; + expandIntegerExtend(MI, AMDIL::SHL_v2i16, opcode, 24); + break; + case AMDIL::GPRV4I8RegClassID: + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v4i8 : AMDIL::USHRVEC_v4i8; + expandIntegerExtend(MI, AMDIL::SHL_v4i8, opcode, 24); + break; + case AMDIL::GPRV4I16RegClassID: + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v4i16 : AMDIL::USHRVEC_v4i16; + expandIntegerExtend(MI, AMDIL::SHL_v4i16, opcode, 24); + break; + case AMDIL::GPRI32RegClassID: + // We can be a i8 or i16 bit sign extended value + if (isNbitType(mType, 8) || getMemorySize(MI) == 1) { + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_i32 : AMDIL::USHRVEC_i32; + expandIntegerExtend(MI, AMDIL::SHL_i32, opcode, 24); + } else if (isNbitType(mType, 16) || getMemorySize(MI) == 2) { + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_i32 : AMDIL::USHRVEC_i32; + expandIntegerExtend(MI, AMDIL::SHL_i32, opcode, 16); + } else { + assert(0 && "Found an extending load that we don't handle!"); + } + break; + case AMDIL::GPRV2I32RegClassID: + // We can be a v2i8 or v2i16 bit sign extended value + if (isNbitType(mType, 8, false) || getMemorySize(MI) == 2) { + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v2i32 : AMDIL::USHRVEC_v2i32; + expandIntegerExtend(MI, AMDIL::SHL_v2i32, opcode, 24); + } else if (isNbitType(mType, 16, false) || getMemorySize(MI) == 4) { + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v2i32 : AMDIL::USHRVEC_v2i32; + expandIntegerExtend(MI, AMDIL::SHL_v2i32, opcode, 16); + } else { + assert(0 && "Found an extending load that we don't handle!"); + } + break; + case AMDIL::GPRV4I32RegClassID: + // We can be a v4i8 or v4i16 bit sign extended value + if (isNbitType(mType, 8, false) || getMemorySize(MI) == 4) { + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v4i32 : AMDIL::USHRVEC_v4i32; + expandIntegerExtend(MI, AMDIL::SHL_v4i32, opcode, 24); + } else if (isNbitType(mType, 16, false) || getMemorySize(MI) == 8) { + opcode = isSExtLoadInst(TM.getInstrInfo(), MI) ? AMDIL::SHRVEC_v4i32 : AMDIL::USHRVEC_v4i32; + expandIntegerExtend(MI, AMDIL::SHL_v4i32, opcode, 16); + } else { + assert(0 && "Found an extending load that we don't handle!"); + } + break; + case AMDIL::GPRI64RegClassID: + // We can be a i8, i16 or i32 bit sign extended value + if (isNbitType(mType, 8) || getMemorySize(MI) == 1) { + expandLongExtend(MI, 1, 8, isSExtLoadInst(TM.getInstrInfo(), MI)); + } else if (isNbitType(mType, 16) || getMemorySize(MI) == 2) { + expandLongExtend(MI, 1, 16, isSExtLoadInst(TM.getInstrInfo(), MI)); + } else if (isNbitType(mType, 32) || getMemorySize(MI) == 4) { + expandLongExtend(MI, 1, 32, isSExtLoadInst(TM.getInstrInfo(), MI)); + } else { + assert(0 && "Found an extending load that we don't handle!"); + } + break; + case AMDIL::GPRV2I64RegClassID: + // We can be a v2i8, v2i16 or v2i32 bit sign extended value + if (isNbitType(mType, 8, false) || getMemorySize(MI) == 2) { + expandLongExtend(MI, 2, 8, isSExtLoadInst(TM.getInstrInfo(), MI)); + } else if (isNbitType(mType, 16, false) || getMemorySize(MI) == 4) { + expandLongExtend(MI, 2, 16, isSExtLoadInst(TM.getInstrInfo(), MI)); + } else if (isNbitType(mType, 32, false) || getMemorySize(MI) == 8) { + expandLongExtend(MI, 2, 32, isSExtLoadInst(TM.getInstrInfo(), MI)); + } else { + assert(0 && "Found an extending load that we don't handle!"); + } + break; + case AMDIL::GPRF32RegClassID: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::HTOF_f32), AMDIL::R1011) + .addReg(AMDIL::R1011); + break; + case AMDIL::GPRV2F32RegClassID: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::HTOF_v2f32), AMDIL::R1011) + .addReg(AMDIL::R1011); + break; + case AMDIL::GPRV4F32RegClassID: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::HTOF_v4f32), AMDIL::R1011) + .addReg(AMDIL::R1011); + break; + case AMDIL::GPRF64RegClassID: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::FTOD), AMDIL::R1011) + .addReg(AMDIL::R1011); + break; + case AMDIL::GPRV2F64RegClassID: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VEXTRACT_v2f32), + AMDIL::R1012).addReg(AMDIL::R1011).addImm(2); + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::FTOD), AMDIL::R1011) + .addReg(AMDIL::R1011); + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::FTOD), AMDIL::R1012) + .addReg(AMDIL::R1012); + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::VINSERT_v2f64), AMDIL::R1011) + .addReg(AMDIL::R1011).addReg(AMDIL::R1012) + .addImm(1 << 8).addImm(1 << 8); + break; + }; + } else if (isSWSExtLoadInst(MI)) { + switch(MI->getDesc().OpInfo[0].RegClass) { + case AMDIL::GPRI8RegClassID: + if (!isHardwareLocal(MI) + || mSTM->device()->usesSoftware(AMDILDeviceInfo::ByteLDSOps)) { + expandIntegerExtend(MI, AMDIL::SHL_i8, AMDIL::SHRVEC_i8, 24); + } + break; + case AMDIL::GPRV2I8RegClassID: + expandIntegerExtend(MI, AMDIL::SHL_v2i8, AMDIL::SHRVEC_v2i8, 24); + break; + case AMDIL::GPRV4I8RegClassID: + expandIntegerExtend(MI, AMDIL::SHL_v4i8, AMDIL::SHRVEC_v4i8, 24); + break; + case AMDIL::GPRI16RegClassID: + if (!isHardwareLocal(MI) + || mSTM->device()->usesSoftware(AMDILDeviceInfo::ByteLDSOps)) { + expandIntegerExtend(MI, AMDIL::SHL_i16, AMDIL::SHRVEC_i16, 16); + } + break; + case AMDIL::GPRV2I16RegClassID: + expandIntegerExtend(MI, AMDIL::SHL_v2i16, AMDIL::SHRVEC_v2i16, 16); + break; + case AMDIL::GPRV4I16RegClassID: + expandIntegerExtend(MI, AMDIL::SHL_v4i16, AMDIL::SHRVEC_v4i16, 16); + break; + + }; + } +} + + void +AMDILIOExpansion::expandTruncData(MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + if (!isTruncStoreInst(TM.getInstrInfo(), MI)) { + return; + } + DebugLoc DL = MI->getDebugLoc(); + switch (MI->getOpcode()) { + default: + MI->dump(); + assert(!"Found a trunc store instructions we don't handle!"); + break; + case AMDIL::GLOBALTRUNCSTORE_i64i8: + case AMDIL::GLOBALTRUNCSTORE_v2i64i8: + case AMDIL::LOCALTRUNCSTORE_i64i8: + case AMDIL::LOCALTRUNCSTORE_v2i64i8: + case AMDIL::REGIONTRUNCSTORE_i64i8: + case AMDIL::REGIONTRUNCSTORE_v2i64i8: + case AMDIL::PRIVATETRUNCSTORE_i64i8: + case AMDIL::PRIVATETRUNCSTORE_v2i64i8: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011); + case AMDIL::GLOBALTRUNCSTORE_i16i8: + case AMDIL::GLOBALTRUNCSTORE_v2i16i8: + case AMDIL::GLOBALTRUNCSTORE_v4i16i8: + case AMDIL::LOCALTRUNCSTORE_i16i8: + case AMDIL::LOCALTRUNCSTORE_v2i16i8: + case AMDIL::LOCALTRUNCSTORE_v4i16i8: + case AMDIL::REGIONTRUNCSTORE_i16i8: + case AMDIL::REGIONTRUNCSTORE_v2i16i8: + case AMDIL::REGIONTRUNCSTORE_v4i16i8: + case AMDIL::PRIVATETRUNCSTORE_i16i8: + case AMDIL::PRIVATETRUNCSTORE_v2i16i8: + case AMDIL::PRIVATETRUNCSTORE_v4i16i8: + case AMDIL::GLOBALTRUNCSTORE_i32i8: + case AMDIL::GLOBALTRUNCSTORE_v2i32i8: + case AMDIL::GLOBALTRUNCSTORE_v4i32i8: + case AMDIL::LOCALTRUNCSTORE_i32i8: + case AMDIL::LOCALTRUNCSTORE_v2i32i8: + case AMDIL::LOCALTRUNCSTORE_v4i32i8: + case AMDIL::REGIONTRUNCSTORE_i32i8: + case AMDIL::REGIONTRUNCSTORE_v2i32i8: + case AMDIL::REGIONTRUNCSTORE_v4i32i8: + case AMDIL::PRIVATETRUNCSTORE_i32i8: + case AMDIL::PRIVATETRUNCSTORE_v2i32i8: + case AMDIL::PRIVATETRUNCSTORE_v4i32i8: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFF)); + break; + case AMDIL::GLOBALTRUNCSTORE_i64i16: + case AMDIL::GLOBALTRUNCSTORE_v2i64i16: + case AMDIL::LOCALTRUNCSTORE_i64i16: + case AMDIL::LOCALTRUNCSTORE_v2i64i16: + case AMDIL::REGIONTRUNCSTORE_i64i16: + case AMDIL::REGIONTRUNCSTORE_v2i64i16: + case AMDIL::PRIVATETRUNCSTORE_i64i16: + case AMDIL::PRIVATETRUNCSTORE_v2i64i16: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011); + case AMDIL::GLOBALTRUNCSTORE_i32i16: + case AMDIL::GLOBALTRUNCSTORE_v2i32i16: + case AMDIL::GLOBALTRUNCSTORE_v4i32i16: + case AMDIL::LOCALTRUNCSTORE_i32i16: + case AMDIL::LOCALTRUNCSTORE_v2i32i16: + case AMDIL::LOCALTRUNCSTORE_v4i32i16: + case AMDIL::REGIONTRUNCSTORE_i32i16: + case AMDIL::REGIONTRUNCSTORE_v2i32i16: + case AMDIL::REGIONTRUNCSTORE_v4i32i16: + case AMDIL::PRIVATETRUNCSTORE_i32i16: + case AMDIL::PRIVATETRUNCSTORE_v2i32i16: + case AMDIL::PRIVATETRUNCSTORE_v4i32i16: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::BINARY_AND_v4i32), AMDIL::R1011) + .addReg(AMDIL::R1011) + .addImm(mMFI->addi32Literal(0xFFFF)); + break; + case AMDIL::GLOBALTRUNCSTORE_i64i32: + case AMDIL::LOCALTRUNCSTORE_i64i32: + case AMDIL::REGIONTRUNCSTORE_i64i32: + case AMDIL::PRIVATETRUNCSTORE_i64i32: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::LLO), AMDIL::R1011) + .addReg(AMDIL::R1011); + break; + case AMDIL::GLOBALTRUNCSTORE_v2i64i32: + case AMDIL::LOCALTRUNCSTORE_v2i64i32: + case AMDIL::REGIONTRUNCSTORE_v2i64i32: + case AMDIL::PRIVATETRUNCSTORE_v2i64i32: + BuildMI(*mBB, MI, DL, + mTII->get(AMDIL::LLO_v2i64), AMDIL::R1011) + .addReg(AMDIL::R1011); + break; + case AMDIL::GLOBALTRUNCSTORE_f64f32: + case AMDIL::LOCALTRUNCSTORE_f64f32: + case AMDIL::REGIONTRUNCSTORE_f64f32: + case AMDIL::PRIVATETRUNCSTORE_f64f32: + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::DTOF), + AMDIL::R1011).addReg(AMDIL::R1011); + break; + case AMDIL::GLOBALTRUNCSTORE_v2f64f32: + case AMDIL::LOCALTRUNCSTORE_v2f64f32: + case AMDIL::REGIONTRUNCSTORE_v2f64f32: + case AMDIL::PRIVATETRUNCSTORE_v2f64f32: + BuildMI(*mBB, I, DL, mTII->get(AMDIL::VEXTRACT_v2f64), + AMDIL::R1012).addReg(AMDIL::R1011).addImm(2); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::DTOF), + AMDIL::R1011).addReg(AMDIL::R1011); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::DTOF), + AMDIL::R1012).addReg(AMDIL::R1012); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VINSERT_v2f32), + AMDIL::R1011).addReg(AMDIL::R1011).addReg(AMDIL::R1012) + .addImm(1 << 8).addImm(1 << 8); + break; + } +} + void +AMDILIOExpansion::expandAddressCalc(MachineInstr *MI) +{ + if (!isAddrCalcInstr(MI)) { + return; + } + DebugLoc DL = MI->getDebugLoc(); + switch(MI->getOpcode()) { + ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE) + ExpandCaseToAllTypes(AMDIL::PRIVATESTORE) + ExpandCaseToAllTypes(AMDIL::PRIVATELOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD) + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_i32), + AMDIL::R1010).addReg(AMDIL::R1010).addReg(AMDIL::T1); + break; + ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE) + ExpandCaseToAllTypes(AMDIL::LOCALLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD) + ExpandCaseToAllTypes(AMDIL::LOCALSTORE) + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_i32), + AMDIL::R1010).addReg(AMDIL::R1010).addReg(AMDIL::T2); + break; + ExpandCaseToAllTypes(AMDIL::CPOOLLOAD) + ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD) + ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD) + ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD) + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_i32), + AMDIL::R1010).addReg(AMDIL::R1010).addReg(AMDIL::SDP); + break; + default: + return; + } +} + void +AMDILIOExpansion::expandLoadStartCode(MachineInstr *MI) +{ + DebugLoc DL = MI->getDebugLoc(); + if (MI->getOperand(2).isReg()) { + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::ADD_i32), + AMDIL::R1010).addReg(MI->getOperand(1).getReg()) + .addReg(MI->getOperand(2).getReg()); + } else { + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::MOVE_i32), + AMDIL::R1010).addReg(MI->getOperand(1).getReg()); + } + MI->getOperand(1).setReg(AMDIL::R1010); + expandAddressCalc(MI); +} + void +AMDILIOExpansion::emitStaticCPLoad(MachineInstr* MI, int swizzle, + int id, bool ExtFPLoad) +{ + DebugLoc DL = MI->getDebugLoc(); + switch(swizzle) { + default: + BuildMI(*mBB, MI, DL, mTII->get(ExtFPLoad + ? AMDIL::DTOF : AMDIL::MOVE_i32), + MI->getOperand(0).getReg()) + .addImm(id); + break; + case 1: + case 2: + case 3: + BuildMI(*mBB, MI, DL, mTII->get(ExtFPLoad + ? AMDIL::DTOF : AMDIL::MOVE_i32), AMDIL::R1001) + .addImm(id); + BuildMI(*mBB, MI, DL, mTII->get(AMDIL::VINSERT_v4i32), + MI->getOperand(0).getReg()) + .addReg(MI->getOperand(0).getReg()) + .addReg(AMDIL::R1001) + .addImm(swizzle + 1); + break; + }; +} + void +AMDILIOExpansion::emitCPInst(MachineInstr* MI, + const Constant* C, AMDILKernelManager* KM, int swizzle, bool ExtFPLoad) +{ + if (const ConstantFP* CFP = dyn_cast<ConstantFP>(C)) { + if (CFP->getType()->isFloatTy()) { + uint32_t val = (uint32_t)(CFP->getValueAPF().bitcastToAPInt() + .getZExtValue()); + uint32_t id = mMFI->addi32Literal(val); + if (!id) { + const APFloat &APF = CFP->getValueAPF(); + union dtol_union { + double d; + uint64_t ul; + } conv; + if (&APF.getSemantics() + == (const llvm::fltSemantics*)&APFloat::IEEEsingle) { + float fval = APF.convertToFloat(); + conv.d = (double)fval; + } else { + conv.d = APF.convertToDouble(); + } + id = mMFI->addi64Literal(conv.ul); + } + emitStaticCPLoad(MI, swizzle, id, ExtFPLoad); + } else { + const APFloat &APF = CFP->getValueAPF(); + union ftol_union { + double d; + uint64_t ul; + } conv; + if (&APF.getSemantics() + == (const llvm::fltSemantics*)&APFloat::IEEEsingle) { + float fval = APF.convertToFloat(); + conv.d = (double)fval; + } else { + conv.d = APF.convertToDouble(); + } + uint32_t id = mMFI->getLongLits(conv.ul); + if (!id) { + id = mMFI->getIntLits((uint32_t)conv.ul); + } + emitStaticCPLoad(MI, swizzle, id, ExtFPLoad); + } + } else if (const ConstantInt* CI = dyn_cast<ConstantInt>(C)) { + int64_t val = 0; + if (CI) { + val = CI->getSExtValue(); + } + if (CI->getBitWidth() == 64) { + emitStaticCPLoad(MI, swizzle, mMFI->addi64Literal(val), ExtFPLoad); + } else { + emitStaticCPLoad(MI, swizzle, mMFI->addi32Literal(val), ExtFPLoad); + } + } else if (const ConstantArray* CA = dyn_cast<ConstantArray>(C)) { + uint32_t size = CA->getNumOperands(); + assert(size < 5 && "Cannot handle a constant array where size > 4"); + if (size > 4) { + size = 4; + } + for (uint32_t x = 0; x < size; ++x) { + emitCPInst(MI, CA->getOperand(0), KM, x, ExtFPLoad); + } + } else if (const ConstantAggregateZero* CAZ + = dyn_cast<ConstantAggregateZero>(C)) { + if (CAZ->isNullValue()) { + emitStaticCPLoad(MI, swizzle, mMFI->addi32Literal(0), ExtFPLoad); + } + } else if (const ConstantStruct* CS = dyn_cast<ConstantStruct>(C)) { + uint32_t size = CS->getNumOperands(); + assert(size < 5 && "Cannot handle a constant array where size > 4"); + if (size > 4) { + size = 4; + } + for (uint32_t x = 0; x < size; ++x) { + emitCPInst(MI, CS->getOperand(0), KM, x, ExtFPLoad); + } + } else if (const ConstantVector* CV = dyn_cast<ConstantVector>(C)) { + // TODO: Make this handle vectors natively up to the correct + // size + uint32_t size = CV->getNumOperands(); + assert(size < 5 && "Cannot handle a constant array where size > 4"); + if (size > 4) { + size = 4; + } + for (uint32_t x = 0; x < size; ++x) { + emitCPInst(MI, CV->getOperand(0), KM, x, ExtFPLoad); + } + } else { + // TODO: Do we really need to handle ConstantPointerNull? + // What about BlockAddress, ConstantExpr and Undef? + // How would these even be generated by a valid CL program? + assert(0 && "Found a constant type that I don't know how to handle"); + } +} + diff --git a/src/gallium/drivers/radeon/AMDILIOExpansion.h b/src/gallium/drivers/radeon/AMDILIOExpansion.h new file mode 100644 index 00000000000..af4709a892c --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILIOExpansion.h @@ -0,0 +1,320 @@ +//===----------- AMDILIOExpansion.h - IO Expansion Pass -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// The AMDIL IO Expansion class expands pseudo IO instructions into a sequence +// of instructions that produces the correct results. These instructions are +// not expanded earlier in the backend because any pass before this can assume to +// be able to generate a load/store instruction. So this pass can only have +// passes that execute after it if no load/store instructions can be generated +// in those passes. +//===----------------------------------------------------------------------===// +#ifndef _AMDILIOEXPANSION_H_ +#define _AMDILIOEXPANSION_H_ +#undef DEBUG_TYPE +#undef DEBUGME +#define DEBUG_TYPE "IOExpansion" +#if !defined(NDEBUG) +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME (false) +#endif +#include "AMDIL.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { + class MachineFunction; + class AMDILKernelManager; + class AMDILMachineFunctionInfo; + class AMDILSubtarget; + class MachineInstr; + class Constant; + class TargetInstrInfo; + class Type; + typedef enum { + NO_PACKING = 0, + PACK_V2I8, + PACK_V4I8, + PACK_V2I16, + PACK_V4I16, + UNPACK_V2I8, + UNPACK_V4I8, + UNPACK_V2I16, + UNPACK_V4I16, + UNPACK_LAST + } REG_PACKED_TYPE; + class AMDILIOExpansion : public MachineFunctionPass + { + public: + virtual ~AMDILIOExpansion(); + virtual const char* getPassName() const; + bool runOnMachineFunction(MachineFunction &MF); + static char ID; + protected: + AMDILIOExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + TargetMachine &TM; + // + // @param MI Machine instruction to check. + // @brief checks to see if the machine instruction + // is an I/O instruction or not. + // + // @return true if I/O, false otherwise. + // + virtual bool + isIOInstruction(MachineInstr *MI); + // Wrapper function that calls the appropriate I/O + // expansion function based on the instruction type. + virtual void + expandIOInstruction(MachineInstr *MI); + virtual void + expandGlobalStore(MachineInstr *MI) = 0; + virtual void + expandLocalStore(MachineInstr *MI) = 0; + virtual void + expandRegionStore(MachineInstr *MI) = 0; + virtual void + expandPrivateStore(MachineInstr *MI) = 0; + virtual void + expandGlobalLoad(MachineInstr *MI) = 0; + virtual void + expandRegionLoad(MachineInstr *MI) = 0; + virtual void + expandLocalLoad(MachineInstr *MI) = 0; + virtual void + expandPrivateLoad(MachineInstr *MI) = 0; + virtual void + expandConstantLoad(MachineInstr *MI) = 0; + virtual void + expandConstantPoolLoad(MachineInstr *MI) = 0; + bool + isAddrCalcInstr(MachineInstr *MI); + bool + isExtendLoad(MachineInstr *MI); + bool + isHardwareRegion(MachineInstr *MI); + bool + isHardwareLocal(MachineInstr *MI); + bool + isPackedData(MachineInstr *MI); + bool + isStaticCPLoad(MachineInstr *MI); + bool + isNbitType(Type *MI, uint32_t nBits, bool isScalar = true); + bool + isHardwareInst(MachineInstr *MI); + uint32_t + getMemorySize(MachineInstr *MI); + REG_PACKED_TYPE + getPackedID(MachineInstr *MI); + uint32_t + getShiftSize(MachineInstr *MI); + uint32_t + getPointerID(MachineInstr *MI); + void + expandTruncData(MachineInstr *MI); + void + expandLoadStartCode(MachineInstr *MI); + virtual void + expandStoreSetupCode(MachineInstr *MI) = 0; + void + expandAddressCalc(MachineInstr *MI); + void + expandLongExtend(MachineInstr *MI, + uint32_t numComponents, uint32_t size, bool signedShift); + void + expandLongExtendSub32(MachineInstr *MI, + unsigned SHLop, unsigned SHRop, unsigned USHRop, + unsigned SHLimm, uint64_t SHRimm, unsigned USHRimm, + unsigned LCRop, bool signedShift); + void + expandIntegerExtend(MachineInstr *MI, unsigned, unsigned, unsigned); + void + expandExtendLoad(MachineInstr *MI); + virtual void + expandPackedData(MachineInstr *MI) = 0; + void + emitCPInst(MachineInstr* MI, const Constant* C, + AMDILKernelManager* KM, int swizzle, bool ExtFPLoad); + + bool mDebug; + const AMDILSubtarget *mSTM; + AMDILKernelManager *mKM; + MachineBasicBlock *mBB; + AMDILMachineFunctionInfo *mMFI; + const TargetInstrInfo *mTII; + bool saveInst; + private: + void + emitStaticCPLoad(MachineInstr* MI, int swizzle, int id, + bool ExtFPLoad); + }; // class AMDILIOExpansion + + // Intermediate class that holds I/O code expansion that is common to the + // 7XX, Evergreen and Northern Island family of chips. + class AMDIL789IOExpansion : public AMDILIOExpansion { + public: + virtual ~AMDIL789IOExpansion(); + virtual const char* getPassName() const; + protected: + AMDIL789IOExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + virtual void + expandGlobalStore(MachineInstr *MI) = 0; + virtual void + expandLocalStore(MachineInstr *MI) = 0; + virtual void + expandRegionStore(MachineInstr *MI) = 0; + virtual void + expandGlobalLoad(MachineInstr *MI) = 0; + virtual void + expandRegionLoad(MachineInstr *MI) = 0; + virtual void + expandLocalLoad(MachineInstr *MI) = 0; + virtual void + expandPrivateStore(MachineInstr *MI); + virtual void + expandConstantLoad(MachineInstr *MI); + virtual void + expandPrivateLoad(MachineInstr *MI) ; + virtual void + expandConstantPoolLoad(MachineInstr *MI); + void + expandStoreSetupCode(MachineInstr *MI); + virtual void + expandPackedData(MachineInstr *MI); + private: + void emitVectorAddressCalc(MachineInstr *MI, bool is32bit, + bool needsSelect); + void emitVectorSwitchWrite(MachineInstr *MI, bool is32bit); + void emitComponentExtract(MachineInstr *MI, unsigned flag, unsigned src, + unsigned dst, bool beforeInst); + void emitDataLoadSelect(MachineInstr *MI); + }; // class AMDIL789IOExpansion + // Class that handles I/O emission for the 7XX family of devices. + class AMDIL7XXIOExpansion : public AMDIL789IOExpansion { + public: + AMDIL7XXIOExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + + ~AMDIL7XXIOExpansion(); + const char* getPassName() const; + protected: + void + expandGlobalStore(MachineInstr *MI); + void + expandLocalStore(MachineInstr *MI); + void + expandRegionStore(MachineInstr *MI); + void + expandGlobalLoad(MachineInstr *MI); + void + expandRegionLoad(MachineInstr *MI); + void + expandLocalLoad(MachineInstr *MI); + }; // class AMDIL7XXIOExpansion + + // Class that handles image functions to expand them into the + // correct set of I/O instructions. + class AMDILImageExpansion : public AMDIL789IOExpansion { + public: + AMDILImageExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + + virtual ~AMDILImageExpansion(); + protected: + // + // @param MI Instruction iterator that has the sample instruction + // that needs to be taken care of. + // @brief transforms the __amdil_sample_data function call into a + // sample instruction in IL. + // + // @warning This function only works correctly if all functions get + // inlined + // + virtual void + expandImageLoad(MachineBasicBlock *BB, MachineInstr *MI); + // + // @param MI Instruction iterator that has the write instruction that + // needs to be taken care of. + // @brief transforms the __amdil_write_data function call into a + // simple UAV write instruction in IL. + // + // @warning This function only works correctly if all functions get + // inlined + // + virtual void + expandImageStore(MachineBasicBlock *BB, MachineInstr *MI); + // + // @param MI Instruction interator that has the image parameter + // instruction + // @brief transforms the __amdil_get_image_params function call into + // a copy of data from a specific constant buffer to the register + // + // @warning This function only works correctly if all functions get + // inlined + // + virtual void + expandImageParam(MachineBasicBlock *BB, MachineInstr *MI); + + // + // @param MI Insturction that points to the image + // @brief transforms __amdil_sample_data into a sequence of + // if/else that selects the correct sample instruction. + // + // @warning This function is inefficient and works with no + // inlining. + // + virtual void + expandInefficientImageLoad(MachineBasicBlock *BB, MachineInstr *MI); + private: + AMDILImageExpansion(); // Do not implement. + + }; // class AMDILImageExpansion + + // Class that expands IO instructions for Evergreen and Northern + // Island family of devices. + class AMDILEGIOExpansion : public AMDILImageExpansion { + public: + AMDILEGIOExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + + virtual ~AMDILEGIOExpansion(); + const char* getPassName() const; + protected: + virtual bool + isIOInstruction(MachineInstr *MI); + virtual void + expandIOInstruction(MachineInstr *MI); + bool + isImageIO(MachineInstr *MI); + virtual void + expandGlobalStore(MachineInstr *MI); + void + expandLocalStore(MachineInstr *MI); + void + expandRegionStore(MachineInstr *MI); + virtual void + expandGlobalLoad(MachineInstr *MI); + void + expandRegionLoad(MachineInstr *MI); + void + expandLocalLoad(MachineInstr *MI); + virtual bool + isCacheableOp(MachineInstr *MI); + void + expandStoreSetupCode(MachineInstr *MI); + void + expandPackedData(MachineInstr *MI); + private: + bool + isArenaOp(MachineInstr *MI); + void + expandArenaSetup(MachineInstr *MI); + }; // class AMDILEGIOExpansion +} // namespace llvm +#endif // _AMDILIOEXPANSION_H_ diff --git a/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp b/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp new file mode 100644 index 00000000000..ff04d9d55bf --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp @@ -0,0 +1,457 @@ +//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file defines an instruction selector for the AMDIL target. +// +//===----------------------------------------------------------------------===// +#include "AMDILDevices.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAGISel.h" +#include "llvm/Support/Compiler.h" + +using namespace llvm; + +//===----------------------------------------------------------------------===// +// Instruction Selector Implementation +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// AMDILDAGToDAGISel - AMDIL specific code to select AMDIL machine instructions +// //for SelectionDAG operations. +// +namespace { +class AMDILDAGToDAGISel : public SelectionDAGISel { + // Subtarget - Keep a pointer to the AMDIL Subtarget around so that we can + // make the right decision when generating code for different targets. + const AMDILSubtarget &Subtarget; +public: + AMDILDAGToDAGISel(AMDILTargetMachine &TM AMDIL_OPT_LEVEL_DECL); + virtual ~AMDILDAGToDAGISel(); + inline SDValue getSmallIPtrImm(unsigned Imm); + + SDNode *Select(SDNode *N); + // Complex pattern selectors + bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2); + bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2); + bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2); + static bool isGlobalStore(const StoreSDNode *N); + static bool isPrivateStore(const StoreSDNode *N); + static bool isLocalStore(const StoreSDNode *N); + static bool isRegionStore(const StoreSDNode *N); + + static bool isCPLoad(const LoadSDNode *N); + static bool isConstantLoad(const LoadSDNode *N, int cbID); + static bool isGlobalLoad(const LoadSDNode *N); + static bool isPrivateLoad(const LoadSDNode *N); + static bool isLocalLoad(const LoadSDNode *N); + static bool isRegionLoad(const LoadSDNode *N); + + virtual const char *getPassName() const; +private: + SDNode *xformAtomicInst(SDNode *N); + + // Include the pieces autogenerated from the target description. +#include "AMDILGenDAGISel.inc" +}; +} // end anonymous namespace + +// createAMDILISelDag - This pass converts a legalized DAG into a AMDIL-specific +// DAG, ready for instruction scheduling. +// +FunctionPass *llvm::createAMDILISelDag(AMDILTargetMachine &TM + AMDIL_OPT_LEVEL_DECL) { + return new AMDILDAGToDAGISel(TM AMDIL_OPT_LEVEL_VAR); +} + +AMDILDAGToDAGISel::AMDILDAGToDAGISel(AMDILTargetMachine &TM + AMDIL_OPT_LEVEL_DECL) + : SelectionDAGISel(TM AMDIL_OPT_LEVEL_VAR), Subtarget(TM.getSubtarget<AMDILSubtarget>()) +{ +} + +AMDILDAGToDAGISel::~AMDILDAGToDAGISel() { +} + +SDValue AMDILDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) { + return CurDAG->getTargetConstant(Imm, MVT::i32); +} + +bool AMDILDAGToDAGISel::SelectADDRParam( + SDValue Addr, SDValue& R1, SDValue& R2) { + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32); + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i32); + } + return true; +} + +bool AMDILDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + return SelectADDRParam(Addr, R1, R2); +} + + +bool AMDILDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) { + if (Addr.getOpcode() == ISD::TargetExternalSymbol || + Addr.getOpcode() == ISD::TargetGlobalAddress) { + return false; + } + + if (Addr.getOpcode() == ISD::FrameIndex) { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) { + R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64); + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } + } else if (Addr.getOpcode() == ISD::ADD) { + R1 = Addr.getOperand(0); + R2 = Addr.getOperand(1); + } else { + R1 = Addr; + R2 = CurDAG->getTargetConstant(0, MVT::i64); + } + return true; +} + +SDNode *AMDILDAGToDAGISel::Select(SDNode *N) { + unsigned int Opc = N->getOpcode(); + if (N->isMachineOpcode()) { + return NULL; // Already selected. + } + switch (Opc) { + default: break; + case ISD::FrameIndex: + { + if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) { + unsigned int FI = FIN->getIndex(); + EVT OpVT = N->getValueType(0); + unsigned int NewOpc = AMDIL::MOVE_i32; + SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32); + return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI); + } + } + break; + } + // For all atomic instructions, we need to add a constant + // operand that stores the resource ID in the instruction + if (Opc > AMDILISD::ADDADDR && Opc < AMDILISD::APPEND_ALLOC) { + N = xformAtomicInst(N); + } + return SelectCode(N); +} + +bool AMDILDAGToDAGISel::isGlobalStore(const StoreSDNode *N) { + return check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS); +} + +bool AMDILDAGToDAGISel::isPrivateStore(const StoreSDNode *N) { + return (!check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS) + && !check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS) + && !check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS)); +} + +bool AMDILDAGToDAGISel::isLocalStore(const StoreSDNode *N) { + return check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS); +} + +bool AMDILDAGToDAGISel::isRegionStore(const StoreSDNode *N) { + return check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS); +} + +bool AMDILDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) { + if (check_type(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)) { + return true; + } + MachineMemOperand *MMO = N->getMemOperand(); + const Value *V = MMO->getValue(); + const Value *BV = getBasePointerValue(V); + if (MMO + && MMO->getValue() + && ((V && dyn_cast<GlobalValue>(V)) + || (BV && dyn_cast<GlobalValue>( + getBasePointerValue(MMO->getValue()))))) { + return check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS); + } else { + return false; + } +} + +bool AMDILDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) { + return check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS); +} + +bool AMDILDAGToDAGISel::isLocalLoad(const LoadSDNode *N) { + return check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS); +} + +bool AMDILDAGToDAGISel::isRegionLoad(const LoadSDNode *N) { + return check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS); +} + +bool AMDILDAGToDAGISel::isCPLoad(const LoadSDNode *N) { + MachineMemOperand *MMO = N->getMemOperand(); + if (check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) { + if (MMO) { + const Value *V = MMO->getValue(); + const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V); + if (PSV && PSV == PseudoSourceValue::getConstantPool()) { + return true; + } + } + } + return false; +} + +bool AMDILDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) { + if (check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) { + // Check to make sure we are not a constant pool load or a constant load + // that is marked as a private load + if (isCPLoad(N) || isConstantLoad(N, -1)) { + return false; + } + } + if (!check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS) + && !check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS) + && !check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS) + && !check_type(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS) + && !check_type(N->getSrcValue(), AMDILAS::PARAM_D_ADDRESS) + && !check_type(N->getSrcValue(), AMDILAS::PARAM_I_ADDRESS)) + { + return true; + } + return false; +} + +const char *AMDILDAGToDAGISel::getPassName() const { + return "AMDIL DAG->DAG Pattern Instruction Selection"; +} + +SDNode* +AMDILDAGToDAGISel::xformAtomicInst(SDNode *N) +{ + uint32_t addVal = 1; + bool addOne = false; + // bool bitCastToInt = (N->getValueType(0) == MVT::f32); + unsigned opc = N->getOpcode(); + switch (opc) { + default: return N; + case AMDILISD::ATOM_G_ADD: + case AMDILISD::ATOM_G_AND: + case AMDILISD::ATOM_G_MAX: + case AMDILISD::ATOM_G_UMAX: + case AMDILISD::ATOM_G_MIN: + case AMDILISD::ATOM_G_UMIN: + case AMDILISD::ATOM_G_OR: + case AMDILISD::ATOM_G_SUB: + case AMDILISD::ATOM_G_RSUB: + case AMDILISD::ATOM_G_XCHG: + case AMDILISD::ATOM_G_XOR: + case AMDILISD::ATOM_G_ADD_NORET: + case AMDILISD::ATOM_G_AND_NORET: + case AMDILISD::ATOM_G_MAX_NORET: + case AMDILISD::ATOM_G_UMAX_NORET: + case AMDILISD::ATOM_G_MIN_NORET: + case AMDILISD::ATOM_G_UMIN_NORET: + case AMDILISD::ATOM_G_OR_NORET: + case AMDILISD::ATOM_G_SUB_NORET: + case AMDILISD::ATOM_G_RSUB_NORET: + case AMDILISD::ATOM_G_XCHG_NORET: + case AMDILISD::ATOM_G_XOR_NORET: + case AMDILISD::ATOM_L_ADD: + case AMDILISD::ATOM_L_AND: + case AMDILISD::ATOM_L_MAX: + case AMDILISD::ATOM_L_UMAX: + case AMDILISD::ATOM_L_MIN: + case AMDILISD::ATOM_L_UMIN: + case AMDILISD::ATOM_L_OR: + case AMDILISD::ATOM_L_SUB: + case AMDILISD::ATOM_L_RSUB: + case AMDILISD::ATOM_L_XCHG: + case AMDILISD::ATOM_L_XOR: + case AMDILISD::ATOM_L_ADD_NORET: + case AMDILISD::ATOM_L_AND_NORET: + case AMDILISD::ATOM_L_MAX_NORET: + case AMDILISD::ATOM_L_UMAX_NORET: + case AMDILISD::ATOM_L_MIN_NORET: + case AMDILISD::ATOM_L_UMIN_NORET: + case AMDILISD::ATOM_L_OR_NORET: + case AMDILISD::ATOM_L_SUB_NORET: + case AMDILISD::ATOM_L_RSUB_NORET: + case AMDILISD::ATOM_L_XCHG_NORET: + case AMDILISD::ATOM_L_XOR_NORET: + case AMDILISD::ATOM_R_ADD: + case AMDILISD::ATOM_R_AND: + case AMDILISD::ATOM_R_MAX: + case AMDILISD::ATOM_R_UMAX: + case AMDILISD::ATOM_R_MIN: + case AMDILISD::ATOM_R_UMIN: + case AMDILISD::ATOM_R_OR: + case AMDILISD::ATOM_R_SUB: + case AMDILISD::ATOM_R_RSUB: + case AMDILISD::ATOM_R_XCHG: + case AMDILISD::ATOM_R_XOR: + case AMDILISD::ATOM_R_ADD_NORET: + case AMDILISD::ATOM_R_AND_NORET: + case AMDILISD::ATOM_R_MAX_NORET: + case AMDILISD::ATOM_R_UMAX_NORET: + case AMDILISD::ATOM_R_MIN_NORET: + case AMDILISD::ATOM_R_UMIN_NORET: + case AMDILISD::ATOM_R_OR_NORET: + case AMDILISD::ATOM_R_SUB_NORET: + case AMDILISD::ATOM_R_RSUB_NORET: + case AMDILISD::ATOM_R_XCHG_NORET: + case AMDILISD::ATOM_R_XOR_NORET: + case AMDILISD::ATOM_G_CMPXCHG: + case AMDILISD::ATOM_G_CMPXCHG_NORET: + case AMDILISD::ATOM_L_CMPXCHG: + case AMDILISD::ATOM_L_CMPXCHG_NORET: + case AMDILISD::ATOM_R_CMPXCHG: + case AMDILISD::ATOM_R_CMPXCHG_NORET: + break; + case AMDILISD::ATOM_G_DEC: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_G_SUB; + } + break; + case AMDILISD::ATOM_G_INC: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_G_ADD; + } + break; + case AMDILISD::ATOM_G_DEC_NORET: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_G_SUB_NORET; + } + break; + case AMDILISD::ATOM_G_INC_NORET: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_G_ADD_NORET; + } + break; + case AMDILISD::ATOM_L_DEC: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_L_SUB; + } + break; + case AMDILISD::ATOM_L_INC: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_L_ADD; + } + break; + case AMDILISD::ATOM_L_DEC_NORET: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_L_SUB_NORET; + } + break; + case AMDILISD::ATOM_L_INC_NORET: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_L_ADD_NORET; + } + break; + case AMDILISD::ATOM_R_DEC: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_R_SUB; + } + break; + case AMDILISD::ATOM_R_INC: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_R_ADD; + } + break; + case AMDILISD::ATOM_R_DEC_NORET: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_R_SUB; + } + break; + case AMDILISD::ATOM_R_INC_NORET: + addOne = true; + if (Subtarget.calVersion() >= CAL_VERSION_SC_136) { + addVal = (uint32_t)-1; + } else { + opc = AMDILISD::ATOM_R_ADD_NORET; + } + break; + } + // The largest we can have is a cmpxchg w/ a return value and an output chain. + // The cmpxchg function has 3 inputs and a single output along with an + // output change and a target constant, giving a total of 6. + SDValue Ops[12]; + unsigned x = 0; + unsigned y = N->getNumOperands(); + for (x = 0; x < y; ++x) { + Ops[x] = N->getOperand(x); + } + if (addOne) { + Ops[x++] = SDValue(SelectCode(CurDAG->getConstant(addVal, MVT::i32).getNode()), 0); + } + Ops[x++] = CurDAG->getTargetConstant(0, MVT::i32); + SDVTList Tys = N->getVTList(); + MemSDNode *MemNode = dyn_cast<MemSDNode>(N); + assert(MemNode && "Atomic should be of MemSDNode type!"); + N = CurDAG->getMemIntrinsicNode(opc, N->getDebugLoc(), Tys, Ops, x, + MemNode->getMemoryVT(), MemNode->getMemOperand()).getNode(); + return N; +} + +#ifdef DEBUGTMP +#undef INT64_C +#endif +#undef DEBUGTMP diff --git a/src/gallium/drivers/radeon/AMDILISelLowering.cpp b/src/gallium/drivers/radeon/AMDILISelLowering.cpp new file mode 100644 index 00000000000..6f78d15ad0b --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILISelLowering.cpp @@ -0,0 +1,5612 @@ +//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file implements the interfaces that AMDIL uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#include "AMDILISelLowering.h" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILIntrinsicInfo.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILSubtarget.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CallingConv.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; +#define ISDBITCAST ISD::BITCAST +#define MVTGLUE MVT::Glue +//===----------------------------------------------------------------------===// +// Calling Convention Implementation +//===----------------------------------------------------------------------===// +#include "AMDILGenCallingConv.inc" + +//===----------------------------------------------------------------------===// +// TargetLowering Implementation Help Functions Begin +//===----------------------------------------------------------------------===// + static SDValue +getConversionNode(SelectionDAG &DAG, SDValue& Src, SDValue& Dst, bool asType) +{ + DebugLoc DL = Src.getDebugLoc(); + EVT svt = Src.getValueType().getScalarType(); + EVT dvt = Dst.getValueType().getScalarType(); + if (svt.isFloatingPoint() && dvt.isFloatingPoint()) { + if (dvt.bitsGT(svt)) { + Src = DAG.getNode(ISD::FP_EXTEND, DL, dvt, Src); + } else if (svt.bitsLT(svt)) { + Src = DAG.getNode(ISD::FP_ROUND, DL, dvt, Src, + DAG.getConstant(1, MVT::i32)); + } + } else if (svt.isInteger() && dvt.isInteger()) { + if (!svt.bitsEq(dvt)) { + Src = DAG.getSExtOrTrunc(Src, DL, dvt); + } else { + Src = DAG.getNode(AMDILISD::MOVE, DL, dvt, Src); + } + } else if (svt.isInteger()) { + unsigned opcode = (asType) ? ISDBITCAST : ISD::SINT_TO_FP; + if (!svt.bitsEq(dvt)) { + if (dvt.getSimpleVT().SimpleTy == MVT::f32) { + Src = DAG.getSExtOrTrunc(Src, DL, MVT::i32); + } else if (dvt.getSimpleVT().SimpleTy == MVT::f64) { + Src = DAG.getSExtOrTrunc(Src, DL, MVT::i64); + } else { + assert(0 && "We only support 32 and 64bit fp types"); + } + } + Src = DAG.getNode(opcode, DL, dvt, Src); + } else if (dvt.isInteger()) { + unsigned opcode = (asType) ? ISDBITCAST : ISD::FP_TO_SINT; + if (svt.getSimpleVT().SimpleTy == MVT::f32) { + Src = DAG.getNode(opcode, DL, MVT::i32, Src); + } else if (svt.getSimpleVT().SimpleTy == MVT::f64) { + Src = DAG.getNode(opcode, DL, MVT::i64, Src); + } else { + assert(0 && "We only support 32 and 64bit fp types"); + } + Src = DAG.getSExtOrTrunc(Src, DL, dvt); + } + return Src; +} +// CondCCodeToCC - Convert a DAG condition code to a AMDIL CC +// condition. + static AMDILCC::CondCodes +CondCCodeToCC(ISD::CondCode CC, const MVT::SimpleValueType& type) +{ + switch (CC) { + default: + { + errs()<<"Condition Code: "<< (unsigned int)CC<<"\n"; + assert(0 && "Unknown condition code!"); + } + case ISD::SETO: + switch(type) { + case MVT::f32: + return AMDILCC::IL_CC_F_O; + case MVT::f64: + return AMDILCC::IL_CC_D_O; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETUO: + switch(type) { + case MVT::f32: + return AMDILCC::IL_CC_F_UO; + case MVT::f64: + return AMDILCC::IL_CC_D_UO; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETGT: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_I_GT; + case MVT::f32: + return AMDILCC::IL_CC_F_GT; + case MVT::f64: + return AMDILCC::IL_CC_D_GT; + case MVT::i64: + return AMDILCC::IL_CC_L_GT; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETGE: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_I_GE; + case MVT::f32: + return AMDILCC::IL_CC_F_GE; + case MVT::f64: + return AMDILCC::IL_CC_D_GE; + case MVT::i64: + return AMDILCC::IL_CC_L_GE; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETLT: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_I_LT; + case MVT::f32: + return AMDILCC::IL_CC_F_LT; + case MVT::f64: + return AMDILCC::IL_CC_D_LT; + case MVT::i64: + return AMDILCC::IL_CC_L_LT; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETLE: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_I_LE; + case MVT::f32: + return AMDILCC::IL_CC_F_LE; + case MVT::f64: + return AMDILCC::IL_CC_D_LE; + case MVT::i64: + return AMDILCC::IL_CC_L_LE; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETNE: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_I_NE; + case MVT::f32: + return AMDILCC::IL_CC_F_NE; + case MVT::f64: + return AMDILCC::IL_CC_D_NE; + case MVT::i64: + return AMDILCC::IL_CC_L_NE; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETEQ: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_I_EQ; + case MVT::f32: + return AMDILCC::IL_CC_F_EQ; + case MVT::f64: + return AMDILCC::IL_CC_D_EQ; + case MVT::i64: + return AMDILCC::IL_CC_L_EQ; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETUGT: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_U_GT; + case MVT::f32: + return AMDILCC::IL_CC_F_UGT; + case MVT::f64: + return AMDILCC::IL_CC_D_UGT; + case MVT::i64: + return AMDILCC::IL_CC_UL_GT; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETUGE: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_U_GE; + case MVT::f32: + return AMDILCC::IL_CC_F_UGE; + case MVT::f64: + return AMDILCC::IL_CC_D_UGE; + case MVT::i64: + return AMDILCC::IL_CC_UL_GE; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETULT: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_U_LT; + case MVT::f32: + return AMDILCC::IL_CC_F_ULT; + case MVT::f64: + return AMDILCC::IL_CC_D_ULT; + case MVT::i64: + return AMDILCC::IL_CC_UL_LT; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETULE: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_U_LE; + case MVT::f32: + return AMDILCC::IL_CC_F_ULE; + case MVT::f64: + return AMDILCC::IL_CC_D_ULE; + case MVT::i64: + return AMDILCC::IL_CC_UL_LE; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETUNE: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_U_NE; + case MVT::f32: + return AMDILCC::IL_CC_F_UNE; + case MVT::f64: + return AMDILCC::IL_CC_D_UNE; + case MVT::i64: + return AMDILCC::IL_CC_UL_NE; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETUEQ: + switch (type) { + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + return AMDILCC::IL_CC_U_EQ; + case MVT::f32: + return AMDILCC::IL_CC_F_UEQ; + case MVT::f64: + return AMDILCC::IL_CC_D_UEQ; + case MVT::i64: + return AMDILCC::IL_CC_UL_EQ; + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETOGT: + switch (type) { + case MVT::f32: + return AMDILCC::IL_CC_F_OGT; + case MVT::f64: + return AMDILCC::IL_CC_D_OGT; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETOGE: + switch (type) { + case MVT::f32: + return AMDILCC::IL_CC_F_OGE; + case MVT::f64: + return AMDILCC::IL_CC_D_OGE; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETOLT: + switch (type) { + case MVT::f32: + return AMDILCC::IL_CC_F_OLT; + case MVT::f64: + return AMDILCC::IL_CC_D_OLT; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETOLE: + switch (type) { + case MVT::f32: + return AMDILCC::IL_CC_F_OLE; + case MVT::f64: + return AMDILCC::IL_CC_D_OLE; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETONE: + switch (type) { + case MVT::f32: + return AMDILCC::IL_CC_F_ONE; + case MVT::f64: + return AMDILCC::IL_CC_D_ONE; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + case ISD::SETOEQ: + switch (type) { + case MVT::f32: + return AMDILCC::IL_CC_F_OEQ; + case MVT::f64: + return AMDILCC::IL_CC_D_OEQ; + case MVT::i1: + case MVT::i8: + case MVT::i16: + case MVT::i32: + case MVT::i64: + default: + assert(0 && "Opcode combination not generated correctly!"); + return AMDILCC::COND_ERROR; + }; + }; +} + + static unsigned int +translateToOpcode(uint64_t CCCode, unsigned int regClass) +{ + switch (CCCode) { + case AMDILCC::IL_CC_D_EQ: + case AMDILCC::IL_CC_D_OEQ: + if (regClass == AMDIL::GPRV2F64RegClassID) { + return (unsigned int)AMDIL::DEQ_v2f64; + } else { + return (unsigned int)AMDIL::DEQ; + } + case AMDILCC::IL_CC_D_LE: + case AMDILCC::IL_CC_D_OLE: + case AMDILCC::IL_CC_D_ULE: + case AMDILCC::IL_CC_D_GE: + case AMDILCC::IL_CC_D_OGE: + case AMDILCC::IL_CC_D_UGE: + return (unsigned int)AMDIL::DGE; + case AMDILCC::IL_CC_D_LT: + case AMDILCC::IL_CC_D_OLT: + case AMDILCC::IL_CC_D_ULT: + case AMDILCC::IL_CC_D_GT: + case AMDILCC::IL_CC_D_OGT: + case AMDILCC::IL_CC_D_UGT: + return (unsigned int)AMDIL::DLT; + case AMDILCC::IL_CC_D_NE: + case AMDILCC::IL_CC_D_UNE: + return (unsigned int)AMDIL::DNE; + case AMDILCC::IL_CC_F_EQ: + case AMDILCC::IL_CC_F_OEQ: + return (unsigned int)AMDIL::FEQ; + case AMDILCC::IL_CC_F_LE: + case AMDILCC::IL_CC_F_ULE: + case AMDILCC::IL_CC_F_OLE: + case AMDILCC::IL_CC_F_GE: + case AMDILCC::IL_CC_F_UGE: + case AMDILCC::IL_CC_F_OGE: + return (unsigned int)AMDIL::FGE; + case AMDILCC::IL_CC_F_LT: + case AMDILCC::IL_CC_F_OLT: + case AMDILCC::IL_CC_F_ULT: + case AMDILCC::IL_CC_F_GT: + case AMDILCC::IL_CC_F_OGT: + case AMDILCC::IL_CC_F_UGT: + if (regClass == AMDIL::GPRV2F32RegClassID) { + return (unsigned int)AMDIL::FLT_v2f32; + } else if (regClass == AMDIL::GPRV4F32RegClassID) { + return (unsigned int)AMDIL::FLT_v4f32; + } else { + return (unsigned int)AMDIL::FLT; + } + case AMDILCC::IL_CC_F_NE: + case AMDILCC::IL_CC_F_UNE: + return (unsigned int)AMDIL::FNE; + case AMDILCC::IL_CC_I_EQ: + case AMDILCC::IL_CC_U_EQ: + if (regClass == AMDIL::GPRI32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::IEQ; + } else if (regClass == AMDIL::GPRV2I32RegClassID + || regClass == AMDIL::GPRV2I8RegClassID + || regClass == AMDIL::GPRV2I16RegClassID) { + return (unsigned int)AMDIL::IEQ_v2i32; + } else if (regClass == AMDIL::GPRV4I32RegClassID + || regClass == AMDIL::GPRV4I8RegClassID + || regClass == AMDIL::GPRV4I16RegClassID) { + return (unsigned int)AMDIL::IEQ_v4i32; + } else { + assert(!"Unknown reg class!"); + } + case AMDILCC::IL_CC_L_EQ: + case AMDILCC::IL_CC_UL_EQ: + return (unsigned int)AMDIL::LEQ; + case AMDILCC::IL_CC_I_GE: + case AMDILCC::IL_CC_I_LE: + if (regClass == AMDIL::GPRI32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::IGE; + } else if (regClass == AMDIL::GPRV2I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::IGE_v2i32; + } else if (regClass == AMDIL::GPRV4I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::IGE_v4i32; + } else { + assert(!"Unknown reg class!"); + } + case AMDILCC::IL_CC_I_LT: + case AMDILCC::IL_CC_I_GT: + if (regClass == AMDIL::GPRI32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::ILT; + } else if (regClass == AMDIL::GPRV2I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::ILT_v2i32; + } else if (regClass == AMDIL::GPRV4I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::ILT_v4i32; + } else { + assert(!"Unknown reg class!"); + } + case AMDILCC::IL_CC_L_GE: + return (unsigned int)AMDIL::LGE; + case AMDILCC::IL_CC_L_LE: + return (unsigned int)AMDIL::LLE; + case AMDILCC::IL_CC_L_LT: + return (unsigned int)AMDIL::LLT; + case AMDILCC::IL_CC_L_GT: + return (unsigned int)AMDIL::LGT; + case AMDILCC::IL_CC_I_NE: + case AMDILCC::IL_CC_U_NE: + if (regClass == AMDIL::GPRI32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::INE; + } else if (regClass == AMDIL::GPRV2I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::INE_v2i32; + } else if (regClass == AMDIL::GPRV4I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::INE_v4i32; + } else { + assert(!"Unknown reg class!"); + } + case AMDILCC::IL_CC_U_GE: + case AMDILCC::IL_CC_U_LE: + if (regClass == AMDIL::GPRI32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::UGE; + } else if (regClass == AMDIL::GPRV2I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::UGE_v2i32; + } else if (regClass == AMDIL::GPRV4I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::UGE_v4i32; + } else { + assert(!"Unknown reg class!"); + } + case AMDILCC::IL_CC_L_NE: + case AMDILCC::IL_CC_UL_NE: + return (unsigned int)AMDIL::LNE; + case AMDILCC::IL_CC_UL_GE: + return (unsigned int)AMDIL::ULGE; + case AMDILCC::IL_CC_UL_LE: + return (unsigned int)AMDIL::ULLE; + case AMDILCC::IL_CC_U_LT: + if (regClass == AMDIL::GPRI32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::ULT; + } else if (regClass == AMDIL::GPRV2I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::ULT_v2i32; + } else if (regClass == AMDIL::GPRV4I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::ULT_v4i32; + } else { + assert(!"Unknown reg class!"); + } + case AMDILCC::IL_CC_U_GT: + if (regClass == AMDIL::GPRI32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::UGT; + } else if (regClass == AMDIL::GPRV2I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::UGT_v2i32; + } else if (regClass == AMDIL::GPRV4I32RegClassID + || regClass == AMDIL::GPRI8RegClassID + || regClass == AMDIL::GPRI16RegClassID) { + return (unsigned int)AMDIL::UGT_v4i32; + } else { + assert(!"Unknown reg class!"); + } + case AMDILCC::IL_CC_UL_LT: + return (unsigned int)AMDIL::ULLT; + case AMDILCC::IL_CC_UL_GT: + return (unsigned int)AMDIL::ULGT; + case AMDILCC::IL_CC_F_UEQ: + case AMDILCC::IL_CC_D_UEQ: + case AMDILCC::IL_CC_F_ONE: + case AMDILCC::IL_CC_D_ONE: + case AMDILCC::IL_CC_F_O: + case AMDILCC::IL_CC_F_UO: + case AMDILCC::IL_CC_D_O: + case AMDILCC::IL_CC_D_UO: + // we don't care + return 0; + + } + errs()<<"Opcode: "<<CCCode<<"\n"; + assert(0 && "Unknown opcode retrieved"); + return 0; +} +SDValue +AMDILTargetLowering::LowerMemArgument( + SDValue Chain, + CallingConv::ID CallConv, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + MachineFrameInfo *MFI, + unsigned i) const +{ + // Create the nodes corresponding to a load from this parameter slot. + ISD::ArgFlagsTy Flags = Ins[i].Flags; + + bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && + getTargetMachine().Options.GuaranteedTailCallOpt; + bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); + + // FIXME: For now, all byval parameter objects are marked mutable. This can + // be changed with more analysis. + // In case of tail call optimization mark all arguments mutable. Since they + // could be overwritten by lowering of arguments in case of a tail call. + int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, + VA.getLocMemOffset(), isImmutable); + SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); + + if (Flags.isByVal()) + return FIN; + return DAG.getLoad(VA.getValVT(), dl, Chain, FIN, + MachinePointerInfo::getFixedStack(FI), + false, false, false, 0); +} +//===----------------------------------------------------------------------===// +// TargetLowering Implementation Help Functions End +//===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Instruction generation functions +//===----------------------------------------------------------------------===// +uint32_t +AMDILTargetLowering::addExtensionInstructions( + uint32_t reg, bool signedShift, + unsigned int simpleVT) const +{ + int shiftSize = 0; + uint32_t LShift, RShift; + switch(simpleVT) + { + default: + return reg; + case AMDIL::GPRI8RegClassID: + shiftSize = 24; + LShift = AMDIL::SHL_i8; + if (signedShift) { + RShift = AMDIL::SHR_i8; + } else { + RShift = AMDIL::USHR_i8; + } + break; + case AMDIL::GPRV2I8RegClassID: + shiftSize = 24; + LShift = AMDIL::SHL_v2i8; + if (signedShift) { + RShift = AMDIL::SHR_v2i8; + } else { + RShift = AMDIL::USHR_v2i8; + } + break; + case AMDIL::GPRV4I8RegClassID: + shiftSize = 24; + LShift = AMDIL::SHL_v4i8; + if (signedShift) { + RShift = AMDIL::SHR_v4i8; + } else { + RShift = AMDIL::USHR_v4i8; + } + break; + case AMDIL::GPRI16RegClassID: + shiftSize = 16; + LShift = AMDIL::SHL_i16; + if (signedShift) { + RShift = AMDIL::SHR_i16; + } else { + RShift = AMDIL::USHR_i16; + } + break; + case AMDIL::GPRV2I16RegClassID: + shiftSize = 16; + LShift = AMDIL::SHL_v2i16; + if (signedShift) { + RShift = AMDIL::SHR_v2i16; + } else { + RShift = AMDIL::USHR_v2i16; + } + break; + case AMDIL::GPRV4I16RegClassID: + shiftSize = 16; + LShift = AMDIL::SHL_v4i16; + if (signedShift) { + RShift = AMDIL::SHR_v4i16; + } else { + RShift = AMDIL::USHR_v4i16; + } + break; + }; + uint32_t LoadReg = genVReg(simpleVT); + uint32_t tmp1 = genVReg(simpleVT); + uint32_t tmp2 = genVReg(simpleVT); + generateMachineInst(AMDIL::LOADCONST_i32, LoadReg).addImm(shiftSize); + generateMachineInst(LShift, tmp1, reg, LoadReg); + generateMachineInst(RShift, tmp2, tmp1, LoadReg); + return tmp2; +} + +MachineOperand +AMDILTargetLowering::convertToReg(MachineOperand op) const +{ + if (op.isReg()) { + return op; + } else if (op.isImm()) { + uint32_t loadReg + = genVReg(op.getParent()->getDesc().OpInfo[0].RegClass); + generateMachineInst(AMDIL::LOADCONST_i32, loadReg) + .addImm(op.getImm()); + op.ChangeToRegister(loadReg, false); + } else if (op.isFPImm()) { + uint32_t loadReg + = genVReg(op.getParent()->getDesc().OpInfo[0].RegClass); + generateMachineInst(AMDIL::LOADCONST_f32, loadReg) + .addFPImm(op.getFPImm()); + op.ChangeToRegister(loadReg, false); + } else if (op.isMBB()) { + op.ChangeToRegister(0, false); + } else if (op.isFI()) { + op.ChangeToRegister(0, false); + } else if (op.isCPI()) { + op.ChangeToRegister(0, false); + } else if (op.isJTI()) { + op.ChangeToRegister(0, false); + } else if (op.isGlobal()) { + op.ChangeToRegister(0, false); + } else if (op.isSymbol()) { + op.ChangeToRegister(0, false); + }/* else if (op.isMetadata()) { + op.ChangeToRegister(0, false); + }*/ + return op; +} + +void +AMDILTargetLowering::generateCMPInstr( + MachineInstr *MI, + MachineBasicBlock *BB, + const TargetInstrInfo& TII) +const +{ + MachineOperand DST = MI->getOperand(0); + MachineOperand CC = MI->getOperand(1); + MachineOperand LHS = MI->getOperand(2); + MachineOperand RHS = MI->getOperand(3); + int64_t ccCode = CC.getImm(); + unsigned int simpleVT = MI->getDesc().OpInfo[0].RegClass; + unsigned int opCode = translateToOpcode(ccCode, simpleVT); + DebugLoc DL = MI->getDebugLoc(); + MachineBasicBlock::iterator BBI = MI; + setPrivateData(BB, BBI, &DL, &TII); + if (!LHS.isReg()) { + LHS = convertToReg(LHS); + } + if (!RHS.isReg()) { + RHS = convertToReg(RHS); + } + switch (ccCode) { + case AMDILCC::IL_CC_I_EQ: + case AMDILCC::IL_CC_I_NE: + case AMDILCC::IL_CC_I_GE: + case AMDILCC::IL_CC_I_LT: + { + uint32_t lhsreg = addExtensionInstructions( + LHS.getReg(), true, simpleVT); + uint32_t rhsreg = addExtensionInstructions( + RHS.getReg(), true, simpleVT); + generateMachineInst(opCode, DST.getReg(), lhsreg, rhsreg); + } + break; + case AMDILCC::IL_CC_U_EQ: + case AMDILCC::IL_CC_U_NE: + case AMDILCC::IL_CC_U_GE: + case AMDILCC::IL_CC_U_LT: + case AMDILCC::IL_CC_D_EQ: + case AMDILCC::IL_CC_F_EQ: + case AMDILCC::IL_CC_F_OEQ: + case AMDILCC::IL_CC_D_OEQ: + case AMDILCC::IL_CC_D_NE: + case AMDILCC::IL_CC_F_NE: + case AMDILCC::IL_CC_F_UNE: + case AMDILCC::IL_CC_D_UNE: + case AMDILCC::IL_CC_D_GE: + case AMDILCC::IL_CC_F_GE: + case AMDILCC::IL_CC_D_OGE: + case AMDILCC::IL_CC_F_OGE: + case AMDILCC::IL_CC_D_LT: + case AMDILCC::IL_CC_F_LT: + case AMDILCC::IL_CC_F_OLT: + case AMDILCC::IL_CC_D_OLT: + generateMachineInst(opCode, DST.getReg(), + LHS.getReg(), RHS.getReg()); + break; + case AMDILCC::IL_CC_I_GT: + case AMDILCC::IL_CC_I_LE: + { + uint32_t lhsreg = addExtensionInstructions( + LHS.getReg(), true, simpleVT); + uint32_t rhsreg = addExtensionInstructions( + RHS.getReg(), true, simpleVT); + generateMachineInst(opCode, DST.getReg(), rhsreg, lhsreg); + } + break; + case AMDILCC::IL_CC_U_GT: + case AMDILCC::IL_CC_U_LE: + case AMDILCC::IL_CC_F_GT: + case AMDILCC::IL_CC_D_GT: + case AMDILCC::IL_CC_F_OGT: + case AMDILCC::IL_CC_D_OGT: + case AMDILCC::IL_CC_F_LE: + case AMDILCC::IL_CC_D_LE: + case AMDILCC::IL_CC_D_OLE: + case AMDILCC::IL_CC_F_OLE: + generateMachineInst(opCode, DST.getReg(), + RHS.getReg(), LHS.getReg()); + break; + case AMDILCC::IL_CC_F_UGT: + case AMDILCC::IL_CC_F_ULE: + { + uint32_t VReg[4] = { + genVReg(simpleVT), genVReg(simpleVT), + genVReg(simpleVT), genVReg(simpleVT) + }; + generateMachineInst(opCode, VReg[0], + RHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::FNE, VReg[1], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::FNE, VReg[2], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::BINARY_OR_f32, + VReg[3], VReg[0], VReg[1]); + generateMachineInst(AMDIL::BINARY_OR_f32, + DST.getReg(), VReg[2], VReg[3]); + } + break; + case AMDILCC::IL_CC_F_ULT: + case AMDILCC::IL_CC_F_UGE: + { + uint32_t VReg[4] = { + genVReg(simpleVT), genVReg(simpleVT), + genVReg(simpleVT), genVReg(simpleVT) + }; + generateMachineInst(opCode, VReg[0], + LHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::FNE, VReg[1], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::FNE, VReg[2], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::BINARY_OR_f32, + VReg[3], VReg[0], VReg[1]); + generateMachineInst(AMDIL::BINARY_OR_f32, + DST.getReg(), VReg[2], VReg[3]); + } + break; + case AMDILCC::IL_CC_D_UGT: + case AMDILCC::IL_CC_D_ULE: + { + uint32_t regID = AMDIL::GPRF64RegClassID; + uint32_t VReg[4] = { + genVReg(regID), genVReg(regID), + genVReg(regID), genVReg(regID) + }; + // The result of a double comparison is a 32bit result + generateMachineInst(opCode, VReg[0], + RHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::DNE, VReg[1], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::DNE, VReg[2], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::BINARY_OR_f32, + VReg[3], VReg[0], VReg[1]); + generateMachineInst(AMDIL::BINARY_OR_f32, + DST.getReg(), VReg[2], VReg[3]); + } + break; + case AMDILCC::IL_CC_D_UGE: + case AMDILCC::IL_CC_D_ULT: + { + uint32_t regID = AMDIL::GPRF64RegClassID; + uint32_t VReg[4] = { + genVReg(regID), genVReg(regID), + genVReg(regID), genVReg(regID) + }; + // The result of a double comparison is a 32bit result + generateMachineInst(opCode, VReg[0], + LHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::DNE, VReg[1], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::DNE, VReg[2], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::BINARY_OR_f32, + VReg[3], VReg[0], VReg[1]); + generateMachineInst(AMDIL::BINARY_OR_f32, + DST.getReg(), VReg[2], VReg[3]); + } + break; + case AMDILCC::IL_CC_F_UEQ: + { + uint32_t VReg[4] = { + genVReg(simpleVT), genVReg(simpleVT), + genVReg(simpleVT), genVReg(simpleVT) + }; + generateMachineInst(AMDIL::FEQ, VReg[0], + LHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::FNE, VReg[1], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::FNE, VReg[2], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::BINARY_OR_f32, + VReg[3], VReg[0], VReg[1]); + generateMachineInst(AMDIL::BINARY_OR_f32, + DST.getReg(), VReg[2], VReg[3]); + } + break; + case AMDILCC::IL_CC_F_ONE: + { + uint32_t VReg[4] = { + genVReg(simpleVT), genVReg(simpleVT), + genVReg(simpleVT), genVReg(simpleVT) + }; + generateMachineInst(AMDIL::FNE, VReg[0], + LHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::FEQ, VReg[1], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::FEQ, VReg[2], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::BINARY_AND_f32, + VReg[3], VReg[0], VReg[1]); + generateMachineInst(AMDIL::BINARY_AND_f32, + DST.getReg(), VReg[2], VReg[3]); + } + break; + case AMDILCC::IL_CC_D_UEQ: + { + uint32_t regID = AMDIL::GPRF64RegClassID; + uint32_t VReg[4] = { + genVReg(regID), genVReg(regID), + genVReg(regID), genVReg(regID) + }; + // The result of a double comparison is a 32bit result + generateMachineInst(AMDIL::DEQ, VReg[0], + LHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::DNE, VReg[1], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::DNE, VReg[2], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::BINARY_OR_f32, + VReg[3], VReg[0], VReg[1]); + generateMachineInst(AMDIL::BINARY_OR_f32, + DST.getReg(), VReg[2], VReg[3]); + + } + break; + case AMDILCC::IL_CC_D_ONE: + { + uint32_t regID = AMDIL::GPRF64RegClassID; + uint32_t VReg[4] = { + genVReg(regID), genVReg(regID), + genVReg(regID), genVReg(regID) + }; + // The result of a double comparison is a 32bit result + generateMachineInst(AMDIL::DNE, VReg[0], + LHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::DEQ, VReg[1], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::DEQ, VReg[2], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::BINARY_AND_f32, + VReg[3], VReg[0], VReg[1]); + generateMachineInst(AMDIL::BINARY_AND_f32, + DST.getReg(), VReg[2], VReg[3]); + + } + break; + case AMDILCC::IL_CC_F_O: + { + uint32_t VReg[2] = { genVReg(simpleVT), genVReg(simpleVT) }; + generateMachineInst(AMDIL::FEQ, VReg[0], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::FEQ, VReg[1], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::BINARY_AND_f32, + DST.getReg(), VReg[0], VReg[1]); + } + break; + case AMDILCC::IL_CC_D_O: + { + uint32_t regID = AMDIL::GPRF64RegClassID; + uint32_t VReg[2] = { genVReg(regID), genVReg(regID) }; + // The result of a double comparison is a 32bit result + generateMachineInst(AMDIL::DEQ, VReg[0], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::DEQ, VReg[1], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::BINARY_AND_f32, + DST.getReg(), VReg[0], VReg[1]); + } + break; + case AMDILCC::IL_CC_F_UO: + { + uint32_t VReg[2] = { genVReg(simpleVT), genVReg(simpleVT) }; + generateMachineInst(AMDIL::FNE, VReg[0], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::FNE, VReg[1], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::BINARY_OR_f32, + DST.getReg(), VReg[0], VReg[1]); + } + break; + case AMDILCC::IL_CC_D_UO: + { + uint32_t regID = AMDIL::GPRF64RegClassID; + uint32_t VReg[2] = { genVReg(regID), genVReg(regID) }; + // The result of a double comparison is a 32bit result + generateMachineInst(AMDIL::DNE, VReg[0], + RHS.getReg(), RHS.getReg()); + generateMachineInst(AMDIL::DNE, VReg[1], + LHS.getReg(), LHS.getReg()); + generateMachineInst(AMDIL::BINARY_OR_f32, + DST.getReg(), VReg[0], VReg[1]); + } + break; + case AMDILCC::IL_CC_L_LE: + case AMDILCC::IL_CC_L_GE: + case AMDILCC::IL_CC_L_EQ: + case AMDILCC::IL_CC_L_NE: + case AMDILCC::IL_CC_L_LT: + case AMDILCC::IL_CC_L_GT: + case AMDILCC::IL_CC_UL_LE: + case AMDILCC::IL_CC_UL_GE: + case AMDILCC::IL_CC_UL_EQ: + case AMDILCC::IL_CC_UL_NE: + case AMDILCC::IL_CC_UL_LT: + case AMDILCC::IL_CC_UL_GT: + { + const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>( + &this->getTargetMachine())->getSubtargetImpl(); + if (stm->device()->usesHardware(AMDILDeviceInfo::LongOps)) { + generateMachineInst(opCode, DST.getReg(), LHS.getReg(), RHS.getReg()); + } else { + generateLongRelational(MI, opCode); + } + } + break; + case AMDILCC::COND_ERROR: + assert(0 && "Invalid CC code"); + break; + }; +} + +//===----------------------------------------------------------------------===// +// TargetLowering Class Implementation Begins +//===----------------------------------------------------------------------===// + AMDILTargetLowering::AMDILTargetLowering(TargetMachine &TM) +: TargetLowering(TM, new TargetLoweringObjectFileELF()) +{ + int types[] = + { + (int)MVT::i8, + (int)MVT::i16, + (int)MVT::i32, + (int)MVT::f32, + (int)MVT::f64, + (int)MVT::i64, + (int)MVT::v2i8, + (int)MVT::v4i8, + (int)MVT::v2i16, + (int)MVT::v4i16, + (int)MVT::v4f32, + (int)MVT::v4i32, + (int)MVT::v2f32, + (int)MVT::v2i32, + (int)MVT::v2f64, + (int)MVT::v2i64 + }; + + int IntTypes[] = + { + (int)MVT::i8, + (int)MVT::i16, + (int)MVT::i32, + (int)MVT::i64 + }; + + int FloatTypes[] = + { + (int)MVT::f32, + (int)MVT::f64 + }; + + int VectorTypes[] = + { + (int)MVT::v2i8, + (int)MVT::v4i8, + (int)MVT::v2i16, + (int)MVT::v4i16, + (int)MVT::v4f32, + (int)MVT::v4i32, + (int)MVT::v2f32, + (int)MVT::v2i32, + (int)MVT::v2f64, + (int)MVT::v2i64 + }; + size_t numTypes = sizeof(types) / sizeof(*types); + size_t numFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes); + size_t numIntTypes = sizeof(IntTypes) / sizeof(*IntTypes); + size_t numVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes); + + const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>( + &this->getTargetMachine())->getSubtargetImpl(); + // These are the current register classes that are + // supported + + addRegisterClass(MVT::i32, AMDIL::GPRI32RegisterClass); + addRegisterClass(MVT::f32, AMDIL::GPRF32RegisterClass); + + if (stm->device()->isSupported(AMDILDeviceInfo::DoubleOps)) { + addRegisterClass(MVT::f64, AMDIL::GPRF64RegisterClass); + addRegisterClass(MVT::v2f64, AMDIL::GPRV2F64RegisterClass); + } + if (stm->device()->isSupported(AMDILDeviceInfo::ByteOps)) { + addRegisterClass(MVT::i8, AMDIL::GPRI8RegisterClass); + addRegisterClass(MVT::v2i8, AMDIL::GPRV2I8RegisterClass); + addRegisterClass(MVT::v4i8, AMDIL::GPRV4I8RegisterClass); + setOperationAction(ISD::Constant , MVT::i8 , Legal); + } + if (stm->device()->isSupported(AMDILDeviceInfo::ShortOps)) { + addRegisterClass(MVT::i16, AMDIL::GPRI16RegisterClass); + addRegisterClass(MVT::v2i16, AMDIL::GPRV2I16RegisterClass); + addRegisterClass(MVT::v4i16, AMDIL::GPRV4I16RegisterClass); + setOperationAction(ISD::Constant , MVT::i16 , Legal); + } + addRegisterClass(MVT::v2f32, AMDIL::GPRV2F32RegisterClass); + addRegisterClass(MVT::v4f32, AMDIL::GPRV4F32RegisterClass); + addRegisterClass(MVT::v2i32, AMDIL::GPRV2I32RegisterClass); + addRegisterClass(MVT::v4i32, AMDIL::GPRV4I32RegisterClass); + if (stm->device()->isSupported(AMDILDeviceInfo::LongOps)) { + addRegisterClass(MVT::i64, AMDIL::GPRI64RegisterClass); + addRegisterClass(MVT::v2i64, AMDIL::GPRV2I64RegisterClass); + } + + for (unsigned int x = 0; x < numTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x]; + + //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types + // We cannot sextinreg, expand to shifts + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::FP_ROUND, VT, Expand); + setOperationAction(ISD::OR, VT, Custom); + setOperationAction(ISD::SUBE, VT, Expand); + setOperationAction(ISD::SUBC, VT, Expand); + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::ADDE, VT, Expand); + setOperationAction(ISD::ADDC, VT, Expand); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::BRCOND, VT, Custom); + setOperationAction(ISD::BR_CC, VT, Custom); + setOperationAction(ISD::BR_JT, VT, Expand); + setOperationAction(ISD::BRIND, VT, Expand); + // TODO: Implement custom UREM/SREM routines + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); + setOperationAction(ISD::SINT_TO_FP, VT, Custom); + setOperationAction(ISD::UINT_TO_FP, VT, Custom); + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISDBITCAST, VT, Custom); + setOperationAction(ISD::GlobalAddress, VT, Custom); + setOperationAction(ISD::JumpTable, VT, Custom); + setOperationAction(ISD::ConstantPool, VT, Custom); + setOperationAction(ISD::SELECT_CC, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + if (VT != MVT::i64 && VT != MVT::v2i64) { + setOperationAction(ISD::SDIV, VT, Custom); + setOperationAction(ISD::UDIV, VT, Custom); + } + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + } + for (unsigned int x = 0; x < numFloatTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x]; + + // IL does not have these operations for floating point types + setOperationAction(ISD::FP_ROUND_INREG, VT, Expand); + setOperationAction(ISD::FP_ROUND, VT, Custom); + setOperationAction(ISD::SETOLT, VT, Expand); + setOperationAction(ISD::SETOGE, VT, Expand); + setOperationAction(ISD::SETOGT, VT, Expand); + setOperationAction(ISD::SETOLE, VT, Expand); + setOperationAction(ISD::SETULT, VT, Expand); + setOperationAction(ISD::SETUGE, VT, Expand); + setOperationAction(ISD::SETUGT, VT, Expand); + setOperationAction(ISD::SETULE, VT, Expand); + } + + for (unsigned int x = 0; x < numIntTypes; ++x) { + MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x]; + + // GPU also does not have divrem function for signed or unsigned + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::FP_ROUND, VT, Expand); + + // GPU does not have [S|U]MUL_LOHI functions as a single instruction + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + setOperationAction(ISD::UMUL_LOHI, VT, Expand); + + // GPU doesn't have a rotl, rotr, or byteswap instruction + setOperationAction(ISD::ROTR, VT, Expand); + setOperationAction(ISD::ROTL, VT, Expand); + setOperationAction(ISD::BSWAP, VT, Expand); + + // GPU doesn't have any counting operators + setOperationAction(ISD::CTPOP, VT, Expand); + setOperationAction(ISD::CTTZ, VT, Expand); + setOperationAction(ISD::CTLZ, VT, Expand); + } + + for ( unsigned int ii = 0; ii < numVectorTypes; ++ii ) + { + MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii]; + + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::FP_ROUND, VT, Expand); + setOperationAction(ISD::SDIVREM, VT, Expand); + setOperationAction(ISD::UDIVREM, VT, Expand); + setOperationAction(ISD::SMUL_LOHI, VT, Expand); + // setOperationAction(ISD::VSETCC, VT, Expand); + setOperationAction(ISD::SETCC, VT, Expand); + setOperationAction(ISD::SELECT_CC, VT, Expand); + setOperationAction(ISD::SELECT, VT, Expand); + + } + setOperationAction(ISD::FP_ROUND, MVT::Other, Expand); + if (stm->device()->isSupported(AMDILDeviceInfo::LongOps)) { + if (stm->calVersion() < CAL_VERSION_SC_139 + || stm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + setOperationAction(ISD::MUL, MVT::i64, Custom); + } + setOperationAction(ISD::SUB, MVT::i64, Custom); + setOperationAction(ISD::ADD, MVT::i64, Custom); + setOperationAction(ISD::MULHU, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::v2i64, Expand); + setOperationAction(ISD::MULHS, MVT::i64, Expand); + setOperationAction(ISD::MULHS, MVT::v2i64, Expand); + setOperationAction(ISD::MUL, MVT::v2i64, Expand); + setOperationAction(ISD::SUB, MVT::v2i64, Expand); + setOperationAction(ISD::ADD, MVT::v2i64, Expand); + setOperationAction(ISD::SREM, MVT::v2i64, Expand); + setOperationAction(ISD::Constant , MVT::i64 , Legal); + setOperationAction(ISD::UDIV, MVT::v2i64, Expand); + setOperationAction(ISD::SDIV, MVT::v2i64, Expand); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand); + } + if (stm->device()->isSupported(AMDILDeviceInfo::DoubleOps)) { + // we support loading/storing v2f64 but not operations on the type + setOperationAction(ISD::FADD, MVT::v2f64, Expand); + setOperationAction(ISD::FSUB, MVT::v2f64, Expand); + setOperationAction(ISD::FMUL, MVT::v2f64, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v2f64, Expand); + setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand); + setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ConstantFP , MVT::f64 , Legal); + setOperationAction(ISD::FDIV, MVT::v2f64, Expand); + // We want to expand vector conversions into their scalar + // counterparts. + setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Expand); + setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Expand); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Expand); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand); + setOperationAction(ISD::FABS, MVT::f64, Expand); + setOperationAction(ISD::FABS, MVT::v2f64, Expand); + } + // TODO: Fix the UDIV24 algorithm so it works for these + // types correctly. This needs vector comparisons + // for this to work correctly. + setOperationAction(ISD::UDIV, MVT::v2i8, Expand); + setOperationAction(ISD::UDIV, MVT::v4i8, Expand); + setOperationAction(ISD::UDIV, MVT::v2i16, Expand); + setOperationAction(ISD::UDIV, MVT::v4i16, Expand); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom); + setOperationAction(ISD::SUBC, MVT::Other, Expand); + setOperationAction(ISD::ADDE, MVT::Other, Expand); + setOperationAction(ISD::ADDC, MVT::Other, Expand); + setOperationAction(ISD::BRCOND, MVT::Other, Custom); + setOperationAction(ISD::BR_CC, MVT::Other, Custom); + setOperationAction(ISD::BR_JT, MVT::Other, Expand); + setOperationAction(ISD::BRIND, MVT::Other, Expand); + setOperationAction(ISD::SETCC, MVT::Other, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand); + setOperationAction(ISD::FDIV, MVT::f32, Custom); + setOperationAction(ISD::FDIV, MVT::v2f32, Custom); + setOperationAction(ISD::FDIV, MVT::v4f32, Custom); + + setOperationAction(ISD::BUILD_VECTOR, MVT::Other, Custom); + // Use the default implementation. + setOperationAction(ISD::VAARG , MVT::Other, Expand); + setOperationAction(ISD::VACOPY , MVT::Other, Expand); + setOperationAction(ISD::VAEND , MVT::Other, Expand); + setOperationAction(ISD::STACKSAVE , MVT::Other, Expand); + setOperationAction(ISD::STACKRESTORE , MVT::Other, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom); + setOperationAction(ISD::ConstantFP , MVT::f32 , Legal); + setOperationAction(ISD::Constant , MVT::i32 , Legal); + setOperationAction(ISD::TRAP , MVT::Other , Legal); + + setStackPointerRegisterToSaveRestore(AMDIL::SP); + setSchedulingPreference(Sched::RegPressure); + setPow2DivIsCheap(false); + setPrefLoopAlignment(16); + setSelectIsExpensive(true); + setJumpIsExpensive(true); + computeRegisterProperties(); + + maxStoresPerMemcpy = 4096; + maxStoresPerMemmove = 4096; + maxStoresPerMemset = 4096; + +#undef numTypes +#undef numIntTypes +#undef numVectorTypes +#undef numFloatTypes +} + +const char * +AMDILTargetLowering::getTargetNodeName(unsigned Opcode) const +{ + switch (Opcode) { + default: return 0; + case AMDILISD::INTTOANY: return "AMDILISD::INTTOANY"; + case AMDILISD::DP_TO_FP: return "AMDILISD::DP_TO_FP"; + case AMDILISD::FP_TO_DP: return "AMDILISD::FP_TO_DP"; + case AMDILISD::BITCONV: return "AMDILISD::BITCONV"; + case AMDILISD::CMOV: return "AMDILISD::CMOV"; + case AMDILISD::CMOVLOG: return "AMDILISD::CMOVLOG"; + case AMDILISD::INEGATE: return "AMDILISD::INEGATE"; + case AMDILISD::MAD: return "AMDILISD::MAD"; + case AMDILISD::UMAD: return "AMDILISD::UMAD"; + case AMDILISD::CALL: return "AMDILISD::CALL"; + case AMDILISD::RET: return "AMDILISD::RET"; + case AMDILISD::IFFB_HI: return "AMDILISD::IFFB_HI"; + case AMDILISD::IFFB_LO: return "AMDILISD::IFFB_LO"; + case AMDILISD::ADD: return "AMDILISD::ADD"; + case AMDILISD::UMUL: return "AMDILISD::UMUL"; + case AMDILISD::AND: return "AMDILISD::AND"; + case AMDILISD::OR: return "AMDILISD::OR"; + case AMDILISD::NOT: return "AMDILISD::NOT"; + case AMDILISD::XOR: return "AMDILISD::XOR"; + case AMDILISD::DIV_INF: return "AMDILISD::DIV_INF"; + case AMDILISD::SMAX: return "AMDILISD::SMAX"; + case AMDILISD::PHIMOVE: return "AMDILISD::PHIMOVE"; + case AMDILISD::MOVE: return "AMDILISD::MOVE"; + case AMDILISD::VBUILD: return "AMDILISD::VBUILD"; + case AMDILISD::VEXTRACT: return "AMDILISD::VEXTRACT"; + case AMDILISD::VINSERT: return "AMDILISD::VINSERT"; + case AMDILISD::VCONCAT: return "AMDILISD::VCONCAT"; + case AMDILISD::LCREATE: return "AMDILISD::LCREATE"; + case AMDILISD::LCOMPHI: return "AMDILISD::LCOMPHI"; + case AMDILISD::LCOMPLO: return "AMDILISD::LCOMPLO"; + case AMDILISD::DCREATE: return "AMDILISD::DCREATE"; + case AMDILISD::DCOMPHI: return "AMDILISD::DCOMPHI"; + case AMDILISD::DCOMPLO: return "AMDILISD::DCOMPLO"; + case AMDILISD::LCREATE2: return "AMDILISD::LCREATE2"; + case AMDILISD::LCOMPHI2: return "AMDILISD::LCOMPHI2"; + case AMDILISD::LCOMPLO2: return "AMDILISD::LCOMPLO2"; + case AMDILISD::DCREATE2: return "AMDILISD::DCREATE2"; + case AMDILISD::DCOMPHI2: return "AMDILISD::DCOMPHI2"; + case AMDILISD::DCOMPLO2: return "AMDILISD::DCOMPLO2"; + case AMDILISD::CMP: return "AMDILISD::CMP"; + case AMDILISD::IL_CC_I_LT: return "AMDILISD::IL_CC_I_LT"; + case AMDILISD::IL_CC_I_LE: return "AMDILISD::IL_CC_I_LE"; + case AMDILISD::IL_CC_I_GT: return "AMDILISD::IL_CC_I_GT"; + case AMDILISD::IL_CC_I_GE: return "AMDILISD::IL_CC_I_GE"; + case AMDILISD::IL_CC_I_EQ: return "AMDILISD::IL_CC_I_EQ"; + case AMDILISD::IL_CC_I_NE: return "AMDILISD::IL_CC_I_NE"; + case AMDILISD::RET_FLAG: return "AMDILISD::RET_FLAG"; + case AMDILISD::BRANCH_COND: return "AMDILISD::BRANCH_COND"; + case AMDILISD::LOOP_NZERO: return "AMDILISD::LOOP_NZERO"; + case AMDILISD::LOOP_ZERO: return "AMDILISD::LOOP_ZERO"; + case AMDILISD::LOOP_CMP: return "AMDILISD::LOOP_CMP"; + case AMDILISD::ADDADDR: return "AMDILISD::ADDADDR"; + case AMDILISD::ATOM_G_ADD: return "AMDILISD::ATOM_G_ADD"; + case AMDILISD::ATOM_G_AND: return "AMDILISD::ATOM_G_AND"; + case AMDILISD::ATOM_G_CMPXCHG: return "AMDILISD::ATOM_G_CMPXCHG"; + case AMDILISD::ATOM_G_DEC: return "AMDILISD::ATOM_G_DEC"; + case AMDILISD::ATOM_G_INC: return "AMDILISD::ATOM_G_INC"; + case AMDILISD::ATOM_G_MAX: return "AMDILISD::ATOM_G_MAX"; + case AMDILISD::ATOM_G_UMAX: return "AMDILISD::ATOM_G_UMAX"; + case AMDILISD::ATOM_G_MIN: return "AMDILISD::ATOM_G_MIN"; + case AMDILISD::ATOM_G_UMIN: return "AMDILISD::ATOM_G_UMIN"; + case AMDILISD::ATOM_G_OR: return "AMDILISD::ATOM_G_OR"; + case AMDILISD::ATOM_G_SUB: return "AMDILISD::ATOM_G_SUB"; + case AMDILISD::ATOM_G_RSUB: return "AMDILISD::ATOM_G_RSUB"; + case AMDILISD::ATOM_G_XCHG: return "AMDILISD::ATOM_G_XCHG"; + case AMDILISD::ATOM_G_XOR: return "AMDILISD::ATOM_G_XOR"; + case AMDILISD::ATOM_G_ADD_NORET: return "AMDILISD::ATOM_G_ADD_NORET"; + case AMDILISD::ATOM_G_AND_NORET: return "AMDILISD::ATOM_G_AND_NORET"; + case AMDILISD::ATOM_G_CMPXCHG_NORET: return "AMDILISD::ATOM_G_CMPXCHG_NORET"; + case AMDILISD::ATOM_G_DEC_NORET: return "AMDILISD::ATOM_G_DEC_NORET"; + case AMDILISD::ATOM_G_INC_NORET: return "AMDILISD::ATOM_G_INC_NORET"; + case AMDILISD::ATOM_G_MAX_NORET: return "AMDILISD::ATOM_G_MAX_NORET"; + case AMDILISD::ATOM_G_UMAX_NORET: return "AMDILISD::ATOM_G_UMAX_NORET"; + case AMDILISD::ATOM_G_MIN_NORET: return "AMDILISD::ATOM_G_MIN_NORET"; + case AMDILISD::ATOM_G_UMIN_NORET: return "AMDILISD::ATOM_G_UMIN_NORET"; + case AMDILISD::ATOM_G_OR_NORET: return "AMDILISD::ATOM_G_OR_NORET"; + case AMDILISD::ATOM_G_SUB_NORET: return "AMDILISD::ATOM_G_SUB_NORET"; + case AMDILISD::ATOM_G_RSUB_NORET: return "AMDILISD::ATOM_G_RSUB_NORET"; + case AMDILISD::ATOM_G_XCHG_NORET: return "AMDILISD::ATOM_G_XCHG_NORET"; + case AMDILISD::ATOM_G_XOR_NORET: return "AMDILISD::ATOM_G_XOR_NORET"; + case AMDILISD::ATOM_L_ADD: return "AMDILISD::ATOM_L_ADD"; + case AMDILISD::ATOM_L_AND: return "AMDILISD::ATOM_L_AND"; + case AMDILISD::ATOM_L_CMPXCHG: return "AMDILISD::ATOM_L_CMPXCHG"; + case AMDILISD::ATOM_L_DEC: return "AMDILISD::ATOM_L_DEC"; + case AMDILISD::ATOM_L_INC: return "AMDILISD::ATOM_L_INC"; + case AMDILISD::ATOM_L_MAX: return "AMDILISD::ATOM_L_MAX"; + case AMDILISD::ATOM_L_UMAX: return "AMDILISD::ATOM_L_UMAX"; + case AMDILISD::ATOM_L_MIN: return "AMDILISD::ATOM_L_MIN"; + case AMDILISD::ATOM_L_UMIN: return "AMDILISD::ATOM_L_UMIN"; + case AMDILISD::ATOM_L_OR: return "AMDILISD::ATOM_L_OR"; + case AMDILISD::ATOM_L_SUB: return "AMDILISD::ATOM_L_SUB"; + case AMDILISD::ATOM_L_RSUB: return "AMDILISD::ATOM_L_RSUB"; + case AMDILISD::ATOM_L_XCHG: return "AMDILISD::ATOM_L_XCHG"; + case AMDILISD::ATOM_L_XOR: return "AMDILISD::ATOM_L_XOR"; + case AMDILISD::ATOM_L_ADD_NORET: return "AMDILISD::ATOM_L_ADD_NORET"; + case AMDILISD::ATOM_L_AND_NORET: return "AMDILISD::ATOM_L_AND_NORET"; + case AMDILISD::ATOM_L_CMPXCHG_NORET: return "AMDILISD::ATOM_L_CMPXCHG_NORET"; + case AMDILISD::ATOM_L_DEC_NORET: return "AMDILISD::ATOM_L_DEC_NORET"; + case AMDILISD::ATOM_L_INC_NORET: return "AMDILISD::ATOM_L_INC_NORET"; + case AMDILISD::ATOM_L_MAX_NORET: return "AMDILISD::ATOM_L_MAX_NORET"; + case AMDILISD::ATOM_L_UMAX_NORET: return "AMDILISD::ATOM_L_UMAX_NORET"; + case AMDILISD::ATOM_L_MIN_NORET: return "AMDILISD::ATOM_L_MIN_NORET"; + case AMDILISD::ATOM_L_UMIN_NORET: return "AMDILISD::ATOM_L_UMIN_NORET"; + case AMDILISD::ATOM_L_OR_NORET: return "AMDILISD::ATOM_L_OR_NORET"; + case AMDILISD::ATOM_L_SUB_NORET: return "AMDILISD::ATOM_L_SUB_NORET"; + case AMDILISD::ATOM_L_RSUB_NORET: return "AMDILISD::ATOM_L_RSUB_NORET"; + case AMDILISD::ATOM_L_XCHG_NORET: return "AMDILISD::ATOM_L_XCHG_NORET"; + case AMDILISD::ATOM_R_ADD: return "AMDILISD::ATOM_R_ADD"; + case AMDILISD::ATOM_R_AND: return "AMDILISD::ATOM_R_AND"; + case AMDILISD::ATOM_R_CMPXCHG: return "AMDILISD::ATOM_R_CMPXCHG"; + case AMDILISD::ATOM_R_DEC: return "AMDILISD::ATOM_R_DEC"; + case AMDILISD::ATOM_R_INC: return "AMDILISD::ATOM_R_INC"; + case AMDILISD::ATOM_R_MAX: return "AMDILISD::ATOM_R_MAX"; + case AMDILISD::ATOM_R_UMAX: return "AMDILISD::ATOM_R_UMAX"; + case AMDILISD::ATOM_R_MIN: return "AMDILISD::ATOM_R_MIN"; + case AMDILISD::ATOM_R_UMIN: return "AMDILISD::ATOM_R_UMIN"; + case AMDILISD::ATOM_R_OR: return "AMDILISD::ATOM_R_OR"; + case AMDILISD::ATOM_R_MSKOR: return "AMDILISD::ATOM_R_MSKOR"; + case AMDILISD::ATOM_R_SUB: return "AMDILISD::ATOM_R_SUB"; + case AMDILISD::ATOM_R_RSUB: return "AMDILISD::ATOM_R_RSUB"; + case AMDILISD::ATOM_R_XCHG: return "AMDILISD::ATOM_R_XCHG"; + case AMDILISD::ATOM_R_XOR: return "AMDILISD::ATOM_R_XOR"; + case AMDILISD::ATOM_R_ADD_NORET: return "AMDILISD::ATOM_R_ADD_NORET"; + case AMDILISD::ATOM_R_AND_NORET: return "AMDILISD::ATOM_R_AND_NORET"; + case AMDILISD::ATOM_R_CMPXCHG_NORET: return "AMDILISD::ATOM_R_CMPXCHG_NORET"; + case AMDILISD::ATOM_R_DEC_NORET: return "AMDILISD::ATOM_R_DEC_NORET"; + case AMDILISD::ATOM_R_INC_NORET: return "AMDILISD::ATOM_R_INC_NORET"; + case AMDILISD::ATOM_R_MAX_NORET: return "AMDILISD::ATOM_R_MAX_NORET"; + case AMDILISD::ATOM_R_UMAX_NORET: return "AMDILISD::ATOM_R_UMAX_NORET"; + case AMDILISD::ATOM_R_MIN_NORET: return "AMDILISD::ATOM_R_MIN_NORET"; + case AMDILISD::ATOM_R_UMIN_NORET: return "AMDILISD::ATOM_R_UMIN_NORET"; + case AMDILISD::ATOM_R_OR_NORET: return "AMDILISD::ATOM_R_OR_NORET"; + case AMDILISD::ATOM_R_MSKOR_NORET: return "AMDILISD::ATOM_R_MSKOR_NORET"; + case AMDILISD::ATOM_R_SUB_NORET: return "AMDILISD::ATOM_R_SUB_NORET"; + case AMDILISD::ATOM_R_RSUB_NORET: return "AMDILISD::ATOM_R_RSUB_NORET"; + case AMDILISD::ATOM_R_XCHG_NORET: return "AMDILISD::ATOM_R_XCHG_NORET"; + case AMDILISD::ATOM_R_XOR_NORET: return "AMDILISD::ATOM_R_XOR_NORET"; + case AMDILISD::APPEND_ALLOC: return "AMDILISD::APPEND_ALLOC"; + case AMDILISD::APPEND_ALLOC_NORET: return "AMDILISD::APPEND_ALLOC_NORET"; + case AMDILISD::APPEND_CONSUME: return "AMDILISD::APPEND_CONSUME"; + case AMDILISD::APPEND_CONSUME_NORET: return "AMDILISD::APPEND_CONSUME_NORET"; + case AMDILISD::IMAGE2D_READ: return "AMDILISD::IMAGE2D_READ"; + case AMDILISD::IMAGE2D_WRITE: return "AMDILISD::IMAGE2D_WRITE"; + case AMDILISD::IMAGE2D_INFO0: return "AMDILISD::IMAGE2D_INFO0"; + case AMDILISD::IMAGE2D_INFO1: return "AMDILISD::IMAGE2D_INFO1"; + case AMDILISD::IMAGE3D_READ: return "AMDILISD::IMAGE3D_READ"; + case AMDILISD::IMAGE3D_WRITE: return "AMDILISD::IMAGE3D_WRITE"; + case AMDILISD::IMAGE3D_INFO0: return "AMDILISD::IMAGE3D_INFO0"; + case AMDILISD::IMAGE3D_INFO1: return "AMDILISD::IMAGE3D_INFO1"; + + }; +} +bool +AMDILTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, unsigned Intrinsic) const +{ + if (Intrinsic <= AMDGPUIntrinsic::last_non_AMDIL_intrinsic + || Intrinsic > AMDGPUIntrinsic::num_AMDIL_intrinsics) { + return false; + } + bool bitCastToInt = false; + unsigned IntNo; + bool isRet = true; + const AMDILSubtarget *STM = &this->getTargetMachine() + .getSubtarget<AMDILSubtarget>(); + switch (Intrinsic) { + default: return false; // Don't custom lower most intrinsics. + case AMDGPUIntrinsic::AMDIL_atomic_add_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_add_gu32: + IntNo = AMDILISD::ATOM_G_ADD; break; + case AMDGPUIntrinsic::AMDIL_atomic_add_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_add_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_ADD_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_add_lu32: + case AMDGPUIntrinsic::AMDIL_atomic_add_li32: + IntNo = AMDILISD::ATOM_L_ADD; break; + case AMDGPUIntrinsic::AMDIL_atomic_add_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_add_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_ADD_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_add_ru32: + case AMDGPUIntrinsic::AMDIL_atomic_add_ri32: + IntNo = AMDILISD::ATOM_R_ADD; break; + case AMDGPUIntrinsic::AMDIL_atomic_add_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_add_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_ADD_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_and_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_and_gu32: + IntNo = AMDILISD::ATOM_G_AND; break; + case AMDGPUIntrinsic::AMDIL_atomic_and_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_and_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_AND_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_and_li32: + case AMDGPUIntrinsic::AMDIL_atomic_and_lu32: + IntNo = AMDILISD::ATOM_L_AND; break; + case AMDGPUIntrinsic::AMDIL_atomic_and_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_and_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_AND_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_and_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_and_ru32: + IntNo = AMDILISD::ATOM_R_AND; break; + case AMDGPUIntrinsic::AMDIL_atomic_and_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_and_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_AND_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_gu32: + IntNo = AMDILISD::ATOM_G_CMPXCHG; break; + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_CMPXCHG_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_li32: + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_lu32: + IntNo = AMDILISD::ATOM_L_CMPXCHG; break; + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_CMPXCHG_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_ru32: + IntNo = AMDILISD::ATOM_R_CMPXCHG; break; + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_cmpxchg_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_CMPXCHG_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_dec_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_dec_gu32: + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_G_DEC; + } else { + IntNo = AMDILISD::ATOM_G_SUB; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_dec_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_dec_gu32_noret: + isRet = false; + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_G_DEC_NORET; + } else { + IntNo = AMDILISD::ATOM_G_SUB_NORET; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_dec_li32: + case AMDGPUIntrinsic::AMDIL_atomic_dec_lu32: + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_L_DEC; + } else { + IntNo = AMDILISD::ATOM_L_SUB; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_dec_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_dec_lu32_noret: + isRet = false; + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_L_DEC_NORET; + } else { + IntNo = AMDILISD::ATOM_L_SUB_NORET; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_dec_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_dec_ru32: + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_R_DEC; + } else { + IntNo = AMDILISD::ATOM_R_SUB; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_dec_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_dec_ru32_noret: + isRet = false; + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_R_DEC_NORET; + } else { + IntNo = AMDILISD::ATOM_R_SUB_NORET; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_inc_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_inc_gu32: + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_G_INC; + } else { + IntNo = AMDILISD::ATOM_G_ADD; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_inc_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_inc_gu32_noret: + isRet = false; + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_G_INC_NORET; + } else { + IntNo = AMDILISD::ATOM_G_ADD_NORET; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_inc_li32: + case AMDGPUIntrinsic::AMDIL_atomic_inc_lu32: + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_L_INC; + } else { + IntNo = AMDILISD::ATOM_L_ADD; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_inc_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_inc_lu32_noret: + isRet = false; + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_L_INC_NORET; + } else { + IntNo = AMDILISD::ATOM_L_ADD_NORET; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_inc_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_inc_ru32: + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_R_INC; + } else { + IntNo = AMDILISD::ATOM_R_ADD; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_inc_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_inc_ru32_noret: + isRet = false; + if (STM->calVersion() >= CAL_VERSION_SC_136) { + IntNo = AMDILISD::ATOM_R_INC_NORET; + } else { + IntNo = AMDILISD::ATOM_R_ADD_NORET; + } + break; + case AMDGPUIntrinsic::AMDIL_atomic_max_gi32: + IntNo = AMDILISD::ATOM_G_MAX; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_gu32: + IntNo = AMDILISD::ATOM_G_UMAX; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_gi32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_MAX_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_UMAX_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_li32: + IntNo = AMDILISD::ATOM_L_MAX; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_lu32: + IntNo = AMDILISD::ATOM_L_UMAX; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_li32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_MAX_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_UMAX_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_ri32: + IntNo = AMDILISD::ATOM_R_MAX; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_ru32: + IntNo = AMDILISD::ATOM_R_UMAX; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_ri32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_MAX_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_max_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_UMAX_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_gi32: + IntNo = AMDILISD::ATOM_G_MIN; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_gu32: + IntNo = AMDILISD::ATOM_G_UMIN; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_gi32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_MIN_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_UMIN_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_li32: + IntNo = AMDILISD::ATOM_L_MIN; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_lu32: + IntNo = AMDILISD::ATOM_L_UMIN; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_li32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_MIN_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_UMIN_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_ri32: + IntNo = AMDILISD::ATOM_R_MIN; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_ru32: + IntNo = AMDILISD::ATOM_R_UMIN; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_ri32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_MIN_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_min_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_UMIN_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_or_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_or_gu32: + IntNo = AMDILISD::ATOM_G_OR; break; + case AMDGPUIntrinsic::AMDIL_atomic_or_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_or_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_OR_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_or_li32: + case AMDGPUIntrinsic::AMDIL_atomic_or_lu32: + IntNo = AMDILISD::ATOM_L_OR; break; + case AMDGPUIntrinsic::AMDIL_atomic_or_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_or_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_OR_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_or_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_or_ru32: + IntNo = AMDILISD::ATOM_R_OR; break; + case AMDGPUIntrinsic::AMDIL_atomic_or_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_or_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_OR_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_sub_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_sub_gu32: + IntNo = AMDILISD::ATOM_G_SUB; break; + case AMDGPUIntrinsic::AMDIL_atomic_sub_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_sub_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_SUB_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_sub_li32: + case AMDGPUIntrinsic::AMDIL_atomic_sub_lu32: + IntNo = AMDILISD::ATOM_L_SUB; break; + case AMDGPUIntrinsic::AMDIL_atomic_sub_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_sub_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_SUB_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_sub_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_sub_ru32: + IntNo = AMDILISD::ATOM_R_SUB; break; + case AMDGPUIntrinsic::AMDIL_atomic_sub_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_sub_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_SUB_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_rsub_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_rsub_gu32: + IntNo = AMDILISD::ATOM_G_RSUB; break; + case AMDGPUIntrinsic::AMDIL_atomic_rsub_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_rsub_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_RSUB_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_rsub_li32: + case AMDGPUIntrinsic::AMDIL_atomic_rsub_lu32: + IntNo = AMDILISD::ATOM_L_RSUB; break; + case AMDGPUIntrinsic::AMDIL_atomic_rsub_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_rsub_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_RSUB_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_rsub_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_rsub_ru32: + IntNo = AMDILISD::ATOM_R_RSUB; break; + case AMDGPUIntrinsic::AMDIL_atomic_rsub_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_rsub_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_RSUB_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_gf32: + bitCastToInt = true; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_xchg_gu32: + IntNo = AMDILISD::ATOM_G_XCHG; break; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_gf32_noret: + bitCastToInt = true; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_xchg_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_XCHG_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_lf32: + bitCastToInt = true; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_li32: + case AMDGPUIntrinsic::AMDIL_atomic_xchg_lu32: + IntNo = AMDILISD::ATOM_L_XCHG; break; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_lf32_noret: + bitCastToInt = true; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_xchg_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_XCHG_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_rf32: + bitCastToInt = true; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_xchg_ru32: + IntNo = AMDILISD::ATOM_R_XCHG; break; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_rf32_noret: + bitCastToInt = true; + case AMDGPUIntrinsic::AMDIL_atomic_xchg_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_xchg_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_XCHG_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_xor_gi32: + case AMDGPUIntrinsic::AMDIL_atomic_xor_gu32: + IntNo = AMDILISD::ATOM_G_XOR; break; + case AMDGPUIntrinsic::AMDIL_atomic_xor_gi32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_xor_gu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_G_XOR_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_xor_li32: + case AMDGPUIntrinsic::AMDIL_atomic_xor_lu32: + IntNo = AMDILISD::ATOM_L_XOR; break; + case AMDGPUIntrinsic::AMDIL_atomic_xor_li32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_xor_lu32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_L_XOR_NORET; break; + case AMDGPUIntrinsic::AMDIL_atomic_xor_ri32: + case AMDGPUIntrinsic::AMDIL_atomic_xor_ru32: + IntNo = AMDILISD::ATOM_R_XOR; break; + case AMDGPUIntrinsic::AMDIL_atomic_xor_ri32_noret: + case AMDGPUIntrinsic::AMDIL_atomic_xor_ru32_noret: + isRet = false; + IntNo = AMDILISD::ATOM_R_XOR_NORET; break; + case AMDGPUIntrinsic::AMDIL_append_alloc_i32: + IntNo = AMDILISD::APPEND_ALLOC; break; + case AMDGPUIntrinsic::AMDIL_append_alloc_i32_noret: + isRet = false; + IntNo = AMDILISD::APPEND_ALLOC_NORET; break; + case AMDGPUIntrinsic::AMDIL_append_consume_i32: + IntNo = AMDILISD::APPEND_CONSUME; break; + case AMDGPUIntrinsic::AMDIL_append_consume_i32_noret: + isRet = false; + IntNo = AMDILISD::APPEND_CONSUME_NORET; break; + }; + const AMDILSubtarget *stm = &this->getTargetMachine() + .getSubtarget<AMDILSubtarget>(); + AMDILKernelManager *KM = const_cast<AMDILKernelManager*>( + stm->getKernelManager()); + KM->setOutputInst(); + + Info.opc = IntNo; + Info.memVT = (bitCastToInt) ? MVT::f32 : MVT::i32; + Info.ptrVal = I.getOperand(0); + Info.offset = 0; + Info.align = 4; + Info.vol = true; + Info.readMem = isRet; + Info.writeMem = true; + return true; +} +// The backend supports 32 and 64 bit floating point immediates +bool +AMDILTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const +{ + if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 + || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { + return true; + } else { + return false; + } +} + +bool +AMDILTargetLowering::ShouldShrinkFPConstant(EVT VT) const +{ + if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32 + || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) { + return false; + } else { + return true; + } +} + + +// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to +// be zero. Op is expected to be a target specific node. Used by DAG +// combiner. + +void +AMDILTargetLowering::computeMaskedBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth) const +{ + APInt KnownZero2; + APInt KnownOne2; + KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything + switch (Op.getOpcode()) { + default: break; + case AMDILISD::SELECT_CC: + DAG.ComputeMaskedBits( + Op.getOperand(1), + KnownZero, + KnownOne, + Depth + 1 + ); + DAG.ComputeMaskedBits( + Op.getOperand(0), + KnownZero2, + KnownOne2 + ); + assert((KnownZero & KnownOne) == 0 + && "Bits known to be one AND zero?"); + assert((KnownZero2 & KnownOne2) == 0 + && "Bits known to be one AND zero?"); + // Only known if known in both the LHS and RHS + KnownOne &= KnownOne2; + KnownZero &= KnownZero2; + break; + }; +} + +// This is the function that determines which calling convention should +// be used. Currently there is only one calling convention +CCAssignFn* +AMDILTargetLowering::CCAssignFnForNode(unsigned int Op) const +{ + //uint64_t CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); + return CC_AMDIL32; +} + +// LowerCallResult - Lower the result values of an ISD::CALL into the +// appropriate copies out of appropriate physical registers. This assumes that +// Chain/InFlag are the input chain/flag to use, and that TheCall is the call +// being lowered. The returns a SDNode with the same number of values as the +// ISD::CALL. +SDValue +AMDILTargetLowering::LowerCallResult( + SDValue Chain, + SDValue InFlag, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const +{ + // Assign locations to each value returned by this call + SmallVector<CCValAssign, 16> RVLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + CCInfo.AnalyzeCallResult(Ins, RetCC_AMDIL32); + + // Copy all of the result registers out of their specified physreg. + for (unsigned i = 0; i != RVLocs.size(); ++i) { + EVT CopyVT = RVLocs[i].getValVT(); + if (RVLocs[i].isRegLoc()) { + Chain = DAG.getCopyFromReg( + Chain, + dl, + RVLocs[i].getLocReg(), + CopyVT, + InFlag + ).getValue(1); + SDValue Val = Chain.getValue(0); + InFlag = Chain.getValue(2); + InVals.push_back(Val); + } + } + + return Chain; + +} + +//===----------------------------------------------------------------------===// +// Other Lowering Hooks +//===----------------------------------------------------------------------===// + +MachineBasicBlock * +AMDILTargetLowering::EmitInstrWithCustomInserter( + MachineInstr *MI, MachineBasicBlock *BB) const +{ + const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo(); + switch (MI->getOpcode()) { + ExpandCaseToAllTypes(AMDIL::CMP); + generateCMPInstr(MI, BB, TII); + MI->eraseFromParent(); + break; + default: + break; + } + return BB; +} + +// Recursively assign SDNodeOrdering to any unordered nodes +// This is necessary to maintain source ordering of instructions +// under -O0 to avoid odd-looking "skipping around" issues. + static const SDValue +Ordered( SelectionDAG &DAG, unsigned order, const SDValue New ) +{ + if (order != 0 && DAG.GetOrdering( New.getNode() ) == 0) { + DAG.AssignOrdering( New.getNode(), order ); + for (unsigned i = 0, e = New.getNumOperands(); i < e; ++i) + Ordered( DAG, order, New.getOperand(i) ); + } + return New; +} + +#define LOWER(A) \ + case ISD:: A: \ +return Ordered( DAG, DAG.GetOrdering( Op.getNode() ), Lower##A(Op, DAG) ) + +SDValue +AMDILTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const +{ + switch (Op.getOpcode()) { + default: + Op.getNode()->dump(); + assert(0 && "Custom lowering code for this" + "instruction is not implemented yet!"); + break; + LOWER(GlobalAddress); + LOWER(JumpTable); + LOWER(ConstantPool); + LOWER(ExternalSymbol); + LOWER(FP_TO_SINT); + LOWER(FP_TO_UINT); + LOWER(SINT_TO_FP); + LOWER(UINT_TO_FP); + LOWER(ADD); + LOWER(MUL); + LOWER(SUB); + LOWER(FDIV); + LOWER(SDIV); + LOWER(SREM); + LOWER(UDIV); + LOWER(UREM); + LOWER(BUILD_VECTOR); + LOWER(INSERT_VECTOR_ELT); + LOWER(EXTRACT_VECTOR_ELT); + LOWER(EXTRACT_SUBVECTOR); + LOWER(SCALAR_TO_VECTOR); + LOWER(CONCAT_VECTORS); + LOWER(AND); + LOWER(OR); + LOWER(SELECT); + LOWER(SELECT_CC); + LOWER(SETCC); + LOWER(SIGN_EXTEND_INREG); + LOWER(BITCAST); + LOWER(DYNAMIC_STACKALLOC); + LOWER(BRCOND); + LOWER(BR_CC); + LOWER(FP_ROUND); + } + return Op; +} + +int +AMDILTargetLowering::getVarArgsFrameOffset() const +{ + return VarArgsFrameOffset; +} +#undef LOWER + +SDValue +AMDILTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const +{ + SDValue DST = Op; + const GlobalAddressSDNode *GADN = cast<GlobalAddressSDNode>(Op); + const GlobalValue *G = GADN->getGlobal(); + const AMDILSubtarget *stm = &this->getTargetMachine() + .getSubtarget<AMDILSubtarget>(); + const AMDILGlobalManager *GM = stm->getGlobalManager(); + DebugLoc DL = Op.getDebugLoc(); + int64_t base_offset = GADN->getOffset(); + int32_t arrayoffset = GM->getArrayOffset(G->getName()); + int32_t constoffset = GM->getConstOffset(G->getName()); + if (arrayoffset != -1) { + DST = DAG.getConstant(arrayoffset, MVT::i32); + DST = DAG.getNode(ISD::ADD, DL, MVT::i32, + DST, DAG.getConstant(base_offset, MVT::i32)); + } else if (constoffset != -1) { + if (GM->getConstHWBit(G->getName())) { + DST = DAG.getConstant(constoffset, MVT::i32); + DST = DAG.getNode(ISD::ADD, DL, MVT::i32, + DST, DAG.getConstant(base_offset, MVT::i32)); + } else { + SDValue addr = DAG.getTargetGlobalAddress(G, DL, MVT::i32); + SDValue DPReg = DAG.getRegister(AMDIL::SDP, MVT::i32); + DPReg = DAG.getNode(ISD::ADD, DL, MVT::i32, DPReg, + DAG.getConstant(base_offset, MVT::i32)); + DST = DAG.getNode(AMDILISD::ADDADDR, DL, MVT::i32, addr, DPReg); + } + } else { + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + if (!GV) { + DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); + } else { + if (GV->hasInitializer()) { + const Constant *C = dyn_cast<Constant>(GV->getInitializer()); + if (const ConstantInt *CI = dyn_cast<ConstantInt>(C)) { + DST = DAG.getConstant(CI->getValue(), Op.getValueType()); + + } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(C)) { + DST = DAG.getConstantFP(CF->getValueAPF(), + Op.getValueType()); + } else if (dyn_cast<ConstantAggregateZero>(C)) { + EVT VT = Op.getValueType(); + if (VT.isInteger()) { + DST = DAG.getConstant(0, VT); + } else { + DST = DAG.getConstantFP(0, VT); + } + } else { + assert(!"lowering this type of Global Address " + "not implemented yet!"); + C->dump(); + DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); + } + } else { + DST = DAG.getTargetGlobalAddress(GV, DL, MVT::i32); + } + } + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const +{ + JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); + SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), MVT::i32); + return Result; +} +SDValue +AMDILTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const +{ + ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); + EVT PtrVT = Op.getValueType(); + SDValue Result; + if (CP->isMachineConstantPoolEntry()) { + Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, + CP->getAlignment(), CP->getOffset(), CP->getTargetFlags()); + } else { + Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, + CP->getAlignment(), CP->getOffset(), CP->getTargetFlags()); + } + return Result; +} + +SDValue +AMDILTargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const +{ + const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); + SDValue Result = DAG.getTargetExternalSymbol(Sym, MVT::i32); + return Result; +} +/// LowerFORMAL_ARGUMENTS - transform physical registers into +/// virtual registers and generate load operations for +/// arguments places on the stack. +/// TODO: isVarArg, hasStructRet, isMemReg + SDValue +AMDILTargetLowering::LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) +const +{ + + MachineFunction &MF = DAG.getMachineFunction(); + AMDILMachineFunctionInfo *FuncInfo + = MF.getInfo<AMDILMachineFunctionInfo>(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + //const Function *Fn = MF.getFunction(); + //MachineRegisterInfo &RegInfo = MF.getRegInfo(); + + SmallVector<CCValAssign, 16> ArgLocs; + CallingConv::ID CC = MF.getFunction()->getCallingConv(); + //bool hasStructRet = MF.getFunction()->hasStructRetAttr(); + + CCState CCInfo(CC, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + + // When more calling conventions are added, they need to be chosen here + CCInfo.AnalyzeFormalArguments(Ins, CC_AMDIL32); + SDValue StackPtr; + + //unsigned int FirstStackArgLoc = 0; + + for (unsigned int i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (VA.isRegLoc()) { + EVT RegVT = VA.getLocVT(); + const TargetRegisterClass *RC = getRegClassFromType( + RegVT.getSimpleVT().SimpleTy); + + unsigned int Reg = MF.addLiveIn(VA.getLocReg(), RC); + SDValue ArgValue = DAG.getCopyFromReg( + Chain, + dl, + Reg, + RegVT); + // If this is an 8 or 16-bit value, it is really passed + // promoted to 32 bits. Insert an assert[sz]ext to capture + // this, then truncate to the right size. + + if (VA.getLocInfo() == CCValAssign::SExt) { + ArgValue = DAG.getNode( + ISD::AssertSext, + dl, + RegVT, + ArgValue, + DAG.getValueType(VA.getValVT())); + } else if (VA.getLocInfo() == CCValAssign::ZExt) { + ArgValue = DAG.getNode( + ISD::AssertZext, + dl, + RegVT, + ArgValue, + DAG.getValueType(VA.getValVT())); + } + if (VA.getLocInfo() != CCValAssign::Full) { + ArgValue = DAG.getNode( + ISD::TRUNCATE, + dl, + VA.getValVT(), + ArgValue); + } + // Add the value to the list of arguments + // to be passed in registers + InVals.push_back(ArgValue); + if (isVarArg) { + assert(0 && "Variable arguments are not yet supported"); + // See MipsISelLowering.cpp for ideas on how to implement + } + } else if(VA.isMemLoc()) { + InVals.push_back(LowerMemArgument(Chain, CallConv, Ins, + dl, DAG, VA, MFI, i)); + } else { + assert(0 && "found a Value Assign that is " + "neither a register or a memory location"); + } + } + /*if (hasStructRet) { + assert(0 && "Has struct return is not yet implemented"); + // See MipsISelLowering.cpp for ideas on how to implement + }*/ + + unsigned int StackSize = CCInfo.getNextStackOffset(); + if (isVarArg) { + assert(0 && "Variable arguments are not yet supported"); + // See X86/PPC/CellSPU ISelLowering.cpp for ideas on how to implement + } + // This needs to be changed to non-zero if the return function needs + // to pop bytes + FuncInfo->setBytesToPopOnReturn(StackSize); + return Chain; +} +/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified +/// by "Src" to address "Dst" with size and alignment information specified by +/// the specific parameter attribute. The copy will be passed as a byval +/// function parameter. +static SDValue +CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, + ISD::ArgFlagsTy Flags, SelectionDAG &DAG) { + assert(0 && "MemCopy does not exist yet"); + SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); + + return DAG.getMemcpy(Chain, + Src.getDebugLoc(), + Dst, Src, SizeNode, Flags.getByValAlign(), + /*IsVol=*/false, /*AlwaysInline=*/true, + MachinePointerInfo(), MachinePointerInfo()); +} + +SDValue +AMDILTargetLowering::LowerMemOpCallTo(SDValue Chain, + SDValue StackPtr, SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const +{ + unsigned int LocMemOffset = VA.getLocMemOffset(); + SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); + PtrOff = DAG.getNode(ISD::ADD, + dl, + getPointerTy(), StackPtr, PtrOff); + if (Flags.isByVal()) { + PtrOff = CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG); + } else { + PtrOff = DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo::getStack(LocMemOffset), + false, false, 0); + } + return PtrOff; +} +/// LowerCAL - functions arguments are copied from virtual +/// regs to (physical regs)/(stack frame), CALLSEQ_START and +/// CALLSEQ_END are emitted. +/// TODO: isVarArg, isTailCall, hasStructRet +SDValue +AMDILTargetLowering::LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, bool doesNotRet, + bool& isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) +const +{ + isTailCall = false; + MachineFunction& MF = DAG.getMachineFunction(); + // FIXME: DO we need to handle fast calling conventions and tail call + // optimizations?? X86/PPC ISelLowering + /*bool hasStructRet = (TheCall->getNumArgs()) + ? TheCall->getArgFlags(0).device()->isSRet() + : false;*/ + + MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Analyze operands of the call, assigning locations to each operand + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), ArgLocs, *DAG.getContext()); + // Analyize the calling operands, but need to change + // if we have more than one calling convetion + CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); + + unsigned int NumBytes = CCInfo.getNextStackOffset(); + if (isTailCall) { + assert(isTailCall && "Tail Call not handled yet!"); + // See X86/PPC ISelLowering + } + + Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); + + SmallVector<std::pair<unsigned int, SDValue>, 8> RegsToPass; + SmallVector<SDValue, 8> MemOpChains; + SDValue StackPtr; + //unsigned int FirstStacArgLoc = 0; + //int LastArgStackLoc = 0; + + // Walk the register/memloc assignments, insert copies/loads + for (unsigned int i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + //bool isByVal = Flags.isByVal(); // handle byval/bypointer registers + // Arguments start after the 5 first operands of ISD::CALL + SDValue Arg = OutVals[i]; + //Promote the value if needed + switch(VA.getLocInfo()) { + default: assert(0 && "Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, + dl, + VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, + dl, + VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, + dl, + VA.getLocVT(), Arg); + break; + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else if (VA.isMemLoc()) { + // Create the frame index object for this incoming parameter + int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, + VA.getLocMemOffset(), true); + SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy()); + + // emit ISD::STORE whichs stores the + // parameter value to a stack Location + MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, + MachinePointerInfo::getFixedStack(FI), + false, false, 0)); + } else { + assert(0 && "Not a Reg/Mem Loc, major error!"); + } + } + if (!MemOpChains.empty()) { + Chain = DAG.getNode(ISD::TokenFactor, + dl, + MVT::Other, + &MemOpChains[0], + MemOpChains.size()); + } + SDValue InFlag; + if (!isTailCall) { + for (unsigned int i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, + dl, + RegsToPass[i].first, + RegsToPass[i].second, + InFlag); + InFlag = Chain.getValue(1); + } + } + + // If the callee is a GlobalAddress/ExternalSymbol node (quite common, + // every direct call is) turn it into a TargetGlobalAddress/ + // TargetExternalSymbol + // node so that legalize doesn't hack it. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl, getPointerTy()); + } + else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); + } + else if (isTailCall) { + assert(0 && "Tail calls are not handled yet"); + // see X86 ISelLowering for ideas on implementation: 1708 + } + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVTGLUE); + SmallVector<SDValue, 8> Ops; + + if (isTailCall) { + assert(0 && "Tail calls are not handled yet"); + // see X86 ISelLowering for ideas on implementation: 1721 + } + // If this is a direct call, pass the chain and the callee + if (Callee.getNode()) { + Ops.push_back(Chain); + Ops.push_back(Callee); + } + + if (isTailCall) { + assert(0 && "Tail calls are not handled yet"); + // see X86 ISelLowering for ideas on implementation: 1739 + } + + // Add argument registers to the end of the list so that they are known + // live into the call + for (unsigned int i = 0, e = RegsToPass.size(); i != e; ++i) { + Ops.push_back(DAG.getRegister( + RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + } + if (InFlag.getNode()) { + Ops.push_back(InFlag); + } + + // Emit Tail Call + if (isTailCall) { + assert(0 && "Tail calls are not handled yet"); + // see X86 ISelLowering for ideas on implementation: 1762 + } + + Chain = DAG.getNode(AMDILISD::CALL, + dl, + NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Create the CALLSEQ_END node + Chain = DAG.getCALLSEQ_END( + Chain, + DAG.getIntPtrConstant(NumBytes, true), + DAG.getIntPtrConstant(0, true), + InFlag); + InFlag = Chain.getValue(1); + // Handle result values, copying them out of physregs into vregs that + // we return + return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, + InVals); +} +static void checkMADType( + SDValue Op, const AMDILSubtarget *STM, bool& is24bitMAD, bool& is32bitMAD) +{ + bool globalLoadStore = false; + is24bitMAD = false; + is32bitMAD = false; + return; + assert(Op.getOpcode() == ISD::ADD && "The opcode must be a add in order for " + "this to work correctly!"); + if (Op.getNode()->use_empty()) { + return; + } + for (SDNode::use_iterator nBegin = Op.getNode()->use_begin(), + nEnd = Op.getNode()->use_end(); nBegin != nEnd; ++nBegin) { + SDNode *ptr = *nBegin; + const LSBaseSDNode *lsNode = dyn_cast<LSBaseSDNode>(ptr); + // If we are not a LSBaseSDNode then we don't do this + // optimization. + // If we are a LSBaseSDNode, but the op is not the offset + // or base pointer, then we don't do this optimization + // (i.e. we are the value being stored) + if (!lsNode || + (lsNode->writeMem() && lsNode->getOperand(1) == Op)) { + return; + } + const PointerType *PT = + dyn_cast<PointerType>(lsNode->getSrcValue()->getType()); + unsigned as = PT->getAddressSpace(); + switch(as) { + default: + globalLoadStore = true; + case AMDILAS::PRIVATE_ADDRESS: + if (!STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)) { + globalLoadStore = true; + } + break; + case AMDILAS::CONSTANT_ADDRESS: + if (!STM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)) { + globalLoadStore = true; + } + break; + case AMDILAS::LOCAL_ADDRESS: + if (!STM->device()->usesHardware(AMDILDeviceInfo::LocalMem)) { + globalLoadStore = true; + } + break; + case AMDILAS::REGION_ADDRESS: + if (!STM->device()->usesHardware(AMDILDeviceInfo::RegionMem)) { + globalLoadStore = true; + } + break; + } + } + if (globalLoadStore) { + is32bitMAD = true; + } else { + is24bitMAD = true; + } +} + +SDValue +AMDILTargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const +{ + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue DST; + const AMDILSubtarget *stm = &this->getTargetMachine() + .getSubtarget<AMDILSubtarget>(); + bool isVec = OVT.isVector(); + if (OVT.getScalarType() == MVT::i64) { + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i64) { + INTTY = MVT::v2i32; + } + if (stm->device()->usesHardware(AMDILDeviceInfo::LongOps) + && INTTY == MVT::i32) { + DST = DAG.getNode(AMDILISD::ADD, + DL, + OVT, + LHS, RHS); + } else { + SDValue LHSLO, LHSHI, RHSLO, RHSHI, INTLO, INTHI; + // TODO: need to turn this into a bitcast of i64/v2i64 to v2i32/v4i32 + LHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, LHS); + RHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, RHS); + LHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, LHS); + RHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, RHS); + INTLO = DAG.getNode(ISD::ADD, DL, INTTY, LHSLO, RHSLO); + INTHI = DAG.getNode(ISD::ADD, DL, INTTY, LHSHI, RHSHI); + SDValue cmp; + cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETULT, MVT::i32), MVT::i32), + INTLO, RHSLO); + cmp = DAG.getNode(AMDILISD::INEGATE, DL, INTTY, cmp); + INTHI = DAG.getNode(ISD::ADD, DL, INTTY, INTHI, cmp); + DST = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, OVT, + INTLO, INTHI); + } + } else { + if (LHS.getOpcode() == ISD::FrameIndex || + RHS.getOpcode() == ISD::FrameIndex) { + DST = DAG.getNode(AMDILISD::ADDADDR, + DL, + OVT, + LHS, RHS); + } else { + if (stm->device()->usesHardware(AMDILDeviceInfo::LocalMem) + && LHS.getNumOperands() + && RHS.getNumOperands()) { + bool is24bitMAD = false; + bool is32bitMAD = false; + const ConstantSDNode *LHSConstOpCode = + dyn_cast<ConstantSDNode>(LHS.getOperand(LHS.getNumOperands()-1)); + const ConstantSDNode *RHSConstOpCode = + dyn_cast<ConstantSDNode>(RHS.getOperand(RHS.getNumOperands()-1)); + if ((LHS.getOpcode() == ISD::SHL && LHSConstOpCode) + || (RHS.getOpcode() == ISD::SHL && RHSConstOpCode) + || LHS.getOpcode() == ISD::MUL + || RHS.getOpcode() == ISD::MUL) { + SDValue Op1, Op2, Op3; + // FIXME: Fix this so that it works for unsigned 24bit ops. + if (LHS.getOpcode() == ISD::MUL) { + Op1 = LHS.getOperand(0); + Op2 = LHS.getOperand(1); + Op3 = RHS; + } else if (RHS.getOpcode() == ISD::MUL) { + Op1 = RHS.getOperand(0); + Op2 = RHS.getOperand(1); + Op3 = LHS; + } else if (LHS.getOpcode() == ISD::SHL && LHSConstOpCode) { + Op1 = LHS.getOperand(0); + Op2 = DAG.getConstant( + 1 << LHSConstOpCode->getZExtValue(), MVT::i32); + Op3 = RHS; + } else if (RHS.getOpcode() == ISD::SHL && RHSConstOpCode) { + Op1 = RHS.getOperand(0); + Op2 = DAG.getConstant( + 1 << RHSConstOpCode->getZExtValue(), MVT::i32); + Op3 = LHS; + } + checkMADType(Op, stm, is24bitMAD, is32bitMAD); + // We can possibly do a MAD transform! + if (is24bitMAD && stm->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) { + uint32_t opcode = AMDGPUIntrinsic::AMDIL_mad24_i32; + SDVTList Tys = DAG.getVTList(OVT/*, MVT::Other*/); + DST = DAG.getNode(ISD::INTRINSIC_W_CHAIN, + DL, Tys, DAG.getEntryNode(), DAG.getConstant(opcode, MVT::i32), + Op1, Op2, Op3); + } else if(is32bitMAD) { + SDVTList Tys = DAG.getVTList(OVT/*, MVT::Other*/); + DST = DAG.getNode(ISD::INTRINSIC_W_CHAIN, + DL, Tys, DAG.getEntryNode(), + DAG.getConstant( + AMDGPUIntrinsic::AMDIL_mad_i32, MVT::i32), + Op1, Op2, Op3); + } + } + } + DST = DAG.getNode(AMDILISD::ADD, + DL, + OVT, + LHS, RHS); + } + } + return DST; +} +SDValue +AMDILTargetLowering::genCLZuN(SDValue Op, SelectionDAG &DAG, + uint32_t bits) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT INTTY = Op.getValueType(); + EVT FPTY; + if (INTTY.isVector()) { + FPTY = EVT(MVT::getVectorVT(MVT::f32, + INTTY.getVectorNumElements())); + } else { + FPTY = EVT(MVT::f32); + } + /* static inline uint + __clz_Nbit(uint x) + { + int xor = 0x3f800000U | x; + float tp = as_float(xor); + float t = tp + -1.0f; + uint tint = as_uint(t); + int cmp = (x != 0); + uint tsrc = tint >> 23; + uint tmask = tsrc & 0xffU; + uint cst = (103 + N)U - tmask; + return cmp ? cst : N; + } + */ + assert(INTTY.getScalarType().getSimpleVT().SimpleTy == MVT::i32 + && "genCLZu16 only works on 32bit types"); + // uint x = Op + SDValue x = Op; + // xornode = 0x3f800000 | x + SDValue xornode = DAG.getNode(ISD::OR, DL, INTTY, + DAG.getConstant(0x3f800000, INTTY), x); + // float tp = as_float(xornode) + SDValue tp = DAG.getNode(ISDBITCAST, DL, FPTY, xornode); + // float t = tp + -1.0f + SDValue t = DAG.getNode(ISD::FADD, DL, FPTY, tp, + DAG.getConstantFP(-1.0f, FPTY)); + // uint tint = as_uint(t) + SDValue tint = DAG.getNode(ISDBITCAST, DL, INTTY, t); + // int cmp = (x != 0) + SDValue cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETNE, MVT::i32), MVT::i32), x, + DAG.getConstant(0, INTTY)); + // uint tsrc = tint >> 23 + SDValue tsrc = DAG.getNode(ISD::SRL, DL, INTTY, tint, + DAG.getConstant(23, INTTY)); + // uint tmask = tsrc & 0xFF + SDValue tmask = DAG.getNode(ISD::AND, DL, INTTY, tsrc, + DAG.getConstant(0xFFU, INTTY)); + // uint cst = (103 + bits) - tmask + SDValue cst = DAG.getNode(ISD::SUB, DL, INTTY, + DAG.getConstant((103U + bits), INTTY), tmask); + // return cmp ? cst : N + cst = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, cmp, cst, + DAG.getConstant(bits, INTTY)); + return cst; +} + +SDValue +AMDILTargetLowering::genCLZu32(SDValue Op, SelectionDAG &DAG) const +{ + SDValue DST = SDValue(); + DebugLoc DL = Op.getDebugLoc(); + EVT INTTY = Op.getValueType(); + const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>( + &this->getTargetMachine())->getSubtargetImpl(); + if (stm->device()->getGeneration() >= AMDILDeviceInfo::HD5XXX) { + //__clz_32bit(uint u) + //{ + // int z = __amdil_ffb_hi(u) ; + // return z < 0 ? 32 : z; + // } + // uint u = op + SDValue u = Op; + // int z = __amdil_ffb_hi(u) + SDValue z = DAG.getNode(AMDILISD::IFFB_HI, DL, INTTY, u); + // int cmp = z < 0 + SDValue cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32), + z, DAG.getConstant(0, INTTY)); + // return cmp ? 32 : z + DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, cmp, + DAG.getConstant(32, INTTY), z); + } else if (stm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + // static inline uint + //__clz_32bit(uint x) + //{ + // uint zh = __clz_16bit(x >> 16); + // uint zl = __clz_16bit(x & 0xffffU); + // return zh == 16U ? 16U + zl : zh; + //} + // uint x = Op + SDValue x = Op; + // uint xs16 = x >> 16 + SDValue xs16 = DAG.getNode(ISD::SRL, DL, INTTY, x, + DAG.getConstant(16, INTTY)); + // uint zh = __clz_16bit(xs16) + SDValue zh = genCLZuN(xs16, DAG, 16); + // uint xa16 = x & 0xFFFF + SDValue xa16 = DAG.getNode(ISD::AND, DL, INTTY, x, + DAG.getConstant(0xFFFFU, INTTY)); + // uint zl = __clz_16bit(xa16) + SDValue zl = genCLZuN(xa16, DAG, 16); + // uint cmp = zh == 16U + SDValue cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32), + zh, DAG.getConstant(16U, INTTY)); + // uint zl16 = zl + 16 + SDValue zl16 = DAG.getNode(ISD::ADD, DL, INTTY, + DAG.getConstant(16, INTTY), zl); + // return cmp ? zl16 : zh + DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, + cmp, zl16, zh); + } else { + assert(0 && "Attempting to generate a CLZ function with an" + " unknown graphics card"); + } + return DST; +} +SDValue +AMDILTargetLowering::genCLZu64(SDValue Op, SelectionDAG &DAG) const +{ + SDValue DST = SDValue(); + DebugLoc DL = Op.getDebugLoc(); + EVT INTTY; + EVT LONGTY = Op.getValueType(); + bool isVec = LONGTY.isVector(); + if (isVec) { + INTTY = EVT(MVT::getVectorVT(MVT::i32, Op.getValueType() + .getVectorNumElements())); + } else { + INTTY = EVT(MVT::i32); + } + const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>( + &this->getTargetMachine())->getSubtargetImpl(); + if (stm->device()->getGeneration() >= AMDILDeviceInfo::HD5XXX) { + // Evergreen: + // static inline uint + // __clz_u64(ulong x) + // { + //uint zhi = __clz_32bit((uint)(x >> 32)); + //uint zlo = __clz_32bit((uint)(x & 0xffffffffUL)); + //return zhi == 32U ? 32U + zlo : zhi; + //} + //ulong x = op + SDValue x = Op; + // uint xhi = x >> 32 + SDValue xlo = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, x); + // uint xlo = x & 0xFFFFFFFF + SDValue xhi = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, x); + // uint zhi = __clz_32bit(xhi) + SDValue zhi = genCLZu32(xhi, DAG); + // uint zlo = __clz_32bit(xlo) + SDValue zlo = genCLZu32(xlo, DAG); + // uint cmp = zhi == 32 + SDValue cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32), + zhi, DAG.getConstant(32U, INTTY)); + // uint zlop32 = 32 + zlo + SDValue zlop32 = DAG.getNode(AMDILISD::ADD, DL, INTTY, + DAG.getConstant(32U, INTTY), zlo); + // return cmp ? zlop32: zhi + DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, cmp, zlop32, zhi); + } else if (stm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + // HD4XXX: + // static inline uint + //__clz_64bit(ulong x) + //{ + //uint zh = __clz_23bit((uint)(x >> 46)) - 5U; + //uint zm = __clz_23bit((uint)(x >> 23) & 0x7fffffU); + //uint zl = __clz_23bit((uint)x & 0x7fffffU); + //uint r = zh == 18U ? 18U + zm : zh; + //return zh + zm == 41U ? 41U + zl : r; + //} + //ulong x = Op + SDValue x = Op; + // ulong xs46 = x >> 46 + SDValue xs46 = DAG.getNode(ISD::SRL, DL, LONGTY, x, + DAG.getConstant(46, LONGTY)); + // uint ixs46 = (uint)xs46 + SDValue ixs46 = DAG.getNode(ISD::TRUNCATE, DL, INTTY, xs46); + // ulong xs23 = x >> 23 + SDValue xs23 = DAG.getNode(ISD::SRL, DL, LONGTY, x, + DAG.getConstant(23, LONGTY)); + // uint ixs23 = (uint)xs23 + SDValue ixs23 = DAG.getNode(ISD::TRUNCATE, DL, INTTY, xs23); + // uint xs23m23 = ixs23 & 0x7FFFFF + SDValue xs23m23 = DAG.getNode(ISD::AND, DL, INTTY, ixs23, + DAG.getConstant(0x7fffffU, INTTY)); + // uint ix = (uint)x + SDValue ix = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, x); + // uint xm23 = ix & 0x7FFFFF + SDValue xm23 = DAG.getNode(ISD::AND, DL, INTTY, ix, + DAG.getConstant(0x7fffffU, INTTY)); + // uint zh = __clz_23bit(ixs46) + SDValue zh = genCLZuN(ixs46, DAG, 23); + // uint zm = __clz_23bit(xs23m23) + SDValue zm = genCLZuN(xs23m23, DAG, 23); + // uint zl = __clz_23bit(xm23) + SDValue zl = genCLZuN(xm23, DAG, 23); + // uint zhm5 = zh - 5 + SDValue zhm5 = DAG.getNode(ISD::ADD, DL, INTTY, zh, + DAG.getConstant(-5U, INTTY)); + SDValue const18 = DAG.getConstant(18, INTTY); + SDValue const41 = DAG.getConstant(41, INTTY); + // uint cmp1 = zh = 18 + SDValue cmp1 = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32), + zhm5, const18); + // uint zhm5zm = zhm5 + zh + SDValue zhm5zm = DAG.getNode(ISD::ADD, DL, INTTY, zhm5, zm); + // uint cmp2 = zhm5zm == 41 + SDValue cmp2 = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32), + zhm5zm, const41); + // uint zmp18 = zhm5 + 18 + SDValue zmp18 = DAG.getNode(ISD::ADD, DL, INTTY, zm, const18); + // uint zlp41 = zl + 41 + SDValue zlp41 = DAG.getNode(ISD::ADD, DL, INTTY, zl, const41); + // uint r = cmp1 ? zmp18 : zh + SDValue r = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, + cmp1, zmp18, zhm5); + // return cmp2 ? zlp41 : r + DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, cmp2, zlp41, r); + } else { + assert(0 && "Attempting to generate a CLZ function with an" + " unknown graphics card"); + } + return DST; +} +SDValue +AMDILTargetLowering::genf64toi64(SDValue RHS, SelectionDAG &DAG, + bool includeSign) const +{ + EVT INTVT; + EVT LONGVT; + SDValue DST; + DebugLoc DL = RHS.getDebugLoc(); + EVT RHSVT = RHS.getValueType(); + bool isVec = RHSVT.isVector(); + if (isVec) { + LONGVT = EVT(MVT::getVectorVT(MVT::i64, RHSVT + .getVectorNumElements())); + INTVT = EVT(MVT::getVectorVT(MVT::i32, RHSVT + .getVectorNumElements())); + } else { + LONGVT = EVT(MVT::i64); + INTVT = EVT(MVT::i32); + } + const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>( + &this->getTargetMachine())->getSubtargetImpl(); + if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + // unsigned version: + // uint uhi = (uint)(d * 0x1.0p-32); + // uint ulo = (uint)(mad((double)uhi, -0x1.0p+32, d)); + // return as_ulong2((uint2)(ulo, uhi)); + // + // signed version: + // double ad = fabs(d); + // long l = unsigned_version(ad); + // long nl = -l; + // return d == ad ? l : nl; + SDValue d = RHS; + if (includeSign) { + d = DAG.getNode(ISD::FABS, DL, RHSVT, d); + } + SDValue uhid = DAG.getNode(ISD::FMUL, DL, RHSVT, d, + DAG.getConstantFP(0x2f800000, RHSVT)); + SDValue uhi = DAG.getNode(ISD::FP_TO_UINT, DL, INTVT, uhid); + SDValue ulod = DAG.getNode(ISD::UINT_TO_FP, DL, RHSVT, uhi); + ulod = DAG.getNode(AMDILISD::MAD, DL, RHSVT, ulod, + DAG.getConstantFP(0xcf800000, RHSVT), d); + SDValue ulo = DAG.getNode(ISD::FP_TO_UINT, DL, INTVT, ulod); + SDValue l = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, ulo, uhi); + if (includeSign) { + SDValue nl = DAG.getNode(AMDILISD::INEGATE, DL, LONGVT, l); + SDValue c = DAG.getNode(AMDILISD::CMP, DL, RHSVT, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::f64), MVT::i32), + RHS, d); + l = DAG.getNode(AMDILISD::CMOVLOG, DL, LONGVT, c, l, nl); + } + DST = l; + } else { + /* + __attribute__((always_inline)) long + cast_f64_to_i64(double d) + { + // Convert d in to 32-bit components + long x = as_long(d); + xhi = LCOMPHI(x); + xlo = LCOMPLO(x); + + // Generate 'normalized' mantissa + mhi = xhi | 0x00100000; // hidden bit + mhi <<= 11; + temp = xlo >> (32 - 11); + mhi |= temp + mlo = xlo << 11; + + // Compute shift right count from exponent + e = (xhi >> (52-32)) & 0x7ff; + sr = 1023 + 63 - e; + srge64 = sr >= 64; + srge32 = sr >= 32; + + // Compute result for 0 <= sr < 32 + rhi0 = mhi >> (sr &31); + rlo0 = mlo >> (sr &31); + temp = mhi << (32 - sr); + temp |= rlo0; + rlo0 = sr ? temp : rlo0; + + // Compute result for 32 <= sr + rhi1 = 0; + rlo1 = srge64 ? 0 : rhi0; + + // Pick between the 2 results + rhi = srge32 ? rhi1 : rhi0; + rlo = srge32 ? rlo1 : rlo0; + + // Optional saturate on overflow + srlt0 = sr < 0; + rhi = srlt0 ? MAXVALUE : rhi; + rlo = srlt0 ? MAXVALUE : rlo; + + // Create long + res = LCREATE( rlo, rhi ); + + // Deal with sign bit (ignoring whether result is signed or unsigned value) + if (includeSign) { + sign = ((signed int) xhi) >> 31; fill with sign bit + sign = LCREATE( sign, sign ); + res += sign; + res ^= sign; + } + + return res; + } + */ + SDValue c11 = DAG.getConstant( 63 - 52, INTVT ); + SDValue c32 = DAG.getConstant( 32, INTVT ); + + // Convert d in to 32-bit components + SDValue d = RHS; + SDValue x = DAG.getNode(ISDBITCAST, DL, LONGVT, d); + SDValue xhi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x ); + SDValue xlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x ); + + // Generate 'normalized' mantissa + SDValue mhi = DAG.getNode( ISD::OR, DL, INTVT, + xhi, DAG.getConstant( 0x00100000, INTVT ) ); + mhi = DAG.getNode( ISD::SHL, DL, INTVT, mhi, c11 ); + SDValue temp = DAG.getNode( ISD::SRL, DL, INTVT, + xlo, DAG.getConstant( 32 - (63 - 52), INTVT ) ); + mhi = DAG.getNode( ISD::OR, DL, INTVT, mhi, temp ); + SDValue mlo = DAG.getNode( ISD::SHL, DL, INTVT, xlo, c11 ); + + // Compute shift right count from exponent + SDValue e = DAG.getNode( ISD::SRL, DL, INTVT, + xhi, DAG.getConstant( 52-32, INTVT ) ); + e = DAG.getNode( ISD::AND, DL, INTVT, + e, DAG.getConstant( 0x7ff, INTVT ) ); + SDValue sr = DAG.getNode( ISD::SUB, DL, INTVT, + DAG.getConstant( 1023 + 63, INTVT ), e ); + SDValue srge64 = DAG.getNode( AMDILISD::CMP, DL, INTVT, + DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32), + sr, DAG.getConstant(64, INTVT)); + SDValue srge32 = DAG.getNode( AMDILISD::CMP, DL, INTVT, + DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32), + sr, DAG.getConstant(32, INTVT)); + + // Compute result for 0 <= sr < 32 + SDValue rhi0 = DAG.getNode( ISD::SRL, DL, INTVT, mhi, sr ); + SDValue rlo0 = DAG.getNode( ISD::SRL, DL, INTVT, mlo, sr ); + temp = DAG.getNode( ISD::SUB, DL, INTVT, c32, sr ); + temp = DAG.getNode( ISD::SHL, DL, INTVT, mhi, temp ); + temp = DAG.getNode( ISD::OR, DL, INTVT, rlo0, temp ); + rlo0 = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, sr, temp, rlo0 ); + + // Compute result for 32 <= sr + SDValue rhi1 = DAG.getConstant( 0, INTVT ); + SDValue rlo1 = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, + srge64, rhi1, rhi0 ); + + // Pick between the 2 results + SDValue rhi = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, + srge32, rhi1, rhi0 ); + SDValue rlo = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, + srge32, rlo1, rlo0 ); + + // Create long + SDValue res = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, rlo, rhi ); + + // Deal with sign bit + if (includeSign) { + SDValue sign = DAG.getNode( ISD::SRA, DL, INTVT, + xhi, DAG.getConstant( 31, INTVT ) ); + sign = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, sign, sign ); + res = DAG.getNode( ISD::ADD, DL, LONGVT, res, sign ); + res = DAG.getNode( ISD::XOR, DL, LONGVT, res, sign ); + } + DST = res; + } + return DST; +} +SDValue +AMDILTargetLowering::genf64toi32(SDValue RHS, SelectionDAG &DAG, + bool includeSign) const +{ + EVT INTVT; + EVT LONGVT; + DebugLoc DL = RHS.getDebugLoc(); + EVT RHSVT = RHS.getValueType(); + bool isVec = RHSVT.isVector(); + if (isVec) { + LONGVT = EVT(MVT::getVectorVT(MVT::i64, + RHSVT.getVectorNumElements())); + INTVT = EVT(MVT::getVectorVT(MVT::i32, + RHSVT.getVectorNumElements())); + } else { + LONGVT = EVT(MVT::i64); + INTVT = EVT(MVT::i32); + } + /* + __attribute__((always_inline)) int + cast_f64_to_[u|i]32(double d) + { + // Convert d in to 32-bit components + long x = as_long(d); + xhi = LCOMPHI(x); + xlo = LCOMPLO(x); + + // Generate 'normalized' mantissa + mhi = xhi | 0x00100000; // hidden bit + mhi <<= 11; + temp = xlo >> (32 - 11); + mhi |= temp + + // Compute shift right count from exponent + e = (xhi >> (52-32)) & 0x7ff; + sr = 1023 + 31 - e; + srge32 = sr >= 32; + + // Compute result for 0 <= sr < 32 + res = mhi >> (sr &31); + res = srge32 ? 0 : res; + + // Optional saturate on overflow + srlt0 = sr < 0; + res = srlt0 ? MAXVALUE : res; + + // Deal with sign bit (ignoring whether result is signed or unsigned value) + if (includeSign) { + sign = ((signed int) xhi) >> 31; fill with sign bit + res += sign; + res ^= sign; + } + + return res; + } + */ + SDValue c11 = DAG.getConstant( 63 - 52, INTVT ); + + // Convert d in to 32-bit components + SDValue d = RHS; + SDValue x = DAG.getNode(ISDBITCAST, DL, LONGVT, d); + SDValue xhi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x ); + SDValue xlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x ); + + // Generate 'normalized' mantissa + SDValue mhi = DAG.getNode( ISD::OR, DL, INTVT, + xhi, DAG.getConstant( 0x00100000, INTVT ) ); + mhi = DAG.getNode( ISD::SHL, DL, INTVT, mhi, c11 ); + SDValue temp = DAG.getNode( ISD::SRL, DL, INTVT, + xlo, DAG.getConstant( 32 - (63 - 52), INTVT ) ); + mhi = DAG.getNode( ISD::OR, DL, INTVT, mhi, temp ); + + // Compute shift right count from exponent + SDValue e = DAG.getNode( ISD::SRL, DL, INTVT, + xhi, DAG.getConstant( 52-32, INTVT ) ); + e = DAG.getNode( ISD::AND, DL, INTVT, + e, DAG.getConstant( 0x7ff, INTVT ) ); + SDValue sr = DAG.getNode( ISD::SUB, DL, INTVT, + DAG.getConstant( 1023 + 31, INTVT ), e ); + SDValue srge32 = DAG.getNode( AMDILISD::CMP, DL, INTVT, + DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32), + sr, DAG.getConstant(32, INTVT)); + + // Compute result for 0 <= sr < 32 + SDValue res = DAG.getNode( ISD::SRL, DL, INTVT, mhi, sr ); + res = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, + srge32, DAG.getConstant(0,INTVT), res ); + + // Deal with sign bit + if (includeSign) { + SDValue sign = DAG.getNode( ISD::SRA, DL, INTVT, + xhi, DAG.getConstant( 31, INTVT ) ); + res = DAG.getNode( ISD::ADD, DL, INTVT, res, sign ); + res = DAG.getNode( ISD::XOR, DL, INTVT, res, sign ); + } + return res; +} +SDValue +AMDILTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const +{ + SDValue RHS = Op.getOperand(0); + EVT RHSVT = RHS.getValueType(); + MVT RST = RHSVT.getScalarType().getSimpleVT(); + EVT LHSVT = Op.getValueType(); + MVT LST = LHSVT.getScalarType().getSimpleVT(); + DebugLoc DL = Op.getDebugLoc(); + SDValue DST; + const AMDILTargetMachine* + amdtm = reinterpret_cast<const AMDILTargetMachine*> + (&this->getTargetMachine()); + const AMDILSubtarget* + stm = dynamic_cast<const AMDILSubtarget*>( + amdtm->getSubtargetImpl()); + if (RST == MVT::f64 && RHSVT.isVector() + && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + // We dont support vector 64bit floating point convertions. + for (unsigned x = 0, y = RHSVT.getVectorNumElements(); x < y; ++x) { + SDValue op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, RST, RHS, DAG.getTargetConstant(x, MVT::i32)); + op = DAG.getNode(ISD::FP_TO_SINT, DL, LST, op); + if (!x) { + DST = DAG.getNode(AMDILISD::VBUILD, DL, LHSVT, op); + } else { + DST = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LHSVT, + DST, op, DAG.getTargetConstant(x, MVT::i32)); + } + } + } else { + if (RST == MVT::f64 + && LST == MVT::i32) { + if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + DST = SDValue(Op.getNode(), 0); + } else { + DST = genf64toi32(RHS, DAG, true); + } + } else if (RST == MVT::f64 + && LST == MVT::i64) { + DST = genf64toi64(RHS, DAG, true); + } else if (RST == MVT::f64 + && (LST == MVT::i8 || LST == MVT::i16)) { + if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + DST = DAG.getNode(ISD::TRUNCATE, DL, LHSVT, SDValue(Op.getNode(), 0)); + } else { + SDValue ToInt = genf64toi32(RHS, DAG, true); + DST = DAG.getNode(ISD::TRUNCATE, DL, LHSVT, ToInt); + } + + } else { + DST = SDValue(Op.getNode(), 0); + } + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const +{ + SDValue DST; + SDValue RHS = Op.getOperand(0); + EVT RHSVT = RHS.getValueType(); + MVT RST = RHSVT.getScalarType().getSimpleVT(); + EVT LHSVT = Op.getValueType(); + MVT LST = LHSVT.getScalarType().getSimpleVT(); + DebugLoc DL = Op.getDebugLoc(); + const AMDILTargetMachine* + amdtm = reinterpret_cast<const AMDILTargetMachine*> + (&this->getTargetMachine()); + const AMDILSubtarget* + stm = dynamic_cast<const AMDILSubtarget*>( + amdtm->getSubtargetImpl()); + if (RST == MVT::f64 && RHSVT.isVector() + && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + // We dont support vector 64bit floating point convertions. + for (unsigned x = 0, y = RHSVT.getVectorNumElements(); x < y; ++x) { + SDValue op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, RST, RHS, DAG.getTargetConstant(x, MVT::i32)); + op = DAG.getNode(ISD::FP_TO_SINT, DL, LST, op); + if (!x) { + DST = DAG.getNode(AMDILISD::VBUILD, DL, LHSVT, op); + } else { + DST = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LHSVT, + DST, op, DAG.getTargetConstant(x, MVT::i32)); + } + + } + } else { + if (RST == MVT::f64 + && LST == MVT::i32) { + if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + DST = SDValue(Op.getNode(), 0); + } else { + DST = genf64toi32(RHS, DAG, false); + } + } else if (RST == MVT::f64 + && LST == MVT::i64) { + DST = genf64toi64(RHS, DAG, false); + } else if (RST == MVT::f64 + && (LST == MVT::i8 || LST == MVT::i16)) { + if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + DST = DAG.getNode(ISD::TRUNCATE, DL, LHSVT, SDValue(Op.getNode(), 0)); + } else { + SDValue ToInt = genf64toi32(RHS, DAG, false); + DST = DAG.getNode(ISD::TRUNCATE, DL, LHSVT, ToInt); + } + + } else { + DST = SDValue(Op.getNode(), 0); + } + } + return DST; +} +SDValue +AMDILTargetLowering::genu32tof64(SDValue RHS, EVT LHSVT, + SelectionDAG &DAG) const +{ + EVT RHSVT = RHS.getValueType(); + DebugLoc DL = RHS.getDebugLoc(); + EVT INTVT; + EVT LONGVT; + bool isVec = RHSVT.isVector(); + if (isVec) { + LONGVT = EVT(MVT::getVectorVT(MVT::i64, + RHSVT.getVectorNumElements())); + INTVT = EVT(MVT::getVectorVT(MVT::i32, + RHSVT.getVectorNumElements())); + } else { + LONGVT = EVT(MVT::i64); + INTVT = EVT(MVT::i32); + } + SDValue x = RHS; + const AMDILTargetMachine* + amdtm = reinterpret_cast<const AMDILTargetMachine*> + (&this->getTargetMachine()); + const AMDILSubtarget* + stm = dynamic_cast<const AMDILSubtarget*>( + amdtm->getSubtargetImpl()); + if (stm->calVersion() >= CAL_VERSION_SC_135) { + // unsigned x = RHS; + // ulong xd = (ulong)(0x4330_0000 << 32) | x; + // double d = as_double( xd ); + // return d - 0x1.0p+52; // 0x1.0p+52 == 0x4330_0000_0000_0000 + SDValue xd = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, x, + DAG.getConstant( 0x43300000, INTVT ) ); + SDValue d = DAG.getNode( ISDBITCAST, DL, LHSVT, xd ); + SDValue offsetd = DAG.getNode( ISDBITCAST, DL, LHSVT, + DAG.getConstant( 0x4330000000000000ULL, LONGVT ) ); + return DAG.getNode( ISD::FSUB, DL, LHSVT, d, offsetd ); + } else { + SDValue clz = genCLZu32(x, DAG); + + // Compute the exponent. 1023 is the bias, 31-clz the actual power of 2 + // Except for an input 0... which requires a 0 exponent + SDValue exp = DAG.getNode( ISD::SUB, DL, INTVT, + DAG.getConstant( (1023+31), INTVT), clz ); + exp = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, x, exp, x ); + + // Normalize frac + SDValue rhi = DAG.getNode( ISD::SHL, DL, INTVT, x, clz ); + + // Eliminate hidden bit + rhi = DAG.getNode( ISD::AND, DL, INTVT, + rhi, DAG.getConstant( 0x7fffffff, INTVT ) ); + + // Pack exponent and frac + SDValue rlo = DAG.getNode( ISD::SHL, DL, INTVT, + rhi, DAG.getConstant( (32 - 11), INTVT ) ); + rhi = DAG.getNode( ISD::SRL, DL, INTVT, + rhi, DAG.getConstant( 11, INTVT ) ); + exp = DAG.getNode( ISD::SHL, DL, INTVT, + exp, DAG.getConstant( 20, INTVT ) ); + rhi = DAG.getNode( ISD::OR, DL, INTVT, rhi, exp ); + + // Convert 2 x 32 in to 1 x 64, then to double precision float type + SDValue res = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, rlo, rhi ); + return DAG.getNode(ISDBITCAST, DL, LHSVT, res); + } +} +SDValue +AMDILTargetLowering::genu64tof64(SDValue RHS, EVT LHSVT, + SelectionDAG &DAG) const +{ + EVT RHSVT = RHS.getValueType(); + DebugLoc DL = RHS.getDebugLoc(); + EVT INTVT; + EVT LONGVT; + bool isVec = RHSVT.isVector(); + if (isVec) { + INTVT = EVT(MVT::getVectorVT(MVT::i32, + RHSVT.getVectorNumElements())); + } else { + INTVT = EVT(MVT::i32); + } + LONGVT = RHSVT; + SDValue x = RHS; + const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>( + &this->getTargetMachine())->getSubtargetImpl(); + if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + // double dhi = (double)(as_uint2(x).y); + // double dlo = (double)(as_uint2(x).x); + // return mad(dhi, 0x1.0p+32, dlo) + SDValue dhi = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x); + dhi = DAG.getNode(ISD::UINT_TO_FP, DL, LHSVT, dhi); + SDValue dlo = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x); + dlo = DAG.getNode(ISD::UINT_TO_FP, DL, LHSVT, dlo); + return DAG.getNode(AMDILISD::MAD, DL, LHSVT, dhi, + DAG.getConstantFP(0x4f800000, LHSVT), dlo); + } else if (stm->calVersion() >= CAL_VERSION_SC_135) { + // double lo = as_double( as_ulong( 0x1.0p+52) | (u & 0xffff_ffffUL)); + // double hi = as_double( as_ulong( 0x1.0p+84) | (u >> 32)); + // return (hi - (0x1.0p+84 + 0x1.0p+52)) + lo; + SDValue xlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x ); // x & 0xffff_ffffUL + SDValue xd = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, xlo, DAG.getConstant( 0x43300000, INTVT ) ); + SDValue lo = DAG.getNode( ISDBITCAST, DL, LHSVT, xd ); + SDValue xhi = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x ); // x >> 32 + SDValue xe = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, xhi, DAG.getConstant( 0x45300000, INTVT ) ); + SDValue hi = DAG.getNode( ISDBITCAST, DL, LHSVT, xe ); + SDValue c = DAG.getNode( ISDBITCAST, DL, LHSVT, + DAG.getConstant( 0x4530000000100000ULL, LONGVT ) ); + hi = DAG.getNode( ISD::FSUB, DL, LHSVT, hi, c ); + return DAG.getNode( ISD::FADD, DL, LHSVT, hi, lo ); + + } else { + SDValue clz = genCLZu64(x, DAG); + SDValue xhi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, x ); + SDValue xlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, x ); + + // Compute the exponent. 1023 is the bias, 63-clz the actual power of 2 + SDValue exp = DAG.getNode( ISD::SUB, DL, INTVT, + DAG.getConstant( (1023+63), INTVT), clz ); + SDValue mash = DAG.getNode( ISD::OR, DL, INTVT, xhi, xlo ); + exp = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, + mash, exp, mash ); // exp = exp, or 0 if input was 0 + + // Normalize frac + SDValue clz31 = DAG.getNode( ISD::AND, DL, INTVT, + clz, DAG.getConstant( 31, INTVT ) ); + SDValue rshift = DAG.getNode( ISD::SUB, DL, INTVT, + DAG.getConstant( 32, INTVT ), clz31 ); + SDValue t1 = DAG.getNode( ISD::SHL, DL, INTVT, xhi, clz31 ); + SDValue t2 = DAG.getNode( ISD::SRL, DL, INTVT, xlo, rshift ); + t2 = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, clz31, t2, t1 ); + SDValue rhi1 = DAG.getNode( ISD::OR, DL, INTVT, t1, t2 ); + SDValue rlo1 = DAG.getNode( ISD::SHL, DL, INTVT, xlo, clz31 ); + SDValue rhi2 = DAG.getNode( ISD::SHL, DL, INTVT, xlo, clz31 ); + SDValue rlo2 = DAG.getConstant( 0, INTVT ); + SDValue clz32 = DAG.getNode( ISD::AND, DL, INTVT, + clz, DAG.getConstant( 32, INTVT ) ); + SDValue rhi = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, + clz32, rhi2, rhi1 ); + SDValue rlo = DAG.getNode( AMDILISD::CMOVLOG, DL, INTVT, + clz32, rlo2, rlo1 ); + + // Eliminate hidden bit + rhi = DAG.getNode( ISD::AND, DL, INTVT, + rhi, DAG.getConstant( 0x7fffffff, INTVT ) ); + + // Save bits needed to round properly + SDValue round = DAG.getNode( ISD::AND, DL, INTVT, + rlo, DAG.getConstant( 0x7ff, INTVT ) ); + + // Pack exponent and frac + rlo = DAG.getNode( ISD::SRL, DL, INTVT, + rlo, DAG.getConstant( 11, INTVT ) ); + SDValue temp = DAG.getNode( ISD::SHL, DL, INTVT, + rhi, DAG.getConstant( (32 - 11), INTVT ) ); + rlo = DAG.getNode( ISD::OR, DL, INTVT, rlo, temp ); + rhi = DAG.getNode( ISD::SRL, DL, INTVT, + rhi, DAG.getConstant( 11, INTVT ) ); + exp = DAG.getNode( ISD::SHL, DL, INTVT, + exp, DAG.getConstant( 20, INTVT ) ); + rhi = DAG.getNode( ISD::OR, DL, INTVT, rhi, exp ); + + // Compute rounding bit + SDValue even = DAG.getNode( ISD::AND, DL, INTVT, + rlo, DAG.getConstant( 1, INTVT ) ); + SDValue grs = DAG.getNode( ISD::AND, DL, INTVT, + round, DAG.getConstant( 0x3ff, INTVT ) ); + grs = DAG.getNode( AMDILISD::CMP, DL, INTVT, + DAG.getConstant( CondCCodeToCC( ISD::SETNE, MVT::i32), MVT::i32), + grs, DAG.getConstant( 0, INTVT ) ); // -1 if any GRS set, 0 if none + grs = DAG.getNode( ISD::OR, DL, INTVT, grs, even ); + round = DAG.getNode( ISD::SRL, DL, INTVT, + round, DAG.getConstant( 10, INTVT ) ); + round = DAG.getNode( ISD::AND, DL, INTVT, round, grs ); // 0 or 1 + + // Add rounding bit + SDValue lround = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, + round, DAG.getConstant( 0, INTVT ) ); + SDValue res = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, rlo, rhi ); + res = DAG.getNode( ISD::ADD, DL, LONGVT, res, lround ); + return DAG.getNode(ISDBITCAST, DL, LHSVT, res); + } +} +SDValue +AMDILTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const +{ + SDValue RHS = Op.getOperand(0); + EVT RHSVT = RHS.getValueType(); + MVT RST = RHSVT.getScalarType().getSimpleVT(); + EVT LHSVT = Op.getValueType(); + MVT LST = LHSVT.getScalarType().getSimpleVT(); + DebugLoc DL = Op.getDebugLoc(); + SDValue DST; + EVT INTVT; + EVT LONGVT; + const AMDILTargetMachine* + amdtm = reinterpret_cast<const AMDILTargetMachine*> + (&this->getTargetMachine()); + const AMDILSubtarget* + stm = dynamic_cast<const AMDILSubtarget*>( + amdtm->getSubtargetImpl()); + if (LST == MVT::f64 && LHSVT.isVector() + && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + // We dont support vector 64bit floating point convertions. + DST = Op; + for (unsigned x = 0, y = LHSVT.getVectorNumElements(); x < y; ++x) { + SDValue op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, RST, RHS, DAG.getTargetConstant(x, MVT::i32)); + op = DAG.getNode(ISD::UINT_TO_FP, DL, LST, op); + if (!x) { + DST = DAG.getNode(AMDILISD::VBUILD, DL, LHSVT, op); + } else { + DST = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LHSVT, DST, + op, DAG.getTargetConstant(x, MVT::i32)); + } + + } + } else { + + if (RST == MVT::i32 + && LST == MVT::f64) { + if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + DST = SDValue(Op.getNode(), 0); + } else { + DST = genu32tof64(RHS, LHSVT, DAG); + } + } else if (RST == MVT::i64 + && LST == MVT::f64) { + DST = genu64tof64(RHS, LHSVT, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const +{ + SDValue RHS = Op.getOperand(0); + EVT RHSVT = RHS.getValueType(); + MVT RST = RHSVT.getScalarType().getSimpleVT(); + EVT INTVT; + EVT LONGVT; + SDValue DST; + bool isVec = RHSVT.isVector(); + DebugLoc DL = Op.getDebugLoc(); + EVT LHSVT = Op.getValueType(); + MVT LST = LHSVT.getScalarType().getSimpleVT(); + const AMDILTargetMachine* + amdtm = reinterpret_cast<const AMDILTargetMachine*> + (&this->getTargetMachine()); + const AMDILSubtarget* + stm = dynamic_cast<const AMDILSubtarget*>( + amdtm->getSubtargetImpl()); + if (LST == MVT::f64 && LHSVT.isVector() + && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + // We dont support vector 64bit floating point convertions. + for (unsigned x = 0, y = LHSVT.getVectorNumElements(); x < y; ++x) { + SDValue op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, RST, RHS, DAG.getTargetConstant(x, MVT::i32)); + op = DAG.getNode(ISD::UINT_TO_FP, DL, LST, op); + if (!x) { + DST = DAG.getNode(AMDILISD::VBUILD, DL, LHSVT, op); + } else { + DST = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, LHSVT, DST, + op, DAG.getTargetConstant(x, MVT::i32)); + } + + } + } else { + + if (isVec) { + LONGVT = EVT(MVT::getVectorVT(MVT::i64, + RHSVT.getVectorNumElements())); + INTVT = EVT(MVT::getVectorVT(MVT::i32, + RHSVT.getVectorNumElements())); + } else { + LONGVT = EVT(MVT::i64); + INTVT = EVT(MVT::i32); + } + MVT RST = RHSVT.getScalarType().getSimpleVT(); + if ((RST == MVT::i32 || RST == MVT::i64) + && LST == MVT::f64) { + if (RST == MVT::i32) { + if (stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + DST = SDValue(Op.getNode(), 0); + return DST; + } + } + SDValue c31 = DAG.getConstant( 31, INTVT ); + SDValue cSbit = DAG.getConstant( 0x80000000, INTVT ); + + SDValue S; // Sign, as 0 or -1 + SDValue Sbit; // Sign bit, as one bit, MSB only. + if (RST == MVT::i32) { + Sbit = DAG.getNode( ISD::AND, DL, INTVT, RHS, cSbit ); + S = DAG.getNode(ISD::SRA, DL, RHSVT, RHS, c31 ); + } else { // 64-bit case... SRA of 64-bit values is slow + SDValue hi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, RHS ); + Sbit = DAG.getNode( ISD::AND, DL, INTVT, hi, cSbit ); + SDValue temp = DAG.getNode( ISD::SRA, DL, INTVT, hi, c31 ); + S = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, RHSVT, temp, temp ); + } + + // get abs() of input value, given sign as S (0 or -1) + // SpI = RHS + S + SDValue SpI = DAG.getNode(ISD::ADD, DL, RHSVT, RHS, S); + // SpIxS = SpI ^ S + SDValue SpIxS = DAG.getNode(ISD::XOR, DL, RHSVT, SpI, S); + + // Convert unsigned value to double precision + SDValue R; + if (RST == MVT::i32) { + // r = cast_u32_to_f64(SpIxS) + R = genu32tof64(SpIxS, LHSVT, DAG); + } else { + // r = cast_u64_to_f64(SpIxS) + R = genu64tof64(SpIxS, LHSVT, DAG); + } + + // drop in the sign bit + SDValue t = DAG.getNode( AMDILISD::BITCONV, DL, LONGVT, R ); + SDValue thi = DAG.getNode( (isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTVT, t ); + SDValue tlo = DAG.getNode( (isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTVT, t ); + thi = DAG.getNode( ISD::OR, DL, INTVT, thi, Sbit ); + t = DAG.getNode( (isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, LONGVT, tlo, thi ); + DST = DAG.getNode( AMDILISD::BITCONV, DL, LHSVT, t ); + } else { + DST = SDValue(Op.getNode(), 0); + } + } + return DST; +} +SDValue +AMDILTargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const +{ + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue DST; + bool isVec = RHS.getValueType().isVector(); + if (OVT.getScalarType() == MVT::i64) { + /*const AMDILTargetMachine* + amdtm = reinterpret_cast<const AMDILTargetMachine*> + (&this->getTargetMachine()); + const AMDILSubtarget* + stm = dynamic_cast<const AMDILSubtarget*>( + amdtm->getSubtargetImpl());*/ + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i64) { + INTTY = MVT::v2i32; + } + SDValue LHSLO, LHSHI, RHSLO, RHSHI, INTLO, INTHI; + // TODO: need to turn this into a bitcast of i64/v2i64 to v2i32/v4i32 + LHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, LHS); + RHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, INTTY, RHS); + LHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, LHS); + RHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, INTTY, RHS); + INTLO = DAG.getNode(ISD::SUB, DL, INTTY, LHSLO, RHSLO); + INTHI = DAG.getNode(ISD::SUB, DL, INTTY, LHSHI, RHSHI); + //TODO: need to use IBORROW on HD5XXX and later hardware + SDValue cmp; + if (OVT == MVT::i64) { + cmp = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETULT, MVT::i32), MVT::i32), + LHSLO, RHSLO); + } else { + SDValue cmplo; + SDValue cmphi; + SDValue LHSRLO = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, MVT::i32, LHSLO, DAG.getTargetConstant(0, MVT::i32)); + SDValue LHSRHI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, MVT::i32, LHSLO, DAG.getTargetConstant(1, MVT::i32)); + SDValue RHSRLO = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, MVT::i32, RHSLO, DAG.getTargetConstant(0, MVT::i32)); + SDValue RHSRHI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, MVT::i32, RHSLO, DAG.getTargetConstant(1, MVT::i32)); + cmplo = DAG.getNode(AMDILISD::CMP, DL, MVT::i32, + DAG.getConstant(CondCCodeToCC(ISD::SETULT, MVT::i32), MVT::i32), + LHSRLO, RHSRLO); + cmphi = DAG.getNode(AMDILISD::CMP, DL, MVT::i32, + DAG.getConstant(CondCCodeToCC(ISD::SETULT, MVT::i32), MVT::i32), + LHSRHI, RHSRHI); + cmp = DAG.getNode(AMDILISD::VBUILD, DL, MVT::v2i32, cmplo); + cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i32, + cmp, cmphi, DAG.getTargetConstant(1, MVT::i32)); + } + INTHI = DAG.getNode(ISD::ADD, DL, INTTY, INTHI, cmp); + DST = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, OVT, + INTLO, INTHI); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} +SDValue +AMDILTargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const +{ + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::f64) { + DST = LowerFDIV64(Op, DAG); + } else if (OVT.getScalarType() == MVT::f32) { + DST = LowerFDIV32(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const +{ + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::i64) { + DST = LowerSDIV64(Op, DAG); + } else if (OVT.getScalarType() == MVT::i32) { + DST = LowerSDIV32(Op, DAG); + } else if (OVT.getScalarType() == MVT::i16 + || OVT.getScalarType() == MVT::i8) { + DST = LowerSDIV24(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerUDIV(SDValue Op, SelectionDAG &DAG) const +{ + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::i64) { + DST = LowerUDIV64(Op, DAG); + } else if (OVT.getScalarType() == MVT::i32) { + DST = LowerUDIV32(Op, DAG); + } else if (OVT.getScalarType() == MVT::i16 + || OVT.getScalarType() == MVT::i8) { + DST = LowerUDIV24(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const +{ + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::i64) { + DST = LowerSREM64(Op, DAG); + } else if (OVT.getScalarType() == MVT::i32) { + DST = LowerSREM32(Op, DAG); + } else if (OVT.getScalarType() == MVT::i16) { + DST = LowerSREM16(Op, DAG); + } else if (OVT.getScalarType() == MVT::i8) { + DST = LowerSREM8(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerUREM(SDValue Op, SelectionDAG &DAG) const +{ + EVT OVT = Op.getValueType(); + SDValue DST; + if (OVT.getScalarType() == MVT::i64) { + DST = LowerUREM64(Op, DAG); + } else if (OVT.getScalarType() == MVT::i32) { + DST = LowerUREM32(Op, DAG); + } else if (OVT.getScalarType() == MVT::i16) { + DST = LowerUREM16(Op, DAG); + } else if (OVT.getScalarType() == MVT::i8) { + DST = LowerUREM8(Op, DAG); + } else { + DST = SDValue(Op.getNode(), 0); + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue DST; + bool isVec = OVT.isVector(); + if (OVT.getScalarType() != MVT::i64) + { + DST = SDValue(Op.getNode(), 0); + } else { + assert(OVT.getScalarType() == MVT::i64 && "Only 64 bit mul should be lowered!"); + // TODO: This needs to be turned into a tablegen pattern + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i64) { + INTTY = MVT::v2i32; + } + // mul64(h1, l1, h0, l0) + SDValue LHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, + DL, + INTTY, LHS); + SDValue LHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, + DL, + INTTY, LHS); + SDValue RHSLO = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, + DL, + INTTY, RHS); + SDValue RHSHI = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, + DL, + INTTY, RHS); + // MULLO_UINT_1 r1, h0, l1 + SDValue RHILLO = DAG.getNode(AMDILISD::UMUL, + DL, + INTTY, RHSHI, LHSLO); + // MULLO_UINT_1 r2, h1, l0 + SDValue RLOHHI = DAG.getNode(AMDILISD::UMUL, + DL, + INTTY, RHSLO, LHSHI); + // ADD_INT hr, r1, r2 + SDValue ADDHI = DAG.getNode(ISD::ADD, + DL, + INTTY, RHILLO, RLOHHI); + // MULHI_UINT_1 r3, l1, l0 + SDValue RLOLLO = DAG.getNode(ISD::MULHU, + DL, + INTTY, RHSLO, LHSLO); + // ADD_INT hr, hr, r3 + SDValue HIGH = DAG.getNode(ISD::ADD, + DL, + INTTY, ADDHI, RLOLLO); + // MULLO_UINT_1 l3, l1, l0 + SDValue LOW = DAG.getNode(AMDILISD::UMUL, + DL, + INTTY, LHSLO, RHSLO); + DST = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, + DL, + OVT, LOW, HIGH); + } + return DST; +} +SDValue +AMDILTargetLowering::LowerBUILD_VECTOR( SDValue Op, SelectionDAG &DAG ) const +{ + EVT VT = Op.getValueType(); + //printSDValue(Op, 1); + SDValue Nodes1; + SDValue second; + SDValue third; + SDValue fourth; + DebugLoc DL = Op.getDebugLoc(); + Nodes1 = DAG.getNode(AMDILISD::VBUILD, + DL, + VT, Op.getOperand(0)); + bool allEqual = true; + for (unsigned x = 1, y = Op.getNumOperands(); x < y; ++x) { + if (Op.getOperand(0) != Op.getOperand(x)) { + allEqual = false; + break; + } + } + if (allEqual) { + return Nodes1; + } + switch(Op.getNumOperands()) { + default: + case 1: + break; + case 4: + fourth = Op.getOperand(3); + if (fourth.getOpcode() != ISD::UNDEF) { + Nodes1 = DAG.getNode( + ISD::INSERT_VECTOR_ELT, + DL, + Op.getValueType(), + Nodes1, + fourth, + DAG.getConstant(7, MVT::i32)); + } + case 3: + third = Op.getOperand(2); + if (third.getOpcode() != ISD::UNDEF) { + Nodes1 = DAG.getNode( + ISD::INSERT_VECTOR_ELT, + DL, + Op.getValueType(), + Nodes1, + third, + DAG.getConstant(6, MVT::i32)); + } + case 2: + second = Op.getOperand(1); + if (second.getOpcode() != ISD::UNDEF) { + Nodes1 = DAG.getNode( + ISD::INSERT_VECTOR_ELT, + DL, + Op.getValueType(), + Nodes1, + second, + DAG.getConstant(5, MVT::i32)); + } + break; + }; + return Nodes1; +} + +SDValue +AMDILTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + const SDValue *ptr = NULL; + const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Op.getOperand(2)); + uint32_t swizzleNum = 0; + SDValue DST; + if (!VT.isVector()) { + SDValue Res = Op.getOperand(0); + return Res; + } + + if (Op.getOperand(1).getOpcode() != ISD::UNDEF) { + ptr = &Op.getOperand(1); + } else { + ptr = &Op.getOperand(0); + } + if (CSDN) { + swizzleNum = (uint32_t)CSDN->getZExtValue(); + uint32_t mask2 = 0x04030201 & ~(0xFF << (swizzleNum * 8)); + uint32_t mask3 = 0x01010101 & (0xFF << (swizzleNum * 8)); + DST = DAG.getNode(AMDILISD::VINSERT, + DL, + VT, + Op.getOperand(0), + *ptr, + DAG.getTargetConstant(mask2, MVT::i32), + DAG.getTargetConstant(mask3, MVT::i32)); + } else { + uint32_t mask2 = 0x04030201 & ~(0xFF << (swizzleNum * 8)); + uint32_t mask3 = 0x01010101 & (0xFF << (swizzleNum * 8)); + SDValue res = DAG.getNode(AMDILISD::VINSERT, + DL, VT, Op.getOperand(0), *ptr, + DAG.getTargetConstant(mask2, MVT::i32), + DAG.getTargetConstant(mask3, MVT::i32)); + for (uint32_t x = 1; x < VT.getVectorNumElements(); ++x) { + mask2 = 0x04030201 & ~(0xFF << (x * 8)); + mask3 = 0x01010101 & (0xFF << (x * 8)); + SDValue t = DAG.getNode(AMDILISD::VINSERT, + DL, VT, Op.getOperand(0), *ptr, + DAG.getTargetConstant(mask2, MVT::i32), + DAG.getTargetConstant(mask3, MVT::i32)); + SDValue c = DAG.getNode(AMDILISD::CMP, DL, ptr->getValueType(), + DAG.getConstant(AMDILCC::IL_CC_I_EQ, MVT::i32), + Op.getOperand(2), DAG.getConstant(x, MVT::i32)); + c = DAG.getNode(AMDILISD::VBUILD, DL, Op.getValueType(), c); + res = DAG.getNode(AMDILISD::CMOVLOG, DL, VT, c, t, res); + } + DST = res; + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const +{ + EVT VT = Op.getValueType(); + //printSDValue(Op, 1); + const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + uint64_t swizzleNum = 0; + DebugLoc DL = Op.getDebugLoc(); + SDValue Res; + if (!Op.getOperand(0).getValueType().isVector()) { + Res = Op.getOperand(0); + return Res; + } + if (CSDN) { + // Static vector extraction + swizzleNum = CSDN->getZExtValue() + 1; + Res = DAG.getNode(AMDILISD::VEXTRACT, + DL, VT, + Op.getOperand(0), + DAG.getTargetConstant(swizzleNum, MVT::i32)); + } else { + SDValue Op1 = Op.getOperand(1); + uint32_t vecSize = 4; + SDValue Op0 = Op.getOperand(0); + SDValue res = DAG.getNode(AMDILISD::VEXTRACT, + DL, VT, Op0, + DAG.getTargetConstant(1, MVT::i32)); + if (Op0.getValueType().isVector()) { + vecSize = Op0.getValueType().getVectorNumElements(); + } + for (uint32_t x = 2; x <= vecSize; ++x) { + SDValue t = DAG.getNode(AMDILISD::VEXTRACT, + DL, VT, Op0, + DAG.getTargetConstant(x, MVT::i32)); + SDValue c = DAG.getNode(AMDILISD::CMP, + DL, Op1.getValueType(), + DAG.getConstant(AMDILCC::IL_CC_I_EQ, MVT::i32), + Op1, DAG.getConstant(x, MVT::i32)); + res = DAG.getNode(AMDILISD::CMOVLOG, DL, + VT, c, t, res); + + } + Res = res; + } + return Res; +} + +SDValue +AMDILTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const +{ + uint32_t vecSize = Op.getValueType().getVectorNumElements(); + SDValue src = Op.getOperand(0); + const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Op.getOperand(1)); + uint64_t offset = 0; + EVT vecType = Op.getValueType().getVectorElementType(); + DebugLoc DL = Op.getDebugLoc(); + SDValue Result; + if (CSDN) { + offset = CSDN->getZExtValue(); + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL,vecType, src, DAG.getConstant(offset, MVT::i32)); + Result = DAG.getNode(AMDILISD::VBUILD, DL, + Op.getValueType(), Result); + for (uint32_t x = 1; x < vecSize; ++x) { + SDValue elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, vecType, + src, DAG.getConstant(offset + x, MVT::i32)); + if (elt.getOpcode() != ISD::UNDEF) { + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, + Op.getValueType(), Result, elt, + DAG.getConstant(x, MVT::i32)); + } + } + } else { + SDValue idx = Op.getOperand(1); + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, vecType, src, idx); + Result = DAG.getNode(AMDILISD::VBUILD, DL, + Op.getValueType(), Result); + for (uint32_t x = 1; x < vecSize; ++x) { + idx = DAG.getNode(ISD::ADD, DL, vecType, + idx, DAG.getConstant(1, MVT::i32)); + SDValue elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, vecType, + src, idx); + if (elt.getOpcode() != ISD::UNDEF) { + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, + Op.getValueType(), Result, elt, idx); + } + } + } + return Result; +} +SDValue +AMDILTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, + SelectionDAG &DAG) const +{ + SDValue Res = DAG.getNode(AMDILISD::VBUILD, + Op.getDebugLoc(), + Op.getValueType(), + Op.getOperand(0)); + return Res; +} +SDValue +AMDILTargetLowering::LowerAND(SDValue Op, SelectionDAG &DAG) const +{ + SDValue andOp; + andOp = DAG.getNode( + AMDILISD::AND, + Op.getDebugLoc(), + Op.getValueType(), + Op.getOperand(0), + Op.getOperand(1)); + return andOp; +} +SDValue +AMDILTargetLowering::LowerOR(SDValue Op, SelectionDAG &DAG) const +{ + SDValue orOp; + orOp = DAG.getNode(AMDILISD::OR, + Op.getDebugLoc(), + Op.getValueType(), + Op.getOperand(0), + Op.getOperand(1)); + return orOp; +} +SDValue +AMDILTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Cond = Op.getOperand(0); + SDValue LHS = Op.getOperand(1); + SDValue RHS = Op.getOperand(2); + DebugLoc DL = Op.getDebugLoc(); + Cond = getConversionNode(DAG, Cond, Op, true); + Cond = DAG.getNode(AMDILISD::CMOVLOG, + DL, + Op.getValueType(), Cond, LHS, RHS); + return Cond; +} +SDValue +AMDILTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Cond; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue TRUE = Op.getOperand(2); + SDValue FALSE = Op.getOperand(3); + SDValue CC = Op.getOperand(4); + DebugLoc DL = Op.getDebugLoc(); + bool skipCMov = false; + bool genINot = false; + EVT OVT = Op.getValueType(); + + // Check for possible elimination of cmov + if (TRUE.getValueType().getSimpleVT().SimpleTy == MVT::i32) { + const ConstantSDNode *trueConst + = dyn_cast<ConstantSDNode>( TRUE.getNode() ); + const ConstantSDNode *falseConst + = dyn_cast<ConstantSDNode>( FALSE.getNode() ); + if (trueConst && falseConst) { + // both possible result values are constants + if (trueConst->isAllOnesValue() + && falseConst->isNullValue()) { // and convenient constants + skipCMov = true; + } + else if (trueConst->isNullValue() + && falseConst->isAllOnesValue()) { // less convenient + skipCMov = true; + genINot = true; + } + } + } + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + unsigned int AMDILCC = CondCCodeToCC( + SetCCOpcode, + LHS.getValueType().getSimpleVT().SimpleTy); + assert((AMDILCC != AMDILCC::COND_ERROR) && "Invalid SetCC!"); + Cond = DAG.getNode( + AMDILISD::CMP, + DL, + LHS.getValueType(), + DAG.getConstant(AMDILCC, MVT::i32), + LHS, + RHS); + Cond = getConversionNode(DAG, Cond, Op, true); + if (genINot) { + Cond = DAG.getNode(AMDILISD::NOT, DL, OVT, Cond); + } + if (!skipCMov) { + Cond = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, Cond, TRUE, FALSE); + } + return Cond; +} +SDValue +AMDILTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Cond; + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue CC = Op.getOperand(2); + DebugLoc DL = Op.getDebugLoc(); + ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); + unsigned int AMDILCC = CondCCodeToCC( + SetCCOpcode, + LHS.getValueType().getSimpleVT().SimpleTy); + assert((AMDILCC != AMDILCC::COND_ERROR) && "Invalid SetCC!"); + Cond = DAG.getNode( + AMDILISD::CMP, + DL, + LHS.getValueType(), + DAG.getConstant(AMDILCC, MVT::i32), + LHS, + RHS); + Cond = getConversionNode(DAG, Cond, Op, true); + Cond = DAG.getNode( + ISD::AND, + DL, + Cond.getValueType(), + DAG.getConstant(1, Cond.getValueType()), + Cond); + return Cond; +} + +SDValue +AMDILTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Data = Op.getOperand(0); + VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1)); + DebugLoc DL = Op.getDebugLoc(); + EVT DVT = Data.getValueType(); + EVT BVT = BaseType->getVT(); + unsigned baseBits = BVT.getScalarType().getSizeInBits(); + unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1; + unsigned shiftBits = srcBits - baseBits; + if (srcBits < 32) { + // If the op is less than 32 bits, then it needs to extend to 32bits + // so it can properly keep the upper bits valid. + EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1); + Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data); + shiftBits = 32 - baseBits; + DVT = IVT; + } + SDValue Shift = DAG.getConstant(shiftBits, DVT); + // Shift left by 'Shift' bits. + Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift); + // Signed shift Right by 'Shift' bits. + Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift); + if (srcBits < 32) { + // Once the sign extension is done, the op needs to be converted to + // its original type. + Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType()); + } + return Data; +} +EVT +AMDILTargetLowering::genIntType(uint32_t size, uint32_t numEle) const +{ + int iSize = (size * numEle); + int vEle = (iSize >> ((size == 64) ? 6 : 5)); + if (!vEle) { + vEle = 1; + } + if (size == 64) { + if (vEle == 1) { + return EVT(MVT::i64); + } else { + return EVT(MVT::getVectorVT(MVT::i64, vEle)); + } + } else { + if (vEle == 1) { + return EVT(MVT::i32); + } else { + return EVT(MVT::getVectorVT(MVT::i32, vEle)); + } + } +} + +SDValue +AMDILTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Src = Op.getOperand(0); + SDValue Dst = Op; + SDValue Res; + DebugLoc DL = Op.getDebugLoc(); + EVT SrcVT = Src.getValueType(); + EVT DstVT = Dst.getValueType(); + // Lets bitcast the floating point types to an + // equivalent integer type before converting to vectors. + if (SrcVT.getScalarType().isFloatingPoint()) { + Src = DAG.getNode(AMDILISD::BITCONV, DL, genIntType( + SrcVT.getScalarType().getSimpleVT().getSizeInBits(), + SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1), + Src); + SrcVT = Src.getValueType(); + } + uint32_t ScalarSrcSize = SrcVT.getScalarType() + .getSimpleVT().getSizeInBits(); + uint32_t ScalarDstSize = DstVT.getScalarType() + .getSimpleVT().getSizeInBits(); + uint32_t SrcNumEle = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1; + uint32_t DstNumEle = DstVT.isVector() ? DstVT.getVectorNumElements() : 1; + bool isVec = SrcVT.isVector(); + if (DstVT.getScalarType().isInteger() && + (SrcVT.getScalarType().isInteger() + || SrcVT.getScalarType().isFloatingPoint())) { + if ((ScalarDstSize == 64 && SrcNumEle == 4 && ScalarSrcSize == 16) + || (ScalarSrcSize == 64 + && DstNumEle == 4 + && ScalarDstSize == 16)) { + // This is the problematic case when bitcasting i64 <-> <4 x i16> + // This approach is a little different as we cannot generate a + // <4 x i64> vector + // as that is illegal in our backend and we are already past + // the DAG legalizer. + // So, in this case, we will do the following conversion. + // Case 1: + // %dst = <4 x i16> %src bitconvert i64 ==> + // %tmp = <4 x i16> %src convert <4 x i32> + // %tmp = <4 x i32> %tmp and 0xFFFF + // %tmp = <4 x i32> %tmp shift_left <0, 16, 0, 16> + // %tmp = <4 x i32> %tmp or %tmp.xz %tmp.yw + // %dst = <2 x i32> %tmp bitcast i64 + // case 2: + // %dst = i64 %src bitconvert <4 x i16> ==> + // %tmp = i64 %src bitcast <2 x i32> + // %tmp = <4 x i32> %tmp vinsert %tmp.xxyy + // %tmp = <4 x i32> %tmp shift_right <0, 16, 0, 16> + // %tmp = <4 x i32> %tmp and 0xFFFF + // %dst = <4 x i16> %tmp bitcast <4 x i32> + SDValue mask = DAG.getNode(AMDILISD::VBUILD, DL, MVT::v4i32, + DAG.getConstant(0xFFFF, MVT::i32)); + SDValue const16 = DAG.getConstant(16, MVT::i32); + if (ScalarDstSize == 64) { + // case 1 + Op = DAG.getSExtOrTrunc(Src, DL, MVT::v4i32); + Op = DAG.getNode(ISD::AND, DL, Op.getValueType(), Op, mask); + SDValue x = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Op, DAG.getConstant(0, MVT::i32)); + SDValue y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Op, DAG.getConstant(1, MVT::i32)); + y = DAG.getNode(ISD::SHL, DL, MVT::i32, y, const16); + SDValue z = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Op, DAG.getConstant(2, MVT::i32)); + SDValue w = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Op, DAG.getConstant(3, MVT::i32)); + w = DAG.getNode(ISD::SHL, DL, MVT::i32, w, const16); + x = DAG.getNode(ISD::OR, DL, MVT::i32, x, y); + y = DAG.getNode(ISD::OR, DL, MVT::i32, z, w); + Res = DAG.getNode((isVec) ? AMDILISD::LCREATE2 : AMDILISD::LCREATE, DL, MVT::i64, x, y); + return Res; + } else { + // case 2 + SDValue lo = DAG.getNode((isVec) ? AMDILISD::LCOMPLO2 : AMDILISD::LCOMPLO, DL, MVT::i32, Src); + SDValue lor16 + = DAG.getNode(ISD::SRL, DL, MVT::i32, lo, const16); + SDValue hi = DAG.getNode((isVec) ? AMDILISD::LCOMPHI2 : AMDILISD::LCOMPHI, DL, MVT::i32, Src); + SDValue hir16 + = DAG.getNode(ISD::SRL, DL, MVT::i32, hi, const16); + SDValue resVec = DAG.getNode(AMDILISD::VBUILD, DL, + MVT::v4i32, lo); + SDValue idxVal = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), DAG.getConstant(1, MVT::i32)); + resVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, + resVec, lor16, idxVal); + idxVal = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), DAG.getConstant(2, MVT::i32)); + resVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, + resVec, hi, idxVal); + idxVal = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), DAG.getConstant(3, MVT::i32)); + resVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, + resVec, hir16, idxVal); + resVec = DAG.getNode(ISD::AND, DL, MVT::v4i32, resVec, mask); + Res = DAG.getSExtOrTrunc(resVec, DL, MVT::v4i16); + return Res; + } + } else { + // There are four cases we need to worry about for bitcasts + // where the size of all + // source, intermediates and result is <= 128 bits, unlike + // the above case + // 1) Sub32bit bitcast 32bitAlign + // %dst = <4 x i8> bitcast i32 + // (also <[2|4] x i16> to <[2|4] x i32>) + // 2) 32bitAlign bitcast Sub32bit + // %dst = i32 bitcast <4 x i8> + // 3) Sub32bit bitcast LargerSub32bit + // %dst = <2 x i8> bitcast i16 + // (also <4 x i8> to <2 x i16>) + // 4) Sub32bit bitcast SmallerSub32bit + // %dst = i16 bitcast <2 x i8> + // (also <2 x i16> to <4 x i8>) + // This also only handles types that are powers of two + if ((ScalarDstSize & (ScalarDstSize - 1)) + || (ScalarSrcSize & (ScalarSrcSize - 1))) { + } else if (ScalarDstSize >= 32 && ScalarSrcSize < 32) { + // case 1: + EVT IntTy = genIntType(ScalarDstSize, SrcNumEle); +#if 0 // TODO: LLVM does not like this for some reason, cannot SignExt vectors + SDValue res = DAG.getSExtOrTrunc(Src, DL, IntTy); +#else + SDValue res = DAG.getNode(AMDILISD::VBUILD, DL, IntTy, + DAG.getUNDEF(IntTy.getScalarType())); + for (uint32_t x = 0; x < SrcNumEle; ++x) { + SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), DAG.getConstant(x, MVT::i32)); + SDValue temp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + SrcVT.getScalarType(), Src, + DAG.getConstant(x, MVT::i32)); + temp = DAG.getSExtOrTrunc(temp, DL, IntTy.getScalarType()); + res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntTy, + res, temp, idx); + } +#endif + SDValue mask = DAG.getNode(AMDILISD::VBUILD, DL, IntTy, + DAG.getConstant((1 << ScalarSrcSize) - 1, MVT::i32)); + SDValue *newEle = new SDValue[SrcNumEle]; + res = DAG.getNode(ISD::AND, DL, IntTy, res, mask); + for (uint32_t x = 0; x < SrcNumEle; ++x) { + newEle[x] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + IntTy.getScalarType(), res, + DAG.getConstant(x, MVT::i32)); + } + uint32_t Ratio = SrcNumEle / DstNumEle; + for (uint32_t x = 0; x < SrcNumEle; ++x) { + if (x % Ratio) { + newEle[x] = DAG.getNode(ISD::SHL, DL, + IntTy.getScalarType(), newEle[x], + DAG.getConstant(ScalarSrcSize * (x % Ratio), + MVT::i32)); + } + } + for (uint32_t x = 0; x < SrcNumEle; x += 2) { + newEle[x] = DAG.getNode(ISD::OR, DL, + IntTy.getScalarType(), newEle[x], newEle[x + 1]); + } + if (ScalarSrcSize == 8) { + for (uint32_t x = 0; x < SrcNumEle; x += 4) { + newEle[x] = DAG.getNode(ISD::OR, DL, + IntTy.getScalarType(), newEle[x], newEle[x + 2]); + } + if (DstNumEle == 1) { + Dst = newEle[0]; + } else { + Dst = DAG.getNode(AMDILISD::VBUILD, DL, DstVT, + newEle[0]); + for (uint32_t x = 1; x < DstNumEle; ++x) { + SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), DAG.getConstant(x, MVT::i32)); + Dst = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, + DstVT, Dst, newEle[x * 4], idx); + } + } + } else { + if (DstNumEle == 1) { + Dst = newEle[0]; + } else { + Dst = DAG.getNode(AMDILISD::VBUILD, DL, DstVT, + newEle[0]); + for (uint32_t x = 1; x < DstNumEle; ++x) { + SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), DAG.getConstant(x, MVT::i32)); + Dst = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, + DstVT, Dst, newEle[x * 2], idx); + } + } + } + delete [] newEle; + return Dst; + } else if (ScalarDstSize < 32 && ScalarSrcSize >= 32) { + // case 2: + EVT IntTy = genIntType(ScalarSrcSize, DstNumEle); + SDValue vec = DAG.getNode(AMDILISD::VBUILD, DL, IntTy, + DAG.getUNDEF(IntTy.getScalarType())); + uint32_t mult = (ScalarDstSize == 8) ? 4 : 2; + for (uint32_t x = 0; x < SrcNumEle; ++x) { + for (uint32_t y = 0; y < mult; ++y) { + SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), + DAG.getConstant(x * mult + y, MVT::i32)); + SDValue t; + if (SrcNumEle > 1) { + t = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, + DL, SrcVT.getScalarType(), Src, + DAG.getConstant(x, MVT::i32)); + } else { + t = Src; + } + if (y != 0) { + t = DAG.getNode(ISD::SRL, DL, t.getValueType(), + t, DAG.getConstant(y * ScalarDstSize, + MVT::i32)); + } + vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, + DL, IntTy, vec, t, idx); + } + } + Dst = DAG.getSExtOrTrunc(vec, DL, DstVT); + return Dst; + } else if (ScalarDstSize == 16 && ScalarSrcSize == 8) { + // case 3: + SDValue *numEle = new SDValue[SrcNumEle]; + for (uint32_t x = 0; x < SrcNumEle; ++x) { + numEle[x] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + MVT::i8, Src, DAG.getConstant(x, MVT::i32)); + numEle[x] = DAG.getSExtOrTrunc(numEle[x], DL, MVT::i16); + numEle[x] = DAG.getNode(ISD::AND, DL, MVT::i16, numEle[x], + DAG.getConstant(0xFF, MVT::i16)); + } + for (uint32_t x = 1; x < SrcNumEle; x += 2) { + numEle[x] = DAG.getNode(ISD::SHL, DL, MVT::i16, numEle[x], + DAG.getConstant(8, MVT::i16)); + numEle[x - 1] = DAG.getNode(ISD::OR, DL, MVT::i16, + numEle[x-1], numEle[x]); + } + if (DstNumEle > 1) { + // If we are not a scalar i16, the only other case is a + // v2i16 since we can't have v8i8 at this point, v4i16 + // cannot be generated + Dst = DAG.getNode(AMDILISD::VBUILD, DL, MVT::v2i16, + numEle[0]); + SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), DAG.getConstant(1, MVT::i32)); + Dst = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i16, + Dst, numEle[2], idx); + } else { + Dst = numEle[0]; + } + delete [] numEle; + return Dst; + } else if (ScalarDstSize == 8 && ScalarSrcSize == 16) { + // case 4: + SDValue *numEle = new SDValue[DstNumEle]; + for (uint32_t x = 0; x < SrcNumEle; ++x) { + numEle[x * 2] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + MVT::i16, Src, DAG.getConstant(x, MVT::i32)); + numEle[x * 2 + 1] = DAG.getNode(ISD::SRL, DL, MVT::i16, + numEle[x * 2], DAG.getConstant(8, MVT::i16)); + } + MVT ty = (SrcNumEle == 1) ? MVT::v2i16 : MVT::v4i16; + Dst = DAG.getNode(AMDILISD::VBUILD, DL, ty, numEle[0]); + for (uint32_t x = 1; x < DstNumEle; ++x) { + SDValue idx = DAG.getNode(ISD::ZERO_EXTEND, DL, + getPointerTy(), DAG.getConstant(x, MVT::i32)); + Dst = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ty, + Dst, numEle[x], idx); + } + delete [] numEle; + ty = (SrcNumEle == 1) ? MVT::v2i8 : MVT::v4i8; + Res = DAG.getSExtOrTrunc(Dst, DL, ty); + return Res; + } + } + } + Res = DAG.getNode(AMDILISD::BITCONV, + Dst.getDebugLoc(), + Dst.getValueType(), Src); + return Res; +} + +SDValue +AMDILTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const +{ + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + unsigned int SPReg = AMDIL::SP; + DebugLoc DL = Op.getDebugLoc(); + SDValue SP = DAG.getCopyFromReg(Chain, + DL, + SPReg, MVT::i32); + SDValue NewSP = DAG.getNode(ISD::ADD, + DL, + MVT::i32, SP, Size); + Chain = DAG.getCopyToReg(SP.getValue(1), + DL, + SPReg, NewSP); + SDValue Ops[2] = {NewSP, Chain}; + Chain = DAG.getMergeValues(Ops, 2 ,DL); + return Chain; +} +SDValue +AMDILTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Chain = Op.getOperand(0); + SDValue Cond = Op.getOperand(1); + SDValue Jump = Op.getOperand(2); + SDValue Result; + Result = DAG.getNode( + AMDILISD::BRANCH_COND, + Op.getDebugLoc(), + Op.getValueType(), + Chain, Jump, Cond); + return Result; +} + +SDValue +AMDILTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Chain = Op.getOperand(0); + CondCodeSDNode *CCNode = cast<CondCodeSDNode>(Op.getOperand(1)); + SDValue LHS = Op.getOperand(2); + SDValue RHS = Op.getOperand(3); + SDValue JumpT = Op.getOperand(4); + SDValue CmpValue; + ISD::CondCode CC = CCNode->get(); + SDValue Result; + unsigned int cmpOpcode = CondCCodeToCC( + CC, + LHS.getValueType().getSimpleVT().SimpleTy); + CmpValue = DAG.getNode( + AMDILISD::CMP, + Op.getDebugLoc(), + LHS.getValueType(), + DAG.getConstant(cmpOpcode, MVT::i32), + LHS, RHS); + Result = DAG.getNode( + AMDILISD::BRANCH_COND, + CmpValue.getDebugLoc(), + MVT::Other, Chain, + JumpT, CmpValue); + return Result; +} + +SDValue +AMDILTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Result = DAG.getNode( + AMDILISD::DP_TO_FP, + Op.getDebugLoc(), + Op.getValueType(), + Op.getOperand(0), + Op.getOperand(1)); + return Result; +} + +SDValue +AMDILTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const +{ + SDValue Result = DAG.getNode( + AMDILISD::VCONCAT, + Op.getDebugLoc(), + Op.getValueType(), + Op.getOperand(0), + Op.getOperand(1)); + return Result; +} +// LowerRET - Lower an ISD::RET node. +SDValue +AMDILTargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) +const +{ + //MachineFunction& MF = DAG.getMachineFunction(); + // CCValAssign - represent the assignment of the return value + // to a location + SmallVector<CCValAssign, 16> RVLocs; + + // CCState - Info about the registers and stack slot + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), + getTargetMachine(), RVLocs, *DAG.getContext()); + + // Analyze return values of ISD::RET + CCInfo.AnalyzeReturn(Outs, RetCC_AMDIL32); + // If this is the first return lowered for this function, add + // the regs to the liveout set for the function + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + for (unsigned int i = 0, e = RVLocs.size(); i != e; ++i) { + if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) { + MRI.addLiveOut(RVLocs[i].getLocReg()); + } + } + // FIXME: implement this when tail call is implemented + // Chain = GetPossiblePreceedingTailCall(Chain, AMDILISD::TAILCALL); + // both x86 and ppc implement this in ISelLowering + + // Regular return here + SDValue Flag; + SmallVector<SDValue, 6> RetOps; + RetOps.push_back(Chain); + RetOps.push_back(DAG.getConstant(0/*getBytesToPopOnReturn()*/, MVT::i32)); + for (unsigned int i = 0, e = RVLocs.size(); i != e; ++i) { + CCValAssign &VA = RVLocs[i]; + SDValue ValToCopy = OutVals[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + // ISD::Ret => ret chain, (regnum1, val1), ... + // So i * 2 + 1 index only the regnums + Chain = DAG.getCopyToReg(Chain, + dl, + VA.getLocReg(), + ValToCopy, + Flag); + // guarantee that all emitted copies are stuck together + // avoiding something bad + Flag = Chain.getValue(1); + } + /*if (MF.getFunction()->hasStructRetAttr()) { + assert(0 && "Struct returns are not yet implemented!"); + // Both MIPS and X86 have this + }*/ + RetOps[0] = Chain; + if (Flag.getNode()) + RetOps.push_back(Flag); + + Flag = DAG.getNode(AMDILISD::RET_FLAG, + dl, + MVT::Other, &RetOps[0], RetOps.size()); + return Flag; +} +void +AMDILTargetLowering::generateLongRelational(MachineInstr *MI, + unsigned int opCode) const +{ + MachineOperand DST = MI->getOperand(0); + MachineOperand LHS = MI->getOperand(2); + MachineOperand RHS = MI->getOperand(3); + unsigned int opi32Code = 0, si32Code = 0; + unsigned int simpleVT = MI->getDesc().OpInfo[0].RegClass; + uint32_t REGS[12]; + // All the relationals can be generated with with 6 temp registers + for (int x = 0; x < 12; ++x) { + REGS[x] = genVReg(simpleVT); + } + // Pull out the high and low components of each 64 bit register + generateMachineInst(AMDIL::LHI, REGS[0], LHS.getReg()); + generateMachineInst(AMDIL::LLO, REGS[1], LHS.getReg()); + generateMachineInst(AMDIL::LHI, REGS[2], RHS.getReg()); + generateMachineInst(AMDIL::LLO, REGS[3], RHS.getReg()); + // Determine the correct opcode that we should use + switch(opCode) { + default: + assert(!"comparison case not handled!"); + break; + case AMDIL::LEQ: + si32Code = opi32Code = AMDIL::IEQ; + break; + case AMDIL::LNE: + si32Code = opi32Code = AMDIL::INE; + break; + case AMDIL::LLE: + case AMDIL::ULLE: + case AMDIL::LGE: + case AMDIL::ULGE: + if (opCode == AMDIL::LGE || opCode == AMDIL::ULGE) { + std::swap(REGS[0], REGS[2]); + } else { + std::swap(REGS[1], REGS[3]); + } + if (opCode == AMDIL::LLE || opCode == AMDIL::LGE) { + opi32Code = AMDIL::ILT; + } else { + opi32Code = AMDIL::ULT; + } + si32Code = AMDIL::UGE; + break; + case AMDIL::LGT: + case AMDIL::ULGT: + std::swap(REGS[0], REGS[2]); + std::swap(REGS[1], REGS[3]); + case AMDIL::LLT: + case AMDIL::ULLT: + if (opCode == AMDIL::LGT || opCode == AMDIL::LLT) { + opi32Code = AMDIL::ILT; + } else { + opi32Code = AMDIL::ULT; + } + si32Code = AMDIL::ULT; + break; + }; + // Do the initial opcode on the high and low components. + // This leaves the following: + // REGS[4] = L_HI OP R_HI + // REGS[5] = L_LO OP R_LO + generateMachineInst(opi32Code, REGS[4], REGS[0], REGS[2]); + generateMachineInst(si32Code, REGS[5], REGS[1], REGS[3]); + switch(opi32Code) { + case AMDIL::IEQ: + case AMDIL::INE: + { + // combine the results with an and or or depending on if + // we are eq or ne + uint32_t combineOp = (opi32Code == AMDIL::IEQ) + ? AMDIL::BINARY_AND_i32 : AMDIL::BINARY_OR_i32; + generateMachineInst(combineOp, REGS[11], REGS[4], REGS[5]); + } + break; + default: + // this finishes codegen for the following pattern + // REGS[4] || (REGS[5] && (L_HI == R_HI)) + generateMachineInst(AMDIL::IEQ, REGS[9], REGS[0], REGS[2]); + generateMachineInst(AMDIL::BINARY_AND_i32, REGS[10], REGS[5], + REGS[9]); + generateMachineInst(AMDIL::BINARY_OR_i32, REGS[11], REGS[4], + REGS[10]); + break; + } + generateMachineInst(AMDIL::LCREATE, DST.getReg(), REGS[11], REGS[11]); +} + +unsigned int +AMDILTargetLowering::getFunctionAlignment(const Function *) const +{ + return 0; +} + +void +AMDILTargetLowering::setPrivateData(MachineBasicBlock *BB, + MachineBasicBlock::iterator &BBI, + DebugLoc *DL, const TargetInstrInfo *TII) const +{ + mBB = BB; + mBBI = BBI; + mDL = DL; + mTII = TII; +} +uint32_t +AMDILTargetLowering::genVReg(uint32_t regType) const +{ + return mBB->getParent()->getRegInfo().createVirtualRegister( + getRegClassFromID(regType)); +} + +MachineInstrBuilder +AMDILTargetLowering::generateMachineInst(uint32_t opcode, uint32_t dst) const +{ + return BuildMI(*mBB, mBBI, *mDL, mTII->get(opcode), dst); +} + +MachineInstrBuilder +AMDILTargetLowering::generateMachineInst(uint32_t opcode, uint32_t dst, + uint32_t src1) const +{ + return generateMachineInst(opcode, dst).addReg(src1); +} + +MachineInstrBuilder +AMDILTargetLowering::generateMachineInst(uint32_t opcode, uint32_t dst, + uint32_t src1, uint32_t src2) const +{ + return generateMachineInst(opcode, dst, src1).addReg(src2); +} + +MachineInstrBuilder +AMDILTargetLowering::generateMachineInst(uint32_t opcode, uint32_t dst, + uint32_t src1, uint32_t src2, uint32_t src3) const +{ + return generateMachineInst(opcode, dst, src1, src2).addReg(src3); +} + + +SDValue +AMDILTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT INTTY; + MVT FLTTY; + if (!OVT.isVector()) { + INTTY = MVT::i32; + FLTTY = MVT::f32; + } else if (OVT.getVectorNumElements() == 2) { + INTTY = MVT::v2i32; + FLTTY = MVT::v2f32; + } else if (OVT.getVectorNumElements() == 4) { + INTTY = MVT::v4i32; + FLTTY = MVT::v4f32; + } + unsigned bitsize = OVT.getScalarType().getSizeInBits(); + // char|short jq = ia ^ ib; + SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS); + + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); + + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, INTTY); + + // int ia = (int)LHS; + SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY); + + // int ib, (int)RHS; + SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY); + + // float fa = (float)ia; + SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); + + // float fb = (float)ib; + SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); + + // float fq = native_divide(fa, fb); + SDValue fq = DAG.getNode(AMDILISD::DIV_INF, DL, FLTTY, fa, fb); + + // fq = trunc(fq); + fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); + + // float fqneg = -fq; + SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq); + + // float fr = mad(fqneg, fb, fa); + SDValue fr = DAG.getNode(AMDILISD::MAD, DL, FLTTY, fqneg, fb, fa); + + // int iq = (int)fq; + SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); + + // fr = fabs(fr); + fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr); + + // fb = fabs(fb); + fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb); + + // int cv = fr >= fb; + SDValue cv; + if (INTTY == MVT::i32) { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } else { + cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE); + } + // jq = (cv ? jq : 0); + jq = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, cv, jq, + DAG.getConstant(0, OVT)); + // dst = iq + jq; + iq = DAG.getSExtOrTrunc(iq, DL, OVT); + iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq); + return iq; +} + +SDValue +AMDILTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSDIV32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r0, r0, r1 + // ixor r10, r10, r11 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getNode(AMDILISD::CMP, DL, OVT, + DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32), + r0, DAG.getConstant(0, OVT)); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getNode(AMDILISD::CMP, DL, OVT, + DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32), + r1, DAG.getConstant(0, OVT)); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r0, r0, r1 + r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1); + + // ixor r10, r10, r11 + r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue +AMDILTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const +{ + return SDValue(Op.getNode(), 0); +} + +SDValue +AMDILTargetLowering::LowerUDIV24(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + MVT INTTY; + MVT FLTTY; + if (!OVT.isVector()) { + INTTY = MVT::i32; + FLTTY = MVT::f32; + } else if (OVT.getVectorNumElements() == 2) { + INTTY = MVT::v2i32; + FLTTY = MVT::v2f32; + } else if (OVT.getVectorNumElements() == 4) { + INTTY = MVT::v4i32; + FLTTY = MVT::v4f32; + } + + // The LowerUDIV24 function implements the following CL. + // int ia = (int)LHS + // float fa = (float)ia + // int ib = (int)RHS + // float fb = (float)ib + // float fq = native_divide(fa, fb) + // fq = trunc(fq) + // float t = mad(fq, fb, fb) + // int iq = (int)fq - (t <= fa) + // return (type)iq + + // int ia = (int)LHS + SDValue ia = DAG.getZExtOrTrunc(LHS, DL, INTTY); + + // float fa = (float)ia + SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia); + + // int ib = (int)RHS + SDValue ib = DAG.getZExtOrTrunc(RHS, DL, INTTY); + + // float fb = (float)ib + SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib); + + // float fq = native_divide(fa, fb) + SDValue fq = DAG.getNode(AMDILISD::DIV_INF, DL, FLTTY, fa, fb); + + // fq = trunc(fq) + fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq); + + // float t = mad(fq, fb, fb) + SDValue t = DAG.getNode(AMDILISD::MAD, DL, FLTTY, fq, fb, fb); + + // int iq = (int)fq - (t <= fa) // This is sub and not add because GPU returns 0, -1 + SDValue iq; + fq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq); + if (INTTY == MVT::i32) { + iq = DAG.getSetCC(DL, INTTY, t, fa, ISD::SETOLE); + } else { + iq = DAG.getSetCC(DL, INTTY, t, fa, ISD::SETOLE); + } + iq = DAG.getNode(ISD::ADD, DL, INTTY, fq, iq); + + + // return (type)iq + iq = DAG.getZExtOrTrunc(iq, DL, OVT); + return iq; + +} + +SDValue +AMDILTargetLowering::LowerUDIV32(SDValue Op, SelectionDAG &DAG) const +{ + return SDValue(Op.getNode(), 0); +} + +SDValue +AMDILTargetLowering::LowerUDIV64(SDValue Op, SelectionDAG &DAG) const +{ + return SDValue(Op.getNode(), 0); +} +SDValue +AMDILTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i8) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4i8) { + INTTY = MVT::v4i32; + } + SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); + SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); + LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); + LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); + return LHS; +} + +SDValue +AMDILTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i16) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4i16) { + INTTY = MVT::v4i32; + } + SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY); + SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY); + LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS); + LHS = DAG.getSExtOrTrunc(LHS, DL, OVT); + return LHS; +} + +SDValue +AMDILTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerSREM32 function generates equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // ilt r10, r0, 0 + // ilt r11, r1, 0 + // iadd r0, r0, r10 + // iadd r1, r1, r11 + // ixor r0, r0, r10 + // ixor r1, r1, r11 + // udiv r20, r0, r1 + // umul r20, r20, r1 + // sub r0, r0, r20 + // iadd r0, r0, r10 + // ixor DST, r0, r10 + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // ilt r10, r0, 0 + SDValue r10 = DAG.getNode(AMDILISD::CMP, DL, OVT, + DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32), + r0, DAG.getConstant(0, OVT)); + + // ilt r11, r1, 0 + SDValue r11 = DAG.getNode(AMDILISD::CMP, DL, OVT, + DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::i32), MVT::i32), + r1, DAG.getConstant(0, OVT)); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // iadd r1, r1, r11 + r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11); + + // ixor r0, r0, r10 + r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + + // ixor r1, r1, r11 + r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11); + + // udiv r20, r0, r1 + SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1); + + // umul r20, r20, r1 + r20 = DAG.getNode(AMDILISD::UMUL, DL, OVT, r20, r1); + + // sub r0, r0, r20 + r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20); + + // iadd r0, r0, r10 + r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10); + + // ixor DST, r0, r10 + SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); + return DST; +} + +SDValue +AMDILTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const +{ + return SDValue(Op.getNode(), 0); +} + +SDValue +AMDILTargetLowering::LowerUREM8(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i8) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4i8) { + INTTY = MVT::v4i32; + } + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerUREM8 function generates equivalent to the following IL. + // mov r0, as_u32(LHS) + // mov r1, as_u32(RHS) + // and r10, r0, 0xFF + // and r11, r1, 0xFF + // cmov_logical r3, r11, r11, 0x1 + // udiv r3, r10, r3 + // cmov_logical r3, r11, r3, 0 + // umul r3, r3, r11 + // sub r3, r10, r3 + // and as_u8(DST), r3, 0xFF + + // mov r0, as_u32(LHS) + SDValue r0 = DAG.getSExtOrTrunc(LHS, DL, INTTY); + + // mov r1, as_u32(RHS) + SDValue r1 = DAG.getSExtOrTrunc(RHS, DL, INTTY); + + // and r10, r0, 0xFF + SDValue r10 = DAG.getNode(ISD::AND, DL, INTTY, r0, + DAG.getConstant(0xFF, INTTY)); + + // and r11, r1, 0xFF + SDValue r11 = DAG.getNode(ISD::AND, DL, INTTY, r1, + DAG.getConstant(0xFF, INTTY)); + + // cmov_logical r3, r11, r11, 0x1 + SDValue r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, r11, r11, + DAG.getConstant(0x01, INTTY)); + + // udiv r3, r10, r3 + r3 = DAG.getNode(ISD::UREM, DL, INTTY, r10, r3); + + // cmov_logical r3, r11, r3, 0 + r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, r11, r3, + DAG.getConstant(0, INTTY)); + + // umul r3, r3, r11 + r3 = DAG.getNode(AMDILISD::UMUL, DL, INTTY, r3, r11); + + // sub r3, r10, r3 + r3 = DAG.getNode(ISD::SUB, DL, INTTY, r10, r3); + + // and as_u8(DST), r3, 0xFF + SDValue DST = DAG.getNode(ISD::AND, DL, INTTY, r3, + DAG.getConstant(0xFF, INTTY)); + DST = DAG.getZExtOrTrunc(DST, DL, OVT); + return DST; +} + +SDValue +AMDILTargetLowering::LowerUREM16(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2i16) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4i16) { + INTTY = MVT::v4i32; + } + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerUREM16 function generatest equivalent to the following IL. + // mov r0, LHS + // mov r1, RHS + // DIV = LowerUDIV16(LHS, RHS) + // and r10, r0, 0xFFFF + // and r11, r1, 0xFFFF + // cmov_logical r3, r11, r11, 0x1 + // udiv as_u16(r3), as_u32(r10), as_u32(r3) + // and r3, r3, 0xFFFF + // cmov_logical r3, r11, r3, 0 + // umul r3, r3, r11 + // sub r3, r10, r3 + // and DST, r3, 0xFFFF + + // mov r0, LHS + SDValue r0 = LHS; + + // mov r1, RHS + SDValue r1 = RHS; + + // and r10, r0, 0xFFFF + SDValue r10 = DAG.getNode(ISD::AND, DL, OVT, r0, + DAG.getConstant(0xFFFF, OVT)); + + // and r11, r1, 0xFFFF + SDValue r11 = DAG.getNode(ISD::AND, DL, OVT, r1, + DAG.getConstant(0xFFFF, OVT)); + + // cmov_logical r3, r11, r11, 0x1 + SDValue r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, r11, r11, + DAG.getConstant(0x01, OVT)); + + // udiv as_u16(r3), as_u32(r10), as_u32(r3) + r10 = DAG.getZExtOrTrunc(r10, DL, INTTY); + r3 = DAG.getZExtOrTrunc(r3, DL, INTTY); + r3 = DAG.getNode(ISD::UREM, DL, INTTY, r10, r3); + r3 = DAG.getZExtOrTrunc(r3, DL, OVT); + r10 = DAG.getZExtOrTrunc(r10, DL, OVT); + + // and r3, r3, 0xFFFF + r3 = DAG.getNode(ISD::AND, DL, OVT, r3, + DAG.getConstant(0xFFFF, OVT)); + + // cmov_logical r3, r11, r3, 0 + r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, r11, r3, + DAG.getConstant(0, OVT)); + // umul r3, r3, r11 + r3 = DAG.getNode(AMDILISD::UMUL, DL, OVT, r3, r11); + + // sub r3, r10, r3 + r3 = DAG.getNode(ISD::SUB, DL, OVT, r10, r3); + + // and DST, r3, 0xFFFF + SDValue DST = DAG.getNode(ISD::AND, DL, OVT, r3, + DAG.getConstant(0xFFFF, OVT)); + return DST; +} + +SDValue +AMDILTargetLowering::LowerUREM32(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + // The LowerUREM32 function generates equivalent to the following IL. + // udiv r20, LHS, RHS + // umul r20, r20, RHS + // sub DST, LHS, r20 + + // udiv r20, LHS, RHS + SDValue r20 = DAG.getNode(ISD::UDIV, DL, OVT, LHS, RHS); + + // umul r20, r20, RHS + r20 = DAG.getNode(AMDILISD::UMUL, DL, OVT, r20, RHS); + + // sub DST, LHS, r20 + SDValue DST = DAG.getNode(ISD::SUB, DL, OVT, LHS, r20); + return DST; +} + +SDValue +AMDILTargetLowering::LowerUREM64(SDValue Op, SelectionDAG &DAG) const +{ + return SDValue(Op.getNode(), 0); +} + + +SDValue +AMDILTargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const +{ + DebugLoc DL = Op.getDebugLoc(); + EVT OVT = Op.getValueType(); + MVT INTTY = MVT::i32; + if (OVT == MVT::v2f32) { + INTTY = MVT::v2i32; + } else if (OVT == MVT::v4f32) { + INTTY = MVT::v4i32; + } + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDValue DST; + const AMDILSubtarget *stm = reinterpret_cast<const AMDILTargetMachine*>( + &this->getTargetMachine())->getSubtargetImpl(); + if (stm->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + // TODO: This doesn't work for vector types yet + // The LowerFDIV32 function generates equivalent to the following + // IL: + // mov r20, as_int(LHS) + // mov r21, as_int(RHS) + // and r30, r20, 0x7f800000 + // and r31, r20, 0x807FFFFF + // and r32, r21, 0x7f800000 + // and r33, r21, 0x807FFFFF + // ieq r40, r30, 0x7F800000 + // ieq r41, r31, 0x7F800000 + // ieq r42, r32, 0 + // ieq r43, r33, 0 + // and r50, r20, 0x80000000 + // and r51, r21, 0x80000000 + // ior r32, r32, 0x3f800000 + // ior r33, r33, 0x3f800000 + // cmov_logical r32, r42, r50, r32 + // cmov_logical r33, r43, r51, r33 + // cmov_logical r32, r40, r20, r32 + // cmov_logical r33, r41, r21, r33 + // ior r50, r40, r41 + // ior r51, r42, r43 + // ior r50, r50, r51 + // inegate r52, r31 + // iadd r30, r30, r52 + // cmov_logical r30, r50, 0, r30 + // div_zeroop(infinity) r21, 1.0, r33 + // mul_ieee r20, r32, r21 + // and r22, r20, 0x7FFFFFFF + // and r23, r20, 0x80000000 + // ishr r60, r22, 0x00000017 + // ishr r61, r30, 0x00000017 + // iadd r20, r20, r30 + // iadd r21, r22, r30 + // iadd r60, r60, r61 + // ige r42, 0, R60 + // ior r41, r23, 0x7F800000 + // ige r40, r60, 0x000000FF + // cmov_logical r40, r50, 0, r40 + // cmov_logical r20, r42, r23, r20 + // cmov_logical DST, r40, r41, r20 + // as_float(DST) + + // mov r20, as_int(LHS) + SDValue R20 = DAG.getNode(ISDBITCAST, DL, INTTY, LHS); + + // mov r21, as_int(RHS) + SDValue R21 = DAG.getNode(ISDBITCAST, DL, INTTY, RHS); + + // and r30, r20, 0x7f800000 + SDValue R30 = DAG.getNode(ISD::AND, DL, INTTY, R20, + DAG.getConstant(0x7F800000, INTTY)); + + // and r31, r21, 0x7f800000 + SDValue R31 = DAG.getNode(ISD::AND, DL, INTTY, R21, + DAG.getConstant(0x7f800000, INTTY)); + + // and r32, r20, 0x807FFFFF + SDValue R32 = DAG.getNode(ISD::AND, DL, INTTY, R20, + DAG.getConstant(0x807FFFFF, INTTY)); + + // and r33, r21, 0x807FFFFF + SDValue R33 = DAG.getNode(ISD::AND, DL, INTTY, R21, + DAG.getConstant(0x807FFFFF, INTTY)); + + // ieq r40, r30, 0x7F800000 + SDValue R40 = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32), + R30, DAG.getConstant(0x7F800000, INTTY)); + + // ieq r41, r31, 0x7F800000 + SDValue R41 = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32), + R31, DAG.getConstant(0x7F800000, INTTY)); + + // ieq r42, r30, 0 + SDValue R42 = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32), + R30, DAG.getConstant(0, INTTY)); + + // ieq r43, r31, 0 + SDValue R43 = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETEQ, MVT::i32), MVT::i32), + R31, DAG.getConstant(0, INTTY)); + + // and r50, r20, 0x80000000 + SDValue R50 = DAG.getNode(ISD::AND, DL, INTTY, R20, + DAG.getConstant(0x80000000, INTTY)); + + // and r51, r21, 0x80000000 + SDValue R51 = DAG.getNode(ISD::AND, DL, INTTY, R21, + DAG.getConstant(0x80000000, INTTY)); + + // ior r32, r32, 0x3f800000 + R32 = DAG.getNode(ISD::OR, DL, INTTY, R32, + DAG.getConstant(0x3F800000, INTTY)); + + // ior r33, r33, 0x3f800000 + R33 = DAG.getNode(ISD::OR, DL, INTTY, R33, + DAG.getConstant(0x3F800000, INTTY)); + + // cmov_logical r32, r42, r50, r32 + R32 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R42, R50, R32); + + // cmov_logical r33, r43, r51, r33 + R33 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R43, R51, R33); + + // cmov_logical r32, r40, r20, r32 + R32 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R40, R20, R32); + + // cmov_logical r33, r41, r21, r33 + R33 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R41, R21, R33); + + // ior r50, r40, r41 + R50 = DAG.getNode(ISD::OR, DL, INTTY, R40, R41); + + // ior r51, r42, r43 + R51 = DAG.getNode(ISD::OR, DL, INTTY, R42, R43); + + // ior r50, r50, r51 + R50 = DAG.getNode(ISD::OR, DL, INTTY, R50, R51); + + // inegate r52, r31 + SDValue R52 = DAG.getNode(AMDILISD::INEGATE, DL, INTTY, R31); + + // iadd r30, r30, r52 + R30 = DAG.getNode(ISD::ADD, DL, INTTY, R30, R52); + + // cmov_logical r30, r50, 0, r30 + R30 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R50, + DAG.getConstant(0, INTTY), R30); + + // div_zeroop(infinity) r21, 1.0, as_float(r33) + R33 = DAG.getNode(ISDBITCAST, DL, OVT, R33); + R21 = DAG.getNode(AMDILISD::DIV_INF, DL, OVT, + DAG.getConstantFP(1.0f, OVT), R33); + + // mul_ieee as_int(r20), as_float(r32), r21 + R32 = DAG.getNode(ISDBITCAST, DL, OVT, R32); + R20 = DAG.getNode(ISD::FMUL, DL, OVT, R32, R21); + R20 = DAG.getNode(ISDBITCAST, DL, INTTY, R20); + + // div_zeroop(infinity) r21, 1.0, as_float(r33) + R33 = DAG.getNode(ISDBITCAST, DL, OVT, R33); + R21 = DAG.getNode(AMDILISD::DIV_INF, DL, OVT, + DAG.getConstantFP(1.0f, OVT), R33); + + // mul_ieee as_int(r20), as_float(r32), r21 + R32 = DAG.getNode(ISDBITCAST, DL, OVT, R32); + R20 = DAG.getNode(ISD::FMUL, DL, OVT, R32, R21); + R20 = DAG.getNode(ISDBITCAST, DL, INTTY, R20); + + // and r22, r20, 0x7FFFFFFF + SDValue R22 = DAG.getNode(ISD::AND, DL, INTTY, R20, + DAG.getConstant(0x7FFFFFFF, INTTY)); + + // and r23, r20, 0x80000000 + SDValue R23 = DAG.getNode(ISD::AND, DL, INTTY, R20, + DAG.getConstant(0x80000000, INTTY)); + + // ishr r60, r22, 0x00000017 + SDValue R60 = DAG.getNode(ISD::SRA, DL, INTTY, R22, + DAG.getConstant(0x00000017, INTTY)); + + // ishr r61, r30, 0x00000017 + SDValue R61 = DAG.getNode(ISD::SRA, DL, INTTY, R30, + DAG.getConstant(0x00000017, INTTY)); + + // iadd r20, r20, r30 + R20 = DAG.getNode(ISD::ADD, DL, INTTY, R20, R30); + + // iadd r21, r22, r30 + R21 = DAG.getNode(ISD::ADD, DL, INTTY, R22, R30); + + // iadd r60, r60, r61 + R60 = DAG.getNode(ISD::ADD, DL, INTTY, R60, R61); + + // ige r42, 0, R60 + R42 = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32), + DAG.getConstant(0, INTTY), + R60); + + // ior r41, r23, 0x7F800000 + R41 = DAG.getNode(ISD::OR, DL, INTTY, R23, + DAG.getConstant(0x7F800000, INTTY)); + + // ige r40, r60, 0x000000FF + R40 = DAG.getNode(AMDILISD::CMP, DL, INTTY, + DAG.getConstant(CondCCodeToCC(ISD::SETGE, MVT::i32), MVT::i32), + R60, + DAG.getConstant(0x0000000FF, INTTY)); + + // cmov_logical r40, r50, 0, r40 + R40 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R50, + DAG.getConstant(0, INTTY), + R40); + + // cmov_logical r20, r42, r23, r20 + R20 = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R42, R23, R20); + + // cmov_logical DST, r40, r41, r20 + DST = DAG.getNode(AMDILISD::CMOVLOG, DL, INTTY, R40, R41, R20); + + // as_float(DST) + DST = DAG.getNode(ISDBITCAST, DL, OVT, DST); + } else { + // The following sequence of DAG nodes produce the following IL: + // fabs r1, RHS + // lt r2, 0x1.0p+96f, r1 + // cmov_logical r3, r2, 0x1.0p-23f, 1.0f + // mul_ieee r1, RHS, r3 + // div_zeroop(infinity) r0, LHS, r1 + // mul_ieee DST, r0, r3 + + // fabs r1, RHS + SDValue r1 = DAG.getNode(ISD::FABS, DL, OVT, RHS); + // lt r2, 0x1.0p+96f, r1 + SDValue r2 = DAG.getNode(AMDILISD::CMP, DL, OVT, + DAG.getConstant(CondCCodeToCC(ISD::SETLT, MVT::f32), MVT::i32), + DAG.getConstant(0x6f800000, INTTY), r1); + // cmov_logical r3, r2, 0x1.0p-23f, 1.0f + SDValue r3 = DAG.getNode(AMDILISD::CMOVLOG, DL, OVT, r2, + DAG.getConstant(0x2f800000, INTTY), + DAG.getConstant(0x3f800000, INTTY)); + // mul_ieee r1, RHS, r3 + r1 = DAG.getNode(ISD::FMUL, DL, OVT, RHS, r3); + // div_zeroop(infinity) r0, LHS, r1 + SDValue r0 = DAG.getNode(AMDILISD::DIV_INF, DL, OVT, LHS, r1); + // mul_ieee DST, r0, r3 + DST = DAG.getNode(ISD::FMUL, DL, OVT, r0, r3); + } + return DST; +} + +SDValue +AMDILTargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const +{ + return SDValue(Op.getNode(), 0); +} diff --git a/src/gallium/drivers/radeon/AMDILISelLowering.h b/src/gallium/drivers/radeon/AMDILISelLowering.h new file mode 100644 index 00000000000..302f0cb6909 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILISelLowering.h @@ -0,0 +1,527 @@ +//===-- AMDILISelLowering.h - AMDIL DAG Lowering Interface ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file defines the interfaces that AMDIL uses to lower LLVM code into a +// selection DAG. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDIL_ISELLOWERING_H_ +#define AMDIL_ISELLOWERING_H_ +#include "AMDIL.h" +#include "llvm/CodeGen/CallingConvLower.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/Target/TargetLowering.h" + +namespace llvm +{ + namespace AMDILISD + { + enum + { + FIRST_NUMBER = ISD::BUILTIN_OP_END, + INTTOANY, // Dummy instruction that takes an int and goes to + // any type converts the SDNode to an int + DP_TO_FP, // Conversion from 64bit FP to 32bit FP + FP_TO_DP, // Conversion from 32bit FP to 64bit FP + BITCONV, // instruction that converts from any type to any type + CMOV, // 32bit FP Conditional move instruction + CMOVLOG, // 32bit FP Conditional move logical instruction + SELECT, // 32bit FP Conditional move logical instruction + SETCC, // 32bit FP Conditional move logical instruction + ISGN, // 32bit Int Sign instruction + INEGATE, // 32bit Int Negation instruction + MAD, // 32bit Fused Multiply Add instruction + ADD, // 32/64 bit pseudo instruction + AND, // 128 bit and instruction + OR, // 128 bit or instruction + NOT, // 128 bit not instruction + XOR, // 128 bit xor instruction + MOVE, // generic mov instruction + PHIMOVE, // generic phi-node mov instruction + VBUILD, // scalar to vector mov instruction + VEXTRACT, // extract vector components + VINSERT, // insert vector components + VCONCAT, // concat a single vector to another vector + UMAD, // 32bit UInt Fused Multiply Add instruction + CALL, // Function call based on a single integer + RET, // Return from a function call + SELECT_CC, // Select the correct conditional instruction + BRCC, // Select the correct branch instruction + CMPCC, // Compare to GPR operands + CMPICC, // Compare two GPR operands, set icc. + CMPFCC, // Compare two FP operands, set fcc. + BRICC, // Branch to dest on icc condition + BRFCC, // Branch to dest on fcc condition + SELECT_ICC, // Select between two values using the current ICC + //flags. + SELECT_FCC, // Select between two values using the current FCC + //flags. + LCREATE, // Create a 64bit integer from two 32 bit integers + LCOMPHI, // Get the hi 32 bits from a 64 bit integer + LCOMPLO, // Get the lo 32 bits from a 64 bit integer + DCREATE, // Create a 64bit float from two 32 bit integers + DCOMPHI, // Get the hi 32 bits from a 64 bit float + DCOMPLO, // Get the lo 32 bits from a 64 bit float + LCREATE2, // Create a 64bit integer from two 32 bit integers + LCOMPHI2, // Get the hi 32 bits from a 64 bit integer + LCOMPLO2, // Get the lo 32 bits from a 64 bit integer + DCREATE2, // Create a 64bit float from two 32 bit integers + DCOMPHI2, // Get the hi 32 bits from a 64 bit float + DCOMPLO2, // Get the lo 32 bits from a 64 bit float + UMUL, // 32bit unsigned multiplication + IFFB_HI, // 32bit find first hi bit instruction + IFFB_LO, // 32bit find first low bit instruction + DIV_INF, // Divide with infinity returned on zero divisor + SMAX, // Signed integer max + CMP, + IL_CC_I_GT, + IL_CC_I_LT, + IL_CC_I_GE, + IL_CC_I_LE, + IL_CC_I_EQ, + IL_CC_I_NE, + RET_FLAG, + BRANCH_COND, + LOOP_NZERO, + LOOP_ZERO, + LOOP_CMP, + ADDADDR, + // ATOMIC Operations + // Global Memory + ATOM_G_ADD = ISD::FIRST_TARGET_MEMORY_OPCODE, + ATOM_G_AND, + ATOM_G_CMPXCHG, + ATOM_G_DEC, + ATOM_G_INC, + ATOM_G_MAX, + ATOM_G_UMAX, + ATOM_G_MIN, + ATOM_G_UMIN, + ATOM_G_OR, + ATOM_G_SUB, + ATOM_G_RSUB, + ATOM_G_XCHG, + ATOM_G_XOR, + ATOM_G_ADD_NORET, + ATOM_G_AND_NORET, + ATOM_G_CMPXCHG_NORET, + ATOM_G_DEC_NORET, + ATOM_G_INC_NORET, + ATOM_G_MAX_NORET, + ATOM_G_UMAX_NORET, + ATOM_G_MIN_NORET, + ATOM_G_UMIN_NORET, + ATOM_G_OR_NORET, + ATOM_G_SUB_NORET, + ATOM_G_RSUB_NORET, + ATOM_G_XCHG_NORET, + ATOM_G_XOR_NORET, + // Local Memory + ATOM_L_ADD, + ATOM_L_AND, + ATOM_L_CMPXCHG, + ATOM_L_DEC, + ATOM_L_INC, + ATOM_L_MAX, + ATOM_L_UMAX, + ATOM_L_MIN, + ATOM_L_UMIN, + ATOM_L_OR, + ATOM_L_MSKOR, + ATOM_L_SUB, + ATOM_L_RSUB, + ATOM_L_XCHG, + ATOM_L_XOR, + ATOM_L_ADD_NORET, + ATOM_L_AND_NORET, + ATOM_L_CMPXCHG_NORET, + ATOM_L_DEC_NORET, + ATOM_L_INC_NORET, + ATOM_L_MAX_NORET, + ATOM_L_UMAX_NORET, + ATOM_L_MIN_NORET, + ATOM_L_UMIN_NORET, + ATOM_L_OR_NORET, + ATOM_L_MSKOR_NORET, + ATOM_L_SUB_NORET, + ATOM_L_RSUB_NORET, + ATOM_L_XCHG_NORET, + ATOM_L_XOR_NORET, + // Region Memory + ATOM_R_ADD, + ATOM_R_AND, + ATOM_R_CMPXCHG, + ATOM_R_DEC, + ATOM_R_INC, + ATOM_R_MAX, + ATOM_R_UMAX, + ATOM_R_MIN, + ATOM_R_UMIN, + ATOM_R_OR, + ATOM_R_MSKOR, + ATOM_R_SUB, + ATOM_R_RSUB, + ATOM_R_XCHG, + ATOM_R_XOR, + ATOM_R_ADD_NORET, + ATOM_R_AND_NORET, + ATOM_R_CMPXCHG_NORET, + ATOM_R_DEC_NORET, + ATOM_R_INC_NORET, + ATOM_R_MAX_NORET, + ATOM_R_UMAX_NORET, + ATOM_R_MIN_NORET, + ATOM_R_UMIN_NORET, + ATOM_R_OR_NORET, + ATOM_R_MSKOR_NORET, + ATOM_R_SUB_NORET, + ATOM_R_RSUB_NORET, + ATOM_R_XCHG_NORET, + ATOM_R_XOR_NORET, + // Append buffer + APPEND_ALLOC, + APPEND_ALLOC_NORET, + APPEND_CONSUME, + APPEND_CONSUME_NORET, + // 2D Images + IMAGE2D_READ, + IMAGE2D_WRITE, + IMAGE2D_INFO0, + IMAGE2D_INFO1, + // 3D Images + IMAGE3D_READ, + IMAGE3D_WRITE, + IMAGE3D_INFO0, + IMAGE3D_INFO1, + + LAST_ISD_NUMBER + }; + } // AMDILISD + + class MachineBasicBlock; + class MachineInstr; + class DebugLoc; + class TargetInstrInfo; + + class AMDILTargetLowering : public TargetLowering + { + private: + int VarArgsFrameOffset; // Frame offset to start of varargs area. + public: + AMDILTargetLowering(TargetMachine &TM); + + virtual SDValue + LowerOperation(SDValue Op, SelectionDAG &DAG) const; + + int + getVarArgsFrameOffset() const; + + /// computeMaskedBitsForTargetNode - Determine which of + /// the bits specified + /// in Mask are known to be either zero or one and return them in + /// the + /// KnownZero/KnownOne bitsets. + virtual void + computeMaskedBitsForTargetNode( + const SDValue Op, + APInt &KnownZero, + APInt &KnownOne, + const SelectionDAG &DAG, + unsigned Depth = 0 + ) const; + + virtual MachineBasicBlock* + EmitInstrWithCustomInserter( + MachineInstr *MI, + MachineBasicBlock *MBB) const; + + virtual bool + getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &I, unsigned Intrinsic) const; + virtual const char* + getTargetNodeName( + unsigned Opcode + ) const; + // We want to mark f32/f64 floating point values as + // legal + bool + isFPImmLegal(const APFloat &Imm, EVT VT) const; + // We don't want to shrink f64/f32 constants because + // they both take up the same amount of space and + // we don't want to use a f2d instruction. + bool ShouldShrinkFPConstant(EVT VT) const; + + /// getFunctionAlignment - Return the Log2 alignment of this + /// function. + virtual unsigned int + getFunctionAlignment(const Function *F) const; + + private: + CCAssignFn* + CCAssignFnForNode(unsigned int CC) const; + + SDValue LowerCallResult(SDValue Chain, + SDValue InFlag, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, + SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + SDValue LowerMemArgument(SDValue Chain, + CallingConv::ID CallConv, + const SmallVectorImpl<ISD::InputArg> &ArgInfo, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, MachineFrameInfo *MFI, + unsigned i) const; + + SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, + SDValue Arg, + DebugLoc dl, SelectionDAG &DAG, + const CCValAssign &VA, + ISD::ArgFlagsTy Flags) const; + + virtual SDValue + LowerFormalArguments(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerCall(SDValue Chain, SDValue Callee, + CallingConv::ID CallConv, bool isVarArg, bool doesNotRet, + bool &isTailCall, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + const SmallVectorImpl<ISD::InputArg> &Ins, + DebugLoc dl, SelectionDAG &DAG, + SmallVectorImpl<SDValue> &InVals) const; + + virtual SDValue + LowerReturn(SDValue Chain, + CallingConv::ID CallConv, bool isVarArg, + const SmallVectorImpl<ISD::OutputArg> &Outs, + const SmallVectorImpl<SDValue> &OutVals, + DebugLoc dl, SelectionDAG &DAG) const; + + //+++--- Function dealing with conversions between floating point and + //integer types ---+++// + SDValue + genCLZu64(SDValue Op, SelectionDAG &DAG) const; + SDValue + genCLZuN(SDValue Op, SelectionDAG &DAG, uint32_t bits) const; + SDValue + genCLZu32(SDValue Op, SelectionDAG &DAG) const; + SDValue + genf64toi32(SDValue Op, SelectionDAG &DAG, + bool includeSign) const; + + SDValue + genf64toi64(SDValue Op, SelectionDAG &DAG, + bool includeSign) const; + + SDValue + genu32tof64(SDValue Op, EVT dblvt, SelectionDAG &DAG) const; + + SDValue + genu64tof64(SDValue Op, EVT dblvt, SelectionDAG &DAG) const; + + SDValue + LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG& DAG) const; + + SDValue + LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG& DAG) const; + + SDValue + LowerINTRINSIC_VOID(SDValue Op, SelectionDAG& DAG) const; + + SDValue + LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerADD(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSUB(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSREM(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerSREM8(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerSREM16(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerSREM32(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerSREM64(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerUREM(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerUREM8(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerUREM16(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerUREM32(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerUREM64(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerSDIV24(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerSDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerSDIV64(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerUDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerUDIV24(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerUDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerUDIV64(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerFDIV(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerFDIV32(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerFDIV64(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerMUL(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerAND(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerOR(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSELECT(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSETCC(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; + + EVT + genIntType(uint32_t size = 32, uint32_t numEle = 1) const; + + SDValue + LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; + + SDValue + LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue + LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; + void + generateCMPInstr(MachineInstr*, MachineBasicBlock*, + const TargetInstrInfo&) const; + MachineOperand + convertToReg(MachineOperand) const; + + // private members used by the set of instruction generation + // functions, these are marked mutable as they are cached so + // that they don't have to constantly be looked up when using the + // generateMachineInst/genVReg instructions. This is to simplify + // the code + // and to make it cleaner. The object itself doesn't change as + // only these functions use these three data types. + mutable MachineBasicBlock *mBB; + mutable DebugLoc *mDL; + mutable const TargetInstrInfo *mTII; + mutable MachineBasicBlock::iterator mBBI; + void + setPrivateData(MachineBasicBlock *BB, + MachineBasicBlock::iterator &BBI, + DebugLoc *DL, + const TargetInstrInfo *TII) const; + uint32_t genVReg(uint32_t regType) const; + MachineInstrBuilder + generateMachineInst(uint32_t opcode, + uint32_t dst) const; + MachineInstrBuilder + generateMachineInst(uint32_t opcode, + uint32_t dst, uint32_t src1) const; + MachineInstrBuilder + generateMachineInst(uint32_t opcode, + uint32_t dst, uint32_t src1, uint32_t src2) const; + MachineInstrBuilder + generateMachineInst(uint32_t opcode, + uint32_t dst, uint32_t src1, uint32_t src2, + uint32_t src3) const; + uint32_t + addExtensionInstructions( + uint32_t reg, bool signedShift, + unsigned int simpleVT) const; + void + generateLongRelational(MachineInstr *MI, + unsigned int opCode) const; + + }; // AMDILTargetLowering +} // end namespace llvm + +#endif // AMDIL_ISELLOWERING_H_ diff --git a/src/gallium/drivers/radeon/AMDILImageExpansion.cpp b/src/gallium/drivers/radeon/AMDILImageExpansion.cpp new file mode 100644 index 00000000000..e6fe37a6b99 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILImageExpansion.cpp @@ -0,0 +1,171 @@ +//===-- AMDILImageExpansion.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// @file AMDILImageExpansion.cpp +// @details Implementatino of the Image expansion class for image capable devices +// +#include "AMDILIOExpansion.h" +#include "AMDILKernelManager.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Value.h" + +using namespace llvm; + +AMDILImageExpansion::AMDILImageExpansion(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + : AMDIL789IOExpansion(tm AMDIL_OPT_LEVEL_VAR) +{ +} + +AMDILImageExpansion::~AMDILImageExpansion() +{ +} +void AMDILImageExpansion::expandInefficientImageLoad( + MachineBasicBlock *mBB, MachineInstr *MI) +{ +#if 0 + const llvm::StringRef &name = MI->getOperand(0).getGlobal()->getName(); + const char *tReg1, *tReg2, *tReg3, *tReg4; + tReg1 = mASM->getRegisterName(MI->getOperand(1).getReg()); + if (MI->getOperand(2).isReg()) { + tReg2 = mASM->getRegisterName(MI->getOperand(2).getReg()); + } else { + tReg2 = mASM->getRegisterName(AMDIL::R1); + O << "\tmov " << tReg2 << ", l" << MI->getOperand(2).getImm() << "\n"; + } + if (MI->getOperand(3).isReg()) { + tReg3 = mASM->getRegisterName(MI->getOperand(3).getReg()); + } else { + tReg3 = mASM->getRegisterName(AMDIL::R2); + O << "\tmov " << tReg3 << ", l" << MI->getOperand(3).getImm() << "\n"; + } + if (MI->getOperand(4).isReg()) { + tReg4 = mASM->getRegisterName(MI->getOperand(4).getReg()); + } else { + tReg4 = mASM->getRegisterName(AMDIL::R3); + O << "\tmov " << tReg2 << ", l" << MI->getOperand(4).getImm() << "\n"; + } + bool internalSampler = false; + //bool linear = true; + unsigned ImageCount = 3; // OPENCL_MAX_READ_IMAGES + unsigned SamplerCount = 3; // OPENCL_MAX_SAMPLERS + if (ImageCount - 1) { + O << "\tswitch " << mASM->getRegisterName(MI->getOperand(1).getReg()) + << "\n"; + } + for (unsigned rID = 0; rID < ImageCount; ++rID) { + if (ImageCount - 1) { + if (!rID) { + O << "\tdefault\n"; + } else { + O << "\tcase " << rID << "\n" ; + } + O << "\tswitch " << mASM->getRegisterName(MI->getOperand(2).getReg()) + << "\n"; + } + for (unsigned sID = 0; sID < SamplerCount; ++sID) { + if (SamplerCount - 1) { + if (!sID) { + O << "\tdefault\n"; + } else { + O << "\tcase " << sID << "\n" ; + } + } + if (internalSampler) { + // Check if sampler has normalized setting. + O << "\tand r0.x, " << tReg2 << ".x, l0.y\n" + << "\tif_logicalz r0.x\n" + << "\tflr " << tReg3 << ", " << tReg3 << "\n" + << "\tsample_resource(" << rID << ")_sampler(" + << sID << ")_coordtype(unnormalized) " + << tReg1 << ", " << tReg3 << " ; " << name.data() << "\n" + << "\telse\n" + << "\tiadd " << tReg1 << ".y, " << tReg1 << ".x, l0.y\n" + << "\titof " << tReg2 << ", cb1[" << tReg1 << ".x].xyz\n" + << "\tmul " << tReg3 << ", " << tReg3 << ", " << tReg2 << "\n" + << "\tflr " << tReg3 << ", " << tReg3 << "\n" + << "\tmul " << tReg3 << ", " << tReg3 << ", cb1[" + << tReg1 << ".y].xyz\n" + << "\tsample_resource(" << rID << ")_sampler(" + << sID << ")_coordtype(normalized) " + << tReg1 << ", " << tReg3 << " ; " << name.data() << "\n" + << "\tendif\n"; + } else { + O << "\tiadd " << tReg1 << ".y, " << tReg1 << ".x, l0.y\n" + // Check if sampler has normalized setting. + << "\tand r0, " << tReg2 << ".x, l0.y\n" + // Convert image dimensions to float. + << "\titof " << tReg4 << ", cb1[" << tReg1 << ".x].xyz\n" + // Move into R0 1 if unnormalized or dimensions if normalized. + << "\tcmov_logical r0, r0, " << tReg4 << ", r1.1111\n" + // Make coordinates unnormalized. + << "\tmul " << tReg3 << ", r0, " << tReg3 << "\n" + // Get linear filtering if set. + << "\tand " << tReg4 << ", " << tReg2 << ".x, l6.x\n" + // Save unnormalized coordinates in R0. + << "\tmov r0, " << tReg3 << "\n" + // Floor the coordinates due to HW incompatibility with precision + // requirements. + << "\tflr " << tReg3 << ", " << tReg3 << "\n" + // get Origianl coordinates (without floor) if linear filtering + << "\tcmov_logical " << tReg3 << ", " << tReg4 + << ".xxxx, r0, " << tReg3 << "\n" + // Normalize the coordinates with multiplying by 1/dimensions + << "\tmul " << tReg3 << ", " << tReg3 << ", cb1[" + << tReg1 << ".y].xyz\n" + << "\tsample_resource(" << rID << ")_sampler(" + << sID << ")_coordtype(normalized) " + << tReg1 << ", " << tReg3 << " ; " << name.data() << "\n"; + } + if (SamplerCount - 1) { + O << "\tbreak\n"; + } + } + if (SamplerCount - 1) { + O << "\tendswitch\n"; + } + if (ImageCount - 1) { + O << "\tbreak\n"; + } + } + if (ImageCount - 1) { + O << "\tendswitch\n"; + } +#endif +} + void +AMDILImageExpansion::expandImageLoad(MachineBasicBlock *mBB, MachineInstr *MI) +{ + uint32_t imageID = getPointerID(MI); + MI->getOperand(1).ChangeToImmediate(imageID); + saveInst = true; +} + void +AMDILImageExpansion::expandImageStore(MachineBasicBlock *mBB, MachineInstr *MI) +{ + uint32_t imageID = getPointerID(MI); + mKM->setOutputInst(); + MI->getOperand(0).ChangeToImmediate(imageID); + saveInst = true; +} + void +AMDILImageExpansion::expandImageParam(MachineBasicBlock *mBB, MachineInstr *MI) +{ + MachineBasicBlock::iterator I = *MI; + uint32_t ID = getPointerID(MI); + DebugLoc DL = MI->getDebugLoc(); + BuildMI(*mBB, I, DL, mTII->get(AMDIL::CBLOAD), + MI->getOperand(0).getReg()) + .addImm(ID) + .addImm(1); +} diff --git a/src/gallium/drivers/radeon/AMDILInliner.cpp b/src/gallium/drivers/radeon/AMDILInliner.cpp new file mode 100644 index 00000000000..9dad6add97b --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILInliner.cpp @@ -0,0 +1,271 @@ +//===-- AMDILInliner.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#define DEBUG_TYPE "amdilinline" +#include "AMDIL.h" +#include "AMDILCompilerErrors.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILSubtarget.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Support/CallSite.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +namespace +{ + class LLVM_LIBRARY_VISIBILITY AMDILInlinePass: public FunctionPass + + { + public: + TargetMachine &TM; + static char ID; + AMDILInlinePass(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + ~AMDILInlinePass(); + virtual const char* getPassName() const; + virtual bool runOnFunction(Function &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + virtual void getAnalysisUsage(AnalysisUsage &AU) const; + private: + typedef DenseMap<const ArrayType*, SmallVector<AllocaInst*, + DEFAULT_VEC_SLOTS> > InlinedArrayAllocasTy; + bool + AMDILInlineCallIfPossible(CallSite CS, + const TargetData *TD, + InlinedArrayAllocasTy &InlinedArrayAllocas); + + CodeGenOpt::Level OptLevel; + }; + char AMDILInlinePass::ID = 0; +} // anonymouse namespace + + +namespace llvm +{ + FunctionPass* + createAMDILInlinePass(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + { + return new AMDILInlinePass(tm AMDIL_OPT_LEVEL_VAR); + } +} // llvm namespace + + AMDILInlinePass::AMDILInlinePass(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) +: FunctionPass(ID), TM(tm) +{ + OptLevel = tm.getOptLevel(); +} +AMDILInlinePass::~AMDILInlinePass() +{ +} + + +bool +AMDILInlinePass::AMDILInlineCallIfPossible(CallSite CS, + const TargetData *TD, InlinedArrayAllocasTy &InlinedArrayAllocas) { + Function *Callee = CS.getCalledFunction(); + Function *Caller = CS.getCaller(); + + // Try to inline the function. Get the list of static allocas that were + // inlined. + SmallVector<AllocaInst*, 16> StaticAllocas; + InlineFunctionInfo IFI; + if (!InlineFunction(CS, IFI)) + return false; + DEBUG(errs() << "<amdilinline> function " << Caller->getName() + << ": inlined call to "<< Callee->getName() << "\n"); + + // If the inlined function had a higher stack protection level than the + // calling function, then bump up the caller's stack protection level. + if (Callee->hasFnAttr(Attribute::StackProtectReq)) + Caller->addFnAttr(Attribute::StackProtectReq); + else if (Callee->hasFnAttr(Attribute::StackProtect) && + !Caller->hasFnAttr(Attribute::StackProtectReq)) + Caller->addFnAttr(Attribute::StackProtect); + + + // Look at all of the allocas that we inlined through this call site. If we + // have already inlined other allocas through other calls into this function, + // then we know that they have disjoint lifetimes and that we can merge them. + // + // There are many heuristics possible for merging these allocas, and the + // different options have different tradeoffs. One thing that we *really* + // don't want to hurt is SRoA: once inlining happens, often allocas are no + // longer address taken and so they can be promoted. + // + // Our "solution" for that is to only merge allocas whose outermost type is an + // array type. These are usually not promoted because someone is using a + // variable index into them. These are also often the most important ones to + // merge. + // + // A better solution would be to have real memory lifetime markers in the IR + // and not have the inliner do any merging of allocas at all. This would + // allow the backend to do proper stack slot coloring of all allocas that + // *actually make it to the backend*, which is really what we want. + // + // Because we don't have this information, we do this simple and useful hack. + // + SmallPtrSet<AllocaInst*, 16> UsedAllocas; + + // Loop over all the allocas we have so far and see if they can be merged with + // a previously inlined alloca. If not, remember that we had it. + + for (unsigned AllocaNo = 0, + e = IFI.StaticAllocas.size(); + AllocaNo != e; ++AllocaNo) { + + AllocaInst *AI = IFI.StaticAllocas[AllocaNo]; + + // Don't bother trying to merge array allocations (they will usually be + // canonicalized to be an allocation *of* an array), or allocations whose + // type is not itself an array (because we're afraid of pessimizing SRoA). + const ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType()); + if (ATy == 0 || AI->isArrayAllocation()) + continue; + + // Get the list of all available allocas for this array type. + SmallVector<AllocaInst*, DEFAULT_VEC_SLOTS> &AllocasForType + = InlinedArrayAllocas[ATy]; + + // Loop over the allocas in AllocasForType to see if we can reuse one. Note + // that we have to be careful not to reuse the same "available" alloca for + // multiple different allocas that we just inlined, we use the 'UsedAllocas' + // set to keep track of which "available" allocas are being used by this + // function. Also, AllocasForType can be empty of course! + bool MergedAwayAlloca = false; + for (unsigned i = 0, e = AllocasForType.size(); i != e; ++i) { + AllocaInst *AvailableAlloca = AllocasForType[i]; + + // The available alloca has to be in the right function, not in some other + // function in this SCC. + if (AvailableAlloca->getParent() != AI->getParent()) + continue; + + // If the inlined function already uses this alloca then we can't reuse + // it. + if (!UsedAllocas.insert(AvailableAlloca)) + continue; + + // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare + // success! + DEBUG(errs() << " ***MERGED ALLOCA: " << *AI); + + AI->replaceAllUsesWith(AvailableAlloca); + AI->eraseFromParent(); + MergedAwayAlloca = true; + break; + } + + // If we already nuked the alloca, we're done with it. + if (MergedAwayAlloca) + continue; + + // If we were unable to merge away the alloca either because there are no + // allocas of the right type available or because we reused them all + // already, remember that this alloca came from an inlined function and mark + // it used so we don't reuse it for other allocas from this inline + // operation. + AllocasForType.push_back(AI); + UsedAllocas.insert(AI); + } + + return true; +} + + bool +AMDILInlinePass::runOnFunction(Function &MF) +{ + Function *F = &MF; + const AMDILSubtarget &STM = TM.getSubtarget<AMDILSubtarget>(); + if (STM.device()->isSupported(AMDILDeviceInfo::NoInline)) { + return false; + } + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + SmallVector<CallSite, 16> CallSites; + for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + CallSite CS = CallSite(cast<Value>(I)); + // If this isn't a call, or it is a call to an intrinsic, it can + // never be inlined. + if (CS.getInstruction() == 0 || isa<IntrinsicInst>(I)) + continue; + + // If this is a direct call to an external function, we can never inline + // it. If it is an indirect call, inlining may resolve it to be a + // direct call, so we keep it. + if (CS.getCalledFunction() && CS.getCalledFunction()->isDeclaration()) + continue; + + // We don't want to inline if we are recursive. + if (CS.getCalledFunction() && CS.getCalledFunction()->getName() == MF.getName()) { + AMDILMachineFunctionInfo *MFI = + getAnalysis<MachineFunctionAnalysis>().getMF() + .getInfo<AMDILMachineFunctionInfo>(); + MFI->addErrorMsg(amd::CompilerErrorMessage[RECURSIVE_FUNCTION]); + continue; + } + + CallSites.push_back(CS); + } + } + + InlinedArrayAllocasTy InlinedArrayAllocas; + bool Changed = false; + for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) { + CallSite CS = CallSites[CSi]; + + Function *Callee = CS.getCalledFunction(); + + // We can only inline direct calls to non-declarations. + if (Callee == 0 || Callee->isDeclaration()) continue; + + // Attempt to inline the function... + if (!AMDILInlineCallIfPossible(CS, TD, InlinedArrayAllocas)) + continue; + Changed = true; + } + return Changed; +} + +const char* +AMDILInlinePass::getPassName() const +{ + return "AMDIL Inline Function Pass"; +} + bool +AMDILInlinePass::doInitialization(Module &M) +{ + return false; +} + + bool +AMDILInlinePass::doFinalization(Module &M) +{ + return false; +} + +void +AMDILInlinePass::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<MachineFunctionAnalysis>(); + FunctionPass::getAnalysisUsage(AU); + AU.setPreservesAll(); +} diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.cpp b/src/gallium/drivers/radeon/AMDILInstrInfo.cpp new file mode 100644 index 00000000000..fbc3e45b357 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILInstrInfo.cpp @@ -0,0 +1,709 @@ +//===- AMDILInstrInfo.cpp - AMDIL Instruction Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file contains the AMDIL implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// +#include "AMDILInstrInfo.h" +#include "AMDILUtilityFunctions.h" + +#define GET_INSTRINFO_CTOR +#include "AMDILGenInstrInfo.inc" + +#include "AMDILInstrInfo.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/PseudoSourceValue.h" +#include "llvm/Instructions.h" + +using namespace llvm; + +AMDILInstrInfo::AMDILInstrInfo(AMDILTargetMachine &tm) + : AMDILGenInstrInfo(AMDIL::ADJCALLSTACKDOWN, AMDIL::ADJCALLSTACKUP), + RI(tm, *this), + TM(tm) { +} + +const AMDILRegisterInfo &AMDILInstrInfo::getRegisterInfo() const { + return RI; +} + +/// Return true if the instruction is a register to register move and leave the +/// source and dest operands in the passed parameters. +bool AMDILInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned int &SrcReg, + unsigned int &DstReg, unsigned int &SrcSubIdx, + unsigned int &DstSubIdx) const { + // FIXME: we should look for: + // add with 0 + //assert(0 && "is Move Instruction has not been implemented yet!"); + //return true; + if (!isMove(MI.getOpcode())) { + return false; + } + if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg()) { + return false; + } + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + DstSubIdx = 0; + SrcSubIdx = 0; + return true; +} + +bool AMDILInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, + unsigned &SrcReg, unsigned &DstReg, + unsigned &SubIdx) const { +// TODO: Implement this function + return false; +} + +unsigned AMDILInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +unsigned AMDILInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} + +bool AMDILInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} +unsigned AMDILInstrInfo::isStoreFromStackSlot(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +unsigned AMDILInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const { +// TODO: Implement this function + return 0; +} +bool AMDILInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const { +// TODO: Implement this function + return false; +} +#if 0 +void +AMDILInstrInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo *TRI) const { +// TODO: Implement this function +} + +MachineInst AMDILInstrInfo::duplicate(MachineInstr *Orig, + MachineFunction &MF) const { +// TODO: Implement this function + return NULL; +} +#endif +MachineInstr * +AMDILInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const { +// TODO: Implement this function + return NULL; +} +#if 0 +MachineInst AMDILInstrInfo::commuteInstruction(MachineInstr *MI, + bool NewMI = false) const { +// TODO: Implement this function + return NULL; +} +bool +AMDILInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const +{ +// TODO: Implement this function +} +bool +AMDILInstrInfo::produceSameValue(const MachineInstr *MI0, + const MachineInstr *MI1) const +{ +// TODO: Implement this function +} +#endif +bool AMDILInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter, + MachineBasicBlock &MBB) const { + while (iter != MBB.end()) { + switch (iter->getOpcode()) { + default: + break; + ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND); + case AMDIL::BRANCH: + return true; + }; + ++iter; + } + return false; +} + +bool AMDILInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + bool retVal = true; + return retVal; + MachineBasicBlock::iterator iter = MBB.begin(); + if (!getNextBranchInstr(iter, MBB)) { + retVal = false; + } else { + MachineInstr *firstBranch = iter; + if (!getNextBranchInstr(++iter, MBB)) { + if (firstBranch->getOpcode() == AMDIL::BRANCH) { + TBB = firstBranch->getOperand(0).getMBB(); + firstBranch->eraseFromParent(); + retVal = false; + } else { + TBB = firstBranch->getOperand(0).getMBB(); + FBB = *(++MBB.succ_begin()); + if (FBB == TBB) { + FBB = *(MBB.succ_begin()); + } + Cond.push_back(firstBranch->getOperand(1)); + retVal = false; + } + } else { + MachineInstr *secondBranch = iter; + if (!getNextBranchInstr(++iter, MBB)) { + if (secondBranch->getOpcode() == AMDIL::BRANCH) { + TBB = firstBranch->getOperand(0).getMBB(); + Cond.push_back(firstBranch->getOperand(1)); + FBB = secondBranch->getOperand(0).getMBB(); + secondBranch->eraseFromParent(); + retVal = false; + } else { + assert(0 && "Should not have two consecutive conditional branches"); + } + } else { + MBB.getParent()->viewCFG(); + assert(0 && "Should not have three branch instructions in" + " a single basic block"); + retVal = false; + } + } + } + return retVal; +} + +unsigned int AMDILInstrInfo::getBranchInstr(const MachineOperand &op) const { + const MachineInstr *MI = op.getParent(); + + switch (MI->getDesc().OpInfo->RegClass) { + default: // FIXME: fallthrough?? + case AMDIL::GPRI8RegClassID: return AMDIL::BRANCH_COND_i8; + case AMDIL::GPRI16RegClassID: return AMDIL::BRANCH_COND_i16; + case AMDIL::GPRI32RegClassID: return AMDIL::BRANCH_COND_i32; + case AMDIL::GPRI64RegClassID: return AMDIL::BRANCH_COND_i64; + case AMDIL::GPRF32RegClassID: return AMDIL::BRANCH_COND_f32; + case AMDIL::GPRF64RegClassID: return AMDIL::BRANCH_COND_f64; + }; +} + +unsigned int +AMDILInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const +{ + assert(TBB && "InsertBranch must not be told to insert a fallthrough"); + for (unsigned int x = 0; x < Cond.size(); ++x) { + Cond[x].getParent()->dump(); + } + if (FBB == 0) { + if (Cond.empty()) { + BuildMI(&MBB, DL, get(AMDIL::BRANCH)).addMBB(TBB); + } else { + BuildMI(&MBB, DL, get(getBranchInstr(Cond[0]))) + .addMBB(TBB).addReg(Cond[0].getReg()); + } + return 1; + } else { + BuildMI(&MBB, DL, get(getBranchInstr(Cond[0]))) + .addMBB(TBB).addReg(Cond[0].getReg()); + BuildMI(&MBB, DL, get(AMDIL::BRANCH)).addMBB(FBB); + } + assert(0 && "Inserting two branches not supported"); + return 0; +} + +unsigned int AMDILInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.end(); + if (I == MBB.begin()) { + return 0; + } + --I; + switch (I->getOpcode()) { + default: + return 0; + ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND); + case AMDIL::BRANCH: + I->eraseFromParent(); + break; + } + I = MBB.end(); + + if (I == MBB.begin()) { + return 1; + } + --I; + switch (I->getOpcode()) { + // FIXME: only one case?? + default: + return 1; + ExpandCaseToAllScalarTypes(AMDIL::BRANCH_COND); + I->eraseFromParent(); + break; + } + return 2; +} + +MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) { + MachineBasicBlock::iterator tmp = MBB->end(); + if (!MBB->size()) { + return MBB->end(); + } + while (--tmp) { + if (tmp->getOpcode() == AMDIL::ENDLOOP + || tmp->getOpcode() == AMDIL::ENDIF + || tmp->getOpcode() == AMDIL::ELSE) { + if (tmp == MBB->begin()) { + return tmp; + } else { + continue; + } + } else { + return ++tmp; + } + } + return MBB->end(); +} + +bool +AMDILInstrInfo::copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const { + // If we are adding to the end of a basic block we can safely assume that the + // move is caused by a PHI node since all move instructions that are non-PHI + // have already been inserted into the basic blocks Therefor we call the skip + // flow control instruction to move the iterator before the flow control + // instructions and put the move instruction there. + bool phi = (DestReg < 1025) || (SrcReg < 1025); + int movInst = phi ? getMoveInstFromID(DestRC->getID()) + : getPHIMoveInstFromID(DestRC->getID()); + + MachineBasicBlock::iterator iTemp = (I == MBB.end()) ? skipFlowControl(&MBB) + : I; + if (DestRC != SrcRC) { + //int convInst; + size_t dSize = DestRC->getSize(); + size_t sSize = SrcRC->getSize(); + if (dSize > sSize) { + // Elements are going to get duplicated. + BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg); + } else if (dSize == sSize) { + // Direct copy, conversions are not handled. + BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg); + } else if (dSize < sSize) { + // Elements are going to get dropped. + BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg); + } + } else { + BuildMI( MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg); + } + return true; +} +void +AMDILInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const +{ + BuildMI(MBB, MI, DL, get(AMDIL::MOVE_v4i32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); + return; +#if 0 + DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) + << " to " << RI.getName(DestReg) << '\n'); + abort(); +#endif +} +void +AMDILInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, + int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + unsigned int Opc = 0; + // MachineInstr *curMI = MI; + MachineFunction &MF = *(MBB.getParent()); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + + DebugLoc DL; + switch (RC->getID()) { + default: + Opc = AMDIL::PRIVATESTORE_v4i32; + break; + case AMDIL::GPRF32RegClassID: + Opc = AMDIL::PRIVATESTORE_f32; + break; + case AMDIL::GPRF64RegClassID: + Opc = AMDIL::PRIVATESTORE_f64; + break; + case AMDIL::GPRI16RegClassID: + Opc = AMDIL::PRIVATESTORE_i16; + break; + case AMDIL::GPRI32RegClassID: + Opc = AMDIL::PRIVATESTORE_i32; + break; + case AMDIL::GPRI8RegClassID: + Opc = AMDIL::PRIVATESTORE_i8; + break; + case AMDIL::GPRI64RegClassID: + Opc = AMDIL::PRIVATESTORE_i64; + break; + case AMDIL::GPRV2F32RegClassID: + Opc = AMDIL::PRIVATESTORE_v2f32; + break; + case AMDIL::GPRV2F64RegClassID: + Opc = AMDIL::PRIVATESTORE_v2f64; + break; + case AMDIL::GPRV2I16RegClassID: + Opc = AMDIL::PRIVATESTORE_v2i16; + break; + case AMDIL::GPRV2I32RegClassID: + Opc = AMDIL::PRIVATESTORE_v2i32; + break; + case AMDIL::GPRV2I8RegClassID: + Opc = AMDIL::PRIVATESTORE_v2i8; + break; + case AMDIL::GPRV2I64RegClassID: + Opc = AMDIL::PRIVATESTORE_v2i64; + break; + case AMDIL::GPRV4F32RegClassID: + Opc = AMDIL::PRIVATESTORE_v4f32; + break; + case AMDIL::GPRV4I16RegClassID: + Opc = AMDIL::PRIVATESTORE_v4i16; + break; + case AMDIL::GPRV4I32RegClassID: + Opc = AMDIL::PRIVATESTORE_v4i32; + break; + case AMDIL::GPRV4I8RegClassID: + Opc = AMDIL::PRIVATESTORE_v4i8; + break; + } + if (MI != MBB.end()) DL = MI->getDebugLoc(); + MachineMemOperand *MMO = + new MachineMemOperand( + MachinePointerInfo::getFixedStack(FrameIndex), + MachineMemOperand::MOLoad, + MFI.getObjectSize(FrameIndex), + MFI.getObjectAlignment(FrameIndex)); + if (MI != MBB.end()) { + DL = MI->getDebugLoc(); + } + MachineInstr *nMI = BuildMI(MBB, MI, DL, get(Opc)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FrameIndex) + .addMemOperand(MMO) + .addImm(0); + AMDILAS::InstrResEnc curRes; + curRes.bits.ResourceID + = TM.getSubtargetImpl()->device()->getResourceID(AMDILDevice::SCRATCH_ID); + setAsmPrinterFlags(nMI, curRes); +} + +void +AMDILInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const { + unsigned int Opc = 0; + MachineFunction &MF = *(MBB.getParent()); + MachineFrameInfo &MFI = *MF.getFrameInfo(); + DebugLoc DL; + switch (RC->getID()) { + default: + Opc = AMDIL::PRIVATELOAD_v4i32; + break; + case AMDIL::GPRF32RegClassID: + Opc = AMDIL::PRIVATELOAD_f32; + break; + case AMDIL::GPRF64RegClassID: + Opc = AMDIL::PRIVATELOAD_f64; + break; + case AMDIL::GPRI16RegClassID: + Opc = AMDIL::PRIVATELOAD_i16; + break; + case AMDIL::GPRI32RegClassID: + Opc = AMDIL::PRIVATELOAD_i32; + break; + case AMDIL::GPRI8RegClassID: + Opc = AMDIL::PRIVATELOAD_i8; + break; + case AMDIL::GPRI64RegClassID: + Opc = AMDIL::PRIVATELOAD_i64; + break; + case AMDIL::GPRV2F32RegClassID: + Opc = AMDIL::PRIVATELOAD_v2f32; + break; + case AMDIL::GPRV2F64RegClassID: + Opc = AMDIL::PRIVATELOAD_v2f64; + break; + case AMDIL::GPRV2I16RegClassID: + Opc = AMDIL::PRIVATELOAD_v2i16; + break; + case AMDIL::GPRV2I32RegClassID: + Opc = AMDIL::PRIVATELOAD_v2i32; + break; + case AMDIL::GPRV2I8RegClassID: + Opc = AMDIL::PRIVATELOAD_v2i8; + break; + case AMDIL::GPRV2I64RegClassID: + Opc = AMDIL::PRIVATELOAD_v2i64; + break; + case AMDIL::GPRV4F32RegClassID: + Opc = AMDIL::PRIVATELOAD_v4f32; + break; + case AMDIL::GPRV4I16RegClassID: + Opc = AMDIL::PRIVATELOAD_v4i16; + break; + case AMDIL::GPRV4I32RegClassID: + Opc = AMDIL::PRIVATELOAD_v4i32; + break; + case AMDIL::GPRV4I8RegClassID: + Opc = AMDIL::PRIVATELOAD_v4i8; + break; + } + + MachineMemOperand *MMO = + new MachineMemOperand( + MachinePointerInfo::getFixedStack(FrameIndex), + MachineMemOperand::MOLoad, + MFI.getObjectSize(FrameIndex), + MFI.getObjectAlignment(FrameIndex)); + if (MI != MBB.end()) { + DL = MI->getDebugLoc(); + } + MachineInstr* nMI = BuildMI(MBB, MI, DL, get(Opc)) + .addReg(DestReg, RegState::Define) + .addFrameIndex(FrameIndex) + .addMemOperand(MMO) + .addImm(0); + AMDILAS::InstrResEnc curRes; + curRes.bits.ResourceID + = TM.getSubtargetImpl()->device()->getResourceID(AMDILDevice::SCRATCH_ID); + setAsmPrinterFlags(nMI, curRes); + +} +MachineInstr * +AMDILInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const { +// TODO: Implement this function + return 0; +} +MachineInstr* +AMDILInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr *LoadMI) const { + // TODO: Implement this function + return 0; +} +bool +AMDILInstrInfo::canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const +{ + // TODO: Implement this function + return false; +} +bool +AMDILInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, + bool UnfoldStore, + SmallVectorImpl<MachineInstr*> &NewMIs) const { + // TODO: Implement this function + return false; +} + +bool +AMDILInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode*> &NewNodes) const { + // TODO: Implement this function + return false; +} + +unsigned +AMDILInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex) const { + // TODO: Implement this function + return 0; +} + +bool +AMDILInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const { + return false; + if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) { + return false; + } + const MachineSDNode *mload1 = dyn_cast<MachineSDNode>(Load1); + const MachineSDNode *mload2 = dyn_cast<MachineSDNode>(Load2); + if (!mload1 || !mload2) { + return false; + } + if (mload1->memoperands_empty() || + mload2->memoperands_empty()) { + return false; + } + MachineMemOperand *memOp1 = (*mload1->memoperands_begin()); + MachineMemOperand *memOp2 = (*mload2->memoperands_begin()); + const Value *mv1 = memOp1->getValue(); + const Value *mv2 = memOp2->getValue(); + if (!memOp1->isLoad() || !memOp2->isLoad()) { + return false; + } + if (getBasePointerValue(mv1) == getBasePointerValue(mv2)) { + if (isa<GetElementPtrInst>(mv1) && isa<GetElementPtrInst>(mv2)) { + const GetElementPtrInst *gep1 = dyn_cast<GetElementPtrInst>(mv1); + const GetElementPtrInst *gep2 = dyn_cast<GetElementPtrInst>(mv2); + if (!gep1 || !gep2) { + return false; + } + if (gep1->getNumOperands() != gep2->getNumOperands()) { + return false; + } + for (unsigned i = 0, e = gep1->getNumOperands() - 1; i < e; ++i) { + const Value *op1 = gep1->getOperand(i); + const Value *op2 = gep2->getOperand(i); + if (op1 != op2) { + // If any value except the last one is different, return false. + return false; + } + } + unsigned size = gep1->getNumOperands()-1; + if (!isa<ConstantInt>(gep1->getOperand(size)) + || !isa<ConstantInt>(gep2->getOperand(size))) { + return false; + } + Offset1 = dyn_cast<ConstantInt>(gep1->getOperand(size))->getSExtValue(); + Offset2 = dyn_cast<ConstantInt>(gep2->getOperand(size))->getSExtValue(); + return true; + } else if (isa<Argument>(mv1) && isa<Argument>(mv2)) { + return false; + } else if (isa<GlobalValue>(mv1) && isa<GlobalValue>(mv2)) { + return false; + } + } + return false; +} + +bool AMDILInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + assert(Offset2 > Offset1 + && "Second offset should be larger than first offset!"); + // If we have less than 16 loads in a row, and the offsets are within 16, + // then schedule together. + // TODO: Make the loads schedule near if it fits in a cacheline + return (NumLoads < 16 && (Offset2 - Offset1) < 16); +} + +bool +AMDILInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) + const { + // TODO: Implement this function + return true; +} +void AMDILInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + // TODO: Implement this function +} + +bool AMDILInstrInfo::isPredicated(const MachineInstr *MI) const { + // TODO: Implement this function + return false; +} +#if 0 +bool AMDILInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { + // TODO: Implement this function +} + +bool AMDILInstrInfo::PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const { + // TODO: Implement this function +} +#endif +bool +AMDILInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) + const { + // TODO: Implement this function + return false; +} + +bool AMDILInstrInfo::DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const { + // TODO: Implement this function + return false; +} + +bool AMDILInstrInfo::isPredicable(MachineInstr *MI) const { + // TODO: Implement this function + return MI->getDesc().isPredicable(); +} + +bool +AMDILInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { + // TODO: Implement this function + return true; +} + +unsigned AMDILInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const { + // TODO: Implement this function + return 0; +} + +#if 0 +unsigned +AMDILInstrInfo::GetFunctionSizeInBytes(const MachineFunction &MF) const { + // TODO: Implement this function + return 0; +} + +unsigned AMDILInstrInfo::getInlineAsmLength(const char *Str, + const MCAsmInfo &MAI) const { + // TODO: Implement this function + return 0; +} +#endif diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.h b/src/gallium/drivers/radeon/AMDILInstrInfo.h new file mode 100644 index 00000000000..88dd4e9441a --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILInstrInfo.h @@ -0,0 +1,175 @@ +//===- AMDILInstrInfo.h - AMDIL Instruction Information ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file contains the AMDIL implementation of the TargetInstrInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDILINSTRUCTIONINFO_H_ +#define AMDILINSTRUCTIONINFO_H_ + +#include "AMDILRegisterInfo.h" +#include "llvm/Target/TargetInstrInfo.h" + +#define GET_INSTRINFO_HEADER +#include "AMDILGenInstrInfo.inc" + +namespace llvm { + // AMDIL - This namespace holds all of the target specific flags that + // instruction info tracks. + // + //class AMDILTargetMachine; +class AMDILInstrInfo : public AMDILGenInstrInfo { +private: + const AMDILRegisterInfo RI; + AMDILTargetMachine &TM; + bool getNextBranchInstr(MachineBasicBlock::iterator &iter, + MachineBasicBlock &MBB) const; + unsigned int getBranchInstr(const MachineOperand &op) const; +public: + explicit AMDILInstrInfo(AMDILTargetMachine &tm); + + // getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As + // such, whenever a client has an instance of instruction info, it should + // always be able to get register info as well (through this method). + const AMDILRegisterInfo &getRegisterInfo() const; + + // Return true if the instruction is a register to register move and leave the + // source and dest operands in the passed parameters. + bool isMoveInstr(const MachineInstr &MI, unsigned int &SrcReg, + unsigned int &DstReg, unsigned int &SrcSubIdx, + unsigned int &DstSubIdx) const; + + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const; + + unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasLoadFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const; + unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI, + int &FrameIndex) const; + bool hasStoreFromStackSlot(const MachineInstr *MI, + const MachineMemOperand *&MMO, + int &FrameIndex) const; + + +#if 0 + void reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, unsigned SubIdx, + const MachineInstr *Orig, + const TargetRegisterInfo *TRI) const; + MachineInstr *duplicate(MachineInstr *Orig, + MachineFunction &MF) const; +#endif + MachineInstr * + convertToThreeAddress(MachineFunction::iterator &MFI, + MachineBasicBlock::iterator &MBBI, + LiveVariables *LV) const; +#if 0 + MachineInstr *commuteInstruction(MachineInstr *MI, + bool NewMI = false) const; + bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, + unsigned &SrcOpIdx2) const; + bool produceSameValue(const MachineInstr *MI0, + const MachineInstr *MI1) const; + +#endif + + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const; + + unsigned + InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + const SmallVectorImpl<MachineOperand> &Cond, + DebugLoc DL) const; + + bool copyRegToReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + unsigned DestReg, unsigned SrcReg, + const TargetRegisterClass *DestRC, + const TargetRegisterClass *SrcRC, + DebugLoc DL) const; + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + + void storeRegToStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned SrcReg, bool isKill, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + void loadRegFromStackSlot(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + unsigned DestReg, int FrameIndex, + const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) const; + +protected: + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + int FrameIndex) const; + MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, + MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops, + MachineInstr *LoadMI) const; +public: + bool canFoldMemoryOperand(const MachineInstr *MI, + const SmallVectorImpl<unsigned> &Ops) const; + bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, + unsigned Reg, bool UnfoldLoad, bool UnfoldStore, + SmallVectorImpl<MachineInstr *> &NewMIs) const; + bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, + SmallVectorImpl<SDNode *> &NewNodes) const; + unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, + bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = 0) const; + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2) const; + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const; + + bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const; + void insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const; + bool isPredicated(const MachineInstr *MI) const; +#if 0 + bool isUnpredicatedTerminator(const MachineInstr *MI) const; + bool PredicateInstruction(MachineInstr *MI, + const SmallVectorImpl<MachineOperand> &Pred) const; +#endif + bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1, + const SmallVectorImpl<MachineOperand> &Pred2) const; + bool DefinesPredicate(MachineInstr *MI, + std::vector<MachineOperand> &Pred) const; + bool isPredicable(MachineInstr *MI) const; + bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const; + unsigned GetInstSizeInBytes(const MachineInstr *MI) const; +#if 0 + unsigned GetFunctionSizeInBytes(const MachineFunction &MF) const; + unsigned getInlineAsmLength(const char *Str, + const MCAsmInfo &MAI) const; +#endif + }; + +} + +#endif // AMDILINSTRINFO_H_ diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.td b/src/gallium/drivers/radeon/AMDILInstrInfo.td new file mode 100644 index 00000000000..7086e53a0c0 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILInstrInfo.td @@ -0,0 +1,115 @@ +//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file describes the AMDIL instructions in TableGen format. +// +//===----------------------------------------------------------------------===// +// AMDIL Instruction Predicate Definitions +// Predicate that is set to true if the hardware supports double precision +// divide +def HasHWDDiv : Predicate<"Subtarget.device()" + "->getGeneration() > AMDILDeviceInfo::HD4XXX && " + "Subtarget.device()->usesHardware(AMDILDeviceInfo::DoubleOps)">; + +// Predicate that is set to true if the hardware supports double, but not double +// precision divide in hardware +def HasSWDDiv : Predicate<"Subtarget.device()" + "->getGeneration() == AMDILDeviceInfo::HD4XXX &&" + "Subtarget.device()->usesHardware(AMDILDeviceInfo::DoubleOps)">; + +// Predicate that is set to true if the hardware support 24bit signed +// math ops. Otherwise a software expansion to 32bit math ops is used instead. +def HasHWSign24Bit : Predicate<"Subtarget.device()" + "->getGeneration() > AMDILDeviceInfo::HD5XXX">; + +// Predicate that is set to true if 64bit operations are supported or not +def HasHW64Bit : Predicate<"Subtarget.device()" + "->usesHardware(AMDILDeviceInfo::LongOps)">; +def HasSW64Bit : Predicate<"Subtarget.device()" + "->usesSoftware(AMDILDeviceInfo::LongOps)">; + +// Predicate that is set to true if the timer register is supported +def HasTmrRegister : Predicate<"Subtarget.device()" + "->isSupported(AMDILDeviceInfo::TmrReg)">; +// Predicate that is true if we are at least evergreen series +def HasDeviceIDInst : Predicate<"Subtarget.device()" + "->getGeneration() >= AMDILDeviceInfo::HD5XXX">; + +// Predicate that is true if we have region address space. +def hasRegionAS : Predicate<"Subtarget.device()" + "->usesHardware(AMDILDeviceInfo::RegionMem)">; + +// Predicate that is false if we don't have region address space. +def noRegionAS : Predicate<"!Subtarget.device()" + "->isSupported(AMDILDeviceInfo::RegionMem)">; + + +// Predicate that is set to true if 64bit Mul is supported in the IL or not +def HasHW64Mul : Predicate<"Subtarget.calVersion()" + ">= CAL_VERSION_SC_139" + "&& Subtarget.device()" + "->getGeneration() >=" + "AMDILDeviceInfo::HD5XXX">; +def HasSW64Mul : Predicate<"Subtarget.calVersion()" + "< CAL_VERSION_SC_139">; +// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not +def HasHW64DivMod : Predicate<"Subtarget.device()" + "->usesHardware(AMDILDeviceInfo::HW64BitDivMod)">; +def HasSW64DivMod : Predicate<"Subtarget.device()" + "->usesSoftware(AMDILDeviceInfo::HW64BitDivMod)">; + +// Predicate that is set to true if 64bit pointer are used. +def Has64BitPtr : Predicate<"Subtarget.is64bit()">; +def Has32BitPtr : Predicate<"!Subtarget.is64bit()">; +//===--------------------------------------------------------------------===// +// Custom Operands +//===--------------------------------------------------------------------===// +include "AMDILOperands.td" + +//===--------------------------------------------------------------------===// +// Custom Selection DAG Type Profiles +//===--------------------------------------------------------------------===// +include "AMDILProfiles.td" + +//===--------------------------------------------------------------------===// +// Custom Selection DAG Nodes +//===--------------------------------------------------------------------===// +include "AMDILNodes.td" + +//===--------------------------------------------------------------------===// +// Custom Pattern DAG Nodes +//===--------------------------------------------------------------------===// +include "AMDILPatterns.td" + +//===----------------------------------------------------------------------===// +// Instruction format classes +//===----------------------------------------------------------------------===// +include "AMDILFormats.td" + +//===--------------------------------------------------------------------===// +// Multiclass Instruction formats +//===--------------------------------------------------------------------===// +include "AMDILMultiClass.td" + +//===--------------------------------------------------------------------===// +// Intrinsics support +//===--------------------------------------------------------------------===// +include "AMDILIntrinsics.td" + +//===--------------------------------------------------------------------===// +// Instructions support +//===--------------------------------------------------------------------===// +include "AMDILInstructions.td" + +//===--------------------------------------------------------------------===// +// Instruction Pattern support - This Must be the last include in the file +// as it requires items defined in other files +//===--------------------------------------------------------------------===// +include "AMDILInstrPatterns.td" + diff --git a/src/gallium/drivers/radeon/AMDILInstrPatterns.td b/src/gallium/drivers/radeon/AMDILInstrPatterns.td new file mode 100644 index 00000000000..51181b2a5dc --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILInstrPatterns.td @@ -0,0 +1,66 @@ +//===- AMDILInstrPatterns.td - AMDIL Target ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +//===--------------------------------------------------------------------===// +// This file holds all the custom patterns that are used by the amdil backend +// +//===--------------------------------------------------------------------===// +//===--------------------------------------------------------------------===// +// Custom patterns for conversion operations +//===--------------------------------------------------------------------===//// +// Pattern to remap integer or to IL_or +def : Pat<(i32 (or GPRI32:$src0, GPRI32:$src1)), + (i32 (BINARY_OR_i32 GPRI32:$src0, GPRI32:$src1))>; +// float ==> long patterns +// unsigned: f32 -> i64 +def FTOUL : Pat<(i64 (fp_to_uint GPRF32:$src)), + (LCREATE (FTOU GPRF32:$src), (LOADCONST_i32 0))>; +// signed: f32 -> i64 +def FTOL : Pat<(i64 (fp_to_sint GPRF32:$src)), + (LCREATE (FTOI GPRF32:$src), (LOADCONST_i32 0))>; +// unsigned: i64 -> f32 +def ULTOF : Pat<(f32 (uint_to_fp GPRI64:$src)), + (UTOF (LLO GPRI64:$src))>; +// signed: i64 -> f32 +def LTOF : Pat<(f32 (sint_to_fp GPRI64:$src)), + (ITOF (LLO GPRI64:$src))>; + +// integer subtraction +// a - b ==> a + (-b) +def SUB_i8 : Pat<(sub GPRI8:$src0, GPRI8:$src1), + (ADD_i8 GPRI8:$src0, (NEGATE_i8 GPRI8:$src1))>; +def SUB_v2i8 : Pat<(sub GPRV2I8:$src0, GPRV2I8:$src1), + (ADD_v2i8 GPRV2I8:$src0, (NEGATE_v2i8 GPRV2I8:$src1))>; +def SUB_v4i8 : Pat<(sub GPRV4I8:$src0, GPRV4I8:$src1), + (ADD_v4i8 GPRV4I8:$src0, (NEGATE_v4i8 GPRV4I8:$src1))>; +def SUB_i16 : Pat<(sub GPRI16:$src0, GPRI16:$src1), + (ADD_i16 GPRI16:$src0, (NEGATE_i16 GPRI16:$src1))>; +def SUB_v2i16 : Pat<(sub GPRV2I16:$src0, GPRV2I16:$src1), + (ADD_v2i16 GPRV2I16:$src0, (NEGATE_v2i16 GPRV2I16:$src1))>; +def SUB_v4i16 : Pat<(sub GPRV4I16:$src0, GPRV4I16:$src1), + (ADD_v4i16 GPRV4I16:$src0, (NEGATE_v4i16 GPRV4I16:$src1))>; +def SUB_i32 : Pat<(sub GPRI32:$src0, GPRI32:$src1), + (ADD_i32 GPRI32:$src0, (NEGATE_i32 GPRI32:$src1))>; +def SUB_v2i32 : Pat<(sub GPRV2I32:$src0, GPRV2I32:$src1), + (ADD_v2i32 GPRV2I32:$src0, (NEGATE_v2i32 GPRV2I32:$src1))>; +def SUB_v4i32 : Pat<(sub GPRV4I32:$src0, GPRV4I32:$src1), + (ADD_v4i32 GPRV4I32:$src0, (NEGATE_v4i32 GPRV4I32:$src1))>; +// LLVM isn't lowering this correctly, so writing a pattern that +// matches it isntead. +def : Pat<(build_vector (i32 imm:$src)), + (VCREATE_v4i32 (LOADCONST_i32 imm:$src))>; + +// Calls: +def : Pat<(IL_call tglobaladdr:$dst), + (CALL tglobaladdr:$dst)>; +def : Pat<(IL_call texternalsym:$dst), + (CALL texternalsym:$dst)>; +def : Pat<(IL_call tconstpool:$dst), + (CALL tconstpool:$dst)>; + +include "AMDILConversions.td" diff --git a/src/gallium/drivers/radeon/AMDILInstructions.td b/src/gallium/drivers/radeon/AMDILInstructions.td new file mode 100644 index 00000000000..f824a67d7ad --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILInstructions.td @@ -0,0 +1,2436 @@ +//===-- AMDILInstructions.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +// Operations in this file are generic to all data types +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +let isReMaterializable = 1, isAsCheapAsAMove = 1 in { + defm LOADCONST : ILConstant<"mov $dst, $val">; + defm MOVE : UnaryOpMC<IL_OP_MOV, IL_mov>; + defm PHIMOVE : UnaryOpMC<IL_OP_MOV, IL_phimov>; +} +defm BINARY_NOT : UnaryOpMC<IL_OP_I_NOT, IL_not>; +defm BINARY_OR : BinaryOpMC<IL_OP_I_OR, IL_or>; +defm BINARY_AND : BinaryOpMC<IL_OP_AND, IL_and>; +defm BINARY_XOR : BinaryOpMC<IL_OP_I_XOR, IL_xor>; +defm AND : BinaryOpMCInt<IL_OP_AND, and>; +defm CMOV : BinaryOpMC<IL_OP_CMOV, IL_cmov>; +defm DIV_INF : BinaryOpMC<IL_OP_DIV_INF, IL_div_inf>; +defm SMAX : BinaryOpMCInt<IL_OP_I_MAX, IL_smax>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder for 64bit +// instructions +defm CMOVLOG : TernaryOpMC<IL_OP_CMOV_LOGICAL, IL_cmov_logical>; +// This opcode has a custom swizzle pattern in the Swizzle Encoder and +// should never be selected in ISel. It should only be generated in the +// I/O expansion code. These are different from the CMOVLOG instruction +// in that the src0 argument uses a custom swizzle for the Y/Z/W +// vector channel respectively instead of the default channel. +def CMOVLOG_Y_i32 : ThreeInOneOut<IL_OP_CMOV_LOGICAL, (outs GPRI32:$dst), + (ins GPRI32:$src0, GPRI32:$src1, GPRI32:$src2), + !strconcat(IL_OP_CMOV_LOGICAL.Text, " $dst, $src0, $src1, $src2"), + []>; +def CMOVLOG_Z_i32 : ThreeInOneOut<IL_OP_CMOV_LOGICAL, (outs GPRI32:$dst), + (ins GPRI32:$src0, GPRI32:$src1, GPRI32:$src2), + !strconcat(IL_OP_CMOV_LOGICAL.Text, " $dst, $src0, $src1, $src2"), + []>; +def CMOVLOG_W_i32 : ThreeInOneOut<IL_OP_CMOV_LOGICAL, (outs GPRI32:$dst), + (ins GPRI32:$src0, GPRI32:$src1 ,GPRI32:$src2), + !strconcat(IL_OP_CMOV_LOGICAL.Text, " $dst, $src0, $src1, $src2"), + []>; +defm SELECTBIN : TernaryOpMCScalar<IL_OP_CMOV_LOGICAL, select>; +//===---------------------------------------------------------------------===// +// Signed 8bit integer math instructions start here +//===---------------------------------------------------------------------===// +def INTTOANY_i8 : OneInOneOut<IL_OP_MOV, (outs GPRI8:$dst), (ins GPRI32:$src0), + !strconcat(IL_OP_MOV.Text, " $dst, $src0"), + [(set GPRI8:$dst, (IL_inttoany GPRI32:$src0))]>; +//===---------------------------------------------------------------------===// +// Signed 16bit integer math instructions start here +//===---------------------------------------------------------------------===// +def INTTOANY_i16: OneInOneOut<IL_OP_MOV, (outs GPRI16:$dst), (ins GPRI32:$src0), + !strconcat(IL_OP_MOV.Text," $dst, $src0"), + [(set GPRI16:$dst, (IL_inttoany GPRI32:$src0))]>; +//===---------------------------------------------------------------------===// +// Signed 32bit integer math instructions start here +//===---------------------------------------------------------------------===// +defm NEGATE : UnaryOpMCi32<IL_OP_I_NEGATE, IL_inegate>; +defm SMUL : BinaryOpMCi32<IL_OP_I_MUL, mul>; +defm SMULHI : BinaryOpMCi32<IL_OP_I_MUL_HIGH, mulhs>; +defm SHL : BinaryOpMCi32Const<IL_OP_I_SHL, shl>; +defm SHR : BinaryOpMCi32Const<IL_OP_I_SHR, sra>; +defm SHLVEC : BinaryOpMCi32<IL_OP_I_SHL, shl>; +defm SHRVEC : BinaryOpMCi32<IL_OP_I_SHR, sra>; +defm ADD : BinaryOpMCi32<IL_OP_I_ADD, add>; +defm CUSTOM_XOR : BinaryOpMCInt<IL_OP_I_XOR, xor>; +// get rid of the addri via the tablegen instead of custom lowered instruction +defm CUSTOM_ADD : BinaryOpMCi32<IL_OP_I_ADD, IL_add>; +defm EADD : BinaryOpMCi32<IL_OP_I_ADD, adde>; +def INTTOANY_i32: OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), (ins GPRI32:$src0), + !strconcat(IL_OP_MOV.Text, " $dst, $src0"), + [(set GPRI32:$dst, (IL_inttoany GPRI32:$src0))]>; +// Integer offsets for addressing +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def ADDir : TwoInOneOut<IL_OP_I_ADD, (outs GPRI32:$dst), + (ins MEMI32:$ptr, GPRI32:$offset), + !strconcat(IL_OP_I_ADD.Text, " $dst, $ptr, $offset"), + [(set GPRI32:$dst, + (IL_addaddrri ADDR:$ptr, + (i32 GPRI32:$offset)))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def ADDri : TwoInOneOut<IL_OP_I_ADD, (outs GPRI32:$dst), + (ins GPRI32:$offset, MEMI32:$ptr), + !strconcat(IL_OP_I_ADD.Text, " $dst, $offset, $ptr"), + [(set GPRI32:$dst, + (IL_addaddrir + (i32 GPRI32:$offset), ADDR:$ptr))]>; + +defm IFFB_HI : UnaryOpMCi32<IL_OP_I_FFB_HI, IL_ffb_hi>; +defm IFFB_LO : UnaryOpMCi32<IL_OP_I_FFB_LO, IL_ffb_lo>; +let mayLoad = 0, mayStore = 0 in { +defm ABS : UnaryIntrinsicInt<IL_OP_ABS, int_AMDIL_abs>; +defm BITCOUNT : UnaryIntrinsicInt<IL_OP_IBIT_COUNT, int_AMDIL_bit_count_i32>; +defm FFB_LO : UnaryIntrinsicInt<IL_OP_I_FFB_LO, int_AMDIL_bit_find_first_lo>; +defm FFB_HI : UnaryIntrinsicInt<IL_OP_I_FFB_HI, int_AMDIL_bit_find_first_hi>; +defm FFB_SGN : UnaryIntrinsicInt<IL_OP_I_FFB_SGN, + int_AMDIL_bit_find_first_sgn>; +defm IMULHI : BinaryIntrinsicInt<IL_OP_I_MUL_HIGH, int_AMDIL_mulhi_i32>; +let Predicates = [HasHWSign24Bit] in { +defm IMUL24 : BinaryIntrinsicInt<IL_OP_I_MUL24, int_AMDIL_mul24_i32>; +defm IMULHI24 : BinaryIntrinsicInt<IL_OP_I_MULHI24, int_AMDIL_mulhi24_i32>; +defm IMAD24 : TernaryIntrinsicInt<IL_OP_I_MAD24, int_AMDIL_mad24_i32>; +} +defm CARRY : BinaryIntrinsicInt<IL_OP_I_CARRY, int_AMDIL_carry_i32>; +defm BORROW : BinaryIntrinsicInt<IL_OP_I_BORROW, int_AMDIL_borrow_i32>; +defm IMIN : BinaryIntrinsicInt<IL_OP_I_MIN, int_AMDIL_min_i32>; +defm IMAX : BinaryIntrinsicInt<IL_OP_I_MAX, int_AMDIL_max_i32>; +defm CMOV_LOG : TernaryIntrinsicInt<IL_OP_CMOV_LOGICAL, + int_AMDIL_cmov_logical>; +defm IBIT_EXTRACT : TernaryIntrinsicInt<IL_OP_IBIT_EXTRACT, + int_AMDIL_bit_extract_i32>; +defm IMAD : TernaryIntrinsicInt<IL_OP_I_MAD, int_AMDIL_mad_i32>; +defm SAD : TernaryIntrinsicInt<IL_OP_SAD, int_AMDIL_media_sad>; +defm SADHI : TernaryIntrinsicInt<IL_OP_SAD_HI, + int_AMDIL_media_sad_hi>; +} +def SAD4_i32 : ThreeInOneOut<IL_OP_SAD4, (outs GPRI32:$dst), + (ins GPRV4I32:$src, GPRV4I32:$src1, GPRI32:$src2), + !strconcat(IL_OP_SAD4.Text, " $dst, $src, $src1, $src2"), + [(set GPRI32:$dst, + (int_AMDIL_media_sad4 GPRV4I32:$src, GPRV4I32:$src1, + GPRI32:$src2))]>; +def FTOV4U8_i32 : OneInOneOut<IL_OP_F2U4, (outs GPRI32:$dst), + (ins GPRV4F32:$src), + !strconcat(IL_OP_F2U4.Text, " $dst, $src"), + [(set GPRI32:$dst, + (int_AMDIL_media_convert_f2v4u8 GPRV4F32:$src))]>; +//===---------------------------------------------------------------------===// +// Unsigned 32bit integer math instructions start here +//===---------------------------------------------------------------------===// +defm UMUL : BinaryOpMCi32<IL_OP_U_MUL, IL_umul>; +defm UMULHI : BinaryOpMCi32<IL_OP_U_MUL_HIGH, mulhu>; +defm USHR : BinaryOpMCi32Const<IL_OP_U_SHR, srl>; +defm USHRVEC : BinaryOpMCi32<IL_OP_U_SHR, srl>; +defm UDIV : BinaryOpMCi32<IL_OP_U_DIV, udiv>; +defm NATIVE_UDIV : BinaryIntrinsicInt<IL_OP_U_DIV, int_AMDIL_udiv>; +let mayLoad=0, mayStore=0 in { +defm UBIT_REVERSE : UnaryIntrinsicInt<IL_OP_UBIT_REVERSE, + int_AMDIL_bit_reverse_u32>; +defm UMULHI_INT : BinaryIntrinsicInt<IL_OP_U_MUL_HIGH, int_AMDIL_mulhi_u32>; +defm UMULHI24 : BinaryIntrinsicInt<IL_OP_U_MULHI24, int_AMDIL_mulhi24_u32>; +defm UMUL24 : BinaryIntrinsicInt<IL_OP_U_MUL24, int_AMDIL_mul24_u32>; +defm UMIN : BinaryIntrinsicInt<IL_OP_U_MIN, int_AMDIL_min_u32>; +defm UMAX : BinaryIntrinsicInt<IL_OP_U_MAX, int_AMDIL_max_u32>; +defm UBIT_EXTRACT : TernaryIntrinsicInt<IL_OP_UBIT_EXTRACT, + int_AMDIL_bit_extract_u32>; +defm UBIT_INSERT : QuaternaryIntrinsicInt<IL_OP_UBIT_INSERT, + int_AMDIL_bit_insert_u32>; +defm BFI : TernaryIntrinsicInt<IL_OP_BFI, int_AMDIL_bfi>; +defm BFM : BinaryIntrinsicInt<IL_OP_BFM, int_AMDIL_bfm>; +defm UMAD : TernaryIntrinsicInt<IL_OP_U_MAD, int_AMDIL_mad_u32>; +defm UMAD24 : TernaryIntrinsicInt<IL_OP_U_MAD24, int_AMDIL_mad24_u32>; +defm U4LERP : TernaryIntrinsicInt<IL_OP_U4_LERP, + int_AMDIL_media_lerp_u4>; +defm BITALIGN : TernaryIntrinsicInt<IL_OP_BIT_ALIGN, int_AMDIL_media_bitalign>; +defm BYTEALIGN : TernaryIntrinsicInt<IL_OP_BYTE_ALIGN, int_AMDIL_media_bytealign>; +} +//===---------------------------------------------------------------------===// +// Signed 64bit integer math instructions start here +//===---------------------------------------------------------------------===// +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def LNEGATE : OneInOneOut<IL_OP_MOV, (outs GPRI64:$dst), (ins GPRI64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRI64:$dst, (IL_inegate GPRI64:$src))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def LNEGATE_v2i64: OneInOneOut<IL_OP_MOV, (outs GPRV2I64:$dst), + (ins GPRV2I64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRV2I64:$dst, (IL_inegate GPRV2I64:$src))]>; +let Predicates = [HasHW64Bit] in { +def LADD : TwoInOneOut<IL_OP_I64_ADD, (outs GPRI64:$dst), + (ins GPRI64:$src1, GPRI64:$src2), + !strconcat(IL_OP_I64_ADD.Text, " $dst, $src1, $src2"), + [(set GPRI64:$dst, (IL_add GPRI64:$src1, GPRI64:$src2))]>; +defm IMIN64 : BinaryIntrinsicLong<IL_OP_I64_MIN, int_AMDIL_min_i32>; +defm UMIN64 : BinaryIntrinsicLong<IL_OP_U64_MIN, int_AMDIL_min_u32>; +defm IMAX64 : BinaryIntrinsicLong<IL_OP_I64_MAX, int_AMDIL_max_i32>; +defm UMAX64 : BinaryIntrinsicLong<IL_OP_U64_MAX, int_AMDIL_max_u32>; +} +let Predicates = [HasHW64Bit] in { +def LSHR : TwoInOneOut<IL_OP_I64_SHR, (outs GPRI64:$dst), + (ins GPRI64:$src1, GPRI32:$src2), + !strconcat(IL_OP_I64_SHR.Text, " $dst, $src1, $src2"), + [(set GPRI64:$dst, (sra GPRI64:$src1, GPRI32:$src2))]>; +def LSHL : TwoInOneOut<IL_OP_I64_SHL, (outs GPRI64:$dst), + (ins GPRI64:$src1, GPRI32:$src2), + !strconcat(IL_OP_I64_SHL.Text, " $dst, $src1, $src2"), + [(set GPRI64:$dst, (shl GPRI64:$src1, GPRI32:$src2))]>; +} + + +//===---------------------------------------------------------------------===// +// Unsigned 64bit integer math instructions start here +//===---------------------------------------------------------------------===// +let Predicates = [HasTmrRegister] in { + def Tmr : ILFormat<IL_OP_MOV, (outs GPRI64:$tmr), + (ins), !strconcat(IL_OP_MOV.Text, " $tmr, Tmr"), + [(set GPRI64:$tmr, (int_AMDIL_get_cycle_count))]>; +} +let Predicates = [HasDeviceIDInst] in { +def CU_ID : ILFormat<IL_OP_CU_ID, (outs GPRI32:$id), (ins), + !strconcat(IL_OP_CU_ID.Text, " $id"), + [(set GPRI32:$id, (int_AMDIL_compute_unit_id))]>; +def WAVE_ID : ILFormat<IL_OP_WAVE_ID, (outs GPRI32:$id), (ins), + !strconcat(IL_OP_WAVE_ID.Text, " $id"), + [(set GPRI32:$id, (int_AMDIL_wavefront_id))]>; +} +let Predicates = [HasHW64Bit] in { +def LUSHR : TwoInOneOut<IL_OP_U64_SHR, (outs GPRI64:$dst), + (ins GPRI64:$src1, GPRI32:$src2), + !strconcat(IL_OP_U64_SHR.Text, " $dst, $src1, $src2"), + [(set GPRI64:$dst, (srl GPRI64:$src1, GPRI32:$src2))]>; +} + + +//===---------------------------------------------------------------------===// +// Generic Float Instructions +//===---------------------------------------------------------------------===// +let hasIEEEFlag = 1 in { +defm MUL_IEEE : BinaryOpMCFloat<IL_OP_MUL_IEEE, IL_OP_D_MUL, fmul>; +} +defm ADD : BinaryOpMCFloat<IL_OP_ADD, IL_OP_D_ADD, fadd>; +//===---------------------------------------------------------------------===// +// float math instructions start here +//===---------------------------------------------------------------------===// +let mayLoad=0, mayStore=0 in { +defm ABS : UnaryIntrinsicFloat<IL_OP_ABS, int_AMDIL_fabs>; +defm FRAC : UnaryIntrinsicFloat<IL_OP_FRC, int_AMDIL_fraction>; +defm PIREDUCE : UnaryIntrinsicFloat<IL_OP_PI_REDUCE, int_AMDIL_pireduce>; +defm ROUND_NEAREST : UnaryIntrinsicFloat<IL_OP_ROUND_NEAR, + int_AMDIL_round_nearest>; +defm ROUND_NEGINF : UnaryIntrinsicFloat<IL_OP_ROUND_NEG_INF, + int_AMDIL_round_neginf>; +defm ROUND_POSINF : UnaryIntrinsicFloat<IL_OP_ROUND_POS_INF, + int_AMDIL_round_posinf>; +defm ROUND_ZERO : UnaryIntrinsicFloat<IL_OP_ROUND_ZERO, + int_AMDIL_round_zero>; +defm ACOS : UnaryIntrinsicFloatScalar<IL_OP_ACOS, int_AMDIL_acos>; +defm ATAN : UnaryIntrinsicFloatScalar<IL_OP_ATAN, int_AMDIL_atan>; +defm ASIN : UnaryIntrinsicFloatScalar<IL_OP_ASIN, int_AMDIL_asin>; +defm TAN : UnaryIntrinsicFloatScalar<IL_OP_TAN, int_AMDIL_tan>; +defm SIN : UnaryIntrinsicFloatScalar<IL_OP_SIN, int_AMDIL_sin>; +defm COS : UnaryIntrinsicFloatScalar<IL_OP_COS, int_AMDIL_cos>; +defm SQRT : UnaryIntrinsicFloatScalar<IL_OP_SQRT, int_AMDIL_sqrt>; +defm EXP : UnaryIntrinsicFloatScalar<IL_OP_EXP, int_AMDIL_exp>; +defm EXPVEC : UnaryIntrinsicFloat<IL_OP_EXP_VEC, int_AMDIL_exp_vec>; +defm SQRTVEC : UnaryIntrinsicFloat<IL_OP_SQRT_VEC, int_AMDIL_sqrt_vec>; +defm COSVEC : UnaryIntrinsicFloat<IL_OP_COS_VEC, int_AMDIL_cos_vec>; +defm SINVEC : UnaryIntrinsicFloat<IL_OP_SIN_VEC, int_AMDIL_sin_vec>; +defm LOGVEC : UnaryIntrinsicFloat<IL_OP_LOG_VEC, int_AMDIL_log_vec>; +defm RSQVEC : UnaryIntrinsicFloat<IL_OP_RSQ_VEC, int_AMDIL_rsq_vec>; +defm EXN : UnaryIntrinsicFloatScalar<IL_OP_EXN, int_AMDIL_exn>; +defm SIGN : UnaryIntrinsicFloat<IL_OP_SGN, int_AMDIL_sign>; +defm LENGTH : UnaryIntrinsicFloat<IL_OP_LEN, int_AMDIL_length>; +defm POW : BinaryIntrinsicFloat<IL_OP_POW, int_AMDIL_pow>; +} + +let hasIEEEFlag = 1 in { + let mayLoad = 0, mayStore=0 in { +defm MIN : BinaryIntrinsicFloat<IL_OP_MIN, int_AMDIL_min>; +defm MAX : BinaryIntrinsicFloat<IL_OP_MAX, int_AMDIL_max>; +defm MAD : TernaryIntrinsicFloat<IL_OP_MAD, int_AMDIL_mad>; + } +defm MOD : BinaryOpMCf32<IL_OP_MOD, frem>; +} +let hasZeroOpFlag = 1 in { + let mayLoad = 0, mayStore=0 in { +defm LN : UnaryIntrinsicFloatScalar<IL_OP_LN, int_AMDIL_ln>; +defm LOG : UnaryIntrinsicFloatScalar<IL_OP_LOG, int_AMDIL_log>; +defm RSQ : UnaryIntrinsicFloatScalar<IL_OP_RSQ, int_AMDIL_rsq>; +defm DIV : BinaryIntrinsicFloat<IL_OP_DIV, int_AMDIL_div>; + } +} + let mayLoad = 0, mayStore=0 in { +defm CLAMP : TernaryIntrinsicFloat<IL_OP_CLAMP, int_AMDIL_clamp>; +defm FMA : TernaryIntrinsicFloat<IL_OP_FMA, int_AMDIL_fma>; +defm LERP : TernaryIntrinsicFloat<IL_OP_LERP, int_AMDIL_lerp>; + } +defm SUB : BinaryOpMCf32<IL_OP_SUB, fsub>; +defm FABS : UnaryOpMCf32<IL_OP_ABS, fabs>; +defm FMAD : TernaryOpMCf32<IL_OP_MAD, IL_mad>; +defm NEAR : UnaryOpMCf32<IL_OP_ROUND_NEAR, fnearbyint>; +defm RND_Z : UnaryOpMCf32<IL_OP_ROUND_ZERO, ftrunc>; + +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def NEG_f32 : OneInOneOut<IL_OP_MOV, (outs GPRF32:$dst), + (ins GPRF32:$src0), + !strconcat(IL_OP_MOV.Text, " $dst, $src0"), + [(set GPRF32:$dst, (fneg GPRF32:$src0))]>; +def INTTOANY_f32 : OneInOneOut<IL_OP_MOV, (outs GPRF32:$dst), + (ins GPRI32:$src0), + !strconcat(IL_OP_MOV.Text, " $dst, $src0"), + [(set GPRF32:$dst, (IL_inttoany GPRI32:$src0))]>; +let hasIEEEFlag = 1 in { +def DP2ADD_f32 : ThreeInOneOut<IL_OP_DP2_ADD, (outs GPRF32:$dst), + (ins GPRV2F32:$src0, GPRV2F32:$src1, GPRF32:$src2), + !strconcat(IL_OP_DP2_ADD.Text, " $dst, $src0, $src1, $src2"), + [(set GPRF32:$dst, + (int_AMDIL_dp2_add GPRV2F32:$src0, + GPRV2F32:$src1, GPRF32:$src2))]>; +def DP2_f32 : TwoInOneOut<IL_OP_DP2, (outs GPRF32:$dst), + (ins GPRV2F32:$src0, GPRV2F32:$src1), + !strconcat(IL_OP_DP2.Text, " $dst, $src0, $src1"), + [(set GPRF32:$dst, + (int_AMDIL_dp2 GPRV2F32:$src0, GPRV2F32:$src1))]>; +def DP3_f32 : TwoInOneOut<IL_OP_DP3, (outs GPRF32:$dst), + (ins GPRV4F32:$src0, GPRV4F32:$src1), + !strconcat(IL_OP_DP3.Text, " $dst, $src0, $src1"), + [(set GPRF32:$dst, + (int_AMDIL_dp3 GPRV4F32:$src0, GPRV4F32:$src1))]>; +def DP4_f32 : TwoInOneOut<IL_OP_DP4, (outs GPRF32:$dst), + (ins GPRV4F32:$src0, GPRV4F32:$src1), + !strconcat(IL_OP_DP4.Text, " $dst, $src0, $src1"), + [(set GPRF32:$dst, + (int_AMDIL_dp4 GPRV4F32:$src0, GPRV4F32:$src1))]>; +} +defm UNPACK_B0 : IntrConvertI32TOF32<IL_OP_UNPACK_0, int_AMDIL_media_unpack_byte_0>; +defm UNPACK_B1 : IntrConvertI32TOF32<IL_OP_UNPACK_1, int_AMDIL_media_unpack_byte_1>; +defm UNPACK_B2 : IntrConvertI32TOF32<IL_OP_UNPACK_2, int_AMDIL_media_unpack_byte_2>; +defm UNPACK_B3 : IntrConvertI32TOF32<IL_OP_UNPACK_3, int_AMDIL_media_unpack_byte_3>; +defm FTOI_FLR : IntrConvertF32TOI32<IL_OP_FTOI_FLR, int_AMDIL_convert_f32_i32_flr>; +defm FTOI_RPI : IntrConvertF32TOI32<IL_OP_FTOI_RPI, int_AMDIL_convert_f32_i32_rpi>; +defm HTOF : IntrConvertF16TOF32<IL_OP_F16_TO_F32, int_AMDIL_convert_f16_f32>; +defm FTOH : IntrConvertF32TOF16<IL_OP_F32_TO_F16, int_AMDIL_convert_f32_f16>; +defm FTOH_NEAR : IntrConvertF32TOF16<IL_OP_F32_TO_F16_NEAR, int_AMDIL_convert_f32_f16_near>; +defm FTOH_NEG_INF : IntrConvertF32TOF16<IL_OP_F32_TO_F16_NEG_INF, int_AMDIL_convert_f32_f16_neg_inf>; +defm FTOH_PLUS_INF : IntrConvertF32TOF16<IL_OP_F32_TO_F16_PLUS_INF, int_AMDIL_convert_f32_f16_plus_inf>; +//===---------------------------------------------------------------------===// +// float math instructions end here +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// float2 math instructions start here +//===---------------------------------------------------------------------===// +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def NEG_v2f32 : OneInOneOut<IL_OP_MOV, (outs GPRV2F32:$dst), + (ins GPRV2F32:$src0), + !strconcat(IL_OP_MOV.Text, " $dst, $src0"), + [(set GPRV2F32:$dst, (fneg GPRV2F32:$src0))]>; +//===---------------------------------------------------------------------===// +// float2 math instructions end here +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// float4 math instructions start here +//===---------------------------------------------------------------------===// +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def NEG_v4f32 : OneInOneOut<IL_OP_MOV, (outs GPRV4F32:$dst), + (ins GPRV4F32:$src0), + !strconcat(IL_OP_MOV.Text, " $dst, $src0"), + [(set GPRV4F32:$dst, (fneg GPRV4F32:$src0))]>; +//===---------------------------------------------------------------------===// +// float4 math instructions end here +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// double math instructions start here +//===---------------------------------------------------------------------===// +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def SUB_f64 : TwoInOneOut<IL_OP_D_ADD, (outs GPRF64:$dst), + (ins GPRF64:$src0, GPRF64:$src1), + !strconcat(IL_OP_D_ADD.Text, " $dst, $src0, $src1"), + [(set GPRF64:$dst, (fsub GPRF64:$src0, GPRF64:$src1))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def SUB_v2f64 : TwoInOneOut<IL_OP_D_ADD, (outs GPRV2F64:$dst), + (ins GPRV2F64:$src0, GPRV2F64:$src1), + !strconcat(IL_OP_D_ADD.Text, " $dst, $src0, $src1"), + [(set GPRV2F64:$dst, (fsub GPRV2F64:$src0, GPRV2F64:$src1))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def NEG_f64 : OneInOneOut<IL_OP_MOV, (outs GPRF64:$dst), + (ins GPRF64:$src0), + !strconcat(IL_OP_MOV.Text, " $dst, $src0"), + [(set GPRF64:$dst, (fneg GPRF64:$src0))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def NEG_v2f64 : OneInOneOut<IL_OP_MOV, (outs GPRV2F64:$dst), + (ins GPRV2F64:$src0), + !strconcat(IL_OP_MOV.Text, " $dst, $src0"), + [(set GPRV2F64:$dst, (fneg GPRV2F64:$src0))]>; + let mayLoad = 0, mayStore=0 in { +defm MIN : BinaryIntrinsicDouble<IL_OP_D_MIN, int_AMDIL_min>; +defm MAX : BinaryIntrinsicDouble<IL_OP_D_MAX, int_AMDIL_max>; +defm DIV : BinaryIntrinsicDouble<IL_OP_D_DIV, int_AMDIL_div>; +defm MAD : TernaryIntrinsicDouble<IL_OP_D_MAD, int_AMDIL_mad>; +defm DFMA : TernaryIntrinsicDouble<IL_OP_D_MAD, int_AMDIL_fma>; +defm FRAC : UnaryIntrinsicDouble<IL_OP_D_FRC, int_AMDIL_fraction>; +defm SQRT : UnaryIntrinsicDouble<IL_OP_D_SQRT, int_AMDIL_sqrt>; +defm RSQ : UnaryIntrinsicDoubleScalar<IL_OP_D_RSQ, int_AMDIL_rsq>; +defm RCP : UnaryIntrinsicDoubleScalar<IL_OP_D_RCP, int_AMDIL_drcp>; +defm DMAD : TernaryOpMCf64<IL_OP_D_MAD, IL_mad>; + } +def FREXP_f64 : OneInOneOut<IL_OP_D_FREXP, (outs GPRV2I64:$dst), + (ins GPRF64:$src), + !strconcat(IL_OP_D_FREXP.Text," $dst, $src"), + [(set GPRV2I64:$dst, + (int_AMDIL_frexp_f64 GPRF64:$src))]>; +def LDEXP_f64 : TwoInOneOut<IL_OP_D_LDEXP, (outs GPRF64:$dst), + (ins GPRF64:$src, GPRI32:$src1), + !strconcat(IL_OP_D_LDEXP.Text, " $dst, $src, $src1"), + [(set GPRF64:$dst, + (int_AMDIL_ldexp GPRF64:$src, GPRI32:$src1))]>; +def LDEXP_v2f64 : TwoInOneOut<IL_OP_D_LDEXP, (outs GPRV2F64:$dst), + (ins GPRV2F64:$src, GPRV2I32:$src1), + !strconcat(IL_OP_D_LDEXP.Text, " $dst, $src, $src1"), + [(set GPRV2F64:$dst, + (int_AMDIL_ldexp GPRV2F64:$src, GPRV2I32:$src1))]>; +//===---------------------------------------------------------------------===// +// double math instructions end here +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +// Various Macros +//===---------------------------------------------------------------------===// +def MACRO__sdiv_i8 : BinaryMacro< GPRI8, GPRI8, GPRI8, sdiv>; +def MACRO__sdiv_i16 : BinaryMacro<GPRI16, GPRI16, GPRI16, sdiv>; +def MACRO__sdiv_i32 : BinaryMacro<GPRI32, GPRI32, GPRI32, sdiv>; +def MACRO__udiv_i8 : BinaryMacro< GPRI8, GPRI8, GPRI8, udiv>; +def MACRO__udiv_i16 : BinaryMacro<GPRI16, GPRI16, GPRI16, udiv>; +def MACRO__udiv_i32 : BinaryMacro<GPRI32, GPRI32, GPRI32, udiv>; +def MACRO__smod_i8 : BinaryMacro< GPRI8, GPRI8, GPRI8, srem>; +def MACRO__smod_i16 : BinaryMacro<GPRI16, GPRI16, GPRI16, srem>; +def MACRO__smod_i32 : BinaryMacro<GPRI32, GPRI32, GPRI32, srem>; +def MACRO__umod_i8 : BinaryMacro< GPRI8, GPRI8, GPRI8, urem>; +def MACRO__umod_i16 : BinaryMacro<GPRI16, GPRI16, GPRI16, urem>; +def MACRO__umod_i32 : BinaryMacro<GPRI32, GPRI32, GPRI32, urem>; +let Predicates = [HasSWDDiv] in { + def MACRO__ddiv_f64: BinaryMacro<GPRF64, GPRF64, GPRF64, fdiv>; +} +let Predicates = [HasHWDDiv] in { + def MACRO__ddiv_f64_fma: BinaryMacro<GPRF64, GPRF64, GPRF64, fdiv>; +} +def MACRO__ftol_i64 : UnaryMacro<GPRI64, GPRF32, fp_to_sint>; +def MACRO__ftoul_i64 : UnaryMacro<GPRI64, GPRF32, fp_to_uint>; +def MACRO__ultof_f32 : UnaryMacro<GPRF32, GPRI64, uint_to_fp>; +def MACRO__ltof_f32 : UnaryMacro<GPRF32, GPRI64, sint_to_fp>; +let Predicates = [HasSW64Mul] in { +def MACRO__mul_i64 : BinaryMacro<GPRI64, GPRI64, GPRI64, mul>; +def MACRO__mul_v2i64 : BinaryMacro<GPRV2I64, GPRV2I64, GPRV2I64, mul>; +} +let Predicates = [HasSW64DivMod] in { +def MACRO__sdiv_i64 : BinaryMacro<GPRI64, GPRI64, GPRI64, sdiv>; +def MACRO__udiv_i64 : BinaryMacro<GPRI64, GPRI64, GPRI64, udiv>; +def MACRO__smod_i64 : BinaryMacro<GPRI64, GPRI64, GPRI64, srem>; +def MACRO__umod_i64 : BinaryMacro<GPRI64, GPRI64, GPRI64, urem>; +} +let Predicates = [HasHW64DivMod] in { + defm SDIV : BinaryOpMCi64<IL_OP_I64_DIV, sdiv>; + defm UDIV : BinaryOpMCi64<IL_OP_U64_DIV, udiv>; + defm SMOD : BinaryOpMCi64<IL_OP_I64_MOD, srem>; + defm UMOD : BinaryOpMCi64<IL_OP_U64_MOD, urem>; +} +let Predicates = [HasHW64Mul] in { + defm SMUL : BinaryOpMCi64<IL_OP_I64_MUL, mul>; + defm UMUL : BinaryOpMCi64<IL_OP_U64_MUL, IL_umul>; +} +def MACRO__shr_v2i64 : BinaryMacro<GPRV2I64, GPRV2I64, GPRV2I32, srl>; +def MACRO__shl_v2i64 : BinaryMacro<GPRV2I64, GPRV2I64, GPRV2I32, shl>; +def MACRO__sra_v2i64 : BinaryMacro<GPRV2I64, GPRV2I64, GPRV2I32, sra>; + +let Predicates = [HasSW64Bit] in { +def MACRO__shr_i64 : BinaryMacro<GPRI64, GPRI64, GPRI32, srl>; +def MACRO__shl_i64 : BinaryMacro<GPRI64, GPRI64, GPRI32, shl>; +def MACRO__sra_i64 : BinaryMacro<GPRI64, GPRI64, GPRI32, sra>; +} +//===---------------------------------------------------------------------===// +// Comparison Instructions +//===---------------------------------------------------------------------===// +let usesCustomInserter = 1 in { + defm CMP : Compare<"Pseudo comparison instr">; +} +//===---------------------------------------------------------------------===// +// 32-bit floating point operations +//===---------------------------------------------------------------------===// +def FEQ : TwoInOneOut<IL_OP_EQ, (outs GPRF32:$dst), + (ins GPRF32:$lhs, GPRF32:$rhs), + !strconcat(IL_OP_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def FGE : TwoInOneOut<IL_OP_GE, (outs GPRF32:$dst), + (ins GPRF32:$lhs, GPRF32:$rhs), + !strconcat(IL_OP_GE.Text, " $dst, $lhs, $rhs") + , []>; +def FLT : TwoInOneOut<IL_OP_LT, (outs GPRF32:$dst), + (ins GPRF32:$lhs, GPRF32:$rhs), + !strconcat(IL_OP_LT.Text, " $dst, $lhs, $rhs") + , []>; +def FLT_v2f32 : TwoInOneOut<IL_OP_LT, (outs GPRV2F32:$dst), + (ins GPRV2F32:$lhs, GPRV2F32:$rhs), + !strconcat(IL_OP_LT.Text, " $dst, $lhs, $rhs") + , []>; +def FLT_v4f32 : TwoInOneOut<IL_OP_LT, (outs GPRV4F32:$dst), + (ins GPRV4F32:$lhs, GPRV4F32:$rhs), + !strconcat(IL_OP_LT.Text, " $dst, $lhs, $rhs") + , []>; +def FNE : TwoInOneOut<IL_OP_NE, (outs GPRF32:$dst), + (ins GPRF32:$lhs, GPRF32:$rhs), + !strconcat(IL_OP_NE.Text, " $dst, $lhs, $rhs") + , []>; + +//===---------------------------------------------------------------------===// +//TODO: need to correctly define comparison instructions +//===---------------------------------------------------------------------===// +def DEQ : TwoInOneOut<IL_OP_D_EQ, (outs GPRF64:$dst), + (ins GPRF64:$lhs, GPRF64:$rhs), + !strconcat(IL_OP_D_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def DEQ_v2f64 : TwoInOneOut<IL_OP_D_EQ, (outs GPRV2F64:$dst), + (ins GPRV2F64:$lhs, GPRV2F64:$rhs), + !strconcat(IL_OP_D_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def DGE : TwoInOneOut<IL_OP_D_GE, (outs GPRF64:$dst), + (ins GPRF64:$lhs, GPRF64:$rhs), + !strconcat(IL_OP_D_GE.Text, " $dst, $lhs, $rhs") + , []>; +def DLT : TwoInOneOut<IL_OP_D_LT, (outs GPRF64:$dst), + (ins GPRF64:$lhs, GPRF64:$rhs), + !strconcat(IL_OP_D_LT.Text, " $dst, $lhs, $rhs") + , []>; +def DNE : TwoInOneOut<IL_OP_D_NE, (outs GPRF64:$dst), + (ins GPRF64:$lhs, GPRF64:$rhs), + !strconcat(IL_OP_D_NE.Text, " $dst, $lhs, $rhs") + , []>; + +//===---------------------------------------------------------------------===// +//TODO: need to correctly define comparison instructions +//===---------------------------------------------------------------------===// +def IEQ : TwoInOneOut<IL_OP_I_EQ, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def IEQ_v2i32 : TwoInOneOut<IL_OP_I_EQ, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def IEQ_v4i32 : TwoInOneOut<IL_OP_I_EQ, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def IGE : TwoInOneOut<IL_OP_I_GE, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_I_GE.Text, " $dst, $lhs, $rhs") + , []>; +def IGE_v2i32 : TwoInOneOut<IL_OP_I_GE, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_I_GE.Text, " $dst, $lhs, $rhs") + , []>; +def IGE_v4i32 : TwoInOneOut<IL_OP_I_GE, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_I_GE.Text, " $dst, $lhs, $rhs") + , []>; +def ILT : TwoInOneOut<IL_OP_I_LT, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_I_LT.Text, " $dst, $lhs, $rhs") + , []>; +def ILT_v2i32 : TwoInOneOut<IL_OP_I_LT, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_I_LT.Text, " $dst, $lhs, $rhs") + , []>; +def ILT_v4i32 : TwoInOneOut<IL_OP_I_LT, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_I_LT.Text, " $dst, $lhs, $rhs") + , []>; +def INE : TwoInOneOut<IL_OP_I_NE, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs") + , []>; +def INE_v2i32 : TwoInOneOut<IL_OP_I_NE, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs") + , []>; +def INE_v4i32 : TwoInOneOut<IL_OP_I_NE, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs") + , []>; +let Predicates = [HasHW64Bit] in { +def LEQ : TwoInOneOut<IL_OP_I64_EQ, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_I64_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def LGE : TwoInOneOut<IL_OP_I64_GE, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_I64_GE.Text, " $dst, $lhs, $rhs") + , []>; +def LLE : TwoInOneOut<IL_OP_I64_GE, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_I64_GE.Text, " $dst, $rhs, $lhs") + , []>; +def LGT : TwoInOneOut<IL_OP_I64_LT, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_I64_LT.Text, " $dst, $rhs, $lhs") + , []>; +def LLT : TwoInOneOut<IL_OP_I64_LT, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_I64_LT.Text, " $dst, $lhs, $rhs") + , []>; +def LNE : TwoInOneOut<IL_OP_I64_NE, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_I64_NE.Text, " $dst, $lhs, $rhs") + , []>; +} + +//===---------------------------------------------------------------------===// +// Unsigned Integer Operations +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// +//TODO: need to correctly define comparison instructions +//===---------------------------------------------------------------------===// +def UEQ : TwoInOneOut<IL_OP_I_EQ, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def UEQ_v2i32 : TwoInOneOut<IL_OP_I_EQ, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def UEQ_v4i32 : TwoInOneOut<IL_OP_I_EQ, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_I_EQ.Text, " $dst, $lhs, $rhs") + , []>; +def ULE : TwoInOneOut<IL_OP_U_GE, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs") + , []>; +def ULE_v2i32 : TwoInOneOut<IL_OP_U_GE, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs") + , []>; +def ULE_v4i32 : TwoInOneOut<IL_OP_U_GE, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs") + , []>; +def UGT : TwoInOneOut<IL_OP_U_LT, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs") + , []>; +def UGT_v2i32 : TwoInOneOut<IL_OP_U_LT, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs") + , []>; +def UGT_v4i32 : TwoInOneOut<IL_OP_U_LT, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs") + , []>; +def UGE : TwoInOneOut<IL_OP_U_GE, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs") + , []>; +def UGE_v2i32 : TwoInOneOut<IL_OP_U_GE, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs") + , []>; +def UGE_v4i32 : TwoInOneOut<IL_OP_U_GE, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_U_GE.Text, " $dst, $lhs, $rhs") + , []>; +def ULT : TwoInOneOut<IL_OP_U_LT, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs") + , []>; +def ULT_v2i32 : TwoInOneOut<IL_OP_U_LT, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs") + , []>; +def ULT_v4i32 : TwoInOneOut<IL_OP_U_LT, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_U_LT.Text, " $dst, $lhs, $rhs") + , []>; +def UNE : TwoInOneOut<IL_OP_I_NE, (outs GPRI32:$dst), + (ins GPRI32:$lhs, GPRI32:$rhs), + !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs") + , []>; +def UNE_v2i32 : TwoInOneOut<IL_OP_I_NE, (outs GPRV2I32:$dst), + (ins GPRV2I32:$lhs, GPRV2I32:$rhs), + !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs") + , []>; +def UNE_v4i32 : TwoInOneOut<IL_OP_I_NE, (outs GPRV4I32:$dst), + (ins GPRV4I32:$lhs, GPRV4I32:$rhs), + !strconcat(IL_OP_I_NE.Text, " $dst, $lhs, $rhs") + , []>; +let Predicates = [HasHW64Bit] in { +def ULLE : TwoInOneOut<IL_OP_U64_GE, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_U64_GE.Text, " $dst, $rhs, $lhs") + , []>; +def ULGT : TwoInOneOut<IL_OP_U64_LT, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_U64_LT.Text, " $dst, $rhs, $lhs") + , []>; +def ULGE : TwoInOneOut<IL_OP_U64_GE, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_U64_GE.Text, " $dst, $lhs, $rhs") + , []>; +def ULLT : TwoInOneOut<IL_OP_U64_LT, (outs GPRI64:$dst), + (ins GPRI64:$lhs, GPRI64:$rhs), + !strconcat(IL_OP_U64_LT.Text, " $dst, $lhs, $rhs") + , []>; +} +//===---------------------------------------------------------------------===// +// Scalar ==> Scalar conversion functions +//===---------------------------------------------------------------------===// +// f32 ==> f64 +def FTOD : UnaryOp<IL_OP_F_2_D, fextend, GPRF64, GPRF32>; +// f64 ==> f32 +def DTOF : UnaryOp<IL_OP_D_2_F, IL_d2f, GPRF32, GPRF64>; +// f32 ==> i32 signed +def FTOI : UnaryOp<IL_OP_FTOI, fp_to_sint, GPRI32, GPRF32>; +def FTOI_v2i32 : UnaryOp<IL_OP_FTOI, fp_to_sint, GPRV2I32, GPRV2F32>; +def FTOI_v4i32 : UnaryOp<IL_OP_FTOI, fp_to_sint, GPRV4I32, GPRV4F32>; +// i32 ==> f32 signed +def ITOF : UnaryOp<IL_OP_ITOF, sint_to_fp, GPRF32, GPRI32>; +def ITOF_v2f32 : UnaryOp<IL_OP_ITOF, sint_to_fp, GPRV2F32, GPRV2I32>; +def ITOF_v4f32 : UnaryOp<IL_OP_ITOF, sint_to_fp, GPRV4F32, GPRV4I32>; +// f32 ==> i32 unsigned +def FTOU : UnaryOp<IL_OP_FTOU, fp_to_uint, GPRI32, GPRF32>; +def FTOU_v2i32 : UnaryOp<IL_OP_FTOU, fp_to_uint, GPRV2I32, GPRV2F32>; +def FTOU_v4i32 : UnaryOp<IL_OP_FTOU, fp_to_uint, GPRV4I32, GPRV4F32>; +// i32 ==> f32 unsigned +def UTOF : UnaryOp<IL_OP_UTOF, uint_to_fp, GPRF32, GPRI32>; +def UTOF_v2f32 : UnaryOp<IL_OP_UTOF, uint_to_fp, GPRV2F32, GPRV2I32>; +def UTOF_v4f32 : UnaryOp<IL_OP_UTOF, uint_to_fp, GPRV4F32, GPRV4I32>; +// Get upper 32 bits of f64 +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def DHI : OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), + (ins GPRF64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRI32:$dst, (IL_dcomphi GPRF64:$src))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def DHI_v2f64 : OneInOneOut<IL_OP_MOV, (outs GPRV2I32:$dst), + (ins GPRV2F64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRV2I32:$dst, (IL_dcomphi2 GPRV2F64:$src))]>; +// Get lower 32 bits of f64 +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def DLO : OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), + (ins GPRF64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRI32:$dst, (IL_dcomplo GPRF64:$src))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def DLO_v2f64 : OneInOneOut<IL_OP_MOV, (outs GPRV2I32:$dst), + (ins GPRV2F64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRV2I32:$dst, (IL_dcomplo2 GPRV2F64:$src))]>; +// Convert two 32 bit integers into a f64 +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def DCREATE : TwoInOneOut<IL_OP_I_ADD, (outs GPRF64:$dst), + (ins GPRI32:$src0, GPRI32:$src1), + !strconcat(IL_OP_I_ADD.Text, " $dst, $src0, $src1"), + [(set GPRF64:$dst, (IL_dcreate GPRI32:$src0, GPRI32:$src1))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def DCREATE_v2f64 : TwoInOneOut<IL_OP_I_ADD, (outs GPRV2F64:$dst), + (ins GPRV2I32:$src0, GPRV2I32:$src1), + !strconcat(IL_OP_I_ADD.Text, " $dst, $src0, $src1"), + [(set GPRV2F64:$dst, + (IL_dcreate2 GPRV2I32:$src0, GPRV2I32:$src1))]>; +// Get upper 32 bits of i64 +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def LHI : OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), + (ins GPRI64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRI32:$dst, (IL_lcomphi GPRI64:$src))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def LHI_v2i64 : OneInOneOut<IL_OP_MOV, (outs GPRV2I32:$dst), + (ins GPRV2I64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRV2I32:$dst, (IL_lcomphi2 GPRV2I64:$src))]>; +// Get lower 32 bits of i64 +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def LLO : OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), + (ins GPRI64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRI32:$dst, (IL_lcomplo GPRI64:$src))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def LLO_v2i64 : OneInOneOut<IL_OP_MOV, (outs GPRV2I32:$dst), + (ins GPRV2I64:$src), + !strconcat(IL_OP_MOV.Text, " $dst, $src"), + [(set GPRV2I32:$dst, (IL_lcomplo2 GPRV2I64:$src))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def HILO_BITOR_v4i16 : TwoInOneOut<IL_OP_I_OR, (outs GPRI32:$dst), + (ins GPRI32:$src, GPRI32:$src2), + !strconcat(IL_OP_I_OR.Text, " $dst, $src, $src2"), []>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def HILO_BITOR_v2i32 : TwoInOneOut<IL_OP_I_OR, (outs GPRI32:$dst), + (ins GPRI32:$src, GPRI32:$src2), + !strconcat(IL_OP_I_OR.Text, " $dst, $src, $src2"), []>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def HILO_BITOR_v2i64 : TwoInOneOut<IL_OP_I_OR, (outs GPRI64:$dst), + (ins GPRI64:$src, GPRI64:$src2), + !strconcat(IL_OP_I_OR.Text, " $dst, $src, $src2"), []>; +// Convert two 32 bit integers into a i64 +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def LCREATE : TwoInOneOut<IL_OP_I_ADD, (outs GPRI64:$dst), + (ins GPRI32:$src0, GPRI32:$src1), + !strconcat(IL_OP_I_ADD.Text, " $dst, $src0, $src1"), + [(set GPRI64:$dst, (IL_lcreate GPRI32:$src0, GPRI32:$src1))]>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +def LCREATE_v2i64 : TwoInOneOut<IL_OP_I_ADD, (outs GPRV2I64:$dst), + (ins GPRV2I32:$src0, GPRV2I32:$src1), + !strconcat(IL_OP_I_ADD.Text, " $dst, $src0, $src1"), + [(set GPRV2I64:$dst, + (IL_lcreate2 GPRV2I32:$src0, GPRV2I32:$src1))]>; +//===---------------------------------------------------------------------===// +// Scalar ==> Vector conversion functions +//===---------------------------------------------------------------------===// +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +defm VCREATE : UnaryOpMCVec<IL_OP_MOV, IL_vbuild>; + +//===---------------------------------------------------------------------===// +// Vector ==> Scalar conversion functions +//===---------------------------------------------------------------------===// + +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +defm VEXTRACT : VectorExtract<IL_vextract>; + +//===---------------------------------------------------------------------===// +// Vector ==> Vector conversion functions +//===---------------------------------------------------------------------===// +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +defm VINSERT : VectorInsert<IL_vinsert>; +// This opcode has custom swizzle pattern encoded in Swizzle Encoder +defm VCONCAT : VectorConcat<IL_vconcat>; + +//===---------------------------------------------------------------------===// +// Bit conversion functions +//===---------------------------------------------------------------------===// +defm IL_ASCHAR : BitConversion<IL_OP_MOV, GPRI8, IL_bitconv>; +defm IL_ASSHORT : BitConversion<IL_OP_MOV, GPRI16, IL_bitconv>; +defm IL_ASINT : BitConversion<IL_OP_MOV, GPRI32, IL_bitconv>; +defm IL_ASFLOAT : BitConversion<IL_OP_MOV, GPRF32, IL_bitconv>; +defm IL_ASDOUBLE : BitConversion<IL_OP_MOV, GPRF64, IL_bitconv>; +defm IL_ASLONG : BitConversion<IL_OP_MOV, GPRI64, IL_bitconv>; +defm IL_ASV2CHAR : BitConversion<IL_OP_MOV, GPRV2I8, IL_bitconv>; +defm IL_ASV2SHORT : BitConversion<IL_OP_MOV, GPRV2I16, IL_bitconv>; +defm IL_ASV2INT : BitConversion<IL_OP_MOV, GPRV2I32, IL_bitconv>; +defm IL_ASV2FLOAT : BitConversion<IL_OP_MOV, GPRV2F32, IL_bitconv>; +defm IL_ASV2DOUBLE : BitConversion<IL_OP_MOV, GPRV2F64, IL_bitconv>; +defm IL_ASV2LONG : BitConversion<IL_OP_MOV, GPRV2I64, IL_bitconv>; +defm IL_ASV4CHAR : BitConversion<IL_OP_MOV, GPRV4I8, IL_bitconv>; +defm IL_ASV4SHORT : BitConversion<IL_OP_MOV, GPRV4I16, IL_bitconv>; +defm IL_ASV4INT : BitConversion<IL_OP_MOV, GPRV4I32, IL_bitconv>; +defm IL_ASV4FLOAT : BitConversion<IL_OP_MOV, GPRV4F32, IL_bitconv>; + +let Predicates = [Has32BitPtr] in { + let isCodeGenOnly=1 in { + //===----------------------------------------------------------------------===// + // Store Memory Operations + //===----------------------------------------------------------------------===// + defm GLOBALTRUNCSTORE : GTRUNCSTORE<"!global trunc store">; + defm GLOBALSTORE : STORE<"!global store" , global_store>; + defm LOCALTRUNCSTORE : LTRUNCSTORE<"!local trunc store">; + defm LOCALSTORE : STORE<"!local store" , local_store>; + defm PRIVATETRUNCSTORE : PTRUNCSTORE<"!private trunc store">; + defm PRIVATESTORE : STORE<"!private store" , private_store>; + defm REGIONTRUNCSTORE : RTRUNCSTORE<"!region trunc store">; + defm REGIONSTORE : STORE<"!region hw store" , region_store>; + + + //===---------------------------------------------------------------------===// + // Load Memory Operations + //===---------------------------------------------------------------------===// + defm GLOBALLOAD : LOAD<"!global load" , global_load>; + defm GLOBALZEXTLOAD : LOAD<"!global zext load" , global_zext_load>; + defm GLOBALSEXTLOAD : LOAD<"!global sext load" , global_sext_load>; + defm GLOBALAEXTLOAD : LOAD<"!global aext load" , global_aext_load>; + defm PRIVATELOAD : LOAD<"!private load" , private_load>; + defm PRIVATEZEXTLOAD : LOAD<"!private zext load" , private_zext_load>; + defm PRIVATESEXTLOAD : LOAD<"!private sext load" , private_sext_load>; + defm PRIVATEAEXTLOAD : LOAD<"!private aext load" , private_aext_load>; + defm CPOOLLOAD : LOAD<"!constant pool load" , cp_load>; + defm CPOOLZEXTLOAD : LOAD<"!constant pool zext load", cp_zext_load>; + defm CPOOLSEXTLOAD : LOAD<"!constant pool sext load", cp_sext_load>; + defm CPOOLAEXTLOAD : LOAD<"!constant aext pool load", cp_aext_load>; + defm CONSTANTLOAD : LOAD<"!constant load" , constant_load>; + defm CONSTANTZEXTLOAD : LOAD<"!constant zext load" , constant_zext_load>; + defm CONSTANTSEXTLOAD : LOAD<"!constant sext load" , constant_sext_load>; + defm CONSTANTAEXTLOAD : LOAD<"!constant aext load" , constant_aext_load>; + defm LOCALLOAD : LOAD<"!local load" , local_load>; + defm LOCALZEXTLOAD : LOAD<"!local zext load" , local_zext_load>; + defm LOCALSEXTLOAD : LOAD<"!local sext load" , local_sext_load>; + defm LOCALAEXTLOAD : LOAD<"!local aext load" , local_aext_load>; + defm REGIONLOAD : LOAD<"!region load" , region_load>; + defm REGIONZEXTLOAD : LOAD<"!region zext load" , region_zext_load>; + defm REGIONSEXTLOAD : LOAD<"!region sext load" , region_sext_load>; + defm REGIONAEXTLOAD : LOAD<"!region aext load" , region_aext_load>; + } + + + //===---------------------------------------------------------------------===// + // IO Expansion Load/Store Instructions + //===---------------------------------------------------------------------===// + let mayLoad = 1 in { + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHLOAD : TwoInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " $dst, x$id[$addy]"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def CBLOAD : TwoInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " $dst, cb$id[$addy]"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSLOAD : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSLOAD_Y : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSLOAD_Z : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSLOAD_W : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOADVEC : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOADVEC_v2i32 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRV2I32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOADVEC_v4i32 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD_i8 : TwoInOneOut<IL_OP_LDS_LOAD_BYTE, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD_BYTE.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD_u8 : TwoInOneOut<IL_OP_LDS_LOAD_UBYTE, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD_UBYTE.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD_i16 : TwoInOneOut<IL_OP_LDS_LOAD_SHORT, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD_SHORT.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD_u16 : TwoInOneOut<IL_OP_LDS_LOAD_USHORT, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD_USHORT.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD_Y : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD_Z : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD_W : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD_i8 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(byte) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD_i16 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(short) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD_Y_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD_Z_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD_W_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOAD_i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOAD_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV2I32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOAD_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOADCACHED_i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOADCACHED_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV2I32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOADCACHED_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>; + } + let mayStore = 1 in { + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRV4I32:$data, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy], $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE_X : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRI32:$data, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].x___, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE_Y : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRI32:$data, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy]._y__, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE_Z : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRI32:$data, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].__z_, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE_W : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRI32:$data, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].___w, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE_XY : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRV2I32:$data, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].xy__, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE_ZW : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRV2I32:$data, i32imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].__zw, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSSTORE : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSSTORE_Y : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSSTORE_Z : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSSTORE_W : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTOREVEC : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRI32:$mem), + (ins GPRI32:$addy, GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTOREVEC_v2i32 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRV2I32:$mem), + (ins GPRI32:$addy, GPRV2I32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTOREVEC_v4i32 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRV4I32:$mem), + (ins GPRI32:$addy, GPRV4I32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE_i8 : TwoInOneOut<IL_OP_LDS_STORE_BYTE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE_BYTE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE_i16 : TwoInOneOut<IL_OP_LDS_STORE_SHORT, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE_SHORT.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE_Y : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE_Z : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE_W : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE_i8 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI8:$src, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(byte) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE_i16 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI16:$src, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(short) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(dword) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE_Y_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(dword) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE_Z_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(dword) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE_W_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(dword) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWSTORE_i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRI32:$mem), + (ins GPRI32:$addy, GPRI32:$src, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWSTORE_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRV2I32:$mem), + (ins GPRI32:$addy, GPRV2I32:$src, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWSTORE_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRV4I32:$mem), + (ins GPRI32:$addy, GPRV4I32:$src, i32imm:$id), + !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>; + } +} +let Predicates = [Has64BitPtr] in { + let isCodeGenOnly=1 in { + //===----------------------------------------------------------------------===// + // Store Memory Operations + //===----------------------------------------------------------------------===// + defm GLOBALTRUNCSTORE64 : GTRUNCSTORE64<"!global trunc store">; + defm GLOBALSTORE64 : STORE64<"!global store" , global_store>; + defm LOCALTRUNCSTORE64 : LTRUNCSTORE64<"!local trunc store">; + defm LOCALSTORE64 : STORE64<"!local store" , local_store>; + defm PRIVATETRUNCSTORE64 : PTRUNCSTORE64<"!private trunc store">; + defm PRIVATESTORE64 : STORE64<"!private store" , private_store>; + defm REGIONTRUNCSTORE64 : RTRUNCSTORE64<"!region trunc store">; + defm REGIONSTORE64 : STORE64<"!region hw store" , region_store>; + + + //===---------------------------------------------------------------------===// + // Load Memory Operations + //===---------------------------------------------------------------------===// + defm GLOBALLOAD64 : LOAD64<"!global load" , global_load>; + defm GLOBALZEXTLOAD64 : LOAD64<"!global zext load" , global_zext_load>; + defm GLOBALSEXTLOAD64 : LOAD64<"!global sext load" , global_sext_load>; + defm GLOBALAEXTLOAD64 : LOAD64<"!global aext load" , global_aext_load>; + defm PRIVATELOAD64 : LOAD64<"!private load" , private_load>; + defm PRIVATEZEXTLOAD64 : LOAD64<"!private zext load" , private_zext_load>; + defm PRIVATESEXTLOAD64 : LOAD64<"!private sext load" , private_sext_load>; + defm PRIVATEAEXTLOAD64 : LOAD64<"!private aext load" , private_aext_load>; + defm CPOOLLOAD64 : LOAD64<"!constant pool load" , cp_load>; + defm CPOOLZEXTLOAD64 : LOAD64<"!constant pool zext load", cp_zext_load>; + defm CPOOLSEXTLOAD64 : LOAD64<"!constant pool sext load", cp_sext_load>; + defm CPOOLAEXTLOAD64 : LOAD64<"!constant aext pool load", cp_aext_load>; + defm CONSTANTLOAD64 : LOAD64<"!constant load" , constant_load>; + defm CONSTANTZEXTLOAD64 : LOAD64<"!constant zext load" , constant_zext_load>; + defm CONSTANTSEXTLOAD64 : LOAD64<"!constant sext load" , constant_sext_load>; + defm CONSTANTAEXTLOAD64 : LOAD64<"!constant aext load" , constant_aext_load>; + defm LOCALLOAD64 : LOAD64<"!local load" , local_load>; + defm LOCALZEXTLOAD64 : LOAD64<"!local zext load" , local_zext_load>; + defm LOCALSEXTLOAD64 : LOAD64<"!local sext load" , local_sext_load>; + defm LOCALAEXTLOAD64 : LOAD64<"!local aext load" , local_aext_load>; + defm REGIONLOAD64 : LOAD64<"!region load" , region_load>; + defm REGIONZEXTLOAD64 : LOAD64<"!region zext load" , region_zext_load>; + defm REGIONSEXTLOAD64 : LOAD64<"!region sext load" , region_sext_load>; + defm REGIONAEXTLOAD64 : LOAD64<"!region aext load" , region_aext_load>; + } + + + //===---------------------------------------------------------------------===// + // IO Expansion Load/Store Instructions + //===---------------------------------------------------------------------===// + let mayLoad = 1 in { + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHLOAD64 : TwoInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " $dst, x$id[$addy]"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def CBLOAD64 : TwoInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " $dst, cb$id[$addy]"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSLOAD64 : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSLOAD64_Y : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSLOAD64_Z : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSLOAD64_W : TwoInOneOut<IL_OP_GDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_GDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOADVEC64 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOADVEC64_v2i32 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRV2I32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOADVEC64_v4i32 : TwoInOneOut<IL_OP_LDS_LOAD_VEC, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD_VEC.Text, "_id($id) $dst, $addy, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD64 : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD64_i8 : TwoInOneOut<IL_OP_LDS_LOAD_BYTE, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD_BYTE.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD64_u8 : TwoInOneOut<IL_OP_LDS_LOAD_UBYTE, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD_UBYTE.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD64_i16 : TwoInOneOut<IL_OP_LDS_LOAD_SHORT, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD_SHORT.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD64_u16 : TwoInOneOut<IL_OP_LDS_LOAD_USHORT, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD_USHORT.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD64_Y : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD64_Z : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSLOAD64_W : TwoInOneOut<IL_OP_LDS_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_LDS_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD64_i8 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(byte) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD64_i16 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(short) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD64_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD64_Y_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD64_Z_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENALOAD64_W_i32 : TwoInOneOut<IL_OP_ARENA_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_LOAD.Text, "_id($id)_size(dword) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOAD64_i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOAD64_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV2I32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOAD64_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id) $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOADCACHED64_i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRI32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOADCACHED64_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV2I32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWLOADCACHED64_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_LOAD, (outs GPRV4I32:$dst), + (ins GPRI32:$addy, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_LOAD.Text, "_id($id)_cached $dst, $addy"), []>; + } + let mayStore = 1 in { + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE64 : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRV4I32:$data, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy], $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE64_X : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRI32:$data, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].x___, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE64_Y : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRI32:$data, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy]._y__, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE64_Z : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRI32:$data, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].__z_, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE64_W : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRI32:$data, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].___w, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE64_XY : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRV2I32:$data, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].xy__, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def SCRATCHSTORE64_ZW : TwoInOneOut<IL_OP_MOV, (outs GPRI32:$addy), + (ins GPRV2I32:$data, i64imm:$id), + !strconcat(IL_OP_MOV.Text, " x$id[$addy].__zw, $data"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSSTORE64 : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSSTORE64_Y : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSSTORE64_Z : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def GDSSTORE64_W : TwoInOneOut<IL_OP_GDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_GDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTOREVEC64 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRI32:$mem), + (ins GPRI32:$addy, GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTOREVEC64_v2i32 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRV2I32:$mem), + (ins GPRI32:$addy, GPRV2I32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTOREVEC64_v4i32 : ThreeInOneOut<IL_OP_LDS_STORE_VEC, (outs GPRV4I32:$mem), + (ins GPRI32:$addy, GPRV4I32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE_VEC.Text, "_id($id) $mem, $addy, $src, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE64 : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE64_i8 : TwoInOneOut<IL_OP_LDS_STORE_BYTE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE_BYTE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE64_i16 : TwoInOneOut<IL_OP_LDS_STORE_SHORT, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE_SHORT.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE64_Y : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE64_Z : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def LDSSTORE64_W : TwoInOneOut<IL_OP_LDS_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_LDS_STORE.Text, "_id($id) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE64_i8 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI8:$src, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(byte) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE64_i16 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI16:$src, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(short) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE64_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(dword) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE64_Y_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(dword) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE64_Z_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(dword) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVARENASTORE64_W_i32 : TwoInOneOut<IL_OP_ARENA_UAV_STORE, (outs GPRI32:$addy), + (ins GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_ARENA_UAV_STORE.Text, + "_id($id)_size(dword) $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWSTORE64_i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRI32:$mem), + (ins GPRI32:$addy, GPRI32:$src, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWSTORE64_v2i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRV2I32:$mem), + (ins GPRI32:$addy, GPRV2I32:$src, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>; + // This opcode has custom swizzle patterns for some of the arguments. + def UAVRAWSTORE64_v4i32 : TwoInOneOut<IL_OP_RAW_UAV_STORE, (outs GPRV4I32:$mem), + (ins GPRI32:$addy, GPRV4I32:$src, i64imm:$id), + !strconcat(IL_OP_RAW_UAV_STORE.Text, "_id($id) $mem, $addy, $src"), []>; + } +} +//===---------------------------------------------------------------------===// +// Custom Inserter for Branches and returns, this eventually will be a +// seperate pass +//===---------------------------------------------------------------------===// +let isTerminator = 1 in { + def BRANCH : ILFormat<IL_PSEUDO_INST, (outs), (ins brtarget:$target), + "; Pseudo unconditional branch instruction", + [(br bb:$target)]>; + defm BRANCH_COND : BranchConditional<IL_brcond>; +} +//===---------------------------------------------------------------------===// +// return instructions +//===---------------------------------------------------------------------===// +let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in { + def RETURN : ILFormat<IL_OP_RET,(outs), (ins variable_ops), + IL_OP_RET.Text, [(IL_retflag)]>; +} +//===---------------------------------------------------------------------===// +// Lower and raise the stack x amount +//===---------------------------------------------------------------------===// +def ADJCALLSTACKDOWN : ILFormat<IL_PSEUDO_INST, (outs), (ins i32imm:$amt), + "; begin of call sequence $amt", + [(IL_callseq_start timm:$amt)]>; +def ADJCALLSTACKUP : ILFormat<IL_PSEUDO_INST, (outs), (ins i32imm:$amt1, + i32imm:$amt2), + "; end of call sequence $amt1 $amt2", + [(IL_callseq_end timm:$amt1, timm:$amt2)]>; + +//===---------------------------------------------------------------------===// +// Handle a function call +//===---------------------------------------------------------------------===// +let isCall = 1, + Defs = [ + R110, R111, + R112, R113, R114, R115, R116, R117, R118, R119, R120, R121, R122, R123, R124, + R125, R126, R127, + R128, R129, R130, R131, R132, R133, R134, R135, R136, R137, R138, R139, R140, + R141, R142, R143, + R144, R145, R146, R147, R148, R149, R150, R151, R152, R153, R154, R155, R156, + R157, R158, R159, + R160, R161, R162, R163, R164, R165, R166, R167, R168, R169, R170, R171, R172, + R173, R174, R175, + R176, R177, R178, R179, R180, R181, R182, R183, R184, R185, R186, R187, R188, + R189, R190, R191, + R192, R193, R194, R195, R196, R197, R198, R199, R200, R201, R202, R203, R204, + R205, R206, R207, + R208, R209, R210, R211, R212, R213, R214, R215, R216, R217, R218, R219, R220, + R221, R222, R223, + R224, R225, R226, R227, R228, R229, R230, R231, R232, R233, R234, R235, R236, + R237, R238, R239, + R240, R241, R242, R243, R244, R245, R246, R247, R248, R249, R250, R251, R252, + R253, R254, R255 + ] + , + Uses = [ + R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, + R16, R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31, + R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46, R47, + R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61, R62, R63, + R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76, R77, R78, R79, + R80, R81, R82, R83, R84, R85, R86, R87, R88, R89, R90, R91, R92, R93, R94, R95, + R96, R97, R98, R99, R100, R101, R102, R103, R104, R105, R106, R107, R108, R109 + ] + in { + def CALL : UnaryOpNoRet<IL_OP_CALL, (outs), + (ins calltarget:$dst, variable_ops), + !strconcat(IL_OP_CALL.Text, " $dst"), []>; + } + + +//===---------------------------------------------------------------------===// +// Flow and Program control Instructions +//===---------------------------------------------------------------------===// +let isTerminator=1 in { + def SWITCH : ILFormat<IL_OP_SWITCH, (outs), (ins GPRI32:$src), + !strconcat(IL_OP_SWITCH.Text, " $src"), []>; + def CASE : ILFormat<IL_OP_CASE, (outs), (ins GPRI32:$src), + !strconcat(IL_OP_CASE.Text, " $src"), []>; + def BREAK : ILFormat<IL_OP_BREAK, (outs), (ins), + IL_OP_BREAK.Text, []>; + def CONTINUE : ILFormat<IL_OP_CONTINUE, (outs), (ins), + IL_OP_CONTINUE.Text, []>; + def DEFAULT : ILFormat<IL_OP_DEFAULT, (outs), (ins), + IL_OP_DEFAULT.Text, []>; + def ELSE : ILFormat<IL_OP_ELSE, (outs), (ins), + IL_OP_ELSE.Text, []>; + def ENDSWITCH : ILFormat<IL_OP_ENDSWITCH, (outs), (ins), + IL_OP_ENDSWITCH.Text, []>; + def ENDMAIN : ILFormat<IL_OP_ENDMAIN, (outs), (ins), + IL_OP_ENDMAIN.Text, []>; + def END : ILFormat<IL_OP_END, (outs), (ins), + IL_OP_END.Text, []>; + def ENDFUNC : ILFormat<IL_OP_ENDFUNC, (outs), (ins), + IL_OP_ENDFUNC.Text, []>; + def ENDIF : ILFormat<IL_OP_ENDIF, (outs), (ins), + IL_OP_ENDIF.Text, []>; + def WHILELOOP : ILFormat<IL_OP_WHILE, (outs), (ins), + IL_OP_WHILE.Text, []>; + def ENDLOOP : ILFormat<IL_OP_ENDLOOP, (outs), (ins), + IL_OP_ENDLOOP.Text, []>; + def FUNC : ILFormat<IL_OP_FUNC, (outs), (ins), + IL_OP_FUNC.Text, []>; + def RETDYN : ILFormat<IL_OP_RET_DYN, (outs), (ins), + IL_OP_RET_DYN.Text, []>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALNZ : BranchInstr<IL_OP_IF_LOGICALNZ>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALZ : BranchInstr<IL_OP_IF_LOGICALZ>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALNZ : BranchInstr<IL_OP_BREAK_LOGICALNZ>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALZ : BranchInstr<IL_OP_BREAK_LOGICALZ>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALNZ : BranchInstr<IL_OP_CONTINUE_LOGICALNZ>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALZ : BranchInstr<IL_OP_CONTINUE_LOGICALZ>; + defm IFC : BranchInstr2<IL_OP_IFC>; + defm BREAKC : BranchInstr2<IL_OP_BREAKC>; + defm CONTINUEC : BranchInstr2<IL_OP_CONTINUEC>; +} +let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in { + def TRAP : ILFormat<IL_OP_NOP, (outs), (ins), + IL_OP_NOP.Text, [(trap)]>; +} + +//===---------------------------------------------------------------------===// +//----------------- Work Item Functions - OpenCL 6.11.1 ---------------------// +//===---------------------------------------------------------------------===// +let isCall = 1, isAsCheapAsAMove = 1 in { + def GET_WORK_DIM : ILFormat<IL_OP_MOV, (outs GPRI32:$dst), (ins), + !strconcat(IL_OP_MOV.Text, " $dst, cb0[0].w"), + [(set GPRI32:$dst, (int_AMDIL_get_work_dim))]>; + + def GET_GLOBAL_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1021.xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_global_id))]>; + + def GET_LOCAL_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1022.xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_local_id))]>; + + def GET_GROUP_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1023.xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_group_id))]>; + + def GET_GLOBAL_SIZE : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[0].xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_global_size))]>; + + def GET_LOCAL_SIZE : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[1].xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_local_size))]>; + + def GET_NUM_GROUPS : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[2].xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_num_groups))]>; + + def GET_GLOBAL_OFFSET : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[9].xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_global_offset))]>; + + let Predicates = [Has64BitPtr] in { + def GET_PRINTF_OFFSET_i64: ILFormat<IL_OP_MOV, (outs GPRI32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[8].zw"), + [(set GPRI32:$dst, (int_AMDIL_get_printf_offset))]>; + def GET_PRINTF_SIZE_i64 : ILFormat<IL_OP_MOV, (outs GPRI32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[9].x0"), + [(set GPRI32:$dst, (int_AMDIL_get_printf_size))]>; + } + let Predicates = [Has32BitPtr] in { + def GET_PRINTF_OFFSET_i32 : ILFormat<IL_OP_MOV, (outs GPRI32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[8].y0"), + [(set GPRI32:$dst, (int_AMDIL_get_printf_offset))]>; + def GET_PRINTF_SIZE_i32 : ILFormat<IL_OP_MOV, (outs GPRI32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[8].z0"), + [(set GPRI32:$dst, (int_AMDIL_get_printf_size))]>; + } +} +//===---------------------------------------------------------------------===// +//------------- Synchronization Functions - OpenCL 6.11.9 -------------------// +//===---------------------------------------------------------------------===// +let isCall=1 in { + + def FENCE : BinaryOpNoRet<IL_OP_FENCE, (outs), (ins GPRI32:$flag), + "fence_lds_memory_gds", + [(int_AMDIL_fence GPRI32:$flag)]>; + + def FENCE_LOCAL : BinaryOpNoRet<IL_OP_FENCE, (outs), (ins GPRI32:$flag), + "fence_lds", + [(int_AMDIL_fence_local GPRI32:$flag)]>; + + def FENCE_GLOBAL : BinaryOpNoRet<IL_OP_FENCE, (outs), (ins GPRI32:$flag), + "fence_memory", + [(int_AMDIL_fence_global GPRI32:$flag)]>; + + def FENCE_REGION : BinaryOpNoRet<IL_OP_FENCE, (outs), (ins GPRI32:$flag), + "fence_gds", + [(int_AMDIL_fence_region GPRI32:$flag)]>; + + def FENCE_READ_ONLY : BinaryOpNoRet<IL_OP_FENCE_READ_ONLY, (outs), + (ins GPRI32:$flag), + "fence_lds_gds_memory_mem_read_only", + [(int_AMDIL_fence_read_only GPRI32:$flag)]>; + + def FENCE_READ_ONLY_LOCAL : BinaryOpNoRet<IL_OP_FENCE_READ_ONLY, (outs), + (ins GPRI32:$flag), + "fence_lds_mem_read_only", + [(int_AMDIL_fence_read_only_local GPRI32:$flag)]>; + + def FENCE_READ_ONLY_GLOBAL : BinaryOpNoRet<IL_OP_FENCE_READ_ONLY, (outs), + (ins GPRI32:$flag), + "fence_mem_read_only", + [(int_AMDIL_fence_read_only_global GPRI32:$flag)]>; + + def FENCE_READ_ONLY_REGION : BinaryOpNoRet<IL_OP_FENCE_READ_ONLY, (outs), + (ins GPRI32:$flag), + "fence_gds_mem_read_only", + [(int_AMDIL_fence_read_only_region GPRI32:$flag)]>; + + def FENCE_WRITE_ONLY : BinaryOpNoRet<IL_OP_FENCE_WRITE_ONLY, (outs), + (ins GPRI32:$flag), + "fence_lds_gds_memory_mem_write_only", + [(int_AMDIL_fence_write_only GPRI32:$flag)]>; + + def FENCE_WRITE_ONLY_LOCAL : BinaryOpNoRet<IL_OP_FENCE_WRITE_ONLY, (outs), + (ins GPRI32:$flag), + "fence_lds_mem_write_only", + [(int_AMDIL_fence_write_only_local GPRI32:$flag)]>; + + def FENCE_WRITE_ONLY_GLOBAL : BinaryOpNoRet<IL_OP_FENCE_WRITE_ONLY, (outs), + (ins GPRI32:$flag), + "fence_mem_write_only", + [(int_AMDIL_fence_write_only_global GPRI32:$flag)]>; + + def FENCE_WRITE_ONLY_REGION : BinaryOpNoRet<IL_OP_FENCE_WRITE_ONLY, (outs), + (ins GPRI32:$flag), + "fence_gds_mem_write_only", + [(int_AMDIL_fence_write_only_region GPRI32:$flag)]>; +} +let isReturn = 1 in { + def EARLY_EXIT : UnaryOpNoRet<IL_OP_RET_LOGICALNZ, (outs), + (ins GPRI32:$flag), + !strconcat(IL_OP_RET_LOGICALNZ.Text, " $flag"), + [(int_AMDIL_early_exit GPRI32:$flag)]>; +} +def MEDIA_UNPACK_0 : OneInOneOut<IL_OP_UNPACK_0, (outs GPRV4F32:$dst), + (ins GPRV4I32:$src), + !strconcat(IL_OP_UNPACK_0.Text, " $dst, $src"), + [(set GPRV4F32:$dst, + (v4f32 (int_AMDIL_media_unpack_byte_0 GPRV4I32:$src)))]>; +def MEDIA_UNPACK_1 : OneInOneOut<IL_OP_UNPACK_1, (outs GPRV4F32:$dst), + (ins GPRV4I32:$src), + !strconcat(IL_OP_UNPACK_1.Text, " $dst, $src"), + [(set GPRV4F32:$dst, + (v4f32 (int_AMDIL_media_unpack_byte_1 GPRV4I32:$src)))]>; +def MEDIA_UNPACK_2 : OneInOneOut<IL_OP_UNPACK_2, (outs GPRV4F32:$dst), + (ins GPRV4I32:$src), + !strconcat(IL_OP_UNPACK_2.Text, " $dst, $src"), + [(set GPRV4F32:$dst, + (v4f32 (int_AMDIL_media_unpack_byte_2 GPRV4I32:$src)))]>; +def MEDIA_UNPACK_3 : OneInOneOut<IL_OP_UNPACK_3, (outs GPRV4F32:$dst), + (ins GPRV4I32:$src), + !strconcat(IL_OP_UNPACK_3.Text, " $dst, $src"), + [(set GPRV4F32:$dst, + (v4f32 (int_AMDIL_media_unpack_byte_3 GPRV4I32:$src)))]>; +let Predicates = [Has32BitPtr] in { +// All of the image functions +def IMAGE1D_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image1d_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE1DA_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image1d_array_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE2D_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image2d_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE2DA_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image2d_array_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE3D_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image3d_read_norm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE1D_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image1d_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE1DA_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image1d_array_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE2D_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image2d_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE2DA_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image2d_array_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE3D_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image3d_read_unnorm ADDR:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE1D_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image1d_info0 ADDR:$ptr))]>; +def IMAGE1D_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image1d_info1 ADDR:$ptr))]>; +def IMAGE1DA_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image1d_array_info0 ADDR:$ptr))]>; +def IMAGE1DA_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image1d_array_info1 ADDR:$ptr))]>; +def IMAGE2D_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image2d_info0 ADDR:$ptr))]>; +def IMAGE2D_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image2d_info1 ADDR:$ptr))]>; +def IMAGE2DA_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image2d_array_info0 ADDR:$ptr))]>; +def IMAGE2DA_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image2d_array_info1 ADDR:$ptr))]>; +def IMAGE3D_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image3d_info0 ADDR:$ptr))]>; +def IMAGE3D_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI32:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image3d_info1 ADDR:$ptr))]>; +def IMAGE1D_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI32:$ptr, GPRV2I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image1d_write ADDR:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>; +def IMAGE1DA_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI32:$ptr, GPRV2I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image1d_array_write ADDR:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>; +def IMAGE2D_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI32:$ptr, GPRV2I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image2d_write ADDR:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>; +def IMAGE2DA_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI32:$ptr, GPRV2I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image2d_array_write ADDR:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>; +def IMAGE3D_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI32:$ptr, GPRV4I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image3d_write ADDR:$ptr, GPRV4I32:$addy, GPRV4I32:$data)]>; +let hasSideEffects = 1, isNotDuplicable = 1 in { + // All of the noret atomic functions + def ATOM_G_ADD_NORET : BinAtomNoRet<IL_OP_UAV_ADD, + "_id($id)", atom_g_add_noret>; + def ATOM_G_AND_NORET : BinAtomNoRet<IL_OP_UAV_AND, + "_id($id)", atom_g_and_noret>; + def ATOM_G_MAX_NORET : BinAtomNoRet<IL_OP_UAV_MAX, + "_id($id)", atom_g_max_noret>; + def ATOM_G_MIN_NORET : BinAtomNoRet<IL_OP_UAV_MIN, + "_id($id)", atom_g_min_noret>; + def ATOM_G_UMAX_NORET : BinAtomNoRet<IL_OP_UAV_UMAX, + "_id($id)", atom_g_umax_noret>; + def ATOM_G_UMIN_NORET : BinAtomNoRet<IL_OP_UAV_UMIN, + "_id($id)", atom_g_umin_noret>; + def ATOM_G_OR_NORET : BinAtomNoRet<IL_OP_UAV_OR, + "_id($id)", atom_g_or_noret>; + def ATOM_G_RSUB_NORET : BinAtomNoRet<IL_OP_UAV_RSUB, + "_id($id)", atom_g_rsub_noret>; + def ATOM_G_SUB_NORET : BinAtomNoRet<IL_OP_UAV_SUB, + "_id($id)", atom_g_sub_noret>; + def ATOM_G_XOR_NORET : BinAtomNoRet<IL_OP_UAV_XOR, + "_id($id)", atom_g_xor_noret>; + def ATOM_G_INC_NORET : BinAtomNoRet<IL_OP_UAV_INC, + "_id($id)", atom_g_inc_noret>; + def ATOM_G_DEC_NORET : BinAtomNoRet<IL_OP_UAV_DEC, + "_id($id)", atom_g_dec_noret>; + def ATOM_G_CMPXCHG_NORET : CmpXChgNoRet<IL_OP_UAV_CMP, + "_id($id)", atom_g_cmpxchg_noret>; + def ATOM_A_ADD_NORET : BinAtomNoRet<IL_OP_UAV_ADD, + "_id($id)_arena", atom_g_add_noret>; + def ATOM_A_AND_NORET : BinAtomNoRet<IL_OP_UAV_AND, + "_id($id)_arena", atom_g_and_noret>; + def ATOM_A_MAX_NORET : BinAtomNoRet<IL_OP_UAV_MAX, + "_id($id)_arena", atom_g_max_noret>; + def ATOM_A_MIN_NORET : BinAtomNoRet<IL_OP_UAV_MIN, + "_id($id)_arena", atom_g_min_noret>; + def ATOM_A_UMAX_NORET : BinAtomNoRet<IL_OP_UAV_UMAX, + "_id($id)_arena", atom_g_umax_noret>; + def ATOM_A_UMIN_NORET : BinAtomNoRet<IL_OP_UAV_UMIN, + "_id($id)_arena", atom_g_umin_noret>; + def ATOM_A_OR_NORET : BinAtomNoRet<IL_OP_UAV_OR, + "_id($id)_arena", atom_g_or_noret>; + def ATOM_A_RSUB_NORET : BinAtomNoRet<IL_OP_UAV_RSUB, + "_id($id)_arena", atom_g_rsub_noret>; + def ATOM_A_SUB_NORET : BinAtomNoRet<IL_OP_UAV_SUB, + "_id($id)_arena", atom_g_sub_noret>; + def ATOM_A_XOR_NORET : BinAtomNoRet<IL_OP_UAV_XOR, + "_id($id)_arena", atom_g_xor_noret>; + def ATOM_A_INC_NORET : BinAtomNoRet<IL_OP_UAV_INC, + "_id($id)_arena", atom_g_inc_noret>; + def ATOM_A_DEC_NORET : BinAtomNoRet<IL_OP_UAV_DEC, + "_id($id)_arena", atom_g_dec_noret>; + def ATOM_A_CMPXCHG_NORET : CmpXChgNoRet<IL_OP_UAV_CMP, + "_id($id)_arena", atom_g_cmpxchg_noret>; + def ATOM_L_ADD_NORET : BinAtomNoRet<IL_OP_LDS_ADD, + "_resource($id)", atom_l_add_noret>; + def ATOM_L_AND_NORET : BinAtomNoRet<IL_OP_LDS_AND, + "_resource($id)", atom_l_and_noret>; + def ATOM_L_MAX_NORET : BinAtomNoRet<IL_OP_LDS_MAX, + "_resource($id)", atom_l_max_noret>; + def ATOM_L_MIN_NORET : BinAtomNoRet<IL_OP_LDS_MIN, + "_resource($id)", atom_l_min_noret>; + def ATOM_L_UMAX_NORET : BinAtomNoRet<IL_OP_LDS_UMAX, + "_resource($id)", atom_l_umax_noret>; + def ATOM_L_UMIN_NORET : BinAtomNoRet<IL_OP_LDS_UMIN, + "_resource($id)", atom_l_umin_noret>; + def ATOM_L_MSKOR_NORET : TriAtomNoRet<IL_OP_LDS_MSKOR, + "_resource($id)", atom_l_mskor_noret>; + def ATOM_L_OR_NORET : BinAtomNoRet<IL_OP_LDS_OR, + "_resource($id)", atom_l_or_noret>; + def ATOM_L_RSUB_NORET : BinAtomNoRet<IL_OP_LDS_RSUB, + "_resource($id)", atom_l_rsub_noret>; + def ATOM_L_SUB_NORET : BinAtomNoRet<IL_OP_LDS_SUB, + "_resource($id)", atom_l_sub_noret>; + def ATOM_L_XOR_NORET : BinAtomNoRet<IL_OP_LDS_XOR, + "_resource($id)", atom_l_xor_noret>; + def ATOM_L_INC_NORET : BinAtomNoRet<IL_OP_LDS_INC, + "_resource($id)", atom_l_inc_noret>; + def ATOM_L_DEC_NORET : BinAtomNoRet<IL_OP_LDS_DEC, + "_resource($id)", atom_l_dec_noret>; + def ATOM_L_CMPXCHG_NORET : TriAtomNoRet<IL_OP_LDS_CMP, + "_resource($id)", atom_l_cmpxchg_noret>; + def ATOM_R_ADD_NORET : BinAtomNoRet<IL_OP_GDS_ADD, + "_resource($id)", atom_r_add_noret>; + def ATOM_R_AND_NORET : BinAtomNoRet<IL_OP_GDS_AND, + "_resource($id)", atom_r_and_noret>; + def ATOM_R_MAX_NORET : BinAtomNoRet<IL_OP_GDS_MAX, + "_resource($id)", atom_r_max_noret>; + def ATOM_R_MIN_NORET : BinAtomNoRet<IL_OP_GDS_MIN, + "_resource($id)", atom_r_min_noret>; + def ATOM_R_UMAX_NORET : BinAtomNoRet<IL_OP_GDS_UMAX, + "_resource($id)", atom_r_umax_noret>; + def ATOM_R_UMIN_NORET : BinAtomNoRet<IL_OP_GDS_UMIN, + "_resource($id)", atom_r_umin_noret>; + def ATOM_R_MSKOR_NORET : TriAtomNoRet<IL_OP_GDS_MSKOR, + "_resource($id)", atom_r_mskor_noret>; + def ATOM_R_OR_NORET : BinAtomNoRet<IL_OP_GDS_OR, + "_resource($id)", atom_r_or_noret>; + def ATOM_R_RSUB_NORET : BinAtomNoRet<IL_OP_GDS_RSUB, + "_resource($id)", atom_r_rsub_noret>; + def ATOM_R_SUB_NORET : BinAtomNoRet<IL_OP_GDS_SUB, + "_resource($id)", atom_r_sub_noret>; + def ATOM_R_XOR_NORET : BinAtomNoRet<IL_OP_GDS_XOR, + "_resource($id)", atom_r_xor_noret>; + def ATOM_R_INC_NORET : BinAtomNoRet<IL_OP_GDS_INC, + "_resource($id)", atom_r_inc_noret>; + def ATOM_R_DEC_NORET : BinAtomNoRet<IL_OP_GDS_DEC, + "_resource($id)", atom_r_dec_noret>; + def ATOM_R_CMPXCHG_NORET : CmpXChgNoRet<IL_OP_GDS_CMP, + "_resource($id)", atom_r_cmpxchg_noret>; + def APPEND_ALLOC_NORET : AppendNoRet<IL_OP_APPEND_BUF_ALLOC, + "_id($id)", append_alloc_noret>; + def APPEND_CONSUME_NORET : AppendNoRet<IL_OP_APPEND_BUF_CONSUME, + "_id($id)", append_consume_noret>; + // All of the atomic functions that return + def ATOM_G_ADD : BinAtom<IL_OP_UAV_READ_ADD, + "_id($id)", atom_g_add>; + def ATOM_G_AND : BinAtom<IL_OP_UAV_READ_AND, + "_id($id)", atom_g_and>; + def ATOM_G_MAX : BinAtom<IL_OP_UAV_READ_MAX, + "_id($id)", atom_g_max>; + def ATOM_G_MIN : BinAtom<IL_OP_UAV_READ_MIN, + "_id($id)", atom_g_min>; + def ATOM_G_UMAX : BinAtom<IL_OP_UAV_READ_UMAX, + "_id($id)", atom_g_umax>; + def ATOM_G_UMIN : BinAtom<IL_OP_UAV_READ_UMIN, + "_id($id)", atom_g_umin>; + def ATOM_G_OR : BinAtom<IL_OP_UAV_READ_OR, + "_id($id)", atom_g_or>; + def ATOM_G_RSUB : BinAtom<IL_OP_UAV_READ_RSUB, + "_id($id)", atom_g_rsub>; + def ATOM_G_SUB : BinAtom<IL_OP_UAV_READ_SUB, + "_id($id)", atom_g_sub>; + def ATOM_G_XOR : BinAtom<IL_OP_UAV_READ_XOR, + "_id($id)", atom_g_xor>; + def ATOM_G_INC : BinAtom<IL_OP_UAV_READ_INC, + "_id($id)", atom_g_inc>; + def ATOM_G_DEC : BinAtom<IL_OP_UAV_READ_DEC, + "_id($id)", atom_g_dec>; + def ATOM_G_XCHG : BinAtom<IL_OP_UAV_READ_XCHG, + "_id($id)", atom_g_xchg>; + def ATOM_G_CMPXCHG : CmpXChg<IL_OP_UAV_READ_CMPXCHG, + "_id($id)", atom_g_cmpxchg>; + // Arena atomic accesses + def ATOM_A_ADD : BinAtom<IL_OP_UAV_READ_ADD, + "_id($id)_arena", atom_g_add>; + def ATOM_A_AND : BinAtom<IL_OP_UAV_READ_AND, + "_id($id)_arena", atom_g_and>; + def ATOM_A_MAX : BinAtom<IL_OP_UAV_READ_MAX, + "_id($id)_arena", atom_g_max>; + def ATOM_A_MIN : BinAtom<IL_OP_UAV_READ_MIN, + "_id($id)_arena", atom_g_min>; + def ATOM_A_UMAX : BinAtom<IL_OP_UAV_READ_UMAX, + "_id($id)_arena", atom_g_umax>; + def ATOM_A_UMIN : BinAtom<IL_OP_UAV_READ_UMIN, + "_id($id)_arena", atom_g_umin>; + def ATOM_A_OR : BinAtom<IL_OP_UAV_READ_OR, + "_id($id)_arena", atom_g_or>; + def ATOM_A_RSUB : BinAtom<IL_OP_UAV_READ_RSUB, + "_id($id)_arena", atom_g_rsub>; + def ATOM_A_SUB : BinAtom<IL_OP_UAV_READ_SUB, + "_id($id)_arena", atom_g_sub>; + def ATOM_A_XOR : BinAtom<IL_OP_UAV_READ_XOR, + "_id($id)_arena", atom_g_xor>; + def ATOM_A_INC : BinAtom<IL_OP_UAV_READ_INC, + "_id($id)_arena", atom_g_inc>; + def ATOM_A_DEC : BinAtom<IL_OP_UAV_READ_DEC, + "_id($id)_arena", atom_g_dec>; + def ATOM_A_XCHG : BinAtom<IL_OP_UAV_READ_XCHG, + "_id($id)_arena", atom_g_xchg>; + def ATOM_A_CMPXCHG : CmpXChg<IL_OP_UAV_READ_CMPXCHG, + "_id($id)_arena", atom_g_cmpxchg>; + def ATOM_L_ADD : BinAtom<IL_OP_LDS_READ_ADD, + "_resource($id)", atom_l_add>; + def ATOM_L_AND : BinAtom<IL_OP_LDS_READ_AND, + "_resource($id)", atom_l_and>; + def ATOM_L_MAX : BinAtom<IL_OP_LDS_READ_MAX, + "_resource($id)", atom_l_max>; + def ATOM_L_MIN : BinAtom<IL_OP_LDS_READ_MIN, + "_resource($id)", atom_l_min>; + def ATOM_L_UMAX : BinAtom<IL_OP_LDS_READ_UMAX, + "_resource($id)", atom_l_umax>; + def ATOM_L_UMIN : BinAtom<IL_OP_LDS_READ_UMIN, + "_resource($id)", atom_l_umin>; + def ATOM_L_OR : BinAtom<IL_OP_LDS_READ_OR, + "_resource($id)", atom_l_or>; + def ATOM_L_MSKOR : TriAtom<IL_OP_LDS_READ_MSKOR, + "_resource($id)", atom_l_mskor>; + def ATOM_L_RSUB : BinAtom<IL_OP_LDS_READ_RSUB, + "_resource($id)", atom_l_rsub>; + def ATOM_L_SUB : BinAtom<IL_OP_LDS_READ_SUB, + "_resource($id)", atom_l_sub>; + def ATOM_L_XOR : BinAtom<IL_OP_LDS_READ_XOR, + "_resource($id)", atom_l_xor>; + def ATOM_L_INC : BinAtom<IL_OP_LDS_READ_INC, + "_resource($id)", atom_l_inc>; + def ATOM_L_DEC : BinAtom<IL_OP_LDS_READ_DEC, + "_resource($id)", atom_l_dec>; + def ATOM_L_XCHG : BinAtom<IL_OP_LDS_READ_XCHG, + "_resource($id)", atom_l_xchg>; + def ATOM_L_CMPXCHG : TriAtom<IL_OP_LDS_READ_CMPXCHG, + "_resource($id)", atom_l_cmpxchg>; + def ATOM_R_ADD : BinAtom<IL_OP_GDS_READ_ADD, + "_resource($id)", atom_r_add>; + def ATOM_R_AND : BinAtom<IL_OP_GDS_READ_AND, + "_resource($id)", atom_r_and>; + def ATOM_R_MAX : BinAtom<IL_OP_GDS_READ_MAX, + "_resource($id)", atom_r_max>; + def ATOM_R_MIN : BinAtom<IL_OP_GDS_READ_MIN, + "_resource($id)", atom_r_min>; + def ATOM_R_UMAX : BinAtom<IL_OP_GDS_READ_UMAX, + "_resource($id)", atom_r_umax>; + def ATOM_R_UMIN : BinAtom<IL_OP_GDS_READ_UMIN, + "_resource($id)", atom_r_umin>; + def ATOM_R_OR : BinAtom<IL_OP_GDS_READ_OR, + "_resource($id)", atom_r_or>; + def ATOM_R_MSKOR : TriAtom<IL_OP_GDS_READ_MSKOR, + "_resource($id)", atom_r_mskor>; + def ATOM_R_RSUB : BinAtom<IL_OP_GDS_READ_RSUB, + "_resource($id)", atom_r_rsub>; + def ATOM_R_SUB : BinAtom<IL_OP_GDS_READ_SUB, + "_resource($id)", atom_r_sub>; + def ATOM_R_XOR : BinAtom<IL_OP_GDS_READ_XOR, + "_resource($id)", atom_r_xor>; + def ATOM_R_INC : BinAtom<IL_OP_GDS_READ_INC, + "_resource($id)", atom_r_inc>; + def ATOM_R_DEC : BinAtom<IL_OP_GDS_READ_DEC, + "_resource($id)", atom_r_dec>; + def ATOM_R_XCHG : BinAtom<IL_OP_GDS_READ_XCHG, + "_resource($id)", atom_r_xchg>; + def ATOM_R_CMPXCHG : CmpXChg<IL_OP_GDS_READ_CMPXCHG, + "_resource($id)", atom_r_cmpxchg>; + def APPEND_ALLOC : Append<IL_OP_APPEND_BUF_ALLOC, + "_id($id)", append_alloc>; + def APPEND_CONSUME : Append<IL_OP_APPEND_BUF_CONSUME, + "_id($id)", append_consume>; +} +} +let Predicates = [Has64BitPtr] in { +// All of the image functions +def IMAGE1D64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image1d_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE1DA64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image1d_array_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE2D64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image2d_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE2DA64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image2d_array_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE3D64_READ : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(normalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image3d_read_norm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE1D64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image1d_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE1DA64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image1d_array_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE2D64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image2d_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE2DA64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image2d_array_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE3D64_READ_UNNORM : ILFormat<IL_OP_SAMPLE, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr, GPRI32:$sampler, GPRV4F32:$addy), + !strconcat(IL_OP_SAMPLE.Text, + "_resource($ptr)_sampler($sampler)_coordtype(unnormalized) $dst, $addy"), + [(set GPRV4I32:$dst, + (int_AMDIL_image3d_read_unnorm ADDR64:$ptr, GPRI32:$sampler, GPRV4F32:$addy))]>; +def IMAGE1D64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image1d_info0 ADDR64:$ptr))]>; +def IMAGE1D64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image1d_info1 ADDR64:$ptr))]>; +def IMAGE1DA64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image1d_array_info0 ADDR64:$ptr))]>; +def IMAGE1DA64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image1d_array_info1 ADDR64:$ptr))]>; +def IMAGE2DA64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image2d_array_info0 ADDR64:$ptr))]>; +def IMAGE2DA64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image2d_array_info1 ADDR64:$ptr))]>; +def IMAGE2D64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image2d_info0 ADDR64:$ptr))]>; +def IMAGE2D64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image2d_info1 ADDR64:$ptr))]>; +def IMAGE3D64_INFO0 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image3d_info0 ADDR64:$ptr))]>; +def IMAGE3D64_INFO1 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins MEMI64:$ptr), + !strconcat(IL_OP_MOV.Text, " $dst, $ptr"), + [(set GPRV4I32:$dst, (int_AMDIL_image3d_info1 ADDR64:$ptr))]>; +def IMAGE1D64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI64:$ptr, GPRV2I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image1d_write ADDR64:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>; +def IMAGE1DA64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI64:$ptr, GPRV2I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image1d_array_write ADDR64:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>; +def IMAGE2D64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI64:$ptr, GPRV2I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image2d_write ADDR64:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>; +def IMAGE2DA64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI64:$ptr, GPRV2I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image2d_array_write ADDR64:$ptr, GPRV2I32:$addy, GPRV4I32:$data)]>; +def IMAGE3D64_WRITE : ILFormat<IL_OP_UAV_STORE, (outs), + (ins MEMI64:$ptr, GPRV4I32:$addy, GPRV4I32:$data), + !strconcat(IL_OP_UAV_STORE.Text, + "_id($ptr) $addy, $data"), + [(int_AMDIL_image3d_write ADDR64:$ptr, GPRV4I32:$addy, GPRV4I32:$data)]>; +let hasSideEffects= 1 in { + // All of the noret atomic functions + def ATOM_G64_ADD_NORET : BinAtomNoRet64<IL_OP_UAV_ADD, + "_id($id)", atom_g_add_noret>; + def ATOM_G64_AND_NORET : BinAtomNoRet64<IL_OP_UAV_AND, + "_id($id)", atom_g_and_noret>; + def ATOM_G64_MAX_NORET : BinAtomNoRet64<IL_OP_UAV_MAX, + "_id($id)", atom_g_max_noret>; + def ATOM_G64_MIN_NORET : BinAtomNoRet64<IL_OP_UAV_MIN, + "_id($id)", atom_g_min_noret>; + def ATOM_G64_UMAX_NORET : BinAtomNoRet64<IL_OP_UAV_UMAX, + "_id($id)", atom_g_umax_noret>; + def ATOM_G64_UMIN_NORET : BinAtomNoRet64<IL_OP_UAV_UMIN, + "_id($id)", atom_g_umin_noret>; + def ATOM_G64_OR_NORET : BinAtomNoRet64<IL_OP_UAV_OR, + "_id($id)", atom_g_or_noret>; + def ATOM_G64_RSUB_NORET : BinAtomNoRet64<IL_OP_UAV_RSUB, + "_id($id)", atom_g_rsub_noret>; + def ATOM_G64_SUB_NORET : BinAtomNoRet64<IL_OP_UAV_SUB, + "_id($id)", atom_g_sub_noret>; + def ATOM_G64_XOR_NORET : BinAtomNoRet64<IL_OP_UAV_XOR, + "_id($id)", atom_g_xor_noret>; + def ATOM_G64_INC_NORET : BinAtomNoRet64<IL_OP_UAV_INC, + "_id($id)", atom_g_inc_noret>; + def ATOM_G64_DEC_NORET : BinAtomNoRet64<IL_OP_UAV_DEC, + "_id($id)", atom_g_dec_noret>; + def ATOM_G64_CMPXCHG_NORET : CmpXChgNoRet64<IL_OP_UAV_CMP, + "_id($id)", atom_g_cmpxchg_noret>; + def ATOM_A64_ADD_NORET : BinAtomNoRet64<IL_OP_UAV_ADD, + "_id($id)_arena", atom_g_add_noret>; + def ATOM_A64_AND_NORET : BinAtomNoRet64<IL_OP_UAV_AND, + "_id($id)_arena", atom_g_and_noret>; + def ATOM_A64_MAX_NORET : BinAtomNoRet64<IL_OP_UAV_MAX, + "_id($id)_arena", atom_g_max_noret>; + def ATOM_A64_MIN_NORET : BinAtomNoRet64<IL_OP_UAV_MIN, + "_id($id)_arena", atom_g_min_noret>; + def ATOM_A64_UMAX_NORET : BinAtomNoRet64<IL_OP_UAV_UMAX, + "_id($id)_arena", atom_g_umax_noret>; + def ATOM_A64_UMIN_NORET : BinAtomNoRet64<IL_OP_UAV_UMIN, + "_id($id)_arena", atom_g_umin_noret>; + def ATOM_A64_OR_NORET : BinAtomNoRet64<IL_OP_UAV_OR, + "_id($id)_arena", atom_g_or_noret>; + def ATOM_A64_RSUB_NORET : BinAtomNoRet64<IL_OP_UAV_RSUB, + "_id($id)_arena", atom_g_rsub_noret>; + def ATOM_A64_SUB_NORET : BinAtomNoRet64<IL_OP_UAV_SUB, + "_id($id)_arena", atom_g_sub_noret>; + def ATOM_A64_XOR_NORET : BinAtomNoRet64<IL_OP_UAV_XOR, + "_id($id)_arena", atom_g_xor_noret>; + def ATOM_A64_INC_NORET : BinAtomNoRet64<IL_OP_UAV_INC, + "_id($id)_arena", atom_g_inc_noret>; + def ATOM_A64_DEC_NORET : BinAtomNoRet64<IL_OP_UAV_DEC, + "_id($id)_arena", atom_g_dec_noret>; + def ATOM_A64_CMPXCHG_NORET : CmpXChgNoRet64<IL_OP_UAV_CMP, + "_id($id)_arena", atom_g_cmpxchg_noret>; + def ATOM_L64_ADD_NORET : BinAtomNoRet64<IL_OP_LDS_ADD, + "_resource($id)", atom_l_add_noret>; + def ATOM_L64_AND_NORET : BinAtomNoRet64<IL_OP_LDS_AND, + "_resource($id)", atom_l_and_noret>; + def ATOM_L64_MAX_NORET : BinAtomNoRet64<IL_OP_LDS_MAX, + "_resource($id)", atom_l_max_noret>; + def ATOM_L64_MIN_NORET : BinAtomNoRet64<IL_OP_LDS_MIN, + "_resource($id)", atom_l_min_noret>; + def ATOM_L64_UMAX_NORET : BinAtomNoRet64<IL_OP_LDS_UMAX, + "_resource($id)", atom_l_umax_noret>; + def ATOM_L64_UMIN_NORET : BinAtomNoRet64<IL_OP_LDS_UMIN, + "_resource($id)", atom_l_umin_noret>; + def ATOM_L64_MSKOR_NORET : TriAtomNoRet64<IL_OP_LDS_MSKOR, + "_resource($id)", atom_l_mskor_noret>; + def ATOM_L64_OR_NORET : BinAtomNoRet64<IL_OP_LDS_OR, + "_resource($id)", atom_l_or_noret>; + def ATOM_L64_RSUB_NORET : BinAtomNoRet64<IL_OP_LDS_RSUB, + "_resource($id)", atom_l_rsub_noret>; + def ATOM_L64_SUB_NORET : BinAtomNoRet64<IL_OP_LDS_SUB, + "_resource($id)", atom_l_sub_noret>; + def ATOM_L64_XOR_NORET : BinAtomNoRet64<IL_OP_LDS_XOR, + "_resource($id)", atom_l_xor_noret>; + def ATOM_L64_INC_NORET : BinAtomNoRet64<IL_OP_LDS_INC, + "_resource($id)", atom_l_inc_noret>; + def ATOM_L64_DEC_NORET : BinAtomNoRet64<IL_OP_LDS_DEC, + "_resource($id)", atom_l_dec_noret>; + def ATOM_L64_CMPXCHG_NORET : TriAtomNoRet64<IL_OP_LDS_CMP, + "_resource($id)", atom_l_cmpxchg_noret>; + def ATOM_R64_ADD_NORET : BinAtomNoRet64<IL_OP_GDS_ADD, + "_resource($id)", atom_r_add_noret>; + def ATOM_R64_AND_NORET : BinAtomNoRet64<IL_OP_GDS_AND, + "_resource($id)", atom_r_and_noret>; + def ATOM_R64_MAX_NORET : BinAtomNoRet64<IL_OP_GDS_MAX, + "_resource($id)", atom_r_max_noret>; + def ATOM_R64_MIN_NORET : BinAtomNoRet64<IL_OP_GDS_MIN, + "_resource($id)", atom_r_min_noret>; + def ATOM_R64_UMAX_NORET : BinAtomNoRet64<IL_OP_GDS_UMAX, + "_resource($id)", atom_r_umax_noret>; + def ATOM_R64_UMIN_NORET : BinAtomNoRet64<IL_OP_GDS_UMIN, + "_resource($id)", atom_r_umin_noret>; + def ATOM_R64_MSKOR_NORET : TriAtomNoRet64<IL_OP_GDS_MSKOR, + "_resource($id)", atom_r_mskor_noret>; + def ATOM_R64_OR_NORET : BinAtomNoRet64<IL_OP_GDS_OR, + "_resource($id)", atom_r_or_noret>; + def ATOM_R64_RSUB_NORET : BinAtomNoRet64<IL_OP_GDS_RSUB, + "_resource($id)", atom_r_rsub_noret>; + def ATOM_R64_SUB_NORET : BinAtomNoRet64<IL_OP_GDS_SUB, + "_resource($id)", atom_r_sub_noret>; + def ATOM_R64_XOR_NORET : BinAtomNoRet64<IL_OP_GDS_XOR, + "_resource($id)", atom_r_xor_noret>; + def ATOM_R64_INC_NORET : BinAtomNoRet64<IL_OP_GDS_INC, + "_resource($id)", atom_r_inc_noret>; + def ATOM_R64_DEC_NORET : BinAtomNoRet64<IL_OP_GDS_DEC, + "_resource($id)", atom_r_dec_noret>; + def ATOM_R64_CMPXCHG_NORET : CmpXChgNoRet64<IL_OP_GDS_CMP, + "_resource($id)", atom_r_cmpxchg_noret>; + def APPEND_ALLOC64_NORET : AppendNoRet64<IL_OP_APPEND_BUF_ALLOC, + "_id($id)", append_alloc_noret>; + def APPEND_CONSUME64_NORET : AppendNoRet64<IL_OP_APPEND_BUF_CONSUME, + "_id($id)", append_consume_noret>; + // All of the atomic functions that return + def ATOM_G64_ADD : BinAtom64<IL_OP_UAV_READ_ADD, + "_id($id)", atom_g_add>; + def ATOM_G64_AND : BinAtom64<IL_OP_UAV_READ_AND, + "_id($id)", atom_g_and>; + def ATOM_G64_MAX : BinAtom64<IL_OP_UAV_READ_MAX, + "_id($id)", atom_g_max>; + def ATOM_G64_MIN : BinAtom64<IL_OP_UAV_READ_MIN, + "_id($id)", atom_g_min>; + def ATOM_G64_UMAX : BinAtom64<IL_OP_UAV_READ_UMAX, + "_id($id)", atom_g_umax>; + def ATOM_G64_UMIN : BinAtom64<IL_OP_UAV_READ_UMIN, + "_id($id)", atom_g_umin>; + def ATOM_G64_OR : BinAtom64<IL_OP_UAV_READ_OR, + "_id($id)", atom_g_or>; + def ATOM_G64_RSUB : BinAtom64<IL_OP_UAV_READ_RSUB, + "_id($id)", atom_g_rsub>; + def ATOM_G64_SUB : BinAtom64<IL_OP_UAV_READ_SUB, + "_id($id)", atom_g_sub>; + def ATOM_G64_XOR : BinAtom64<IL_OP_UAV_READ_XOR, + "_id($id)", atom_g_xor>; + def ATOM_G64_INC : BinAtom64<IL_OP_UAV_READ_INC, + "_id($id)", atom_g_inc>; + def ATOM_G64_DEC : BinAtom64<IL_OP_UAV_READ_DEC, + "_id($id)", atom_g_dec>; + def ATOM_G64_XCHG : BinAtom64<IL_OP_UAV_READ_XCHG, + "_id($id)", atom_g_xchg>; + def ATOM_G64_CMPXCHG : CmpXChg64<IL_OP_UAV_READ_CMPXCHG, + "_id($id)", atom_g_cmpxchg>; + // Arena atomic accesses + def ATOM_A64_ADD : BinAtom64<IL_OP_UAV_READ_ADD, + "_id($id)_arena", atom_g_add>; + def ATOM_A64_AND : BinAtom64<IL_OP_UAV_READ_AND, + "_id($id)_arena", atom_g_and>; + def ATOM_A64_MAX : BinAtom64<IL_OP_UAV_READ_MAX, + "_id($id)_arena", atom_g_max>; + def ATOM_A64_MIN : BinAtom64<IL_OP_UAV_READ_MIN, + "_id($id)_arena", atom_g_min>; + def ATOM_A64_UMAX : BinAtom64<IL_OP_UAV_READ_UMAX, + "_id($id)_arena", atom_g_umax>; + def ATOM_A64_UMIN : BinAtom64<IL_OP_UAV_READ_UMIN, + "_id($id)_arena", atom_g_umin>; + def ATOM_A64_OR : BinAtom64<IL_OP_UAV_READ_OR, + "_id($id)_arena", atom_g_or>; + def ATOM_A64_RSUB : BinAtom64<IL_OP_UAV_READ_RSUB, + "_id($id)_arena", atom_g_rsub>; + def ATOM_A64_SUB : BinAtom64<IL_OP_UAV_READ_SUB, + "_id($id)_arena", atom_g_sub>; + def ATOM_A64_XOR : BinAtom64<IL_OP_UAV_READ_XOR, + "_id($id)_arena", atom_g_xor>; + def ATOM_A64_INC : BinAtom64<IL_OP_UAV_READ_INC, + "_id($id)_arena", atom_g_inc>; + def ATOM_A64_DEC : BinAtom64<IL_OP_UAV_READ_DEC, + "_id($id)_arena", atom_g_dec>; + def ATOM_A64_XCHG : BinAtom64<IL_OP_UAV_READ_XCHG, + "_id($id)_arena", atom_g_xchg>; + def ATOM_A64_CMPXCHG : CmpXChg64<IL_OP_UAV_READ_CMPXCHG, + "_id($id)_arena", atom_g_cmpxchg>; + def ATOM_L64_ADD : BinAtom64<IL_OP_LDS_READ_ADD, + "_resource($id)", atom_l_add>; + def ATOM_L64_AND : BinAtom64<IL_OP_LDS_READ_AND, + "_resource($id)", atom_l_and>; + def ATOM_L64_MAX : BinAtom64<IL_OP_LDS_READ_MAX, + "_resource($id)", atom_l_max>; + def ATOM_L64_MIN : BinAtom64<IL_OP_LDS_READ_MIN, + "_resource($id)", atom_l_min>; + def ATOM_L64_UMAX : BinAtom64<IL_OP_LDS_READ_UMAX, + "_resource($id)", atom_l_umax>; + def ATOM_L64_UMIN : BinAtom64<IL_OP_LDS_READ_UMIN, + "_resource($id)", atom_l_umin>; + def ATOM_L64_OR : BinAtom64<IL_OP_LDS_READ_OR, + "_resource($id)", atom_l_or>; + def ATOM_L64_MSKOR : TriAtom64<IL_OP_LDS_READ_MSKOR, + "_resource($id)", atom_l_mskor>; + def ATOM_L64_RSUB : BinAtom64<IL_OP_LDS_READ_RSUB, + "_resource($id)", atom_l_rsub>; + def ATOM_L64_SUB : BinAtom64<IL_OP_LDS_READ_SUB, + "_resource($id)", atom_l_sub>; + def ATOM_L64_XOR : BinAtom64<IL_OP_LDS_READ_XOR, + "_resource($id)", atom_l_xor>; + def ATOM_L64_INC : BinAtom64<IL_OP_LDS_READ_INC, + "_resource($id)", atom_l_inc>; + def ATOM_L64_DEC : BinAtom64<IL_OP_LDS_READ_DEC, + "_resource($id)", atom_l_dec>; + def ATOM_L64_XCHG : BinAtom64<IL_OP_LDS_READ_XCHG, + "_resource($id)", atom_l_xchg>; + def ATOM_L64_CMPXCHG : TriAtom64<IL_OP_LDS_READ_CMPXCHG, + "_resource($id)", atom_l_cmpxchg>; + def ATOM_R64_ADD : BinAtom64<IL_OP_GDS_READ_ADD, + "_resource($id)", atom_r_add>; + def ATOM_R64_AND : BinAtom64<IL_OP_GDS_READ_AND, + "_resource($id)", atom_r_and>; + def ATOM_R64_MAX : BinAtom64<IL_OP_GDS_READ_MAX, + "_resource($id)", atom_r_max>; + def ATOM_R64_MIN : BinAtom64<IL_OP_GDS_READ_MIN, + "_resource($id)", atom_r_min>; + def ATOM_R64_UMAX : BinAtom64<IL_OP_GDS_READ_UMAX, + "_resource($id)", atom_r_umax>; + def ATOM_R64_UMIN : BinAtom64<IL_OP_GDS_READ_UMIN, + "_resource($id)", atom_r_umin>; + def ATOM_R64_OR : BinAtom64<IL_OP_GDS_READ_OR, + "_resource($id)", atom_r_or>; + def ATOM_R64_MSKOR : TriAtom64<IL_OP_GDS_READ_MSKOR, + "_resource($id)", atom_r_mskor>; + def ATOM_R64_RSUB : BinAtom64<IL_OP_GDS_READ_RSUB, + "_resource($id)", atom_r_rsub>; + def ATOM_R64_SUB : BinAtom64<IL_OP_GDS_READ_SUB, + "_resource($id)", atom_r_sub>; + def ATOM_R64_XOR : BinAtom64<IL_OP_GDS_READ_XOR, + "_resource($id)", atom_r_xor>; + def ATOM_R64_INC : BinAtom64<IL_OP_GDS_READ_INC, + "_resource($id)", atom_r_inc>; + def ATOM_R64_DEC : BinAtom64<IL_OP_GDS_READ_DEC, + "_resource($id)", atom_r_dec>; + def ATOM_R64_XCHG : BinAtom64<IL_OP_GDS_READ_XCHG, + "_resource($id)", atom_r_xchg>; + def ATOM_R64_CMPXCHG : CmpXChg64<IL_OP_GDS_READ_CMPXCHG, + "_resource($id)", atom_r_cmpxchg>; + def APPEND_ALLOC64 : Append64<IL_OP_APPEND_BUF_ALLOC, + "_id($id)", append_alloc>; + def APPEND_CONSUME64 : Append64<IL_OP_APPEND_BUF_CONSUME, + "_id($id)", append_consume>; +} +} +/* +def SEMAPHORE_INIT : BinaryOpNoRet<IL_OP_SEMAPHORE_INIT, (outs), + (ins MEMI32:$ptr, i32imm:$val), + !strconcat(IL_OP_SEMAPHORE_INIT.Text, "_id($ptr)_value($val)"), + [(int_AMDIL_semaphore_init ADDR:$ptr, timm:$val)]>; + +def SEMAPHORE_WAIT : UnaryOpNoRet<IL_OP_SEMAPHORE_WAIT, (outs), + (ins MEMI32:$ptr), + !strconcat(IL_OP_SEMAPHORE_WAIT.Text, "_id($ptr)"), + [(int_AMDIL_semaphore_wait ADDR:$ptr)]>; + +def SEMAPHORE_SIGNAL : UnaryOpNoRet<IL_OP_SEMAPHORE_SIGNAL, (outs), + (ins MEMI32:$ptr), + !strconcat(IL_OP_SEMAPHORE_SIGNAL.Text, "_id($ptr)"), + [(int_AMDIL_semaphore_signal ADDR:$ptr)]>; +*/ diff --git a/src/gallium/drivers/radeon/AMDILIntrinsicInfo.cpp b/src/gallium/drivers/radeon/AMDILIntrinsicInfo.cpp new file mode 100644 index 00000000000..75729ac01a3 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILIntrinsicInfo.cpp @@ -0,0 +1,190 @@ +//===- AMDILIntrinsicInfo.cpp - AMDIL Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file contains the AMDIL Implementation of the IntrinsicInfo class. +// +//===-----------------------------------------------------------------------===// + +#include "AMDILIntrinsicInfo.h" +#include "AMDIL.h" +#include "AMDILTargetMachine.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" + +using namespace llvm; + +#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN +#include "AMDILGenIntrinsics.inc" +#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN + +AMDILIntrinsicInfo::AMDILIntrinsicInfo(AMDILTargetMachine *tm) + : TargetIntrinsicInfo(), mTM(tm) +{ +} + +std::string +AMDILIntrinsicInfo::getName(unsigned int IntrID, Type **Tys, + unsigned int numTys) const +{ + static const char* const names[] = { +#define GET_INTRINSIC_NAME_TABLE +#include "AMDILGenIntrinsics.inc" +#undef GET_INTRINSIC_NAME_TABLE + }; + + //assert(!isOverloaded(IntrID) + //&& "AMDIL Intrinsics are not overloaded"); + if (IntrID < Intrinsic::num_intrinsics) { + return 0; + } + assert(IntrID < AMDGPUIntrinsic::num_AMDIL_intrinsics + && "Invalid intrinsic ID"); + + std::string Result(names[IntrID - Intrinsic::num_intrinsics]); + return Result; +} + + static bool +checkTruncation(const char *Name, unsigned int& Len) +{ + const char *ptr = Name + (Len - 1); + while(ptr != Name && *ptr != '_') { + --ptr; + } + // We don't want to truncate on atomic instructions + // but we do want to enter the check Truncation + // section so that we can translate the atomic + // instructions if we need to. + if (!strncmp(Name, "__atom", 6)) { + return true; + } + if (strstr(ptr, "i32") + || strstr(ptr, "u32") + || strstr(ptr, "i64") + || strstr(ptr, "u64") + || strstr(ptr, "f32") + || strstr(ptr, "f64") + || strstr(ptr, "i16") + || strstr(ptr, "u16") + || strstr(ptr, "i8") + || strstr(ptr, "u8")) { + Len = (unsigned int)(ptr - Name); + return true; + } + return false; +} + +// We don't want to support both the OpenCL 1.0 atomics +// and the 1.1 atomics with different names, so we translate +// the 1.0 atomics to the 1.1 naming here if needed. +static char* +atomTranslateIfNeeded(const char *Name, unsigned int Len) +{ + char *buffer = NULL; + if (strncmp(Name, "__atom_", 7)) { + // If we are not starting with __atom_, then + // go ahead and continue on with the allocation. + buffer = new char[Len + 1]; + memcpy(buffer, Name, Len); + } else { + buffer = new char[Len + 3]; + memcpy(buffer, "__atomic_", 9); + memcpy(buffer + 9, Name + 7, Len - 7); + Len += 2; + } + buffer[Len] = '\0'; + return buffer; +} + +unsigned int +AMDILIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const +{ +#define GET_FUNCTION_RECOGNIZER +#include "AMDILGenIntrinsics.inc" +#undef GET_FUNCTION_RECOGNIZER + AMDGPUIntrinsic::ID IntrinsicID + = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic; + if (checkTruncation(Name, Len)) { + char *buffer = atomTranslateIfNeeded(Name, Len); + IntrinsicID = getIntrinsicForGCCBuiltin("AMDIL", buffer); + delete [] buffer; + } else { + IntrinsicID = getIntrinsicForGCCBuiltin("AMDIL", Name); + } + if (!isValidIntrinsic(IntrinsicID)) { + return 0; + } + if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) { + return IntrinsicID; + } + return 0; +} + +bool +AMDILIntrinsicInfo::isOverloaded(unsigned id) const +{ + // Overload Table +#define GET_INTRINSIC_OVERLOAD_TABLE +#include "AMDILGenIntrinsics.inc" +#undef GET_INTRINSIC_OVERLOAD_TABLE +} + +/// This defines the "getAttributes(ID id)" method. +#define GET_INTRINSIC_ATTRIBUTES +#include "AMDILGenIntrinsics.inc" +#undef GET_INTRINSIC_ATTRIBUTES + +Function* +AMDILIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID, + Type **Tys, + unsigned numTys) const +{ + assert(!isOverloaded(IntrID) && "AMDIL intrinsics are not overloaded"); + AttrListPtr AList = getAttributes((AMDGPUIntrinsic::ID) IntrID); + LLVMContext& Context = M->getContext(); + unsigned int id = IntrID; + Type *ResultTy = NULL; + std::vector<Type*> ArgTys; + bool IsVarArg = false; + +#define GET_INTRINSIC_GENERATOR +#include "AMDILGenIntrinsics.inc" +#undef GET_INTRINSIC_GENERATOR + // We need to add the resource ID argument for atomics. + if (id >= AMDGPUIntrinsic::AMDIL_atomic_add_gi32 + && id <= AMDGPUIntrinsic::AMDIL_atomic_xor_ru32_noret) { + ArgTys.push_back(IntegerType::get(Context, 32)); + } + + return cast<Function>(M->getOrInsertFunction(getName(IntrID), + FunctionType::get(ResultTy, ArgTys, IsVarArg), + AList)); +} + +/// Because the code generator has to support different SC versions, +/// this function is added to check that the intrinsic being used +/// is actually valid. In the case where it isn't valid, the +/// function call is not translated into an intrinsic and the +/// fall back software emulated path should pick up the result. +bool +AMDILIntrinsicInfo::isValidIntrinsic(unsigned int IntrID) const +{ + const AMDILSubtarget *stm = mTM->getSubtargetImpl(); + switch (IntrID) { + default: + return true; + case AMDGPUIntrinsic::AMDIL_convert_f32_i32_rpi: + case AMDGPUIntrinsic::AMDIL_convert_f32_i32_flr: + case AMDGPUIntrinsic::AMDIL_convert_f32_f16_near: + case AMDGPUIntrinsic::AMDIL_convert_f32_f16_neg_inf: + case AMDGPUIntrinsic::AMDIL_convert_f32_f16_plus_inf: + return stm->calVersion() >= CAL_VERSION_SC_139; + }; +} diff --git a/src/gallium/drivers/radeon/AMDILIntrinsicInfo.h b/src/gallium/drivers/radeon/AMDILIntrinsicInfo.h new file mode 100644 index 00000000000..513c6f06e85 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILIntrinsicInfo.h @@ -0,0 +1,49 @@ +//===- AMDILIntrinsicInfo.h - AMDIL Intrinsic Information ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface for the AMDIL Implementation of the Intrinsic Info class. +// +//===-----------------------------------------------------------------------===// +#ifndef _AMDIL_INTRINSICS_H_ +#define _AMDIL_INTRINSICS_H_ + +#include "llvm/Intrinsics.h" +#include "llvm/Target/TargetIntrinsicInfo.h" + +namespace llvm { + class AMDILTargetMachine; + namespace AMDGPUIntrinsic { + enum ID { + last_non_AMDIL_intrinsic = Intrinsic::num_intrinsics - 1, +#define GET_INTRINSIC_ENUM_VALUES +#include "AMDILGenIntrinsics.inc" +#undef GET_INTRINSIC_ENUM_VALUES + , num_AMDIL_intrinsics + }; + + } + + + class AMDILIntrinsicInfo : public TargetIntrinsicInfo { + AMDILTargetMachine *mTM; + public: + AMDILIntrinsicInfo(AMDILTargetMachine *tm); + std::string getName(unsigned int IntrId, Type **Tys = 0, + unsigned int numTys = 0) const; + unsigned int lookupName(const char *Name, unsigned int Len) const; + bool isOverloaded(unsigned int IID) const; + Function *getDeclaration(Module *M, unsigned int ID, + Type **Tys = 0, + unsigned int numTys = 0) const; + bool isValidIntrinsic(unsigned int) const; + }; // AMDILIntrinsicInfo +} + +#endif // _AMDIL_INTRINSICS_H_ + diff --git a/src/gallium/drivers/radeon/AMDILIntrinsics.td b/src/gallium/drivers/radeon/AMDILIntrinsics.td new file mode 100644 index 00000000000..ef361f4e6af --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILIntrinsics.td @@ -0,0 +1,705 @@ +//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file defines all of the amdil-specific intrinsics +// +//===---------------------------------------------------------------===// + +let TargetPrefix = "AMDIL", isTarget = 1 in { +//------------- Synchronization Functions - OpenCL 6.11.9 --------------------// + def int_AMDIL_fence : GCCBuiltin<"mem_fence">, + UnaryIntNoRetInt; + def int_AMDIL_fence_global : GCCBuiltin<"mem_fence_global">, + UnaryIntNoRetInt; + def int_AMDIL_fence_local : GCCBuiltin<"mem_fence_local">, + UnaryIntNoRetInt; + def int_AMDIL_fence_region : GCCBuiltin<"mem_fence_region">, + UnaryIntNoRetInt; + def int_AMDIL_fence_read_only : GCCBuiltin<"read_mem_fence">, + UnaryIntNoRetInt; + def int_AMDIL_fence_read_only_global : GCCBuiltin<"read_mem_fence_global">, + UnaryIntNoRetInt; + def int_AMDIL_fence_read_only_local : GCCBuiltin<"read_mem_fence_local">, + UnaryIntNoRetInt; + def int_AMDIL_fence_read_only_region : GCCBuiltin<"read_mem_fence_region">, + UnaryIntNoRetInt; + def int_AMDIL_fence_write_only : GCCBuiltin<"write_mem_fence">, + UnaryIntNoRetInt; + def int_AMDIL_fence_write_only_global : GCCBuiltin<"write_mem_fence_global">, + UnaryIntNoRetInt; + def int_AMDIL_fence_write_only_local : GCCBuiltin<"write_mem_fence_local">, + UnaryIntNoRetInt; + def int_AMDIL_fence_write_only_region : GCCBuiltin<"write_mem_fence_region">, + UnaryIntNoRetInt; + + def int_AMDIL_early_exit : GCCBuiltin<"__amdil_early_exit">, + UnaryIntNoRetInt; + + def int_AMDIL_cmov_logical : GCCBuiltin<"__amdil_cmov_logical">, + TernaryIntInt; + def int_AMDIL_fabs : GCCBuiltin<"__amdil_fabs">, UnaryIntFloat; + def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt; + + def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">, + TernaryIntInt; + def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">, + TernaryIntInt; + def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">, + UnaryIntInt; + def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">, + UnaryIntInt; + def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">, + UnaryIntInt; + def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">, + UnaryIntInt; + def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">, + UnaryIntInt; + def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">, + TernaryIntInt; + def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">, + TernaryIntInt; + def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">, + QuaternaryIntInt; + def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">, + TernaryIntInt; + def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">, + BinaryIntInt; + def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">, + TernaryIntInt; + def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">, + TernaryIntInt; + def int_AMDIL_mad : GCCBuiltin<"__amdil_mad">, + TernaryIntFloat; + def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">, + BinaryIntInt; + def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">, + BinaryIntInt; + def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">, + BinaryIntInt; + def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">, + BinaryIntInt; + def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">, + BinaryIntInt; + def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">, + BinaryIntInt; + def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">, + TernaryIntInt; + def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">, + TernaryIntInt; + def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">, + BinaryIntInt; + def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">, + BinaryIntInt; + def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">, + BinaryIntInt; + def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">, + BinaryIntInt; + def int_AMDIL_min : GCCBuiltin<"__amdil_min">, + BinaryIntFloat; + def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">, + BinaryIntInt; + def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">, + BinaryIntInt; + def int_AMDIL_max : GCCBuiltin<"__amdil_max">, + BinaryIntFloat; + def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">, + TernaryIntInt; + def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">, + TernaryIntInt; + def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">, + TernaryIntInt; + def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">, + UnaryIntFloat; + def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">, + TernaryIntFloat; + def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">, + UnaryIntFloat; + def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">, + UnaryIntFloat; + def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">, + UnaryIntFloat; + def int_AMDIL_round_posinf : GCCBuiltin<"__amdil_round_posinf">, + UnaryIntFloat; + def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">, + UnaryIntFloat; + def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">, + UnaryIntFloat; + def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">, + UnaryIntFloat; + def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">, + UnaryIntFloat; + def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">, + UnaryIntFloat; + def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">, + UnaryIntFloat; + def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">, + UnaryIntFloat; + def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">, + UnaryIntFloat; + def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">, + UnaryIntFloat; + def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat; + def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat; + def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt; + def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">, + UnaryIntFloat; + def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">, + UnaryIntFloat; + def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">, + UnaryIntFloat; + def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">, + UnaryIntFloat; + def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">, + UnaryIntFloat; + def int_AMDIL_log : GCCBuiltin<"__amdil_log">, + UnaryIntFloat; + def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">, + UnaryIntFloat; + def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">, + UnaryIntFloat; + def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">, + UnaryIntFloat; + def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">, + TernaryIntFloat; + def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">, + UnaryIntFloat; + def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">, + UnaryIntFloat; + def int_AMDIL_length : GCCBuiltin<"__amdil_length">, + UnaryIntFloat; + def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">, + TernaryIntFloat; + def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">, + Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty, + llvm_v4i32_ty, llvm_i32_ty], []>; + + def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">, + Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>; + def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">, + Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>; + def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">, + Intrinsic<[llvm_double_ty], [llvm_double_ty], []>; + def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">, + ConvertIntITOF; + def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">, + ConvertIntFTOI; + def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">, + ConvertIntFTOI; + def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">, + Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>; + def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">, + ConvertIntITOF; + def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">, + ConvertIntITOF; + def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">, + Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, + llvm_v2f32_ty, llvm_float_ty], []>; + def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">, + Intrinsic<[llvm_float_ty], [llvm_v2f32_ty, + llvm_v2f32_ty], []>; + def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">, + Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, + llvm_v4f32_ty], []>; + def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">, + Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, + llvm_v4f32_ty], []>; +//===---------------------- Image functions begin ------------------------===// + def int_AMDIL_image1d_write : GCCBuiltin<"__amdil_image1d_write">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image1d_read_norm : GCCBuiltin<"__amdil_image1d_read_norm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image1d_read_unnorm : GCCBuiltin<"__amdil_image1d_read_unnorm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image1d_info0 : GCCBuiltin<"__amdil_image1d_info0">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image1d_info1 : GCCBuiltin<"__amdil_image1d_info1">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image1d_array_write : GCCBuiltin<"__amdil_image1d_array_write">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image1d_array_read_norm : GCCBuiltin<"__amdil_image1d_array_read_norm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image1d_array_read_unnorm : GCCBuiltin<"__amdil_image1d_array_read_unnorm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image1d_array_info0 : GCCBuiltin<"__amdil_image1d_array_info0">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image1d_array_info1 : GCCBuiltin<"__amdil_image1d_array_info1">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image2d_write : GCCBuiltin<"__amdil_image2d_write">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image2d_read_norm : GCCBuiltin<"__amdil_image2d_read_norm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image2d_read_unnorm : GCCBuiltin<"__amdil_image2d_read_unnorm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image2d_info0 : GCCBuiltin<"__amdil_image2d_info0">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image2d_info1 : GCCBuiltin<"__amdil_image2d_info1">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image2d_array_write : GCCBuiltin<"__amdil_image2d_array_write">, + Intrinsic<[], [llvm_ptr_ty, llvm_v2i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image2d_array_read_norm : GCCBuiltin<"__amdil_image2d_array_read_norm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image2d_array_read_unnorm : GCCBuiltin<"__amdil_image2d_array_read_unnorm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image2d_array_info0 : GCCBuiltin<"__amdil_image2d_array_info0">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image2d_array_info1 : GCCBuiltin<"__amdil_image2d_array_info1">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image3d_write : GCCBuiltin<"__amdil_image3d_write">, + Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image3d_read_norm : GCCBuiltin<"__amdil_image3d_read_norm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image3d_read_unnorm : GCCBuiltin<"__amdil_image3d_read_unnorm">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_image3d_info0 : GCCBuiltin<"__amdil_image3d_info0">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + + def int_AMDIL_image3d_info1 : GCCBuiltin<"__amdil_image3d_info1">, + Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty], []>; + +//===---------------------- Image functions end --------------------------===// + + def int_AMDIL_append_alloc_i32 : GCCBuiltin<"__amdil_append_alloc">, + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>; + def int_AMDIL_append_consume_i32 : GCCBuiltin<"__amdil_append_consume">, + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>; + def int_AMDIL_append_alloc_i32_noret : GCCBuiltin<"__amdil_append_alloc_noret">, + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>; + def int_AMDIL_append_consume_i32_noret : GCCBuiltin<"__amdil_append_consume_noret">, + Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrReadWriteArgMem]>; + + def int_AMDIL_get_global_id : GCCBuiltin<"__amdil_get_global_id_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; + def int_AMDIL_get_local_id : GCCBuiltin<"__amdil_get_local_id_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; + def int_AMDIL_get_group_id : GCCBuiltin<"__amdil_get_group_id_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; + def int_AMDIL_get_num_groups : GCCBuiltin<"__amdil_get_num_groups_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; + def int_AMDIL_get_local_size : GCCBuiltin<"__amdil_get_local_size_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; + def int_AMDIL_get_global_size : GCCBuiltin<"__amdil_get_global_size_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; + def int_AMDIL_get_global_offset : GCCBuiltin<"__amdil_get_global_offset_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; + def int_AMDIL_get_work_dim : GCCBuiltin<"get_work_dim">, + Intrinsic<[llvm_i32_ty], [], []>; + def int_AMDIL_get_printf_offset : GCCBuiltin<"__amdil_get_printf_offset">, + Intrinsic<[llvm_i32_ty], []>; + def int_AMDIL_get_printf_size : GCCBuiltin<"__amdil_get_printf_size">, + Intrinsic<[llvm_i32_ty], []>; + +/// Intrinsics for atomic instructions with no return value +/// Signed 32 bit integer atomics for global address space +def int_AMDIL_atomic_add_gi32_noret : GCCBuiltin<"__atomic_add_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_sub_gi32_noret : GCCBuiltin<"__atomic_sub_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_rsub_gi32_noret : GCCBuiltin<"__atomic_rsub_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xchg_gi32_noret : GCCBuiltin<"__atomic_xchg_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_inc_gi32_noret : GCCBuiltin<"__atomic_inc_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_dec_gi32_noret : GCCBuiltin<"__atomic_dec_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_cmpxchg_gi32_noret : GCCBuiltin<"__atomic_cmpxchg_gi32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_min_gi32_noret : GCCBuiltin<"__atomic_min_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_max_gi32_noret : GCCBuiltin<"__atomic_max_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_and_gi32_noret : GCCBuiltin<"__atomic_and_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_or_gi32_noret : GCCBuiltin<"__atomic_or_gi32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xor_gi32_noret : GCCBuiltin<"__atomic_xor_gi32_noret">, + BinaryAtomicIntNoRet; + + + +/// Unsigned 32 bit integer atomics for global address space +def int_AMDIL_atomic_add_gu32_noret : GCCBuiltin<"__atomic_add_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_sub_gu32_noret : GCCBuiltin<"__atomic_sub_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_rsub_gu32_noret : GCCBuiltin<"__atomic_rsub_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xchg_gu32_noret : GCCBuiltin<"__atomic_xchg_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_inc_gu32_noret : GCCBuiltin<"__atomic_inc_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_dec_gu32_noret : GCCBuiltin<"__atomic_dec_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_cmpxchg_gu32_noret : GCCBuiltin<"__atomic_cmpxchg_gu32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_min_gu32_noret : GCCBuiltin<"__atomic_min_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_max_gu32_noret : GCCBuiltin<"__atomic_max_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_and_gu32_noret : GCCBuiltin<"__atomic_and_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_or_gu32_noret : GCCBuiltin<"__atomic_or_gu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xor_gu32_noret : GCCBuiltin<"__atomic_xor_gu32_noret">, + BinaryAtomicIntNoRet; + + +/// Intrinsics for atomic instructions with a return value +/// Signed 32 bit integer atomics for global address space +def int_AMDIL_atomic_add_gi32 : GCCBuiltin<"__atomic_add_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_sub_gi32 : GCCBuiltin<"__atomic_sub_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_rsub_gi32 : GCCBuiltin<"__atomic_rsub_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_gi32 : GCCBuiltin<"__atomic_xchg_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_inc_gi32 : GCCBuiltin<"__atomic_inc_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_dec_gi32 : GCCBuiltin<"__atomic_dec_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_cmpxchg_gi32 : GCCBuiltin<"__atomic_cmpxchg_gi32">, + TernaryAtomicInt; +def int_AMDIL_atomic_min_gi32 : GCCBuiltin<"__atomic_min_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_max_gi32 : GCCBuiltin<"__atomic_max_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_and_gi32 : GCCBuiltin<"__atomic_and_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_or_gi32 : GCCBuiltin<"__atomic_or_gi32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xor_gi32 : GCCBuiltin<"__atomic_xor_gi32">, + BinaryAtomicInt; + +/// 32 bit float atomics required by OpenCL +def int_AMDIL_atomic_xchg_gf32 : GCCBuiltin<"__atomic_xchg_gf32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_gf32_noret : GCCBuiltin<"__atomic_xchg_gf32_noret">, + BinaryAtomicIntNoRet; + +/// Unsigned 32 bit integer atomics for global address space +def int_AMDIL_atomic_add_gu32 : GCCBuiltin<"__atomic_add_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_sub_gu32 : GCCBuiltin<"__atomic_sub_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_rsub_gu32 : GCCBuiltin<"__atomic_rsub_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_gu32 : GCCBuiltin<"__atomic_xchg_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_inc_gu32 : GCCBuiltin<"__atomic_inc_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_dec_gu32 : GCCBuiltin<"__atomic_dec_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_cmpxchg_gu32 : GCCBuiltin<"__atomic_cmpxchg_gu32">, + TernaryAtomicInt; +def int_AMDIL_atomic_min_gu32 : GCCBuiltin<"__atomic_min_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_max_gu32 : GCCBuiltin<"__atomic_max_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_and_gu32 : GCCBuiltin<"__atomic_and_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_or_gu32 : GCCBuiltin<"__atomic_or_gu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xor_gu32 : GCCBuiltin<"__atomic_xor_gu32">, + BinaryAtomicInt; + + +/// Intrinsics for atomic instructions with no return value +/// Signed 32 bit integer atomics for local address space +def int_AMDIL_atomic_add_li32_noret : GCCBuiltin<"__atomic_add_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_sub_li32_noret : GCCBuiltin<"__atomic_sub_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_rsub_li32_noret : GCCBuiltin<"__atomic_rsub_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xchg_li32_noret : GCCBuiltin<"__atomic_xchg_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_inc_li32_noret : GCCBuiltin<"__atomic_inc_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_dec_li32_noret : GCCBuiltin<"__atomic_dec_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_cmpxchg_li32_noret : GCCBuiltin<"__atomic_cmpxchg_li32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_min_li32_noret : GCCBuiltin<"__atomic_min_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_max_li32_noret : GCCBuiltin<"__atomic_max_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_and_li32_noret : GCCBuiltin<"__atomic_and_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_or_li32_noret : GCCBuiltin<"__atomic_or_li32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_mskor_li32_noret : GCCBuiltin<"__atomic_mskor_li32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_xor_li32_noret : GCCBuiltin<"__atomic_xor_li32_noret">, + BinaryAtomicIntNoRet; + +/// Signed 32 bit integer atomics for region address space +def int_AMDIL_atomic_add_ri32_noret : GCCBuiltin<"__atomic_add_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_sub_ri32_noret : GCCBuiltin<"__atomic_sub_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_rsub_ri32_noret : GCCBuiltin<"__atomic_rsub_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xchg_ri32_noret : GCCBuiltin<"__atomic_xchg_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_inc_ri32_noret : GCCBuiltin<"__atomic_inc_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_dec_ri32_noret : GCCBuiltin<"__atomic_dec_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_cmpxchg_ri32_noret : GCCBuiltin<"__atomic_cmpxchg_ri32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_min_ri32_noret : GCCBuiltin<"__atomic_min_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_max_ri32_noret : GCCBuiltin<"__atomic_max_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_and_ri32_noret : GCCBuiltin<"__atomic_and_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_or_ri32_noret : GCCBuiltin<"__atomic_or_ri32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_mskor_ri32_noret : GCCBuiltin<"__atomic_mskor_ri32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_xor_ri32_noret : GCCBuiltin<"__atomic_xor_ri32_noret">, + BinaryAtomicIntNoRet; + + + +/// Unsigned 32 bit integer atomics for local address space +def int_AMDIL_atomic_add_lu32_noret : GCCBuiltin<"__atomic_add_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_sub_lu32_noret : GCCBuiltin<"__atomic_sub_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_rsub_lu32_noret : GCCBuiltin<"__atomic_rsub_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xchg_lu32_noret : GCCBuiltin<"__atomic_xchg_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_inc_lu32_noret : GCCBuiltin<"__atomic_inc_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_dec_lu32_noret : GCCBuiltin<"__atomic_dec_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_cmpxchg_lu32_noret : GCCBuiltin<"__atomic_cmpxchg_lu32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_min_lu32_noret : GCCBuiltin<"__atomic_min_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_max_lu32_noret : GCCBuiltin<"__atomic_max_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_and_lu32_noret : GCCBuiltin<"__atomic_and_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_or_lu32_noret : GCCBuiltin<"__atomic_or_lu32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_mskor_lu32_noret : GCCBuiltin<"__atomic_mskor_lu32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_xor_lu32_noret : GCCBuiltin<"__atomic_xor_lu32_noret">, + BinaryAtomicIntNoRet; + +/// Unsigned 32 bit integer atomics for region address space +def int_AMDIL_atomic_add_ru32_noret : GCCBuiltin<"__atomic_add_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_sub_ru32_noret : GCCBuiltin<"__atomic_sub_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_rsub_ru32_noret : GCCBuiltin<"__atomic_rsub_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xchg_ru32_noret : GCCBuiltin<"__atomic_xchg_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_inc_ru32_noret : GCCBuiltin<"__atomic_inc_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_dec_ru32_noret : GCCBuiltin<"__atomic_dec_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_cmpxchg_ru32_noret : GCCBuiltin<"__atomic_cmpxchg_ru32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_min_ru32_noret : GCCBuiltin<"__atomic_min_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_max_ru32_noret : GCCBuiltin<"__atomic_max_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_and_ru32_noret : GCCBuiltin<"__atomic_and_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_or_ru32_noret : GCCBuiltin<"__atomic_or_ru32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_mskor_ru32_noret : GCCBuiltin<"__atomic_mskor_ru32_noret">, + TernaryAtomicIntNoRet; +def int_AMDIL_atomic_xor_ru32_noret : GCCBuiltin<"__atomic_xor_ru32_noret">, + BinaryAtomicIntNoRet; + +def int_AMDIL_get_cycle_count : GCCBuiltin<"__amdil_get_cycle_count">, + VoidIntLong; + +def int_AMDIL_compute_unit_id : GCCBuiltin<"__amdil_compute_unit_id">, + VoidIntInt; + +def int_AMDIL_wavefront_id : GCCBuiltin<"__amdil_wavefront_id">, + VoidIntInt; + + +/// Intrinsics for atomic instructions with a return value +/// Signed 32 bit integer atomics for local address space +def int_AMDIL_atomic_add_li32 : GCCBuiltin<"__atomic_add_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_sub_li32 : GCCBuiltin<"__atomic_sub_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_rsub_li32 : GCCBuiltin<"__atomic_rsub_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_li32 : GCCBuiltin<"__atomic_xchg_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_inc_li32 : GCCBuiltin<"__atomic_inc_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_dec_li32 : GCCBuiltin<"__atomic_dec_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_cmpxchg_li32 : GCCBuiltin<"__atomic_cmpxchg_li32">, + TernaryAtomicInt; +def int_AMDIL_atomic_min_li32 : GCCBuiltin<"__atomic_min_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_max_li32 : GCCBuiltin<"__atomic_max_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_and_li32 : GCCBuiltin<"__atomic_and_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_or_li32 : GCCBuiltin<"__atomic_or_li32">, + BinaryAtomicInt; +def int_AMDIL_atomic_mskor_li32 : GCCBuiltin<"__atomic_mskor_li32">, + TernaryAtomicInt; +def int_AMDIL_atomic_xor_li32 : GCCBuiltin<"__atomic_xor_li32">, + BinaryAtomicInt; + +/// Signed 32 bit integer atomics for region address space +def int_AMDIL_atomic_add_ri32 : GCCBuiltin<"__atomic_add_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_sub_ri32 : GCCBuiltin<"__atomic_sub_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_rsub_ri32 : GCCBuiltin<"__atomic_rsub_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_ri32 : GCCBuiltin<"__atomic_xchg_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_inc_ri32 : GCCBuiltin<"__atomic_inc_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_dec_ri32 : GCCBuiltin<"__atomic_dec_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_cmpxchg_ri32 : GCCBuiltin<"__atomic_cmpxchg_ri32">, + TernaryAtomicInt; +def int_AMDIL_atomic_min_ri32 : GCCBuiltin<"__atomic_min_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_max_ri32 : GCCBuiltin<"__atomic_max_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_and_ri32 : GCCBuiltin<"__atomic_and_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_or_ri32 : GCCBuiltin<"__atomic_or_ri32">, + BinaryAtomicInt; +def int_AMDIL_atomic_mskor_ri32 : GCCBuiltin<"__atomic_mskor_ri32">, + TernaryAtomicInt; +def int_AMDIL_atomic_xor_ri32 : GCCBuiltin<"__atomic_xor_ri32">, + BinaryAtomicInt; + +/// 32 bit float atomics required by OpenCL +def int_AMDIL_atomic_xchg_lf32 : GCCBuiltin<"__atomic_xchg_lf32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_lf32_noret : GCCBuiltin<"__atomic_xchg_lf32_noret">, + BinaryAtomicIntNoRet; +def int_AMDIL_atomic_xchg_rf32 : GCCBuiltin<"__atomic_xchg_rf32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_rf32_noret : GCCBuiltin<"__atomic_xchg_rf32_noret">, + BinaryAtomicIntNoRet; + +/// Unsigned 32 bit integer atomics for local address space +def int_AMDIL_atomic_add_lu32 : GCCBuiltin<"__atomic_add_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_sub_lu32 : GCCBuiltin<"__atomic_sub_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_rsub_lu32 : GCCBuiltin<"__atomic_rsub_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_lu32 : GCCBuiltin<"__atomic_xchg_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_inc_lu32 : GCCBuiltin<"__atomic_inc_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_dec_lu32 : GCCBuiltin<"__atomic_dec_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_cmpxchg_lu32 : GCCBuiltin<"__atomic_cmpxchg_lu32">, + TernaryAtomicInt; +def int_AMDIL_atomic_min_lu32 : GCCBuiltin<"__atomic_min_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_max_lu32 : GCCBuiltin<"__atomic_max_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_and_lu32 : GCCBuiltin<"__atomic_and_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_or_lu32 : GCCBuiltin<"__atomic_or_lu32">, + BinaryAtomicInt; +def int_AMDIL_atomic_mskor_lu32 : GCCBuiltin<"__atomic_mskor_lu32">, + TernaryAtomicInt; +def int_AMDIL_atomic_xor_lu32 : GCCBuiltin<"__atomic_xor_lu32">, + BinaryAtomicInt; + +/// Unsigned 32 bit integer atomics for region address space +def int_AMDIL_atomic_add_ru32 : GCCBuiltin<"__atomic_add_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_sub_ru32 : GCCBuiltin<"__atomic_sub_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_rsub_ru32 : GCCBuiltin<"__atomic_rsub_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_xchg_ru32 : GCCBuiltin<"__atomic_xchg_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_inc_ru32 : GCCBuiltin<"__atomic_inc_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_dec_ru32 : GCCBuiltin<"__atomic_dec_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_cmpxchg_ru32 : GCCBuiltin<"__atomic_cmpxchg_ru32">, + TernaryAtomicInt; +def int_AMDIL_atomic_min_ru32 : GCCBuiltin<"__atomic_min_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_max_ru32 : GCCBuiltin<"__atomic_max_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_and_ru32 : GCCBuiltin<"__atomic_and_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_or_ru32 : GCCBuiltin<"__atomic_or_ru32">, + BinaryAtomicInt; +def int_AMDIL_atomic_mskor_ru32 : GCCBuiltin<"__atomic_mskor_ru32">, + TernaryAtomicInt; +def int_AMDIL_atomic_xor_ru32 : GCCBuiltin<"__atomic_xor_ru32">, + BinaryAtomicInt; + +/// Semaphore signal/wait/init +def int_AMDIL_semaphore_init : GCCBuiltin<"__amdil_semaphore_init">, + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>; +def int_AMDIL_semaphore_wait : GCCBuiltin<"__amdil_semaphore_wait">, + Intrinsic<[], [llvm_ptr_ty]>; +def int_AMDIL_semaphore_signal : GCCBuiltin<"__amdil_semaphore_signal">, + Intrinsic<[], [llvm_ptr_ty]>; +def int_AMDIL_semaphore_size : GCCBuiltin<"__amdil_max_semaphore_size">, + Intrinsic<[llvm_i32_ty], []>; +} diff --git a/src/gallium/drivers/radeon/AMDILKernel.h b/src/gallium/drivers/radeon/AMDILKernel.h new file mode 100644 index 00000000000..ce7ea04c569 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILKernel.h @@ -0,0 +1,84 @@ +//===------------- AMDILKernel.h - AMDIL Kernel Class ----------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// Definition of a AMDILKernel object and the various subclasses that +// are used. +//===----------------------------------------------------------------------===// +#ifndef _AMDIL_KERNEL_H_ +#define _AMDIL_KERNEL_H_ +#include "AMDIL.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Constant.h" +#include "llvm/Value.h" + +namespace llvm { + class AMDILSubtarget; + class AMDILTargetMachine; + /// structure that holds information for a single local/region address array + typedef struct _AMDILArrayMemRec { + uint32_t vecSize; // size of each vector + uint32_t offset; // offset into the memory section + bool isHW; // flag to specify if HW is used or SW is used + bool isRegion; // flag to specify if GDS is used or not + } AMDILArrayMem; + + /// structure that holds information about a constant address + /// space pointer that is a kernel argument + typedef struct _AMDILConstPtrRec { + const llvm::Value *base; + uint32_t size; + uint32_t offset; + uint32_t cbNum; // value of 0 means that it does not use hw CB + bool isArray; + bool isArgument; + bool usesHardware; + std::string name; + } AMDILConstPtr; + + /// Structure that holds information for all local/region address + /// arrays in the kernel + typedef struct _AMDILLocalArgRec { + llvm::SmallVector<AMDILArrayMem *, DEFAULT_VEC_SLOTS> local; + std::string name; // Kernel Name + } AMDILLocalArg; + + /// Structure that holds information for each kernel argument + typedef struct _AMDILkernelArgRec { + uint32_t reqGroupSize[3]; + uint32_t reqRegionSize[3]; + llvm::SmallVector<uint32_t, DEFAULT_VEC_SLOTS> argInfo; + bool mHasRWG; + bool mHasRWR; + } AMDILKernelAttr; + + /// Structure that holds information for each kernel + class AMDILKernel { + public: + AMDILKernel() {} + uint32_t curSize; + uint32_t curRSize; + uint32_t curHWSize; + uint32_t curHWRSize; + uint32_t constSize; + bool mKernel; + std::string mName; + AMDILKernelAttr *sgv; + AMDILLocalArg *lvgv; + llvm::SmallVector<struct _AMDILConstPtrRec, DEFAULT_VEC_SLOTS> constPtr; + uint32_t constSizes[HW_MAX_NUM_CB]; + llvm::SmallSet<uint32_t, OPENCL_MAX_READ_IMAGES> readOnly; + llvm::SmallSet<uint32_t, OPENCL_MAX_WRITE_IMAGES> writeOnly; + llvm::SmallVector<std::pair<uint32_t, const llvm::Constant *>, + DEFAULT_VEC_SLOTS> CPOffsets; + typedef llvm::SmallVector<struct _AMDILConstPtrRec, DEFAULT_VEC_SLOTS>::iterator constptr_iterator; + typedef llvm::SmallVector<AMDILArrayMem *, DEFAULT_VEC_SLOTS>::iterator arraymem_iterator; + }; // AMDILKernel +} // end llvm namespace +#endif // _AMDIL_KERNEL_H_ diff --git a/src/gallium/drivers/radeon/AMDILKernelManager.cpp b/src/gallium/drivers/radeon/AMDILKernelManager.cpp new file mode 100644 index 00000000000..4df81ff5078 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILKernelManager.cpp @@ -0,0 +1,1356 @@ +//===-- AMDILKernelManager.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILKernelManager.h" + +#include "AMDILAlgorithms.tpp" +#include "AMDILKernelManager.h" +#ifdef UPSTREAM_LLVM +#include "AMDILAsmPrinter.h" +#endif +#include "AMDILCompilerErrors.h" +#include "AMDILDeviceInfo.h" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILModuleInfo.h" +#include "AMDILSubtarget.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/MathExtras.h" + +#include <stdio.h> + +using namespace llvm; +#define NUM_EXTRA_SLOTS_PER_IMAGE 1 + +static bool errorPrint(const char *ptr, llvm::raw_ostream &O) { + if (ptr[0] == 'E') { + O << ";error:" << ptr << "\n"; + } else { + O << ";warning:" << ptr << "\n"; + } + return false; +} + +#if 0 +static bool +samplerPrint(StringMap<SamplerInfo>::iterator &data, llvm::raw_ostream &O) { + O << ";sampler:" << (*data).second.name << ":" << (*data).second.idx + << ":" << ((*data).second.val == (uint32_t)-1 ? 0 : 1) + << ":" << ((*data).second.val != (uint32_t)-1 ? (*data).second.val : 0) + << "\n"; + return false; +} +#endif + +static bool arenaPrint(uint32_t val, llvm::raw_ostream &O) { + if (val >= ARENA_SEGMENT_RESERVED_UAVS) { + O << "dcl_arena_uav_id(" << val << ")\n"; + } + return false; +} + +static bool uavPrint(uint32_t val, llvm::raw_ostream &O) { + if (val < 8 || val == 11){ + O << "dcl_raw_uav_id(" << val << ")\n"; + } + return false; +} + +static bool uavPrintSI(uint32_t val, llvm::raw_ostream &O) { + O << "dcl_typeless_uav_id(" << val << ")_stride(4)_length(4)_access(read_write)\n"; + return false; +} + +static bool +printfPrint(std::pair<const std::string, PrintfInfo *> &data, llvm::raw_ostream &O) { + O << ";printf_fmt:" << data.second->getPrintfID(); + // Number of operands + O << ":" << data.second->getNumOperands(); + // Size of each operand + for (size_t i = 0, e = data.second->getNumOperands(); i < e; ++i) { + O << ":" << (data.second->getOperandID(i) >> 3); + } + const char *ptr = data.first.c_str(); + uint32_t size = data.first.size() - 1; + // The format string size + O << ":" << size << ":"; + for (size_t i = 0; i < size; ++i) { + if (ptr[i] == '\r') { + O << "\\r"; + } else if (ptr[i] == '\n') { + O << "\\n"; + } else { + O << ptr[i]; + } + } + O << ";\n"; // c_str() is cheap way to trim + return false; +} + + +void AMDILKernelManager::updatePtrArg(Function::const_arg_iterator Ip, + int numWriteImages, int raw_uav_buffer, + int counter, bool isKernel, + const Function *F) { + assert(F && "Cannot pass a NULL Pointer to F!"); + assert(Ip->getType()->isPointerTy() && + "Argument must be a pointer to be passed into this function!\n"); + std::string ptrArg(";pointer:"); + const char *symTab = "NoSymTab"; + uint32_t ptrID = getUAVID(Ip); + const PointerType *PT = cast<PointerType>(Ip->getType()); + uint32_t Align = 4; + const char *MemType = "uav"; + if (PT->getElementType()->isSized()) { + Align = NextPowerOf2((uint32_t)mTM->getTargetData()-> + getTypeAllocSize(PT->getElementType())); + } + ptrArg += Ip->getName().str() + ":" + getTypeName(PT, symTab) + ":1:1:" + + itostr(counter * 16) + ":"; + switch (PT->getAddressSpace()) { + case AMDILAS::ADDRESS_NONE: + //O << "No Address space qualifier!"; + mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]); + assert(1); + break; + case AMDILAS::GLOBAL_ADDRESS: + if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) { + if (ptrID >= ARENA_SEGMENT_RESERVED_UAVS) { + ptrID = 8; + } + } + mMFI->uav_insert(ptrID); + break; + case AMDILAS::CONSTANT_ADDRESS: { + if (isKernel && mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)){ + const kernel t = mGM->getKernel(F->getName()); + if (mGM->usesHWConstant(t, Ip->getName())) { + MemType = "hc\0"; + ptrID = mGM->getConstPtrCB(t, Ip->getName()); + } else { + MemType = "c\0"; + mMFI->uav_insert(ptrID); + } + } else { + MemType = "c\0"; + mMFI->uav_insert(ptrID); + } + break; + } + default: + case AMDILAS::PRIVATE_ADDRESS: + if (mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)) { + MemType = (mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV)) + ? "up\0" : "hp\0"; + } else { + MemType = "p\0"; + mMFI->uav_insert(ptrID); + } + break; + case AMDILAS::REGION_ADDRESS: + mMFI->setUsesRegion(); + if (mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem)) { + MemType = "hr\0"; + ptrID = 0; + } else { + MemType = "r\0"; + mMFI->uav_insert(ptrID); + } + break; + case AMDILAS::LOCAL_ADDRESS: + mMFI->setUsesLocal(); + if (mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem)) { + MemType = "hl\0"; + ptrID = 1; + } else { + MemType = "l\0"; + mMFI->uav_insert(ptrID); + } + break; + }; + ptrArg += std::string(MemType) + ":"; + ptrArg += itostr(ptrID) + ":"; + ptrArg += itostr(Align); + mMFI->addMetadata(ptrArg, true); +} + +AMDILKernelManager::AMDILKernelManager(AMDILTargetMachine *TM, + AMDILGlobalManager *GM) +{ + mTM = TM; + mSTM = mTM->getSubtargetImpl(); + mGM = GM; + clear(); +} + +AMDILKernelManager::~AMDILKernelManager() { + clear(); +} + +void +AMDILKernelManager::setMF(MachineFunction *MF) +{ + mMF = MF; + mMFI = MF->getInfo<AMDILMachineFunctionInfo>(); +} + +void AMDILKernelManager::clear() { + mUniqueID = 0; + mIsKernel = false; + mWasKernel = false; + mHasImageWrite = false; + mHasOutputInst = false; +} + +bool AMDILKernelManager::useCompilerWrite(const MachineInstr *MI) { + return (MI->getOpcode() == AMDIL::RETURN && wasKernel() && !mHasImageWrite + && !mHasOutputInst); +} + +void AMDILKernelManager::processArgMetadata(llvm::raw_ostream &O, + uint32_t buf, + bool isKernel) +{ + const Function *F = mMF->getFunction(); + const char * symTab = "NoSymTab"; + Function::const_arg_iterator Ip = F->arg_begin(); + Function::const_arg_iterator Ep = F->arg_end(); + + if (F->hasStructRetAttr()) { + assert(Ip != Ep && "Invalid struct return fucntion!"); + mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]); + ++Ip; + } + uint32_t mCBSize = 0; + int raw_uav_buffer = mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID); + bool MultiUAV = mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV); + bool ArenaSegment = + mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment); + int numWriteImages = + mSTM->getGlobalManager()->getNumWriteImages(F->getName()); + if (numWriteImages == OPENCL_MAX_WRITE_IMAGES || MultiUAV || ArenaSegment) { + if (mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + raw_uav_buffer = mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID); + } + } + uint32_t CounterNum = 0; + uint32_t ROArg = 0; + uint32_t WOArg = 0; + uint32_t NumArg = 0; + while (Ip != Ep) { + Type *cType = Ip->getType(); + if (cType->isIntOrIntVectorTy() || cType->isFPOrFPVectorTy()) { + std::string argMeta(";value:"); + argMeta += Ip->getName().str() + ":" + getTypeName(cType, symTab) + ":"; + int bitsize = cType->getPrimitiveSizeInBits(); + int numEle = 1; + if (cType->getTypeID() == Type::VectorTyID) { + numEle = cast<VectorType>(cType)->getNumElements(); + } + argMeta += itostr(numEle) + ":1:" + itostr(mCBSize << 4); + mMFI->addMetadata(argMeta, true); + + // FIXME: simplify + if ((bitsize / numEle) < 32) { + bitsize = numEle >> 2; + } else { + bitsize >>= 7; + } + if (!bitsize) { + bitsize = 1; + } + + mCBSize += bitsize; + ++NumArg; + } else if (const PointerType *PT = dyn_cast<PointerType>(cType)) { + Type *CT = PT->getElementType(); + const StructType *ST = dyn_cast<StructType>(CT); + if (ST && ST->isOpaque()) { + StringRef name = ST->getName(); + bool i1d = name.equals( "struct._image1d_t" ); + bool i1da = name.equals( "struct._image1d_array_t" ); + bool i1db = name.equals( "struct._image1d_buffer_t" ); + bool i2d = name.equals( "struct._image2d_t" ); + bool i2da = name.equals( "struct._image2d_array_t" ); + bool i3d = name.equals( "struct._image3d_t" ); + bool c32 = name.equals( "struct._counter32_t" ); + bool c64 = name.equals( "struct._counter64_t" ); + if (i1d || i1da || i1db || i2d | i2da || i3d) { + if (mSTM->device()->isSupported(AMDILDeviceInfo::Images)) { + std::string imageArg(";image:"); + imageArg += Ip->getName().str() + ":"; + if (i1d) imageArg += "1D:"; + else if (i1da) imageArg += "1DA:"; + else if (i1db) imageArg += "1DB:"; + else if (i2d) imageArg += "2D:"; + else if (i2da) imageArg += "2DA:"; + else if (i3d) imageArg += "3D:"; + + if (isKernel) { + if (mGM->isReadOnlyImage (mMF->getFunction()->getName(), + (ROArg + WOArg))) { + imageArg += "RO:" + itostr(ROArg); + O << "dcl_resource_id(" << ROArg << ")_type("; + if (i1d) O << "1d"; + else if (i1da) O << "1darray"; + else if (i1db) O << "buffer"; + else if (i2d) O << "2d"; + else if (i2da) O << "2darray"; + else if (i3d) O << "3d"; + O << ")_fmtx(unknown)_fmty(unknown)" + << "_fmtz(unknown)_fmtw(unknown)\n"; + ++ROArg; + } else if (mGM->isWriteOnlyImage(mMF->getFunction()->getName(), + (ROArg + WOArg))) { + uint32_t offset = 0; + offset += WOArg; + imageArg += "WO:" + itostr(offset & 0x7); + O << "dcl_uav_id(" << ((offset) & 0x7) << ")_type("; + if (i1d) O << "1d"; + else if (i1da) O << "1darray"; + else if (i1db) O << "buffer"; + else if (i2d) O << "2d"; + else if (i2da) O << "2darray"; + else if (i3d) O << "3d"; + O << ")_fmtx(uint)\n"; + ++WOArg; + } else { + imageArg += "RW:" + itostr(ROArg + WOArg); + } + } + imageArg += ":1:" + itostr(mCBSize * 16); + mMFI->addMetadata(imageArg, true); + mMFI->addi32Literal(mCBSize); + mCBSize += NUM_EXTRA_SLOTS_PER_IMAGE + 1; + ++NumArg; + } else { + mMFI->addErrorMsg(amd::CompilerErrorMessage[NO_IMAGE_SUPPORT]); + ++NumArg; + } + } else if (c32 || c64) { + std::string counterArg(";counter:"); + counterArg += Ip->getName().str() + ":" + + itostr(c32 ? 32 : 64) + ":" + + itostr(CounterNum++) + ":1:" + itostr(mCBSize * 16); + mMFI->addMetadata(counterArg, true); + ++NumArg; + ++mCBSize; + } else { + updatePtrArg(Ip, numWriteImages, raw_uav_buffer, mCBSize, isKernel, + F); + ++NumArg; + ++mCBSize; + } + } + else if (CT->getTypeID() == Type::StructTyID + && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) { + const TargetData *td = mTM->getTargetData(); + const StructLayout *sl = td->getStructLayout(dyn_cast<StructType>(CT)); + int bytesize = sl->getSizeInBytes(); + int reservedsize = (bytesize + 15) & ~15; + int numSlots = reservedsize >> 4; + if (!numSlots) { + numSlots = 1; + } + std::string structArg(";value:"); + structArg += Ip->getName().str() + ":struct:" + + itostr(bytesize) + ":1:" + itostr(mCBSize * 16); + mMFI->addMetadata(structArg, true); + mCBSize += numSlots; + ++NumArg; + } else if (CT->isIntOrIntVectorTy() + || CT->isFPOrFPVectorTy() + || CT->getTypeID() == Type::ArrayTyID + || CT->getTypeID() == Type::PointerTyID + || PT->getAddressSpace() != AMDILAS::PRIVATE_ADDRESS) { + updatePtrArg(Ip, numWriteImages, raw_uav_buffer, mCBSize, isKernel, F); + ++NumArg; + ++mCBSize; + } else { + assert(0 && "Cannot process current pointer argument"); + mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]); + ++NumArg; + } + } else { + assert(0 && "Cannot process current kernel argument"); + mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]); + ++NumArg; + } + ++Ip; + } +} + +void AMDILKernelManager::printHeader(AMDILAsmPrinter *AsmPrinter, + llvm::raw_ostream &O, + const std::string &name) { +#ifdef UPSTREAM_LLVM + mName = name; + std::string kernelName; + kernelName = name; + int kernelId = mGM->getOrCreateFunctionID(kernelName); + O << "func " << kernelId << " ; " << kernelName << "\n"; + if (mSTM->is64bit()) { + O << "mov " << AsmPrinter->getRegisterName(AMDIL::SDP) << ", cb0[8].xy\n"; + } else { + O << "mov " << AsmPrinter->getRegisterName(AMDIL::SDP) << ", cb0[8].x\n"; + } + O << "mov " << AsmPrinter->getRegisterName(AMDIL::SP) << ", l1.0\n"; +#endif +} + +void AMDILKernelManager::printGroupSize(llvm::raw_ostream& O) { + // The HD4XXX generation of hardware does not support a 3D launch, so we need + // to use dcl_num_thread_per_group to specify the launch size. If the launch + // size is specified via a kernel attribute, we print it here. Otherwise we + // use the the default size. + if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + if (mGM->hasRWG(mName) + || !mMFI->usesLocal()) { + // if the user has specified what the required workgroup size is then we + // need to compile for that size and that size only. Otherwise we compile + // for the max workgroup size that is passed in as an option to the + // backend. + O << "dcl_num_thread_per_group "; + O << mGM->getLocal(mName, 0) << ", "; + O << mGM->getLocal(mName, 1) << ", "; + O << mGM->getLocal(mName, 2) << " \n"; + } else { + // If the kernel uses local memory, then the kernel is being + // compiled in single wavefront mode. So we have to generate code slightly + // different. + O << "dcl_num_thread_per_group " + << mSTM->device()->getWavefrontSize() + << ", 1, 1 \n"; + } + } else { + // Otherwise we generate for devices that support 3D launch natively. If + // the reqd_workgroup_size attribute was specified, then we can specify the + // exact launch dimensions. + if (mGM->hasRWG(mName)) { + O << "dcl_num_thread_per_group "; + O << mGM->getLocal(mName, 0) << ", "; + O << mGM->getLocal(mName, 1) << ", "; + O << mGM->getLocal(mName, 2) << " \n"; + } else { + // Otherwise we specify the largest workgroup size that can be launched. + O << "dcl_max_thread_per_group " << mGM->getLocal(mName, 3) << " \n"; + } + } + // Now that we have specified the workgroup size, lets declare the local + // memory size. If we are using hardware and we know the value at compile + // time, then we need to declare the correct value. Otherwise we should just + // declare the maximum size. + if (mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem)) { + size_t kernelLocalSize = (mGM->getHWLocalSize(mName) + 3) & ~3; + if (kernelLocalSize > mSTM->device()->getMaxLDSSize()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[INSUFFICIENT_LOCAL_RESOURCES]); + } + // If there is a local pointer as a kernel argument, we don't know the size + // at compile time, so we reserve all of the space. + if (mMFI->usesLocal() && (mMFI->hasLocalArg() || !kernelLocalSize)) { + O << "dcl_lds_id(" << DEFAULT_LDS_ID << ") " + << mSTM->device()->getMaxLDSSize() << "\n"; + mMFI->setUsesMem(AMDILDevice::LDS_ID); + } else if (kernelLocalSize) { + // We know the size, so lets declare it correctly. + O << "dcl_lds_id(" << DEFAULT_LDS_ID << ") " + << kernelLocalSize << "\n"; + mMFI->setUsesMem(AMDILDevice::LDS_ID); + } + } + // If the device supports the region memory extension, which maps to our + // hardware GDS memory, then lets declare it so we can use it later on. + if (mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem)) { + size_t kernelGDSSize = (mGM->getHWRegionSize(mName) + 3) & ~3; + if (kernelGDSSize > mSTM->device()->getMaxGDSSize()) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[INSUFFICIENT_REGION_RESOURCES]); + } + // If there is a region pointer as a kernel argument, we don't know the size + // at compile time, so we reserved all of the space. + if (mMFI->usesRegion() && (mMFI->hasRegionArg() || !kernelGDSSize)) { + O << "dcl_gds_id(" << DEFAULT_GDS_ID << + ") " << mSTM->device()->getMaxGDSSize() << "\n"; + mMFI->setUsesMem(AMDILDevice::GDS_ID); + } else if (kernelGDSSize) { + // We know the size, so lets declare it. + O << "dcl_gds_id(" << DEFAULT_GDS_ID << + ") " << kernelGDSSize << "\n"; + mMFI->setUsesMem(AMDILDevice::GDS_ID); + } + } +} + +void +AMDILKernelManager::printDecls(AMDILAsmPrinter *AsmPrinter, llvm::raw_ostream &O) { + // If we are a HD4XXX generation device, then we only support a single uav + // surface, so we declare it and leave + if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + O << "dcl_raw_uav_id(" + << mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) + << ")\n"; + mMFI->setUsesMem(AMDILDevice::RAW_UAV_ID); + getIntrinsicSetup(AsmPrinter, O); + return; + } + // If we are supporting multiple uav's view the MultiUAV capability, then we + // need to print out the declarations here. MultiUAV conflicts with write + // images, so they only use 8 - NumWriteImages uav's. Therefor only pointers + // with ID's < 8 will get printed. + if (mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) { + binaryForEach(mMFI->uav_begin(), mMFI->uav_end(), uavPrint, O); + mMFI->setUsesMem(AMDILDevice::RAW_UAV_ID); + } + // If arena segments are supported, then we should emit them now. Arena + // segments are similiar to MultiUAV, except ArenaSegments are virtual and up + // to 1024 of them can coexist. These are more compiler hints for CAL and thus + // cannot overlap in any form. Each ID maps to a seperate piece of memory and + // CAL determines whether the load/stores should go to the fast path/slow path + // based on the usage and instruction. + if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) { + binaryForEach(mMFI->uav_begin(), mMFI->uav_end(), arenaPrint, O); + } + // Now that we have printed out all of the arena and multi uav declaration, + // now we must print out the default raw uav id. This always exists on HD5XXX + // and HD6XXX hardware. The reason is that the hardware supports 12 UAV's and + // 11 are taken up by MultiUAV/Write Images and Arena. However, if we do not + // have UAV 11 as the raw UAV and there are 8 write images, we must revert + // everything to the arena and not print out the default raw uav id. + if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD5XXX + || mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX) { + if ((mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) < 11 && + mSTM->getGlobalManager()->getNumWriteImages(mName) + != OPENCL_MAX_WRITE_IMAGES + && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) + || mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) == 11) { + if (!mMFI->usesMem(AMDILDevice::RAW_UAV_ID) + && mMFI->uav_count(mSTM->device()-> + getResourceID(AMDILDevice::RAW_UAV_ID))) { + O << "dcl_raw_uav_id(" + << mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID); + O << ")\n"; + mMFI->setUsesMem(AMDILDevice::RAW_UAV_ID); + } + } + // If we have not printed out the arena ID yet, then do so here. + if (!mMFI->usesMem(AMDILDevice::ARENA_UAV_ID) + && mSTM->device()->usesHardware(AMDILDeviceInfo::ArenaUAV)) { + O << "dcl_arena_uav_id(" + << mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID) << ")\n"; + mMFI->setUsesMem(AMDILDevice::ARENA_UAV_ID); + } + } else if (mSTM->device()->getGeneration() > AMDILDeviceInfo::HD6XXX) { + binaryForEach(mMFI->uav_begin(), mMFI->uav_end(), uavPrintSI, O); + mMFI->setUsesMem(AMDILDevice::RAW_UAV_ID); + } + getIntrinsicSetup(AsmPrinter, O); +} + +void AMDILKernelManager::getIntrinsicSetup(AMDILAsmPrinter *AsmPrinter, + llvm::raw_ostream &O) +{ + O << "mov r0.z, vThreadGrpIdFlat.x\n" + << "mov r1022.xyz0, vTidInGrp.xyz\n"; + if (mSTM->device()->getGeneration() > AMDILDeviceInfo::HD4XXX) { + O << "mov r1023.xyz0, vThreadGrpId.xyz\n"; + } else { + O << "imul r0.w, cb0[2].x, cb0[2].y\n" + // Calculates the local id. + // Calculates the group id. + << "umod r1023.x, r0.z, cb0[2].x\n" + << "udiv r1023.y, r0.z, cb0[2].x\n" + << "umod r1023.y, r1023.y, cb0[2].y\n" + << "udiv r1023.z, r0.z, r0.w\n"; + } + // Calculates the global id. + if (mGM->hasRWG(mName) && 0) { + // Anytime we declare a literal, we need to reserve it, if it is not emitted + // in emitLiterals. + mMFI->addReservedLiterals(1); + O << "dcl_literal l" << mMFI->getNumLiterals() + 1 << ", "; + O << mGM->getLocal(mName, 0) << ", "; + O << mGM->getLocal(mName, 1) << ", "; + O << mGM->getLocal(mName, 2) << ", "; + O << "0\n"; + O << "imad r1021.xyz0, r1023.xyz, l" << mMFI->getNumLiterals() + 1 << ".xyz, r1022.xyz\n"; + mMFI->addReservedLiterals(1); + } else { + O << "imad r1021.xyz0, r1023.xyz, cb0[1].xyz, r1022.xyz\n"; + } + + // Add the global/group offset for multi-launch support. + O << "iadd r1021.xyz0, r1021.xyz0, cb0[6].xyz0\n" + << "iadd r1023.xyz0, r1023.xyz0, cb0[7].xyz0\n" + // moves the flat group id. + << "mov r1023.w, r0.z\n"; +#ifdef UPSTREAM_LLVM + if (mSTM->device()->usesSoftware(AMDILDeviceInfo::LocalMem)) { + if (mSTM->is64bit()) { + O << "umul " << AsmPrinter->getRegisterName(AMDIL::T2) + << ".x0, r1023.w, cb0[4].z\n" + << "i64add " << AsmPrinter->getRegisterName(AMDIL::T2) + << ".xy, " << AsmPrinter->getRegisterName(AMDIL::T2) + << ".xy, cb0[4].xy\n"; + + } else { + O << "imad " << AsmPrinter->getRegisterName(AMDIL::T2) + << ".x, r1023.w, cb0[4].y, cb0[4].x\n"; + } + } + // Shift the flat group id to be in bytes instead of dwords. + O << "ishl r1023.w, r1023.w, l0.z\n"; + if (mSTM->device()->usesSoftware(AMDILDeviceInfo::PrivateMem)) { + if (mSTM->is64bit()) { + O << "umul " << AsmPrinter->getRegisterName(AMDIL::T1) + << ".x0, vAbsTidFlat.x, cb0[3].z\n" + << "i64add " << AsmPrinter->getRegisterName(AMDIL::T1) + << ".xy, " << AsmPrinter->getRegisterName(AMDIL::T1) + << ".xy, cb0[3].xy\n"; + + } else { + O << "imad " << AsmPrinter->getRegisterName(AMDIL::T1) + << ".x, vAbsTidFlat.x, cb0[3].y, cb0[3].x\n"; + } + } else { + O << "mov " << AsmPrinter->getRegisterName(AMDIL::T1) << ".x, l0.0\n"; + } +#endif + if (mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) { + O << "udiv r1024.xyz, r1021.xyz, cb0[10].xyz\n"; + if (mGM->hasRWR(mName) && 0) { + // Anytime we declare a literal, we need to reserve it, if it is not emitted + // in emitLiterals. + mMFI->addReservedLiterals(1); + O << "dcl_literal l" << mMFI->getNumLiterals() + 1 << ", "; + O << mGM->getLocal(mName, 0) << ", "; + O << mGM->getLocal(mName, 1) << ", "; + O << mGM->getLocal(mName, 2) << ", "; + O << "0\n"; + O << "imad r1025.xyz0, r1023.xyz, l" << mMFI->getNumLiterals() + 1 << ".xyz, r1022.xyz\n"; + mMFI->addReservedLiterals(1); + } else { + O << "imad r1025.xyz0, r1023.xyz, cb0[1].xyz, r1022.xyz\n"; + } + } +} + +void AMDILKernelManager::printFooter(llvm::raw_ostream &O) { + O << "ret\n"; + O << "endfunc ; " << mName << "\n"; +} + +void +AMDILKernelManager::printMetaData(llvm::raw_ostream &O, uint32_t id, bool kernel) { + if (kernel) { + int kernelId = mGM->getOrCreateFunctionID(mName); + mMFI->addCalledFunc(id); + mUniqueID = kernelId; + mIsKernel = true; + } + printKernelArgs(O); + if (kernel) { + mIsKernel = false; + mMFI->eraseCalledFunc(id); + mUniqueID = id; + } +} + +void AMDILKernelManager::setKernel(bool kernel) { + mIsKernel = kernel; + if (kernel) { + mWasKernel = mIsKernel; + } +} + +void AMDILKernelManager::setID(uint32_t id) +{ + mUniqueID = id; +} + +void AMDILKernelManager::setName(const std::string &name) { + mName = name; +} + +bool AMDILKernelManager::isKernel() { + return mIsKernel; +} + +bool AMDILKernelManager::wasKernel() { + return mWasKernel; +} + +void AMDILKernelManager::setImageWrite() { + mHasImageWrite = true; +} + +void AMDILKernelManager::setOutputInst() { + mHasOutputInst = true; +} + +void AMDILKernelManager::printConstantToRegMapping( + AMDILAsmPrinter *RegNames, + uint32_t &LII, + llvm::raw_ostream &O, + uint32_t &Counter, + uint32_t Buffer, + uint32_t n, + const char *lit, + uint32_t fcall, + bool isImage, + bool isHWCB) +{ +#ifdef UPSTREAM_LLVM + // TODO: This needs to be enabled or SC will never statically index into the + // CB when a pointer is used. + if (mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem) && isHWCB) { + const char *name = RegNames->getRegisterName(LII); + O << "mov " << name << ", l5.x\n"; + ++LII; + Counter++; + return; + } + for (uint32_t x = 0; x < n; ++x) { + const char *name = RegNames->getRegisterName(LII); + if (isImage) { + O << "mov " << name << ", l" << mMFI->getIntLits(Counter++) << "\n"; + } else { + O << "mov " << name << ", cb" <<Buffer<< "[" <<Counter++<< "]\n"; + } + switch(fcall) { + case 1093: + O << "ishr " << name << ", " << name << ".xxyy, l3.0y0y\n" + "ishl " << name << ", " << name << ", l3.y\n" + "ishr " << name << ", " << name << ", l3.y\n"; + break; + case 1092: + O << "ishr " << name << ", " << name << ".xx, l3.0y\n" + "ishl " << name << ", " << name << ", l3.y\n" + "ishr " << name << ", " << name << ", l3.y\n"; + break; + case 1091: + O << "ishr " << name << ", " << name << ".xxxx, l3.0zyx\n" + "ishl " << name << ", " << name << ", l3.x\n" + "ishr " << name << ", " << name << ", l3.x\n"; + break; + case 1090: + O << "ishr " << name << ", " << name << ".xx, l3.0z\n" + "ishl " << name << ".xy__, " << name << ".xy, l3.x\n" + "ishr " << name << ".xy__, " << name << ".xy, l3.x\n"; + break; + default: + break; + }; + if (lit) { + O << "ishl " << name << ", " << name + << ", " << lit << "\n"; + O << "ishr " << name << ", " << name + << ", " << lit << "\n"; + } + if (isImage) { + Counter += NUM_EXTRA_SLOTS_PER_IMAGE; + } + ++LII; + } +#endif +} + +void +AMDILKernelManager::printCopyStructPrivate(const StructType *ST, + llvm::raw_ostream &O, + size_t stackSize, + uint32_t Buffer, + uint32_t mLitIdx, + uint32_t &Counter) +{ + size_t n = ((stackSize + 15) & ~15) >> 4; + for (size_t x = 0; x < n; ++x) { + O << "mov r2, cb" << Buffer << "[" << Counter++ << "]\n"; + O << "mov r1.x, r0.x\n"; + if (mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + if (mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)) { + O << "ishr r1.x, r1.x, l0.x\n"; + O << "mov x" << mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID) + <<"[r1.x], r2\n"; + } else { + O << "uav_raw_store_id(" << + mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID) + << ") mem0, r1.x, r2\n"; + } + } else { + O << "uav_raw_store_id(" << + mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID) + << ") mem0, r1.x, r2\n"; + } + O << "iadd r0.x, r0.x, l" << mLitIdx << ".z\n"; + } +} + +void AMDILKernelManager::printKernelArgs(llvm::raw_ostream &O) { + std::string version(";version:"); + version += itostr(AMDIL_MAJOR_VERSION) + ":" + + itostr(AMDIL_MINOR_VERSION) + ":" + itostr(AMDIL_REVISION_NUMBER); + O << ";ARGSTART:" <<mName<< "\n"; + if (mIsKernel) { + O << version << "\n"; + O << ";device:" <<mSTM->getDeviceName() << "\n"; + } + O << ";uniqueid:" <<mUniqueID<< "\n"; + + size_t local = mGM->getLocalSize(mName); + size_t hwlocal = ((mGM->getHWLocalSize(mName) + 3) & (~0x3)); + size_t region = mGM->getRegionSize(mName); + size_t hwregion = ((mGM->getHWRegionSize(mName) + 3) & (~0x3)); + bool usehwlocal = mSTM->device()->usesHardware(AMDILDeviceInfo::LocalMem); + bool usehwprivate = mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem); + bool usehwregion = mSTM->device()->usesHardware(AMDILDeviceInfo::RegionMem); + bool useuavprivate = mSTM->device()->isSupported(AMDILDeviceInfo::PrivateUAV); + if (mIsKernel) { + O << ";memory:" << ((usehwprivate) ? + (useuavprivate) ? "uav" : "hw" : "" ) << "private:" + <<(((mMFI->getStackSize() + 15) & (~0xF)))<< "\n"; + } + if (mSTM->device()->isSupported(AMDILDeviceInfo::RegionMem)) { + O << ";memory:" << ((usehwregion) ? "hw" : "") << "region:" + << ((usehwregion) ? hwregion : hwregion + region) << "\n"; + } + O << ";memory:" << ((usehwlocal) ? "hw" : "") << "local:" + << ((usehwlocal) ? hwlocal : hwlocal + local) << "\n"; + + if (mIsKernel) { + if (mGM->hasRWG(mName)) { + O << ";cws:" << mGM->getLocal(mName, 0) << ":"; + O << mGM->getLocal(mName, 1) << ":"; + O << mGM->getLocal(mName, 2) << "\n"; + } + if (mGM->hasRWR(mName)) { + O << ";crs:" << mGM->getRegion(mName, 0) << ":"; + O << mGM->getRegion(mName, 1) << ":"; + O << mGM->getRegion(mName, 2) << "\n"; + } + } + if (mIsKernel) { + for (std::vector<std::string>::iterator ib = mMFI->kernel_md_begin(), + ie = mMFI->kernel_md_end(); ib != ie; ++ib) { + O << (*ib) << "\n"; + } + } + for (std::set<std::string>::iterator ib = mMFI->func_md_begin(), + ie = mMFI->func_md_end(); ib != ie; ++ib) { + O << (*ib) << "\n"; + } + if (!mMFI->func_empty()) { + O << ";function:" << mMFI->func_size(); + binaryForEach(mMFI->func_begin(), mMFI->func_end(), commaPrint, O); + O << "\n"; + } + + if (!mSTM->device()->isSupported(AMDILDeviceInfo::MacroDB) + && !mMFI->intr_empty()) { + O << ";intrinsic:" << mMFI->intr_size(); + binaryForEach(mMFI->intr_begin(), mMFI->intr_end(), commaPrint, O); + O << "\n"; + } + + if (!mIsKernel) { + binaryForEach(mMFI->printf_begin(), mMFI->printf_end(), printfPrint, O); + mMF->getMMI().getObjFileInfo<AMDILModuleInfo>().add_printf_offset( + mMFI->printf_size()); + } else { + for (StringMap<SamplerInfo>::iterator + smb = mMFI->sampler_begin(), + sme = mMFI->sampler_end(); smb != sme; ++ smb) { + O << ";sampler:" << (*smb).second.name << ":" << (*smb).second.idx + << ":" << ((*smb).second.val == (uint32_t)-1 ? 0 : 1) + << ":" << ((*smb).second.val != (uint32_t)-1 ? (*smb).second.val : 0) + << "\n"; + } + } + if (mSTM->is64bit()) { + O << ";memory:64bitABI\n"; + } + + if (mMFI->errors_empty()) { + binaryForEach(mMFI->errors_begin(), mMFI->errors_end(), errorPrint, O); + } + // This has to come last + if (mIsKernel + && mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + if (mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) > + mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) { + if (mMFI->uav_size() == 1) { + if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment) + && *(mMFI->uav_begin()) >= ARENA_SEGMENT_RESERVED_UAVS) { + O << ";uavid:" + << mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID); + O << "\n"; + } else { + O << ";uavid:" << *(mMFI->uav_begin()) << "\n"; + } + } else if (mMFI->uav_count(mSTM->device()-> + getResourceID(AMDILDevice::RAW_UAV_ID))) { + O << ";uavid:" + << mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID); + O << "\n"; + } else { + O << ";uavid:" + << mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID); + O << "\n"; + } + } else if (mSTM->getGlobalManager()->getNumWriteImages(mName) != + OPENCL_MAX_WRITE_IMAGES + && !mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment) + && mMFI->uav_count(mSTM->device()-> + getResourceID(AMDILDevice::RAW_UAV_ID))) { + O << ";uavid:" + << mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) << "\n"; + } else if (mMFI->uav_size() == 1) { + O << ";uavid:" << *(mMFI->uav_begin()) << "\n"; + } else { + O << ";uavid:" + << mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID); + O << "\n"; + } + } + O << ";ARGEND:" << mName << "\n"; +} + +void AMDILKernelManager::printArgCopies(llvm::raw_ostream &O, + AMDILAsmPrinter *RegNames) +{ + Function::const_arg_iterator I = mMF->getFunction()->arg_begin(); + Function::const_arg_iterator Ie = mMF->getFunction()->arg_end(); + uint32_t Counter = 0; + + if (mMFI->getArgSize()) { + O << "dcl_cb cb1"; + O << "[" << (mMFI->getArgSize() >> 4) << "]\n"; + mMFI->setUsesMem(AMDILDevice::CONSTANT_ID); + } + const Function *F = mMF->getFunction(); + // Get the stack size + uint32_t stackSize = mMFI->getStackSize(); + uint32_t privateSize = mMFI->getScratchSize(); + uint32_t stackOffset = (privateSize + 15) & (~0xF); + if (stackSize + && mSTM->device()->usesHardware(AMDILDeviceInfo::PrivateMem)) { + // TODO: If the size is too large, we need to fall back to software emulated + // instead of using the hardware capability. + int size = (((stackSize + 15) & (~0xF)) >> 4); + if (size > 4096) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[INSUFFICIENT_PRIVATE_RESOURCES]); + } + if (size) { + // For any stack variables, we need to declare the literals for them so that + // we can use them when we copy our data to the stack. + mMFI->addReservedLiterals(1); + // Anytime we declare a literal, we need to reserve it, if it is not emitted + // in emitLiterals. +#ifdef UPSTREAM_LLVM + O << "dcl_literal l" << mMFI->getNumLiterals() << ", " << stackSize << ", " + << privateSize << ", 16, " << ((stackSize == privateSize) ? 0 : stackOffset) << "\n" + << "iadd r0.x, " << RegNames->getRegisterName(AMDIL::T1) << ".x, l" + << mMFI->getNumLiterals() << ".w\n"; + if (mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + O << "dcl_indexed_temp_array x" + << mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID) << "[" + << size << "]\n"; + } else { + O << "dcl_typeless_uav_id(" + << mSTM->device()->getResourceID(AMDILDevice::SCRATCH_ID) + << ")_stride(4)_length(" << (size << 4 )<< ")_access(private)\n"; + + } + O << "mov " << RegNames->getRegisterName(AMDIL::FP) + << ".x, l" << mMFI->getNumLiterals() << ".0\n"; +#endif + mMFI->setUsesMem(AMDILDevice::SCRATCH_ID); + } + } + I = mMF->getFunction()->arg_begin(); + int32_t count = 0; + // uint32_t Image = 0; + bool displaced1 = false; + bool displaced2 = false; + uint32_t curReg = AMDIL::R1; + // TODO: We don't handle arguments that were pushed onto the stack! + for (; I != Ie; ++I) { + Type *curType = I->getType(); + unsigned int Buffer = 1; + O << "; Kernel arg setup: " << I->getName() << "\n"; + if (curType->isIntegerTy() || curType->isFloatingPointTy()) { + switch (curType->getPrimitiveSizeInBits()) { + default: + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1); + break; + case 16: + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1, + "l3.y" ); + break; + case 8: + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1, "l3.x" ); + break; + } +#ifdef UPSTREAM_LLVM + } else if (const VectorType *VT = dyn_cast<VectorType>(curType)) { + Type *ET = VT->getElementType(); + int numEle = VT->getNumElements(); + switch (ET->getPrimitiveSizeInBits()) { + default: + if (numEle == 3) { + O << "mov " << RegNames->getRegisterName(curReg); + O << ".x, cb" << Buffer << "[" << Counter << "].x\n"; + curReg++; + O << "mov " << RegNames->getRegisterName(curReg); + O << ".x, cb" << Buffer << "[" << Counter << "].y\n"; + curReg++; + O << "mov " << RegNames->getRegisterName(curReg); + O << ".x, cb" << Buffer << "[" << Counter << "].z\n"; + curReg++; + Counter++; + } else { + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, + (numEle+2) >> 2); + } + break; + case 64: + if (numEle == 3) { + O << "mov " << RegNames->getRegisterName(curReg); + O << ".xy, cb" << Buffer << "[" << Counter << "].xy\n"; + curReg++; + O << "mov " << RegNames->getRegisterName(curReg); + O << ".xy, cb" << Buffer << "[" << Counter++ << "].zw\n"; + curReg++; + O << "mov " << RegNames->getRegisterName(curReg); + O << ".xy, cb" << Buffer << "[" << Counter << "].xy\n"; + curReg++; + Counter++; + } else { + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, + (numEle) >> 1); + } + break; + case 16: + { + switch (numEle) { + default: + printConstantToRegMapping(RegNames, curReg, O, Counter, + Buffer, (numEle+2) >> 2, "l3.y", 1093); + if (numEle == 3) { + O << "mov " << RegNames->getRegisterName(curReg) << ".x, "; + O << RegNames->getRegisterName(curReg) << ".y\n"; + ++curReg; + O << "mov " << RegNames->getRegisterName(curReg) << ".x, "; + O << RegNames->getRegisterName(curReg) << ".z\n"; + ++curReg; + } + break; + case 2: + printConstantToRegMapping(RegNames, curReg, O, Counter, + Buffer, 1, "l3.y", 1092); + break; + } + break; + } + case 8: + { + switch (numEle) { + default: + printConstantToRegMapping(RegNames, curReg, O, Counter, + Buffer, (numEle+2) >> 2, "l3.x", 1091); + if (numEle == 3) { + O << "mov " << RegNames->getRegisterName(curReg) << ".x, "; + O << RegNames->getRegisterName(curReg) << ".y\n"; + ++curReg; + O << "mov " << RegNames->getRegisterName(curReg) << ".x, "; + O << RegNames->getRegisterName(curReg) << ".z\n"; + ++curReg; + } + break; + case 2: + printConstantToRegMapping(RegNames, curReg, O, Counter, + Buffer, 1, "l3.x", 1090); + break; + } + break; + } + } +#endif + } else if (const PointerType *PT = dyn_cast<PointerType>(curType)) { + Type *CT = PT->getElementType(); + const StructType *ST = dyn_cast<StructType>(CT); + if (ST && ST->isOpaque()) { + bool i1d = ST->getName() == "struct._image1d_t"; + bool i1da = ST->getName() == "struct._image1d_array_t"; + bool i1db = ST->getName() == "struct._image1d_buffer_t"; + bool i2d = ST->getName() == "struct._image2d_t"; + bool i2da = ST->getName() == "struct._image2d_array_t"; + bool i3d = ST->getName() == "struct._image3d_t"; + bool is_image = i1d || i1da || i1db || i2d || i2da || i3d; + if (is_image) { + if (mSTM->device()->isSupported(AMDILDeviceInfo::Images)) { + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, + 1, NULL, 0, is_image); + } else { + mMFI->addErrorMsg( + amd::CompilerErrorMessage[NO_IMAGE_SUPPORT]); + ++curReg; + } + } else { + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1); + } + } else if (CT->isStructTy() + && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) { + StructType *ST = dyn_cast<StructType>(CT); + bool i1d = ST->getName() == "struct._image1d_t"; + bool i1da = ST->getName() == "struct._image1d_array_t"; + bool i1db = ST->getName() == "struct._image1d_buffer_t"; + bool i2d = ST->getName() == "struct._image2d_t"; + bool i2da = ST->getName() == "struct._image2d_array_t"; + bool i3d = ST->getName() == "struct._image3d_t"; + bool is_image = i1d || i1da || i1db || i2d || i2da || i3d; + if (is_image) { + if (mSTM->device()->isSupported(AMDILDeviceInfo::Images)) { + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, + 1, NULL, 0, is_image); + } else { + mMFI->addErrorMsg(amd::CompilerErrorMessage[NO_IMAGE_SUPPORT]); + ++curReg; + } + } else { + if (count) { + // Anytime we declare a literal, we need to reserve it, if it + // is not emitted in emitLiterals. + mMFI->addReservedLiterals(1); + O << "dcl_literal l" << mMFI->getNumLiterals() << ", " + << -stackSize << ", " << stackSize << ", 16, " + << stackOffset << "\n"; + } + ++count; + size_t structSize; + structSize = (getTypeSize(ST) + 15) & ~15; + stackOffset += structSize; +#ifdef UPSTREAM_LLVM + O << "mov " << RegNames->getRegisterName((curReg)) << ", l" + << mMFI->getNumLiterals()<< ".w\n"; + if (!displaced1) { + O << "mov r1011, r1\n"; + displaced1 = true; + } + if (!displaced2 && strcmp(RegNames->getRegisterName(curReg), "r1")) { + O << "mov r1010, r2\n"; + displaced2 = true; + } +#endif + printCopyStructPrivate(ST, O, structSize, Buffer, mMFI->getNumLiterals(), + Counter); + ++curReg; + } + } else if (CT->isIntOrIntVectorTy() + || CT->isFPOrFPVectorTy() + || CT->isArrayTy() + || CT->isPointerTy() + || PT->getAddressSpace() != AMDILAS::PRIVATE_ADDRESS) { + if (PT->getAddressSpace() == AMDILAS::CONSTANT_ADDRESS) { + const kernel& krnl = mGM->getKernel(F->getName()); + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, + 1, NULL, 0, false, + mGM->usesHWConstant(krnl, I->getName())); + } else if (PT->getAddressSpace() == AMDILAS::REGION_ADDRESS) { + // TODO: If we are region address space, the first region pointer, no + // array pointers exist, and hardware RegionMem is enabled then we can + // zero out register as the initial offset is zero. + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1); + } else if (PT->getAddressSpace() == AMDILAS::LOCAL_ADDRESS) { + // TODO: If we are local address space, the first local pointer, no + // array pointers exist, and hardware LocalMem is enabled then we can + // zero out register as the initial offset is zero. + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1); + } else { + printConstantToRegMapping(RegNames, curReg, O, Counter, Buffer, 1); + } + } else { + assert(0 && "Current type is not supported!"); + mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]); + ++curReg; + } + } else { + assert(0 && "Current type is not supported!"); + mMFI->addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]); + ++curReg; + } + } + if (displaced1) { + O << "mov r1, r1011\n"; + } + if (displaced2) { + O << "mov r2, r1010\n"; + } + if (mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)) { + const kernel& krnl = mGM->getKernel(F->getName()); + uint32_t constNum = 0; + for (uint32_t x = 0; x < mSTM->device()->getMaxNumCBs(); ++x) { + if (krnl.constSizes[x]) { + O << "dcl_cb cb" << x + CB_BASE_OFFSET; + O << "[" << (((krnl.constSizes[x] + 15) & ~15) >> 4) << "]\n"; + ++constNum; + mMFI->setUsesMem(AMDILDevice::CONSTANT_ID); + } + } + // TODO: If we run out of constant resources, we need to push some of the + // constant pointers to the software emulated section. + if (constNum > mSTM->device()->getMaxNumCBs()) { + assert(0 && "Max constant buffer limit passed!"); + mMFI->addErrorMsg(amd::CompilerErrorMessage[INSUFFICIENT_CONSTANT_RESOURCES]); + } + } +} + + const char * +AMDILKernelManager::getTypeName(const Type *ptr, const char *symTab) +{ + // symTab argument is ignored... + LLVMContext& ctx = ptr->getContext(); + switch (ptr->getTypeID()) { + case Type::StructTyID: + { + const StructType *ST = cast<StructType>(ptr); + if (!ST->isOpaque()) + return "struct"; + // ptr is a pre-LLVM 3.0 "opaque" type. + StringRef name = ST->getName(); + if (name.equals( "struct._event_t" )) return "event"; + if (name.equals( "struct._image1d_t" )) return "image1d"; + if (name.equals( "struct._image1d_array_t" )) return "image1d_array"; + if (name.equals( "struct._image2d_t" )) return "image2d"; + if (name.equals( "struct._image2d_array_t" )) return "image2d_array"; + if (name.equals( "struct._image3d_t" )) return "image3d"; + if (name.equals( "struct._counter32_t" )) return "counter32"; + if (name.equals( "struct._counter64_t" )) return "counter64"; + return "opaque"; + break; + } + case Type::FloatTyID: + return "float"; + case Type::DoubleTyID: + { + const AMDILSubtarget *mSTM= mTM->getSubtargetImpl(); + if (!mSTM->device()->usesHardware(AMDILDeviceInfo::DoubleOps)) { + mMFI->addErrorMsg(amd::CompilerErrorMessage[DOUBLE_NOT_SUPPORTED]); + } + return "double"; + } + case Type::IntegerTyID: + { + if (ptr == Type::getInt8Ty(ctx)) { + return "i8"; + } else if (ptr == Type::getInt16Ty(ctx)) { + return "i16"; + } else if (ptr == Type::getInt32Ty(ctx)) { + return "i32"; + } else if(ptr == Type::getInt64Ty(ctx)) { + return "i64"; + } + break; + } + default: + break; + case Type::ArrayTyID: + { + const ArrayType *AT = cast<ArrayType>(ptr); + const Type *name = AT->getElementType(); + return getTypeName(name, symTab); + break; + } + case Type::VectorTyID: + { + const VectorType *VT = cast<VectorType>(ptr); + const Type *name = VT->getElementType(); + return getTypeName(name, symTab); + break; + } + case Type::PointerTyID: + { + const PointerType *PT = cast<PointerType>(ptr); + const Type *name = PT->getElementType(); + return getTypeName(name, symTab); + break; + } + case Type::FunctionTyID: + { + const FunctionType *FT = cast<FunctionType>(ptr); + const Type *name = FT->getReturnType(); + return getTypeName(name, symTab); + break; + } + } + ptr->dump(); + mMFI->addErrorMsg(amd::CompilerErrorMessage[UNKNOWN_TYPE_NAME]); + return "unknown"; +} + +void AMDILKernelManager::emitLiterals(llvm::raw_ostream &O) { + char buffer[256]; + std::map<uint32_t, uint32_t>::iterator ilb, ile; + for (ilb = mMFI->begin_32(), ile = mMFI->end_32(); ilb != ile; ++ilb) { + uint32_t a = ilb->first; + O << "dcl_literal l" <<ilb->second<< ", "; + sprintf(buffer, "0x%08x, 0x%08x, 0x%08x, 0x%08x", a, a, a, a); + O << buffer << "; f32:i32 " << ilb->first << "\n"; + } + std::map<uint64_t, uint32_t>::iterator llb, lle; + for (llb = mMFI->begin_64(), lle = mMFI->end_64(); llb != lle; ++llb) { + uint32_t v[2]; + uint64_t a = llb->first; + memcpy(v, &a, sizeof(uint64_t)); + O << "dcl_literal l" <<llb->second<< ", "; + sprintf(buffer, "0x%08x, 0x%08x, 0x%08x, 0x%08x; f64:i64 ", + v[0], v[1], v[0], v[1]); + O << buffer << llb->first << "\n"; + } + std::map<std::pair<uint64_t, uint64_t>, uint32_t>::iterator vlb, vle; + for (vlb = mMFI->begin_128(), vle = mMFI->end_128(); vlb != vle; ++vlb) { + uint32_t v[2][2]; + uint64_t a = vlb->first.first; + uint64_t b = vlb->first.second; + memcpy(v[0], &a, sizeof(uint64_t)); + memcpy(v[1], &b, sizeof(uint64_t)); + O << "dcl_literal l" << vlb->second << ", "; + sprintf(buffer, "0x%08x, 0x%08x, 0x%08x, 0x%08x; f128:i128 ", + v[0][0], v[0][1], v[1][0], v[1][1]); + O << buffer << vlb->first.first << vlb->first.second << "\n"; + } +} + +// If the value is not known, then the uav is set, otherwise the mValueIDMap +// is used. +void AMDILKernelManager::setUAVID(const Value *value, uint32_t ID) { + if (value) { + mValueIDMap[value] = ID; + } +} + +uint32_t AMDILKernelManager::getUAVID(const Value *value) { + if (mValueIDMap.find(value) != mValueIDMap.end()) { + return mValueIDMap[value]; + } + + if (mSTM->device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) { + return mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID); + } else { + return mSTM->device()->getResourceID(AMDILDevice::RAW_UAV_ID); + } +} + diff --git a/src/gallium/drivers/radeon/AMDILKernelManager.h b/src/gallium/drivers/radeon/AMDILKernelManager.h new file mode 100644 index 00000000000..d5eb296cbf2 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILKernelManager.h @@ -0,0 +1,177 @@ +//===-- AMDILKernelManager.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Class that handles the metadata/abi management for the +// ASM printer. Handles the parsing and generation of the metadata +// for each kernel and keeps track of its arguments. +// +//==-----------------------------------------------------------------------===// +#ifndef _AMDILKERNELMANAGER_H_ +#define _AMDILKERNELMANAGER_H_ +#include "AMDIL.h" +#include "AMDILDevice.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/Function.h" + +#include <map> +#include <set> +#include <string> + +#define IMAGETYPE_2D 0 +#define IMAGETYPE_3D 1 +#define RESERVED_LIT_COUNT 6 + +namespace llvm { +class AMDILGlobalManager; +class AMDILSubtarget; +class AMDILMachineFunctionInfo; +class AMDILTargetMachine; +class AMDILAsmPrinter; +class StructType; +class Value; +class TypeSymbolTable; +class MachineFunction; +class MachineInstr; +class ConstantFP; +class PrintfInfo; + + +class AMDILKernelManager { +public: + typedef enum { + RELEASE_ONLY, + DEBUG_ONLY, + ALWAYS + } ErrorMsgEnum; + AMDILKernelManager(AMDILTargetMachine *TM, AMDILGlobalManager *GM); + virtual ~AMDILKernelManager(); + + /// Clear the state of the KernelManager putting it in its most initial state. + void clear(); + void setMF(MachineFunction *MF); + + /// Process the specific kernel parsing out the parameter information for the + /// kernel. + void processArgMetadata(llvm::raw_ostream &O, + uint32_t buf, bool kernel); + + + /// Prints the header for the kernel which includes the groupsize declaration + /// and calculation of the local/group/global id's. + void printHeader(AMDILAsmPrinter *AsmPrinter, llvm::raw_ostream &O, + const std::string &name); + + virtual void printDecls(AMDILAsmPrinter *AsmPrinter, llvm::raw_ostream &O); + virtual void printGroupSize(llvm::raw_ostream &O); + + /// Copies the data from the runtime setup constant buffers into registers so + /// that the program can correctly access memory or data that was set by the + /// host program. + void printArgCopies(llvm::raw_ostream &O, AMDILAsmPrinter* RegNames); + + /// Prints out the end of the function. + void printFooter(llvm::raw_ostream &O); + + /// Prints out the metadata for the specific function depending if it is a + /// kernel or not. + void printMetaData(llvm::raw_ostream &O, uint32_t id, bool isKernel = false); + + /// Set bool value on whether to consider the function a kernel or a normal + /// function. + void setKernel(bool kernel); + + /// Set the unique ID of the kernel/function. + void setID(uint32_t id); + + /// Set the name of the kernel/function. + void setName(const std::string &name); + + /// Flag to specify whether the function is a kernel or not. + bool isKernel(); + + /// Flag that specifies whether this function has a kernel wrapper. + bool wasKernel(); + + void getIntrinsicSetup(AMDILAsmPrinter *AsmPrinter, llvm::raw_ostream &O); + + // Returns whether a compiler needs to insert a write to memory or not. + bool useCompilerWrite(const MachineInstr *MI); + + // Set the flag that there exists an image write. + void setImageWrite(); + void setOutputInst(); + + const char *getTypeName(const Type *name, const char * symTab); + + void emitLiterals(llvm::raw_ostream &O); + + // Set the uav id for the specific pointer value. If value is NULL, then the + // ID sets the default ID. + void setUAVID(const Value *value, uint32_t ID); + + // Get the UAV id for the specific pointer value. + uint32_t getUAVID(const Value *value); + +private: + + /// Helper function that prints the actual metadata and should only be called + /// by printMetaData. + void printKernelArgs(llvm::raw_ostream &O); + void printCopyStructPrivate(const StructType *ST, + llvm::raw_ostream &O, + size_t stackSize, + uint32_t Buffer, + uint32_t mLitIdx, + uint32_t &counter); + virtual void + printConstantToRegMapping(AMDILAsmPrinter *RegNames, + uint32_t &LII, + llvm::raw_ostream &O, + uint32_t &counter, + uint32_t Buffer, + uint32_t n, + const char *lit = NULL, + uint32_t fcall = 0, + bool isImage = false, + bool isHWCB = false); + void updatePtrArg(llvm::Function::const_arg_iterator Ip, + int numWriteImages, + int raw_uav_buffer, + int counter, + bool isKernel, + const Function *F); + /// Name of the current kernel. + std::string mName; + uint32_t mUniqueID; + bool mIsKernel; + bool mWasKernel; + bool mCompilerWrite; + /// Flag to specify if an image write has occured or not in order to not add a + /// compiler specific write if no other writes to memory occured. + bool mHasImageWrite; + bool mHasOutputInst; + + /// Map from const Value * to UAV ID. + std::map<const Value *, uint32_t> mValueIDMap; + + AMDILTargetMachine * mTM; + const AMDILSubtarget * mSTM; + AMDILGlobalManager * mGM; + /// This is the global offset of the printf string id's. + MachineFunction *mMF; + AMDILMachineFunctionInfo *mMFI; +}; // class AMDILKernelManager + +} // llvm namespace +#endif // _AMDILKERNELMANAGER_H_ diff --git a/src/gallium/drivers/radeon/AMDILLiteralManager.cpp b/src/gallium/drivers/radeon/AMDILLiteralManager.cpp new file mode 100644 index 00000000000..43167f57001 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILLiteralManager.cpp @@ -0,0 +1,128 @@ +//===--- AMDILLiteralManager.cpp - AMDIL Literal Manager Pass --*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#define DEBUG_TYPE "literal_manager" + +#include "AMDIL.h" + +#include "AMDILAlgorithms.tpp" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILSubtarget.h" +#include "AMDILTargetMachine.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + + +// AMDIL Literal Manager traverses through all of the LOADCONST instructions and +// converts them from an immediate value to the literal index. The literal index +// is valid IL, but the immediate values are not. The Immediate values must be +// aggregated and declared for clarity and to reduce the number of literals that +// are used. It is also illegal to declare the same literal twice, so this keeps +// that from occuring. + +namespace { + class AMDILLiteralManager : public MachineFunctionPass { + public: + static char ID; + AMDILLiteralManager(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + virtual const char *getPassName() const; + + bool runOnMachineFunction(MachineFunction &MF); + private: + bool trackLiterals(MachineBasicBlock::iterator *bbb); + TargetMachine &TM; + const AMDILSubtarget *mSTM; + AMDILKernelManager *mKM; + AMDILMachineFunctionInfo *mMFI; + int32_t mLitIdx; + bool mChanged; + }; + char AMDILLiteralManager::ID = 0; +} + +namespace llvm { + FunctionPass * + createAMDILLiteralManager(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) { + return new AMDILLiteralManager(tm AMDIL_OPT_LEVEL_VAR); + } + +} + +AMDILLiteralManager::AMDILLiteralManager(TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) + : MachineFunctionPass(ID), + TM(tm) { +} + +bool AMDILLiteralManager::runOnMachineFunction(MachineFunction &MF) { + mChanged = false; + mMFI = MF.getInfo<AMDILMachineFunctionInfo>(); + const AMDILTargetMachine *amdtm = + reinterpret_cast<const AMDILTargetMachine *>(&TM); + mSTM = dynamic_cast<const AMDILSubtarget *>(amdtm->getSubtargetImpl()); + mKM = const_cast<AMDILKernelManager *>(mSTM->getKernelManager()); + safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), + std::bind1st(std::mem_fun(&AMDILLiteralManager::trackLiterals), this)); + return mChanged; +} + +bool AMDILLiteralManager::trackLiterals(MachineBasicBlock::iterator *bbb) { + MachineInstr *MI = *bbb; + uint32_t Opcode = MI->getOpcode(); + switch(Opcode) { + default: + return false; + case AMDIL::LOADCONST_i8: + case AMDIL::LOADCONST_i16: + case AMDIL::LOADCONST_i32: + case AMDIL::LOADCONST_i64: + case AMDIL::LOADCONST_f32: + case AMDIL::LOADCONST_f64: + break; + }; + MachineOperand &dstOp = MI->getOperand(0); + MachineOperand &litOp = MI->getOperand(1); + if (!litOp.isImm() && !litOp.isFPImm()) { + return false; + } + if (!dstOp.isReg()) { + return false; + } + // Change the literal to the correct index for each literal that is found. + if (litOp.isImm()) { + int64_t immVal = litOp.getImm(); + uint32_t idx = MI->getOpcode() == AMDIL::LOADCONST_i64 + ? mMFI->addi64Literal(immVal) + : mMFI->addi32Literal(static_cast<int>(immVal), Opcode); + litOp.ChangeToImmediate(idx); + return false; + } + + if (litOp.isFPImm()) { + const ConstantFP *fpVal = litOp.getFPImm(); + uint32_t idx = MI->getOpcode() == AMDIL::LOADCONST_f64 + ? mMFI->addf64Literal(fpVal) + : mMFI->addf32Literal(fpVal); + litOp.ChangeToImmediate(idx); + return false; + } + + return false; +} + +const char* AMDILLiteralManager::getPassName() const { + return "AMDIL Constant Propagation"; +} + + diff --git a/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp b/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp new file mode 100644 index 00000000000..9366f2e7bcb --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp @@ -0,0 +1,158 @@ +//===---- AMDILMCCodeEmitter.cpp - Convert AMDIL text to AMDIL binary ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +//===---------------------------------------------------------------------===// + +#define DEBUG_TYPE "amdil-emitter" +#include "AMDIL.h" +#include "AMDILInstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCExpr.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +#if 0 +namespace { + class AMDILMCCodeEmitter : public MCCodeEmitter { + AMDILMCCodeEmitter(const AMDILMCCodeEmitter &);// DO NOT IMPLEMENT + void operator=(const AMDILMCCodeEmitter &); // DO NOT IMPLEMENT + const TargetMachine &TM; + const TargetInstrInfo &TII; + MCContext &Ctx; + bool Is64BitMode; + public: + AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx, bool is64Bit); + ~AMDILMCCodeEmitter(); + unsigned getNumFixupKinds() const; + const MCFixupKindInfo& getFixupKindInfo(MCFixupKind Kind) const; + static unsigned GetAMDILRegNum(const MCOperand &MO); + void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const; + void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte, + raw_ostream &OS) const; + void EmitImmediate(const MCOperand &Disp, unsigned ImmSize, + MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &os, + SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const; + + void EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const; + + }; // class AMDILMCCodeEmitter +}; // anonymous namespace + +namespace llvm { + MCCodeEmitter *createAMDILMCCodeEmitter(const Target &, + TargetMachine &TM, MCContext &Ctx) + { + return new AMDILMCCodeEmitter(TM, Ctx, false); + } +} + +AMDILMCCodeEmitter::AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx + , bool is64Bit) +: TM(tm), TII(*TM.getInstrInfo()), Ctx(ctx) +{ + Is64BitMode = is64Bit; +} + +AMDILMCCodeEmitter::~AMDILMCCodeEmitter() +{ +} + +unsigned +AMDILMCCodeEmitter::getNumFixupKinds() const +{ + return 0; +} + +const MCFixupKindInfo & +AMDILMCCodeEmitter::getFixupKindInfo(MCFixupKind Kind) const +{ +// const static MCFixupKindInfo Infos[] = {}; + if (Kind < FirstTargetFixupKind) { + return MCCodeEmitter::getFixupKindInfo(Kind); + } + assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() && + "Invalid kind!"); + return MCCodeEmitter::getFixupKindInfo(Kind); + // return Infos[Kind - FirstTargetFixupKind]; + +} + +void +AMDILMCCodeEmitter::EmitByte(unsigned char C, unsigned &CurByte, + raw_ostream &OS) const +{ + OS << (char) C; + ++CurByte; +} +void +AMDILMCCodeEmitter::EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte, + raw_ostream &OS) const +{ + // Output the constant in little endian byte order + for (unsigned i = 0; i != Size; ++i) { + EmitByte(Val & 255, CurByte, OS); + Val >>= 8; + } +} +void +AMDILMCCodeEmitter::EmitImmediate(const MCOperand &DispOp, unsigned ImmSize, + MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const +{ + // If this is a simple integer displacement that doesn't require a relocation + // emit it now. + if (DispOp.isImm()) { + EmitConstant(DispOp.getImm() + ImmOffset, ImmSize, CurByte, OS); + } + + // If we have an immoffset, add it to the expression + const MCExpr *Expr = DispOp.getExpr(); + + if (ImmOffset) { + Expr = MCBinaryExpr::CreateAdd(Expr, + MCConstantExpr::Create(ImmOffset, Ctx), Ctx); + } + // Emit a symbolic constant as a fixup and 4 zeros. + Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind)); + // TODO: Why the 4 zeros? + EmitConstant(0, ImmSize, CurByte, OS); +} + +void +AMDILMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, + SmallVectorImpl<MCFixup> &Fixups) const +{ +#if 0 + unsigned Opcode = MI.getOpcode(); + const TargetInstrDesc &Desc = TII.get(Opcode); + unsigned TSFlags = Desc.TSFlags; + + // Keep track of the current byte being emitted. + unsigned CurByte = 0; + + unsigned NumOps = Desc.getNumOperands(); + unsigned CurOp = 0; + + unsigned char BaseOpcode = 0; +#ifndef NDEBUG + // FIXME: Verify. + if (// !Desc.isVariadic() && + CurOp != NumOps) { + errs() << "Cannot encode all operands of: "; + MI.dump(); + errs() << '\n'; + abort(); + } +#endif +#endif +} +#endif diff --git a/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.cpp b/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.cpp new file mode 100644 index 00000000000..0061d29e7df --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.cpp @@ -0,0 +1,597 @@ +//===-- AMDILMachineFunctionInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILMachineFunctionInfo.h" +#include "AMDILCompilerErrors.h" +#include "AMDILModuleInfo.h" +#include "AMDILSubtarget.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Support/FormattedStream.h" + +using namespace llvm; + +static const AMDILConstPtr *getConstPtr(const AMDILKernel *krnl, const std::string &arg) { + llvm::SmallVector<AMDILConstPtr, DEFAULT_VEC_SLOTS>::const_iterator begin, end; + for (begin = krnl->constPtr.begin(), end = krnl->constPtr.end(); + begin != end; ++begin) { + if (!strcmp(begin->name.data(),arg.c_str())) { + return &(*begin); + } + } + return NULL; +} + +void PrintfInfo::addOperand(size_t idx, uint32_t size) { + mOperands.resize((unsigned)(idx + 1)); + mOperands[(unsigned)idx] = size; +} + +uint32_t PrintfInfo::getPrintfID() { + return mPrintfID; +} + +void PrintfInfo::setPrintfID(uint32_t id) { + mPrintfID = id; +} + +size_t PrintfInfo::getNumOperands() { + return mOperands.size(); +} + +uint32_t PrintfInfo::getOperandID(uint32_t idx) { + return mOperands[idx]; +} + +AMDILMachineFunctionInfo::AMDILMachineFunctionInfo() + : CalleeSavedFrameSize(0), BytesToPopOnReturn(0), + DecorationStyle(None), ReturnAddrIndex(0), + TailCallReturnAddrDelta(0), + SRetReturnReg(0), UsesLDS(false), LDSArg(false), + UsesGDS(false), GDSArg(false), + mReservedLits(9) +{ + for (uint32_t x = 0; x < AMDILDevice::MAX_IDS; ++x) { + mUsedMem[x] = false; + } + mMF = NULL; + mKernel = NULL; + mScratchSize = -1; + mArgSize = -1; + mStackSize = -1; +} + +AMDILMachineFunctionInfo::AMDILMachineFunctionInfo(MachineFunction& MF) + : CalleeSavedFrameSize(0), BytesToPopOnReturn(0), + DecorationStyle(None), ReturnAddrIndex(0), + TailCallReturnAddrDelta(0), + SRetReturnReg(0), UsesLDS(false), LDSArg(false), + UsesGDS(false), GDSArg(false), + mReservedLits(9) +{ + for (uint32_t x = 0; x < AMDILDevice::MAX_IDS; ++x) { + mUsedMem[x] = false; + } + const Function *F = MF.getFunction(); + mMF = &MF; + MachineModuleInfo &mmi = MF.getMMI(); + const AMDILTargetMachine *TM = + reinterpret_cast<const AMDILTargetMachine*>(&MF.getTarget()); + AMDILModuleInfo *AMI = &(mmi.getObjFileInfo<AMDILModuleInfo>()); + AMI->processModule(mmi.getModule(), TM); + mSTM = TM->getSubtargetImpl(); + mKernel = AMI->getKernel(F->getName()); + + mScratchSize = -1; + mArgSize = -1; + mStackSize = -1; +} + +AMDILMachineFunctionInfo::~AMDILMachineFunctionInfo() +{ + for (std::map<std::string, PrintfInfo*>::iterator pfb = printf_begin(), + pfe = printf_end(); pfb != pfe; ++pfb) { + delete pfb->second; + } +} +unsigned int +AMDILMachineFunctionInfo::getCalleeSavedFrameSize() const +{ + return CalleeSavedFrameSize; +} +void +AMDILMachineFunctionInfo::setCalleeSavedFrameSize(unsigned int bytes) +{ + CalleeSavedFrameSize = bytes; +} +unsigned int +AMDILMachineFunctionInfo::getBytesToPopOnReturn() const +{ + return BytesToPopOnReturn; +} +void +AMDILMachineFunctionInfo::setBytesToPopOnReturn(unsigned int bytes) +{ + BytesToPopOnReturn = bytes; +} +NameDecorationStyle +AMDILMachineFunctionInfo::getDecorationStyle() const +{ + return DecorationStyle; +} +void +AMDILMachineFunctionInfo::setDecorationStyle(NameDecorationStyle style) +{ + DecorationStyle = style; +} +int +AMDILMachineFunctionInfo::getRAIndex() const +{ + return ReturnAddrIndex; +} +void +AMDILMachineFunctionInfo::setRAIndex(int index) +{ + ReturnAddrIndex = index; +} +int +AMDILMachineFunctionInfo::getTCReturnAddrDelta() const +{ + return TailCallReturnAddrDelta; +} +void +AMDILMachineFunctionInfo::setTCReturnAddrDelta(int delta) +{ + TailCallReturnAddrDelta = delta; +} +unsigned int +AMDILMachineFunctionInfo::getSRetReturnReg() const +{ + return SRetReturnReg; +} +void +AMDILMachineFunctionInfo::setSRetReturnReg(unsigned int reg) +{ + SRetReturnReg = reg; +} + +void +AMDILMachineFunctionInfo::setUsesLocal() +{ + UsesLDS = true; +} + +bool +AMDILMachineFunctionInfo::usesLocal() const +{ + return UsesLDS; +} + +void +AMDILMachineFunctionInfo::setHasLocalArg() +{ + LDSArg = true; +} + +bool +AMDILMachineFunctionInfo::hasLocalArg() const +{ + return LDSArg; +} + + + +void +AMDILMachineFunctionInfo::setUsesRegion() +{ + UsesGDS = true; +} + +bool +AMDILMachineFunctionInfo::usesRegion() const +{ + return UsesGDS; +} + +void +AMDILMachineFunctionInfo::setHasRegionArg() +{ + GDSArg = true; +} + +bool +AMDILMachineFunctionInfo::hasRegionArg() const +{ + return GDSArg; +} + + +bool +AMDILMachineFunctionInfo::usesHWConstant(std::string name) const +{ + const AMDILConstPtr *curConst = getConstPtr(mKernel, name); + if (curConst) { + return curConst->usesHardware; + } else { + return false; + } +} + +uint32_t +AMDILMachineFunctionInfo::getLocal(uint32_t dim) +{ + if (mKernel && mKernel->sgv) { + AMDILKernelAttr *sgv = mKernel->sgv; + switch (dim) { + default: break; + case 0: + case 1: + case 2: + return sgv->reqGroupSize[dim]; + break; + case 3: + return sgv->reqGroupSize[0] * sgv->reqGroupSize[1] * sgv->reqGroupSize[2]; + }; + } + switch (dim) { + default: + return 1; + case 3: + return mSTM->getDefaultSize(0) * + mSTM->getDefaultSize(1) * + mSTM->getDefaultSize(2); + case 2: + case 1: + case 0: + return mSTM->getDefaultSize(dim); + break; + }; + return 1; +} +bool +AMDILMachineFunctionInfo::isKernel() const +{ + return mKernel != NULL && mKernel->mKernel; +} + +AMDILKernel* +AMDILMachineFunctionInfo::getKernel() +{ + return mKernel; +} + +std::string +AMDILMachineFunctionInfo::getName() +{ + if (mMF) { + return mMF->getFunction()->getName(); + } else { + return ""; + } +} + +uint32_t +AMDILMachineFunctionInfo::getArgSize() +{ + if (mArgSize == -1) { + Function::const_arg_iterator I = mMF->getFunction()->arg_begin(); + Function::const_arg_iterator Ie = mMF->getFunction()->arg_end(); + uint32_t Counter = 0; + while (I != Ie) { + Type* curType = I->getType(); + if (curType->isIntegerTy() || curType->isFloatingPointTy()) { + ++Counter; + } else if (const VectorType *VT = dyn_cast<VectorType>(curType)) { + Type *ET = VT->getElementType(); + int numEle = VT->getNumElements(); + switch (ET->getPrimitiveSizeInBits()) { + default: + if (numEle == 3) { + Counter++; + } else { + Counter += ((numEle + 2) >> 2); + } + break; + case 64: + if (numEle == 3) { + Counter += 2; + } else { + Counter += (numEle >> 1); + } + break; + case 16: + case 8: + switch (numEle) { + default: + Counter += ((numEle + 2) >> 2); + case 2: + Counter++; + break; + } + break; + } + } else if (const PointerType *PT = dyn_cast<PointerType>(curType)) { + Type *CT = PT->getElementType(); + const StructType *ST = dyn_cast<StructType>(CT); + if (ST && ST->isOpaque()) { + bool i1d = ST->getName() == "struct._image1d_t"; + bool i1da = ST->getName() == "struct._image1d_array_t"; + bool i1db = ST->getName() == "struct._image1d_buffer_t"; + bool i2d = ST->getName() == "struct._image2d_t"; + bool i2da = ST->getName() == "struct._image2d_array_t"; + bool i3d = ST->getName() == "struct._image3d_t"; + bool is_image = i1d || i1da || i1db || i2d || i2da || i3d; + if (is_image) { + if (mSTM->device()->isSupported(AMDILDeviceInfo::Images)) { + Counter += 2; + } else { + addErrorMsg(amd::CompilerErrorMessage[NO_IMAGE_SUPPORT]); + } + } else { + Counter++; + } + } else if (CT->isStructTy() + && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) { + StructType *ST = dyn_cast<StructType>(CT); + Counter += ((getTypeSize(ST) + 15) & ~15) >> 4; + } else if (CT->isIntOrIntVectorTy() + || CT->isFPOrFPVectorTy() + || CT->isArrayTy() + || CT->isPointerTy() + || PT->getAddressSpace() != AMDILAS::PRIVATE_ADDRESS) { + ++Counter; + } else { + assert(0 && "Current type is not supported!"); + addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]); + } + } else { + assert(0 && "Current type is not supported!"); + addErrorMsg(amd::CompilerErrorMessage[INTERNAL_ERROR]); + } + ++I; + } + // Convert from slots to bytes by multiplying by 16(shift by 4). + mArgSize = Counter << 4; + } + return (uint32_t)mArgSize; +} + uint32_t +AMDILMachineFunctionInfo::getScratchSize() +{ + if (mScratchSize == -1) { + mScratchSize = 0; + Function::const_arg_iterator I = mMF->getFunction()->arg_begin(); + Function::const_arg_iterator Ie = mMF->getFunction()->arg_end(); + while (I != Ie) { + Type *curType = I->getType(); + mScratchSize += ((getTypeSize(curType) + 15) & ~15); + ++I; + } + mScratchSize += ((mScratchSize + 15) & ~15); + } + return (uint32_t)mScratchSize; +} + + uint32_t +AMDILMachineFunctionInfo::getStackSize() +{ + if (mStackSize == -1) { + uint32_t privSize = 0; + const MachineFrameInfo *MFI = mMF->getFrameInfo(); + privSize = MFI->getOffsetAdjustment() + MFI->getStackSize(); + const AMDILTargetMachine *TM = + reinterpret_cast<const AMDILTargetMachine*>(&mMF->getTarget()); + bool addStackSize = TM->getOptLevel() == CodeGenOpt::None; + Function::const_arg_iterator I = mMF->getFunction()->arg_begin(); + Function::const_arg_iterator Ie = mMF->getFunction()->arg_end(); + while (I != Ie) { + Type *curType = I->getType(); + ++I; + if (dyn_cast<PointerType>(curType)) { + Type *CT = dyn_cast<PointerType>(curType)->getElementType(); + if (CT->isStructTy() + && dyn_cast<PointerType>(curType)->getAddressSpace() + == AMDILAS::PRIVATE_ADDRESS) { + addStackSize = true; + } + } + } + if (addStackSize) { + privSize += getScratchSize(); + } + mStackSize = privSize; + } + return (uint32_t)mStackSize; + +} + +uint32_t +AMDILMachineFunctionInfo::addi32Literal(uint32_t val, int Opcode) { + // Since we have emulated 16/8/1 bit register types with a 32bit real + // register, we need to sign extend the constants to 32bits in order for + // comparisons against the constants to work correctly, this fixes some issues + // we had in conformance failing for saturation. + if (Opcode == AMDIL::LOADCONST_i16) { + val = (((int32_t)val << 16) >> 16); + } else if (Opcode == AMDIL::LOADCONST_i8) { + val = (((int32_t)val << 24) >> 24); + } + if (mIntLits.find(val) == mIntLits.end()) { + mIntLits[val] = getNumLiterals(); + } + return mIntLits[val]; +} + +uint32_t +AMDILMachineFunctionInfo::addi64Literal(uint64_t val) { + if (mLongLits.find(val) == mLongLits.end()) { + mLongLits[val] = getNumLiterals(); + } + return mLongLits[val]; +} + +uint32_t +AMDILMachineFunctionInfo::addi128Literal(uint64_t val_lo, uint64_t val_hi) { + std::pair<uint64_t, uint64_t> a; + a.first = val_lo; + a.second = val_hi; + if (mVecLits.find(a) == mVecLits.end()) { + mVecLits[a] = getNumLiterals(); + } + return mVecLits[a]; +} + +uint32_t +AMDILMachineFunctionInfo::addf32Literal(const ConstantFP *CFP) { + uint32_t val = (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(); + if (mIntLits.find(val) == mIntLits.end()) { + mIntLits[val] = getNumLiterals(); + } + return mIntLits[val]; +} + +uint32_t +AMDILMachineFunctionInfo::addf64Literal(const ConstantFP *CFP) { + union dtol_union { + double d; + uint64_t ul; + } dval; + const APFloat &APF = CFP->getValueAPF(); + if (&APF.getSemantics() == (const llvm::fltSemantics *)&APFloat::IEEEsingle) { + float fval = APF.convertToFloat(); + dval.d = (double)fval; + } else { + dval.d = APF.convertToDouble(); + } + if (mLongLits.find(dval.ul) == mLongLits.end()) { + mLongLits[dval.ul] = getNumLiterals(); + } + return mLongLits[dval.ul]; +} + + uint32_t +AMDILMachineFunctionInfo::getIntLits(uint32_t offset) +{ + return mIntLits[offset]; +} + + uint32_t +AMDILMachineFunctionInfo::getLongLits(uint64_t offset) +{ + return mLongLits[offset]; +} + + uint32_t +AMDILMachineFunctionInfo::getVecLits(uint64_t low64, uint64_t high64) +{ + return mVecLits[std::pair<uint64_t, uint64_t>(low64, high64)]; +} + +size_t +AMDILMachineFunctionInfo::getNumLiterals() const { + return mLongLits.size() + mIntLits.size() + mVecLits.size() + mReservedLits; +} + + void +AMDILMachineFunctionInfo::addReservedLiterals(uint32_t size) +{ + mReservedLits += size; +} + + uint32_t +AMDILMachineFunctionInfo::addSampler(std::string name, uint32_t val) +{ + if (mSamplerMap.find(name) != mSamplerMap.end()) { + SamplerInfo newVal = mSamplerMap[name]; + assert(newVal.val == val + && "Found a sampler with same name but different values!"); + return mSamplerMap[name].idx; + } else { + SamplerInfo curVal; + curVal.name = name; + curVal.val = val; + curVal.idx = mSamplerMap.size(); + mSamplerMap[name] = curVal; + return curVal.idx; + } +} + +void +AMDILMachineFunctionInfo::setUsesMem(unsigned id) { + assert(id < AMDILDevice::MAX_IDS && + "Must set the ID to be less than MAX_IDS!"); + mUsedMem[id] = true; +} + +bool +AMDILMachineFunctionInfo::usesMem(unsigned id) { + assert(id < AMDILDevice::MAX_IDS && + "Must set the ID to be less than MAX_IDS!"); + return mUsedMem[id]; +} + + void +AMDILMachineFunctionInfo::addErrorMsg(const char *msg, ErrorMsgEnum val) +{ + if (val == DEBUG_ONLY) { +#if defined(DEBUG) || defined(_DEBUG) + mErrors.insert(msg); +#endif + } else if (val == RELEASE_ONLY) { +#if !defined(DEBUG) && !defined(_DEBUG) + mErrors.insert(msg); +#endif + } else if (val == ALWAYS) { + mErrors.insert(msg); + } +} + + uint32_t +AMDILMachineFunctionInfo::addPrintfString(std::string &name, unsigned offset) +{ + if (mPrintfMap.find(name) != mPrintfMap.end()) { + return mPrintfMap[name]->getPrintfID(); + } else { + PrintfInfo *info = new PrintfInfo; + info->setPrintfID(mPrintfMap.size() + offset); + mPrintfMap[name] = info; + return info->getPrintfID(); + } +} + + void +AMDILMachineFunctionInfo::addPrintfOperand(std::string &name, + size_t idx, + uint32_t size) +{ + mPrintfMap[name]->addOperand(idx, size); +} + + void +AMDILMachineFunctionInfo::addMetadata(const char *md, bool kernelOnly) +{ + addMetadata(std::string(md), kernelOnly); +} + + void +AMDILMachineFunctionInfo::addMetadata(std::string md, bool kernelOnly) +{ + if (kernelOnly) { + mMetadataKernel.push_back(md); + } else { + mMetadataFunc.insert(md); + } +} + diff --git a/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.h b/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.h new file mode 100644 index 00000000000..45f57518184 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILMachineFunctionInfo.h @@ -0,0 +1,422 @@ +//== AMDILMachineFunctionInfo.h - AMD il Machine Function Info -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file declares AMDIL-specific per-machine-function information +// +//===----------------------------------------------------------------------===// +#ifndef _AMDILMACHINEFUNCTIONINFO_H_ +#define _AMDILMACHINEFUNCTIONINFO_H_ +#include "AMDIL.h" +#include "AMDILDevice.h" +#include "AMDILKernel.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/Function.h" + +#include <map> +#include <set> +#include <string> + +namespace llvm +{ + class AMDILSubtarget; + class PrintfInfo { + uint32_t mPrintfID; + SmallVector<uint32_t, DEFAULT_VEC_SLOTS> mOperands; + public: + void addOperand(size_t idx, uint32_t size); + uint32_t getPrintfID(); + void setPrintfID(uint32_t idx); + size_t getNumOperands(); + uint32_t getOperandID(uint32_t idx); + }; // class PrintfInfo + + enum NameDecorationStyle + { + None, + StdCall, + FastCall + }; + typedef struct SamplerInfoRec { + std::string name; // The name of the sampler + uint32_t val; // The value of the sampler + uint32_t idx; // The sampler resource id + } SamplerInfo; + // Some typedefs that will help with using the various iterators + // of the machine function info class. + typedef std::map<uint32_t, uint32_t>::iterator lit32_iterator; + typedef std::map<uint64_t, uint32_t>::iterator lit64_iterator; + typedef std::map<std::pair<uint64_t, uint64_t>, uint32_t>::iterator + lit128_iterator; + typedef StringMap<SamplerInfo>::iterator sampler_iterator; + typedef DenseSet<uint32_t>::iterator func_iterator; + typedef DenseSet<uint32_t>::iterator intr_iterator; + typedef DenseSet<uint32_t>::iterator uav_iterator; + typedef DenseSet<uint32_t>::iterator read_image2d_iterator; + typedef DenseSet<uint32_t>::iterator read_image3d_iterator; + typedef DenseSet<uint32_t>::iterator write_image2d_iterator; + typedef DenseSet<uint32_t>::iterator write_image3d_iterator; + typedef DenseSet<const char*>::iterator error_iterator; + typedef std::map<std::string, PrintfInfo*>::iterator printf_iterator; + typedef std::set<std::string>::iterator func_md_iterator; + typedef std::vector<std::string>::iterator kernel_md_iterator; + // AMDILMachineFunctionInfo - This class is + // derived from MachineFunction private + // amdil target-specific information for each MachineFunction + class AMDILMachineFunctionInfo : public MachineFunctionInfo + { + // CalleeSavedFrameSize - Size of the callee-saved + // register portion of the + // stack frame in bytes. + unsigned int CalleeSavedFrameSize; + // BytesToPopOnReturn - Number of bytes function pops on return. + // Used on windows platform for stdcall & fastcall name decoration + unsigned int BytesToPopOnReturn; + // DecorationStyle - If the function requires additional + // name decoration, + // DecorationStyle holds the right way to do so. + NameDecorationStyle DecorationStyle; + // ReturnAddrIndex - FrameIndex for return slot. + int ReturnAddrIndex; + + // TailCallReturnAddrDelta - Delta the ReturnAddr stack slot is moved + // Used for creating an area before the register spill area + // on the stack + // the returnaddr can be savely move to this area + int TailCallReturnAddrDelta; + + // SRetReturnReg - Some subtargets require that sret lowering includes + // returning the value of the returned struct in a register. + // This field holds the virtual register into which the sret + // argument is passed. + unsigned int SRetReturnReg; + + // UsesLocal - Specifies that this function uses LDS memory and + // that it needs to be allocated. + bool UsesLDS; + + // LDSArg - Flag that specifies if this function has an Local + // argument or not + bool LDSArg; + + // UsesGDS - Specifies that this function uses GDS memory and + // that it needs to be allocated. + bool UsesGDS; + + // GDSArg - Flag that specifies if this function has an Region + // argument or not + bool GDSArg; + + // The size in bytes required to host all of the kernel arguments. + // -1 means this value has not been determined yet. + int32_t mArgSize; + + // The size in bytes required to host the stack and the kernel arguments + // in private memory. + // -1 means this value has not been determined yet. + int32_t mScratchSize; + + // The size in bytes required to host the the kernel arguments + // on the stack. + // -1 means this value has not been determined yet. + int32_t mStackSize; + + /// A map of constant to literal mapping for all of the 32bit or + /// smaller literals in the current function. + std::map<uint32_t, uint32_t> mIntLits; + + /// A map of constant to literal mapping for all of the 64bit + /// literals in the current function. + std::map<uint64_t, uint32_t> mLongLits; + + /// A map of constant to literal mapping for all of the 128bit + /// literals in the current function. + std::map<std::pair<uint64_t, uint64_t>, uint32_t> mVecLits; + + /// The number of literals that should be reserved. + /// TODO: Remove this when the wrapper emitter is added. + uint32_t mReservedLits; + + /// A map of name to sampler information that is used to emit + /// metadata to the IL stream that the runtimes can use for + /// hardware setup. + StringMap<SamplerInfo> mSamplerMap; + + /// Array of flags to specify if a specific memory type is used or not. + bool mUsedMem[AMDILDevice::MAX_IDS]; + + /// Set of all functions that this function calls. + DenseSet<uint32_t> mFuncs; + + /// Set of all intrinsics that this function calls. + DenseSet<uint32_t> mIntrs; + + /// Set of all read only 2D images. + DenseSet<uint32_t> mRO2D; + /// Set of all read only 3D images. + DenseSet<uint32_t> mRO3D; + /// Set of all write only 2D images. + DenseSet<uint32_t> mWO2D; + /// Set of all write only 3D images. + DenseSet<uint32_t> mWO3D; + /// Set of all the raw uavs. + DenseSet<uint32_t> mRawUAV; + /// Set of all the arena uavs. + DenseSet<uint32_t> mArenaUAV; + + /// A set of all errors that occured in the backend for this function. + DenseSet<const char *> mErrors; + + /// A mapping of printf data and the printf string + std::map<std::string, PrintfInfo*> mPrintfMap; + + /// A set of all of the metadata that is used for the current function. + std::set<std::string> mMetadataFunc; + + /// A set of all of the metadata that is used for the function wrapper. + std::vector<std::string> mMetadataKernel; + + /// Information about the kernel, NULL if the function is not a kernel. + AMDILKernel *mKernel; + + /// Pointer to the machine function that this information belongs to. + MachineFunction *mMF; + + /// Pointer to the subtarget for this function. + const AMDILSubtarget *mSTM; + public: + AMDILMachineFunctionInfo(); + AMDILMachineFunctionInfo(MachineFunction &MF); + virtual ~AMDILMachineFunctionInfo(); + unsigned int + getCalleeSavedFrameSize() const; + void + setCalleeSavedFrameSize(unsigned int bytes); + + unsigned int + getBytesToPopOnReturn() const; + void + setBytesToPopOnReturn (unsigned int bytes); + + NameDecorationStyle + getDecorationStyle() const; + void + setDecorationStyle(NameDecorationStyle style); + + int + getRAIndex() const; + void + setRAIndex(int Index); + + int + getTCReturnAddrDelta() const; + void + setTCReturnAddrDelta(int delta); + + unsigned int + getSRetReturnReg() const; + void + setSRetReturnReg(unsigned int Reg); + + void + setUsesLocal(); + bool + usesLocal() const; + void + setHasLocalArg(); + bool + hasLocalArg() const; + + void + setUsesRegion(); + bool + usesRegion() const; + void + setHasRegionArg(); + bool + hasRegionArg() const; + + bool + usesHWConstant(std::string name) const; + uint32_t + getLocal(uint32_t); + bool + isKernel() const; + AMDILKernel* + getKernel(); + + std::string + getName(); + + /// Get the size in bytes that are required to host all of + /// arguments based on the argument alignment rules in the AMDIL + /// Metadata spec. + uint32_t getArgSize(); + + /// Get the size in bytes that are required to host all of + /// arguments and stack memory in scratch. + uint32_t getScratchSize(); + + /// Get the size in bytes that is required to host all of + /// the arguments on the stack. + uint32_t getStackSize(); + + /// + /// @param val value to add the lookup table + /// @param Opcode opcode of the literal instruction + /// @brief adds the specified value of the type represented by the + /// Opcode + /// to the literal to integer and integer to literal mappings. + /// + /// Add a 32bit integer value to the literal table. + uint32_t addi32Literal(uint32_t val, int Opcode = AMDIL::LOADCONST_i32); + + /// Add a 32bit floating point value to the literal table. + uint32_t addf32Literal(const ConstantFP *CFP); + + /// Add a 64bit integer value to the literal table. + uint32_t addi64Literal(uint64_t val); + + /// Add a 128 bit integer value to the literal table. + uint32_t addi128Literal(uint64_t val_lo, uint64_t val_hi); + + /// Add a 64bit floating point literal as a 64bit integer value. + uint32_t addf64Literal(const ConstantFP *CFP); + + /// Get the number of literals that have currently been allocated. + size_t getNumLiterals() const; + + /// Get the literal ID of an Integer literal of the given offset. + uint32_t getIntLits(uint32_t lit); + + /// Get the literal ID of a Long literal of the given offset. + uint32_t getLongLits(uint64_t lit); + + /// Get the literal ID of a Long literal of the given offset. + uint32_t getVecLits(uint64_t low64, uint64_t high64); + + /// Add some literals to the number of reserved literals. + void addReservedLiterals(uint32_t); + + // Functions that return iterators to the beginning and end + // of the various literal maps. + // Functions that return the beginning and end of the 32bit literal map + lit32_iterator begin_32() { return mIntLits.begin(); } + lit32_iterator end_32() { return mIntLits.end(); } + + // Functions that return the beginning and end of the 64bit literal map + lit64_iterator begin_64() { return mLongLits.begin(); } + lit64_iterator end_64() { return mLongLits.end(); } + + // Functions that return the beginning and end of the 2x64bit literal map + lit128_iterator begin_128() { return mVecLits.begin(); } + lit128_iterator end_128() { return mVecLits.end(); } + + // Add a sampler to the set of known samplers for the current kernel. + uint32_t addSampler(std::string name, uint32_t value); + + // Iterators that point to the beginning and end of the sampler map. + sampler_iterator sampler_begin() { return mSamplerMap.begin(); } + sampler_iterator sampler_end() { return mSamplerMap.end(); } + + + /// Set the flag for the memory ID to true for the current function. + void setUsesMem(unsigned); + /// Retrieve the flag for the memory ID. + bool usesMem(unsigned); + + /// Add called functions to the set of all functions this function calls. + void addCalledFunc(uint32_t id) { mFuncs.insert(id); } + void eraseCalledFunc(uint32_t id) { mFuncs.erase(id); } + size_t func_size() { return mFuncs.size(); } + bool func_empty() { return mFuncs.empty(); } + func_iterator func_begin() { return mFuncs.begin(); } + func_iterator func_end() { return mFuncs.end(); } + + /// Add called intrinsics to the set of all intrinscis this function calls. + void addCalledIntr(uint32_t id) { mIntrs.insert(id); } + size_t intr_size() { return mIntrs.size(); } + bool intr_empty() { return mIntrs.empty(); } + intr_iterator intr_begin() { return mIntrs.begin(); } + intr_iterator intr_end() { return mIntrs.end(); } + + /// Add a 2D read_only image id. + void addROImage2D(uint32_t id) { mRO2D.insert(id); } + size_t read_image2d_size() { return mRO2D.size(); } + read_image2d_iterator read_image2d_begin() { return mRO2D.begin(); } + read_image2d_iterator read_image2d_end() { return mRO2D.end(); } + + /// Add a 3D read_only image id. + void addROImage3D(uint32_t id) { mRO3D.insert(id); } + size_t read_image3d_size() { return mRO3D.size(); } + read_image3d_iterator read_image3d_begin() { return mRO3D.begin(); } + read_image3d_iterator read_image3d_end() { return mRO3D.end(); } + + /// Add a 2D write_only image id. + void addWOImage2D(uint32_t id) { mWO2D.insert(id); } + size_t write_image2d_size() { return mWO2D.size(); } + write_image2d_iterator write_image2d_begin() { return mWO2D.begin(); } + write_image2d_iterator write_image2d_end() { return mWO2D.end(); } + + /// Add a 3D write_only image id. + void addWOImage3D(uint32_t id) { mWO3D.insert(id); } + size_t write_image3d_size() { return mWO3D.size(); } + write_image3d_iterator write_image3d_begin() { return mWO3D.begin(); } + write_image3d_iterator write_image3d_end() { return mWO3D.end(); } + + /// Add a raw uav id. + void uav_insert(uint32_t id) { mRawUAV.insert(id); } + bool uav_count(uint32_t id) { return mRawUAV.count(id); } + size_t uav_size() { return mRawUAV.size(); } + uav_iterator uav_begin() { return mRawUAV.begin(); } + uav_iterator uav_end() { return mRawUAV.end(); } + + /// Add an arena uav id. + void arena_insert(uint32_t id) { mArenaUAV.insert(id); } + bool arena_count(uint32_t id) { return mArenaUAV.count(id); } + size_t arena_size() { return mArenaUAV.size(); } + uav_iterator arena_begin() { return mArenaUAV.begin(); } + uav_iterator arena_end() { return mArenaUAV.end(); } + + // Add an error to the output for the current function. + typedef enum { + RELEASE_ONLY, /// Only emit error message in release mode. + DEBUG_ONLY, /// Only emit error message in debug mode. + ALWAYS /// Always emit the error message. + } ErrorMsgEnum; + /// Add an error message to the set of all error messages. + void addErrorMsg(const char* msg, ErrorMsgEnum val = ALWAYS); + bool errors_empty() { return mErrors.empty(); } + error_iterator errors_begin() { return mErrors.begin(); } + error_iterator errors_end() { return mErrors.end(); } + + /// Add a string to the printf map + uint32_t addPrintfString(std::string &name, unsigned offset); + /// Add a operand to the printf string + void addPrintfOperand(std::string &name, size_t idx, uint32_t size); + bool printf_empty() { return mPrintfMap.empty(); } + size_t printf_size() { return mPrintfMap.size(); } + printf_iterator printf_begin() { return mPrintfMap.begin(); } + printf_iterator printf_end() { return mPrintfMap.end(); } + + /// Add a string to the metadata set for a function/kernel wrapper + void addMetadata(const char *md, bool kernelOnly = false); + void addMetadata(std::string md, bool kernelOnly = false); + func_md_iterator func_md_begin() { return mMetadataFunc.begin(); } + func_md_iterator func_md_end() { return mMetadataFunc.end(); } + kernel_md_iterator kernel_md_begin() { return mMetadataKernel.begin(); } + kernel_md_iterator kernel_md_end() { return mMetadataKernel.end(); } + }; +} // llvm namespace +#endif // _AMDILMACHINEFUNCTIONINFO_H_ diff --git a/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp b/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp new file mode 100644 index 00000000000..b8e536361f0 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp @@ -0,0 +1,173 @@ +//===-- AMDILMachinePeephole.cpp - AMDIL Machine Peephole Pass -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + + +#define DEBUG_TYPE "machine_peephole" +#if !defined(NDEBUG) +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME (false) +#endif + +#include "AMDIL.h" +#include "AMDILSubtarget.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +namespace +{ + class AMDILMachinePeephole : public MachineFunctionPass + { + public: + static char ID; + AMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + //virtual ~AMDILMachinePeephole(); + virtual const char* + getPassName() const; + virtual bool + runOnMachineFunction(MachineFunction &MF); + private: + void insertFence(MachineBasicBlock::iterator &MIB); + TargetMachine &TM; + bool mDebug; + }; // AMDILMachinePeephole + char AMDILMachinePeephole::ID = 0; +} // anonymous namespace + +namespace llvm +{ + FunctionPass* + createAMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + { + return new AMDILMachinePeephole(tm AMDIL_OPT_LEVEL_VAR); + } +} // llvm namespace + +AMDILMachinePeephole::AMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + : MachineFunctionPass(ID), TM(tm) +{ + mDebug = DEBUGME; +} + +bool +AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF) +{ + bool Changed = false; + const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>(); + for (MachineFunction::iterator MBB = MF.begin(), MBE = MF.end(); + MBB != MBE; ++MBB) { + MachineBasicBlock *mb = MBB; + for (MachineBasicBlock::iterator MIB = mb->begin(), MIE = mb->end(); + MIB != MIE; ++MIB) { + MachineInstr *mi = MIB; + const char * name; + name = TM.getInstrInfo()->getName(mi->getOpcode()); + switch (mi->getOpcode()) { + default: + if (isAtomicInst(TM.getInstrInfo(), mi)) { + // If we don't support the hardware accellerated address spaces, + // then the atomic needs to be transformed to the global atomic. + if (strstr(name, "_L_") + && STM->device()->usesSoftware(AMDILDeviceInfo::LocalMem)) { + BuildMI(*mb, MIB, mi->getDebugLoc(), + TM.getInstrInfo()->get(AMDIL::ADD_i32), AMDIL::R1011) + .addReg(mi->getOperand(1).getReg()) + .addReg(AMDIL::T2); + mi->getOperand(1).setReg(AMDIL::R1011); + mi->setDesc( + TM.getInstrInfo()->get( + (mi->getOpcode() - AMDIL::ATOM_L_ADD) + AMDIL::ATOM_G_ADD)); + } else if (strstr(name, "_R_") + && STM->device()->usesSoftware(AMDILDeviceInfo::RegionMem)) { + assert(!"Software region memory is not supported!"); + mi->setDesc( + TM.getInstrInfo()->get( + (mi->getOpcode() - AMDIL::ATOM_R_ADD) + AMDIL::ATOM_G_ADD)); + } + } else if ((isLoadInst(TM.getInstrInfo(), mi) || isStoreInst(TM.getInstrInfo(), mi)) && isVolatileInst(TM.getInstrInfo(), mi)) { + insertFence(MIB); + } + continue; + break; + case AMDIL::USHR_i16: + case AMDIL::USHR_v2i16: + case AMDIL::USHR_v4i16: + case AMDIL::USHRVEC_i16: + case AMDIL::USHRVEC_v2i16: + case AMDIL::USHRVEC_v4i16: + if (TM.getSubtarget<AMDILSubtarget>() + .device()->usesSoftware(AMDILDeviceInfo::ShortOps)) { + unsigned lReg = MF.getRegInfo() + .createVirtualRegister(&AMDIL::GPRI32RegClass); + unsigned Reg = MF.getRegInfo() + .createVirtualRegister(&AMDIL::GPRV4I32RegClass); + BuildMI(*mb, MIB, mi->getDebugLoc(), + TM.getInstrInfo()->get(AMDIL::LOADCONST_i32), + lReg).addImm(0xFFFF); + BuildMI(*mb, MIB, mi->getDebugLoc(), + TM.getInstrInfo()->get(AMDIL::BINARY_AND_v4i32), + Reg) + .addReg(mi->getOperand(1).getReg()) + .addReg(lReg); + mi->getOperand(1).setReg(Reg); + } + break; + case AMDIL::USHR_i8: + case AMDIL::USHR_v2i8: + case AMDIL::USHR_v4i8: + case AMDIL::USHRVEC_i8: + case AMDIL::USHRVEC_v2i8: + case AMDIL::USHRVEC_v4i8: + if (TM.getSubtarget<AMDILSubtarget>() + .device()->usesSoftware(AMDILDeviceInfo::ByteOps)) { + unsigned lReg = MF.getRegInfo() + .createVirtualRegister(&AMDIL::GPRI32RegClass); + unsigned Reg = MF.getRegInfo() + .createVirtualRegister(&AMDIL::GPRV4I32RegClass); + BuildMI(*mb, MIB, mi->getDebugLoc(), + TM.getInstrInfo()->get(AMDIL::LOADCONST_i32), + lReg).addImm(0xFF); + BuildMI(*mb, MIB, mi->getDebugLoc(), + TM.getInstrInfo()->get(AMDIL::BINARY_AND_v4i32), + Reg) + .addReg(mi->getOperand(1).getReg()) + .addReg(lReg); + mi->getOperand(1).setReg(Reg); + } + break; + } + } + } + return Changed; +} + +const char* +AMDILMachinePeephole::getPassName() const +{ + return "AMDIL Generic Machine Peephole Optimization Pass"; +} + +void +AMDILMachinePeephole::insertFence(MachineBasicBlock::iterator &MIB) +{ + MachineInstr *MI = MIB; + MachineInstr *fence = BuildMI(*(MI->getParent()->getParent()), + MI->getDebugLoc(), + TM.getInstrInfo()->get(AMDIL::FENCE)).addReg(1); + + MI->getParent()->insert(MIB, fence); + fence = BuildMI(*(MI->getParent()->getParent()), + MI->getDebugLoc(), + TM.getInstrInfo()->get(AMDIL::FENCE)).addReg(1); + MIB = MI->getParent()->insertAfter(MIB, fence); +} diff --git a/src/gallium/drivers/radeon/AMDILModuleInfo.cpp b/src/gallium/drivers/radeon/AMDILModuleInfo.cpp new file mode 100644 index 00000000000..82c3e4ccca9 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILModuleInfo.cpp @@ -0,0 +1,1266 @@ +//===-- AMDILModuleInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILModuleInfo.h" +#include "AMDILDevices.h" +#include "AMDILKernel.h" +#include "AMDILSubtarget.h" + +#include "AMDILAlgorithms.tpp" +#include "AMDILModuleInfo.h" +#include "AMDILDevices.h" +#include "AMDILKernel.h" +#include "AMDILSubtarget.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instructions.h" +#include "llvm/Support/FormattedStream.h" + +#include <cstdio> + +#define CB_BASE_OFFSET 2 +using namespace llvm; + +AMDILModuleInfo::AMDILModuleInfo(const MachineModuleInfo &MMI) +{ + mMMI = &MMI; + mOffset = 0; + mReservedBuffs = 0; + symTab = NULL; + mCurrentCPOffset = 0; + mPrintfOffset = 0; +} + +AMDILModuleInfo::~AMDILModuleInfo() { + for (StringMap<AMDILKernel*>::iterator kb = mKernels.begin(), ke = mKernels.end(); + kb != ke; ++kb) { + StringMapEntry<AMDILKernel*> cur = *kb; + AMDILKernel *ptr = cur.getValue(); + delete ptr; + } +} + +static const AMDILConstPtr *getConstPtr(const AMDILKernel *krnl, const std::string &arg) { + llvm::SmallVector<AMDILConstPtr, DEFAULT_VEC_SLOTS>::const_iterator begin, end; + for (begin = krnl->constPtr.begin(), end = krnl->constPtr.end(); + begin != end; ++begin) { + if (!strcmp(begin->name.data(),arg.c_str())) { + return &(*begin); + } + } + return NULL; +} +#if 0 +static bool structContainsSub32bitType(const StructType *ST) { + StructType::element_iterator eib, eie; + for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { + Type *ptr = *eib; + uint32_t size = (uint32_t)GET_SCALAR_SIZE(ptr); + if (!size) { + if (const StructType *ST = dyn_cast<StructType>(ptr)) { + if (structContainsSub32bitType(ST)) { + return true; + } + } + } else if (size < 32) { + return true; + } + } + return false; +} +#endif + +void AMDILModuleInfo::processModule(const Module *M, + const AMDILTargetMachine *mTM) +{ + Module::const_global_iterator GI; + Module::const_global_iterator GE; + mSTM = mTM->getSubtargetImpl(); + for (GI = M->global_begin(), GE = M->global_end(); GI != GE; ++GI) { + const GlobalValue *GV = GI; + llvm::StringRef GVName = GV->getName(); + const char *name = GVName.data(); + if (!strncmp(name, "sgv", 3)) { + mKernelArgs[GVName] = parseSGV(GV); + } else if (!strncmp(name, "fgv", 3)) { + // we can ignore this since we don't care about the filename + // string + } else if (!strncmp(name, "lvgv", 4)) { + mLocalArgs[GVName] = parseLVGV(GV); + } else if (!strncmp(name, "llvm.image.annotations", 22)) { + parseImageAnnotate(GV); + } else if (!strncmp(name, "llvm.global.annotations", 23)) { + parseGlobalAnnotate(GV); + } else if (!strncmp(name, "llvm.constpointer.annotations", 29)) { + parseConstantPtrAnnotate(GV); + } else if (!strncmp(name, "llvm.readonlypointer.annotations", 32)) { + // These are skipped as we handle them later in AMDILPointerManager.cpp + } else if (GV->getType()->getAddressSpace() == 3) { // *** Match cl_kernel.h local AS # + parseAutoArray(GV, false); + } else if (strstr(name, "clregion")) { + parseAutoArray(GV, true); + } else if (!GV->use_empty() + && mIgnoreStr.find(GVName) == mIgnoreStr.end()) { + parseConstantPtr(GV); + } + } + allocateGlobalCB(); + + safeForEach(M->begin(), M->end(), + std::bind1st( + std::mem_fun(&AMDILModuleInfo::checkConstPtrsUseHW), + this)); +} + +void AMDILModuleInfo::allocateGlobalCB(void) { + uint32_t maxCBSize = mSTM->device()->getMaxCBSize(); + uint32_t offset = 0; + uint32_t curCB = 0; + uint32_t swoffset = 0; + for (StringMap<AMDILConstPtr>::iterator cpb = mConstMems.begin(), + cpe = mConstMems.end(); cpb != cpe; ++cpb) { + bool constHW = mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem); + cpb->second.usesHardware = false; + if (constHW) { + // If we have a limit on the max CB Size, then we need to make sure that + // the constant sizes fall within the limits. + if (cpb->second.size <= maxCBSize) { + if (offset + cpb->second.size > maxCBSize) { + offset = 0; + curCB++; + } + if (curCB < mSTM->device()->getMaxNumCBs()) { + cpb->second.cbNum = curCB + CB_BASE_OFFSET; + cpb->second.offset = offset; + offset += (cpb->second.size + 15) & (~15); + cpb->second.usesHardware = true; + continue; + } + } + } + cpb->second.cbNum = 0; + cpb->second.offset = swoffset; + swoffset += (cpb->second.size + 15) & (~15); + } + if (!mConstMems.empty()) { + mReservedBuffs = curCB + 1; + } +} + +bool AMDILModuleInfo::checkConstPtrsUseHW(llvm::Module::const_iterator *FCI) +{ + Function::const_arg_iterator AI, AE; + const Function *func = *FCI; + std::string name = func->getName(); + if (!strstr(name.c_str(), "__OpenCL") + || !strstr(name.c_str(), "_AMDILKernel")) { + return false; + } + AMDILKernel *krnl = mKernels[name]; + if (mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem)) { + for (AI = func->arg_begin(), AE = func->arg_end(); + AI != AE; ++AI) { + const Argument *Arg = &(*AI); + const PointerType *P = dyn_cast<PointerType>(Arg->getType()); + if (!P) { + continue; + } + if (P->getAddressSpace() != AMDILAS::CONSTANT_ADDRESS) { + continue; + } + const AMDILConstPtr *ptr = getConstPtr(krnl, Arg->getName()); + if (ptr) { + continue; + } + AMDILConstPtr constAttr; + constAttr.name = Arg->getName(); + constAttr.size = this->mSTM->device()->getMaxCBSize(); + constAttr.base = Arg; + constAttr.isArgument = true; + constAttr.isArray = false; + constAttr.offset = 0; + constAttr.usesHardware = + mSTM->device()->usesHardware(AMDILDeviceInfo::ConstantMem); + if (constAttr.usesHardware) { + constAttr.cbNum = krnl->constPtr.size() + 2; + } else { + constAttr.cbNum = 0; + } + krnl->constPtr.push_back(constAttr); + } + } + // Now lets make sure that only the N largest buffers + // get allocated in hardware if we have too many buffers + uint32_t numPtrs = krnl->constPtr.size(); + if (numPtrs > (this->mSTM->device()->getMaxNumCBs() - mReservedBuffs)) { + // TODO: Change this routine so it sorts + // AMDILConstPtr instead of pulling the sizes out + // and then grab the N largest and disable the rest + llvm::SmallVector<uint32_t, 16> sizes; + for (uint32_t x = 0; x < numPtrs; ++x) { + sizes.push_back(krnl->constPtr[x].size); + } + std::sort(sizes.begin(), sizes.end()); + uint32_t numToDisable = numPtrs - (mSTM->device()->getMaxNumCBs() - + mReservedBuffs); + uint32_t safeSize = sizes[numToDisable-1]; + for (uint32_t x = 0; x < numPtrs && numToDisable; ++x) { + if (krnl->constPtr[x].size <= safeSize) { + krnl->constPtr[x].usesHardware = false; + --numToDisable; + } + } + } + // Renumber all of the valid CB's so that + // they are linear increase + uint32_t CBid = 2 + mReservedBuffs; + for (uint32_t x = 0; x < numPtrs; ++x) { + if (krnl->constPtr[x].usesHardware) { + krnl->constPtr[x].cbNum = CBid++; + } + } + for (StringMap<AMDILConstPtr>::iterator cpb = mConstMems.begin(), + cpe = mConstMems.end(); cpb != cpe; ++cpb) { + if (cpb->second.usesHardware) { + krnl->constPtr.push_back(cpb->second); + } + } + for (uint32_t x = 0; x < krnl->constPtr.size(); ++x) { + AMDILConstPtr &c = krnl->constPtr[x]; + uint32_t cbNum = c.cbNum - CB_BASE_OFFSET; + if (cbNum < HW_MAX_NUM_CB && c.cbNum >= CB_BASE_OFFSET) { + if ((c.size + c.offset) > krnl->constSizes[cbNum]) { + krnl->constSizes[cbNum] = + ((c.size + c.offset) + 15) & ~15; + } + } else { + krnl->constPtr[x].usesHardware = false; + } + } + return false; +} + +int32_t AMDILModuleInfo::getArrayOffset(const llvm::StringRef &a) const { + StringMap<AMDILArrayMem>::const_iterator iter = mArrayMems.find(a); + if (iter != mArrayMems.end()) { + return iter->second.offset; + } else { + return -1; + } +} + +int32_t AMDILModuleInfo::getConstOffset(const llvm::StringRef &a) const { + StringMap<AMDILConstPtr>::const_iterator iter = mConstMems.find(a); + if (iter != mConstMems.end()) { + return iter->second.offset; + } else { + return -1; + } +} + +bool AMDILModuleInfo::getConstHWBit(const llvm::StringRef &name) const { + StringMap<AMDILConstPtr>::const_iterator iter = mConstMems.find(name); + if (iter != mConstMems.end()) { + return iter->second.usesHardware; + } else { + return false; + } +} + +// As of right now we only care about the required group size +// so we can skip the variable encoding +AMDILKernelAttr AMDILModuleInfo::parseSGV(const GlobalValue *G) { + AMDILKernelAttr nArg; + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + memset(&nArg, 0, sizeof(nArg)); + for (int x = 0; x < 3; ++x) { + nArg.reqGroupSize[x] = mSTM->getDefaultSize(x); + nArg.reqRegionSize[x] = mSTM->getDefaultSize(x); + } + if (!GV || !GV->hasInitializer()) { + return nArg; + } + const Constant *CV = GV->getInitializer(); + const ConstantDataArray *CA = dyn_cast_or_null<ConstantDataArray>(CV); + if (!CA || !CA->isString()) { + return nArg; + } + std::string init = CA->getAsString(); + size_t pos = init.find("RWG"); + if (pos != llvm::StringRef::npos) { + pos += 3; + std::string LWS = init.substr(pos, init.length() - pos); + const char *lws = LWS.c_str(); + sscanf(lws, "%u,%u,%u", &(nArg.reqGroupSize[0]), + &(nArg.reqGroupSize[1]), + &(nArg.reqGroupSize[2])); + nArg.mHasRWG = true; + } + pos = init.find("RWR"); + if (pos != llvm::StringRef::npos) { + pos += 3; + std::string LWS = init.substr(pos, init.length() - pos); + const char *lws = LWS.c_str(); + sscanf(lws, "%u,%u,%u", &(nArg.reqRegionSize[0]), + &(nArg.reqRegionSize[1]), + &(nArg.reqRegionSize[2])); + nArg.mHasRWR = true; + } + return nArg; +} + +AMDILLocalArg AMDILModuleInfo::parseLVGV(const GlobalValue *G) { + AMDILLocalArg nArg; + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + nArg.name = ""; + if (!GV || !GV->hasInitializer()) { + return nArg; + } + const ConstantArray *CA = + dyn_cast_or_null<ConstantArray>(GV->getInitializer()); + if (!CA) { + return nArg; + } + for (size_t x = 0, y = CA->getNumOperands(); x < y; ++x) { + const Value *local = CA->getOperand(x); + const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(local); + if (!CE || !CE->getNumOperands()) { + continue; + } + nArg.name = (*(CE->op_begin()))->getName(); + if (mArrayMems.find(nArg.name) != mArrayMems.end()) { + nArg.local.push_back(&(mArrayMems[nArg.name])); + } + } + return nArg; +} + +void AMDILModuleInfo::parseConstantPtrAnnotate(const GlobalValue *G) { + const GlobalVariable *GV = dyn_cast_or_null<GlobalVariable>(G); + const ConstantArray *CA = + dyn_cast_or_null<ConstantArray>(GV->getInitializer()); + if (!CA) { + return; + } + uint32_t numOps = CA->getNumOperands(); + for (uint32_t x = 0; x < numOps; ++x) { + const Value *V = CA->getOperand(x); + const ConstantStruct *CS = dyn_cast_or_null<ConstantStruct>(V); + if (!CS) { + continue; + } + assert(CS->getNumOperands() == 2 && "There can only be 2" + " fields, a name and size"); + const ConstantExpr *nameField = dyn_cast<ConstantExpr>(CS->getOperand(0)); + const ConstantInt *sizeField = dyn_cast<ConstantInt>(CS->getOperand(1)); + assert(nameField && "There must be a constant name field"); + assert(sizeField && "There must be a constant size field"); + const GlobalVariable *nameGV = + dyn_cast<GlobalVariable>(nameField->getOperand(0)); + const ConstantDataArray *nameArray = + dyn_cast<ConstantDataArray>(nameGV->getInitializer()); + // Lets add this string to the set of strings we should ignore processing + mIgnoreStr.insert(nameGV->getName()); + if (mConstMems.find(nameGV->getName()) + != mConstMems.end()) { + // If we already processesd this string as a constant, lets remove it from + // the list of known constants. This way we don't process unneeded data + // and don't generate code/metadata for strings that are never used. + mConstMems.erase(mConstMems.find(nameGV->getName())); + } else { + mIgnoreStr.insert(CS->getOperand(0)->getName()); + } + AMDILConstPtr constAttr; + constAttr.name = nameArray->getAsString(); + constAttr.size = (sizeField->getZExtValue() + 15) & ~15; + constAttr.base = CS; + constAttr.isArgument = true; + constAttr.isArray = false; + constAttr.cbNum = 0; + constAttr.offset = 0; + constAttr.usesHardware = (constAttr.size <= mSTM->device()->getMaxCBSize()); + // Now that we have all our constant information, + // lets update the AMDILKernel + llvm::StringRef AMDILKernelName = G->getName().data() + 30; + AMDILKernel *k; + if (mKernels.find(AMDILKernelName) != mKernels.end()) { + k = mKernels[AMDILKernelName]; + } else { + k = new AMDILKernel; + k->curSize = 0; + k->curRSize = 0; + k->curHWSize = 0; + k->curHWRSize = 0; + k->constSize = 0; + k->lvgv = NULL; + k->sgv = NULL; + memset(k->constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB); + } + constAttr.cbNum = k->constPtr.size() + 2; + k->constPtr.push_back(constAttr); + mKernels[AMDILKernelName] = k; + } +} + +void AMDILModuleInfo::parseImageAnnotate(const GlobalValue *G) { + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + const ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer()); + if (!CA) { + return; + } + if (isa<GlobalValue>(CA)) { + return; + } + uint32_t e = CA->getNumOperands(); + if (!e) { + return; + } + AMDILKernel *k; + llvm::StringRef name = G->getName().data() + 23; + if (mKernels.find(name) != mKernels.end()) { + k = mKernels[name]; + } else { + k = new AMDILKernel; + k->curSize = 0; + k->curRSize = 0; + k->curHWSize = 0; + k->curHWRSize = 0; + k->constSize = 0; + k->lvgv = NULL; + k->sgv = NULL; + memset(k->constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB); + } + for (uint32_t i = 0; i != e; ++i) { + const Value *V = CA->getOperand(i); + const Constant *C = dyn_cast<Constant>(V); + const ConstantStruct *CS = dyn_cast<ConstantStruct>(C); + if (CS && CS->getNumOperands() == 2) { + if (mConstMems.find(CS->getOperand(0)->getOperand(0)->getName()) != + mConstMems.end()) { + // If we already processesd this string as a constant, lets remove it + // from the list of known constants. This way we don't process unneeded + // data and don't generate code/metadata for strings that are never + // used. + mConstMems.erase( + mConstMems.find(CS->getOperand(0)->getOperand(0)->getName())); + } else { + mIgnoreStr.insert(CS->getOperand(0)->getOperand(0)->getName()); + } + const ConstantInt *CI = dyn_cast<ConstantInt>(CS->getOperand(1)); + uint32_t val = (uint32_t)CI->getZExtValue(); + if (val == 1) { + k->readOnly.insert(i); + } else if (val == 2) { + k->writeOnly.insert(i); + } else { + assert(!"Unknown image type value!"); + } + } + } + mKernels[name] = k; +} + +void AMDILModuleInfo::parseAutoArray(const GlobalValue *GV, bool isRegion) { + const GlobalVariable *G = dyn_cast<GlobalVariable>(GV); + Type *Ty = (G) ? G->getType() : NULL; + AMDILArrayMem tmp; + tmp.isHW = true; + tmp.offset = 0; + tmp.vecSize = getTypeSize(Ty, true); + tmp.isRegion = isRegion; + mArrayMems[GV->getName()] = tmp; +} + +void AMDILModuleInfo::parseConstantPtr(const GlobalValue *GV) { + const GlobalVariable *G = dyn_cast<GlobalVariable>(GV); + Type *Ty = (G) ? G->getType() : NULL; + AMDILConstPtr constAttr; + constAttr.name = G->getName(); + constAttr.size = getTypeSize(Ty, true); + constAttr.base = GV; + constAttr.isArgument = false; + constAttr.isArray = true; + constAttr.offset = 0; + constAttr.cbNum = 0; + constAttr.usesHardware = false; + mConstMems[GV->getName()] = constAttr; +} + +void AMDILModuleInfo::parseGlobalAnnotate(const GlobalValue *G) { + const GlobalVariable *GV = dyn_cast<GlobalVariable>(G); + if (!GV->hasInitializer()) { + return; + } + const Constant *CT = GV->getInitializer(); + if (!CT || isa<GlobalValue>(CT)) { + return; + } + const ConstantArray *CA = dyn_cast<ConstantArray>(CT); + if (!CA) { + return; + } + + unsigned int nKernels = CA->getNumOperands(); + for (unsigned int i = 0, e = nKernels; i != e; ++i) { + parseKernelInformation(CA->getOperand(i)); + } +} + +void AMDILModuleInfo::parseKernelInformation(const Value *V) { + if (isa<GlobalValue>(V)) { + return; + } + const ConstantStruct *CS = dyn_cast_or_null<ConstantStruct>(V); + if (!CS) { + return; + } + uint32_t N = CS->getNumOperands(); + if (N != 5) { + return; + } + AMDILKernel *tmp; + + // The first operand is always a pointer to the AMDILKernel. + const Constant *CV = dyn_cast<Constant>(CS->getOperand(0)); + llvm::StringRef AMDILKernelName = ""; + if (CV->getNumOperands()) { + AMDILKernelName = (*(CV->op_begin()))->getName(); + } + + // If we have images, then we have already created the AMDILKernel and we just need + // to get the AMDILKernel information. + if (mKernels.find(AMDILKernelName) != mKernels.end()) { + tmp = mKernels[AMDILKernelName]; + } else { + tmp = new AMDILKernel; + tmp->curSize = 0; + tmp->curRSize = 0; + tmp->curHWSize = 0; + tmp->curHWRSize = 0; + tmp->constSize = 0; + tmp->lvgv = NULL; + tmp->sgv = NULL; + memset(tmp->constSizes, 0, sizeof(uint32_t) * HW_MAX_NUM_CB); + } + + + // The second operand is SGV, there can only be one so we don't need to worry + // about parsing out multiple data points. + CV = dyn_cast<Constant>(CS->getOperand(1)); + + llvm::StringRef sgvName; + if (CV->getNumOperands()) { + sgvName = (*(CV->op_begin()))->getName(); + } + + if (mKernelArgs.find(sgvName) != mKernelArgs.end()) { + tmp->sgv = &mKernelArgs[sgvName]; + } + // The third operand is FGV, which is skipped + // The fourth operand is LVGV + // There can be multiple local arrays, so we + // need to handle each one seperatly + CV = dyn_cast<Constant>(CS->getOperand(3)); + llvm::StringRef lvgvName = ""; + if (CV->getNumOperands()) { + lvgvName = (*(CV->op_begin()))->getName(); + } + if (mLocalArgs.find(lvgvName) != mLocalArgs.end()) { + AMDILLocalArg *ptr = &mLocalArgs[lvgvName]; + tmp->lvgv = ptr; + llvm::SmallVector<AMDILArrayMem *, DEFAULT_VEC_SLOTS>::iterator ib, ie; + for (ib = ptr->local.begin(), ie = ptr->local.end(); ib != ie; ++ib) { + if ((*ib)->isRegion) { + if ((*ib)->isHW) { + (*ib)->offset = tmp->curHWRSize; + tmp->curHWRSize += ((*ib)->vecSize + 15) & ~15; + } else { + (*ib)->offset = tmp->curRSize; + tmp->curRSize += ((*ib)->vecSize + 15) & ~15; + } + } else { + if ((*ib)->isHW) { + (*ib)->offset = tmp->curHWSize; + tmp->curHWSize += ((*ib)->vecSize + 15) & ~15; + } else { + (*ib)->offset = tmp->curSize; + tmp->curSize += ((*ib)->vecSize + 15) & ~15; + } + } + } + } + + // The fifth operand is NULL + mKernels[AMDILKernelName] = tmp; +} + +AMDILKernel * +AMDILModuleInfo::getKernel(const llvm::StringRef &name) { + StringMap<AMDILKernel*>::iterator iter = mKernels.find(name); + if (iter == mKernels.end()) { + return NULL; + } else { + return iter->second; + } +} + +bool AMDILModuleInfo::isKernel(const llvm::StringRef &name) const { + return (mKernels.find(name) != mKernels.end()); +} + +bool AMDILModuleInfo::isWriteOnlyImage(const llvm::StringRef &name, + uint32_t iID) const { + const StringMap<AMDILKernel*>::const_iterator kiter = mKernels.find(name); + if (kiter == mKernels.end()) { + return false; + } + return kiter->second->writeOnly.count(iID); +} +#if 0 +uint32_t +AMDILModuleInfo::getNumWriteImages(const llvm::StringRef &name) const { + char *env = NULL; + env = getenv("GPU_DISABLE_RAW_UAV"); + if (env && env[0] == '1') { + return 8; + } + const StringMap<AMDILKernel*>::const_iterator kiter = mKernels.find(name); + if (kiter == mKernels.end()) { + return 0; + } else { + return kiter->second->writeOnly.size(); + } +} +#endif +bool AMDILModuleInfo::isReadOnlyImage(const llvm::StringRef &name, + uint32_t iID) const { + const StringMap<AMDILKernel*>::const_iterator kiter = mKernels.find(name); + if (kiter == mKernels.end()) { + return false; + } + return kiter->second->readOnly.count(iID); +} +#if 0 +bool AMDILModuleInfo::hasRWG(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + AMDILKernelAttr *ptr = iter->second->sgv; + if (ptr) { + return ptr->mHasRWG; + } + } + return false; +} + +bool AMDILModuleInfo::hasRWR(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + AMDILKernelAttr *ptr = iter->second->sgv; + if (ptr) { + return ptr->mHasRWR; + } + } + return false; +} + +uint32_t +AMDILModuleInfo::getMaxGroupSize(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + AMDILKernelAttr *sgv = iter->second->sgv; + if (sgv) { + return sgv->reqGroupSize[0] * sgv->reqGroupSize[1] * sgv->reqGroupSize[2]; + } + } + return mSTM->getDefaultSize(0) * + mSTM->getDefaultSize(1) * + mSTM->getDefaultSize(2); +} + +uint32_t +AMDILModuleInfo::getMaxRegionSize(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + AMDILKernelAttr *sgv = iter->second->sgv; + if (sgv) { + return sgv->reqRegionSize[0] * + sgv->reqRegionSize[1] * + sgv->reqRegionSize[2]; + } + } + return mSTM->getDefaultSize(0) * + mSTM->getDefaultSize(1) * + mSTM->getDefaultSize(2); +} +uint32_t AMDILModuleInfo::getRegionSize(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second->curRSize; + } else { + return 0; + } +} + +uint32_t AMDILModuleInfo::getLocalSize(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second->curSize; + } else { + return 0; + } +} + +uint32_t AMDILModuleInfo::getConstSize(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second->constSize; + } else { + return 0; + } +} + +uint32_t +AMDILModuleInfo::getHWRegionSize(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second->curHWRSize; + } else { + return 0; + } +} + +uint32_t AMDILModuleInfo::getHWLocalSize(const llvm::StringRef &name) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end()) { + return iter->second->curHWSize; + } else { + return 0; + } +} +#endif + +int32_t AMDILModuleInfo::getArgID(const Argument *arg) { + DenseMap<const Argument *, int32_t>::iterator argiter = mArgIDMap.find(arg); + if (argiter != mArgIDMap.end()) { + return argiter->second; + } else { + return -1; + } +} + + +uint32_t +AMDILModuleInfo::getRegion(const llvm::StringRef &name, uint32_t dim) const { + StringMap<AMDILKernel*>::const_iterator iter = mKernels.find(name); + if (iter != mKernels.end() && iter->second->sgv) { + AMDILKernelAttr *sgv = iter->second->sgv; + switch (dim) { + default: break; + case 0: + case 1: + case 2: + return sgv->reqRegionSize[dim]; + break; + case 3: + return sgv->reqRegionSize[0] * + sgv->reqRegionSize[1] * + sgv->reqRegionSize[2]; + }; + } + switch (dim) { + default: + return 1; + case 3: + return mSTM->getDefaultSize(0) * + mSTM->getDefaultSize(1) * + mSTM->getDefaultSize(2); + case 2: + case 1: + case 0: + return mSTM->getDefaultSize(dim); + break; + }; + return 1; +} + +StringMap<AMDILConstPtr>::iterator AMDILModuleInfo::consts_begin() { + return mConstMems.begin(); +} + + +StringMap<AMDILConstPtr>::iterator AMDILModuleInfo::consts_end() { + return mConstMems.end(); +} + +bool AMDILModuleInfo::byteStoreExists(StringRef S) const { + return mByteStore.find(S) != mByteStore.end(); +} + +uint32_t AMDILModuleInfo::getConstPtrSize(const AMDILKernel *krnl, + const llvm::StringRef &arg) +{ + const AMDILConstPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->size; + } else { + return 0; + } +} + +uint32_t AMDILModuleInfo::getConstPtrOff(const AMDILKernel *krnl, + const llvm::StringRef &arg) +{ + const AMDILConstPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->offset; + } else { + return 0; + } +} + +uint32_t AMDILModuleInfo::getConstPtrCB(const AMDILKernel *krnl, + const llvm::StringRef &arg) +{ + const AMDILConstPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->cbNum; + } else { + return 0; + } +} + +void AMDILModuleInfo::calculateCPOffsets(const MachineFunction *MF, + AMDILKernel *krnl) +{ + const MachineConstantPool *MCP = MF->getConstantPool(); + if (!MCP) { + return; + } + const std::vector<MachineConstantPoolEntry> consts = MCP->getConstants(); + size_t numConsts = consts.size(); + for (size_t x = 0; x < numConsts; ++x) { + krnl->CPOffsets.push_back( + std::make_pair<uint32_t, const Constant*>( + mCurrentCPOffset, consts[x].Val.ConstVal)); + size_t curSize = getTypeSize(consts[x].Val.ConstVal->getType(), true); + // Align the size to the vector boundary + curSize = (curSize + 15) & (~15); + mCurrentCPOffset += curSize; + } +} + +bool AMDILModuleInfo::isConstPtrArray(const AMDILKernel *krnl, + const llvm::StringRef &arg) { + const AMDILConstPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->isArray; + } else { + return false; + } +} + +bool AMDILModuleInfo::isConstPtrArgument(const AMDILKernel *krnl, + const llvm::StringRef &arg) +{ + const AMDILConstPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->isArgument; + } else { + return false; + } +} + +const Value *AMDILModuleInfo::getConstPtrValue(const AMDILKernel *krnl, + const llvm::StringRef &arg) { + const AMDILConstPtr *curConst = getConstPtr(krnl, arg); + if (curConst) { + return curConst->base; + } else { + return NULL; + } +} + +static void +dumpZeroElements(StructType * const T, llvm::raw_ostream &O, bool asBytes); +static void +dumpZeroElements(IntegerType * const T, llvm::raw_ostream &O, bool asBytes); +static void +dumpZeroElements(ArrayType * const T, llvm::raw_ostream &O, bool asBytes); +static void +dumpZeroElements(VectorType * const T, llvm::raw_ostream &O, bool asBytes); +static void +dumpZeroElements(Type * const T, llvm::raw_ostream &O, bool asBytes); + +void dumpZeroElements(Type * const T, llvm::raw_ostream &O, bool asBytes) { + if (!T) { + return; + } + switch(T->getTypeID()) { + case Type::X86_FP80TyID: + case Type::FP128TyID: + case Type::PPC_FP128TyID: + case Type::LabelTyID: + assert(0 && "These types are not supported by this backend"); + default: + case Type::DoubleTyID: + if (asBytes) { + O << ":0:0:0:0:0:0:0:0"; + } else { + O << ":0"; + } + break; + case Type::FloatTyID: + case Type::PointerTyID: + case Type::FunctionTyID: + if (asBytes) { + O << ":0:0:0:0"; + } else { + O << ":0"; + } + case Type::IntegerTyID: + dumpZeroElements(dyn_cast<IntegerType>(T), O, asBytes); + break; + case Type::StructTyID: + { + const StructType *ST = cast<StructType>(T); + if (!ST->isOpaque()) { + dumpZeroElements(dyn_cast<StructType>(T), O, asBytes); + } else { // A pre-LLVM 3.0 opaque type + if (asBytes) { + O << ":0:0:0:0"; + } else { + O << ":0"; + } + } + } + break; + case Type::ArrayTyID: + dumpZeroElements(dyn_cast<ArrayType>(T), O, asBytes); + break; + case Type::VectorTyID: + dumpZeroElements(dyn_cast<VectorType>(T), O, asBytes); + break; + }; +} + +void +dumpZeroElements(StructType * const ST, llvm::raw_ostream &O, bool asBytes) { + if (!ST) { + return; + } + Type *curType; + StructType::element_iterator eib = ST->element_begin(); + StructType::element_iterator eie = ST->element_end(); + for (;eib != eie; ++eib) { + curType = *eib; + dumpZeroElements(curType, O, asBytes); + } +} + +void +dumpZeroElements(IntegerType * const IT, llvm::raw_ostream &O, bool asBytes) { + if (asBytes) { + unsigned byteWidth = (IT->getBitWidth() >> 3); + for (unsigned x = 0; x < byteWidth; ++x) { + O << ":0"; + } + } +} + +void +dumpZeroElements(ArrayType * const AT, llvm::raw_ostream &O, bool asBytes) { + size_t size = AT->getNumElements(); + for (size_t x = 0; x < size; ++x) { + dumpZeroElements(AT->getElementType(), O, asBytes); + } +} + +void +dumpZeroElements(VectorType * const VT, llvm::raw_ostream &O, bool asBytes) { + size_t size = VT->getNumElements(); + for (size_t x = 0; x < size; ++x) { + dumpZeroElements(VT->getElementType(), O, asBytes); + } +} + +void AMDILModuleInfo::printConstantValue(const Constant *CAval, + llvm::raw_ostream &O, bool asBytes) { + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CAval)) { + bool isDouble = &CFP->getValueAPF().getSemantics()==&APFloat::IEEEdouble; + if (isDouble) { + double val = CFP->getValueAPF().convertToDouble(); + union dtol_union { + double d; + uint64_t l; + char c[8]; + } conv; + conv.d = val; + if (!asBytes) { + O << ":"; + O.write_hex(conv.l); + } else { + for (int i = 0; i < 8; ++i) { + O << ":"; + O.write_hex((unsigned)conv.c[i] & 0xFF); + } + } + } else { + float val = CFP->getValueAPF().convertToFloat(); + union ftoi_union { + float f; + uint32_t u; + char c[4]; + } conv; + conv.f = val; + if (!asBytes) { + O << ":"; + O.write_hex(conv.u); + } else { + for (int i = 0; i < 4; ++i) { + O << ":"; + O.write_hex((unsigned)conv.c[i] & 0xFF); + } + } + } + } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CAval)) { + uint64_t zVal = CI->getValue().getZExtValue(); + if (!asBytes) { + O << ":"; + O.write_hex(zVal); + } else { + switch (CI->getBitWidth()) { + default: + { + union ltob_union { + uint64_t l; + char c[8]; + } conv; + conv.l = zVal; + for (int i = 0; i < 8; ++i) { + O << ":"; + O.write_hex((unsigned)conv.c[i] & 0xFF); + } + } + break; + case 8: + O << ":"; + O.write_hex(zVal & 0xFF); + break; + case 16: + { + union stob_union { + uint16_t s; + char c[2]; + } conv; + conv.s = (uint16_t)zVal; + O << ":"; + O.write_hex((unsigned)conv.c[0] & 0xFF); + O << ":"; + O.write_hex((unsigned)conv.c[1] & 0xFF); + } + break; + case 32: + { + union itob_union { + uint32_t i; + char c[4]; + } conv; + conv.i = (uint32_t)zVal; + for (int i = 0; i < 4; ++i) { + O << ":"; + O.write_hex((unsigned)conv.c[i] & 0xFF); + } + } + break; + } + } + } else if (const ConstantVector *CV = dyn_cast<ConstantVector>(CAval)) { + int y = CV->getNumOperands()-1; + int x = 0; + for (; x < y; ++x) { + printConstantValue(CV->getOperand(x), O, asBytes); + } + printConstantValue(CV->getOperand(x), O, asBytes); + } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CAval)) { + int y = CS->getNumOperands(); + int x = 0; + for (; x < y; ++x) { + printConstantValue(CS->getOperand(x), O, asBytes); + } + } else if (const ConstantAggregateZero *CAZ + = dyn_cast<ConstantAggregateZero>(CAval)) { + int y = CAZ->getNumOperands(); + if (y > 0) { + int x = 0; + for (; x < y; ++x) { + printConstantValue((llvm::Constant *)CAZ->getOperand(x), + O, asBytes); + } + } else { + if (asBytes) { + dumpZeroElements(CAval->getType(), O, asBytes); + } else { + int y = getNumElements(CAval->getType())-1; + for (int x = 0; x < y; ++x) { + O << ":0"; + } + O << ":0"; + } + } + } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CAval)) { + int y = CA->getNumOperands(); + int x = 0; + for (; x < y; ++x) { + printConstantValue(CA->getOperand(x), O, asBytes); + } + } else if (dyn_cast<ConstantPointerNull>(CAval)) { + O << ":0"; + //assert(0 && "Hit condition which was not expected"); + } else if (dyn_cast<ConstantExpr>(CAval)) { + O << ":0"; + //assert(0 && "Hit condition which was not expected"); + } else if (dyn_cast<UndefValue>(CAval)) { + O << ":0"; + //assert(0 && "Hit condition which was not expected"); + } else { + assert(0 && "Hit condition which was not expected"); + } +} +#if 0 +static bool isStruct(Type * const T) +{ + if (!T) { + return false; + } + switch (T->getTypeID()) { + default: + return false; + case Type::PointerTyID: + return isStruct(T->getContainedType(0)); + case Type::StructTyID: + return true; + case Type::ArrayTyID: + case Type::VectorTyID: + return isStruct(dyn_cast<SequentialType>(T)->getElementType()); + }; + +} + +void AMDILModuleInfo::dumpDataToCB(llvm::raw_ostream &O, AMDILKernelManager *km, + uint32_t id) { + uint32_t size = 0; + for (StringMap<AMDILConstPtr>::iterator cmb = consts_begin(), + cme = consts_end(); cmb != cme; ++cmb) { + if (id == cmb->second.cbNum) { + size += (cmb->second.size + 15) & (~15); + } + } + if (id == 0) { + O << ";#DATASTART:" << (size + mCurrentCPOffset) << "\n"; + if (mCurrentCPOffset) { + for (StringMap<AMDILKernel*>::iterator kcpb = mKernels.begin(), + kcpe = mKernels.end(); kcpb != kcpe; ++kcpb) { + const AMDILKernel *k = kcpb->second; + size_t numConsts = k->CPOffsets.size(); + for (size_t x = 0; x < numConsts; ++x) { + size_t offset = k->CPOffsets[x].first; + const Constant *C = k->CPOffsets[x].second; + Type *Ty = C->getType(); + size_t size = (isStruct(Ty) ? getTypeSize(Ty, true) + : getNumElements(Ty)); + O << ";#" << km->getTypeName(Ty, symTab) << ":"; + O << offset << ":" << size ; + printConstantValue(C, O, isStruct(Ty)); + O << "\n"; + } + } + } + } else { + O << ";#DATASTART:" << id << ":" << size << "\n"; + } + + for (StringMap<AMDILConstPtr>::iterator cmb = consts_begin(), cme = consts_end(); + cmb != cme; ++cmb) { + if (cmb->second.cbNum != id) { + continue; + } + const GlobalVariable *G = dyn_cast<GlobalVariable>(cmb->second.base); + Type *Ty = (G) ? G->getType() : NULL; + size_t offset = cmb->second.offset; + const Constant *C = G->getInitializer(); + size_t size = (isStruct(Ty) + ? getTypeSize(Ty, true) + : getNumElements(Ty)); + O << ";#" << km->getTypeName(Ty, symTab) << ":"; + if (!id) { + O << (offset + mCurrentCPOffset) << ":" << size; + } else { + O << offset << ":" << size; + } + if (C) { + printConstantValue(C, O, isStruct(Ty)); + } else { + assert(0 && "Cannot have a constant pointer" + " without an initializer!"); + } + O <<"\n"; + } + if (id == 0) { + O << ";#DATAEND\n"; + } else { + O << ";#DATAEND:" << id << "\n"; + } +} + +void +AMDILModuleInfo::dumpDataSection(llvm::raw_ostream &O, AMDILKernelManager *km) { + if (mConstMems.empty() && !mCurrentCPOffset) { + return; + } else { + llvm::DenseSet<uint32_t> const_set; + for (StringMap<AMDILConstPtr>::iterator cmb = consts_begin(), cme = consts_end(); + cmb != cme; ++cmb) { + const_set.insert(cmb->second.cbNum); + } + if (mCurrentCPOffset) { + const_set.insert(0); + } + for (llvm::DenseSet<uint32_t>::iterator setb = const_set.begin(), + sete = const_set.end(); setb != sete; ++setb) { + dumpDataToCB(O, km, *setb); + } + } +} +#endif +/// Create a function ID if it is not known or return the known +/// function ID. +uint32_t AMDILModuleInfo::getOrCreateFunctionID(const GlobalValue* func) { + if (func->getName().size()) { + return getOrCreateFunctionID(func->getName()); + } + uint32_t id; + if (mFuncPtrNames.find(func) == mFuncPtrNames.end()) { + id = mFuncPtrNames.size() + RESERVED_FUNCS + mFuncNames.size(); + mFuncPtrNames[func] = id; + } else { + id = mFuncPtrNames[func]; + } + return id; +} +uint32_t AMDILModuleInfo::getOrCreateFunctionID(const std::string &func) { + uint32_t id; + if (mFuncNames.find(func) == mFuncNames.end()) { + id = mFuncNames.size() + RESERVED_FUNCS + mFuncPtrNames.size(); + mFuncNames[func] = id; + } else { + id = mFuncNames[func]; + } + return id; +} diff --git a/src/gallium/drivers/radeon/AMDILModuleInfo.h b/src/gallium/drivers/radeon/AMDILModuleInfo.h new file mode 100644 index 00000000000..5111b87c338 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILModuleInfo.h @@ -0,0 +1,159 @@ +//===--------------- AMDILModuleInfo.h -------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This is an MMI implementation for AMDIL targets. +// +//===----------------------------------------------------------------------===// + +#ifndef _AMDIL_MACHINE_MODULE_INFO_H_ +#define _AMDIL_MACHINE_MODULE_INFO_H_ +#include "AMDIL.h" +#include "AMDILKernel.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/Module.h" +#include "llvm/Support/raw_ostream.h" + +#include <set> +#include <string> + +namespace llvm { + class AMDILKernel; + class Argument; + class TypeSymbolTable; + class GlobalValue; + class MachineFunction; + class GlobalValue; + + class AMDILModuleInfo : public MachineModuleInfoImpl { + protected: + const MachineModuleInfo *mMMI; + public: + AMDILModuleInfo(const MachineModuleInfo &); + virtual ~AMDILModuleInfo(); + + void processModule(const Module *MF, const AMDILTargetMachine* mTM); + + /// Process the given module and parse out the global variable metadata passed + /// down from the frontend-compiler + + /// Returns true if the image ID corresponds to a read only image. + bool isReadOnlyImage(const llvm::StringRef &name, uint32_t iID) const; + + /// Returns true if the image ID corresponds to a write only image. + bool isWriteOnlyImage(const llvm::StringRef &name, uint32_t iID) const; + + /// Gets the group size of the kernel for the given dimension. + uint32_t getRegion(const llvm::StringRef &name, uint32_t dim) const; + + /// Get the offset of the array for the kernel. + int32_t getArrayOffset(const llvm::StringRef &name) const; + + /// Get the offset of the const memory for the kernel. + int32_t getConstOffset(const llvm::StringRef &name) const; + + /// Get the boolean value if this particular constant uses HW or not. + bool getConstHWBit(const llvm::StringRef &name) const; + + /// Get a reference to the kernel metadata information for the given function + /// name. + AMDILKernel *getKernel(const llvm::StringRef &name); + bool isKernel(const llvm::StringRef &name) const; + + /// Dump the data section to the output stream for the given kernel. + //void dumpDataSection(llvm::raw_ostream &O, AMDILKernelManager *km); + + /// Iterate through the constants that are global to the compilation unit. + StringMap<AMDILConstPtr>::iterator consts_begin(); + StringMap<AMDILConstPtr>::iterator consts_end(); + + /// Query if the kernel has a byte store. + bool byteStoreExists(llvm::StringRef S) const; + + /// Query if the constant pointer is an argument. + bool isConstPtrArgument(const AMDILKernel *krnl, const llvm::StringRef &arg); + + /// Query if the constant pointer is an array that is globally scoped. + bool isConstPtrArray(const AMDILKernel *krnl, const llvm::StringRef &arg); + + /// Query the size of the constant pointer. + uint32_t getConstPtrSize(const AMDILKernel *krnl, const llvm::StringRef &arg); + + /// Query the offset of the constant pointer. + uint32_t getConstPtrOff(const AMDILKernel *krnl, const llvm::StringRef &arg); + + /// Query the constant buffer number for a constant pointer. + uint32_t getConstPtrCB(const AMDILKernel *krnl, const llvm::StringRef &arg); + + /// Query the Value* that the constant pointer originates from. + const Value *getConstPtrValue(const AMDILKernel *krnl, const llvm::StringRef &arg); + + /// Get the ID of the argument. + int32_t getArgID(const Argument *arg); + + /// Get the unique function ID for the specific function name and create a new + /// unique ID if it is not found. + uint32_t getOrCreateFunctionID(const GlobalValue* func); + uint32_t getOrCreateFunctionID(const std::string& func); + + /// Calculate the offsets of the constant pool for the given kernel and + /// machine function. + void calculateCPOffsets(const MachineFunction *MF, AMDILKernel *krnl); + + void add_printf_offset(uint32_t offset) { mPrintfOffset += offset; } + uint32_t get_printf_offset() { return mPrintfOffset; } + + private: + /// Various functions that parse global value information and store them in + /// the global manager. This approach is used instead of dynamic parsing as it + /// might require more space, but should allow caching of data that gets + /// requested multiple times. + AMDILKernelAttr parseSGV(const GlobalValue *GV); + AMDILLocalArg parseLVGV(const GlobalValue *GV); + void parseGlobalAnnotate(const GlobalValue *G); + void parseImageAnnotate(const GlobalValue *G); + void parseConstantPtrAnnotate(const GlobalValue *G); + void printConstantValue(const Constant *CAval, + llvm::raw_ostream& O, + bool asByte); + void parseKernelInformation(const Value *V); + void parseAutoArray(const GlobalValue *G, bool isRegion); + void parseConstantPtr(const GlobalValue *G); + void allocateGlobalCB(); + bool checkConstPtrsUseHW(Module::const_iterator *F); + + llvm::StringMap<AMDILKernel*> mKernels; + llvm::StringMap<AMDILKernelAttr> mKernelArgs; + llvm::StringMap<AMDILArrayMem> mArrayMems; + llvm::StringMap<AMDILConstPtr> mConstMems; + llvm::StringMap<AMDILLocalArg> mLocalArgs; + llvm::StringMap<uint32_t> mFuncNames; + llvm::DenseMap<const GlobalValue*, uint32_t> mFuncPtrNames; + llvm::DenseMap<uint32_t, llvm::StringRef> mImageNameMap; + std::set<llvm::StringRef> mByteStore; + std::set<llvm::StringRef> mIgnoreStr; + llvm::DenseMap<const Argument *, int32_t> mArgIDMap; + const TypeSymbolTable *symTab; + const AMDILSubtarget *mSTM; + size_t mOffset; + uint32_t mReservedBuffs; + uint32_t mCurrentCPOffset; + uint32_t mPrintfOffset; + }; + + + +} // end namespace llvm + +#endif // _AMDIL_COFF_MACHINE_MODULE_INFO_H_ + diff --git a/src/gallium/drivers/radeon/AMDILMultiClass.td b/src/gallium/drivers/radeon/AMDILMultiClass.td new file mode 100644 index 00000000000..92691db52fd --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILMultiClass.td @@ -0,0 +1,1440 @@ +//===-- AMDILMultiClass.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// Multiclass that handles branch instructions +multiclass BranchConditional<SDNode Op> { + def _i8 : ILFormat<IL_OP_IFC, (outs), + (ins brtarget:$target, GPRI8:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, GPRI8:$src0)]>; + def _i16 : ILFormat<IL_OP_IFC, (outs), + (ins brtarget:$target, GPRI16:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, GPRI16:$src0)]>; + def _i32 : ILFormat<IL_OP_IFC, (outs), + (ins brtarget:$target, GPRI32:$src0), + "; i32 Pseudo branch instruction", + [(Op bb:$target, GPRI32:$src0)]>; + def _f32 : ILFormat<IL_OP_IFC, (outs), + (ins brtarget:$target, GPRF32:$src0), + "; f32 Pseudo branch instruction", + [(Op bb:$target, GPRF32:$src0)]>; + def _i64 : ILFormat<IL_OP_IFC, (outs), + (ins brtarget:$target, GPRI64:$src0), + "; f64 Pseudo branch instruction", + [(Op bb:$target, (i64 GPRI64:$src0))]>; + def _f64 : ILFormat<IL_OP_IFC, (outs), + (ins brtarget:$target, GPRF64:$src0), + "; f64 Pseudo branch instruction", + [(Op bb:$target, (f64 GPRF64:$src0))]>; +} +// Multiclass that handles compare instructions +// When a definition is added here, a corrosponding defition +// needs to be added at: +// AMDILISelLowering.cpp@EmitInstrWithCustomInserter +multiclass Compare<string asm> { + def _i8 : ILFormat<IL_OP_CMP, (outs GPRI8:$dst), + (ins i32imm:$cc, GPRI8:$src0, GPRI8:$src1), + !strconcat("; i8 ", asm), + [(set GPRI8:$dst, (IL_cmp imm:$cc, GPRI8:$src0, GPRI8:$src1))]>; + def _i16 : ILFormat<IL_OP_CMP, (outs GPRI16:$dst), + (ins i32imm:$cc, GPRI16:$src0, GPRI16:$src1), + !strconcat("; i16 ", asm), + [(set GPRI16:$dst, (IL_cmp imm:$cc, GPRI16:$src0, GPRI16:$src1))]>; + def _i32 : ILFormat<IL_OP_CMP, (outs GPRI32:$dst), + (ins i32imm:$cc, GPRI32:$src0, GPRI32:$src1), + !strconcat("; i32 ", asm), + [(set GPRI32:$dst, (IL_cmp imm:$cc, GPRI32:$src0, GPRI32:$src1))]>; + def _i64 : ILFormat<IL_OP_CMP, (outs GPRI64:$dst), + (ins i32imm:$cc, GPRI64:$src0, GPRI64:$src1), + !strconcat("; i64 ", asm), + [(set GPRI64:$dst, (IL_cmp imm:$cc, GPRI64:$src0, GPRI64:$src1))]>; + def _f32 : ILFormat<IL_OP_CMP, (outs GPRF32:$dst), + (ins i32imm:$cc, GPRF32:$src0, GPRF32:$src1), + !strconcat("; f32 ", asm), + [(set GPRF32:$dst, (IL_cmp imm:$cc, GPRF32:$src0, GPRF32:$src1))]>; + def _f64 : ILFormat<IL_OP_CMP, (outs GPRF64:$dst), + (ins i32imm:$cc, GPRF64:$src0, GPRF64:$src1), + !strconcat("; f64 ", asm), + [(set GPRF64:$dst, (IL_cmp imm:$cc, GPRF64:$src0, GPRF64:$src1))]>; + def _v2i8 : ILFormat<IL_OP_CMP, (outs GPRV2I8:$dst), + (ins i32imm:$cc, GPRV2I8:$src0, GPRV2I8:$src1), + !strconcat("; i8 ", asm), + [(set GPRV2I8:$dst, (IL_cmp imm:$cc, GPRV2I8:$src0, GPRV2I8:$src1))]>; + def _v2i16 : ILFormat<IL_OP_CMP, (outs GPRV2I16:$dst), + (ins i32imm:$cc, GPRV2I16:$src0, GPRV2I16:$src1), + !strconcat("; i16 ", asm), + [(set GPRV2I16:$dst, (IL_cmp imm:$cc, GPRV2I16:$src0, GPRV2I16:$src1))]>; + def _v2i32 : ILFormat<IL_OP_CMP, (outs GPRV2I32:$dst), + (ins i32imm:$cc, GPRV2I32:$src0, GPRV2I32:$src1), + !strconcat("; i32 ", asm), + [(set GPRV2I32:$dst, (IL_cmp imm:$cc, GPRV2I32:$src0, GPRV2I32:$src1))]>; + def _v2i64 : ILFormat<IL_OP_CMP, (outs GPRV2I64:$dst), + (ins i32imm:$cc, GPRV2I64:$src0, GPRV2I64:$src1), + !strconcat("; i64 ", asm), + [(set GPRV2I64:$dst, (IL_cmp imm:$cc, GPRV2I64:$src0, GPRV2I64:$src1))]>; + def _v2f32 : ILFormat<IL_OP_CMP, (outs GPRV2F32:$dst), + (ins i32imm:$cc, GPRV2F32:$src0, GPRV2F32:$src1), + !strconcat("; f32 ", asm), + [(set GPRV2F32:$dst, (IL_cmp imm:$cc, GPRV2F32:$src0, GPRV2F32:$src1))]>; + def _v2f64 : ILFormat<IL_OP_CMP, (outs GPRV2F64:$dst), + (ins i32imm:$cc, GPRV2F64:$src0, GPRV2F64:$src1), + !strconcat("; f64 ", asm), + [(set GPRV2F64:$dst, (IL_cmp imm:$cc, GPRV2F64:$src0, GPRV2F64:$src1))]>; + def _v4i8 : ILFormat<IL_OP_CMP, (outs GPRV4I8:$dst), + (ins i32imm:$cc, GPRV4I8:$src0, GPRV4I8:$src1), + !strconcat("; i8 ", asm), + [(set GPRV4I8:$dst, (IL_cmp imm:$cc, GPRV4I8:$src0, GPRV4I8:$src1))]>; + def _v4i16 : ILFormat<IL_OP_CMP, (outs GPRV4I16:$dst), + (ins i32imm:$cc, GPRV4I16:$src0, GPRV4I16:$src1), + !strconcat("; i16 ", asm), + [(set GPRV4I16:$dst, (IL_cmp imm:$cc, GPRV4I16:$src0, GPRV4I16:$src1))]>; + def _v4i32 : ILFormat<IL_OP_CMP, (outs GPRV4I32:$dst), + (ins i32imm:$cc, GPRV4I32:$src0, GPRV4I32:$src1), + !strconcat("; i32 ", asm), + [(set GPRV4I32:$dst, (IL_cmp imm:$cc, GPRV4I32:$src0, GPRV4I32:$src1))]>; + def _v4f32 : ILFormat<IL_OP_CMP, (outs GPRV4F32:$dst), + (ins i32imm:$cc, GPRV4F32:$src0, GPRV4F32:$src1), + !strconcat("; f32 ", asm), + [(set GPRV4F32:$dst, (IL_cmp imm:$cc, GPRV4F32:$src0, GPRV4F32:$src1))]>; +} + +// Multiclass that handles constant values +multiclass ILConstant<string asm> { + def _i8 : ILFormat<IL_OP_MOV, (outs GPRI8:$dst), + (ins i8imm:$val), + asm, [(set GPRI8:$dst, imm:$val)]>; + + // def _v2i8 : ILFormat<IL_OP_MOV, (outs GPRV2I8:$dst), + // (ins i8imm:$val), + // asm, [(set GPRV2I8:$dst, GPRV2I8:$val)]>; + + //def _v4i8 : ILFormat<IL_OP_MOV, (outs GPRV4I8:$dst), + //(ins i8imm:$val), + //asm, [(set GPRV4I8:$dst, GPRV4I8:$val)]>; + + def _i16 : ILFormat<IL_OP_MOV, (outs GPRI16:$dst), + (ins i16imm:$val), + asm, [(set GPRI16:$dst, imm:$val)]>; + + // def _v2i16 : ILFormat<IL_OP_MOV, (outs GPRV2I16:$dst), + // (ins i16imm:$val), + // asm, [(set GPRV2I16:$dst, GPRV2I16:$val)]>; + + // def _v4i16 : ILFormat<IL_OP_MOV, (outs GPRV4I16:$dst), + // (ins i16imm:$val), + // asm, [(set GPRV4I16:$dst, GPRV4I16:$val)]>; + + def _i32 : ILFormat<IL_OP_MOV, (outs GPRI32:$dst), + (ins i32imm:$val), + asm, [(set GPRI32:$dst, imm:$val)]>; + + // def _v2i32 : ILFormat<IL_OP_MOV, (outs GPRV2I32:$dst), + // (ins i32imm:$val), + // asm, [(set GPRV2I32:$dst, GPRV2I32:$val)]>; + + // def _v4i32 : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + // (ins GPRV4I32:$val), + // asm, [(set GPRV4I32:$dst, GPRV4I32:$val)]>; + + def _i64 : ILFormat<IL_OP_MOV, (outs GPRI64:$dst), + (ins i64imm:$val), + asm, [(set GPRI64:$dst, imm:$val)]>; + + // def _v2i64 : ILFormat<IL_OP_MOV, (outs GPRV2I64:$dst), + // (ins i64imm:$val), + // asm, [(set GPRV2I64:$dst, GPRV2I64:$val)]>; + + def _f32 : ILFormat<IL_OP_MOV, (outs GPRF32:$dst), + (ins f32imm:$val), + asm, [(set GPRF32:$dst, fpimm:$val)]>; + + // def _v2f32 : ILFormat<IL_OP_MOV, (outs GPRV2F32:$dst), + // (ins f32imm:$val), + // asm, [(set GPRV2F32:$dst, GPRV2F32:$val)]>; + + // def _v4f32 : ILFormat<IL_OP_MOV, (outs GPRV4F32:$dst), + // (ins f32imm:$val), + // asm, [(set GPRV4F32:$dst, GPRV4F32:$val)]>; + + def _f64 : ILFormat<IL_OP_MOV, (outs GPRF64:$dst), + (ins f64imm:$val), + asm, [(set GPRF64:$dst, fpimm:$val)]>; + + // def _v2f64 : ILFormat<IL_OP_MOV, (outs GPRV2F64:$dst), + // (ins f64imm:$val), + // asm, [(set GPRV2F64:$dst, GPRV2F64:$val)]>; + +} + +// Multiclass that handles memory store operations +multiclass GTRUNCSTORE<string asm> { + def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i8trunc_store GPRI16:$val, ADDR:$ptr)]>; + def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i8trunc_store GPRI32:$val, ADDR:$ptr)]>; + def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i8trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i16trunc_store GPRI32:$val, ADDR:$ptr)]>; + def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i16trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i32trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_f32trunc_store GPRF64:$val, ADDR:$ptr)]>; + def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i8trunc_store GPRV2I32:$val, ADDR:$ptr)]>; + def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v4i8trunc_store GPRV4I32:$val, ADDR:$ptr)]>; + def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i8trunc_store GPRV2I16:$val, ADDR:$ptr)]>; + def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v4i8trunc_store GPRV4I16:$val, ADDR:$ptr)]>; + def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i16trunc_store GPRV2I32:$val, ADDR:$ptr)]>; + def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v4i16trunc_store GPRV4I32:$val, ADDR:$ptr)]>; + def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2f32trunc_store GPRV2F64:$val, ADDR:$ptr)]>; + def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i8trunc_store GPRV2I64:$val, ADDR:$ptr)]>; + def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i16trunc_store GPRV2I64:$val, ADDR:$ptr)]>; + def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i32trunc_store GPRV2I64:$val, ADDR:$ptr)]>; +} + +// Multiclass that handles memory store operations +multiclass LTRUNCSTORE<string asm> { + def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i8trunc_store GPRI16:$val, ADDR:$ptr)]>; + def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i8trunc_store GPRI32:$val, ADDR:$ptr)]>; + def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i8trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i16trunc_store GPRI32:$val, ADDR:$ptr)]>; + def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i16trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i32trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_f32trunc_store GPRF64:$val, ADDR:$ptr)]>; + def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i8trunc_store GPRV2I32:$val, ADDR:$ptr)]>; + def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v4i8trunc_store GPRV4I32:$val, ADDR:$ptr)]>; + def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i8trunc_store GPRV2I16:$val, ADDR:$ptr)]>; + def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v4i8trunc_store GPRV4I16:$val, ADDR:$ptr)]>; + def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i16trunc_store GPRV2I32:$val, ADDR:$ptr)]>; + def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v4i16trunc_store GPRV4I32:$val, ADDR:$ptr)]>; + def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2f32trunc_store GPRV2F64:$val, ADDR:$ptr)]>; + def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i8trunc_store GPRV2I64:$val, ADDR:$ptr)]>; + def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i16trunc_store GPRV2I64:$val, ADDR:$ptr)]>; + def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i32trunc_store GPRV2I64:$val, ADDR:$ptr)]>; +} + +// Multiclass that handles memory store operations +multiclass PTRUNCSTORE<string asm> { + def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i8trunc_store GPRI16:$val, ADDR:$ptr)]>; + def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i8trunc_store GPRI32:$val, ADDR:$ptr)]>; + def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i8trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i16trunc_store GPRI32:$val, ADDR:$ptr)]>; + def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i16trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i32trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_f32trunc_store GPRF64:$val, ADDR:$ptr)]>; + def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i8trunc_store GPRV2I32:$val, ADDR:$ptr)]>; + def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v4i8trunc_store GPRV4I32:$val, ADDR:$ptr)]>; + def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i8trunc_store GPRV2I16:$val, ADDR:$ptr)]>; + def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v4i8trunc_store GPRV4I16:$val, ADDR:$ptr)]>; + def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i16trunc_store GPRV2I32:$val, ADDR:$ptr)]>; + def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v4i16trunc_store GPRV4I32:$val, ADDR:$ptr)]>; + def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2f32trunc_store GPRV2F64:$val, ADDR:$ptr)]>; + def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i8trunc_store GPRV2I64:$val, ADDR:$ptr)]>; + def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i16trunc_store GPRV2I64:$val, ADDR:$ptr)]>; + def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i32trunc_store GPRV2I64:$val, ADDR:$ptr)]>; +} + +// Multiclass that handles memory store operations +multiclass RTRUNCSTORE<string asm> { + def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i8trunc_store GPRI16:$val, ADDR:$ptr)]>; + def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i8trunc_store GPRI32:$val, ADDR:$ptr)]>; + def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i8trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i16trunc_store GPRI32:$val, ADDR:$ptr)]>; + def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i16trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i32trunc_store GPRI64:$val, ADDR:$ptr)]>; + def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_f32trunc_store GPRF64:$val, ADDR:$ptr)]>; + def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i8trunc_store GPRV2I32:$val, ADDR:$ptr)]>; + def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v4i8trunc_store GPRV4I32:$val, ADDR:$ptr)]>; + def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i8trunc_store GPRV2I16:$val, ADDR:$ptr)]>; + def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v4i8trunc_store GPRV4I16:$val, ADDR:$ptr)]>; + def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i16trunc_store GPRV2I32:$val, ADDR:$ptr)]>; + def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v4i16trunc_store GPRV4I32:$val, ADDR:$ptr)]>; + def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2f32trunc_store GPRV2F64:$val, ADDR:$ptr)]>; + def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i8trunc_store GPRV2I64:$val, ADDR:$ptr)]>; + def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i16trunc_store GPRV2I64:$val, ADDR:$ptr)]>; + def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i32trunc_store GPRV2I64:$val, ADDR:$ptr)]>; +} + + +// Multiclass that handles memory store operations +multiclass STORE<string asm, PatFrag OpNode> { + def _i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI8:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRI8:$val, ADDR:$ptr)]>; + def _i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRI16:$val, ADDR:$ptr)]>; + def _i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRI32:$val, ADDR:$ptr)]>; + def _f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRF32:$val, ADDR:$ptr)]>; + def _i64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRI64:$val, ADDR:$ptr)]>; + def _f64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRF64:$val, ADDR:$ptr)]>; + def _v4f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4F32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV4F32:$val, ADDR:$ptr)]>; + def _v2f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2F32:$val, ADDR:$ptr)]>; + def _v4i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV4I32:$val, ADDR:$ptr)]>; + def _v2i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I8:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2I8:$val, ADDR:$ptr)]>; + def _v2i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2I16:$val, ADDR:$ptr)]>; + def _v4i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I8:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV4I8:$val, ADDR:$ptr)]>; + def _v4i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV4I16:$val, ADDR:$ptr)]>; + def _v2i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2I32:$val, ADDR:$ptr)]>; + def _v2f64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2F64:$val, ADDR:$ptr)]>; + def _v2i64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI32:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2I64:$val, ADDR:$ptr)]>; +} + +// Multiclass that handles load operations +multiclass LOAD<string asm, PatFrag OpNode> { + def _i8 : OneInOneOut<IL_OP_MOV, (outs GPRI8:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRI8:$dst, (OpNode ADDR:$ptr))]>; + def _i16 : OneInOneOut<IL_OP_MOV, (outs GPRI16:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRI16:$dst, (OpNode ADDR:$ptr))]>; + def _i32 : OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRI32:$dst, (OpNode ADDR:$ptr))]>; + def _f32 : OneInOneOut<IL_OP_MOV, (outs GPRF32:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRF32:$dst, (OpNode ADDR:$ptr))]>; + def _i64 : OneInOneOut<IL_OP_MOV, (outs GPRI64:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRI64:$dst, (OpNode ADDR:$ptr))]>; + def _f64 : OneInOneOut<IL_OP_MOV, (outs GPRF64:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRF64:$dst, (OpNode ADDR:$ptr))]>; + def _v4f32 : OneInOneOut<IL_OP_MOV, (outs GPRV4F32:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV4F32:$dst, (OpNode ADDR:$ptr))]>; + def _v2f32 : OneInOneOut<IL_OP_MOV, (outs GPRV2F32:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2F32:$dst, (OpNode ADDR:$ptr))]>; + def _v2f64 : OneInOneOut<IL_OP_MOV, (outs GPRV2F64:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2F64:$dst, (OpNode ADDR:$ptr))]>; + def _v4i32 : OneInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV4I32:$dst, (OpNode ADDR:$ptr))]>; + def _v2i8 : OneInOneOut<IL_OP_MOV, (outs GPRV2I8:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2I8:$dst, (OpNode ADDR:$ptr))]>; + def _v2i16 : OneInOneOut<IL_OP_MOV, (outs GPRV2I16:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2I16:$dst, (OpNode ADDR:$ptr))]>; + def _v4i8 : OneInOneOut<IL_OP_MOV, (outs GPRV4I8:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV4I8:$dst, (OpNode ADDR:$ptr))]>; + def _v4i16 : OneInOneOut<IL_OP_MOV, (outs GPRV4I16:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV4I16:$dst, (OpNode ADDR:$ptr))]>; + def _v2i32 : OneInOneOut<IL_OP_MOV, (outs GPRV2I32:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2I32:$dst, (OpNode ADDR:$ptr))]>; + def _v2i64 : OneInOneOut<IL_OP_MOV, (outs GPRV2I64:$dst), (ins MEMI32:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2I64:$dst, (OpNode ADDR:$ptr))]>; +} + +// Multiclass that handles memory store operations +multiclass GTRUNCSTORE64<string asm> { + def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i8trunc_store GPRI16:$val, ADDR64:$ptr)]>; + def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i8trunc_store GPRI32:$val, ADDR64:$ptr)]>; + def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i8trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i16trunc_store GPRI32:$val, ADDR64:$ptr)]>; + def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i16trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_i32trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_f32trunc_store GPRF64:$val, ADDR64:$ptr)]>; + def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i8trunc_store GPRV2I32:$val, ADDR64:$ptr)]>; + def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v4i8trunc_store GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i8trunc_store GPRV2I16:$val, ADDR64:$ptr)]>; + def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v4i8trunc_store GPRV4I16:$val, ADDR64:$ptr)]>; + def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i16trunc_store GPRV2I32:$val, ADDR64:$ptr)]>; + def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v4i16trunc_store GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2f32trunc_store GPRV2F64:$val, ADDR64:$ptr)]>; + def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i8trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; + def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i16trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; + def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(global_v2i32trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; +} + +// Multiclass that handles memory store operations +multiclass LTRUNCSTORE64<string asm> { + def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i8trunc_store GPRI16:$val, ADDR64:$ptr)]>; + def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i8trunc_store GPRI32:$val, ADDR64:$ptr)]>; + def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i8trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i16trunc_store GPRI32:$val, ADDR64:$ptr)]>; + def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i16trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_i32trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_f32trunc_store GPRF64:$val, ADDR64:$ptr)]>; + def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i8trunc_store GPRV2I32:$val, ADDR64:$ptr)]>; + def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v4i8trunc_store GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i8trunc_store GPRV2I16:$val, ADDR64:$ptr)]>; + def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v4i8trunc_store GPRV4I16:$val, ADDR64:$ptr)]>; + def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i16trunc_store GPRV2I32:$val, ADDR64:$ptr)]>; + def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v4i16trunc_store GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2f32trunc_store GPRV2F64:$val, ADDR64:$ptr)]>; + def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i8trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; + def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i16trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; + def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(local_v2i32trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; +} + +// Multiclass that handles memory store operations +multiclass PTRUNCSTORE64<string asm> { + def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i8trunc_store GPRI16:$val, ADDR64:$ptr)]>; + def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i8trunc_store GPRI32:$val, ADDR64:$ptr)]>; + def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i8trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i16trunc_store GPRI32:$val, ADDR64:$ptr)]>; + def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i16trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_i32trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_f32trunc_store GPRF64:$val, ADDR64:$ptr)]>; + def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i8trunc_store GPRV2I32:$val, ADDR64:$ptr)]>; + def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v4i8trunc_store GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i8trunc_store GPRV2I16:$val, ADDR64:$ptr)]>; + def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v4i8trunc_store GPRV4I16:$val, ADDR64:$ptr)]>; + def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i16trunc_store GPRV2I32:$val, ADDR64:$ptr)]>; + def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v4i16trunc_store GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2f32trunc_store GPRV2F64:$val, ADDR64:$ptr)]>; + def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i8trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; + def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i16trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; + def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(private_v2i32trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; +} + +// Multiclass that handles memory store operations +multiclass RTRUNCSTORE64<string asm> { + def _i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i8trunc_store GPRI16:$val, ADDR64:$ptr)]>; + def _i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i8trunc_store GPRI32:$val, ADDR64:$ptr)]>; + def _i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i8trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i16trunc_store GPRI32:$val, ADDR64:$ptr)]>; + def _i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i16trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_i32trunc_store GPRI64:$val, ADDR64:$ptr)]>; + def _f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_f32trunc_store GPRF64:$val, ADDR64:$ptr)]>; + def _v2i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i8trunc_store GPRV2I32:$val, ADDR64:$ptr)]>; + def _v4i32i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v4i8trunc_store GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i8trunc_store GPRV2I16:$val, ADDR64:$ptr)]>; + def _v4i16i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v4i8trunc_store GPRV4I16:$val, ADDR64:$ptr)]>; + def _v2i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i16trunc_store GPRV2I32:$val, ADDR64:$ptr)]>; + def _v4i32i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v4i16trunc_store GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2f64f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2f32trunc_store GPRV2F64:$val, ADDR64:$ptr)]>; + def _v2i64i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i8trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; + def _v2i64i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i16trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; + def _v2i64i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(region_v2i32trunc_store GPRV2I64:$val, ADDR64:$ptr)]>; +} + + +// Multiclass that handles memory store operations +multiclass STORE64<string asm, PatFrag OpNode> { + def _i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI8:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRI8:$val, ADDR64:$ptr)]>; + def _i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRI16:$val, ADDR64:$ptr)]>; + def _i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRI32:$val, ADDR64:$ptr)]>; + def _f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRF32:$val, ADDR64:$ptr)]>; + def _i64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRI64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRI64:$val, ADDR64:$ptr)]>; + def _f64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRF64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRF64:$val, ADDR64:$ptr)]>; + def _v4f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4F32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV4F32:$val, ADDR64:$ptr)]>; + def _v2f32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2F32:$val, ADDR64:$ptr)]>; + def _v4i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV4I32:$val, ADDR64:$ptr)]>; + def _v2i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I8:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2I8:$val, ADDR64:$ptr)]>; + def _v2i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2I16:$val, ADDR64:$ptr)]>; + def _v4i8 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I8:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV4I8:$val, ADDR64:$ptr)]>; + def _v4i16 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV4I16:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV4I16:$val, ADDR64:$ptr)]>; + def _v2i32 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I32:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2I32:$val, ADDR64:$ptr)]>; + def _v2f64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2F64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2F64:$val, ADDR64:$ptr)]>; + def _v2i64 : OneInOneOut<IL_OP_MOV, (outs), (ins GPRV2I64:$val, MEMI64:$ptr), + !strconcat(asm, " $val $ptr"), + [(OpNode GPRV2I64:$val, ADDR64:$ptr)]>; +} + +// Multiclass that handles load operations +multiclass LOAD64<string asm, PatFrag OpNode> { + def _i8 : OneInOneOut<IL_OP_MOV, (outs GPRI8:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRI8:$dst, (OpNode ADDR64:$ptr))]>; + def _i16 : OneInOneOut<IL_OP_MOV, (outs GPRI16:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRI16:$dst, (OpNode ADDR64:$ptr))]>; + def _i32 : OneInOneOut<IL_OP_MOV, (outs GPRI32:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRI32:$dst, (OpNode ADDR64:$ptr))]>; + def _f32 : OneInOneOut<IL_OP_MOV, (outs GPRF32:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRF32:$dst, (OpNode ADDR64:$ptr))]>; + def _i64 : OneInOneOut<IL_OP_MOV, (outs GPRI64:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRI64:$dst, (OpNode ADDR64:$ptr))]>; + def _f64 : OneInOneOut<IL_OP_MOV, (outs GPRF64:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRF64:$dst, (OpNode ADDR64:$ptr))]>; + def _v4f32 : OneInOneOut<IL_OP_MOV, (outs GPRV4F32:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV4F32:$dst, (OpNode ADDR64:$ptr))]>; + def _v2f32 : OneInOneOut<IL_OP_MOV, (outs GPRV2F32:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2F32:$dst, (OpNode ADDR64:$ptr))]>; + def _v2f64 : OneInOneOut<IL_OP_MOV, (outs GPRV2F64:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2F64:$dst, (OpNode ADDR64:$ptr))]>; + def _v4i32 : OneInOneOut<IL_OP_MOV, (outs GPRV4I32:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV4I32:$dst, (OpNode ADDR64:$ptr))]>; + def _v2i8 : OneInOneOut<IL_OP_MOV, (outs GPRV2I8:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2I8:$dst, (OpNode ADDR64:$ptr))]>; + def _v2i16 : OneInOneOut<IL_OP_MOV, (outs GPRV2I16:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2I16:$dst, (OpNode ADDR64:$ptr))]>; + def _v4i8 : OneInOneOut<IL_OP_MOV, (outs GPRV4I8:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV4I8:$dst, (OpNode ADDR64:$ptr))]>; + def _v4i16 : OneInOneOut<IL_OP_MOV, (outs GPRV4I16:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV4I16:$dst, (OpNode ADDR64:$ptr))]>; + def _v2i32 : OneInOneOut<IL_OP_MOV, (outs GPRV2I32:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2I32:$dst, (OpNode ADDR64:$ptr))]>; + def _v2i64 : OneInOneOut<IL_OP_MOV, (outs GPRV2I64:$dst), (ins MEMI64:$ptr), + !strconcat(asm, " $dst $ptr"), + [(set GPRV2I64:$dst, (OpNode ADDR64:$ptr))]>; +} + +// Only scalar types should generate flow control +multiclass BranchInstr<ILOpCode opc> { + def _i8 : UnaryOpNoRet<opc, (outs), (ins GPRI8:$src), + !strconcat(opc.Text, " $src"), []>; + def _i16 : UnaryOpNoRet<opc, (outs), (ins GPRI16:$src), + !strconcat(opc.Text, " $src"), []>; + def _i32 : UnaryOpNoRet<opc, (outs), (ins GPRI32:$src), + !strconcat(opc.Text, " $src"), []>; + def _i64 : UnaryOpNoRet<opc, (outs), (ins GPRI64:$src), + !strconcat(opc.Text, " $src"), []>; + def _f32 : UnaryOpNoRet<opc, (outs), (ins GPRF32:$src), + !strconcat(opc.Text, " $src"), []>; + def _f64 : UnaryOpNoRet<opc, (outs), (ins GPRF64:$src), + !strconcat(opc.Text, " $src"), []>; +} +// Only scalar types should generate flow control +multiclass BranchInstr2<ILOpCode opc> { + def _i8 : BinaryOpNoRet<opc, (outs), (ins GPRI8:$src0, GPRI8:$src1), + !strconcat(opc.Text, " $src0, $src1"), []>; + def _i16 : BinaryOpNoRet<opc, (outs), (ins GPRI16:$src0, GPRI16:$src1), + !strconcat(opc.Text, " $src0, $src1"), []>; + def _i32 : BinaryOpNoRet<opc, (outs), (ins GPRI32:$src0, GPRI32:$src1), + !strconcat(opc.Text, " $src0, $src1"), []>; + def _i64 : BinaryOpNoRet<opc, (outs), (ins GPRI64:$src0, GPRI64:$src1), + !strconcat(opc.Text, " $src0, $src1"), []>; + def _f32 : BinaryOpNoRet<opc, (outs), (ins GPRF32:$src0, GPRF32:$src1), + !strconcat(opc.Text, " $src0, $src1"), []>; + def _f64 : BinaryOpNoRet<opc, (outs), (ins GPRF64:$src0, GPRF64:$src1), + !strconcat(opc.Text, " $src0, $src1"), []>; +} + +// Class that handles the various vector extract patterns +multiclass VectorExtract<SDNode OpNode> { + def _v2f64 : ExtractVectorClass<GPRF64, GPRV2F64, OpNode>; + def _v4f32: ExtractVectorClass<GPRF32, GPRV4F32, OpNode>; + def _v2f32 : ExtractVectorClass<GPRF32, GPRV2F32, OpNode>; + def _v2i64 : ExtractVectorClass<GPRI64, GPRV2I64, OpNode>; + def _v4i8 : ExtractVectorClass<GPRI8, GPRV4I8, OpNode>; + def _v4i16 : ExtractVectorClass<GPRI16, GPRV4I16, OpNode>; + def _v4i32 : ExtractVectorClass<GPRI32, GPRV4I32, OpNode>; + def _v2i8 : ExtractVectorClass<GPRI8, GPRV2I8, OpNode>; + def _v2i16 : ExtractVectorClass<GPRI16, GPRV2I16, OpNode>; + def _v2i32 : ExtractVectorClass<GPRI32, GPRV2I32, OpNode>; +} + +multiclass VectorConcat<SDNode OpNode> { + def _v2f64 : VectorConcatClass<GPRV2F64, GPRF64, OpNode>; + def _v2i64 : VectorConcatClass<GPRV2F64, GPRI64, OpNode>; + def _v4f32 : VectorConcatClass<GPRV4F32, GPRV2F32, OpNode>; + def _v4i32 : VectorConcatClass<GPRV4I32, GPRV2I32, OpNode>; + def _v4i16 : VectorConcatClass<GPRV4I16, GPRV2I16, OpNode>; + def _v4i8 : VectorConcatClass<GPRV4I8, GPRV2I8, OpNode>; + def _v2f32 : VectorConcatClass<GPRV2F32, GPRF32, OpNode>; + def _v2i32 : VectorConcatClass<GPRV2I32, GPRI32, OpNode>; + def _v2i16 : VectorConcatClass<GPRV2I16, GPRI16, OpNode>; + def _v2i8 : VectorConcatClass<GPRV2I8, GPRI8, OpNode>; +} + +// Class that handles the various vector insert patterns +multiclass VectorInsert<SDNode OpNode> { + def _v2f64 : InsertVectorClass<IL_OP_I_ADD, GPRV2F64, + GPRF64, OpNode, "iadd">; + def _v4f32: InsertVectorClass<IL_OP_I_ADD, GPRV4F32, + GPRF32, OpNode, "iadd">; + def _v2f32 : InsertVectorClass<IL_OP_I_ADD, GPRV2F32, + GPRF32, OpNode, "iadd">; + def _v2i64 : InsertVectorClass<IL_OP_I_ADD, GPRV2I64, + GPRI64, OpNode, "iadd">; + def _v4i8 : InsertVectorClass<IL_OP_I_ADD, GPRV4I8, + GPRI8, OpNode, "iadd">; + def _v4i16 : InsertVectorClass<IL_OP_I_ADD, GPRV4I16, + GPRI16, OpNode, "iadd">; + def _v4i32 : InsertVectorClass<IL_OP_I_ADD, GPRV4I32, + GPRI32, OpNode, "iadd">; + def _v2i8 : InsertVectorClass<IL_OP_I_ADD, GPRV2I8, + GPRI8, OpNode, "iadd">; + def _v2i16 : InsertVectorClass<IL_OP_I_ADD, GPRV2I16, + GPRI16, OpNode, "iadd">; + def _v2i32 : InsertVectorClass<IL_OP_I_ADD, GPRV2I32, + GPRI32, OpNode, "iadd">; +} + +// generic class that handles math instruction for OneInOneOut instruction +// patterns +multiclass UnaryOpMC<ILOpCode OpCode, SDNode OpNode> { + def _i8 : UnaryOp<OpCode, OpNode, GPRI8, GPRI8>; + def _i16 : UnaryOp<OpCode, OpNode, GPRI16, GPRI16>; + def _i32 : UnaryOp<OpCode, OpNode, GPRI32, GPRI32>; + def _f32 : UnaryOp<OpCode, OpNode, GPRF32, GPRF32>; + def _f64 : UnaryOp<OpCode, OpNode, GPRF64, GPRF64>; + def _i64 : UnaryOp<OpCode, OpNode, GPRI64, GPRI64>; + def _v4f32: UnaryOp<OpCode, OpNode, GPRV4F32, GPRV4F32>; + def _v4i16 : UnaryOp<OpCode, OpNode, GPRV4I16, GPRV4I16>; + def _v4i8 : UnaryOp<OpCode, OpNode, GPRV4I8, GPRV4I8>; + def _v4i32 : UnaryOp<OpCode, OpNode, GPRV4I32, GPRV4I32>; + def _v2f32 : UnaryOp<OpCode, OpNode, GPRV2F32, GPRV2F32>; + def _v2i16 : UnaryOp<OpCode, OpNode, GPRV2I16, GPRV2I16>; + def _v2i8 : UnaryOp<OpCode, OpNode, GPRV2I8, GPRV2I8>; + def _v2i32 : UnaryOp<OpCode, OpNode, GPRV2I32, GPRV2I32>; + def _v2f64 : UnaryOp<OpCode, OpNode, GPRV2F64, GPRV2F64>; + def _v2i64 : UnaryOp<OpCode, OpNode, GPRV2I64, GPRV2I64>; +} +multiclass UnaryOpMCVec<ILOpCode OpCode, SDNode OpNode> { + def _v4f32: UnaryOp<OpCode, OpNode, GPRV4F32, GPRF32>; + def _v4i16 : UnaryOp<OpCode, OpNode, GPRV4I16, GPRI16>; + def _v4i8 : UnaryOp<OpCode, OpNode, GPRV4I8, GPRI8>; + def _v4i32 : UnaryOp<OpCode, OpNode, GPRV4I32, GPRI32>; + def _v2f32 : UnaryOp<OpCode, OpNode, GPRV2F32, GPRF32>; + def _v2i16 : UnaryOp<OpCode, OpNode, GPRV2I16, GPRI16>; + def _v2i8 : UnaryOp<OpCode, OpNode, GPRV2I8, GPRI8>; + def _v2i32 : UnaryOp<OpCode, OpNode, GPRV2I32, GPRI32>; + def _v2f64 : UnaryOp<OpCode, OpNode, GPRV2F64, GPRF64>; + def _v2i64 : UnaryOp<OpCode, OpNode, GPRV2I64, GPRI64>; +} + +multiclass UnaryOpMCf32< +ILOpCode f32OpCode, + SDNode OpNode> { + def _f32 : UnaryOp<f32OpCode, OpNode, GPRF32, GPRF32>; + def _v4f32: UnaryOp<f32OpCode, OpNode, GPRV4F32, GPRV4F32>; + def _v2f32 : UnaryOp<f32OpCode, OpNode, GPRV2F32, GPRV2F32>; + } + +multiclass UnaryOpMCi32< +ILOpCode i32OpCode, + SDNode OpNode> { + def _i8 : UnaryOp<i32OpCode, OpNode, GPRI8, GPRI8>; + def _i16 : UnaryOp<i32OpCode, OpNode, GPRI16, GPRI16>; + def _i32 : UnaryOp<i32OpCode, OpNode, GPRI32, GPRI32>; + def _v4i16 : UnaryOp<i32OpCode, OpNode, GPRV4I16, GPRV4I16>; + def _v4i8 : UnaryOp<i32OpCode, OpNode, GPRV4I8, GPRV4I8>; + def _v4i32 : UnaryOp<i32OpCode, OpNode, GPRV4I32, GPRV4I32>; + def _v2i16 : UnaryOp<i32OpCode, OpNode, GPRV2I16, GPRV2I16>; + def _v2i8 : UnaryOp<i32OpCode, OpNode, GPRV2I8, GPRV2I8>; + def _v2i32 : UnaryOp<i32OpCode, OpNode, GPRV2I32, GPRV2I32>; + } + + +multiclass BinaryOpMC<ILOpCode OpCode, SDNode OpNode> { + def _i8 : BinaryOp<OpCode, OpNode, GPRI8, GPRI8, GPRI8>; + + def _i16 : BinaryOp<OpCode, OpNode, GPRI16, GPRI16, GPRI16>; + def _i32 : BinaryOp<OpCode, OpNode, GPRI32, GPRI32, GPRI32>; + def _f32 : BinaryOp<OpCode, OpNode, GPRF32, GPRF32, GPRF32>; + def _f64 : BinaryOp<OpCode, OpNode, GPRF64, GPRF64, GPRF64>; + def _i64 : BinaryOp<OpCode, OpNode, GPRI64, GPRI64, GPRI64>; + def _v4f32: BinaryOp<OpCode, OpNode, GPRV4F32, GPRV4F32, GPRV4F32>; + def _v4i16 : BinaryOp<OpCode, OpNode, GPRV4I16, GPRV4I16, GPRV4I16>; + def _v4i8 : BinaryOp<OpCode, OpNode, GPRV4I8, GPRV4I8, GPRV4I8>; + def _v4i32 : BinaryOp<OpCode, OpNode, GPRV4I32, GPRV4I32, GPRV4I32>; + def _v2f32 : BinaryOp<OpCode, OpNode, GPRV2F32, GPRV2F32, GPRV2F32>; + def _v2i16 : BinaryOp<OpCode, OpNode, GPRV2I16, GPRV2I16, GPRV2I16>; + def _v2i8 : BinaryOp<OpCode, OpNode, GPRV2I8, GPRV2I8, GPRV2I8>; + def _v2i32 : BinaryOp<OpCode, OpNode, GPRV2I32, GPRV2I32, GPRV2I32>; + def _v2f64 : BinaryOp<OpCode, OpNode, GPRV2F64, GPRV2F64, GPRV2F64>; + def _v2i64 : BinaryOp<OpCode, OpNode, GPRV2I64, GPRV2I64, GPRV2I64>; +} + +multiclass BinaryOpMCInt<ILOpCode OpCode, SDNode OpNode> { + def _i8 : BinaryOp<OpCode, OpNode, GPRI8, GPRI8, GPRI8>; + + def _i16 : BinaryOp<OpCode, OpNode, GPRI16, GPRI16, GPRI16>; + def _i32 : BinaryOp<OpCode, OpNode, GPRI32, GPRI32, GPRI32>; + def _i64 : BinaryOp<OpCode, OpNode, GPRI64, GPRI64, GPRI64>; + def _v4i16 : BinaryOp<OpCode, OpNode, GPRV4I16, GPRV4I16, GPRV4I16>; + def _v4i8 : BinaryOp<OpCode, OpNode, GPRV4I8, GPRV4I8, GPRV4I8>; + def _v4i32 : BinaryOp<OpCode, OpNode, GPRV4I32, GPRV4I32, GPRV4I32>; + def _v2i16 : BinaryOp<OpCode, OpNode, GPRV2I16, GPRV2I16, GPRV2I16>; + def _v2i8 : BinaryOp<OpCode, OpNode, GPRV2I8, GPRV2I8, GPRV2I8>; + def _v2i32 : BinaryOp<OpCode, OpNode, GPRV2I32, GPRV2I32, GPRV2I32>; + def _v2i64 : BinaryOp<OpCode, OpNode, GPRV2I64, GPRV2I64, GPRV2I64>; +} + +// generic class that handles math instruction for ThreeInOneOut +// instruction patterns +multiclass TernaryOpMC<ILOpCode OpCode, SDNode OpNode> { + def _i8 : TernaryOp<OpCode, OpNode, GPRI8, GPRI8, GPRI8, GPRI8>; + def _i16 : TernaryOp<OpCode, OpNode, GPRI16, GPRI16, GPRI16, GPRI16>; + def _i32 : TernaryOp<OpCode, OpNode, GPRI32, GPRI32, GPRI32, GPRI32>; + def _f32 : TernaryOp<OpCode, OpNode, GPRF32, GPRF32, GPRF32, GPRF32>; + def _f64 : TernaryOp<OpCode, OpNode, GPRF64, GPRF64, GPRF64, GPRF64>; + def _i64 : TernaryOp<OpCode, OpNode, GPRI64, GPRI64, GPRI64, GPRI64>; + def _v4f32: TernaryOp<OpCode, OpNode, GPRV4F32, GPRV4F32, + GPRV4F32, GPRV4F32>; + def _v4i8 : TernaryOp<OpCode, OpNode, GPRV4I8, GPRV4I8, + GPRV4I8, GPRV4I8>; + def _v4i16 : TernaryOp<OpCode, OpNode, GPRV4I16, GPRV4I16, + GPRV4I16, GPRV4I16>; + def _v4i32 : TernaryOp<OpCode, OpNode, GPRV4I32, GPRV4I32, + GPRV4I32, GPRV4I32>; + def _v2f32 : TernaryOp<OpCode, OpNode, GPRV2F32, GPRV2F32, + GPRV2F32, GPRV2F32>; + def _v2i8 : TernaryOp<OpCode, OpNode, GPRV2I8, GPRV2I8, + GPRV2I8, GPRV2I8>; + def _v2i16 : TernaryOp<OpCode, OpNode, GPRV2I16, GPRV2I16, + GPRV2I16, GPRV2I16>; + def _v2i32 : TernaryOp<OpCode, OpNode, GPRV2I32, GPRV2I32, + GPRV2I32, GPRV2I32>; + def _v2f64 : TernaryOp<OpCode, OpNode, GPRV2F64, GPRV2F64, + GPRV2F64, GPRV2F64>; + def _v2i64 : TernaryOp<OpCode, OpNode, GPRV2I64, GPRV2I64, + GPRV2I64, GPRV2I64>; +} +multiclass BinaryOpMCi32<ILOpCode i32OpCode, SDNode OpNode> { + def _i8 : BinaryOp<i32OpCode, OpNode, GPRI8, GPRI8, GPRI8>; + def _i16 : BinaryOp<i32OpCode, OpNode, GPRI16, GPRI16, GPRI16>; + def _i32 : BinaryOp<i32OpCode, OpNode, GPRI32, GPRI32, GPRI32>; + def _v4i16 : BinaryOp<i32OpCode, OpNode, GPRV4I16, + GPRV4I16, GPRV4I16>; + def _v4i8 : BinaryOp<i32OpCode, OpNode, GPRV4I8, + GPRV4I8, GPRV4I8>; + def _v4i32 : BinaryOp<i32OpCode, OpNode, GPRV4I32, + GPRV4I32, GPRV4I32>; + def _v2i16 : BinaryOp<i32OpCode, OpNode, GPRV2I16, + GPRV2I16, GPRV2I16>; + def _v2i8 : BinaryOp<i32OpCode, OpNode, GPRV2I8, + GPRV2I8, GPRV2I8>; + def _v2i32 : BinaryOp<i32OpCode, OpNode, GPRV2I32, + GPRV2I32, GPRV2I32>; +} +multiclass BinaryOpMCi64<ILOpCode i64OpCode, SDNode OpNode> { + def _i64 : BinaryOp<i64OpCode, OpNode, GPRI64, GPRI64, GPRI64>; + def _v2i64 : BinaryOp<i64OpCode, OpNode, GPRV2I64, + GPRV2I64, GPRV2I64>; +} +multiclass BinaryOpMCi32Const<ILOpCode i32OpCode, SDNode OpNode> { + def _i8 : BinaryOp<i32OpCode, OpNode, GPRI8, GPRI8, GPRI32>; + def _i16 : BinaryOp<i32OpCode, OpNode, GPRI16, GPRI16, GPRI32>; + def _i32 : BinaryOp<i32OpCode, OpNode, GPRI32, GPRI32, GPRI32>; + def _v4i16 : BinaryOp<i32OpCode, OpNode, GPRV4I32, + GPRV4I32, GPRI32>; + def _v4i8 : BinaryOp<i32OpCode, OpNode, GPRV4I32, + GPRV4I32, GPRI32>; + def _v4i32 : BinaryOp<i32OpCode, OpNode, GPRV4I32, + GPRV4I32, GPRI32>; + def _v2i16 : BinaryOp<i32OpCode, OpNode, GPRV2I32, + GPRV2I32, GPRI32>; + def _v2i8 : BinaryOp<i32OpCode, OpNode, GPRV2I32, + GPRV2I32, GPRI32>; + def _v2i32 : BinaryOp<i32OpCode, OpNode, GPRV2I32, + GPRV2I32, GPRI32>; +} +multiclass BinaryOpMCf32<ILOpCode f32OpCode, SDNode OpNode> { + def _f32 : BinaryOp<f32OpCode, OpNode, GPRF32, + GPRF32, GPRF32>; + def _v4f32: BinaryOp<f32OpCode, OpNode, GPRV4F32, + GPRV4F32, GPRV4F32>; + def _v2f32 : BinaryOp<f32OpCode, OpNode, GPRV2F32, + GPRV2F32, GPRV2F32>; +} + +multiclass TernaryOpMCf64<ILOpCode f64OpCode, SDNode OpNode> { + def _f64 : TernaryOp<f64OpCode, OpNode, GPRF64, + GPRF64, GPRF64, GPRF64>; +} + +multiclass TernaryOpMCf32<ILOpCode f32OpCode, SDNode OpNode> { + def _f32 : TernaryOp<f32OpCode, OpNode, GPRF32, + GPRF32, GPRF32, GPRF32>; + def _v4f32: TernaryOp<f32OpCode, OpNode, GPRV4F32, + GPRV4F32, GPRV4F32, GPRV4F32>; + def _v2f32 : TernaryOp<f32OpCode, OpNode, GPRV2F32, + GPRV2F32, GPRV2F32, GPRV2F32>; +} +multiclass BinaryOpMCFloat<ILOpCode f32OpCode, ILOpCode f64OpCode, + SDNode OpNode> { + def _f64 : BinaryOp<f64OpCode, OpNode, GPRF64, + GPRF64, GPRF64>; + def _v2f64 : BinaryOp<f64OpCode, OpNode, GPRV2F64, + GPRV2F64, GPRV2F64>; + def _f32 : BinaryOp<f32OpCode, OpNode, GPRF32, + GPRF32, GPRF32>; + def _v2f32 : BinaryOp<f32OpCode, OpNode, GPRV2F32, + GPRV2F32, GPRV2F32>; + def _v4f32: BinaryOp<f32OpCode, OpNode, GPRV4F32, + GPRV4F32, GPRV4F32>; + } + +multiclass TernaryOpMCScalar<ILOpCode opcode, SDNode node> +{ + def _i8: TernaryOp<opcode, node, GPRI8, GPRI8, GPRI8, GPRI8>; + def _i16: TernaryOp<opcode, node, GPRI16, GPRI8, GPRI16, GPRI16>; + def _i32: TernaryOp<opcode, node, GPRI32, GPRI8, GPRI32, GPRI32>; + def _i64: TernaryOp<opcode, node, GPRI64, GPRI8, GPRI64, GPRI64>; + def _f32: TernaryOp<opcode, node, GPRF32, GPRI8, GPRF32, GPRF32>; + def _f64: TernaryOp<opcode, node, GPRF64, GPRI8, GPRF64, GPRF64>; +} + + +multiclass BitConversion<ILOpCode opcode, RegisterClass Regs, SDNode OpNode> +{ + def _i8 : UnaryOp<opcode, OpNode, Regs, GPRI8>; + def _i16 : UnaryOp<opcode, OpNode, Regs, GPRI16>; + def _i32 : UnaryOp<opcode, OpNode, Regs, GPRI32>; + def _f32 : UnaryOp<opcode, OpNode, Regs, GPRF32>; + def _i64 : UnaryOp<opcode, OpNode, Regs, GPRI64>; + def _f64 : UnaryOp<opcode, OpNode, Regs, GPRF64>; + def _v2i8 : UnaryOp<opcode, OpNode, Regs, GPRV2I8>; + def _v2i16 : UnaryOp<opcode, OpNode, Regs, GPRV2I16>; + def _v2i32 : UnaryOp<opcode, OpNode, Regs, GPRV2I32>; + def _v2f32 : UnaryOp<opcode, OpNode, Regs, GPRV2F32>; + def _v2i64 : UnaryOp<opcode, OpNode, Regs, GPRV2I64>; + def _v2f64 : UnaryOp<opcode, OpNode, Regs, GPRV2F64>; + def _v4i8 : UnaryOp<opcode, OpNode, Regs, GPRV4I8>; + def _v4i16 : UnaryOp<opcode, OpNode, Regs, GPRV4I16>; + def _v4i32 : UnaryOp<opcode, OpNode, Regs, GPRV4I32>; + def _v4f32 : UnaryOp<opcode, OpNode, Regs, GPRV4F32>; +} + + +multiclass UnaryIntrinsicInt<ILOpCode opcode, Intrinsic intr> +{ +def _i32 : OneInOneOut<opcode, (outs GPRI32:$dst), + (ins GPRI32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRI32:$dst, (intr GPRI32:$src))]>; +def _v2i32 : OneInOneOut<opcode, (outs GPRV2I32:$dst), + (ins GPRV2I32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV2I32:$dst, (intr GPRV2I32:$src))]>; +def _v4i32 : OneInOneOut<opcode, (outs GPRV4I32:$dst), + (ins GPRV4I32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV4I32:$dst, (intr GPRV4I32:$src))]>; +} + +multiclass IntrConvertF32TOF16<ILOpCode opcode, Intrinsic intr> +{ +def _i16 : OneInOneOut<opcode, (outs GPRI16:$dst), + (ins GPRF32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRI16:$dst, (intr GPRF32:$src))]>; +def _v2i16 : OneInOneOut<opcode, (outs GPRV2I16:$dst), + (ins GPRV2F32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV2I16:$dst, (intr GPRV2F32:$src))]>; +def _v4i16 : OneInOneOut<opcode, (outs GPRV4I16:$dst), + (ins GPRV4F32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV4I16:$dst, (intr GPRV4F32:$src))]>; +} + + +multiclass IntrConvertF32TOI32<ILOpCode opcode, Intrinsic intr> +{ +def _i32 : OneInOneOut<opcode, (outs GPRI32:$dst), + (ins GPRF32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRI32:$dst, (intr GPRF32:$src))]>; +def _v2i32 : OneInOneOut<opcode, (outs GPRV2I32:$dst), + (ins GPRV2F32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV2I32:$dst, (intr GPRV2F32:$src))]>; +def _v4i32 : OneInOneOut<opcode, (outs GPRV4I32:$dst), + (ins GPRV4F32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV4I32:$dst, (intr GPRV4F32:$src))]>; +} + +multiclass IntrConvertF64TOI32<ILOpCode opcode, Intrinsic intr> +{ +def _i32 : OneInOneOut<opcode, (outs GPRI32:$dst), + (ins GPRF64:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRI32:$dst, (intr GPRF64:$src))]>; +def _v2i32 : OneInOneOut<opcode, (outs GPRV2I32:$dst), + (ins GPRV2F64:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV2I32:$dst, (intr GPRV2F64:$src))]>; +} + +multiclass IntrConvertF16TOF32<ILOpCode opcode, Intrinsic intr> +{ +def _f32 : OneInOneOut<opcode, (outs GPRF32:$dst), + (ins GPRI16:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRF32:$dst, (intr GPRI16:$src))]>; +def _v2f32 : OneInOneOut<opcode, (outs GPRV2F32:$dst), + (ins GPRV2I16:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV2F32:$dst, (intr GPRV2I16:$src))]>; +def _v4f32 : OneInOneOut<opcode, (outs GPRV4F32:$dst), + (ins GPRV4I16:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV4F32:$dst, (intr GPRV4I16:$src))]>; +} + + +multiclass IntrConvertI32TOF32<ILOpCode opcode, Intrinsic intr> +{ +def _f32 : OneInOneOut<opcode, (outs GPRF32:$dst), + (ins GPRI32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRF32:$dst, (intr GPRI32:$src))]>; +def _v2f32 : OneInOneOut<opcode, (outs GPRV2F32:$dst), + (ins GPRV2I32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV2F32:$dst, (intr GPRV2I32:$src))]>; +def _v4f32 : OneInOneOut<opcode, (outs GPRV4F32:$dst), + (ins GPRV4I32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV4F32:$dst, (intr GPRV4I32:$src))]>; +} + +multiclass BinaryIntrinsicLong<ILOpCode opcode, Intrinsic intr> +{ +def _i64 : TwoInOneOut<opcode, (outs GPRI64:$dst), + (ins GPRI64:$src, GPRI64:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRI64:$dst, + (intr GPRI64:$src, GPRI64:$src2))]>; +} + + +multiclass BinaryIntrinsicInt<ILOpCode opcode, Intrinsic intr> +{ +def _i32 : TwoInOneOut<opcode, (outs GPRI32:$dst), + (ins GPRI32:$src, GPRI32:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRI32:$dst, + (intr GPRI32:$src, GPRI32:$src2))]>; +def _v2i32 : TwoInOneOut<opcode, (outs GPRV2I32:$dst), + (ins GPRV2I32:$src, GPRV2I32:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRV2I32:$dst, + (intr GPRV2I32:$src, GPRV2I32:$src2))]>; +def _v4i32 : TwoInOneOut<opcode, (outs GPRV4I32:$dst), + (ins GPRV4I32:$src, GPRV4I32:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRV4I32:$dst, + (intr GPRV4I32:$src, GPRV4I32:$src2))]>; +} + +multiclass TernaryIntrinsicInt<ILOpCode opcode, Intrinsic intr> +{ +def _i32 : ThreeInOneOut<opcode, (outs GPRI32:$dst), + (ins GPRI32:$src, GPRI32:$src2, GPRI32:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRI32:$dst, + (intr GPRI32:$src, GPRI32:$src2, GPRI32:$src3))]>; +def _v2i32 : ThreeInOneOut<opcode, (outs GPRV2I32:$dst), + (ins GPRV2I32:$src, GPRV2I32:$src2, GPRV2I32:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRV2I32:$dst, + (intr GPRV2I32:$src, GPRV2I32:$src2, GPRV2I32:$src3))]>; +def _v4i32 : ThreeInOneOut<opcode, (outs GPRV4I32:$dst), + (ins GPRV4I32:$src, GPRV4I32:$src2, GPRV4I32:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRV4I32:$dst, + (intr GPRV4I32:$src, GPRV4I32:$src2, GPRV4I32:$src3))]>; +} + +multiclass TernaryIntrinsicFloat<ILOpCode opcode, Intrinsic intr> +{ +def _f32 : ThreeInOneOut<opcode, (outs GPRF32:$dst), + (ins GPRF32:$src, GPRF32:$src2, GPRF32:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRF32:$dst, + (intr GPRF32:$src, GPRF32:$src2, GPRF32:$src3))]>; +def _v2f32 : ThreeInOneOut<opcode, (outs GPRV2F32:$dst), + (ins GPRV2F32:$src, GPRV2F32:$src2, GPRV2F32:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRV2F32:$dst, + (intr GPRV2F32:$src, GPRV2F32:$src2, GPRV2F32:$src3))]>; +def _v4f32 : ThreeInOneOut<opcode, (outs GPRV4F32:$dst), + (ins GPRV4F32:$src, GPRV4F32:$src2, GPRV4F32:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRV4F32:$dst, + (intr GPRV4F32:$src, GPRV4F32:$src2, GPRV4F32:$src3))]>; +} + +multiclass BinaryIntrinsicDoubleScalar<ILOpCode opcode, Intrinsic intr> +{ +def _f64 : ThreeInOneOut<opcode, (outs GPRF64:$dst), + (ins GPRF64:$src, GPRF64:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRF64:$dst, + (intr GPRF64:$src, GPRF64:$src2))]>; +} + +multiclass TernaryIntrinsicDoubleScalar<ILOpCode opcode, Intrinsic intr> +{ +def _f64 : ThreeInOneOut<opcode, (outs GPRF64:$dst), + (ins GPRF64:$src, GPRF64:$src2, GPRF64:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRF64:$dst, + (intr GPRF64:$src, GPRF64:$src2, GPRF64:$src3))]>; +} + + +multiclass TernaryIntrinsicLongScalar<ILOpCode opcode, Intrinsic intr> +{ +def _i64 : ThreeInOneOut<opcode, (outs GPRI64:$dst), + (ins GPRI64:$src, GPRI64:$src2, GPRI64:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRI64:$dst, + (intr GPRI64:$src, GPRI64:$src2, GPRI64:$src3))]>; +} + +multiclass QuaternaryIntrinsicInt<ILOpCode opcode, Intrinsic intr> +{ +def _i32 : FourInOneOut<opcode, (outs GPRI32:$dst), + (ins GPRI32:$src, GPRI32:$src2, GPRI32:$src3, GPRI32:$src4), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3, $src4"), + [(set GPRI32:$dst, + (intr GPRI32:$src, GPRI32:$src2, GPRI32:$src3, GPRI32:$src4))]>; +def _v2i32 : FourInOneOut<opcode, (outs GPRV2I32:$dst), + (ins GPRV2I32:$src, GPRV2I32:$src2, GPRV2I32:$src3, GPRV2I32:$src4), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3, $src4"), + [(set GPRV2I32:$dst, + (intr GPRV2I32:$src, GPRV2I32:$src2, GPRV2I32:$src3, GPRV2I32:$src4))]>; +def _v4i32 : FourInOneOut<opcode, (outs GPRV4I32:$dst), + (ins GPRV4I32:$src, GPRV4I32:$src2, GPRV4I32:$src3, GPRV4I32:$src4), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3, $src4"), + [(set GPRV4I32:$dst, + (intr GPRV4I32:$src, GPRV4I32:$src2, GPRV4I32:$src3, GPRV4I32:$src4))]>; +} + +multiclass UnaryIntrinsicFloatScalar<ILOpCode opcode, Intrinsic intr> +{ +def _f32 : OneInOneOut<opcode, (outs GPRF32:$dst), + (ins GPRF32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRF32:$dst, (intr GPRF32:$src))]>; +} + +multiclass UnaryIntrinsicFloat<ILOpCode opcode, Intrinsic intr> +{ +def _f32 : OneInOneOut<opcode, (outs GPRF32:$dst), + (ins GPRF32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRF32:$dst, (intr GPRF32:$src))]>; +def _v2f32 : OneInOneOut<opcode, (outs GPRV2F32:$dst), + (ins GPRV2F32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV2F32:$dst, (intr GPRV2F32:$src))]>; +def _v4f32 : OneInOneOut<opcode, (outs GPRV4F32:$dst), + (ins GPRV4F32:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV4F32:$dst, (intr GPRV4F32:$src))]>; +} + +multiclass BinaryIntrinsicFloatScalar<ILOpCode opcode, Intrinsic intr> +{ +def _f32 : TwoInOneOut<opcode, (outs GPRF32:$dst), + (ins GPRF32:$src, GPRF32:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRF32:$dst, + (intr GPRF32:$src, GPRF32:$src2))]>; +} +multiclass BinaryIntrinsicFloat<ILOpCode opcode, Intrinsic intr> +{ +def _f32 : TwoInOneOut<opcode, (outs GPRF32:$dst), + (ins GPRF32:$src, GPRF32:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRF32:$dst, + (intr GPRF32:$src, GPRF32:$src2))]>; +def _v2f32 : TwoInOneOut<opcode, (outs GPRV2F32:$dst), + (ins GPRV2F32:$src, GPRV2F32:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRV2F32:$dst, + (intr GPRV2F32:$src, GPRV2F32:$src2))]>; +def _v4f32 : TwoInOneOut<opcode, (outs GPRV4F32:$dst), + (ins GPRV4F32:$src, GPRV4F32:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRV4F32:$dst, + (intr GPRV4F32:$src, GPRV4F32:$src2))]>; +} + +multiclass UnaryIntrinsicDoubleScalar<ILOpCode opcode, Intrinsic intr> +{ +def _f64 : OneInOneOut<opcode, (outs GPRF64:$dst), + (ins GPRF64:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRF64:$dst, (intr GPRF64:$src))]>; +} + +multiclass UnaryIntrinsicDouble<ILOpCode opcode, Intrinsic intr> +{ +def _f64 : OneInOneOut<opcode, (outs GPRF64:$dst), + (ins GPRF64:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRF64:$dst, (intr GPRF64:$src))]>; +def _v2f64 : OneInOneOut<opcode, (outs GPRV2F64:$dst), + (ins GPRV2F64:$src), + !strconcat(opcode.Text, " $dst, $src"), + [(set GPRV2F64:$dst, (intr GPRV2F64:$src))]>; +} + +multiclass BinaryIntrinsicDouble<ILOpCode opcode, Intrinsic intr> +{ +def _f64 : TwoInOneOut<opcode, (outs GPRF64:$dst), + (ins GPRF64:$src, GPRF64:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRF64:$dst, + (intr GPRF64:$src, GPRF64:$src2))]>; +def _v2f64 : TwoInOneOut<opcode, (outs GPRV2F64:$dst), + (ins GPRV2F64:$src, GPRV2F64:$src2), + !strconcat(opcode.Text, " $dst, $src, $src2"), + [(set GPRV2F64:$dst, + (intr GPRV2F64:$src, GPRV2F64:$src2))]>; +} + +multiclass TernaryIntrinsicDouble<ILOpCode opcode, Intrinsic intr> +{ +def _f64 : TwoInOneOut<opcode, (outs GPRF64:$dst), + (ins GPRF64:$src, GPRF64:$src2, GPRF64:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRF64:$dst, + (intr GPRF64:$src, GPRF64:$src2, GPRF64:$src3))]>; +def _v2f64 : TwoInOneOut<opcode, (outs GPRV2F64:$dst), + (ins GPRV2F64:$src, GPRV2F64:$src2, GPRV2F64:$src3), + !strconcat(opcode.Text, " $dst, $src, $src2, $src3"), + [(set GPRV2F64:$dst, + (intr GPRV2F64:$src, GPRV2F64:$src2, GPRV2F64:$src3))]>; +} diff --git a/src/gallium/drivers/radeon/AMDILNIDevice.cpp b/src/gallium/drivers/radeon/AMDILNIDevice.cpp new file mode 100644 index 00000000000..8fda1c18ae5 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILNIDevice.cpp @@ -0,0 +1,71 @@ +//===-- AMDILNIDevice.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +#include "AMDILNIDevice.h" +#include "AMDILEvergreenDevice.h" +#include "AMDILSubtarget.h" + +using namespace llvm; + +AMDILNIDevice::AMDILNIDevice(AMDILSubtarget *ST) + : AMDILEvergreenDevice(ST) +{ + std::string name = ST->getDeviceName(); + if (name == "caicos") { + mDeviceFlag = OCL_DEVICE_CAICOS; + } else if (name == "turks") { + mDeviceFlag = OCL_DEVICE_TURKS; + } else if (name == "cayman") { + mDeviceFlag = OCL_DEVICE_CAYMAN; + } else { + mDeviceFlag = OCL_DEVICE_BARTS; + } +} +AMDILNIDevice::~AMDILNIDevice() +{ +} + +size_t +AMDILNIDevice::getMaxLDSSize() const +{ + if (usesHardware(AMDILDeviceInfo::LocalMem)) { + return MAX_LDS_SIZE_900; + } else { + return 0; + } +} + +uint32_t +AMDILNIDevice::getGeneration() const +{ + return AMDILDeviceInfo::HD6XXX; +} + + +AMDILCaymanDevice::AMDILCaymanDevice(AMDILSubtarget *ST) + : AMDILNIDevice(ST) +{ + setCaps(); +} + +AMDILCaymanDevice::~AMDILCaymanDevice() +{ +} + +void +AMDILCaymanDevice::setCaps() +{ + if (mSTM->isOverride(AMDILDeviceInfo::DoubleOps)) { + mHWBits.set(AMDILDeviceInfo::DoubleOps); + mHWBits.set(AMDILDeviceInfo::FMA); + } + mHWBits.set(AMDILDeviceInfo::Signed24BitOps); + mSWBits.reset(AMDILDeviceInfo::Signed24BitOps); + mSWBits.set(AMDILDeviceInfo::ArenaSegment); +} + diff --git a/src/gallium/drivers/radeon/AMDILNIDevice.h b/src/gallium/drivers/radeon/AMDILNIDevice.h new file mode 100644 index 00000000000..556670abba1 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILNIDevice.h @@ -0,0 +1,59 @@ +//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Interface for the subtarget data classes. +// +//===---------------------------------------------------------------------===// +// This file will define the interface that each generation needs to +// implement in order to correctly answer queries on the capabilities of the +// specific hardware. +//===---------------------------------------------------------------------===// +#ifndef _AMDILNIDEVICE_H_ +#define _AMDILNIDEVICE_H_ +#include "AMDILEvergreenDevice.h" +#include "AMDILSubtarget.h" + +namespace llvm { + class AMDILSubtarget; +//===---------------------------------------------------------------------===// +// NI generation of devices and their respective sub classes +//===---------------------------------------------------------------------===// + +// The AMDILNIDevice is the base class for all Northern Island series of +// cards. It is very similiar to the AMDILEvergreenDevice, with the major +// exception being differences in wavefront size and hardware capabilities. The +// NI devices are all 64 wide wavefronts and also add support for signed 24 bit +// integer operations + + class AMDILNIDevice : public AMDILEvergreenDevice { + public: + AMDILNIDevice(AMDILSubtarget*); + virtual ~AMDILNIDevice(); + virtual size_t getMaxLDSSize() const; + virtual uint32_t getGeneration() const; + protected: + }; // AMDILNIDevice + +// Just as the AMDILCypressDevice is the double capable version of the +// AMDILEvergreenDevice, the AMDILCaymanDevice is the double capable version of +// the AMDILNIDevice. The other major difference that is not as useful from +// standpoint is that the Cayman Device has 4 wide ALU's, whereas the rest of the +// NI family is a 5 wide. + + class AMDILCaymanDevice: public AMDILNIDevice { + public: + AMDILCaymanDevice(AMDILSubtarget*); + virtual ~AMDILCaymanDevice(); + private: + virtual void setCaps(); + }; // AMDILCaymanDevice + + static const unsigned int MAX_LDS_SIZE_900 = AMDILDevice::MAX_LDS_SIZE_800; +} // namespace llvm +#endif // _AMDILNIDEVICE_H_ diff --git a/src/gallium/drivers/radeon/AMDILNodes.td b/src/gallium/drivers/radeon/AMDILNodes.td new file mode 100644 index 00000000000..8cf07a5b27b --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILNodes.td @@ -0,0 +1,325 @@ +//===- AMDILNodes.td - AMD IL nodes ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Conversion DAG Nodes +//===----------------------------------------------------------------------===// +// Double to Single conversion +def IL_d2f : SDNode<"AMDILISD::DP_TO_FP" , SDTIL_DPToFPOp>; + +def IL_inttoany: SDNode<"AMDILISD::INTTOANY", SDTIL_IntToAny>; +//===----------------------------------------------------------------------===// +// Flow Control DAG Nodes +//===----------------------------------------------------------------------===// +def IL_brcond : SDNode<"AMDILISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>; + +//===----------------------------------------------------------------------===// +// Comparison DAG Nodes +//===----------------------------------------------------------------------===// +def IL_cmp : SDNode<"AMDILISD::CMP", SDTIL_Cmp>; + +//===----------------------------------------------------------------------===// +// Call/Return DAG Nodes +//===----------------------------------------------------------------------===// +def IL_callseq_start : SDNode<"ISD::CALLSEQ_START", SDTIL_CallSeqStart, + [SDNPHasChain, SDNPOutGlue]>; +def IL_callseq_end : SDNode<"ISD::CALLSEQ_END", SDTIL_CallSeqEnd, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def IL_call : SDNode<"AMDILISD::CALL", SDTIL_Call, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + +def IL_retflag : SDNode<"AMDILISD::RET_FLAG", SDTNone, + [SDNPHasChain, SDNPOptInGlue]>; + +//===----------------------------------------------------------------------===// +// Arithmetic DAG Nodes +//===----------------------------------------------------------------------===// +// Address modification nodes +def IL_addaddrri : SDNode<"AMDILISD::ADDADDR", SDTIL_AddAddrri, + [SDNPCommutative, SDNPAssociative]>; +def IL_addaddrir : SDNode<"AMDILISD::ADDADDR", SDTIL_AddAddrir, + [SDNPCommutative, SDNPAssociative]>; + +//===--------------------------------------------------------------------===// +// Instructions +//===--------------------------------------------------------------------===// +// Floating point math functions +def IL_cmov_logical : SDNode<"AMDILISD::CMOVLOG", SDTIL_GenTernaryOp>; +def IL_add : SDNode<"AMDILISD::ADD" , SDTIL_GenBinaryOp>; +def IL_cmov : SDNode<"AMDILISD::CMOV" , SDTIL_GenBinaryOp>; +def IL_or : SDNode<"AMDILISD::OR" ,SDTIL_GenBinaryOp>; +def IL_and : SDNode<"AMDILISD::AND" ,SDTIL_GenBinaryOp>; +def IL_xor : SDNode<"AMDILISD::XOR", SDTIL_GenBinaryOp>; +def IL_not : SDNode<"AMDILISD::NOT", SDTIL_GenUnaryOp>; +def IL_div_inf : SDNode<"AMDILISD::DIV_INF", SDTIL_GenBinaryOp>; +def IL_mad : SDNode<"AMDILISD::MAD", SDTIL_GenTernaryOp>; + +//===----------------------------------------------------------------------===// +// Integer functions +//===----------------------------------------------------------------------===// +def IL_inegate : SDNode<"AMDILISD::INEGATE" , SDTIntUnaryOp>; +def IL_umul : SDNode<"AMDILISD::UMUL" , SDTIntBinOp, + [SDNPCommutative, SDNPAssociative]>; +def IL_mov : SDNode<"AMDILISD::MOVE", SDTIL_GenUnaryOp>; +def IL_phimov : SDNode<"AMDILISD::PHIMOVE", SDTIL_GenUnaryOp>; +def IL_bitconv : SDNode<"AMDILISD::BITCONV", SDTIL_GenBitConv>; +def IL_ffb_hi : SDNode<"AMDILISD::IFFB_HI", SDTIL_GenUnaryOp>; +def IL_ffb_lo : SDNode<"AMDILISD::IFFB_LO", SDTIL_GenUnaryOp>; +def IL_smax : SDNode<"AMDILISD::SMAX", SDTIL_GenBinaryOp>; + +//===----------------------------------------------------------------------===// +// Double functions +//===----------------------------------------------------------------------===// +def IL_dcreate : SDNode<"AMDILISD::DCREATE" , SDTIL_DCreate>; +def IL_dcomphi : SDNode<"AMDILISD::DCOMPHI" , SDTIL_DComp>; +def IL_dcomplo : SDNode<"AMDILISD::DCOMPLO" , SDTIL_DComp>; +def IL_dcreate2 : SDNode<"AMDILISD::DCREATE2" , SDTIL_DCreate2>; +def IL_dcomphi2 : SDNode<"AMDILISD::DCOMPHI2" , SDTIL_DComp2>; +def IL_dcomplo2 : SDNode<"AMDILISD::DCOMPLO2" , SDTIL_DComp2>; + +//===----------------------------------------------------------------------===// +// Long functions +//===----------------------------------------------------------------------===// +def IL_lcreate : SDNode<"AMDILISD::LCREATE" , SDTIL_LCreate>; +def IL_lcreate2 : SDNode<"AMDILISD::LCREATE2" , SDTIL_LCreate2>; +def IL_lcomphi : SDNode<"AMDILISD::LCOMPHI" , SDTIL_LComp>; +def IL_lcomphi2 : SDNode<"AMDILISD::LCOMPHI2" , SDTIL_LComp2>; +def IL_lcomplo : SDNode<"AMDILISD::LCOMPLO" , SDTIL_LComp>; +def IL_lcomplo2 : SDNode<"AMDILISD::LCOMPLO2" , SDTIL_LComp2>; + +//===----------------------------------------------------------------------===// +// Vector functions +//===----------------------------------------------------------------------===// +def IL_vbuild : SDNode<"AMDILISD::VBUILD", SDTIL_GenVecBuild, + []>; +def IL_vextract : SDNode<"AMDILISD::VEXTRACT", SDTIL_GenVecExtract, + []>; +def IL_vinsert : SDNode<"AMDILISD::VINSERT", SDTIL_GenVecInsert, + []>; +def IL_vconcat : SDNode<"AMDILISD::VCONCAT", SDTIL_GenVecConcat, + []>; + +//===----------------------------------------------------------------------===// +// AMDIL Image Custom SDNodes +//===----------------------------------------------------------------------===// +def image2d_read : SDNode<"AMDILISD::IMAGE2D_READ", SDTIL_ImageRead, + [SDNPHasChain, SDNPMayLoad]>; +def image2d_write : SDNode<"AMDILISD::IMAGE2D_WRITE", SDTIL_ImageWrite, + [SDNPHasChain, SDNPMayStore]>; +def image2d_info0 : SDNode<"AMDILISD::IMAGE2D_INFO0", SDTIL_ImageInfo, []>; +def image2d_info1 : SDNode<"AMDILISD::IMAGE2D_INFO1", SDTIL_ImageInfo, []>; +def image3d_read : SDNode<"AMDILISD::IMAGE3D_READ", SDTIL_ImageRead, + [SDNPHasChain, SDNPMayLoad]>; +def image3d_write : SDNode<"AMDILISD::IMAGE3D_WRITE", SDTIL_ImageWrite3D, + [SDNPHasChain, SDNPMayStore]>; +def image3d_info0 : SDNode<"AMDILISD::IMAGE3D_INFO0", SDTIL_ImageInfo, []>; +def image3d_info1 : SDNode<"AMDILISD::IMAGE3D_INFO1", SDTIL_ImageInfo, []>; + +//===----------------------------------------------------------------------===// +// AMDIL Atomic Custom SDNodes +//===----------------------------------------------------------------------===// +//===-------------- 32 bit global atomics with return values --------------===// +def atom_g_add : SDNode<"AMDILISD::ATOM_G_ADD", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_and : SDNode<"AMDILISD::ATOM_G_AND", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_cmpxchg : SDNode<"AMDILISD::ATOM_G_CMPXCHG", SDTIL_TriAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_dec : SDNode<"AMDILISD::ATOM_G_DEC", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_inc : SDNode<"AMDILISD::ATOM_G_INC", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_max : SDNode<"AMDILISD::ATOM_G_MAX", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_umax : SDNode<"AMDILISD::ATOM_G_UMAX", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_min : SDNode<"AMDILISD::ATOM_G_MIN", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_umin : SDNode<"AMDILISD::ATOM_G_UMIN", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_or : SDNode<"AMDILISD::ATOM_G_OR", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_sub : SDNode<"AMDILISD::ATOM_G_SUB", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_rsub : SDNode<"AMDILISD::ATOM_G_RSUB", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_xchg : SDNode<"AMDILISD::ATOM_G_XCHG", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_xor : SDNode<"AMDILISD::ATOM_G_XOR", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +//===------------- 32 bit global atomics without return values ------------===// +def atom_g_add_noret : SDNode<"AMDILISD::ATOM_G_ADD_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_and_noret : SDNode<"AMDILISD::ATOM_G_AND_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_cmpxchg_noret : SDNode<"AMDILISD::ATOM_G_CMPXCHG_NORET", + SDTIL_TriAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_cmp_noret : SDNode<"AMDILISD::ATOM_G_CMPXCHG_NORET", + SDTIL_TriAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_dec_noret : SDNode<"AMDILISD::ATOM_G_DEC_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_inc_noret : SDNode<"AMDILISD::ATOM_G_INC_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_max_noret : SDNode<"AMDILISD::ATOM_G_MAX_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_umax_noret: SDNode<"AMDILISD::ATOM_G_UMAX_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_min_noret : SDNode<"AMDILISD::ATOM_G_MIN_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_umin_noret: SDNode<"AMDILISD::ATOM_G_UMIN_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_or_noret : SDNode<"AMDILISD::ATOM_G_OR_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_sub_noret : SDNode<"AMDILISD::ATOM_G_SUB_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_rsub_noret : SDNode<"AMDILISD::ATOM_G_RSUB_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_g_xchg_noret: SDNode<"AMDILISD::ATOM_G_XCHG_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_g_xor_noret : SDNode<"AMDILISD::ATOM_G_XOR_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +//===--------------- 32 bit local atomics with return values --------------===// +def atom_l_add : SDNode<"AMDILISD::ATOM_L_ADD", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_and : SDNode<"AMDILISD::ATOM_L_AND", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_cmpxchg : SDNode<"AMDILISD::ATOM_L_CMPXCHG", SDTIL_TriAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_dec : SDNode<"AMDILISD::ATOM_L_DEC", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_inc : SDNode<"AMDILISD::ATOM_L_INC", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_max : SDNode<"AMDILISD::ATOM_L_MAX", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_umax : SDNode<"AMDILISD::ATOM_L_UMAX", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_min : SDNode<"AMDILISD::ATOM_L_MIN", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_umin : SDNode<"AMDILISD::ATOM_L_UMIN", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_or : SDNode<"AMDILISD::ATOM_L_OR", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_mskor : SDNode<"AMDILISD::ATOM_L_MSKOR", SDTIL_TriAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_sub : SDNode<"AMDILISD::ATOM_L_SUB", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_rsub : SDNode<"AMDILISD::ATOM_L_RSUB", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_xchg : SDNode<"AMDILISD::ATOM_L_XCHG", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_xor : SDNode<"AMDILISD::ATOM_L_XOR", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; + +//===-------------- 32 bit local atomics without return values ------------===// +def atom_l_add_noret : SDNode<"AMDILISD::ATOM_L_ADD_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_and_noret : SDNode<"AMDILISD::ATOM_L_AND_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_cmpxchg_noret : SDNode<"AMDILISD::ATOM_L_CMPXCHG_NORET", + SDTIL_TriAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_dec_noret : SDNode<"AMDILISD::ATOM_L_DEC_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_inc_noret : SDNode<"AMDILISD::ATOM_L_INC_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_max_noret : SDNode<"AMDILISD::ATOM_L_MAX_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_umax_noret: SDNode<"AMDILISD::ATOM_L_UMAX_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_min_noret : SDNode<"AMDILISD::ATOM_L_MIN_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_umin_noret: SDNode<"AMDILISD::ATOM_L_UMIN_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_or_noret : SDNode<"AMDILISD::ATOM_L_OR_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_mskor_noret : SDNode<"AMDILISD::ATOM_L_MSKOR_NORET", + SDTIL_TriAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_sub_noret : SDNode<"AMDILISD::ATOM_L_SUB_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_rsub_noret : SDNode<"AMDILISD::ATOM_L_RSUB_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_l_xchg_noret: SDNode<"AMDILISD::ATOM_L_XCHG_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_l_xor_noret : SDNode<"AMDILISD::ATOM_L_XOR_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +//===--------------- 32 bit local atomics with return values --------------===// +def atom_r_add : SDNode<"AMDILISD::ATOM_R_ADD", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_and : SDNode<"AMDILISD::ATOM_R_AND", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_cmpxchg : SDNode<"AMDILISD::ATOM_R_CMPXCHG", SDTIL_TriAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_dec : SDNode<"AMDILISD::ATOM_R_DEC", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_inc : SDNode<"AMDILISD::ATOM_R_INC", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_max : SDNode<"AMDILISD::ATOM_R_MAX", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_umax : SDNode<"AMDILISD::ATOM_R_UMAX", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_min : SDNode<"AMDILISD::ATOM_R_MIN", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_umin : SDNode<"AMDILISD::ATOM_R_UMIN", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_or : SDNode<"AMDILISD::ATOM_R_OR", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_mskor : SDNode<"AMDILISD::ATOM_R_MSKOR", SDTIL_TriAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_sub : SDNode<"AMDILISD::ATOM_R_SUB", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_rsub : SDNode<"AMDILISD::ATOM_R_RSUB", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_xchg : SDNode<"AMDILISD::ATOM_R_XCHG", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_xor : SDNode<"AMDILISD::ATOM_R_XOR", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; + +//===-------------- 32 bit local atomics without return values ------------===// +def atom_r_add_noret : SDNode<"AMDILISD::ATOM_R_ADD_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_and_noret : SDNode<"AMDILISD::ATOM_R_AND_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_cmpxchg_noret : SDNode<"AMDILISD::ATOM_R_CMPXCHG_NORET", + SDTIL_TriAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_dec_noret : SDNode<"AMDILISD::ATOM_R_DEC_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_inc_noret : SDNode<"AMDILISD::ATOM_R_INC_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_max_noret : SDNode<"AMDILISD::ATOM_R_MAX_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_umax_noret: SDNode<"AMDILISD::ATOM_R_UMAX_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_min_noret : SDNode<"AMDILISD::ATOM_R_MIN_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_umin_noret: SDNode<"AMDILISD::ATOM_R_UMIN_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_or_noret : SDNode<"AMDILISD::ATOM_R_OR_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_mskor_noret : SDNode<"AMDILISD::ATOM_R_MSKOR_NORET", SDTIL_TriAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_sub_noret : SDNode<"AMDILISD::ATOM_R_SUB_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_rsub_noret : SDNode<"AMDILISD::ATOM_R_RSUB_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def atom_r_xchg_noret: SDNode<"AMDILISD::ATOM_R_XCHG_NORET", + SDTIL_BinAtom, [SDNPHasChain, SDNPMayLoad, SDNPMayStore, SDNPMemOperand]>; +def atom_r_xor_noret : SDNode<"AMDILISD::ATOM_R_XOR_NORET", SDTIL_BinAtom, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +//===--------------- 32 bit atomic counter instructions -------------------===// +def append_alloc : SDNode<"AMDILISD::APPEND_ALLOC", SDTIL_Append, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; +def append_consume : SDNode<"AMDILISD::APPEND_CONSUME", SDTIL_Append, + [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>; +def append_alloc_noret : SDNode<"AMDILISD::APPEND_ALLOC_NORET", SDTIL_Append, + [SDNPHasChain, SDNPMayStore]>; +def append_consume_noret : SDNode<"AMDILISD::APPEND_CONSUME_NORET", + SDTIL_Append, [SDNPHasChain, SDNPMayStore]>; diff --git a/src/gallium/drivers/radeon/AMDILOperands.td b/src/gallium/drivers/radeon/AMDILOperands.td new file mode 100644 index 00000000000..b22c67bfdba --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILOperands.td @@ -0,0 +1,37 @@ +//===- AMDILOperands.td - AMD IL Operands ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// Custom memory operand +//===----------------------------------------------------------------------===// + +def MEMI32 : Operand<i32> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops GPRI32, GPRI32); +} + +def MEMI64 : Operand<i64> { + let PrintMethod = "printMemOperand"; + let MIOperandInfo = (ops GPRI64, GPRI64); +} + +// Call target types +def calltarget : Operand<i32>; +def brtarget : Operand<OtherVT>; + +// def v2i8imm : Operand<v2i8>; +// def v4i8imm : Operand<v4i8>; +// def v2i16imm : Operand<v2i16>; +// def v4i16imm : Operand<v4i16>; +// def v2i32imm : Operand<v2i32>; +// def v4i32imm : Operand<v4i32>; +// def v2i64imm : Operand<v2i64>; +// def v2f32imm : Operand<v2f32>; +// def v4f32imm : Operand<v4f32>; +// def v2f64imm : Operand<v2f64>; + diff --git a/src/gallium/drivers/radeon/AMDILPatterns.td b/src/gallium/drivers/radeon/AMDILPatterns.td new file mode 100644 index 00000000000..aa59bcb5b4e --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILPatterns.td @@ -0,0 +1,504 @@ +//===- AMDILPatterns.td - AMDIL Target Patterns------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Store pattern fragments +//===----------------------------------------------------------------------===// +def truncstorei64 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i64; +}]>; +def truncstorev2i8 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i8; +}]>; +def truncstorev2i16 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i16; +}]>; +def truncstorev2i32 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i32; +}]>; +def truncstorev2i64 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2i64; +}]>; +def truncstorev2f32 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2f32; +}]>; +def truncstorev2f64 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v2f64; +}]>; +def truncstorev4i8 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i8; +}]>; +def truncstorev4i16 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i16; +}]>; +def truncstorev4i32 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4i32; +}]>; +def truncstorev4f32 : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return cast<StoreSDNode>(N)->getMemoryVT() == MVT::v4f32; +}]>; + +def global_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_store : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei32 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_i64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei64 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstoref32 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_f64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstoref64 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v2i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i8 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v2i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i16 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v2i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i32 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v2i64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i64 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v2f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2f32 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v2f64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2f64 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v4i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i8 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v4i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i16 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v4i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i32 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def global_v4f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4f32 node:$val, node:$ptr), [{ + return isGlobalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei32 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_i64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei64 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstoref32 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_f64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstoref64 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v2i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i8 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v2i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i16 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v2i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i32 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v2i64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i64 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v2f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2f32 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v2f64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2f64 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v4i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i8 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v4i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i16 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v4i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i32 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; +def private_v4f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4f32 node:$val, node:$ptr), [{ + return isPrivateStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def local_trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei32 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_i64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei64 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstoref32 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_f64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstoref64 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v2i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i8 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v2i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i16 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v2i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i32 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v2i64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i64 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v2f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2f32 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v2f64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2f64 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v4i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i8 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v4i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i16 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v4i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i32 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; +def local_v4f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4f32 node:$val, node:$ptr), [{ + return isLocalStore(dyn_cast<StoreSDNode>(N)); +}]>; + +def region_trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstore node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei8 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei16 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei32 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_i64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorei64 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstoref32 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_f64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstoref64 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v2i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i8 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v2i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i16 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v2i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i32 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v2i64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2i64 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v2f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2f32 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v2f64trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev2f64 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v4i8trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i8 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v4i16trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i16 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v4i32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4i32 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; +def region_v4f32trunc_store : PatFrag<(ops node:$val, node:$ptr), + (truncstorev4f32 node:$val, node:$ptr), [{ + return isRegionStore(dyn_cast<StoreSDNode>(N)); +}]>; + +//===----------------------------------------------------------------------===// +// Load pattern fragments +//===----------------------------------------------------------------------===// +// Global address space loads +def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def global_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def global_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def global_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ + return isGlobalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +// Private address space loads +def private_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isPrivateLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def private_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{ + return isPrivateLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def private_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{ + return isPrivateLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def private_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ + return isPrivateLoad(dyn_cast<LoadSDNode>(N)); +}]>; +// Local address space loads +def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def local_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def local_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def local_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ + return isLocalLoad(dyn_cast<LoadSDNode>(N)); +}]>; +// Region address space loads +def region_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isRegionLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def region_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{ + return isRegionLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def region_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{ + return isRegionLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def region_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ + return isRegionLoad(dyn_cast<LoadSDNode>(N)); +}]>; +// Constant address space loads +def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; +def constant_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; +def constant_aext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; +def constant_zext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ + return isConstantLoad(dyn_cast<LoadSDNode>(N), -1); +}]>; +// Constant pool loads +def cp_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isCPLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def cp_sext_load : PatFrag<(ops node:$ptr), (sextload node:$ptr), [{ + return isCPLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def cp_zext_load : PatFrag<(ops node:$ptr), (zextload node:$ptr), [{ + return isCPLoad(dyn_cast<LoadSDNode>(N)); +}]>; +def cp_aext_load : PatFrag<(ops node:$ptr), (extload node:$ptr), [{ + return isCPLoad(dyn_cast<LoadSDNode>(N)); +}]>; + +//===----------------------------------------------------------------------===// +// Complex addressing mode patterns +//===----------------------------------------------------------------------===// +def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>; +def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>; +def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>; +def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>; + + +//===----------------------------------------------------------------------===// +// Conditional Instruction Pattern Leafs +//===----------------------------------------------------------------------===// +class IL_CC_Op<int N> : PatLeaf<(i32 N)>; +def IL_CC_D_EQ : IL_CC_Op<0>; +def IL_CC_D_GE : IL_CC_Op<1>; +def IL_CC_D_LT : IL_CC_Op<2>; +def IL_CC_D_NE : IL_CC_Op<3>; +def IL_CC_F_EQ : IL_CC_Op<4>; +def IL_CC_F_GE : IL_CC_Op<5>; +def IL_CC_F_LT : IL_CC_Op<6>; +def IL_CC_F_NE : IL_CC_Op<7>; +def IL_CC_I_EQ : IL_CC_Op<8>; +def IL_CC_I_GE : IL_CC_Op<9>; +def IL_CC_I_LT : IL_CC_Op<10>; +def IL_CC_I_NE : IL_CC_Op<11>; +def IL_CC_U_GE : IL_CC_Op<12>; +def IL_CC_U_LT : IL_CC_Op<13>; +// Pseudo IL comparison instructions that aren't natively supported +def IL_CC_F_GT : IL_CC_Op<14>; +def IL_CC_U_GT : IL_CC_Op<15>; +def IL_CC_I_GT : IL_CC_Op<16>; +def IL_CC_D_GT : IL_CC_Op<17>; +def IL_CC_F_LE : IL_CC_Op<18>; +def IL_CC_U_LE : IL_CC_Op<19>; +def IL_CC_I_LE : IL_CC_Op<20>; +def IL_CC_D_LE : IL_CC_Op<21>; +def IL_CC_F_UNE : IL_CC_Op<22>; +def IL_CC_F_UEQ : IL_CC_Op<23>; +def IL_CC_F_ULT : IL_CC_Op<24>; +def IL_CC_F_UGT : IL_CC_Op<25>; +def IL_CC_F_ULE : IL_CC_Op<26>; +def IL_CC_F_UGE : IL_CC_Op<27>; +def IL_CC_F_ONE : IL_CC_Op<28>; +def IL_CC_F_OEQ : IL_CC_Op<29>; +def IL_CC_F_OLT : IL_CC_Op<30>; +def IL_CC_F_OGT : IL_CC_Op<31>; +def IL_CC_F_OLE : IL_CC_Op<32>; +def IL_CC_F_OGE : IL_CC_Op<33>; +def IL_CC_D_UNE : IL_CC_Op<34>; +def IL_CC_D_UEQ : IL_CC_Op<35>; +def IL_CC_D_ULT : IL_CC_Op<36>; +def IL_CC_D_UGT : IL_CC_Op<37>; +def IL_CC_D_ULE : IL_CC_Op<38>; +def IL_CC_D_UGE : IL_CC_Op<39>; +def IL_CC_D_ONE : IL_CC_Op<30>; +def IL_CC_D_OEQ : IL_CC_Op<41>; +def IL_CC_D_OLT : IL_CC_Op<42>; +def IL_CC_D_OGT : IL_CC_Op<43>; +def IL_CC_D_OLE : IL_CC_Op<44>; +def IL_CC_D_OGE : IL_CC_Op<45>; +def IL_CC_U_EQ : IL_CC_Op<46>; +def IL_CC_U_NE : IL_CC_Op<47>; +def IL_CC_F_O : IL_CC_Op<48>; +def IL_CC_D_O : IL_CC_Op<49>; +def IL_CC_F_UO : IL_CC_Op<50>; +def IL_CC_D_UO : IL_CC_Op<51>; +def IL_CC_L_LE : IL_CC_Op<52>; +def IL_CC_L_GE : IL_CC_Op<53>; +def IL_CC_L_EQ : IL_CC_Op<54>; +def IL_CC_L_NE : IL_CC_Op<55>; +def IL_CC_L_LT : IL_CC_Op<56>; +def IL_CC_L_GT : IL_CC_Op<57>; +def IL_CC_UL_LE : IL_CC_Op<58>; +def IL_CC_UL_GE : IL_CC_Op<59>; +def IL_CC_UL_EQ : IL_CC_Op<60>; +def IL_CC_UL_NE : IL_CC_Op<61>; +def IL_CC_UL_LT : IL_CC_Op<62>; +def IL_CC_UL_GT : IL_CC_Op<63>; diff --git a/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp new file mode 100644 index 00000000000..9383bfcb77b --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp @@ -0,0 +1,1211 @@ +//===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#define DEBUG_TYPE "PeepholeOpt" +#ifdef DEBUG +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME 0 +#endif + +#include "AMDILAlgorithms.tpp" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" + +#include <sstream> + +#if 0 +STATISTIC(PointerAssignments, "Number of dynamic pointer " + "assigments discovered"); +STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); +#endif +STATISTIC(LocalFuncs, "Number of get_local_size(N) functions removed"); + +using namespace llvm; +// The Peephole optimization pass is used to do simple last minute optimizations +// that are required for correct code or to remove redundant functions +namespace { +class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass { +public: + TargetMachine &TM; + static char ID; + AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + ~AMDILPeepholeOpt(); + const char *getPassName() const; + bool runOnFunction(Function &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + void getAnalysisUsage(AnalysisUsage &AU) const; +protected: +private: + // Function to initiate all of the instruction level optimizations. + bool instLevelOptimizations(BasicBlock::iterator *inst); + // Quick check to see if we need to dump all of the pointers into the + // arena. If this is correct, then we set all pointers to exist in arena. This + // is a workaround for aliasing of pointers in a struct/union. + bool dumpAllIntoArena(Function &F); + // Because I don't want to invalidate any pointers while in the + // safeNestedForEachFunction. I push atomic conversions to a vector and handle + // it later. This function does the conversions if required. + void doAtomicConversionIfNeeded(Function &F); + // Because __amdil_is_constant cannot be properly evaluated if + // optimizations are disabled, the call's are placed in a vector + // and evaluated after the __amdil_image* functions are evaluated + // which should allow the __amdil_is_constant function to be + // evaluated correctly. + void doIsConstCallConversionIfNeeded(); + bool mChanged; + bool mDebug; + bool mRWGOpt; + bool mConvertAtomics; + CodeGenOpt::Level optLevel; + // Run a series of tests to see if we can optimize a CALL instruction. + bool optimizeCallInst(BasicBlock::iterator *bbb); + // A peephole optimization to optimize bit extract sequences. + bool optimizeBitExtract(Instruction *inst); + // A peephole optimization to optimize bit insert sequences. + bool optimizeBitInsert(Instruction *inst); + bool setupBitInsert(Instruction *base, + Instruction *&src, + Constant *&mask, + Constant *&shift); + // Expand the bit field insert instruction on versions of OpenCL that + // don't support it. + bool expandBFI(CallInst *CI); + // Expand the bit field mask instruction on version of OpenCL that + // don't support it. + bool expandBFM(CallInst *CI); + // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in + // this case we need to expand them. These functions check for 24bit functions + // and then expand. + bool isSigned24BitOps(CallInst *CI); + void expandSigned24BitOps(CallInst *CI); + // One optimization that can occur is that if the required workgroup size is + // specified then the result of get_local_size is known at compile time and + // can be returned accordingly. + bool isRWGLocalOpt(CallInst *CI); + void expandRWGLocalOpt(CallInst *CI); + // On northern island cards, the division is slightly less accurate than on + // previous generations, so we need to utilize a more accurate division. So we + // can translate the accurate divide to a normal divide on all other cards. + bool convertAccurateDivide(CallInst *CI); + void expandAccurateDivide(CallInst *CI); + // If the alignment is set incorrectly, it can produce really inefficient + // code. This checks for this scenario and fixes it if possible. + bool correctMisalignedMemOp(Instruction *inst); + + // If we are in no opt mode, then we need to make sure that + // local samplers are properly propagated as constant propagation + // doesn't occur and we need to know the value of kernel defined + // samplers at compile time. + bool propagateSamplerInst(CallInst *CI); + + LLVMContext *mCTX; + Function *mF; + const AMDILSubtarget *mSTM; + SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; + SmallVector<CallInst *, 16> isConstVec; +}; // class AMDILPeepholeOpt + char AMDILPeepholeOpt::ID = 0; +} // anonymous namespace + +namespace llvm { + FunctionPass * + createAMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + { + return new AMDILPeepholeOpt(tm AMDIL_OPT_LEVEL_VAR); + } +} // llvm namespace + +AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + : FunctionPass(ID), TM(tm) +{ + mDebug = DEBUGME; + optLevel = TM.getOptLevel(); + +} + +AMDILPeepholeOpt::~AMDILPeepholeOpt() +{ +} + +const char * +AMDILPeepholeOpt::getPassName() const +{ + return "AMDIL PeepHole Optimization Pass"; +} + +bool +containsPointerType(Type *Ty) +{ + if (!Ty) { + return false; + } + switch(Ty->getTypeID()) { + default: + return false; + case Type::StructTyID: { + const StructType *ST = dyn_cast<StructType>(Ty); + for (StructType::element_iterator stb = ST->element_begin(), + ste = ST->element_end(); stb != ste; ++stb) { + if (!containsPointerType(*stb)) { + continue; + } + return true; + } + break; + } + case Type::VectorTyID: + case Type::ArrayTyID: + return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); + case Type::PointerTyID: + return true; + }; + return false; +} + +bool +AMDILPeepholeOpt::dumpAllIntoArena(Function &F) +{ + bool dumpAll = false; + for (Function::const_arg_iterator cab = F.arg_begin(), + cae = F.arg_end(); cab != cae; ++cab) { + const Argument *arg = cab; + const PointerType *PT = dyn_cast<PointerType>(arg->getType()); + if (!PT) { + continue; + } + Type *DereferencedType = PT->getElementType(); + if (!dyn_cast<StructType>(DereferencedType) + ) { + continue; + } + if (!containsPointerType(DereferencedType)) { + continue; + } + // FIXME: Because a pointer inside of a struct/union may be aliased to + // another pointer we need to take the conservative approach and place all + // pointers into the arena until more advanced detection is implemented. + dumpAll = true; + } + return dumpAll; +} +void +AMDILPeepholeOpt::doIsConstCallConversionIfNeeded() +{ + if (isConstVec.empty()) { + return; + } + for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { + CallInst *CI = isConstVec[x]; + Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) + : ConstantInt::get(aType, 0); + CI->replaceAllUsesWith(Val); + CI->eraseFromParent(); + } + isConstVec.clear(); +} +void +AMDILPeepholeOpt::doAtomicConversionIfNeeded(Function &F) +{ + // Don't do anything if we don't have any atomic operations. + if (atomicFuncs.empty()) { + return; + } + // Change the function name for the atomic if it is required + uint32_t size = atomicFuncs.size(); + for (uint32_t x = 0; x < size; ++x) { + atomicFuncs[x].first->setOperand( + atomicFuncs[x].first->getNumOperands()-1, + atomicFuncs[x].second); + + } + mChanged = true; + if (mConvertAtomics) { + return; + } + // If we did not convert all of the atomics, then we need to make sure that + // the atomics that were not converted have their base pointers set to use the + // arena path. + Function::arg_iterator argB = F.arg_begin(); + Function::arg_iterator argE = F.arg_end(); + AMDILKernelManager *KM = mSTM->getKernelManager(); + AMDILMachineFunctionInfo *mMFI = getAnalysis<MachineFunctionAnalysis>().getMF() + .getInfo<AMDILMachineFunctionInfo>(); + for (; argB != argE; ++argB) { + if (mSTM->device()->isSupported(AMDILDeviceInfo::ArenaUAV)) { + KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)); + mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)); + } else { + KM->setUAVID(argB,mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID)); + mMFI->uav_insert(mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID)); + } + } +} + +bool +AMDILPeepholeOpt::runOnFunction(Function &MF) +{ + mChanged = false; + mF = &MF; + mSTM = &TM.getSubtarget<AMDILSubtarget>(); + if (mDebug) { + MF.dump(); + } + mCTX = &MF.getType()->getContext(); + mConvertAtomics = true; + if (dumpAllIntoArena(MF)) { + for (Function::const_arg_iterator cab = MF.arg_begin(), + cae = MF.arg_end(); cab != cae; ++cab) { + const Argument *arg = cab; + AMDILKernelManager *KM = mSTM->getKernelManager(); + KM->setUAVID(getBasePointerValue(arg), + mSTM->device()->getResourceID(AMDILDevice::GLOBAL_ID)); + } + } + mRWGOpt = mSTM->getGlobalManager()->hasRWG(MF.getName()); + safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), + std::bind1st(std::mem_fun(&AMDILPeepholeOpt::instLevelOptimizations), + this)); + + doAtomicConversionIfNeeded(MF); + doIsConstCallConversionIfNeeded(); + + if (mDebug) { + MF.dump(); + } + return mChanged; +} + +bool +AMDILPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) +{ + Instruction *inst = (*bbb); + CallInst *CI = dyn_cast<CallInst>(inst); + if (!CI) { + return false; + } + if (isSigned24BitOps(CI)) { + expandSigned24BitOps(CI); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + if (isRWGLocalOpt(CI)) { + expandRWGLocalOpt(CI); + return false; + } + if (propagateSamplerInst(CI)) { + return false; + } + if (expandBFI(CI) || expandBFM(CI)) { + ++(*bbb); + CI->eraseFromParent(); + return true; + } + if (convertAccurateDivide(CI)) { + expandAccurateDivide(CI); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + + StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); + if (calleeName.startswith("__amdil_is_constant")) { + // If we do not have optimizations, then this + // cannot be properly evaluated, so we add the + // call instruction to a vector and process + // them at the end of processing after the + // samplers have been correctly handled. + if (optLevel == CodeGenOpt::None) { + isConstVec.push_back(CI); + return false; + } else { + Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) + : ConstantInt::get(aType, 0); + CI->replaceAllUsesWith(Val); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + } + + if (calleeName.equals("__amdil_is_asic_id_i32")) { + ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); + Type *aType = Type::getInt32Ty(*mCTX); + Value *Val = CV; + if (Val) { + Val = ConstantInt::get(aType, + mSTM->device()->getDeviceFlag() & CV->getZExtValue()); + } else { + Val = ConstantInt::get(aType, 0); + } + CI->replaceAllUsesWith(Val); + ++(*bbb); + CI->eraseFromParent(); + return true; + } + Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); + if (!F) { + return false; + } + if (F->getName().startswith("__atom") && !CI->getNumUses() + && F->getName().find("_xchg") == StringRef::npos) { + std::string buffer(F->getName().str() + "_noret"); + F = dyn_cast<Function>( + F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); + atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F)); + } + + if (!mSTM->device()->isSupported(AMDILDeviceInfo::ArenaSegment) + && !mSTM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) { + return false; + } + if (!mConvertAtomics) { + return false; + } + StringRef name = F->getName(); + if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { + Value *ptr = CI->getOperand(0); + const Value *basePtr = getBasePointerValue(ptr); + const Argument *Arg = dyn_cast<Argument>(basePtr); + if (Arg) { + AMDILGlobalManager *GM = mSTM->getGlobalManager(); + int32_t id = GM->getArgID(Arg); + if (id >= 0) { + std::stringstream ss; + ss << name.data() << "_" << id << '\n'; + std::string val; + ss >> val; + F = dyn_cast<Function>( + F->getParent() ->getOrInsertFunction(val, F->getFunctionType())); + atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F)); + } else { + mConvertAtomics = false; + } + } else { + mConvertAtomics = false; + } + } + return false; +} + +bool +AMDILPeepholeOpt::setupBitInsert(Instruction *base, + Instruction *&src, + Constant *&mask, + Constant *&shift) +{ + if (!base) { + if (mDebug) { + dbgs() << "Null pointer passed into function.\n"; + } + return false; + } + bool andOp = false; + if (base->getOpcode() == Instruction::Shl) { + shift = dyn_cast<Constant>(base->getOperand(1)); + } else if (base->getOpcode() == Instruction::And) { + mask = dyn_cast<Constant>(base->getOperand(1)); + andOp = true; + } else { + if (mDebug) { + dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; + } + // If the base is neither a Shl or a And, we don't fit any of the patterns above. + return false; + } + src = dyn_cast<Instruction>(base->getOperand(0)); + if (!src) { + if (mDebug) { + dbgs() << "Failed setup since the base operand is not an instruction!\n"; + } + return false; + } + // If we find an 'and' operation, then we don't need to + // find the next operation as we already know the + // bits that are valid at this point. + if (andOp) { + return true; + } + if (src->getOpcode() == Instruction::Shl && !shift) { + shift = dyn_cast<Constant>(src->getOperand(1)); + src = dyn_cast<Instruction>(src->getOperand(0)); + } else if (src->getOpcode() == Instruction::And && !mask) { + mask = dyn_cast<Constant>(src->getOperand(1)); + } + if (!mask && !shift) { + if (mDebug) { + dbgs() << "Failed setup since both mask and shift are NULL!\n"; + } + // Did not find a constant mask or a shift. + return false; + } + return true; +} +bool +AMDILPeepholeOpt::optimizeBitInsert(Instruction *inst) +{ + if (!inst) { + return false; + } + if (!inst->isBinaryOp()) { + return false; + } + if (inst->getOpcode() != Instruction::Or) { + return false; + } + if (optLevel == CodeGenOpt::None) { + return false; + } + // We want to do an optimization on a sequence of ops that in the end equals a + // single ISA instruction. + // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) + // Some simplified versions of this pattern are as follows: + // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 + // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E + // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B + // (A & B) | (D << F) when (1 << F) >= B + // (A << C) | (D & E) when (1 << C) >= E + if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + // The HD4XXX hardware doesn't support the ubit_insert instruction. + return false; + } + Type *aType = inst->getType(); + bool isVector = aType->isVectorTy(); + int numEle = 1; + // This optimization only works on 32bit integers. + if (aType->getScalarType() + != Type::getInt32Ty(inst->getContext())) { + return false; + } + if (isVector) { + const VectorType *VT = dyn_cast<VectorType>(aType); + numEle = VT->getNumElements(); + // We currently cannot support more than 4 elements in a intrinsic and we + // cannot support Vec3 types. + if (numEle > 4 || numEle == 3) { + return false; + } + } + // TODO: Handle vectors. + if (isVector) { + if (mDebug) { + dbgs() << "!!! Vectors are not supported yet!\n"; + } + return false; + } + Instruction *LHSSrc = NULL, *RHSSrc = NULL; + Constant *LHSMask = NULL, *RHSMask = NULL; + Constant *LHSShift = NULL, *RHSShift = NULL; + Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); + Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); + if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { + if (mDebug) { + dbgs() << "Found an OR Operation that failed setup!\n"; + inst->dump(); + if (LHS) { LHS->dump(); } + if (LHSSrc) { LHSSrc->dump(); } + if (LHSMask) { LHSMask->dump(); } + if (LHSShift) { LHSShift->dump(); } + } + // There was an issue with the setup for BitInsert. + return false; + } + if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { + if (mDebug) { + dbgs() << "Found an OR Operation that failed setup!\n"; + inst->dump(); + if (RHS) { RHS->dump(); } + if (RHSSrc) { RHSSrc->dump(); } + if (RHSMask) { RHSMask->dump(); } + if (RHSShift) { RHSShift->dump(); } + } + // There was an issue with the setup for BitInsert. + return false; + } + if (mDebug) { + dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; + dbgs() << "Op: "; inst->dump(); + dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } + dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } + } + Constant *offset = NULL; + Constant *width = NULL; + int32_t lhsMaskVal = 0, rhsMaskVal = 0; + int32_t lhsShiftVal = 0, rhsShiftVal = 0; + int32_t lhsMaskWidth = 0, rhsMaskWidth = 0; + int32_t lhsMaskOffset = 0, rhsMaskOffset = 0; + lhsMaskVal = (int32_t)(LHSMask + ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); + rhsMaskVal = (int32_t)(RHSMask + ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); + lhsShiftVal = (int32_t)(LHSShift + ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); + rhsShiftVal = (int32_t)(RHSShift + ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); + lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; + rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; + lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; + rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; + // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). + if (mDebug) { + dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")"); + dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ; + dbgs() << (RHSMask ? " & E)" : ")"); + dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n"); + dbgs() << "A = LHSSrc\t\tD = RHSSrc \n"; + dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n"; + dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n"; + dbgs() << "width(B) = " << lhsMaskWidth; + dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n"; + dbgs() << "offset(B) = " << lhsMaskOffset; + dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n"; + dbgs() << "Constraints: \n"; + dbgs() << "\t(1) B ^ E == 0\n"; + dbgs() << "\t(2-LHS) B is a mask\n"; + dbgs() << "\t(2-LHS) E is a mask\n"; + dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n"; + dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n"; + } + if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { + if (mDebug) { + dbgs() << lhsMaskVal << " ^ " << rhsMaskVal; + dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n"; + dbgs() << "Failed constraint 1!\n"; + } + return false; + } + if (mDebug) { + dbgs() << "LHS = " << lhsMaskOffset << ""; + dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = "; + dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)); + dbgs() << "\nRHS = " << rhsMaskOffset << ""; + dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = "; + dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)); + dbgs() << "\n"; + } + if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { + offset = ConstantInt::get(aType, lhsMaskOffset, false); + width = ConstantInt::get(aType, lhsMaskWidth, false); + RHSSrc = RHS; + if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { + if (mDebug) { + dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n"; + dbgs() << "Failed constraint 2!\n"; + } + return false; + } + if (!LHSShift) { + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, + "MaskShr", LHS); + } else if (lhsShiftVal != lhsMaskOffset) { + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, + "MaskShr", LHS); + } + if (mDebug) { + dbgs() << "Optimizing LHS!\n"; + } + } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { + offset = ConstantInt::get(aType, rhsMaskOffset, false); + width = ConstantInt::get(aType, rhsMaskWidth, false); + LHSSrc = RHSSrc; + RHSSrc = LHS; + if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { + if (mDebug) { + dbgs() << "Non-Mask: " << rhsMaskVal << "\n"; + dbgs() << "Failed constraint 2!\n"; + } + return false; + } + if (!RHSShift) { + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, + "MaskShr", RHS); + } else if (rhsShiftVal != rhsMaskOffset) { + LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, + "MaskShr", RHS); + } + if (mDebug) { + dbgs() << "Optimizing RHS!\n"; + } + } else { + if (mDebug) { + dbgs() << "Failed constraint 3!\n"; + } + return false; + } + if (mDebug) { + dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } + dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } + dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } + dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } + } + if (!offset || !width) { + if (mDebug) { + dbgs() << "Either width or offset are NULL, failed detection!\n"; + } + return false; + } + // Lets create the function signature. + std::vector<Type *> callTypes; + callTypes.push_back(aType); + callTypes.push_back(aType); + callTypes.push_back(aType); + callTypes.push_back(aType); + FunctionType *funcType = FunctionType::get(aType, callTypes, false); + std::string name = "__amdil_ubit_insert"; + if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } + Function *Func = + dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> + getOrInsertFunction(llvm::StringRef(name), funcType)); + Value *Operands[4] = { + width, + offset, + LHSSrc, + RHSSrc + }; + CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); + if (mDebug) { + dbgs() << "Old Inst: "; + inst->dump(); + dbgs() << "New Inst: "; + CI->dump(); + dbgs() << "\n\n"; + } + CI->insertBefore(inst); + inst->replaceAllUsesWith(CI); + return true; +} + +bool +AMDILPeepholeOpt::optimizeBitExtract(Instruction *inst) +{ + if (!inst) { + return false; + } + if (!inst->isBinaryOp()) { + return false; + } + if (inst->getOpcode() != Instruction::And) { + return false; + } + if (optLevel == CodeGenOpt::None) { + return false; + } + // We want to do some simple optimizations on Shift right/And patterns. The + // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a + // value smaller than 32 and C is a mask. If C is a constant value, then the + // following transformation can occur. For signed integers, it turns into the + // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned + // integers, it turns into the function call dst = + // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract + // can be found in Section 7.9 of the ATI IL spec of the stream SDK for + // Evergreen hardware. + if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX) { + // This does not work on HD4XXX hardware. + return false; + } + Type *aType = inst->getType(); + bool isVector = aType->isVectorTy(); + int numEle = 1; + // This only works on 32bit integers + if (aType->getScalarType() + != Type::getInt32Ty(inst->getContext())) { + return false; + } + if (isVector) { + const VectorType *VT = dyn_cast<VectorType>(aType); + numEle = VT->getNumElements(); + // We currently cannot support more than 4 elements in a intrinsic and we + // cannot support Vec3 types. + if (numEle > 4 || numEle == 3) { + return false; + } + } + BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); + // If the first operand is not a shift instruction, then we can return as it + // doesn't match this pattern. + if (!ShiftInst || !ShiftInst->isShift()) { + return false; + } + // If we are a shift left, then we need don't match this pattern. + if (ShiftInst->getOpcode() == Instruction::Shl) { + return false; + } + bool isSigned = ShiftInst->isArithmeticShift(); + Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); + Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); + // Lets make sure that the shift value and the and mask are constant integers. + if (!AndMask || !ShrVal) { + return false; + } + Constant *newMaskConst; + Constant *shiftValConst; + if (isVector) { + // Handle the vector case + std::vector<Constant *> maskVals; + std::vector<Constant *> shiftVals; + ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); + ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); + Type *scalarType = AndMaskVec->getType()->getScalarType(); + assert(AndMaskVec->getNumOperands() == + ShrValVec->getNumOperands() && "cannot have a " + "combination where the number of elements to a " + "shift and an and are different!"); + for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { + ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); + ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); + if (!AndCI || !ShiftIC) { + return false; + } + uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); + if (!isMask_32(maskVal)) { + return false; + } + maskVal = (uint32_t)CountTrailingOnes_32(maskVal); + uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); + // If the mask or shiftval is greater than the bitcount, then break out. + if (maskVal >= 32 || shiftVal >= 32) { + return false; + } + // If the mask val is greater than the the number of original bits left + // then this optimization is invalid. + if (maskVal > (32 - shiftVal)) { + return false; + } + maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); + shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); + } + newMaskConst = ConstantVector::get(maskVals); + shiftValConst = ConstantVector::get(shiftVals); + } else { + // Handle the scalar case + uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); + // This must be a mask value where all lower bits are set to 1 and then any + // bit higher is set to 0. + if (!isMask_32(maskVal)) { + return false; + } + maskVal = (uint32_t)CountTrailingOnes_32(maskVal); + // Count the number of bits set in the mask, this is the width of the + // resulting bit set that is extracted from the source value. + uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); + // If the mask or shift val is greater than the bitcount, then break out. + if (maskVal >= 32 || shiftVal >= 32) { + return false; + } + // If the mask val is greater than the the number of original bits left then + // this optimization is invalid. + if (maskVal > (32 - shiftVal)) { + return false; + } + newMaskConst = ConstantInt::get(aType, maskVal, isSigned); + shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); + } + // Lets create the function signature. + std::vector<Type *> callTypes; + callTypes.push_back(aType); + callTypes.push_back(aType); + callTypes.push_back(aType); + FunctionType *funcType = FunctionType::get(aType, callTypes, false); + std::string name = "__amdil_ubit_extract"; + if (isVector) { + name += "_v" + itostr(numEle) + "i32"; + } else { + name += "_i32"; + } + // Lets create the function. + Function *Func = + dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> + getOrInsertFunction(llvm::StringRef(name), funcType)); + Value *Operands[3] = { + newMaskConst, + shiftValConst, + ShiftInst->getOperand(0) + }; + // Lets create the Call with the operands + CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); + CI->insertBefore(inst); + inst->replaceAllUsesWith(CI); + return true; +} + +bool +AMDILPeepholeOpt::expandBFI(CallInst *CI) +{ + if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) { + return false; + } + Value *LHS = CI->getOperand(CI->getNumOperands() - 1); + if (!LHS->getName().startswith("__amdil_bfi")) { + return false; + } + Type* type = CI->getOperand(0)->getType(); + Constant *negOneConst = NULL; + if (type->isVectorTy()) { + std::vector<Constant *> negOneVals; + negOneConst = ConstantInt::get(CI->getContext(), + APInt(32, StringRef("-1"), 10)); + for (size_t x = 0, + y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { + negOneVals.push_back(negOneConst); + } + negOneConst = ConstantVector::get(negOneVals); + } else { + negOneConst = ConstantInt::get(CI->getContext(), + APInt(32, StringRef("-1"), 10)); + } + // __amdil_bfi => (A & B) | (~A & C) + BinaryOperator *lhs = + BinaryOperator::Create(Instruction::And, CI->getOperand(0), + CI->getOperand(1), "bfi_and", CI); + BinaryOperator *rhs = + BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, + "bfi_not", CI); + rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), + "bfi_and", CI); + lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); + CI->replaceAllUsesWith(lhs); + return true; +} + +bool +AMDILPeepholeOpt::expandBFM(CallInst *CI) +{ + if (!CI || mSTM->calVersion() <= CAL_VERSION_SC_150) { + return false; + } + Value *LHS = CI->getOperand(CI->getNumOperands() - 1); + if (!LHS->getName().startswith("__amdil_bfm")) { + return false; + } + // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) + Constant *newMaskConst = NULL; + Constant *newShiftConst = NULL; + Type* type = CI->getOperand(0)->getType(); + if (type->isVectorTy()) { + std::vector<Constant*> newMaskVals, newShiftVals; + newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); + newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); + for (size_t x = 0, + y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { + newMaskVals.push_back(newMaskConst); + newShiftVals.push_back(newShiftConst); + } + newMaskConst = ConstantVector::get(newMaskVals); + newShiftConst = ConstantVector::get(newShiftVals); + } else { + newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); + newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); + } + BinaryOperator *lhs = + BinaryOperator::Create(Instruction::And, CI->getOperand(0), + newMaskConst, "bfm_mask", CI); + lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, + lhs, "bfm_shl", CI); + lhs = BinaryOperator::Create(Instruction::Sub, lhs, + newShiftConst, "bfm_sub", CI); + BinaryOperator *rhs = + BinaryOperator::Create(Instruction::And, CI->getOperand(1), + newMaskConst, "bfm_mask", CI); + lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); + CI->replaceAllUsesWith(lhs); + return true; +} + +bool +AMDILPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) +{ + Instruction *inst = (*bbb); + if (optimizeCallInst(bbb)) { + return true; + } + if (optimizeBitExtract(inst)) { + return false; + } + if (optimizeBitInsert(inst)) { + return false; + } + if (correctMisalignedMemOp(inst)) { + return false; + } + return false; +} +bool +AMDILPeepholeOpt::correctMisalignedMemOp(Instruction *inst) +{ + LoadInst *linst = dyn_cast<LoadInst>(inst); + StoreInst *sinst = dyn_cast<StoreInst>(inst); + unsigned alignment; + Type* Ty = inst->getType(); + if (linst) { + alignment = linst->getAlignment(); + Ty = inst->getType(); + } else if (sinst) { + alignment = sinst->getAlignment(); + Ty = sinst->getValueOperand()->getType(); + } else { + return false; + } + unsigned size = getTypeSize(Ty); + if (size == alignment || size < alignment) { + return false; + } + if (!Ty->isStructTy()) { + return false; + } + if (alignment < 4) { + if (linst) { + linst->setAlignment(0); + return true; + } else if (sinst) { + sinst->setAlignment(0); + return true; + } + } + return false; +} +bool +AMDILPeepholeOpt::isSigned24BitOps(CallInst *CI) +{ + if (!CI) { + return false; + } + Value *LHS = CI->getOperand(CI->getNumOperands() - 1); + std::string namePrefix = LHS->getName().substr(0, 14); + if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" + && namePrefix != "__amdil__imul24_high") { + return false; + } + if (mSTM->device()->usesHardware(AMDILDeviceInfo::Signed24BitOps)) { + return false; + } + return true; +} + +void +AMDILPeepholeOpt::expandSigned24BitOps(CallInst *CI) +{ + assert(isSigned24BitOps(CI) && "Must be a " + "signed 24 bit operation to call this function!"); + Value *LHS = CI->getOperand(CI->getNumOperands()-1); + // On 7XX and 8XX we do not have signed 24bit, so we need to + // expand it to the following: + // imul24 turns into 32bit imul + // imad24 turns into 32bit imad + // imul24_high turns into 32bit imulhigh + if (LHS->getName().substr(0, 14) == "__amdil_imad24") { + Type *aType = CI->getOperand(0)->getType(); + bool isVector = aType->isVectorTy(); + int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; + std::vector<Type*> callTypes; + callTypes.push_back(CI->getOperand(0)->getType()); + callTypes.push_back(CI->getOperand(1)->getType()); + callTypes.push_back(CI->getOperand(2)->getType()); + FunctionType *funcType = + FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); + std::string name = "__amdil_imad"; + if (isVector) { + name += "_v" + itostr(numEle) + "i32"; + } else { + name += "_i32"; + } + Function *Func = dyn_cast<Function>( + CI->getParent()->getParent()->getParent()-> + getOrInsertFunction(llvm::StringRef(name), funcType)); + Value *Operands[3] = { + CI->getOperand(0), + CI->getOperand(1), + CI->getOperand(2) + }; + CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); + nCI->insertBefore(CI); + CI->replaceAllUsesWith(nCI); + } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { + BinaryOperator *mulOp = + BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), + CI->getOperand(1), "imul24", CI); + CI->replaceAllUsesWith(mulOp); + } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { + Type *aType = CI->getOperand(0)->getType(); + + bool isVector = aType->isVectorTy(); + int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; + std::vector<Type*> callTypes; + callTypes.push_back(CI->getOperand(0)->getType()); + callTypes.push_back(CI->getOperand(1)->getType()); + FunctionType *funcType = + FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); + std::string name = "__amdil_imul_high"; + if (isVector) { + name += "_v" + itostr(numEle) + "i32"; + } else { + name += "_i32"; + } + Function *Func = dyn_cast<Function>( + CI->getParent()->getParent()->getParent()-> + getOrInsertFunction(llvm::StringRef(name), funcType)); + Value *Operands[2] = { + CI->getOperand(0), + CI->getOperand(1) + }; + CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); + nCI->insertBefore(CI); + CI->replaceAllUsesWith(nCI); + } +} + +bool +AMDILPeepholeOpt::isRWGLocalOpt(CallInst *CI) +{ + return (CI != NULL && mRWGOpt + && CI->getOperand(CI->getNumOperands() - 1)->getName() + == "__amdil_get_local_size_int"); +} + +void +AMDILPeepholeOpt::expandRWGLocalOpt(CallInst *CI) +{ + assert(isRWGLocalOpt(CI) && + "This optmization only works when the call inst is get_local_size!"); + std::vector<Constant *> consts; + for (uint32_t x = 0; x < 3; ++x) { + uint32_t val = mSTM->getGlobalManager()->getLocal(mF->getName(), x); + consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), val)); + } + consts.push_back(ConstantInt::get(Type::getInt32Ty(*mCTX), 0)); + Value *cVec = ConstantVector::get(consts); + CI->replaceAllUsesWith(cVec); + ++LocalFuncs; + return; +} + +bool +AMDILPeepholeOpt::convertAccurateDivide(CallInst *CI) +{ + if (!CI) { + return false; + } + if (mSTM->device()->getGeneration() == AMDILDeviceInfo::HD6XXX + && (mSTM->getDeviceName() == "cayman")) { + return false; + } + return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) + == "__amdil_improved_div"; +} + +void +AMDILPeepholeOpt::expandAccurateDivide(CallInst *CI) +{ + assert(convertAccurateDivide(CI) + && "expanding accurate divide can only happen if it is expandable!"); + BinaryOperator *divOp = + BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), + CI->getOperand(1), "fdiv32", CI); + CI->replaceAllUsesWith(divOp); +} + +bool +AMDILPeepholeOpt::propagateSamplerInst(CallInst *CI) +{ + if (optLevel != CodeGenOpt::None) { + return false; + } + + if (!CI) { + return false; + } + + unsigned funcNameIdx = 0; + funcNameIdx = CI->getNumOperands() - 1; + StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); + if (calleeName != "__amdil_image2d_read_norm" + && calleeName != "__amdil_image2d_read_unnorm" + && calleeName != "__amdil_image3d_read_norm" + && calleeName != "__amdil_image3d_read_unnorm") { + return false; + } + + unsigned samplerIdx = 2; + samplerIdx = 1; + Value *sampler = CI->getOperand(samplerIdx); + LoadInst *lInst = dyn_cast<LoadInst>(sampler); + if (!lInst) { + return false; + } + + if (lInst->getPointerAddressSpace() != AMDILAS::PRIVATE_ADDRESS) { + return false; + } + + GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); + // If we are loading from what is not a global value, then we + // fail and return. + if (!gv) { + return false; + } + + // If we don't have an initializer or we have an initializer and + // the initializer is not a 32bit integer, we fail. + if (!gv->hasInitializer() + || !gv->getInitializer()->getType()->isIntegerTy(32)) { + return false; + } + + // Now that we have the global variable initializer, lets replace + // all uses of the load instruction with the samplerVal and + // reparse the __amdil_is_constant() function. + Constant *samplerVal = gv->getInitializer(); + lInst->replaceAllUsesWith(samplerVal); + return true; +} + +bool +AMDILPeepholeOpt::doInitialization(Module &M) +{ + return false; +} + +bool +AMDILPeepholeOpt::doFinalization(Module &M) +{ + return false; +} + +void +AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<MachineFunctionAnalysis>(); + FunctionPass::getAnalysisUsage(AU); + AU.setPreservesAll(); +} diff --git a/src/gallium/drivers/radeon/AMDILPointerManager.cpp b/src/gallium/drivers/radeon/AMDILPointerManager.cpp new file mode 100644 index 00000000000..9cac61cb718 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILPointerManager.cpp @@ -0,0 +1,2551 @@ +//===-------- AMDILPointerManager.cpp - Manage Pointers for HW-------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// Implementation for the AMDILPointerManager classes. See header file for +// more documentation of class. +// TODO: This fails when function calls are enabled, must always be inlined +//===----------------------------------------------------------------------===// +#include "AMDILPointerManager.h" +#include "AMDILCompilerErrors.h" +#include "AMDILDeviceInfo.h" +#include "AMDILGlobalManager.h" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Twine.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalValue.h" +#include "llvm/Instructions.h" +#include "llvm/Metadata.h" +#include "llvm/Module.h" +#include "llvm/Support/FormattedStream.h" + +#include <stdio.h> +using namespace llvm; +char AMDILPointerManager::ID = 0; +namespace llvm { + FunctionPass* + createAMDILPointerManager(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + { + return tm.getSubtarget<AMDILSubtarget>() + .device()->getPointerManager(tm AMDIL_OPT_LEVEL_VAR); + } +} + +AMDILPointerManager::AMDILPointerManager( + TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) : + MachineFunctionPass(ID), + TM(tm) +{ + mDebug = DEBUGME; + initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry()); +} + +AMDILPointerManager::~AMDILPointerManager() +{ +} + +const char* +AMDILPointerManager::getPassName() const +{ + return "AMD IL Default Pointer Manager Pass"; +} + +void +AMDILPointerManager::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.setPreservesAll(); + AU.addRequiredID(MachineDominatorsID); + MachineFunctionPass::getAnalysisUsage(AU); +} + +AMDILEGPointerManager::AMDILEGPointerManager( + TargetMachine &tm + AMDIL_OPT_LEVEL_DECL) : + AMDILPointerManager(tm AMDIL_OPT_LEVEL_VAR), + TM(tm) +{ +} + +AMDILEGPointerManager::~AMDILEGPointerManager() +{ +} +std::string +findSamplerName(MachineInstr* MI, + FIPMap &FIToPtrMap, + RVPVec &lookupTable, + const TargetMachine *TM) +{ + std::string sampler = "unknown"; + assert(MI->getNumOperands() == 5 && "Only an " + "image read instruction with 5 arguments can " + "have a sampler."); + assert(MI->getOperand(3).isReg() && + "Argument 3 must be a register to call this function"); + unsigned reg = MI->getOperand(3).getReg(); + // If this register points to an argument, then + // we can return the argument name. + if (lookupTable[reg].second && dyn_cast<Argument>(lookupTable[reg].second)) { + return lookupTable[reg].second->getName(); + } + // Otherwise the sampler is coming from memory somewhere. + // If the sampler memory location can be tracked, then + // we ascertain the sampler name that way. + // The most common case is when optimizations are disabled + // or mem2reg is not enabled, then the sampler when it is + // an argument is passed through the frame index. + + // In the optimized case, the instruction that defined + // register from operand #3 is a private load. + MachineRegisterInfo ®Info = MI->getParent()->getParent()->getRegInfo(); + assert(!regInfo.def_empty(reg) + && "We don't have any defs of this register, but we aren't an argument!"); + MachineOperand *defOp = regInfo.getRegUseDefListHead(reg); + MachineInstr *defMI = defOp->getParent(); + if (isPrivateInst(TM->getInstrInfo(), defMI) && isLoadInst(TM->getInstrInfo(), defMI)) { + if (defMI->getOperand(1).isFI()) { + RegValPair &fiRVP = FIToPtrMap[reg]; + if (fiRVP.second && dyn_cast<Argument>(fiRVP.second)) { + return fiRVP.second->getName(); + } else { + // FIXME: Fix the case where the value stored is not a kernel argument. + assert(!"Found a private load of a sampler where the value isn't an argument!"); + } + } else { + // FIXME: Fix the case where someone dynamically loads a sampler value + // from private memory. This is problematic because we need to know the + // sampler value at compile time and if it is dynamically loaded, we won't + // know what sampler value to use. + assert(!"Found a private load of a sampler that isn't from a frame index!"); + } + } else { + // FIXME: Handle the case where the def is neither a private instruction + // and not a load instruction. This shouldn't occur, but putting an assertion + // just to make sure that it doesn't. + assert(!"Found a case which we don't handle."); + } + return sampler; +} + +const char* +AMDILEGPointerManager::getPassName() const +{ + return "AMD IL EG Pointer Manager Pass"; +} + +// Helper function to determine if the current pointer is from the +// local, region or private address spaces. + static bool +isLRPInst(MachineInstr *MI, + const AMDILTargetMachine *ATM) +{ + const AMDILSubtarget *STM + = ATM->getSubtargetImpl(); + if (!MI) { + return false; + } + if ((isRegionInst(ATM->getInstrInfo(), MI) + && STM->device()->usesHardware(AMDILDeviceInfo::RegionMem)) + || (isLocalInst(ATM->getInstrInfo(), MI) + && STM->device()->usesHardware(AMDILDeviceInfo::LocalMem)) + || (isPrivateInst(ATM->getInstrInfo(), MI) + && STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem))) { + return true; + } + return false; +} + +/// Helper function to determine if the I/O instruction uses +/// global device memory or not. +static bool +usesGlobal( + const AMDILTargetMachine *ATM, + MachineInstr *MI) { + const AMDILSubtarget *STM + = ATM->getSubtargetImpl(); + switch(MI->getOpcode()) { + ExpandCaseToAllTypes(AMDIL::GLOBALSTORE); + ExpandCaseToAllTruncTypes(AMDIL::GLOBALTRUNCSTORE); + ExpandCaseToAllTypes(AMDIL::GLOBALLOAD); + ExpandCaseToAllTypes(AMDIL::GLOBALSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::GLOBALZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::GLOBALAEXTLOAD); + return true; + ExpandCaseToAllTypes(AMDIL::REGIONLOAD); + ExpandCaseToAllTypes(AMDIL::REGIONSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::REGIONZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::REGIONAEXTLOAD); + ExpandCaseToAllTypes(AMDIL::REGIONSTORE); + ExpandCaseToAllTruncTypes(AMDIL::REGIONTRUNCSTORE); + return !STM->device()->usesHardware(AMDILDeviceInfo::RegionMem); + ExpandCaseToAllTypes(AMDIL::LOCALLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALAEXTLOAD); + ExpandCaseToAllTypes(AMDIL::LOCALSTORE); + ExpandCaseToAllTruncTypes(AMDIL::LOCALTRUNCSTORE); + return !STM->device()->usesHardware(AMDILDeviceInfo::LocalMem); + ExpandCaseToAllTypes(AMDIL::CPOOLLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CPOOLAEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CONSTANTLOAD); + ExpandCaseToAllTypes(AMDIL::CONSTANTSEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CONSTANTAEXTLOAD); + ExpandCaseToAllTypes(AMDIL::CONSTANTZEXTLOAD); + return !STM->device()->usesHardware(AMDILDeviceInfo::ConstantMem); + ExpandCaseToAllTypes(AMDIL::PRIVATELOAD); + ExpandCaseToAllTypes(AMDIL::PRIVATESEXTLOAD); + ExpandCaseToAllTypes(AMDIL::PRIVATEZEXTLOAD); + ExpandCaseToAllTypes(AMDIL::PRIVATEAEXTLOAD); + ExpandCaseToAllTypes(AMDIL::PRIVATESTORE); + ExpandCaseToAllTruncTypes(AMDIL::PRIVATETRUNCSTORE); + return !STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem); + default: + return false; + } + return false; +} + +// Helper function that allocates the default resource ID for the +// respective I/O types. +static void +allocateDefaultID( + const AMDILTargetMachine *ATM, + AMDILAS::InstrResEnc &curRes, + MachineInstr *MI, + bool mDebug) +{ + AMDILMachineFunctionInfo *mMFI = + MI->getParent()->getParent()->getInfo<AMDILMachineFunctionInfo>(); + const AMDILSubtarget *STM + = ATM->getSubtargetImpl(); + if (mDebug) { + dbgs() << "Assigning instruction to default ID. Inst:"; + MI->dump(); + } + // If we use global memory, lets set the Operand to + // the ARENA_UAV_ID. + if (usesGlobal(ATM, MI)) { + curRes.bits.ResourceID = + STM->device()->getResourceID(AMDILDevice::GLOBAL_ID); + if (isAtomicInst(ATM->getInstrInfo(), MI)) { + MI->getOperand(MI->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + } + AMDILKernelManager *KM = STM->getKernelManager(); + if (curRes.bits.ResourceID == 8 + && !STM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) { + KM->setUAVID(NULL, curRes.bits.ResourceID); + mMFI->uav_insert(curRes.bits.ResourceID); + } + } else if (isPrivateInst(ATM->getInstrInfo(), MI)) { + curRes.bits.ResourceID = + STM->device()->getResourceID(AMDILDevice::SCRATCH_ID); + } else if (isLocalInst(ATM->getInstrInfo(), MI) || isLocalAtomic(ATM->getInstrInfo(), MI)) { + curRes.bits.ResourceID = + STM->device()->getResourceID(AMDILDevice::LDS_ID); + AMDILMachineFunctionInfo *mMFI = + MI->getParent()->getParent()->getInfo<AMDILMachineFunctionInfo>(); + mMFI->setUsesLocal(); + if (isAtomicInst(ATM->getInstrInfo(), MI)) { + assert(curRes.bits.ResourceID && "Atomic resource ID " + "cannot be zero!"); + MI->getOperand(MI->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + } + } else if (isRegionInst(ATM->getInstrInfo(), MI) || isRegionAtomic(ATM->getInstrInfo(), MI)) { + curRes.bits.ResourceID = + STM->device()->getResourceID(AMDILDevice::GDS_ID); + AMDILMachineFunctionInfo *mMFI = + MI->getParent()->getParent()->getInfo<AMDILMachineFunctionInfo>(); + mMFI->setUsesRegion(); + if (isAtomicInst(ATM->getInstrInfo(), MI)) { + assert(curRes.bits.ResourceID && "Atomic resource ID " + "cannot be zero!"); + (MI)->getOperand((MI)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + } + } else if (isConstantInst(ATM->getInstrInfo(), MI)) { + // If we are unknown constant instruction and the base pointer is known. + // Set the resource ID accordingly, otherwise use the default constant ID. + // FIXME: this should not require the base pointer to know what constant + // it is from. + AMDILGlobalManager *GM = STM->getGlobalManager(); + MachineFunction *MF = MI->getParent()->getParent(); + if (GM->isKernel(MF->getFunction()->getName())) { + const kernel &krnl = GM->getKernel(MF->getFunction()->getName()); + const Value *V = getBasePointerValue(MI); + if (V && !dyn_cast<AllocaInst>(V)) { + curRes.bits.ResourceID = GM->getConstPtrCB(krnl, V->getName()); + curRes.bits.HardwareInst = 1; + } else if (V && dyn_cast<AllocaInst>(V)) { + // FIXME: Need a better way to fix this. Requires a rewrite of how + // we lower global addresses to various address spaces. + // So for now, lets assume that there is only a single + // constant buffer that can be accessed from a load instruction + // that is derived from an alloca instruction. + curRes.bits.ResourceID = 2; + curRes.bits.HardwareInst = 1; + } else { + if (isStoreInst(ATM->getInstrInfo(), MI)) { + if (mDebug) { + dbgs() << __LINE__ << ": Setting byte store bit on instruction: "; + MI->dump(); + } + curRes.bits.ByteStore = 1; + } + curRes.bits.ResourceID = STM->device()->getResourceID(AMDILDevice::CONSTANT_ID); + } + } else { + if (isStoreInst(ATM->getInstrInfo(), MI)) { + if (mDebug) { + dbgs() << __LINE__ << ": Setting byte store bit on instruction: "; + MI->dump(); + } + curRes.bits.ByteStore = 1; + } + curRes.bits.ResourceID = STM->device()->getResourceID(AMDILDevice::GLOBAL_ID); + AMDILKernelManager *KM = STM->getKernelManager(); + KM->setUAVID(NULL, curRes.bits.ResourceID); + mMFI->uav_insert(curRes.bits.ResourceID); + } + } else if (isAppendInst(ATM->getInstrInfo(), MI)) { + unsigned opcode = MI->getOpcode(); + if (opcode == AMDIL::APPEND_ALLOC + || opcode == AMDIL::APPEND_ALLOC_NORET) { + curRes.bits.ResourceID = 1; + } else { + curRes.bits.ResourceID = 2; + } + } + setAsmPrinterFlags(MI, curRes); +} + +// Function that parses the arguments and updates the lookupTable with the +// pointer -> register mapping. This function also checks for cacheable +// pointers and updates the CacheableSet with the arguments that +// can be cached based on the readonlypointer annotation. The final +// purpose of this function is to update the imageSet and counterSet +// with all pointers that are either images or atomic counters. +uint32_t +parseArguments(MachineFunction &MF, + RVPVec &lookupTable, + const AMDILTargetMachine *ATM, + CacheableSet &cacheablePtrs, + ImageSet &imageSet, + AppendSet &counterSet, + bool mDebug) +{ + const AMDILSubtarget *STM + = ATM->getSubtargetImpl(); + uint32_t writeOnlyImages = 0; + uint32_t readOnlyImages = 0; + std::string cachedKernelName = "llvm.readonlypointer.annotations."; + cachedKernelName.append(MF.getFunction()->getName()); + GlobalVariable *GV = MF.getFunction()->getParent() + ->getGlobalVariable(cachedKernelName); + unsigned cbNum = 0; + unsigned regNum = AMDIL::R1; + AMDILMachineFunctionInfo *mMFI = MF.getInfo<AMDILMachineFunctionInfo>(); + for (Function::const_arg_iterator I = MF.getFunction()->arg_begin(), + E = MF.getFunction()->arg_end(); I != E; ++I) { + const Argument *curArg = I; + if (mDebug) { + dbgs() << "Argument: "; + curArg->dump(); + } + Type *curType = curArg->getType(); + // We are either a scalar or vector type that + // is passed by value that is not a opaque/struct + // type. We just need to increment regNum + // the correct number of times to match the number + // of registers that it takes up. + if (curType->isFPOrFPVectorTy() || + curType->isIntOrIntVectorTy()) { + // We are scalar, so increment once and + // move on + if (!curType->isVectorTy()) { + lookupTable[regNum] = std::make_pair<unsigned, const Value*>(~0U, curArg); + ++regNum; + ++cbNum; + continue; + } + VectorType *VT = dyn_cast<VectorType>(curType); + // We are a vector type. If we are 64bit type, then + // we increment length / 2 times, otherwise we + // increment length / 4 times. The only corner case + // is with vec3 where the vector gets scalarized and + // therefor we need a loop count of 3. + size_t loopCount = VT->getNumElements(); + if (loopCount != 3) { + if (VT->getScalarSizeInBits() == 64) { + loopCount = loopCount >> 1; + } else { + loopCount = (loopCount + 2) >> 2; + } + cbNum += loopCount; + } else { + cbNum++; + } + while (loopCount--) { + lookupTable[regNum] = std::make_pair<unsigned, const Value*>(~0U, curArg); + ++regNum; + } + } else if (curType->isPointerTy()) { + Type *CT = dyn_cast<PointerType>(curType)->getElementType(); + const StructType *ST = dyn_cast<StructType>(CT); + if (ST && ST->isOpaque()) { + StringRef name = ST->getName(); + bool i1d_type = name == "struct._image1d_t"; + bool i1da_type = name == "struct._image1d_array_t"; + bool i1db_type = name == "struct._image1d_buffer_t"; + bool i2d_type = name == "struct._image2d_t"; + bool i2da_type = name == "struct._image2d_array_t"; + bool i3d_type = name == "struct._image3d_t"; + bool c32_type = name == "struct._counter32_t"; + bool c64_type = name == "struct._counter64_t"; + if (i2d_type || i3d_type || i2da_type || + i1d_type || i1db_type || i1da_type) { + imageSet.insert(I); + uint32_t imageNum = readOnlyImages + writeOnlyImages; + if (STM->getGlobalManager() + ->isReadOnlyImage(MF.getFunction()->getName(), imageNum)) { + if (mDebug) { + dbgs() << "Pointer: '" << curArg->getName() + << "' is a read only image # " << readOnlyImages << "!\n"; + } + // We store the cbNum along with the image number so that we can + // correctly encode the 'info' intrinsics. + lookupTable[regNum] = std::make_pair<unsigned, const Value*> + ((cbNum << 16 | readOnlyImages++), curArg); + } else if (STM->getGlobalManager() + ->isWriteOnlyImage(MF.getFunction()->getName(), imageNum)) { + if (mDebug) { + dbgs() << "Pointer: '" << curArg->getName() + << "' is a write only image # " << writeOnlyImages << "!\n"; + } + // We store the cbNum along with the image number so that we can + // correctly encode the 'info' intrinsics. + lookupTable[regNum] = std::make_pair<unsigned, const Value*> + ((cbNum << 16 | writeOnlyImages++), curArg); + } else { + assert(!"Read/Write images are not supported!"); + } + ++regNum; + cbNum += 2; + continue; + } else if (c32_type || c64_type) { + if (mDebug) { + dbgs() << "Pointer: '" << curArg->getName() + << "' is a " << (c32_type ? "32" : "64") + << " bit atomic counter type!\n"; + } + counterSet.push_back(I); + } + } + + if (STM->device()->isSupported(AMDILDeviceInfo::CachedMem) + && GV && GV->hasInitializer()) { + const ConstantArray *nameArray + = dyn_cast_or_null<ConstantArray>(GV->getInitializer()); + if (nameArray) { + for (unsigned x = 0, y = nameArray->getNumOperands(); x < y; ++x) { + const GlobalVariable *gV= dyn_cast_or_null<GlobalVariable>( + nameArray->getOperand(x)->getOperand(0)); + const ConstantDataArray *argName = + dyn_cast_or_null<ConstantDataArray>(gV->getInitializer()); + if (!argName) { + continue; + } + std::string argStr = argName->getAsString(); + std::string curStr = curArg->getName(); + if (!strcmp(argStr.data(), curStr.data())) { + if (mDebug) { + dbgs() << "Pointer: '" << curArg->getName() + << "' is cacheable!\n"; + } + cacheablePtrs.insert(curArg); + } + } + } + } + uint32_t as = dyn_cast<PointerType>(curType)->getAddressSpace(); + // Handle the case where the kernel argument is a pointer + if (mDebug) { + dbgs() << "Pointer: " << curArg->getName() << " is assigned "; + if (as == AMDILAS::GLOBAL_ADDRESS) { + dbgs() << "uav " << STM->device() + ->getResourceID(AMDILDevice::GLOBAL_ID); + } else if (as == AMDILAS::PRIVATE_ADDRESS) { + dbgs() << "scratch " << STM->device() + ->getResourceID(AMDILDevice::SCRATCH_ID); + } else if (as == AMDILAS::LOCAL_ADDRESS) { + dbgs() << "lds " << STM->device() + ->getResourceID(AMDILDevice::LDS_ID); + } else if (as == AMDILAS::CONSTANT_ADDRESS) { + dbgs() << "cb " << STM->device() + ->getResourceID(AMDILDevice::CONSTANT_ID); + } else if (as == AMDILAS::REGION_ADDRESS) { + dbgs() << "gds " << STM->device() + ->getResourceID(AMDILDevice::GDS_ID); + } else { + assert(!"Found an address space that we don't support!"); + } + dbgs() << " @ register " << regNum << ". Inst: "; + curArg->dump(); + } + switch (as) { + default: + lookupTable[regNum] = std::make_pair<unsigned, const Value*> + (STM->device()->getResourceID(AMDILDevice::GLOBAL_ID), curArg); + break; + case AMDILAS::LOCAL_ADDRESS: + lookupTable[regNum] = std::make_pair<unsigned, const Value*> + (STM->device()->getResourceID(AMDILDevice::LDS_ID), curArg); + mMFI->setHasLocalArg(); + break; + case AMDILAS::REGION_ADDRESS: + lookupTable[regNum] = std::make_pair<unsigned, const Value*> + (STM->device()->getResourceID(AMDILDevice::GDS_ID), curArg); + mMFI->setHasRegionArg(); + break; + case AMDILAS::CONSTANT_ADDRESS: + lookupTable[regNum] = std::make_pair<unsigned, const Value*> + (STM->device()->getResourceID(AMDILDevice::CONSTANT_ID), curArg); + break; + case AMDILAS::PRIVATE_ADDRESS: + lookupTable[regNum] = std::make_pair<unsigned, const Value*> + (STM->device()->getResourceID(AMDILDevice::SCRATCH_ID), curArg); + break; + } + // In this case we need to increment it once. + ++regNum; + ++cbNum; + } else { + // Is anything missing that is legal in CL? + assert(0 && "Current type is not supported!"); + lookupTable[regNum] = std::make_pair<unsigned, const Value*> + (STM->device()->getResourceID(AMDILDevice::GLOBAL_ID), curArg); + ++regNum; + ++cbNum; + } + } + return writeOnlyImages; +} +// The call stack is interesting in that even in SSA form, it assigns +// registers to the same value's over and over again. So we need to +// ignore the values that are assigned and just deal with the input +// and return registers. +static void +parseCall( + const AMDILTargetMachine *ATM, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + RVPVec &lookupTable, + MachineBasicBlock::iterator &mBegin, + MachineBasicBlock::iterator mEnd, + bool mDebug) +{ + SmallVector<unsigned, 8> inputRegs; + AMDILAS::InstrResEnc curRes; + if (mDebug) { + dbgs() << "Parsing Call Stack Start.\n"; + } + MachineBasicBlock::iterator callInst = mBegin; + MachineInstr *CallMI = callInst; + getAsmPrinterFlags(CallMI, curRes); + MachineInstr *MI = --mBegin; + unsigned reg = AMDIL::R1; + // First we need to check the input registers. + do { + // We stop if we hit the beginning of the call stack + // adjustment. + if (MI->getOpcode() == AMDIL::ADJCALLSTACKDOWN + || MI->getOpcode() == AMDIL::ADJCALLSTACKUP + || MI->getNumOperands() != 2 + || !MI->getOperand(0).isReg()) { + break; + } + reg = MI->getOperand(0).getReg(); + if (MI->getOperand(1).isReg()) { + unsigned reg1 = MI->getOperand(1).getReg(); + inputRegs.push_back(reg1); + if (lookupTable[reg1].second) { + curRes.bits.PointerPath = 1; + } + } + lookupTable.erase(reg); + if ((signed)reg < 0 + || mBegin == CallMI->getParent()->begin()) { + break; + } + MI = --mBegin; + } while (1); + mBegin = callInst; + MI = ++mBegin; + // If the next registers operand 1 is not a register or that register + // is not R1, then we don't have any return values. + if (MI->getNumOperands() == 2 + && MI->getOperand(1).isReg() + && MI->getOperand(1).getReg() == AMDIL::R1) { + // Next we check the output register. + reg = MI->getOperand(0).getReg(); + // Now we link the inputs to the output. + for (unsigned x = 0; x < inputRegs.size(); ++x) { + if (lookupTable[inputRegs[x]].second) { + curRes.bits.PointerPath = 1; + lookupTable[reg] = lookupTable[inputRegs[x]]; + InstToPtrMap[CallMI].insert( + lookupTable[reg].second); + break; + } + } + lookupTable.erase(MI->getOperand(1).getReg()); + } + setAsmPrinterFlags(CallMI, curRes); + if (mDebug) { + dbgs() << "Parsing Call Stack End.\n"; + } + return; +} + +// Detect if the current instruction conflicts with another instruction +// and add the instruction to the correct location accordingly. +static void +detectConflictInst( + MachineInstr *MI, + AMDILAS::InstrResEnc &curRes, + RVPVec &lookupTable, + InstPMap &InstToPtrMap, + bool isLoadStore, + unsigned reg, + unsigned dstReg, + bool mDebug) +{ + // If the instruction does not have a point path flag + // associated with it, then we know that no other pointer + // hits this instruciton. + if (!curRes.bits.PointerPath) { + if (dyn_cast<PointerType>(lookupTable[reg].second->getType())) { + curRes.bits.PointerPath = 1; + } + // We don't want to transfer to the register number + // between load/store because the load dest can be completely + // different pointer path and the store doesn't have a real + // destination register. + if (!isLoadStore) { + if (mDebug) { + if (dyn_cast<PointerType>(lookupTable[reg].second->getType())) { + dbgs() << "Pointer: " << lookupTable[reg].second->getName(); + assert(dyn_cast<PointerType>(lookupTable[reg].second->getType()) + && "Must be a pointer type for an instruction!"); + switch (dyn_cast<PointerType>( + lookupTable[reg].second->getType())->getAddressSpace()) + { + case AMDILAS::GLOBAL_ADDRESS: dbgs() << " UAV: "; break; + case AMDILAS::LOCAL_ADDRESS: dbgs() << " LDS: "; break; + case AMDILAS::REGION_ADDRESS: dbgs() << " GDS: "; break; + case AMDILAS::PRIVATE_ADDRESS: dbgs() << " SCRATCH: "; break; + case AMDILAS::CONSTANT_ADDRESS: dbgs() << " CB: "; break; + + } + dbgs() << lookupTable[reg].first << " Reg: " << reg + << " assigned to reg " << dstReg << ". Inst: "; + MI->dump(); + } + } + // We don't want to do any copies if the register is not virtual + // as it is the result of a CALL. ParseCallInst handles the + // case where the input and output need to be linked up + // if it occurs. The easiest way to check for virtual + // is to check the top bit. + lookupTable[dstReg] = lookupTable[reg]; + } + } else { + if (dyn_cast<PointerType>(lookupTable[reg].second->getType())) { + // Otherwise we have a conflict between two pointers somehow. + curRes.bits.ConflictPtr = 1; + if (mDebug) { + dbgs() << "Pointer: " << lookupTable[reg].second->getName(); + assert(dyn_cast<PointerType>(lookupTable[reg].second->getType()) + && "Must be a pointer type for a conflict instruction!"); + switch (dyn_cast<PointerType>( + lookupTable[reg].second->getType())->getAddressSpace()) + { + case AMDILAS::GLOBAL_ADDRESS: dbgs() << " UAV: "; break; + case AMDILAS::LOCAL_ADDRESS: dbgs() << " LDS: "; break; + case AMDILAS::REGION_ADDRESS: dbgs() << " GDS: "; break; + case AMDILAS::PRIVATE_ADDRESS: dbgs() << " SCRATCH: "; break; + case AMDILAS::CONSTANT_ADDRESS: dbgs() << " CB: "; break; + + } + dbgs() << lookupTable[reg].first << " Reg: " << reg; + if (InstToPtrMap[MI].size() > 1) { + dbgs() << " conflicts with:\n "; + for (PtrSet::iterator psib = InstToPtrMap[MI].begin(), + psie = InstToPtrMap[MI].end(); psib != psie; ++psib) { + dbgs() << "\t\tPointer: " << (*psib)->getName() << " "; + assert(dyn_cast<PointerType>((*psib)->getType()) + && "Must be a pointer type for a conflict instruction!"); + (*psib)->dump(); + } + } else { + dbgs() << "."; + } + dbgs() << " Inst: "; + MI->dump(); + } + } + // Add the conflicting values to the pointer set for the instruction + InstToPtrMap[MI].insert(lookupTable[reg].second); + // We don't want to add the destination register if + // we are a load or store. + if (!isLoadStore) { + InstToPtrMap[MI].insert(lookupTable[dstReg].second); + } + } + setAsmPrinterFlags(MI, curRes); +} + +// In this case we want to handle a load instruction. +static void +parseLoadInst( + const AMDILTargetMachine *ATM, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + FIPMap &FIToPtrMap, + RVPVec &lookupTable, + CPoolSet &cpool, + BlockCacheableInfo &bci, + MachineInstr *MI, + bool mDebug) +{ + assert(isLoadInst(ATM->getInstrInfo(), MI) && "Only a load instruction can be parsed by " + "the parseLoadInst function."); + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(MI, curRes); + unsigned dstReg = MI->getOperand(0).getReg(); + unsigned idx = 0; + const Value *basePtr = NULL; + if (MI->getOperand(1).isReg()) { + idx = MI->getOperand(1).getReg(); + basePtr = lookupTable[idx].second; + // If we don't know what value the register + // is assigned to, then we need to special case + // this instruction. + } else if (MI->getOperand(1).isFI()) { + idx = MI->getOperand(1).getIndex(); + lookupTable[dstReg] = FIToPtrMap[idx]; + } else if (MI->getOperand(1).isCPI()) { + cpool.insert(MI); + } + // If we are a hardware local, then we don't need to track as there + // is only one resource ID that we need to know about, so we + // map it using allocateDefaultID, which maps it to the default. + // This is also the case for REGION_ADDRESS and PRIVATE_ADDRESS. + if (isLRPInst(MI, ATM) || !basePtr) { + allocateDefaultID(ATM, curRes, MI, mDebug); + return; + } + // We have a load instruction so we map this instruction + // to the pointer and insert it into the set of known + // load instructions. + InstToPtrMap[MI].insert(basePtr); + PtrToInstMap[basePtr].push_back(MI); + + if (isGlobalInst(ATM->getInstrInfo(), MI)) { + // Add to the cacheable set for the block. If there was a store earlier + // in the block, this call won't actually add it to the cacheable set. + bci.addPossiblyCacheableInst(ATM, MI); + } + + if (mDebug) { + dbgs() << "Assigning instruction to pointer "; + dbgs() << basePtr->getName() << ". Inst: "; + MI->dump(); + } + detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, true, + idx, dstReg, mDebug); +} + +// In this case we want to handle a store instruction. +static void +parseStoreInst( + const AMDILTargetMachine *ATM, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + FIPMap &FIToPtrMap, + RVPVec &lookupTable, + CPoolSet &cpool, + BlockCacheableInfo &bci, + MachineInstr *MI, + ByteSet &bytePtrs, + ConflictSet &conflictPtrs, + bool mDebug) +{ + assert(isStoreInst(ATM->getInstrInfo(), MI) && "Only a store instruction can be parsed by " + "the parseStoreInst function."); + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(MI, curRes); + unsigned dstReg = MI->getOperand(0).getReg(); + + // If the data part of the store instruction is known to + // be a pointer, then we need to mark this pointer as being + // a byte pointer. This is the conservative case that needs + // to be handled correctly. + if (lookupTable[dstReg].second && lookupTable[dstReg].first != ~0U) { + curRes.bits.ConflictPtr = 1; + if (mDebug) { + dbgs() << "Found a case where the pointer is being stored!\n"; + MI->dump(); + dbgs() << "Pointer is "; + lookupTable[dstReg].second->print(dbgs()); + dbgs() << "\n"; + } + //PtrToInstMap[lookupTable[dstReg].second].push_back(MI); + if (lookupTable[dstReg].second->getType()->isPointerTy()) { + conflictPtrs.insert(lookupTable[dstReg].second); + } + } + + // Before we go through the special cases, for the cacheable information + // all we care is if the store if global or not. + if (!isLRPInst(MI, ATM)) { + bci.setReachesExit(); + } + + // If the address is not a register address, + // then we need to lower it as an unknown id. + if (!MI->getOperand(1).isReg()) { + if (MI->getOperand(1).isCPI()) { + if (mDebug) { + dbgs() << "Found an instruction with a CPI index #" + << MI->getOperand(1).getIndex() << "!\n"; + } + cpool.insert(MI); + } else if (MI->getOperand(1).isFI()) { + if (mDebug) { + dbgs() << "Found an instruction with a frame index #" + << MI->getOperand(1).getIndex() << "!\n"; + } + // If we are a frame index and we are storing a pointer there, lets + // go ahead and assign the pointer to the location within the frame + // index map so that we can get the value out later. + FIToPtrMap[MI->getOperand(1).getIndex()] = lookupTable[dstReg]; + } + + allocateDefaultID(ATM, curRes, MI, mDebug); + return; + } + unsigned reg = MI->getOperand(1).getReg(); + // If we don't know what value the register + // is assigned to, then we need to special case + // this instruction. + if (!lookupTable[reg].second) { + allocateDefaultID(ATM, curRes, MI, mDebug); + return; + } + // const Value *basePtr = lookupTable[reg].second; + // If we are a hardware local, then we don't need to track as there + // is only one resource ID that we need to know about, so we + // map it using allocateDefaultID, which maps it to the default. + // This is also the case for REGION_ADDRESS and PRIVATE_ADDRESS. + if (isLRPInst(MI, ATM)) { + allocateDefaultID(ATM, curRes, MI, mDebug); + return; + } + + // We have a store instruction so we map this instruction + // to the pointer and insert it into the set of known + // store instructions. + InstToPtrMap[MI].insert(lookupTable[reg].second); + PtrToInstMap[lookupTable[reg].second].push_back(MI); + uint16_t RegClass = MI->getDesc().OpInfo[0].RegClass; + switch (RegClass) { + default: + break; + case AMDIL::GPRI8RegClassID: + case AMDIL::GPRV2I8RegClassID: + case AMDIL::GPRI16RegClassID: + if (usesGlobal(ATM, MI)) { + if (mDebug) { + dbgs() << "Annotating instruction as Byte Store. Inst: "; + MI->dump(); + } + curRes.bits.ByteStore = 1; + setAsmPrinterFlags(MI, curRes); + const PointerType *PT = dyn_cast<PointerType>( + lookupTable[reg].second->getType()); + if (PT) { + bytePtrs.insert(lookupTable[reg].second); + } + } + break; + }; + // If we are a truncating store, then we need to determine the + // size of the pointer that we are truncating to, and if we + // are less than 32 bits, we need to mark the pointer as a + // byte store pointer. + switch (MI->getOpcode()) { + case AMDIL::GLOBALTRUNCSTORE_i16i8: + case AMDIL::GLOBALTRUNCSTORE_v2i16i8: + case AMDIL::GLOBALTRUNCSTORE_i32i8: + case AMDIL::GLOBALTRUNCSTORE_v2i32i8: + case AMDIL::GLOBALTRUNCSTORE_i64i8: + case AMDIL::GLOBALTRUNCSTORE_v2i64i8: + case AMDIL::GLOBALTRUNCSTORE_i32i16: + case AMDIL::GLOBALTRUNCSTORE_i64i16: + case AMDIL::GLOBALSTORE_i8: + case AMDIL::GLOBALSTORE_i16: + curRes.bits.ByteStore = 1; + setAsmPrinterFlags(MI, curRes); + bytePtrs.insert(lookupTable[reg].second); + break; + default: + break; + } + + if (mDebug) { + dbgs() << "Assigning instruction to pointer "; + dbgs() << lookupTable[reg].second->getName() << ". Inst: "; + MI->dump(); + } + detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, true, + reg, dstReg, mDebug); +} + +// In this case we want to handle an atomic instruction. +static void +parseAtomicInst( + const AMDILTargetMachine *ATM, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + RVPVec &lookupTable, + BlockCacheableInfo &bci, + MachineInstr *MI, + ByteSet &bytePtrs, + bool mDebug) +{ + assert(isAtomicInst(ATM->getInstrInfo(), MI) && "Only an atomic instruction can be parsed by " + "the parseAtomicInst function."); + AMDILAS::InstrResEnc curRes; + unsigned dstReg = MI->getOperand(0).getReg(); + unsigned reg = 0; + getAsmPrinterFlags(MI, curRes); + unsigned numOps = MI->getNumOperands(); + bool found = false; + while (--numOps) { + MachineOperand &Op = MI->getOperand(numOps); + if (!Op.isReg()) { + continue; + } + reg = Op.getReg(); + // If the register is not known to be owned by a pointer + // then we can ignore it + if (!lookupTable[reg].second) { + continue; + } + // if the pointer is known to be local, region or private, then we + // can ignore it. Although there are no private atomics, we still + // do this check so we don't have to write a new function to check + // for only local and region. + if (isLRPInst(MI, ATM)) { + continue; + } + found = true; + InstToPtrMap[MI].insert(lookupTable[reg].second); + PtrToInstMap[lookupTable[reg].second].push_back(MI); + + // We now know we have an atomic operation on global memory. + // This is a store so must update the cacheable information. + bci.setReachesExit(); + + // Only do if have SC with arena atomic bug fix (EPR 326883). + // TODO: enable once SC with EPR 326883 has been promoted to CAL. + if (ATM->getSubtargetImpl()->calVersion() >= CAL_VERSION_SC_150) { + // Force pointers that are used by atomics to be in the arena. + // If they were allowed to be accessed as RAW they would cause + // all access to use the slow complete path. + if (mDebug) { + dbgs() << __LINE__ << ": Setting byte store bit on atomic instruction: "; + MI->dump(); + } + curRes.bits.ByteStore = 1; + bytePtrs.insert(lookupTable[reg].second); + } + + if (mDebug) { + dbgs() << "Assigning instruction to pointer "; + dbgs() << lookupTable[reg].second->getName() << ". Inst: "; + MI->dump(); + } + detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, true, + reg, dstReg, mDebug); + } + if (!found) { + allocateDefaultID(ATM, curRes, MI, mDebug); + } +} +// In this case we want to handle a counter instruction. +static void +parseAppendInst( + const AMDILTargetMachine *ATM, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + RVPVec &lookupTable, + MachineInstr *MI, + bool mDebug) +{ + assert(isAppendInst(ATM->getInstrInfo(), MI) && "Only an atomic counter instruction can be " + "parsed by the parseAppendInst function."); + AMDILAS::InstrResEnc curRes; + unsigned dstReg = MI->getOperand(0).getReg(); + unsigned reg = MI->getOperand(1).getReg(); + getAsmPrinterFlags(MI, curRes); + // If the register is not known to be owned by a pointer + // then we set it to the default + if (!lookupTable[reg].second) { + allocateDefaultID(ATM, curRes, MI, mDebug); + return; + } + InstToPtrMap[MI].insert(lookupTable[reg].second); + PtrToInstMap[lookupTable[reg].second].push_back(MI); + if (mDebug) { + dbgs() << "Assigning instruction to pointer "; + dbgs() << lookupTable[reg].second->getName() << ". Inst: "; + MI->dump(); + } + detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, true, + reg, dstReg, mDebug); +} +// In this case we want to handle an Image instruction. +static void +parseImageInst( + const AMDILTargetMachine *ATM, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + FIPMap &FIToPtrMap, + RVPVec &lookupTable, + MachineInstr *MI, + bool mDebug) +{ + assert(isImageInst(ATM->getInstrInfo(), MI) && "Only an image instruction can be " + "parsed by the parseImageInst function."); + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(MI, curRes); + // AMDILKernelManager *km = + // (AMDILKernelManager *)ATM->getSubtargetImpl()->getKernelManager(); + AMDILMachineFunctionInfo *mMFI = MI->getParent()->getParent() + ->getInfo<AMDILMachineFunctionInfo>(); + if (MI->getOpcode() == AMDIL::IMAGE2D_WRITE + || MI->getOpcode() == AMDIL::IMAGE3D_WRITE) { + unsigned dstReg = MI->getOperand(0).getReg(); + curRes.bits.ResourceID = lookupTable[dstReg].first & 0xFFFF; + curRes.bits.isImage = 1; + InstToPtrMap[MI].insert(lookupTable[dstReg].second); + PtrToInstMap[lookupTable[dstReg].second].push_back(MI); + if (mDebug) { + dbgs() << "Assigning instruction to pointer "; + dbgs() << lookupTable[dstReg].second->getName() << ". Inst: "; + MI->dump(); + } + } else { + // unsigned dstReg = MI->getOperand(0).getReg(); + unsigned reg = MI->getOperand(1).getReg(); + + // If the register is not known to be owned by a pointer + // then we set it to the default + if (!lookupTable[reg].second) { + assert(!"This should not happen for images!"); + allocateDefaultID(ATM, curRes, MI, mDebug); + return; + } + InstToPtrMap[MI].insert(lookupTable[reg].second); + PtrToInstMap[lookupTable[reg].second].push_back(MI); + if (mDebug) { + dbgs() << "Assigning instruction to pointer "; + dbgs() << lookupTable[reg].second->getName() << ". Inst: "; + MI->dump(); + } + switch (MI->getOpcode()) { + case AMDIL::IMAGE2D_READ: + case AMDIL::IMAGE2D_READ_UNNORM: + case AMDIL::IMAGE3D_READ: + case AMDIL::IMAGE3D_READ_UNNORM: + curRes.bits.ResourceID = lookupTable[reg].first & 0xFFFF; + if (MI->getOperand(3).isReg()) { + // Our sampler is not a literal value. + char buffer[256]; + memset(buffer, 0, sizeof(buffer)); + std::string sampler_name = ""; + unsigned reg = MI->getOperand(3).getReg(); + if (lookupTable[reg].second) { + sampler_name = lookupTable[reg].second->getName(); + } + if (sampler_name.empty()) { + sampler_name = findSamplerName(MI, lookupTable, FIToPtrMap, ATM); + } + uint32_t val = mMFI->addSampler(sampler_name, ~0U); + if (mDebug) { + dbgs() << "Mapping kernel sampler " << sampler_name + << " to sampler number " << val << " for Inst:\n"; + MI->dump(); + } + MI->getOperand(3).ChangeToImmediate(val); + } else { + // Our sampler is known at runtime as a literal, lets make sure + // that the metadata for it is known. + char buffer[256]; + memset(buffer, 0, sizeof(buffer)); + sprintf(buffer,"_%d", (int32_t)MI->getOperand(3).getImm()); + std::string sampler_name = std::string("unknown") + std::string(buffer); + uint32_t val = mMFI->addSampler(sampler_name, MI->getOperand(3).getImm()); + if (mDebug) { + dbgs() << "Mapping internal sampler " << sampler_name + << " to sampler number " << val << " for Inst:\n"; + MI->dump(); + } + MI->getOperand(3).setImm(val); + } + break; + case AMDIL::IMAGE2D_INFO0: + case AMDIL::IMAGE3D_INFO0: + curRes.bits.ResourceID = lookupTable[reg].first >> 16; + break; + case AMDIL::IMAGE2D_INFO1: + case AMDIL::IMAGE2DA_INFO1: + curRes.bits.ResourceID = (lookupTable[reg].first >> 16) + 1; + break; + }; + curRes.bits.isImage = 1; + } + setAsmPrinterFlags(MI, curRes); +} +// This case handles the rest of the instructions +static void +parseInstruction( + const AMDILTargetMachine *ATM, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + RVPVec &lookupTable, + CPoolSet &cpool, + MachineInstr *MI, + bool mDebug) +{ + assert(!isAtomicInst(ATM->getInstrInfo(), MI) && !isStoreInst(ATM->getInstrInfo(), MI) && !isLoadInst(ATM->getInstrInfo(), MI) && + !isAppendInst(ATM->getInstrInfo(), MI) && !isImageInst(ATM->getInstrInfo(), MI) && + "Atomic/Load/Store/Append/Image insts should not be handled here!"); + unsigned numOps = MI->getNumOperands(); + // If we don't have any operands, we can skip this instruction + if (!numOps) { + return; + } + // if the dst operand is not a register, then we can skip + // this instruction. That is because we are probably a branch + // or jump instruction. + if (!MI->getOperand(0).isReg()) { + return; + } + // If we are a LOADCONST_i32, we might be a sampler, so we need + // to propogate the LOADCONST to IMAGE[2|3]D_READ instructions. + if (MI->getOpcode() == AMDIL::LOADCONST_i32) { + uint32_t val = MI->getOperand(1).getImm(); + MachineOperand* oldPtr = &MI->getOperand(0); + MachineOperand* moPtr = oldPtr->getNextOperandForReg(); + while (moPtr) { + oldPtr = moPtr; + moPtr = oldPtr->getNextOperandForReg(); + switch (oldPtr->getParent()->getOpcode()) { + default: + break; + case AMDIL::IMAGE2D_READ: + case AMDIL::IMAGE2D_READ_UNNORM: + case AMDIL::IMAGE3D_READ: + case AMDIL::IMAGE3D_READ_UNNORM: + if (mDebug) { + dbgs() << "Found a constant sampler for image read inst: "; + oldPtr->getParent()->print(dbgs()); + } + oldPtr->ChangeToImmediate(val); + break; + } + } + } + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(MI, curRes); + unsigned dstReg = MI->getOperand(0).getReg(); + unsigned reg = 0; + while (--numOps) { + MachineOperand &Op = MI->getOperand(numOps); + // if the operand is not a register, then we can ignore it + if (!Op.isReg()) { + if (Op.isCPI()) { + cpool.insert(MI); + } + continue; + } + reg = Op.getReg(); + // If the register is not known to be owned by a pointer + // then we can ignore it + if (!lookupTable[reg].second) { + continue; + } + detectConflictInst(MI, curRes, lookupTable, InstToPtrMap, false, + reg, dstReg, mDebug); + + } +} + +// This function parses the basic block and based on the instruction type, +// calls the function to finish parsing the instruction. +static void +parseBasicBlock( + const AMDILTargetMachine *ATM, + MachineBasicBlock *MB, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + FIPMap &FIToPtrMap, + RVPVec &lookupTable, + ByteSet &bytePtrs, + ConflictSet &conflictPtrs, + CPoolSet &cpool, + BlockCacheableInfo &bci, + bool mDebug) +{ + for (MachineBasicBlock::iterator mbb = MB->begin(), mbe = MB->end(); + mbb != mbe; ++mbb) { + MachineInstr *MI = mbb; + if (MI->getOpcode() == AMDIL::CALL) { + parseCall(ATM, InstToPtrMap, PtrToInstMap, lookupTable, + mbb, mbe, mDebug); + } + else if (isLoadInst(ATM->getInstrInfo(), MI)) { + parseLoadInst(ATM, InstToPtrMap, PtrToInstMap, + FIToPtrMap, lookupTable, cpool, bci, MI, mDebug); + } else if (isStoreInst(ATM->getInstrInfo(), MI)) { + parseStoreInst(ATM, InstToPtrMap, PtrToInstMap, + FIToPtrMap, lookupTable, cpool, bci, MI, bytePtrs, conflictPtrs, mDebug); + } else if (isAtomicInst(ATM->getInstrInfo(), MI)) { + parseAtomicInst(ATM, InstToPtrMap, PtrToInstMap, + lookupTable, bci, MI, bytePtrs, mDebug); + } else if (isAppendInst(ATM->getInstrInfo(), MI)) { + parseAppendInst(ATM, InstToPtrMap, PtrToInstMap, + lookupTable, MI, mDebug); + } else if (isImageInst(ATM->getInstrInfo(), MI)) { + parseImageInst(ATM, InstToPtrMap, PtrToInstMap, + FIToPtrMap, lookupTable, MI, mDebug); + } else { + parseInstruction(ATM, InstToPtrMap, PtrToInstMap, + lookupTable, cpool, MI, mDebug); + } + } +} + +// Follows the Reverse Post Order Traversal of the basic blocks to +// determine which order to parse basic blocks in. +void +parseFunction( + const AMDILPointerManager *PM, + const AMDILTargetMachine *ATM, + MachineFunction &MF, + InstPMap &InstToPtrMap, + PtrIMap &PtrToInstMap, + FIPMap &FIToPtrMap, + RVPVec &lookupTable, + ByteSet &bytePtrs, + ConflictSet &conflictPtrs, + CPoolSet &cpool, + MBBCacheableMap &mbbCacheable, + bool mDebug) +{ + if (mDebug) { + MachineDominatorTree *dominatorTree = &PM + ->getAnalysis<MachineDominatorTree>(); + dominatorTree->dump(); + } + + std::list<MachineBasicBlock*> prop_worklist; + + ReversePostOrderTraversal<MachineFunction*> RPOT(&MF); + for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator + curBlock = RPOT.begin(), endBlock = RPOT.end(); + curBlock != endBlock; ++curBlock) { + MachineBasicBlock *MB = (*curBlock); + BlockCacheableInfo &bci = mbbCacheable[MB]; + for (MachineBasicBlock::pred_iterator mbbit = MB->pred_begin(), + mbbitend = MB->pred_end(); + mbbit != mbbitend; + mbbit++) { + MBBCacheableMap::const_iterator mbbcmit = mbbCacheable.find(*mbbit); + if (mbbcmit != mbbCacheable.end() && + mbbcmit->second.storeReachesExit()) { + bci.setReachesTop(); + break; + } + } + + if (mDebug) { + dbgs() << "[BlockOrdering] Parsing CurrentBlock: " + << MB->getNumber() << "\n"; + } + parseBasicBlock(ATM, MB, InstToPtrMap, PtrToInstMap, + FIToPtrMap, lookupTable, bytePtrs, conflictPtrs, cpool, bci, mDebug); + + if (bci.storeReachesExit()) + prop_worklist.push_back(MB); + + if (mDebug) { + dbgs() << "BCI info: Top: " << bci.storeReachesTop() << " Exit: " + << bci.storeReachesExit() << "\n Instructions:\n"; + for (CacheableInstrSet::const_iterator cibit = bci.cacheableBegin(), + cibitend = bci.cacheableEnd(); + cibit != cibitend; + cibit++) + { + (*cibit)->dump(); + } + } + } + + // This loop pushes any "storeReachesExit" flags into successor + // blocks until the flags have been fully propagated. This will + // ensure that blocks that have reachable stores due to loops + // are labeled appropriately. + while (!prop_worklist.empty()) { + MachineBasicBlock *wlb = prop_worklist.front(); + prop_worklist.pop_front(); + for (MachineBasicBlock::succ_iterator mbbit = wlb->succ_begin(), + mbbitend = wlb->succ_end(); + mbbit != mbbitend; + mbbit++) + { + BlockCacheableInfo &blockCache = mbbCacheable[*mbbit]; + if (!blockCache.storeReachesTop()) { + blockCache.setReachesTop(); + prop_worklist.push_back(*mbbit); + } + if (mDebug) { + dbgs() << "BCI Prop info: " << (*mbbit)->getNumber() << " Top: " + << blockCache.storeReachesTop() << " Exit: " + << blockCache.storeReachesExit() + << "\n"; + } + } + } +} + +// Helper function that dumps to dbgs() information about +// a pointer set. + void +dumpPointers(AppendSet &Ptrs, const char *str) +{ + if (Ptrs.empty()) { + return; + } + dbgs() << "[Dump]" << str << " found: " << "\n"; + for (AppendSet::iterator sb = Ptrs.begin(); + sb != Ptrs.end(); ++sb) { + (*sb)->dump(); + } + dbgs() << "\n"; +} +// Helper function that dumps to dbgs() information about +// a pointer set. + void +dumpPointers(PtrSet &Ptrs, const char *str) +{ + if (Ptrs.empty()) { + return; + } + dbgs() << "[Dump]" << str << " found: " << "\n"; + for (PtrSet::iterator sb = Ptrs.begin(); + sb != Ptrs.end(); ++sb) { + (*sb)->dump(); + } + dbgs() << "\n"; +} +// Function that detects all the conflicting pointers and adds +// the pointers that are detected to the conflict set, otherwise +// they are added to the raw or byte set based on their usage. +void +detectConflictingPointers( + const AMDILTargetMachine *ATM, + InstPMap &InstToPtrMap, + ByteSet &bytePtrs, + RawSet &rawPtrs, + ConflictSet &conflictPtrs, + bool mDebug) +{ + if (InstToPtrMap.empty()) { + return; + } + PtrSet aliasedPtrs; + const AMDILSubtarget *STM = ATM->getSubtargetImpl(); + for (InstPMap::iterator + mapIter = InstToPtrMap.begin(), iterEnd = InstToPtrMap.end(); + mapIter != iterEnd; ++mapIter) { + if (mDebug) { + dbgs() << "Instruction: "; + (mapIter)->first->dump(); + } + MachineInstr* MI = mapIter->first; + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(MI, curRes); + if (curRes.bits.isImage) { + continue; + } + bool byte = false; + // We might have a case where more than 1 pointers is going to the same + // I/O instruction + if (mDebug) { + dbgs() << "Base Pointer[s]:\n"; + } + for (PtrSet::iterator cfIter = mapIter->second.begin(), + cfEnd = mapIter->second.end(); cfIter != cfEnd; ++cfIter) { + if (mDebug) { + (*cfIter)->dump(); + } + if (bytePtrs.count(*cfIter)) { + if (mDebug) { + dbgs() << "Byte pointer found!\n"; + } + byte = true; + break; + } + } + if (byte) { + for (PtrSet::iterator cfIter = mapIter->second.begin(), + cfEnd = mapIter->second.end(); cfIter != cfEnd; ++cfIter) { + const Value *ptr = (*cfIter); + if (isLRPInst(mapIter->first, ATM)) { + // We don't need to deal with pointers to local/region/private + // memory regions + continue; + } + if (mDebug) { + dbgs() << "Adding pointer " << (ptr)->getName() + << " to byte set!\n"; + } + const PointerType *PT = dyn_cast<PointerType>(ptr->getType()); + if (PT) { + bytePtrs.insert(ptr); + } + } + } else { + for (PtrSet::iterator cfIter = mapIter->second.begin(), + cfEnd = mapIter->second.end(); cfIter != cfEnd; ++cfIter) { + const Value *ptr = (*cfIter); + // bool aliased = false; + if (isLRPInst(mapIter->first, ATM)) { + // We don't need to deal with pointers to local/region/private + // memory regions + continue; + } + const Argument *arg = dyn_cast_or_null<Argument>(*cfIter); + if (!arg) { + continue; + } + if (!STM->device()->isSupported(AMDILDeviceInfo::NoAlias) + && !arg->hasNoAliasAttr()) { + if (mDebug) { + dbgs() << "Possible aliased pointer found!\n"; + } + aliasedPtrs.insert(ptr); + } + if (mapIter->second.size() > 1) { + if (mDebug) { + dbgs() << "Adding pointer " << ptr->getName() + << " to conflict set!\n"; + } + const PointerType *PT = dyn_cast<PointerType>(ptr->getType()); + if (PT) { + conflictPtrs.insert(ptr); + } + } + if (mDebug) { + dbgs() << "Adding pointer " << ptr->getName() + << " to raw set!\n"; + } + const PointerType *PT = dyn_cast<PointerType>(ptr->getType()); + if (PT) { + rawPtrs.insert(ptr); + } + } + } + if (mDebug) { + dbgs() << "\n"; + } + } + // If we have any aliased pointers and byte pointers exist, + // then make sure that all of the aliased pointers are + // part of the byte pointer set. + if (!bytePtrs.empty()) { + for (PtrSet::iterator aIter = aliasedPtrs.begin(), + aEnd = aliasedPtrs.end(); aIter != aEnd; ++aIter) { + if (mDebug) { + dbgs() << "Moving " << (*aIter)->getName() + << " from raw to byte.\n"; + } + bytePtrs.insert(*aIter); + rawPtrs.erase(*aIter); + } + } +} +// Function that detects aliased constant pool operations. +void +detectAliasedCPoolOps( + TargetMachine &TM, + CPoolSet &cpool, + bool mDebug + ) +{ + const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>(); + if (mDebug && !cpool.empty()) { + dbgs() << "Instructions w/ CPool Ops: \n"; + } + // The algorithm for detecting aliased cpool is as follows. + // For each instruction that has a cpool argument + // follow def-use chain + // if instruction is a load and load is a private load, + // switch to constant pool load + for (CPoolSet::iterator cpb = cpool.begin(), cpe = cpool.end(); + cpb != cpe; ++cpb) { + if (mDebug) { + (*cpb)->dump(); + } + std::queue<MachineInstr*> queue; + std::set<MachineInstr*> visited; + queue.push(*cpb); + MachineInstr *cur; + while (!queue.empty()) { + cur = queue.front(); + queue.pop(); + if (visited.count(cur)) { + continue; + } + if (isLoadInst(TM.getInstrInfo(), cur) && isPrivateInst(TM.getInstrInfo(), cur)) { + // If we are a private load and the register is + // used in the address register, we need to + // switch from private to constant pool load. + if (mDebug) { + dbgs() << "Found an instruction that is a private load " + << "but should be a constant pool load.\n"; + cur->print(dbgs()); + dbgs() << "\n"; + } + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(cur, curRes); + curRes.bits.ResourceID = STM->device()->getResourceID(AMDILDevice::GLOBAL_ID); + curRes.bits.ConflictPtr = 1; + setAsmPrinterFlags(cur, curRes); + cur->setDesc(TM.getInstrInfo()->get( + (cur->getOpcode() - AMDIL::PRIVATEAEXTLOAD_f32) + + AMDIL::CPOOLAEXTLOAD_f32)); + } else { + if (cur->getOperand(0).isReg()) { + MachineOperand* ptr = cur->getOperand(0).getNextOperandForReg(); + while (ptr && !ptr->isDef() && ptr->isReg()) { + queue.push(ptr->getParent()); + ptr = ptr->getNextOperandForReg(); + } + } + } + visited.insert(cur); + } + } +} +// Function that detects fully cacheable pointers. Fully cacheable pointers +// are pointers that have no writes to them and -fno-alias is specified. +void +detectFullyCacheablePointers( + const AMDILTargetMachine *ATM, + PtrIMap &PtrToInstMap, + RawSet &rawPtrs, + CacheableSet &cacheablePtrs, + ConflictSet &conflictPtrs, + bool mDebug + ) +{ + if (PtrToInstMap.empty()) { + return; + } + const AMDILSubtarget *STM + = ATM->getSubtargetImpl(); + // 4XXX hardware doesn't support cached uav opcodes and we assume + // no aliasing for this to work. Also in debug mode we don't do + // any caching. + if (STM->device()->getGeneration() == AMDILDeviceInfo::HD4XXX + || !STM->device()->isSupported(AMDILDeviceInfo::CachedMem)) { + return; + } + if (STM->device()->isSupported(AMDILDeviceInfo::NoAlias)) { + for (PtrIMap::iterator mapIter = PtrToInstMap.begin(), + iterEnd = PtrToInstMap.end(); mapIter != iterEnd; ++mapIter) { + if (mDebug) { + dbgs() << "Instruction: "; + mapIter->first->dump(); + } + // Skip the pointer if we have already detected it. + if (cacheablePtrs.count(mapIter->first)) { + continue; + } + bool cacheable = true; + for (std::vector<MachineInstr*>::iterator + miBegin = mapIter->second.begin(), + miEnd = mapIter->second.end(); miBegin != miEnd; ++miBegin) { + if (isStoreInst(ATM->getInstrInfo(), *miBegin) || + isImageInst(ATM->getInstrInfo(), *miBegin) || + isAtomicInst(ATM->getInstrInfo(), *miBegin)) { + cacheable = false; + break; + } + } + // we aren't cacheable, so lets move on to the next instruction + if (!cacheable) { + continue; + } + // If we are in the conflict set, lets move to the next instruction + // FIXME: we need to check to see if the pointers that conflict with + // the current pointer are also cacheable. If they are, then add them + // to the cacheable list and not fail. + if (conflictPtrs.count(mapIter->first)) { + continue; + } + // Otherwise if we have no stores and no conflicting pointers, we can + // be added to the cacheable set. + if (mDebug) { + dbgs() << "Adding pointer " << mapIter->first->getName(); + dbgs() << " to cached set!\n"; + } + const PointerType *PT = dyn_cast<PointerType>(mapIter->first->getType()); + if (PT) { + cacheablePtrs.insert(mapIter->first); + } + } + } +} + +// Are any of the pointers in PtrSet also in the BytePtrs or the CachePtrs? +static bool +ptrSetIntersectsByteOrCache( + PtrSet &cacheSet, + ByteSet &bytePtrs, + CacheableSet &cacheablePtrs + ) +{ + for (PtrSet::const_iterator psit = cacheSet.begin(), + psitend = cacheSet.end(); + psit != psitend; + psit++) { + if (bytePtrs.find(*psit) != bytePtrs.end() || + cacheablePtrs.find(*psit) != cacheablePtrs.end()) { + return true; + } + } + return false; +} + +// Function that detects which instructions are cacheable even if +// all instructions of the pointer are not cacheable. The resulting +// set of instructions will not contain Ptrs that are in the cacheable +// ptr set (under the assumption they will get marked cacheable already) +// or pointers in the byte set, since they are not cacheable. +void +detectCacheableInstrs( + MBBCacheableMap &bbCacheable, + InstPMap &InstToPtrMap, + CacheableSet &cacheablePtrs, + ByteSet &bytePtrs, + CacheableInstrSet &cacheableSet, + bool mDebug + ) + +{ + for (MBBCacheableMap::const_iterator mbbcit = bbCacheable.begin(), + mbbcitend = bbCacheable.end(); + mbbcit != mbbcitend; + mbbcit++) { + for (CacheableInstrSet::const_iterator bciit + = mbbcit->second.cacheableBegin(), + bciitend + = mbbcit->second.cacheableEnd(); + bciit != bciitend; + bciit++) { + if (!ptrSetIntersectsByteOrCache(InstToPtrMap[*bciit], + bytePtrs, + cacheablePtrs)) { + cacheableSet.insert(*bciit); + } + } + } +} +// This function annotates the cacheable pointers with the +// CacheableRead bit. The cacheable read bit is set +// when the number of write images is not equal to the max +// or if the default RAW_UAV_ID is equal to 11. The first +// condition means that there is a raw uav between 0 and 7 +// that is available for cacheable reads and the second +// condition means that UAV 11 is available for cacheable +// reads. +void +annotateCacheablePtrs( + TargetMachine &TM, + PtrIMap &PtrToInstMap, + CacheableSet &cacheablePtrs, + ByteSet &bytePtrs, + uint32_t numWriteImages, + bool mDebug) +{ + const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>(); + // AMDILKernelManager *KM = (AMDILKernelManager*)STM->getKernelManager(); + PtrSet::iterator siBegin, siEnd; + std::vector<MachineInstr*>::iterator miBegin, miEnd; + AMDILMachineFunctionInfo *mMFI = NULL; + // First we can check the cacheable pointers + for (siBegin = cacheablePtrs.begin(), siEnd = cacheablePtrs.end(); + siBegin != siEnd; ++siBegin) { + assert(!bytePtrs.count(*siBegin) && "Found a cacheable pointer " + "that also exists as a byte pointer!"); + for (miBegin = PtrToInstMap[*siBegin].begin(), + miEnd = PtrToInstMap[*siBegin].end(); + miBegin != miEnd; ++miBegin) { + if (mDebug) { + dbgs() << "Annotating pointer as cacheable. Inst: "; + (*miBegin)->dump(); + } + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(*miBegin, curRes); + assert(!curRes.bits.ByteStore && "No cacheable pointers should have the " + "byte Store flag set!"); + // If UAV11 is enabled, then we can enable cached reads. + if (STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) == 11) { + curRes.bits.CacheableRead = 1; + curRes.bits.ResourceID = 11; + setAsmPrinterFlags(*miBegin, curRes); + if (!mMFI) { + mMFI = (*miBegin)->getParent()->getParent() + ->getInfo<AMDILMachineFunctionInfo>(); + } + mMFI->uav_insert(curRes.bits.ResourceID); + } + } + } +} + +// A byte pointer is a pointer that along the pointer path has a +// byte store assigned to it. +void +annotateBytePtrs( + TargetMachine &TM, + PtrIMap &PtrToInstMap, + ByteSet &bytePtrs, + RawSet &rawPtrs, + bool mDebug + ) +{ + const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>(); + AMDILKernelManager *KM = STM->getKernelManager(); + PtrSet::iterator siBegin, siEnd; + std::vector<MachineInstr*>::iterator miBegin, miEnd; + uint32_t arenaID = STM->device() + ->getResourceID(AMDILDevice::ARENA_UAV_ID); + if (STM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) { + arenaID = ARENA_SEGMENT_RESERVED_UAVS + 1; + } + AMDILMachineFunctionInfo *mMFI = NULL; + for (siBegin = bytePtrs.begin(), siEnd = bytePtrs.end(); + siBegin != siEnd; ++siBegin) { + const Value* val = (*siBegin); + const PointerType *PT = dyn_cast<PointerType>(val->getType()); + if (!PT) { + continue; + } + const Argument *curArg = dyn_cast<Argument>(val); + assert(!rawPtrs.count(*siBegin) && "Found a byte pointer " + "that also exists as a raw pointer!"); + bool arenaInc = false; + for (miBegin = PtrToInstMap[*siBegin].begin(), + miEnd = PtrToInstMap[*siBegin].end(); + miBegin != miEnd; ++miBegin) { + if (mDebug) { + dbgs() << "Annotating pointer as arena. Inst: "; + (*miBegin)->dump(); + } + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(*miBegin, curRes); + + if (STM->device()->usesHardware(AMDILDeviceInfo::ConstantMem) + && PT->getAddressSpace() == AMDILAS::CONSTANT_ADDRESS) { + // If hardware constant mem is enabled, then we need to + // get the constant pointer CB number and use that to specify + // the resource ID. + AMDILGlobalManager *GM = STM->getGlobalManager(); + const StringRef funcName = (*miBegin)->getParent()->getParent() + ->getFunction()->getName(); + if (GM->isKernel(funcName)) { + const kernel &krnl = GM->getKernel(funcName); + curRes.bits.ResourceID = GM->getConstPtrCB(krnl, + (*siBegin)->getName()); + curRes.bits.HardwareInst = 1; + } else { + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::CONSTANT_ID); + } + } else if (STM->device()->usesHardware(AMDILDeviceInfo::LocalMem) + && PT->getAddressSpace() == AMDILAS::LOCAL_ADDRESS) { + // If hardware local mem is enabled, get the local mem ID from + // the device to use as the ResourceID + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::LDS_ID); + if (isAtomicInst(TM.getInstrInfo(), *miBegin)) { + assert(curRes.bits.ResourceID && "Atomic resource ID " + "cannot be non-zero!"); + (*miBegin)->getOperand((*miBegin)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + } + } else if (STM->device()->usesHardware(AMDILDeviceInfo::RegionMem) + && PT->getAddressSpace() == AMDILAS::REGION_ADDRESS) { + // If hardware region mem is enabled, get the gds mem ID from + // the device to use as the ResourceID + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::GDS_ID); + if (isAtomicInst(TM.getInstrInfo(), *miBegin)) { + assert(curRes.bits.ResourceID && "Atomic resource ID " + "cannot be non-zero!"); + (*miBegin)->getOperand((*miBegin)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + } + } else if (STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem) + && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) { + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::SCRATCH_ID); + } else { + if (mDebug) { + dbgs() << __LINE__ << ": Setting byte store bit on instruction: "; + (*miBegin)->print(dbgs()); + } + curRes.bits.ByteStore = 1; + curRes.bits.ResourceID = (curArg && curArg->hasNoAliasAttr()) ? arenaID + : STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID); + if (STM->device()->isSupported(AMDILDeviceInfo::ArenaSegment)) { + arenaInc = true; + } + if (isAtomicInst(TM.getInstrInfo(), *miBegin) && + STM->device()->isSupported(AMDILDeviceInfo::ArenaUAV)) { + (*miBegin)->getOperand((*miBegin)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + // If we are an arena instruction, we need to switch the atomic opcode + // from the global version to the arena version. + MachineInstr *MI = *miBegin; + MI->setDesc( + TM.getInstrInfo()->get( + (MI->getOpcode() - AMDIL::ATOM_G_ADD) + AMDIL::ATOM_A_ADD)); + } + if (mDebug) { + dbgs() << "Annotating pointer as arena. Inst: "; + (*miBegin)->dump(); + } + } + setAsmPrinterFlags(*miBegin, curRes); + KM->setUAVID(*siBegin, curRes.bits.ResourceID); + if (!mMFI) { + mMFI = (*miBegin)->getParent()->getParent() + ->getInfo<AMDILMachineFunctionInfo>(); + } + mMFI->uav_insert(curRes.bits.ResourceID); + } + if (arenaInc) { + ++arenaID; + } + } +} +// An append pointer is a opaque object that has append instructions +// in its path. +void +annotateAppendPtrs( + TargetMachine &TM, + PtrIMap &PtrToInstMap, + AppendSet &appendPtrs, + bool mDebug) +{ + unsigned currentCounter = 0; + // const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>(); + // AMDILKernelManager *KM = (AMDILKernelManager*)STM->getKernelManager(); + MachineFunction *MF = NULL; + for (AppendSet::iterator asBegin = appendPtrs.begin(), + asEnd = appendPtrs.end(); asBegin != asEnd; ++asBegin) + { + bool usesWrite = false; + bool usesRead = false; + const Value* curVal = *asBegin; + if (mDebug) { + dbgs() << "Counter: " << curVal->getName() + << " assigned the counter " << currentCounter << "\n"; + } + for (std::vector<MachineInstr*>::iterator + miBegin = PtrToInstMap[curVal].begin(), + miEnd = PtrToInstMap[curVal].end(); miBegin != miEnd; ++miBegin) { + MachineInstr *MI = *miBegin; + if (!MF) { + MF = MI->getParent()->getParent(); + } + unsigned opcode = MI->getOpcode(); + switch (opcode) { + default: + if (mDebug) { + dbgs() << "Skipping instruction: "; + MI->dump(); + } + break; + case AMDIL::APPEND_ALLOC: + case AMDIL::APPEND_ALLOC_NORET: + usesWrite = true; + MI->getOperand(1).ChangeToImmediate(currentCounter); + if (mDebug) { + dbgs() << "Assing to counter " << currentCounter << " Inst: "; + MI->dump(); + } + break; + case AMDIL::APPEND_CONSUME: + case AMDIL::APPEND_CONSUME_NORET: + usesRead = true; + MI->getOperand(1).ChangeToImmediate(currentCounter); + if (mDebug) { + dbgs() << "Assing to counter " << currentCounter << " Inst: "; + MI->dump(); + } + break; + }; + } + if (usesWrite && usesRead && MF) { + MF->getInfo<AMDILMachineFunctionInfo>()->addErrorMsg( + amd::CompilerErrorMessage[INCORRECT_COUNTER_USAGE]); + } + ++currentCounter; + } +} +// A raw pointer is any pointer that does not have byte store in its path. +static void +annotateRawPtrs( + TargetMachine &TM, + PtrIMap &PtrToInstMap, + RawSet &rawPtrs, + ByteSet &bytePtrs, + uint32_t numWriteImages, + bool mDebug + ) +{ + const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>(); + AMDILKernelManager *KM = STM->getKernelManager(); + PtrSet::iterator siBegin, siEnd; + std::vector<MachineInstr*>::iterator miBegin, miEnd; + AMDILMachineFunctionInfo *mMFI = NULL; + + // Now all of the raw pointers will go to the raw uav. + for (siBegin = rawPtrs.begin(), siEnd = rawPtrs.end(); + siBegin != siEnd; ++siBegin) { + const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType()); + if (!PT) { + continue; + } + assert(!bytePtrs.count(*siBegin) && "Found a raw pointer " + " that also exists as a byte pointers!"); + for (miBegin = PtrToInstMap[*siBegin].begin(), + miEnd = PtrToInstMap[*siBegin].end(); + miBegin != miEnd; ++miBegin) { + if (mDebug) { + dbgs() << "Annotating pointer as raw. Inst: "; + (*miBegin)->dump(); + } + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(*miBegin, curRes); + if (!curRes.bits.ConflictPtr) { + assert(!curRes.bits.ByteStore + && "Found a instruction that is marked as " + "raw but has a byte store bit set!"); + } else if (curRes.bits.ConflictPtr) { + if (curRes.bits.ByteStore) { + curRes.bits.ByteStore = 0; + } + } + if (STM->device()->usesHardware(AMDILDeviceInfo::ConstantMem) + && PT->getAddressSpace() == AMDILAS::CONSTANT_ADDRESS) { + // If hardware constant mem is enabled, then we need to + // get the constant pointer CB number and use that to specify + // the resource ID. + AMDILGlobalManager *GM = STM->getGlobalManager(); + const StringRef funcName = (*miBegin)->getParent()->getParent() + ->getFunction()->getName(); + if (GM->isKernel(funcName)) { + const kernel &krnl = GM->getKernel(funcName); + curRes.bits.ResourceID = GM->getConstPtrCB(krnl, + (*siBegin)->getName()); + curRes.bits.HardwareInst = 1; + } else { + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::CONSTANT_ID); + } + } else if (STM->device()->usesHardware(AMDILDeviceInfo::LocalMem) + && PT->getAddressSpace() == AMDILAS::LOCAL_ADDRESS) { + // If hardware local mem is enabled, get the local mem ID from + // the device to use as the ResourceID + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::LDS_ID); + if (isAtomicInst(TM.getInstrInfo(), *miBegin)) { + assert(curRes.bits.ResourceID && "Atomic resource ID " + "cannot be non-zero!"); + (*miBegin)->getOperand((*miBegin)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + } + } else if (STM->device()->usesHardware(AMDILDeviceInfo::RegionMem) + && PT->getAddressSpace() == AMDILAS::REGION_ADDRESS) { + // If hardware region mem is enabled, get the gds mem ID from + // the device to use as the ResourceID + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::GDS_ID); + if (isAtomicInst(TM.getInstrInfo(), *miBegin)) { + assert(curRes.bits.ResourceID && "Atomic resource ID " + "cannot be non-zero!"); + (*miBegin)->getOperand((*miBegin)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + } + } else if (STM->device()->usesHardware(AMDILDeviceInfo::PrivateMem) + && PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) { + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::SCRATCH_ID); + } else if (!STM->device()->isSupported(AMDILDeviceInfo::MultiUAV)) { + // If multi uav is enabled, then the resource ID is either the + // number of write images that are available or the device + // raw uav id if it is 11. + if (STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) > + STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) { + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::RAW_UAV_ID); + } else if (numWriteImages != OPENCL_MAX_WRITE_IMAGES) { + if (STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) + < numWriteImages) { + curRes.bits.ResourceID = numWriteImages; + } else { + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::RAW_UAV_ID); + } + } else { + if (mDebug) { + dbgs() << __LINE__ << ": Setting byte store bit on instruction: "; + (*miBegin)->print(dbgs()); + } + curRes.bits.ByteStore = 1; + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::ARENA_UAV_ID); + } + if (isAtomicInst(TM.getInstrInfo(), *miBegin)) { + (*miBegin)->getOperand((*miBegin)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + if (curRes.bits.ResourceID + == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) { + assert(0 && "Found an atomic instruction that has " + "an arena uav id!"); + } + } + KM->setUAVID(*siBegin, curRes.bits.ResourceID); + if (!mMFI) { + mMFI = (*miBegin)->getParent()->getParent() + ->getInfo<AMDILMachineFunctionInfo>(); + } + mMFI->uav_insert(curRes.bits.ResourceID); + } + setAsmPrinterFlags(*miBegin, curRes); + } + } + +} + +void +annotateCacheableInstrs( + TargetMachine &TM, + CacheableInstrSet &cacheableSet, + bool mDebug) +{ + const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>(); + // AMDILKernelManager *KM = (AMDILKernelManager*)STM->getKernelManager(); + + CacheableInstrSet::iterator miBegin, miEnd; + + for (miBegin = cacheableSet.begin(), + miEnd = cacheableSet.end(); + miBegin != miEnd; ++miBegin) { + if (mDebug) { + dbgs() << "Annotating instr as cacheable. Inst: "; + (*miBegin)->dump(); + } + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(*miBegin, curRes); + // If UAV11 is enabled, then we can enable cached reads. + if (STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID) == 11) { + curRes.bits.CacheableRead = 1; + curRes.bits.ResourceID = 11; + setAsmPrinterFlags(*miBegin, curRes); + } + } +} + +// Annotate the instructions along various pointer paths. The paths that +// are handled are the raw, byte and cacheable pointer paths. +static void +annotatePtrPath( + TargetMachine &TM, + PtrIMap &PtrToInstMap, + RawSet &rawPtrs, + ByteSet &bytePtrs, + CacheableSet &cacheablePtrs, + uint32_t numWriteImages, + bool mDebug + ) +{ + if (PtrToInstMap.empty()) { + return; + } + // First we can check the cacheable pointers + annotateCacheablePtrs(TM, PtrToInstMap, cacheablePtrs, + bytePtrs, numWriteImages, mDebug); + + // Next we annotate the byte pointers + annotateBytePtrs(TM, PtrToInstMap, bytePtrs, rawPtrs, mDebug); + + // Next we annotate the raw pointers + annotateRawPtrs(TM, PtrToInstMap, rawPtrs, bytePtrs, + numWriteImages, mDebug); +} +// Allocate MultiUAV pointer ID's for the raw/conflict pointers. +static void +allocateMultiUAVPointers( + MachineFunction &MF, + const AMDILTargetMachine *ATM, + PtrIMap &PtrToInstMap, + RawSet &rawPtrs, + ConflictSet &conflictPtrs, + CacheableSet &cacheablePtrs, + uint32_t numWriteImages, + bool mDebug) +{ + if (PtrToInstMap.empty()) { + return; + } + AMDILMachineFunctionInfo *mMFI = MF.getInfo<AMDILMachineFunctionInfo>(); + uint32_t curUAV = numWriteImages; + bool increment = true; + const AMDILSubtarget *STM + = ATM->getSubtargetImpl(); + // If the RAW_UAV_ID is a value that is larger than the max number of write + // images, then we use that UAV ID. + if (numWriteImages >= OPENCL_MAX_WRITE_IMAGES) { + curUAV = STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID); + increment = false; + } + AMDILKernelManager *KM = STM->getKernelManager(); + PtrSet::iterator siBegin, siEnd; + std::vector<MachineInstr*>::iterator miBegin, miEnd; + // First lets handle the raw pointers. + for (siBegin = rawPtrs.begin(), siEnd = rawPtrs.end(); + siBegin != siEnd; ++siBegin) { + assert((*siBegin)->getType()->isPointerTy() && "We must be a pointer type " + "to be processed at this point!"); + const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType()); + if (conflictPtrs.count(*siBegin) || !PT) { + continue; + } + // We only want to process global address space pointers + if (PT->getAddressSpace() != AMDILAS::GLOBAL_ADDRESS) { + if ((PT->getAddressSpace() == AMDILAS::LOCAL_ADDRESS + && STM->device()->usesSoftware(AMDILDeviceInfo::LocalMem)) + || (PT->getAddressSpace() == AMDILAS::CONSTANT_ADDRESS + && STM->device()->usesSoftware(AMDILDeviceInfo::ConstantMem)) + || (PT->getAddressSpace() == AMDILAS::REGION_ADDRESS + && STM->device()->usesSoftware(AMDILDeviceInfo::RegionMem))) { + // If we are using software emulated hardware features, then + // we need to specify that they use the raw uav and not + // zero-copy uav. The easiest way to do this is to assume they + // conflict with another pointer. Any pointer that conflicts + // with another pointer is assigned to the raw uav or the + // arena uav if no raw uav exists. + const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType()); + if (PT) { + conflictPtrs.insert(*siBegin); + } + } + if (PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) { + if (STM->device()->usesSoftware(AMDILDeviceInfo::PrivateMem)) { + const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType()); + if (PT) { + conflictPtrs.insert(*siBegin); + } + } else { + if (mDebug) { + dbgs() << "Scratch Pointer '" << (*siBegin)->getName() + << "' being assigned uav "<< + STM->device()->getResourceID(AMDILDevice::SCRATCH_ID) << "\n"; + } + for (miBegin = PtrToInstMap[*siBegin].begin(), + miEnd = PtrToInstMap[*siBegin].end(); + miBegin != miEnd; ++miBegin) { + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(*miBegin, curRes); + curRes.bits.ResourceID = STM->device() + ->getResourceID(AMDILDevice::SCRATCH_ID); + if (mDebug) { + dbgs() << "Updated instruction to bitmask "; + dbgs().write_hex(curRes.u16all); + dbgs() << " with ResID " << curRes.bits.ResourceID; + dbgs() << ". Inst: "; + (*miBegin)->dump(); + } + setAsmPrinterFlags((*miBegin), curRes); + KM->setUAVID(*siBegin, curRes.bits.ResourceID); + mMFI->uav_insert(curRes.bits.ResourceID); + } + } + } + continue; + } + // If more than just UAV 11 is cacheable, then we can remove + // this check. + if (cacheablePtrs.count(*siBegin)) { + if (mDebug) { + dbgs() << "Raw Pointer '" << (*siBegin)->getName() + << "' is cacheable, not allocating a multi-uav for it!\n"; + } + continue; + } + if (mDebug) { + dbgs() << "Raw Pointer '" << (*siBegin)->getName() + << "' being assigned uav " << curUAV << "\n"; + } + if (PtrToInstMap[*siBegin].empty()) { + KM->setUAVID(*siBegin, curUAV); + mMFI->uav_insert(curUAV); + } + // For all instructions here, we are going to set the new UAV to the curUAV + // number and not the value that it currently is set to. + for (miBegin = PtrToInstMap[*siBegin].begin(), + miEnd = PtrToInstMap[*siBegin].end(); + miBegin != miEnd; ++miBegin) { + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(*miBegin, curRes); + curRes.bits.ResourceID = curUAV; + if (isAtomicInst(ATM->getInstrInfo(), *miBegin)) { + (*miBegin)->getOperand((*miBegin)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + if (curRes.bits.ResourceID + == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) { + assert(0 && "Found an atomic instruction that has " + "an arena uav id!"); + } + } + if (curUAV == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) { + if (mDebug) { + dbgs() << __LINE__ << ": Setting byte store bit on instruction: "; + (*miBegin)->print(dbgs()); + } + curRes.bits.ByteStore = 1; + curRes.bits.CacheableRead = 0; + } + if (mDebug) { + dbgs() << "Updated instruction to bitmask "; + dbgs().write_hex(curRes.u16all); + dbgs() << " with ResID " << curRes.bits.ResourceID; + dbgs() << ". Inst: "; + (*miBegin)->dump(); + } + setAsmPrinterFlags(*miBegin, curRes); + KM->setUAVID(*siBegin, curRes.bits.ResourceID); + mMFI->uav_insert(curRes.bits.ResourceID); + } + // If we make it here, we can increment the uav counter if we are less + // than the max write image count. Otherwise we set it to the default + // UAV and leave it. + if (increment && curUAV < (OPENCL_MAX_WRITE_IMAGES - 1)) { + ++curUAV; + } else { + curUAV = STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID); + increment = false; + } + } + if (numWriteImages == 8) { + curUAV = STM->device()->getResourceID(AMDILDevice::RAW_UAV_ID); + } + // Now lets handle the conflict pointers + for (siBegin = conflictPtrs.begin(), siEnd = conflictPtrs.end(); + siBegin != siEnd; ++siBegin) { + assert((*siBegin)->getType()->isPointerTy() && "We must be a pointer type " + "to be processed at this point!"); + const PointerType *PT = dyn_cast<PointerType>((*siBegin)->getType()); + // We only want to process global address space pointers + if (!PT || PT->getAddressSpace() != AMDILAS::GLOBAL_ADDRESS) { + continue; + } + if (mDebug) { + dbgs() << "Conflict Pointer '" << (*siBegin)->getName() + << "' being assigned uav " << curUAV << "\n"; + } + if (PtrToInstMap[*siBegin].empty()) { + KM->setUAVID(*siBegin, curUAV); + mMFI->uav_insert(curUAV); + } + for (miBegin = PtrToInstMap[*siBegin].begin(), + miEnd = PtrToInstMap[*siBegin].end(); + miBegin != miEnd; ++miBegin) { + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(*miBegin, curRes); + curRes.bits.ResourceID = curUAV; + if (isAtomicInst(ATM->getInstrInfo(), *miBegin)) { + (*miBegin)->getOperand((*miBegin)->getNumOperands()-1) + .setImm(curRes.bits.ResourceID); + if (curRes.bits.ResourceID + == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) { + assert(0 && "Found an atomic instruction that has " + "an arena uav id!"); + } + } + if (curUAV == STM->device()->getResourceID(AMDILDevice::ARENA_UAV_ID)) { + if (mDebug) { + dbgs() << __LINE__ << ": Setting byte store bit on instruction: "; + (*miBegin)->print(dbgs()); + } + curRes.bits.ByteStore = 1; + } + if (mDebug) { + dbgs() << "Updated instruction to bitmask "; + dbgs().write_hex(curRes.u16all); + dbgs() << " with ResID " << curRes.bits.ResourceID; + dbgs() << ". Inst: "; + (*miBegin)->dump(); + } + setAsmPrinterFlags(*miBegin, curRes); + KM->setUAVID(*siBegin, curRes.bits.ResourceID); + mMFI->uav_insert(curRes.bits.ResourceID); + } + } +} +// The first thing we should do is to allocate the default +// ID for each load/store/atomic instruction so that +// it is correctly allocated. Everything else after this +// is just an optimization to more efficiently allocate +// resource ID's. +void +allocateDefaultIDs( + const AMDILTargetMachine *ATM, + MachineFunction &MF, + bool mDebug) +{ + for (MachineFunction::iterator mfBegin = MF.begin(), + mfEnd = MF.end(); mfBegin != mfEnd; ++mfBegin) { + MachineBasicBlock *MB = mfBegin; + for (MachineBasicBlock::iterator mbb = MB->begin(), mbe = MB->end(); + mbb != mbe; ++mbb) { + MachineInstr *MI = mbb; + if (isLoadInst(ATM->getInstrInfo(), MI) + || isStoreInst(ATM->getInstrInfo(), MI) + || isAtomicInst(ATM->getInstrInfo(), MI)) { + AMDILAS::InstrResEnc curRes; + getAsmPrinterFlags(MI, curRes); + allocateDefaultID(ATM, curRes, MI, mDebug); + } + } + } +} + + bool +AMDILEGPointerManager::runOnMachineFunction(MachineFunction &MF) +{ + bool changed = false; + const AMDILTargetMachine *ATM + = reinterpret_cast<const AMDILTargetMachine*>(&TM); + AMDILMachineFunctionInfo *mMFI = + MF.getInfo<AMDILMachineFunctionInfo>(); + if (mDebug) { + dbgs() << getPassName() << "\n"; + dbgs() << MF.getFunction()->getName() << "\n"; + MF.dump(); + } + // Start out by allocating the default ID's to all instructions in the + // function. + allocateDefaultIDs(ATM, MF, mDebug); + + // A set of all pointers are tracked in this map and + // if multiple pointers are detected, they go to the same + // set. + PtrIMap PtrToInstMap; + + // All of the instructions that are loads, stores or pointer + // conflicts are tracked in the map with a set of all values + // that reference the instruction stored. + InstPMap InstToPtrMap; + + // In order to track across stack entries, we need a map between a + // frame index and a pointer. That way when we load from a frame + // index, we know what pointer was stored to the frame index. + FIPMap FIToPtrMap; + + // Set of all the pointers that are byte pointers. Byte pointers + // are required to have their instructions go to the arena. + ByteSet bytePtrs; + + // Set of all the pointers that are cacheable. All of the cache pointers + // are required to go to a raw uav and cannot go to arena. + CacheableSet cacheablePtrs; + + // Set of all the pointers that go into a raw buffer. A pointer can + // exist in either rawPtrs or bytePtrs but not both. + RawSet rawPtrs; + + // Set of all the pointers that end up having a conflicting instruction + // somewhere in the pointer path. + ConflictSet conflictPtrs; + + // Set of all pointers that are images + ImageSet images; + + // Set of all pointers that are counters + AppendSet counters; + + // Set of all pointers that load from a constant pool + CPoolSet cpool; + + // Mapping from BB to infomation about the cacheability of the + // global load instructions in it. + MBBCacheableMap bbCacheable; + + // A set of load instructions that are cacheable + // even if all the load instructions of the ptr are not. + CacheableInstrSet cacheableSet; + + // The lookup table holds all of the registers that + // are used as we assign pointers values to them. + // If two pointers collide on the lookup table, then + // we assign them to the same UAV. If one of the + // pointers is byte addressable, then we assign + // them to arena, otherwise we assign them to raw. + RVPVec lookupTable; + + // First we need to go through all of the arguments and assign the + // live in registers to the lookup table and the pointer mapping. + uint32_t numWriteImages = parseArguments(MF, lookupTable, ATM, + cacheablePtrs, images, counters, mDebug); + + // Lets do some error checking on the results of the parsing. + if (counters.size() > OPENCL_MAX_NUM_ATOMIC_COUNTERS) { + mMFI->addErrorMsg( + amd::CompilerErrorMessage[INSUFFICIENT_COUNTER_RESOURCES]); + } + if (numWriteImages > OPENCL_MAX_WRITE_IMAGES + || (images.size() - numWriteImages > OPENCL_MAX_READ_IMAGES)) { + mMFI->addErrorMsg( + amd::CompilerErrorMessage[INSUFFICIENT_IMAGE_RESOURCES]); + } + + // Now lets parse all of the instructions and update our + // lookup tables. + parseFunction(this, ATM, MF, InstToPtrMap, PtrToInstMap, + FIToPtrMap, lookupTable, bytePtrs, conflictPtrs, cpool, + bbCacheable, mDebug); + + // We need to go over our pointer map and find all the conflicting + // pointers that have byte stores and put them in the bytePtr map. + // All conflicting pointers that don't have byte stores go into + // the rawPtr map. + detectConflictingPointers(ATM, InstToPtrMap, bytePtrs, rawPtrs, + conflictPtrs, mDebug); + + // The next step is to detect whether the pointer should be added to + // the fully cacheable set or not. A pointer is marked as cacheable if + // no store instruction exists. + detectFullyCacheablePointers(ATM, PtrToInstMap, rawPtrs, + cacheablePtrs, conflictPtrs, mDebug); + + // Disable partially cacheable for now when multiUAV is on. + // SC versions before SC139 have a bug that generates incorrect + // addressing for some cached accesses. + if (!ATM->getSubtargetImpl() + ->device()->isSupported(AMDILDeviceInfo::MultiUAV) && + ATM->getSubtargetImpl()->calVersion() >= CAL_VERSION_SC_139) { + // Now we take the set of loads that have no reachable stores and + // create a list of additional instructions (those that aren't already + // in a cacheablePtr set) that are safe to mark as cacheable. + detectCacheableInstrs(bbCacheable, InstToPtrMap, cacheablePtrs, + bytePtrs, cacheableSet, mDebug); + + // Annotate the additional instructions computed above as cacheable. + // Note that this should not touch any instructions annotated in + // annotatePtrPath. + annotateCacheableInstrs(TM, cacheableSet, mDebug); + } + + // Now that we have detected everything we need to detect, lets go through an + // annotate the instructions along the pointer path for each of the + // various pointer types. + annotatePtrPath(TM, PtrToInstMap, rawPtrs, bytePtrs, + cacheablePtrs, numWriteImages, mDebug); + + // Annotate the atomic counter path if any exists. + annotateAppendPtrs(TM, PtrToInstMap, counters, mDebug); + + // If we support MultiUAV, then we need to determine how + // many write images exist so that way we know how many UAV are + // left to allocate to buffers. + if (ATM->getSubtargetImpl() + ->device()->isSupported(AMDILDeviceInfo::MultiUAV)) { + // We now have (OPENCL_MAX_WRITE_IMAGES - numPtrs) buffers open for + // multi-uav allocation. + allocateMultiUAVPointers(MF, ATM, PtrToInstMap, rawPtrs, + conflictPtrs, cacheablePtrs, numWriteImages, mDebug); + } + + // The last step is to detect if we have any alias constant pool operations. + // This is not likely, but does happen on occasion with double precision + // operations. + detectAliasedCPoolOps(TM, cpool, mDebug); + if (mDebug) { + dumpPointers(bytePtrs, "Byte Store Ptrs"); + dumpPointers(rawPtrs, "Raw Ptrs"); + dumpPointers(cacheablePtrs, "Cache Load Ptrs"); + dumpPointers(counters, "Atomic Counters"); + dumpPointers(images, "Images"); + } + return changed; +} + +// The default pointer manager just assigns the default ID's to +// each load/store instruction and does nothing else. This is +// the pointer manager for the 7XX series of cards. + bool +AMDILPointerManager::runOnMachineFunction(MachineFunction &MF) +{ + bool changed = false; + const AMDILTargetMachine *ATM + = reinterpret_cast<const AMDILTargetMachine*>(&TM); + if (mDebug) { + dbgs() << getPassName() << "\n"; + dbgs() << MF.getFunction()->getName() << "\n"; + MF.dump(); + } + // On the 7XX we don't have to do any special processing, so we + // can just allocate the default ID and be done with it. + allocateDefaultIDs(ATM, MF, mDebug); + return changed; +} diff --git a/src/gallium/drivers/radeon/AMDILPointerManager.h b/src/gallium/drivers/radeon/AMDILPointerManager.h new file mode 100644 index 00000000000..2c471fb4d65 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILPointerManager.h @@ -0,0 +1,209 @@ +//===-------- AMDILPointerManager.h - Manage Pointers for HW ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// The AMDIL Pointer Manager is a class that does all the checking for +// different pointer characteristics. Pointers have attributes that need +// to be attached to them in order to correctly codegen them efficiently. +// This class will analyze the pointers of a function and then traverse the uses +// of the pointers and determine if a pointer can be cached, should belong in +// the arena, and what UAV it should belong to. There are seperate classes for +// each unique generation of devices. This pass only works in SSA form. +//===----------------------------------------------------------------------===// +#ifndef _AMDIL_POINTER_MANAGER_H_ +#define _AMDIL_POINTER_MANAGER_H_ +#undef DEBUG_TYPE +#undef DEBUGME +#define DEBUG_TYPE "PointerManager" +#if !defined(NDEBUG) +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME (false) +#endif +#include "AMDIL.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#include <list> +#include <map> +#include <queue> +#include <set> + +namespace llvm { + class Value; + class MachineBasicBlock; + // Typedefing the multiple different set types to that it is + // easier to read what each set is supposed to handle. This + // also allows it easier to track which set goes to which + // argument in a function call. + typedef std::set<const Value*> PtrSet; + + // A Byte set is the set of all base pointers that must + // be allocated to the arena path. + typedef PtrSet ByteSet; + + // A Raw set is the set of all base pointers that can be + // allocated to the raw path. + typedef PtrSet RawSet; + + // A cacheable set is the set of all base pointers that + // are deamed cacheable based on annotations or + // compiler options. + typedef PtrSet CacheableSet; + + // A conflict set is a set of all base pointers whose + // use/def chains conflict with another base pointer. + typedef PtrSet ConflictSet; + + // An image set is a set of all read/write only image pointers. + typedef PtrSet ImageSet; + + // An append set is a set of atomic counter base pointers + typedef std::vector<const Value*> AppendSet; + + // A ConstantSet is a set of constant pool instructions + typedef std::set<MachineInstr*> CPoolSet; + + // A CacheableInstSet set is a set of instructions that are cachable + // even if the pointer is not generally cacheable. + typedef std::set<MachineInstr*> CacheableInstrSet; + + // A pair that maps a virtual register to the equivalent base + // pointer value that it was derived from. + typedef std::pair<unsigned, const Value*> RegValPair; + + // A map that maps between the base pointe rvalue and an array + // of instructions that are part of the pointer chain. A pointer + // chain is a recursive def/use chain of all instructions that don't + // store data to memory unless the pointer is the data being stored. + typedef std::map<const Value*, std::vector<MachineInstr*> > PtrIMap; + + // A map that holds a set of all base pointers that are used in a machine + // instruction. This helps to detect when conflict pointers are found + // such as when pointer subtraction occurs. + typedef std::map<MachineInstr*, PtrSet> InstPMap; + + // A map that holds the frame index to RegValPair so that writes of + // pointers to the stack can be tracked. + typedef std::map<unsigned, RegValPair > FIPMap; + + // A small vector impl that holds all of the register to base pointer + // mappings for a given function. + typedef std::map<unsigned, RegValPair> RVPVec; + + + + // The default pointer manager. This handles pointer + // resource allocation for default ID's only. + // There is no special processing. + class AMDILPointerManager : public MachineFunctionPass + { + public: + AMDILPointerManager( + TargetMachine &tm + AMDIL_OPT_LEVEL_DECL); + virtual ~AMDILPointerManager(); + virtual const char* + getPassName() const; + virtual bool + runOnMachineFunction(MachineFunction &F); + virtual void + getAnalysisUsage(AnalysisUsage &AU) const; + static char ID; + protected: + bool mDebug; + private: + TargetMachine &TM; + }; // class AMDILPointerManager + + // The pointer manager for Evergreen and Northern Island + // devices. This pointer manager allocates and trackes + // cached memory, arena resources, raw resources and + // whether multi-uav is utilized or not. + class AMDILEGPointerManager : public AMDILPointerManager + { + public: + AMDILEGPointerManager( + TargetMachine &tm + AMDIL_OPT_LEVEL_DECL); + virtual ~AMDILEGPointerManager(); + virtual const char* + getPassName() const; + virtual bool + runOnMachineFunction(MachineFunction &F); + private: + TargetMachine &TM; + }; // class AMDILEGPointerManager + + // Information related to the cacheability of instructions in a basic block. + // This is used during the parse phase of the pointer algorithm to track + // the reachability of stores within a basic block. + class BlockCacheableInfo { + public: + BlockCacheableInfo() : + mStoreReachesTop(false), + mStoreReachesExit(false), + mCacheableSet() + {}; + + bool storeReachesTop() const { return mStoreReachesTop; } + bool storeReachesExit() const { return mStoreReachesExit; } + CacheableInstrSet::const_iterator + cacheableBegin() const { return mCacheableSet.begin(); } + CacheableInstrSet::const_iterator + cacheableEnd() const { return mCacheableSet.end(); } + + // mark the block as having a global store that reaches it. This + // will also set the store reaches exit flag, and clear the list + // of loads (since they are now reachable by a store.) + bool setReachesTop() { + bool changedExit = !mStoreReachesExit; + + if (!mStoreReachesTop) + mCacheableSet.clear(); + + mStoreReachesTop = true; + mStoreReachesExit = true; + return changedExit; + } + + // Mark the block as having a store that reaches the exit of the + // block. + void setReachesExit() { + mStoreReachesExit = true; + } + + // If the top or the exit of the block are not marked as reachable + // by a store, add the load to the list of cacheable loads. + void addPossiblyCacheableInst(const TargetMachine * tm, MachineInstr *load) { + // By definition, if store reaches top, then store reaches exit. + // So, we only test for exit here. + // If we have a volatile load we cannot cache it. + if (mStoreReachesExit || isVolatileInst(tm->getInstrInfo(), load)) { + return; + } + + mCacheableSet.insert(load); + } + + private: + bool mStoreReachesTop; // Does a global store reach the top of this block? + bool mStoreReachesExit;// Does a global store reach the exit of this block? + CacheableInstrSet mCacheableSet; // The set of loads in the block not + // reachable by a global store. + }; + // Map from MachineBasicBlock to it's cacheable load info. + typedef std::map<MachineBasicBlock*, BlockCacheableInfo> MBBCacheableMap; +} // end llvm namespace +#endif // _AMDIL_POINTER_MANAGER_H_ diff --git a/src/gallium/drivers/radeon/AMDILPrintfConvert.cpp b/src/gallium/drivers/radeon/AMDILPrintfConvert.cpp new file mode 100644 index 00000000000..95614f477c0 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILPrintfConvert.cpp @@ -0,0 +1,293 @@ +//===-- AMDILPrintfConvert.cpp - Printf Conversion pass --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// + +#define DEBUG_TYPE "PrintfConvert" +#ifdef DEBUG +#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) +#else +#define DEBUGME 0 +#endif + +#include "AMDILAlgorithms.tpp" +#include "AMDILKernelManager.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILModuleInfo.h" +#include "AMDILTargetMachine.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/Module.h" +#include "llvm/Type.h" + +#include <cstdio> + +using namespace llvm; +namespace +{ + class LLVM_LIBRARY_VISIBILITY AMDILPrintfConvert : public FunctionPass + { + public: + TargetMachine &TM; + static char ID; + AMDILPrintfConvert(TargetMachine &tm AMDIL_OPT_LEVEL_DECL); + ~AMDILPrintfConvert(); + const char* getPassName() const; + bool runOnFunction(Function &F); + bool doInitialization(Module &M); + bool doFinalization(Module &M); + void getAnalysisUsage(AnalysisUsage &AU) const; + + private: + bool expandPrintf(BasicBlock::iterator *bbb); + AMDILMachineFunctionInfo *mMFI; + AMDILKernelManager *mKM; + bool mChanged; + SmallVector<int64_t, DEFAULT_VEC_SLOTS> bVecMap; + }; + char AMDILPrintfConvert::ID = 0; +} // anonymouse namespace + +namespace llvm +{ + FunctionPass* + createAMDILPrintfConvert(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + { + return new AMDILPrintfConvert(tm AMDIL_OPT_LEVEL_VAR); + } +} // llvm namespace +AMDILPrintfConvert::AMDILPrintfConvert(TargetMachine &tm AMDIL_OPT_LEVEL_DECL) + : FunctionPass(ID), TM(tm) +{ +} +AMDILPrintfConvert::~AMDILPrintfConvert() +{ +} + bool +AMDILPrintfConvert::expandPrintf(BasicBlock::iterator *bbb) +{ + Instruction *inst = (*bbb); + CallInst *CI = dyn_cast<CallInst>(inst); + if (!CI) { + return false; + } + int num_ops = CI->getNumOperands(); + if (!num_ops) { + return false; + } + if (CI->getOperand(num_ops - 1)->getName() != "printf") { + return false; + } + + Function *mF = inst->getParent()->getParent(); + uint64_t bytes = 0; + mChanged = true; + if (num_ops == 1) { + ++(*bbb); + Constant *newConst = ConstantInt::getSigned(CI->getType(), bytes); + CI->replaceAllUsesWith(newConst); + CI->eraseFromParent(); + return mChanged; + } + // Deal with the string here + Value *op = CI->getOperand(0); + ConstantExpr *GEPinst = dyn_cast<ConstantExpr>(op); + if (GEPinst) { + GlobalVariable *GVar + = dyn_cast<GlobalVariable>(GEPinst->getOperand(0)); + std::string str = "unknown"; + if (GVar && GVar->hasInitializer()) { + ConstantDataArray *CA + = dyn_cast<ConstantDataArray>(GVar->getInitializer()); + str = (CA->isString() ? CA->getAsString() : "unknown"); + } + uint64_t id = (uint64_t)mMFI->addPrintfString(str, + getAnalysis<MachineFunctionAnalysis>().getMF() + .getMMI().getObjFileInfo<AMDILModuleInfo>().get_printf_offset()); + std::string name = "___dumpStringID"; + Function *nF = NULL; + std::vector<Type*> types; + types.push_back(Type::getInt32Ty(mF->getContext())); + nF = mF->getParent()->getFunction(name); + if (!nF) { + nF = Function::Create( + FunctionType::get( + Type::getVoidTy(mF->getContext()), types, false), + GlobalValue::ExternalLinkage, + name, mF->getParent()); + } + Constant *C = ConstantInt::get( + Type::getInt32Ty(mF->getContext()), id, false); + CallInst *nCI = CallInst::Create(nF, C); + nCI->insertBefore(CI); + bytes = strlen(str.data()); + for (uint32_t x = 1, y = num_ops - 1; x < y; ++x) { + op = CI->getOperand(x); + Type *oType = op->getType(); + uint32_t eleCount = getNumElements(oType); + uint32_t eleSize = (uint32_t)GET_SCALAR_SIZE(oType); + if (!eleSize) { + // Default size is 32bits. + eleSize = 32; + } + if (!eleCount) { + // Default num elements is 1. + eleCount = 1; + } + uint32_t totalSize = eleCount * eleSize; + mMFI->addPrintfOperand(str, (x - 1), + (uint32_t)totalSize); + } + } + for (uint32_t x = 1, y = num_ops - 1; x < y; ++x) { + op = CI->getOperand(x); + Type *oType = op->getType(); + if (oType->isFPOrFPVectorTy() + && (oType->getTypeID() != Type::VectorTyID)) { + Type *iType = NULL; + if (oType->isFloatTy()) { + iType = dyn_cast<Type>( + Type::getInt32Ty(oType->getContext())); + } else { + iType = dyn_cast<Type>( + Type::getInt64Ty(oType->getContext())); + } + op = new BitCastInst(op, iType, "printfBitCast", CI); + } else if (oType->getTypeID() == Type::VectorTyID) { + Type *iType = NULL; + uint32_t eleCount = getNumElements(oType); + uint32_t eleSize = (uint32_t)GET_SCALAR_SIZE(oType); + uint32_t totalSize = eleCount * eleSize; + switch (eleSize) { + default: + eleCount = totalSize / 64; + iType = dyn_cast<Type>( + Type::getInt64Ty(oType->getContext())); + break; + case 8: + if (eleCount >= 8) { + eleCount = totalSize / 64; + iType = dyn_cast<Type>( + Type::getInt64Ty(oType->getContext())); + } else if (eleCount >= 4) { + eleCount = 1; + iType = dyn_cast<Type>( + Type::getInt32Ty(oType->getContext())); + } else { + eleCount = 1; + iType = dyn_cast<Type>( + Type::getInt16Ty(oType->getContext())); + } + break; + case 16: + if (eleCount >= 4) { + eleCount = totalSize / 64; + iType = dyn_cast<Type>( + Type::getInt64Ty(oType->getContext())); + } else { + eleCount = 1; + iType = dyn_cast<Type>( + Type::getInt32Ty(oType->getContext())); + } + break; + } + if (eleCount > 1) { + iType = dyn_cast<Type>( + VectorType::get(iType, eleCount)); + } + op = new BitCastInst(op, iType, "printfBitCast", CI); + } + char buffer[256]; + uint32_t size = (uint32_t)GET_SCALAR_SIZE(oType); + if (size) { + sprintf(buffer, "___dumpBytes_v%db%u", + 1, + (uint32_t)getNumElements(oType) * (uint32_t)size); + } else { + const PointerType *PT = dyn_cast<PointerType>(oType); + if (PT->getAddressSpace() == 0 && + GET_SCALAR_SIZE(PT->getContainedType(0)) == 8 + && getNumElements(PT->getContainedType(0)) == 1) { + op = new BitCastInst(op, + Type::getInt8PtrTy(oType->getContext(), + AMDILAS::CONSTANT_ADDRESS), + "printfPtrCast", CI); + + sprintf(buffer, "___dumpBytes_v%dbs", 1); + } else { + op = new PtrToIntInst(op, + Type::getInt32Ty(oType->getContext()), + "printfPtrCast", CI); + sprintf(buffer, "___dumpBytes_v1b32"); + } + } + std::vector<Type*> types; + types.push_back(op->getType()); + std::string name = buffer; + Function *nF = NULL; + nF = mF->getParent()->getFunction(name); + if (!nF) { + nF = Function::Create( + FunctionType::get( + Type::getVoidTy(mF->getContext()), types, false), + GlobalValue::ExternalLinkage, + name, mF->getParent()); + } + CallInst *nCI = CallInst::Create(nF, op); + nCI->insertBefore(CI); + bytes += (size - 4); + } + ++(*bbb); + Constant *newConst = ConstantInt::getSigned(CI->getType(), bytes); + CI->replaceAllUsesWith(newConst); + CI->eraseFromParent(); + return mChanged; +} + bool +AMDILPrintfConvert::runOnFunction(Function &MF) +{ + mChanged = false; + mKM = TM.getSubtarget<AMDILSubtarget>().getKernelManager(); + mMFI = getAnalysis<MachineFunctionAnalysis>().getMF() + .getInfo<AMDILMachineFunctionInfo>(); + bVecMap.clear(); + safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), + std::bind1st( + std::mem_fun( + &AMDILPrintfConvert::expandPrintf), this)); + return mChanged; +} + +const char* +AMDILPrintfConvert::getPassName() const +{ + return "AMDIL Printf Conversion Pass"; +} +bool +AMDILPrintfConvert::doInitialization(Module &M) +{ + return false; +} + +bool +AMDILPrintfConvert::doFinalization(Module &M) +{ + return false; +} + +void +AMDILPrintfConvert::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired<MachineFunctionAnalysis>(); + FunctionPass::getAnalysisUsage(AU); + AU.setPreservesAll(); +} diff --git a/src/gallium/drivers/radeon/AMDILProfiles.td b/src/gallium/drivers/radeon/AMDILProfiles.td new file mode 100644 index 00000000000..60435a82b66 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILProfiles.td @@ -0,0 +1,174 @@ +//===- AMDILProfiles.td - AMD IL Profiles ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// These are used for custom selection dag type profiles + +//===----------------------------------------------------------------------===// +// Custom Selection DAG Type Profiles +//===----------------------------------------------------------------------===// +// SDTCisDP - The specified operand has double type +// Tablegen needs to be hacked to get this constraint to work +//class SDTCisDP<int OpNum> : SDTypeConstraint<OpNum>; + +//===----------------------------------------------------------------------===// +// Generic Profile Types +//===----------------------------------------------------------------------===// + +def SDTIL_GenUnaryOp : SDTypeProfile<1, 1, [ + SDTCisSameAs<0, 1> + ]>; +def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> + ]>; +def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3> + ]>; +def SDTIL_GenCMovLog : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisInt<1> + ]>; +def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [ + SDTCisEltOfVec<1, 0> + ]>; + +def SDTIL_GenVecExtract : SDTypeProfile<1, 2, [ + SDTCisEltOfVec<0, 1>, SDTCisVT<2, i32> + ]>; + +def SDTIL_GenVecInsert : SDTypeProfile<1, 4, [ + SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>, + SDTCisVT<3, i32>, SDTCisVT<4, i32> + ]>; + +def SDTIL_GenVecShuffle : SDTypeProfile <1, 2, [ + SDTCisSameAs<0, 1>, SDTCisVT<2, i32> + ]>; + +def SDTIL_GenVecConcat : SDTypeProfile <1, 2, [ + SDTCisSameAs<1, 2> + ]>; +//===----------------------------------------------------------------------===// +// Conversion Profile Types +//===----------------------------------------------------------------------===// +def SDTIL_DPToFPOp : SDTypeProfile<1, 1, [ + SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1> + ]>; // d2f + +def SDTIL_AnyToInt : SDTypeProfile<1, 1, [ + SDTCisInt<0> + ]>; +def SDTIL_IntToAny : SDTypeProfile<1, 1, [ + SDTCisInt<1> + ]>; +def SDTIL_GenBitConv : SDTypeProfile<1, 1, []>; +//===----------------------------------------------------------------------===// +// Scalar Profile Types +//===----------------------------------------------------------------------===// + +// Add instruction pattern to handle offsets of memory operationns +def SDTIL_AddAddrri: SDTypeProfile<1, 2, [ + SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisSameAs<0, 2> + ]>; +def SDTIL_AddAddrir : SDTypeProfile<1, 2, [ + SDTCisInt<0>, SDTCisPtrTy<2>, SDTCisSameAs<0, 1> + ]>; + +def SDTIL_LCreate : SDTypeProfile<1, 2, [ + SDTCisVT<0, i64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2> + ]>; +def SDTIL_LCreate2 : SDTypeProfile<1, 2, [ + SDTCisVT<0, v2i64>, SDTCisVT<1, v2i32>, SDTCisSameAs<1, 2> + ]>; +def SDTIL_LComp : SDTypeProfile<1, 1, [ + SDTCisVT<0, i32>, SDTCisVT<1, i64> + ]>; +def SDTIL_LComp2 : SDTypeProfile<1, 1, [ + SDTCisVT<0, v2i32>, SDTCisVT<1, v2i64> + ]>; +def SDTIL_DCreate : SDTypeProfile<1, 2, [ + SDTCisVT<0, f64>, SDTCisVT<1, i32>, SDTCisSameAs<1, 2> + ]>; +def SDTIL_DComp : SDTypeProfile<1, 1, [ + SDTCisVT<0, i32>, SDTCisVT<1, f64> + ]>; +def SDTIL_DCreate2 : SDTypeProfile<1, 2, [ + SDTCisVT<0, v2f64>, SDTCisVT<1, v2i32>, SDTCisSameAs<1, 2> + ]>; +def SDTIL_DComp2 : SDTypeProfile<1, 1, [ + SDTCisVT<0, v2i32>, SDTCisVT<1, v2f64> + ]>; +//===----------------------------------------------------------------------===// +// Flow Control Profile Types +//===----------------------------------------------------------------------===// +// Profile for Normal Call +def SDTIL_Call : SDTypeProfile<0, 1, [ + SDTCisVT<0, i32> + ]>; +// Branch instruction where second and third are basic blocks +def SDTIL_BRCond : SDTypeProfile<0, 2, [ + SDTCisVT<0, OtherVT> + ]>; +// Comparison instruction +def SDTIL_Cmp : SDTypeProfile<1, 3, [ + SDTCisSameAs<0, 2>, SDTCisSameAs<2,3>, SDTCisVT<1, i32> + ]>; + + +//===----------------------------------------------------------------------===// +// Call Sequence Profiles +//===----------------------------------------------------------------------===// +def SDTIL_CallSeqStart : SDCallSeqStart< [ + SDTCisVT<0, i32> + ]>; +def SDTIL_CallSeqEnd : SDCallSeqEnd< [ + SDTCisVT<0, i32>, SDTCisVT<1, i32> + ]>; + +//===----------------------------------------------------------------------===// +// Image Operation Profiles +//===----------------------------------------------------------------------===// +def SDTIL_ImageRead : SDTypeProfile<1, 3, + [SDTCisVT<0, v4i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVT<3, v4f32>]>; +def SDTIL_ImageWrite : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, SDTCisVT<1, v2i32>, SDTCisVT<2, v4i32>]>; +def SDTIL_ImageWrite3D : SDTypeProfile<0, 3, + [SDTCisPtrTy<0>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32>]>; +def SDTIL_ImageInfo : SDTypeProfile<1, 1, + [SDTCisVT<0, v4i32>, SDTCisPtrTy<1>]>; +//===----------------------------------------------------------------------===// +// Atomic Operation Profiles +//===----------------------------------------------------------------------===// +def SDTIL_UniAtomNoRet : SDTypeProfile<0, 2, [ + SDTCisPtrTy<0>, SDTCisVT<1, i32> + ]>; +def SDTIL_BinAtomNoRet : SDTypeProfile<0, 3, [ + SDTCisPtrTy<0>, SDTCisVT<1, i32>, SDTCisVT<2, i32> + ]>; +def SDTIL_TriAtomNoRet : SDTypeProfile<0, 4, [ + SDTCisPtrTy<0>, SDTCisVT<1, i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32> + ]>; +def SDTIL_UniAtom : SDTypeProfile<1, 2, [ + SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32> + ]>; +def SDTIL_BinAtom : SDTypeProfile<1, 3, [ + SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVT<3, i32> + ]>; +def SDTIL_TriAtom : SDTypeProfile<1, 4, [ + SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, + SDTCisVT<3, i32>, SDTCisVT<4, i32> + ]>; + +def SDTIL_BinAtomFloat : SDTypeProfile<1, 3, [ + SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, f32>, SDTCisVT<3, f32> + ]>; +def SDTIL_BinAtomNoRetFloat : SDTypeProfile<0, 3, [ + SDTCisPtrTy<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32> + ]>; + +def SDTIL_Append : SDTypeProfile<1, 1, [ + SDTCisVT<0, i32>, SDTCisPtrTy<1> + ]>; diff --git a/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp b/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp new file mode 100644 index 00000000000..5588233378c --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp @@ -0,0 +1,200 @@ +//===- AMDILRegisterInfo.cpp - AMDIL Register Information -------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the AMDIL implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#include "AMDILRegisterInfo.h" +#include "AMDIL.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +AMDILRegisterInfo::AMDILRegisterInfo(AMDILTargetMachine &tm, + const TargetInstrInfo &tii) +: AMDILGenRegisterInfo(0), // RA??? + TM(tm), TII(tii) +{ + baseOffset = 0; + nextFuncOffset = 0; +} + +const uint16_t* +AMDILRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const +{ + static const uint16_t CalleeSavedRegs[] = { 0 }; + // TODO: Does IL need to actually have any callee saved regs? + // I don't think we do since we can just use sequential registers + // Maybe this would be easier if every function call was inlined first + // and then there would be no callee issues to deal with + //TODO(getCalleeSavedRegs); + return CalleeSavedRegs; +} + +BitVector +AMDILRegisterInfo::getReservedRegs(const MachineFunction &MF) const +{ + BitVector Reserved(getNumRegs()); + // We reserve the first getNumRegs() registers as they are the ones passed + // in live-in/live-out + // and therefor cannot be killed by the scheduler. This works around a bug + // discovered + // that was causing the linearscan register allocator to kill registers + // inside of the + // function that were also passed as LiveIn registers. + for (unsigned int x = 0, y = 256; x < y; ++x) { + Reserved.set(x); + } + return Reserved; +} + +BitVector +AMDILRegisterInfo::getAllocatableSet(const MachineFunction &MF, + const TargetRegisterClass *RC = NULL) const +{ + BitVector Allocatable(getNumRegs()); + Allocatable.clear(); + return Allocatable; +} + +const TargetRegisterClass* const* +AMDILRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const +{ + static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 }; + // TODO: Keep in sync with getCalleeSavedRegs + //TODO(getCalleeSavedRegClasses); + return CalleeSavedRegClasses; +} +void +AMDILRegisterInfo::eliminateCallFramePseudoInstr( + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const +{ + MBB.erase(I); +} + +// For each frame index we find, we store the offset in the stack which is +// being pushed back into the global buffer. The offset into the stack where +// the value is stored is copied into a new register and the frame index is +// then replaced with that register. +void +AMDILRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, + RegScavenger *RS) const +{ + assert(SPAdj == 0 && "Unexpected"); + MachineInstr &MI = *II; + MachineFunction &MF = *MI.getParent()->getParent(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + unsigned int y = MI.getNumOperands(); + for (unsigned int x = 0; x < y; ++x) { + if (!MI.getOperand(x).isFI()) { + continue; + } + bool def = isStoreInst(TM.getInstrInfo(), &MI); + int FrameIndex = MI.getOperand(x).getIndex(); + int64_t Offset = MFI->getObjectOffset(FrameIndex); + //int64_t Size = MF.getFrameInfo()->getObjectSize(FrameIndex); + // An optimization is to only use the offsets if the size + // is larger than 4, which means we are storing an array + // instead of just a pointer. If we are size 4 then we can + // just do register copies since we don't need to worry about + // indexing dynamically + MachineInstr *nMI = MF.CreateMachineInstr( + TII.get(AMDIL::LOADCONST_i32), MI.getDebugLoc()); + nMI->addOperand(MachineOperand::CreateReg(AMDIL::DFP, true)); + nMI->addOperand( + MachineOperand::CreateImm(Offset)); + MI.getParent()->insert(II, nMI); + nMI = MF.CreateMachineInstr( + TII.get(AMDIL::ADD_i32), MI.getDebugLoc()); + nMI->addOperand(MachineOperand::CreateReg(AMDIL::DFP, true)); + nMI->addOperand(MachineOperand::CreateReg(AMDIL::DFP, false)); + nMI->addOperand(MachineOperand::CreateReg(AMDIL::FP, false)); + + MI.getParent()->insert(II, nMI); + if (MI.getOperand(x).isReg() == false) { + MI.getOperand(x).ChangeToRegister( + nMI->getOperand(0).getReg(), def); + } else { + MI.getOperand(x).setReg( + nMI->getOperand(0).getReg()); + } + } +} + +void +AMDILRegisterInfo::processFunctionBeforeFrameFinalized( + MachineFunction &MF) const +{ + //TODO(processFunctionBeforeFrameFinalized); + // Here we keep track of the amount of stack that the current function + // uses so + // that we can set the offset to the end of the stack and any other + // function call + // will not overwrite any stack variables. + // baseOffset = nextFuncOffset; + MachineFrameInfo *MFI = MF.getFrameInfo(); + + for (uint32_t x = 0, y = MFI->getNumObjects(); x < y; ++x) { + int64_t size = MFI->getObjectSize(x); + if (!(size % 4) && size > 1) { + nextFuncOffset += size; + } else { + nextFuncOffset += 16; + } + } +} +unsigned int +AMDILRegisterInfo::getRARegister() const +{ + return AMDIL::RA; +} + +unsigned int +AMDILRegisterInfo::getFrameRegister(const MachineFunction &MF) const +{ + return AMDIL::FP; +} + +unsigned int +AMDILRegisterInfo::getEHExceptionRegister() const +{ + assert(0 && "What is the exception register"); + return 0; +} + +unsigned int +AMDILRegisterInfo::getEHHandlerRegister() const +{ + assert(0 && "What is the exception handler register"); + return 0; +} + +int64_t +AMDILRegisterInfo::getStackSize() const +{ + return nextFuncOffset - baseOffset; +} + +#define GET_REGINFO_TARGET_DESC +#include "AMDILGenRegisterInfo.inc" + diff --git a/src/gallium/drivers/radeon/AMDILRegisterInfo.h b/src/gallium/drivers/radeon/AMDILRegisterInfo.h new file mode 100644 index 00000000000..5207cd8b466 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILRegisterInfo.h @@ -0,0 +1,91 @@ +//===- AMDILRegisterInfo.h - AMDIL Register Information Impl ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file contains the AMDIL implementation of the TargetRegisterInfo class. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDILREGISTERINFO_H_ +#define AMDILREGISTERINFO_H_ + +#include "llvm/Target/TargetRegisterInfo.h" + +#define GET_REGINFO_HEADER +#include "AMDILGenRegisterInfo.inc" +// See header file for explanation + +namespace llvm +{ + + class AMDILTargetMachine; + class TargetInstrInfo; + class Type; + + /// DWARFFlavour - Flavour of dwarf regnumbers + /// + namespace DWARFFlavour { + enum { + AMDIL_Generic = 0 + }; + } + + struct AMDILRegisterInfo : public AMDILGenRegisterInfo + { + AMDILTargetMachine &TM; + const TargetInstrInfo &TII; + + AMDILRegisterInfo(AMDILTargetMachine &tm, const TargetInstrInfo &tii); + /// Code Generation virtual methods... + const uint16_t * getCalleeSavedRegs(const MachineFunction *MF = 0) const; + + const TargetRegisterClass* const* + getCalleeSavedRegClasses( + const MachineFunction *MF = 0) const; + + BitVector + getReservedRegs(const MachineFunction &MF) const; + BitVector + getAllocatableSet(const MachineFunction &MF, + const TargetRegisterClass *RC) const; + + void + eliminateCallFramePseudoInstr( + MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + void + eliminateFrameIndex(MachineBasicBlock::iterator II, + int SPAdj, RegScavenger *RS = NULL) const; + + void + processFunctionBeforeFrameFinalized(MachineFunction &MF) const; + + // Debug information queries. + unsigned int + getRARegister() const; + + unsigned int + getFrameRegister(const MachineFunction &MF) const; + + // Exception handling queries. + unsigned int + getEHExceptionRegister() const; + unsigned int + getEHHandlerRegister() const; + + int64_t + getStackSize() const; + private: + mutable int64_t baseOffset; + mutable int64_t nextFuncOffset; + }; + +} // end namespace llvm + +#endif // AMDILREGISTERINFO_H_ diff --git a/src/gallium/drivers/radeon/AMDILRegisterInfo.td b/src/gallium/drivers/radeon/AMDILRegisterInfo.td new file mode 100644 index 00000000000..17f4b3b46a1 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILRegisterInfo.td @@ -0,0 +1,964 @@ +//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// Declarations that describe the AMDIL register file +// +//===----------------------------------------------------------------------===// + +class AMDILReg<bits<16> num, string n> : Register<n> { + field bits<16> Value; + let Value = num; + let Namespace = "AMDIL"; +} + +// We will start with 8 registers for each class before expanding to more +// Since the swizzle is added based on the register class, we can leave it +// off here and just specify different registers for different register classes +def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>; +def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>; +def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>; +def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>; +def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>; +def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>; +def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>; +def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>; +def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>; +def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>; +def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>; +def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>; +def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>; +def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>; +def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>; +def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>; +def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>; +def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>; +def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>; +def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>; +def R21 : AMDILReg<21, "r21">, DwarfRegNum<[21]>; +def R22 : AMDILReg<22, "r22">, DwarfRegNum<[22]>; +def R23 : AMDILReg<23, "r23">, DwarfRegNum<[23]>; +def R24 : AMDILReg<24, "r24">, DwarfRegNum<[24]>; +def R25 : AMDILReg<25, "r25">, DwarfRegNum<[25]>; +def R26 : AMDILReg<26, "r26">, DwarfRegNum<[26]>; +def R27 : AMDILReg<27, "r27">, DwarfRegNum<[27]>; +def R28 : AMDILReg<28, "r28">, DwarfRegNum<[28]>; +def R29 : AMDILReg<29, "r29">, DwarfRegNum<[29]>; +def R30 : AMDILReg<30, "r30">, DwarfRegNum<[30]>; +def R31 : AMDILReg<31, "r31">, DwarfRegNum<[31]>; +def R32 : AMDILReg<32, "r32">, DwarfRegNum<[32]>; +def R33 : AMDILReg<33, "r33">, DwarfRegNum<[33]>; +def R34 : AMDILReg<34, "r34">, DwarfRegNum<[34]>; +def R35 : AMDILReg<35, "r35">, DwarfRegNum<[35]>; +def R36 : AMDILReg<36, "r36">, DwarfRegNum<[36]>; +def R37 : AMDILReg<37, "r37">, DwarfRegNum<[37]>; +def R38 : AMDILReg<38, "r38">, DwarfRegNum<[38]>; +def R39 : AMDILReg<39, "r39">, DwarfRegNum<[39]>; +def R40 : AMDILReg<40, "r40">, DwarfRegNum<[40]>; +def R41 : AMDILReg<41, "r41">, DwarfRegNum<[41]>; +def R42 : AMDILReg<42, "r42">, DwarfRegNum<[42]>; +def R43 : AMDILReg<43, "r43">, DwarfRegNum<[43]>; +def R44 : AMDILReg<44, "r44">, DwarfRegNum<[44]>; +def R45 : AMDILReg<45, "r45">, DwarfRegNum<[45]>; +def R46 : AMDILReg<46, "r46">, DwarfRegNum<[46]>; +def R47 : AMDILReg<47, "r47">, DwarfRegNum<[47]>; +def R48 : AMDILReg<48, "r48">, DwarfRegNum<[48]>; +def R49 : AMDILReg<49, "r49">, DwarfRegNum<[49]>; +def R50 : AMDILReg<50, "r50">, DwarfRegNum<[50]>; +def R51 : AMDILReg<51, "r51">, DwarfRegNum<[51]>; +def R52 : AMDILReg<52, "r52">, DwarfRegNum<[52]>; +def R53 : AMDILReg<53, "r53">, DwarfRegNum<[53]>; +def R54 : AMDILReg<54, "r54">, DwarfRegNum<[54]>; +def R55 : AMDILReg<55, "r55">, DwarfRegNum<[55]>; +def R56 : AMDILReg<56, "r56">, DwarfRegNum<[56]>; +def R57 : AMDILReg<57, "r57">, DwarfRegNum<[57]>; +def R58 : AMDILReg<58, "r58">, DwarfRegNum<[58]>; +def R59 : AMDILReg<59, "r59">, DwarfRegNum<[59]>; +def R60 : AMDILReg<60, "r60">, DwarfRegNum<[60]>; +def R61 : AMDILReg<61, "r61">, DwarfRegNum<[61]>; +def R62 : AMDILReg<62, "r62">, DwarfRegNum<[62]>; +def R63 : AMDILReg<63, "r63">, DwarfRegNum<[63]>; +def R64 : AMDILReg<64, "r64">, DwarfRegNum<[64]>; +def R65 : AMDILReg<65, "r65">, DwarfRegNum<[65]>; +def R66 : AMDILReg<66, "r66">, DwarfRegNum<[66]>; +def R67 : AMDILReg<67, "r67">, DwarfRegNum<[67]>; +def R68 : AMDILReg<68, "r68">, DwarfRegNum<[68]>; +def R69 : AMDILReg<69, "r69">, DwarfRegNum<[69]>; +def R70 : AMDILReg<70, "r70">, DwarfRegNum<[70]>; +def R71 : AMDILReg<71, "r71">, DwarfRegNum<[71]>; +def R72 : AMDILReg<72, "r72">, DwarfRegNum<[72]>; +def R73 : AMDILReg<73, "r73">, DwarfRegNum<[73]>; +def R74 : AMDILReg<74, "r74">, DwarfRegNum<[74]>; +def R75 : AMDILReg<75, "r75">, DwarfRegNum<[75]>; +def R76 : AMDILReg<76, "r76">, DwarfRegNum<[76]>; +def R77 : AMDILReg<77, "r77">, DwarfRegNum<[77]>; +def R78 : AMDILReg<78, "r78">, DwarfRegNum<[78]>; +def R79 : AMDILReg<79, "r79">, DwarfRegNum<[79]>; +def R80 : AMDILReg<80, "r80">, DwarfRegNum<[80]>; +def R81 : AMDILReg<81, "r81">, DwarfRegNum<[81]>; +def R82 : AMDILReg<82, "r82">, DwarfRegNum<[82]>; +def R83 : AMDILReg<83, "r83">, DwarfRegNum<[83]>; +def R84 : AMDILReg<84, "r84">, DwarfRegNum<[84]>; +def R85 : AMDILReg<85, "r85">, DwarfRegNum<[85]>; +def R86 : AMDILReg<86, "r86">, DwarfRegNum<[86]>; +def R87 : AMDILReg<87, "r87">, DwarfRegNum<[87]>; +def R88 : AMDILReg<88, "r88">, DwarfRegNum<[88]>; +def R89 : AMDILReg<89, "r89">, DwarfRegNum<[89]>; +def R90 : AMDILReg<90, "r90">, DwarfRegNum<[90]>; +def R91 : AMDILReg<91, "r91">, DwarfRegNum<[91]>; +def R92 : AMDILReg<92, "r92">, DwarfRegNum<[92]>; +def R93 : AMDILReg<93, "r93">, DwarfRegNum<[93]>; +def R94 : AMDILReg<94, "r94">, DwarfRegNum<[94]>; +def R95 : AMDILReg<95, "r95">, DwarfRegNum<[95]>; +def R96 : AMDILReg<96, "r96">, DwarfRegNum<[96]>; +def R97 : AMDILReg<97, "r97">, DwarfRegNum<[97]>; +def R98 : AMDILReg<98, "r98">, DwarfRegNum<[98]>; +def R99 : AMDILReg<99, "r99">, DwarfRegNum<[99]>; +def R100 : AMDILReg<100, "r100">, DwarfRegNum<[100]>; +def R101 : AMDILReg<101, "r101">, DwarfRegNum<[101]>; +def R102 : AMDILReg<102, "r102">, DwarfRegNum<[102]>; +def R103 : AMDILReg<103, "r103">, DwarfRegNum<[103]>; +def R104 : AMDILReg<104, "r104">, DwarfRegNum<[104]>; +def R105 : AMDILReg<105, "r105">, DwarfRegNum<[105]>; +def R106 : AMDILReg<106, "r106">, DwarfRegNum<[106]>; +def R107 : AMDILReg<107, "r107">, DwarfRegNum<[107]>; +def R108 : AMDILReg<108, "r108">, DwarfRegNum<[108]>; +def R109 : AMDILReg<109, "r109">, DwarfRegNum<[109]>; +def R110 : AMDILReg<110, "r110">, DwarfRegNum<[110]>; +def R111 : AMDILReg<111, "r111">, DwarfRegNum<[111]>; +def R112 : AMDILReg<112, "r112">, DwarfRegNum<[112]>; +def R113 : AMDILReg<113, "r113">, DwarfRegNum<[113]>; +def R114 : AMDILReg<114, "r114">, DwarfRegNum<[114]>; +def R115 : AMDILReg<115, "r115">, DwarfRegNum<[115]>; +def R116 : AMDILReg<116, "r116">, DwarfRegNum<[116]>; +def R117 : AMDILReg<117, "r117">, DwarfRegNum<[117]>; +def R118 : AMDILReg<118, "r118">, DwarfRegNum<[118]>; +def R119 : AMDILReg<119, "r119">, DwarfRegNum<[119]>; +def R120 : AMDILReg<120, "r120">, DwarfRegNum<[120]>; +def R121 : AMDILReg<121, "r121">, DwarfRegNum<[121]>; +def R122 : AMDILReg<122, "r122">, DwarfRegNum<[122]>; +def R123 : AMDILReg<123, "r123">, DwarfRegNum<[123]>; +def R124 : AMDILReg<124, "r124">, DwarfRegNum<[124]>; +def R125 : AMDILReg<125, "r125">, DwarfRegNum<[125]>; +def R126 : AMDILReg<126, "r126">, DwarfRegNum<[126]>; +def R127 : AMDILReg<127, "r127">, DwarfRegNum<[127]>; +def R128 : AMDILReg<128, "r128">, DwarfRegNum<[128]>; +def R129 : AMDILReg<129, "r129">, DwarfRegNum<[129]>; +def R130 : AMDILReg<130, "r130">, DwarfRegNum<[130]>; +def R131 : AMDILReg<131, "r131">, DwarfRegNum<[131]>; +def R132 : AMDILReg<132, "r132">, DwarfRegNum<[132]>; +def R133 : AMDILReg<133, "r133">, DwarfRegNum<[133]>; +def R134 : AMDILReg<134, "r134">, DwarfRegNum<[134]>; +def R135 : AMDILReg<135, "r135">, DwarfRegNum<[135]>; +def R136 : AMDILReg<136, "r136">, DwarfRegNum<[136]>; +def R137 : AMDILReg<137, "r137">, DwarfRegNum<[137]>; +def R138 : AMDILReg<138, "r138">, DwarfRegNum<[138]>; +def R139 : AMDILReg<139, "r139">, DwarfRegNum<[139]>; +def R140 : AMDILReg<140, "r140">, DwarfRegNum<[140]>; +def R141 : AMDILReg<141, "r141">, DwarfRegNum<[141]>; +def R142 : AMDILReg<142, "r142">, DwarfRegNum<[142]>; +def R143 : AMDILReg<143, "r143">, DwarfRegNum<[143]>; +def R144 : AMDILReg<144, "r144">, DwarfRegNum<[144]>; +def R145 : AMDILReg<145, "r145">, DwarfRegNum<[145]>; +def R146 : AMDILReg<146, "r146">, DwarfRegNum<[146]>; +def R147 : AMDILReg<147, "r147">, DwarfRegNum<[147]>; +def R148 : AMDILReg<148, "r148">, DwarfRegNum<[148]>; +def R149 : AMDILReg<149, "r149">, DwarfRegNum<[149]>; +def R150 : AMDILReg<150, "r150">, DwarfRegNum<[150]>; +def R151 : AMDILReg<151, "r151">, DwarfRegNum<[151]>; +def R152 : AMDILReg<152, "r152">, DwarfRegNum<[152]>; +def R153 : AMDILReg<153, "r153">, DwarfRegNum<[153]>; +def R154 : AMDILReg<154, "r154">, DwarfRegNum<[154]>; +def R155 : AMDILReg<155, "r155">, DwarfRegNum<[155]>; +def R156 : AMDILReg<156, "r156">, DwarfRegNum<[156]>; +def R157 : AMDILReg<157, "r157">, DwarfRegNum<[157]>; +def R158 : AMDILReg<158, "r158">, DwarfRegNum<[158]>; +def R159 : AMDILReg<159, "r159">, DwarfRegNum<[159]>; +def R160 : AMDILReg<160, "r160">, DwarfRegNum<[160]>; +def R161 : AMDILReg<161, "r161">, DwarfRegNum<[161]>; +def R162 : AMDILReg<162, "r162">, DwarfRegNum<[162]>; +def R163 : AMDILReg<163, "r163">, DwarfRegNum<[163]>; +def R164 : AMDILReg<164, "r164">, DwarfRegNum<[164]>; +def R165 : AMDILReg<165, "r165">, DwarfRegNum<[165]>; +def R166 : AMDILReg<166, "r166">, DwarfRegNum<[166]>; +def R167 : AMDILReg<167, "r167">, DwarfRegNum<[167]>; +def R168 : AMDILReg<168, "r168">, DwarfRegNum<[168]>; +def R169 : AMDILReg<169, "r169">, DwarfRegNum<[169]>; +def R170 : AMDILReg<170, "r170">, DwarfRegNum<[170]>; +def R171 : AMDILReg<171, "r171">, DwarfRegNum<[171]>; +def R172 : AMDILReg<172, "r172">, DwarfRegNum<[172]>; +def R173 : AMDILReg<173, "r173">, DwarfRegNum<[173]>; +def R174 : AMDILReg<174, "r174">, DwarfRegNum<[174]>; +def R175 : AMDILReg<175, "r175">, DwarfRegNum<[175]>; +def R176 : AMDILReg<176, "r176">, DwarfRegNum<[176]>; +def R177 : AMDILReg<177, "r177">, DwarfRegNum<[177]>; +def R178 : AMDILReg<178, "r178">, DwarfRegNum<[178]>; +def R179 : AMDILReg<179, "r179">, DwarfRegNum<[179]>; +def R180 : AMDILReg<180, "r180">, DwarfRegNum<[180]>; +def R181 : AMDILReg<181, "r181">, DwarfRegNum<[181]>; +def R182 : AMDILReg<182, "r182">, DwarfRegNum<[182]>; +def R183 : AMDILReg<183, "r183">, DwarfRegNum<[183]>; +def R184 : AMDILReg<184, "r184">, DwarfRegNum<[184]>; +def R185 : AMDILReg<185, "r185">, DwarfRegNum<[185]>; +def R186 : AMDILReg<186, "r186">, DwarfRegNum<[186]>; +def R187 : AMDILReg<187, "r187">, DwarfRegNum<[187]>; +def R188 : AMDILReg<188, "r188">, DwarfRegNum<[188]>; +def R189 : AMDILReg<189, "r189">, DwarfRegNum<[189]>; +def R190 : AMDILReg<190, "r190">, DwarfRegNum<[190]>; +def R191 : AMDILReg<191, "r191">, DwarfRegNum<[191]>; +def R192 : AMDILReg<192, "r192">, DwarfRegNum<[192]>; +def R193 : AMDILReg<193, "r193">, DwarfRegNum<[193]>; +def R194 : AMDILReg<194, "r194">, DwarfRegNum<[194]>; +def R195 : AMDILReg<195, "r195">, DwarfRegNum<[195]>; +def R196 : AMDILReg<196, "r196">, DwarfRegNum<[196]>; +def R197 : AMDILReg<197, "r197">, DwarfRegNum<[197]>; +def R198 : AMDILReg<198, "r198">, DwarfRegNum<[198]>; +def R199 : AMDILReg<199, "r199">, DwarfRegNum<[199]>; +def R200 : AMDILReg<200, "r200">, DwarfRegNum<[200]>; +def R201 : AMDILReg<201, "r201">, DwarfRegNum<[201]>; +def R202 : AMDILReg<202, "r202">, DwarfRegNum<[202]>; +def R203 : AMDILReg<203, "r203">, DwarfRegNum<[203]>; +def R204 : AMDILReg<204, "r204">, DwarfRegNum<[204]>; +def R205 : AMDILReg<205, "r205">, DwarfRegNum<[205]>; +def R206 : AMDILReg<206, "r206">, DwarfRegNum<[206]>; +def R207 : AMDILReg<207, "r207">, DwarfRegNum<[207]>; +def R208 : AMDILReg<208, "r208">, DwarfRegNum<[208]>; +def R209 : AMDILReg<209, "r209">, DwarfRegNum<[209]>; +def R210 : AMDILReg<210, "r210">, DwarfRegNum<[210]>; +def R211 : AMDILReg<211, "r211">, DwarfRegNum<[211]>; +def R212 : AMDILReg<212, "r212">, DwarfRegNum<[212]>; +def R213 : AMDILReg<213, "r213">, DwarfRegNum<[213]>; +def R214 : AMDILReg<214, "r214">, DwarfRegNum<[214]>; +def R215 : AMDILReg<215, "r215">, DwarfRegNum<[215]>; +def R216 : AMDILReg<216, "r216">, DwarfRegNum<[216]>; +def R217 : AMDILReg<217, "r217">, DwarfRegNum<[217]>; +def R218 : AMDILReg<218, "r218">, DwarfRegNum<[218]>; +def R219 : AMDILReg<219, "r219">, DwarfRegNum<[219]>; +def R220 : AMDILReg<220, "r220">, DwarfRegNum<[220]>; +def R221 : AMDILReg<221, "r221">, DwarfRegNum<[221]>; +def R222 : AMDILReg<222, "r222">, DwarfRegNum<[222]>; +def R223 : AMDILReg<223, "r223">, DwarfRegNum<[223]>; +def R224 : AMDILReg<224, "r224">, DwarfRegNum<[224]>; +def R225 : AMDILReg<225, "r225">, DwarfRegNum<[225]>; +def R226 : AMDILReg<226, "r226">, DwarfRegNum<[226]>; +def R227 : AMDILReg<227, "r227">, DwarfRegNum<[227]>; +def R228 : AMDILReg<228, "r228">, DwarfRegNum<[228]>; +def R229 : AMDILReg<229, "r229">, DwarfRegNum<[229]>; +def R230 : AMDILReg<230, "r230">, DwarfRegNum<[230]>; +def R231 : AMDILReg<231, "r231">, DwarfRegNum<[231]>; +def R232 : AMDILReg<232, "r232">, DwarfRegNum<[232]>; +def R233 : AMDILReg<233, "r233">, DwarfRegNum<[233]>; +def R234 : AMDILReg<234, "r234">, DwarfRegNum<[234]>; +def R235 : AMDILReg<235, "r235">, DwarfRegNum<[235]>; +def R236 : AMDILReg<236, "r236">, DwarfRegNum<[236]>; +def R237 : AMDILReg<237, "r237">, DwarfRegNum<[237]>; +def R238 : AMDILReg<238, "r238">, DwarfRegNum<[238]>; +def R239 : AMDILReg<239, "r239">, DwarfRegNum<[239]>; +def R240 : AMDILReg<240, "r240">, DwarfRegNum<[240]>; +def R241 : AMDILReg<241, "r241">, DwarfRegNum<[241]>; +def R242 : AMDILReg<242, "r242">, DwarfRegNum<[242]>; +def R243 : AMDILReg<243, "r243">, DwarfRegNum<[243]>; +def R244 : AMDILReg<244, "r244">, DwarfRegNum<[244]>; +def R245 : AMDILReg<245, "r245">, DwarfRegNum<[245]>; +def R246 : AMDILReg<246, "r246">, DwarfRegNum<[246]>; +def R247 : AMDILReg<247, "r247">, DwarfRegNum<[247]>; +def R248 : AMDILReg<248, "r248">, DwarfRegNum<[248]>; +def R249 : AMDILReg<249, "r249">, DwarfRegNum<[249]>; +def R250 : AMDILReg<250, "r250">, DwarfRegNum<[250]>; +def R251 : AMDILReg<251, "r251">, DwarfRegNum<[251]>; +def R252 : AMDILReg<252, "r252">, DwarfRegNum<[252]>; +def R253 : AMDILReg<253, "r253">, DwarfRegNum<[253]>; +def R254 : AMDILReg<254, "r254">, DwarfRegNum<[254]>; +def R255 : AMDILReg<255, "r255">, DwarfRegNum<[255]>; +def R256 : AMDILReg<256, "r256">, DwarfRegNum<[256]>; +def R257 : AMDILReg<257, "r257">, DwarfRegNum<[257]>; +def R258 : AMDILReg<258, "r258">, DwarfRegNum<[258]>; +def R259 : AMDILReg<259, "r259">, DwarfRegNum<[259]>; +def R260 : AMDILReg<260, "r260">, DwarfRegNum<[260]>; +def R261 : AMDILReg<261, "r261">, DwarfRegNum<[261]>; +def R262 : AMDILReg<262, "r262">, DwarfRegNum<[262]>; +def R263 : AMDILReg<263, "r263">, DwarfRegNum<[263]>; +def R264 : AMDILReg<264, "r264">, DwarfRegNum<[264]>; +def R265 : AMDILReg<265, "r265">, DwarfRegNum<[265]>; +def R266 : AMDILReg<266, "r266">, DwarfRegNum<[266]>; +def R267 : AMDILReg<267, "r267">, DwarfRegNum<[267]>; +def R268 : AMDILReg<268, "r268">, DwarfRegNum<[268]>; +def R269 : AMDILReg<269, "r269">, DwarfRegNum<[269]>; +def R270 : AMDILReg<270, "r270">, DwarfRegNum<[270]>; +def R271 : AMDILReg<271, "r271">, DwarfRegNum<[271]>; +def R272 : AMDILReg<272, "r272">, DwarfRegNum<[272]>; +def R273 : AMDILReg<273, "r273">, DwarfRegNum<[273]>; +def R274 : AMDILReg<274, "r274">, DwarfRegNum<[274]>; +def R275 : AMDILReg<275, "r275">, DwarfRegNum<[275]>; +def R276 : AMDILReg<276, "r276">, DwarfRegNum<[276]>; +def R277 : AMDILReg<277, "r277">, DwarfRegNum<[277]>; +def R278 : AMDILReg<278, "r278">, DwarfRegNum<[278]>; +def R279 : AMDILReg<279, "r279">, DwarfRegNum<[279]>; +def R280 : AMDILReg<280, "r280">, DwarfRegNum<[280]>; +def R281 : AMDILReg<281, "r281">, DwarfRegNum<[281]>; +def R282 : AMDILReg<282, "r282">, DwarfRegNum<[282]>; +def R283 : AMDILReg<283, "r283">, DwarfRegNum<[283]>; +def R284 : AMDILReg<284, "r284">, DwarfRegNum<[284]>; +def R285 : AMDILReg<285, "r285">, DwarfRegNum<[285]>; +def R286 : AMDILReg<286, "r286">, DwarfRegNum<[286]>; +def R287 : AMDILReg<287, "r287">, DwarfRegNum<[287]>; +def R288 : AMDILReg<288, "r288">, DwarfRegNum<[288]>; +def R289 : AMDILReg<289, "r289">, DwarfRegNum<[289]>; +def R290 : AMDILReg<290, "r290">, DwarfRegNum<[290]>; +def R291 : AMDILReg<291, "r291">, DwarfRegNum<[291]>; +def R292 : AMDILReg<292, "r292">, DwarfRegNum<[292]>; +def R293 : AMDILReg<293, "r293">, DwarfRegNum<[293]>; +def R294 : AMDILReg<294, "r294">, DwarfRegNum<[294]>; +def R295 : AMDILReg<295, "r295">, DwarfRegNum<[295]>; +def R296 : AMDILReg<296, "r296">, DwarfRegNum<[296]>; +def R297 : AMDILReg<297, "r297">, DwarfRegNum<[297]>; +def R298 : AMDILReg<298, "r298">, DwarfRegNum<[298]>; +def R299 : AMDILReg<299, "r299">, DwarfRegNum<[299]>; +def R300 : AMDILReg<300, "r300">, DwarfRegNum<[300]>; +def R301 : AMDILReg<301, "r301">, DwarfRegNum<[301]>; +def R302 : AMDILReg<302, "r302">, DwarfRegNum<[302]>; +def R303 : AMDILReg<303, "r303">, DwarfRegNum<[303]>; +def R304 : AMDILReg<304, "r304">, DwarfRegNum<[304]>; +def R305 : AMDILReg<305, "r305">, DwarfRegNum<[305]>; +def R306 : AMDILReg<306, "r306">, DwarfRegNum<[306]>; +def R307 : AMDILReg<307, "r307">, DwarfRegNum<[307]>; +def R308 : AMDILReg<308, "r308">, DwarfRegNum<[308]>; +def R309 : AMDILReg<309, "r309">, DwarfRegNum<[309]>; +def R310 : AMDILReg<310, "r310">, DwarfRegNum<[310]>; +def R311 : AMDILReg<311, "r311">, DwarfRegNum<[311]>; +def R312 : AMDILReg<312, "r312">, DwarfRegNum<[312]>; +def R313 : AMDILReg<313, "r313">, DwarfRegNum<[313]>; +def R314 : AMDILReg<314, "r314">, DwarfRegNum<[314]>; +def R315 : AMDILReg<315, "r315">, DwarfRegNum<[315]>; +def R316 : AMDILReg<316, "r316">, DwarfRegNum<[316]>; +def R317 : AMDILReg<317, "r317">, DwarfRegNum<[317]>; +def R318 : AMDILReg<318, "r318">, DwarfRegNum<[318]>; +def R319 : AMDILReg<319, "r319">, DwarfRegNum<[319]>; +def R320 : AMDILReg<320, "r320">, DwarfRegNum<[320]>; +def R321 : AMDILReg<321, "r321">, DwarfRegNum<[321]>; +def R322 : AMDILReg<322, "r322">, DwarfRegNum<[322]>; +def R323 : AMDILReg<323, "r323">, DwarfRegNum<[323]>; +def R324 : AMDILReg<324, "r324">, DwarfRegNum<[324]>; +def R325 : AMDILReg<325, "r325">, DwarfRegNum<[325]>; +def R326 : AMDILReg<326, "r326">, DwarfRegNum<[326]>; +def R327 : AMDILReg<327, "r327">, DwarfRegNum<[327]>; +def R328 : AMDILReg<328, "r328">, DwarfRegNum<[328]>; +def R329 : AMDILReg<329, "r329">, DwarfRegNum<[329]>; +def R330 : AMDILReg<330, "r330">, DwarfRegNum<[330]>; +def R331 : AMDILReg<331, "r331">, DwarfRegNum<[331]>; +def R332 : AMDILReg<332, "r332">, DwarfRegNum<[332]>; +def R333 : AMDILReg<333, "r333">, DwarfRegNum<[333]>; +def R334 : AMDILReg<334, "r334">, DwarfRegNum<[334]>; +def R335 : AMDILReg<335, "r335">, DwarfRegNum<[335]>; +def R336 : AMDILReg<336, "r336">, DwarfRegNum<[336]>; +def R337 : AMDILReg<337, "r337">, DwarfRegNum<[337]>; +def R338 : AMDILReg<338, "r338">, DwarfRegNum<[338]>; +def R339 : AMDILReg<339, "r339">, DwarfRegNum<[339]>; +def R340 : AMDILReg<340, "r340">, DwarfRegNum<[340]>; +def R341 : AMDILReg<341, "r341">, DwarfRegNum<[341]>; +def R342 : AMDILReg<342, "r342">, DwarfRegNum<[342]>; +def R343 : AMDILReg<343, "r343">, DwarfRegNum<[343]>; +def R344 : AMDILReg<344, "r344">, DwarfRegNum<[344]>; +def R345 : AMDILReg<345, "r345">, DwarfRegNum<[345]>; +def R346 : AMDILReg<346, "r346">, DwarfRegNum<[346]>; +def R347 : AMDILReg<347, "r347">, DwarfRegNum<[347]>; +def R348 : AMDILReg<348, "r348">, DwarfRegNum<[348]>; +def R349 : AMDILReg<349, "r349">, DwarfRegNum<[349]>; +def R350 : AMDILReg<350, "r350">, DwarfRegNum<[350]>; +def R351 : AMDILReg<351, "r351">, DwarfRegNum<[351]>; +def R352 : AMDILReg<352, "r352">, DwarfRegNum<[352]>; +def R353 : AMDILReg<353, "r353">, DwarfRegNum<[353]>; +def R354 : AMDILReg<354, "r354">, DwarfRegNum<[354]>; +def R355 : AMDILReg<355, "r355">, DwarfRegNum<[355]>; +def R356 : AMDILReg<356, "r356">, DwarfRegNum<[356]>; +def R357 : AMDILReg<357, "r357">, DwarfRegNum<[357]>; +def R358 : AMDILReg<358, "r358">, DwarfRegNum<[358]>; +def R359 : AMDILReg<359, "r359">, DwarfRegNum<[359]>; +def R360 : AMDILReg<360, "r360">, DwarfRegNum<[360]>; +def R361 : AMDILReg<361, "r361">, DwarfRegNum<[361]>; +def R362 : AMDILReg<362, "r362">, DwarfRegNum<[362]>; +def R363 : AMDILReg<363, "r363">, DwarfRegNum<[363]>; +def R364 : AMDILReg<364, "r364">, DwarfRegNum<[364]>; +def R365 : AMDILReg<365, "r365">, DwarfRegNum<[365]>; +def R366 : AMDILReg<366, "r366">, DwarfRegNum<[366]>; +def R367 : AMDILReg<367, "r367">, DwarfRegNum<[367]>; +def R368 : AMDILReg<368, "r368">, DwarfRegNum<[368]>; +def R369 : AMDILReg<369, "r369">, DwarfRegNum<[369]>; +def R370 : AMDILReg<370, "r370">, DwarfRegNum<[370]>; +def R371 : AMDILReg<371, "r371">, DwarfRegNum<[371]>; +def R372 : AMDILReg<372, "r372">, DwarfRegNum<[372]>; +def R373 : AMDILReg<373, "r373">, DwarfRegNum<[373]>; +def R374 : AMDILReg<374, "r374">, DwarfRegNum<[374]>; +def R375 : AMDILReg<375, "r375">, DwarfRegNum<[375]>; +def R376 : AMDILReg<376, "r376">, DwarfRegNum<[376]>; +def R377 : AMDILReg<377, "r377">, DwarfRegNum<[377]>; +def R378 : AMDILReg<378, "r378">, DwarfRegNum<[378]>; +def R379 : AMDILReg<379, "r379">, DwarfRegNum<[379]>; +def R380 : AMDILReg<380, "r380">, DwarfRegNum<[380]>; +def R381 : AMDILReg<381, "r381">, DwarfRegNum<[381]>; +def R382 : AMDILReg<382, "r382">, DwarfRegNum<[382]>; +def R383 : AMDILReg<383, "r383">, DwarfRegNum<[383]>; +def R384 : AMDILReg<384, "r384">, DwarfRegNum<[384]>; +def R385 : AMDILReg<385, "r385">, DwarfRegNum<[385]>; +def R386 : AMDILReg<386, "r386">, DwarfRegNum<[386]>; +def R387 : AMDILReg<387, "r387">, DwarfRegNum<[387]>; +def R388 : AMDILReg<388, "r388">, DwarfRegNum<[388]>; +def R389 : AMDILReg<389, "r389">, DwarfRegNum<[389]>; +def R390 : AMDILReg<390, "r390">, DwarfRegNum<[390]>; +def R391 : AMDILReg<391, "r391">, DwarfRegNum<[391]>; +def R392 : AMDILReg<392, "r392">, DwarfRegNum<[392]>; +def R393 : AMDILReg<393, "r393">, DwarfRegNum<[393]>; +def R394 : AMDILReg<394, "r394">, DwarfRegNum<[394]>; +def R395 : AMDILReg<395, "r395">, DwarfRegNum<[395]>; +def R396 : AMDILReg<396, "r396">, DwarfRegNum<[396]>; +def R397 : AMDILReg<397, "r397">, DwarfRegNum<[397]>; +def R398 : AMDILReg<398, "r398">, DwarfRegNum<[398]>; +def R399 : AMDILReg<399, "r399">, DwarfRegNum<[399]>; +def R400 : AMDILReg<400, "r400">, DwarfRegNum<[400]>; +def R401 : AMDILReg<401, "r401">, DwarfRegNum<[401]>; +def R402 : AMDILReg<402, "r402">, DwarfRegNum<[402]>; +def R403 : AMDILReg<403, "r403">, DwarfRegNum<[403]>; +def R404 : AMDILReg<404, "r404">, DwarfRegNum<[404]>; +def R405 : AMDILReg<405, "r405">, DwarfRegNum<[405]>; +def R406 : AMDILReg<406, "r406">, DwarfRegNum<[406]>; +def R407 : AMDILReg<407, "r407">, DwarfRegNum<[407]>; +def R408 : AMDILReg<408, "r408">, DwarfRegNum<[408]>; +def R409 : AMDILReg<409, "r409">, DwarfRegNum<[409]>; +def R410 : AMDILReg<410, "r410">, DwarfRegNum<[410]>; +def R411 : AMDILReg<411, "r411">, DwarfRegNum<[411]>; +def R412 : AMDILReg<412, "r412">, DwarfRegNum<[412]>; +def R413 : AMDILReg<413, "r413">, DwarfRegNum<[413]>; +def R414 : AMDILReg<414, "r414">, DwarfRegNum<[414]>; +def R415 : AMDILReg<415, "r415">, DwarfRegNum<[415]>; +def R416 : AMDILReg<416, "r416">, DwarfRegNum<[416]>; +def R417 : AMDILReg<417, "r417">, DwarfRegNum<[417]>; +def R418 : AMDILReg<418, "r418">, DwarfRegNum<[418]>; +def R419 : AMDILReg<419, "r419">, DwarfRegNum<[419]>; +def R420 : AMDILReg<420, "r420">, DwarfRegNum<[420]>; +def R421 : AMDILReg<421, "r421">, DwarfRegNum<[421]>; +def R422 : AMDILReg<422, "r422">, DwarfRegNum<[422]>; +def R423 : AMDILReg<423, "r423">, DwarfRegNum<[423]>; +def R424 : AMDILReg<424, "r424">, DwarfRegNum<[424]>; +def R425 : AMDILReg<425, "r425">, DwarfRegNum<[425]>; +def R426 : AMDILReg<426, "r426">, DwarfRegNum<[426]>; +def R427 : AMDILReg<427, "r427">, DwarfRegNum<[427]>; +def R428 : AMDILReg<428, "r428">, DwarfRegNum<[428]>; +def R429 : AMDILReg<429, "r429">, DwarfRegNum<[429]>; +def R430 : AMDILReg<430, "r430">, DwarfRegNum<[430]>; +def R431 : AMDILReg<431, "r431">, DwarfRegNum<[431]>; +def R432 : AMDILReg<432, "r432">, DwarfRegNum<[432]>; +def R433 : AMDILReg<433, "r433">, DwarfRegNum<[433]>; +def R434 : AMDILReg<434, "r434">, DwarfRegNum<[434]>; +def R435 : AMDILReg<435, "r435">, DwarfRegNum<[435]>; +def R436 : AMDILReg<436, "r436">, DwarfRegNum<[436]>; +def R437 : AMDILReg<437, "r437">, DwarfRegNum<[437]>; +def R438 : AMDILReg<438, "r438">, DwarfRegNum<[438]>; +def R439 : AMDILReg<439, "r439">, DwarfRegNum<[439]>; +def R440 : AMDILReg<440, "r440">, DwarfRegNum<[440]>; +def R441 : AMDILReg<441, "r441">, DwarfRegNum<[441]>; +def R442 : AMDILReg<442, "r442">, DwarfRegNum<[442]>; +def R443 : AMDILReg<443, "r443">, DwarfRegNum<[443]>; +def R444 : AMDILReg<444, "r444">, DwarfRegNum<[444]>; +def R445 : AMDILReg<445, "r445">, DwarfRegNum<[445]>; +def R446 : AMDILReg<446, "r446">, DwarfRegNum<[446]>; +def R447 : AMDILReg<447, "r447">, DwarfRegNum<[447]>; +def R448 : AMDILReg<448, "r448">, DwarfRegNum<[448]>; +def R449 : AMDILReg<449, "r449">, DwarfRegNum<[449]>; +def R450 : AMDILReg<450, "r450">, DwarfRegNum<[450]>; +def R451 : AMDILReg<451, "r451">, DwarfRegNum<[451]>; +def R452 : AMDILReg<452, "r452">, DwarfRegNum<[452]>; +def R453 : AMDILReg<453, "r453">, DwarfRegNum<[453]>; +def R454 : AMDILReg<454, "r454">, DwarfRegNum<[454]>; +def R455 : AMDILReg<455, "r455">, DwarfRegNum<[455]>; +def R456 : AMDILReg<456, "r456">, DwarfRegNum<[456]>; +def R457 : AMDILReg<457, "r457">, DwarfRegNum<[457]>; +def R458 : AMDILReg<458, "r458">, DwarfRegNum<[458]>; +def R459 : AMDILReg<459, "r459">, DwarfRegNum<[459]>; +def R460 : AMDILReg<460, "r460">, DwarfRegNum<[460]>; +def R461 : AMDILReg<461, "r461">, DwarfRegNum<[461]>; +def R462 : AMDILReg<462, "r462">, DwarfRegNum<[462]>; +def R463 : AMDILReg<463, "r463">, DwarfRegNum<[463]>; +def R464 : AMDILReg<464, "r464">, DwarfRegNum<[464]>; +def R465 : AMDILReg<465, "r465">, DwarfRegNum<[465]>; +def R466 : AMDILReg<466, "r466">, DwarfRegNum<[466]>; +def R467 : AMDILReg<467, "r467">, DwarfRegNum<[467]>; +def R468 : AMDILReg<468, "r468">, DwarfRegNum<[468]>; +def R469 : AMDILReg<469, "r469">, DwarfRegNum<[469]>; +def R470 : AMDILReg<470, "r470">, DwarfRegNum<[470]>; +def R471 : AMDILReg<471, "r471">, DwarfRegNum<[471]>; +def R472 : AMDILReg<472, "r472">, DwarfRegNum<[472]>; +def R473 : AMDILReg<473, "r473">, DwarfRegNum<[473]>; +def R474 : AMDILReg<474, "r474">, DwarfRegNum<[474]>; +def R475 : AMDILReg<475, "r475">, DwarfRegNum<[475]>; +def R476 : AMDILReg<476, "r476">, DwarfRegNum<[476]>; +def R477 : AMDILReg<477, "r477">, DwarfRegNum<[477]>; +def R478 : AMDILReg<478, "r478">, DwarfRegNum<[478]>; +def R479 : AMDILReg<479, "r479">, DwarfRegNum<[479]>; +def R480 : AMDILReg<480, "r480">, DwarfRegNum<[480]>; +def R481 : AMDILReg<481, "r481">, DwarfRegNum<[481]>; +def R482 : AMDILReg<482, "r482">, DwarfRegNum<[482]>; +def R483 : AMDILReg<483, "r483">, DwarfRegNum<[483]>; +def R484 : AMDILReg<484, "r484">, DwarfRegNum<[484]>; +def R485 : AMDILReg<485, "r485">, DwarfRegNum<[485]>; +def R486 : AMDILReg<486, "r486">, DwarfRegNum<[486]>; +def R487 : AMDILReg<487, "r487">, DwarfRegNum<[487]>; +def R488 : AMDILReg<488, "r488">, DwarfRegNum<[488]>; +def R489 : AMDILReg<489, "r489">, DwarfRegNum<[489]>; +def R490 : AMDILReg<490, "r490">, DwarfRegNum<[490]>; +def R491 : AMDILReg<491, "r491">, DwarfRegNum<[491]>; +def R492 : AMDILReg<492, "r492">, DwarfRegNum<[492]>; +def R493 : AMDILReg<493, "r493">, DwarfRegNum<[493]>; +def R494 : AMDILReg<494, "r494">, DwarfRegNum<[494]>; +def R495 : AMDILReg<495, "r495">, DwarfRegNum<[495]>; +def R496 : AMDILReg<496, "r496">, DwarfRegNum<[496]>; +def R497 : AMDILReg<497, "r497">, DwarfRegNum<[497]>; +def R498 : AMDILReg<498, "r498">, DwarfRegNum<[498]>; +def R499 : AMDILReg<499, "r499">, DwarfRegNum<[499]>; +def R500 : AMDILReg<500, "r500">, DwarfRegNum<[500]>; +def R501 : AMDILReg<501, "r501">, DwarfRegNum<[501]>; +def R502 : AMDILReg<502, "r502">, DwarfRegNum<[502]>; +def R503 : AMDILReg<503, "r503">, DwarfRegNum<[503]>; +def R504 : AMDILReg<504, "r504">, DwarfRegNum<[504]>; +def R505 : AMDILReg<505, "r505">, DwarfRegNum<[505]>; +def R506 : AMDILReg<506, "r506">, DwarfRegNum<[506]>; +def R507 : AMDILReg<507, "r507">, DwarfRegNum<[507]>; +def R508 : AMDILReg<508, "r508">, DwarfRegNum<[508]>; +def R509 : AMDILReg<509, "r509">, DwarfRegNum<[509]>; +def R510 : AMDILReg<510, "r510">, DwarfRegNum<[510]>; +def R511 : AMDILReg<511, "r511">, DwarfRegNum<[511]>; +def R512 : AMDILReg<512, "r512">, DwarfRegNum<[512]>; +def R513 : AMDILReg<513, "r513">, DwarfRegNum<[513]>; +def R514 : AMDILReg<514, "r514">, DwarfRegNum<[514]>; +def R515 : AMDILReg<515, "r515">, DwarfRegNum<[515]>; +def R516 : AMDILReg<516, "r516">, DwarfRegNum<[516]>; +def R517 : AMDILReg<517, "r517">, DwarfRegNum<[517]>; +def R518 : AMDILReg<518, "r518">, DwarfRegNum<[518]>; +def R519 : AMDILReg<519, "r519">, DwarfRegNum<[519]>; +def R520 : AMDILReg<520, "r520">, DwarfRegNum<[520]>; +def R521 : AMDILReg<521, "r521">, DwarfRegNum<[521]>; +def R522 : AMDILReg<522, "r522">, DwarfRegNum<[522]>; +def R523 : AMDILReg<523, "r523">, DwarfRegNum<[523]>; +def R524 : AMDILReg<524, "r524">, DwarfRegNum<[524]>; +def R525 : AMDILReg<525, "r525">, DwarfRegNum<[525]>; +def R526 : AMDILReg<526, "r526">, DwarfRegNum<[526]>; +def R527 : AMDILReg<527, "r527">, DwarfRegNum<[527]>; +def R528 : AMDILReg<528, "r528">, DwarfRegNum<[528]>; +def R529 : AMDILReg<529, "r529">, DwarfRegNum<[529]>; +def R530 : AMDILReg<530, "r530">, DwarfRegNum<[530]>; +def R531 : AMDILReg<531, "r531">, DwarfRegNum<[531]>; +def R532 : AMDILReg<532, "r532">, DwarfRegNum<[532]>; +def R533 : AMDILReg<533, "r533">, DwarfRegNum<[533]>; +def R534 : AMDILReg<534, "r534">, DwarfRegNum<[534]>; +def R535 : AMDILReg<535, "r535">, DwarfRegNum<[535]>; +def R536 : AMDILReg<536, "r536">, DwarfRegNum<[536]>; +def R537 : AMDILReg<537, "r537">, DwarfRegNum<[537]>; +def R538 : AMDILReg<538, "r538">, DwarfRegNum<[538]>; +def R539 : AMDILReg<539, "r539">, DwarfRegNum<[539]>; +def R540 : AMDILReg<540, "r540">, DwarfRegNum<[540]>; +def R541 : AMDILReg<541, "r541">, DwarfRegNum<[541]>; +def R542 : AMDILReg<542, "r542">, DwarfRegNum<[542]>; +def R543 : AMDILReg<543, "r543">, DwarfRegNum<[543]>; +def R544 : AMDILReg<544, "r544">, DwarfRegNum<[544]>; +def R545 : AMDILReg<545, "r545">, DwarfRegNum<[545]>; +def R546 : AMDILReg<546, "r546">, DwarfRegNum<[546]>; +def R547 : AMDILReg<547, "r547">, DwarfRegNum<[547]>; +def R548 : AMDILReg<548, "r548">, DwarfRegNum<[548]>; +def R549 : AMDILReg<549, "r549">, DwarfRegNum<[549]>; +def R550 : AMDILReg<550, "r550">, DwarfRegNum<[550]>; +def R551 : AMDILReg<551, "r551">, DwarfRegNum<[551]>; +def R552 : AMDILReg<552, "r552">, DwarfRegNum<[552]>; +def R553 : AMDILReg<553, "r553">, DwarfRegNum<[553]>; +def R554 : AMDILReg<554, "r554">, DwarfRegNum<[554]>; +def R555 : AMDILReg<555, "r555">, DwarfRegNum<[555]>; +def R556 : AMDILReg<556, "r556">, DwarfRegNum<[556]>; +def R557 : AMDILReg<557, "r557">, DwarfRegNum<[557]>; +def R558 : AMDILReg<558, "r558">, DwarfRegNum<[558]>; +def R559 : AMDILReg<559, "r559">, DwarfRegNum<[559]>; +def R560 : AMDILReg<560, "r560">, DwarfRegNum<[560]>; +def R561 : AMDILReg<561, "r561">, DwarfRegNum<[561]>; +def R562 : AMDILReg<562, "r562">, DwarfRegNum<[562]>; +def R563 : AMDILReg<563, "r563">, DwarfRegNum<[563]>; +def R564 : AMDILReg<564, "r564">, DwarfRegNum<[564]>; +def R565 : AMDILReg<565, "r565">, DwarfRegNum<[565]>; +def R566 : AMDILReg<566, "r566">, DwarfRegNum<[566]>; +def R567 : AMDILReg<567, "r567">, DwarfRegNum<[567]>; +def R568 : AMDILReg<568, "r568">, DwarfRegNum<[568]>; +def R569 : AMDILReg<569, "r569">, DwarfRegNum<[569]>; +def R570 : AMDILReg<570, "r570">, DwarfRegNum<[570]>; +def R571 : AMDILReg<571, "r571">, DwarfRegNum<[571]>; +def R572 : AMDILReg<572, "r572">, DwarfRegNum<[572]>; +def R573 : AMDILReg<573, "r573">, DwarfRegNum<[573]>; +def R574 : AMDILReg<574, "r574">, DwarfRegNum<[574]>; +def R575 : AMDILReg<575, "r575">, DwarfRegNum<[575]>; +def R576 : AMDILReg<576, "r576">, DwarfRegNum<[576]>; +def R577 : AMDILReg<577, "r577">, DwarfRegNum<[577]>; +def R578 : AMDILReg<578, "r578">, DwarfRegNum<[578]>; +def R579 : AMDILReg<579, "r579">, DwarfRegNum<[579]>; +def R580 : AMDILReg<580, "r580">, DwarfRegNum<[580]>; +def R581 : AMDILReg<581, "r581">, DwarfRegNum<[581]>; +def R582 : AMDILReg<582, "r582">, DwarfRegNum<[582]>; +def R583 : AMDILReg<583, "r583">, DwarfRegNum<[583]>; +def R584 : AMDILReg<584, "r584">, DwarfRegNum<[584]>; +def R585 : AMDILReg<585, "r585">, DwarfRegNum<[585]>; +def R586 : AMDILReg<586, "r586">, DwarfRegNum<[586]>; +def R587 : AMDILReg<587, "r587">, DwarfRegNum<[587]>; +def R588 : AMDILReg<588, "r588">, DwarfRegNum<[588]>; +def R589 : AMDILReg<589, "r589">, DwarfRegNum<[589]>; +def R590 : AMDILReg<590, "r590">, DwarfRegNum<[590]>; +def R591 : AMDILReg<591, "r591">, DwarfRegNum<[591]>; +def R592 : AMDILReg<592, "r592">, DwarfRegNum<[592]>; +def R593 : AMDILReg<593, "r593">, DwarfRegNum<[593]>; +def R594 : AMDILReg<594, "r594">, DwarfRegNum<[594]>; +def R595 : AMDILReg<595, "r595">, DwarfRegNum<[595]>; +def R596 : AMDILReg<596, "r596">, DwarfRegNum<[596]>; +def R597 : AMDILReg<597, "r597">, DwarfRegNum<[597]>; +def R598 : AMDILReg<598, "r598">, DwarfRegNum<[598]>; +def R599 : AMDILReg<599, "r599">, DwarfRegNum<[599]>; +def R600 : AMDILReg<600, "r600">, DwarfRegNum<[600]>; +def R601 : AMDILReg<601, "r601">, DwarfRegNum<[601]>; +def R602 : AMDILReg<602, "r602">, DwarfRegNum<[602]>; +def R603 : AMDILReg<603, "r603">, DwarfRegNum<[603]>; +def R604 : AMDILReg<604, "r604">, DwarfRegNum<[604]>; +def R605 : AMDILReg<605, "r605">, DwarfRegNum<[605]>; +def R606 : AMDILReg<606, "r606">, DwarfRegNum<[606]>; +def R607 : AMDILReg<607, "r607">, DwarfRegNum<[607]>; +def R608 : AMDILReg<608, "r608">, DwarfRegNum<[608]>; +def R609 : AMDILReg<609, "r609">, DwarfRegNum<[609]>; +def R610 : AMDILReg<610, "r610">, DwarfRegNum<[610]>; +def R611 : AMDILReg<611, "r611">, DwarfRegNum<[611]>; +def R612 : AMDILReg<612, "r612">, DwarfRegNum<[612]>; +def R613 : AMDILReg<613, "r613">, DwarfRegNum<[613]>; +def R614 : AMDILReg<614, "r614">, DwarfRegNum<[614]>; +def R615 : AMDILReg<615, "r615">, DwarfRegNum<[615]>; +def R616 : AMDILReg<616, "r616">, DwarfRegNum<[616]>; +def R617 : AMDILReg<617, "r617">, DwarfRegNum<[617]>; +def R618 : AMDILReg<618, "r618">, DwarfRegNum<[618]>; +def R619 : AMDILReg<619, "r619">, DwarfRegNum<[619]>; +def R620 : AMDILReg<620, "r620">, DwarfRegNum<[620]>; +def R621 : AMDILReg<621, "r621">, DwarfRegNum<[621]>; +def R622 : AMDILReg<622, "r622">, DwarfRegNum<[622]>; +def R623 : AMDILReg<623, "r623">, DwarfRegNum<[623]>; +def R624 : AMDILReg<624, "r624">, DwarfRegNum<[624]>; +def R625 : AMDILReg<625, "r625">, DwarfRegNum<[625]>; +def R626 : AMDILReg<626, "r626">, DwarfRegNum<[626]>; +def R627 : AMDILReg<627, "r627">, DwarfRegNum<[627]>; +def R628 : AMDILReg<628, "r628">, DwarfRegNum<[628]>; +def R629 : AMDILReg<629, "r629">, DwarfRegNum<[629]>; +def R630 : AMDILReg<630, "r630">, DwarfRegNum<[630]>; +def R631 : AMDILReg<631, "r631">, DwarfRegNum<[631]>; +def R632 : AMDILReg<632, "r632">, DwarfRegNum<[632]>; +def R633 : AMDILReg<633, "r633">, DwarfRegNum<[633]>; +def R634 : AMDILReg<634, "r634">, DwarfRegNum<[634]>; +def R635 : AMDILReg<635, "r635">, DwarfRegNum<[635]>; +def R636 : AMDILReg<636, "r636">, DwarfRegNum<[636]>; +def R637 : AMDILReg<637, "r637">, DwarfRegNum<[637]>; +def R638 : AMDILReg<638, "r638">, DwarfRegNum<[638]>; +def R639 : AMDILReg<639, "r639">, DwarfRegNum<[639]>; +def R640 : AMDILReg<640, "r640">, DwarfRegNum<[640]>; +def R641 : AMDILReg<641, "r641">, DwarfRegNum<[641]>; +def R642 : AMDILReg<642, "r642">, DwarfRegNum<[642]>; +def R643 : AMDILReg<643, "r643">, DwarfRegNum<[643]>; +def R644 : AMDILReg<644, "r644">, DwarfRegNum<[644]>; +def R645 : AMDILReg<645, "r645">, DwarfRegNum<[645]>; +def R646 : AMDILReg<646, "r646">, DwarfRegNum<[646]>; +def R647 : AMDILReg<647, "r647">, DwarfRegNum<[647]>; +def R648 : AMDILReg<648, "r648">, DwarfRegNum<[648]>; +def R649 : AMDILReg<649, "r649">, DwarfRegNum<[649]>; +def R650 : AMDILReg<650, "r650">, DwarfRegNum<[650]>; +def R651 : AMDILReg<651, "r651">, DwarfRegNum<[651]>; +def R652 : AMDILReg<652, "r652">, DwarfRegNum<[652]>; +def R653 : AMDILReg<653, "r653">, DwarfRegNum<[653]>; +def R654 : AMDILReg<654, "r654">, DwarfRegNum<[654]>; +def R655 : AMDILReg<655, "r655">, DwarfRegNum<[655]>; +def R656 : AMDILReg<656, "r656">, DwarfRegNum<[656]>; +def R657 : AMDILReg<657, "r657">, DwarfRegNum<[657]>; +def R658 : AMDILReg<658, "r658">, DwarfRegNum<[658]>; +def R659 : AMDILReg<659, "r659">, DwarfRegNum<[659]>; +def R660 : AMDILReg<660, "r660">, DwarfRegNum<[660]>; +def R661 : AMDILReg<661, "r661">, DwarfRegNum<[661]>; +def R662 : AMDILReg<662, "r662">, DwarfRegNum<[662]>; +def R663 : AMDILReg<663, "r663">, DwarfRegNum<[663]>; +def R664 : AMDILReg<664, "r664">, DwarfRegNum<[664]>; +def R665 : AMDILReg<665, "r665">, DwarfRegNum<[665]>; +def R666 : AMDILReg<666, "r666">, DwarfRegNum<[666]>; +def R667 : AMDILReg<667, "r667">, DwarfRegNum<[667]>; +def R668 : AMDILReg<668, "r668">, DwarfRegNum<[668]>; +def R669 : AMDILReg<669, "r669">, DwarfRegNum<[669]>; +def R670 : AMDILReg<670, "r670">, DwarfRegNum<[670]>; +def R671 : AMDILReg<671, "r671">, DwarfRegNum<[671]>; +def R672 : AMDILReg<672, "r672">, DwarfRegNum<[672]>; +def R673 : AMDILReg<673, "r673">, DwarfRegNum<[673]>; +def R674 : AMDILReg<674, "r674">, DwarfRegNum<[674]>; +def R675 : AMDILReg<675, "r675">, DwarfRegNum<[675]>; +def R676 : AMDILReg<676, "r676">, DwarfRegNum<[676]>; +def R677 : AMDILReg<677, "r677">, DwarfRegNum<[677]>; +def R678 : AMDILReg<678, "r678">, DwarfRegNum<[678]>; +def R679 : AMDILReg<679, "r679">, DwarfRegNum<[679]>; +def R680 : AMDILReg<680, "r680">, DwarfRegNum<[680]>; +def R681 : AMDILReg<681, "r681">, DwarfRegNum<[681]>; +def R682 : AMDILReg<682, "r682">, DwarfRegNum<[682]>; +def R683 : AMDILReg<683, "r683">, DwarfRegNum<[683]>; +def R684 : AMDILReg<684, "r684">, DwarfRegNum<[684]>; +def R685 : AMDILReg<685, "r685">, DwarfRegNum<[685]>; +def R686 : AMDILReg<686, "r686">, DwarfRegNum<[686]>; +def R687 : AMDILReg<687, "r687">, DwarfRegNum<[687]>; +def R688 : AMDILReg<688, "r688">, DwarfRegNum<[688]>; +def R689 : AMDILReg<689, "r689">, DwarfRegNum<[689]>; +def R690 : AMDILReg<690, "r690">, DwarfRegNum<[690]>; +def R691 : AMDILReg<691, "r691">, DwarfRegNum<[691]>; +def R692 : AMDILReg<692, "r692">, DwarfRegNum<[692]>; +def R693 : AMDILReg<693, "r693">, DwarfRegNum<[693]>; +def R694 : AMDILReg<694, "r694">, DwarfRegNum<[694]>; +def R695 : AMDILReg<695, "r695">, DwarfRegNum<[695]>; +def R696 : AMDILReg<696, "r696">, DwarfRegNum<[696]>; +def R697 : AMDILReg<697, "r697">, DwarfRegNum<[697]>; +def R698 : AMDILReg<698, "r698">, DwarfRegNum<[698]>; +def R699 : AMDILReg<699, "r699">, DwarfRegNum<[699]>; +def R700 : AMDILReg<700, "r700">, DwarfRegNum<[700]>; +def R701 : AMDILReg<701, "r701">, DwarfRegNum<[701]>; +def R702 : AMDILReg<702, "r702">, DwarfRegNum<[702]>; +def R703 : AMDILReg<703, "r703">, DwarfRegNum<[703]>; +def R704 : AMDILReg<704, "r704">, DwarfRegNum<[704]>; +def R705 : AMDILReg<705, "r705">, DwarfRegNum<[705]>; +def R706 : AMDILReg<706, "r706">, DwarfRegNum<[706]>; +def R707 : AMDILReg<707, "r707">, DwarfRegNum<[707]>; +def R708 : AMDILReg<708, "r708">, DwarfRegNum<[708]>; +def R709 : AMDILReg<709, "r709">, DwarfRegNum<[709]>; +def R710 : AMDILReg<710, "r710">, DwarfRegNum<[710]>; +def R711 : AMDILReg<711, "r711">, DwarfRegNum<[711]>; +def R712 : AMDILReg<712, "r712">, DwarfRegNum<[712]>; +def R713 : AMDILReg<713, "r713">, DwarfRegNum<[713]>; +def R714 : AMDILReg<714, "r714">, DwarfRegNum<[714]>; +def R715 : AMDILReg<715, "r715">, DwarfRegNum<[715]>; +def R716 : AMDILReg<716, "r716">, DwarfRegNum<[716]>; +def R717 : AMDILReg<717, "r717">, DwarfRegNum<[717]>; +def R718 : AMDILReg<718, "r718">, DwarfRegNum<[718]>; +def R719 : AMDILReg<719, "r719">, DwarfRegNum<[719]>; +def R720 : AMDILReg<720, "r720">, DwarfRegNum<[720]>; +def R721 : AMDILReg<721, "r721">, DwarfRegNum<[721]>; +def R722 : AMDILReg<722, "r722">, DwarfRegNum<[722]>; +def R723 : AMDILReg<723, "r723">, DwarfRegNum<[723]>; +def R724 : AMDILReg<724, "r724">, DwarfRegNum<[724]>; +def R725 : AMDILReg<725, "r725">, DwarfRegNum<[725]>; +def R726 : AMDILReg<726, "r726">, DwarfRegNum<[726]>; +def R727 : AMDILReg<727, "r727">, DwarfRegNum<[727]>; +def R728 : AMDILReg<728, "r728">, DwarfRegNum<[728]>; +def R729 : AMDILReg<729, "r729">, DwarfRegNum<[729]>; +def R730 : AMDILReg<730, "r730">, DwarfRegNum<[730]>; +def R731 : AMDILReg<731, "r731">, DwarfRegNum<[731]>; +def R732 : AMDILReg<732, "r732">, DwarfRegNum<[732]>; +def R733 : AMDILReg<733, "r733">, DwarfRegNum<[733]>; +def R734 : AMDILReg<734, "r734">, DwarfRegNum<[734]>; +def R735 : AMDILReg<735, "r735">, DwarfRegNum<[735]>; +def R736 : AMDILReg<736, "r736">, DwarfRegNum<[736]>; +def R737 : AMDILReg<737, "r737">, DwarfRegNum<[737]>; +def R738 : AMDILReg<738, "r738">, DwarfRegNum<[738]>; +def R739 : AMDILReg<739, "r739">, DwarfRegNum<[739]>; +def R740 : AMDILReg<740, "r740">, DwarfRegNum<[740]>; +def R741 : AMDILReg<741, "r741">, DwarfRegNum<[741]>; +def R742 : AMDILReg<742, "r742">, DwarfRegNum<[742]>; +def R743 : AMDILReg<743, "r743">, DwarfRegNum<[743]>; +def R744 : AMDILReg<744, "r744">, DwarfRegNum<[744]>; +def R745 : AMDILReg<745, "r745">, DwarfRegNum<[745]>; +def R746 : AMDILReg<746, "r746">, DwarfRegNum<[746]>; +def R747 : AMDILReg<747, "r747">, DwarfRegNum<[747]>; +def R748 : AMDILReg<748, "r748">, DwarfRegNum<[748]>; +def R749 : AMDILReg<749, "r749">, DwarfRegNum<[749]>; +def R750 : AMDILReg<750, "r750">, DwarfRegNum<[750]>; +def R751 : AMDILReg<751, "r751">, DwarfRegNum<[751]>; +def R752 : AMDILReg<752, "r752">, DwarfRegNum<[752]>; +def R753 : AMDILReg<753, "r753">, DwarfRegNum<[753]>; +def R754 : AMDILReg<754, "r754">, DwarfRegNum<[754]>; +def R755 : AMDILReg<755, "r755">, DwarfRegNum<[755]>; +def R756 : AMDILReg<756, "r756">, DwarfRegNum<[756]>; +def R757 : AMDILReg<757, "r757">, DwarfRegNum<[757]>; +def R758 : AMDILReg<758, "r758">, DwarfRegNum<[758]>; +def R759 : AMDILReg<759, "r759">, DwarfRegNum<[759]>; +def R760 : AMDILReg<760, "r760">, DwarfRegNum<[760]>; +def R761 : AMDILReg<761, "r761">, DwarfRegNum<[761]>; +def R762 : AMDILReg<762, "r762">, DwarfRegNum<[762]>; +def R763 : AMDILReg<763, "r763">, DwarfRegNum<[763]>; +def R764 : AMDILReg<764, "r764">, DwarfRegNum<[764]>; +def R765 : AMDILReg<765, "r765">, DwarfRegNum<[765]>; +def R766 : AMDILReg<766, "r766">, DwarfRegNum<[766]>; +def R767 : AMDILReg<767, "r767">, DwarfRegNum<[767]>; + +// All registers between 1000 and 1024 are reserved and cannot be used +// unless commented in this section +// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's +// r1020 is used to hold the frame index for local arrays +// r1019 is used to hold the dynamic stack allocation pointer +// r1018 is used as a temporary register for handwritten code +// r1017 is used as a temporary register for handwritten code +// r1016 is used as a temporary register for load/store code +// r1015 is used as a temporary register for data segment offset +// r1014 is used as a temporary register for store code +// r1013 is used as the section data pointer register +// r1012-r1010 and r1001-r1008 are used for temporary I/O registers +// r1009 is used as the frame pointer register +// r999 is used as the mem register. +// r998 is used as the return address register. +//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>; +//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>; +//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>; +//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>; +//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>; +//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>; +def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>; +def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>; +def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>; +def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>; +def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>; +def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>; +def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>; +def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>; +def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>; +def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>; +def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>; +def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>; +def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>; +def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>; +def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>; +def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>; +def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>; +def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>; +def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>; +def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>; +def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>; +def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>; +def GPRI8 : RegisterClass<"AMDIL", [i8], 8, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV2I8 : RegisterClass<"AMDIL", [v2i8], 16, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV4I8 : RegisterClass<"AMDIL", [v4i8], 32, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRI16 : RegisterClass<"AMDIL", [i16], 16, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV2I16 : RegisterClass<"AMDIL", [v2i16], 32, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV4I16 : RegisterClass<"AMDIL", [v4i16], 64, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRI32 : RegisterClass<"AMDIL", [i32], 32, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRF32 : RegisterClass<"AMDIL", [f32], 32, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +// For 64 bit integer emulation, the lower 32 bits are in x +// and the upper 32 bits are in y +def GPRI64 : RegisterClass<"AMDIL", [i64], 64, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRF64 : RegisterClass<"AMDIL", [f64], 64, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV4F32 : RegisterClass<"AMDIL", [v4f32], 128, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV4I32 : RegisterClass<"AMDIL", [v4i32], 128, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV2I32 : RegisterClass<"AMDIL", [v2i32], 64, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV2F32 : RegisterClass<"AMDIL", [v2f32], 64, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV2I64 : RegisterClass<"AMDIL", [v2i64], 128, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } +def GPRV2F64 : RegisterClass<"AMDIL", [v2f64], 128, + (add (sequence "R%u", 1, 767), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> +{ + let AltOrders = [(add (sequence "R%u", 1, 767))]; + let AltOrderSelect = [{ + return 1; + }]; + } + diff --git a/src/gallium/drivers/radeon/AMDILSIDevice.cpp b/src/gallium/drivers/radeon/AMDILSIDevice.cpp new file mode 100644 index 00000000000..ce560984ef9 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILSIDevice.cpp @@ -0,0 +1,49 @@ +//===-- AMDILSIDevice.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===//
+#include "AMDILSIDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+#include "AMDILSubtarget.h"
+ +using namespace llvm;
+
+AMDILSIDevice::AMDILSIDevice(AMDILSubtarget *ST)
+ : AMDILEvergreenDevice(ST)
+{
+}
+AMDILSIDevice::~AMDILSIDevice()
+{
+}
+
+size_t
+AMDILSIDevice::getMaxLDSSize() const
+{
+ if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+ return MAX_LDS_SIZE_900;
+ } else {
+ return 0;
+ }
+}
+
+uint32_t
+AMDILSIDevice::getGeneration() const
+{
+ return AMDILDeviceInfo::HD7XXX;
+}
+
+std::string
+AMDILSIDevice::getDataLayout() const
+{
+ return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
+ "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+ "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+ "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+ "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+ "-n8:16:32:64");
+}
diff --git a/src/gallium/drivers/radeon/AMDILSIDevice.h b/src/gallium/drivers/radeon/AMDILSIDevice.h new file mode 100644 index 00000000000..69f35a0588d --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILSIDevice.h @@ -0,0 +1,45 @@ +//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
+// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===---------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
+// specific hardware.
+//===---------------------------------------------------------------------===//
+#ifndef _AMDILSIDEVICE_H_
+#define _AMDILSIDEVICE_H_
+#include "AMDILEvergreenDevice.h"
+#include "AMDILSubtarget.h"
+ +namespace llvm {
+ class AMDILSubtarget;
+//===---------------------------------------------------------------------===//
+// SI generation of devices and their respective sub classes
+//===---------------------------------------------------------------------===//
+
+// The AMDILSIDevice is the base class for all Northern Island series of
+// cards. It is very similiar to the AMDILEvergreenDevice, with the major
+// exception being differences in wavefront size and hardware capabilities. The
+// SI devices are all 64 wide wavefronts and also add support for signed 24 bit
+// integer operations
+
+ class AMDILSIDevice : public AMDILEvergreenDevice {
+ public:
+ AMDILSIDevice(AMDILSubtarget*);
+ virtual ~AMDILSIDevice();
+ virtual size_t getMaxLDSSize() const;
+ virtual uint32_t getGeneration() const;
+ virtual std::string getDataLayout() const;
+ protected:
+ }; // AMDILSIDevice
+
+} // namespace llvm
+#endif // _AMDILSIDEVICE_H_
diff --git a/src/gallium/drivers/radeon/AMDILSubtarget.cpp b/src/gallium/drivers/radeon/AMDILSubtarget.cpp new file mode 100644 index 00000000000..898833d9c0e --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILSubtarget.cpp @@ -0,0 +1,179 @@ +//===- AMDILSubtarget.cpp - AMDIL Subtarget Information -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file implements the AMD IL specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "AMDILSubtarget.h" +#include "AMDIL.h" +#include "AMDILDevices.h" +#include "AMDILGlobalManager.h" +#include "AMDILKernelManager.h" +#include "AMDILUtilityFunctions.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/SubtargetFeature.h" + +using namespace llvm; + +#define GET_SUBTARGETINFO_ENUM +#define GET_SUBTARGETINFO_CTOR +#define GET_SUBTARGETINFO_TARGET_DESC +#include "AMDILGenSubtargetInfo.inc" + +AMDILSubtarget::AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS) : AMDILGenSubtargetInfo( TT, CPU, FS ) +{ + memset(CapsOverride, 0, sizeof(*CapsOverride) + * AMDILDeviceInfo::MaxNumberCapabilities); + // Default card + std::string GPU = "rv770"; + GPU = CPU; + mIs64bit = false; + mVersion = 0; + SmallVector<StringRef, DEFAULT_VEC_SLOTS> Features; + SplitString(FS, Features, ","); + mDefaultSize[0] = 64; + mDefaultSize[1] = 1; + mDefaultSize[2] = 1; + std::string newFeatures = ""; +#if defined(_DEBUG) || defined(DEBUG) + bool useTest = false; +#endif + for (size_t x = 0; x < Features.size(); ++x) { + if (Features[x].startswith("+mwgs")) { + SmallVector<StringRef, DEFAULT_VEC_SLOTS> sizes; + SplitString(Features[x], sizes, "-"); + size_t mDim = ::atoi(sizes[1].data()); + if (mDim > 3) { + mDim = 3; + } + for (size_t y = 0; y < mDim; ++y) { + mDefaultSize[y] = ::atoi(sizes[y+2].data()); + } +#if defined(_DEBUG) || defined(DEBUG) + } else if (!Features[x].compare("test")) { + useTest = true; +#endif + } else if (Features[x].startswith("+cal")) { + SmallVector<StringRef, DEFAULT_VEC_SLOTS> version; + SplitString(Features[x], version, "="); + mVersion = ::atoi(version[1].data()); + } else { + GPU = CPU; + if (x > 0) newFeatures += ','; + newFeatures += Features[x]; + } + } + // If we don't have a version then set it to + // -1 which enables everything. This is for + // offline devices. + if (!mVersion) { + mVersion = (uint32_t)-1; + } + for (int x = 0; x < 3; ++x) { + if (!mDefaultSize[x]) { + mDefaultSize[x] = 1; + } + } +#if defined(_DEBUG) || defined(DEBUG) + if (useTest) { + GPU = "kauai"; + } +#endif + ParseSubtargetFeatures(GPU, newFeatures); +#if defined(_DEBUG) || defined(DEBUG) + if (useTest) { + GPU = "test"; + } +#endif + mDevName = GPU; + mDevice = getDeviceFromName(mDevName, this, mIs64bit); +} +AMDILSubtarget::~AMDILSubtarget() +{ + delete mDevice; +} +bool +AMDILSubtarget::isOverride(AMDILDeviceInfo::Caps caps) const +{ + assert(caps < AMDILDeviceInfo::MaxNumberCapabilities && + "Caps index is out of bounds!"); + return CapsOverride[caps]; +} +bool +AMDILSubtarget::is64bit() const +{ + return mIs64bit; +} +bool +AMDILSubtarget::isTargetELF() const +{ + return false; +} +size_t +AMDILSubtarget::getDefaultSize(uint32_t dim) const +{ + if (dim > 3) { + return 1; + } else { + return mDefaultSize[dim]; + } +} +uint32_t +AMDILSubtarget::calVersion() const +{ + return mVersion; +} + +AMDILGlobalManager* +AMDILSubtarget::getGlobalManager() const +{ + return mGM; +} +void +AMDILSubtarget::setGlobalManager(AMDILGlobalManager *gm) const +{ + mGM = gm; +} + +AMDILKernelManager* +AMDILSubtarget::getKernelManager() const +{ + return mKM; +} +void +AMDILSubtarget::setKernelManager(AMDILKernelManager *km) const +{ + mKM = km; +} +std::string +AMDILSubtarget::getDataLayout() const +{ + if (!mDevice) { + return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16" + "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32" + "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64" + "-v96:128:128-v128:128:128-v192:256:256-v256:256:256" + "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64"); + } + return mDevice->getDataLayout(); +} + +std::string +AMDILSubtarget::getDeviceName() const +{ + return mDevName; +} +const AMDILDevice * +AMDILSubtarget::device() const +{ + return mDevice; +} diff --git a/src/gallium/drivers/radeon/AMDILSubtarget.h b/src/gallium/drivers/radeon/AMDILSubtarget.h new file mode 100644 index 00000000000..a4b0e34ada7 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILSubtarget.h @@ -0,0 +1,75 @@ +//=====-- AMDILSubtarget.h - Define Subtarget for the AMDIL ----*- C++ -*-====// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file declares the AMDIL specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#ifndef _AMDILSUBTARGET_H_ +#define _AMDILSUBTARGET_H_ + +#include "AMDILDevice.h" +#include "llvm/ADT/BitVector.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +#include <cstdlib> +#include <string> + +#define GET_SUBTARGETINFO_HEADER +#include "AMDILGenSubtargetInfo.inc" + +#define MAX_CB_SIZE (1 << 16) +namespace llvm { + class Module; + class AMDILKernelManager; + class AMDILGlobalManager; + class AMDILDevice; + class AMDILSubtarget : public AMDILGenSubtargetInfo { + private: + bool CapsOverride[AMDILDeviceInfo::MaxNumberCapabilities]; + mutable AMDILGlobalManager *mGM; + mutable AMDILKernelManager *mKM; + const AMDILDevice *mDevice; + size_t mDefaultSize[3]; + size_t mMinimumSize[3]; + std::string mDevName; + uint32_t mVersion; + bool mIs64bit; + bool mIs32on64bit; + public: + AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS); + virtual ~AMDILSubtarget(); + bool isOverride(AMDILDeviceInfo::Caps) const; + bool is64bit() const; + + // Helper functions to simplify if statements + bool isTargetELF() const; + AMDILGlobalManager* getGlobalManager() const; + void setGlobalManager(AMDILGlobalManager *gm) const; + AMDILKernelManager* getKernelManager() const; + void setKernelManager(AMDILKernelManager *gm) const; + const AMDILDevice* device() const; + std::string getDataLayout() const; + std::string getDeviceName() const; + virtual size_t getDefaultSize(uint32_t dim) const; + // Return the version of CAL that the backend should target. + uint32_t calVersion() const; + // ParseSubtargetFeatures - Parses features string setting specified + // subtarget options. Definition of function is + //auto generated by tblgen. + void + ParseSubtargetFeatures( + llvm::StringRef CPU, + llvm::StringRef FS); + + }; + +} // end namespace llvm + +#endif // AMDILSUBTARGET_H_ diff --git a/src/gallium/drivers/radeon/AMDILTargetMachine.cpp b/src/gallium/drivers/radeon/AMDILTargetMachine.cpp new file mode 100644 index 00000000000..6146dded3aa --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILTargetMachine.cpp @@ -0,0 +1,195 @@ +//===-- AMDILTargetMachine.cpp - Define TargetMachine for AMDIL -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// + +#include "AMDILTargetMachine.h" +#include "AMDGPUTargetMachine.h" +#include "AMDILDevices.h" +#include "AMDILFrameLowering.h" +#include "llvm/ADT/OwningPtr.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/SchedulerRegistry.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/Pass.h" +#include "llvm/PassManager.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Scalar.h" + +using namespace llvm; + +extern "C" void LLVMInitializeAMDILTarget() { + // Register the target + RegisterTargetMachine<AMDILTargetMachine> X(TheAMDILTarget); + RegisterTargetMachine<AMDGPUTargetMachine> Y(TheAMDGPUTarget); +} + +/// AMDILTargetMachine ctor - +/// +AMDILTargetMachine::AMDILTargetMachine(const Target &T, + StringRef TT, StringRef CPU, StringRef FS, + TargetOptions Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL +) +: + LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), + Subtarget(TT, CPU, FS), + DataLayout(Subtarget.getDataLayout()), + FrameLowering(TargetFrameLowering::StackGrowsUp, + Subtarget.device()->getStackAlignment(), 0), + InstrInfo(*this), //JITInfo(*this), + TLInfo(*this), + IntrinsicInfo(this), + ELFWriterInfo(false, true) +{ + setAsmVerbosityDefault(true); + setMCUseLoc(false); +} + +AMDILTargetLowering* +AMDILTargetMachine::getTargetLowering() const +{ + return const_cast<AMDILTargetLowering*>(&TLInfo); +} + +const AMDILInstrInfo* +AMDILTargetMachine::getInstrInfo() const +{ + return &InstrInfo; +} +const AMDILFrameLowering* +AMDILTargetMachine::getFrameLowering() const +{ + return &FrameLowering; +} + +const AMDILSubtarget* +AMDILTargetMachine::getSubtargetImpl() const +{ + return &Subtarget; +} + +const AMDILRegisterInfo* +AMDILTargetMachine::getRegisterInfo() const +{ + return &InstrInfo.getRegisterInfo(); +} + +const TargetData* +AMDILTargetMachine::getTargetData() const +{ + return &DataLayout; +} + +const AMDILELFWriterInfo* +AMDILTargetMachine::getELFWriterInfo() const +{ + return Subtarget.isTargetELF() ? &ELFWriterInfo : 0; +} + +const AMDILIntrinsicInfo* +AMDILTargetMachine::getIntrinsicInfo() const +{ + return &IntrinsicInfo; +} + + void +AMDILTargetMachine::dump(llvm::raw_ostream &O) +{ + if (!mDebugMode) { + return; + } + O << ";AMDIL Target Machine State Dump: \n"; +} + + void +AMDILTargetMachine::setDebug(bool debugMode) +{ + mDebugMode = debugMode; +} + +bool +AMDILTargetMachine::getDebug() const +{ + return mDebugMode; +} + +namespace { +class AMDILPassConfig : public TargetPassConfig { + +public: + AMDILPassConfig(AMDILTargetMachine *TM, PassManagerBase &PM) + : TargetPassConfig(TM, PM) {} + + AMDILTargetMachine &getAMDILTargetMachine() const { + return getTM<AMDILTargetMachine>(); + } + + virtual bool addPreISel(); + virtual bool addInstSelector(); + virtual bool addPreRegAlloc(); + virtual bool addPostRegAlloc(); + virtual bool addPreEmitPass(); +}; +} // End of anonymous namespace + +TargetPassConfig *AMDILTargetMachine::createPassConfig(PassManagerBase &PM) { + return new AMDILPassConfig(this, PM); +} + +bool AMDILPassConfig::addPreISel() +{ + return false; +} + +bool AMDILPassConfig::addInstSelector() +{ + PM.add(createAMDILBarrierDetect(*TM)); + PM.add(createAMDILPrintfConvert(*TM)); + PM.add(createAMDILInlinePass(*TM)); + PM.add(createAMDILPeepholeOpt(*TM)); + PM.add(createAMDILISelDag(getAMDILTargetMachine())); + return false; +} + +bool AMDILPassConfig::addPreRegAlloc() +{ + // If debugging, reduce code motion. Use less aggressive pre-RA scheduler + if (TM->getOptLevel() == CodeGenOpt::None) { + llvm::RegisterScheduler::setDefault(&llvm::createSourceListDAGScheduler); + } + + PM.add(createAMDILMachinePeephole(*TM)); + PM.add(createAMDILPointerManager(*TM)); + return false; +} + +bool AMDILPassConfig::addPostRegAlloc() { + return false; // -print-machineinstr should print after this. +} + +/// addPreEmitPass - This pass may be implemented by targets that want to run +/// passes immediately before machine code is emitted. This should return +/// true if -print-machineinstrs should print out the code after the passes. +bool AMDILPassConfig::addPreEmitPass() +{ + PM.add(createAMDILCFGPreparationPass(*TM)); + PM.add(createAMDILCFGStructurizerPass(*TM)); + PM.add(createAMDILLiteralManager(*TM)); + PM.add(createAMDILIOExpansion(*TM)); + return true; +} + diff --git a/src/gallium/drivers/radeon/AMDILTargetMachine.h b/src/gallium/drivers/radeon/AMDILTargetMachine.h new file mode 100644 index 00000000000..1c90e1c9d13 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILTargetMachine.h @@ -0,0 +1,75 @@ +//===-- AMDILTargetMachine.h - Define TargetMachine for AMDIL ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file declares the AMDIL specific subclass of TargetMachine. +// +//===----------------------------------------------------------------------===// + +#ifndef AMDILTARGETMACHINE_H_ +#define AMDILTARGETMACHINE_H_ + +#include "AMDILELFWriterInfo.h" +#include "AMDILFrameLowering.h" +#include "AMDILISelLowering.h" +#include "AMDILInstrInfo.h" +#include "AMDILIntrinsicInfo.h" +#include "AMDILSubtarget.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm +{ + class raw_ostream; + + class AMDILTargetMachine : public LLVMTargetMachine + { + private: + AMDILSubtarget Subtarget; + const TargetData DataLayout; // Calculates type size & alignment + AMDILFrameLowering FrameLowering; + AMDILInstrInfo InstrInfo; + AMDILTargetLowering TLInfo; + AMDILIntrinsicInfo IntrinsicInfo; + AMDILELFWriterInfo ELFWriterInfo; + bool mDebugMode; + CodeGenOpt::Level mOptLevel; + + protected: + + public: + AMDILTargetMachine(const Target &T, + StringRef TT, StringRef CPU, StringRef FS, + TargetOptions Options, + Reloc::Model RM, CodeModel::Model CM, + CodeGenOpt::Level OL); + + // Get Target/Subtarget specific information + virtual AMDILTargetLowering* getTargetLowering() const; + virtual const AMDILInstrInfo* getInstrInfo() const; + virtual const AMDILFrameLowering* getFrameLowering() const; + virtual const AMDILSubtarget* getSubtargetImpl() const; + virtual const AMDILRegisterInfo* getRegisterInfo() const; + virtual const TargetData* getTargetData() const; + virtual const AMDILIntrinsicInfo *getIntrinsicInfo() const; + virtual const AMDILELFWriterInfo *getELFWriterInfo() const; + + // Pass Pipeline Configuration + virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); + + void dump(llvm::raw_ostream &O); + void setDebug(bool debugMode); + bool getDebug() const; + CodeGenOpt::Level getOptLevel() const { return mOptLevel; } + + + }; // AMDILTargetMachine + +} // end namespace llvm + +#endif // AMDILTARGETMACHINE_H_ diff --git a/src/gallium/drivers/radeon/AMDILTokenDesc.td b/src/gallium/drivers/radeon/AMDILTokenDesc.td new file mode 100644 index 00000000000..b81f593506f --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILTokenDesc.td @@ -0,0 +1,120 @@ +//===-- AMDILTokenDesc.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--------------------------------------------------------------------===// + +include "AMDILEnumeratedTypes.td" + +// Each token is 32 bits as specified in section 2.1 of the IL spec +class ILToken <bits<32> n> { + field bits<32> _bits = n; +} + +// Section 2.2.1 - IL Language Token +class ILLang<bits<8> client_type> : ILToken<0> { + let _bits{0-7} = client_type; +} + +// Section 2.2.2 - IL Version Token +class ILVersion<bits<8> minor_version, bits<8> major_version, ILShader shader_type> : ILToken<0> { + let _bits{0-7} = minor_version; + let _bits{8-15} = major_version; + let _bits{16-23} = shader_type.Value; +} + +// Section 2.2.3 - IL Opcode Token +class ILOpcode<ILOpCode opcode, bits<14> control, bit sec_mod_pre, bit pri_mod_pre> : ILToken<0> { + let _bits{0-15} = opcode.Value; + let _bits{16-29} = control; + let _bits{30} = sec_mod_pre; + let _bits{31} = pri_mod_pre; +} + +// Section 2.2.4 - IL Destination Token +class ILDst<AMDILReg register_num, ILRegType register_type, bit mod_pre, bits<2> relative_address, bit dimension, bit immediate_pre, bit extended> : ILToken<0> { + let _bits{0-15} = register_num.Value; + let _bits{16-21} = register_type.Value; + let _bits{22} = mod_pre; + let _bits{23-24} = relative_address; + let _bits{25} = dimension; + let _bits{26} = immediate_pre; + let _bits{31} = extended; +} + +// Section 2.2.5 - IL Destination Modifier Token +class ILDstMod<ILModDstComp x, ILModDstComp y, ILModDstComp z, ILModDstComp w, bit clamp, ILShiftScale shift_scale> : ILToken<0> { + let _bits{0-1} = x.Value; + let _bits{2-3} = y.Value; + let _bits{4-5} = z.Value; + let _bits{6-7} = w.Value; + let _bits{8} = clamp; + //let _bits{9-12} = shift_scale; +} + +// Section 2.2.6 - IL Source Token +class ILSrc<AMDILReg register_num, ILRegType register_type, bit mod_pre, bits<2> relative_address, bit dimension, bit immediate_pre, bit extended> : ILToken<0> { + let _bits{0-15} = register_num.Value; + let _bits{16-21} = register_type.Value; + let _bits{22} = mod_pre; + let _bits{23-24} = relative_address; + let _bits{25} = dimension; + let _bits{26} = immediate_pre; + let _bits{31} = extended; +} + +// Section 2.2.7 - IL Source Modifier Token +class ILSrcMod<ILComponentSelect swizzle_x, bit negate_x, ILComponentSelect swizzle_y, bit negate_y, + ILComponentSelect swizzle_z, bit negate_z, ILComponentSelect swizzle_w, bit negate_w, + bit invert, bit bias, bit x2, bit sign, bit abs, ILDivComp divComp, + bits<8> clamp> : ILToken<0> { + let _bits{0-2} = swizzle_x.Value; + let _bits{3} = negate_x; + let _bits{4-6} = swizzle_y.Value; + let _bits{7} = negate_y; + let _bits{8-10} = swizzle_z.Value; + let _bits{11} = negate_z; + let _bits{12-14} = swizzle_w.Value; + let _bits{15} = negate_w; + let _bits{16} = invert; + let _bits{17} = bias; + let _bits{18} = x2; + let _bits{19} = sign; + let _bits{20} = abs; + let _bits{21-23} = divComp.Value; + let _bits{24-31} = clamp; +} + +// Section 2.2.8 - IL Relative Address Token +class ILRelAddr<AMDILReg address_register, bit loop_relative, ILAddressing component> : ILToken<0> { + let _bits{0-15} = address_register.Value; + let _bits{16} = loop_relative; + let _bits{17-19} = component.Value; +} + +// IL Literal Token +class ILLiteral<bits<32> val> : ILToken<0> { + let _bits = val; +} + +// All tokens required for a destination register +class ILDstReg<ILDst Reg, ILDstMod Mod, ILRelAddr Rel, ILSrc Reg_Rel, ILSrcMod Reg_Rel_Mod> { + ILDst reg = Reg; + ILDstMod mod = Mod; + ILRelAddr rel = Rel; + ILSrc reg_rel = Reg_Rel; + ILSrcMod reg_rel_mod = Reg_Rel_Mod; +} + +// All tokens required for a source register +class ILSrcReg<ILSrc Reg, ILSrcMod Mod, ILRelAddr Rel, ILSrc Reg_Rel, ILSrcMod Reg_Rel_Mod> { + ILSrc reg = Reg; + ILSrcMod mod = Mod; + ILRelAddr rel = Rel; + ILSrc reg_rel = Reg_Rel; + ILSrcMod reg_rel_mod = Reg_Rel_Mod; +} + diff --git a/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp b/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp new file mode 100644 index 00000000000..f2ef4eb7771 --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp @@ -0,0 +1,683 @@ +//===-- AMDILUtilityFunctions.cpp - AMDIL Utility Functions ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file provides the implementations of functions that are declared in the +// AMDILUtilityFUnctions.h file. +// +//===----------------------------------------------------------------------===// +#include "AMDILUtilityFunctions.h" +#include "AMDILISelLowering.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Instruction.h" +#include "llvm/Instructions.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Type.h" + +#include <cstdio> +#include <list> +#include <queue> + +#define GET_OPCODE_NAME(TII, MI) \ + TII->getName(MI->getOpcode()) + + +using namespace llvm; +int64_t GET_SCALAR_SIZE(llvm::Type *A) { + return A->getScalarSizeInBits(); +} + +const TargetRegisterClass * getRegClassFromID(unsigned int ID) { + switch (ID) { + default: + assert(0 && "Passed in ID does not match any register classes."); + return NULL; + case AMDIL::GPRI8RegClassID: + return &AMDIL::GPRI8RegClass; + case AMDIL::GPRI16RegClassID: + return &AMDIL::GPRI16RegClass; + case AMDIL::GPRI32RegClassID: + return &AMDIL::GPRI32RegClass; + case AMDIL::GPRF32RegClassID: + return &AMDIL::GPRF32RegClass; + case AMDIL::GPRI64RegClassID: + return &AMDIL::GPRI64RegClass; + case AMDIL::GPRF64RegClassID: + return &AMDIL::GPRF64RegClass; + case AMDIL::GPRV4F32RegClassID: + return &AMDIL::GPRV4F32RegClass; + case AMDIL::GPRV4I8RegClassID: + return &AMDIL::GPRV4I8RegClass; + case AMDIL::GPRV4I16RegClassID: + return &AMDIL::GPRV4I16RegClass; + case AMDIL::GPRV4I32RegClassID: + return &AMDIL::GPRV4I32RegClass; + case AMDIL::GPRV2F32RegClassID: + return &AMDIL::GPRV2F32RegClass; + case AMDIL::GPRV2I8RegClassID: + return &AMDIL::GPRV2I8RegClass; + case AMDIL::GPRV2I16RegClassID: + return &AMDIL::GPRV2I16RegClass; + case AMDIL::GPRV2I32RegClassID: + return &AMDIL::GPRV2I32RegClass; + case AMDIL::GPRV2F64RegClassID: + return &AMDIL::GPRV2F64RegClass; + case AMDIL::GPRV2I64RegClassID: + return &AMDIL::GPRV2I64RegClass; + }; +} + +unsigned int getMoveInstFromID(unsigned int ID) { + switch (ID) { + default: + assert(0 && "Passed in ID does not match any move instructions."); + case AMDIL::GPRI8RegClassID: + return AMDIL::MOVE_i8; + case AMDIL::GPRI16RegClassID: + return AMDIL::MOVE_i16; + case AMDIL::GPRI32RegClassID: + return AMDIL::MOVE_i32; + case AMDIL::GPRF32RegClassID: + return AMDIL::MOVE_f32; + case AMDIL::GPRI64RegClassID: + return AMDIL::MOVE_i64; + case AMDIL::GPRF64RegClassID: + return AMDIL::MOVE_f64; + case AMDIL::GPRV4F32RegClassID: + return AMDIL::MOVE_v4f32; + case AMDIL::GPRV4I8RegClassID: + return AMDIL::MOVE_v4i8; + case AMDIL::GPRV4I16RegClassID: + return AMDIL::MOVE_v4i16; + case AMDIL::GPRV4I32RegClassID: + return AMDIL::MOVE_v4i32; + case AMDIL::GPRV2F32RegClassID: + return AMDIL::MOVE_v2f32; + case AMDIL::GPRV2I8RegClassID: + return AMDIL::MOVE_v2i8; + case AMDIL::GPRV2I16RegClassID: + return AMDIL::MOVE_v2i16; + case AMDIL::GPRV2I32RegClassID: + return AMDIL::MOVE_v2i32; + case AMDIL::GPRV2F64RegClassID: + return AMDIL::MOVE_v2f64; + case AMDIL::GPRV2I64RegClassID: + return AMDIL::MOVE_v2i64; + }; + return -1; +} + +unsigned int getPHIMoveInstFromID(unsigned int ID) { + switch (ID) { + default: + assert(0 && "Passed in ID does not match any move instructions."); + case AMDIL::GPRI8RegClassID: + return AMDIL::PHIMOVE_i8; + case AMDIL::GPRI16RegClassID: + return AMDIL::PHIMOVE_i16; + case AMDIL::GPRI32RegClassID: + return AMDIL::PHIMOVE_i32; + case AMDIL::GPRF32RegClassID: + return AMDIL::PHIMOVE_f32; + case AMDIL::GPRI64RegClassID: + return AMDIL::PHIMOVE_i64; + case AMDIL::GPRF64RegClassID: + return AMDIL::PHIMOVE_f64; + case AMDIL::GPRV4F32RegClassID: + return AMDIL::PHIMOVE_v4f32; + case AMDIL::GPRV4I8RegClassID: + return AMDIL::PHIMOVE_v4i8; + case AMDIL::GPRV4I16RegClassID: + return AMDIL::PHIMOVE_v4i16; + case AMDIL::GPRV4I32RegClassID: + return AMDIL::PHIMOVE_v4i32; + case AMDIL::GPRV2F32RegClassID: + return AMDIL::PHIMOVE_v2f32; + case AMDIL::GPRV2I8RegClassID: + return AMDIL::PHIMOVE_v2i8; + case AMDIL::GPRV2I16RegClassID: + return AMDIL::PHIMOVE_v2i16; + case AMDIL::GPRV2I32RegClassID: + return AMDIL::PHIMOVE_v2i32; + case AMDIL::GPRV2F64RegClassID: + return AMDIL::PHIMOVE_v2f64; + case AMDIL::GPRV2I64RegClassID: + return AMDIL::PHIMOVE_v2i64; + }; + return -1; +} + +const TargetRegisterClass* getRegClassFromType(unsigned int type) { + switch (type) { + default: + assert(0 && "Passed in type does not match any register classes."); + case MVT::i8: + return &AMDIL::GPRI8RegClass; + case MVT::i16: + return &AMDIL::GPRI16RegClass; + case MVT::i32: + return &AMDIL::GPRI32RegClass; + case MVT::f32: + return &AMDIL::GPRF32RegClass; + case MVT::i64: + return &AMDIL::GPRI64RegClass; + case MVT::f64: + return &AMDIL::GPRF64RegClass; + case MVT::v4f32: + return &AMDIL::GPRV4F32RegClass; + case MVT::v4i8: + return &AMDIL::GPRV4I8RegClass; + case MVT::v4i16: + return &AMDIL::GPRV4I16RegClass; + case MVT::v4i32: + return &AMDIL::GPRV4I32RegClass; + case MVT::v2f32: + return &AMDIL::GPRV2F32RegClass; + case MVT::v2i8: + return &AMDIL::GPRV2I8RegClass; + case MVT::v2i16: + return &AMDIL::GPRV2I16RegClass; + case MVT::v2i32: + return &AMDIL::GPRV2I32RegClass; + case MVT::v2f64: + return &AMDIL::GPRV2F64RegClass; + case MVT::v2i64: + return &AMDIL::GPRV2I64RegClass; + } +} + +void printSDNode(const SDNode *N) { + printf("Opcode: %d isTargetOpcode: %d isMachineOpcode: %d\n", + N->getOpcode(), N->isTargetOpcode(), N->isMachineOpcode()); + printf("Empty: %d OneUse: %d Size: %d NodeID: %d\n", + N->use_empty(), N->hasOneUse(), (int)N->use_size(), N->getNodeId()); + for (unsigned int i = 0; i < N->getNumOperands(); ++i) { + printf("OperandNum: %d ValueCount: %d ValueType: %d\n", + i, N->getNumValues(), N->getValueType(0) .getSimpleVT().SimpleTy); + printSDValue(N->getOperand(i), 0); + } +} + +void printSDValue(const SDValue &Op, int level) { + printf("\nOp: %p OpCode: %d NumOperands: %d ", (void*)&Op, Op.getOpcode(), + Op.getNumOperands()); + printf("IsTarget: %d IsMachine: %d ", Op.isTargetOpcode(), + Op.isMachineOpcode()); + if (Op.isMachineOpcode()) { + printf("MachineOpcode: %d\n", Op.getMachineOpcode()); + } else { + printf("\n"); + } + EVT vt = Op.getValueType(); + printf("ValueType: %d \n", vt.getSimpleVT().SimpleTy); + printf("UseEmpty: %d OneUse: %d\n", Op.use_empty(), Op.hasOneUse()); + if (level) { + printf("Children for %d:\n", level); + for (unsigned int i = 0; i < Op.getNumOperands(); ++i) { + printf("Child %d->%d:", level, i); + printSDValue(Op.getOperand(i), level - 1); + } + } +} + +bool isPHIMove(unsigned int opcode) { + switch (opcode) { + default: + return false; + ExpandCaseToAllTypes(AMDIL::PHIMOVE); + return true; + } + return false; +} + +bool isMove(unsigned int opcode) { + switch (opcode) { + default: + return false; + ExpandCaseToAllTypes(AMDIL::MOVE); + return true; + } + return false; +} + +bool isMoveOrEquivalent(unsigned int opcode) { + switch (opcode) { + default: + return isMove(opcode) || isPHIMove(opcode); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASCHAR); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASSHORT); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASINT); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASLONG); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASDOUBLE); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASFLOAT); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2CHAR); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2SHORT); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2INT); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2FLOAT); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2LONG); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2DOUBLE); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4CHAR); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4SHORT); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4INT); + ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4FLOAT); + case AMDIL::INTTOANY_i8: + case AMDIL::INTTOANY_i16: + case AMDIL::INTTOANY_i32: + case AMDIL::INTTOANY_f32: + case AMDIL::DLO: + case AMDIL::LLO: + case AMDIL::LLO_v2i64: + return true; + }; + return false; +} + +bool check_type(const Value *ptr, unsigned int addrspace) { + if (!ptr) { + return false; + } + Type *ptrType = ptr->getType(); + return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace; +} + +size_t getTypeSize(Type * const T, bool dereferencePtr) { + size_t size = 0; + if (!T) { + return size; + } + switch (T->getTypeID()) { + case Type::X86_FP80TyID: + case Type::FP128TyID: + case Type::PPC_FP128TyID: + case Type::LabelTyID: + assert(0 && "These types are not supported by this backend"); + default: + case Type::FloatTyID: + case Type::DoubleTyID: + size = T->getPrimitiveSizeInBits() >> 3; + break; + case Type::PointerTyID: + size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); + break; + case Type::IntegerTyID: + size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); + break; + case Type::StructTyID: + size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); + break; + case Type::ArrayTyID: + size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); + break; + case Type::FunctionTyID: + size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); + break; + case Type::VectorTyID: + size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); + break; + }; + return size; +} + +size_t getTypeSize(StructType * const ST, bool dereferencePtr) { + size_t size = 0; + if (!ST) { + return size; + } + Type *curType; + StructType::element_iterator eib; + StructType::element_iterator eie; + for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { + curType = *eib; + size += getTypeSize(curType, dereferencePtr); + } + return size; +} + +size_t getTypeSize(IntegerType * const IT, bool dereferencePtr) { + return IT ? (IT->getBitWidth() >> 3) : 0; +} + +size_t getTypeSize(FunctionType * const FT, bool dereferencePtr) { + assert(0 && "Should not be able to calculate the size of an function type"); + return 0; +} + +size_t getTypeSize(ArrayType * const AT, bool dereferencePtr) { + return (size_t)(AT ? (getTypeSize(AT->getElementType(), + dereferencePtr) * AT->getNumElements()) + : 0); +} + +size_t getTypeSize(VectorType * const VT, bool dereferencePtr) { + return VT ? (VT->getBitWidth() >> 3) : 0; +} + +size_t getTypeSize(PointerType * const PT, bool dereferencePtr) { + if (!PT) { + return 0; + } + Type *CT = PT->getElementType(); + if (CT->getTypeID() == Type::StructTyID && + PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) { + return getTypeSize(dyn_cast<StructType>(CT)); + } else if (dereferencePtr) { + size_t size = 0; + for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { + size += getTypeSize(PT->getContainedType(x), dereferencePtr); + } + return size; + } else { + return 4; + } +} + +size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr) { + //assert(0 && "Should not be able to calculate the size of an opaque type"); + return 4; +} + +size_t getNumElements(Type * const T) { + size_t size = 0; + if (!T) { + return size; + } + switch (T->getTypeID()) { + case Type::X86_FP80TyID: + case Type::FP128TyID: + case Type::PPC_FP128TyID: + case Type::LabelTyID: + assert(0 && "These types are not supported by this backend"); + default: + case Type::FloatTyID: + case Type::DoubleTyID: + size = 1; + break; + case Type::PointerTyID: + size = getNumElements(dyn_cast<PointerType>(T)); + break; + case Type::IntegerTyID: + size = getNumElements(dyn_cast<IntegerType>(T)); + break; + case Type::StructTyID: + size = getNumElements(dyn_cast<StructType>(T)); + break; + case Type::ArrayTyID: + size = getNumElements(dyn_cast<ArrayType>(T)); + break; + case Type::FunctionTyID: + size = getNumElements(dyn_cast<FunctionType>(T)); + break; + case Type::VectorTyID: + size = getNumElements(dyn_cast<VectorType>(T)); + break; + }; + return size; +} + +size_t getNumElements(StructType * const ST) { + size_t size = 0; + if (!ST) { + return size; + } + Type *curType; + StructType::element_iterator eib; + StructType::element_iterator eie; + for (eib = ST->element_begin(), eie = ST->element_end(); + eib != eie; ++eib) { + curType = *eib; + size += getNumElements(curType); + } + return size; +} + +size_t getNumElements(IntegerType * const IT) { + return (!IT) ? 0 : 1; +} + +size_t getNumElements(FunctionType * const FT) { + assert(0 && "Should not be able to calculate the number of " + "elements of a function type"); + return 0; +} + +size_t getNumElements(ArrayType * const AT) { + return (!AT) ? 0 + : (size_t)(getNumElements(AT->getElementType()) * + AT->getNumElements()); +} + +size_t getNumElements(VectorType * const VT) { + return (!VT) ? 0 + : VT->getNumElements() * getNumElements(VT->getElementType()); +} + +size_t getNumElements(PointerType * const PT) { + size_t size = 0; + if (!PT) { + return size; + } + for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { + size += getNumElements(PT->getContainedType(x)); + } + return size; +} + +const llvm::Value *getBasePointerValue(const llvm::Value *V) +{ + if (!V) { + return NULL; + } + const Value *ret = NULL; + ValueMap<const Value *, bool> ValueBitMap; + std::queue<const Value *, std::list<const Value *> > ValueQueue; + ValueQueue.push(V); + while (!ValueQueue.empty()) { + V = ValueQueue.front(); + if (ValueBitMap.find(V) == ValueBitMap.end()) { + ValueBitMap[V] = true; + if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) { + ret = V; + break; + } else if (dyn_cast<GlobalVariable>(V)) { + ret = V; + break; + } else if (dyn_cast<Constant>(V)) { + const ConstantExpr *CE = dyn_cast<ConstantExpr>(V); + if (CE) { + ValueQueue.push(CE->getOperand(0)); + } + } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) { + ret = AI; + break; + } else if (const Instruction *I = dyn_cast<Instruction>(V)) { + uint32_t numOps = I->getNumOperands(); + for (uint32_t x = 0; x < numOps; ++x) { + ValueQueue.push(I->getOperand(x)); + } + } else { + // assert(0 && "Found a Value that we didn't know how to handle!"); + } + } + ValueQueue.pop(); + } + return ret; +} + +const llvm::Value *getBasePointerValue(const llvm::MachineInstr *MI) { + const Value *moVal = NULL; + if (!MI->memoperands_empty()) { + const MachineMemOperand *memOp = (*MI->memoperands_begin()); + moVal = memOp ? memOp->getValue() : NULL; + moVal = getBasePointerValue(moVal); + } + return moVal; +} + +bool commaPrint(int i, llvm::raw_ostream &O) { + O << ":" << i; + return false; +} + +bool isLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + if (strstr(GET_OPCODE_NAME(TII, MI), "LOADCONST")) { + return false; + } + return strstr(GET_OPCODE_NAME(TII, MI), "LOAD"); +} + +bool isSWSExtLoadInst(MachineInstr *MI) +{ +switch (MI->getOpcode()) { + default: + break; + ExpandCaseToByteShortTypes(AMDIL::LOCALLOAD); + ExpandCaseToByteShortTypes(AMDIL::GLOBALLOAD); + ExpandCaseToByteShortTypes(AMDIL::REGIONLOAD); + ExpandCaseToByteShortTypes(AMDIL::PRIVATELOAD); + ExpandCaseToByteShortTypes(AMDIL::CPOOLLOAD); + ExpandCaseToByteShortTypes(AMDIL::CONSTANTLOAD); + return true; + }; + return false; +} + +bool isExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + return strstr(GET_OPCODE_NAME(TII, MI), "EXTLOAD"); +} + +bool isSExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + return strstr(GET_OPCODE_NAME(TII, MI), "SEXTLOAD"); +} + +bool isAExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + return strstr(GET_OPCODE_NAME(TII, MI), "AEXTLOAD"); +} + +bool isZExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + return strstr(GET_OPCODE_NAME(TII, MI), "ZEXTLOAD"); +} + +bool isStoreInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + return strstr(GET_OPCODE_NAME(TII, MI), "STORE"); +} + +bool isTruncStoreInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + return strstr(GET_OPCODE_NAME(TII, MI), "TRUNCSTORE"); +} + +bool isAtomicInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + return strstr(GET_OPCODE_NAME(TII, MI), "ATOM"); +} + +bool isVolatileInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) { + if (!MI->memoperands_empty()) { + for (MachineInstr::mmo_iterator mob = MI->memoperands_begin(), + moe = MI->memoperands_end(); mob != moe; ++mob) { + // If there is a volatile mem operand, this is a volatile instruction. + if ((*mob)->isVolatile()) { + return true; + } + } + } + return false; +} +bool isGlobalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "GLOBAL"); +} +bool isPrivateInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "PRIVATE"); +} +bool isConstantInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "CONSTANT") + || strstr(GET_OPCODE_NAME(TII, MI), "CPOOL"); +} +bool isRegionInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "REGION"); +} +bool isLocalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "LOCAL"); +} +bool isImageInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "IMAGE"); +} +bool isAppendInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "APPEND"); +} +bool isRegionAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_R"); +} +bool isLocalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_L"); +} +bool isGlobalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_G") + || isArenaAtomic(TII, MI); +} +bool isArenaAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI) +{ + return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_A"); +} + +const char* getSrcSwizzle(unsigned idx) { + const char *srcSwizzles[] = { + "", ".x000", ".0x00", ".00x0", ".000x", ".y000", ".0y00", ".00y0", ".000y", + ".z000", ".0z00", ".00z0", ".000z", ".w000", ".0w00", ".00w0", ".000w", + ".xy00", ".00xy", ".zw00", ".00zw", ".xyz0", ".0xyz", ".xyzw", ".0000", + ".xxxx", ".yyyy", ".zzzz", ".wwww", ".xyxy", ".zwzw", ".xzxz", ".ywyw", + ".x0y0", ".0x0y", ".xy_neg(y)", "_neg(yw)", "_neg(x)", ".xy_neg(xy)", + "_neg(xyzw)", ".0yzw", ".x0zw", ".xy0w", ".x", ".y", ".z", ".w", ".xy", + ".zw" + }; + assert(idx < sizeof(srcSwizzles)/sizeof(srcSwizzles[0]) + && "Idx passed in is invalid!"); + return srcSwizzles[idx]; +} +const char* getDstSwizzle(unsigned idx) { + const char *dstSwizzles[] = { + "", ".x___", ".xy__", ".xyz_", ".xyzw", "._y__", "._yz_", "._yzw", ".__z_", + ".__zw", ".___w", ".x_zw", ".xy_w", ".x_z_", ".x__w", "._y_w", + }; + assert(idx < sizeof(dstSwizzles)/sizeof(dstSwizzles[0]) + && "Idx passed in is invalid!"); + return dstSwizzles[idx]; +} +/// Helper function to get the currently set flags +void getAsmPrinterFlags(MachineInstr *MI, AMDILAS::InstrResEnc &curRes) +{ + // We need 16 bits of information, but LLVMr127097 cut the field in half. + // So we have to use two different fields to store all of our information. + uint16_t upper = MI->getFlags() << 8; + uint16_t lower = MI->getAsmPrinterFlags(); + curRes.u16all = upper | lower; +} +/// Helper function to clear the currently set flags and add the new flags. +void setAsmPrinterFlags(MachineInstr *MI, AMDILAS::InstrResEnc &curRes) +{ + // We need 16 bits of information, but LLVMr127097 cut the field in half. + // So we have to use two different fields to store all of our information. + MI->clearAsmPrinterFlags(); + MI->setFlags(0); + uint8_t lower = curRes.u16all & 0xFF; + uint8_t upper = (curRes.u16all >> 8) & 0xFF; + MI->setFlags(upper); + MI->setAsmPrinterFlag((llvm::MachineInstr::CommentFlag)lower); +} diff --git a/src/gallium/drivers/radeon/AMDILUtilityFunctions.h b/src/gallium/drivers/radeon/AMDILUtilityFunctions.h new file mode 100644 index 00000000000..637c868b55c --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILUtilityFunctions.h @@ -0,0 +1,362 @@ +//===-- AMDILUtilityFunctions.h - AMDIL Utility Functions Header --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//==-----------------------------------------------------------------------===// +// +// This file provides declarations for functions that are used across different +// classes and provide various conversions or utility to shorten the code +// +//===----------------------------------------------------------------------===// +#ifndef AMDILUTILITYFUNCTIONS_H_ +#define AMDILUTILITYFUNCTIONS_H_ + +#include "AMDIL.h" +#include "AMDILTargetMachine.h" +#include "llvm/ADT/SmallVector.h" + +// Utility functions from ID +// +namespace llvm { +class TargetRegisterClass; +class SDValue; +class SDNode; +class Value; +class Type; +class StructType; +class IntegerType; +class FunctionType; +class VectorType; +class ArrayType; +class PointerType; +class OpaqueType; +class MachineInstr; + +} +enum SrcSwizzles { + AMDIL_SRC_SWIZZLE_DEFAULT = 0, + AMDIL_SRC_SWIZZLE_X000, + AMDIL_SRC_SWIZZLE_0X00, + AMDIL_SRC_SWIZZLE_00X0, + AMDIL_SRC_SWIZZLE_000X, + AMDIL_SRC_SWIZZLE_Y000, + AMDIL_SRC_SWIZZLE_0Y00, + AMDIL_SRC_SWIZZLE_00Y0, + AMDIL_SRC_SWIZZLE_000Y, + AMDIL_SRC_SWIZZLE_Z000, + AMDIL_SRC_SWIZZLE_0Z00, + AMDIL_SRC_SWIZZLE_00Z0, + AMDIL_SRC_SWIZZLE_000Z, + AMDIL_SRC_SWIZZLE_W000, + AMDIL_SRC_SWIZZLE_0W00, + AMDIL_SRC_SWIZZLE_00W0, + AMDIL_SRC_SWIZZLE_000W, + AMDIL_SRC_SWIZZLE_XY00, + AMDIL_SRC_SWIZZLE_00XY, + AMDIL_SRC_SWIZZLE_ZW00, + AMDIL_SRC_SWIZZLE_00ZW, + AMDIL_SRC_SWIZZLE_XYZ0, + AMDIL_SRC_SWIZZLE_0XYZ, + AMDIL_SRC_SWIZZLE_XYZW, + AMDIL_SRC_SWIZZLE_0000, + AMDIL_SRC_SWIZZLE_XXXX, + AMDIL_SRC_SWIZZLE_YYYY, + AMDIL_SRC_SWIZZLE_ZZZZ, + AMDIL_SRC_SWIZZLE_WWWW, + AMDIL_SRC_SWIZZLE_XYXY, + AMDIL_SRC_SWIZZLE_ZWZW, + AMDIL_SRC_SWIZZLE_XZXZ, + AMDIL_SRC_SWIZZLE_YWYW, + AMDIL_SRC_SWIZZLE_X0Y0, + AMDIL_SRC_SWIZZLE_0X0Y, + AMDIL_SRC_SWIZZLE_XY_NEGY, + AMDIL_SRC_SWIZZLE_NEGYW, + AMDIL_SRC_SWIZZLE_NEGX, + AMDIL_SRC_SWIZZLE_XY_NEGXY, + AMDIL_SRC_SWIZZLE_NEG_XYZW, + AMDIL_SRC_SWIZZLE_0YZW, + AMDIL_SRC_SWIZZLE_X0ZW, + AMDIL_SRC_SWIZZLE_XY0W, + AMDIL_SRC_SWIZZLE_X, + AMDIL_SRC_SWIZZLE_Y, + AMDIL_SRC_SWIZZLE_Z, + AMDIL_SRC_SWIZZLE_W, + AMDIL_SRC_SWIZZLE_XY, + AMDIL_SRC_SWIZZLE_ZW, + AMDIL_SRC_SWIZZLE_LAST +}; +enum DstSwizzles { + AMDIL_DST_SWIZZLE_DEFAULT = 0, + AMDIL_DST_SWIZZLE_X___, + AMDIL_DST_SWIZZLE_XY__, + AMDIL_DST_SWIZZLE_XYZ_, + AMDIL_DST_SWIZZLE_XYZW, + AMDIL_DST_SWIZZLE__Y__, + AMDIL_DST_SWIZZLE__YZ_, + AMDIL_DST_SWIZZLE__YZW, + AMDIL_DST_SWIZZLE___Z_, + AMDIL_DST_SWIZZLE___ZW, + AMDIL_DST_SWIZZLE____W, + AMDIL_DST_SWIZZLE_X_ZW, + AMDIL_DST_SWIZZLE_XY_W, + AMDIL_DST_SWIZZLE_X_Z_, + AMDIL_DST_SWIZZLE_X__W, + AMDIL_DST_SWIZZLE__Y_W, + AMDIL_DST_SWIZZLE_LAST +}; +// Function to get the correct src swizzle string from ID +const char *getSrcSwizzle(unsigned); + +// Function to get the correct dst swizzle string from ID +const char *getDstSwizzle(unsigned); + +const llvm::TargetRegisterClass *getRegClassFromID(unsigned int ID); + +unsigned int getMoveInstFromID(unsigned int ID); +unsigned int getPHIMoveInstFromID(unsigned int ID); + +// Utility functions from Type. +const llvm::TargetRegisterClass *getRegClassFromType(unsigned int type); +unsigned int getTargetIndependentMoveFromType(unsigned int type); + +// Debug functions for SDNode and SDValue. +void printSDValue(const llvm::SDValue &Op, int level); +void printSDNode(const llvm::SDNode *N); + +// Functions to check if an opcode is a specific type. +bool isMove(unsigned int opcode); +bool isPHIMove(unsigned int opcode); +bool isMoveOrEquivalent(unsigned int opcode); + +// Function to check address space +bool check_type(const llvm::Value *ptr, unsigned int addrspace); + +// Group of functions that recursively calculate the size of a structure based +// on it's sub-types. +size_t getTypeSize(llvm::Type * const T, bool dereferencePtr = false); +size_t +getTypeSize(llvm::StructType * const ST, bool dereferencePtr = false); +size_t +getTypeSize(llvm::IntegerType * const IT, bool dereferencePtr = false); +size_t +getTypeSize(llvm::FunctionType * const FT, bool dereferencePtr = false); +size_t +getTypeSize(llvm::ArrayType * const AT, bool dereferencePtr = false); +size_t +getTypeSize(llvm::VectorType * const VT, bool dereferencePtr = false); +size_t +getTypeSize(llvm::PointerType * const PT, bool dereferencePtr = false); +size_t +getTypeSize(llvm::OpaqueType * const OT, bool dereferencePtr = false); + +// Group of functions that recursively calculate the number of elements of a +// structure based on it's sub-types. +size_t getNumElements(llvm::Type * const T); +size_t getNumElements(llvm::StructType * const ST); +size_t getNumElements(llvm::IntegerType * const IT); +size_t getNumElements(llvm::FunctionType * const FT); +size_t getNumElements(llvm::ArrayType * const AT); +size_t getNumElements(llvm::VectorType * const VT); +size_t getNumElements(llvm::PointerType * const PT); +size_t getNumElements(llvm::OpaqueType * const OT); +const llvm::Value *getBasePointerValue(const llvm::Value *V); +const llvm::Value *getBasePointerValue(const llvm::MachineInstr *MI); + + +int64_t GET_SCALAR_SIZE(llvm::Type* A); + +// Helper functions that check the opcode for status information +bool isLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isSWSExtLoadInst(llvm::MachineInstr *MI); +bool isSExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isZExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isAExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isStoreInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isTruncStoreInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isAtomicInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isVolatileInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isGlobalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isPrivateInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isConstantInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isRegionInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isLocalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isImageInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isAppendInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isRegionAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isLocalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isGlobalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); +bool isArenaAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI); + + +// Macros that are used to help with switch statements for various data types +// However, these macro's do not return anything unlike the second set below. +#define ExpandCaseTo32bitIntTypes(Instr) \ +case Instr##_i8: \ +case Instr##_i16: \ +case Instr##_i32: + +#define ExpandCaseTo32bitIntTruncTypes(Instr) \ +case Instr##_i16i8: \ +case Instr##_i32i8: \ +case Instr##_i32i16: + +#define ExpandCaseToIntTypes(Instr) \ + ExpandCaseTo32bitIntTypes(Instr) \ +case Instr##_i64: + +#define ExpandCaseToIntTruncTypes(Instr) \ + ExpandCaseTo32bitIntTruncTypes(Instr) \ +case Instr##_i64i8:\ +case Instr##_i64i16:\ +case Instr##_i64i32:\ + +#define ExpandCaseToFloatTypes(Instr) \ + case Instr##_f32: \ +case Instr##_f64: + +#define ExpandCaseToFloatTruncTypes(Instr) \ +case Instr##_f64f32: + +#define ExpandCaseTo32bitScalarTypes(Instr) \ + ExpandCaseTo32bitIntTypes(Instr) \ +case Instr##_f32: + +#define ExpandCaseToAllScalarTypes(Instr) \ + ExpandCaseToFloatTypes(Instr) \ +ExpandCaseToIntTypes(Instr) + +#define ExpandCaseToAllScalarTruncTypes(Instr) \ + ExpandCaseToFloatTruncTypes(Instr) \ +ExpandCaseToIntTruncTypes(Instr) + +// Vector versions of above macros +#define ExpandCaseToVectorIntTypes(Instr) \ + case Instr##_v2i8: \ +case Instr##_v4i8: \ +case Instr##_v2i16: \ +case Instr##_v4i16: \ +case Instr##_v2i32: \ +case Instr##_v4i32: \ +case Instr##_v2i64: + +#define ExpandCaseToVectorIntTruncTypes(Instr) \ +case Instr##_v2i16i8: \ +case Instr##_v4i16i8: \ +case Instr##_v2i32i8: \ +case Instr##_v4i32i8: \ +case Instr##_v2i32i16: \ +case Instr##_v4i32i16: \ +case Instr##_v2i64i8: \ +case Instr##_v2i64i16: \ +case Instr##_v2i64i32: + +#define ExpandCaseToVectorFloatTypes(Instr) \ + case Instr##_v2f32: \ +case Instr##_v4f32: \ +case Instr##_v2f64: + +#define ExpandCaseToVectorFloatTruncTypes(Instr) \ +case Instr##_v2f64f32: + +#define ExpandCaseToVectorByteTypes(Instr) \ + case Instr##_v4i8:\ +case Instr##_v2i16: \ +case Instr##_v4i16: + +#define ExpandCaseToAllVectorTypes(Instr) \ + ExpandCaseToVectorFloatTypes(Instr) \ +ExpandCaseToVectorIntTypes(Instr) + +#define ExpandCaseToAllVectorTruncTypes(Instr) \ + ExpandCaseToVectorFloatTruncTypes(Instr) \ +ExpandCaseToVectorIntTruncTypes(Instr) + +#define ExpandCaseToAllTypes(Instr) \ + ExpandCaseToAllVectorTypes(Instr) \ +ExpandCaseToAllScalarTypes(Instr) + +#define ExpandCaseToAllTruncTypes(Instr) \ + ExpandCaseToAllVectorTruncTypes(Instr) \ +ExpandCaseToAllScalarTruncTypes(Instr) + +#define ExpandCaseToPackedTypes(Instr) \ + case Instr##_v2i8: \ + case Instr##_v4i8: \ + case Instr##_v2i16: \ + case Instr##_v4i16: + +#define ExpandCaseToByteShortTypes(Instr) \ + case Instr##_i8: \ + case Instr##_i16: \ + ExpandCaseToPackedTypes(Instr) + +// Macros that expand into case statements with return values +#define ExpandCaseTo32bitIntReturn(Instr, Return) \ +case Instr##_i8: return Return##_i8;\ +case Instr##_i16: return Return##_i16;\ +case Instr##_i32: return Return##_i32; + +#define ExpandCaseToIntReturn(Instr, Return) \ + ExpandCaseTo32bitIntReturn(Instr, Return) \ +case Instr##_i64: return Return##_i64; + +#define ExpandCaseToFloatReturn(Instr, Return) \ + case Instr##_f32: return Return##_f32;\ +case Instr##_f64: return Return##_f64; + +#define ExpandCaseToAllScalarReturn(Instr, Return) \ + ExpandCaseToFloatReturn(Instr, Return) \ +ExpandCaseToIntReturn(Instr, Return) + +// These macros expand to common groupings of RegClass ID's +#define ExpandCaseTo1CompRegID \ +case AMDIL::GPRI8RegClassID: \ +case AMDIL::GPRI16RegClassID: \ +case AMDIL::GPRI32RegClassID: \ +case AMDIL::GPRF32RegClassID: + +#define ExpandCaseTo2CompRegID \ + case AMDIL::GPRI64RegClassID: \ +case AMDIL::GPRF64RegClassID: \ +case AMDIL::GPRV2I8RegClassID: \ +case AMDIL::GPRV2I16RegClassID: \ +case AMDIL::GPRV2I32RegClassID: \ +case AMDIL::GPRV2F32RegClassID: + +// Macros that expand to case statements for specific bitlengths +#define ExpandCaseTo8BitType(Instr) \ + case Instr##_i8: + +#define ExpandCaseTo16BitType(Instr) \ + case Instr##_v2i8: \ +case Instr##_i16: + +#define ExpandCaseTo32BitType(Instr) \ + case Instr##_v4i8: \ +case Instr##_v2i16: \ +case Instr##_i32: \ +case Instr##_f32: + +#define ExpandCaseTo64BitType(Instr) \ + case Instr##_v4i16: \ +case Instr##_v2i32: \ +case Instr##_v2f32: \ +case Instr##_i64: \ +case Instr##_f64: + +#define ExpandCaseTo128BitType(Instr) \ + case Instr##_v4i32: \ +case Instr##_v4f32: \ +case Instr##_v2i64: \ +case Instr##_v2f64: + +bool commaPrint(int i, llvm::raw_ostream &O); +/// Helper function to get the currently get/set flags. +void getAsmPrinterFlags(llvm::MachineInstr *MI, llvm::AMDILAS::InstrResEnc &curRes); +void setAsmPrinterFlags(llvm::MachineInstr *MI, llvm::AMDILAS::InstrResEnc &curRes); + +#endif // AMDILUTILITYFUNCTIONS_H_ diff --git a/src/gallium/drivers/radeon/AMDILVersion.td b/src/gallium/drivers/radeon/AMDILVersion.td new file mode 100644 index 00000000000..b8b02608d3b --- /dev/null +++ b/src/gallium/drivers/radeon/AMDILVersion.td @@ -0,0 +1,75 @@ +//===-- AMDILVersion.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===--------------------------------------------------------------------===// +// Intrinsic operation support +//===--------------------------------------------------------------------===// +let TargetPrefix = "AMDIL", isTarget = 1 in { +def int_AMDIL_barrier : GCCBuiltin<"barrier">, + BinaryIntNoRetInt; +def int_AMDIL_barrier_global : GCCBuiltin<"barrierGlobal">, + BinaryIntNoRetInt; +def int_AMDIL_barrier_local : GCCBuiltin<"barrierLocal">, + BinaryIntNoRetInt; +def int_AMDIL_barrier_region : GCCBuiltin<"barrierRegion">, + BinaryIntNoRetInt; +def int_AMDIL_get_region_id : GCCBuiltin<"__amdil_get_region_id_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; +def int_AMDIL_get_region_local_id : GCCBuiltin<"__amdil_get_region_local_id_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; +def int_AMDIL_get_num_regions : GCCBuiltin<"__amdil_get_num_regions_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; +def int_AMDIL_get_region_size : GCCBuiltin<"__amdil_get_region_size_int">, + Intrinsic<[llvm_v4i32_ty], [], []>; +} + +let isCall=1, isNotDuplicable=1 in { + let Predicates=[hasRegionAS] in { +def BARRIER_EGNI : BinaryOpNoRet<IL_OP_BARRIER, (outs), + (ins GPRI32:$flag, GPRI32:$id), + "fence_threads_memory_lds_gds_gws", + [(int_AMDIL_barrier GPRI32:$flag, GPRI32:$id)]>; +} +let Predicates=[noRegionAS] in { +def BARRIER_7XX : BinaryOpNoRet<IL_OP_BARRIER, (outs), + (ins GPRI32:$flag, GPRI32:$id), + "fence_threads_memory_lds", + [(int_AMDIL_barrier GPRI32:$flag, GPRI32:$id)]>; +} + +def BARRIER_LOCAL : BinaryOpNoRet<IL_OP_BARRIER_LOCAL, (outs), + (ins GPRI32:$flag, GPRI32:$id), + "fence_threads_lds", + [(int_AMDIL_barrier_local GPRI32:$flag, GPRI32:$id)]>; + +def BARRIER_GLOBAL : BinaryOpNoRet<IL_OP_BARRIER_GLOBAL, (outs), + (ins GPRI32:$flag, GPRI32:$id), + "fence_threads_memory", + [(int_AMDIL_barrier_global GPRI32:$flag, GPRI32:$id)]>; + +def BARRIER_REGION : BinaryOpNoRet<IL_OP_BARRIER_REGION, (outs), + (ins GPRI32:$flag, GPRI32:$id), + "fence_threads_gds", + [(int_AMDIL_barrier_region GPRI32:$flag, GPRI32:$id)]>; + +def GET_REGION_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1022.xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_region_id))]>; + +def GET_REGION_LOCAL_ID : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, r1022.xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_region_local_id))]>; + +def GET_REGION_SIZE : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[10].xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_region_size))]>; + +def GET_NUM_REGIONS : ILFormat<IL_OP_MOV, (outs GPRV4I32:$dst), + (ins), !strconcat(IL_OP_MOV.Text, " $dst, cb0[11].xyz0"), + [(set GPRV4I32:$dst, (int_AMDIL_get_num_regions))]>; + +} diff --git a/src/gallium/drivers/radeon/LICENSE.TXT b/src/gallium/drivers/radeon/LICENSE.TXT new file mode 100644 index 00000000000..a57de2e87a1 --- /dev/null +++ b/src/gallium/drivers/radeon/LICENSE.TXT @@ -0,0 +1,43 @@ +============================================================================== +LLVM Release License +============================================================================== +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2012 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. diff --git a/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.cpp b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.cpp new file mode 100644 index 00000000000..5b62311c6e6 --- /dev/null +++ b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.cpp @@ -0,0 +1,107 @@ +//===-- MCTargetDesc/AMDILMCAsmInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDILMCAsmInfo.h" +#ifndef NULL +#define NULL 0 +#endif + +using namespace llvm; +AMDILMCAsmInfo::AMDILMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() +{ + //===------------------------------------------------------------------===// + HasSubsectionsViaSymbols = true; + HasMachoZeroFillDirective = false; + HasMachoTBSSDirective = false; + HasStaticCtorDtorReferenceInStaticMode = false; + LinkerRequiresNonEmptyDwarfLines = true; + MaxInstLength = 16; + PCSymbol = "$"; + SeparatorString = "\n"; + CommentColumn = 40; + CommentString = ";"; + LabelSuffix = ":"; + GlobalPrefix = "@"; + PrivateGlobalPrefix = ";."; + LinkerPrivateGlobalPrefix = "!"; + InlineAsmStart = ";#ASMSTART"; + InlineAsmEnd = ";#ASMEND"; + AssemblerDialect = 0; + AllowQuotesInName = false; + AllowNameToStartWithDigit = false; + AllowPeriodsInName = false; + + //===--- Data Emission Directives -------------------------------------===// + ZeroDirective = ".zero"; + AsciiDirective = ".ascii\t"; + AscizDirective = ".asciz\t"; + Data8bitsDirective = ".byte\t"; + Data16bitsDirective = ".short\t"; + Data32bitsDirective = ".long\t"; + Data64bitsDirective = ".quad\t"; + GPRel32Directive = NULL; + SunStyleELFSectionSwitchSyntax = true; + UsesELFSectionDirectiveForBSS = true; + HasMicrosoftFastStdCallMangling = false; + + //===--- Alignment Information ----------------------------------------===// + AlignDirective = ".align\t"; + AlignmentIsInBytes = true; + TextAlignFillValue = 0; + + //===--- Global Variable Emission Directives --------------------------===// + GlobalDirective = ".global"; + ExternDirective = ".extern"; + HasSetDirective = false; + HasAggressiveSymbolFolding = true; + LCOMMDirectiveType = LCOMM::None; + COMMDirectiveAlignmentIsInBytes = false; + HasDotTypeDotSizeDirective = false; + HasSingleParameterDotFile = true; + HasNoDeadStrip = true; + HasSymbolResolver = false; + WeakRefDirective = ".weakref\t"; + WeakDefDirective = ".weakdef\t"; + LinkOnceDirective = NULL; + HiddenVisibilityAttr = MCSA_Hidden; + HiddenDeclarationVisibilityAttr = MCSA_Hidden; + ProtectedVisibilityAttr = MCSA_Protected; + + //===--- Dwarf Emission Directives -----------------------------------===// + HasLEB128 = true; + SupportsDebugInformation = true; + ExceptionsType = ExceptionHandling::None; + DwarfUsesInlineInfoSection = false; + DwarfSectionOffsetDirective = ".offset"; + DwarfUsesLabelOffsetForRanges = true; + + //===--- CBE Asm Translation Table -----------------------------------===// + AsmTransCBE = NULL; +} +const char* +AMDILMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const +{ + switch (AS) { + default: + return NULL; + case 0: + return NULL; + }; + return NULL; +} + +const MCSection* +AMDILMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const +{ + return NULL; +} diff --git a/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.h b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.h new file mode 100644 index 00000000000..d354b03351c --- /dev/null +++ b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCAsmInfo.h @@ -0,0 +1,30 @@ +//===-- MCTargetDesc/AMDILMCAsmInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef AMDILMCASMINFO_H_ +#define AMDILMCASMINFO_H_ + +#include "llvm/MC/MCAsmInfo.h" +namespace llvm { + class Target; + class StringRef; + + class AMDILMCAsmInfo : public MCAsmInfo { + public: + explicit AMDILMCAsmInfo(const Target &T, StringRef &TT); + const char* + getDataASDirective(unsigned int Size, unsigned int AS) const; + const MCSection* getNonexecutableStackSection(MCContext &CTX) const; + }; +} // namespace llvm +#endif // AMDILMCASMINFO_H_ diff --git a/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.cpp b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.cpp new file mode 100644 index 00000000000..5e60b00bf53 --- /dev/null +++ b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.cpp @@ -0,0 +1,66 @@ +#include "AMDILMCTargetDesc.h" +#include "AMDILMCAsmInfo.h" +#include "llvm/MC/MachineLocation.h" +#include "llvm/MC/MCCodeGenInfo.h" +#include "llvm/MC/MCInstrInfo.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/TargetRegistry.h" + +#define GET_INSTRINFO_MC_DESC +#include "AMDILGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_MC_DESC +#include "AMDILGenSubtargetInfo.inc" + +#define GET_REGINFO_MC_DESC +#include "AMDILGenRegisterInfo.inc" + +using namespace llvm; + +static MCInstrInfo *createAMDILMCInstrInfo() { + MCInstrInfo *X = new MCInstrInfo(); + InitAMDILMCInstrInfo(X); + return X; +} + +static MCRegisterInfo *createAMDILMCRegisterInfo(StringRef TT) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAMDILMCRegisterInfo(X, 0); + return X; +} + +static MCSubtargetInfo *createAMDILMCSubtargetInfo(StringRef TT, StringRef CPU, + StringRef FS) { + MCSubtargetInfo * X = new MCSubtargetInfo(); + InitAMDILMCSubtargetInfo(X, TT, CPU, FS); + return X; +} + +static MCCodeGenInfo *createAMDILMCCodeGenInfo(StringRef TT, Reloc::Model RM, + CodeModel::Model CM, + CodeGenOpt::Level OL) { + MCCodeGenInfo *X = new MCCodeGenInfo(); + X->InitMCCodeGenInfo(RM, CM, OL); + return X; +} + +extern "C" void LLVMInitializeAMDILTargetMC() { + + RegisterMCAsmInfo<AMDILMCAsmInfo> X(TheAMDILTarget); + RegisterMCAsmInfo<AMDILMCAsmInfo> Y(TheAMDGPUTarget); + + TargetRegistry::RegisterMCCodeGenInfo(TheAMDILTarget, createAMDILMCCodeGenInfo); + TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDILMCCodeGenInfo); + + TargetRegistry::RegisterMCInstrInfo(TheAMDILTarget, createAMDILMCInstrInfo); + TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDILMCInstrInfo); + + TargetRegistry::RegisterMCRegInfo(TheAMDILTarget, createAMDILMCRegisterInfo); + TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDILMCRegisterInfo); + + TargetRegistry::RegisterMCSubtargetInfo(TheAMDILTarget, createAMDILMCSubtargetInfo); + TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDILMCSubtargetInfo); + +} diff --git a/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.h b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.h new file mode 100644 index 00000000000..370769fea25 --- /dev/null +++ b/src/gallium/drivers/radeon/MCTargetDesc/AMDILMCTargetDesc.h @@ -0,0 +1,36 @@ +//===-- AMDILMCTargetDesc.h - AMDIL Target Descriptions -----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file provides AMDIL specific target descriptions. +// +//===----------------------------------------------------------------------===// +// + +#ifndef AMDILMCTARGETDESC_H +#define AMDILMCTARGETDESC_H + +namespace llvm { +class MCSubtargetInfo; +class Target; + +extern Target TheAMDILTarget; +extern Target TheAMDGPUTarget; + +} // End llvm namespace + +#define GET_REGINFO_ENUM +#include "AMDILGenRegisterInfo.inc" + +#define GET_INSTRINFO_ENUM +#include "AMDILGenInstrInfo.inc" + +#define GET_SUBTARGETINFO_ENUM +#include "AMDILGenSubtargetInfo.inc" + +#endif // AMDILMCTARGETDESC_H diff --git a/src/gallium/drivers/radeon/Makefile b/src/gallium/drivers/radeon/Makefile new file mode 100644 index 00000000000..807dc781c7c --- /dev/null +++ b/src/gallium/drivers/radeon/Makefile @@ -0,0 +1,77 @@ + +TOP = ../../../.. +include $(TOP)/configs/current + +include Makefile.sources + +LIBNAME = radeon + +LIBRARY_INCLUDES = -I$(TOP)/include + +TBLGEN = $(LLVM_BINDIR)/llvm-tblgen + +CXXFLAGS+= $(LLVM_CXXFLAGS) + +include ../../Makefile.template + +CXXFLAGS := $(filter-out -DDEBUG, $(CXXFLAGS)) + +tablegen = $(TBLGEN) -I $(LLVM_INCLUDEDIR) $1 $2 -o $3 + +gen: $(GENERATED_SOURCES) + +SIRegisterInfo.td: SIGenRegisterInfo.pl + $(PERL) $^ > $@ + +SIRegisterGetHWRegNum.inc: SIGenRegisterInfo.pl + $(PERL) $^ $@ > /dev/null + +R600ShaderPatterns.td: AMDGPUGenShaderPatterns.pl + $(PERL) $^ C > $@ + +R600RegisterInfo.td: R600GenRegisterInfo.pl + $(PERL) $^ > $@ + +AMDGPUInstrEnums.td: AMDGPUGenInstrEnums.pl + $(PERL) $^ td > $@ + +AMDGPUInstrEnums.h.include: AMDGPUGenInstrEnums.pl + $(PERL) $^ h > $@ + +AMDGPUInstrEnums.include: AMDGPUGenInstrEnums.pl + $(PERL) $^ inc > $@ + + +AMDILGenRegisterInfo.inc: *.td + $(call tablegen, -gen-register-info, AMDIL.td, $@) + +AMDILGenInstrInfo.inc: *.td + $(call tablegen, -gen-instr-info, AMDIL.td, $@) + +AMDILGenAsmWriter.inc: *.td + $(call tablegen, -gen-asm-writer, AMDIL.td, $@) + +AMDILGenDAGISel.inc: *.td + $(call tablegen, -gen-dag-isel, AMDIL.td, $@) + +AMDILGenCallingConv.inc: *.td + $(call tablegen, -gen-callingconv, AMDIL.td, $@) + +AMDILGenSubtargetInfo.inc: *.td + $(call tablegen, -gen-subtarget, AMDIL.td, $@) + +AMDILGenEDInfo.inc: *.td + $(call tablegen, -gen-enhanced-disassembly-info, AMDIL.td, $@) + +AMDILGenIntrinsics.inc: *.td + $(call tablegen, -gen-tgt-intrinsic, AMDIL.td, $@) + +AMDILGenCodeEmitter.inc: *.td + $(call tablegen, -gen-emitter, AMDIL.td, $@) + +LOADER_LIBS=$(shell llvm-config --libs bitreader asmparser) +loader: loader.o libradeon.a + gcc -o loader -L/usr/local/lib $(LDFLAGS) loader.o libradeon.a $(LLVM_LIBS) $(LOADER_LIBS) -lpthread -ldl -lstdc++ -lm + +# FIXME: Remove when this driver is converted to automake. +all: default diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources new file mode 100644 index 00000000000..96189e75a17 --- /dev/null +++ b/src/gallium/drivers/radeon/Makefile.sources @@ -0,0 +1,86 @@ + +GENERATED_SOURCES := \ + R600ShaderPatterns.td \ + R600RegisterInfo.td \ + AMDGPUInstrEnums.td \ + SIRegisterInfo.td \ + SIRegisterGetHWRegNum.inc \ + AMDILGenRegisterInfo.inc \ + AMDILGenInstrInfo.inc \ + AMDILGenAsmWriter.inc \ + AMDILGenDAGISel.inc \ + AMDILGenCallingConv.inc \ + AMDILGenSubtargetInfo.inc \ + AMDILGenEDInfo.inc \ + AMDILGenIntrinsics.inc \ + AMDILGenCodeEmitter.inc \ + AMDGPUInstrEnums.h.include \ + AMDGPUInstrEnums.include + +CPP_SOURCES := \ + AMDIL7XXDevice.cpp \ + AMDIL7XXIOExpansion.cpp \ + AMDIL789IOExpansion.cpp \ + AMDILAsmBackend.cpp \ + AMDILBarrierDetect.cpp \ + AMDILCFGStructurizer.cpp \ + AMDILDevice.cpp \ + AMDILDeviceInfo.cpp \ + AMDILEGIOExpansion.cpp \ + AMDILEvergreenDevice.cpp \ + AMDILELFWriterInfo.cpp \ + AMDILFrameLowering.cpp \ + AMDILGlobalManager.cpp \ + AMDILImageExpansion.cpp \ + AMDILInliner.cpp \ + AMDILInstrInfo.cpp \ + AMDILIntrinsicInfo.cpp \ + AMDILIOExpansion.cpp \ + AMDILISelDAGToDAG.cpp \ + AMDILISelLowering.cpp \ + AMDILKernelManager.cpp \ + AMDILLiteralManager.cpp \ + AMDILMachineFunctionInfo.cpp \ + AMDILMachinePeephole.cpp \ + AMDILMCCodeEmitter.cpp \ + AMDILModuleInfo.cpp \ + AMDILNIDevice.cpp \ + AMDILPeepholeOptimizer.cpp \ + AMDILPointerManager.cpp \ + AMDILPrintfConvert.cpp \ + AMDILRegisterInfo.cpp \ + AMDILSIDevice.cpp \ + AMDILSubtarget.cpp \ + AMDILTargetMachine.cpp \ + AMDILUtilityFunctions.cpp \ + AMDGPUTargetMachine.cpp \ + AMDGPUISelLowering.cpp \ + AMDGPUConvertToISA.cpp \ + AMDGPULowerShaderInstructions.cpp \ + AMDGPUReorderPreloadInstructions.cpp \ + AMDGPUInstrInfo.cpp \ + AMDGPURegisterInfo.cpp \ + AMDGPUUtil.cpp \ + R600CodeEmitter.cpp \ + R600ISelLowering.cpp \ + R600InstrInfo.cpp \ + R600KernelParameters.cpp \ + R600LowerInstructions.cpp \ + R600LowerShaderInstructions.cpp \ + R600RegisterInfo.cpp \ + SIAssignInterpRegs.cpp \ + SICodeEmitter.cpp \ + SIConvertToISA.cpp \ + SIInstrInfo.cpp \ + SIISelLowering.cpp \ + SILowerShaderInstructions.cpp \ + SIMachineFunctionInfo.cpp \ + SIPropagateImmReads.cpp \ + SIRegisterInfo.cpp \ + MCTargetDesc/AMDILMCAsmInfo.cpp \ + MCTargetDesc/AMDILMCTargetDesc.cpp \ + TargetInfo/AMDILTargetInfo.cpp \ + radeon_llvm_emit.cpp + +C_SOURCES := \ + radeon_setup_tgsi_llvm.c diff --git a/src/gallium/drivers/radeon/Processors.td b/src/gallium/drivers/radeon/Processors.td new file mode 100644 index 00000000000..6d1b411e207 --- /dev/null +++ b/src/gallium/drivers/radeon/Processors.td @@ -0,0 +1,28 @@ +//===-- Processors.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// AMDIL processors supported.
+// +//===----------------------------------------------------------------------===// +
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
+: Processor<Name, itin, Features>;
+def : Proc<"rv710", R600_EG_Itin, []>;
+def : Proc<"rv730", R600_EG_Itin, []>;
+def : Proc<"rv770", R600_EG_Itin, [FeatureFP64]>;
+def : Proc<"cedar", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"redwood", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"juniper", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"cypress", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
+def : Proc<"barts", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"turks", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"caicos", R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"cayman", R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
+def : Proc<"SI", SI_Itin, []>;
+
diff --git a/src/gallium/drivers/radeon/R600CodeEmitter.cpp b/src/gallium/drivers/radeon/R600CodeEmitter.cpp new file mode 100644 index 00000000000..d5f82cf69a1 --- /dev/null +++ b/src/gallium/drivers/radeon/R600CodeEmitter.cpp @@ -0,0 +1,776 @@ +//===-- R600CodeEmitter.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUUtil.h" +#include "AMDILCodeEmitter.h" +#include "AMDILInstrInfo.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILUtilityFunctions.h" +#include "R600RegisterInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/DataTypes.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Target/TargetMachine.h" + +#include <stdio.h> + +#define SRC_BYTE_COUNT 11 +#define DST_BYTE_COUNT 5 + +using namespace llvm; + +namespace { + + class R600CodeEmitter : public MachineFunctionPass, public AMDILCodeEmitter { + + private: + + static char ID; + formatted_raw_ostream &_OS; + const TargetMachine * TM; + const MachineRegisterInfo * MRI; + AMDILMachineFunctionInfo * MFI; + const R600RegisterInfo * TRI; + bool evergreenEncoding; + + bool isReduction; + unsigned reductionElement; + bool isLast; + + unsigned section_start; + + public: + + R600CodeEmitter(formatted_raw_ostream &OS) : MachineFunctionPass(ID), + _OS(OS), TM(NULL), evergreenEncoding(false), isReduction(false), + isLast(true) { } + + const char *getPassName() const { return "AMDGPU Machine Code Emitter"; } + + bool runOnMachineFunction(MachineFunction &MF); + virtual uint64_t getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const; + + private: + + void emitALUInstr(MachineInstr &MI); + void emitSrc(const MachineOperand & MO); + void emitDst(const MachineOperand & MO); + void emitALU(MachineInstr &MI, unsigned numSrc); + void emitTexInstr(MachineInstr &MI); + void emitFCInstr(MachineInstr &MI); + + unsigned int getHWInst(const MachineInstr &MI); + + void emitNullBytes(unsigned int byteCount); + + void emitByte(unsigned int byte); + + void emitTwoBytes(uint32_t bytes); + + void emit(uint32_t value); + void emit(uint64_t value); + + unsigned getHWReg(unsigned regNo) const; + + unsigned getElement(unsigned regNo); + +}; + +} /* End anonymous namespace */ + +#define WRITE_MASK_X 0x1 +#define WRITE_MASK_Y 0x2 +#define WRITE_MASK_Z 0x4 +#define WRITE_MASK_W 0x8 + +enum RegElement { + ELEMENT_X = 0, + ELEMENT_Y, + ELEMENT_Z, + ELEMENT_W +}; + +enum InstrTypes { + INSTR_ALU = 0, + INSTR_TEX, + INSTR_FC, + INSTR_NATIVE, + INSTR_VTX +}; + +enum FCInstr { + FC_IF = 0, + FC_ELSE, + FC_ENDIF, + FC_BGNLOOP, + FC_ENDLOOP, + FC_BREAK, + FC_BREAK_NZ_INT, + FC_CONTINUE, + FC_BREAK_Z_INT +}; + +enum TextureTypes { + TEXTURE_1D = 1, + TEXTURE_2D, + TEXTURE_3D, + TEXTURE_CUBE, + TEXTURE_RECT, + TEXTURE_SHADOW1D, + TEXTURE_SHADOW2D, + TEXTURE_SHADOWRECT, + TEXTURE_1D_ARRAY, + TEXTURE_2D_ARRAY, + TEXTURE_SHADOW1D_ARRAY, + TEXTURE_SHADOW2D_ARRAY +}; + +char R600CodeEmitter::ID = 0; + +FunctionPass *llvm::createR600CodeEmitterPass(formatted_raw_ostream &OS) { + return new R600CodeEmitter(OS); +} + +bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) { + + TM = &MF.getTarget(); + MRI = &MF.getRegInfo(); + MFI = MF.getInfo<AMDILMachineFunctionInfo>(); + TRI = static_cast<const R600RegisterInfo *>(TM->getRegisterInfo()); + const AMDILSubtarget &STM = TM->getSubtarget<AMDILSubtarget>(); + std::string gpu = STM.getDeviceName(); + if (!gpu.compare(0,3, "rv7")) { + evergreenEncoding = false; + } else { + evergreenEncoding = true; + } + const AMDGPUTargetMachine *amdtm = + static_cast<const AMDGPUTargetMachine *>(&MF.getTarget()); + + if (amdtm->shouldDumpCode()) { + MF.dump(); + } + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + if (MI.getNumOperands() > 1 && MI.getOperand(0).isReg() && MI.getOperand(0).isDead()) { + continue; + } + if (isTexOp(MI.getOpcode())) { + emitTexInstr(MI); + } else if (isFCOp(MI.getOpcode())){ + emitFCInstr(MI); + } else if (isReductionOp(MI.getOpcode())) { + isReduction = true; + isLast = false; + for (reductionElement = 0; reductionElement < 4; reductionElement++) { + isLast = (reductionElement == 3); + emitALUInstr(MI); + } + isReduction = false; + } else if (MI.getOpcode() == AMDIL::RETURN) { + continue; + } else { + switch(MI.getOpcode()) { + case AMDIL::RAT_WRITE_CACHELESS_eg: + { + /* XXX: Support for autoencoding 64-bit instructions was added + * in LLVM 3.1. Until we drop support for 3.0, we will use Magic + * numbers for the high bits. */ + uint64_t high = 0x95c0100000000000; + uint64_t inst = getBinaryCodeForInstr(MI); + inst |= high; + /* Set End Of Program bit */ + /* XXX: Need better check of end of program. EOP should be + * encoded in one of the operands of the MI, and it should be + * set in a prior pass. */ + MachineBasicBlock::iterator NextI = llvm::next(I); + MachineInstr &NextMI = *NextI; + if (NextMI.getOpcode() == AMDIL::RETURN) { + inst |= (((uint64_t)1) << 53); + } + emitByte(INSTR_NATIVE); + emit(inst); + break; + } + case AMDIL::VTX_READ_eg: + { + emitByte(INSTR_VTX); + /* inst */ + emitByte(0); + + /* fetch_type */ + emitByte(2); + + /* buffer_id */ + emitByte(MI.getOperand(2).getImm()); + + /* src_gpr */ + emitByte(getHWReg(MI.getOperand(1).getReg())); + + /* src_sel_x */ + emitByte(TRI->getHWRegChan(MI.getOperand(1).getReg())); + + /* mega_fetch_count */ + emitByte(3); + + /* dst_gpr */ + emitByte(getHWReg(MI.getOperand(0).getReg())); + + /* dst_sel_x */ + emitByte(0); + + /* dst_sel_y */ + emitByte(7); + + /* dst_sel_z */ + emitByte(7); + + /* dst_sel_w */ + emitByte(7); + + /* use_const_fields */ + emitByte(1); + + /* data_format */ + emitByte(0); + + /* num_format_all */ + emitByte(0); + + /* format_comp_all */ + emitByte(0); + + /* srf_mode_all */ + emitByte(0); + + /* offset */ + emitByte(0); + + /* endian */ + emitByte(0); + break; + } + + default: + emitALUInstr(MI); + break; + } + } + } + } + return false; +} + +void R600CodeEmitter::emitALUInstr(MachineInstr &MI) +{ + + unsigned numOperands = MI.getNumOperands(); + + /* Some instructions are just place holder instructions that represent + * operations that the GPU does automatically. They should be ignored. */ + if (isPlaceHolderOpcode(MI.getOpcode())) { + return; + } + + /* We need to handle some opcodes differently */ + switch (MI.getOpcode()) { + default: break; + + /* Custom swizzle instructions, ignore the last two operands */ + case AMDIL::SET_CHAN: + numOperands = 2; + break; + + case AMDIL::VEXTRACT_v4f32: + numOperands = 2; + break; + + /* XXX: Temp Hack */ + case AMDIL::STORE_OUTPUT: + numOperands = 2; + break; + } + + /* XXX Check if instruction writes a result */ + if (numOperands < 1) { + return; + } + const MachineOperand dstOp = MI.getOperand(0); + + /* Emit instruction type */ + emitByte(0); + + unsigned int opIndex; + for (opIndex = 1; opIndex < numOperands; opIndex++) { + /* Literal constants are always stored as the last operand. */ + if (MI.getOperand(opIndex).isImm() || MI.getOperand(opIndex).isFPImm()) { + break; + } + emitSrc(MI.getOperand(opIndex)); + } + + /* Emit zeros for unused sources */ + for ( ; opIndex < 4; opIndex++) { + emitNullBytes(SRC_BYTE_COUNT); + } + + emitDst(dstOp); + + emitALU(MI, numOperands - 1); +} + +void R600CodeEmitter::emitSrc(const MachineOperand & MO) +{ + uint32_t value = 0; + /* Emit the source select (2 bytes). For GPRs, this is the register index. + * For other potential instruction operands, (e.g. constant registers) the + * value of the source select is defined in the r600isa docs. */ + if (MO.isReg()) { + unsigned reg = MO.getReg(); + emitTwoBytes(getHWReg(reg)); + if (reg == AMDIL::ALU_LITERAL_X) { + const MachineInstr * parent = MO.getParent(); + unsigned immOpIndex = parent->getNumOperands() - 1; + MachineOperand immOp = parent->getOperand(immOpIndex); + if (immOp.isFPImm()) { + value = immOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue(); + } else { + assert(immOp.isImm()); + value = immOp.getImm(); + } + } + } else { + /* XXX: Handle other operand types. */ + emitTwoBytes(0); + } + + /* Emit the source channel (1 byte) */ + if (isReduction) { + emitByte(reductionElement); + } else if (MO.isReg()) { + const MachineInstr * parent = MO.getParent(); + /* The source channel for EXTRACT is stored in operand 2. */ + if (parent->getOpcode() == AMDIL::VEXTRACT_v4f32) { + emitByte(parent->getOperand(2).getImm()); + } else { + emitByte(TRI->getHWRegChan(MO.getReg())); + } + } else { + emitByte(0); + } + + /* XXX: Emit isNegated (1 byte) */ + if ((!(MO.getTargetFlags() & MO_FLAG_ABS)) + && (MO.getTargetFlags() & MO_FLAG_NEG || + (MO.isReg() && + (MO.getReg() == AMDIL::NEG_ONE || MO.getReg() == AMDIL::NEG_HALF)))){ + emitByte(1); + } else { + emitByte(0); + } + + /* Emit isAbsolute (1 byte) */ + if (MO.getTargetFlags() & MO_FLAG_ABS) { + emitByte(1); + } else { + emitByte(0); + } + + /* XXX: Emit relative addressing mode (1 byte) */ + emitByte(0); + + /* Emit kc_bank, This will be adjusted later by r600_asm */ + emitByte(0); + + /* Emit the literal value, if applicable (4 bytes). */ + emit(value); + +} + +void R600CodeEmitter::emitDst(const MachineOperand & MO) +{ + if (MO.isReg()) { + /* Emit the destination register index (1 byte) */ + emitByte(getHWReg(MO.getReg())); + + /* Emit the element of the destination register (1 byte)*/ + const MachineInstr * parent = MO.getParent(); + if (isReduction) { + emitByte(reductionElement); + + /* The destination element for SET_CHAN is stored in the 3rd operand. */ + } else if (parent->getOpcode() == AMDIL::SET_CHAN) { + emitByte(parent->getOperand(2).getImm()); + } else if (parent->getOpcode() == AMDIL::VCREATE_v4f32) { + emitByte(ELEMENT_X); + } else { + emitByte(TRI->getHWRegChan(MO.getReg())); + } + + /* Emit isClamped (1 byte) */ + if (MO.getTargetFlags() & MO_FLAG_CLAMP) { + emitByte(1); + } else { + emitByte(0); + } + + /* Emit writemask (1 byte). */ + if ((isReduction && reductionElement != TRI->getHWRegChan(MO.getReg())) + || MO.getTargetFlags() & MO_FLAG_MASK) { + emitByte(0); + } else { + emitByte(1); + } + + /* XXX: Emit relative addressing mode */ + emitByte(0); + } else { + /* XXX: Handle other operand types. Are there any for destination regs? */ + emitNullBytes(DST_BYTE_COUNT); + } +} + +void R600CodeEmitter::emitALU(MachineInstr &MI, unsigned numSrc) +{ + /* Emit the instruction (2 bytes) */ + emitTwoBytes(getHWInst(MI)); + + /* Emit isLast (for this instruction group) (1 byte) */ + if (isLast) { + emitByte(1); + } else { + emitByte(0); + } + /* Emit isOp3 (1 byte) */ + if (numSrc == 3) { + emitByte(1); + } else { + emitByte(0); + } + + /* XXX: Emit predicate (1 byte) */ + emitByte(0); + + /* XXX: Emit bank swizzle. (1 byte) Do we need this? It looks like + * r600_asm.c sets it. */ + emitByte(0); + + /* XXX: Emit bank_swizzle_force (1 byte) Not sure what this is for. */ + emitByte(0); + + /* XXX: Emit OMOD (1 byte) Not implemented. */ + emitByte(0); + + /* XXX: Emit index_mode. I think this is for indirect addressing, so we + * don't need to worry about it. */ + emitByte(0); +} + +void R600CodeEmitter::emitTexInstr(MachineInstr &MI) +{ + + int64_t sampler = MI.getOperand(2).getImm(); + int64_t textureType = MI.getOperand(3).getImm(); + unsigned opcode = MI.getOpcode(); + unsigned srcSelect[4] = {0, 1, 2, 3}; + + /* Emit instruction type */ + emitByte(1); + + /* Emit instruction */ + emitByte(getHWInst(MI)); + + /* XXX: Emit resource id r600_shader.c uses sampler + 1. Why? */ + emitByte(sampler + 1 + 1); + + /* Emit source register */ + emitByte(getHWReg(MI.getOperand(1).getReg())); + + /* XXX: Emit src isRelativeAddress */ + emitByte(0); + + /* Emit destination register */ + emitByte(getHWReg(MI.getOperand(0).getReg())); + + /* XXX: Emit dst isRealtiveAddress */ + emitByte(0); + + /* XXX: Emit dst select */ + emitByte(0); /* X */ + emitByte(1); /* Y */ + emitByte(2); /* Z */ + emitByte(3); /* W */ + + /* XXX: Emit lod bias */ + emitByte(0); + + /* XXX: Emit coord types */ + unsigned coordType[4] = {1, 1, 1, 1}; + + if (textureType == TEXTURE_RECT + || textureType == TEXTURE_SHADOWRECT) { + coordType[ELEMENT_X] = 0; + coordType[ELEMENT_Y] = 0; + } + + if (textureType == TEXTURE_1D_ARRAY + || textureType == TEXTURE_SHADOW1D_ARRAY) { + if (opcode == AMDIL::TEX_SAMPLE_C_L || opcode == AMDIL::TEX_SAMPLE_C_LB) { + coordType[ELEMENT_Y] = 0; + } else { + coordType[ELEMENT_Z] = 0; + srcSelect[ELEMENT_Z] = ELEMENT_Y; + } + } else if (textureType == TEXTURE_2D_ARRAY + || textureType == TEXTURE_SHADOW2D_ARRAY) { + coordType[ELEMENT_Z] = 0; + } + + for (unsigned i = 0; i < 4; i++) { + emitByte(coordType[i]); + } + + /* XXX: Emit offsets */ + emitByte(0); /* X */ + emitByte(0); /* Y */ + emitByte(0); /* Z */ + /* There is no OFFSET_W */ + + /* Emit sampler id */ + emitByte(sampler); + + /* XXX:Emit source select */ + if ((textureType == TEXTURE_SHADOW1D + || textureType == TEXTURE_SHADOW2D + || textureType == TEXTURE_SHADOWRECT + || textureType == TEXTURE_SHADOW1D_ARRAY) + && opcode != AMDIL::TEX_SAMPLE_C_L + && opcode != AMDIL::TEX_SAMPLE_C_LB) { + srcSelect[ELEMENT_W] = ELEMENT_Z; + } + + for (unsigned i = 0; i < 4; i++) { + emitByte(srcSelect[i]); + } +} + +void R600CodeEmitter::emitFCInstr(MachineInstr &MI) +{ + /* Emit instruction type */ + emitByte(INSTR_FC); + + /* Emit SRC */ + unsigned numOperands = MI.getNumOperands(); + if (numOperands > 0) { + assert(numOperands == 1); + emitSrc(MI.getOperand(0)); + } else { + emitNullBytes(SRC_BYTE_COUNT); + } + + /* Emit FC Instruction */ + enum FCInstr instr; + switch (MI.getOpcode()) { + case AMDIL::BREAK_LOGICALZ_f32: + instr = FC_BREAK; + break; + case AMDIL::BREAK_LOGICALNZ_i32: + instr = FC_BREAK_NZ_INT; + break; + case AMDIL::BREAK_LOGICALZ_i32: + instr = FC_BREAK_Z_INT; + break; + case AMDIL::CONTINUE_LOGICALNZ_f32: + instr = FC_CONTINUE; + break; + /* XXX: This assumes that all IFs will be if (x != 0). If we add + * optimizations this might not be the case */ + case AMDIL::IF_LOGICALNZ_f32: + case AMDIL::IF_LOGICALNZ_i32: + instr = FC_IF; + break; + case AMDIL::IF_LOGICALZ_f32: + abort(); + break; + case AMDIL::ELSE: + instr = FC_ELSE; + break; + case AMDIL::ENDIF: + instr = FC_ENDIF; + break; + case AMDIL::ENDLOOP: + instr = FC_ENDLOOP; + break; + case AMDIL::WHILELOOP: + instr = FC_BGNLOOP; + break; + default: + abort(); + break; + } + emitByte(instr); +} + +#define INSTR_FLOAT2_V(inst, hw) \ + case AMDIL:: inst##_v4f32: \ + case AMDIL:: inst##_v2f32: return HW_INST2(hw); + +#define INSTR_FLOAT2_S(inst, hw) \ + case AMDIL:: inst##_f32: return HW_INST2(hw); + +#define INSTR_FLOAT2(inst, hw) \ + INSTR_FLOAT2_V(inst, hw) \ + INSTR_FLOAT2_S(inst, hw) + +unsigned int R600CodeEmitter::getHWInst(const MachineInstr &MI) +{ + + /* XXX: Lower these to MOV before the code emitter. */ + switch (MI.getOpcode()) { + case AMDIL::STORE_OUTPUT: + case AMDIL::VCREATE_v4i32: + case AMDIL::VCREATE_v4f32: + case AMDIL::VEXTRACT_v4f32: + case AMDIL::VINSERT_v4f32: + case AMDIL::LOADCONST_i32: + case AMDIL::LOADCONST_f32: + case AMDIL::MOVE_v4i32: + case AMDIL::SET_CHAN: + /* Instructons to reinterpret bits as ... */ + case AMDIL::IL_ASINT_f32: + case AMDIL::IL_ASINT_i32: + case AMDIL::IL_ASFLOAT_f32: + case AMDIL::IL_ASFLOAT_i32: + return 0x19; + + default: + return getBinaryCodeForInstr(MI); + } +} + +void R600CodeEmitter::emitNullBytes(unsigned int byteCount) +{ + for (unsigned int i = 0; i < byteCount; i++) { + emitByte(0); + } +} + +void R600CodeEmitter::emitByte(unsigned int byte) +{ + _OS.write((uint8_t) byte & 0xff); +} +void R600CodeEmitter::emitTwoBytes(unsigned int bytes) +{ + _OS.write((uint8_t) (bytes & 0xff)); + _OS.write((uint8_t) ((bytes >> 8) & 0xff)); +} + +void R600CodeEmitter::emit(uint32_t value) +{ + for (unsigned i = 0; i < 4; i++) { + _OS.write((uint8_t) ((value >> (8 * i)) & 0xff)); + } +} + +void R600CodeEmitter::emit(uint64_t value) +{ + for (unsigned i = 0; i < 8; i++) { + emitByte((value >> (8 * i)) & 0xff); + } +} + +unsigned R600CodeEmitter::getHWReg(unsigned regNo) const +{ + unsigned hwReg; + + hwReg = TRI->getHWRegIndex(regNo); + if (AMDIL::R600_CReg32RegClass.contains(regNo)) { + hwReg += 512; + } + return hwReg; +} + +uint64_t R600CodeEmitter::getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const +{ + if (MO.isReg()) { + return getHWReg(MO.getReg()); + } else { + return MO.getImm(); + } +} + + +RegElement maskBitToElement(unsigned int maskBit) +{ + switch (maskBit) { + case WRITE_MASK_X: return ELEMENT_X; + case WRITE_MASK_Y: return ELEMENT_Y; + case WRITE_MASK_Z: return ELEMENT_Z; + case WRITE_MASK_W: return ELEMENT_W; + default: + assert("Invalid maskBit"); + return ELEMENT_X; + } +} + +unsigned int dstSwizzleToWriteMask(unsigned swizzle) +{ + switch(swizzle) { + default: + case AMDIL_DST_SWIZZLE_DEFAULT: + return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W; + case AMDIL_DST_SWIZZLE_X___: + return WRITE_MASK_X; + case AMDIL_DST_SWIZZLE_XY__: + return WRITE_MASK_X | WRITE_MASK_Y; + case AMDIL_DST_SWIZZLE_XYZ_: + return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z; + case AMDIL_DST_SWIZZLE_XYZW: + return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W; + case AMDIL_DST_SWIZZLE__Y__: + return WRITE_MASK_Y; + case AMDIL_DST_SWIZZLE__YZ_: + return WRITE_MASK_Y | WRITE_MASK_Z; + case AMDIL_DST_SWIZZLE__YZW: + return WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W; + case AMDIL_DST_SWIZZLE___Z_: + return WRITE_MASK_Z; + case AMDIL_DST_SWIZZLE___ZW: + return WRITE_MASK_Z | WRITE_MASK_W; + case AMDIL_DST_SWIZZLE____W: + return WRITE_MASK_W; + case AMDIL_DST_SWIZZLE_X_ZW: + return WRITE_MASK_X | WRITE_MASK_Z | WRITE_MASK_W; + case AMDIL_DST_SWIZZLE_XY_W: + return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_W; + case AMDIL_DST_SWIZZLE_X_Z_: + return WRITE_MASK_X | WRITE_MASK_Z; + case AMDIL_DST_SWIZZLE_X__W: + return WRITE_MASK_X | WRITE_MASK_W; + case AMDIL_DST_SWIZZLE__Y_W: + return WRITE_MASK_Y | WRITE_MASK_W; + } +} + +#include "AMDILGenCodeEmitter.inc" + diff --git a/src/gallium/drivers/radeon/R600GenRegisterInfo.pl b/src/gallium/drivers/radeon/R600GenRegisterInfo.pl new file mode 100644 index 00000000000..396a69f11f6 --- /dev/null +++ b/src/gallium/drivers/radeon/R600GenRegisterInfo.pl @@ -0,0 +1,171 @@ +#===-- R600GenRegisterInfo.pl - TODO: Add brief description -------===# +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +#===----------------------------------------------------------------------===# +# +# TODO: Add full description +# +#===----------------------------------------------------------------------===# + +use strict; +use warnings; + +use AMDGPUConstants; + +my $CREG_MAX = CONST_REG_COUNT - 1; +my $TREG_MAX = TEMP_REG_COUNT - 1; + +print <<STRING; + +class R600Reg <string name> : Register<name> { + let Namespace = "AMDIL"; +} + +class R600Reg_128<string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> { + let Namespace = "AMDIL"; + let SubRegIndices = [sel_x, sel_y, sel_z, sel_w]; +} + +STRING + +my $i; + +### REG DEFS ### + +my @creg_list = print_reg_defs(CONST_REG_COUNT * 4, "C"); +my @treg_list = print_reg_defs(TEMP_REG_COUNT * 4, "T"); + +my @t128reg; +my @treg_x; +for (my $i = 0; $i < TEMP_REG_COUNT; $i++) { + my $name = "T$i\_XYZW"; + print qq{def $name : R600Reg_128 <"T$i.XYZW", [T$i\_X, T$i\_Y, T$i\_Z, T$i\_W] >;\n}; + $t128reg[$i] = $name; + $treg_x[$i] = "T$i\_X"; +} + +my $treg_string = join(",", @treg_list); +my $creg_list = join(",", @creg_list); +my $t128_string = join(",", @t128reg); +my $treg_x_string = join(",", @treg_x); +print <<STRING; + +class RegSet <dag s> { + dag set = s; +} + +def ZERO : R600Reg<"0.0">; +def HALF : R600Reg<"0.5">; +def ONE : R600Reg<"1.0">; +def ONE_INT : R600Reg<"1">; +def NEG_HALF : R600Reg<"-0.5">; +def NEG_ONE : R600Reg<"-1.0">; +def PV_X : R600Reg<"pv.x">; +def ALU_LITERAL_X : R600Reg<"literal.x">; + +def R600_CReg32 : RegisterClass <"AMDIL", [f32, i32], 32, (add + $creg_list)>; + +def R600_TReg32 : RegisterClass <"AMDIL", [f32, i32], 32, (add + $treg_string)>; + +def R600_TReg32_X : RegisterClass <"AMDIL", [f32, i32], 32, (add + $treg_x_string)>; + +def R600_Reg32 : RegisterClass <"AMDIL", [f32, i32], 32, (add + R600_TReg32, + R600_CReg32, + ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>; + +def R600_Reg128 : RegisterClass<"AMDIL", [v4f32], 128, (add + $t128_string)> +{ + let SubRegClasses = [(R600_TReg32 sel_x, sel_y, sel_z, sel_w)]; +} + +STRING + +my %index_map; +my %chan_map; + +for ($i = 0; $i <= $#creg_list; $i++) { + push(@{$index_map{get_hw_index($i)}}, $creg_list[$i]); + push(@{$chan_map{get_chan_str($i)}}, $creg_list[$i]); +} + +for ($i = 0; $i <= $#treg_list; $i++) { + push(@{$index_map{get_hw_index($i)}}, $treg_list[$i]); + push(@{$chan_map{get_chan_str($i)}}, $treg_list[$i]); +} + +for ($i = 0; $i <= $#t128reg; $i++) { + push(@{$index_map{$i}}, $t128reg[$i]); + push(@{$chan_map{'X'}}, $t128reg[$i]); +} + +open(OUTFILE, ">", "R600HwRegInfo.include"); + +print OUTFILE <<STRING; + +unsigned R600RegisterInfo::getHWRegIndexGen(unsigned reg) const +{ + switch(reg) { + default: assert(!"Unknown register"); return 0; +STRING +foreach my $key (keys(%index_map)) { + foreach my $reg (@{$index_map{$key}}) { + print OUTFILE " case AMDIL::$reg:\n"; + } + print OUTFILE " return $key;\n\n"; +} + +print OUTFILE " }\n}\n\n"; + +print OUTFILE <<STRING; + +unsigned R600RegisterInfo::getHWRegChanGen(unsigned reg) const +{ + switch(reg) { + default: assert(!"Unknown register"); return 0; +STRING + +foreach my $key (keys(%chan_map)) { + foreach my $reg (@{$chan_map{$key}}) { + print OUTFILE " case AMDIL::$reg:\n"; + } + my $val; + if ($key eq 'X') { + $val = 0; + } elsif ($key eq 'Y') { + $val = 1; + } elsif ($key eq 'Z') { + $val = 2; + } elsif ($key eq 'W') { + $val = 3; + } else { + die("Unknown chan value; $key"); + } + print OUTFILE " return $val;\n\n"; +} + +print OUTFILE " }\n}\n\n"; + +sub print_reg_defs { + my ($count, $prefix) = @_; + + my @reg_list; + + for ($i = 0; $i < $count; $i++) { + my $hw_index = get_hw_index($i); + my $chan= get_chan_str($i); + my $name = "$prefix$hw_index\_$chan"; + print qq{def $name : R600Reg <"$prefix$hw_index.$chan">;\n}; + $reg_list[$i] = $name; + } + return @reg_list; +} + diff --git a/src/gallium/drivers/radeon/R600ISelLowering.cpp b/src/gallium/drivers/radeon/R600ISelLowering.cpp new file mode 100644 index 00000000000..104f4c5aeb8 --- /dev/null +++ b/src/gallium/drivers/radeon/R600ISelLowering.cpp @@ -0,0 +1,102 @@ +//===-- R600ISelLowering.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "R600ISelLowering.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +R600TargetLowering::R600TargetLowering(TargetMachine &TM) : + AMDGPUTargetLowering(TM), + TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) +{ + setOperationAction(ISD::MUL, MVT::i64, Expand); +// setSchedulingPreference(Sched::VLIW); +} + +MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const +{ + MachineFunction * MF = BB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + switch (MI->getOpcode()) { + default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + /* XXX: Use helper function from AMDGPULowerShaderInstructions here */ + case AMDIL::TGID_X: + addLiveIn(MI, MF, MRI, TII, AMDIL::T1_X); + break; + case AMDIL::TGID_Y: + addLiveIn(MI, MF, MRI, TII, AMDIL::T1_Y); + break; + case AMDIL::TGID_Z: + addLiveIn(MI, MF, MRI, TII, AMDIL::T1_Z); + break; + case AMDIL::TIDIG_X: + addLiveIn(MI, MF, MRI, TII, AMDIL::T0_X); + break; + case AMDIL::TIDIG_Y: + addLiveIn(MI, MF, MRI, TII, AMDIL::T0_Y); + break; + case AMDIL::TIDIG_Z: + addLiveIn(MI, MF, MRI, TII, AMDIL::T0_Z); + break; + case AMDIL::NGROUPS_X: + lowerImplicitParameter(MI, *BB, MRI, 0); + break; + case AMDIL::NGROUPS_Y: + lowerImplicitParameter(MI, *BB, MRI, 1); + break; + case AMDIL::NGROUPS_Z: + lowerImplicitParameter(MI, *BB, MRI, 2); + break; + case AMDIL::GLOBAL_SIZE_X: + lowerImplicitParameter(MI, *BB, MRI, 3); + break; + case AMDIL::GLOBAL_SIZE_Y: + lowerImplicitParameter(MI, *BB, MRI, 4); + break; + case AMDIL::GLOBAL_SIZE_Z: + lowerImplicitParameter(MI, *BB, MRI, 5); + break; + case AMDIL::LOCAL_SIZE_X: + lowerImplicitParameter(MI, *BB, MRI, 6); + break; + case AMDIL::LOCAL_SIZE_Y: + lowerImplicitParameter(MI, *BB, MRI, 7); + break; + case AMDIL::LOCAL_SIZE_Z: + lowerImplicitParameter(MI, *BB, MRI, 8); + break; + } + MI->eraseFromParent(); + return BB; +} + +void R600TargetLowering::lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, + MachineRegisterInfo & MRI, unsigned dword_offset) const +{ + MachineBasicBlock::iterator I = *MI; + unsigned offsetReg = MRI.createVirtualRegister(&AMDIL::R600_TReg32_XRegClass); + MRI.setRegClass(MI->getOperand(0).getReg(), &AMDIL::R600_TReg32_XRegClass); + + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::MOV), offsetReg) + .addReg(AMDIL::ALU_LITERAL_X) + .addImm(dword_offset * 4); + + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::VTX_READ_eg)) + .addOperand(MI->getOperand(0)) + .addReg(offsetReg) + .addImm(0); +} diff --git a/src/gallium/drivers/radeon/R600ISelLowering.h b/src/gallium/drivers/radeon/R600ISelLowering.h new file mode 100644 index 00000000000..fd26bf538c4 --- /dev/null +++ b/src/gallium/drivers/radeon/R600ISelLowering.h @@ -0,0 +1,40 @@ +//===-- R600ISelLowering.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef R600ISELLOWERING_H +#define R600ISELLOWERING_H + +#include "AMDGPUISelLowering.h" + +namespace llvm { + +class R600InstrInfo; + +class R600TargetLowering : public AMDGPUTargetLowering +{ +public: + R600TargetLowering(TargetMachine &TM); + virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI, + MachineBasicBlock * BB) const; + +private: + const R600InstrInfo * TII; + + void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, + MachineRegisterInfo & MRI, unsigned dword_offset) const; + +}; + +} // End namespace llvm; + +#endif // R600ISELLOWERING_H diff --git a/src/gallium/drivers/radeon/R600InstrFormats.td b/src/gallium/drivers/radeon/R600InstrFormats.td new file mode 100644 index 00000000000..0890eb64509 --- /dev/null +++ b/src/gallium/drivers/radeon/R600InstrFormats.td @@ -0,0 +1,16 @@ +//===-- R600InstrFormats.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +class ALUInst <bits<10> op, dag outs, dag ins, string asm, list<dag> pattern> + : InstR600 <, outs, ins , asm, pattern> diff --git a/src/gallium/drivers/radeon/R600InstrInfo.cpp b/src/gallium/drivers/radeon/R600InstrInfo.cpp new file mode 100644 index 00000000000..bcee89c4f91 --- /dev/null +++ b/src/gallium/drivers/radeon/R600InstrInfo.cpp @@ -0,0 +1,109 @@ +//===-- R600InstrInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "R600InstrInfo.h" +#include "AMDGPUTargetMachine.h" +#include "R600RegisterInfo.h" + +using namespace llvm; + +R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm) + : AMDGPUInstrInfo(tm), + RI(tm, *this), + TM(tm) + { } + +const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const +{ + return RI; +} + +bool R600InstrInfo::isTrig(const MachineInstr &MI) const +{ + return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; +} + +void +R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const +{ + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) + && AMDIL::GPRI32RegClass.contains(SrcReg)) { + SrcReg = AMDIL::T0_X; + } + BuildMI(MBB, MI, DL, get(AMDIL::MOV), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); +} + +unsigned R600InstrInfo::getISAOpcode(unsigned opcode) const +{ + switch (opcode) { + default: return AMDGPUInstrInfo::getISAOpcode(opcode); + case AMDIL::CUSTOM_ADD_i32: + return AMDIL::ADD_INT; + case AMDIL::CUSTOM_XOR_i32: + return AMDIL::XOR_INT; + case AMDIL::MOVE_f32: + case AMDIL::MOVE_i32: + return AMDIL::MOV; + case AMDIL::SHR_i32: + return getLSHRop(); + } +} + +unsigned R600InstrInfo::getLSHRop() const +{ + unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration(); + if (gen < AMDILDeviceInfo::HD5XXX) { + return AMDIL::LSHR_r600; + } else { + return AMDIL::LSHR_eg; + } +} + +unsigned R600InstrInfo::getMULHI_UINT() const +{ + unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration(); + + if (gen < AMDILDeviceInfo::HD5XXX) { + return AMDIL::MULHI_UINT_r600; + } else { + return AMDIL::MULHI_UINT_eg; + } +} + +unsigned R600InstrInfo::getMULLO_UINT() const +{ + unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration(); + + if (gen < AMDILDeviceInfo::HD5XXX) { + return AMDIL::MULLO_UINT_r600; + } else { + return AMDIL::MULLO_UINT_eg; + } +} + +unsigned R600InstrInfo::getRECIP_UINT() const +{ + const AMDILDevice * dev = TM.getSubtarget<AMDILSubtarget>().device(); + + if (dev->getGeneration() < AMDILDeviceInfo::HD5XXX) { + return AMDIL::RECIP_UINT_r600; + } else if (dev->getDeviceFlag() != OCL_DEVICE_CAYMAN) { + return AMDIL::RECIP_UINT_eg; + } else { + return AMDIL::RECIP_UINT_cm; + } +} diff --git a/src/gallium/drivers/radeon/R600InstrInfo.h b/src/gallium/drivers/radeon/R600InstrInfo.h new file mode 100644 index 00000000000..aedaa9f47f3 --- /dev/null +++ b/src/gallium/drivers/radeon/R600InstrInfo.h @@ -0,0 +1,74 @@ +//===-- R600InstrInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef R600INSTRUCTIONINFO_H_ +#define R600INSTRUCTIONINFO_H_ + +#include "AMDIL.h" +#include "AMDILInstrInfo.h" +#include "R600RegisterInfo.h" + +#include <map> + +namespace llvm { + + struct InstrGroup { + unsigned amdil; + unsigned r600; + unsigned eg; + unsigned cayman; + }; + + class AMDGPUTargetMachine; + class MachineFunction; + class MachineInstr; + class MachineInstrBuilder; + + class R600InstrInfo : public AMDGPUInstrInfo { + private: + const R600RegisterInfo RI; + AMDGPUTargetMachine &TM; + + public: + explicit R600InstrInfo(AMDGPUTargetMachine &tm); + + const R600RegisterInfo &getRegisterInfo() const; + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + + virtual unsigned getISAOpcode(unsigned opcode) const; + bool isTrig(const MachineInstr &MI) const; + + unsigned getLSHRop() const; + unsigned getMULHI_UINT() const; + unsigned getMULLO_UINT() const; + unsigned getRECIP_UINT() const; + + }; + +} // End llvm namespace + +namespace R600_InstFlag { + enum TIF { + TRANS_ONLY = (1 << 0), + TEX = (1 << 1), + REDUCTION = (1 << 2), + FC = (1 << 3), + TRIG = (1 << 4), + OP3 = (1 << 5) + }; +} + +#endif // R600INSTRINFO_H_ diff --git a/src/gallium/drivers/radeon/R600Instructions.td b/src/gallium/drivers/radeon/R600Instructions.td new file mode 100644 index 00000000000..913e27f1f9c --- /dev/null +++ b/src/gallium/drivers/radeon/R600Instructions.td @@ -0,0 +1,931 @@ +//===-- R600Instructions.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +include "R600Intrinsics.td" + +class InstR600 <bits<32> inst, dag outs, dag ins, string asm, list<dag> pattern, + InstrItinClass itin> + : AMDGPUInst <outs, ins, asm, pattern> { + + field bits<32> Inst; + bit Trig = 0; + bit Op3 = 0; + + let Inst = inst; + let Namespace = "AMDIL"; + let OutOperandList = outs; + let InOperandList = ins; + let AsmString = asm; + let Pattern = pattern; + let Itinerary = itin; + + let TSFlags{4} = Trig; + let TSFlags{5} = Op3; +} + +class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> : + AMDGPUInst <outs, ins, asm, pattern> +{ + field bits<64> Inst; + + let Namespace = "AMDIL"; +} + +def MEMri : Operand<iPTRAny> { + let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index); +} + +def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>; + +class R600_ALU { + + bits<7> DST_GPR = 0; + bits<9> SRC0_SEL = 0; + bits<1> SRC0_NEG = 0; + bits<9> SRC1_SEL = 0; + bits<1> SRC1_NEG = 0; + bits<1> CLAMP = 0; + +} + + +class R600_1OP <bits<32> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <inst, + (outs R600_Reg32:$dst), + (ins R600_Reg32:$src, variable_ops), + !strconcat(opName, " $dst, $src"), + pattern, + itin + >; + +class R600_2OP <bits<32> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <inst, + (outs R600_Reg32:$dst), + (ins R600_Reg32:$src0, R600_Reg32:$src1, variable_ops), + !strconcat(opName, " $dst, $src0, $src1"), + pattern, + itin + >; + +class R600_3OP <bits<32> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <inst, + (outs R600_Reg32:$dst), + (ins R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2, variable_ops), + !strconcat(opName, "$dst $src0, $src1, $src2"), + pattern, + itin>{ + + let Op3 = 1; + } + +class R600_REDUCTION <bits<32> inst, dag ins, string asm, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <inst, + (outs R600_Reg32:$dst), + ins, + asm, + pattern, + itin + + >; + +class R600_TEX <bits<32> inst, string opName, list<dag> pattern, + InstrItinClass itin = AnyALU> : + InstR600 <inst, + (outs R600_Reg128:$dst), + (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2), + !strconcat(opName, "$dst, $src0, $src1, $src2"), + pattern, + itin + >; + +def TEX_SHADOW : PatLeaf< + (imm), + [{uint32_t TType = (uint32_t)N->getZExtValue(); + return (TType >= 6 && TType <= 8) || TType == 11 || TType == 12; + }] +>; + +class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, dag outs, dag ins, + string asm> : + InstR600ISA <outs, ins, asm, []> +{ + bits<7> RW_GPR; + bits<7> INDEX_GPR; + bits<4> RAT_ID; + + bits<2> RIM; + bits<2> TYPE; + bits<1> RW_REL; + bits<2> ELEM_SIZE; + + bits<12> ARRAY_SIZE; + bits<4> COMP_MASK; + bits<4> BURST_COUNT; + bits<1> VPM; + bits<1> EOP; + bits<1> MARK; + bits<1> BARRIER; + + /* CF_ALLOC_EXPORT_WORD0_RAT */ + let Inst{3-0} = RAT_ID; + let Inst{9-4} = rat_inst; + let Inst{10} = 0; /* Reserved */ + let Inst{12-11} = RIM; + let Inst{14-13} = TYPE; + let Inst{21-15} = RW_GPR; + let Inst{22} = RW_REL; + let Inst{29-23} = INDEX_GPR; + let Inst{31-30} = ELEM_SIZE; + + /* CF_ALLOC_EXPORT_WORD1_BUF */ +/* XXX: We can't have auto encoding of 64-bit instructions until LLVM 3.1 :( */ +/* + let Inst{43-32} = ARRAY_SIZE; + let Inst{47-44} = COMP_MASK; + let Inst{51-48} = BURST_COUNT; + let Inst{52} = VPM; + let Inst{53} = EOP; + let Inst{61-54} = cf_inst; + let Inst{62} = MARK; + let Inst{63} = BARRIER; +*/ +} + +/* +def store_global : PatFrag<(ops node:$value, node:$ptr), + (store node:$value, node:$ptr), + [{ + const Value *Src; + const PointerType *Type; + if ((src = cast<StoreSDNode>(N)->getSrcValue() && + PT = dyn_cast<PointerType>(Src->getType()))) { + return PT->getAddressSpace() == 1; + } + return false; + }]>; + +*/ + +def load_param : PatFrag<(ops node:$ptr), + (load node:$ptr), + [{ + return true; + const Value *Src = cast<LoadSDNode>(N)->getSrcValue(); + if (Src) { + PointerType * PT = dyn_cast<PointerType>(Src->getType()); + return PT && PT->getAddressSpace() == AMDILAS::PARAM_I_ADDRESS; + } + return false; + }]>; + +//class EG_CF <bits<32> inst, string asm> : +// InstR600 <inst, (outs), (ins), asm, []>; + +/* XXX: We will use this when we emit the real ISA. + bits<24> ADDR = 0; + bits<3> JTS = 0; + + bits<3> PC = 0; + bits<5> CF_CONS = 0; + bits<2> COND = 0; + bits<6> COUNT = 0; + bits<1> VPM = 0; + bits<1> EOP = 0; + bits<8> CF_INST = 0; + bits<1> WQM = 0; + bits<1> B = 0; + + let Inst{23-0} = ADDR; + let Inst{26-24} = JTS; + let Inst{34-32} = PC; + let Inst{39-35} = CF_CONST; + let Inst{41-40} = COND; + let Inst{47-42} = COUNT; + let Inst{52} = VPM; + let Inst{53} = EOP; + let Inst{61-54} = CF_INST; + let Inst{62} = WQM; + let Inst{63} = B; +//} +*/ +def isR600 : Predicate<"Subtarget.device()" + "->getGeneration() == AMDILDeviceInfo::HD4XXX">; +def isEG : Predicate<"Subtarget.device()" + "->getGeneration() >= AMDILDeviceInfo::HD5XXX && " + "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">; +def isCayman : Predicate<"Subtarget.device()" + "->getDeviceFlag() == OCL_DEVICE_CAYMAN">; +def isEGorCayman : Predicate<"Subtarget.device()" + "->getGeneration() >= AMDILDeviceInfo::HD5XXX">; + +def isR600toCayman : Predicate< + "Subtarget.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX">; + + +let Predicates = [isR600toCayman] in { + +/* ------------------------------------------- */ +/* Common Instructions R600, R700, Evergreen, Cayman */ +/* ------------------------------------------- */ +let Gen = AMDGPUGen.R600_CAYMAN in { + +def ADD : R600_2OP < + 0x0, "ADD", + [(set R600_Reg32:$dst, (fadd R600_Reg32:$src0, R600_Reg32:$src1))] > { + let AMDILOp = AMDILInst.ADD_f32; +} +// Non-IEEE MUL: 0 * anything = 0 +def MUL : R600_2OP < + 0x1, "MUL NON-IEEE", + [(set R600_Reg32:$dst, (int_AMDGPU_mul R600_Reg32:$src0, R600_Reg32:$src1))] +>; + +def MUL_IEEE : R600_2OP < + 0x2, "MUL_IEEE", + [(set R600_Reg32:$dst, (fmul R600_Reg32:$src0, R600_Reg32:$src1))]> { + let AMDILOp = AMDILInst.MUL_IEEE_f32; +} + +def MAX : R600_2OP < + 0x3, "MAX", + [(set R600_Reg32:$dst, (int_AMDIL_max R600_Reg32:$src0, R600_Reg32:$src1))]> { + let AMDILOp = AMDILInst.MAX_f32; +} + +def MIN : R600_2OP < + 0x4, "MIN", + [(set R600_Reg32:$dst, (int_AMDIL_min R600_Reg32:$src0, R600_Reg32:$src1))]> { + let AMDILOp = AMDILInst.MIN_f32; +} + +/* For the SET* instructions there is a naming conflict in TargetSelectionDAG.td, + * so some of the instruction names don't match the asm string. + * XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics. + */ + +def SETE : R600_2OP < + 0x08, "SETE", + [(set R600_Reg32:$dst, (int_AMDGPU_seq R600_Reg32:$src0, R600_Reg32:$src1))]> { + let AMDILOp = AMDILInst.FEQ; +} + +def SGT : R600_2OP < + 0x09, "SETGT", + [(set R600_Reg32:$dst, (int_AMDGPU_sgt R600_Reg32:$src0, R600_Reg32:$src1))] +>; + +def SGE : R600_2OP < + 0xA, "SETGE", + [(set R600_Reg32:$dst, (int_AMDGPU_sge R600_Reg32:$src0, R600_Reg32:$src1))]> { + let AMDILOp = AMDILInst.FGE; +} + +def SNE : R600_2OP < + 0xB, "SETNE", + [(set R600_Reg32:$dst, (int_AMDGPU_sne R600_Reg32:$src0, R600_Reg32:$src1))]> { + let AMDILOp = AMDILInst.FNE; +} + +def FRACT : R600_1OP < + 0x10, "FRACT", + []> { + let AMDILOp = AMDILInst.FRAC_f32; +} + +def TRUNC : R600_1OP < + 0x11, "TRUNC", + [(set R600_Reg32:$dst, (int_AMDGPU_trunc R600_Reg32:$src))] +>; + +def FLOOR : R600_1OP < + 0x14, "FLOOR", + [(set R600_Reg32:$dst, (int_AMDGPU_floor R600_Reg32:$src))] +>; + +def MOV : R600_1OP <0x19, "MOV", []>; + +def KILLGT : R600_2OP < + 0x2D, "KILLGT", + [] +>; + +def AND_INT : R600_2OP < + 0x30, "AND_INT", + []> { + let AMDILOp = AMDILInst.AND_i32; +} + +def XOR_INT : R600_2OP < + 0x32, "XOR_INT", + [] +>; + +def ADD_INT : R600_2OP < + 0x34, "ADD_INT $dst, $src0, $src1", + []>{ + let AMDILOp = AMDILInst.ADD_i32; +} + +def SUB_INT : R600_2OP < + 0x35, "SUB_INT $dst, $src0, $src1", + [] +>; + +def SETE_INT : R600_2OP < + 0x3A, "SETE_INT $dst, $src0, $src1", + []>{ + let AMDILOp = AMDILInst.IEQ; +} + +def SETGT_INT : R600_2OP < + 0x3B, "SGT_INT $dst, $src0, $src1", + [] +>; + +def SETGE_INT : R600_2OP < + 0x3C, "SETGE_INT $dst, $src0, $src1", + []>{ + let AMDILOp = AMDILInst.IGE; +} + +def SETNE_INT : R600_2OP < + 0x3D, "SETNE_INT $dst, $src0, $src1", + []>{ + let AMDILOp = AMDILInst.INE; +} + +def SETGT_UINT : R600_2OP < + 0x3E, "SETGT_UINT $dst, $src0, $src1", + []>{ + let AMDILOp = AMDILInst.UGT; +} + +def SETGE_UINT : R600_2OP < + 0x3F, "SETGE_UINT $dst, $src0, $src1", + []>{ + let AMDILOp = AMDILInst.UGE; +} + +def CNDE_INT : R600_3OP < + 0x1C, "CNDE_INT $dst, $src0, $src1, $src2", + [] +>; + +/* Texture instructions */ + +def TEX_SAMPLE : R600_TEX < + 0x10, "TEX_SAMPLE", + [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$src1, imm:$src2))] +>; + +def TEX_SAMPLE_C : R600_TEX < + 0x18, "TEX_SAMPLE_C", + [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$src1, TEX_SHADOW:$src2))] +>; + +def TEX_SAMPLE_L : R600_TEX < + 0x11, "TEX_SAMPLE_L", + [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$src1, imm:$src2))] +>; + +def TEX_SAMPLE_C_L : R600_TEX < + 0x19, "TEX_SAMPLE_C_L", + [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$src1, TEX_SHADOW:$src2))] +>; + +def TEX_SAMPLE_LB : R600_TEX < + 0x12, "TEX_SAMPLE_LB", + [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$src1, imm:$src2))] +>; + +def TEX_SAMPLE_C_LB : R600_TEX < + 0x1A, "TEX_SAMPLE_C_LB", + [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$src1, TEX_SHADOW:$src2))] +>; + +def TEX_SAMPLE_G : R600_TEX < + 0x14, "TEX_SAMPLE_G", + [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, imm:$src1, imm:$src2))] +>; + +def TEX_SAMPLE_C_G : R600_TEX < + 0x1C, "TEX_SAMPLE_C_G", + [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, imm:$src1, TEX_SHADOW:$src2))] +>; + +} // End Gen R600_CAYMAN + +def KILP : Pat < + (int_AMDGPU_kilp), + (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO))) +>; + +/* Helper classes for common instructions */ + +class MUL_LIT_Common <bits<32> inst> : R600_3OP < + inst, "MUL_LIT", + [] +>; + +class MULADD_Common <bits<32> inst> : R600_3OP < + inst, "MULADD", + []> { + let AMDILOp = AMDILInst.MAD_f32; +} + +class CNDE_Common <bits<32> inst> : R600_3OP < + inst, "CNDE", + []> { + let AMDILOp = AMDILInst.CMOVLOG_f32; +} + +class CNDGT_Common <bits<32> inst> : R600_3OP < + inst, "CNDGT", + [] +>; + +class CNDGE_Common <bits<32> inst> : R600_3OP < + inst, "CNDGE", + [(set R600_Reg32:$dst, (int_AMDGPU_cndlt R600_Reg32:$src0, R600_Reg32:$src2, R600_Reg32:$src1))] +>; + +class DOT4_Common <bits<32> inst> : R600_REDUCTION < + inst, + (ins R600_Reg128:$src0, R600_Reg128:$src1), + "DOT4 $dst $src0, $src1", + [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))] +>; + +class EXP_IEEE_Common <bits<32> inst> : R600_1OP < + inst, "EXP_IEEE", + []> { + let AMDILOp = AMDILInst.EXP_f32; +} + +class FLT_TO_INT_Common <bits<32> inst> : R600_1OP < + inst, "FLT_TO_INT", []> { + let AMDILOp = AMDILInst.FTOI; +} + +class INT_TO_FLT_Common <bits<32> inst> : R600_1OP < + inst, "INT_TO_FLT", []> { + let AMDILOp = AMDILInst.ITOF; +} + +class LOG_CLAMPED_Common <bits<32> inst> : R600_1OP < + inst, "LOG_CLAMPED", + [] +>; + +class LOG_IEEE_Common <bits<32> inst> : R600_1OP < + inst, "LOG_IEEE", + []> { + let AMDILOp = AMDILInst.LOG_f32; +} + +class LSHL_Common <bits<32> inst> : R600_2OP < + inst, "LSHL $dst, $src0, $src1", + [] >{ + let AMDILOp = AMDILInst.SHL_i32; +} + +class LSHR_Common <bits<32> inst> : R600_2OP < + inst, "LSHR $dst, $src0, $src1", + [] >{ + let AMDILOp = AMDILInst.USHR_i32; +} + +class MULHI_INT_Common <bits<32> inst> : R600_2OP < + inst, "MULHI_INT $dst, $src0, $src1", + [] >{ + let AMDILOp = AMDILInst.SMULHI_i32; +} + +class MULHI_UINT_Common <bits<32> inst> : R600_2OP < + inst, "MULHI $dst, $src0, $src1", + [] +>; + +class MULLO_INT_Common <bits<32> inst> : R600_2OP < + inst, "MULLO_INT $dst, $src0, $src1", + [] >{ + let AMDILOp = AMDILInst.SMUL_i32; +} + +class MULLO_UINT_Common <bits<32> inst> : R600_2OP < + inst, "MULLO_UINT $dst, $src0, $src1", + [] +>; + +class RECIP_CLAMPED_Common <bits<32> inst> : R600_1OP < + inst, "RECIP_CLAMPED", + [] +>; + +class RECIP_IEEE_Common <bits<32> inst> : R600_1OP < + inst, "RECIP_IEEE", + [(set R600_Reg32:$dst, (int_AMDGPU_rcp R600_Reg32:$src))]> { + let AMDILOp = AMDILInst.RSQ_f32; +} + +class RECIP_UINT_Common <bits<32> inst> : R600_1OP < + inst, "RECIP_INT $dst, $src", + [] +>; + +class RECIPSQRT_CLAMPED_Common <bits<32> inst> : R600_1OP < + inst, "RECIPSQRT_CLAMPED", + [(set R600_Reg32:$dst, (int_AMDGPU_rsq R600_Reg32:$src))] +>; + +class RECIPSQRT_IEEE_Common <bits<32> inst> : R600_1OP < + inst, "RECIPSQRT_IEEE", + [] +>; + +class SIN_Common <bits<32> inst> : R600_1OP < + inst, "SIN", + []>{ + let AMDILOp = AMDILInst.SIN_f32; + let Trig = 1; +} + +class COS_Common <bits<32> inst> : R600_1OP < + inst, "COS", + []> { + let AMDILOp = AMDILInst.COS_f32; + let Trig = 1; +} + +/* Helper patterns for complex intrinsics */ +/* -------------------------------------- */ + +class DIV_Common <InstR600 recip_ieee> : Pat< + (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1), + (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1)) +>; + +class LRP_Common <InstR600 muladd> : Pat < + (int_AMDGPU_lrp R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2), + (muladd R600_Reg32:$src0, R600_Reg32:$src1, (MUL (SUB_f32 ONE, R600_Reg32:$src0), R600_Reg32:$src2)) +>; + +class SSG_Common <InstR600 cndgt, InstR600 cndge> : Pat < + (int_AMDGPU_ssg R600_Reg32:$src), + (cndgt R600_Reg32:$src, (f32 ONE), (cndge R600_Reg32:$src, (f32 ZERO), (f32 NEG_ONE))) +>; + +class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat < + (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w), + (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x)) +>; + +/* ---------------------- */ +/* R600 / R700 Only Instructions */ +/* ---------------------- */ + +let Predicates = [isR600] in { + +let Gen = AMDGPUGen.R600 in { + + def MUL_LIT_r600 : MUL_LIT_Common<0x0C>; + def MULADD_r600 : MULADD_Common<0x10>; + def CNDE_r600 : CNDE_Common<0x18>; + def CNDGT_r600 : CNDGT_Common<0x19>; + def CNDGE_r600 : CNDGE_Common<0x1A>; + def DOT4_r600 : DOT4_Common<0x50>; + def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>; + def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>; + def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>; + def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>; + def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>; + def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>; + def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>; + def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>; + def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>; + def SIN_r600 : SIN_Common<0x6E>; + def COS_r600 : COS_Common<0x6F>; + def LSHR_r600 : LSHR_Common<0x71>; + def LSHL_r600 : LSHL_Common<0x72>; + def MULLO_INT_r600 : MULLO_INT_Common<0x73>; + def MULHI_INT_r600 : MULHI_INT_Common<0x74>; + def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>; + def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>; + def RECIP_UINT_r600 : RECIP_UINT_Common <0x77>; + +} // End AMDGPUGen.R600 + + def DIV_r600 : DIV_Common<RECIP_IEEE_r600>; + def LRP_r600 : LRP_Common<MULADD_r600>; + def POW_r600 : POW_Common<LOG_IEEE_r600, EXP_IEEE_r600, MUL, GPRF32>; + def SSG_r600 : SSG_Common<CNDGT_r600, CNDGE_r600>; + def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>; + +} + +/* ----------------- */ +/* R700+ Trig helper */ +/* ----------------- */ + +/* +class TRIG_HELPER_r700 <InstR600 trig_inst>: Pat < + (trig_inst R600_Reg32:$src), + (trig_inst (fmul R600_Reg32:$src, (PI)))) +>; +*/ + +/* ---------------------- */ +/* Evergreen Instructions */ +/* ---------------------- */ + + +let Predicates = [isEG] in { + +let Gen = AMDGPUGen.EG in { + +def RAT_WRITE_CACHELESS_eg : + EG_CF_RAT <0x57, 0x2, (outs), (ins R600_TReg32_X:$rw_gpr, + R600_TReg32_X:$index_gpr, i32imm:$rat_id), ""> +{ +/* + let Inst{3-0} = RAT_ID; + let Inst{21-15} = RW_GPR; + let Inst{29-23} = INDEX_GPR; + /* Propery of the UAV */ + let Inst{31-30} = ELEM_SIZE; +*/ + let RIM = 0; + /* XXX: Have a separate instruction for non-indexed writes. */ + let TYPE = 1; + let RW_REL = 0; + let ELEM_SIZE = 0; + +/* + let ARRAY_SIZE = 0; + let COMP_MASK = 1; + let BURST_COUNT = 0; + let VPM = 0; + let EOP = 0; + let MARK = 0; + let BARRIER = 1; +*/ +} + +def VTX_READ_eg : InstR600ISA < (outs R600_TReg32_X:$dst), + (ins R600_TReg32_X:$src, i32imm:$buffer_id), + "VTX_READ_eg $dst, $src", []> +{ +/* + bits<7> DST_GPR; + bits<7> SRC_GPR; + bits<8> BUFFER_ID; +*/ + /* If any of these field below need to be calculated at compile time, and + * a ins operand for them and move them to the list of operands above. */ + + /* XXX: This instruction is manual encoded, so none of these values are used. + */ +/* + bits<5> VC_INST = 0; //VC_INST_FETCH + bits<2> FETCH_TYPE = 2; + bits<1> FETCH_WHOLE_QUAD = 1; + bits<1> SRC_REL = 0; + bits<2> SRC_SEL_X = 0; + bits<6> MEGA_FETCH_COUNT = 4; +*/ +/* + + bits<1> DST_REL = 0; + bits<3> DST_SEL_X = 0; + bits<3> DST_SEL_Y = 7; //Masked + bits<3> DST_SEL_Z = 7; //Masked + bits<3> DST_SEL_W = 7; //Masked + bits<1> USE_CONST_FIELDS = 1; //Masked + bits<6> DATA_FORMAT = 0; + bits<2> NUM_FORMAT_ALL = 0; + bits<1> FORMAT_COMP_ALL = 0; + bits<1> SRF_MODE_ALL = 0; +*/ + +/* + let Inst{4-0} = VC_INST; + let Inst{6-5} = FETCH_TYPE; + let Inst{7} = FETCH_WHOLE_QUAD; + let Inst{15-8} = BUFFER_ID; + let Inst{22-16} = SRC_GPR; + let Inst{23} = SRC_REL; + let Inst{25-24} = SRC_SEL_X; + let Inst{31-26} = MEGA_FETCH_COUNT; +*/ + /* DST_GPR is OK to leave uncommented, because LLVM 3.0 only prevents you + * from statically setting bits > 31. This field will be set by + * getMachineValueOp which can set bits > 31. + */ +// let Inst{32-38} = DST_GPR; + + /* XXX: Uncomment for LLVM 3.1 which supports 64-bit instructions */ + +/* + let Inst{39} = DST_REL; + let Inst{40} = 0; //Reserved + let Inst{43-41} = DST_SEL_X; + let Inst{46-44} = DST_SEL_Y; + let Inst{49-47} = DST_SEL_Z; + let Inst{52-50} = DST_SEL_W; + let Inst{53} = USE_CONST_FIELDS; + let Inst{59-54} = DATA_FORMAT; + let Inst{61-60} = NUM_FORMAT_ALL; + let Inst{62} = FORMAT_COMP_ALL; + let Inst{63} = SRF_MODE_ALL; +*/ +} + + + +} // End AMDGPUGen.EG +/* XXX: Need to convert PTR to rat_id */ +/* +def : Pat <(store_global (f32 R600_Reg32:$value), node:$ptr), + (RAT_WRITE_CACHELESS_eg (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), + (f32 R600_Reg32:$value), + sel_x), + (f32 ZERO), 0, R600_Reg32:$ptr)>; +*/ + +class VTX_Param_Read_Pattern <ValueType vt> : Pat < + (vt (load_param ADDRParam:$mem)), + (VTX_READ_eg (i32 R600_Reg32:$mem), 0)>; + +def : VTX_Param_Read_Pattern <f32>; +def : VTX_Param_Read_Pattern <i32>; + +} // End isEG Predicate + +/* ------------------------------- */ +/* Evergreen / Cayman Instructions */ +/* ------------------------------- */ + +let Predicates = [isEGorCayman] in { + +class TRIG_eg <InstR600 trig, Intrinsic intr> : Pat< + (intr R600_Reg32:$src), + (trig (MUL (MOV (LOADCONST_i32 CONST.TWO_PI_INV)), R600_Reg32:$src)) +>; + +let Gen = AMDGPUGen.EG_CAYMAN in { + + def MULADD_eg : MULADD_Common<0x14>; + def LSHR_eg : LSHR_Common<0x16>; + def LSHL_eg : LSHL_Common<0x17>; + def CNDE_eg : CNDE_Common<0x19>; + def CNDGT_eg : CNDGT_Common<0x1A>; + def CNDGE_eg : CNDGE_Common<0x1B>; + def MUL_LIT_eg : MUL_LIT_Common<0x1F>; + def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50>; + def EXP_IEEE_eg : EXP_IEEE_Common<0x81>; + def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>; + def LOG_IEEE_eg : LOG_IEEE_Common<0x83>; + def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; + def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>; + def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>; + def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; + def SIN_eg : SIN_Common<0x8D>; + def COS_eg : COS_Common<0x8E>; + def MULLO_INT_eg : MULLO_INT_Common<0x8F>; + def MULHI_INT_eg : MULHI_INT_Common<0x90>; + def MULLO_UINT_eg : MULLO_UINT_Common<0x91>; + def MULHI_UINT_eg : MULHI_UINT_Common<0x92>; + def RECIP_UINT_eg : RECIP_UINT_Common<0x94>; + def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>; + def DOT4_eg : DOT4_Common<0xBE>; + +} // End AMDGPUGen.EG_CAYMAN + + def DIV_eg : DIV_Common<RECIP_IEEE_eg>; + def LRP_eg : LRP_Common<MULADD_eg>; + def POW_eg : POW_Common<LOG_IEEE_eg, EXP_IEEE_eg, MUL, GPRF32>; + def SSG_eg : SSG_Common<CNDGT_eg, CNDGE_eg>; + def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>; + + def : TRIG_eg <SIN_eg, int_AMDGPU_sin>; + def : TRIG_eg <COS_eg, int_AMDGPU_cos>; + +} + +let Predicates = [isCayman] in { + +let Gen = AMDGPUGen.CAYMAN in { + + /* XXX: I'm not sure if this opcode is correct. */ + def RECIP_UINT_cm : RECIP_UINT_Common<0x77>; + +} // End AMDGPUGen.CAYMAN + +} // End isCayman + +/* Other Instructions */ + +let isCodeGenOnly = 1 in { +/* + def SWIZZLE : AMDGPUShaderInst < + (outs GPRV4F32:$dst), + (ins GPRV4F32:$src0, i32imm:$src1), + "SWIZZLE $dst, $src0, $src1", + [(set GPRV4F32:$dst, (int_AMDGPU_swizzle GPRV4F32:$src0, imm:$src1))] + >; +*/ + + def LAST : AMDGPUShaderInst < + (outs), + (ins), + "LAST", + [] + >; + + def GET_CHAN : AMDGPUShaderInst < + (outs R600_Reg32:$dst), + (ins R600_Reg128:$src0, i32imm:$src1), + "GET_CHAN $dst, $src0, $src1", + [] + >; + + def SET_CHAN : AMDGPUShaderInst < + (outs R600_Reg128:$dst), + (ins R600_Reg32:$src0, i32imm:$src1), + "SET_CHAN $dst, $src0, $src1", + [] + >; + + def MULLIT : AMDGPUShaderInst < + (outs R600_Reg128:$dst), + (ins R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2), + "MULLIT $dst, $src0, $src1", + [(set R600_Reg128:$dst, (int_AMDGPU_mullit R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))] + >; + +let usesCustomInserter = 1, isPseudo = 1 in { + +class R600PreloadInst <string asm, Intrinsic intr> : AMDGPUInst < + (outs R600_TReg32:$dst), + (ins), + asm, + [(set R600_TReg32:$dst, (intr))] +>; + +def TGID_X : R600PreloadInst <"TGID_X", int_r600_read_tgid_x>; +def TGID_Y : R600PreloadInst <"TGID_Y", int_r600_read_tgid_y>; +def TGID_Z : R600PreloadInst <"TGID_Z", int_r600_read_tgid_z>; + +def TIDIG_X : R600PreloadInst <"TIDIG_X", int_r600_read_tidig_x>; +def TIDIG_Y : R600PreloadInst <"TIDIG_Y", int_r600_read_tidig_y>; +def TIDIG_Z : R600PreloadInst <"TIDIG_Z", int_r600_read_tidig_z>; + +def NGROUPS_X : R600PreloadInst <"NGROUPS_X", int_r600_read_ngroups_x>; +def NGROUPS_Y : R600PreloadInst <"NGROUPS_Y", int_r600_read_ngroups_y>; +def NGROUPS_Z : R600PreloadInst <"NGROUPS_Z", int_r600_read_ngroups_z>; + +def GLOBAL_SIZE_X : R600PreloadInst <"GLOBAL_SIZE_X", + int_r600_read_global_size_x>; +def GLOBAL_SIZE_Y : R600PreloadInst <"GLOBAL_SIZE_Y", + int_r600_read_global_size_y>; +def GLOBAL_SIZE_Z : R600PreloadInst <"GLOBAL_SIZE_Z", + int_r600_read_global_size_z>; + +def LOCAL_SIZE_X : R600PreloadInst <"LOCAL_SIZE_X", + int_r600_read_local_size_x>; +def LOCAL_SIZE_Y : R600PreloadInst <"LOCAL_SIZE_Y", + int_r600_read_local_size_y>; +def LOCAL_SIZE_Z : R600PreloadInst <"LOCAL_SIZE_Z", + int_r600_read_local_size_z>; + +} // End usesCustomInserter = 1, isPseudo = 1 + +} // End isCodeGenOnly = 1 + + + +include "R600ShaderPatterns.td" + +// We need this pattern to avoid having real registers in PHI nodes. +// For some reason this pattern only works when it comes after the other +// instruction defs. +def : Pat < + (int_R600_load_input imm:$src), + (LOAD_INPUT imm:$src) +>; + +} // End isR600toCayman Predicate diff --git a/src/gallium/drivers/radeon/R600Intrinsics.td b/src/gallium/drivers/radeon/R600Intrinsics.td new file mode 100644 index 00000000000..8038fee1a3c --- /dev/null +++ b/src/gallium/drivers/radeon/R600Intrinsics.td @@ -0,0 +1,40 @@ +//===-- R600Intrinsics.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +let TargetPrefix = "R600", isTarget = 1 in { + def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadWriteArgMem]>; +} + +let TargetPrefix = "r600", isTarget = 1 in { + +class R600ReadPreloadRegisterIntrinsic<string name> + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, + GCCBuiltin<name>; + +multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> { + def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>; + def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>; + def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>; +} + +defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_global_size">; +defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_local_size">; +defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_ngroups">; +defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_tgid">; +defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz < + "__builtin_r600_read_tidig">; +} // End TargetPrefix = "r600" diff --git a/src/gallium/drivers/radeon/R600KernelParameters.cpp b/src/gallium/drivers/radeon/R600KernelParameters.cpp new file mode 100644 index 00000000000..3fdf48a2bf2 --- /dev/null +++ b/src/gallium/drivers/radeon/R600KernelParameters.cpp @@ -0,0 +1,503 @@ +//===-- R600KernelParameters.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include <llvm-c/Core.h> +#include "R600KernelParameters.h" +#include "R600OpenCLUtils.h" +#include "llvm/Constants.h" +#include "llvm/Intrinsics.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/TypeBuilder.h" +// #include "llvm/CodeGen/Function.h" + +namespace AMDILAS { +enum AddressSpaces { + PRIVATE_ADDRESS = 0, // Address space for private memory. + GLOBAL_ADDRESS = 1, // Address space for global memory (RAT0, VTX0). + CONSTANT_ADDRESS = 2, // Address space for constant memory. + LOCAL_ADDRESS = 3, // Address space for local memory. + REGION_ADDRESS = 4, // Address space for region memory. + ADDRESS_NONE = 5, // Address space for unknown memory. + PARAM_D_ADDRESS = 6, // Address space for direct addressible parameter memory (CONST0) + PARAM_I_ADDRESS = 7, // Address space for indirect addressible parameter memory (VTX1) + LAST_ADDRESS = 8 +}; +} + + +#include <map> +#include <set> + +using namespace llvm; +using namespace std; + +#define CONSTANT_CACHE_SIZE_DW 127 + +class R600KernelParameters : public llvm::FunctionPass +{ + const llvm::TargetData * TD; + LLVMContext* Context; + Module *mod; + + struct param + { + param() : val(NULL), ptr_val(NULL), offset_in_dw(0), size_in_dw(0), indirect(false), specialID(0) {} + + llvm::Value* val; + llvm::Value* ptr_val; + int offset_in_dw; + int size_in_dw; + + bool indirect; + + string specialType; + int specialID; + + int end() { return offset_in_dw + size_in_dw; } + /* The first 9 dwords are reserved for the grid sizes. */ + int get_rat_offset() { return 9 + offset_in_dw; } + }; + + std::vector<param> params; + + int getLastSpecialID(const string& TypeName); + + int getListSize(); + void AddParam(llvm::Argument* arg); + int calculateArgumentSize(llvm::Argument* arg); + void RunAna(llvm::Function* fun); + void Replace(llvm::Function* fun); + bool isIndirect(Value* val, set<Value*>& visited); + void Propagate(llvm::Function* fun); + void Propagate(llvm::Value* v, const llvm::Twine& name, bool indirect = false); + Value* ConstantRead(Function* fun, param& p); + Value* handleSpecial(Function* fun, param& p); + bool isSpecialType(Type*); + string getSpecialTypeName(Type*); +public: + static char ID; + R600KernelParameters() : FunctionPass(ID) {}; + R600KernelParameters(const llvm::TargetData* TD) : FunctionPass(ID), TD(TD) {} +// bool runOnFunction (llvm::Function &F); + bool runOnFunction (llvm::Function &F); + void getAnalysisUsage(AnalysisUsage &AU) const; + const char *getPassName() const; + bool doInitialization(Module &M); + bool doFinalization(Module &M); +}; + +char R600KernelParameters::ID = 0; + +static RegisterPass<R600KernelParameters> X("kerparam", "OpenCL Kernel Parameter conversion", false, false); + +int R600KernelParameters::getLastSpecialID(const string& TypeName) +{ + int lastID = -1; + + for (vector<param>::iterator i = params.begin(); i != params.end(); i++) + { + if (i->specialType == TypeName) + { + lastID = i->specialID; + } + } + + return lastID; +} + +int R600KernelParameters::getListSize() +{ + if (params.size() == 0) + { + return 0; + } + + return params.back().end(); +} + +bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited) +{ + if (isa<LoadInst>(val)) + { + return false; + } + + if (isa<IntegerType>(val->getType())) + { + assert(0 and "Internal error"); + return false; + } + + if (visited.count(val)) + { + return false; + } + + visited.insert(val); + + if (isa<GetElementPtrInst>(val)) + { + GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(val); + GetElementPtrInst::op_iterator i = GEP->op_begin(); + + for (i++; i != GEP->op_end(); i++) + { + if (!isa<Constant>(*i)) + { + return true; + } + } + } + + for (Value::use_iterator i = val->use_begin(); i != val->use_end(); i++) + { + Value* v2 = dyn_cast<Value>(*i); + + if (v2) + { + if (isIndirect(v2, visited)) + { + return true; + } + } + } + + return false; +} + +void R600KernelParameters::AddParam(llvm::Argument* arg) +{ + param p; + + p.val = dyn_cast<Value>(arg); + p.offset_in_dw = getListSize(); + p.size_in_dw = calculateArgumentSize(arg); + + if (isa<PointerType>(arg->getType()) and arg->hasByValAttr()) + { + set<Value*> visited; + p.indirect = isIndirect(p.val, visited); + } + + params.push_back(p); +} + +int R600KernelParameters::calculateArgumentSize(llvm::Argument* arg) +{ + Type* t = arg->getType(); + + if (arg->hasByValAttr() and dyn_cast<PointerType>(t)) + { + t = dyn_cast<PointerType>(t)->getElementType(); + } + + int store_size_in_dw = (TD->getTypeStoreSize(t) + 3)/4; + + assert(store_size_in_dw); + + return store_size_in_dw; +} + + +void R600KernelParameters::RunAna(llvm::Function* fun) +{ + assert(isOpenCLKernel(fun)); + + for (Function::arg_iterator i = fun->arg_begin(); i != fun->arg_end(); i++) + { + AddParam(i); + } + +} + +void R600KernelParameters::Replace(llvm::Function* fun) +{ + for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++) + { + Value *new_val; + + if (isSpecialType(i->val->getType())) + { + new_val = handleSpecial(fun, *i); + } + else + { + new_val = ConstantRead(fun, *i); + } + if (new_val) + { + i->val->replaceAllUsesWith(new_val); + } + } +} + +void R600KernelParameters::Propagate(llvm::Function* fun) +{ + for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++) + { + if (i->ptr_val) + { + Propagate(i->ptr_val, i->val->getName(), i->indirect); + } + } +} + +void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect) +{ + LoadInst* load = dyn_cast<LoadInst>(v); + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(v); + + unsigned addrspace; + + if (indirect) + { + addrspace = AMDILAS::PARAM_I_ADDRESS; + } + else + { + addrspace = AMDILAS::PARAM_D_ADDRESS; + } + + if (GEP and GEP->getType()->getAddressSpace() != addrspace) + { + Value* op = GEP->getPointerOperand(); + + if (dyn_cast<PointerType>(op->getType())->getAddressSpace() != addrspace) + { + op = new BitCastInst(op, PointerType::get(dyn_cast<PointerType>(op->getType())->getElementType(), addrspace), name, dyn_cast<Instruction>(v)); + } + + vector<Value*> params(GEP->idx_begin(), GEP->idx_end()); + + GetElementPtrInst* GEP2 = GetElementPtrInst::Create(op, params, name, dyn_cast<Instruction>(v)); + GEP2->setIsInBounds(GEP->isInBounds()); + v = dyn_cast<Value>(GEP2); + GEP->replaceAllUsesWith(GEP2); + GEP->eraseFromParent(); + load = NULL; + } + + if (load) + { + if (load->getPointerAddressSpace() != addrspace) ///normally at this point we have the right address space + { + Value *orig_ptr = load->getPointerOperand(); + PointerType *orig_ptr_type = dyn_cast<PointerType>(orig_ptr->getType()); + + Type* new_ptr_type = PointerType::get(orig_ptr_type->getElementType(), addrspace); + + Value* new_ptr = orig_ptr; + + if (orig_ptr->getType() != new_ptr_type) + { + new_ptr = new BitCastInst(orig_ptr, new_ptr_type, "prop_cast", load); + } + + Value* new_load = new LoadInst(new_ptr, name, load); + load->replaceAllUsesWith(new_load); + load->eraseFromParent(); + } + + return; + } + + vector<User*> users(v->use_begin(), v->use_end()); + + for (int i = 0; i < int(users.size()); i++) + { + Value* v2 = dyn_cast<Value>(users[i]); + + if (v2) + { + Propagate(v2, name, indirect); + } + } +} + +Value* R600KernelParameters::ConstantRead(Function* fun, param& p) +{ + assert(fun->front().begin() != fun->front().end()); + + Instruction *first_inst = fun->front().begin(); + IRBuilder <> builder (first_inst); +/* First 3 dwords are reserved for the dimmension info */ + + if (!p.val->hasNUsesOrMore(1)) + { + return NULL; + } + unsigned addrspace; + + if (p.indirect) + { + addrspace = AMDILAS::PARAM_I_ADDRESS; + } + else + { + addrspace = AMDILAS::PARAM_D_ADDRESS; + } + + Argument *arg = dyn_cast<Argument>(p.val); + Type * argType = p.val->getType(); + PointerType * argPtrType = dyn_cast<PointerType>(p.val->getType()); + + if (argPtrType and arg->hasByValAttr()) + { + Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(Type::getInt32Ty(*Context), addrspace)); + Value* param_ptr = GetElementPtrInst::Create(param_addr_space_ptr, ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName(), first_inst); + param_ptr = new BitCastInst(param_ptr, PointerType::get(argPtrType->getElementType(), addrspace), arg->getName(), first_inst); + p.ptr_val = param_ptr; + return param_ptr; + } + else + { + Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(argType, addrspace)); + + Value* param_ptr = builder.CreateGEP(param_addr_space_ptr, + ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName()); + + Value* param_value = builder.CreateLoad(param_ptr, arg->getName()); + + return param_value; + } +} + +Value* R600KernelParameters::handleSpecial(Function* fun, param& p) +{ + string name = getSpecialTypeName(p.val->getType()); + int ID; + + assert(!name.empty()); + + if (name == "image2d_t" or name == "image3d_t") + { + int lastID = max(getLastSpecialID("image2d_t"), getLastSpecialID("image3d_t")); + + if (lastID == -1) + { + ID = 2; ///ID0 and ID1 are used internally by the driver + } + else + { + ID = lastID + 1; + } + } + else if (name == "sampler_t") + { + int lastID = getLastSpecialID("sampler_t"); + + if (lastID == -1) + { + ID = 0; + } + else + { + ID = lastID + 1; + } + } + else + { + ///TODO: give some error message + return NULL; + } + + p.specialType = name; + p.specialID = ID; + + Instruction *first_inst = fun->front().begin(); + + return new IntToPtrInst(ConstantInt::get(Type::getInt32Ty(*Context), p.specialID), p.val->getType(), "resourceID", first_inst); +} + + +bool R600KernelParameters::isSpecialType(Type* t) +{ + return !getSpecialTypeName(t).empty(); +} + +string R600KernelParameters::getSpecialTypeName(Type* t) +{ + PointerType *pt = dyn_cast<PointerType>(t); + StructType *st = NULL; + + if (pt) + { + st = dyn_cast<StructType>(pt->getElementType()); + } + + if (st) + { + string prefix = "struct.opencl_builtin_type_"; + + string name = st->getName().str(); + + if (name.substr(0, prefix.length()) == prefix) + { + return name.substr(prefix.length(), name.length()); + } + } + + return ""; +} + + +bool R600KernelParameters::runOnFunction (Function &F) +{ + if (!isOpenCLKernel(&F)) + { + return false; + } + +// F.dump(); + + RunAna(&F); + Replace(&F); + Propagate(&F); + + mod->dump(); + return false; +} + +void R600KernelParameters::getAnalysisUsage(AnalysisUsage &AU) const +{ +// AU.addRequired<FunctionAnalysis>(); + FunctionPass::getAnalysisUsage(AU); + AU.setPreservesAll(); +} + +const char *R600KernelParameters::getPassName() const +{ + return "OpenCL Kernel parameter conversion to memory"; +} + +bool R600KernelParameters::doInitialization(Module &M) +{ + Context = &M.getContext(); + mod = &M; + + return false; +} + +bool R600KernelParameters::doFinalization(Module &M) +{ + return false; +} + +llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD) +{ + FunctionPass *p = new R600KernelParameters(TD); + + return p; +} + + diff --git a/src/gallium/drivers/radeon/R600KernelParameters.h b/src/gallium/drivers/radeon/R600KernelParameters.h new file mode 100644 index 00000000000..904a469a5f0 --- /dev/null +++ b/src/gallium/drivers/radeon/R600KernelParameters.h @@ -0,0 +1,28 @@ +//===-- R600KernelParameters.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef KERNELPARAMETERS_H +#define KERNELPARAMETERS_H + +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/Function.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Value.h" + +#include <vector> + +llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD); + + +#endif diff --git a/src/gallium/drivers/radeon/R600LowerInstructions.cpp b/src/gallium/drivers/radeon/R600LowerInstructions.cpp new file mode 100644 index 00000000000..b9f9c7cdb4a --- /dev/null +++ b/src/gallium/drivers/radeon/R600LowerInstructions.cpp @@ -0,0 +1,546 @@ +//===-- R600LowerInstructions.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUUtil.h" +#include "AMDIL.h" +#include "AMDILMachineFunctionInfo.h" +#include "AMDILRegisterInfo.h" +#include "R600InstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Constants.h" +#include "llvm/Target/TargetInstrInfo.h" + +#include <stdio.h> + +using namespace llvm; + +namespace { + class R600LowerInstructionsPass : public MachineFunctionPass { + + private: + static char ID; + TargetMachine &TM; + AMDILMachineFunctionInfo * MFI; + const R600InstrInfo * TII; + MachineRegisterInfo * MRI; + + void lowerFLT(MachineInstr &MI); + + void calcAddress(const MachineOperand &ptrOp, + const MachineOperand &indexOp, + unsigned indexReg, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const; + + void divMod(MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + bool div = true) const; + + public: + R600LowerInstructionsPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm), + TII(static_cast<const R600InstrInfo*>(tm.getInstrInfo())), + MRI(NULL) + { } + + const char *getPassName() const { return "R600 Lower Instructions"; } + virtual bool runOnMachineFunction(MachineFunction &MF); + + }; +} /* End anonymous namespace */ + +char R600LowerInstructionsPass::ID = 0; + +FunctionPass *llvm::createR600LowerInstructionsPass(TargetMachine &tm) { + return new R600LowerInstructionsPass(tm); +} + +bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF) +{ + MRI = &MF.getRegInfo(); + MFI = MF.getInfo<AMDILMachineFunctionInfo>(); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next, Next = llvm::next(I) ) { + + MachineInstr &MI = *I; + switch(MI.getOpcode()) { + case AMDIL::FLT: + BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FGE)) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(2)) + .addOperand(MI.getOperand(1)); + break; + + case AMDIL::ABS_i32: + { + unsigned setgt = MRI->createVirtualRegister( + &AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), + setgt) + .addOperand(MI.getOperand(1)) + .addReg(AMDIL::ZERO); + + unsigned add_int = MRI->createVirtualRegister( + &AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), + add_int) + .addReg(setgt) + .addOperand(MI.getOperand(1)); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::XOR_INT)) + .addOperand(MI.getOperand(0)) + .addReg(setgt) + .addReg(add_int); + + break; + } + + /* XXX: We could propagate the ABS flag to all of the uses of Operand0 and + * remove the ABS instruction.*/ + case AMDIL::FABS_f32: + case AMDIL::ABS_f32: + MI.getOperand(1).addTargetFlag(MO_FLAG_ABS); + BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::MOVE_f32)) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)); + break; + + case AMDIL::BINARY_OR_f32: + { + unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FTOI), tmp0) + .addOperand(MI.getOperand(1)); + unsigned tmp1 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::FTOI), tmp1) + .addOperand(MI.getOperand(2)); + unsigned tmp2 = MRI->createVirtualRegister(&AMDIL::GPRI32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::BINARY_OR_i32), tmp2) + .addReg(tmp0) + .addReg(tmp1); + BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::ITOF), MI.getOperand(0).getReg()) + .addReg(tmp2); + break; + } + case AMDIL::CMOVLOG_f32: + BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(MI.getOpcode())) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(3)) + .addOperand(MI.getOperand(2)); + break; + + case AMDIL::CMOVLOG_i32: + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT)) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(3)) + .addOperand(MI.getOperand(2)); + break; + + case AMDIL::CLAMP_f32: + { + MachineOperand lowOp = MI.getOperand(2); + MachineOperand highOp = MI.getOperand(3); + if (lowOp.isReg() && highOp.isReg() + && lowOp.getReg() == AMDIL::ZERO && highOp.getReg() == AMDIL::ONE) { + MI.getOperand(0).addTargetFlag(MO_FLAG_CLAMP); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV)) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)); + } else { + /* XXX: Handle other cases */ + abort(); + } + break; + } + + case AMDIL::UDIV_i32: + divMod(MI, MBB, I); + break; + + /* XXX: Figure out the semantics of DIV_INF_f32 and make sure this is OK */ +/* case AMDIL::DIV_INF_f32: + { + unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::GPRF32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), + TM.getInstrInfo()->get(AMDIL::RECIP_CLAMPED), tmp0) + .addOperand(MI.getOperand(2)); + BuildMI(MBB, I, MBB.findDebugLoc(I), + TM.getInstrInfo()->get(AMDIL::MUL_IEEE_f32)) + .addOperand(MI.getOperand(0)) + .addReg(tmp0) + .addOperand(MI.getOperand(1)); + break; + } +*/ /* XXX: This is an optimization */ + + case AMDIL::GLOBALLOAD_f32: + case AMDIL::GLOBALLOAD_i32: + { + MachineOperand &ptrOperand = MI.getOperand(1); + MachineOperand &indexOperand = MI.getOperand(2); + unsigned indexReg = + MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass); + + /* Calculate the address with in the VTX buffer */ + calcAddress(ptrOperand, indexOperand, indexReg, MBB, I); + + /* Make sure the VTX_READ_eg writes to the X chan */ + MRI->setRegClass(MI.getOperand(0).getReg(), + &AMDIL::R600_TReg32_XRegClass); + + /* Add the VTX_READ_eg instruction */ + BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDIL::VTX_READ_eg)) + .addOperand(MI.getOperand(0)) + .addReg(indexReg) + .addImm(1); + break; + } + + case AMDIL::GLOBALSTORE_i32: + case AMDIL::GLOBALSTORE_f32: + { + MachineOperand &ptrOperand = MI.getOperand(1); + MachineOperand &indexOperand = MI.getOperand(2); + unsigned rwReg = + MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass); + unsigned byteIndexReg = + MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + unsigned shiftReg = + MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + unsigned indexReg = + MRI->createVirtualRegister(&AMDIL::R600_TReg32_XRegClass); + + /* Move the store value to the correct register class */ + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::COPY), rwReg) + .addOperand(MI.getOperand(0)); + + /* Calculate the address in the RAT */ + calcAddress(ptrOperand, indexOperand, byteIndexReg, MBB, I); + + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV), shiftReg) + .addReg(AMDIL::ALU_LITERAL_X) + .addImm(2); + + /* XXX: Check GPU family */ + BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDIL::LSHR_eg), indexReg) + .addReg(byteIndexReg) + .addReg(shiftReg); + + /* XXX: Check GPU Family */ + BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDIL::RAT_WRITE_CACHELESS_eg)) + .addReg(rwReg) + .addReg(indexReg) + .addImm(0); + break; + } + case AMDIL::ILT: + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGT_INT)) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(2)) + .addOperand(MI.getOperand(1)); + break; + case AMDIL::LOADCONST_f32: + case AMDIL::LOADCONST_i32: + { + bool canInline = false; + unsigned inlineReg; + MachineOperand & dstOp = MI.getOperand(0); + MachineOperand & immOp = MI.getOperand(1); + if (immOp.isFPImm()) { + const ConstantFP * cfp = immOp.getFPImm(); + if (cfp->isZero()) { + canInline = true; + inlineReg = AMDIL::ZERO; + } else if (cfp->isExactlyValue(1.0f)) { + canInline = true; + inlineReg = AMDIL::ONE; + } else if (cfp->isExactlyValue(0.5f)) { + canInline = true; + inlineReg = AMDIL::HALF; + } + } + + if (canInline) { + MachineOperand * use = dstOp.getNextOperandForReg(); + /* The lowering operation for CLAMP needs to have the immediates + * as operands, so we must propagate them. */ + while (use) { + MachineOperand * next = use->getNextOperandForReg(); + if (use->getParent()->getOpcode() == AMDIL::CLAMP_f32) { + use->setReg(inlineReg); + } + use = next; + } + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::COPY)) + .addOperand(dstOp) + .addReg(inlineReg); + } else { + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::MOV)) + .addOperand(dstOp) + .addReg(AMDIL::ALU_LITERAL_X) + .addOperand(immOp); + } + break; + } + + case AMDIL::MASK_WRITE: + { + unsigned maskedRegister = MI.getOperand(0).getReg(); + assert(TargetRegisterInfo::isVirtualRegister(maskedRegister)); + MachineInstr * defInstr = MRI->getVRegDef(maskedRegister); + MachineOperand * def = defInstr->findRegisterDefOperand(maskedRegister); + def->addTargetFlag(MO_FLAG_MASK); + break; + } + + case AMDIL::VEXTRACT_v4f32: + MI.getOperand(2).setImm(MI.getOperand(2).getImm() - 1); + continue; + + case AMDIL::NEGATE_i32: + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT)) + .addOperand(MI.getOperand(0)) + .addReg(AMDIL::ZERO) + .addOperand(MI.getOperand(1)); + break; + + case AMDIL::NEG_f32: + { + MI.getOperand(1).addTargetFlag(MO_FLAG_NEG); + BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(TII->getISAOpcode(AMDIL::MOV))) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)); + break; + } + + case AMDIL::SUB_f32: + { + MI.getOperand(2).addTargetFlag(MO_FLAG_NEG); + BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(TII->getISAOpcode(AMDIL::ADD_f32))) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)); + break; + } + + case AMDIL::VINSERT_v4f32: + { + + int64_t swz = MI.getOperand(4).getImm(); + int64_t chan; + switch (swz) { + case (1 << 0): + chan = 0; + break; + case (1 << 8): + chan = 1; + break; + case (1 << 16): + chan = 2; + break; + case (1 << 24): + chan = 3; + break; + default: + chan = 0; + fprintf(stderr, "swizzle: %ld\n", swz); + abort(); + break; + } + BuildMI(MBB, I, MBB.findDebugLoc(I), + TM.getInstrInfo()->get(AMDIL::SET_CHAN)) + .addOperand(MI.getOperand(1)) + .addOperand(MI.getOperand(2)) + .addImm(chan); + + BuildMI(MBB, I, MBB.findDebugLoc(I), + TM.getInstrInfo()->get(AMDIL::COPY)) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)); + break; + } + + default: + continue; + } + MI.eraseFromParent(); + } + } + return false; +} + +void R600LowerInstructionsPass::calcAddress(const MachineOperand &ptrOp, + const MachineOperand &indexOp, + unsigned indexReg, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) const +{ + /* Optimize the case where the indexOperand is 0 */ + if (indexOp.isImm() && indexOp.getImm() == 0) { + assert(ptrOp.isReg()); + BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDIL::COPY), indexReg) + .addOperand(ptrOp); + } else { + BuildMI(MBB, I, MBB.findDebugLoc(I), + TII->get(AMDIL::ADD_INT), indexReg) + .addOperand(indexOp) + .addOperand(ptrOp); + } +} + +/* Mostly copied from tgsi_divmod() in r600_shader.c */ +void R600LowerInstructionsPass::divMod(MachineInstr &MI, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + bool div) const +{ + unsigned dst = MI.getOperand(0).getReg(); + MachineOperand &numerator = MI.getOperand(1); + MachineOperand &denominator = MI.getOperand(2); + /* rcp = RECIP(denominator) = 2^32 / denominator + e + * e is rounding error */ + unsigned rcp = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getRECIP_UINT()), rcp) + .addOperand(denominator); + + /* rcp_lo = lo(rcp * denominator) */ + unsigned rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULLO_UINT()), rcp_lo) + .addReg(rcp) + .addOperand(denominator); + + /* rcp_hi = HI (rcp * denominator) */ + unsigned rcp_hi = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), rcp_hi) + .addReg(rcp) + .addOperand(denominator); + + unsigned neg_rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), neg_rcp_lo) + .addReg(AMDIL::ZERO) + .addReg(rcp_lo); + + unsigned abs_rcp_lo = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), abs_rcp_lo) + .addReg(rcp_hi) + .addReg(neg_rcp_lo) + .addReg(rcp_lo); + + unsigned e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), e) + .addReg(abs_rcp_lo) + .addReg(rcp); + + unsigned rcp_plus_e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), rcp_plus_e) + .addReg(rcp) + .addReg(e); + + unsigned rcp_sub_e = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), rcp_sub_e) + .addReg(rcp) + .addReg(e); + + /* tmp0 = rcp_hi == 0 ? rcp_plus_e : rcp_sub_e */ + unsigned tmp0 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), tmp0) + .addReg(rcp_hi) + .addReg(rcp_plus_e) + .addReg(rcp_sub_e); + + unsigned q = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULHI_UINT()), q) + .addReg(tmp0) + .addOperand(numerator); + + /* num_sub_r = q * denominator */ + unsigned num_sub_r = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(TII->getMULLO_UINT()), + num_sub_r) + .addReg(q) + .addOperand(denominator); + + unsigned r = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), r) + .addOperand(numerator) + .addReg(num_sub_r); + + unsigned r_ge_den = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), r_ge_den) + .addReg(r) + .addOperand(denominator); + + unsigned r_ge_zero = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT), r_ge_zero) + .addOperand(numerator) + .addReg(num_sub_r); + + unsigned tmp1 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::AND_INT), tmp1) + .addReg(r_ge_den) + .addReg(r_ge_zero); + + unsigned val0 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + unsigned val1 = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + unsigned result = MRI->createVirtualRegister(&AMDIL::R600_TReg32RegClass); + if (div) { + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), val0) + .addReg(q) + .addReg(AMDIL::ONE_INT); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), val1) + .addReg(q) + .addReg(AMDIL::ONE_INT); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), result) + .addReg(tmp1) + .addReg(q) + .addReg(val0); + } else { + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SUB_INT), val0) + .addReg(r) + .addOperand(denominator); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::ADD_INT), val1) + .addReg(r) + .addOperand(denominator); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), result) + .addReg(tmp1) + .addReg(r) + .addReg(val0); + } + + /* XXX: Do we need to set to MAX_INT if denominator is 0? */ + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::CNDE_INT), dst) + .addReg(r_ge_zero) + .addReg(val1) + .addReg(result); +} diff --git a/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp b/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp new file mode 100644 index 00000000000..394ee7006ce --- /dev/null +++ b/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp @@ -0,0 +1,143 @@ +//===-- R600LowerShaderInstructions.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPULowerShaderInstructions.h" +#include "AMDIL.h" +#include "AMDILInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + class R600LowerShaderInstructionsPass : public MachineFunctionPass, + public AMDGPULowerShaderInstructionsPass { + + private: + static char ID; + TargetMachine &TM; + + void lowerEXPORT_REG_FAKE(MachineInstr &MI, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); + void lowerLOAD_INPUT(MachineInstr & MI); + bool lowerSTORE_OUTPUT(MachineInstr & MI, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); + + public: + R600LowerShaderInstructionsPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "R600 Lower Shader Instructions"; } + }; +} /* End anonymous namespace */ + +char R600LowerShaderInstructionsPass::ID = 0; + +FunctionPass *llvm::createR600LowerShaderInstructionsPass(TargetMachine &tm) { + return new R600LowerShaderInstructionsPass(tm); +} + +#define INSTR_CASE_FLOAT_V(inst) \ + case AMDIL:: inst##_v4f32: \ + +#define INSTR_CASE_FLOAT_S(inst) \ + case AMDIL:: inst##_f32: + +#define INSTR_CASE_FLOAT(inst) \ + INSTR_CASE_FLOAT_V(inst) \ + INSTR_CASE_FLOAT_S(inst) +bool R600LowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF) +{ + MRI = &MF.getRegInfo(); + + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) { + MachineInstr &MI = *I; + bool deleteInstr = false; + switch (MI.getOpcode()) { + + default: break; + + case AMDIL::RESERVE_REG: + case AMDIL::EXPORT_REG: + deleteInstr = true; + break; + + case AMDIL::LOAD_INPUT: + lowerLOAD_INPUT(MI); + deleteInstr = true; + break; + + case AMDIL::STORE_OUTPUT: + deleteInstr = lowerSTORE_OUTPUT(MI, MBB, I); + break; + + } + + ++I; + + if (deleteInstr) { + MI.eraseFromParent(); + } + } + } + + return false; +} + +/* The goal of this function is to replace the virutal destination register of + * a LOAD_INPUT instruction with the correct physical register that will. + * + * XXX: I don't think this is the right way things assign physical registers, + * but I'm not sure of another way to do this. + */ +void R600LowerShaderInstructionsPass::lowerLOAD_INPUT(MachineInstr &MI) +{ + MachineOperand &dst = MI.getOperand(0); + MachineOperand &arg = MI.getOperand(1); + int64_t inputIndex = arg.getImm(); + const TargetRegisterClass * inputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID); + unsigned newRegister = inputClass->getRegister(inputIndex); + unsigned dstReg = dst.getReg(); + + preloadRegister(MI.getParent()->getParent(), TM.getInstrInfo(), newRegister, + dstReg); +} + +bool R600LowerShaderInstructionsPass::lowerSTORE_OUTPUT(MachineInstr &MI, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) +{ + MachineOperand &valueOp = MI.getOperand(1); + MachineOperand &indexOp = MI.getOperand(2); + unsigned valueReg = valueOp.getReg(); + int64_t outputIndex = indexOp.getImm(); + const TargetRegisterClass * outputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID); + unsigned newRegister = outputClass->getRegister(outputIndex); + + BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::COPY), + newRegister) + .addReg(valueReg); + + if (!MRI->isLiveOut(newRegister)) + MRI->addLiveOut(newRegister); + + return true; + +} diff --git a/src/gallium/drivers/radeon/R600OpenCLUtils.h b/src/gallium/drivers/radeon/R600OpenCLUtils.h new file mode 100644 index 00000000000..91e41d63d0d --- /dev/null +++ b/src/gallium/drivers/radeon/R600OpenCLUtils.h @@ -0,0 +1,49 @@ +//===-- OpenCLUtils.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// +#ifndef OPENCLUTILS_H +#define OPENCLUTILS_H + +#include "llvm/Function.h" + +#include <llvm/Module.h> + +static bool isOpenCLKernel(const llvm::Function* fun) +{ + llvm::Module *mod = const_cast<llvm::Function*>(fun)->getParent(); + llvm::NamedMDNode * md = mod->getOrInsertNamedMetadata("opencl.kernels"); + + if (!md or !md->getNumOperands()) + { + return false; + } + + for (int i = 0; i < int(md->getNumOperands()); i++) + { + if (!md->getOperand(i) or !md->getOperand(i)->getOperand(0)) + { + continue; + } + + assert(md->getOperand(i)->getNumOperands() == 1); + + if (md->getOperand(i)->getOperand(0)->getName() == fun->getName()) + { + return true; + } + } + + return false; +} + + +#endif diff --git a/src/gallium/drivers/radeon/R600RegisterInfo.cpp b/src/gallium/drivers/radeon/R600RegisterInfo.cpp new file mode 100644 index 00000000000..96507b104cf --- /dev/null +++ b/src/gallium/drivers/radeon/R600RegisterInfo.cpp @@ -0,0 +1,102 @@ +//===-- R600RegisterInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "R600RegisterInfo.h" +#include "AMDGPUTargetMachine.h" + +using namespace llvm; + +R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm, + const TargetInstrInfo &tii) +: AMDGPURegisterInfo(tm, tii), + TM(tm), + TII(tii) + { } + +BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const +{ + BitVector Reserved(getNumRegs()); + Reserved.set(AMDIL::ZERO); + Reserved.set(AMDIL::HALF); + Reserved.set(AMDIL::ONE); + Reserved.set(AMDIL::ONE_INT); + Reserved.set(AMDIL::NEG_HALF); + Reserved.set(AMDIL::NEG_ONE); + Reserved.set(AMDIL::PV_X); + Reserved.set(AMDIL::ALU_LITERAL_X); + + for (TargetRegisterClass::iterator I = AMDIL::R600_CReg32RegClass.begin(), + E = AMDIL::R600_CReg32RegClass.end(); I != E; ++I) { + Reserved.set(*I); + } + + for (MachineFunction::const_iterator BB = MF.begin(), + BB_E = MF.end(); BB != BB_E; ++BB) { + const MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + const MachineInstr &MI = *I; + if (MI.getOpcode() == AMDIL::RESERVE_REG) { + if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) { + Reserved.set(MI.getOperand(0).getReg()); + } + } + } + } + return Reserved; +} + +const TargetRegisterClass * +R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const +{ + switch (rc->getID()) { + case AMDIL::GPRV4F32RegClassID: + case AMDIL::GPRV4I32RegClassID: + return &AMDIL::R600_Reg128RegClass; + case AMDIL::GPRF32RegClassID: + case AMDIL::GPRI32RegClassID: + return &AMDIL::R600_Reg32RegClass; + default: return rc; + } +} + +unsigned R600RegisterInfo::getHWRegIndex(unsigned reg) const +{ + switch(reg) { + case AMDIL::ZERO: return 248; + case AMDIL::ONE: + case AMDIL::NEG_ONE: return 249; + case AMDIL::ONE_INT: return 250; + case AMDIL::HALF: + case AMDIL::NEG_HALF: return 252; + case AMDIL::ALU_LITERAL_X: return 253; + default: return getHWRegIndexGen(reg); + } +} + +unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const +{ + switch(reg) { + case AMDIL::ZERO: + case AMDIL::ONE: + case AMDIL::ONE_INT: + case AMDIL::NEG_ONE: + case AMDIL::HALF: + case AMDIL::NEG_HALF: + case AMDIL::ALU_LITERAL_X: + return 0; + default: return getHWRegChanGen(reg); + } +} + +#include "R600HwRegInfo.include" diff --git a/src/gallium/drivers/radeon/R600RegisterInfo.h b/src/gallium/drivers/radeon/R600RegisterInfo.h new file mode 100644 index 00000000000..95a44f971a0 --- /dev/null +++ b/src/gallium/drivers/radeon/R600RegisterInfo.h @@ -0,0 +1,44 @@ +//===-- R600RegisterInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef R600REGISTERINFO_H_ +#define R600REGISTERINFO_H_ + +#include "AMDGPUTargetMachine.h" +#include "AMDILRegisterInfo.h" + +namespace llvm { + + class R600TargetMachine; + class TargetInstrInfo; + + struct R600RegisterInfo : public AMDGPURegisterInfo + { + AMDGPUTargetMachine &TM; + const TargetInstrInfo &TII; + + R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii); + + virtual BitVector getReservedRegs(const MachineFunction &MF) const; + + virtual const TargetRegisterClass * + getISARegClass(const TargetRegisterClass * rc) const; + unsigned getHWRegIndex(unsigned reg) const; + unsigned getHWRegChan(unsigned reg) const; +private: + unsigned getHWRegChanGen(unsigned reg) const; + unsigned getHWRegIndexGen(unsigned reg) const; + }; +} // End namespace llvm + +#endif // AMDIDSAREGISTERINFO_H_ diff --git a/src/gallium/drivers/radeon/R600Schedule.td b/src/gallium/drivers/radeon/R600Schedule.td new file mode 100644 index 00000000000..c6b1ca61bb5 --- /dev/null +++ b/src/gallium/drivers/radeon/R600Schedule.td @@ -0,0 +1,34 @@ +//===-- R600Schedule.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +def ALU_X : FuncUnit; +def ALU_Y : FuncUnit; +def ALU_Z : FuncUnit; +def ALU_W : FuncUnit; +def TRANS : FuncUnit; + + +def AnyALU : InstrItinClass; +def VecALU : InstrItinClass; +def TransALU : InstrItinClass; + +def R600_EG_Itin : ProcessorItineraries < + [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS], + [], + [ + InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>, + InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>, + InstrItinData<TransALU, [InstrStage<1, [TRANS]>]> + ] +>; diff --git a/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp b/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp new file mode 100644 index 00000000000..b0bdf701a74 --- /dev/null +++ b/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp @@ -0,0 +1,110 @@ +//===-- SIAssignInterpRegs.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + + +#include "AMDGPU.h" +#include "AMDGPUUtil.h" +#include "AMDIL.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + class SIAssignInterpRegsPass : public MachineFunctionPass { + + private: + static char ID; + TargetMachine &TM; + + public: + SIAssignInterpRegsPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "SI Assign intrpolation registers"; } + }; +} // End anonymous namespace + +char SIAssignInterpRegsPass::ID = 0; + +#define INTERP_VALUES 16 + +struct interp_info { + bool enabled; + unsigned regs[3]; + unsigned reg_count; +}; + + +FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) { + return new SIAssignInterpRegsPass(tm); +} + +bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) +{ + + struct interp_info InterpUse[INTERP_VALUES] = { + {false, {AMDIL::PERSP_SAMPLE_I, AMDIL::PERSP_SAMPLE_J}, 2}, + {false, {AMDIL::PERSP_CENTER_I, AMDIL::PERSP_CENTER_J}, 2}, + {false, {AMDIL::PERSP_CENTROID_I, AMDIL::PERSP_CENTROID_J}, 2}, + {false, {AMDIL::PERSP_I_W, AMDIL::PERSP_J_W, AMDIL::PERSP_1_W}, 3}, + {false, {AMDIL::LINEAR_SAMPLE_I, AMDIL::LINEAR_SAMPLE_J}, 2}, + {false, {AMDIL::LINEAR_CENTER_I, AMDIL::LINEAR_CENTER_J}, 2}, + {false, {AMDIL::LINEAR_CENTROID_I, AMDIL::LINEAR_CENTROID_J}, 2}, + {false, {AMDIL::LINE_STIPPLE_TEX_COORD}, 1}, + {false, {AMDIL::POS_X_FLOAT}, 1}, + {false, {AMDIL::POS_Y_FLOAT}, 1}, + {false, {AMDIL::POS_Z_FLOAT}, 1}, + {false, {AMDIL::POS_W_FLOAT}, 1}, + {false, {AMDIL::FRONT_FACE}, 1}, + {false, {AMDIL::ANCILLARY}, 1}, + {false, {AMDIL::SAMPLE_COVERAGE}, 1}, + {false, {AMDIL::POS_FIXED_PT}, 1} + }; + + SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + /* First pass, mark the interpolation values that are used. */ + for (unsigned interp_idx = 0; interp_idx < INTERP_VALUES; interp_idx++) { + for (unsigned reg_idx = 0; reg_idx < InterpUse[interp_idx].reg_count; + reg_idx++) { + InterpUse[interp_idx].enabled = + !MRI.use_empty(InterpUse[interp_idx].regs[reg_idx]); + } + } + + unsigned used_vgprs = 0; + + /* Second pass, replace with VGPRs. */ + for (unsigned interp_idx = 0; interp_idx < INTERP_VALUES; interp_idx++) { + if (!InterpUse[interp_idx].enabled) { + continue; + } + MFI->spi_ps_input_addr |= (1 << interp_idx); + + for (unsigned reg_idx = 0; reg_idx < InterpUse[interp_idx].reg_count; + reg_idx++, used_vgprs++) { + unsigned new_reg = AMDIL::VReg_32RegisterClass->getRegister(used_vgprs); + unsigned virt_reg = MRI.createVirtualRegister(AMDIL::VReg_32RegisterClass); + MRI.replaceRegWith(InterpUse[interp_idx].regs[reg_idx], virt_reg); + AMDGPU::utilAddLiveIn(&MF, MRI, TM.getInstrInfo(), new_reg, virt_reg); + } + } + + return false; +} diff --git a/src/gallium/drivers/radeon/SICodeEmitter.cpp b/src/gallium/drivers/radeon/SICodeEmitter.cpp new file mode 100644 index 00000000000..0553f0e7d42 --- /dev/null +++ b/src/gallium/drivers/radeon/SICodeEmitter.cpp @@ -0,0 +1,274 @@ +//===-- SICodeEmitter.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "AMDGPUUtil.h" +#include "AMDILCodeEmitter.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/Support/FormattedStream.h" +#include "llvm/Target/TargetMachine.h" + +#include <stdio.h> + +#define LITERAL_REG 255 +#define VGPR_BIT(src_idx) (1 << (8 * (src_idx))) +using namespace llvm; + +namespace { + + class SICodeEmitter : public MachineFunctionPass, public AMDILCodeEmitter { + + private: + static char ID; + formatted_raw_ostream &_OS; + const TargetMachine *TM; + void emitState(MachineFunction & MF); + void emitInstr(MachineInstr &MI); + + void outputBytes(uint64_t value, unsigned bytes); + unsigned GPRAlign(const MachineInstr &MI, unsigned OpNo, unsigned shift) + const; + + public: + SICodeEmitter(formatted_raw_ostream &OS) : MachineFunctionPass(ID), + _OS(OS), TM(NULL) { } + const char *getPassName() const { return "SI Code Emitter"; } + bool runOnMachineFunction(MachineFunction &MF); + virtual uint64_t getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const; + virtual unsigned GPR4AlignEncode(const MachineInstr &MI, unsigned OpNo) + const; + virtual unsigned GPR2AlignEncode(const MachineInstr &MI, unsigned OpNo) + const; + virtual uint64_t i32LiteralEncode(const MachineInstr &MI, unsigned OpNo) + const; + virtual uint64_t VOPPostEncode(const MachineInstr &MI, + uint64_t Value) const; + }; +} + +char SICodeEmitter::ID = 0; + +FunctionPass *llvm::createSICodeEmitterPass(formatted_raw_ostream &OS) { + return new SICodeEmitter(OS); +} + +void SICodeEmitter::emitState(MachineFunction & MF) +{ + unsigned maxSGPR = 0; + unsigned maxVGPR = 0; + bool VCCUsed = false; + const SIRegisterInfo * RI = + static_cast<const SIRegisterInfo*>(TM->getRegisterInfo()); + SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>(); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + unsigned numOperands = MI.getNumOperands(); + for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) { + MachineOperand & MO = MI.getOperand(op_idx); + unsigned maxUsed; + unsigned width = 0; + bool isSGPR = false; + unsigned reg; + unsigned hwReg; + if (!MO.isReg()) { + continue; + } + reg = MO.getReg(); + if (reg == AMDIL::VCC) { + VCCUsed = true; + continue; + } + if (AMDIL::SReg_32RegClass.contains(reg)) { + isSGPR = true; + width = 1; + } else if (AMDIL::VReg_32RegClass.contains(reg)) { + isSGPR = false; + width = 1; + } else if (AMDIL::SReg_64RegClass.contains(reg)) { + isSGPR = true; + width = 2; + } else if (AMDIL::VReg_64RegClass.contains(reg)) { + isSGPR = false; + width = 2; + } else if (AMDIL::SReg_128RegClass.contains(reg)) { + isSGPR = true; + width = 4; + } else if (AMDIL::VReg_128RegClass.contains(reg)) { + isSGPR = false; + width = 4; + } else if (AMDIL::SReg_256RegClass.contains(reg)) { + isSGPR = true; + width = 8; + } else { + assert("!Unknown register class"); + } + hwReg = RI->getHWRegNum(reg); + maxUsed = ((hwReg + 1) * width) - 1; + if (isSGPR) { + maxSGPR = maxUsed > maxSGPR ? maxUsed : maxSGPR; + } else { + maxVGPR = maxUsed > maxVGPR ? maxUsed : maxVGPR; + } + } + } + } + if (VCCUsed) { + maxSGPR += 2; + } + outputBytes(maxSGPR + 1, 4); + outputBytes(maxVGPR + 1, 4); + outputBytes(MFI->spi_ps_input_addr, 4); +} + +bool SICodeEmitter::runOnMachineFunction(MachineFunction &MF) +{ + MF.dump(); + TM = &MF.getTarget(); + const AMDGPUInstrInfo * TII = + static_cast<const AMDGPUInstrInfo*>(TM->getInstrInfo()); + + emitState(MF); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + MachineInstr &MI = *I; + if (!TII->isRegPreload(MI) && MI.getOpcode() != AMDIL::KILL + && MI.getOpcode() != AMDIL::RETURN) { + emitInstr(MI); + } + } + } + return false; +} + +void SICodeEmitter::emitInstr(MachineInstr &MI) +{ + const SIInstrInfo * SII = static_cast<const SIInstrInfo*>(TM->getInstrInfo()); + + uint64_t hwInst = getBinaryCodeForInstr(MI); + + if ((hwInst & 0xffffffff) == 0xffffffff) { + fprintf(stderr, "Unsupported Instruction: \n"); + MI.dump(); + abort(); + } + +// hwInst |= SII->getBinaryCode(MI); + + unsigned bytes = SII->getEncodingBytes(MI); + outputBytes(hwInst, bytes); +} + +uint64_t SICodeEmitter::getMachineOpValue(const MachineInstr &MI, + const MachineOperand &MO) const +{ + const SIRegisterInfo * RI = + static_cast<const SIRegisterInfo*>(TM->getRegisterInfo()); + + switch(MO.getType()) { + case MachineOperand::MO_Register: + return RI->getBinaryCode(MO.getReg()); + + case MachineOperand::MO_Immediate: + return MO.getImm(); + + case MachineOperand::MO_FPImmediate: + /* XXX: Not all instructions can use inline literals */ + /* XXX: We should make sure this is a 32-bit constant */ + return LITERAL_REG | (MO.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue() << 32); + default: + llvm_unreachable("Encoding of this operand type is not supported yet."); + break; + } +} + +unsigned SICodeEmitter::GPRAlign(const MachineInstr &MI, unsigned OpNo, + unsigned shift) const +{ + const SIRegisterInfo * RI = + static_cast<const SIRegisterInfo*>(TM->getRegisterInfo()); + unsigned regCode = RI->getHWRegNum(MI.getOperand(OpNo).getReg()); + return regCode >> shift; +} + +unsigned SICodeEmitter::GPR4AlignEncode(const MachineInstr &MI, + unsigned OpNo) const +{ + return GPRAlign(MI, OpNo, 2); +} + +unsigned SICodeEmitter::GPR2AlignEncode(const MachineInstr &MI, + unsigned OpNo) const +{ + return GPRAlign(MI, OpNo, 1); +} + +uint64_t SICodeEmitter::i32LiteralEncode(const MachineInstr &MI, + unsigned OpNo) const +{ + return LITERAL_REG | (MI.getOperand(OpNo).getImm() << 32); +} + +/* Set the "VGPR" bit for VOP args that can take either a VGPR or a SGPR. + * XXX: It would be nice if we could handle this without a PostEncode function. + */ +uint64_t SICodeEmitter::VOPPostEncode(const MachineInstr &MI, + uint64_t Value) const +{ + const SIInstrInfo * SII = static_cast<const SIInstrInfo*>(TM->getInstrInfo()); + unsigned encodingType = SII->getEncodingType(MI); + unsigned numSrcOps; + unsigned vgprBitOffset; + + if (encodingType == SIInstrEncodingType::VOP3) { + numSrcOps = 3; + vgprBitOffset = 32; + } else { + numSrcOps = 1; + vgprBitOffset = 0; + } + + /* Add one to skip over the destination reg operand. */ + for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) { + if (!MI.getOperand(opIdx).isReg()) { + continue; + } + unsigned reg = MI.getOperand(opIdx).getReg(); + if (AMDIL::VReg_32RegClass.contains(reg) + || AMDIL::VReg_64RegClass.contains(reg)) { + Value |= (VGPR_BIT(opIdx)) << vgprBitOffset; + } + } + return Value; +} + + +void SICodeEmitter::outputBytes(uint64_t value, unsigned bytes) +{ + for (unsigned i = 0; i < bytes; i++) { + _OS.write((uint8_t) ((value >> (8 * i)) & 0xff)); + } +} diff --git a/src/gallium/drivers/radeon/SIConvertToISA.cpp b/src/gallium/drivers/radeon/SIConvertToISA.cpp new file mode 100644 index 00000000000..44e65398a61 --- /dev/null +++ b/src/gallium/drivers/radeon/SIConvertToISA.cpp @@ -0,0 +1,89 @@ +//===-- SIConvertToISA.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "AMDGPURegisterInfo.h" +#include "AMDIL.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + class SIConvertToISAPass : public MachineFunctionPass { + + private: + static char ID; + TargetMachine &TM; + void convertVCREATE_v4f32(MachineInstr &MI, MachineBasicBlock::iterator I, + MachineBasicBlock &MBB, MachineFunction &MF); + + public: + SIConvertToISAPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + }; +} /* End anonymous namespace */ + +char SIConvertToISAPass::ID = 0; + +FunctionPass *llvm::createSIConvertToISAPass(TargetMachine &tm) { + return new SIConvertToISAPass(tm); +} + +bool SIConvertToISAPass::runOnMachineFunction(MachineFunction &MF) +{ + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next, Next = llvm::next(I) ) { + MachineInstr &MI = *I; + + switch (MI.getOpcode()) { + default: continue; + case AMDIL::VCREATE_v4f32: convertVCREATE_v4f32(MI, I, MBB, MF); + + } + MI.removeFromParent(); + } + } + return false; +} + +void SIConvertToISAPass::convertVCREATE_v4f32(MachineInstr &MI, + MachineBasicBlock::iterator I, MachineBasicBlock &MBB, MachineFunction &MF) +{ + MachineInstrBuilder implicitDef; + MachineInstrBuilder insertSubreg; + MachineRegisterInfo & MRI = MF.getRegInfo(); + unsigned tmp = MRI.createVirtualRegister(&AMDIL::VReg_128RegClass); + + implicitDef = BuildMI(MF, MBB.findDebugLoc(I), + TM.getInstrInfo()->get(AMDIL::IMPLICIT_DEF), tmp); + + MRI.setRegClass(MI.getOperand(1).getReg(), &AMDIL::VReg_32RegClass); + insertSubreg = BuildMI(MF, MBB.findDebugLoc(I), + TM.getInstrInfo()->get(AMDIL::INSERT_SUBREG)) + .addOperand(MI.getOperand(0)) + .addReg(tmp) + .addOperand(MI.getOperand(1)) + .addImm(AMDIL::sel_x); + + MBB.insert(I, implicitDef); + MBB.insert(I, insertSubreg); +} diff --git a/src/gallium/drivers/radeon/SIGenRegisterInfo.pl b/src/gallium/drivers/radeon/SIGenRegisterInfo.pl new file mode 100644 index 00000000000..644daa1bc22 --- /dev/null +++ b/src/gallium/drivers/radeon/SIGenRegisterInfo.pl @@ -0,0 +1,278 @@ +#===-- SIGenRegisterInfo.pl - TODO: Add brief description -------===# +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +#===----------------------------------------------------------------------===# +# +# TODO: Add full description +# +#===----------------------------------------------------------------------===# + + +use strict; +use warnings; + +my $SGPR_COUNT = 104; +my $VGPR_COUNT = 256; + +my $SGPR_MAX_IDX = $SGPR_COUNT - 1; +my $VGPR_MAX_IDX = $VGPR_COUNT - 1; + +my $INDEX_FILE = defined($ARGV[0]) ? $ARGV[0] : ''; + +print <<STRING; + +let Namespace = "AMDIL" in { + def low : SubRegIndex; + def high : SubRegIndex; + + def sub0 : SubRegIndex; + def sub1 : SubRegIndex; + def sub2 : SubRegIndex; + def sub3 : SubRegIndex; + def sub4 : SubRegIndex; + def sub5 : SubRegIndex; + def sub6 : SubRegIndex; + def sub7 : SubRegIndex; +} + +class SIReg <string n> : Register<n> { + let Namespace = "AMDIL"; +} + +class SI_64 <string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> { + let Namespace = "AMDIL"; + let SubRegIndices = [low, high]; +} + +class SI_128 <string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> { + let Namespace = "AMDIL"; + let SubRegIndices = [sel_x, sel_y, sel_z, sel_w]; +} + +class SI_256 <string n, list<Register> subregs> : RegisterWithSubRegs<n, subregs> { + let Namespace = "AMDIL"; + let SubRegIndices = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7]; +} + +class SGPR_32 <bits<8> num, string name> : SIReg<name> { + field bits<8> Num; + + let Num = num; +} + + +class VGPR_32 <bits<9> num, string name, Register gprf32_alias> : SIReg<name> { + field bits<9> Num; + + let Num = num; + let Aliases = [gprf32_alias]; +} + +class SGPR_64 <bits<8> num, string name, list<Register> subregs> : + SI_64 <name, subregs>; + +class VGPR_64 <bits<9> num, string name, list<Register> subregs> : + SI_64 <name, subregs>; + +class SGPR_128 <bits<8> num, string name, list<Register> subregs> : + SI_128 <name, subregs>; + +class VGPR_128 <bits<9> num, string name, list<Register> subregs> : + SI_128 <name, subregs>; + +class SGPR_256 <bits<8> num, string name, list<Register> subregs> : + SI_256 <name, subregs>; + +def VCC : SIReg<"VCC">; +def SCC : SIReg<"SCC">; +def SREG_LIT_0 : SIReg <"S LIT 0">; + +def M0 : SIReg <"M0">; + +//Interpolation registers + +def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">; +def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">; +def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">; +def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">; +def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">; +def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">; +def PERSP_I_W : SIReg <"PERSP_I_W">; +def PERSP_J_W : SIReg <"PERSP_J_W">; +def PERSP_1_W : SIReg <"PERSP_1_W">; +def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">; +def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">; +def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">; +def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">; +def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">; +def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">; +def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">; +def POS_X_FLOAT : SIReg <"POS_X_FLOAT">; +def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">; +def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">; +def POS_W_FLOAT : SIReg <"POS_W_FLOAT">; +def FRONT_FACE : SIReg <"FRONT_FACE">; +def ANCILLARY : SIReg <"ANCILLARY">; +def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">; +def POS_FIXED_PT : SIReg <"POS_FIXED_PT">; + +STRING + +#32 bit register + +my @SGPR; +for (my $i = 0; $i < $SGPR_COUNT; $i++) { + print "def SGPR$i : SGPR_32 <$i, \"SGPR$i\">;\n"; + $SGPR[$i] = "SGPR$i"; +} + +my @VGPR; +my @GPRF32; +for (my $i = 0; $i < $VGPR_COUNT; $i++) { + my $gprf32_num = $i + 1; + my $gprf32_name = "R$gprf32_num"; + print "def VGPR$i : VGPR_32 <$i, \"VGPR$i\", $gprf32_name>;\n"; + $VGPR[$i] = "VGPR$i"; + $GPRF32[$i] = $gprf32_name; +} + +print <<STRING; + +def SReg_32 : RegisterClass<"AMDIL", [f32, i32], 32, + (add (sequence "SGPR%u", 0, $SGPR_MAX_IDX), SREG_LIT_0, M0) +>; + +def VReg_32 : RegisterClass<"AMDIL", [f32, i32], 32, + (add (sequence "VGPR%u", 0, $VGPR_MAX_IDX), + PERSP_SAMPLE_I, PERSP_SAMPLE_J, + PERSP_CENTER_I, PERSP_CENTER_J, + PERSP_CENTROID_I, PERSP_CENTROID_J, + PERSP_I_W, PERSP_J_W, PERSP_1_W, + LINEAR_SAMPLE_I, LINEAR_SAMPLE_J, + LINEAR_CENTER_I, LINEAR_CENTER_J, + LINEAR_CENTROID_I, LINEAR_CENTROID_J, + LINE_STIPPLE_TEX_COORD, + POS_X_FLOAT, + POS_Y_FLOAT, + POS_Z_FLOAT, + POS_W_FLOAT, + FRONT_FACE, + ANCILLARY, + SAMPLE_COVERAGE, + POS_FIXED_PT + ) +>; + +def AllReg_32 : RegisterClass<"AMDIL", [f32, i32], 32, + (add VReg_32, + SReg_32, + (sequence "R%u", 1, $VGPR_COUNT)) +>; + +def CCReg : RegisterClass<"AMDIL", [f32], 32, (add VCC, SCC)>; + +STRING + +my @subregs_64 = ('low', 'high'); +my @subregs_128 = ('sel_x', 'sel_y', 'sel_z', 'sel_w'); +my @subregs_256 = ('sub0', 'sub1', 'sub2', 'sub3', 'sub4', 'sub5', 'sub6', 'sub7'); + +my @SGPR64 = print_sgpr_class(64, \@subregs_64, ('i64', 'iPTRAny')); +my @SGPR128 = print_sgpr_class(128, \@subregs_128, ('v4f32')); +my @SGPR256 = print_sgpr_class(256, \@subregs_256, ('v8i32')); + +my @VGPR64 = print_vgpr_class(64, \@subregs_64, ('i64')); +my @VGPR128 = print_vgpr_class(128, \@subregs_128, ('v4f32')); + + +my $sgpr64_list = join(',', @SGPR64); +my $vgpr64_list = join(',', @VGPR64); +print <<STRING; + +def AllReg_64 : RegisterClass<"AMDIL", [f64, i64], 64, + (add $sgpr64_list, $vgpr64_list) +>; + +STRING + +if ($INDEX_FILE ne '') { + open(my $fh, ">", $INDEX_FILE); + my %hw_values; + + for (my $i = 0; $i <= $#SGPR; $i++) { + push (@{$hw_values{$i}}, $SGPR[$i]); + } + + for (my $i = 0; $i <= $#SGPR64; $i++) { + push (@{$hw_values{$i * 2}}, $SGPR64[$i]) + } + + for (my $i = 0; $i <= $#SGPR128; $i++) { + push (@{$hw_values{$i * 4}}, $SGPR128[$i]); + } + + for (my $i = 0; $i <= $#SGPR256; $i++) { + push (@{$hw_values{$i * 8}}, $SGPR256[$i]); + } + + for (my $i = 0; $i <= $#VGPR; $i++) { + push (@{$hw_values{$i}}, $VGPR[$i]); + } + for (my $i = 0; $i <= $#VGPR64; $i++) { + push (@{$hw_values{$i * 2}}, $VGPR64[$i]); + } + + for (my $i = 0; $i <= $#VGPR128; $i++) { + push (@{$hw_values{$i * 4}}, $VGPR128[$i]); + } + + + print $fh "unsigned SIRegisterInfo::getHWRegNum(unsigned reg) const\n{\n switch(reg) {\n"; + for my $key (keys(%hw_values)) { + my @names = @{$hw_values{$key}}; + for my $regname (@names) { + print $fh " case AMDIL::$regname:\n" + } + print $fh " return $key;\n"; + } + print $fh " default: return 0;\n }\n}\n" +} + + + + +sub print_sgpr_class { + my ($reg_width, $sub_reg_ref, @types) = @_; + return print_reg_class('SReg', 'SGPR', $reg_width, $SGPR_COUNT, $sub_reg_ref, @types); +} + +sub print_vgpr_class { + my ($reg_width, $sub_reg_ref, @types) = @_; + return print_reg_class('VReg', 'VGPR', $reg_width, $VGPR_COUNT, $sub_reg_ref, @types); +} + +sub print_reg_class { + my ($class_prefix, $reg_prefix, $reg_width, $reg_count, $sub_reg_ref, @types) = @_; + my @registers; + my $component_count = $reg_width / 32; + + for (my $i = 0; $i < $reg_count; $i += $component_count) { + my $reg_name = $reg_prefix . $i . '_' . $reg_width; + my @sub_regs; + for (my $idx = 0; $idx < $component_count; $idx++) { + my $sub_idx = $i + $idx; + push(@sub_regs, $reg_prefix . $sub_idx); + } + print "def $reg_name : $reg_prefix\_$reg_width <$i, \"$reg_name\", [ ", join(',', @sub_regs) , "]>;\n"; + push (@registers, $reg_name); + } + my $reg_list = join(', ', @registers); + + print "def $class_prefix\_$reg_width : RegisterClass<\"AMDIL\", [" . join (', ', @types) . "], $reg_width,\n (add $reg_list)\n>{\n"; + print " let SubRegClasses = [($class_prefix\_", ($reg_width / $component_count) , ' ', join(', ', @{$sub_reg_ref}), ")];\n}\n"; + return @registers; +} diff --git a/src/gallium/drivers/radeon/SIISelLowering.cpp b/src/gallium/drivers/radeon/SIISelLowering.cpp new file mode 100644 index 00000000000..1a4b47ecbf5 --- /dev/null +++ b/src/gallium/drivers/radeon/SIISelLowering.cpp @@ -0,0 +1,151 @@ +//===-- SIISelLowering.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "SIISelLowering.h" +#include "SIInstrInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +SITargetLowering::SITargetLowering(TargetMachine &TM) : + AMDGPUTargetLowering(TM), + TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) +{ + addRegisterClass(MVT::v4f32, &AMDIL::VReg_128RegClass); + addRegisterClass(MVT::f32, &AMDIL::VReg_32RegClass); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal); +} + +MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( + MachineInstr * MI, MachineBasicBlock * BB) const +{ + const struct TargetInstrInfo * TII = getTargetMachine().getInstrInfo(); + MachineRegisterInfo & MRI = BB->getParent()->getRegInfo(); + MachineBasicBlock::iterator I = MI; + + if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) { + AppendS_WAITCNT(MI, *BB, llvm::next(I)); + } + + switch (MI->getOpcode()) { + default: + return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); + case AMDIL::SI_INTERP: + LowerSI_INTERP(MI, *BB, I, MRI); + break; + case AMDIL::SI_INTERP_CONST: + LowerSI_INTERP_CONST(MI, *BB, I); + break; + case AMDIL::SI_V_CNDLT: + LowerSI_V_CNDLT(MI, *BB, I, MRI); + break; + case AMDIL::USE_SGPR_32: + case AMDIL::USE_SGPR_64: + lowerUSE_SGPR(MI, BB->getParent(), MRI); + MI->eraseFromParent(); + break; + case AMDIL::VS_LOAD_BUFFER_INDEX: + addLiveIn(MI, BB->getParent(), MRI, TII, AMDIL::VGPR0); + MI->eraseFromParent(); + break; + } + return BB; +} + +void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I) const +{ + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::S_WAITCNT)) + .addImm(0); +} + +void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const +{ + unsigned tmp = MRI.createVirtualRegister(&AMDIL::VReg_32RegClass); + MachineOperand dst = MI->getOperand(0); + MachineOperand iReg = MI->getOperand(1); + MachineOperand jReg = MI->getOperand(2); + MachineOperand attr_chan = MI->getOperand(3); + MachineOperand attr = MI->getOperand(4); + MachineOperand params = MI->getOperand(5); + + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::S_MOV_B32)) + .addReg(AMDIL::M0) + .addOperand(params); + + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_INTERP_P1_F32), tmp) + .addOperand(iReg) + .addOperand(attr_chan) + .addOperand(attr); + + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_INTERP_P2_F32)) + .addOperand(dst) + .addReg(tmp) + .addOperand(jReg) + .addOperand(attr_chan) + .addOperand(attr); + + MI->eraseFromParent(); +} + +void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI, + MachineBasicBlock &BB, MachineBasicBlock::iterator I) const +{ + MachineOperand dst = MI->getOperand(0); + MachineOperand attr_chan = MI->getOperand(1); + MachineOperand attr = MI->getOperand(2); + MachineOperand params = MI->getOperand(3); + + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::S_MOV_B32)) + .addReg(AMDIL::M0) + .addOperand(params); + + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_INTERP_MOV_F32)) + .addOperand(dst) + .addOperand(attr_chan) + .addOperand(attr); + + MI->eraseFromParent(); +} + +void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const +{ + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_CMP_LT_F32_e32)) + .addOperand(MI->getOperand(1)) + .addReg(AMDIL::SREG_LIT_0); + + BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDIL::V_CNDMASK_B32)) + .addOperand(MI->getOperand(0)) + .addOperand(MI->getOperand(2)) + .addOperand(MI->getOperand(3)); + + MI->eraseFromParent(); +} + +void SITargetLowering::lowerUSE_SGPR(MachineInstr *MI, + MachineFunction * MF, MachineRegisterInfo & MRI) const +{ + const struct TargetInstrInfo * TII = getTargetMachine().getInstrInfo(); + unsigned dstReg = MI->getOperand(0).getReg(); + int64_t newIndex = MI->getOperand(1).getImm(); + const TargetRegisterClass * dstClass = MRI.getRegClass(dstReg); + + unsigned newReg = dstClass->getRegister(newIndex); + addLiveIn(MI, MF, MRI, TII, newReg); +} + diff --git a/src/gallium/drivers/radeon/SIISelLowering.h b/src/gallium/drivers/radeon/SIISelLowering.h new file mode 100644 index 00000000000..e7a79f8e215 --- /dev/null +++ b/src/gallium/drivers/radeon/SIISelLowering.h @@ -0,0 +1,44 @@ +//===-- SIISelLowering.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#ifndef SIISELLOWERING_H +#define SIISELLOWERING_H + +#include "AMDGPUISelLowering.h" +#include "SIInstrInfo.h" + +namespace llvm { + +class SITargetLowering : public AMDGPUTargetLowering +{ + const SIInstrInfo * TII; + + void AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I) const; + void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; + void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I) const; + void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB, + MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const; + void lowerUSE_SGPR(MachineInstr *MI, MachineFunction * MF, + MachineRegisterInfo & MRI) const; +public: + SITargetLowering(TargetMachine &tm); + virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, + MachineBasicBlock * BB) const; +}; + +} // End namespace llvm + +#endif //SIISELLOWERING_H diff --git a/src/gallium/drivers/radeon/SIInstrFormats.td b/src/gallium/drivers/radeon/SIInstrFormats.td new file mode 100644 index 00000000000..caf9b0ef120 --- /dev/null +++ b/src/gallium/drivers/radeon/SIInstrFormats.td @@ -0,0 +1,128 @@ +//===-- SIInstrFormats.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +class VOP3_32 <bits<9> op, string opName, list<dag> pattern> + : VOP3 <op, (outs VReg_32:$dst), (ins AllReg_32:$src0, AllReg_32:$src1, AllReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>; + +class VOP3_64 <bits<9> op, string opName, list<dag> pattern> + : VOP3 <op, (outs VReg_64:$dst), (ins AllReg_64:$src0, AllReg_64:$src1, AllReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>; + + +class SOP1_32 <bits<8> op, string opName, list<dag> pattern> + : SOP1 <op, (outs SReg_32:$dst), (ins SReg_32:$src0), opName, pattern>; + +class SOP1_64 <bits<8> op, string opName, list<dag> pattern> + : SOP1 <op, (outs SReg_64:$dst), (ins SReg_64:$src0), opName, pattern>; + +class SOP2_32 <bits<7> op, string opName, list<dag> pattern> + : SOP2 <op, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>; + +class SOP2_64 <bits<7> op, string opName, list<dag> pattern> + : SOP2 <op, (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>; + +class VOP1_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, + string opName, list<dag> pattern> : + VOP1 < + op, (outs vrc:$dst), (ins arc:$src0), opName, pattern + >; + +multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern, + bits<16> amdil = AMDILInst.NONE> { + + let AMDILOp = amdil in { + def _e32: VOP1_Helper <op, VReg_32, AllReg_32, opName, pattern>; + } + + def _e64 : VOP3_32 < + {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + opName, [] + >; +} + +multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> { + + def _e32 : VOP1_Helper <op, VReg_64, AllReg_64, opName, pattern>; + + def _e64 : VOP3_64 < + {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + opName, [] + >; +} + +class VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc, + string opName, list<dag> pattern> : + VOP2 < + op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName, pattern + >; + +multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern, + bits<16> amdil = AMDILInst.NONE> { + + let AMDILOp = amdil in { + def _e32 : VOP2_Helper <op, VReg_32, AllReg_32, opName, pattern>; + } + + def _e64 : VOP3_32 < + {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + opName, [] + >; +} + +multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> { + def _e32: VOP2_Helper <op, VReg_64, AllReg_64, opName, pattern>; + + def _e64 : VOP3_64 < + {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + opName, [] + >; +} + +class SOPK_32 <bits<5> op, string opName, list<dag> pattern> + : SOPK <op, (outs SReg_32:$dst), (ins i16imm:$src0), opName, pattern>; + +class SOPK_64 <bits<5> op, string opName, list<dag> pattern> + : SOPK <op, (outs SReg_64:$dst), (ins i16imm:$src0), opName, pattern>; + +class VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc, + string opName, list<dag> pattern> : + VOPC < + op, (outs), (ins arc:$src0, vrc:$src1), opName, pattern + >; + +multiclass VOPC_32 <bits<8> op, string opName, list<dag> pattern> { + + def _e32 : VOPC_Helper <op, VReg_32, AllReg_32, opName, pattern>; + + def _e64 : VOP3_32 < + {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + opName, [] + >; +} + +multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern> { + + def _e32 : VOPC_Helper <op, VReg_64, AllReg_64, opName, pattern>; + + def _e64 : VOP3_64 < + {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}}, + opName, [] + >; +} + +class SOPC_32 <bits<7> op, string opName, list<dag> pattern> + : SOPC <op, (outs CCReg:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>; + +class SOPC_64 <bits<7> op, string opName, list<dag> pattern> + : SOPC <op, (outs CCReg:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>; + diff --git a/src/gallium/drivers/radeon/SIInstrInfo.cpp b/src/gallium/drivers/radeon/SIInstrInfo.cpp new file mode 100644 index 00000000000..6f92e96c6e7 --- /dev/null +++ b/src/gallium/drivers/radeon/SIInstrInfo.cpp @@ -0,0 +1,173 @@ +//===-- SIInstrInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#include "SIInstrInfo.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCInstrDesc.h" + +#include <stdio.h> + +using namespace llvm; + +SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm) + : AMDGPUInstrInfo(tm), + RI(tm, *this), + TM(tm) + { } + +const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const +{ + return RI; +} + +void +SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const +{ + BuildMI(MBB, MI, DL, get(AMDIL::V_MOV_B32_e32), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); +} + +unsigned SIInstrInfo::getEncodingType(const MachineInstr &MI) const +{ + return get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK; +} + +unsigned SIInstrInfo::getEncodingBytes(const MachineInstr &MI) const +{ + + /* Instructions with literal constants are expanded to 64-bits, and + * the constant is stored in bits [63:32] */ + for (unsigned i = 0; i < MI.getNumOperands(); i++) { + if (MI.getOperand(i).getType() == MachineOperand::MO_FPImmediate) { + return 8; + } + } + + /* This instruction always has a literal */ + if (MI.getOpcode() == AMDIL::S_MOV_IMM_I32) { + return 8; + } + + unsigned encoding_type = getEncodingType(MI); + switch (encoding_type) { + case SIInstrEncodingType::EXP: + case SIInstrEncodingType::LDS: + case SIInstrEncodingType::MUBUF: + case SIInstrEncodingType::MTBUF: + case SIInstrEncodingType::MIMG: + case SIInstrEncodingType::VOP3: + return 8; + default: + return 4; + } +} + +MachineInstr * SIInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF, + DebugLoc DL) const +{ + + switch (MI.getOpcode()) { + default: break; + case AMDIL::ABS_f32: return convertABS_f32(MI, MF, DL); + case AMDIL::CLAMP_f32: return convertCLAMP_f32(MI, MF, DL); + } + + MachineInstr * newMI = AMDGPUInstrInfo::convertToISA(MI, MF, DL); + const MCInstrDesc &newDesc = get(newMI->getOpcode()); + + /* If this instruction was converted to a VOP3, we need to add the extra + * operands for abs, clamp, omod, and negate. */ + if (getEncodingType(*newMI) == SIInstrEncodingType::VOP3 + && newMI->getNumOperands() < newDesc.getNumOperands()) { + MachineInstrBuilder builder(newMI); + for (unsigned op_idx = newMI->getNumOperands(); + op_idx < newDesc.getNumOperands(); op_idx++) { + builder.addImm(0); + } + } + return newMI; +} + +unsigned SIInstrInfo::getISAOpcode(unsigned AMDILopcode) const +{ + switch (AMDILopcode) { + case AMDIL::MAD_f32: return AMDIL::V_MAD_LEGACY_F32; + default: return AMDGPUInstrInfo::getISAOpcode(AMDILopcode); + } +} + +MachineInstr * SIInstrInfo::convertABS_f32(MachineInstr & absInstr, + MachineFunction &MF, DebugLoc DL) const +{ + MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineOperand &dst = absInstr.getOperand(0); + + /* Convert the desination register to the VReg_32 class */ + if (TargetRegisterInfo::isVirtualRegister(dst.getReg())) { + MRI.setRegClass(dst.getReg(), AMDIL::VReg_32RegisterClass); + } + + return BuildMI(MF, DL, get(AMDIL::V_MOV_B32_e64)) + .addOperand(absInstr.getOperand(0)) + .addOperand(absInstr.getOperand(1)) + /* VSRC1-2 are unused, but we still need to fill all the + * operand slots, so we just reuse the VSRC0 operand */ + .addOperand(absInstr.getOperand(1)) + .addOperand(absInstr.getOperand(1)) + .addImm(1) // ABS + .addImm(0) // CLAMP + .addImm(0) // OMOD + .addImm(0); // NEG +} + +MachineInstr * SIInstrInfo::convertCLAMP_f32(MachineInstr & clampInstr, + MachineFunction &MF, DebugLoc DL) const +{ + MachineRegisterInfo &MRI = MF.getRegInfo(); + /* XXX: HACK assume that low == zero and high == one for now until + * we have a way to propogate the immediates. */ + +/* + uint32_t zero = (uint32_t)APFloat(0.0f).bitcastToAPInt().getZExtValue(); + uint32_t one = (uint32_t)APFloat(1.0f).bitcastToAPInt().getZExtValue(); + uint32_t low = clampInstr.getOperand(2).getImm(); + uint32_t high = clampInstr.getOperand(3).getImm(); +*/ +// if (low == zero && high == one) { + + /* Convert the desination register to the VReg_32 class */ + if (TargetRegisterInfo::isVirtualRegister(clampInstr.getOperand(0).getReg())) { + MRI.setRegClass(clampInstr.getOperand(0).getReg(), + AMDIL::VReg_32RegisterClass); + } + return BuildMI(MF, DL, get(AMDIL::V_MOV_B32_e64)) + .addOperand(clampInstr.getOperand(0)) + .addOperand(clampInstr.getOperand(1)) + /* VSRC1-2 are unused, but we still need to fill all the + * operand slots, so we just reuse the VSRC0 operand */ + .addOperand(clampInstr.getOperand(1)) + .addOperand(clampInstr.getOperand(1)) + .addImm(0) // ABS + .addImm(1) // CLAMP + .addImm(0) // OMOD + .addImm(0); // NEG +// } else { + /* XXX: Handle other cases */ +// abort(); +// } +} diff --git a/src/gallium/drivers/radeon/SIInstrInfo.h b/src/gallium/drivers/radeon/SIInstrInfo.h new file mode 100644 index 00000000000..bd76c3f94aa --- /dev/null +++ b/src/gallium/drivers/radeon/SIInstrInfo.h @@ -0,0 +1,95 @@ +//===-- SIInstrInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#ifndef SIINSTRINFO_H +#define SIINSTRINFO_H + +#include "AMDGPUInstrInfo.h" +#include "SIRegisterInfo.h" + +namespace llvm { + + class SIInstrInfo : public AMDGPUInstrInfo { + private: + const SIRegisterInfo RI; + AMDGPUTargetMachine &TM; + + MachineInstr * convertABS_f32(MachineInstr & absInstr, MachineFunction &MF, + DebugLoc DL) const; + + MachineInstr * convertCLAMP_f32(MachineInstr & clampInstr, + MachineFunction &MF, DebugLoc DL) const; + + public: + explicit SIInstrInfo(AMDGPUTargetMachine &tm); + + const SIRegisterInfo &getRegisterInfo() const; + + virtual void copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, DebugLoc DL, + unsigned DestReg, unsigned SrcReg, + bool KillSrc) const; + + unsigned getEncodingType(const MachineInstr &MI) const; + + unsigned getEncodingBytes(const MachineInstr &MI) const; + + uint64_t getBinaryCode(const MachineInstr &MI, bool encodOpcode = false) const; + + virtual MachineInstr * convertToISA(MachineInstr & MI, MachineFunction &MF, + DebugLoc DL) const; + + virtual unsigned getISAOpcode(unsigned AMDILopcode) const; + + }; + +} // End namespace llvm + +/* These must be kept in sync with SIInstructions.td and also the + * InstrEncodingInfo array in SIInstrInfo.cpp. + * + * NOTE: This enum is only used to identify the encoding type within LLVM, + * the actual encoding type that is part of the instruction format is different + */ +namespace SIInstrEncodingType { + enum Encoding { + EXP = 0, + LDS = 1, + MIMG = 2, + MTBUF = 3, + MUBUF = 4, + SMRD = 5, + SOP1 = 6, + SOP2 = 7, + SOPC = 8, + SOPK = 9, + SOPP = 10, + VINTRP = 11, + VOP1 = 12, + VOP2 = 13, + VOP3 = 14, + VOPC = 15 + }; +} + +#define SI_INSTR_FLAGS_ENCODING_MASK 0xf + +namespace SIInstrFlags { + enum Flags { + /* First 4 bits are the instruction encoding */ + NEED_WAIT = 1 << 4 + }; +} + +#endif //SIINSTRINFO_H diff --git a/src/gallium/drivers/radeon/SIInstrInfo.td b/src/gallium/drivers/radeon/SIInstrInfo.td new file mode 100644 index 00000000000..ffa18d05019 --- /dev/null +++ b/src/gallium/drivers/radeon/SIInstrInfo.td @@ -0,0 +1,472 @@ +//===-- SIInstrInfo.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + + +class InstSI <dag outs, dag ins, string asm, list<dag> pattern> : + AMDGPUInst<outs, ins, asm, pattern> { + + field bits<4> EncodingType = 0; + field bits<1> NeedWait = 0; + + let TSFlags{3-0} = EncodingType; + let TSFlags{4} = NeedWait; + +} + +class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + field bits<32> Inst; +} + +class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> : + InstSI <outs, ins, asm, pattern> { + + field bits<64> Inst; +} + +class GPR4Align <RegisterClass rc> : Operand <vAny> { + let EncoderMethod = "GPR4AlignEncode"; + let MIOperandInfo = (ops rc:$reg); +} + +class GPR2Align <RegisterClass rc, ValueType vt> : Operand <vt> { + let EncoderMethod = "GPR2AlignEncode"; + let MIOperandInfo = (ops rc:$reg); +} + +def i32Literal : Operand <i32> { + let EncoderMethod = "i32LiteralEncode"; +} + +def EXP : Enc64< + (outs), + (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm, + VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3", + [] > { + + bits<4> EN; + bits<6> TGT; + bits<1> COMPR; + bits<1> DONE; + bits<1> VM; + bits<8> VSRC0; + bits<8> VSRC1; + bits<8> VSRC2; + bits<8> VSRC3; + + let Inst{3-0} = EN; + let Inst{9-4} = TGT; + let Inst{10} = COMPR; + let Inst{11} = DONE; + let Inst{12} = VM; + let Inst{31-26} = 0x3e; + let Inst{39-32} = VSRC0; + let Inst{47-40} = VSRC1; + let Inst{55-48} = VSRC2; + let Inst{63-56} = VSRC3; + let EncodingType = 0; //SIInstrEncodingType::EXP + + let NeedWait = 1; + let usesCustomInserter = 1; +} + +class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64 <outs, ins, asm, pattern> { + + bits<8> VDATA; + bits<4> DMASK; + bits<1> UNORM; + bits<1> GLC; + bits<1> DA; + bits<1> R128; + bits<1> TFE; + bits<1> LWE; + bits<1> SLC; + bits<8> VADDR; + bits<5> SRSRC; + bits<5> SSAMP; + + let Inst{11-8} = DMASK; + let Inst{12} = UNORM; + let Inst{13} = GLC; + let Inst{14} = DA; + let Inst{15} = R128; + let Inst{16} = TFE; + let Inst{17} = LWE; + let Inst{24-18} = op; + let Inst{25} = SLC; + let Inst{31-26} = 0x3c; + let Inst{39-32} = VADDR; + let Inst{47-40} = VDATA; + let Inst{52-48} = SRSRC; + let Inst{57-53} = SSAMP; + + let EncodingType = 2; //SIInstrEncodingType::MIMG + +} + +class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64<outs, ins, asm, pattern> { + + bits<8> VDATA; + bits<12> OFFSET; + bits<1> OFFEN; + bits<1> IDXEN; + bits<1> GLC; + bits<1> ADDR64; + bits<4> DFMT; + bits<3> NFMT; + bits<8> VADDR; + bits<5> SRSRC; + bits<1> SLC; + bits<1> TFE; + bits<8> SOFFSET; + + let Inst{11-0} = OFFSET; + let Inst{12} = OFFEN; + let Inst{13} = IDXEN; + let Inst{14} = GLC; + let Inst{15} = ADDR64; + let Inst{18-16} = op; + let Inst{22-19} = DFMT; + let Inst{25-23} = NFMT; + let Inst{31-26} = 0x3a; //encoding + let Inst{39-32} = VADDR; + let Inst{47-40} = VDATA; + let Inst{52-48} = SRSRC; + let Inst{54} = SLC; + let Inst{55} = TFE; + let Inst{63-56} = SOFFSET; + let EncodingType = 3; //SIInstrEncodingType::MTBUF + + let NeedWait = 1; + let usesCustomInserter = 1; + let neverHasSideEffects = 1; +} + +class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64<outs, ins, asm, pattern> { + + bits<8> VDATA; + bits<12> OFFSET; + bits<1> OFFEN; + bits<1> IDXEN; + bits<1> GLC; + bits<1> ADDR64; + bits<1> LDS; + bits<8> VADDR; + bits<5> SRSRC; + bits<1> SLC; + bits<1> TFE; + bits<8> SOFFSET; + + let Inst{11-0} = OFFSET; + let Inst{12} = OFFEN; + let Inst{13} = IDXEN; + let Inst{14} = GLC; + let Inst{15} = ADDR64; + let Inst{16} = LDS; + let Inst{24-18} = op; + let Inst{31-26} = 0x38; //encoding + let Inst{39-32} = VADDR; + let Inst{47-40} = VDATA; + let Inst{52-48} = SRSRC; + let Inst{54} = SLC; + let Inst{55} = TFE; + let Inst{63-56} = SOFFSET; + let EncodingType = 4; //SIInstrEncodingType::MUBUF + + let NeedWait = 1; + let usesCustomInserter = 1; + let neverHasSideEffects = 1; +} + +class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32<outs, ins, asm, pattern> { + + bits<7> SDST; + bits<8> OFFSET; + bits<6> SBASE; + bits<1> IMM = 0; // Determined by subclasses + + let Inst{7-0} = OFFSET; + let Inst{8} = IMM; + let Inst{14-9} = SBASE; + let Inst{21-15} = SDST; + let Inst{26-22} = op; + let Inst{31-27} = 0x18; //encoding + let EncodingType = 5; //SIInstrEncodingType::SMRD + + let NeedWait = 1; + let usesCustomInserter = 1; +} + +class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32<outs, ins, asm, pattern> { + + bits<7> SDST; + bits<8> SSRC0; + + let Inst{7-0} = SSRC0; + let Inst{15-8} = op; + let Inst{22-16} = SDST; + let Inst{31-23} = 0x17d; //encoding; + let EncodingType = 6; //SIInstrEncodingType::SOP1 +} + +class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<7> SDST; + bits<8> SSRC0; + bits<8> SSRC1; + + let Inst{7-0} = SSRC0; + let Inst{15-8} = SSRC1; + let Inst{22-16} = SDST; + let Inst{29-23} = op; + let Inst{31-30} = 0x2; // encoding + let EncodingType = 7; // SIInstrEncodingType::SOP2 +} + +class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32<outs, ins, asm, pattern> { + + bits<8> SSRC0; + bits<8> SSRC1; + + let Inst{7-0} = SSRC0; + let Inst{15-8} = SSRC1; + let Inst{22-16} = op; + let Inst{31-23} = 0x17e; + let EncodingType = 8; // SIInstrEncodingType::SOPC +} + +class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins , asm, pattern> { + + bits <7> SDST; + bits <16> SIMM16; + + let Inst{15-0} = SIMM16; + let Inst{22-16} = SDST; + let Inst{27-23} = op; + let Inst{31-28} = 0xb; //encoding + let EncodingType = 9; // SIInstrEncodingType::SOPK +} + +class SOPP <bits<7> op, dag ins, string asm> : Enc32 < + (outs), + ins, + asm, + [] > { + + bits <16> SIMM16; + + let Inst{15-0} = SIMM16; + let Inst{22-16} = op; + let Inst{31-23} = 0x17f; // encoding + let EncodingType = 10; // SIInstrEncodingType::SOPP +} + + +class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<8> VSRC; + bits<2> ATTRCHAN; + bits<6> ATTR; + + let Inst{7-0} = VSRC; + let Inst{9-8} = ATTRCHAN; + let Inst{15-10} = ATTR; + let Inst{17-16} = op; + let Inst{25-18} = VDST; + let Inst{31-26} = 0x32; // encoding + let EncodingType = 11; // SIInstrEncodingType::VINTRP + + let Uses = [M0]; +} + +class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<9> SRC0; + + let Inst{8-0} = SRC0; + let Inst{16-9} = op; + let Inst{24-17} = VDST; + let Inst{31-25} = 0x3f; //encoding + + let EncodingType = 12; // SIInstrEncodingType::VOP1 + let PostEncoderMethod = "VOPPostEncode"; +} + +class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<9> SRC0; + bits<8> VSRC1; + + let Inst{8-0} = SRC0; + let Inst{16-9} = VSRC1; + let Inst{24-17} = VDST; + let Inst{30-25} = op; + let Inst{31} = 0x0; //encoding + + let EncodingType = 13; // SIInstrEncodingType::VOP2 + let PostEncoderMethod = "VOPPostEncode"; +} + +class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc64 <outs, ins, asm, pattern> { + + bits<8> VDST; + bits<9> SRC0; + bits<9> SRC1; + bits<9> SRC2; + bits<3> ABS; + bits<1> CLAMP; + bits<2> OMOD; + bits<3> NEG; + + let Inst{7-0} = VDST; + let Inst{10-8} = ABS; + let Inst{11} = CLAMP; + let Inst{25-17} = op; + let Inst{31-26} = 0x34; //encoding + let Inst{40-32} = SRC0; + let Inst{49-41} = SRC1; + let Inst{58-50} = SRC2; + let Inst{60-59} = OMOD; + let Inst{63-61} = NEG; + + let EncodingType = 14; // SIInstrEncodingType::VOP3 + let PostEncoderMethod = "VOPPostEncode"; +} + +class VOPC <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> : + Enc32 <outs, ins, asm, pattern> { + + bits<9> SRC0; + bits<8> VSRC1; + + let Inst{8-0} = SRC0; + let Inst{16-9} = VSRC1; + let Inst{24-17} = op; + let Inst{31-25} = 0x3e; + + let EncodingType = 15; //SIInstrEncodingType::VOPC + let PostEncoderMethod = "VOPPostEncode"; + + let Defs = [VCC]; +} + +class MIMG_Load_Helper <bits<7> op, string asm> : MIMG < + op, + (outs VReg_128:$vdata), + (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128, + i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_128:$vaddr, + GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp), + asm, + [] +>; + +class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF < + op, + (outs regClass:$dst), + (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc, + i1imm:$tfe, SReg_32:$soffset), + asm, + []> { + let mayLoad = 1; +} + +class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < + op, + (outs regClass:$dst), + (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64, + i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, + i1imm:$slc, i1imm:$tfe, SReg_32:$soffset), + asm, + []> { + let mayLoad = 1; +} + +class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF < + op, + (outs), + (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, + i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, + GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset), + asm, + []> { + let mayStore = 1; +} + +/*XXX: We should be able to infer the imm bit based on the arg types */ +multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass> { + + def _SGPR : SMRD < + op, + (outs dstClass:$dst), + (ins SReg_32:$offset, GPR2Align<SReg_64,i64>:$sbase), + asm, + [] + > { + let IMM = 0; + } + + def _IMM : SMRD < + op, + (outs dstClass:$dst), + (ins i32imm:$offset, GPR2Align<SReg_64,i64>:$sbase), + asm, + [] + > { + let IMM = 1; + } +} + +class SIOperand <ValueType vt, dag opInfo>: Operand <vt> { + let EncoderMethod = "encodeOperand"; + let MIOperandInfo = opInfo; +} + +def IMM8bit : ImmLeaf < + i32, + [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}] +>; + +def IMM12bit : ImmLeaf < + i16, + [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}] +>; + +include "SIInstrFormats.td" + +def LOAD_CONST : AMDGPUShaderInst < + (outs GPRF32:$dst), + (ins i32imm:$src), + "LOAD_CONST $dst, $src", + [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))] +>; + +include "SIInstructions.td" diff --git a/src/gallium/drivers/radeon/SIInstructions.td b/src/gallium/drivers/radeon/SIInstructions.td new file mode 100644 index 00000000000..003d3d0b215 --- /dev/null +++ b/src/gallium/drivers/radeon/SIInstructions.td @@ -0,0 +1,962 @@ +//===-- SIInstructions.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +def isSI : Predicate<"Subtarget.device()" + "->getGeneration() == AMDILDeviceInfo::HD7XXX">; + +let Predicates = [isSI] in { +let Gen = AMDGPUGen.SI in { + +def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>; +def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>; +def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>; +def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>; +def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>; +def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>; +def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>; +def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>; +def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>; +def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>; +////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>; +////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>; +////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>; +////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>; +////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>; +////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>; +////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>; +////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>; +//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>; +//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>; +def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>; +//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>; +//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>; +//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>; +////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>; +////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>; +////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>; +////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>; +def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>; +def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>; +def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>; +def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>; +def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>; +def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>; +def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>; +////def S_ANDN2_SAVEEXEC_B64 : SOP1_ANDN2 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>; +////def S_ORN2_SAVEEXEC_B64 : SOP1_ORN2 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>; +def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>; +def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>; +def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>; +def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>; +def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>; +def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>; +def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>; +def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>; +def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>; +//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>; +def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>; +def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>; +def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>; +def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>; +def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>; +def S_CMPK_EQ_I32 : SOPK_32 <0x00000003, "S_CMPK_EQ_I32", []>; +def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>; +def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>; +def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>; +def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>; +def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>; +def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>; +def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>; +def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>; +def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>; +def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>; +def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>; +def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>; +def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>; +//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>; +def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>; +def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>; +def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>; +//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>; +//def EXP : EXP_ <0x00000000, "EXP", []>; +defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>; +defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>; +defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>; +defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>; +defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>; +defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>; +defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>; +defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>; +defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>; +defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32", []>; +defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32", []>; +defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32", []>; +defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>; +defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>; +defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>; +defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>; +defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32", []>; +defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32", []>; +defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32", []>; +defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32", []>; +defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32", []>; +defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32", []>; +defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32", []>; +defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32", []>; +defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32", []>; +defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32", []>; +defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32", []>; +defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32", []>; +defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32", []>; +defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32", []>; +defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32", []>; +defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32", []>; +defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64", []>; +defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", []>; +defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", []>; +defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", []>; +defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", []>; +defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64", []>; +defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", []>; +defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", []>; +defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", []>; +defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64", []>; +defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64", []>; +defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64", []>; +defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64", []>; +defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", []>; +defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64", []>; +defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64", []>; +defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64", []>; +defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64", []>; +defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64", []>; +defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64", []>; +defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64", []>; +defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64", []>; +defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64", []>; +defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64", []>; +defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64", []>; +defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64", []>; +defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64", []>; +defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64", []>; +defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64", []>; +defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64", []>; +defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64", []>; +defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64", []>; +defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32", []>; +defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32", []>; +defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32", []>; +defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32", []>; +defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32", []>; +defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32", []>; +defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32", []>; +defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32", []>; +defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32", []>; +defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32", []>; +defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32", []>; +defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32", []>; +defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32", []>; +defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32", []>; +defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32", []>; +defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32", []>; +defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32", []>; +defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32", []>; +defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32", []>; +defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32", []>; +defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32", []>; +defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32", []>; +defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32", []>; +defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32", []>; +defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32", []>; +defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32", []>; +defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32", []>; +defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32", []>; +defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32", []>; +defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32", []>; +defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32", []>; +defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32", []>; +defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64", []>; +defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64", []>; +defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64", []>; +defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64", []>; +defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64", []>; +defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64", []>; +defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64", []>; +defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64", []>; +defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64", []>; +defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64", []>; +defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64", []>; +defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64", []>; +defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64", []>; +defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64", []>; +defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64", []>; +defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64", []>; +defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64", []>; +defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64", []>; +defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64", []>; +defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64", []>; +defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64", []>; +defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64", []>; +defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64", []>; +defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64", []>; +defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64", []>; +defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64", []>; +defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64", []>; +defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64", []>; +defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64", []>; +defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64", []>; +defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64", []>; +defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64", []>; +defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>; +defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>; +defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>; +defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>; +defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>; +defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>; +defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>; +defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>; +defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32", []>; +defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32", []>; +defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32", []>; +defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32", []>; +defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32", []>; +defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32", []>; +defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32", []>; +defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32", []>; +defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64", []>; +defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", []>; +defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", []>; +defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", []>; +defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", []>; +defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", []>; +defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", []>; +defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64", []>; +defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64", []>; +defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64", []>; +defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64", []>; +defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64", []>; +defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64", []>; +defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64", []>; +defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64", []>; +defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64", []>; +defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32", []>; +defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", []>; +defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", []>; +defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", []>; +defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", []>; +defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", []>; +defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", []>; +defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32", []>; +defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32", []>; +defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32", []>; +defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32", []>; +defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32", []>; +defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32", []>; +defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32", []>; +defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32", []>; +defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32", []>; +defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64", []>; +defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", []>; +defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", []>; +defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", []>; +defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", []>; +defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", []>; +defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", []>; +defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64", []>; +defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64", []>; +defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64", []>; +defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64", []>; +defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64", []>; +defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64", []>; +defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64", []>; +defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64", []>; +defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64", []>; +defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32", []>; +defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32", []>; +defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64", []>; +defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64", []>; +//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>; +//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>; +//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>; +def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>; +//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>; +//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>; +//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>; +//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>; +//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>; +//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>; +//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>; +//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>; +//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>; +//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>; +//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>; +//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>; +//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>; +//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>; +//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>; +//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>; +//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>; +//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>; +//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>; +//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>; +//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>; +//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>; +//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>; +//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>; +//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>; +//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>; +//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>; +//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>; +//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>; +//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>; +//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>; +//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>; +//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>; +//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>; +//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>; +//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>; +//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>; +//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>; +//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>; +//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>; +//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>; +//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>; +//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>; +//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>; +//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>; +//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>; +//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>; +//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>; +//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>; +//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>; +//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>; +//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>; +//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>; +//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>; +//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>; +def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>; +//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>; +//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>; +//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>; +//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>; + +let mayLoad = 0, neverHasSideEffects = 1 in { + +defm S_LOAD_DWORD : SMRD_Helper <0x00000000, "S_LOAD_DWORD", SReg_32>; +//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>; +defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128>; +defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256>; +//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>; +//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>; +//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>; +//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>; +//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>; +//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>; + +} // End mayLoad, neverHasSideEffects + +//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>; +//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>; +//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>; +//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>; +//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>; +//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>; +//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>; +//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>; +//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>; +//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>; +//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>; +//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>; +//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>; +//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>; +//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>; +//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>; +//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>; +//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>; +//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>; +//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>; +//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>; +//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>; +//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>; +//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>; +//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>; +//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>; +//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>; +//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>; +//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>; +//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>; +def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">; +//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>; +//def IMAGE_SAMPLE_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_D", 0x00000022>; +//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>; +//def IMAGE_SAMPLE_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_L", 0x00000024>; +//def IMAGE_SAMPLE_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_B", 0x00000025>; +//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>; +//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>; +//def IMAGE_SAMPLE_C : MIMG_NoPattern_ <"IMAGE_SAMPLE_C", 0x00000028>; +//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>; +//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>; +//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>; +//def IMAGE_SAMPLE_C_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L", 0x0000002c>; +//def IMAGE_SAMPLE_C_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B", 0x0000002d>; +//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>; +//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>; +//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>; +//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>; +//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>; +//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>; +//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>; +//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>; +//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>; +//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>; +//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>; +//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>; +//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>; +//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>; +//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>; +//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>; +//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>; +//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>; +//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>; +//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>; +//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>; +//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>; +//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>; +//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>; +//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>; +//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>; +//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>; +//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>; +//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>; +//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>; +//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>; +//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>; +//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>; +//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>; +//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>; +//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>; +//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>; +//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>; +//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>; +//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>; +//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>; +//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>; +//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>; +//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>; +//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>; +//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>; +//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>; +//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>; +//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>; +//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>; +//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>; +//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>; +//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>; +//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>; + +let neverHasSideEffects = 1 in { +defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", [], AMDILInst.MOVE_f32>; +} // End neverHasSideEffects +defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>; +//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>; +//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>; +//defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32", []>; +//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>; +//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>; +//defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32", []>; +defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>; +////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>; +//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>; +//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>; +//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>; +//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>; +//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>; +//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>; +//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>; +//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>; +//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>; +//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>; +//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>; +//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>; +defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32", []>; +defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>; +defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>; +defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32", []>; +defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32", []>; +defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32", []>; +defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; +defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>; +defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; +defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; +defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", []>; +defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; +defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; +defm V_RSQ_LEGACY_F32 : VOP1_32 < + 0x0000002d, "V_RSQ_LEGACY_F32", + [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))] +>; +defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>; +defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>; +defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; +defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>; +defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>; +defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>; +defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>; +defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>; +defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>; +defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>; +defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>; +defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>; +defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>; +defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>; +//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>; +defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>; +defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>; +//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>; +defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>; +//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>; +defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>; +defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>; +defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>; + +def V_INTERP_P1_F32 : VINTRP < + 0x00000000, + (outs VReg_32:$dst), + (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr), + "V_INTERP_P1_F32", + [] +>; + +def V_INTERP_P2_F32 : VINTRP < + 0x00000001, + (outs VReg_32:$dst), + (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr), + "V_INTERP_P2_F32", + []> { + + let Constraints = "$src0 = $dst"; + let DisableEncoding = "$src0"; + +} + +def V_INTERP_MOV_F32 : VINTRP < + 0x00000002, + (outs VReg_32:$dst), + (ins i32imm:$attr_chan, i32imm:$attr), + "V_INTERP_MOV_F32", + []> { + let VSRC = 0; +} + +//def V_INTERP_MOV_F32 : VINTRP_32 <0x00000002, "V_INTERP_MOV_F32", []>; +//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>; +def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM"> { + let SIMM16 = 0; + let isTerminator = 1; +} +//def S_BRANCH : SOPP_ <0x00000002, "S_BRANCH", []>; +//def S_CBRANCH_SCC0 : SOPP_SCC0 <0x00000004, "S_CBRANCH_SCC0", []>; +//def S_CBRANCH_SCC1 : SOPP_SCC1 <0x00000005, "S_CBRANCH_SCC1", []>; +//def S_CBRANCH_VCCZ : SOPP_ <0x00000006, "S_CBRANCH_VCCZ", []>; +//def S_CBRANCH_VCCNZ : SOPP_ <0x00000007, "S_CBRANCH_VCCNZ", []>; +//def S_CBRANCH_EXECZ : SOPP_ <0x00000008, "S_CBRANCH_EXECZ", []>; +//def S_CBRANCH_EXECNZ : SOPP_ <0x00000009, "S_CBRANCH_EXECNZ", []>; +//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>; +def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16">; +//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>; +//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>; +//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>; +//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>; +//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>; +//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>; +//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>; +//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>; +//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>; +//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>; + +/* XXX: No VOP3 version of this instruction yet */ +def V_CNDMASK_B32 : VOP2_Helper < + 0x00000000, VReg_32, AllReg_32, "V_CNDMASK_B32", []> { + let VDST = 0; + let Uses = [VCC]; +} +defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>; +defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>; + +defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", [], AMDILInst.ADD_f32>; + +defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>; +defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>; +defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>; +defm V_MUL_LEGACY_F32 : VOP2_32 < + 0x00000007, "V_MUL_LEGACY_F32", + [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))] +>; +defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32", []>; +//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>; +//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>; +//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>; +//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>; +defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32", []>; + +defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32", [], + AMDILInst.MAX_f32>; +defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>; +defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>; +defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>; +defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>; +defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>; +defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>; +defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>; +defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>; +defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>; +defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>; +defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>; +defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>; +defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", []>; +defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", []>; +defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", []>; +defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>; +defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>; +defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>; +defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>; +//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>; +//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>; +//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>; +defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32", []>; +defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32", []>; +defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>; +defm V_ADDC_U32 : VOP2_32 <0x00000028, "V_ADDC_U32", []>; +defm V_SUBB_U32 : VOP2_32 <0x00000029, "V_SUBB_U32", []>; +defm V_SUBBREV_U32 : VOP2_32 <0x0000002a, "V_SUBBREV_U32", []>; +defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>; +////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>; +////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>; +////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>; +////def V_CVT_PKRTZ_F16_F32 : VOP2_F16 <0x0000002f, "V_CVT_PKRTZ_F16_F32", []>; +////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>; +////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>; +def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>; +def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>; +def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>; +def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>; +def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>; +def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>; +def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>; +def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>; +def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>; +def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>; +def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>; +def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>; +////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>; +////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>; +////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>; +////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>; +//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>; + +let neverHasSideEffects = 1 in { + +def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>; +def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>; +//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>; +//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>; + +} // End neverHasSideEffects +def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>; +def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>; +def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>; +def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>; +def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>; +def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>; +def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>; +def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>; +def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>; +//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>; +def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>; +def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>; +def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; +////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>; +////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>; +////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>; +////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>; +////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>; +////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>; +////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>; +////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>; +////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>; +//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>; +//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>; +//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; +def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; +////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; +def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; +def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>; +def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>; +def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>; +def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>; +def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>; +def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>; +def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>; +def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>; +def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>; +def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>; +def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>; +def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; +def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; +def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; +def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; +//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; +//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; +//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; +def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; +def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>; +def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>; +def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>; +def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>; +def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>; +def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>; +def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>; +def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>; +def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>; +def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>; +def S_CSELECT_B32 : SOP2_32 <0x0000000a, "S_CSELECT_B32", []>; +def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>; +def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>; +def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64", []>; +def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>; +def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>; +def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>; +def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>; +////def S_ANDN2_B32 : SOP2_ANDN2 <0x00000014, "S_ANDN2_B32", []>; +////def S_ANDN2_B64 : SOP2_ANDN2 <0x00000015, "S_ANDN2_B64", []>; +////def S_ORN2_B32 : SOP2_ORN2 <0x00000016, "S_ORN2_B32", []>; +////def S_ORN2_B64 : SOP2_ORN2 <0x00000017, "S_ORN2_B64", []>; +def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>; +def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>; +def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>; +def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>; +def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>; +def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>; +def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>; +def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>; +def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>; +def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>; +def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>; +def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>; +def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>; +def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>; +def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>; +def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>; +def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>; +def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>; +def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>; +//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>; +def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>; + +def V_MOV_IMM : VOP1 < + 0x1, + (outs VReg_32:$dst), + (ins f32imm:$src0), + "V_MOV_IMM", + [] +>; + +def S_MOV_IMM_I32 : SOP1 < + 0x3, + (outs SReg_32:$dst), + (ins i32Literal:$src0), + "S_MOV_IMM", + [] > { + let neverHasSideEffects = 1; +} + +let isCodeGenOnly = 1, isPseudo = 1 in { + +def SET_M0 : InstSI < + (outs SReg_32:$dst), + (ins i32imm:$src0), + "SET_M0", + [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))] +>; + +def CONFIG_WRITE : InstSI < + (outs i32imm:$reg), + (ins i32imm:$val), + "CONFIG_WRITE $reg, $val", + [] > { + field bits<32> Inst = 0; +} + +let usesCustomInserter = 1 in { + +def SI_V_CNDLT : InstSI < + (outs VReg_32:$dst), + (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2), + "SI_V_CNDLT $dst, $src0, $src1, $src2", + [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))] +>; + +def SI_INTERP : InstSI < + (outs VReg_32:$dst), + (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params), + "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params", + [] +>; + +def SI_INTERP_CONST : InstSI < + (outs VReg_32:$dst), + (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params), + "SI_INTERP_CONST $dst, $attr_chan, $attr, $params", + [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan, + imm:$attr, SReg_32:$params))] +>; + + +def USE_SGPR_32 : InstSI < + (outs SReg_32:$dst), + (ins i32imm:$src0), + "USE_SGPR_32", + [(set SReg_32:$dst, (int_SI_use_sgpr imm:$src0))] + +> { + field bits<32> Inst = 0; + let PreloadReg = 1; +} + +def USE_SGPR_64 : InstSI < + (outs SReg_64:$dst), + (ins i32imm:$src0), + "USE_SGPR_64", + [(set SReg_64:$dst, (int_SI_use_sgpr imm:$src0))] + +> { + field bits<32> Inst = 0; + let PreloadReg = 1; +} + +def VS_LOAD_BUFFER_INDEX : InstSI < + (outs VReg_32:$dst), + (ins), + "VS_LOAD_BUFFER_INDEX", + [(set VReg_32:$dst, (int_SI_vs_load_buffer_index))]> { + + field bits<32> Inst = 0; + let PreloadReg = 1; +} + +} // end usesCustomInserter + +} // end IsCodeGenOnly, isPseudo + +} // end Gen = AMDGPUGen.SI + +/* int_SI_vs_load_input */ +def : Pat< + (int_SI_vs_load_input SReg_64:$tlst_sgpr, IMM8bit:$t_offset, IMM12bit:$attr_offset, + VReg_32:$buf_idx_vgpr), + (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0, + VReg_32:$buf_idx_vgpr, + (S_LOAD_DWORDX4_IMM imm:$t_offset, SReg_64:$tlst_sgpr), + 0, 0, (i32 SREG_LIT_0)) +>; + +/* int_SI_load_const */ + +def : Pat < + (int_SI_load_const SReg_64:$const_ptr, IMM8bit:$offset), + (S_LOAD_DWORD_IMM imm:$offset, SReg_64:$const_ptr) +>; + + +/* XXX: Complete this pattern with some form of a scalar move immediate */ +/* +def : Pat < + (int_SI_load_const SReg_64:$const_ptr, imm:$offset), + (S_LOAD_DWORD_SGPR imm:$offset, SReg_64:$const_ptr) +>; +*/ + +/* int_SI_export */ +def : Pat < + (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr, + VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3), + (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm, + VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3) +>; + +/* int_SI_sample */ +def : Pat < + (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_64:$rsrc, imm:$rsrc_offset, + SReg_64:$sampler, imm:$sampler_offset), + (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord, + (S_LOAD_DWORDX8_IMM imm:$rsrc_offset, SReg_64:$rsrc), /* Resource */ + (S_LOAD_DWORDX4_IMM imm:$sampler_offset, SReg_64:$sampler)) /* Sampler */ +>; + + +/* Extract element pattern */ +class Extract_Element <ValueType sub_type, ValueType vec_type, + RegisterClass vec_class, int sub_idx, + SubRegIndex sub_reg>: Pat< + (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)), + (EXTRACT_SUBREG vec_class:$src, sub_reg) +>; + +def : Extract_Element <f32, v4f32, VReg_128, 0, sel_x>; +def : Extract_Element <f32, v4f32, VReg_128, 1, sel_y>; +def : Extract_Element <f32, v4f32, VReg_128, 2, sel_z>; +def : Extract_Element <f32, v4f32, VReg_128, 3, sel_w>; + +class Insert_Element <ValueType elem_type, ValueType vec_type, + RegisterClass elem_class, RegisterClass vec_class, + int sub_idx, SubRegIndex sub_reg> : Pat < + + (vec_type (vector_insert (vec_type vec_class:$vec), + (elem_type elem_class:$elem), sub_idx)), + (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg) +>; + +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sel_x>; +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sel_y>; +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sel_z>; +def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sel_w>; + +/* +def : Pat< + (int_SI_vs_load_buffer_index), + (COPY_TO_REGCLASS (f32 VGPR0), VReg_32) +>; +*/ + +/********** ===================== **********/ +/********** Interpolation Paterns **********/ +/********** ===================== **********/ + +def : Pat < + (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params), + (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan, + imm:$attr, SReg_32:$params) +>; + +/********** ================== **********/ +/********** Intrinsic Patterns **********/ +/********** ================== **********/ + +/* llvm.AMDGPU.pow */ +/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */ +def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>; + +} // End isSI predicate diff --git a/src/gallium/drivers/radeon/SIIntrinsics.td b/src/gallium/drivers/radeon/SIIntrinsics.td new file mode 100644 index 00000000000..e3014e13916 --- /dev/null +++ b/src/gallium/drivers/radeon/SIIntrinsics.td @@ -0,0 +1,34 @@ +//===-- SIIntrinsics.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +let TargetPrefix = "SI", isTarget = 1 in { + + def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>; + /* XXX: We may need a seperate intrinsic here for loading integer values */ + def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>; + def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], []>; + def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], []> ; + + def int_SI_sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty]>; + def int_SI_use_sgpr : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], [IntrNoMem]>; + + + /* Interpolation Intrinsics */ + + def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>; + class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; + + def int_SI_fs_interp_linear_center : Interp; + def int_SI_fs_interp_constant : Interp; +} diff --git a/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp b/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp new file mode 100644 index 00000000000..5d49d88dc7c --- /dev/null +++ b/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp @@ -0,0 +1,90 @@ +//===-- SILowerShaderInstructions.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#include "AMDGPU.h" +#include "AMDGPULowerShaderInstructions.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + class SILowerShaderInstructionsPass : public MachineFunctionPass, + public AMDGPULowerShaderInstructionsPass { + + private: + static char ID; + TargetMachine &TM; + + public: + SILowerShaderInstructionsPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { return "SI Lower Shader Instructions"; } + + void lowerRETURN(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + void lowerSET_M0(MachineInstr &MI, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); + }; +} /* End anonymous namespace */ + +char SILowerShaderInstructionsPass::ID = 0; + +FunctionPass *llvm::createSILowerShaderInstructionsPass(TargetMachine &tm) { + return new SILowerShaderInstructionsPass(tm); +} + +bool SILowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF) +{ + MRI = &MF.getRegInfo(); + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next, Next = llvm::next(I) ) { + MachineInstr &MI = *I; + switch (MI.getOpcode()) { + case AMDIL::RETURN: + lowerRETURN(MBB, I); + break; + case AMDIL::SET_M0: + lowerSET_M0(MI, MBB, I); + break; + default: continue; + } + MI.removeFromParent(); + } + } + + return false; +} + +void SILowerShaderInstructionsPass::lowerRETURN(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) +{ + const struct TargetInstrInfo * TII = TM.getInstrInfo(); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_ENDPGM)); +} + +void SILowerShaderInstructionsPass::lowerSET_M0(MachineInstr &MI, + MachineBasicBlock &MBB, MachineBasicBlock::iterator I) +{ + const struct TargetInstrInfo * TII = TM.getInstrInfo(); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_MOV_IMM_I32)) + .addReg(AMDIL::M0) + .addOperand(MI.getOperand(1)); +} diff --git a/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp b/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp new file mode 100644 index 00000000000..a69353af9a6 --- /dev/null +++ b/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp @@ -0,0 +1,62 @@ +//===-- SIMachineFunctionInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#include "SIMachineFunctionInfo.h" +#include "AMDGPU.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + + +SIMachineFunctionInfo::SIMachineFunctionInfo() + : AMDILMachineFunctionInfo(), + spi_ps_input_addr(0) + { } + +SIMachineFunctionInfo::SIMachineFunctionInfo(MachineFunction &MF) + : AMDILMachineFunctionInfo(MF), + spi_ps_input_addr(0) + { } + + +namespace { + class SIInitMachineFunctionInfoPass : public MachineFunctionPass { + + private: + static char ID; + TargetMachine &TM; + + public: + SIInitMachineFunctionInfoPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + virtual bool runOnMachineFunction(MachineFunction &MF); + }; +} // End anonymous namespace + +char SIInitMachineFunctionInfoPass::ID = 0; + +FunctionPass *llvm::createSIInitMachineFunctionInfoPass(TargetMachine &tm) { + return new SIInitMachineFunctionInfoPass(tm); +} + +/* A MachineFunction's MachineFunctionInfo is initialized in the first call to + * getInfo(). We need to intialize it as an SIMachineFunctionInfo object + * before any of the AMDIL passes otherwise it will be an + * AMDILMachineFunctionInfo object and we won't be able to use it. + */ +bool SIInitMachineFunctionInfoPass::runOnMachineFunction(MachineFunction &MF) +{ + SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>(); + return false; +} diff --git a/src/gallium/drivers/radeon/SIMachineFunctionInfo.h b/src/gallium/drivers/radeon/SIMachineFunctionInfo.h new file mode 100644 index 00000000000..66feee9ef08 --- /dev/null +++ b/src/gallium/drivers/radeon/SIMachineFunctionInfo.h @@ -0,0 +1,36 @@ +//===-- SIMachineFunctionInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#ifndef _SIMACHINEFUNCTIONINFO_H_ +#define _SIMACHINEFUNCTIONINFO_H_ + +#include "AMDILMachineFunctionInfo.h" + +namespace llvm { + +class SIMachineFunctionInfo : public AMDILMachineFunctionInfo { + + private: + + public: + SIMachineFunctionInfo(); + SIMachineFunctionInfo(MachineFunction &MF); + unsigned spi_ps_input_addr; + +}; + +} // End namespace llvm + + +#endif //_SIMACHINEFUNCTIONINFO_H_ diff --git a/src/gallium/drivers/radeon/SIPropagateImmReads.cpp b/src/gallium/drivers/radeon/SIPropagateImmReads.cpp new file mode 100644 index 00000000000..e9b51b07046 --- /dev/null +++ b/src/gallium/drivers/radeon/SIPropagateImmReads.cpp @@ -0,0 +1,71 @@ +//===-- SIPropagateImmReads.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUUtil.h" +#include "AMDILMachineFunctionInfo.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" + +using namespace llvm; + +namespace { + class SIPropagateImmReadsPass : public MachineFunctionPass { + + private: + static char ID; + TargetMachine &TM; + + public: + SIPropagateImmReadsPass(TargetMachine &tm) : + MachineFunctionPass(ID), TM(tm) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + }; +} /* End anonymous namespace */ + +char SIPropagateImmReadsPass::ID = 0; + +FunctionPass *llvm::createSIPropagateImmReadsPass(TargetMachine &tm) { + return new SIPropagateImmReadsPass(tm); +} + +bool SIPropagateImmReadsPass::runOnMachineFunction(MachineFunction &MF) +{ + AMDILMachineFunctionInfo * MFI = MF.getInfo<AMDILMachineFunctionInfo>(); + const SIInstrInfo * TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo()); + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next, Next = llvm::next(I)) { + MachineInstr &MI = *I; + + switch (MI.getOpcode()) { + case AMDIL::LOADCONST_f32: + case AMDIL::LOADCONST_i32: + break; + default: + continue; + } + + /* XXX: Create and use S_MOV_IMM for SREGs */ + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::V_MOV_IMM)) + .addOperand(MI.getOperand(0)) + .addOperand(MI.getOperand(1)); + + MI.eraseFromParent(); + } + } +} diff --git a/src/gallium/drivers/radeon/SIRegisterInfo.cpp b/src/gallium/drivers/radeon/SIRegisterInfo.cpp new file mode 100644 index 00000000000..da2ec36a773 --- /dev/null +++ b/src/gallium/drivers/radeon/SIRegisterInfo.cpp @@ -0,0 +1,66 @@ +//===-- SIRegisterInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#include "SIRegisterInfo.h" +#include "AMDGPUTargetMachine.h" +#include "AMDGPUUtil.h" + +using namespace llvm; + +SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm, + const TargetInstrInfo &tii) +: AMDGPURegisterInfo(tm, tii), + TM(tm), + TII(tii) + { } + +BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const +{ + BitVector Reserved(getNumRegs()); + return Reserved; +} + +unsigned SIRegisterInfo::getBinaryCode(unsigned reg) const +{ + switch (reg) { + case AMDIL::M0: return 124; + case AMDIL::SREG_LIT_0: return 128; + default: return getHWRegNum(reg); + } +} + +bool SIRegisterInfo::isBaseRegClass(unsigned regClassID) const +{ + switch (regClassID) { + default: return true; + case AMDIL::AllReg_32RegClassID: + case AMDIL::AllReg_64RegClassID: + return false; + } +} + +const TargetRegisterClass * +SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const +{ + switch (rc->getID()) { + case AMDIL::GPRF32RegClassID: + return &AMDIL::VReg_32RegClass; + case AMDIL::GPRV4F32RegClassID: + case AMDIL::GPRV4I32RegClassID: + return &AMDIL::VReg_128RegClass; + default: return rc; + } +} + +#include "SIRegisterGetHWRegNum.inc" diff --git a/src/gallium/drivers/radeon/SIRegisterInfo.h b/src/gallium/drivers/radeon/SIRegisterInfo.h new file mode 100644 index 00000000000..c797e3c8ace --- /dev/null +++ b/src/gallium/drivers/radeon/SIRegisterInfo.h @@ -0,0 +1,46 @@ +//===-- SIRegisterInfo.h - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +#ifndef SIREGISTERINFO_H_ +#define SIREGISTERINFO_H_ + +#include "AMDGPURegisterInfo.h" + +namespace llvm { + + class AMDGPUTargetMachine; + class TargetInstrInfo; + + struct SIRegisterInfo : public AMDGPURegisterInfo + { + AMDGPUTargetMachine &TM; + const TargetInstrInfo &TII; + + SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii); + + virtual BitVector getReservedRegs(const MachineFunction &MF) const; + virtual unsigned getBinaryCode(unsigned reg) const; + + virtual bool isBaseRegClass(unsigned regClassID) const; + + virtual const TargetRegisterClass * + getISARegClass(const TargetRegisterClass * rc) const; + + unsigned getHWRegNum(unsigned reg) const; + + }; + +} // End namespace llvm + +#endif // SIREGISTERINFO_H_ diff --git a/src/gallium/drivers/radeon/SISchedule.td b/src/gallium/drivers/radeon/SISchedule.td new file mode 100644 index 00000000000..9e99268e9ca --- /dev/null +++ b/src/gallium/drivers/radeon/SISchedule.td @@ -0,0 +1,15 @@ +//===-- SISchedule.td - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + + +def SI_Itin : ProcessorItineraries <[], [], []>; diff --git a/src/gallium/drivers/radeon/TargetInfo/AMDILTargetInfo.cpp b/src/gallium/drivers/radeon/TargetInfo/AMDILTargetInfo.cpp new file mode 100644 index 00000000000..5dee0cb7c05 --- /dev/null +++ b/src/gallium/drivers/radeon/TargetInfo/AMDILTargetInfo.cpp @@ -0,0 +1,32 @@ +//===-- TargetInfo/AMDILTargetInfo.cpp - TODO: Add brief description -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// TODO: Add full description +// +//===----------------------------------------------------------------------===// + +#include "AMDIL.h" +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +/// The target for the AMDIL backend +Target llvm::TheAMDILTarget; + +/// The target for the AMDGPU backend +Target llvm::TheAMDGPUTarget; + +/// Extern function to initialize the targets for the AMDIL backend +extern "C" void LLVMInitializeAMDILTargetInfo() { + RegisterTarget<Triple::amdil, false> + IL(TheAMDILTarget, "amdil", "ATI graphics cards"); + + RegisterTarget<Triple::r600, false> + R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX"); +} diff --git a/src/gallium/drivers/radeon/loader.cpp b/src/gallium/drivers/radeon/loader.cpp new file mode 100644 index 00000000000..5b46cade602 --- /dev/null +++ b/src/gallium/drivers/radeon/loader.cpp @@ -0,0 +1,34 @@ + +#include "radeon_llvm.h" + +#include <llvm/Support/CommandLine.h> +#include <llvm/Support/IRReader.h> +#include <llvm/Support/SourceMgr.h> +#include <llvm/LLVMContext.h> +#include <llvm/Module.h> +#include <stdio.h> + +#include <llvm-c/Core.h> + +using namespace llvm; + +static cl::opt<std::string> +InputFilename(cl::Positional, cl::desc("<input bitcode>"), cl::init("-")); + + + +int main(int argc, char ** argv) +{ + unsigned char * bytes; + unsigned byte_count; + + std::auto_ptr<Module> M; + LLVMContext &Context = getGlobalContext(); + SMDiagnostic Err; + cl::ParseCommandLineOptions(argc, argv, "llvm system compiler\n"); + M.reset(ParseIRFile(InputFilename, Err, Context)); + + Module * mod = M.get(); + + radeon_llvm_compile(wrap(mod), &bytes, &byte_count, "SI", 1); +} diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h new file mode 100644 index 00000000000..14c9ecbb865 --- /dev/null +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -0,0 +1,136 @@ +/* + * Copyright 2011 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Tom Stellard <[email protected]> + * + */ + +#ifndef LLVM_GPU_H +#define LLVM_GPU_H + +#include <llvm-c/Core.h> +#include "gallivm/lp_bld_init.h" +#include "gallivm/lp_bld_tgsi.h" + +#define RADEON_LLVM_MAX_INPUTS 16 * 4 +#define RADEON_LLVM_MAX_OUTPUTS 16 * 4 +#define RADEON_LLVM_MAX_BRANCH_DEPTH 16 +#define RADEON_LLVM_MAX_LOOP_DEPTH 16 + +#ifdef __cplusplus +extern "C" { +#endif + +struct radeon_llvm_branch { + LLVMBasicBlockRef endif_block; + LLVMBasicBlockRef if_block; + LLVMBasicBlockRef else_block; + unsigned has_else; +}; + +struct radeon_llvm_loop { + LLVMBasicBlockRef loop_block; + LLVMBasicBlockRef endloop_block; +}; + +struct radeon_llvm_context { + + struct lp_build_tgsi_soa_context soa; + + /*=== Front end configuration ===*/ + + /* Special Intrinsics */ + + /** Write to an output register: float store_output(float, i32) */ + const char * store_output_intr; + + /** Swizzle a vector value: <4 x float> swizzle(<4 x float>, i32) + * The swizzle is an unsigned integer that encodes a TGSI_SWIZZLE_* value + * in 2-bits. + * Swizzle{0-1} = X Channel + * Swizzle{2-3} = Y Channel + * Swizzle{4-5} = Z Channel + * Swizzle{6-7} = W Channel + */ + const char * swizzle_intr; + + /* Instructions that are not described by any of the TGSI opcodes. */ + + /** This function is responsible for initilizing the inputs array and will be + * called once for each input declared in the TGSI shader. + */ + void (*load_input)(struct radeon_llvm_context *, + unsigned input_index, + const struct tgsi_full_declaration *decl); + + + /** User data to use with the callbacks */ + void * userdata; + + /** This array contains the input values for the shader. Typically these + * values will be in the form of a target intrinsic that will inform the + * backend how to load the actual inputs to the shader. + */ + LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS]; + LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS]; + unsigned output_reg_count; + + unsigned reserved_reg_count; + /*=== Private Members ===*/ + + struct radeon_llvm_branch branch[RADEON_LLVM_MAX_BRANCH_DEPTH]; + struct radeon_llvm_loop loop[RADEON_LLVM_MAX_LOOP_DEPTH]; + + unsigned branch_depth; + unsigned loop_depth; + + + LLVMValueRef main_fn; + + struct gallivm_state gallivm; +}; + +unsigned radeon_llvm_compile( + LLVMModuleRef M, + unsigned char ** bytes, + unsigned * byte_count, + const char * gpu_family, + unsigned dump); + +void radeon_llvm_context_init(struct radeon_llvm_context * ctx); + +void radeon_llvm_dispose(struct radeon_llvm_context * ctx); + +inline static struct radeon_llvm_context * radeon_llvm_context( + struct lp_build_tgsi_context * bld_base) +{ + return (struct radeon_llvm_context*)bld_base; +} + +unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan); + +void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx); + +#ifdef __cplusplus +} +#endif +#endif /* LLVM_GPU_H */ diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.cpp b/src/gallium/drivers/radeon/radeon_llvm_emit.cpp new file mode 100644 index 00000000000..1bc6a15610a --- /dev/null +++ b/src/gallium/drivers/radeon/radeon_llvm_emit.cpp @@ -0,0 +1,145 @@ +/* + * Copyright 2011 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Tom Stellard <[email protected]> + * + */ +#include "radeon_llvm.h" + +#include <llvm/LLVMContext.h> +#include <llvm/Module.h> +#include <llvm/PassManager.h> +#include <llvm/ADT/Triple.h> +#include <llvm/Support/FormattedStream.h> +#include <llvm/Support/Host.h> +#include <llvm/Support/IRReader.h> +#include <llvm/Support/SourceMgr.h> +#include <llvm/Support/TargetRegistry.h> +#include <llvm/Support/TargetSelect.h> +#include <llvm/Target/TargetData.h> +#include <llvm/Target/TargetMachine.h> + +#include <llvm/Transforms/Scalar.h> + +#include <llvm-c/Target.h> + +#include <iostream> +#include <stdlib.h> +#include <stdio.h> + +using namespace llvm; + +#ifndef EXTERNAL_LLVM +extern "C" { + +void LLVMInitializeAMDILTargetMC(void); +void LLVMInitializeAMDILTarget(void); +void LLVMInitializeAMDILTargetInfo(void); +} +#endif + +/** + * Compile an LLVM module to machine code. + * + * @param bytes This function allocates memory for the byte stream, it is the + * caller's responsibility to free it. + */ +extern "C" unsigned +radeon_llvm_compile(LLVMModuleRef M, unsigned char ** bytes, + unsigned * byte_count, const char * gpu_family, + unsigned dump) { + +#if HAVE_LLVM > 0x0300 + Triple AMDGPUTriple(sys::getDefaultTargetTriple()); +#else + Triple AMDGPUTriple(sys::getHostTriple()); +#endif + + +#ifdef EXTERNAL_LLVM + /* XXX: Can we just initialize the AMDGPU target here? */ + InitializeAllTargets(); + InitializeAllTargetMCs(); +#else + LLVMInitializeAMDILTargetInfo(); + LLVMInitializeAMDILTarget(); + LLVMInitializeAMDILTargetMC(); +#endif + std::string err; + const Target * AMDGPUTarget = TargetRegistry::lookupTarget("r600", err); + fprintf(stderr, "%s\n", err.c_str()); + if(!AMDGPUTarget) { + fprintf(stderr, "Can't find target\n"); + return 1; + } + + Triple::ArchType Arch = Triple::getArchTypeForLLVMName("r600"); + if (Arch == Triple::UnknownArch) { + fprintf(stderr, "Unknown Arch\n"); + } + AMDGPUTriple.setArch(Arch); + + Module * mod = unwrap(M); + std::string FS = gpu_family; +#if HAVE_LLVM > 0x0300 + TargetOptions TO; +#endif + + std::auto_ptr<TargetMachine> tm(AMDGPUTarget->createTargetMachine( + AMDGPUTriple.getTriple(), gpu_family, "" /* Features */, + TO, Reloc::Default, CodeModel::Default, + CodeGenOpt::Default + )); + TargetMachine &AMDGPUTargetMachine = *tm.get(); + /* XXX: Use TargetMachine.Options in 3.0 */ + if (dump) { + mod->dump(); + } + PassManager PM; + PM.add(new TargetData(*AMDGPUTargetMachine.getTargetData())); + PM.add(createPromoteMemoryToRegisterPass()); + AMDGPUTargetMachine.setAsmVerbosityDefault(true); + + std::string CodeString; + raw_string_ostream oStream(CodeString); + formatted_raw_ostream out(oStream); + + /* Optional extra paramater true / false to disable verify */ + if (AMDGPUTargetMachine.addPassesToEmitFile(PM, out, TargetMachine::CGFT_AssemblyFile, +#if HAVE_LLVM <= 0x300 + CodeGenOpt::Default, +#endif + true)){ + fprintf(stderr, "AddingPasses failed.\n"); + return 1; + } + PM.run(*mod); + + out.flush(); + std::string &data = oStream.str(); + + *bytes = (unsigned char*)malloc(data.length() * sizeof(unsigned char)); + memcpy(*bytes, data.c_str(), data.length() * sizeof(unsigned char)); + *byte_count = data.length(); + + return 0; +} diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c new file mode 100644 index 00000000000..62de9da28de --- /dev/null +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -0,0 +1,660 @@ +/* + * Copyright 2011 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Authors: Tom Stellard <[email protected]> + * + */ +#include "radeon_llvm.h" + +#include "gallivm/lp_bld_const.h" +#include "gallivm/lp_bld_gather.h" +#include "gallivm/lp_bld_flow.h" +#include "gallivm/lp_bld_init.h" +#include "gallivm/lp_bld_swizzle.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_parse.h" +#include "util/u_math.h" +#include "util/u_debug.h" + +#include <llvm-c/Transforms/Scalar.h> + +static struct radeon_llvm_loop * get_current_loop(struct radeon_llvm_context * ctx) +{ + return ctx->loop_depth > 0 ? ctx->loop + (ctx->loop_depth - 1) : NULL; +} + +static struct radeon_llvm_branch * get_current_branch( + struct radeon_llvm_context * ctx) +{ + return ctx->branch_depth > 0 ? + ctx->branch + (ctx->branch_depth - 1) : NULL; +} + +unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan) +{ + return (index * 4) + chan; +} + +static void radeon_llvm_fetch_args_2_reverse_soa( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + assert(emit_data->info->num_src == 2); + emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, + 1, emit_data->chan); + emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, + 0, emit_data->chan); + emit_data->arg_count = 2; + emit_data->dst_type = LLVMTypeOf(emit_data->args[0]); +} + +static LLVMValueRef emit_swizzle( + struct lp_build_tgsi_context * bld_base, + LLVMValueRef value, + unsigned swizzle_x, + unsigned swizzle_y, + unsigned swizzle_z, + unsigned swizzle_w) +{ + unsigned char swizzles[4]; + swizzles[0] = swizzle_x; + swizzles[1] = swizzle_y; + swizzles[2] = swizzle_z; + swizzles[3] = swizzle_w; + + + return lp_build_swizzle_aos(&bld_base->base, value, swizzles); +} + +static LLVMValueRef +emit_array_index( + struct lp_build_tgsi_soa_context *bld, + const struct tgsi_full_src_register *reg, + unsigned swizzle) +{ + struct gallivm_state * gallivm = bld->bld_base.base.gallivm; + + LLVMValueRef addr = LLVMBuildLoad(gallivm->builder, + bld->addr[reg->Indirect.Index][swizzle], ""); + LLVMValueRef offset = lp_build_const_int32(gallivm, reg->Register.Index); + LLVMValueRef hw_index = LLVMBuildAdd(gallivm->builder, addr, offset, ""); + LLVMValueRef soa_index = LLVMBuildMul(gallivm->builder, hw_index, + lp_build_const_int32(gallivm, 4), ""); + LLVMValueRef array_index = LLVMBuildAdd(gallivm->builder, soa_index, + lp_build_const_int32(gallivm, swizzle), ""); + + return array_index; +} + +static LLVMValueRef +emit_fetch_immediate( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, + unsigned swizzle) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + return bld->immediates[reg->Register.Index][swizzle]; +} + +static LLVMValueRef +emit_fetch_input( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, + unsigned swizzle) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + if (swizzle == ~0) { + LLVMValueRef values[TGSI_NUM_CHANNELS] = {}; + unsigned chan; + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + values[chan] = ctx->inputs[radeon_llvm_reg_index_soa( + reg->Register.Index, chan)]; + } + return lp_build_gather_values(bld_base->base.gallivm, values, + TGSI_NUM_CHANNELS); + } else { + return ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)]; + } +} + +static LLVMValueRef +emit_fetch_temporary( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, + unsigned swizzle) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + if (reg->Register.Indirect) { + LLVMValueRef array_index = emit_array_index(bld, reg, swizzle); + LLVMValueRef ptr = LLVMBuildGEP(builder, bld->temps_array, &array_index, + 1, ""); + return LLVMBuildLoad(builder, ptr, ""); + } else { + LLVMValueRef temp_ptr; + temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle); + return LLVMBuildLoad(builder, temp_ptr, ""); + } +} + +static LLVMValueRef +emit_fetch_output( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, + unsigned swizzle) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + LLVMBuilderRef builder = bld_base->base.gallivm->builder; + if (reg->Register.Indirect) { + LLVMValueRef array_index = emit_array_index(bld, reg, swizzle); + LLVMValueRef ptr = LLVMBuildGEP(builder, bld->outputs_array, &array_index, + 1, ""); + return LLVMBuildLoad(builder, ptr, ""); + } else { + LLVMValueRef temp_ptr; + temp_ptr = lp_get_output_ptr(bld, reg->Register.Index, swizzle); + return LLVMBuildLoad(builder, temp_ptr, ""); + } +} + +static void emit_declaration( + struct lp_build_tgsi_context * bld_base, + const struct tgsi_full_declaration *decl) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + switch(decl->Declaration.File) { + case TGSI_FILE_ADDRESS: + { + unsigned idx; + for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { + unsigned chan; + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + ctx->soa.addr[idx][chan] = lp_build_alloca( + &ctx->gallivm, + ctx->soa.bld_base.uint_bld.elem_type, ""); + } + } + break; + } + + case TGSI_FILE_TEMPORARY: + lp_emit_declaration_soa(bld_base, decl); + break; + + case TGSI_FILE_INPUT: + { + unsigned idx; + for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { + ctx->load_input(ctx, idx, decl); + } + } + break; + + case TGSI_FILE_OUTPUT: + { + unsigned idx; + for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) { + unsigned chan; + assert(idx < RADEON_LLVM_MAX_OUTPUTS); + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + ctx->soa.outputs[idx][chan] = lp_build_alloca(&ctx->gallivm, + ctx->soa.bld_base.base.elem_type, ""); + } + } + + ctx->output_reg_count = MAX2(ctx->output_reg_count, + decl->Range.Last + 1); + break; + } + + default: + break; + } +} + +static void +emit_store( + struct lp_build_tgsi_context * bld_base, + const struct tgsi_full_instruction * inst, + const struct tgsi_opcode_info * info, + LLVMValueRef dst[4]) +{ + struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); + struct gallivm_state *gallivm = bld->bld_base.base.gallivm; + struct lp_build_context base = bld->bld_base.base; + const struct tgsi_full_dst_register *reg = &inst->Dst[0]; + LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; + LLVMValueRef temp_ptr; + unsigned chan, chan_index; + boolean is_vec_store = FALSE; + if (dst[0]) { + LLVMTypeKind k = LLVMGetTypeKind(LLVMTypeOf(dst[0])); + is_vec_store = (k == LLVMVectorTypeKind); + } + + if (is_vec_store) { + LLVMValueRef values[4] = {}; + TGSI_FOR_EACH_DST0_ENABLED_CHANNEL(inst, chan) { + LLVMValueRef index = lp_build_const_int32(gallivm, chan); + values[chan] = LLVMBuildExtractElement(gallivm->builder, + dst[0], index, ""); + } + bld_base->emit_store(bld_base, inst, info, values); + return; + } + + TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) { + LLVMValueRef value = dst[chan_index]; + + if (inst->Instruction.Saturate != TGSI_SAT_NONE) { + struct lp_build_emit_data clamp_emit_data; + + memset(&clamp_emit_data, 0, sizeof(clamp_emit_data)); + clamp_emit_data.arg_count = 3; + clamp_emit_data.args[0] = value; + clamp_emit_data.args[2] = base.one; + + switch(inst->Instruction.Saturate) { + case TGSI_SAT_ZERO_ONE: + clamp_emit_data.args[1] = base.zero; + break; + case TGSI_SAT_MINUS_PLUS_ONE: + clamp_emit_data.args[1] = LLVMConstReal( + base.elem_type, -1.0f); + break; + default: + assert(0); + } + value = lp_build_emit_llvm(bld_base, TGSI_OPCODE_CLAMP, + &clamp_emit_data); + } + + switch(reg->Register.File) { + case TGSI_FILE_OUTPUT: + temp_ptr = bld->outputs[reg->Register.Index][chan_index]; + break; + + case TGSI_FILE_TEMPORARY: + temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, chan_index); + break; + + default: + return; + } + LLVMBuildStore(builder, value, temp_ptr); + } +} + +static void bgnloop_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + struct gallivm_state * gallivm = bld_base->base.gallivm; + LLVMBasicBlockRef loop_block; + LLVMBasicBlockRef endloop_block; + endloop_block = LLVMAppendBasicBlockInContext(gallivm->context, + ctx->main_fn, "ENDLOOP"); + loop_block = LLVMInsertBasicBlockInContext(gallivm->context, + endloop_block, "LOOP"); + LLVMBuildBr(gallivm->builder, loop_block); + LLVMPositionBuilderAtEnd(gallivm->builder, loop_block); + ctx->loop_depth++; + ctx->loop[ctx->loop_depth - 1].loop_block = loop_block; + ctx->loop[ctx->loop_depth - 1].endloop_block = endloop_block; +} + +static void brk_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + struct gallivm_state * gallivm = bld_base->base.gallivm; + struct radeon_llvm_loop * current_loop = get_current_loop(ctx); + + LLVMBuildBr(gallivm->builder, current_loop->endloop_block); +} + +static void cont_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + struct gallivm_state * gallivm = bld_base->base.gallivm; + struct radeon_llvm_loop * current_loop = get_current_loop(ctx); + + LLVMBuildBr(gallivm->builder, current_loop->loop_block); +} + +static void else_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + struct gallivm_state * gallivm = bld_base->base.gallivm; + struct radeon_llvm_branch * current_branch = get_current_branch(ctx); + LLVMBasicBlockRef current_block = LLVMGetInsertBlock(gallivm->builder); + + /* We need to add a terminator to the current block if the previous + * instruction was an ENDIF.Example: + * IF + * [code] + * IF + * [code] + * ELSE + * [code] + * ENDIF <-- + * ELSE<-- + * [code] + * ENDIF + */ + + if (current_block != current_branch->if_block) { + LLVMBuildBr(gallivm->builder, current_branch->endif_block); + } + if (!LLVMGetBasicBlockTerminator(current_branch->if_block)) { + LLVMBuildBr(gallivm->builder, current_branch->endif_block); + } + current_branch->has_else = 1; + LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->else_block); +} + +static void endif_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + struct gallivm_state * gallivm = bld_base->base.gallivm; + struct radeon_llvm_branch * current_branch = get_current_branch(ctx); + LLVMBasicBlockRef current_block = LLVMGetInsertBlock(gallivm->builder); + + /* If we have consecutive ENDIF instructions, then the first ENDIF + * will not have a terminator, so we need to add one. */ + if (current_block != current_branch->if_block + && current_block != current_branch->else_block + && !LLVMGetBasicBlockTerminator(current_block)) { + + LLVMBuildBr(gallivm->builder, current_branch->endif_block); + } + if (!LLVMGetBasicBlockTerminator(current_branch->else_block)) { + LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->else_block); + LLVMBuildBr(gallivm->builder, current_branch->endif_block); + } + + if (!LLVMGetBasicBlockTerminator(current_branch->if_block)) { + LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->if_block); + LLVMBuildBr(gallivm->builder, current_branch->endif_block); + } + + LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->endif_block); + ctx->branch_depth--; +} + +static void endloop_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + struct gallivm_state * gallivm = bld_base->base.gallivm; + struct radeon_llvm_loop * current_loop = get_current_loop(ctx); + + if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(gallivm->builder))) { + LLVMBuildBr(gallivm->builder, current_loop->loop_block); + } + + LLVMPositionBuilderAtEnd(gallivm->builder, current_loop->endloop_block); + ctx->loop_depth--; +} + +static void if_emit( + const struct lp_build_tgsi_action * action, + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); + struct gallivm_state * gallivm = bld_base->base.gallivm; + LLVMValueRef cond; + LLVMBasicBlockRef if_block, else_block, endif_block; + cond = LLVMBuildFCmp(gallivm->builder, LLVMRealOEQ, emit_data->args[0], + bld_base->base.one, ""); + + endif_block = LLVMAppendBasicBlockInContext(gallivm->context, + ctx->main_fn, "ENDIF"); + if_block = LLVMInsertBasicBlockInContext(gallivm->context, + endif_block, "IF"); + else_block = LLVMInsertBasicBlockInContext(gallivm->context, + endif_block, "ELSE"); + LLVMBuildCondBr(gallivm->builder, cond, if_block, else_block); + LLVMPositionBuilderAtEnd(gallivm->builder, if_block); + + ctx->branch_depth++; + ctx->branch[ctx->branch_depth - 1].endif_block = endif_block; + ctx->branch[ctx->branch_depth - 1].if_block = if_block; + ctx->branch[ctx->branch_depth - 1].else_block = else_block; + ctx->branch[ctx->branch_depth - 1].has_else = 0; +} + +static void tex_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + /* XXX: lp_build_swizzle_aos() was failing with wrong arg types, + * when we used CHAN_ALL. We should be able to get this to work, + * but for now we will swizzle it ourselves + emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst, + 0, CHAN_ALL); + + */ + + LLVMValueRef coords[4]; + unsigned chan; + for (chan = 0; chan < 4; chan++) { + coords[chan] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, chan); + } + + emit_data->arg_count = 1; + emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm, + coords, 4); + emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); +} + +void radeon_llvm_context_init(struct radeon_llvm_context * ctx) +{ + struct lp_type type; + LLVMTypeRef main_fn_type; + LLVMBasicBlockRef main_fn_body; + + /* Initialize the gallivm object: + * We are only using the module, context, and builder fields of this struct. + * This should be enough for us to be able to pass our gallivm struct to the + * helper functions in the gallivm module. + */ + memset(&ctx->gallivm, 0, sizeof (ctx->gallivm)); + memset(&ctx->soa, 0, sizeof(ctx->soa)); + ctx->gallivm.context = LLVMContextCreate(); + ctx->gallivm.module = LLVMModuleCreateWithNameInContext("tgsi", + ctx->gallivm.context); + ctx->gallivm.builder = LLVMCreateBuilderInContext(ctx->gallivm.context); + + /* Setup the module */ + main_fn_type = LLVMFunctionType(LLVMVoidTypeInContext(ctx->gallivm.context), + NULL, 0, 0); + ctx->main_fn = LLVMAddFunction(ctx->gallivm.module, "main", main_fn_type); + main_fn_body = LLVMAppendBasicBlockInContext(ctx->gallivm.context, + ctx->main_fn, "main_body"); + LLVMPositionBuilderAtEnd(ctx->gallivm.builder, main_fn_body); + + ctx->store_output_intr = "llvm.AMDGPU.store.output."; + ctx->swizzle_intr = "llvm.AMDGPU.swizzle"; + struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base; + + /* XXX: We need to revisit this.I think the correct way to do this is + * to use length = 4 here and use the elem_bld for everything. */ + type.floating = TRUE; + type.sign = TRUE; + type.width = 32; + type.length = 1; + + lp_build_context_init(&bld_base->base, &ctx->gallivm, type); + lp_build_context_init(&ctx->soa.bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type)); + + bld_base->soa = 1; + bld_base->emit_store = emit_store; + bld_base->emit_swizzle = emit_swizzle; + bld_base->emit_declaration = emit_declaration; + bld_base->emit_immediate = lp_emit_immediate_soa; + + bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch_immediate; + bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input; + bld_base->emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch_temporary; + bld_base->emit_fetch_funcs[TGSI_FILE_OUTPUT] = emit_fetch_output; + + /* Allocate outputs */ + ctx->soa.outputs = ctx->outputs; + + /* XXX: Is there a better way to initialize all this ? */ + + lp_set_default_actions(bld_base); + + bld_base->op_actions[TGSI_OPCODE_ABS].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "llvm.AMDIL.fabs."; + bld_base->op_actions[TGSI_OPCODE_ARL].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_ARL].intr_name = "llvm.AMDGPU.arl"; + bld_base->op_actions[TGSI_OPCODE_BGNLOOP].emit = bgnloop_emit; + bld_base->op_actions[TGSI_OPCODE_BRK].emit = brk_emit; + bld_base->op_actions[TGSI_OPCODE_CONT].emit = cont_emit; + bld_base->op_actions[TGSI_OPCODE_CLAMP].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_CLAMP].intr_name = "llvm.AMDIL.clamp."; + bld_base->op_actions[TGSI_OPCODE_CMP].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_CMP].intr_name = "llvm.AMDGPU.cndlt"; + bld_base->op_actions[TGSI_OPCODE_COS].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.AMDGPU.cos"; + bld_base->op_actions[TGSI_OPCODE_DDX].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx"; + bld_base->op_actions[TGSI_OPCODE_DDY].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy"; + bld_base->op_actions[TGSI_OPCODE_DIV].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_DIV].intr_name = "llvm.AMDGPU.div"; + bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit; + bld_base->op_actions[TGSI_OPCODE_ENDIF].emit = endif_emit; + bld_base->op_actions[TGSI_OPCODE_ENDLOOP].emit = endloop_emit; + bld_base->op_actions[TGSI_OPCODE_EX2].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_EX2].intr_name = "llvm.AMDIL.exp."; + bld_base->op_actions[TGSI_OPCODE_FLR].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_FLR].intr_name = "llvm.AMDGPU.floor"; + bld_base->op_actions[TGSI_OPCODE_FRC].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_FRC].intr_name = "llvm.AMDIL.fraction."; + bld_base->op_actions[TGSI_OPCODE_IF].emit = if_emit; + bld_base->op_actions[TGSI_OPCODE_KIL].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_KIL].intr_name = "llvm.AMDGPU.kill"; + bld_base->op_actions[TGSI_OPCODE_KILP].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_KILP].intr_name = "llvm.AMDGPU.kilp"; + bld_base->op_actions[TGSI_OPCODE_LG2].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_LG2].intr_name = "llvm.AMDIL.log."; + bld_base->op_actions[TGSI_OPCODE_LRP].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_LRP].intr_name = "llvm.AMDGPU.lrp"; + bld_base->op_actions[TGSI_OPCODE_MIN].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.AMDIL.min."; + bld_base->op_actions[TGSI_OPCODE_MAD].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_MAD].intr_name = "llvm.AMDIL.mad."; + bld_base->op_actions[TGSI_OPCODE_MAX].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.AMDIL.max."; + bld_base->op_actions[TGSI_OPCODE_MUL].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_MUL].intr_name = "llvm.AMDGPU.mul"; + bld_base->op_actions[TGSI_OPCODE_POW].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_POW].intr_name = "llvm.AMDGPU.pow"; + bld_base->op_actions[TGSI_OPCODE_RCP].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_RCP].intr_name = "llvm.AMDGPU.rcp"; + bld_base->op_actions[TGSI_OPCODE_SSG].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_SSG].intr_name = "llvm.AMDGPU.ssg"; + bld_base->op_actions[TGSI_OPCODE_SGE].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_SGE].intr_name = "llvm.AMDGPU.sge."; + bld_base->op_actions[TGSI_OPCODE_SEQ].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_SEQ].intr_name = "llvm.AMDGPU.seq"; + bld_base->op_actions[TGSI_OPCODE_SLE].fetch_args = radeon_llvm_fetch_args_2_reverse_soa; + bld_base->op_actions[TGSI_OPCODE_SLE].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_SLE].intr_name = "llvm.AMDGPU.sge"; + bld_base->op_actions[TGSI_OPCODE_SLT].fetch_args = radeon_llvm_fetch_args_2_reverse_soa; + bld_base->op_actions[TGSI_OPCODE_SLT].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_SLT].intr_name = "llvm.AMDGPU.sgt"; + bld_base->op_actions[TGSI_OPCODE_SNE].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_SNE].intr_name = "llvm.AMDGPU.sne"; + bld_base->op_actions[TGSI_OPCODE_SGT].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_SGT].intr_name = "llvm.AMDGPU.sgt"; + bld_base->op_actions[TGSI_OPCODE_SIN].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_SIN].intr_name = "llvm.AMDGPU.sin"; + bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex"; + bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb"; + bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd"; + bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args; + bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl"; + bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex"; + bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = lp_build_tgsi_intrinsic; + bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc"; + + bld_base->rsq_action.emit = lp_build_tgsi_intrinsic; + bld_base->rsq_action.intr_name = "llvm.AMDGPU.rsq"; +} + +void radeon_llvm_finalize_module(struct radeon_llvm_context * ctx) +{ + struct gallivm_state * gallivm = ctx->soa.bld_base.base.gallivm; + /* End the main function with Return*/ + LLVMBuildRetVoid(gallivm->builder); + + /* Create the pass manager */ + ctx->gallivm.passmgr = LLVMCreateFunctionPassManagerForModule( + gallivm->module); + + /* This pass should eliminate all the load and store instructions */ + LLVMAddPromoteMemoryToRegisterPass(gallivm->passmgr); + + /* Add some optimization passes */ + LLVMAddScalarReplAggregatesPass(gallivm->passmgr); + LLVMAddCFGSimplificationPass(gallivm->passmgr); + + /* Run the passs */ + LLVMRunFunctionPassManager(gallivm->passmgr, ctx->main_fn); + + LLVMDisposeBuilder(gallivm->builder); + LLVMDisposePassManager(gallivm->passmgr); + +} + +void radeon_llvm_dispose(struct radeon_llvm_context * ctx) +{ + LLVMDisposeModule(ctx->soa.bld_base.base.gallivm->module); + LLVMContextDispose(ctx->soa.bld_base.base.gallivm->context); +} diff --git a/src/gallium/drivers/radeonsi/Android.mk b/src/gallium/drivers/radeonsi/Android.mk new file mode 100644 index 00000000000..f7e01a3f658 --- /dev/null +++ b/src/gallium/drivers/radeonsi/Android.mk @@ -0,0 +1,38 @@ +# Mesa 3-D graphics library +# +# Copyright (C) 2010-2011 Chia-I Wu <[email protected]> +# Copyright (C) 2010-2011 LunarG Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +LOCAL_PATH := $(call my-dir) + +# get C_SOURCES +include $(LOCAL_PATH)/Makefile.sources + +include $(CLEAR_VARS) + +LOCAL_SRC_FILES := $(C_SOURCES) + +LOCAL_C_INCLUDES := + +LOCAL_MODULE := libmesa_pipe_radeonsi + +include $(GALLIUM_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) diff --git a/src/gallium/drivers/radeonsi/Makefile b/src/gallium/drivers/radeonsi/Makefile new file mode 100644 index 00000000000..90f6f472730 --- /dev/null +++ b/src/gallium/drivers/radeonsi/Makefile @@ -0,0 +1,24 @@ +TOP = ../../../.. +include $(TOP)/configs/current + +LIBNAME = radeonsi + +LIBRARY_INCLUDES = \ + -I$(TOP)/include \ + -I$(TOP)/src/gallium/drivers/radeon/ + + +# get C_SOURCES +include Makefile.sources + +LIBRADEON = $(TOP)/src/gallium/drivers/radeon/libradeon.a + +EXTRA_OBJECTS = \ + $(LIBRADEON) + +CFLAGS+=$(RADEON_CFLAGS) + +include ../../Makefile.template + +# FIXME: Remove when this driver is converted to automake. +all: default diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources new file mode 100644 index 00000000000..394cfe93e07 --- /dev/null +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -0,0 +1,13 @@ +C_SOURCES := \ + r600_blit.c \ + r600_buffer.c \ + r600_hw_context.c \ + radeonsi_pipe.c \ + r600_query.c \ + r600_resource.c \ + radeonsi_shader.c \ + r600_texture.c \ + evergreen_hw_context.c \ + evergreen_state.c \ + r600_translate.c \ + r600_state_common.c diff --git a/src/gallium/drivers/radeonsi/SConscript b/src/gallium/drivers/radeonsi/SConscript new file mode 100644 index 00000000000..f2d2bec6e42 --- /dev/null +++ b/src/gallium/drivers/radeonsi/SConscript @@ -0,0 +1,17 @@ +Import('*') + +env = env.Clone() + +env.Append(CPPPATH = [ + '#/include', + '#/src/gallium/drivers/radeon', +]) + +radeonsi = env.ConvenienceLibrary( + target = 'radeonsi', + source = env.ParseSourceList('Makefile.sources', 'C_SOURCES') + ) + +env.Alias('radeonsi', radeonsi) + +Export('radeonsi') diff --git a/src/gallium/drivers/radeonsi/evergreen_hw_context.c b/src/gallium/drivers/radeonsi/evergreen_hw_context.c new file mode 100644 index 00000000000..549673f4a0b --- /dev/null +++ b/src/gallium/drivers/radeonsi/evergreen_hw_context.c @@ -0,0 +1,561 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + */ +#include "r600.h" +#include "r600_hw_context_priv.h" +#include "radeonsi_pipe.h" +#include "sid.h" +#include "util/u_memory.h" +#include <errno.h> + +#define GROUP_FORCE_NEW_BLOCK 0 + +static const struct r600_reg si_config_reg_list[] = { + {R_0088B0_VGT_VTX_VECT_EJECT_REG, REG_FLAG_FLUSH_CHANGE}, + {R_0088C8_VGT_ESGS_RING_SIZE, REG_FLAG_FLUSH_CHANGE}, + {R_0088CC_VGT_GSVS_RING_SIZE, REG_FLAG_FLUSH_CHANGE}, + {R_008958_VGT_PRIMITIVE_TYPE, 0}, + {R_008A14_PA_CL_ENHANCE, REG_FLAG_FLUSH_CHANGE}, + {R_009100_SPI_CONFIG_CNTL, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE}, + {R_00913C_SPI_CONFIG_CNTL_1, REG_FLAG_ENABLE_ALWAYS | REG_FLAG_FLUSH_CHANGE}, +}; + +static const struct r600_reg si_context_reg_list[] = { + {R_028000_DB_RENDER_CONTROL, 0}, + {R_028004_DB_COUNT_CONTROL, 0}, + {R_028008_DB_DEPTH_VIEW, 0}, + {R_02800C_DB_RENDER_OVERRIDE, 0}, + {R_028010_DB_RENDER_OVERRIDE2, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028014_DB_HTILE_DATA_BASE, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028020_DB_DEPTH_BOUNDS_MIN, 0}, + {R_028024_DB_DEPTH_BOUNDS_MAX, 0}, + {R_028028_DB_STENCIL_CLEAR, 0}, + {R_02802C_DB_DEPTH_CLEAR, 0}, + {R_028030_PA_SC_SCREEN_SCISSOR_TL, 0}, + {R_028034_PA_SC_SCREEN_SCISSOR_BR, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_02803C_DB_DEPTH_INFO, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028040_DB_Z_INFO, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028044_DB_STENCIL_INFO, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028048_DB_Z_READ_BASE, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_02804C_DB_STENCIL_READ_BASE, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028050_DB_Z_WRITE_BASE, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028054_DB_STENCIL_WRITE_BASE, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028058_DB_DEPTH_SIZE, 0}, + {R_02805C_DB_DEPTH_SLICE, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028080_TA_BC_BASE_ADDR, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028200_PA_SC_WINDOW_OFFSET, 0}, + {R_028204_PA_SC_WINDOW_SCISSOR_TL, 0}, + {R_028208_PA_SC_WINDOW_SCISSOR_BR, 0}, + {R_02820C_PA_SC_CLIPRECT_RULE, 0}, + {R_028210_PA_SC_CLIPRECT_0_TL, 0}, + {R_028214_PA_SC_CLIPRECT_0_BR, 0}, + {R_028218_PA_SC_CLIPRECT_1_TL, 0}, + {R_02821C_PA_SC_CLIPRECT_1_BR, 0}, + {R_028220_PA_SC_CLIPRECT_2_TL, 0}, + {R_028224_PA_SC_CLIPRECT_2_BR, 0}, + {R_028228_PA_SC_CLIPRECT_3_TL, 0}, + {R_02822C_PA_SC_CLIPRECT_3_BR, 0}, + {R_028230_PA_SC_EDGERULE, 0}, + {R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0}, + {R_028238_CB_TARGET_MASK, 0}, + {R_02823C_CB_SHADER_MASK, 0}, + {R_028240_PA_SC_GENERIC_SCISSOR_TL, 0}, + {R_028244_PA_SC_GENERIC_SCISSOR_BR, 0}, + {R_028250_PA_SC_VPORT_SCISSOR_0_TL, 0}, + {R_028254_PA_SC_VPORT_SCISSOR_0_BR, 0}, + {R_0282D0_PA_SC_VPORT_ZMIN_0, 0}, + {R_0282D4_PA_SC_VPORT_ZMAX_0, 0}, + {R_028350_PA_SC_RASTER_CONFIG, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028400_VGT_MAX_VTX_INDX, 0}, + {R_028404_VGT_MIN_VTX_INDX, 0}, + {R_028408_VGT_INDX_OFFSET, 0}, + {R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, 0}, + {R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028414_CB_BLEND_RED, 0}, + {R_028418_CB_BLEND_GREEN, 0}, + {R_02841C_CB_BLEND_BLUE, 0}, + {R_028420_CB_BLEND_ALPHA, 0}, + {R_028430_DB_STENCILREFMASK, 0}, + {R_028434_DB_STENCILREFMASK_BF, 0}, + {R_02843C_PA_CL_VPORT_XSCALE_0, 0}, + {R_028440_PA_CL_VPORT_XOFFSET_0, 0}, + {R_028444_PA_CL_VPORT_YSCALE_0, 0}, + {R_028448_PA_CL_VPORT_YOFFSET_0, 0}, + {R_02844C_PA_CL_VPORT_ZSCALE_0, 0}, + {R_028450_PA_CL_VPORT_ZOFFSET_0, 0}, + {R_0285BC_PA_CL_UCP_0_X, 0}, + {R_0285C0_PA_CL_UCP_0_Y, 0}, + {R_0285C4_PA_CL_UCP_0_Z, 0}, + {R_0285C8_PA_CL_UCP_0_W, 0}, + {R_0285CC_PA_CL_UCP_1_X, 0}, + {R_0285D0_PA_CL_UCP_1_Y, 0}, + {R_0285D4_PA_CL_UCP_1_Z, 0}, + {R_0285D8_PA_CL_UCP_1_W, 0}, + {R_0285DC_PA_CL_UCP_2_X, 0}, + {R_0285E0_PA_CL_UCP_2_Y, 0}, + {R_0285E4_PA_CL_UCP_2_Z, 0}, + {R_0285E8_PA_CL_UCP_2_W, 0}, + {R_0285EC_PA_CL_UCP_3_X, 0}, + {R_0285F0_PA_CL_UCP_3_Y, 0}, + {R_0285F4_PA_CL_UCP_3_Z, 0}, + {R_0285F8_PA_CL_UCP_3_W, 0}, + {R_0285FC_PA_CL_UCP_4_X, 0}, + {R_028600_PA_CL_UCP_4_Y, 0}, + {R_028604_PA_CL_UCP_4_Z, 0}, + {R_028608_PA_CL_UCP_4_W, 0}, + {R_02860C_PA_CL_UCP_5_X, 0}, + {R_028610_PA_CL_UCP_5_Y, 0}, + {R_028614_PA_CL_UCP_5_Z, 0}, + {R_028618_PA_CL_UCP_5_W, 0}, + {R_028644_SPI_PS_INPUT_CNTL_0, 0}, + {R_028648_SPI_PS_INPUT_CNTL_1, 0}, + {R_02864C_SPI_PS_INPUT_CNTL_2, 0}, + {R_028650_SPI_PS_INPUT_CNTL_3, 0}, + {R_028654_SPI_PS_INPUT_CNTL_4, 0}, + {R_028658_SPI_PS_INPUT_CNTL_5, 0}, + {R_02865C_SPI_PS_INPUT_CNTL_6, 0}, + {R_028660_SPI_PS_INPUT_CNTL_7, 0}, + {R_028664_SPI_PS_INPUT_CNTL_8, 0}, + {R_028668_SPI_PS_INPUT_CNTL_9, 0}, + {R_02866C_SPI_PS_INPUT_CNTL_10, 0}, + {R_028670_SPI_PS_INPUT_CNTL_11, 0}, + {R_028674_SPI_PS_INPUT_CNTL_12, 0}, + {R_028678_SPI_PS_INPUT_CNTL_13, 0}, + {R_02867C_SPI_PS_INPUT_CNTL_14, 0}, + {R_028680_SPI_PS_INPUT_CNTL_15, 0}, + {R_028684_SPI_PS_INPUT_CNTL_16, 0}, + {R_028688_SPI_PS_INPUT_CNTL_17, 0}, + {R_02868C_SPI_PS_INPUT_CNTL_18, 0}, + {R_028690_SPI_PS_INPUT_CNTL_19, 0}, + {R_028694_SPI_PS_INPUT_CNTL_20, 0}, + {R_028698_SPI_PS_INPUT_CNTL_21, 0}, + {R_02869C_SPI_PS_INPUT_CNTL_22, 0}, + {R_0286A0_SPI_PS_INPUT_CNTL_23, 0}, + {R_0286A4_SPI_PS_INPUT_CNTL_24, 0}, + {R_0286A8_SPI_PS_INPUT_CNTL_25, 0}, + {R_0286AC_SPI_PS_INPUT_CNTL_26, 0}, + {R_0286B0_SPI_PS_INPUT_CNTL_27, 0}, + {R_0286B4_SPI_PS_INPUT_CNTL_28, 0}, + {R_0286B8_SPI_PS_INPUT_CNTL_29, 0}, + {R_0286BC_SPI_PS_INPUT_CNTL_30, 0}, + {R_0286C0_SPI_PS_INPUT_CNTL_31, 0}, + {R_0286C4_SPI_VS_OUT_CONFIG, 0}, + {R_0286CC_SPI_PS_INPUT_ENA, 0}, + {R_0286D0_SPI_PS_INPUT_ADDR, 0}, + {R_0286D4_SPI_INTERP_CONTROL_0, 0}, + {R_0286D8_SPI_PS_IN_CONTROL, 0}, + {R_0286E0_SPI_BARYC_CNTL, 0}, + {R_02870C_SPI_SHADER_POS_FORMAT, 0}, + {R_028710_SPI_SHADER_Z_FORMAT, 0}, + {R_028714_SPI_SHADER_COL_FORMAT, 0}, + {R_028780_CB_BLEND0_CONTROL, 0}, + {R_028784_CB_BLEND1_CONTROL, 0}, + {R_028788_CB_BLEND2_CONTROL, 0}, + {R_02878C_CB_BLEND3_CONTROL, 0}, + {R_028790_CB_BLEND4_CONTROL, 0}, + {R_028794_CB_BLEND5_CONTROL, 0}, + {R_028798_CB_BLEND6_CONTROL, 0}, + {R_02879C_CB_BLEND7_CONTROL, 0}, + {R_0287D4_PA_CL_POINT_X_RAD, 0}, + {R_0287D8_PA_CL_POINT_Y_RAD, 0}, + {R_0287DC_PA_CL_POINT_SIZE, 0}, + {R_0287E0_PA_CL_POINT_CULL_RAD, 0}, + {R_028800_DB_DEPTH_CONTROL, 0}, + {R_028804_DB_EQAA, 0}, + {R_028808_CB_COLOR_CONTROL, 0}, + {R_02880C_DB_SHADER_CONTROL, 0}, + {R_028810_PA_CL_CLIP_CNTL, 0}, + {R_028814_PA_SU_SC_MODE_CNTL, 0}, + {R_028818_PA_CL_VTE_CNTL, 0}, + {R_02881C_PA_CL_VS_OUT_CNTL, 0}, + {R_028820_PA_CL_NANINF_CNTL, 0}, + {R_028824_PA_SU_LINE_STIPPLE_CNTL, 0}, + {R_028828_PA_SU_LINE_STIPPLE_SCALE, 0}, + {R_02882C_PA_SU_PRIM_FILTER_CNTL, 0}, + {R_028A00_PA_SU_POINT_SIZE, 0}, + {R_028A04_PA_SU_POINT_MINMAX, 0}, + {R_028A08_PA_SU_LINE_CNTL, 0}, + {R_028A0C_PA_SC_LINE_STIPPLE, 0}, + {R_028A10_VGT_OUTPUT_PATH_CNTL, 0}, + {R_028A14_VGT_HOS_CNTL, 0}, + {R_028A18_VGT_HOS_MAX_TESS_LEVEL, 0}, + {R_028A1C_VGT_HOS_MIN_TESS_LEVEL, 0}, + {R_028A20_VGT_HOS_REUSE_DEPTH, 0}, + {R_028A24_VGT_GROUP_PRIM_TYPE, 0}, + {R_028A28_VGT_GROUP_FIRST_DECR, 0}, + {R_028A2C_VGT_GROUP_DECR, 0}, + {R_028A30_VGT_GROUP_VECT_0_CNTL, 0}, + {R_028A34_VGT_GROUP_VECT_1_CNTL, 0}, + {R_028A38_VGT_GROUP_VECT_0_FMT_CNTL, 0}, + {R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL, 0}, + {R_028A40_VGT_GS_MODE, 0}, + {R_028A48_PA_SC_MODE_CNTL_0, 0}, + {R_028A4C_PA_SC_MODE_CNTL_1, 0}, + {R_028A50_VGT_ENHANCE, 0}, + {R_028A54_VGT_GS_PER_ES, 0}, + {R_028A58_VGT_ES_PER_GS, 0}, + {R_028A5C_VGT_GS_PER_VS, 0}, + {R_028A60_VGT_GSVS_RING_OFFSET_1, 0}, + {R_028A64_VGT_GSVS_RING_OFFSET_2, 0}, + {R_028A68_VGT_GSVS_RING_OFFSET_3, 0}, + {R_028A6C_VGT_GS_OUT_PRIM_TYPE, 0}, + {R_028A70_IA_ENHANCE, 0}, + {R_028A84_VGT_PRIMITIVEID_EN, 0}, + {R_028A8C_VGT_PRIMITIVEID_RESET, 0}, + {R_028AA0_VGT_INSTANCE_STEP_RATE_0, 0}, + {R_028AA4_VGT_INSTANCE_STEP_RATE_1, 0}, + {R_028AA8_IA_MULTI_VGT_PARAM, 0}, + {R_028AAC_VGT_ESGS_RING_ITEMSIZE, 0}, + {R_028AB0_VGT_GSVS_RING_ITEMSIZE, 0}, + {R_028AB4_VGT_REUSE_OFF, 0}, + {R_028AB8_VGT_VTX_CNT_EN, 0}, + {R_028ABC_DB_HTILE_SURFACE, 0}, + {R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0}, + {R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0}, + {R_028AC8_DB_PRELOAD_CONTROL, 0}, + {R_028B54_VGT_SHADER_STAGES_EN, 0}, + {R_028B70_DB_ALPHA_TO_MASK, 0}, + {R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, 0}, + {R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 0}, + {R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, 0}, + {R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, 0}, + {R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, 0}, + {R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, 0}, + {R_028B94_VGT_STRMOUT_CONFIG, 0}, + {R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0}, + {R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0}, + {R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0}, + {R_028BDC_PA_SC_LINE_CNTL, 0}, + {R_028BE0_PA_SC_AA_CONFIG, 0}, + {R_028BE4_PA_SU_VTX_CNTL, 0}, + {R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 0}, + {R_028BEC_PA_CL_GB_VERT_DISC_ADJ, 0}, + {R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, 0}, + {R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, 0}, + {R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0}, + {R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, 0}, + {R_028C00_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2, 0}, + {R_028C04_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3, 0}, + {R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0}, + {R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, 0}, + {R_028C10_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2, 0}, + {R_028C14_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3, 0}, + {R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0}, + {R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, 0}, + {R_028C20_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2, 0}, + {R_028C24_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3, 0}, + {R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0}, + {R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, 0}, + {R_028C30_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2, 0}, + {R_028C34_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3, 0}, + {R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 0}, + {R_028C3C_PA_SC_AA_MASK_X0Y1_X1Y1, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028C60_CB_COLOR0_BASE, REG_FLAG_NEED_BO}, + {R_028C64_CB_COLOR0_PITCH, 0}, + {R_028C68_CB_COLOR0_SLICE, 0}, + {R_028C6C_CB_COLOR0_VIEW, 0}, + {R_028C70_CB_COLOR0_INFO, REG_FLAG_NEED_BO}, + {R_028C74_CB_COLOR0_ATTRIB, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028C9C_CB_COLOR1_BASE, REG_FLAG_NEED_BO}, + {R_028CA0_CB_COLOR1_PITCH, 0}, + {R_028CA4_CB_COLOR1_SLICE, 0}, + {R_028CA8_CB_COLOR1_VIEW, 0}, + {R_028CAC_CB_COLOR1_INFO, REG_FLAG_NEED_BO}, + {R_028CB0_CB_COLOR1_ATTRIB, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028CD8_CB_COLOR2_BASE, REG_FLAG_NEED_BO}, + {R_028CDC_CB_COLOR2_PITCH, 0}, + {R_028CE0_CB_COLOR2_SLICE, 0}, + {R_028CE4_CB_COLOR2_VIEW, 0}, + {R_028CE8_CB_COLOR2_INFO, REG_FLAG_NEED_BO}, + {R_028CEC_CB_COLOR2_ATTRIB, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028D14_CB_COLOR3_BASE, REG_FLAG_NEED_BO}, + {R_028D18_CB_COLOR3_PITCH, 0}, + {R_028D1C_CB_COLOR3_SLICE, 0}, + {R_028D20_CB_COLOR3_VIEW, 0}, + {R_028D24_CB_COLOR3_INFO, REG_FLAG_NEED_BO}, + {R_028D28_CB_COLOR3_ATTRIB, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028D50_CB_COLOR4_BASE, REG_FLAG_NEED_BO}, + {R_028D54_CB_COLOR4_PITCH, 0}, + {R_028D58_CB_COLOR4_SLICE, 0}, + {R_028D5C_CB_COLOR4_VIEW, 0}, + {R_028D60_CB_COLOR4_INFO, REG_FLAG_NEED_BO}, + {R_028D64_CB_COLOR4_ATTRIB, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028D8C_CB_COLOR5_BASE, REG_FLAG_NEED_BO}, + {R_028D90_CB_COLOR5_PITCH, 0}, + {R_028D94_CB_COLOR5_SLICE, 0}, + {R_028D98_CB_COLOR5_VIEW, 0}, + {R_028D9C_CB_COLOR5_INFO, REG_FLAG_NEED_BO}, + {R_028DA0_CB_COLOR5_ATTRIB, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028DC8_CB_COLOR6_BASE, REG_FLAG_NEED_BO}, + {R_028DCC_CB_COLOR6_PITCH, 0}, + {R_028DD0_CB_COLOR6_SLICE, 0}, + {R_028DD4_CB_COLOR6_VIEW, 0}, + {R_028DD8_CB_COLOR6_INFO, REG_FLAG_NEED_BO}, + {R_028DDC_CB_COLOR6_ATTRIB, REG_FLAG_NEED_BO}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_028E04_CB_COLOR7_BASE, REG_FLAG_NEED_BO}, + {R_028E08_CB_COLOR7_PITCH, 0}, + {R_028E0C_CB_COLOR7_SLICE, 0}, + {R_028E10_CB_COLOR7_VIEW, 0}, + {R_028E14_CB_COLOR7_INFO, REG_FLAG_NEED_BO}, + {R_028E18_CB_COLOR7_ATTRIB, REG_FLAG_NEED_BO}, +}; + +static const struct r600_reg si_sh_reg_list[] = { + {R_00B020_SPI_SHADER_PGM_LO_PS, REG_FLAG_NEED_BO}, + {R_00B024_SPI_SHADER_PGM_HI_PS, REG_FLAG_NEED_BO}, + {R_00B028_SPI_SHADER_PGM_RSRC1_PS, 0}, + {R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B030_SPI_SHADER_USER_DATA_PS_0, REG_FLAG_NEED_BO}, + {R_00B034_SPI_SHADER_USER_DATA_PS_1, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B038_SPI_SHADER_USER_DATA_PS_2, REG_FLAG_NEED_BO}, + {R_00B03C_SPI_SHADER_USER_DATA_PS_3, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B040_SPI_SHADER_USER_DATA_PS_4, REG_FLAG_NEED_BO}, + {R_00B044_SPI_SHADER_USER_DATA_PS_5, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B048_SPI_SHADER_USER_DATA_PS_6, REG_FLAG_NEED_BO}, + {R_00B04C_SPI_SHADER_USER_DATA_PS_7, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B050_SPI_SHADER_USER_DATA_PS_8, REG_FLAG_NEED_BO}, + {R_00B054_SPI_SHADER_USER_DATA_PS_9, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B058_SPI_SHADER_USER_DATA_PS_10, REG_FLAG_NEED_BO}, + {R_00B05C_SPI_SHADER_USER_DATA_PS_11, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B060_SPI_SHADER_USER_DATA_PS_12, REG_FLAG_NEED_BO}, + {R_00B064_SPI_SHADER_USER_DATA_PS_13, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B068_SPI_SHADER_USER_DATA_PS_14, REG_FLAG_NEED_BO}, + {R_00B06C_SPI_SHADER_USER_DATA_PS_15, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B120_SPI_SHADER_PGM_LO_VS, REG_FLAG_NEED_BO}, + {R_00B124_SPI_SHADER_PGM_HI_VS, REG_FLAG_NEED_BO}, + {R_00B128_SPI_SHADER_PGM_RSRC1_VS, 0}, + {R_00B12C_SPI_SHADER_PGM_RSRC2_VS, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B130_SPI_SHADER_USER_DATA_VS_0, REG_FLAG_NEED_BO}, + {R_00B134_SPI_SHADER_USER_DATA_VS_1, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B138_SPI_SHADER_USER_DATA_VS_2, REG_FLAG_NEED_BO}, + {R_00B13C_SPI_SHADER_USER_DATA_VS_3, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B140_SPI_SHADER_USER_DATA_VS_4, REG_FLAG_NEED_BO}, + {R_00B144_SPI_SHADER_USER_DATA_VS_5, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B148_SPI_SHADER_USER_DATA_VS_6, REG_FLAG_NEED_BO}, + {R_00B14C_SPI_SHADER_USER_DATA_VS_7, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B150_SPI_SHADER_USER_DATA_VS_8, REG_FLAG_NEED_BO}, + {R_00B154_SPI_SHADER_USER_DATA_VS_9, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B158_SPI_SHADER_USER_DATA_VS_10, REG_FLAG_NEED_BO}, + {R_00B15C_SPI_SHADER_USER_DATA_VS_11, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B160_SPI_SHADER_USER_DATA_VS_12, REG_FLAG_NEED_BO}, + {R_00B164_SPI_SHADER_USER_DATA_VS_13, 0}, + {GROUP_FORCE_NEW_BLOCK, 0}, + {R_00B168_SPI_SHADER_USER_DATA_VS_14, REG_FLAG_NEED_BO}, + {R_00B16C_SPI_SHADER_USER_DATA_VS_15, 0}, +}; + +int si_context_init(struct r600_context *ctx) +{ + int r; + + LIST_INITHEAD(&ctx->active_query_list); + + /* init dirty list */ + LIST_INITHEAD(&ctx->dirty); + LIST_INITHEAD(&ctx->enable_list); + + ctx->range = calloc(NUM_RANGES, sizeof(struct r600_range)); + if (!ctx->range) { + r = -ENOMEM; + goto out_err; + } + + /* add blocks */ + r = r600_context_add_block(ctx, si_config_reg_list, + Elements(si_config_reg_list), PKT3_SET_CONFIG_REG, SI_CONFIG_REG_OFFSET); + if (r) + goto out_err; + r = r600_context_add_block(ctx, si_context_reg_list, + Elements(si_context_reg_list), PKT3_SET_CONTEXT_REG, SI_CONTEXT_REG_OFFSET); + if (r) + goto out_err; + r = r600_context_add_block(ctx, si_sh_reg_list, + Elements(si_sh_reg_list), PKT3_SET_SH_REG, SI_SH_REG_OFFSET); + if (r) + goto out_err; + + + /* PS SAMPLER */ + /* VS SAMPLER */ + + /* PS SAMPLER BORDER */ + /* VS SAMPLER BORDER */ + + /* PS RESOURCES */ + /* VS RESOURCES */ + + ctx->cs = ctx->ws->cs_create(ctx->ws); + + r600_init_cs(ctx); + ctx->max_db = 8; + return 0; +out_err: + r600_context_fini(ctx); + return r; +} + +static inline void evergreen_context_ps_partial_flush(struct r600_context *ctx) +{ + struct radeon_winsys_cs *cs = ctx->cs; + + if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING)) + return; + + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); + + ctx->flags &= ~R600_CONTEXT_DRAW_PENDING; +} + +void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw) +{ + struct radeon_winsys_cs *cs = ctx->cs; + unsigned ndwords = 7; + uint32_t *pm4; + uint64_t va; + + if (draw->indices) { + ndwords = 11; + } + if (ctx->num_cs_dw_queries_suspend) + ndwords += 6; + + /* when increasing ndwords, bump the max limit too */ + assert(ndwords <= R600_MAX_DRAW_CS_DWORDS); + + /* queries need some special values + * (this is non-zero if any query is active) */ + if (ctx->num_cs_dw_queries_suspend) { + pm4 = &cs->buf[cs->cdw]; + pm4[0] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + pm4[1] = (R_028004_DB_COUNT_CONTROL - SI_CONTEXT_REG_OFFSET) >> 2; + pm4[2] = S_028004_PERFECT_ZPASS_COUNTS(1); + pm4[3] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + pm4[4] = (R_02800C_DB_RENDER_OVERRIDE - SI_CONTEXT_REG_OFFSET) >> 2; + pm4[5] = draw->db_render_override | S_02800C_NOOP_CULL_DISABLE(1); + cs->cdw += 6; + ndwords -= 6; + } + + /* draw packet */ + pm4 = &cs->buf[cs->cdw]; + pm4[0] = PKT3(PKT3_INDEX_TYPE, 0, ctx->predicate_drawing); + pm4[1] = draw->vgt_index_type; + pm4[2] = PKT3(PKT3_NUM_INSTANCES, 0, ctx->predicate_drawing); + pm4[3] = draw->vgt_num_instances; + if (draw->indices) { + va = r600_resource_va(&ctx->screen->screen, (void*)draw->indices); + va += draw->indices_bo_offset; + pm4[4] = PKT3(PKT3_DRAW_INDEX, 3, ctx->predicate_drawing); + pm4[5] = va; + pm4[6] = (va >> 32UL) & 0xFF; + pm4[7] = draw->vgt_num_indices; + pm4[8] = draw->vgt_draw_initiator; + pm4[9] = PKT3(PKT3_NOP, 0, ctx->predicate_drawing); + pm4[10] = r600_context_bo_reloc(ctx, draw->indices, RADEON_USAGE_READ); + } else { + pm4[4] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, ctx->predicate_drawing); + pm4[5] = draw->vgt_num_indices; + pm4[6] = draw->vgt_draw_initiator; + } + cs->cdw += ndwords; +} + +void evergreen_flush_vgt_streamout(struct r600_context *ctx) +{ + struct radeon_winsys_cs *cs = ctx->cs; + + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); + cs->buf[cs->cdw++] = (R_0084FC_CP_STRMOUT_CNTL - SI_CONFIG_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = 0; + + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0); + + cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); + cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ + cs->buf[cs->cdw++] = R_0084FC_CP_STRMOUT_CNTL >> 2; /* register */ + cs->buf[cs->cdw++] = 0; + cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* reference value */ + cs->buf[cs->cdw++] = S_0084FC_OFFSET_UPDATE_DONE(1); /* mask */ + cs->buf[cs->cdw++] = 4; /* poll interval */ +} + +void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit) +{ + struct radeon_winsys_cs *cs = ctx->cs; + + if (buffer_enable_bit) { + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + cs->buf[cs->cdw++] = (R_028B94_VGT_STRMOUT_CONFIG - SI_CONTEXT_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = S_028B94_STREAMOUT_0_EN(1); + + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + cs->buf[cs->cdw++] = (R_028B98_VGT_STRMOUT_BUFFER_CONFIG - SI_CONTEXT_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = S_028B98_STREAM_0_BUFFER_EN(buffer_enable_bit); + } else { + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + cs->buf[cs->cdw++] = (R_028B94_VGT_STRMOUT_CONFIG - SI_CONTEXT_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = S_028B94_STREAMOUT_0_EN(0); + } +} diff --git a/src/gallium/drivers/radeonsi/evergreen_state.c b/src/gallium/drivers/radeonsi/evergreen_state.c new file mode 100644 index 00000000000..5049c7b2db6 --- /dev/null +++ b/src/gallium/drivers/radeonsi/evergreen_state.c @@ -0,0 +1,2169 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* TODO: + * - fix mask for depth control & cull for query + */ +#include <stdio.h> +#include <errno.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "util/u_blitter.h" +#include "util/u_double_list.h" +#include "util/u_transfer.h" +#include "util/u_surface.h" +#include "util/u_pack_color.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_framebuffer.h" +#include "pipebuffer/pb_buffer.h" +#include "r600.h" +#include "sid.h" +#include "r600_resource.h" +#include "radeonsi_pipe.h" + +static uint32_t si_translate_blend_function(int blend_func) +{ + switch (blend_func) { + case PIPE_BLEND_ADD: + return V_028780_COMB_DST_PLUS_SRC; + case PIPE_BLEND_SUBTRACT: + return V_028780_COMB_SRC_MINUS_DST; + case PIPE_BLEND_REVERSE_SUBTRACT: + return V_028780_COMB_DST_MINUS_SRC; + case PIPE_BLEND_MIN: + return V_028780_COMB_MIN_DST_SRC; + case PIPE_BLEND_MAX: + return V_028780_COMB_MAX_DST_SRC; + default: + R600_ERR("Unknown blend function %d\n", blend_func); + assert(0); + break; + } + return 0; +} + +static uint32_t si_translate_blend_factor(int blend_fact) +{ + switch (blend_fact) { + case PIPE_BLENDFACTOR_ONE: + return V_028780_BLEND_ONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return V_028780_BLEND_SRC_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return V_028780_BLEND_SRC_ALPHA; + case PIPE_BLENDFACTOR_DST_ALPHA: + return V_028780_BLEND_DST_ALPHA; + case PIPE_BLENDFACTOR_DST_COLOR: + return V_028780_BLEND_DST_COLOR; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return V_028780_BLEND_SRC_ALPHA_SATURATE; + case PIPE_BLENDFACTOR_CONST_COLOR: + return V_028780_BLEND_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_CONST_ALPHA: + return V_028780_BLEND_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_ZERO: + return V_028780_BLEND_ZERO; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return V_028780_BLEND_ONE_MINUS_SRC_COLOR; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return V_028780_BLEND_ONE_MINUS_SRC_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_ALPHA: + return V_028780_BLEND_ONE_MINUS_DST_ALPHA; + case PIPE_BLENDFACTOR_INV_DST_COLOR: + return V_028780_BLEND_ONE_MINUS_DST_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_COLOR: + return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR; + case PIPE_BLENDFACTOR_INV_CONST_ALPHA: + return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA; + case PIPE_BLENDFACTOR_SRC1_COLOR: + return V_028780_BLEND_SRC1_COLOR; + case PIPE_BLENDFACTOR_SRC1_ALPHA: + return V_028780_BLEND_SRC1_ALPHA; + case PIPE_BLENDFACTOR_INV_SRC1_COLOR: + return V_028780_BLEND_INV_SRC1_COLOR; + case PIPE_BLENDFACTOR_INV_SRC1_ALPHA: + return V_028780_BLEND_INV_SRC1_ALPHA; + default: + R600_ERR("Bad blend factor %d not supported!\n", blend_fact); + assert(0); + break; + } + return 0; +} + +#if 0 +static uint32_t r600_translate_stencil_op(int s_op) +{ + switch (s_op) { + case PIPE_STENCIL_OP_KEEP: + return V_028800_STENCIL_KEEP; + case PIPE_STENCIL_OP_ZERO: + return V_028800_STENCIL_ZERO; + case PIPE_STENCIL_OP_REPLACE: + return V_028800_STENCIL_REPLACE; + case PIPE_STENCIL_OP_INCR: + return V_028800_STENCIL_INCR; + case PIPE_STENCIL_OP_DECR: + return V_028800_STENCIL_DECR; + case PIPE_STENCIL_OP_INCR_WRAP: + return V_028800_STENCIL_INCR_WRAP; + case PIPE_STENCIL_OP_DECR_WRAP: + return V_028800_STENCIL_DECR_WRAP; + case PIPE_STENCIL_OP_INVERT: + return V_028800_STENCIL_INVERT; + default: + R600_ERR("Unknown stencil op %d", s_op); + assert(0); + break; + } + return 0; +} +#endif + +static uint32_t si_translate_fill(uint32_t func) +{ + switch(func) { + case PIPE_POLYGON_MODE_FILL: + return V_028814_X_DRAW_TRIANGLES; + case PIPE_POLYGON_MODE_LINE: + return V_028814_X_DRAW_LINES; + case PIPE_POLYGON_MODE_POINT: + return V_028814_X_DRAW_POINTS; + default: + assert(0); + return V_028814_X_DRAW_POINTS; + } +} + +/* translates straight */ +static uint32_t si_translate_ds_func(int func) +{ + return func; +} + +static unsigned si_tex_wrap(unsigned wrap) +{ + switch (wrap) { + default: + case PIPE_TEX_WRAP_REPEAT: + return V_008F30_SQ_TEX_WRAP; + case PIPE_TEX_WRAP_CLAMP: + return V_008F30_SQ_TEX_CLAMP_HALF_BORDER; + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL; + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + return V_008F30_SQ_TEX_CLAMP_BORDER; + case PIPE_TEX_WRAP_MIRROR_REPEAT: + return V_008F30_SQ_TEX_MIRROR; + case PIPE_TEX_WRAP_MIRROR_CLAMP: + return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL; + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER; + } +} + +static unsigned si_tex_filter(unsigned filter) +{ + switch (filter) { + default: + case PIPE_TEX_FILTER_NEAREST: + return V_008F38_SQ_TEX_XY_FILTER_POINT; + case PIPE_TEX_FILTER_LINEAR: + return V_008F38_SQ_TEX_XY_FILTER_BILINEAR; + } +} + +static unsigned si_tex_mipfilter(unsigned filter) +{ + switch (filter) { + case PIPE_TEX_MIPFILTER_NEAREST: + return V_008F38_SQ_TEX_Z_FILTER_POINT; + case PIPE_TEX_MIPFILTER_LINEAR: + return V_008F38_SQ_TEX_Z_FILTER_LINEAR; + default: + case PIPE_TEX_MIPFILTER_NONE: + return V_008F38_SQ_TEX_Z_FILTER_NONE; + } +} + +static unsigned si_tex_compare(unsigned compare) +{ + switch (compare) { + default: + case PIPE_FUNC_NEVER: + return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER; + case PIPE_FUNC_LESS: + return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS; + case PIPE_FUNC_EQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL; + case PIPE_FUNC_LEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL; + case PIPE_FUNC_GREATER: + return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER; + case PIPE_FUNC_NOTEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL; + case PIPE_FUNC_GEQUAL: + return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL; + case PIPE_FUNC_ALWAYS: + return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS; + } +} + +static unsigned si_tex_dim(unsigned dim) +{ + switch (dim) { + default: + case PIPE_TEXTURE_1D: + return V_008F1C_SQ_RSRC_IMG_1D; + case PIPE_TEXTURE_1D_ARRAY: + return V_008F1C_SQ_RSRC_IMG_1D_ARRAY; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + return V_008F1C_SQ_RSRC_IMG_2D; + case PIPE_TEXTURE_2D_ARRAY: + return V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + case PIPE_TEXTURE_3D: + return V_008F1C_SQ_RSRC_IMG_3D; + case PIPE_TEXTURE_CUBE: + return V_008F1C_SQ_RSRC_IMG_CUBE; + } +} + +static uint32_t si_translate_dbformat(enum pipe_format format) +{ + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return V_028040_Z_16; + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return V_028040_Z_24; /* XXX no longer supported on SI */ + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return V_028040_Z_32_FLOAT; + default: + return ~0U; + } +} + +static uint32_t si_translate_colorswap(enum pipe_format format) +{ + switch (format) { + /* 8-bit buffers. */ + case PIPE_FORMAT_L4A4_UNORM: + case PIPE_FORMAT_A4R4_UNORM: + return V_028C70_SWAP_ALT; + + case PIPE_FORMAT_A8_UNORM: + case PIPE_FORMAT_A8_UINT: + case PIPE_FORMAT_A8_SINT: + case PIPE_FORMAT_R4A4_UNORM: + return V_028C70_SWAP_ALT_REV; + case PIPE_FORMAT_I8_UNORM: + case PIPE_FORMAT_L8_UNORM: + case PIPE_FORMAT_I8_UINT: + case PIPE_FORMAT_I8_SINT: + case PIPE_FORMAT_L8_UINT: + case PIPE_FORMAT_L8_SINT: + case PIPE_FORMAT_L8_SRGB: + case PIPE_FORMAT_R8_UNORM: + case PIPE_FORMAT_R8_SNORM: + case PIPE_FORMAT_R8_UINT: + case PIPE_FORMAT_R8_SINT: + return V_028C70_SWAP_STD; + + /* 16-bit buffers. */ + case PIPE_FORMAT_B5G6R5_UNORM: + return V_028C70_SWAP_STD_REV; + + case PIPE_FORMAT_B5G5R5A1_UNORM: + case PIPE_FORMAT_B5G5R5X1_UNORM: + return V_028C70_SWAP_ALT; + + case PIPE_FORMAT_B4G4R4A4_UNORM: + case PIPE_FORMAT_B4G4R4X4_UNORM: + return V_028C70_SWAP_ALT; + + case PIPE_FORMAT_Z16_UNORM: + return V_028C70_SWAP_STD; + + case PIPE_FORMAT_L8A8_UNORM: + case PIPE_FORMAT_L8A8_UINT: + case PIPE_FORMAT_L8A8_SINT: + case PIPE_FORMAT_L8A8_SRGB: + return V_028C70_SWAP_ALT; + case PIPE_FORMAT_R8G8_UNORM: + case PIPE_FORMAT_R8G8_UINT: + case PIPE_FORMAT_R8G8_SINT: + return V_028C70_SWAP_STD; + + case PIPE_FORMAT_R16_UNORM: + case PIPE_FORMAT_R16_UINT: + case PIPE_FORMAT_R16_SINT: + case PIPE_FORMAT_R16_FLOAT: + return V_028C70_SWAP_STD; + + /* 32-bit buffers. */ + case PIPE_FORMAT_A8B8G8R8_SRGB: + return V_028C70_SWAP_STD_REV; + case PIPE_FORMAT_B8G8R8A8_SRGB: + return V_028C70_SWAP_ALT; + + case PIPE_FORMAT_B8G8R8A8_UNORM: + case PIPE_FORMAT_B8G8R8X8_UNORM: + return V_028C70_SWAP_ALT; + + case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_X8R8G8B8_UNORM: + return V_028C70_SWAP_ALT_REV; + case PIPE_FORMAT_R8G8B8A8_SNORM: + case PIPE_FORMAT_R8G8B8A8_UNORM: + case PIPE_FORMAT_R8G8B8A8_SSCALED: + case PIPE_FORMAT_R8G8B8A8_USCALED: + case PIPE_FORMAT_R8G8B8A8_SINT: + case PIPE_FORMAT_R8G8B8A8_UINT: + case PIPE_FORMAT_R8G8B8X8_UNORM: + return V_028C70_SWAP_STD; + + case PIPE_FORMAT_A8B8G8R8_UNORM: + case PIPE_FORMAT_X8B8G8R8_UNORM: + /* case PIPE_FORMAT_R8SG8SB8UX8U_NORM: */ + return V_028C70_SWAP_STD_REV; + + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return V_028C70_SWAP_STD; + + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return V_028C70_SWAP_STD; + + case PIPE_FORMAT_R10G10B10A2_UNORM: + case PIPE_FORMAT_R10G10B10X2_SNORM: + case PIPE_FORMAT_R10SG10SB10SA2U_NORM: + return V_028C70_SWAP_STD; + + case PIPE_FORMAT_B10G10R10A2_UNORM: + case PIPE_FORMAT_B10G10R10A2_UINT: + return V_028C70_SWAP_ALT; + + case PIPE_FORMAT_R11G11B10_FLOAT: + case PIPE_FORMAT_R32_FLOAT: + case PIPE_FORMAT_R32_UINT: + case PIPE_FORMAT_R32_SINT: + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_R16G16_FLOAT: + case PIPE_FORMAT_R16G16_UNORM: + case PIPE_FORMAT_R16G16_UINT: + case PIPE_FORMAT_R16G16_SINT: + return V_028C70_SWAP_STD; + + /* 64-bit buffers. */ + case PIPE_FORMAT_R32G32_FLOAT: + case PIPE_FORMAT_R32G32_UINT: + case PIPE_FORMAT_R32G32_SINT: + case PIPE_FORMAT_R16G16B16A16_UNORM: + case PIPE_FORMAT_R16G16B16A16_SNORM: + case PIPE_FORMAT_R16G16B16A16_USCALED: + case PIPE_FORMAT_R16G16B16A16_SSCALED: + case PIPE_FORMAT_R16G16B16A16_UINT: + case PIPE_FORMAT_R16G16B16A16_SINT: + case PIPE_FORMAT_R16G16B16A16_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + + /* 128-bit buffers. */ + case PIPE_FORMAT_R32G32B32A32_FLOAT: + case PIPE_FORMAT_R32G32B32A32_SNORM: + case PIPE_FORMAT_R32G32B32A32_UNORM: + case PIPE_FORMAT_R32G32B32A32_SSCALED: + case PIPE_FORMAT_R32G32B32A32_USCALED: + case PIPE_FORMAT_R32G32B32A32_SINT: + case PIPE_FORMAT_R32G32B32A32_UINT: + return V_028C70_SWAP_STD; + default: + R600_ERR("unsupported colorswap format %d\n", format); + return ~0U; + } + return ~0U; +} + +static uint32_t si_translate_colorformat(enum pipe_format format) +{ + switch (format) { + /* 8-bit buffers. */ + case PIPE_FORMAT_A8_UNORM: + case PIPE_FORMAT_A8_UINT: + case PIPE_FORMAT_A8_SINT: + case PIPE_FORMAT_I8_UNORM: + case PIPE_FORMAT_I8_UINT: + case PIPE_FORMAT_I8_SINT: + case PIPE_FORMAT_L8_UNORM: + case PIPE_FORMAT_L8_UINT: + case PIPE_FORMAT_L8_SINT: + case PIPE_FORMAT_L8_SRGB: + case PIPE_FORMAT_R8_UNORM: + case PIPE_FORMAT_R8_SNORM: + case PIPE_FORMAT_R8_UINT: + case PIPE_FORMAT_R8_SINT: + return V_028C70_COLOR_8; + + /* 16-bit buffers. */ + case PIPE_FORMAT_B5G6R5_UNORM: + return V_028C70_COLOR_5_6_5; + + case PIPE_FORMAT_B5G5R5A1_UNORM: + case PIPE_FORMAT_B5G5R5X1_UNORM: + return V_028C70_COLOR_1_5_5_5; + + case PIPE_FORMAT_B4G4R4A4_UNORM: + case PIPE_FORMAT_B4G4R4X4_UNORM: + return V_028C70_COLOR_4_4_4_4; + + case PIPE_FORMAT_L8A8_UNORM: + case PIPE_FORMAT_L8A8_UINT: + case PIPE_FORMAT_L8A8_SINT: + case PIPE_FORMAT_L8A8_SRGB: + case PIPE_FORMAT_R8G8_UNORM: + case PIPE_FORMAT_R8G8_UINT: + case PIPE_FORMAT_R8G8_SINT: + return V_028C70_COLOR_8_8; + + case PIPE_FORMAT_Z16_UNORM: + case PIPE_FORMAT_R16_UNORM: + case PIPE_FORMAT_R16_UINT: + case PIPE_FORMAT_R16_SINT: + case PIPE_FORMAT_R16_FLOAT: + case PIPE_FORMAT_R16G16_FLOAT: + return V_028C70_COLOR_16; + + /* 32-bit buffers. */ + case PIPE_FORMAT_A8B8G8R8_SRGB: + case PIPE_FORMAT_A8B8G8R8_UNORM: + case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_B8G8R8A8_SRGB: + case PIPE_FORMAT_B8G8R8A8_UNORM: + case PIPE_FORMAT_B8G8R8X8_UNORM: + case PIPE_FORMAT_R8G8B8A8_SNORM: + case PIPE_FORMAT_R8G8B8A8_UNORM: + case PIPE_FORMAT_R8G8B8X8_UNORM: + case PIPE_FORMAT_R8SG8SB8UX8U_NORM: + case PIPE_FORMAT_X8B8G8R8_UNORM: + case PIPE_FORMAT_X8R8G8B8_UNORM: + case PIPE_FORMAT_R8G8B8_UNORM: + case PIPE_FORMAT_R8G8B8A8_SSCALED: + case PIPE_FORMAT_R8G8B8A8_USCALED: + case PIPE_FORMAT_R8G8B8A8_SINT: + case PIPE_FORMAT_R8G8B8A8_UINT: + return V_028C70_COLOR_8_8_8_8; + + case PIPE_FORMAT_R10G10B10A2_UNORM: + case PIPE_FORMAT_R10G10B10X2_SNORM: + case PIPE_FORMAT_B10G10R10A2_UNORM: + case PIPE_FORMAT_B10G10R10A2_UINT: + case PIPE_FORMAT_R10SG10SB10SA2U_NORM: + return V_028C70_COLOR_2_10_10_10; + + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return V_028C70_COLOR_8_24; + + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return V_028C70_COLOR_24_8; + + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return V_028C70_COLOR_X24_8_32_FLOAT; + + case PIPE_FORMAT_R32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT: + return V_028C70_COLOR_32; + + case PIPE_FORMAT_R16G16_SSCALED: + case PIPE_FORMAT_R16G16_UNORM: + case PIPE_FORMAT_R16G16_UINT: + case PIPE_FORMAT_R16G16_SINT: + return V_028C70_COLOR_16_16; + + case PIPE_FORMAT_R11G11B10_FLOAT: + return V_028C70_COLOR_10_11_11; + + /* 64-bit buffers. */ + case PIPE_FORMAT_R16G16B16_USCALED: + case PIPE_FORMAT_R16G16B16_SSCALED: + case PIPE_FORMAT_R16G16B16A16_UINT: + case PIPE_FORMAT_R16G16B16A16_SINT: + case PIPE_FORMAT_R16G16B16A16_USCALED: + case PIPE_FORMAT_R16G16B16A16_SSCALED: + case PIPE_FORMAT_R16G16B16A16_UNORM: + case PIPE_FORMAT_R16G16B16A16_SNORM: + case PIPE_FORMAT_R16G16B16_FLOAT: + case PIPE_FORMAT_R16G16B16A16_FLOAT: + return V_028C70_COLOR_16_16_16_16; + + case PIPE_FORMAT_R32G32_FLOAT: + case PIPE_FORMAT_R32G32_USCALED: + case PIPE_FORMAT_R32G32_SSCALED: + case PIPE_FORMAT_R32G32_SINT: + case PIPE_FORMAT_R32G32_UINT: + return V_028C70_COLOR_32_32; + + /* 128-bit buffers. */ + case PIPE_FORMAT_R32G32B32A32_SNORM: + case PIPE_FORMAT_R32G32B32A32_UNORM: + case PIPE_FORMAT_R32G32B32A32_SSCALED: + case PIPE_FORMAT_R32G32B32A32_USCALED: + case PIPE_FORMAT_R32G32B32A32_SINT: + case PIPE_FORMAT_R32G32B32A32_UINT: + case PIPE_FORMAT_R32G32B32A32_FLOAT: + return V_028C70_COLOR_32_32_32_32; + + /* YUV buffers. */ + case PIPE_FORMAT_UYVY: + case PIPE_FORMAT_YUYV: + /* 96-bit buffers. */ + case PIPE_FORMAT_R32G32B32_FLOAT: + /* 8-bit buffers. */ + case PIPE_FORMAT_L4A4_UNORM: + case PIPE_FORMAT_R4A4_UNORM: + case PIPE_FORMAT_A4R4_UNORM: + default: + return ~0U; /* Unsupported. */ + } +} + +static uint32_t si_colorformat_endian_swap(uint32_t colorformat) +{ + if (R600_BIG_ENDIAN) { + switch(colorformat) { + /* 8-bit buffers. */ + case V_028C70_COLOR_8: + return V_028C70_ENDIAN_NONE; + + /* 16-bit buffers. */ + case V_028C70_COLOR_5_6_5: + case V_028C70_COLOR_1_5_5_5: + case V_028C70_COLOR_4_4_4_4: + case V_028C70_COLOR_16: + case V_028C70_COLOR_8_8: + return V_028C70_ENDIAN_8IN16; + + /* 32-bit buffers. */ + case V_028C70_COLOR_8_8_8_8: + case V_028C70_COLOR_2_10_10_10: + case V_028C70_COLOR_8_24: + case V_028C70_COLOR_24_8: + case V_028C70_COLOR_16_16: + return V_028C70_ENDIAN_8IN32; + + /* 64-bit buffers. */ + case V_028C70_COLOR_16_16_16_16: + return V_028C70_ENDIAN_8IN16; + + case V_028C70_COLOR_32_32: + return V_028C70_ENDIAN_8IN32; + + /* 128-bit buffers. */ + case V_028C70_COLOR_32_32_32_32: + return V_028C70_ENDIAN_8IN32; + default: + return V_028C70_ENDIAN_NONE; /* Unsupported. */ + } + } else { + return V_028C70_ENDIAN_NONE; + } +} + +static uint32_t si_translate_texformat(struct pipe_screen *screen, + enum pipe_format format, + const struct util_format_description *desc, + int first_non_void) +{ + boolean uniform = TRUE; + int i; + + /* Colorspace (return non-RGB formats directly). */ + switch (desc->colorspace) { + /* Depth stencil formats */ + case UTIL_FORMAT_COLORSPACE_ZS: + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return V_008F14_IMG_DATA_FORMAT_16; + case PIPE_FORMAT_X24S8_UINT: + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_24_8; + case PIPE_FORMAT_S8X24_UINT: + case PIPE_FORMAT_X8Z24_UNORM: + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + return V_008F14_IMG_DATA_FORMAT_8_24; + case PIPE_FORMAT_S8_UINT: + return V_008F14_IMG_DATA_FORMAT_8; + case PIPE_FORMAT_Z32_FLOAT: + return V_008F14_IMG_DATA_FORMAT_32; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + return V_008F14_IMG_DATA_FORMAT_X24_8_32; + default: + goto out_unknown; + } + + case UTIL_FORMAT_COLORSPACE_YUV: + goto out_unknown; /* TODO */ + + case UTIL_FORMAT_COLORSPACE_SRGB: + break; + + default: + break; + } + + /* TODO compressed formats */ + + if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) { + return V_008F14_IMG_DATA_FORMAT_5_9_9_9; + } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) { + return V_008F14_IMG_DATA_FORMAT_10_11_11; + } + + /* R8G8Bx_SNORM - TODO CxV8U8 */ + + /* See whether the components are of the same size. */ + for (i = 1; i < desc->nr_channels; i++) { + uniform = uniform && desc->channel[0].size == desc->channel[i].size; + } + + /* Non-uniform formats. */ + if (!uniform) { + switch(desc->nr_channels) { + case 3: + if (desc->channel[0].size == 5 && + desc->channel[1].size == 6 && + desc->channel[2].size == 5) { + return V_008F14_IMG_DATA_FORMAT_5_6_5; + } + goto out_unknown; + case 4: + if (desc->channel[0].size == 5 && + desc->channel[1].size == 5 && + desc->channel[2].size == 5 && + desc->channel[3].size == 1) { + return V_008F14_IMG_DATA_FORMAT_1_5_5_5; + } + if (desc->channel[0].size == 10 && + desc->channel[1].size == 10 && + desc->channel[2].size == 10 && + desc->channel[3].size == 2) { + return V_008F14_IMG_DATA_FORMAT_2_10_10_10; + } + goto out_unknown; + } + goto out_unknown; + } + + if (first_non_void < 0 || first_non_void > 3) + goto out_unknown; + + /* uniform formats */ + switch (desc->channel[first_non_void].size) { + case 4: + switch (desc->nr_channels) { + case 2: + return V_008F14_IMG_DATA_FORMAT_4_4; + case 4: + return V_008F14_IMG_DATA_FORMAT_4_4_4_4; + } + break; + case 8: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_8; + case 2: + return V_008F14_IMG_DATA_FORMAT_8_8; + case 4: + return V_008F14_IMG_DATA_FORMAT_8_8_8_8; + } + break; + case 16: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_16; + case 2: + return V_008F14_IMG_DATA_FORMAT_16_16; + case 4: + return V_008F14_IMG_DATA_FORMAT_16_16_16_16; + } + break; + case 32: + switch (desc->nr_channels) { + case 1: + return V_008F14_IMG_DATA_FORMAT_32; + case 2: + return V_008F14_IMG_DATA_FORMAT_32_32; + case 3: + return V_008F14_IMG_DATA_FORMAT_32_32_32; + case 4: + return V_008F14_IMG_DATA_FORMAT_32_32_32_32; + } + } + +out_unknown: + /* R600_ERR("Unable to handle texformat %d %s\n", format, util_format_name(format)); */ + return ~0; +} + +static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format) +{ + return si_translate_texformat(screen, format, util_format_description(format), + util_format_get_first_non_void_channel(format)) != ~0U; +} + +uint32_t si_translate_vertexformat(struct pipe_screen *screen, + enum pipe_format format, + const struct util_format_description *desc, + int first_non_void) +{ + uint32_t result = si_translate_texformat(screen, format, desc, first_non_void); + + if (result == V_008F0C_BUF_DATA_FORMAT_INVALID || + result > V_008F0C_BUF_DATA_FORMAT_32_32_32_32) + result = ~0; + + return result; +} + +static bool si_is_vertex_format_supported(struct pipe_screen *screen, enum pipe_format format) +{ + return si_translate_vertexformat(screen, format, util_format_description(format), + util_format_get_first_non_void_channel(format)) != ~0U; +} + +static bool r600_is_colorbuffer_format_supported(enum pipe_format format) +{ + return si_translate_colorformat(format) != ~0U && + si_translate_colorswap(format) != ~0U; +} + +static bool r600_is_zs_format_supported(enum pipe_format format) +{ + return si_translate_dbformat(format) != ~0U; +} + +boolean si_is_format_supported(struct pipe_screen *screen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned usage) +{ + unsigned retval = 0; + + if (target >= PIPE_MAX_TEXTURE_TYPES) { + R600_ERR("r600: unsupported texture type %d\n", target); + return FALSE; + } + + if (!util_format_is_supported(format, usage)) + return FALSE; + + /* Multisample */ + if (sample_count > 1) + return FALSE; + + if ((usage & PIPE_BIND_SAMPLER_VIEW) && + si_is_sampler_format_supported(screen, format)) { + retval |= PIPE_BIND_SAMPLER_VIEW; + } + + if ((usage & (PIPE_BIND_RENDER_TARGET | + PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED)) && + r600_is_colorbuffer_format_supported(format)) { + retval |= usage & + (PIPE_BIND_RENDER_TARGET | + PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT | + PIPE_BIND_SHARED); + } + + if ((usage & PIPE_BIND_DEPTH_STENCIL) && + r600_is_zs_format_supported(format)) { + retval |= PIPE_BIND_DEPTH_STENCIL; + } + + if ((usage & PIPE_BIND_VERTEX_BUFFER) && + si_is_vertex_format_supported(screen, format)) { + retval |= PIPE_BIND_VERTEX_BUFFER; + } + + if (usage & PIPE_BIND_TRANSFER_READ) + retval |= PIPE_BIND_TRANSFER_READ; + if (usage & PIPE_BIND_TRANSFER_WRITE) + retval |= PIPE_BIND_TRANSFER_WRITE; + + return retval == usage; +} + +static void evergreen_set_blend_color(struct pipe_context *ctx, + const struct pipe_blend_color *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state); + + if (rstate == NULL) + return; + + rstate->id = R600_PIPE_STATE_BLEND_COLOR; + r600_pipe_state_add_reg(rstate, R_028414_CB_BLEND_RED, fui(state->color[0]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028418_CB_BLEND_GREEN, fui(state->color[1]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_02841C_CB_BLEND_BLUE, fui(state->color[2]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028420_CB_BLEND_ALPHA, fui(state->color[3]), NULL, 0); + + free(rctx->states[R600_PIPE_STATE_BLEND_COLOR]); + rctx->states[R600_PIPE_STATE_BLEND_COLOR] = rstate; + r600_context_pipe_state_set(rctx, rstate); +} + +static void *evergreen_create_blend_state(struct pipe_context *ctx, + const struct pipe_blend_state *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_blend *blend = CALLOC_STRUCT(r600_pipe_blend); + struct r600_pipe_state *rstate; + uint32_t color_control, target_mask; + /* FIXME there is more then 8 framebuffer */ + unsigned blend_cntl[8]; + + if (blend == NULL) { + return NULL; + } + + rstate = &blend->rstate; + + rstate->id = R600_PIPE_STATE_BLEND; + + target_mask = 0; + color_control = S_028808_MODE(V_028808_CB_NORMAL); + if (state->logicop_enable) { + color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4)); + } else { + color_control |= S_028808_ROP3(0xcc); + } + /* we pretend 8 buffer are used, CB_SHADER_MASK will disable unused one */ + if (state->independent_blend_enable) { + for (int i = 0; i < 8; i++) { + target_mask |= (state->rt[i].colormask << (4 * i)); + } + } else { + for (int i = 0; i < 8; i++) { + target_mask |= (state->rt[0].colormask << (4 * i)); + } + } + blend->cb_target_mask = target_mask; + + r600_pipe_state_add_reg(rstate, R_028808_CB_COLOR_CONTROL, + color_control, NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, ~0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028C3C_PA_SC_AA_MASK_X0Y1_X1Y1, ~0, NULL, 0); + + for (int i = 0; i < 8; i++) { + /* state->rt entries > 0 only written if independent blending */ + const int j = state->independent_blend_enable ? i : 0; + + unsigned eqRGB = state->rt[j].rgb_func; + unsigned srcRGB = state->rt[j].rgb_src_factor; + unsigned dstRGB = state->rt[j].rgb_dst_factor; + unsigned eqA = state->rt[j].alpha_func; + unsigned srcA = state->rt[j].alpha_src_factor; + unsigned dstA = state->rt[j].alpha_dst_factor; + + blend_cntl[i] = 0; + if (!state->rt[j].blend_enable) + continue; + + blend_cntl[i] |= S_028780_ENABLE(1); + blend_cntl[i] |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB)); + blend_cntl[i] |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB)); + blend_cntl[i] |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB)); + + if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) { + blend_cntl[i] |= S_028780_SEPARATE_ALPHA_BLEND(1); + blend_cntl[i] |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA)); + blend_cntl[i] |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA)); + blend_cntl[i] |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA)); + } + } + for (int i = 0; i < 8; i++) { + r600_pipe_state_add_reg(rstate, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl[i], NULL, 0); + } + + return rstate; +} + +static void *evergreen_create_dsa_state(struct pipe_context *ctx, + const struct pipe_depth_stencil_alpha_state *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_dsa *dsa = CALLOC_STRUCT(r600_pipe_dsa); + unsigned db_depth_control, alpha_test_control, alpha_ref; + unsigned db_render_override, db_render_control; + struct r600_pipe_state *rstate; + + if (dsa == NULL) { + return NULL; + } + + dsa->valuemask[0] = state->stencil[0].valuemask; + dsa->valuemask[1] = state->stencil[1].valuemask; + dsa->writemask[0] = state->stencil[0].writemask; + dsa->writemask[1] = state->stencil[1].writemask; + + rstate = &dsa->rstate; + + rstate->id = R600_PIPE_STATE_DSA; + db_depth_control = S_028800_Z_ENABLE(state->depth.enabled) | + S_028800_Z_WRITE_ENABLE(state->depth.writemask) | + S_028800_ZFUNC(state->depth.func); + + /* stencil */ + if (state->stencil[0].enabled) { + db_depth_control |= S_028800_STENCIL_ENABLE(1); + db_depth_control |= S_028800_STENCILFUNC(si_translate_ds_func(state->stencil[0].func)); + //db_depth_control |= S_028800_STENCILFAIL(r600_translate_stencil_op(state->stencil[0].fail_op)); + //db_depth_control |= S_028800_STENCILZPASS(r600_translate_stencil_op(state->stencil[0].zpass_op)); + //db_depth_control |= S_028800_STENCILZFAIL(r600_translate_stencil_op(state->stencil[0].zfail_op)); + + if (state->stencil[1].enabled) { + db_depth_control |= S_028800_BACKFACE_ENABLE(1); + db_depth_control |= S_028800_STENCILFUNC_BF(si_translate_ds_func(state->stencil[1].func)); + //db_depth_control |= S_028800_STENCILFAIL_BF(r600_translate_stencil_op(state->stencil[1].fail_op)); + //db_depth_control |= S_028800_STENCILZPASS_BF(r600_translate_stencil_op(state->stencil[1].zpass_op)); + //db_depth_control |= S_028800_STENCILZFAIL_BF(r600_translate_stencil_op(state->stencil[1].zfail_op)); + } + } + + /* alpha */ + alpha_test_control = 0; + alpha_ref = 0; + if (state->alpha.enabled) { + //alpha_test_control = S_028410_ALPHA_FUNC(state->alpha.func); + //alpha_test_control |= S_028410_ALPHA_TEST_ENABLE(1); + alpha_ref = fui(state->alpha.ref_value); + } + dsa->alpha_ref = alpha_ref; + + /* misc */ + db_render_control = 0; + db_render_override = S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_DISABLE) | + S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | + S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE); + /* TODO db_render_override depends on query */ + r600_pipe_state_add_reg(rstate, R_028020_DB_DEPTH_BOUNDS_MIN, 0x00000000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028024_DB_DEPTH_BOUNDS_MAX, 0x00000000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028028_DB_STENCIL_CLEAR, 0x00000000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_02802C_DB_DEPTH_CLEAR, 0x3F800000, NULL, 0); + //r600_pipe_state_add_reg(rstate, R_028410_SX_ALPHA_TEST_CONTROL, alpha_test_control, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028800_DB_DEPTH_CONTROL, db_depth_control, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028000_DB_RENDER_CONTROL, db_render_control, NULL, 0); + r600_pipe_state_add_reg(rstate, R_02800C_DB_RENDER_OVERRIDE, db_render_override, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028AC8_DB_PRELOAD_CONTROL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028B70_DB_ALPHA_TO_MASK, 0x0000AA00, NULL, 0); + dsa->db_render_override = db_render_override; + + return rstate; +} + +static void *evergreen_create_rs_state(struct pipe_context *ctx, + const struct pipe_rasterizer_state *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_rasterizer *rs = CALLOC_STRUCT(r600_pipe_rasterizer); + struct r600_pipe_state *rstate; + unsigned tmp; + unsigned prov_vtx = 1, polygon_dual_mode; + unsigned clip_rule; + float psize_min, psize_max; + + if (rs == NULL) { + return NULL; + } + + polygon_dual_mode = (state->fill_front != PIPE_POLYGON_MODE_FILL || + state->fill_back != PIPE_POLYGON_MODE_FILL); + + if (state->flatshade_first) + prov_vtx = 0; + + rstate = &rs->rstate; + rs->flatshade = state->flatshade; + rs->sprite_coord_enable = state->sprite_coord_enable; + rs->pa_sc_line_stipple = state->line_stipple_enable ? + S_028A0C_LINE_PATTERN(state->line_stipple_pattern) | + S_028A0C_REPEAT_COUNT(state->line_stipple_factor) : 0; + rs->pa_su_sc_mode_cntl = + S_028814_PROVOKING_VTX_LAST(prov_vtx) | + S_028814_CULL_FRONT(state->rasterizer_discard || (state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | + S_028814_CULL_BACK(state->rasterizer_discard || (state->cull_face & PIPE_FACE_BACK) ? 1 : 0) | + S_028814_FACE(!state->front_ccw) | + S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) | + S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) | + S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri) | + S_028814_POLY_MODE(polygon_dual_mode) | + S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) | + S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)); + rs->pa_cl_clip_cntl = + S_028810_PS_UCP_MODE(3) | + S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip) | + S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip) | + S_028810_DX_LINEAR_ATTR_CLIP_ENA(1); + rs->pa_cl_vs_out_cntl = + S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex) | + S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex); + + clip_rule = state->scissor ? 0xAAAA : 0xFFFF; + + /* offset */ + rs->offset_units = state->offset_units; + rs->offset_scale = state->offset_scale * 12.0f; + + rstate->id = R600_PIPE_STATE_RASTERIZER; + tmp = S_0286D4_FLAT_SHADE_ENA(state->flatshade); + if (state->sprite_coord_enable) { + tmp |= S_0286D4_PNT_SPRITE_ENA(1) | + S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) | + S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) | + S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) | + S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1); + if (state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT) { + tmp |= S_0286D4_PNT_SPRITE_TOP_1(1); + } + } + r600_pipe_state_add_reg(rstate, R_0286D4_SPI_INTERP_CONTROL_0, tmp, NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028820_PA_CL_NANINF_CNTL, 0x00000000, NULL, 0); + /* point size 12.4 fixed point */ + tmp = (unsigned)(state->point_size * 8.0); + r600_pipe_state_add_reg(rstate, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp), NULL, 0); + + if (state->point_size_per_vertex) { + psize_min = util_get_min_point_size(state); + psize_max = 8192; + } else { + /* Force the point size to be as if the vertex output was disabled. */ + psize_min = state->point_size; + psize_max = state->point_size; + } + /* Divide by two, because 0.5 = 1 pixel. */ + r600_pipe_state_add_reg(rstate, R_028A04_PA_SU_POINT_MINMAX, + S_028A04_MIN_SIZE(r600_pack_float_12p4(psize_min/2)) | + S_028A04_MAX_SIZE(r600_pack_float_12p4(psize_max/2)), + NULL, 0); + + tmp = (unsigned)state->line_width * 8; + r600_pipe_state_add_reg(rstate, R_028A08_PA_SU_LINE_CNTL, S_028A08_WIDTH(tmp), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A48_PA_SC_MODE_CNTL_0, + S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable), + NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028BDC_PA_SC_LINE_CNTL, 0x00000400, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028BE4_PA_SU_VTX_CNTL, + S_028BE4_PIX_CENTER(state->gl_rasterization_rules), + NULL, 0); + r600_pipe_state_add_reg(rstate, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 0x3F800000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, 0x3F800000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, 0x3F800000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, 0x3F800000, NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp), NULL, 0); + r600_pipe_state_add_reg(rstate, R_02820C_PA_SC_CLIPRECT_RULE, clip_rule, NULL, 0); + return rstate; +} + +static void *si_create_sampler_state(struct pipe_context *ctx, + const struct pipe_sampler_state *state) +{ + struct si_pipe_sampler_state *rstate = CALLOC_STRUCT(si_pipe_sampler_state); + union util_color uc; + unsigned aniso_flag_offset = state->max_anisotropy > 1 ? 2 : 0; + unsigned border_color_type; + + if (rstate == NULL) { + return NULL; + } + + util_pack_color(state->border_color.f, PIPE_FORMAT_B8G8R8A8_UNORM, &uc); + switch (uc.ui) { + case 0x000000FF: /* opaque black */ + border_color_type = 0; + break; + case 0x00000000: /* transparent black */ + border_color_type = 1; + break; + case 0xFFFFFFFF: /* white */ + border_color_type = 1; + break; + default: /* Use border color pointer */ + border_color_type = 3; + } + + rstate->val[0] = si_tex_wrap(state->wrap_s) | + si_tex_wrap(state->wrap_t) << 3 | + si_tex_wrap(state->wrap_r) << 6 | + (state->max_anisotropy & 0x7) << 9 | /* XXX */ + si_tex_compare(state->compare_func) << 12 | + !state->normalized_coords << 15 | + aniso_flag_offset << 16 | /* XXX */ + !state->seamless_cube_map << 28 | + si_tex_mipfilter(state->min_mip_filter) << 29; + rstate->val[1] = S_FIXED(CLAMP(state->min_lod, 0, 15), 8) | + S_FIXED(CLAMP(state->max_lod, 0, 15), 8) << 12; + rstate->val[2] = S_FIXED(CLAMP(state->lod_bias, -16, 16), 8) | + si_tex_filter(state->mag_img_filter) << 20 | + si_tex_filter(state->min_img_filter) << 22; + rstate->val[3] = border_color_type << 30; + +#if 0 + if (border_color_type == 3) { + r600_pipe_state_add_reg_noblock(rstate, R_00A404_TD_PS_SAMPLER0_BORDER_RED, fui(state->border_color.f[0]), NULL, 0); + r600_pipe_state_add_reg_noblock(rstate, R_00A408_TD_PS_SAMPLER0_BORDER_GREEN, fui(state->border_color.f[1]), NULL, 0); + r600_pipe_state_add_reg_noblock(rstate, R_00A40C_TD_PS_SAMPLER0_BORDER_BLUE, fui(state->border_color.f[2]), NULL, 0); + r600_pipe_state_add_reg_noblock(rstate, R_00A410_TD_PS_SAMPLER0_BORDER_ALPHA, fui(state->border_color.f[3]), NULL, 0); + } +#endif + return rstate; +} + +static void si_delete_sampler_state(struct pipe_context *ctx, + void *state) +{ + free(state); +} + +static unsigned si_map_swizzle(unsigned swizzle) +{ + switch (swizzle) { + case UTIL_FORMAT_SWIZZLE_Y: + return V_008F1C_SQ_SEL_Y; + case UTIL_FORMAT_SWIZZLE_Z: + return V_008F1C_SQ_SEL_Z; + case UTIL_FORMAT_SWIZZLE_W: + return V_008F1C_SQ_SEL_W; + case UTIL_FORMAT_SWIZZLE_0: + return V_008F1C_SQ_SEL_0; + case UTIL_FORMAT_SWIZZLE_1: + return V_008F1C_SQ_SEL_1; + default: /* UTIL_FORMAT_SWIZZLE_X */ + return V_008F1C_SQ_SEL_X; + } +} + +static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_context *ctx, + struct pipe_resource *texture, + const struct pipe_sampler_view *state) +{ + struct si_pipe_sampler_view *view = CALLOC_STRUCT(si_pipe_sampler_view); + struct r600_resource_texture *tmp = (struct r600_resource_texture*)texture; + const struct util_format_description *desc = util_format_description(state->format); + unsigned format, num_format, endian; + uint32_t pitch = 0; + unsigned char state_swizzle[4], swizzle[4], array_mode = 0, tile_type = 0; + unsigned height, depth; + int first_non_void; + uint64_t va; + + if (view == NULL) + return NULL; + + /* initialize base object */ + view->base = *state; + view->base.texture = NULL; + pipe_reference(NULL, &texture->reference); + view->base.texture = texture; + view->base.reference.count = 1; + view->base.context = ctx; + + state_swizzle[0] = state->swizzle_r; + state_swizzle[1] = state->swizzle_g; + state_swizzle[2] = state->swizzle_b; + state_swizzle[3] = state->swizzle_a; + util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle); + + first_non_void = util_format_get_first_non_void_channel(state->format); + switch (desc->channel[first_non_void].type) { + case UTIL_FORMAT_TYPE_FLOAT: + num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; + break; + case UTIL_FORMAT_TYPE_FIXED: + num_format = V_008F14_IMG_NUM_FORMAT_USCALED; /* XXX */ + break; + case UTIL_FORMAT_TYPE_SIGNED: + num_format = V_008F14_IMG_NUM_FORMAT_SNORM; + break; + case UTIL_FORMAT_TYPE_UNSIGNED: + default: + num_format = V_008F14_IMG_NUM_FORMAT_UNORM; + } + + format = si_translate_texformat(ctx->screen, state->format, desc, first_non_void); + if (format == ~0) { + format = 0; + } + + if (tmp->depth && !tmp->is_flushing_texture) { + r600_texture_depth_flush(ctx, texture, TRUE); + tmp = tmp->flushed_depth_texture; + } + + endian = si_colorformat_endian_swap(format); + + height = texture->height0; + depth = texture->depth0; + + pitch = align(tmp->pitch_in_blocks[0] * + util_format_get_blockwidth(state->format), 8); + array_mode = tmp->array_mode[0]; + tile_type = tmp->tile_type; + + if (texture->target == PIPE_TEXTURE_1D_ARRAY) { + height = 1; + depth = texture->array_size; + } else if (texture->target == PIPE_TEXTURE_2D_ARRAY) { + depth = texture->array_size; + } + + va = r600_resource_va(ctx->screen, texture); + view->state[0] = (va + tmp->offset[0]) >> 8; + view->state[1] = ((va + tmp->offset[0]) >> 40) & 0xff; + view->state[1] |= (S_008F14_DATA_FORMAT(format) | + S_008F14_NUM_FORMAT(num_format)); + view->state[2] = (S_008F18_WIDTH(texture->width0 - 1) | + S_008F18_HEIGHT(height - 1)); + view->state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) | + S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | + S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | + S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) | + S_008F1C_BASE_LEVEL(state->u.tex.first_level) | + S_008F1C_LAST_LEVEL(state->u.tex.last_level) | + S_008F1C_TYPE(si_tex_dim(texture->target))); + view->state[4] = (S_008F20_DEPTH(depth - 1) | + S_008F20_PITCH((pitch / 8) - 1)); + view->state[5] = (S_008F24_BASE_ARRAY(state->u.tex.first_layer) | + S_008F24_LAST_ARRAY(state->u.tex.last_layer)); + view->state[6] = 0; + view->state[7] = 0; + + return &view->base; +} + +static void evergreen_set_vs_sampler_view(struct pipe_context *ctx, unsigned count, + struct pipe_sampler_view **views) +{ +} + +static void evergreen_set_ps_sampler_view(struct pipe_context *ctx, unsigned count, + struct pipe_sampler_view **views) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct si_pipe_sampler_view **resource = (struct si_pipe_sampler_view **)views; + struct r600_pipe_state *rstate = &rctx->ps_samplers.rstate; + struct r600_resource *bo; + int i; + int has_depth = 0; + uint64_t va; + char *ptr; + + if (!count) + goto out; + + r600_inval_texture_cache(rctx); + + bo = (struct r600_resource*) + pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, + count * sizeof(resource[0]->state)); + ptr = rctx->ws->buffer_map(bo->buf, rctx->cs, PIPE_TRANSFER_WRITE); + + for (i = 0; i < count; i++, ptr += sizeof(resource[0]->state)) { + pipe_sampler_view_reference( + (struct pipe_sampler_view **)&rctx->ps_samplers.views[i], + views[i]); + + if (resource[i]) { + if (((struct r600_resource_texture *)resource[i]->base.texture)->depth) + has_depth = 1; + + memcpy(ptr, resource[i]->state, sizeof(resource[0]->state)); + } else + memset(ptr, 0, sizeof(resource[0]->state)); + } + + rctx->ws->buffer_unmap(bo->buf); + + for (i = count; i < NUM_TEX_UNITS; i++) { + if (rctx->ps_samplers.views[i]) + pipe_sampler_view_reference((struct pipe_sampler_view **)&rctx->ps_samplers.views[i], NULL); + } + + va = r600_resource_va(ctx->screen, (void *)bo); + r600_pipe_state_add_reg(rstate, R_00B040_SPI_SHADER_USER_DATA_PS_4, va, bo, RADEON_USAGE_READ); + r600_pipe_state_add_reg(rstate, R_00B044_SPI_SHADER_USER_DATA_PS_5, va >> 32, NULL, 0); + r600_context_pipe_state_set(rctx, rstate); + +out: + rctx->have_depth_texture = has_depth; + rctx->ps_samplers.n_views = count; +} + +static void evergreen_bind_ps_sampler(struct pipe_context *ctx, unsigned count, void **states) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct si_pipe_sampler_state **rstates = (struct si_pipe_sampler_state **)states; + struct r600_pipe_state *rstate = &rctx->ps_samplers.rstate; + struct r600_resource *bo; + uint64_t va; + char *ptr; + int i; + + if (!count) + goto out; + + r600_inval_texture_cache(rctx); + + bo = (struct r600_resource*) + pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, + count * sizeof(rstates[0]->val)); + ptr = rctx->ws->buffer_map(bo->buf, rctx->cs, PIPE_TRANSFER_WRITE); + + for (i = 0; i < count; i++, ptr += sizeof(rstates[0]->val)) { + memcpy(ptr, rstates[i]->val, sizeof(rstates[0]->val)); + } + + rctx->ws->buffer_unmap(bo->buf); + + va = r600_resource_va(ctx->screen, (void *)bo); + r600_pipe_state_add_reg(rstate, R_00B038_SPI_SHADER_USER_DATA_PS_2, va, bo, RADEON_USAGE_READ); + r600_pipe_state_add_reg(rstate, R_00B03C_SPI_SHADER_USER_DATA_PS_3, va >> 32, NULL, 0); + r600_context_pipe_state_set(rctx, rstate); + +out: + rctx->ps_samplers.n_samplers = count; +} + +static void evergreen_bind_vs_sampler(struct pipe_context *ctx, unsigned count, void **states) +{ +} + +static void evergreen_set_clip_state(struct pipe_context *ctx, + const struct pipe_clip_state *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state); + + if (rstate == NULL) + return; + + rctx->clip = *state; + rstate->id = R600_PIPE_STATE_CLIP; + for (int i = 0; i < 6; i++) { + r600_pipe_state_add_reg(rstate, + R_0285BC_PA_CL_UCP_0_X + i * 16, + fui(state->ucp[i][0]), NULL, 0); + r600_pipe_state_add_reg(rstate, + R_0285C0_PA_CL_UCP_0_Y + i * 16, + fui(state->ucp[i][1]) , NULL, 0); + r600_pipe_state_add_reg(rstate, + R_0285C4_PA_CL_UCP_0_Z + i * 16, + fui(state->ucp[i][2]), NULL, 0); + r600_pipe_state_add_reg(rstate, + R_0285C8_PA_CL_UCP_0_W + i * 16, + fui(state->ucp[i][3]), NULL, 0); + } + + free(rctx->states[R600_PIPE_STATE_CLIP]); + rctx->states[R600_PIPE_STATE_CLIP] = rstate; + r600_context_pipe_state_set(rctx, rstate); +} + +static void evergreen_set_polygon_stipple(struct pipe_context *ctx, + const struct pipe_poly_stipple *state) +{ +} + +static void evergreen_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask) +{ +} + +static void evergreen_set_scissor_state(struct pipe_context *ctx, + const struct pipe_scissor_state *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state); + uint32_t tl, br; + + if (rstate == NULL) + return; + + rstate->id = R600_PIPE_STATE_SCISSOR; + tl = S_028240_TL_X(state->minx) | S_028240_TL_Y(state->miny); + br = S_028244_BR_X(state->maxx) | S_028244_BR_Y(state->maxy); + r600_pipe_state_add_reg(rstate, + R_028210_PA_SC_CLIPRECT_0_TL, tl, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028214_PA_SC_CLIPRECT_0_BR, br, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028218_PA_SC_CLIPRECT_1_TL, tl, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_02821C_PA_SC_CLIPRECT_1_BR, br, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028220_PA_SC_CLIPRECT_2_TL, tl, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028224_PA_SC_CLIPRECT_2_BR, br, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028228_PA_SC_CLIPRECT_3_TL, tl, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_02822C_PA_SC_CLIPRECT_3_BR, br, + NULL, 0); + + free(rctx->states[R600_PIPE_STATE_SCISSOR]); + rctx->states[R600_PIPE_STATE_SCISSOR] = rstate; + r600_context_pipe_state_set(rctx, rstate); +} + +static void evergreen_set_viewport_state(struct pipe_context *ctx, + const struct pipe_viewport_state *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state); + + if (rstate == NULL) + return; + + rctx->viewport = *state; + rstate->id = R600_PIPE_STATE_VIEWPORT; + r600_pipe_state_add_reg(rstate, R_0282D0_PA_SC_VPORT_ZMIN_0, 0x00000000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_0282D4_PA_SC_VPORT_ZMAX_0, 0x3F800000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028350_PA_SC_RASTER_CONFIG, 0x00000000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_02843C_PA_CL_VPORT_XSCALE_0, fui(state->scale[0]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028444_PA_CL_VPORT_YSCALE_0, fui(state->scale[1]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_02844C_PA_CL_VPORT_ZSCALE_0, fui(state->scale[2]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028440_PA_CL_VPORT_XOFFSET_0, fui(state->translate[0]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028448_PA_CL_VPORT_YOFFSET_0, fui(state->translate[1]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028450_PA_CL_VPORT_ZOFFSET_0, fui(state->translate[2]), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028818_PA_CL_VTE_CNTL, 0x0000043F, NULL, 0); + + free(rctx->states[R600_PIPE_STATE_VIEWPORT]); + rctx->states[R600_PIPE_STATE_VIEWPORT] = rstate; + r600_context_pipe_state_set(rctx, rstate); +} + +static void evergreen_cb(struct r600_context *rctx, struct r600_pipe_state *rstate, + const struct pipe_framebuffer_state *state, int cb) +{ + struct r600_resource_texture *rtex; + struct r600_surface *surf; + unsigned level = state->cbufs[cb]->u.tex.level; + unsigned pitch, slice; + unsigned color_info; + unsigned format, swap, ntype, endian; + uint64_t offset; + unsigned tile_type; + const struct util_format_description *desc; + int i; + unsigned blend_clamp = 0, blend_bypass = 0; + + surf = (struct r600_surface *)state->cbufs[cb]; + rtex = (struct r600_resource_texture*)state->cbufs[cb]->texture; + + if (rtex->depth) + rctx->have_depth_fb = TRUE; + + if (rtex->depth && !rtex->is_flushing_texture) { + r600_texture_depth_flush(&rctx->context, state->cbufs[cb]->texture, TRUE); + rtex = rtex->flushed_depth_texture; + } + + /* XXX quite sure for dx10+ hw don't need any offset hacks */ + offset = r600_texture_get_offset(rtex, + level, state->cbufs[cb]->u.tex.first_layer); + pitch = rtex->pitch_in_blocks[level] / 8 - 1; + slice = rtex->pitch_in_blocks[level] * surf->aligned_height / 64 - 1; + desc = util_format_description(surf->base.format); + for (i = 0; i < 4; i++) { + if (desc->channel[i].type != UTIL_FORMAT_TYPE_VOID) { + break; + } + } + if (desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT) { + ntype = V_028C70_NUMBER_FLOAT; + } else { + ntype = V_028C70_NUMBER_UNORM; + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + ntype = V_028C70_NUMBER_SRGB; + else if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) { + if (desc->channel[i].normalized) + ntype = V_028C70_NUMBER_SNORM; + else if (desc->channel[i].pure_integer) + ntype = V_028C70_NUMBER_SINT; + } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) { + if (desc->channel[i].normalized) + ntype = V_028C70_NUMBER_UNORM; + else if (desc->channel[i].pure_integer) + ntype = V_028C70_NUMBER_UINT; + } + } + + format = si_translate_colorformat(surf->base.format); + swap = si_translate_colorswap(surf->base.format); + if (rtex->resource.b.b.b.usage == PIPE_USAGE_STAGING) { + endian = V_028C70_ENDIAN_NONE; + } else { + endian = si_colorformat_endian_swap(format); + } + + /* blend clamp should be set for all NORM/SRGB types */ + if (ntype == V_028C70_NUMBER_UNORM || + ntype == V_028C70_NUMBER_SNORM || + ntype == V_028C70_NUMBER_SRGB) + blend_clamp = 1; + + /* set blend bypass according to docs if SINT/UINT or + 8/24 COLOR variants */ + if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT || + format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 || + format == V_028C70_COLOR_X24_8_32_FLOAT) { + blend_clamp = 0; + blend_bypass = 1; + } + + color_info = S_028C70_FORMAT(format) | + S_028C70_COMP_SWAP(swap) | + //S_028C70_ARRAY_MODE(rtex->array_mode[level]) | + S_028C70_BLEND_CLAMP(blend_clamp) | + S_028C70_BLEND_BYPASS(blend_bypass) | + S_028C70_NUMBER_TYPE(ntype) | + S_028C70_ENDIAN(endian); + + color_info |= S_028C70_LINEAR_GENERAL(1); + + rctx->alpha_ref_dirty = true; + + offset += r600_resource_va(rctx->context.screen, state->cbufs[cb]->texture); + offset >>= 8; + + /* FIXME handle enabling of CB beyond BASE8 which has different offset */ + r600_pipe_state_add_reg(rstate, + R_028C60_CB_COLOR0_BASE + cb * 0x3C, + offset, &rtex->resource, RADEON_USAGE_READWRITE); + r600_pipe_state_add_reg(rstate, + R_028C64_CB_COLOR0_PITCH + cb * 0x3C, + S_028C64_TILE_MAX(pitch), + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028C68_CB_COLOR0_SLICE + cb * 0x3C, + S_028C68_TILE_MAX(slice), + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028C6C_CB_COLOR0_VIEW + cb * 0x3C, + 0x00000000, NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028C70_CB_COLOR0_INFO + cb * 0x3C, + color_info, &rtex->resource, RADEON_USAGE_READWRITE); + r600_pipe_state_add_reg(rstate, + R_028C74_CB_COLOR0_ATTRIB + cb * 0x3C, + 0, + &rtex->resource, RADEON_USAGE_READWRITE); +} + +static void evergreen_db(struct r600_context *rctx, struct r600_pipe_state *rstate, + const struct pipe_framebuffer_state *state) +{ + struct r600_resource_texture *rtex; + struct r600_surface *surf; + unsigned level, first_layer, pitch, slice, format, array_mode; + uint64_t offset; + + if (state->zsbuf == NULL) { + r600_pipe_state_add_reg(rstate, R_028040_DB_Z_INFO, 0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028044_DB_STENCIL_INFO, 0, NULL, 0); + return; + } + + surf = (struct r600_surface *)state->zsbuf; + level = surf->base.u.tex.level; + rtex = (struct r600_resource_texture*)surf->base.texture; + + /* XXX remove this once tiling is properly supported */ + array_mode = 0;/*rtex->array_mode[level] ? rtex->array_mode[level] : + V_028C70_ARRAY_1D_TILED_THIN1;*/ + + first_layer = surf->base.u.tex.first_layer; + offset = r600_texture_get_offset(rtex, level, first_layer); + pitch = rtex->pitch_in_blocks[level] / 8 - 1; + slice = rtex->pitch_in_blocks[level] * surf->aligned_height / 64 - 1; + format = si_translate_dbformat(rtex->real_format); + + offset += r600_resource_va(rctx->context.screen, surf->base.texture); + offset >>= 8; + + r600_pipe_state_add_reg(rstate, R_028048_DB_Z_READ_BASE, + offset, &rtex->resource, RADEON_USAGE_READWRITE); + r600_pipe_state_add_reg(rstate, R_028050_DB_Z_WRITE_BASE, + offset, &rtex->resource, RADEON_USAGE_READWRITE); + r600_pipe_state_add_reg(rstate, R_028008_DB_DEPTH_VIEW, 0x00000000, NULL, 0); + + if (rtex->stencil) { + uint64_t stencil_offset = + r600_texture_get_offset(rtex->stencil, level, first_layer); + + stencil_offset += r600_resource_va(rctx->context.screen, (void*)rtex->stencil); + stencil_offset >>= 8; + + r600_pipe_state_add_reg(rstate, R_02804C_DB_STENCIL_READ_BASE, + stencil_offset, &rtex->stencil->resource, RADEON_USAGE_READWRITE); + r600_pipe_state_add_reg(rstate, R_028054_DB_STENCIL_WRITE_BASE, + stencil_offset, &rtex->stencil->resource, RADEON_USAGE_READWRITE); + r600_pipe_state_add_reg(rstate, R_028044_DB_STENCIL_INFO, + 1, NULL, 0); + } else { + r600_pipe_state_add_reg(rstate, R_028044_DB_STENCIL_INFO, + 0, NULL, 0); + } + + r600_pipe_state_add_reg(rstate, R_02803C_DB_DEPTH_INFO, 0x1, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028040_DB_Z_INFO, + /*S_028040_ARRAY_MODE(array_mode) |*/ S_028040_FORMAT(format), + NULL, 0); + r600_pipe_state_add_reg(rstate, R_028058_DB_DEPTH_SIZE, + S_028058_PITCH_TILE_MAX(pitch), + NULL, 0); + r600_pipe_state_add_reg(rstate, R_02805C_DB_DEPTH_SLICE, + S_02805C_SLICE_TILE_MAX(slice), + NULL, 0); +} + +static void evergreen_set_framebuffer_state(struct pipe_context *ctx, + const struct pipe_framebuffer_state *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state); + uint32_t shader_mask, tl, br; + int tl_x, tl_y, br_x, br_y; + + if (rstate == NULL) + return; + + r600_flush_framebuffer(rctx, false); + + /* unreference old buffer and reference new one */ + rstate->id = R600_PIPE_STATE_FRAMEBUFFER; + + util_copy_framebuffer_state(&rctx->framebuffer, state); + + /* build states */ + rctx->have_depth_fb = 0; + rctx->nr_cbufs = state->nr_cbufs; + for (int i = 0; i < state->nr_cbufs; i++) { + evergreen_cb(rctx, rstate, state, i); + } + evergreen_db(rctx, rstate, state); + + shader_mask = 0; + for (int i = 0; i < state->nr_cbufs; i++) { + shader_mask |= 0xf << (i * 4); + } + tl_x = 0; + tl_y = 0; + br_x = state->width; + br_y = state->height; +#if 0 /* These shouldn't be necessary on SI, see PA_SC_ENHANCE register */ + /* EG hw workaround */ + if (br_x == 0) + tl_x = 1; + if (br_y == 0) + tl_y = 1; + /* cayman hw workaround */ + if (rctx->chip_class == CAYMAN) { + if (br_x == 1 && br_y == 1) + br_x = 2; + } +#endif + tl = S_028240_TL_X(tl_x) | S_028240_TL_Y(tl_y); + br = S_028244_BR_X(br_x) | S_028244_BR_Y(br_y); + + r600_pipe_state_add_reg(rstate, + R_028240_PA_SC_GENERIC_SCISSOR_TL, tl, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028244_PA_SC_GENERIC_SCISSOR_BR, br, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028250_PA_SC_VPORT_SCISSOR_0_TL, tl, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028254_PA_SC_VPORT_SCISSOR_0_BR, br, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028030_PA_SC_SCREEN_SCISSOR_TL, tl, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028034_PA_SC_SCREEN_SCISSOR_BR, br, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028204_PA_SC_WINDOW_SCISSOR_TL, tl, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028208_PA_SC_WINDOW_SCISSOR_BR, br, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028200_PA_SC_WINDOW_OFFSET, 0x00000000, + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028230_PA_SC_EDGERULE, 0xAAAAAAAA, + NULL, 0); + + r600_pipe_state_add_reg(rstate, R_02823C_CB_SHADER_MASK, + shader_mask, NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028BE0_PA_SC_AA_CONFIG, + 0x00000000, NULL, 0); + + free(rctx->states[R600_PIPE_STATE_FRAMEBUFFER]); + rctx->states[R600_PIPE_STATE_FRAMEBUFFER] = rstate; + r600_context_pipe_state_set(rctx, rstate); + + if (state->zsbuf) { + cayman_polygon_offset_update(rctx); + } +} + +void cayman_init_state_functions(struct r600_context *rctx) +{ + rctx->context.create_blend_state = evergreen_create_blend_state; + rctx->context.create_depth_stencil_alpha_state = evergreen_create_dsa_state; + rctx->context.create_fs_state = si_create_shader_state; + rctx->context.create_rasterizer_state = evergreen_create_rs_state; + rctx->context.create_sampler_state = si_create_sampler_state; + rctx->context.create_sampler_view = evergreen_create_sampler_view; + rctx->context.create_vertex_elements_state = si_create_vertex_elements; + rctx->context.create_vs_state = si_create_shader_state; + rctx->context.bind_blend_state = r600_bind_blend_state; + rctx->context.bind_depth_stencil_alpha_state = r600_bind_dsa_state; + rctx->context.bind_fragment_sampler_states = evergreen_bind_ps_sampler; + rctx->context.bind_fs_state = r600_bind_ps_shader; + rctx->context.bind_rasterizer_state = r600_bind_rs_state; + rctx->context.bind_vertex_elements_state = r600_bind_vertex_elements; + rctx->context.bind_vertex_sampler_states = evergreen_bind_vs_sampler; + rctx->context.bind_vs_state = r600_bind_vs_shader; + rctx->context.delete_blend_state = r600_delete_state; + rctx->context.delete_depth_stencil_alpha_state = r600_delete_state; + rctx->context.delete_fs_state = r600_delete_ps_shader; + rctx->context.delete_rasterizer_state = r600_delete_rs_state; + rctx->context.delete_sampler_state = si_delete_sampler_state; + rctx->context.delete_vertex_elements_state = r600_delete_vertex_element; + rctx->context.delete_vs_state = r600_delete_vs_shader; + rctx->context.set_blend_color = evergreen_set_blend_color; + rctx->context.set_clip_state = evergreen_set_clip_state; + rctx->context.set_constant_buffer = r600_set_constant_buffer; + rctx->context.set_fragment_sampler_views = evergreen_set_ps_sampler_view; + rctx->context.set_framebuffer_state = evergreen_set_framebuffer_state; + rctx->context.set_polygon_stipple = evergreen_set_polygon_stipple; + rctx->context.set_sample_mask = evergreen_set_sample_mask; + rctx->context.set_scissor_state = evergreen_set_scissor_state; + rctx->context.set_stencil_ref = r600_set_pipe_stencil_ref; + rctx->context.set_vertex_buffers = r600_set_vertex_buffers; + rctx->context.set_index_buffer = r600_set_index_buffer; + rctx->context.set_vertex_sampler_views = evergreen_set_vs_sampler_view; + rctx->context.set_viewport_state = evergreen_set_viewport_state; + rctx->context.sampler_view_destroy = r600_sampler_view_destroy; + rctx->context.redefine_user_buffer = u_default_redefine_user_buffer; + rctx->context.texture_barrier = r600_texture_barrier; + rctx->context.create_stream_output_target = r600_create_so_target; + rctx->context.stream_output_target_destroy = r600_so_target_destroy; + rctx->context.set_stream_output_targets = r600_set_so_targets; +} + +void si_init_config(struct r600_context *rctx) +{ + struct r600_pipe_state *rstate = &rctx->config; + unsigned tmp; + + r600_pipe_state_add_reg(rstate, R_028A4C_PA_SC_MODE_CNTL_1, 0x0, NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028A10_VGT_OUTPUT_PATH_CNTL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A14_VGT_HOS_CNTL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A18_VGT_HOS_MAX_TESS_LEVEL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A20_VGT_HOS_REUSE_DEPTH, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A24_VGT_GROUP_PRIM_TYPE, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A28_VGT_GROUP_FIRST_DECR, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A2C_VGT_GROUP_DECR, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A30_VGT_GROUP_VECT_0_CNTL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A34_VGT_GROUP_VECT_1_CNTL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A38_VGT_GROUP_VECT_0_FMT_CNTL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A40_VGT_GS_MODE, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A84_VGT_PRIMITIVEID_EN, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028B94_VGT_STRMOUT_CONFIG, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028AA8_IA_MULTI_VGT_PARAM, S_028AA8_SWITCH_ON_EOP(1) | S_028AA8_PARTIAL_VS_WAVE_ON(1) | S_028AA8_PRIMGROUP_SIZE(63), NULL, 0); + r600_pipe_state_add_reg(rstate, R_028AB4_VGT_REUSE_OFF, 0x00000000, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028AB8_VGT_VTX_CNT_EN, 0x0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_008A14_PA_CL_ENHANCE, (3 << 1) | 1, NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028810_PA_CL_CLIP_CNTL, 0x0, NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028B54_VGT_SHADER_STAGES_EN, 0, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 0x76543210, NULL, 0); + r600_pipe_state_add_reg(rstate, R_028BD8_PA_SC_CENTROID_PRIORITY_1, 0xfedcba98, NULL, 0); + + r600_pipe_state_add_reg(rstate, R_028804_DB_EQAA, 0x110000, NULL, 0); + r600_context_pipe_state_set(rctx, rstate); +} + +void cayman_polygon_offset_update(struct r600_context *rctx) +{ + struct r600_pipe_state state; + + state.id = R600_PIPE_STATE_POLYGON_OFFSET; + state.nregs = 0; + if (rctx->rasterizer && rctx->framebuffer.zsbuf) { + float offset_units = rctx->rasterizer->offset_units; + unsigned offset_db_fmt_cntl = 0, depth; + + switch (rctx->framebuffer.zsbuf->texture->format) { + case PIPE_FORMAT_Z24X8_UNORM: + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + depth = -24; + offset_units *= 2.0f; + break; + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + depth = -23; + offset_units *= 1.0f; + offset_db_fmt_cntl |= S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1); + break; + case PIPE_FORMAT_Z16_UNORM: + depth = -16; + offset_units *= 4.0f; + break; + default: + return; + } + /* FIXME some of those reg can be computed with cso */ + offset_db_fmt_cntl |= S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(depth); + r600_pipe_state_add_reg(&state, + R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, + fui(rctx->rasterizer->offset_scale), NULL, 0); + r600_pipe_state_add_reg(&state, + R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, + fui(offset_units), NULL, 0); + r600_pipe_state_add_reg(&state, + R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, + fui(rctx->rasterizer->offset_scale), NULL, 0); + r600_pipe_state_add_reg(&state, + R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, + fui(offset_units), NULL, 0); + r600_pipe_state_add_reg(&state, + R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, + offset_db_fmt_cntl, NULL, 0); + r600_context_pipe_state_set(rctx, &state); + } +} + +void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *shader) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = &shader->rstate; + struct r600_shader *rshader = &shader->shader; + unsigned i, exports_ps, num_cout, spi_ps_in_control, db_shader_control; + int pos_index = -1, face_index = -1; + int ninterp = 0; + boolean have_linear = FALSE, have_centroid = FALSE, have_perspective = FALSE; + unsigned spi_baryc_cntl; + uint64_t va; + + if (si_pipe_shader_create(ctx, shader)) + return; + + rstate->nregs = 0; + + db_shader_control = S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z); + for (i = 0; i < rshader->ninput; i++) { + /* evergreen NUM_INTERP only contains values interpolated into the LDS, + POSITION goes via GPRs from the SC so isn't counted */ + if (rshader->input[i].name == TGSI_SEMANTIC_POSITION) + pos_index = i; + else if (rshader->input[i].name == TGSI_SEMANTIC_FACE) + face_index = i; + else { + ninterp++; + if (rshader->input[i].interpolate == TGSI_INTERPOLATE_LINEAR) + have_linear = TRUE; + if (rshader->input[i].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) + have_perspective = TRUE; + if (rshader->input[i].centroid) + have_centroid = TRUE; + } + } + + for (i = 0; i < rshader->noutput; i++) { + if (rshader->output[i].name == TGSI_SEMANTIC_POSITION) + db_shader_control |= S_02880C_Z_EXPORT_ENABLE(1); + if (rshader->output[i].name == TGSI_SEMANTIC_STENCIL) + db_shader_control |= 0; // XXX OP_VAL or TEST_VAL? + } + if (rshader->uses_kill) + db_shader_control |= S_02880C_KILL_ENABLE(1); + + exports_ps = 0; + num_cout = 0; + for (i = 0; i < rshader->noutput; i++) { + if (rshader->output[i].name == TGSI_SEMANTIC_POSITION || + rshader->output[i].name == TGSI_SEMANTIC_STENCIL) + exports_ps |= 1; + else if (rshader->output[i].name == TGSI_SEMANTIC_COLOR) { + if (rshader->fs_write_all) + num_cout = rshader->nr_cbufs; + else + num_cout++; + } + } + if (!exports_ps) { + /* always at least export 1 component per pixel */ + exports_ps = 2; + } + + if (ninterp == 0) { + ninterp = 1; + have_perspective = TRUE; + } + + if (!have_perspective && !have_linear) + have_perspective = TRUE; + + spi_ps_in_control = S_0286D8_NUM_INTERP(ninterp); + + spi_baryc_cntl = 0; + if (have_perspective) + spi_baryc_cntl |= have_centroid ? + S_0286E0_PERSP_CENTROID_CNTL(1) : S_0286E0_PERSP_CENTER_CNTL(1); + if (have_linear) + spi_baryc_cntl |= have_centroid ? + S_0286E0_LINEAR_CENTROID_CNTL(1) : S_0286E0_LINEAR_CENTER_CNTL(1); + + r600_pipe_state_add_reg(rstate, + R_0286E0_SPI_BARYC_CNTL, + spi_baryc_cntl, + NULL, 0); + + r600_pipe_state_add_reg(rstate, + R_0286CC_SPI_PS_INPUT_ENA, + shader->spi_ps_input_ena, + NULL, 0); + + r600_pipe_state_add_reg(rstate, + R_0286D0_SPI_PS_INPUT_ADDR, + shader->spi_ps_input_ena, + NULL, 0); + + r600_pipe_state_add_reg(rstate, + R_0286D8_SPI_PS_IN_CONTROL, + spi_ps_in_control, + NULL, 0); + + /* XXX: Depends on Z buffer format? */ + r600_pipe_state_add_reg(rstate, + R_028710_SPI_SHADER_Z_FORMAT, + 0, + NULL, 0); + + /* XXX: Depends on color buffer format? */ + r600_pipe_state_add_reg(rstate, + R_028714_SPI_SHADER_COL_FORMAT, + S_028714_COL0_EXPORT_FORMAT(V_028714_SPI_SHADER_32_ABGR), + NULL, 0); + + va = r600_resource_va(ctx->screen, (void *)shader->bo); + r600_pipe_state_add_reg(rstate, + R_00B020_SPI_SHADER_PGM_LO_PS, + va >> 8, + shader->bo, RADEON_USAGE_READ); + r600_pipe_state_add_reg(rstate, + R_00B024_SPI_SHADER_PGM_HI_PS, + va >> 40, + shader->bo, RADEON_USAGE_READ); + + /* Last 2 reserved SGPRs are used for VCC */ + /* XXX: Hard-coding 2 SGPRs for constant buffer */ + r600_pipe_state_add_reg(rstate, + R_00B028_SPI_SHADER_PGM_RSRC1_PS, + S_00B028_VGPRS(shader->num_vgprs / 4) | + S_00B028_SGPRS((shader->num_sgprs + 2 + 2 + 1) / 8), + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_00B02C_SPI_SHADER_PGM_RSRC2_PS, + S_00B02C_USER_SGPR(6), + NULL, 0); + + r600_pipe_state_add_reg(rstate, R_02880C_DB_SHADER_CONTROL, + db_shader_control, + NULL, 0); + + shader->sprite_coord_enable = rctx->sprite_coord_enable; +} + +void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *shader) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = &shader->rstate; + struct r600_shader *rshader = &shader->shader; + unsigned nparams, i; + uint64_t va; + + if (si_pipe_shader_create(ctx, shader)) + return; + + /* clear previous register */ + rstate->nregs = 0; + + /* Certain attributes (position, psize, etc.) don't count as params. + * VS is required to export at least one param and r600_shader_from_tgsi() + * takes care of adding a dummy export. + */ + for (nparams = 0, i = 0 ; i < rshader->noutput; i++) { + if (rshader->output[i].name != TGSI_SEMANTIC_POSITION) + nparams++; + } + if (nparams < 1) + nparams = 1; + + r600_pipe_state_add_reg(rstate, + R_0286C4_SPI_VS_OUT_CONFIG, + S_0286C4_VS_EXPORT_COUNT(nparams - 1), + NULL, 0); + + r600_pipe_state_add_reg(rstate, + R_02870C_SPI_SHADER_POS_FORMAT, + S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | + S_02870C_POS1_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | + S_02870C_POS2_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) | + S_02870C_POS3_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP), + NULL, 0); + + va = r600_resource_va(ctx->screen, (void *)shader->bo); + r600_pipe_state_add_reg(rstate, + R_00B120_SPI_SHADER_PGM_LO_VS, + va >> 8, + shader->bo, RADEON_USAGE_READ); + r600_pipe_state_add_reg(rstate, + R_00B124_SPI_SHADER_PGM_HI_VS, + va >> 40, + shader->bo, RADEON_USAGE_READ); + + /* Last 2 reserved SGPRs are used for VCC */ + /* XXX: Hard-coding 2 SGPRs for constant buffer */ + r600_pipe_state_add_reg(rstate, + R_00B128_SPI_SHADER_PGM_RSRC1_VS, + S_00B128_VGPRS(shader->num_vgprs / 4) | + S_00B128_SGPRS((shader->num_sgprs + 2 + 2 + 2) / 8), + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_00B12C_SPI_SHADER_PGM_RSRC2_VS, + S_00B12C_USER_SGPR(2 + 2), + NULL, 0); +} + +void si_update_spi_map(struct r600_context *rctx) +{ + struct r600_shader *ps = &rctx->ps_shader->shader; + struct r600_shader *vs = &rctx->vs_shader->shader; + struct r600_pipe_state *rstate = &rctx->spi; + unsigned i, j, tmp; + + rstate->nregs = 0; + + for (i = 0; i < ps->ninput; i++) { + tmp = 0; + + if (ps->input[i].name == TGSI_SEMANTIC_COLOR || + ps->input[i].name == TGSI_SEMANTIC_BCOLOR || + ps->input[i].name == TGSI_SEMANTIC_POSITION) { + tmp |= S_028644_FLAT_SHADE(1); + } + + if (ps->input[i].name == TGSI_SEMANTIC_GENERIC && + rctx->sprite_coord_enable & (1 << ps->input[i].sid)) { + tmp |= S_028644_PT_SPRITE_TEX(1); + } + + for (j = 0; j < vs->noutput; j++) { + if (ps->input[i].name == vs->output[j].name && + ps->input[i].sid == vs->output[j].sid) { + tmp |= S_028644_OFFSET(ps->input[i].sid); + break; + } + } + + if (j == vs->noutput) { + /* No corresponding output found, load defaults into input */ + tmp |= S_028644_OFFSET(0x20); + } + + r600_pipe_state_add_reg(rstate, R_028644_SPI_PS_INPUT_CNTL_0 + i * 4, + tmp, NULL, 0); + } + + if (rstate->nregs > 0) + r600_context_pipe_state_set(rctx, rstate); +} + +void *cayman_create_db_flush_dsa(struct r600_context *rctx) +{ + struct pipe_depth_stencil_alpha_state dsa; + struct r600_pipe_state *rstate; + + memset(&dsa, 0, sizeof(dsa)); + + rstate = rctx->context.create_depth_stencil_alpha_state(&rctx->context, &dsa); + r600_pipe_state_add_reg(rstate, + R_028000_DB_RENDER_CONTROL, + S_028000_DEPTH_COPY(1) | + S_028000_STENCIL_COPY(1) | + S_028000_COPY_CENTROID(1), + NULL, 0); + return rstate; +} diff --git a/src/gallium/drivers/radeonsi/r600.h b/src/gallium/drivers/radeonsi/r600.h new file mode 100644 index 00000000000..56915ab966f --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600.h @@ -0,0 +1,245 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + */ +#ifndef R600_H +#define R600_H + +#include "../../winsys/radeon/drm/radeon_winsys.h" +#include "util/u_double_list.h" +#include "util/u_vbuf.h" + +#define R600_ERR(fmt, args...) \ + fprintf(stderr, "EE %s:%d %s - "fmt, __FILE__, __LINE__, __func__, ##args) + +struct winsys_handle; + +enum radeon_family { + CHIP_UNKNOWN, + CHIP_CAYMAN, + CHIP_TAHITI, + CHIP_PITCAIRN, + CHIP_VERDE, + CHIP_LAST, +}; + +enum chip_class { + CAYMAN, + TAHITI, +}; + +struct r600_tiling_info { + unsigned num_channels; + unsigned num_banks; + unsigned group_bytes; +}; + +struct r600_resource { + struct u_vbuf_resource b; + + /* Winsys objects. */ + struct pb_buffer *buf; + struct radeon_winsys_cs_handle *cs_buf; + + /* Resource state. */ + unsigned domains; +}; + +/* R600/R700 STATES */ +#define R600_GROUP_MAX 16 +#define R600_BLOCK_MAX_BO 32 +#define R600_BLOCK_MAX_REG 128 + +/* each range covers 9 bits of dword space = 512 dwords = 2k bytes */ +/* there is a block entry for each register so 512 blocks */ +/* we have no registers to read/write below 0x8000 (0x2000 in dw space) */ +/* we use some fake offsets at 0x40000 to do evergreen sampler borders so take 0x42000 as a max bound*/ +#define RANGE_OFFSET_START 0x8000 +#define HASH_SHIFT 9 +#define NUM_RANGES (0x42000 - RANGE_OFFSET_START) / (4 << HASH_SHIFT) /* 128 << 9 = 64k */ + +#define CTX_RANGE_ID(offset) ((((offset - RANGE_OFFSET_START) >> 2) >> HASH_SHIFT) & 255) +#define CTX_BLOCK_ID(offset) (((offset - RANGE_OFFSET_START) >> 2) & ((1 << HASH_SHIFT) - 1)) + +struct r600_pipe_reg { + uint32_t value; + struct r600_block *block; + struct r600_resource *bo; + enum radeon_bo_usage bo_usage; + uint32_t id; +}; + +struct r600_pipe_state { + unsigned id; + unsigned nregs; + struct r600_pipe_reg regs[R600_BLOCK_MAX_REG]; +}; + +#define R600_BLOCK_STATUS_ENABLED (1 << 0) +#define R600_BLOCK_STATUS_DIRTY (1 << 1) + +struct r600_block_reloc { + struct r600_resource *bo; + enum radeon_bo_usage bo_usage; + unsigned bo_pm4_index; +}; + +struct r600_block { + struct list_head list; + struct list_head enable_list; + unsigned status; + unsigned flags; + unsigned start_offset; + unsigned pm4_ndwords; + unsigned nbo; + uint16_t nreg; + uint16_t nreg_dirty; + uint32_t *reg; + uint32_t pm4[R600_BLOCK_MAX_REG]; + unsigned pm4_bo_index[R600_BLOCK_MAX_REG]; + struct r600_block_reloc reloc[R600_BLOCK_MAX_BO]; +}; + +struct r600_range { + struct r600_block **blocks; +}; + +struct r600_query { + union { + uint64_t u64; + boolean b; + struct pipe_query_data_so_statistics so; + } result; + /* The kind of query */ + unsigned type; + /* Offset of the first result for current query */ + unsigned results_start; + /* Offset of the next free result after current query data */ + unsigned results_end; + /* Size of the result in memory for both begin_query and end_query, + * this can be one or two numbers, or it could even be a size of a structure. */ + unsigned result_size; + /* The buffer where query results are stored. It's used as a ring, + * data blocks for current query are stored sequentially from + * results_start to results_end, with wrapping on the buffer end */ + struct r600_resource *buffer; + /* The number of dwords for begin_query or end_query. */ + unsigned num_cs_dw; + /* linked list of queries */ + struct list_head list; +}; + +struct r600_so_target { + struct pipe_stream_output_target b; + + /* The buffer where BUFFER_FILLED_SIZE is stored. */ + struct r600_resource *filled_size; + unsigned stride; + unsigned so_index; +}; + +#define R600_CONTEXT_DRAW_PENDING (1 << 0) +#define R600_CONTEXT_DST_CACHES_DIRTY (1 << 1) +#define R600_CONTEXT_CHECK_EVENT_FLUSH (1 << 2) + +struct r600_draw { + uint32_t vgt_num_indices; + uint32_t vgt_num_instances; + uint32_t vgt_index_type; + uint32_t vgt_draw_initiator; + uint32_t indices_bo_offset; + unsigned db_render_override; + unsigned db_render_control; + struct r600_resource *indices; +}; + +struct r600_context; +struct r600_screen; + +void r600_get_backend_mask(struct r600_context *ctx); +void r600_context_fini(struct r600_context *ctx); +void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state); +void r600_context_flush(struct r600_context *ctx, unsigned flags); +void r600_context_draw(struct r600_context *ctx, const struct r600_draw *draw); + +struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type); +void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query); +boolean r600_context_query_result(struct r600_context *ctx, + struct r600_query *query, + boolean wait, void *vresult); +void r600_query_begin(struct r600_context *ctx, struct r600_query *query); +void r600_query_end(struct r600_context *ctx, struct r600_query *query); +void r600_context_queries_suspend(struct r600_context *ctx); +void r600_context_queries_resume(struct r600_context *ctx); +void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation, + int flag_wait); +void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fence, + unsigned offset, unsigned value); +void r600_inval_shader_cache(struct r600_context *ctx); +void r600_inval_texture_cache(struct r600_context *ctx); +void r600_inval_vertex_cache(struct r600_context *ctx); +void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now); + +void r600_context_streamout_begin(struct r600_context *ctx); +void r600_context_streamout_end(struct r600_context *ctx); +void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t); +void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in); +void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block); +void r600_context_block_resource_emit_dirty(struct r600_context *ctx, struct r600_block *block); + +int si_context_init(struct r600_context *ctx); +void evergreen_context_draw(struct r600_context *ctx, const struct r600_draw *draw); + +void _r600_pipe_state_add_reg(struct r600_context *ctx, + struct r600_pipe_state *state, + uint32_t offset, uint32_t value, + uint32_t range_id, uint32_t block_id, + struct r600_resource *bo, + enum radeon_bo_usage usage); + +void r600_pipe_state_add_reg_noblock(struct r600_pipe_state *state, + uint32_t offset, uint32_t value, + struct r600_resource *bo, + enum radeon_bo_usage usage); + +#define r600_pipe_state_add_reg(state, offset, value, bo, usage) _r600_pipe_state_add_reg(rctx, state, offset, value, CTX_RANGE_ID(offset), CTX_BLOCK_ID(offset), bo, usage) + +static inline void r600_pipe_state_mod_reg(struct r600_pipe_state *state, + uint32_t value) +{ + state->regs[state->nregs].value = value; + state->nregs++; +} + +static inline void r600_pipe_state_mod_reg_bo(struct r600_pipe_state *state, + uint32_t value, struct r600_resource *bo, + enum radeon_bo_usage usage) +{ + state->regs[state->nregs].value = value; + state->regs[state->nregs].bo = bo; + state->regs[state->nregs].bo_usage = usage; + state->nregs++; +} + +#endif diff --git a/src/gallium/drivers/radeonsi/r600_blit.c b/src/gallium/drivers/radeonsi/r600_blit.c new file mode 100644 index 00000000000..65158089acb --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_blit.c @@ -0,0 +1,379 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#include "util/u_surface.h" +#include "util/u_blitter.h" +#include "util/u_format.h" +#include "radeonsi_pipe.h" + +enum r600_blitter_op /* bitmask */ +{ + R600_SAVE_TEXTURES = 1, + R600_SAVE_FRAMEBUFFER = 2, + R600_DISABLE_RENDER_COND = 4, + + R600_CLEAR = 0, + + R600_CLEAR_SURFACE = R600_SAVE_FRAMEBUFFER, + + R600_COPY = R600_SAVE_FRAMEBUFFER | R600_SAVE_TEXTURES | + R600_DISABLE_RENDER_COND, + + R600_DECOMPRESS = R600_SAVE_FRAMEBUFFER | R600_DISABLE_RENDER_COND, +}; + +static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + r600_context_queries_suspend(rctx); + + util_blitter_save_blend(rctx->blitter, rctx->states[R600_PIPE_STATE_BLEND]); + util_blitter_save_depth_stencil_alpha(rctx->blitter, rctx->states[R600_PIPE_STATE_DSA]); + if (rctx->states[R600_PIPE_STATE_STENCIL_REF]) { + util_blitter_save_stencil_ref(rctx->blitter, &rctx->stencil_ref); + } + util_blitter_save_rasterizer(rctx->blitter, rctx->states[R600_PIPE_STATE_RASTERIZER]); + util_blitter_save_fragment_shader(rctx->blitter, rctx->ps_shader); + util_blitter_save_vertex_shader(rctx->blitter, rctx->vs_shader); + util_blitter_save_vertex_elements(rctx->blitter, rctx->vertex_elements); + if (rctx->states[R600_PIPE_STATE_VIEWPORT]) { + util_blitter_save_viewport(rctx->blitter, &rctx->viewport); + } + util_blitter_save_vertex_buffers(rctx->blitter, + rctx->vbuf_mgr->nr_vertex_buffers, + rctx->vbuf_mgr->vertex_buffer); + util_blitter_save_so_targets(rctx->blitter, rctx->num_so_targets, + (struct pipe_stream_output_target**)rctx->so_targets); + + if (op & R600_SAVE_FRAMEBUFFER) + util_blitter_save_framebuffer(rctx->blitter, &rctx->framebuffer); + + if (op & R600_SAVE_TEXTURES) { + util_blitter_save_fragment_sampler_states( + rctx->blitter, rctx->ps_samplers.n_samplers, + (void**)rctx->ps_samplers.samplers); + + util_blitter_save_fragment_sampler_views( + rctx->blitter, rctx->ps_samplers.n_views, + (struct pipe_sampler_view**)rctx->ps_samplers.views); + } + + if ((op & R600_DISABLE_RENDER_COND) && rctx->current_render_cond) { + rctx->saved_render_cond = rctx->current_render_cond; + rctx->saved_render_cond_mode = rctx->current_render_cond_mode; + rctx->context.render_condition(&rctx->context, NULL, 0); + } + +} + +static void r600_blitter_end(struct pipe_context *ctx) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + if (rctx->saved_render_cond) { + rctx->context.render_condition(&rctx->context, + rctx->saved_render_cond, + rctx->saved_render_cond_mode); + rctx->saved_render_cond = NULL; + } + r600_context_queries_resume(rctx); +} + +static unsigned u_num_layers(struct pipe_resource *r, unsigned level) +{ + switch (r->target) { + case PIPE_TEXTURE_CUBE: + return 6; + case PIPE_TEXTURE_3D: + return u_minify(r->depth0, level); + case PIPE_TEXTURE_1D_ARRAY: + return r->array_size; + case PIPE_TEXTURE_2D_ARRAY: + return r->array_size; + default: + return 1; + } +} + +void r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + unsigned layer, level; + float depth = 1.0f; + + if (!texture->dirty_db) + return; + + for (level = 0; level <= texture->resource.b.b.b.last_level; level++) { + unsigned num_layers = u_num_layers(&texture->resource.b.b.b, level); + + for (layer = 0; layer < num_layers; layer++) { + struct pipe_surface *zsurf, *cbsurf, surf_tmpl; + + surf_tmpl.format = texture->real_format; + surf_tmpl.u.tex.level = level; + surf_tmpl.u.tex.first_layer = layer; + surf_tmpl.u.tex.last_layer = layer; + surf_tmpl.usage = PIPE_BIND_DEPTH_STENCIL; + + zsurf = ctx->create_surface(ctx, &texture->resource.b.b.b, &surf_tmpl); + + surf_tmpl.format = texture->flushed_depth_texture->real_format; + surf_tmpl.usage = PIPE_BIND_RENDER_TARGET; + cbsurf = ctx->create_surface(ctx, + (struct pipe_resource*)texture->flushed_depth_texture, &surf_tmpl); + + r600_blitter_begin(ctx, R600_DECOMPRESS); + util_blitter_custom_depth_stencil(rctx->blitter, zsurf, cbsurf, rctx->custom_dsa_flush, depth); + r600_blitter_end(ctx); + + pipe_surface_reference(&zsurf, NULL); + pipe_surface_reference(&cbsurf, NULL); + } + } + + texture->dirty_db = FALSE; +} + +void r600_flush_depth_textures(struct r600_context *rctx) +{ + unsigned int i; + + /* FIXME: This handles fragment shader textures only. */ + + for (i = 0; i < rctx->ps_samplers.n_views; ++i) { + struct si_pipe_sampler_view *view; + struct r600_resource_texture *tex; + + view = rctx->ps_samplers.views[i]; + if (!view) continue; + + tex = (struct r600_resource_texture *)view->base.texture; + if (!tex->depth) + continue; + + if (tex->is_flushing_texture) + continue; + + r600_blit_uncompress_depth(&rctx->context, tex); + } + + /* also check CB here */ + for (i = 0; i < rctx->framebuffer.nr_cbufs; i++) { + struct r600_resource_texture *tex; + tex = (struct r600_resource_texture *)rctx->framebuffer.cbufs[i]->texture; + + if (!tex->depth) + continue; + + if (tex->is_flushing_texture) + continue; + + r600_blit_uncompress_depth(&rctx->context, tex); + } +} + +static void r600_clear(struct pipe_context *ctx, unsigned buffers, + const union pipe_color_union *color, + double depth, unsigned stencil) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct pipe_framebuffer_state *fb = &rctx->framebuffer; + + r600_blitter_begin(ctx, R600_CLEAR); + util_blitter_clear(rctx->blitter, fb->width, fb->height, + fb->nr_cbufs, buffers, fb->nr_cbufs ? fb->cbufs[0]->format : PIPE_FORMAT_NONE, + color, depth, stencil); + r600_blitter_end(ctx); +} + +static void r600_clear_render_target(struct pipe_context *ctx, + struct pipe_surface *dst, + const union pipe_color_union *color, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + r600_blitter_begin(ctx, R600_CLEAR_SURFACE); + util_blitter_clear_render_target(rctx->blitter, dst, color, + dstx, dsty, width, height); + r600_blitter_end(ctx); +} + +static void r600_clear_depth_stencil(struct pipe_context *ctx, + struct pipe_surface *dst, + unsigned clear_flags, + double depth, + unsigned stencil, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + r600_blitter_begin(ctx, R600_CLEAR_SURFACE); + util_blitter_clear_depth_stencil(rctx->blitter, dst, clear_flags, depth, stencil, + dstx, dsty, width, height); + r600_blitter_end(ctx); +} + + + +/* Copy a block of pixels from one surface to another using HW. */ +static void r600_hw_copy_region(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + r600_blitter_begin(ctx, R600_COPY); + util_blitter_copy_texture(rctx->blitter, dst, dst_level, dstx, dsty, dstz, + src, src_level, src_box, TRUE); + r600_blitter_end(ctx); +} + +struct texture_orig_info { + unsigned format; + unsigned width0; + unsigned height0; +}; + +static void r600_compressed_to_blittable(struct pipe_resource *tex, + unsigned level, + struct texture_orig_info *orig) +{ + struct r600_resource_texture *rtex = (struct r600_resource_texture*)tex; + unsigned pixsize = util_format_get_blocksize(rtex->real_format); + int new_format; + int new_height, new_width; + + orig->format = tex->format; + orig->width0 = tex->width0; + orig->height0 = tex->height0; + + if (pixsize == 8) + new_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */ + else + new_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */ + + new_width = util_format_get_nblocksx(tex->format, orig->width0); + new_height = util_format_get_nblocksy(tex->format, orig->height0); + + tex->width0 = new_width; + tex->height0 = new_height; + tex->format = new_format; +} + +static void r600_reset_blittable_to_compressed(struct pipe_resource *tex, + struct texture_orig_info *orig) +{ + tex->format = orig->format; + tex->width0 = orig->width0; + tex->height0 = orig->height0; +} + +static void r600_resource_copy_region(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned dst_level, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + unsigned src_level, + const struct pipe_box *src_box) +{ + struct r600_resource_texture *rsrc = (struct r600_resource_texture*)src; + struct texture_orig_info orig_info[2]; + struct pipe_box sbox; + const struct pipe_box *psbox; + boolean restore_orig[2]; + + memset(orig_info, 0, sizeof(orig_info)); + + /* Fallback for buffers. */ + if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) { + util_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, + src, src_level, src_box); + return; + } + + if (rsrc->depth && !rsrc->is_flushing_texture) + r600_texture_depth_flush(ctx, src, FALSE); + + restore_orig[0] = restore_orig[1] = FALSE; + + if (util_format_is_compressed(src->format)) { + r600_compressed_to_blittable(src, src_level, &orig_info[0]); + restore_orig[0] = TRUE; + sbox.x = util_format_get_nblocksx(orig_info[0].format, src_box->x); + sbox.y = util_format_get_nblocksy(orig_info[0].format, src_box->y); + sbox.z = src_box->z; + sbox.width = util_format_get_nblocksx(orig_info[0].format, src_box->width); + sbox.height = util_format_get_nblocksy(orig_info[0].format, src_box->height); + sbox.depth = src_box->depth; + psbox=&sbox; + } else + psbox=src_box; + + if (util_format_is_compressed(dst->format)) { + r600_compressed_to_blittable(dst, dst_level, &orig_info[1]); + restore_orig[1] = TRUE; + /* translate the dst box as well */ + dstx = util_format_get_nblocksx(orig_info[1].format, dstx); + dsty = util_format_get_nblocksy(orig_info[1].format, dsty); + } + + r600_hw_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, + src, src_level, psbox); + + if (restore_orig[0]) + r600_reset_blittable_to_compressed(src, &orig_info[0]); + + if (restore_orig[1]) + r600_reset_blittable_to_compressed(dst, &orig_info[1]); +} + +void r600_init_blit_functions(struct r600_context *rctx) +{ + rctx->context.clear = r600_clear; + rctx->context.clear_render_target = r600_clear_render_target; + rctx->context.clear_depth_stencil = r600_clear_depth_stencil; + rctx->context.resource_copy_region = r600_resource_copy_region; +} + +void r600_blit_push_depth(struct pipe_context *ctx, struct r600_resource_texture *texture) +{ + struct pipe_box sbox; + + sbox.x = sbox.y = sbox.z = 0; + sbox.width = texture->resource.b.b.b.width0; + sbox.height = texture->resource.b.b.b.height0; + /* XXX that might be wrong */ + sbox.depth = 1; + + r600_hw_copy_region(ctx, (struct pipe_resource *)texture, 0, + 0, 0, 0, + (struct pipe_resource *)texture->flushed_depth_texture, 0, + &sbox); +} diff --git a/src/gallium/drivers/radeonsi/r600_buffer.c b/src/gallium/drivers/radeonsi/r600_buffer.c new file mode 100644 index 00000000000..bb885df8dcd --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_buffer.c @@ -0,0 +1,282 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + * Corbin Simpson <[email protected]> + */ +#include <byteswap.h> + +#include "pipe/p_screen.h" +#include "util/u_format.h" +#include "util/u_math.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "util/u_upload_mgr.h" + +#include "r600.h" +#include "radeonsi_pipe.h" + +static void r600_buffer_destroy(struct pipe_screen *screen, + struct pipe_resource *buf) +{ + struct r600_screen *rscreen = (struct r600_screen*)screen; + struct r600_resource *rbuffer = r600_resource(buf); + + pb_reference(&rbuffer->buf, NULL); + util_slab_free(&rscreen->pool_buffers, rbuffer); +} + +static struct pipe_transfer *r600_get_transfer(struct pipe_context *ctx, + struct pipe_resource *resource, + unsigned level, + unsigned usage, + const struct pipe_box *box) +{ + struct r600_context *rctx = (struct r600_context*)ctx; + struct pipe_transfer *transfer = util_slab_alloc(&rctx->pool_transfers); + + transfer->resource = resource; + transfer->level = level; + transfer->usage = usage; + transfer->box = *box; + transfer->stride = 0; + transfer->layer_stride = 0; + transfer->data = NULL; + + /* Note strides are zero, this is ok for buffers, but not for + * textures 2d & higher at least. + */ + return transfer; +} + +static void *r600_buffer_transfer_map(struct pipe_context *pipe, + struct pipe_transfer *transfer) +{ + struct r600_resource *rbuffer = r600_resource(transfer->resource); + struct r600_context *rctx = (struct r600_context*)pipe; + uint8_t *data; + + if (rbuffer->b.user_ptr) + return (uint8_t*)rbuffer->b.user_ptr + transfer->box.x; + + data = rctx->ws->buffer_map(rbuffer->buf, rctx->cs, transfer->usage); + if (!data) + return NULL; + + return (uint8_t*)data + transfer->box.x; +} + +static void r600_buffer_transfer_unmap(struct pipe_context *pipe, + struct pipe_transfer *transfer) +{ + struct r600_resource *rbuffer = r600_resource(transfer->resource); + struct r600_context *rctx = (struct r600_context*)pipe; + + if (rbuffer->b.user_ptr) + return; + + rctx->ws->buffer_unmap(rbuffer->buf); +} + +static void r600_buffer_transfer_flush_region(struct pipe_context *pipe, + struct pipe_transfer *transfer, + const struct pipe_box *box) +{ +} + +static void r600_transfer_destroy(struct pipe_context *ctx, + struct pipe_transfer *transfer) +{ + struct r600_context *rctx = (struct r600_context*)ctx; + util_slab_free(&rctx->pool_transfers, transfer); +} + +static void r600_buffer_transfer_inline_write(struct pipe_context *pipe, + struct pipe_resource *resource, + unsigned level, + unsigned usage, + const struct pipe_box *box, + const void *data, + unsigned stride, + unsigned layer_stride) +{ + struct r600_context *rctx = (struct r600_context*)pipe; + struct r600_resource *rbuffer = r600_resource(resource); + uint8_t *map = NULL; + + assert(rbuffer->b.user_ptr == NULL); + + map = rctx->ws->buffer_map(rbuffer->buf, rctx->cs, + PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD_RANGE | usage); + + memcpy(map + box->x, data, box->width); + + rctx->ws->buffer_unmap(rbuffer->buf); +} + +static const struct u_resource_vtbl r600_buffer_vtbl = +{ + u_default_resource_get_handle, /* get_handle */ + r600_buffer_destroy, /* resource_destroy */ + r600_get_transfer, /* get_transfer */ + r600_transfer_destroy, /* transfer_destroy */ + r600_buffer_transfer_map, /* transfer_map */ + r600_buffer_transfer_flush_region, /* transfer_flush_region */ + r600_buffer_transfer_unmap, /* transfer_unmap */ + r600_buffer_transfer_inline_write /* transfer_inline_write */ +}; + +bool r600_init_resource(struct r600_screen *rscreen, + struct r600_resource *res, + unsigned size, unsigned alignment, + unsigned bind, unsigned usage) +{ + uint32_t initial_domain, domains; + + /* Staging resources particpate in transfers and blits only + * and are used for uploads and downloads from regular + * resources. We generate them internally for some transfers. + */ + if (usage == PIPE_USAGE_STAGING) { + domains = RADEON_DOMAIN_GTT; + initial_domain = RADEON_DOMAIN_GTT; + } else { + domains = RADEON_DOMAIN_GTT | RADEON_DOMAIN_VRAM; + + switch(usage) { + case PIPE_USAGE_DYNAMIC: + case PIPE_USAGE_STREAM: + case PIPE_USAGE_STAGING: + initial_domain = RADEON_DOMAIN_GTT; + break; + case PIPE_USAGE_DEFAULT: + case PIPE_USAGE_STATIC: + case PIPE_USAGE_IMMUTABLE: + default: + initial_domain = RADEON_DOMAIN_VRAM; + break; + } + } + + res->buf = rscreen->ws->buffer_create(rscreen->ws, size, alignment, bind, initial_domain); + if (!res->buf) { + return false; + } + + res->cs_buf = rscreen->ws->buffer_get_cs_handle(res->buf); + res->domains = domains; + return true; +} + +struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, + const struct pipe_resource *templ) +{ + struct r600_screen *rscreen = (struct r600_screen*)screen; + struct r600_resource *rbuffer; + /* XXX We probably want a different alignment for buffers and textures. */ + unsigned alignment = 4096; + + rbuffer = util_slab_alloc(&rscreen->pool_buffers); + + rbuffer->b.b.b = *templ; + pipe_reference_init(&rbuffer->b.b.b.reference, 1); + rbuffer->b.b.b.screen = screen; + rbuffer->b.b.vtbl = &r600_buffer_vtbl; + rbuffer->b.user_ptr = NULL; + + if (!r600_init_resource(rscreen, rbuffer, templ->width0, alignment, templ->bind, templ->usage)) { + util_slab_free(&rscreen->pool_buffers, rbuffer); + return NULL; + } + return &rbuffer->b.b.b; +} + +struct pipe_resource *r600_user_buffer_create(struct pipe_screen *screen, + void *ptr, unsigned bytes, + unsigned bind) +{ + struct r600_screen *rscreen = (struct r600_screen*)screen; + struct r600_resource *rbuffer; + + rbuffer = util_slab_alloc(&rscreen->pool_buffers); + + pipe_reference_init(&rbuffer->b.b.b.reference, 1); + rbuffer->b.b.vtbl = &r600_buffer_vtbl; + rbuffer->b.b.b.screen = screen; + rbuffer->b.b.b.target = PIPE_BUFFER; + rbuffer->b.b.b.format = PIPE_FORMAT_R8_UNORM; + rbuffer->b.b.b.usage = PIPE_USAGE_IMMUTABLE; + rbuffer->b.b.b.bind = bind; + rbuffer->b.b.b.width0 = bytes; + rbuffer->b.b.b.height0 = 1; + rbuffer->b.b.b.depth0 = 1; + rbuffer->b.b.b.array_size = 1; + rbuffer->b.b.b.flags = 0; + rbuffer->b.user_ptr = ptr; + rbuffer->buf = NULL; + return &rbuffer->b.b.b; +} + +void r600_upload_index_buffer(struct r600_context *rctx, + struct pipe_index_buffer *ib, unsigned count) +{ + struct r600_resource *rbuffer = r600_resource(ib->buffer); + + u_upload_data(rctx->vbuf_mgr->uploader, 0, count * ib->index_size, + rbuffer->b.user_ptr, &ib->offset, &ib->buffer); +} + +void r600_upload_const_buffer(struct r600_context *rctx, struct r600_resource **rbuffer, + uint32_t *const_offset) +{ + if ((*rbuffer)->b.user_ptr) { + uint8_t *ptr = (*rbuffer)->b.user_ptr; + unsigned size = (*rbuffer)->b.b.b.width0; + + *rbuffer = NULL; + + if (R600_BIG_ENDIAN) { + uint32_t *tmpPtr; + unsigned i; + + if (!(tmpPtr = malloc(size))) { + R600_ERR("Failed to allocate BE swap buffer.\n"); + return; + } + + for (i = 0; i < size / 4; ++i) { + tmpPtr[i] = bswap_32(((uint32_t *)ptr)[i]); + } + + u_upload_data(rctx->vbuf_mgr->uploader, 0, size, tmpPtr, const_offset, + (struct pipe_resource**)rbuffer); + + free(tmpPtr); + } else { + u_upload_data(rctx->vbuf_mgr->uploader, 0, size, ptr, const_offset, + (struct pipe_resource**)rbuffer); + } + } else { + *const_offset = 0; + } +} diff --git a/src/gallium/drivers/radeonsi/r600_hw_context.c b/src/gallium/drivers/radeonsi/r600_hw_context.c new file mode 100644 index 00000000000..494b0d34283 --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_hw_context.c @@ -0,0 +1,1151 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + */ +#include "r600_hw_context_priv.h" +#include "radeonsi_pipe.h" +#include "sid.h" +#include "util/u_memory.h" +#include <errno.h> + +#define GROUP_FORCE_NEW_BLOCK 0 + +/* Get backends mask */ +void r600_get_backend_mask(struct r600_context *ctx) +{ + struct radeon_winsys_cs *cs = ctx->cs; + struct r600_resource *buffer; + uint32_t *results; + unsigned num_backends = ctx->screen->info.r600_num_backends; + unsigned i, mask = 0; + + /* if backend_map query is supported by the kernel */ + if (ctx->screen->info.r600_backend_map_valid) { + unsigned num_tile_pipes = ctx->screen->info.r600_num_tile_pipes; + unsigned backend_map = ctx->screen->info.r600_backend_map; + unsigned item_width, item_mask; + + if (ctx->chip_class >= CAYMAN) { + item_width = 4; + item_mask = 0x7; + } + + while(num_tile_pipes--) { + i = backend_map & item_mask; + mask |= (1<<i); + backend_map >>= item_width; + } + if (mask != 0) { + ctx->backend_mask = mask; + return; + } + } + + /* otherwise backup path for older kernels */ + + /* create buffer for event data */ + buffer = (struct r600_resource*) + pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_STAGING, ctx->max_db*16); + if (!buffer) + goto err; + + /* initialize buffer with zeroes */ + results = ctx->ws->buffer_map(buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE); + if (results) { + uint64_t va = 0; + + memset(results, 0, ctx->max_db * 4 * 4); + ctx->ws->buffer_unmap(buffer->buf); + + /* emit EVENT_WRITE for ZPASS_DONE */ + va = r600_resource_va(&ctx->screen->screen, (void *)buffer); + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); + cs->buf[cs->cdw++] = va; + cs->buf[cs->cdw++] = va >> 32; + + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, buffer, RADEON_USAGE_WRITE); + + /* analyze results */ + results = ctx->ws->buffer_map(buffer->buf, ctx->cs, PIPE_TRANSFER_READ); + if (results) { + for(i = 0; i < ctx->max_db; i++) { + /* at least highest bit will be set if backend is used */ + if (results[i*4 + 1]) + mask |= (1<<i); + } + ctx->ws->buffer_unmap(buffer->buf); + } + } + + pipe_resource_reference((struct pipe_resource**)&buffer, NULL); + + if (mask != 0) { + ctx->backend_mask = mask; + return; + } + +err: + /* fallback to old method - set num_backends lower bits to 1 */ + ctx->backend_mask = (~((uint32_t)0))>>(32-num_backends); + return; +} + +static inline void r600_context_ps_partial_flush(struct r600_context *ctx) +{ + struct radeon_winsys_cs *cs = ctx->cs; + + if (!(ctx->flags & R600_CONTEXT_DRAW_PENDING)) + return; + + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); + + ctx->flags &= ~R600_CONTEXT_DRAW_PENDING; +} + +void r600_init_cs(struct r600_context *ctx) +{ + struct radeon_winsys_cs *cs = ctx->cs; + + /* All asics require this one */ + cs->buf[cs->cdw++] = PKT3(PKT3_CONTEXT_CONTROL, 1, 0); + cs->buf[cs->cdw++] = 0x80000000; + cs->buf[cs->cdw++] = 0x80000000; + + ctx->init_dwords = cs->cdw; +} + +static void r600_init_block(struct r600_context *ctx, + struct r600_block *block, + const struct r600_reg *reg, int index, int nreg, + unsigned opcode, unsigned offset_base) +{ + int i = index; + int j, n = nreg; + + /* initialize block */ + block->flags = 0; + block->status |= R600_BLOCK_STATUS_DIRTY; /* dirty all blocks at start */ + block->start_offset = reg[i].offset; + block->pm4[block->pm4_ndwords++] = PKT3(opcode, n, 0); + block->pm4[block->pm4_ndwords++] = (block->start_offset - offset_base) >> 2; + block->reg = &block->pm4[block->pm4_ndwords]; + block->pm4_ndwords += n; + block->nreg = n; + block->nreg_dirty = n; + LIST_INITHEAD(&block->list); + LIST_INITHEAD(&block->enable_list); + + for (j = 0; j < n; j++) { + if (reg[i+j].flags & REG_FLAG_DIRTY_ALWAYS) { + block->flags |= REG_FLAG_DIRTY_ALWAYS; + } + if (reg[i+j].flags & REG_FLAG_ENABLE_ALWAYS) { + if (!(block->status & R600_BLOCK_STATUS_ENABLED)) { + block->status |= R600_BLOCK_STATUS_ENABLED; + LIST_ADDTAIL(&block->enable_list, &ctx->enable_list); + LIST_ADDTAIL(&block->list,&ctx->dirty); + } + } + if (reg[i+j].flags & REG_FLAG_FLUSH_CHANGE) { + block->flags |= REG_FLAG_FLUSH_CHANGE; + } + + if (reg[i+j].flags & REG_FLAG_NEED_BO) { + block->nbo++; + assert(block->nbo < R600_BLOCK_MAX_BO); + block->pm4_bo_index[j] = block->nbo; + block->pm4[block->pm4_ndwords++] = PKT3(PKT3_NOP, 0, 0); + block->pm4[block->pm4_ndwords++] = 0x00000000; + block->reloc[block->nbo].bo_pm4_index = block->pm4_ndwords - 1; + } + } + /* check that we stay in limit */ + assert(block->pm4_ndwords < R600_BLOCK_MAX_REG); +} + +int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg, + unsigned opcode, unsigned offset_base) +{ + struct r600_block *block; + struct r600_range *range; + int offset; + + for (unsigned i = 0, n = 0; i < nreg; i += n) { + /* ignore new block balise */ + if (reg[i].offset == GROUP_FORCE_NEW_BLOCK) { + n = 1; + continue; + } + + /* register that need relocation are in their own group */ + /* find number of consecutive registers */ + n = 0; + offset = reg[i].offset; + while (reg[i + n].offset == offset) { + n++; + offset += 4; + if ((n + i) >= nreg) + break; + if (n >= (R600_BLOCK_MAX_REG - 2)) + break; + } + + /* allocate new block */ + block = calloc(1, sizeof(struct r600_block)); + if (block == NULL) { + return -ENOMEM; + } + ctx->nblocks++; + for (int j = 0; j < n; j++) { + range = &ctx->range[CTX_RANGE_ID(reg[i + j].offset)]; + /* create block table if it doesn't exist */ + if (!range->blocks) + range->blocks = calloc(1 << HASH_SHIFT, sizeof(void *)); + if (!range->blocks) + return -1; + + range->blocks[CTX_BLOCK_ID(reg[i + j].offset)] = block; + } + + r600_init_block(ctx, block, reg, i, n, opcode, offset_base); + + } + return 0; +} + + +/* initialize */ +void r600_context_fini(struct r600_context *ctx) +{ + struct r600_block *block; + struct r600_range *range; + + for (int i = 0; i < NUM_RANGES; i++) { + if (!ctx->range[i].blocks) + continue; + for (int j = 0; j < (1 << HASH_SHIFT); j++) { + block = ctx->range[i].blocks[j]; + if (block) { + for (int k = 0, offset = block->start_offset; k < block->nreg; k++, offset += 4) { + range = &ctx->range[CTX_RANGE_ID(offset)]; + range->blocks[CTX_BLOCK_ID(offset)] = NULL; + } + for (int k = 1; k <= block->nbo; k++) { + pipe_resource_reference((struct pipe_resource**)&block->reloc[k].bo, NULL); + } + free(block); + } + } + free(ctx->range[i].blocks); + } + free(ctx->range); + free(ctx->blocks); + ctx->ws->cs_destroy(ctx->cs); +} + +int r600_setup_block_table(struct r600_context *ctx) +{ + /* setup block table */ + int c = 0; + ctx->blocks = calloc(ctx->nblocks, sizeof(void*)); + if (!ctx->blocks) + return -ENOMEM; + for (int i = 0; i < NUM_RANGES; i++) { + if (!ctx->range[i].blocks) + continue; + for (int j = 0, add; j < (1 << HASH_SHIFT); j++) { + if (!ctx->range[i].blocks[j]) + continue; + + add = 1; + for (int k = 0; k < c; k++) { + if (ctx->blocks[k] == ctx->range[i].blocks[j]) { + add = 0; + break; + } + } + if (add) { + assert(c < ctx->nblocks); + ctx->blocks[c++] = ctx->range[i].blocks[j]; + j += (ctx->range[i].blocks[j]->nreg) - 1; + } + } + } + + return 0; +} + +void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, + boolean count_draw_in) +{ + struct r600_atom *state; + + /* The number of dwords we already used in the CS so far. */ + num_dw += ctx->cs->cdw; + + if (count_draw_in) { + /* The number of dwords all the dirty states would take. */ + LIST_FOR_EACH_ENTRY(state, &ctx->dirty_states, head) { + num_dw += state->num_dw; + } + + num_dw += ctx->pm4_dirty_cdwords; + + /* The upper-bound of how much a draw command would take. */ + num_dw += R600_MAX_DRAW_CS_DWORDS; + } + + /* Count in queries_suspend. */ + num_dw += ctx->num_cs_dw_queries_suspend; + + /* Count in streamout_end at the end of CS. */ + num_dw += ctx->num_cs_dw_streamout_end; + + /* Count in render_condition(NULL) at the end of CS. */ + if (ctx->predicate_drawing) { + num_dw += 3; + } + + /* Count in framebuffer cache flushes at the end of CS. */ + num_dw += 7; /* one SURFACE_SYNC and CACHE_FLUSH_AND_INV (r6xx-only) */ + + /* Save 16 dwords for the fence mechanism. */ + num_dw += 16; + + /* Flush if there's not enough space. */ + if (num_dw > RADEON_MAX_CMDBUF_DWORDS) { + radeonsi_flush(&ctx->context, NULL, RADEON_FLUSH_ASYNC); + } +} + +void r600_context_dirty_block(struct r600_context *ctx, + struct r600_block *block, + int dirty, int index) +{ + if ((index + 1) > block->nreg_dirty) + block->nreg_dirty = index + 1; + + if ((dirty != (block->status & R600_BLOCK_STATUS_DIRTY)) || !(block->status & R600_BLOCK_STATUS_ENABLED)) { + block->status |= R600_BLOCK_STATUS_DIRTY; + ctx->pm4_dirty_cdwords += block->pm4_ndwords; + if (!(block->status & R600_BLOCK_STATUS_ENABLED)) { + block->status |= R600_BLOCK_STATUS_ENABLED; + LIST_ADDTAIL(&block->enable_list, &ctx->enable_list); + } + LIST_ADDTAIL(&block->list,&ctx->dirty); + + if (block->flags & REG_FLAG_FLUSH_CHANGE) { + r600_context_ps_partial_flush(ctx); + } + } +} + +void r600_context_pipe_state_set(struct r600_context *ctx, struct r600_pipe_state *state) +{ + struct r600_block *block; + int dirty; + for (int i = 0; i < state->nregs; i++) { + unsigned id, reloc_id; + struct r600_pipe_reg *reg = &state->regs[i]; + + block = reg->block; + id = reg->id; + + dirty = block->status & R600_BLOCK_STATUS_DIRTY; + + if (reg->value != block->reg[id]) { + block->reg[id] = reg->value; + dirty |= R600_BLOCK_STATUS_DIRTY; + } + if (block->flags & REG_FLAG_DIRTY_ALWAYS) + dirty |= R600_BLOCK_STATUS_DIRTY; + if (block->pm4_bo_index[id]) { + /* find relocation */ + reloc_id = block->pm4_bo_index[id]; + pipe_resource_reference((struct pipe_resource**)&block->reloc[reloc_id].bo, ®->bo->b.b.b); + block->reloc[reloc_id].bo_usage = reg->bo_usage; + /* always force dirty for relocs for now */ + dirty |= R600_BLOCK_STATUS_DIRTY; + } + + if (dirty) + r600_context_dirty_block(ctx, block, dirty, id); + } +} + +struct r600_resource *r600_context_reg_bo(struct r600_context *ctx, unsigned offset) +{ + struct r600_range *range; + struct r600_block *block; + unsigned id; + + range = &ctx->range[CTX_RANGE_ID(offset)]; + block = range->blocks[CTX_BLOCK_ID(offset)]; + offset -= block->start_offset; + id = block->pm4_bo_index[offset >> 2]; + if (block->reloc[id].bo) { + return block->reloc[id].bo; + } + return NULL; +} + +void r600_context_block_emit_dirty(struct r600_context *ctx, struct r600_block *block) +{ + struct radeon_winsys_cs *cs = ctx->cs; + int optional = block->nbo == 0 && !(block->flags & REG_FLAG_DIRTY_ALWAYS); + int cp_dwords = block->pm4_ndwords, start_dword = 0; + int new_dwords = 0; + int nbo = block->nbo; + + if (block->nreg_dirty == 0 && optional) { + goto out; + } + + if (nbo) { + ctx->flags |= R600_CONTEXT_CHECK_EVENT_FLUSH; + + for (int j = 0; j < block->nreg; j++) { + if (block->pm4_bo_index[j]) { + /* find relocation */ + struct r600_block_reloc *reloc = &block->reloc[block->pm4_bo_index[j]]; + block->pm4[reloc->bo_pm4_index] = + r600_context_bo_reloc(ctx, reloc->bo, reloc->bo_usage); + nbo--; + if (nbo == 0) + break; + } + } + ctx->flags &= ~R600_CONTEXT_CHECK_EVENT_FLUSH; + } + + optional &= (block->nreg_dirty != block->nreg); + if (optional) { + new_dwords = block->nreg_dirty; + start_dword = cs->cdw; + cp_dwords = new_dwords + 2; + } + memcpy(&cs->buf[cs->cdw], block->pm4, cp_dwords * 4); + cs->cdw += cp_dwords; + + if (optional) { + uint32_t newword; + + newword = cs->buf[start_dword]; + newword &= PKT_COUNT_C; + newword |= PKT_COUNT_S(new_dwords); + cs->buf[start_dword] = newword; + } +out: + block->status ^= R600_BLOCK_STATUS_DIRTY; + block->nreg_dirty = 0; + LIST_DELINIT(&block->list); +} + +void r600_inval_shader_cache(struct r600_context *ctx) +{ + ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_ICACHE_ACTION_ENA(1); + ctx->atom_surface_sync.flush_flags |= S_0085F0_SH_KCACHE_ACTION_ENA(1); + r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); +} + +void r600_inval_texture_cache(struct r600_context *ctx) +{ + ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1); + r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); +} + +void r600_inval_vertex_cache(struct r600_context *ctx) +{ + /* Some GPUs don't have the vertex cache and must use the texture cache instead. */ + ctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1); + r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); +} + +void r600_flush_framebuffer(struct r600_context *ctx, bool flush_now) +{ + if (!(ctx->flags & R600_CONTEXT_DST_CACHES_DIRTY)) + return; + + ctx->atom_surface_sync.flush_flags |= + r600_get_cb_flush_flags(ctx) | + (ctx->framebuffer.zsbuf ? S_0085F0_DB_ACTION_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) : 0); + + if (flush_now) { + r600_emit_atom(ctx, &ctx->atom_surface_sync.atom); + } else { + r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); + } + + ctx->flags &= ~R600_CONTEXT_DST_CACHES_DIRTY; +} + +void r600_context_flush(struct r600_context *ctx, unsigned flags) +{ + struct radeon_winsys_cs *cs = ctx->cs; + struct r600_block *enable_block = NULL; + bool queries_suspended = false; + bool streamout_suspended = false; + + if (cs->cdw == ctx->init_dwords) + return; + + /* suspend queries */ + if (ctx->num_cs_dw_queries_suspend) { + r600_context_queries_suspend(ctx); + queries_suspended = true; + } + + if (ctx->num_cs_dw_streamout_end) { + r600_context_streamout_end(ctx); + streamout_suspended = true; + } + + r600_flush_framebuffer(ctx, true); + + /* partial flush is needed to avoid lockups on some chips with user fences */ + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); + + /* Flush the CS. */ + ctx->ws->cs_flush(ctx->cs, flags); + + ctx->pm4_dirty_cdwords = 0; + ctx->flags = 0; + + r600_init_cs(ctx); + + if (streamout_suspended) { + ctx->streamout_start = TRUE; + ctx->streamout_append_bitmask = ~0; + } + + /* resume queries */ + if (queries_suspended) { + r600_context_queries_resume(ctx); + } + + /* set all valid group as dirty so they get reemited on + * next draw command + */ + LIST_FOR_EACH_ENTRY(enable_block, &ctx->enable_list, enable_list) { + if(!(enable_block->status & R600_BLOCK_STATUS_DIRTY)) { + LIST_ADDTAIL(&enable_block->list,&ctx->dirty); + enable_block->status |= R600_BLOCK_STATUS_DIRTY; + } + ctx->pm4_dirty_cdwords += enable_block->pm4_ndwords; + enable_block->nreg_dirty = enable_block->nreg; + } +} + +void r600_context_emit_fence(struct r600_context *ctx, struct r600_resource *fence_bo, unsigned offset, unsigned value) +{ + struct radeon_winsys_cs *cs = ctx->cs; + uint64_t va; + + r600_need_cs_space(ctx, 10, FALSE); + + va = r600_resource_va(&ctx->screen->screen, (void*)fence_bo); + va = va + (offset << 2); + + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4); + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); + cs->buf[cs->cdw++] = va & 0xFFFFFFFFUL; /* ADDRESS_LO */ + /* DATA_SEL | INT_EN | ADDRESS_HI */ + cs->buf[cs->cdw++] = (1 << 29) | (0 << 24) | ((va >> 32UL) & 0xFF); + cs->buf[cs->cdw++] = value; /* DATA_LO */ + cs->buf[cs->cdw++] = 0; /* DATA_HI */ + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, fence_bo, RADEON_USAGE_WRITE); +} + +static unsigned r600_query_read_result(char *map, unsigned start_index, unsigned end_index, + bool test_status_bit) +{ + uint32_t *current_result = (uint32_t*)map; + uint64_t start, end; + + start = (uint64_t)current_result[start_index] | + (uint64_t)current_result[start_index+1] << 32; + end = (uint64_t)current_result[end_index] | + (uint64_t)current_result[end_index+1] << 32; + + if (!test_status_bit || + ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL))) { + return end - start; + } + return 0; +} + +static boolean r600_query_result(struct r600_context *ctx, struct r600_query *query, boolean wait) +{ + unsigned results_base = query->results_start; + char *map; + + map = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, + PIPE_TRANSFER_READ | + (wait ? 0 : PIPE_TRANSFER_DONTBLOCK)); + if (!map) + return FALSE; + + /* count all results across all data blocks */ + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + while (results_base != query->results_end) { + query->result.u64 += + r600_query_read_result(map + results_base, 0, 2, true); + results_base = (results_base + 16) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_OCCLUSION_PREDICATE: + while (results_base != query->results_end) { + query->result.b = query->result.b || + r600_query_read_result(map + results_base, 0, 2, true) != 0; + results_base = (results_base + 16) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_TIME_ELAPSED: + while (results_base != query->results_end) { + query->result.u64 += + r600_query_read_result(map + results_base, 0, 2, false); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + /* SAMPLE_STREAMOUTSTATS stores this structure: + * { + * u64 NumPrimitivesWritten; + * u64 PrimitiveStorageNeeded; + * } + * We only need NumPrimitivesWritten here. */ + while (results_base != query->results_end) { + query->result.u64 += + r600_query_read_result(map + results_base, 2, 6, true); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + /* Here we read PrimitiveStorageNeeded. */ + while (results_base != query->results_end) { + query->result.u64 += + r600_query_read_result(map + results_base, 0, 4, true); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_SO_STATISTICS: + while (results_base != query->results_end) { + query->result.so.num_primitives_written += + r600_query_read_result(map + results_base, 2, 6, true); + query->result.so.primitives_storage_needed += + r600_query_read_result(map + results_base, 0, 4, true); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + while (results_base != query->results_end) { + query->result.b = query->result.b || + r600_query_read_result(map + results_base, 2, 6, true) != + r600_query_read_result(map + results_base, 0, 4, true); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + } + break; + default: + assert(0); + } + + query->results_start = query->results_end; + ctx->ws->buffer_unmap(query->buffer->buf); + return TRUE; +} + +void r600_query_begin(struct r600_context *ctx, struct r600_query *query) +{ + struct radeon_winsys_cs *cs = ctx->cs; + unsigned new_results_end, i; + uint32_t *results; + uint64_t va; + + r600_need_cs_space(ctx, query->num_cs_dw * 2, TRUE); + + new_results_end = (query->results_end + query->result_size) % query->buffer->b.b.b.width0; + + /* collect current results if query buffer is full */ + if (new_results_end == query->results_start) { + r600_query_result(ctx, query, TRUE); + } + + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE); + if (results) { + results = (uint32_t*)((char*)results + query->results_end); + memset(results, 0, query->result_size); + + /* Set top bits for unused backends */ + for (i = 0; i < ctx->max_db; i++) { + if (!(ctx->backend_mask & (1<<i))) { + results[(i * 4)+1] = 0x80000000; + results[(i * 4)+3] = 0x80000000; + } + } + ctx->ws->buffer_unmap(query->buffer->buf); + } + break; + case PIPE_QUERY_TIME_ELAPSED: + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + results = ctx->ws->buffer_map(query->buffer->buf, ctx->cs, PIPE_TRANSFER_WRITE); + results = (uint32_t*)((char*)results + query->results_end); + memset(results, 0, query->result_size); + ctx->ws->buffer_unmap(query->buffer->buf); + break; + default: + assert(0); + } + + /* emit begin query */ + va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); + va += query->results_end; + + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); + cs->buf[cs->cdw++] = va; + cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3); + cs->buf[cs->cdw++] = query->results_end; + cs->buf[cs->cdw++] = 0; + break; + case PIPE_QUERY_TIME_ELAPSED: + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); + cs->buf[cs->cdw++] = va; + cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF); + cs->buf[cs->cdw++] = 0; + cs->buf[cs->cdw++] = 0; + break; + default: + assert(0); + } + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE); + + ctx->num_cs_dw_queries_suspend += query->num_cs_dw; +} + +void r600_query_end(struct r600_context *ctx, struct r600_query *query) +{ + struct radeon_winsys_cs *cs = ctx->cs; + uint64_t va; + + va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); + /* emit end query */ + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + va += query->results_end + 8; + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1); + cs->buf[cs->cdw++] = va; + cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 2, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SAMPLE_STREAMOUTSTATS) | EVENT_INDEX(3); + cs->buf[cs->cdw++] = query->results_end + query->result_size/2; + cs->buf[cs->cdw++] = 0; + break; + case PIPE_QUERY_TIME_ELAPSED: + va += query->results_end + query->result_size/2; + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE_EOP, 4, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5); + cs->buf[cs->cdw++] = va; + cs->buf[cs->cdw++] = (3 << 29) | ((va >> 32UL) & 0xFF); + cs->buf[cs->cdw++] = 0; + cs->buf[cs->cdw++] = 0; + break; + default: + assert(0); + } + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, RADEON_USAGE_WRITE); + + query->results_end = (query->results_end + query->result_size) % query->buffer->b.b.b.width0; + ctx->num_cs_dw_queries_suspend -= query->num_cs_dw; +} + +void r600_query_predication(struct r600_context *ctx, struct r600_query *query, int operation, + int flag_wait) +{ + struct radeon_winsys_cs *cs = ctx->cs; + uint64_t va; + + if (operation == PREDICATION_OP_CLEAR) { + r600_need_cs_space(ctx, 3, FALSE); + + cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0); + cs->buf[cs->cdw++] = 0; + cs->buf[cs->cdw++] = PRED_OP(PREDICATION_OP_CLEAR); + } else { + unsigned results_base = query->results_start; + unsigned count; + uint32_t op; + + /* find count of the query data blocks */ + count = (query->buffer->b.b.b.width0 + query->results_end - query->results_start) % query->buffer->b.b.b.width0; + count /= query->result_size; + + r600_need_cs_space(ctx, 5 * count, TRUE); + + op = PRED_OP(operation) | PREDICATION_DRAW_VISIBLE | + (flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW); + va = r600_resource_va(&ctx->screen->screen, (void*)query->buffer); + + /* emit predicate packets for all data blocks */ + while (results_base != query->results_end) { + cs->buf[cs->cdw++] = PKT3(PKT3_SET_PREDICATION, 1, 0); + cs->buf[cs->cdw++] = (va + results_base) & 0xFFFFFFFFUL; + cs->buf[cs->cdw++] = op | (((va + results_base) >> 32UL) & 0xFF); + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, query->buffer, + RADEON_USAGE_READ); + results_base = (results_base + query->result_size) % query->buffer->b.b.b.width0; + + /* set CONTINUE bit for all packets except the first */ + op |= PREDICATION_CONTINUE; + } + } +} + +struct r600_query *r600_context_query_create(struct r600_context *ctx, unsigned query_type) +{ + struct r600_query *query; + unsigned buffer_size = 4096; + + query = CALLOC_STRUCT(r600_query); + if (query == NULL) + return NULL; + + query->type = query_type; + + switch (query_type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + query->result_size = 16 * ctx->max_db; + query->num_cs_dw = 6; + break; + case PIPE_QUERY_TIME_ELAPSED: + query->result_size = 16; + query->num_cs_dw = 8; + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + /* NumPrimitivesWritten, PrimitiveStorageNeeded. */ + query->result_size = 32; + query->num_cs_dw = 6; + break; + default: + assert(0); + FREE(query); + return NULL; + } + + /* adjust buffer size to simplify offsets wrapping math */ + buffer_size -= buffer_size % query->result_size; + + /* Queries are normally read by the CPU after + * being written by the gpu, hence staging is probably a good + * usage pattern. + */ + query->buffer = (struct r600_resource*) + pipe_buffer_create(&ctx->screen->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STAGING, buffer_size); + if (!query->buffer) { + FREE(query); + return NULL; + } + return query; +} + +void r600_context_query_destroy(struct r600_context *ctx, struct r600_query *query) +{ + pipe_resource_reference((struct pipe_resource**)&query->buffer, NULL); + free(query); +} + +boolean r600_context_query_result(struct r600_context *ctx, + struct r600_query *query, + boolean wait, void *vresult) +{ + boolean *result_b = (boolean*)vresult; + uint64_t *result_u64 = (uint64_t*)vresult; + struct pipe_query_data_so_statistics *result_so = + (struct pipe_query_data_so_statistics*)vresult; + + if (!r600_query_result(ctx, query, wait)) + return FALSE; + + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + *result_u64 = query->result.u64; + break; + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + *result_b = query->result.b; + break; + case PIPE_QUERY_TIME_ELAPSED: + *result_u64 = (1000000 * query->result.u64) / ctx->screen->info.r600_clock_crystal_freq; + break; + case PIPE_QUERY_SO_STATISTICS: + *result_so = query->result.so; + break; + default: + assert(0); + } + return TRUE; +} + +void r600_context_queries_suspend(struct r600_context *ctx) +{ + struct r600_query *query; + + LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) { + r600_query_end(ctx, query); + } + assert(ctx->num_cs_dw_queries_suspend == 0); +} + +void r600_context_queries_resume(struct r600_context *ctx) +{ + struct r600_query *query; + + assert(ctx->num_cs_dw_queries_suspend == 0); + + LIST_FOR_EACH_ENTRY(query, &ctx->active_query_list, list) { + r600_query_begin(ctx, query); + } +} + +void r600_context_streamout_begin(struct r600_context *ctx) +{ + struct radeon_winsys_cs *cs = ctx->cs; + struct r600_so_target **t = ctx->so_targets; + unsigned *strides = ctx->vs_shader_so_strides; + unsigned buffer_en, i; + + buffer_en = (ctx->num_so_targets >= 1 && t[0] ? 1 : 0) | + (ctx->num_so_targets >= 2 && t[1] ? 2 : 0) | + (ctx->num_so_targets >= 3 && t[2] ? 4 : 0) | + (ctx->num_so_targets >= 4 && t[3] ? 8 : 0); + + ctx->num_cs_dw_streamout_end = + 12 + /* flush_vgt_streamout */ + util_bitcount(buffer_en) * 8 + + 3; + + r600_need_cs_space(ctx, + 12 + /* flush_vgt_streamout */ + 6 + /* enables */ + util_bitcount(buffer_en & ctx->streamout_append_bitmask) * 8 + + util_bitcount(buffer_en & ~ctx->streamout_append_bitmask) * 6 + + ctx->num_cs_dw_streamout_end, TRUE); + + if (ctx->chip_class >= CAYMAN) { + evergreen_flush_vgt_streamout(ctx); + evergreen_set_streamout_enable(ctx, buffer_en); + } + + for (i = 0; i < ctx->num_so_targets; i++) { +#if 0 + if (t[i]) { + t[i]->stride = strides[i]; + t[i]->so_index = i; + + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 3, 0); + cs->buf[cs->cdw++] = (R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + + 16*i - SI_CONTEXT_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = (t[i]->b.buffer_offset + + t[i]->b.buffer_size) >> 2; /* BUFFER_SIZE (in DW) */ + cs->buf[cs->cdw++] = strides[i] >> 2; /* VTX_STRIDE (in DW) */ + cs->buf[cs->cdw++] = 0; /* BUFFER_BASE */ + + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = + r600_context_bo_reloc(ctx, r600_resource(t[i]->b.buffer), + RADEON_USAGE_WRITE); + + if (ctx->streamout_append_bitmask & (1 << i)) { + /* Append. */ + cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); + cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM); /* control */ + cs->buf[cs->cdw++] = 0; /* unused */ + cs->buf[cs->cdw++] = 0; /* unused */ + cs->buf[cs->cdw++] = 0; /* src address lo */ + cs->buf[cs->cdw++] = 0; /* src address hi */ + + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = + r600_context_bo_reloc(ctx, t[i]->filled_size, + RADEON_USAGE_READ); + } else { + /* Start from the beginning. */ + cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); + cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET); /* control */ + cs->buf[cs->cdw++] = 0; /* unused */ + cs->buf[cs->cdw++] = 0; /* unused */ + cs->buf[cs->cdw++] = t[i]->b.buffer_offset >> 2; /* buffer offset in DW */ + cs->buf[cs->cdw++] = 0; /* unused */ + } + } +#endif + } +} + +void r600_context_streamout_end(struct r600_context *ctx) +{ + struct radeon_winsys_cs *cs = ctx->cs; + struct r600_so_target **t = ctx->so_targets; + unsigned i, flush_flags = 0; + + evergreen_flush_vgt_streamout(ctx); + + for (i = 0; i < ctx->num_so_targets; i++) { +#if 0 + if (t[i]) { + cs->buf[cs->cdw++] = PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0); + cs->buf[cs->cdw++] = STRMOUT_SELECT_BUFFER(i) | + STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | + STRMOUT_STORE_BUFFER_FILLED_SIZE; /* control */ + cs->buf[cs->cdw++] = 0; /* dst address lo */ + cs->buf[cs->cdw++] = 0; /* dst address hi */ + cs->buf[cs->cdw++] = 0; /* unused */ + cs->buf[cs->cdw++] = 0; /* unused */ + + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = + r600_context_bo_reloc(ctx, t[i]->filled_size, + RADEON_USAGE_WRITE); + + flush_flags |= S_0085F0_SO0_DEST_BASE_ENA(1) << i; + } +#endif + } + + evergreen_set_streamout_enable(ctx, 0); + + ctx->atom_surface_sync.flush_flags |= flush_flags; + r600_atom_dirty(ctx, &ctx->atom_surface_sync.atom); + + ctx->num_cs_dw_streamout_end = 0; + + /* XXX print some debug info */ + for (i = 0; i < ctx->num_so_targets; i++) { + if (!t[i]) + continue; + + uint32_t *ptr = ctx->ws->buffer_map(t[i]->filled_size->buf, ctx->cs, RADEON_USAGE_READ); + printf("FILLED_SIZE%i: %u\n", i, *ptr); + ctx->ws->buffer_unmap(t[i]->filled_size->buf); + } +} + +void r600_context_draw_opaque_count(struct r600_context *ctx, struct r600_so_target *t) +{ + struct radeon_winsys_cs *cs = ctx->cs; + r600_need_cs_space(ctx, 14 + 21, TRUE); + + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + cs->buf[cs->cdw++] = (R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET - SI_CONTEXT_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = 0; + + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONTEXT_REG, 1, 0); + cs->buf[cs->cdw++] = (R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE - SI_CONTEXT_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = t->stride >> 2; + +#if 0 + cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0); + cs->buf[cs->cdw++] = COPY_DW_SRC_IS_MEM | COPY_DW_DST_IS_REG; + cs->buf[cs->cdw++] = 0; /* src address lo */ + cs->buf[cs->cdw++] = 0; /* src address hi */ + cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* dst register */ + cs->buf[cs->cdw++] = 0; /* unused */ +#endif + + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, t->filled_size, RADEON_USAGE_READ); + +#if 0 /* I have not found this useful yet. */ + cs->buf[cs->cdw++] = PKT3(PKT3_COPY_DW, 4, 0); + cs->buf[cs->cdw++] = COPY_DW_SRC_IS_REG | COPY_DW_DST_IS_REG; + cs->buf[cs->cdw++] = R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2; /* src register */ + cs->buf[cs->cdw++] = 0; /* unused */ + cs->buf[cs->cdw++] = R_0085F4_CP_COHER_SIZE >> 2; /* dst register */ + cs->buf[cs->cdw++] = 0; /* unused */ + + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); + cs->buf[cs->cdw++] = (R_0085F0_CP_COHER_CNTL - SI_CONFIG_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = S_0085F0_SO0_DEST_BASE_ENA(1) << t->so_index; + + cs->buf[cs->cdw++] = PKT3(PKT3_SET_CONFIG_REG, 1, 0); + cs->buf[cs->cdw++] = (R_0085F8_CP_COHER_BASE - SI_CONFIG_REG_OFFSET) >> 2; + cs->buf[cs->cdw++] = t->b.buffer_offset >> 2; + + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = r600_context_bo_reloc(ctx, (struct r600_resource*)t->b.buffer, + RADEON_USAGE_WRITE); + + cs->buf[cs->cdw++] = PKT3(PKT3_WAIT_REG_MEM, 5, 0); + cs->buf[cs->cdw++] = WAIT_REG_MEM_EQUAL; /* wait until the register is equal to the reference value */ + cs->buf[cs->cdw++] = R_0085FC_CP_COHER_STATUS >> 2; /* register */ + cs->buf[cs->cdw++] = 0; + cs->buf[cs->cdw++] = 0; /* reference value */ + cs->buf[cs->cdw++] = 0xffffffff; /* mask */ + cs->buf[cs->cdw++] = 4; /* poll interval */ +#endif +} diff --git a/src/gallium/drivers/radeonsi/r600_hw_context_priv.h b/src/gallium/drivers/radeonsi/r600_hw_context_priv.h new file mode 100644 index 00000000000..7d5394e9f58 --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_hw_context_priv.h @@ -0,0 +1,76 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + */ +#ifndef R600_PRIV_H +#define R600_PRIV_H + +#include "radeonsi_pipe.h" +#include "util/u_hash_table.h" +#include "os/os_thread.h" + +#define R600_MAX_DRAW_CS_DWORDS 17 + +#define PKT_COUNT_C 0xC000FFFF +#define PKT_COUNT_S(x) (((x) & 0x3FFF) << 16) + +/* these flags are used in register flags and added into block flags */ +#define REG_FLAG_NEED_BO 1 +#define REG_FLAG_DIRTY_ALWAYS 2 +#define REG_FLAG_RV6XX_SBU 4 +#define REG_FLAG_NOT_R600 8 +#define REG_FLAG_ENABLE_ALWAYS 16 +#define REG_FLAG_FLUSH_CHANGE 64 + +struct r600_reg { + unsigned offset; + unsigned flags; +}; + +/* + * r600_hw_context.c + */ +struct r600_resource *r600_context_reg_bo(struct r600_context *ctx, unsigned offset); +int r600_context_add_block(struct r600_context *ctx, const struct r600_reg *reg, unsigned nreg, + unsigned opcode, unsigned offset_base); +void r600_context_dirty_block(struct r600_context *ctx, struct r600_block *block, + int dirty, int index); +int r600_setup_block_table(struct r600_context *ctx); +void r600_init_cs(struct r600_context *ctx); + +/* + * evergreen_hw_context.c + */ +void evergreen_flush_vgt_streamout(struct r600_context *ctx); +void evergreen_set_streamout_enable(struct r600_context *ctx, unsigned buffer_enable_bit); + + +static INLINE unsigned r600_context_bo_reloc(struct r600_context *ctx, struct r600_resource *rbo, + enum radeon_bo_usage usage) +{ + assert(usage); + return ctx->ws->cs_add_reloc(ctx->cs, rbo->cs_buf, usage, rbo->domains) * 4; +} + +#endif diff --git a/src/gallium/drivers/radeonsi/r600_query.c b/src/gallium/drivers/radeonsi/r600_query.c new file mode 100644 index 00000000000..bbf7c046f57 --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_query.c @@ -0,0 +1,130 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#include "radeonsi_pipe.h" +#include "sid.h" + +static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + return (struct pipe_query*)r600_context_query_create(rctx, query_type); +} + +static void r600_destroy_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + r600_context_query_destroy(rctx, (struct r600_query *)query); +} + +static void r600_begin_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + memset(&rquery->result, 0, sizeof(rquery->result)); + rquery->results_start = rquery->results_end; + r600_query_begin(rctx, (struct r600_query *)query); + LIST_ADDTAIL(&rquery->list, &rctx->active_query_list); +} + +static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + r600_query_end(rctx, rquery); + LIST_DELINIT(&rquery->list); +} + +static boolean r600_get_query_result(struct pipe_context *ctx, + struct pipe_query *query, + boolean wait, union pipe_query_result *vresult) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + + return r600_context_query_result(rctx, rquery, wait, vresult); +} + +static void r600_render_condition(struct pipe_context *ctx, + struct pipe_query *query, + uint mode) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_query *rquery = (struct r600_query *)query; + int wait_flag = 0; + + /* If we already have nonzero result, render unconditionally */ + if (query != NULL && rquery->result.u64 != 0) { + if (rctx->current_render_cond) { + r600_render_condition(ctx, NULL, 0); + } + return; + } + + rctx->current_render_cond = query; + rctx->current_render_cond_mode = mode; + + if (query == NULL) { + if (rctx->predicate_drawing) { + rctx->predicate_drawing = false; + r600_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, 1); + } + return; + } + + if (mode == PIPE_RENDER_COND_WAIT || + mode == PIPE_RENDER_COND_BY_REGION_WAIT) { + wait_flag = 1; + } + + rctx->predicate_drawing = true; + + switch (rquery->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + r600_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + r600_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag); + break; + default: + assert(0); + } +} + +void r600_init_query_functions(struct r600_context *rctx) +{ + rctx->context.create_query = r600_create_query; + rctx->context.destroy_query = r600_destroy_query; + rctx->context.begin_query = r600_begin_query; + rctx->context.end_query = r600_end_query; + rctx->context.get_query_result = r600_get_query_result; + + if (rctx->screen->info.r600_num_backends > 0) + rctx->context.render_condition = r600_render_condition; +} diff --git a/src/gallium/drivers/radeonsi/r600_resource.c b/src/gallium/drivers/radeonsi/r600_resource.c new file mode 100644 index 00000000000..7bdf6d6bd5f --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_resource.c @@ -0,0 +1,64 @@ +/* + * Copyright 2010 Marek Olšák <[email protected] + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "radeonsi_pipe.h" + +static struct pipe_resource *r600_resource_create(struct pipe_screen *screen, + const struct pipe_resource *templ) +{ + if (templ->target == PIPE_BUFFER) { + return r600_buffer_create(screen, templ); + } else { + return r600_texture_create(screen, templ); + } +} + +static struct pipe_resource *r600_resource_from_handle(struct pipe_screen * screen, + const struct pipe_resource *templ, + struct winsys_handle *whandle) +{ + if (templ->target == PIPE_BUFFER) { + return NULL; + } else { + return r600_texture_from_handle(screen, templ, whandle); + } +} + +void r600_init_screen_resource_functions(struct pipe_screen *screen) +{ + screen->resource_create = r600_resource_create; + screen->resource_from_handle = r600_resource_from_handle; + screen->resource_get_handle = u_resource_get_handle_vtbl; + screen->resource_destroy = u_resource_destroy_vtbl; + screen->user_buffer_create = r600_user_buffer_create; +} + +void r600_init_context_resource_functions(struct r600_context *r600) +{ + r600->context.get_transfer = u_get_transfer_vtbl; + r600->context.transfer_map = u_transfer_map_vtbl; + r600->context.transfer_flush_region = u_transfer_flush_region_vtbl; + r600->context.transfer_unmap = u_transfer_unmap_vtbl; + r600->context.transfer_destroy = u_transfer_destroy_vtbl; + r600->context.transfer_inline_write = u_transfer_inline_write_vtbl; +} diff --git a/src/gallium/drivers/radeonsi/r600_resource.h b/src/gallium/drivers/radeonsi/r600_resource.h new file mode 100644 index 00000000000..d6f97b0d5a5 --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_resource.h @@ -0,0 +1,105 @@ +/* + * Copyright 2010 Marek Olšák <[email protected] + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef R600_RESOURCE_H +#define R600_RESOURCE_H + +#include "util/u_transfer.h" +#include "util/u_vbuf.h" + +/* flag to indicate a resource is to be used as a transfer so should not be tiled */ +#define R600_RESOURCE_FLAG_TRANSFER PIPE_RESOURCE_FLAG_DRV_PRIV + +/* Texture transfer. */ +struct r600_transfer { + /* Base class. */ + struct pipe_transfer transfer; + /* Buffer transfer. */ + struct pipe_transfer *buffer_transfer; + unsigned offset; + struct pipe_resource *staging_texture; +}; + +struct r600_resource_texture { + struct r600_resource resource; + + /* If this resource is a depth-stencil buffer on evergreen, this contains + * the depth part of the format. There is a separate stencil resource + * for the stencil buffer below. */ + enum pipe_format real_format; + + unsigned offset[PIPE_MAX_TEXTURE_LEVELS]; + unsigned pitch_in_bytes[PIPE_MAX_TEXTURE_LEVELS]; /* transfer */ + unsigned pitch_in_blocks[PIPE_MAX_TEXTURE_LEVELS]; /* texture resource */ + unsigned layer_size[PIPE_MAX_TEXTURE_LEVELS]; + unsigned array_mode[PIPE_MAX_TEXTURE_LEVELS]; + unsigned pitch_override; + unsigned size; + unsigned tile_type; + unsigned depth; + unsigned dirty_db; + struct r600_resource_texture *stencil; /* Stencil is in a separate buffer on Evergreen. */ + struct r600_resource_texture *flushed_depth_texture; + boolean is_flushing_texture; +}; + +#define R600_TEX_IS_TILED(tex, level) ((tex)->array_mode[level] != V_038000_ARRAY_LINEAR_GENERAL && (tex)->array_mode[level] != V_038000_ARRAY_LINEAR_ALIGNED) + +struct r600_surface { + struct pipe_surface base; + unsigned aligned_height; +}; + +void r600_init_screen_resource_functions(struct pipe_screen *screen); + +/* r600_texture */ +struct pipe_resource *r600_texture_create(struct pipe_screen *screen, + const struct pipe_resource *templ); +struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen, + const struct pipe_resource *base, + struct winsys_handle *whandle); + +static INLINE struct r600_resource *r600_resource(struct pipe_resource *r) +{ + return (struct r600_resource*)r; +} + +int r600_texture_depth_flush(struct pipe_context *ctx, struct pipe_resource *texture, boolean just_create); + +/* r600_texture.c texture transfer functions. */ +struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx, + struct pipe_resource *texture, + unsigned level, + unsigned usage, + const struct pipe_box *box); +void r600_texture_transfer_destroy(struct pipe_context *ctx, + struct pipe_transfer *trans); +void* r600_texture_transfer_map(struct pipe_context *ctx, + struct pipe_transfer* transfer); +void r600_texture_transfer_unmap(struct pipe_context *ctx, + struct pipe_transfer* transfer); + +struct r600_context; + +void r600_upload_const_buffer(struct r600_context *rctx, struct r600_resource **rbuffer, uint32_t *offset); + +#endif diff --git a/src/gallium/drivers/radeonsi/r600_state_common.c b/src/gallium/drivers/radeonsi/r600_state_common.c new file mode 100644 index 00000000000..4ba83dec903 --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_state_common.c @@ -0,0 +1,899 @@ +/* + * Copyright 2010 Red Hat Inc. + * 2010 Jerome Glisse + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: Dave Airlie <[email protected]> + * Jerome Glisse <[email protected]> + */ +#include "util/u_blitter.h" +#include "util/u_memory.h" +#include "util/u_format.h" +#include "pipebuffer/pb_buffer.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "r600_hw_context_priv.h" +#include "radeonsi_pipe.h" +#include "sid.h" + +static void r600_emit_surface_sync(struct r600_context *rctx, struct r600_atom *atom) +{ + struct radeon_winsys_cs *cs = rctx->cs; + struct r600_atom_surface_sync *a = (struct r600_atom_surface_sync*)atom; + + cs->buf[cs->cdw++] = PKT3(PKT3_SURFACE_SYNC, 3, 0); + cs->buf[cs->cdw++] = a->flush_flags; /* CP_COHER_CNTL */ + cs->buf[cs->cdw++] = 0xffffffff; /* CP_COHER_SIZE */ + cs->buf[cs->cdw++] = 0; /* CP_COHER_BASE */ + cs->buf[cs->cdw++] = 0x0000000A; /* POLL_INTERVAL */ + + a->flush_flags = 0; +} + +static void r600_emit_r6xx_flush_and_inv(struct r600_context *rctx, struct r600_atom *atom) +{ + struct radeon_winsys_cs *cs = rctx->cs; + cs->buf[cs->cdw++] = PKT3(PKT3_EVENT_WRITE, 0, 0); + cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0); +} + +static void r600_init_atom(struct r600_atom *atom, + void (*emit)(struct r600_context *ctx, struct r600_atom *state), + unsigned num_dw, + enum r600_atom_flags flags) +{ + atom->emit = emit; + atom->num_dw = num_dw; + atom->flags = flags; +} + +void r600_init_common_atoms(struct r600_context *rctx) +{ + r600_init_atom(&rctx->atom_surface_sync.atom, r600_emit_surface_sync, 5, EMIT_EARLY); + r600_init_atom(&rctx->atom_r6xx_flush_and_inv, r600_emit_r6xx_flush_and_inv, 2, EMIT_EARLY); +} + +unsigned r600_get_cb_flush_flags(struct r600_context *rctx) +{ + unsigned flags = 0; + + if (rctx->framebuffer.nr_cbufs) { + flags |= S_0085F0_CB_ACTION_ENA(1) | + (((1 << rctx->framebuffer.nr_cbufs) - 1) << S_0085F0_CB0_DEST_BASE_ENA_SHIFT); + } + + return flags; +} + +void r600_texture_barrier(struct pipe_context *ctx) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + rctx->atom_surface_sync.flush_flags |= S_0085F0_TC_ACTION_ENA(1) | r600_get_cb_flush_flags(rctx); + r600_atom_dirty(rctx, &rctx->atom_surface_sync.atom); +} + +static bool r600_conv_pipe_prim(unsigned pprim, unsigned *prim) +{ + static const int prim_conv[] = { + V_008958_DI_PT_POINTLIST, + V_008958_DI_PT_LINELIST, + V_008958_DI_PT_LINELOOP, + V_008958_DI_PT_LINESTRIP, + V_008958_DI_PT_TRILIST, + V_008958_DI_PT_TRISTRIP, + V_008958_DI_PT_TRIFAN, + V_008958_DI_PT_QUADLIST, + V_008958_DI_PT_QUADSTRIP, + V_008958_DI_PT_POLYGON, + -1, + -1, + -1, + -1 + }; + + *prim = prim_conv[pprim]; + if (*prim == -1) { + fprintf(stderr, "%s:%d unsupported %d\n", __func__, __LINE__, pprim); + return false; + } + return true; +} + +/* common state between evergreen and r600 */ +void r600_bind_blend_state(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_blend *blend = (struct r600_pipe_blend *)state; + struct r600_pipe_state *rstate; + + if (state == NULL) + return; + rstate = &blend->rstate; + rctx->states[rstate->id] = rstate; + rctx->cb_target_mask = blend->cb_target_mask; + rctx->cb_color_control = blend->cb_color_control; + + r600_context_pipe_state_set(rctx, rstate); +} + +static void r600_set_stencil_ref(struct pipe_context *ctx, + const struct r600_stencil_ref *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = CALLOC_STRUCT(r600_pipe_state); + + if (rstate == NULL) + return; + + rstate->id = R600_PIPE_STATE_STENCIL_REF; + r600_pipe_state_add_reg(rstate, + R_028430_DB_STENCILREFMASK, + S_028430_STENCILTESTVAL(state->ref_value[0]) | + S_028430_STENCILMASK(state->valuemask[0]) | + S_028430_STENCILWRITEMASK(state->writemask[0]), + NULL, 0); + r600_pipe_state_add_reg(rstate, + R_028434_DB_STENCILREFMASK_BF, + S_028434_STENCILTESTVAL_BF(state->ref_value[1]) | + S_028434_STENCILMASK_BF(state->valuemask[1]) | + S_028434_STENCILWRITEMASK_BF(state->writemask[1]), + NULL, 0); + + free(rctx->states[R600_PIPE_STATE_STENCIL_REF]); + rctx->states[R600_PIPE_STATE_STENCIL_REF] = rstate; + r600_context_pipe_state_set(rctx, rstate); +} + +void r600_set_pipe_stencil_ref(struct pipe_context *ctx, + const struct pipe_stencil_ref *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_dsa *dsa = (struct r600_pipe_dsa*)rctx->states[R600_PIPE_STATE_DSA]; + struct r600_stencil_ref ref; + + rctx->stencil_ref = *state; + + if (!dsa) + return; + + ref.ref_value[0] = state->ref_value[0]; + ref.ref_value[1] = state->ref_value[1]; + ref.valuemask[0] = dsa->valuemask[0]; + ref.valuemask[1] = dsa->valuemask[1]; + ref.writemask[0] = dsa->writemask[0]; + ref.writemask[1] = dsa->writemask[1]; + + r600_set_stencil_ref(ctx, &ref); +} + +void r600_bind_dsa_state(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_dsa *dsa = state; + struct r600_pipe_state *rstate; + struct r600_stencil_ref ref; + + if (state == NULL) + return; + rstate = &dsa->rstate; + rctx->states[rstate->id] = rstate; + rctx->alpha_ref = dsa->alpha_ref; + rctx->alpha_ref_dirty = true; + r600_context_pipe_state_set(rctx, rstate); + + ref.ref_value[0] = rctx->stencil_ref.ref_value[0]; + ref.ref_value[1] = rctx->stencil_ref.ref_value[1]; + ref.valuemask[0] = dsa->valuemask[0]; + ref.valuemask[1] = dsa->valuemask[1]; + ref.writemask[0] = dsa->writemask[0]; + ref.writemask[1] = dsa->writemask[1]; + + r600_set_stencil_ref(ctx, &ref); +} + +void r600_bind_rs_state(struct pipe_context *ctx, void *state) +{ + struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state; + struct r600_context *rctx = (struct r600_context *)ctx; + + if (state == NULL) + return; + + rctx->sprite_coord_enable = rs->sprite_coord_enable; + rctx->pa_sc_line_stipple = rs->pa_sc_line_stipple; + rctx->pa_su_sc_mode_cntl = rs->pa_su_sc_mode_cntl; + rctx->pa_cl_clip_cntl = rs->pa_cl_clip_cntl; + rctx->pa_cl_vs_out_cntl = rs->pa_cl_vs_out_cntl; + + rctx->rasterizer = rs; + + rctx->states[rs->rstate.id] = &rs->rstate; + r600_context_pipe_state_set(rctx, &rs->rstate); + + if (rctx->chip_class >= CAYMAN) { + cayman_polygon_offset_update(rctx); + } +} + +void r600_delete_rs_state(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_rasterizer *rs = (struct r600_pipe_rasterizer *)state; + + if (rctx->rasterizer == rs) { + rctx->rasterizer = NULL; + } + if (rctx->states[rs->rstate.id] == &rs->rstate) { + rctx->states[rs->rstate.id] = NULL; + } + free(rs); +} + +void r600_sampler_view_destroy(struct pipe_context *ctx, + struct pipe_sampler_view *state) +{ + struct r600_pipe_sampler_view *resource = (struct r600_pipe_sampler_view *)state; + + pipe_resource_reference(&state->texture, NULL); + FREE(resource); +} + +void r600_delete_state(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_state *rstate = (struct r600_pipe_state *)state; + + if (rctx->states[rstate->id] == rstate) { + rctx->states[rstate->id] = NULL; + } + for (int i = 0; i < rstate->nregs; i++) { + pipe_resource_reference((struct pipe_resource**)&rstate->regs[i].bo, NULL); + } + free(rstate); +} + +void r600_bind_vertex_elements(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_vertex_element *v = (struct r600_vertex_element*)state; + + rctx->vertex_elements = v; + if (v) { + r600_inval_shader_cache(rctx); + u_vbuf_bind_vertex_elements(rctx->vbuf_mgr, state, + v->vmgr_elements); + + rctx->states[v->rstate.id] = &v->rstate; + r600_context_pipe_state_set(rctx, &v->rstate); + } +} + +void r600_delete_vertex_element(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_vertex_element *v = (struct r600_vertex_element*)state; + + if (rctx->states[v->rstate.id] == &v->rstate) { + rctx->states[v->rstate.id] = NULL; + } + if (rctx->vertex_elements == state) + rctx->vertex_elements = NULL; + + u_vbuf_destroy_vertex_elements(rctx->vbuf_mgr, v->vmgr_elements); + FREE(state); +} + + +void r600_set_index_buffer(struct pipe_context *ctx, + const struct pipe_index_buffer *ib) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + u_vbuf_set_index_buffer(rctx->vbuf_mgr, ib); +} + +void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count, + const struct pipe_vertex_buffer *buffers) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + u_vbuf_set_vertex_buffers(rctx->vbuf_mgr, count, buffers); +} + +void *si_create_vertex_elements(struct pipe_context *ctx, + unsigned count, + const struct pipe_vertex_element *elements) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_vertex_element *v = CALLOC_STRUCT(r600_vertex_element); + + assert(count < 32); + if (!v) + return NULL; + + v->count = count; + v->vmgr_elements = + u_vbuf_create_vertex_elements(rctx->vbuf_mgr, count, + elements, v->elements); + + return v; +} + +void *si_create_shader_state(struct pipe_context *ctx, + const struct pipe_shader_state *state) +{ + struct si_pipe_shader *shader = CALLOC_STRUCT(si_pipe_shader); + + shader->tokens = tgsi_dup_tokens(state->tokens); + shader->so = state->stream_output; + + return shader; +} + +void r600_bind_ps_shader(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + if (rctx->ps_shader != state) + rctx->shader_dirty = true; + + /* TODO delete old shader */ + rctx->ps_shader = (struct si_pipe_shader *)state; + if (state) { + r600_inval_shader_cache(rctx); + r600_context_pipe_state_set(rctx, &rctx->ps_shader->rstate); + } +} + +void r600_bind_vs_shader(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + + if (rctx->vs_shader != state) + rctx->shader_dirty = true; + + /* TODO delete old shader */ + rctx->vs_shader = (struct si_pipe_shader *)state; + if (state) { + r600_inval_shader_cache(rctx); + r600_context_pipe_state_set(rctx, &rctx->vs_shader->rstate); + } +} + +void r600_delete_ps_shader(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct si_pipe_shader *shader = (struct si_pipe_shader *)state; + + if (rctx->ps_shader == shader) { + rctx->ps_shader = NULL; + } + + free(shader->tokens); + si_pipe_shader_destroy(ctx, shader); + free(shader); +} + +void r600_delete_vs_shader(struct pipe_context *ctx, void *state) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct si_pipe_shader *shader = (struct si_pipe_shader *)state; + + if (rctx->vs_shader == shader) { + rctx->vs_shader = NULL; + } + + free(shader->tokens); + si_pipe_shader_destroy(ctx, shader); + free(shader); +} + +static void r600_update_alpha_ref(struct r600_context *rctx) +{ +#if 0 + unsigned alpha_ref; + struct r600_pipe_state rstate; + + alpha_ref = rctx->alpha_ref; + rstate.nregs = 0; + if (rctx->export_16bpc) + alpha_ref &= ~0x1FFF; + r600_pipe_state_add_reg(&rstate, R_028438_SX_ALPHA_REF, alpha_ref, NULL, 0); + + r600_context_pipe_state_set(rctx, &rstate); + rctx->alpha_ref_dirty = false; +#endif +} + +void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index, + struct pipe_resource *buffer) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_resource *rbuffer = r600_resource(buffer); + struct r600_pipe_state *rstate; + uint64_t va_offset; + uint32_t offset; + + /* Note that the state tracker can unbind constant buffers by + * passing NULL here. + */ + if (buffer == NULL) { + return; + } + + r600_inval_shader_cache(rctx); + + r600_upload_const_buffer(rctx, &rbuffer, &offset); + va_offset = r600_resource_va(ctx->screen, (void*)rbuffer); + va_offset += offset; + //va_offset >>= 8; + + switch (shader) { + case PIPE_SHADER_VERTEX: + rstate = &rctx->vs_const_buffer; + rstate->nregs = 0; + r600_pipe_state_add_reg(rstate, + R_00B138_SPI_SHADER_USER_DATA_VS_2, + va_offset, rbuffer, RADEON_USAGE_READ); + r600_pipe_state_add_reg(rstate, + R_00B13C_SPI_SHADER_USER_DATA_VS_3, + va_offset >> 32, NULL, 0); + break; + case PIPE_SHADER_FRAGMENT: + rstate = &rctx->ps_const_buffer; + rstate->nregs = 0; + r600_pipe_state_add_reg(rstate, + R_00B030_SPI_SHADER_USER_DATA_PS_0, + va_offset, rbuffer, RADEON_USAGE_READ); + r600_pipe_state_add_reg(rstate, + R_00B034_SPI_SHADER_USER_DATA_PS_1, + va_offset >> 32, NULL, 0); + break; + default: + R600_ERR("unsupported %d\n", shader); + return; + } + + r600_context_pipe_state_set(rctx, rstate); + + if (buffer != &rbuffer->b.b.b) + pipe_resource_reference((struct pipe_resource**)&rbuffer, NULL); +} + +struct pipe_stream_output_target * +r600_create_so_target(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned buffer_offset, + unsigned buffer_size) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_so_target *t; + void *ptr; + + t = CALLOC_STRUCT(r600_so_target); + if (!t) { + return NULL; + } + + t->b.reference.count = 1; + t->b.context = ctx; + pipe_resource_reference(&t->b.buffer, buffer); + t->b.buffer_offset = buffer_offset; + t->b.buffer_size = buffer_size; + + t->filled_size = (struct r600_resource*) + pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_STATIC, 4); + ptr = rctx->ws->buffer_map(t->filled_size->buf, rctx->cs, PIPE_TRANSFER_WRITE); + memset(ptr, 0, t->filled_size->buf->size); + rctx->ws->buffer_unmap(t->filled_size->buf); + + return &t->b; +} + +void r600_so_target_destroy(struct pipe_context *ctx, + struct pipe_stream_output_target *target) +{ + struct r600_so_target *t = (struct r600_so_target*)target; + pipe_resource_reference(&t->b.buffer, NULL); + pipe_resource_reference((struct pipe_resource**)&t->filled_size, NULL); + FREE(t); +} + +void r600_set_so_targets(struct pipe_context *ctx, + unsigned num_targets, + struct pipe_stream_output_target **targets, + unsigned append_bitmask) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + unsigned i; + + /* Stop streamout. */ + if (rctx->num_so_targets) { + r600_context_streamout_end(rctx); + } + + /* Set the new targets. */ + for (i = 0; i < num_targets; i++) { + pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->so_targets[i], targets[i]); + } + for (; i < rctx->num_so_targets; i++) { + pipe_so_target_reference((struct pipe_stream_output_target**)&rctx->so_targets[i], NULL); + } + + rctx->num_so_targets = num_targets; + rctx->streamout_start = num_targets != 0; + rctx->streamout_append_bitmask = append_bitmask; +} + +static void r600_vertex_buffer_update(struct r600_context *rctx) +{ + struct pipe_context *ctx = &rctx->context; + struct r600_pipe_state *rstate = &rctx->vs_user_data; + struct r600_resource *rbuffer, *t_list_buffer; + struct pipe_vertex_buffer *vertex_buffer; + unsigned i, count, offset; + uint32_t *ptr; + uint64_t va; + + r600_inval_vertex_cache(rctx); + + if (rctx->vertex_elements->vbuffer_need_offset) { + /* one resource per vertex elements */ + count = rctx->vertex_elements->count; + } else { + /* bind vertex buffer once */ + count = rctx->vbuf_mgr->nr_real_vertex_buffers; + } + assert(count <= 256 / 4); + + t_list_buffer = (struct r600_resource*) + pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_IMMUTABLE, 4 * 4 * count); + if (t_list_buffer == NULL) + return; + + ptr = (uint32_t*)rctx->ws->buffer_map(t_list_buffer->buf, + rctx->cs, + PIPE_TRANSFER_WRITE); + + for (i = 0 ; i < count; i++, ptr += 4) { + struct pipe_vertex_element *velem = &rctx->vertex_elements->elements[i]; + const struct util_format_description *desc; + unsigned data_format, num_format; + int first_non_void; + + if (rctx->vertex_elements->vbuffer_need_offset) { + /* one resource per vertex elements */ + unsigned vbuffer_index; + vbuffer_index = rctx->vertex_elements->elements[i].vertex_buffer_index; + vertex_buffer = &rctx->vbuf_mgr->real_vertex_buffer[vbuffer_index]; + rbuffer = (struct r600_resource*)vertex_buffer->buffer; + offset = rctx->vertex_elements->vbuffer_offset[i]; + } else { + /* bind vertex buffer once */ + vertex_buffer = &rctx->vbuf_mgr->real_vertex_buffer[i]; + rbuffer = (struct r600_resource*)vertex_buffer->buffer; + offset = 0; + } + if (vertex_buffer == NULL || rbuffer == NULL) + continue; + offset += vertex_buffer->buffer_offset; + + va = r600_resource_va(ctx->screen, (void*)rbuffer); + va += offset; + + desc = util_format_description(velem->src_format); + first_non_void = util_format_get_first_non_void_channel(velem->src_format); + data_format = si_translate_vertexformat(ctx->screen, + velem->src_format, + desc, first_non_void); + + switch (desc->channel[first_non_void].type) { + case UTIL_FORMAT_TYPE_FIXED: + num_format = V_008F0C_BUF_NUM_FORMAT_USCALED; /* XXX */ + break; + case UTIL_FORMAT_TYPE_SIGNED: + num_format = V_008F0C_BUF_NUM_FORMAT_SNORM; + break; + case UTIL_FORMAT_TYPE_UNSIGNED: + num_format = V_008F0C_BUF_NUM_FORMAT_UNORM; + break; + case UTIL_FORMAT_TYPE_FLOAT: + default: + num_format = V_008F14_IMG_NUM_FORMAT_FLOAT; + } + + /* Fill in T# buffer resource description */ + ptr[0] = va & 0xFFFFFFFF; + ptr[1] = ((va >> 32) & 0xFFFF) | + (vertex_buffer->stride & 0x3FFF) << 16; + ptr[2] = (vertex_buffer->buffer->width0 - offset) / vertex_buffer->stride; + /* XXX: Hardcoding RGBA */ + ptr[3] = 4 | 5 << 3 | 6 << 6 | 7 << 9 | + num_format << 12 | data_format << 15; + + r600_context_bo_reloc(rctx, rbuffer, RADEON_USAGE_READ); + } + + rstate->nregs = 0; + + va = r600_resource_va(ctx->screen, (void*)t_list_buffer); + r600_pipe_state_add_reg(rstate, + R_00B130_SPI_SHADER_USER_DATA_VS_0, + va, t_list_buffer, RADEON_USAGE_READ); + r600_pipe_state_add_reg(rstate, + R_00B134_SPI_SHADER_USER_DATA_VS_1, + va >> 32, + NULL, 0); + + r600_context_pipe_state_set(rctx, rstate); +} + +static void si_update_derived_state(struct r600_context *rctx) +{ + struct pipe_context * ctx = (struct pipe_context*)rctx; + + if (!rctx->blitter->running) { + if (rctx->have_depth_fb || rctx->have_depth_texture) + r600_flush_depth_textures(rctx); + } + + if (rctx->shader_dirty) { + si_pipe_shader_destroy(&rctx->context, rctx->vs_shader); + } + + if (rctx->shader_dirty || + (rctx->ps_shader->shader.fs_write_all && + (rctx->ps_shader->shader.nr_cbufs != rctx->nr_cbufs)) || + (rctx->sprite_coord_enable && + (rctx->ps_shader->sprite_coord_enable != rctx->sprite_coord_enable))) { + si_pipe_shader_destroy(&rctx->context, rctx->ps_shader); + } + + if (rctx->alpha_ref_dirty) { + r600_update_alpha_ref(rctx); + } + + if (!rctx->vs_shader->bo) { + si_pipe_shader_vs(ctx, rctx->vs_shader); + + r600_context_pipe_state_set(rctx, &rctx->vs_shader->rstate); + } + + if (!rctx->ps_shader->bo) { + si_pipe_shader_ps(ctx, rctx->ps_shader); + + r600_context_pipe_state_set(rctx, &rctx->ps_shader->rstate); + } + + if (rctx->shader_dirty) { + si_update_spi_map(rctx); + rctx->shader_dirty = false; + } +} + +void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_pipe_dsa *dsa = (struct r600_pipe_dsa*)rctx->states[R600_PIPE_STATE_DSA]; + struct pipe_draw_info info = *dinfo; + struct r600_draw rdraw = {}; + struct pipe_index_buffer ib = {}; + unsigned prim, mask, ls_mask = 0; + struct r600_block *dirty_block = NULL, *next_block = NULL; + struct r600_atom *state = NULL, *next_state = NULL; + int i; + + if ((!info.count && (info.indexed || !info.count_from_stream_output)) || + (info.indexed && !rctx->vbuf_mgr->index_buffer.buffer) || + !r600_conv_pipe_prim(info.mode, &prim)) { + return; + } + + if (!rctx->ps_shader || !rctx->vs_shader) + return; + + si_update_derived_state(rctx); + + u_vbuf_draw_begin(rctx->vbuf_mgr, &info); + r600_vertex_buffer_update(rctx); + + rdraw.vgt_num_indices = info.count; + rdraw.vgt_num_instances = info.instance_count; + + if (info.indexed) { + /* Initialize the index buffer struct. */ + pipe_resource_reference(&ib.buffer, rctx->vbuf_mgr->index_buffer.buffer); + ib.index_size = rctx->vbuf_mgr->index_buffer.index_size; + ib.offset = rctx->vbuf_mgr->index_buffer.offset + info.start * ib.index_size; + + /* Translate or upload, if needed. */ + r600_translate_index_buffer(rctx, &ib, info.count); + + if (u_vbuf_resource(ib.buffer)->user_ptr) { + r600_upload_index_buffer(rctx, &ib, info.count); + } + + /* Initialize the r600_draw struct with index buffer info. */ + if (ib.index_size == 4) { + rdraw.vgt_index_type = V_028A7C_VGT_INDEX_32 | + (R600_BIG_ENDIAN ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0); + } else { + rdraw.vgt_index_type = V_028A7C_VGT_INDEX_16 | + (R600_BIG_ENDIAN ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0); + } + rdraw.indices = (struct r600_resource*)ib.buffer; + rdraw.indices_bo_offset = ib.offset; + rdraw.vgt_draw_initiator = V_0287F0_DI_SRC_SEL_DMA; + } else { + info.index_bias = info.start; + rdraw.vgt_draw_initiator = V_0287F0_DI_SRC_SEL_AUTO_INDEX; + if (info.count_from_stream_output) { + rdraw.vgt_draw_initiator |= S_0287F0_USE_OPAQUE(1); + + r600_context_draw_opaque_count(rctx, (struct r600_so_target*)info.count_from_stream_output); + } + } + + rctx->vs_shader_so_strides = rctx->vs_shader->so_strides; + + mask = (1ULL << ((unsigned)rctx->framebuffer.nr_cbufs * 4)) - 1; + + if (rctx->vgt.id != R600_PIPE_STATE_VGT) { + rctx->vgt.id = R600_PIPE_STATE_VGT; + rctx->vgt.nregs = 0; + r600_pipe_state_add_reg(&rctx->vgt, R_008958_VGT_PRIMITIVE_TYPE, prim, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_028238_CB_TARGET_MASK, rctx->cb_target_mask & mask, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_028400_VGT_MAX_VTX_INDX, ~0, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_028404_VGT_MIN_VTX_INDX, 0, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_028408_VGT_INDX_OFFSET, info.index_bias, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info.restart_index, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, info.primitive_restart, NULL, 0); +#if 0 + r600_pipe_state_add_reg(&rctx->vgt, R_03CFF0_SQ_VTX_BASE_VTX_LOC, 0, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_03CFF4_SQ_VTX_START_INST_LOC, info.start_instance, NULL, 0); +#endif + r600_pipe_state_add_reg(&rctx->vgt, R_028A0C_PA_SC_LINE_STIPPLE, 0, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_028814_PA_SU_SC_MODE_CNTL, 0, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_02881C_PA_CL_VS_OUT_CNTL, 0, NULL, 0); + r600_pipe_state_add_reg(&rctx->vgt, R_028810_PA_CL_CLIP_CNTL, 0x0, NULL, 0); + } + + rctx->vgt.nregs = 0; + r600_pipe_state_mod_reg(&rctx->vgt, prim); + r600_pipe_state_mod_reg(&rctx->vgt, rctx->cb_target_mask & mask); + r600_pipe_state_mod_reg(&rctx->vgt, ~0); + r600_pipe_state_mod_reg(&rctx->vgt, 0); + r600_pipe_state_mod_reg(&rctx->vgt, info.index_bias); + r600_pipe_state_mod_reg(&rctx->vgt, info.restart_index); + r600_pipe_state_mod_reg(&rctx->vgt, info.primitive_restart); +#if 0 + r600_pipe_state_mod_reg(&rctx->vgt, 0); + r600_pipe_state_mod_reg(&rctx->vgt, info.start_instance); +#endif + + if (prim == V_008958_DI_PT_LINELIST) + ls_mask = 1; + else if (prim == V_008958_DI_PT_LINESTRIP) + ls_mask = 2; + r600_pipe_state_mod_reg(&rctx->vgt, S_028A0C_AUTO_RESET_CNTL(ls_mask) | rctx->pa_sc_line_stipple); + + if (info.mode == PIPE_PRIM_QUADS || info.mode == PIPE_PRIM_QUAD_STRIP || info.mode == PIPE_PRIM_POLYGON) { + r600_pipe_state_mod_reg(&rctx->vgt, S_028814_PROVOKING_VTX_LAST(1) | rctx->pa_su_sc_mode_cntl); + } else { + r600_pipe_state_mod_reg(&rctx->vgt, rctx->pa_su_sc_mode_cntl); + } + r600_pipe_state_mod_reg(&rctx->vgt, + rctx->pa_cl_vs_out_cntl /*| + (rctx->rasterizer->clip_plane_enable & rctx->vs_shader->shader.clip_dist_write)*/); + r600_pipe_state_mod_reg(&rctx->vgt, + rctx->pa_cl_clip_cntl /*| + (rctx->vs_shader->shader.clip_dist_write || + rctx->vs_shader->shader.vs_prohibit_ucps ? + 0 : rctx->rasterizer->clip_plane_enable & 0x3F)*/); + + r600_context_pipe_state_set(rctx, &rctx->vgt); + + rdraw.db_render_override = dsa->db_render_override; + rdraw.db_render_control = dsa->db_render_control; + + /* Emit states. */ + r600_need_cs_space(rctx, 0, TRUE); + + LIST_FOR_EACH_ENTRY_SAFE(state, next_state, &rctx->dirty_states, head) { + r600_emit_atom(rctx, state); + } + LIST_FOR_EACH_ENTRY_SAFE(dirty_block, next_block, &rctx->dirty,list) { + r600_context_block_emit_dirty(rctx, dirty_block); + } + rctx->pm4_dirty_cdwords = 0; + + /* Enable stream out if needed. */ + if (rctx->streamout_start) { + r600_context_streamout_begin(rctx); + rctx->streamout_start = FALSE; + } + + for (i = 0; i < NUM_TEX_UNITS; i++) { + if (rctx->ps_samplers.views[i]) + r600_context_bo_reloc(rctx, + (struct r600_resource*)rctx->ps_samplers.views[i]->base.texture, + RADEON_USAGE_READ); + } + + if (rctx->chip_class >= CAYMAN) { + evergreen_context_draw(rctx, &rdraw); + } + + rctx->flags |= R600_CONTEXT_DST_CACHES_DIRTY | R600_CONTEXT_DRAW_PENDING; + + if (rctx->framebuffer.zsbuf) + { + struct pipe_resource *tex = rctx->framebuffer.zsbuf->texture; + ((struct r600_resource_texture *)tex)->dirty_db = TRUE; + } + + pipe_resource_reference(&ib.buffer, NULL); + u_vbuf_draw_end(rctx->vbuf_mgr); +} + +void _r600_pipe_state_add_reg(struct r600_context *ctx, + struct r600_pipe_state *state, + uint32_t offset, uint32_t value, + uint32_t range_id, uint32_t block_id, + struct r600_resource *bo, + enum radeon_bo_usage usage) +{ + struct r600_range *range; + struct r600_block *block; + + if (bo) assert(usage); + + range = &ctx->range[range_id]; + block = range->blocks[block_id]; + state->regs[state->nregs].block = block; + state->regs[state->nregs].id = (offset - block->start_offset) >> 2; + + state->regs[state->nregs].value = value; + state->regs[state->nregs].bo = bo; + state->regs[state->nregs].bo_usage = usage; + + state->nregs++; + assert(state->nregs < R600_BLOCK_MAX_REG); +} + +void r600_pipe_state_add_reg_noblock(struct r600_pipe_state *state, + uint32_t offset, uint32_t value, + struct r600_resource *bo, + enum radeon_bo_usage usage) +{ + if (bo) assert(usage); + + state->regs[state->nregs].id = offset; + state->regs[state->nregs].block = NULL; + state->regs[state->nregs].value = value; + state->regs[state->nregs].bo = bo; + state->regs[state->nregs].bo_usage = usage; + + state->nregs++; + assert(state->nregs < R600_BLOCK_MAX_REG); +} diff --git a/src/gallium/drivers/radeonsi/r600_texture.c b/src/gallium/drivers/radeonsi/r600_texture.c new file mode 100644 index 00000000000..c9e1b832113 --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_texture.c @@ -0,0 +1,825 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + * Corbin Simpson + */ +#include <errno.h> +#include "pipe/p_screen.h" +#include "util/u_format.h" +#include "util/u_format_s3tc.h" +#include "util/u_math.h" +#include "util/u_inlines.h" +#include "util/u_memory.h" +#include "pipebuffer/pb_buffer.h" +#include "radeonsi_pipe.h" +#include "r600_resource.h" +#include "sid.h" + +/* Copy from a full GPU texture to a transfer's staging one. */ +static void r600_copy_to_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer) +{ + struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer; + struct pipe_resource *texture = transfer->resource; + + ctx->resource_copy_region(ctx, rtransfer->staging_texture, + 0, 0, 0, 0, texture, transfer->level, + &transfer->box); +} + + +/* Copy from a transfer's staging texture to a full GPU one. */ +static void r600_copy_from_staging_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer) +{ + struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer; + struct pipe_resource *texture = transfer->resource; + struct pipe_box sbox; + + sbox.x = sbox.y = sbox.z = 0; + sbox.width = transfer->box.width; + sbox.height = transfer->box.height; + /* XXX that might be wrong */ + sbox.depth = 1; + ctx->resource_copy_region(ctx, texture, transfer->level, + transfer->box.x, transfer->box.y, transfer->box.z, + rtransfer->staging_texture, + 0, &sbox); +} + +unsigned r600_texture_get_offset(struct r600_resource_texture *rtex, + unsigned level, unsigned layer) +{ + unsigned offset = rtex->offset[level]; + + switch (rtex->resource.b.b.b.target) { + case PIPE_TEXTURE_3D: + case PIPE_TEXTURE_CUBE: + default: + return offset + layer * rtex->layer_size[level]; + } +} + +static unsigned r600_get_block_alignment(struct pipe_screen *screen, + enum pipe_format format, + unsigned array_mode) +{ + struct r600_screen* rscreen = (struct r600_screen *)screen; + unsigned pixsize = util_format_get_blocksize(format); + int p_align; + + switch(array_mode) { +#if 0 + case V_038000_ARRAY_1D_TILED_THIN1: + p_align = MAX2(8, + ((rscreen->tiling_info.group_bytes / 8 / pixsize))); + break; + case V_038000_ARRAY_2D_TILED_THIN1: + p_align = MAX2(rscreen->tiling_info.num_banks, + (((rscreen->tiling_info.group_bytes / 8 / pixsize)) * + rscreen->tiling_info.num_banks)) * 8; + break; + case V_038000_ARRAY_LINEAR_ALIGNED: + p_align = MAX2(64, rscreen->tiling_info.group_bytes / pixsize); + break; + case V_038000_ARRAY_LINEAR_GENERAL: +#endif + default: + p_align = rscreen->tiling_info.group_bytes / pixsize; + break; + } + return p_align; +} + +static unsigned r600_get_height_alignment(struct pipe_screen *screen, + unsigned array_mode) +{ + struct r600_screen* rscreen = (struct r600_screen *)screen; + int h_align; + + switch (array_mode) { +#if 0 + case V_038000_ARRAY_2D_TILED_THIN1: + h_align = rscreen->tiling_info.num_channels * 8; + break; + case V_038000_ARRAY_1D_TILED_THIN1: + case V_038000_ARRAY_LINEAR_ALIGNED: + h_align = 8; + break; + case V_038000_ARRAY_LINEAR_GENERAL: +#endif + default: + h_align = 1; + break; + } + return h_align; +} + +static unsigned r600_get_base_alignment(struct pipe_screen *screen, + enum pipe_format format, + unsigned array_mode) +{ + struct r600_screen* rscreen = (struct r600_screen *)screen; + unsigned pixsize = util_format_get_blocksize(format); + int p_align = r600_get_block_alignment(screen, format, array_mode); + int h_align = r600_get_height_alignment(screen, array_mode); + int b_align; + + switch (array_mode) { +#if 0 + case V_038000_ARRAY_2D_TILED_THIN1: + b_align = MAX2(rscreen->tiling_info.num_banks * rscreen->tiling_info.num_channels * 8 * 8 * pixsize, + p_align * pixsize * h_align); + break; + case V_038000_ARRAY_1D_TILED_THIN1: + case V_038000_ARRAY_LINEAR_ALIGNED: + case V_038000_ARRAY_LINEAR_GENERAL: +#endif + default: + b_align = rscreen->tiling_info.group_bytes; + break; + } + return b_align; +} + +static unsigned mip_minify(unsigned size, unsigned level) +{ + unsigned val; + val = u_minify(size, level); + if (level > 0) + val = util_next_power_of_two(val); + return val; +} + +static unsigned r600_texture_get_nblocksx(struct pipe_screen *screen, + struct r600_resource_texture *rtex, + unsigned level) +{ + struct pipe_resource *ptex = &rtex->resource.b.b.b; + unsigned nblocksx, block_align, width; + unsigned blocksize = util_format_get_blocksize(rtex->real_format); + + if (rtex->pitch_override) + return rtex->pitch_override / blocksize; + + width = mip_minify(ptex->width0, level); + nblocksx = util_format_get_nblocksx(rtex->real_format, width); + + block_align = r600_get_block_alignment(screen, rtex->real_format, + rtex->array_mode[level]); + nblocksx = align(nblocksx, block_align); + return nblocksx; +} + +static unsigned r600_texture_get_nblocksy(struct pipe_screen *screen, + struct r600_resource_texture *rtex, + unsigned level) +{ + struct pipe_resource *ptex = &rtex->resource.b.b.b; + unsigned height, tile_height; + + height = mip_minify(ptex->height0, level); + height = util_format_get_nblocksy(rtex->real_format, height); + tile_height = r600_get_height_alignment(screen, + rtex->array_mode[level]); + + /* XXX Hack around an alignment issue. Less tests fail with this. + * + * The thing is depth-stencil buffers should be tiled, i.e. + * the alignment should be >=8. If I make them tiled, stencil starts + * working because it no longer overlaps with the depth buffer + * in memory, but texturing like drawpix-stencil breaks. */ + if (util_format_is_depth_or_stencil(rtex->real_format) && tile_height < 8) + tile_height = 8; + + height = align(height, tile_height); + return height; +} + +static void r600_texture_set_array_mode(struct pipe_screen *screen, + struct r600_resource_texture *rtex, + unsigned level, unsigned array_mode) +{ + struct pipe_resource *ptex = &rtex->resource.b.b.b; + + switch (array_mode) { +#if 0 + case V_0280A0_ARRAY_LINEAR_GENERAL: + case V_0280A0_ARRAY_LINEAR_ALIGNED: + case V_0280A0_ARRAY_1D_TILED_THIN1: +#endif + default: + rtex->array_mode[level] = array_mode; + break; +#if 0 + case V_0280A0_ARRAY_2D_TILED_THIN1: + { + unsigned w, h, tile_height, tile_width; + + tile_height = r600_get_height_alignment(screen, array_mode); + tile_width = r600_get_block_alignment(screen, rtex->real_format, array_mode); + + w = mip_minify(ptex->width0, level); + h = mip_minify(ptex->height0, level); + if (w <= tile_width || h <= tile_height) + rtex->array_mode[level] = V_0280A0_ARRAY_1D_TILED_THIN1; + else + rtex->array_mode[level] = array_mode; + } + break; +#endif + } +} + +static void r600_setup_miptree(struct pipe_screen *screen, + struct r600_resource_texture *rtex, + unsigned array_mode) +{ + struct pipe_resource *ptex = &rtex->resource.b.b.b; + enum chip_class chipc = ((struct r600_screen*)screen)->chip_class; + unsigned size, layer_size, i, offset; + unsigned nblocksx, nblocksy; + + for (i = 0, offset = 0; i <= ptex->last_level; i++) { + unsigned blocksize = util_format_get_blocksize(rtex->real_format); + unsigned base_align = r600_get_base_alignment(screen, rtex->real_format, array_mode); + + r600_texture_set_array_mode(screen, rtex, i, array_mode); + + nblocksx = r600_texture_get_nblocksx(screen, rtex, i); + nblocksy = r600_texture_get_nblocksy(screen, rtex, i); + + if (chipc >= CAYMAN /*&& array_mode == V_038000_ARRAY_LINEAR_GENERAL*/) + layer_size = align(nblocksx, 64) * nblocksy * blocksize; + else + layer_size = nblocksx * nblocksy * blocksize; + + if (ptex->target == PIPE_TEXTURE_CUBE) { + if (chipc >= CAYMAN) + size = layer_size * 8; + } + else if (ptex->target == PIPE_TEXTURE_3D) + size = layer_size * u_minify(ptex->depth0, i); + else + size = layer_size * ptex->array_size; + + /* align base image and start of miptree */ + if ((i == 0) || (i == 1)) + offset = align(offset, base_align); + rtex->offset[i] = offset; + rtex->layer_size[i] = layer_size; + rtex->pitch_in_blocks[i] = nblocksx; /* CB talks in elements */ + rtex->pitch_in_bytes[i] = nblocksx * blocksize; + + offset += size; + } + rtex->size = offset; +} + +/* Figure out whether u_blitter will fallback to a transfer operation. + * If so, don't use a staging resource. + */ +static boolean permit_hardware_blit(struct pipe_screen *screen, + const struct pipe_resource *res) +{ + unsigned bind; + + if (util_format_is_depth_or_stencil(res->format)) + bind = PIPE_BIND_DEPTH_STENCIL; + else + bind = PIPE_BIND_RENDER_TARGET; + + /* hackaround for S3TC */ + if (util_format_is_compressed(res->format)) + return TRUE; + + if (!screen->is_format_supported(screen, + res->format, + res->target, + res->nr_samples, + bind)) + return FALSE; + + if (!screen->is_format_supported(screen, + res->format, + res->target, + res->nr_samples, + PIPE_BIND_SAMPLER_VIEW)) + return FALSE; + + switch (res->usage) { + case PIPE_USAGE_STREAM: + case PIPE_USAGE_STAGING: + return FALSE; + + default: + return TRUE; + } +} + +static boolean r600_texture_get_handle(struct pipe_screen* screen, + struct pipe_resource *ptex, + struct winsys_handle *whandle) +{ + struct r600_resource_texture *rtex = (struct r600_resource_texture*)ptex; + struct r600_resource *resource = &rtex->resource; + struct r600_screen *rscreen = (struct r600_screen*)screen; + + return rscreen->ws->buffer_get_handle(resource->buf, + rtex->pitch_in_bytes[0], whandle); +} + +static void r600_texture_destroy(struct pipe_screen *screen, + struct pipe_resource *ptex) +{ + struct r600_resource_texture *rtex = (struct r600_resource_texture*)ptex; + struct r600_resource *resource = &rtex->resource; + + if (rtex->flushed_depth_texture) + pipe_resource_reference((struct pipe_resource **)&rtex->flushed_depth_texture, NULL); + + if (rtex->stencil) + pipe_resource_reference((struct pipe_resource **)&rtex->stencil, NULL); + + pb_reference(&resource->buf, NULL); + FREE(rtex); +} + +static const struct u_resource_vtbl r600_texture_vtbl = +{ + r600_texture_get_handle, /* get_handle */ + r600_texture_destroy, /* resource_destroy */ + r600_texture_get_transfer, /* get_transfer */ + r600_texture_transfer_destroy, /* transfer_destroy */ + r600_texture_transfer_map, /* transfer_map */ + u_default_transfer_flush_region,/* transfer_flush_region */ + r600_texture_transfer_unmap, /* transfer_unmap */ + u_default_transfer_inline_write /* transfer_inline_write */ +}; + +static struct r600_resource_texture * +r600_texture_create_object(struct pipe_screen *screen, + const struct pipe_resource *base, + unsigned array_mode, + unsigned pitch_in_bytes_override, + unsigned max_buffer_size, + struct pb_buffer *buf, + boolean alloc_bo) +{ + struct r600_resource_texture *rtex; + struct r600_resource *resource; + struct r600_screen *rscreen = (struct r600_screen*)screen; + + rtex = CALLOC_STRUCT(r600_resource_texture); + if (rtex == NULL) + return NULL; + + resource = &rtex->resource; + resource->b.b.b = *base; + resource->b.b.vtbl = &r600_texture_vtbl; + pipe_reference_init(&resource->b.b.b.reference, 1); + resource->b.b.b.screen = screen; + rtex->pitch_override = pitch_in_bytes_override; + rtex->real_format = base->format; + + /* We must split depth and stencil into two separate buffers on Evergreen. */ + if (!(base->flags & R600_RESOURCE_FLAG_TRANSFER) && + ((struct r600_screen*)screen)->chip_class >= CAYMAN && + util_format_is_depth_and_stencil(base->format)) { + struct pipe_resource stencil; + unsigned stencil_pitch_override = 0; + + switch (base->format) { + case PIPE_FORMAT_Z24_UNORM_S8_UINT: + rtex->real_format = PIPE_FORMAT_Z24X8_UNORM; + break; + case PIPE_FORMAT_S8_UINT_Z24_UNORM: + rtex->real_format = PIPE_FORMAT_X8Z24_UNORM; + break; + case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT: + rtex->real_format = PIPE_FORMAT_Z32_FLOAT; + break; + default: + assert(0); + FREE(rtex); + return NULL; + } + + /* Divide the pitch in bytes by 4 for stencil, because it has a smaller pixel size. */ + if (pitch_in_bytes_override) { + assert(base->format == PIPE_FORMAT_Z24_UNORM_S8_UINT || + base->format == PIPE_FORMAT_S8_UINT_Z24_UNORM); + stencil_pitch_override = pitch_in_bytes_override / 4; + } + + /* Allocate the stencil buffer. */ + stencil = *base; + stencil.format = PIPE_FORMAT_S8_UINT; + rtex->stencil = r600_texture_create_object(screen, &stencil, array_mode, + stencil_pitch_override, + max_buffer_size, NULL, FALSE); + if (!rtex->stencil) { + FREE(rtex); + return NULL; + } + /* Proceed in creating the depth buffer. */ + } + + /* only mark depth textures the HW can hit as depth textures */ + if (util_format_is_depth_or_stencil(rtex->real_format) && permit_hardware_blit(screen, base)) + rtex->depth = 1; + + r600_setup_miptree(screen, rtex, array_mode); + + /* If we initialized separate stencil for Evergreen. place it after depth. */ + if (rtex->stencil) { + unsigned stencil_align, stencil_offset; + + stencil_align = r600_get_base_alignment(screen, rtex->stencil->real_format, array_mode); + stencil_offset = align(rtex->size, stencil_align); + + for (unsigned i = 0; i <= rtex->stencil->resource.b.b.b.last_level; i++) + rtex->stencil->offset[i] += stencil_offset; + + rtex->size = stencil_offset + rtex->stencil->size; + } + + /* Now create the backing buffer. */ + if (!buf && alloc_bo) { + struct pipe_resource *ptex = &rtex->resource.b.b.b; + unsigned base_align = r600_get_base_alignment(screen, ptex->format, array_mode); + + if (!r600_init_resource(rscreen, resource, rtex->size, base_align, base->bind, base->usage)) { + pipe_resource_reference((struct pipe_resource**)&rtex->stencil, NULL); + FREE(rtex); + return NULL; + } + } else if (buf) { + resource->buf = buf; + resource->cs_buf = rscreen->ws->buffer_get_cs_handle(buf); + resource->domains = RADEON_DOMAIN_GTT | RADEON_DOMAIN_VRAM; + } + + if (rtex->stencil) { + pb_reference(&rtex->stencil->resource.buf, rtex->resource.buf); + rtex->stencil->resource.cs_buf = rtex->resource.cs_buf; + rtex->stencil->resource.domains = rtex->resource.domains; + } + return rtex; +} + +DEBUG_GET_ONCE_BOOL_OPTION(tiling_enabled, "R600_TILING", FALSE); + +struct pipe_resource *r600_texture_create(struct pipe_screen *screen, + const struct pipe_resource *templ) +{ + struct r600_screen *rscreen = (struct r600_screen*)screen; + unsigned array_mode = 0; + + if (!(templ->flags & R600_RESOURCE_FLAG_TRANSFER) && + !(templ->bind & PIPE_BIND_SCANOUT)) { +#if 0 + if (util_format_is_compressed(templ->format)) { + array_mode = V_038000_ARRAY_1D_TILED_THIN1; + } + else if (debug_get_option_tiling_enabled() && + rscreen->info.drm_minor >= 9 && + permit_hardware_blit(screen, templ)) { + array_mode = V_038000_ARRAY_2D_TILED_THIN1; + } +#endif + } + + return (struct pipe_resource *)r600_texture_create_object(screen, templ, array_mode, + 0, 0, NULL, TRUE); +} + +static struct pipe_surface *r600_create_surface(struct pipe_context *pipe, + struct pipe_resource *texture, + const struct pipe_surface *surf_tmpl) +{ + struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture; + struct r600_surface *surface = CALLOC_STRUCT(r600_surface); + unsigned level = surf_tmpl->u.tex.level; + + assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer); + if (surface == NULL) + return NULL; + /* XXX no offset */ +/* offset = r600_texture_get_offset(rtex, level, surf_tmpl->u.tex.first_layer);*/ + pipe_reference_init(&surface->base.reference, 1); + pipe_resource_reference(&surface->base.texture, texture); + surface->base.context = pipe; + surface->base.format = surf_tmpl->format; + surface->base.width = mip_minify(texture->width0, level); + surface->base.height = mip_minify(texture->height0, level); + surface->base.usage = surf_tmpl->usage; + surface->base.texture = texture; + surface->base.u.tex.first_layer = surf_tmpl->u.tex.first_layer; + surface->base.u.tex.last_layer = surf_tmpl->u.tex.last_layer; + surface->base.u.tex.level = level; + + surface->aligned_height = r600_texture_get_nblocksy(pipe->screen, + rtex, level); + return &surface->base; +} + +static void r600_surface_destroy(struct pipe_context *pipe, + struct pipe_surface *surface) +{ + pipe_resource_reference(&surface->texture, NULL); + FREE(surface); +} + +struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen, + const struct pipe_resource *templ, + struct winsys_handle *whandle) +{ + struct r600_screen *rscreen = (struct r600_screen*)screen; + struct pb_buffer *buf = NULL; + unsigned stride = 0; + unsigned array_mode = 0; + enum radeon_bo_layout micro, macro; + + /* Support only 2D textures without mipmaps */ + if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) || + templ->depth0 != 1 || templ->last_level != 0) + return NULL; + + buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride); + if (!buf) + return NULL; + + rscreen->ws->buffer_get_tiling(buf, µ, ¯o, NULL, NULL, NULL, NULL, NULL); + +#if 0 + if (macro == RADEON_LAYOUT_TILED) + array_mode = V_0280A0_ARRAY_2D_TILED_THIN1; + else if (micro == RADEON_LAYOUT_TILED) + array_mode = V_0280A0_ARRAY_1D_TILED_THIN1; + else +#endif + array_mode = 0; + + return (struct pipe_resource *)r600_texture_create_object(screen, templ, array_mode, + stride, 0, buf, FALSE); +} + +int r600_texture_depth_flush(struct pipe_context *ctx, + struct pipe_resource *texture, boolean just_create) +{ + struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture; + struct pipe_resource resource; + + if (rtex->flushed_depth_texture) + goto out; + + resource.target = texture->target; + resource.format = texture->format; + resource.width0 = texture->width0; + resource.height0 = texture->height0; + resource.depth0 = texture->depth0; + resource.array_size = texture->array_size; + resource.last_level = texture->last_level; + resource.nr_samples = texture->nr_samples; + resource.usage = PIPE_USAGE_DYNAMIC; + resource.bind = texture->bind | PIPE_BIND_DEPTH_STENCIL; + resource.flags = R600_RESOURCE_FLAG_TRANSFER | texture->flags; + + rtex->flushed_depth_texture = (struct r600_resource_texture *)ctx->screen->resource_create(ctx->screen, &resource); + if (rtex->flushed_depth_texture == NULL) { + R600_ERR("failed to create temporary texture to hold untiled copy\n"); + return -ENOMEM; + } + + ((struct r600_resource_texture *)rtex->flushed_depth_texture)->is_flushing_texture = TRUE; +out: + if (just_create) + return 0; + + /* XXX: only do this if the depth texture has actually changed: + */ + r600_blit_uncompress_depth(ctx, rtex); + return 0; +} + +/* Needs adjustment for pixelformat: + */ +static INLINE unsigned u_box_volume( const struct pipe_box *box ) +{ + return box->width * box->depth * box->height; +}; + +struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx, + struct pipe_resource *texture, + unsigned level, + unsigned usage, + const struct pipe_box *box) +{ + struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture; + struct pipe_resource resource; + struct r600_transfer *trans; + int r; + boolean use_staging_texture = FALSE; + +#if 0 + /* We cannot map a tiled texture directly because the data is + * in a different order, therefore we do detiling using a blit. + * + * Also, use a temporary in GTT memory for read transfers, as + * the CPU is much happier reading out of cached system memory + * than uncached VRAM. + */ + if (R600_TEX_IS_TILED(rtex, level)) + use_staging_texture = TRUE; +#endif + + if ((usage & PIPE_TRANSFER_READ) && u_box_volume(box) > 1024) + use_staging_texture = TRUE; + + /* XXX: Use a staging texture for uploads if the underlying BO + * is busy. No interface for checking that currently? so do + * it eagerly whenever the transfer doesn't require a readback + * and might block. + */ + if ((usage & PIPE_TRANSFER_WRITE) && + !(usage & (PIPE_TRANSFER_READ | + PIPE_TRANSFER_DONTBLOCK | + PIPE_TRANSFER_UNSYNCHRONIZED))) + use_staging_texture = TRUE; + + if (!permit_hardware_blit(ctx->screen, texture) || + (texture->flags & R600_RESOURCE_FLAG_TRANSFER)) + use_staging_texture = FALSE; + + if (use_staging_texture && (usage & PIPE_TRANSFER_MAP_DIRECTLY)) + return NULL; + + trans = CALLOC_STRUCT(r600_transfer); + if (trans == NULL) + return NULL; + pipe_resource_reference(&trans->transfer.resource, texture); + trans->transfer.level = level; + trans->transfer.usage = usage; + trans->transfer.box = *box; + if (rtex->depth) { + /* XXX: only readback the rectangle which is being mapped? + */ + /* XXX: when discard is true, no need to read back from depth texture + */ + r = r600_texture_depth_flush(ctx, texture, FALSE); + if (r < 0) { + R600_ERR("failed to create temporary texture to hold untiled copy\n"); + pipe_resource_reference(&trans->transfer.resource, NULL); + FREE(trans); + return NULL; + } + trans->transfer.stride = rtex->flushed_depth_texture->pitch_in_bytes[level]; + trans->offset = r600_texture_get_offset(rtex->flushed_depth_texture, level, box->z); + return &trans->transfer; + } else if (use_staging_texture) { + resource.target = PIPE_TEXTURE_2D; + resource.format = texture->format; + resource.width0 = box->width; + resource.height0 = box->height; + resource.depth0 = 1; + resource.array_size = 1; + resource.last_level = 0; + resource.nr_samples = 0; + resource.usage = PIPE_USAGE_STAGING; + resource.bind = 0; + resource.flags = R600_RESOURCE_FLAG_TRANSFER; + /* For texture reading, the temporary (detiled) texture is used as + * a render target when blitting from a tiled texture. */ + if (usage & PIPE_TRANSFER_READ) { + resource.bind |= PIPE_BIND_RENDER_TARGET; + } + /* For texture writing, the temporary texture is used as a sampler + * when blitting into a tiled texture. */ + if (usage & PIPE_TRANSFER_WRITE) { + resource.bind |= PIPE_BIND_SAMPLER_VIEW; + } + /* Create the temporary texture. */ + trans->staging_texture = ctx->screen->resource_create(ctx->screen, &resource); + if (trans->staging_texture == NULL) { + R600_ERR("failed to create temporary texture to hold untiled copy\n"); + pipe_resource_reference(&trans->transfer.resource, NULL); + FREE(trans); + return NULL; + } + + trans->transfer.stride = + ((struct r600_resource_texture *)trans->staging_texture)->pitch_in_bytes[0]; + if (usage & PIPE_TRANSFER_READ) { + r600_copy_to_staging_texture(ctx, trans); + /* Always referenced in the blit. */ + radeonsi_flush(ctx, NULL, 0); + } + return &trans->transfer; + } + trans->transfer.stride = rtex->pitch_in_bytes[level]; + trans->transfer.layer_stride = rtex->layer_size[level]; + trans->offset = r600_texture_get_offset(rtex, level, box->z); + return &trans->transfer; +} + +void r600_texture_transfer_destroy(struct pipe_context *ctx, + struct pipe_transfer *transfer) +{ + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; + struct pipe_resource *texture = transfer->resource; + struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture; + + if (rtransfer->staging_texture) { + if (transfer->usage & PIPE_TRANSFER_WRITE) { + r600_copy_from_staging_texture(ctx, rtransfer); + } + pipe_resource_reference(&rtransfer->staging_texture, NULL); + } + + if (rtex->depth && !rtex->is_flushing_texture) { + if ((transfer->usage & PIPE_TRANSFER_WRITE) && rtex->flushed_depth_texture) + r600_blit_push_depth(ctx, rtex); + } + + pipe_resource_reference(&transfer->resource, NULL); + FREE(transfer); +} + +void* r600_texture_transfer_map(struct pipe_context *ctx, + struct pipe_transfer* transfer) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; + struct pb_buffer *buf; + enum pipe_format format = transfer->resource->format; + unsigned offset = 0; + char *map; + + if (rtransfer->staging_texture) { + buf = ((struct r600_resource *)rtransfer->staging_texture)->buf; + } else { + struct r600_resource_texture *rtex = (struct r600_resource_texture*)transfer->resource; + + if (rtex->flushed_depth_texture) + buf = ((struct r600_resource *)rtex->flushed_depth_texture)->buf; + else + buf = ((struct r600_resource *)transfer->resource)->buf; + + offset = rtransfer->offset + + transfer->box.y / util_format_get_blockheight(format) * transfer->stride + + transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format); + } + + if (!(map = rctx->ws->buffer_map(buf, rctx->cs, transfer->usage))) { + return NULL; + } + + return map + offset; +} + +void r600_texture_transfer_unmap(struct pipe_context *ctx, + struct pipe_transfer* transfer) +{ + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; + struct r600_context *rctx = (struct r600_context*)ctx; + struct pb_buffer *buf; + + if (rtransfer->staging_texture) { + buf = ((struct r600_resource *)rtransfer->staging_texture)->buf; + } else { + struct r600_resource_texture *rtex = (struct r600_resource_texture*)transfer->resource; + + if (rtex->flushed_depth_texture) { + buf = ((struct r600_resource *)rtex->flushed_depth_texture)->buf; + } else { + buf = ((struct r600_resource *)transfer->resource)->buf; + } + } + rctx->ws->buffer_unmap(buf); +} + +void r600_init_surface_functions(struct r600_context *r600) +{ + r600->context.create_surface = r600_create_surface; + r600->context.surface_destroy = r600_surface_destroy; +} diff --git a/src/gallium/drivers/radeonsi/r600_translate.c b/src/gallium/drivers/radeonsi/r600_translate.c new file mode 100644 index 00000000000..6551044b553 --- /dev/null +++ b/src/gallium/drivers/radeonsi/r600_translate.c @@ -0,0 +1,54 @@ +/* + * Copyright 2010 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: Dave Airlie <[email protected]> + */ + +#include "util/u_index_modify.h" +#include "util/u_inlines.h" +#include "util/u_upload_mgr.h" +#include "radeonsi_pipe.h" + + +void r600_translate_index_buffer(struct r600_context *r600, + struct pipe_index_buffer *ib, + unsigned count) +{ + struct pipe_resource *out_buffer = NULL; + unsigned out_offset; + void *ptr; + + switch (ib->index_size) { + case 1: + u_upload_alloc(r600->vbuf_mgr->uploader, 0, count * 2, + &out_offset, &out_buffer, &ptr); + + util_shorten_ubyte_elts_to_userptr( + &r600->context, ib->buffer, 0, ib->offset, count, ptr); + + pipe_resource_reference(&ib->buffer, NULL); + ib->buffer = out_buffer; + ib->offset = out_offset; + ib->index_size = 2; + break; + } +} diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.c b/src/gallium/drivers/radeonsi/radeonsi_pipe.c new file mode 100644 index 00000000000..9e849525cc9 --- /dev/null +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.c @@ -0,0 +1,731 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#include <stdio.h> +#include <errno.h> +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "pipe/p_context.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "util/u_blitter.h" +#include "util/u_double_list.h" +#include "util/u_format.h" +#include "util/u_format_s3tc.h" +#include "util/u_transfer.h" +#include "util/u_surface.h" +#include "util/u_pack_color.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" +#include "util/u_upload_mgr.h" +#include "vl/vl_decoder.h" +#include "vl/vl_video_buffer.h" +#include "os/os_time.h" +#include "pipebuffer/pb_buffer.h" +#include "r600.h" +#include "sid.h" +#include "r600_resource.h" +#include "radeonsi_pipe.h" +#include "r600_hw_context_priv.h" + +/* + * pipe_context + */ +static struct r600_fence *r600_create_fence(struct r600_context *rctx) +{ + struct r600_screen *rscreen = rctx->screen; + struct r600_fence *fence = NULL; + + pipe_mutex_lock(rscreen->fences.mutex); + + if (!rscreen->fences.bo) { + /* Create the shared buffer object */ + rscreen->fences.bo = (struct r600_resource*) + pipe_buffer_create(&rscreen->screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_STAGING, 4096); + if (!rscreen->fences.bo) { + R600_ERR("r600: failed to create bo for fence objects\n"); + goto out; + } + rscreen->fences.data = rctx->ws->buffer_map(rscreen->fences.bo->buf, + rctx->cs, + PIPE_TRANSFER_READ_WRITE); + } + + if (!LIST_IS_EMPTY(&rscreen->fences.pool)) { + struct r600_fence *entry; + + /* Try to find a freed fence that has been signalled */ + LIST_FOR_EACH_ENTRY(entry, &rscreen->fences.pool, head) { + if (rscreen->fences.data[entry->index] != 0) { + LIST_DELINIT(&entry->head); + fence = entry; + break; + } + } + } + + if (!fence) { + /* Allocate a new fence */ + struct r600_fence_block *block; + unsigned index; + + if ((rscreen->fences.next_index + 1) >= 1024) { + R600_ERR("r600: too many concurrent fences\n"); + goto out; + } + + index = rscreen->fences.next_index++; + + if (!(index % FENCE_BLOCK_SIZE)) { + /* Allocate a new block */ + block = CALLOC_STRUCT(r600_fence_block); + if (block == NULL) + goto out; + + LIST_ADD(&block->head, &rscreen->fences.blocks); + } else { + block = LIST_ENTRY(struct r600_fence_block, rscreen->fences.blocks.next, head); + } + + fence = &block->fences[index % FENCE_BLOCK_SIZE]; + fence->index = index; + } + + pipe_reference_init(&fence->reference, 1); + + rscreen->fences.data[fence->index] = 0; + r600_context_emit_fence(rctx, rscreen->fences.bo, fence->index, 1); + + /* Create a dummy BO so that fence_finish without a timeout can sleep waiting for completion */ + fence->sleep_bo = (struct r600_resource*) + pipe_buffer_create(&rctx->screen->screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_STAGING, 1); + /* Add the fence as a dummy relocation. */ + r600_context_bo_reloc(rctx, fence->sleep_bo, RADEON_USAGE_READWRITE); + +out: + pipe_mutex_unlock(rscreen->fences.mutex); + return fence; +} + + +void radeonsi_flush(struct pipe_context *ctx, struct pipe_fence_handle **fence, + unsigned flags) +{ + struct r600_context *rctx = (struct r600_context *)ctx; + struct r600_fence **rfence = (struct r600_fence**)fence; + struct pipe_query *render_cond = NULL; + unsigned render_cond_mode = 0; + + if (rfence) + *rfence = r600_create_fence(rctx); + + /* Disable render condition. */ + if (rctx->current_render_cond) { + render_cond = rctx->current_render_cond; + render_cond_mode = rctx->current_render_cond_mode; + ctx->render_condition(ctx, NULL, 0); + } + + r600_context_flush(rctx, flags); + + /* Re-enable render condition. */ + if (render_cond) { + ctx->render_condition(ctx, render_cond, render_cond_mode); + } +} + +static void r600_flush_from_st(struct pipe_context *ctx, + struct pipe_fence_handle **fence) +{ + radeonsi_flush(ctx, fence, 0); +} + +static void r600_flush_from_winsys(void *ctx, unsigned flags) +{ + radeonsi_flush((struct pipe_context*)ctx, NULL, flags); +} + +static void r600_update_num_contexts(struct r600_screen *rscreen, int diff) +{ + pipe_mutex_lock(rscreen->mutex_num_contexts); + if (diff > 0) { + rscreen->num_contexts++; + + if (rscreen->num_contexts > 1) + util_slab_set_thread_safety(&rscreen->pool_buffers, + UTIL_SLAB_MULTITHREADED); + } else { + rscreen->num_contexts--; + + if (rscreen->num_contexts <= 1) + util_slab_set_thread_safety(&rscreen->pool_buffers, + UTIL_SLAB_SINGLETHREADED); + } + pipe_mutex_unlock(rscreen->mutex_num_contexts); +} + +static void r600_destroy_context(struct pipe_context *context) +{ + struct r600_context *rctx = (struct r600_context *)context; + + rctx->context.delete_depth_stencil_alpha_state(&rctx->context, rctx->custom_dsa_flush); + util_unreference_framebuffer_state(&rctx->framebuffer); + + r600_context_fini(rctx); + + util_blitter_destroy(rctx->blitter); + + for (int i = 0; i < R600_PIPE_NSTATES; i++) { + free(rctx->states[i]); + } + + u_vbuf_destroy(rctx->vbuf_mgr); + util_slab_destroy(&rctx->pool_transfers); + + r600_update_num_contexts(rctx->screen, -1); + + FREE(rctx); +} + +static struct pipe_context *r600_create_context(struct pipe_screen *screen, void *priv) +{ + struct r600_context *rctx = CALLOC_STRUCT(r600_context); + struct r600_screen* rscreen = (struct r600_screen *)screen; + + if (rctx == NULL) + return NULL; + + r600_update_num_contexts(rscreen, 1); + + rctx->context.screen = screen; + rctx->context.priv = priv; + rctx->context.destroy = r600_destroy_context; + rctx->context.flush = r600_flush_from_st; + + /* Easy accessing of screen/winsys. */ + rctx->screen = rscreen; + rctx->ws = rscreen->ws; + rctx->family = rscreen->family; + rctx->chip_class = rscreen->chip_class; + + r600_init_blit_functions(rctx); + r600_init_query_functions(rctx); + r600_init_context_resource_functions(rctx); + r600_init_surface_functions(rctx); + rctx->context.draw_vbo = r600_draw_vbo; + + rctx->context.create_video_decoder = vl_create_decoder; + rctx->context.create_video_buffer = vl_video_buffer_create; + + r600_init_common_atoms(rctx); + + switch (rctx->chip_class) { + case TAHITI: + cayman_init_state_functions(rctx); + if (si_context_init(rctx)) { + r600_destroy_context(&rctx->context); + return NULL; + } + si_init_config(rctx); + rctx->custom_dsa_flush = cayman_create_db_flush_dsa(rctx); + break; + default: + R600_ERR("Unsupported chip class %d.\n", rctx->chip_class); + r600_destroy_context(&rctx->context); + return NULL; + } + + rctx->ws->cs_set_flush_callback(rctx->cs, r600_flush_from_winsys, rctx); + + util_slab_create(&rctx->pool_transfers, + sizeof(struct pipe_transfer), 64, + UTIL_SLAB_SINGLETHREADED); + + rctx->vbuf_mgr = u_vbuf_create(&rctx->context, 1024 * 1024, 256, + PIPE_BIND_VERTEX_BUFFER | + PIPE_BIND_INDEX_BUFFER | + PIPE_BIND_CONSTANT_BUFFER, + U_VERTEX_FETCH_DWORD_ALIGNED); + if (!rctx->vbuf_mgr) { + r600_destroy_context(&rctx->context); + return NULL; + } + rctx->vbuf_mgr->caps.format_fixed32 = 0; + + rctx->blitter = util_blitter_create(&rctx->context); + if (rctx->blitter == NULL) { + r600_destroy_context(&rctx->context); + return NULL; + } + + LIST_INITHEAD(&rctx->dirty_states); + + r600_get_backend_mask(rctx); /* this emits commands and must be last */ + + return &rctx->context; +} + +/* + * pipe_screen + */ +static const char* r600_get_vendor(struct pipe_screen* pscreen) +{ + return "X.Org"; +} + +static const char *r600_get_family_name(enum radeon_family family) +{ + switch(family) { + case CHIP_CAYMAN: return "AMD CAYMAN"; + default: return "AMD unknown"; + } +} + +static const char* r600_get_name(struct pipe_screen* pscreen) +{ + struct r600_screen *rscreen = (struct r600_screen *)pscreen; + + return r600_get_family_name(rscreen->family); +} + +static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) +{ + struct r600_screen *rscreen = (struct r600_screen *)pscreen; + enum radeon_family family = rscreen->family; + + switch (param) { + /* Supported features (boolean caps). */ + case PIPE_CAP_NPOT_TEXTURES: + case PIPE_CAP_TWO_SIDED_STENCIL: + case PIPE_CAP_DUAL_SOURCE_BLEND: + case PIPE_CAP_ANISOTROPIC_FILTER: + case PIPE_CAP_POINT_SPRITE: + case PIPE_CAP_OCCLUSION_QUERY: + case PIPE_CAP_TEXTURE_SHADOW_MAP: + case PIPE_CAP_TEXTURE_MIRROR_CLAMP: + case PIPE_CAP_BLEND_EQUATION_SEPARATE: + case PIPE_CAP_TEXTURE_SWIZZLE: + case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE: + case PIPE_CAP_DEPTH_CLIP_DISABLE: + case PIPE_CAP_SHADER_STENCIL_EXPORT: + case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR: + case PIPE_CAP_MIXED_COLORBUFFER_FORMATS: + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER: + case PIPE_CAP_SM3: + case PIPE_CAP_SEAMLESS_CUBE_MAP: + case PIPE_CAP_PRIMITIVE_RESTART: + case PIPE_CAP_CONDITIONAL_RENDER: + case PIPE_CAP_TEXTURE_BARRIER: + case PIPE_CAP_INDEP_BLEND_ENABLE: + case PIPE_CAP_INDEP_BLEND_FUNC: + case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: + case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: + return 1; + + case PIPE_CAP_GLSL_FEATURE_LEVEL: + return debug_get_bool_option("R600_GLSL130", FALSE) ? 130 : 120; + + /* Unsupported features. */ + case PIPE_CAP_TGSI_INSTANCEID: + case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT: + case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER: + case PIPE_CAP_SCALED_RESOLVE: + case PIPE_CAP_TGSI_CAN_COMPACT_VARYINGS: + case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS: + case PIPE_CAP_FRAGMENT_COLOR_CLAMPED: + case PIPE_CAP_VERTEX_COLOR_CLAMPED: + case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION: + return 0; + + /* Stream output. */ + case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS: + return debug_get_bool_option("R600_STREAMOUT", FALSE) ? 4 : 0; + case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME: + return debug_get_bool_option("R600_STREAMOUT", FALSE) ? 1 : 0; + case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS: + case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS: + return 16*4; + + /* Texturing. */ + case PIPE_CAP_MAX_TEXTURE_2D_LEVELS: + case PIPE_CAP_MAX_TEXTURE_3D_LEVELS: + case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS: + return 15; + case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: + return rscreen->info.drm_minor >= 9 ? 16384 : 0; + case PIPE_CAP_MAX_COMBINED_SAMPLERS: + return 32; + + /* Render targets. */ + case PIPE_CAP_MAX_RENDER_TARGETS: + /* FIXME some r6xx are buggy and can only do 4 */ + return 8; + + /* Timer queries, present when the clock frequency is non zero. */ + case PIPE_CAP_TIMER_QUERY: + return rscreen->info.r600_clock_crystal_freq != 0; + + case PIPE_CAP_MIN_TEXEL_OFFSET: + return -8; + + case PIPE_CAP_MAX_TEXEL_OFFSET: + return 7; + } + return 0; +} + +static float r600_get_paramf(struct pipe_screen* pscreen, + enum pipe_capf param) +{ + struct r600_screen *rscreen = (struct r600_screen *)pscreen; + enum radeon_family family = rscreen->family; + + switch (param) { + case PIPE_CAPF_MAX_LINE_WIDTH: + case PIPE_CAPF_MAX_LINE_WIDTH_AA: + case PIPE_CAPF_MAX_POINT_WIDTH: + case PIPE_CAPF_MAX_POINT_WIDTH_AA: + return 16384.0f; + case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY: + return 16.0f; + case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS: + return 16.0f; + case PIPE_CAPF_GUARD_BAND_LEFT: + case PIPE_CAPF_GUARD_BAND_TOP: + case PIPE_CAPF_GUARD_BAND_RIGHT: + case PIPE_CAPF_GUARD_BAND_BOTTOM: + return 0.0f; + } + return 0.0f; +} + +static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enum pipe_shader_cap param) +{ + struct r600_screen *rscreen = (struct r600_screen *)pscreen; + switch(shader) + { + case PIPE_SHADER_FRAGMENT: + case PIPE_SHADER_VERTEX: + break; + case PIPE_SHADER_GEOMETRY: + /* TODO: support and enable geometry programs */ + return 0; + default: + /* TODO: support tessellation */ + return 0; + } + + /* TODO: all these should be fixed, since r600 surely supports much more! */ + switch (param) { + case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: + return 16384; + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return 8; /* FIXME */ + case PIPE_SHADER_CAP_MAX_INPUTS: + if(shader == PIPE_SHADER_FRAGMENT) + return 34; + else + return 32; + case PIPE_SHADER_CAP_MAX_TEMPS: + return 256; /* Max native temporaries. */ + case PIPE_SHADER_CAP_MAX_ADDRS: + /* FIXME Isn't this equal to TEMPS? */ + return 1; /* Max native address registers */ + case PIPE_SHADER_CAP_MAX_CONSTS: + return R600_MAX_CONST_BUFFER_SIZE; + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + return R600_MAX_CONST_BUFFERS; + case PIPE_SHADER_CAP_MAX_PREDS: + return 0; /* FIXME */ + case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: + return 1; + case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR: + case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: + case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: + case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: + case PIPE_SHADER_CAP_INTEGERS: + return 0; + case PIPE_SHADER_CAP_SUBROUTINES: + return 0; + case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: + return 16; + } + return 0; +} + +static int r600_get_video_param(struct pipe_screen *screen, + enum pipe_video_profile profile, + enum pipe_video_cap param) +{ + switch (param) { + case PIPE_VIDEO_CAP_SUPPORTED: + return vl_profile_supported(screen, profile); + case PIPE_VIDEO_CAP_NPOT_TEXTURES: + return 1; + case PIPE_VIDEO_CAP_MAX_WIDTH: + case PIPE_VIDEO_CAP_MAX_HEIGHT: + return vl_video_buffer_max_size(screen); + case PIPE_VIDEO_CAP_PREFERED_FORMAT: + return PIPE_FORMAT_NV12; + default: + return 0; + } +} + +static void r600_destroy_screen(struct pipe_screen* pscreen) +{ + struct r600_screen *rscreen = (struct r600_screen *)pscreen; + + if (rscreen == NULL) + return; + + if (rscreen->fences.bo) { + struct r600_fence_block *entry, *tmp; + + LIST_FOR_EACH_ENTRY_SAFE(entry, tmp, &rscreen->fences.blocks, head) { + LIST_DEL(&entry->head); + FREE(entry); + } + + rscreen->ws->buffer_unmap(rscreen->fences.bo->buf); + pipe_resource_reference((struct pipe_resource**)&rscreen->fences.bo, NULL); + } + pipe_mutex_destroy(rscreen->fences.mutex); + + rscreen->ws->destroy(rscreen->ws); + + util_slab_destroy(&rscreen->pool_buffers); + pipe_mutex_destroy(rscreen->mutex_num_contexts); + FREE(rscreen); +} + +static void r600_fence_reference(struct pipe_screen *pscreen, + struct pipe_fence_handle **ptr, + struct pipe_fence_handle *fence) +{ + struct r600_fence **oldf = (struct r600_fence**)ptr; + struct r600_fence *newf = (struct r600_fence*)fence; + + if (pipe_reference(&(*oldf)->reference, &newf->reference)) { + struct r600_screen *rscreen = (struct r600_screen *)pscreen; + pipe_mutex_lock(rscreen->fences.mutex); + pipe_resource_reference((struct pipe_resource**)&(*oldf)->sleep_bo, NULL); + LIST_ADDTAIL(&(*oldf)->head, &rscreen->fences.pool); + pipe_mutex_unlock(rscreen->fences.mutex); + } + + *ptr = fence; +} + +static boolean r600_fence_signalled(struct pipe_screen *pscreen, + struct pipe_fence_handle *fence) +{ + struct r600_screen *rscreen = (struct r600_screen *)pscreen; + struct r600_fence *rfence = (struct r600_fence*)fence; + + return rscreen->fences.data[rfence->index]; +} + +static boolean r600_fence_finish(struct pipe_screen *pscreen, + struct pipe_fence_handle *fence, + uint64_t timeout) +{ + struct r600_screen *rscreen = (struct r600_screen *)pscreen; + struct r600_fence *rfence = (struct r600_fence*)fence; + int64_t start_time = 0; + unsigned spins = 0; + + if (timeout != PIPE_TIMEOUT_INFINITE) { + start_time = os_time_get(); + + /* Convert to microseconds. */ + timeout /= 1000; + } + + while (rscreen->fences.data[rfence->index] == 0) { + /* Special-case infinite timeout - wait for the dummy BO to become idle */ + if (timeout == PIPE_TIMEOUT_INFINITE) { + rscreen->ws->buffer_wait(rfence->sleep_bo->buf, RADEON_USAGE_READWRITE); + break; + } + + /* The dummy BO will be busy until the CS including the fence has completed, or + * the GPU is reset. Don't bother continuing to spin when the BO is idle. */ + if (!rscreen->ws->buffer_is_busy(rfence->sleep_bo->buf, RADEON_USAGE_READWRITE)) + break; + + if (++spins % 256) + continue; +#ifdef PIPE_OS_UNIX + sched_yield(); +#else + os_time_sleep(10); +#endif + if (timeout != PIPE_TIMEOUT_INFINITE && + os_time_get() - start_time >= timeout) { + break; + } + } + + return rscreen->fences.data[rfence->index] != 0; +} + +static int evergreen_interpret_tiling(struct r600_screen *rscreen, uint32_t tiling_config) +{ + switch (tiling_config & 0xf) { + case 0: + rscreen->tiling_info.num_channels = 1; + break; + case 1: + rscreen->tiling_info.num_channels = 2; + break; + case 2: + rscreen->tiling_info.num_channels = 4; + break; + case 3: + rscreen->tiling_info.num_channels = 8; + break; + default: + return -EINVAL; + } + + switch ((tiling_config & 0xf0) >> 4) { + case 0: + rscreen->tiling_info.num_banks = 4; + break; + case 1: + rscreen->tiling_info.num_banks = 8; + break; + case 2: + rscreen->tiling_info.num_banks = 16; + break; + default: + return -EINVAL; + } + + switch ((tiling_config & 0xf00) >> 8) { + case 0: + rscreen->tiling_info.group_bytes = 256; + break; + case 1: + rscreen->tiling_info.group_bytes = 512; + break; + default: + return -EINVAL; + } + return 0; +} + +static int r600_init_tiling(struct r600_screen *rscreen) +{ + uint32_t tiling_config = rscreen->info.r600_tiling_config; + + /* set default group bytes, overridden by tiling info ioctl */ + rscreen->tiling_info.group_bytes = 512; + + if (!tiling_config) + return 0; + + return evergreen_interpret_tiling(rscreen, tiling_config); +} + +static unsigned radeon_family_from_device(unsigned device) +{ + switch (device) { +#define CHIPSET(pciid, name, family) case pciid: return CHIP_##family; +#include "pci_ids/radeonsi_pci_ids.h" +#undef CHIPSET + default: + return CHIP_UNKNOWN; + } +} + +struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws) +{ + struct r600_screen *rscreen = CALLOC_STRUCT(r600_screen); + if (rscreen == NULL) { + return NULL; + } + + rscreen->ws = ws; + ws->query_info(ws, &rscreen->info); + + rscreen->family = radeon_family_from_device(rscreen->info.pci_id); + if (rscreen->family == CHIP_UNKNOWN) { + fprintf(stderr, "r600: Unknown chipset 0x%04X\n", rscreen->info.pci_id); + FREE(rscreen); + return NULL; + } + + /* setup class */ + if (rscreen->family >= CHIP_TAHITI) { + rscreen->chip_class = TAHITI; + } else { + fprintf(stderr, "r600: Unsupported family %d\n", rscreen->family); + FREE(rscreen); + return NULL; + } + + if (r600_init_tiling(rscreen)) { + FREE(rscreen); + return NULL; + } + + rscreen->screen.destroy = r600_destroy_screen; + rscreen->screen.get_name = r600_get_name; + rscreen->screen.get_vendor = r600_get_vendor; + rscreen->screen.get_param = r600_get_param; + rscreen->screen.get_shader_param = r600_get_shader_param; + rscreen->screen.get_paramf = r600_get_paramf; + rscreen->screen.get_video_param = r600_get_video_param; + rscreen->screen.is_format_supported = si_is_format_supported; + rscreen->screen.is_video_format_supported = vl_video_buffer_is_format_supported; + rscreen->screen.context_create = r600_create_context; + rscreen->screen.fence_reference = r600_fence_reference; + rscreen->screen.fence_signalled = r600_fence_signalled; + rscreen->screen.fence_finish = r600_fence_finish; + r600_init_screen_resource_functions(&rscreen->screen); + + util_format_s3tc_init(); + + util_slab_create(&rscreen->pool_buffers, + sizeof(struct r600_resource), 64, + UTIL_SLAB_SINGLETHREADED); + + pipe_mutex_init(rscreen->mutex_num_contexts); + + rscreen->fences.bo = NULL; + rscreen->fences.data = NULL; + rscreen->fences.next_index = 0; + LIST_INITHEAD(&rscreen->fences.pool); + LIST_INITHEAD(&rscreen->fences.blocks); + pipe_mutex_init(rscreen->fences.mutex); + + return &rscreen->screen; +} diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h new file mode 100644 index 00000000000..f4a1219d860 --- /dev/null +++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h @@ -0,0 +1,490 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Authors: + * Jerome Glisse + */ +#ifndef RADEONSI_PIPE_H +#define RADEONSI_PIPE_H + +#include "../../winsys/radeon/drm/radeon_winsys.h" + +#include "pipe/p_state.h" +#include "pipe/p_screen.h" +#include "pipe/p_context.h" +#include "util/u_format.h" +#include "util/u_math.h" +#include "util/u_slab.h" +#include "util/u_vbuf.h" +#include "r600.h" +#include "radeonsi_public.h" +#include "r600_resource.h" + +#define R600_MAX_CONST_BUFFERS 1 +#define R600_MAX_CONST_BUFFER_SIZE 4096 + +#ifdef PIPE_ARCH_BIG_ENDIAN +#define R600_BIG_ENDIAN 1 +#else +#define R600_BIG_ENDIAN 0 +#endif + +enum r600_atom_flags { + /* When set, atoms are added at the beginning of the dirty list + * instead of the end. */ + EMIT_EARLY = (1 << 0) +}; + +/* This encapsulates a state or an operation which can emitted into the GPU + * command stream. It's not limited to states only, it can be used for anything + * that wants to write commands into the CS (e.g. cache flushes). */ +struct r600_atom { + void (*emit)(struct r600_context *ctx, struct r600_atom *state); + + unsigned num_dw; + enum r600_atom_flags flags; + bool dirty; + + struct list_head head; +}; + +struct r600_atom_surface_sync { + struct r600_atom atom; + unsigned flush_flags; /* CP_COHER_CNTL */ +}; + +enum r600_pipe_state_id { + R600_PIPE_STATE_BLEND = 0, + R600_PIPE_STATE_BLEND_COLOR, + R600_PIPE_STATE_CONFIG, + R600_PIPE_STATE_SEAMLESS_CUBEMAP, + R600_PIPE_STATE_CLIP, + R600_PIPE_STATE_SCISSOR, + R600_PIPE_STATE_VIEWPORT, + R600_PIPE_STATE_RASTERIZER, + R600_PIPE_STATE_VGT, + R600_PIPE_STATE_FRAMEBUFFER, + R600_PIPE_STATE_DSA, + R600_PIPE_STATE_STENCIL_REF, + R600_PIPE_STATE_PS_SHADER, + R600_PIPE_STATE_VS_SHADER, + R600_PIPE_STATE_CONSTANT, + R600_PIPE_STATE_SAMPLER, + R600_PIPE_STATE_RESOURCE, + R600_PIPE_STATE_POLYGON_OFFSET, + R600_PIPE_NSTATES +}; + +struct r600_pipe_fences { + struct r600_resource *bo; + unsigned *data; + unsigned next_index; + /* linked list of preallocated blocks */ + struct list_head blocks; + /* linked list of freed fences */ + struct list_head pool; + pipe_mutex mutex; +}; + +struct r600_screen { + struct pipe_screen screen; + struct radeon_winsys *ws; + unsigned family; + enum chip_class chip_class; + struct radeon_info info; + struct r600_tiling_info tiling_info; + struct util_slab_mempool pool_buffers; + struct r600_pipe_fences fences; + + unsigned num_contexts; + + /* for thread-safe write accessing to num_contexts */ + pipe_mutex mutex_num_contexts; +}; + +struct si_pipe_sampler_view { + struct pipe_sampler_view base; + uint32_t state[8]; +}; + +struct si_pipe_sampler_state { + uint32_t val[4]; +}; + +struct r600_pipe_rasterizer { + struct r600_pipe_state rstate; + boolean flatshade; + unsigned sprite_coord_enable; + unsigned pa_sc_line_stipple; + unsigned pa_su_sc_mode_cntl; + unsigned pa_cl_clip_cntl; + unsigned pa_cl_vs_out_cntl; + float offset_units; + float offset_scale; +}; + +struct r600_pipe_blend { + struct r600_pipe_state rstate; + unsigned cb_target_mask; + unsigned cb_color_control; +}; + +struct r600_pipe_dsa { + struct r600_pipe_state rstate; + unsigned alpha_ref; + unsigned db_render_override; + unsigned db_render_control; + ubyte valuemask[2]; + ubyte writemask[2]; +}; + +struct r600_vertex_element +{ + unsigned count; + struct pipe_vertex_element elements[PIPE_MAX_ATTRIBS]; + struct u_vbuf_elements *vmgr_elements; + unsigned fs_size; + struct r600_pipe_state rstate; + /* if offset is to big for fetch instructio we need to alterate + * offset of vertex buffer, record here the offset need to add + */ + unsigned vbuffer_need_offset; + unsigned vbuffer_offset[PIPE_MAX_ATTRIBS]; +}; + +struct r600_shader_io { + unsigned name; + unsigned gpr; + unsigned done; + int sid; + unsigned interpolate; + boolean centroid; + unsigned lds_pos; /* for evergreen */ +}; + +struct r600_shader { + unsigned ninput; + unsigned noutput; + struct r600_shader_io input[32]; + struct r600_shader_io output[32]; + boolean uses_kill; + boolean fs_write_all; + unsigned nr_cbufs; +}; + +struct si_pipe_shader { + struct r600_shader shader; + struct r600_pipe_state rstate; + struct r600_resource *bo; + struct r600_vertex_element vertex_elements; + struct tgsi_token *tokens; + unsigned num_sgprs; + unsigned num_vgprs; + unsigned spi_ps_input_ena; + unsigned sprite_coord_enable; + struct pipe_stream_output_info so; + unsigned so_strides[4]; +}; + +/* needed for blitter save */ +#define NUM_TEX_UNITS 16 + +struct r600_textures_info { + struct r600_pipe_state rstate; + struct si_pipe_sampler_view *views[NUM_TEX_UNITS]; + struct si_pipe_sampler_state *samplers[NUM_TEX_UNITS]; + unsigned n_views; + unsigned n_samplers; + bool samplers_dirty; + bool is_array_sampler[NUM_TEX_UNITS]; +}; + +struct r600_fence { + struct pipe_reference reference; + unsigned index; /* in the shared bo */ + struct r600_resource *sleep_bo; + struct list_head head; +}; + +#define FENCE_BLOCK_SIZE 16 + +struct r600_fence_block { + struct r600_fence fences[FENCE_BLOCK_SIZE]; + struct list_head head; +}; + +#define R600_CONSTANT_ARRAY_SIZE 256 +#define R600_RESOURCE_ARRAY_SIZE 160 + +struct r600_stencil_ref +{ + ubyte ref_value[2]; + ubyte valuemask[2]; + ubyte writemask[2]; +}; + +struct r600_context { + struct pipe_context context; + struct blitter_context *blitter; + enum radeon_family family; + enum chip_class chip_class; + void *custom_dsa_flush; + struct r600_screen *screen; + struct radeon_winsys *ws; + struct r600_pipe_state *states[R600_PIPE_NSTATES]; + struct r600_vertex_element *vertex_elements; + struct pipe_framebuffer_state framebuffer; + unsigned cb_target_mask; + unsigned cb_color_control; + unsigned pa_sc_line_stipple; + unsigned pa_su_sc_mode_cntl; + unsigned pa_cl_clip_cntl; + unsigned pa_cl_vs_out_cntl; + /* for saving when using blitter */ + struct pipe_stencil_ref stencil_ref; + struct pipe_viewport_state viewport; + struct pipe_clip_state clip; + struct r600_pipe_state config; + struct si_pipe_shader *ps_shader; + struct si_pipe_shader *vs_shader; + struct r600_pipe_state vs_const_buffer; + struct r600_pipe_state vs_user_data; + struct r600_pipe_state ps_const_buffer; + struct r600_pipe_rasterizer *rasterizer; + struct r600_pipe_state vgt; + struct r600_pipe_state spi; + struct pipe_query *current_render_cond; + unsigned current_render_cond_mode; + struct pipe_query *saved_render_cond; + unsigned saved_render_cond_mode; + /* shader information */ + unsigned sprite_coord_enable; + boolean export_16bpc; + unsigned alpha_ref; + boolean alpha_ref_dirty; + unsigned nr_cbufs; + struct r600_textures_info vs_samplers; + struct r600_textures_info ps_samplers; + boolean shader_dirty; + + struct u_vbuf *vbuf_mgr; + struct util_slab_mempool pool_transfers; + boolean have_depth_texture, have_depth_fb; + + unsigned default_ps_gprs, default_vs_gprs; + + /* States based on r600_state. */ + struct list_head dirty_states; + struct r600_atom_surface_sync atom_surface_sync; + struct r600_atom atom_r6xx_flush_and_inv; + + /* Below are variables from the old r600_context. + */ + struct radeon_winsys_cs *cs; + + struct r600_range *range; + unsigned nblocks; + struct r600_block **blocks; + struct list_head dirty; + struct list_head enable_list; + unsigned pm4_dirty_cdwords; + unsigned ctx_pm4_ndwords; + unsigned init_dwords; + + /* The list of active queries. Only one query of each type can be active. */ + struct list_head active_query_list; + unsigned num_cs_dw_queries_suspend; + unsigned num_cs_dw_streamout_end; + + unsigned backend_mask; + unsigned max_db; /* for OQ */ + unsigned flags; + boolean predicate_drawing; + + unsigned num_so_targets; + struct r600_so_target *so_targets[PIPE_MAX_SO_BUFFERS]; + boolean streamout_start; + unsigned streamout_append_bitmask; + unsigned *vs_so_stride_in_dw; + unsigned *vs_shader_so_strides; +}; + +static INLINE void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom) +{ + atom->emit(rctx, atom); + atom->dirty = false; + if (atom->head.next && atom->head.prev) + LIST_DELINIT(&atom->head); +} + +static INLINE void r600_atom_dirty(struct r600_context *rctx, struct r600_atom *state) +{ + if (!state->dirty) { + if (state->flags & EMIT_EARLY) { + LIST_ADD(&state->head, &rctx->dirty_states); + } else { + LIST_ADDTAIL(&state->head, &rctx->dirty_states); + } + state->dirty = true; + } +} + +/* evergreen_state.c */ +void cayman_init_state_functions(struct r600_context *rctx); +void si_init_config(struct r600_context *rctx); +void si_pipe_shader_ps(struct pipe_context *ctx, struct si_pipe_shader *shader); +void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *shader); +void si_update_spi_map(struct r600_context *rctx); +void *cayman_create_db_flush_dsa(struct r600_context *rctx); +void cayman_polygon_offset_update(struct r600_context *rctx); +uint32_t si_translate_vertexformat(struct pipe_screen *screen, + enum pipe_format format, + const struct util_format_description *desc, + int first_non_void); +boolean si_is_format_supported(struct pipe_screen *screen, + enum pipe_format format, + enum pipe_texture_target target, + unsigned sample_count, + unsigned usage); + +/* r600_blit.c */ +void r600_init_blit_functions(struct r600_context *rctx); +void r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *texture); +void r600_blit_push_depth(struct pipe_context *ctx, struct r600_resource_texture *texture); +void r600_flush_depth_textures(struct r600_context *rctx); + +/* r600_buffer.c */ +bool r600_init_resource(struct r600_screen *rscreen, + struct r600_resource *res, + unsigned size, unsigned alignment, + unsigned bind, unsigned usage); +struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, + const struct pipe_resource *templ); +struct pipe_resource *r600_user_buffer_create(struct pipe_screen *screen, + void *ptr, unsigned bytes, + unsigned bind); +void r600_upload_index_buffer(struct r600_context *rctx, + struct pipe_index_buffer *ib, unsigned count); + + +/* r600_pipe.c */ +void radeonsi_flush(struct pipe_context *ctx, struct pipe_fence_handle **fence, + unsigned flags); + +/* r600_query.c */ +void r600_init_query_functions(struct r600_context *rctx); + +/* r600_resource.c */ +void r600_init_context_resource_functions(struct r600_context *r600); + +/* radeonsi_shader.c */ +int si_pipe_shader_create(struct pipe_context *ctx, struct si_pipe_shader *shader); +void si_pipe_shader_destroy(struct pipe_context *ctx, struct si_pipe_shader *shader); + +/* r600_texture.c */ +void r600_init_screen_texture_functions(struct pipe_screen *screen); +void r600_init_surface_functions(struct r600_context *r600); +unsigned r600_texture_get_offset(struct r600_resource_texture *rtex, + unsigned level, unsigned layer); + +/* r600_translate.c */ +void r600_translate_index_buffer(struct r600_context *r600, + struct pipe_index_buffer *ib, + unsigned count); + +/* r600_state_common.c */ +void r600_init_common_atoms(struct r600_context *rctx); +unsigned r600_get_cb_flush_flags(struct r600_context *rctx); +void r600_texture_barrier(struct pipe_context *ctx); +void r600_set_index_buffer(struct pipe_context *ctx, + const struct pipe_index_buffer *ib); +void r600_set_vertex_buffers(struct pipe_context *ctx, unsigned count, + const struct pipe_vertex_buffer *buffers); +void *si_create_vertex_elements(struct pipe_context *ctx, + unsigned count, + const struct pipe_vertex_element *elements); +void r600_delete_vertex_element(struct pipe_context *ctx, void *state); +void r600_bind_blend_state(struct pipe_context *ctx, void *state); +void r600_bind_dsa_state(struct pipe_context *ctx, void *state); +void r600_bind_rs_state(struct pipe_context *ctx, void *state); +void r600_delete_rs_state(struct pipe_context *ctx, void *state); +void r600_sampler_view_destroy(struct pipe_context *ctx, + struct pipe_sampler_view *state); +void r600_delete_state(struct pipe_context *ctx, void *state); +void r600_bind_vertex_elements(struct pipe_context *ctx, void *state); +void *si_create_shader_state(struct pipe_context *ctx, + const struct pipe_shader_state *state); +void r600_bind_ps_shader(struct pipe_context *ctx, void *state); +void r600_bind_vs_shader(struct pipe_context *ctx, void *state); +void r600_delete_ps_shader(struct pipe_context *ctx, void *state); +void r600_delete_vs_shader(struct pipe_context *ctx, void *state); +void r600_set_constant_buffer(struct pipe_context *ctx, uint shader, uint index, + struct pipe_resource *buffer); +struct pipe_stream_output_target * +r600_create_so_target(struct pipe_context *ctx, + struct pipe_resource *buffer, + unsigned buffer_offset, + unsigned buffer_size); +void r600_so_target_destroy(struct pipe_context *ctx, + struct pipe_stream_output_target *target); +void r600_set_so_targets(struct pipe_context *ctx, + unsigned num_targets, + struct pipe_stream_output_target **targets, + unsigned append_bitmask); +void r600_set_pipe_stencil_ref(struct pipe_context *ctx, + const struct pipe_stencil_ref *state); +void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info); + +/* + * common helpers + */ +static INLINE uint32_t S_FIXED(float value, uint32_t frac_bits) +{ + return value * (1 << frac_bits); +} +#define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y)) + +static inline unsigned r600_tex_aniso_filter(unsigned filter) +{ + if (filter <= 1) return 0; + if (filter <= 2) return 1; + if (filter <= 4) return 2; + if (filter <= 8) return 3; + /* else */ return 4; +} + +/* 12.4 fixed-point */ +static INLINE unsigned r600_pack_float_12p4(float x) +{ + return x <= 0 ? 0 : + x >= 4096 ? 0xffff : x * 16; +} + +static INLINE uint64_t r600_resource_va(struct pipe_screen *screen, struct pipe_resource *resource) +{ + struct r600_screen *rscreen = (struct r600_screen*)screen; + struct r600_resource *rresource = (struct r600_resource*)resource; + + return rscreen->ws->buffer_get_virtual_address(rresource->cs_buf); +} + +#endif diff --git a/src/gallium/drivers/radeonsi/radeonsi_public.h b/src/gallium/drivers/radeonsi/radeonsi_public.h new file mode 100644 index 00000000000..5dcec0fc93b --- /dev/null +++ b/src/gallium/drivers/radeonsi/radeonsi_public.h @@ -0,0 +1,30 @@ +/* + * Copyright 2010 Jerome Glisse <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef RADEONSI_PUBLIC_H +#define RADEONSI_PUBLIC_H + +struct radeon_winsys; + +struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws); + +#endif diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c new file mode 100644 index 00000000000..50f2e39314f --- /dev/null +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c @@ -0,0 +1,565 @@ + +#include "gallivm/lp_bld_tgsi_action.h" +#include "gallivm/lp_bld_const.h" +#include "gallivm/lp_bld_intr.h" +#include "gallivm/lp_bld_tgsi.h" +#include "radeon_llvm.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_dump.h" + +#include "radeonsi_pipe.h" +#include "radeonsi_shader.h" +#include "sid.h" + +#include <assert.h> +#include <errno.h> +#include <stdio.h> + +/* +static ps_remap_inputs( + struct tgsi_llvm_context * tl_ctx, + unsigned tgsi_index, + unsigned tgsi_chan) +{ + : +} + +struct si_input +{ + struct list_head head; + unsigned tgsi_index; + unsigned tgsi_chan; + unsigned order; +}; +*/ + + +struct si_shader_context +{ + struct radeon_llvm_context radeon_bld; + struct r600_context *rctx; + struct tgsi_parse_context parse; + struct tgsi_token * tokens; + struct si_pipe_shader *shader; + unsigned type; /* TGSI_PROCESSOR_* specifies the type of shader. */ +/* unsigned num_inputs; */ +/* struct list_head inputs; */ +/* unsigned * input_mappings *//* From TGSI to SI hw */ +/* struct tgsi_shader_info info;*/ +}; + +static struct si_shader_context * si_shader_context( + struct lp_build_tgsi_context * bld_base) +{ + return (struct si_shader_context *)bld_base; +} + + +#define PERSPECTIVE_BASE 0 +#define LINEAR_BASE 9 + +#define SAMPLE_OFFSET 0 +#define CENTER_OFFSET 2 +#define CENTROID_OFSET 4 + +#define USE_SGPR_MAX_SUFFIX_LEN 5 + +enum sgpr_type { + SGPR_I32, + SGPR_I64, + SGPR_PTR_V4I32, + SGPR_PTR_V8I32 +}; + +static LLVMValueRef use_sgpr( + struct gallivm_state * gallivm, + enum sgpr_type type, + unsigned sgpr) +{ + LLVMValueRef sgpr_index; + LLVMValueRef sgpr_value; + LLVMTypeRef ret_type; + + sgpr_index = lp_build_const_int32(gallivm, sgpr); + + if (type == SGPR_I32) { + ret_type = LLVMInt32TypeInContext(gallivm->context); + return lp_build_intrinsic_unary(gallivm->builder, + "llvm.SI.use.sgpr.i32", + ret_type, sgpr_index); + } + + ret_type = LLVMInt64TypeInContext(gallivm->context); + sgpr_value = lp_build_intrinsic_unary(gallivm->builder, + "llvm.SI.use.sgpr.i64", + ret_type, sgpr_index); + + switch (type) { + case SGPR_I64: + return sgpr_value; + case SGPR_PTR_V4I32: + ret_type = LLVMInt32TypeInContext(gallivm->context); + ret_type = LLVMVectorType(ret_type, 4); + ret_type = LLVMPointerType(ret_type, + 0 /*XXX: Specify address space*/); + return LLVMBuildIntToPtr(gallivm->builder, sgpr_value, + ret_type, ""); + case SGPR_PTR_V8I32: + ret_type = LLVMInt32TypeInContext(gallivm->context); + ret_type = LLVMVectorType(ret_type, 8); + ret_type = LLVMPointerType(ret_type, + 0 /*XXX: Specify address space*/); + return LLVMBuildIntToPtr(gallivm->builder, sgpr_value, + ret_type, ""); + default: + assert(!"Unsupported SGPR type in use_sgpr()"); + return NULL; + } +} + +static void declare_input_vs( + struct si_shader_context * si_shader_ctx, + unsigned input_index, + const struct tgsi_full_declaration *decl) +{ + LLVMValueRef t_list_ptr; + LLVMValueRef t_offset; + LLVMValueRef attribute_offset; + LLVMValueRef buffer_index_reg; + LLVMValueRef args[4]; + LLVMTypeRef vec4_type; + LLVMValueRef input; + struct lp_build_context * uint = &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld; + struct lp_build_context * base = &si_shader_ctx->radeon_bld.soa.bld_base.base; + struct r600_context *rctx = si_shader_ctx->rctx; + struct pipe_vertex_element *velem = &rctx->vertex_elements->elements[input_index]; + unsigned chan; + + /* XXX: Communicate with the rest of the driver about which SGPR the T# + * list pointer is going to be stored in. Hard code to SGPR[0-1] for + * now */ + t_list_ptr = use_sgpr(base->gallivm, SGPR_I64, 0); + + t_offset = lp_build_const_int32(base->gallivm, + 4 * velem->vertex_buffer_index); + attribute_offset = lp_build_const_int32(base->gallivm, velem->src_offset); + + /* Load the buffer index is always, which is always stored in VGPR0 + * for Vertex Shaders */ + buffer_index_reg = lp_build_intrinsic(base->gallivm->builder, + "llvm.SI.vs.load.buffer.index", uint->elem_type, NULL, 0); + + vec4_type = LLVMVectorType(base->elem_type, 4); + args[0] = t_list_ptr; + args[1] = t_offset; + args[2] = attribute_offset; + args[3] = buffer_index_reg; + input = lp_build_intrinsic(base->gallivm->builder, + "llvm.SI.vs.load.input", vec4_type, args, 4); + + /* Break up the vec4 into individual components */ + for (chan = 0; chan < 4; chan++) { + LLVMValueRef llvm_chan = lp_build_const_int32(base->gallivm, chan); + /* XXX: Use a helper function for this. There is one in + * tgsi_llvm.c. */ + si_shader_ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] = + LLVMBuildExtractElement(base->gallivm->builder, + input, llvm_chan, ""); + } +} + +static void declare_input_fs( + struct si_shader_context * si_shader_ctx, + unsigned input_index, + const struct tgsi_full_declaration *decl) +{ + const char * intr_name; + unsigned chan; + struct lp_build_context * base = + &si_shader_ctx->radeon_bld.soa.bld_base.base; + struct gallivm_state * gallivm = base->gallivm; + + /* This value is: + * [15:0] NewPrimMask (Bit mask for each quad. It is set it the + * quad begins a new primitive. Bit 0 always needs + * to be unset) + * [32:16] ParamOffset + * + */ + LLVMValueRef params = use_sgpr(base->gallivm, SGPR_I32, 6); + + + /* XXX: Is this the input_index? */ + LLVMValueRef attr_number = lp_build_const_int32(gallivm, input_index); + + /* XXX: Handle all possible interpolation modes */ + switch (decl->Declaration.Interpolate) { + case TGSI_INTERPOLATE_COLOR: + if (si_shader_ctx->rctx->rasterizer->flatshade) + intr_name = "llvm.SI.fs.interp.constant"; + else + intr_name = "llvm.SI.fs.interp.linear.center"; + break; + case TGSI_INTERPOLATE_CONSTANT: + intr_name = "llvm.SI.fs.interp.constant"; + break; + case TGSI_INTERPOLATE_LINEAR: + intr_name = "llvm.SI.fs.interp.linear.center"; + break; + default: + fprintf(stderr, "Warning: Unhandled interpolation mode.\n"); + return; + } + + /* XXX: Could there be more than TGSI_NUM_CHANNELS (4) ? */ + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + LLVMValueRef args[3]; + LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan); + unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan); + LLVMTypeRef input_type = LLVMFloatTypeInContext(gallivm->context); + args[0] = llvm_chan; + args[1] = attr_number; + args[2] = params; + si_shader_ctx->radeon_bld.inputs[soa_index] = + lp_build_intrinsic(gallivm->builder, intr_name, + input_type, args, 3); + } +} + +static void declare_input( + struct radeon_llvm_context * radeon_bld, + unsigned input_index, + const struct tgsi_full_declaration *decl) +{ + struct si_shader_context * si_shader_ctx = + si_shader_context(&radeon_bld->soa.bld_base); + if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX) { + declare_input_vs(si_shader_ctx, input_index, decl); + } else if (si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT) { + declare_input_fs(si_shader_ctx, input_index, decl); + } else { + fprintf(stderr, "Warning: Unsupported shader type,\n"); + } +} + +static LLVMValueRef fetch_constant( + struct lp_build_tgsi_context * bld_base, + const struct tgsi_full_src_register *reg, + enum tgsi_opcode_type type, + unsigned swizzle) +{ + struct lp_build_context * base = &bld_base->base; + + LLVMValueRef const_ptr; + LLVMValueRef offset; + + /* XXX: Assume the pointer to the constant buffer is being stored in + * SGPR[2:3] */ + const_ptr = use_sgpr(base->gallivm, SGPR_I64, 1); + + /* XXX: This assumes that the constant buffer is not packed, so + * CONST[0].x will have an offset of 0 and CONST[1].x will have an + * offset of 4. */ + offset = lp_build_const_int32(base->gallivm, + (reg->Register.Index * 4) + swizzle); + + return lp_build_intrinsic_binary(base->gallivm->builder, + "llvm.SI.load.const", base->elem_type, const_ptr, offset); +} + + +/* Declare some intrinsics with the correct attributes */ +static void si_llvm_emit_prologue(struct lp_build_tgsi_context * bld_base) +{ + LLVMValueRef function; + struct gallivm_state * gallivm = bld_base->base.gallivm; + + LLVMTypeRef i64 = LLVMInt64TypeInContext(gallivm->context); + LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); + + /* declare i32 @llvm.SI.use.sgpr.i32(i32) */ + function = lp_declare_intrinsic(gallivm->module, "llvm.SI.use.sgpr.i32", + i32, &i32, 1); + LLVMAddFunctionAttr(function, LLVMReadNoneAttribute); + + /* declare i64 @llvm.SI.use.sgpr.i64(i32) */ + function = lp_declare_intrinsic(gallivm->module, "llvm.SI.use.sgpr.i64", + i64, &i32, 1); + LLVMAddFunctionAttr(function, LLVMReadNoneAttribute); +} + +/* XXX: This is partially implemented for VS only at this point. It is not complete */ +static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) +{ + struct si_shader_context * si_shader_ctx = si_shader_context(bld_base); + struct r600_shader * shader = &si_shader_ctx->shader->shader; + struct lp_build_context * base = &bld_base->base; + struct lp_build_context * uint = + &si_shader_ctx->radeon_bld.soa.bld_base.uint_bld; + struct tgsi_parse_context *parse = &si_shader_ctx->parse; + LLVMValueRef last_args[9] = { 0 }; + + while (!tgsi_parse_end_of_tokens(parse)) { + /* XXX: component_bits controls which components of the output + * registers actually get exported. (e.g bit 0 means export + * X component, bit 1 means export Y component, etc.) I'm + * hard coding this to 0xf for now. In the future, we might + * want to do something else. */ + unsigned component_bits = 0xf; + unsigned chan; + struct tgsi_full_declaration *d = + &parse->FullToken.FullDeclaration; + LLVMValueRef args[9]; + unsigned target; + unsigned index; + unsigned color_count = 0; + unsigned param_count = 0; + int i; + + tgsi_parse_token(parse); + if (parse->FullToken.Token.Type != TGSI_TOKEN_TYPE_DECLARATION) + continue; + + switch (d->Declaration.File) { + case TGSI_FILE_INPUT: + i = shader->ninput++; + shader->input[i].name = d->Semantic.Name; + shader->input[i].sid = d->Semantic.Index; + shader->input[i].interpolate = d->Declaration.Interpolate; + shader->input[i].centroid = d->Declaration.Centroid; + break; + case TGSI_FILE_OUTPUT: + i = shader->noutput++; + shader->output[i].name = d->Semantic.Name; + shader->output[i].sid = d->Semantic.Index; + shader->output[i].interpolate = d->Declaration.Interpolate; + break; + } + + if (d->Declaration.File != TGSI_FILE_OUTPUT) + continue; + + for (index = d->Range.First; index <= d->Range.Last; index++) { + for (chan = 0; chan < 4; chan++ ) { + LLVMValueRef out_ptr = + si_shader_ctx->radeon_bld.soa.outputs + [index][chan]; + /* +5 because the first output value will be + * the 6th argument to the intrinsic. */ + args[chan + 5]= LLVMBuildLoad( + base->gallivm->builder, out_ptr, ""); + } + + /* XXX: We probably need to keep track of the output + * values, so we know what we are passing to the next + * stage. */ + + /* Select the correct target */ + switch(d->Semantic.Name) { + case TGSI_SEMANTIC_POSITION: + target = V_008DFC_SQ_EXP_POS; + break; + case TGSI_SEMANTIC_COLOR: + if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX) { + target = V_008DFC_SQ_EXP_PARAM + param_count; + param_count++; + } else { + target = V_008DFC_SQ_EXP_MRT + color_count; + color_count++; + } + break; + case TGSI_SEMANTIC_GENERIC: + target = V_008DFC_SQ_EXP_PARAM + param_count; + param_count++; + break; + default: + target = 0; + fprintf(stderr, + "Warning: SI unhandled output type:%d\n", + d->Semantic.Name); + } + + /* Specify which components to enable */ + args[0] = lp_build_const_int32(base->gallivm, + component_bits); + + /* Specify whether the EXEC mask represents the valid mask */ + args[1] = lp_build_const_int32(base->gallivm, 0); + + /* Specify whether this is the last export */ + args[2] = lp_build_const_int32(base->gallivm, 0); + + /* Specify the target we are exporting */ + args[3] = lp_build_const_int32(base->gallivm, target); + + /* Set COMPR flag to zero to export data as 32-bit */ + args[4] = uint->zero; + + if (si_shader_ctx->type == TGSI_PROCESSOR_VERTEX ? + (d->Semantic.Name == TGSI_SEMANTIC_POSITION) : + (d->Semantic.Name == TGSI_SEMANTIC_COLOR)) { + if (last_args[0]) { + lp_build_intrinsic(base->gallivm->builder, + "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + last_args, 9); + } + + memcpy(last_args, args, sizeof(args)); + } else { + lp_build_intrinsic(base->gallivm->builder, + "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + args, 9); + } + + } + } + + /* Specify whether the EXEC mask represents the valid mask */ + last_args[1] = lp_build_const_int32(base->gallivm, + si_shader_ctx->type == TGSI_PROCESSOR_FRAGMENT); + + /* Specify that this is the last export */ + last_args[2] = lp_build_const_int32(base->gallivm, 1); + + lp_build_intrinsic(base->gallivm->builder, + "llvm.SI.export", + LLVMVoidTypeInContext(base->gallivm->context), + last_args, 9); + +/* XXX: Look up what this function does */ +/* ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);*/ +} + +static void tex_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + /* WriteMask */ + emit_data->args[0] = lp_build_const_int32(bld_base->base.gallivm, + emit_data->inst->Dst[0].Register.WriteMask); + + /* Coordinates */ + /* XXX: Not all sample instructions need 4 address arguments. */ + emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst, + 0, LP_CHAN_ALL); + + /* Resource */ + emit_data->args[2] = use_sgpr(bld_base->base.gallivm, SGPR_I64, 2); + emit_data->args[3] = lp_build_const_int32(bld_base->base.gallivm, + 32 * emit_data->inst->Src[2].Register.Index); + + /* Sampler */ + emit_data->args[4] = use_sgpr(bld_base->base.gallivm, SGPR_I64, 1); + emit_data->args[5] = lp_build_const_int32(bld_base->base.gallivm, + 16 * emit_data->inst->Src[2].Register.Index); + + /* Dimensions */ + /* XXX: We might want to pass this information to the shader at some. */ +/* emit_data->args[4] = lp_build_const_int32(bld_base->base.gallivm, + emit_data->inst->Texture.Texture); +*/ + + emit_data->arg_count = 6; + /* XXX: To optimize, we could use a float or v2f32, if the last bits of + * the writemask are clear */ + emit_data->dst_type = LLVMVectorType( + LLVMFloatTypeInContext(bld_base->base.gallivm->context), + 4); +} + +static const struct lp_build_tgsi_action tex_action = { + .fetch_args = tex_fetch_args, + .emit = lp_build_tgsi_intrinsic, + .intr_name = "llvm.SI.sample" +}; + + +int si_pipe_shader_create( + struct pipe_context *ctx, + struct si_pipe_shader *shader) +{ + struct r600_context *rctx = (struct r600_context*)ctx; + struct si_shader_context si_shader_ctx; + struct tgsi_shader_info shader_info; + struct lp_build_tgsi_context * bld_base; + LLVMModuleRef mod; + unsigned char * inst_bytes; + unsigned inst_byte_count; + unsigned i; + + radeon_llvm_context_init(&si_shader_ctx.radeon_bld); + bld_base = &si_shader_ctx.radeon_bld.soa.bld_base; + + tgsi_scan_shader(shader->tokens, &shader_info); + bld_base->info = &shader_info; + bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = fetch_constant; + bld_base->emit_prologue = si_llvm_emit_prologue; + bld_base->emit_epilogue = si_llvm_emit_epilogue; + + bld_base->op_actions[TGSI_OPCODE_TEX] = tex_action; + + si_shader_ctx.radeon_bld.load_input = declare_input; + si_shader_ctx.tokens = shader->tokens; + tgsi_parse_init(&si_shader_ctx.parse, si_shader_ctx.tokens); + si_shader_ctx.shader = shader; + si_shader_ctx.type = si_shader_ctx.parse.FullHeader.Processor.Processor; + si_shader_ctx.rctx = rctx; + + shader->shader.nr_cbufs = rctx->nr_cbufs; + + lp_build_tgsi_llvm(bld_base, shader->tokens); + + radeon_llvm_finalize_module(&si_shader_ctx.radeon_bld); + + mod = bld_base->base.gallivm->module; + tgsi_dump(shader->tokens, 0); + LLVMDumpModule(mod); + radeon_llvm_compile(mod, &inst_bytes, &inst_byte_count, "SI", 1 /* dump */); + fprintf(stderr, "SI CODE:\n"); + for (i = 0; i < inst_byte_count; i+=4 ) { + fprintf(stderr, "%02x%02x%02x%02x\n", inst_bytes[i + 3], + inst_bytes[i + 2], inst_bytes[i + 1], + inst_bytes[i]); + } + + shader->num_sgprs = util_le32_to_cpu(*(uint32_t*)inst_bytes); + shader->num_vgprs = util_le32_to_cpu(*(uint32_t*)(inst_bytes + 4)); + shader->spi_ps_input_ena = util_le32_to_cpu(*(uint32_t*)(inst_bytes + 8)); + + tgsi_parse_free(&si_shader_ctx.parse); + + /* copy new shader */ + if (shader->bo == NULL) { + uint32_t *ptr; + + shader->bo = (struct r600_resource*) + pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, inst_byte_count); + if (shader->bo == NULL) { + return -ENOMEM; + } + ptr = (uint32_t*)rctx->ws->buffer_map(shader->bo->buf, rctx->cs, PIPE_TRANSFER_WRITE); + if (0 /*R600_BIG_ENDIAN*/) { + for (i = 0; i < (inst_byte_count-12)/4; ++i) { + ptr[i] = util_bswap32(*(uint32_t*)(inst_bytes+12 + i*4)); + } + } else { + memcpy(ptr, inst_bytes + 12, inst_byte_count - 12); + } + rctx->ws->buffer_unmap(shader->bo->buf); + } + + free(inst_bytes); + + return 0; +} + +void si_pipe_shader_destroy(struct pipe_context *ctx, struct si_pipe_shader *shader) +{ + pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL); + + memset(&shader->shader,0,sizeof(struct r600_shader)); +} diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.h b/src/gallium/drivers/radeonsi/radeonsi_shader.h new file mode 100644 index 00000000000..cd742f57da1 --- /dev/null +++ b/src/gallium/drivers/radeonsi/radeonsi_shader.h @@ -0,0 +1,4 @@ + +struct tgsi_token; + +void si_test(struct tgsi_token * token, unsigned type); diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h new file mode 100644 index 00000000000..325445c97e0 --- /dev/null +++ b/src/gallium/drivers/radeonsi/sid.h @@ -0,0 +1,7668 @@ +/* + * Southern Islands Register documentation + * + * Copyright (C) 2011 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SID_H +#define SID_H + +/* si values */ +#define SI_CONFIG_REG_OFFSET 0x00008000 +#define SI_CONFIG_REG_END 0x0000B000 +#define SI_SH_REG_OFFSET 0x0000B000 +#define SI_SH_REG_END 0x0000C000 +#define SI_CONTEXT_REG_OFFSET 0x00028000 +#define SI_CONTEXT_REG_END 0x00029000 + +#define EVENT_TYPE_PS_PARTIAL_FLUSH 0x10 +#define EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT 0x14 +#define EVENT_TYPE_ZPASS_DONE 0x15 +#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16 +#define EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH 0x1f +#define EVENT_TYPE_SAMPLE_STREAMOUTSTATS 0x20 +#define EVENT_TYPE(x) ((x) << 0) +#define EVENT_INDEX(x) ((x) << 8) + /* 0 - any non-TS event + * 1 - ZPASS_DONE + * 2 - SAMPLE_PIPELINESTAT + * 3 - SAMPLE_STREAMOUTSTAT* + * 4 - *S_PARTIAL_FLUSH + * 5 - TS events + */ + +#define PREDICATION_OP_CLEAR 0x0 +#define PREDICATION_OP_ZPASS 0x1 +#define PREDICATION_OP_PRIMCOUNT 0x2 + +#define PRED_OP(x) ((x) << 16) + +#define PREDICATION_CONTINUE (1 << 31) + +#define PREDICATION_HINT_WAIT (0 << 12) +#define PREDICATION_HINT_NOWAIT_DRAW (1 << 12) + +#define PREDICATION_DRAW_NOT_VISIBLE (0 << 8) +#define PREDICATION_DRAW_VISIBLE (1 << 8) + +#define R600_TEXEL_PITCH_ALIGNMENT_MASK 0x7 + +#define PKT3_NOP 0x10 +#define PKT3_SET_PREDICATION 0x20 +#define PKT3_COND_EXEC 0x22 +#define PKT3_PRED_EXEC 0x23 +#define PKT3_START_3D_CMDBUF 0x24 +#define PKT3_DRAW_INDEX_2 0x27 +#define PKT3_CONTEXT_CONTROL 0x28 +#define PKT3_INDEX_TYPE 0x2A +#define PKT3_DRAW_INDEX 0x2B +#define PKT3_DRAW_INDEX_AUTO 0x2D +#define PKT3_DRAW_INDEX_IMMD 0x2E +#define PKT3_NUM_INSTANCES 0x2F +#define PKT3_STRMOUT_BUFFER_UPDATE 0x34 +#define PKT3_MEM_SEMAPHORE 0x39 +#define PKT3_MPEG_INDEX 0x3A +#define PKT3_WAIT_REG_MEM 0x3C +#define WAIT_REG_MEM_EQUAL 3 +#define PKT3_MEM_WRITE 0x3D +#define PKT3_INDIRECT_BUFFER 0x32 +#define PKT3_SURFACE_SYNC 0x43 +#define PKT3_ME_INITIALIZE 0x44 +#define PKT3_COND_WRITE 0x45 +#define PKT3_EVENT_WRITE 0x46 +#define PKT3_EVENT_WRITE_EOP 0x47 +#define PKT3_EVENT_WRITE_EOS 0x48 +#define PKT3_ONE_REG_WRITE 0x57 +#define PKT3_SET_CONFIG_REG 0x68 +#define PKT3_SET_CONTEXT_REG 0x69 +#define PKT3_SET_SH_REG 0x76 + +#define PKT_TYPE_S(x) (((x) & 0x3) << 30) +#define PKT_TYPE_G(x) (((x) >> 30) & 0x3) +#define PKT_TYPE_C 0x3FFFFFFF +#define PKT_COUNT_S(x) (((x) & 0x3FFF) << 16) +#define PKT_COUNT_G(x) (((x) >> 16) & 0x3FFF) +#define PKT_COUNT_C 0xC000FFFF +#define PKT0_BASE_INDEX_S(x) (((x) & 0xFFFF) << 0) +#define PKT0_BASE_INDEX_G(x) (((x) >> 0) & 0xFFFF) +#define PKT0_BASE_INDEX_C 0xFFFF0000 +#define PKT3_IT_OPCODE_S(x) (((x) & 0xFF) << 8) +#define PKT3_IT_OPCODE_G(x) (((x) >> 8) & 0xFF) +#define PKT3_IT_OPCODE_C 0xFFFF00FF +#define PKT3_PREDICATE(x) (((x) >> 0) & 0x1) +#define PKT0(index, count) (PKT_TYPE_S(0) | PKT0_BASE_INDEX_S(index) | PKT_COUNT_S(count)) +#define PKT3(op, count, predicate) (PKT_TYPE_S(3) | PKT3_IT_OPCODE_S(op) | PKT_COUNT_S(count) | PKT3_PREDICATE(predicate)) + +#define R_0084FC_CP_STRMOUT_CNTL 0x0084FC +#define S_0084FC_OFFSET_UPDATE_DONE(x) (((x) & 0x1) << 0) +#define R_0085F0_CP_COHER_CNTL 0x0085F0 +#define S_0085F0_DEST_BASE_0_ENA(x) (((x) & 0x1) << 0) +#define G_0085F0_DEST_BASE_0_ENA(x) (((x) >> 0) & 0x1) +#define C_0085F0_DEST_BASE_0_ENA 0xFFFFFFFE +#define S_0085F0_DEST_BASE_1_ENA(x) (((x) & 0x1) << 1) +#define G_0085F0_DEST_BASE_1_ENA(x) (((x) >> 1) & 0x1) +#define C_0085F0_DEST_BASE_1_ENA 0xFFFFFFFD +#define S_0085F0_CB0_DEST_BASE_ENA_SHIFT 6 +#define S_0085F0_CB0_DEST_BASE_ENA(x) (((x) & 0x1) << 6) +#define G_0085F0_CB0_DEST_BASE_ENA(x) (((x) >> 6) & 0x1) +#define C_0085F0_CB0_DEST_BASE_ENA 0xFFFFFFBF +#define S_0085F0_CB1_DEST_BASE_ENA(x) (((x) & 0x1) << 7) +#define G_0085F0_CB1_DEST_BASE_ENA(x) (((x) >> 7) & 0x1) +#define C_0085F0_CB1_DEST_BASE_ENA 0xFFFFFF7F +#define S_0085F0_CB2_DEST_BASE_ENA(x) (((x) & 0x1) << 8) +#define G_0085F0_CB2_DEST_BASE_ENA(x) (((x) >> 8) & 0x1) +#define C_0085F0_CB2_DEST_BASE_ENA 0xFFFFFEFF +#define S_0085F0_CB3_DEST_BASE_ENA(x) (((x) & 0x1) << 9) +#define G_0085F0_CB3_DEST_BASE_ENA(x) (((x) >> 9) & 0x1) +#define C_0085F0_CB3_DEST_BASE_ENA 0xFFFFFDFF +#define S_0085F0_CB4_DEST_BASE_ENA(x) (((x) & 0x1) << 10) +#define G_0085F0_CB4_DEST_BASE_ENA(x) (((x) >> 10) & 0x1) +#define C_0085F0_CB4_DEST_BASE_ENA 0xFFFFFBFF +#define S_0085F0_CB5_DEST_BASE_ENA(x) (((x) & 0x1) << 11) +#define G_0085F0_CB5_DEST_BASE_ENA(x) (((x) >> 11) & 0x1) +#define C_0085F0_CB5_DEST_BASE_ENA 0xFFFFF7FF +#define S_0085F0_CB6_DEST_BASE_ENA(x) (((x) & 0x1) << 12) +#define G_0085F0_CB6_DEST_BASE_ENA(x) (((x) >> 12) & 0x1) +#define C_0085F0_CB6_DEST_BASE_ENA 0xFFFFEFFF +#define S_0085F0_CB7_DEST_BASE_ENA(x) (((x) & 0x1) << 13) +#define G_0085F0_CB7_DEST_BASE_ENA(x) (((x) >> 13) & 0x1) +#define C_0085F0_CB7_DEST_BASE_ENA 0xFFFFDFFF +#define S_0085F0_DB_DEST_BASE_ENA(x) (((x) & 0x1) << 14) +#define G_0085F0_DB_DEST_BASE_ENA(x) (((x) >> 14) & 0x1) +#define C_0085F0_DB_DEST_BASE_ENA 0xFFFFBFFF +#define S_0085F0_DEST_BASE_2_ENA(x) (((x) & 0x1) << 19) +#define G_0085F0_DEST_BASE_2_ENA(x) (((x) >> 19) & 0x1) +#define C_0085F0_DEST_BASE_2_ENA 0xFFF7FFFF +#define S_0085F0_DEST_BASE_3_ENA(x) (((x) & 0x1) << 21) +#define G_0085F0_DEST_BASE_3_ENA(x) (((x) >> 21) & 0x1) +#define C_0085F0_DEST_BASE_3_ENA 0xFFDFFFFF +#define S_0085F0_TCL1_ACTION_ENA(x) (((x) & 0x1) << 22) +#define G_0085F0_TCL1_ACTION_ENA(x) (((x) >> 22) & 0x1) +#define C_0085F0_TCL1_ACTION_ENA 0xFFBFFFFF +#define S_0085F0_TC_ACTION_ENA(x) (((x) & 0x1) << 23) +#define G_0085F0_TC_ACTION_ENA(x) (((x) >> 23) & 0x1) +#define C_0085F0_TC_ACTION_ENA 0xFF7FFFFF +#define S_0085F0_CB_ACTION_ENA(x) (((x) & 0x1) << 25) +#define G_0085F0_CB_ACTION_ENA(x) (((x) >> 25) & 0x1) +#define C_0085F0_CB_ACTION_ENA 0xFDFFFFFF +#define S_0085F0_DB_ACTION_ENA(x) (((x) & 0x1) << 26) +#define G_0085F0_DB_ACTION_ENA(x) (((x) >> 26) & 0x1) +#define C_0085F0_DB_ACTION_ENA 0xFBFFFFFF +#define S_0085F0_SH_KCACHE_ACTION_ENA(x) (((x) & 0x1) << 27) +#define G_0085F0_SH_KCACHE_ACTION_ENA(x) (((x) >> 27) & 0x1) +#define C_0085F0_SH_KCACHE_ACTION_ENA 0xF7FFFFFF +#define S_0085F0_SH_ICACHE_ACTION_ENA(x) (((x) & 0x1) << 29) +#define G_0085F0_SH_ICACHE_ACTION_ENA(x) (((x) >> 29) & 0x1) +#define C_0085F0_SH_ICACHE_ACTION_ENA 0xDFFFFFFF +#define R_0085F4_CP_COHER_SIZE 0x0085F4 +#define R_0085F8_CP_COHER_BASE 0x0085F8 +#define R_0088B0_VGT_VTX_VECT_EJECT_REG 0x0088B0 +#define S_0088B0_PRIM_COUNT(x) (((x) & 0x3FF) << 0) +#define G_0088B0_PRIM_COUNT(x) (((x) >> 0) & 0x3FF) +#define C_0088B0_PRIM_COUNT 0xFFFFFC00 +#define R_0088C4_VGT_CACHE_INVALIDATION 0x0088C4 +#define S_0088C4_VS_NO_EXTRA_BUFFER(x) (((x) & 0x1) << 5) +#define G_0088C4_VS_NO_EXTRA_BUFFER(x) (((x) >> 5) & 0x1) +#define C_0088C4_VS_NO_EXTRA_BUFFER 0xFFFFFFDF +#define S_0088C4_STREAMOUT_FULL_FLUSH(x) (((x) & 0x1) << 13) +#define G_0088C4_STREAMOUT_FULL_FLUSH(x) (((x) >> 13) & 0x1) +#define C_0088C4_STREAMOUT_FULL_FLUSH 0xFFFFDFFF +#define S_0088C4_ES_LIMIT(x) (((x) & 0x1F) << 16) +#define G_0088C4_ES_LIMIT(x) (((x) >> 16) & 0x1F) +#define C_0088C4_ES_LIMIT 0xFFE0FFFF +#define R_0088C8_VGT_ESGS_RING_SIZE 0x0088C8 +#define R_0088CC_VGT_GSVS_RING_SIZE 0x0088CC +#define R_0088D4_VGT_GS_VERTEX_REUSE 0x0088D4 +#define S_0088D4_VERT_REUSE(x) (((x) & 0x1F) << 0) +#define G_0088D4_VERT_REUSE(x) (((x) >> 0) & 0x1F) +#define C_0088D4_VERT_REUSE 0xFFFFFFE0 +#define R_008958_VGT_PRIMITIVE_TYPE 0x008958 +#define S_008958_PRIM_TYPE(x) (((x) & 0x3F) << 0) +#define G_008958_PRIM_TYPE(x) (((x) >> 0) & 0x3F) +#define C_008958_PRIM_TYPE 0xFFFFFFC0 +#define V_008958_DI_PT_NONE 0x00 +#define V_008958_DI_PT_POINTLIST 0x01 +#define V_008958_DI_PT_LINELIST 0x02 +#define V_008958_DI_PT_LINESTRIP 0x03 +#define V_008958_DI_PT_TRILIST 0x04 +#define V_008958_DI_PT_TRIFAN 0x05 +#define V_008958_DI_PT_TRISTRIP 0x06 +#define V_008958_DI_PT_UNUSED_0 0x07 +#define V_008958_DI_PT_UNUSED_1 0x08 +#define V_008958_DI_PT_PATCH 0x09 +#define V_008958_DI_PT_LINELIST_ADJ 0x0A +#define V_008958_DI_PT_LINESTRIP_ADJ 0x0B +#define V_008958_DI_PT_TRILIST_ADJ 0x0C +#define V_008958_DI_PT_TRISTRIP_ADJ 0x0D +#define V_008958_DI_PT_UNUSED_3 0x0E +#define V_008958_DI_PT_UNUSED_4 0x0F +#define V_008958_DI_PT_TRI_WITH_WFLAGS 0x10 +#define V_008958_DI_PT_RECTLIST 0x11 +#define V_008958_DI_PT_LINELOOP 0x12 +#define V_008958_DI_PT_QUADLIST 0x13 +#define V_008958_DI_PT_QUADSTRIP 0x14 +#define V_008958_DI_PT_POLYGON 0x15 +#define V_008958_DI_PT_2D_COPY_RECT_LIST_V0 0x16 +#define V_008958_DI_PT_2D_COPY_RECT_LIST_V1 0x17 +#define V_008958_DI_PT_2D_COPY_RECT_LIST_V2 0x18 +#define V_008958_DI_PT_2D_COPY_RECT_LIST_V3 0x19 +#define V_008958_DI_PT_2D_FILL_RECT_LIST 0x1A +#define V_008958_DI_PT_2D_LINE_STRIP 0x1B +#define V_008958_DI_PT_2D_TRI_STRIP 0x1C +#define R_00895C_VGT_INDEX_TYPE 0x00895C +#define S_00895C_INDEX_TYPE(x) (((x) & 0x03) << 0) +#define G_00895C_INDEX_TYPE(x) (((x) >> 0) & 0x03) +#define C_00895C_INDEX_TYPE 0xFFFFFFFC +#define V_00895C_DI_INDEX_SIZE_16_BIT 0x00 +#define V_00895C_DI_INDEX_SIZE_32_BIT 0x01 +#define R_008960_VGT_STRMOUT_BUFFER_FILLED_SIZE_0 0x008960 +#define R_008964_VGT_STRMOUT_BUFFER_FILLED_SIZE_1 0x008964 +#define R_008968_VGT_STRMOUT_BUFFER_FILLED_SIZE_2 0x008968 +#define R_00896C_VGT_STRMOUT_BUFFER_FILLED_SIZE_3 0x00896C +#define R_008970_VGT_NUM_INDICES 0x008970 +#define R_008974_VGT_NUM_INSTANCES 0x008974 +#define R_008988_VGT_TF_RING_SIZE 0x008988 +#define S_008988_SIZE(x) (((x) & 0xFFFF) << 0) +#define G_008988_SIZE(x) (((x) >> 0) & 0xFFFF) +#define C_008988_SIZE 0xFFFF0000 +#define R_0089B0_VGT_HS_OFFCHIP_PARAM 0x0089B0 +#define S_0089B0_OFFCHIP_BUFFERING(x) (((x) & 0x7F) << 0) +#define G_0089B0_OFFCHIP_BUFFERING(x) (((x) >> 0) & 0x7F) +#define C_0089B0_OFFCHIP_BUFFERING 0xFFFFFF80 +#define R_0089B8_VGT_TF_MEMORY_BASE 0x0089B8 +#define R_008A14_PA_CL_ENHANCE 0x008A14 +#define S_008A14_CLIP_VTX_REORDER_ENA(x) (((x) & 0x1) << 0) +#define G_008A14_CLIP_VTX_REORDER_ENA(x) (((x) >> 0) & 0x1) +#define C_008A14_CLIP_VTX_REORDER_ENA 0xFFFFFFFE +#define S_008A14_NUM_CLIP_SEQ(x) (((x) & 0x03) << 1) +#define G_008A14_NUM_CLIP_SEQ(x) (((x) >> 1) & 0x03) +#define C_008A14_NUM_CLIP_SEQ 0xFFFFFFF9 +#define S_008A14_CLIPPED_PRIM_SEQ_STALL(x) (((x) & 0x1) << 3) +#define G_008A14_CLIPPED_PRIM_SEQ_STALL(x) (((x) >> 3) & 0x1) +#define C_008A14_CLIPPED_PRIM_SEQ_STALL 0xFFFFFFF7 +#define S_008A14_VE_NAN_PROC_DISABLE(x) (((x) & 0x1) << 4) +#define G_008A14_VE_NAN_PROC_DISABLE(x) (((x) >> 4) & 0x1) +#define C_008A14_VE_NAN_PROC_DISABLE 0xFFFFFFEF +#define R_008A60_PA_SU_LINE_STIPPLE_VALUE 0x008A60 +#define S_008A60_LINE_STIPPLE_VALUE(x) (((x) & 0xFFFFFF) << 0) +#define G_008A60_LINE_STIPPLE_VALUE(x) (((x) >> 0) & 0xFFFFFF) +#define C_008A60_LINE_STIPPLE_VALUE 0xFF000000 +#define R_008B10_PA_SC_LINE_STIPPLE_STATE 0x008B10 +#define S_008B10_CURRENT_PTR(x) (((x) & 0x0F) << 0) +#define G_008B10_CURRENT_PTR(x) (((x) >> 0) & 0x0F) +#define C_008B10_CURRENT_PTR 0xFFFFFFF0 +#define S_008B10_CURRENT_COUNT(x) (((x) & 0xFF) << 8) +#define G_008B10_CURRENT_COUNT(x) (((x) >> 8) & 0xFF) +#define C_008B10_CURRENT_COUNT 0xFFFF00FF +#define R_008BF0_PA_SC_ENHANCE 0x008BF0 +#define S_008BF0_ENABLE_PA_SC_OUT_OF_ORDER(x) (((x) & 0x1) << 0) +#define G_008BF0_ENABLE_PA_SC_OUT_OF_ORDER(x) (((x) >> 0) & 0x1) +#define C_008BF0_ENABLE_PA_SC_OUT_OF_ORDER 0xFFFFFFFE +#define S_008BF0_DISABLE_SC_DB_TILE_FIX(x) (((x) & 0x1) << 1) +#define G_008BF0_DISABLE_SC_DB_TILE_FIX(x) (((x) >> 1) & 0x1) +#define C_008BF0_DISABLE_SC_DB_TILE_FIX 0xFFFFFFFD +#define S_008BF0_DISABLE_AA_MASK_FULL_FIX(x) (((x) & 0x1) << 2) +#define G_008BF0_DISABLE_AA_MASK_FULL_FIX(x) (((x) >> 2) & 0x1) +#define C_008BF0_DISABLE_AA_MASK_FULL_FIX 0xFFFFFFFB +#define S_008BF0_ENABLE_1XMSAA_SAMPLE_LOCATIONS(x) (((x) & 0x1) << 3) +#define G_008BF0_ENABLE_1XMSAA_SAMPLE_LOCATIONS(x) (((x) >> 3) & 0x1) +#define C_008BF0_ENABLE_1XMSAA_SAMPLE_LOCATIONS 0xFFFFFFF7 +#define S_008BF0_ENABLE_1XMSAA_SAMPLE_LOC_CENTROID(x) (((x) & 0x1) << 4) +#define G_008BF0_ENABLE_1XMSAA_SAMPLE_LOC_CENTROID(x) (((x) >> 4) & 0x1) +#define C_008BF0_ENABLE_1XMSAA_SAMPLE_LOC_CENTROID 0xFFFFFFEF +#define S_008BF0_DISABLE_SCISSOR_FIX(x) (((x) & 0x1) << 5) +#define G_008BF0_DISABLE_SCISSOR_FIX(x) (((x) >> 5) & 0x1) +#define C_008BF0_DISABLE_SCISSOR_FIX 0xFFFFFFDF +#define S_008BF0_DISABLE_PW_BUBBLE_COLLAPSE(x) (((x) & 0x03) << 6) +#define G_008BF0_DISABLE_PW_BUBBLE_COLLAPSE(x) (((x) >> 6) & 0x03) +#define C_008BF0_DISABLE_PW_BUBBLE_COLLAPSE 0xFFFFFF3F +#define S_008BF0_SEND_UNLIT_STILES_TO_PACKER(x) (((x) & 0x1) << 8) +#define G_008BF0_SEND_UNLIT_STILES_TO_PACKER(x) (((x) >> 8) & 0x1) +#define C_008BF0_SEND_UNLIT_STILES_TO_PACKER 0xFFFFFEFF +#define S_008BF0_DISABLE_DUALGRAD_PERF_OPTIMIZATION(x) (((x) & 0x1) << 9) +#define G_008BF0_DISABLE_DUALGRAD_PERF_OPTIMIZATION(x) (((x) >> 9) & 0x1) +#define C_008BF0_DISABLE_DUALGRAD_PERF_OPTIMIZATION 0xFFFFFDFF +#define R_008C08_SQC_CACHES 0x008C08 +#define S_008C08_INST_INVALIDATE(x) (((x) & 0x1) << 0) +#define G_008C08_INST_INVALIDATE(x) (((x) >> 0) & 0x1) +#define C_008C08_INST_INVALIDATE 0xFFFFFFFE +#define S_008C08_DATA_INVALIDATE(x) (((x) & 0x1) << 1) +#define G_008C08_DATA_INVALIDATE(x) (((x) >> 1) & 0x1) +#define C_008C08_DATA_INVALIDATE 0xFFFFFFFD +#define R_008C0C_SQ_RANDOM_WAVE_PRI 0x008C0C +#define S_008C0C_RET(x) (((x) & 0x7F) << 0) +#define G_008C0C_RET(x) (((x) >> 0) & 0x7F) +#define C_008C0C_RET 0xFFFFFF80 +#define S_008C0C_RUI(x) (((x) & 0x07) << 7) +#define G_008C0C_RUI(x) (((x) >> 7) & 0x07) +#define C_008C0C_RUI 0xFFFFFC7F +#define S_008C0C_RNG(x) (((x) & 0x7FF) << 10) +#define G_008C0C_RNG(x) (((x) >> 10) & 0x7FF) +#define C_008C0C_RNG 0xFFE003FF +#if 0 +#define R_008DFC_SQ_INST 0x008DFC +#define R_008DFC_SQ_VOP1 0x008DFC +#define S_008DFC_SRC0(x) (((x) & 0x1FF) << 0) +#define G_008DFC_SRC0(x) (((x) >> 0) & 0x1FF) +#define C_008DFC_SRC0 0xFFFFFE00 +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define V_008DFC_SQ_SRC_VGPR 0x100 +#define S_008DFC_OP(x) (((x) & 0xFF) << 9) +#define G_008DFC_OP(x) (((x) >> 9) & 0xFF) +#define C_008DFC_OP 0xFFFE01FF +#define V_008DFC_SQ_V_NOP 0x00 +#define V_008DFC_SQ_V_MOV_B32 0x01 +#define V_008DFC_SQ_V_READFIRSTLANE_B32 0x02 +#define V_008DFC_SQ_V_CVT_I32_F64 0x03 +#define V_008DFC_SQ_V_CVT_F64_I32 0x04 +#define V_008DFC_SQ_V_CVT_F32_I32 0x05 +#define V_008DFC_SQ_V_CVT_F32_U32 0x06 +#define V_008DFC_SQ_V_CVT_U32_F32 0x07 +#define V_008DFC_SQ_V_CVT_I32_F32 0x08 +#define V_008DFC_SQ_V_MOV_FED_B32 0x09 +#define V_008DFC_SQ_V_CVT_F16_F32 0x0A +#define V_008DFC_SQ_V_CVT_F32_F16 0x0B +#define V_008DFC_SQ_V_CVT_RPI_I32_F32 0x0C +#define V_008DFC_SQ_V_CVT_FLR_I32_F32 0x0D +#define V_008DFC_SQ_V_CVT_OFF_F32_I4 0x0E +#define V_008DFC_SQ_V_CVT_F32_F64 0x0F +#define V_008DFC_SQ_V_CVT_F64_F32 0x10 +#define V_008DFC_SQ_V_CVT_F32_UBYTE0 0x11 +#define V_008DFC_SQ_V_CVT_F32_UBYTE1 0x12 +#define V_008DFC_SQ_V_CVT_F32_UBYTE2 0x13 +#define V_008DFC_SQ_V_CVT_F32_UBYTE3 0x14 +#define V_008DFC_SQ_V_CVT_U32_F64 0x15 +#define V_008DFC_SQ_V_CVT_F64_U32 0x16 +#define V_008DFC_SQ_V_FRACT_F32 0x20 +#define V_008DFC_SQ_V_TRUNC_F32 0x21 +#define V_008DFC_SQ_V_CEIL_F32 0x22 +#define V_008DFC_SQ_V_RNDNE_F32 0x23 +#define V_008DFC_SQ_V_FLOOR_F32 0x24 +#define V_008DFC_SQ_V_EXP_F32 0x25 +#define V_008DFC_SQ_V_LOG_CLAMP_F32 0x26 +#define V_008DFC_SQ_V_LOG_F32 0x27 +#define V_008DFC_SQ_V_RCP_CLAMP_F32 0x28 +#define V_008DFC_SQ_V_RCP_LEGACY_F32 0x29 +#define V_008DFC_SQ_V_RCP_F32 0x2A +#define V_008DFC_SQ_V_RCP_IFLAG_F32 0x2B +#define V_008DFC_SQ_V_RSQ_CLAMP_F32 0x2C +#define V_008DFC_SQ_V_RSQ_LEGACY_F32 0x2D +#define V_008DFC_SQ_V_RSQ_F32 0x2E +#define V_008DFC_SQ_V_RCP_F64 0x2F +#define V_008DFC_SQ_V_RCP_CLAMP_F64 0x30 +#define V_008DFC_SQ_V_RSQ_F64 0x31 +#define V_008DFC_SQ_V_RSQ_CLAMP_F64 0x32 +#define V_008DFC_SQ_V_SQRT_F32 0x33 +#define V_008DFC_SQ_V_SQRT_F64 0x34 +#define V_008DFC_SQ_V_SIN_F32 0x35 +#define V_008DFC_SQ_V_COS_F32 0x36 +#define V_008DFC_SQ_V_NOT_B32 0x37 +#define V_008DFC_SQ_V_BFREV_B32 0x38 +#define V_008DFC_SQ_V_FFBH_U32 0x39 +#define V_008DFC_SQ_V_FFBL_B32 0x3A +#define V_008DFC_SQ_V_FFBH_I32 0x3B +#define V_008DFC_SQ_V_FREXP_EXP_I32_F64 0x3C +#define V_008DFC_SQ_V_FREXP_MANT_F64 0x3D +#define V_008DFC_SQ_V_FRACT_F64 0x3E +#define V_008DFC_SQ_V_FREXP_EXP_I32_F32 0x3F +#define V_008DFC_SQ_V_FREXP_MANT_F32 0x40 +#define V_008DFC_SQ_V_CLREXCP 0x41 +#define V_008DFC_SQ_V_MOVRELD_B32 0x42 +#define V_008DFC_SQ_V_MOVRELS_B32 0x43 +#define V_008DFC_SQ_V_MOVRELSD_B32 0x44 +#define S_008DFC_VDST(x) (((x) & 0xFF) << 17) +#define G_008DFC_VDST(x) (((x) >> 17) & 0xFF) +#define C_008DFC_VDST 0xFE01FFFF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_ENCODING(x) (((x) & 0x7F) << 25) +#define G_008DFC_ENCODING(x) (((x) >> 25) & 0x7F) +#define C_008DFC_ENCODING 0x01FFFFFF +#define V_008DFC_SQ_ENC_VOP1_FIELD 0x3F +#define R_008DFC_SQ_MIMG_1 0x008DFC +#define S_008DFC_VADDR(x) (((x) & 0xFF) << 0) +#define G_008DFC_VADDR(x) (((x) >> 0) & 0xFF) +#define C_008DFC_VADDR 0xFFFFFF00 +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_VDATA(x) (((x) & 0xFF) << 8) +#define G_008DFC_VDATA(x) (((x) >> 8) & 0xFF) +#define C_008DFC_VDATA 0xFFFF00FF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_SRSRC(x) (((x) & 0x1F) << 16) +#define G_008DFC_SRSRC(x) (((x) >> 16) & 0x1F) +#define C_008DFC_SRSRC 0xFFE0FFFF +#define S_008DFC_SSAMP(x) (((x) & 0x1F) << 21) +#define G_008DFC_SSAMP(x) (((x) >> 21) & 0x1F) +#define C_008DFC_SSAMP 0xFC1FFFFF +#define R_008DFC_SQ_VOP3_1 0x008DFC +#define S_008DFC_SRC0(x) (((x) & 0x1FF) << 0) +#define G_008DFC_SRC0(x) (((x) >> 0) & 0x1FF) +#define C_008DFC_SRC0 0xFFFFFE00 +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define V_008DFC_SQ_SRC_VGPR 0x100 +#define S_008DFC_SRC1(x) (((x) & 0x1FF) << 9) +#define G_008DFC_SRC1(x) (((x) >> 9) & 0x1FF) +#define C_008DFC_SRC1 0xFFFC01FF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define V_008DFC_SQ_SRC_VGPR 0x100 +#define S_008DFC_SRC2(x) (((x) & 0x1FF) << 18) +#define G_008DFC_SRC2(x) (((x) >> 18) & 0x1FF) +#define C_008DFC_SRC2 0xF803FFFF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define V_008DFC_SQ_SRC_VGPR 0x100 +#define S_008DFC_OMOD(x) (((x) & 0x03) << 27) +#define G_008DFC_OMOD(x) (((x) >> 27) & 0x03) +#define C_008DFC_OMOD 0xE7FFFFFF +#define V_008DFC_SQ_OMOD_OFF 0x00 +#define V_008DFC_SQ_OMOD_M2 0x01 +#define V_008DFC_SQ_OMOD_M4 0x02 +#define V_008DFC_SQ_OMOD_D2 0x03 +#define S_008DFC_NEG(x) (((x) & 0x07) << 29) +#define G_008DFC_NEG(x) (((x) >> 29) & 0x07) +#define C_008DFC_NEG 0x1FFFFFFF +#define R_008DFC_SQ_MUBUF_1 0x008DFC +#define S_008DFC_VADDR(x) (((x) & 0xFF) << 0) +#define G_008DFC_VADDR(x) (((x) >> 0) & 0xFF) +#define C_008DFC_VADDR 0xFFFFFF00 +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_VDATA(x) (((x) & 0xFF) << 8) +#define G_008DFC_VDATA(x) (((x) >> 8) & 0xFF) +#define C_008DFC_VDATA 0xFFFF00FF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_SRSRC(x) (((x) & 0x1F) << 16) +#define G_008DFC_SRSRC(x) (((x) >> 16) & 0x1F) +#define C_008DFC_SRSRC 0xFFE0FFFF +#define S_008DFC_SLC(x) (((x) & 0x1) << 22) +#define G_008DFC_SLC(x) (((x) >> 22) & 0x1) +#define C_008DFC_SLC 0xFFBFFFFF +#define S_008DFC_TFE(x) (((x) & 0x1) << 23) +#define G_008DFC_TFE(x) (((x) >> 23) & 0x1) +#define C_008DFC_TFE 0xFF7FFFFF +#define S_008DFC_SOFFSET(x) (((x) & 0xFF) << 24) +#define G_008DFC_SOFFSET(x) (((x) >> 24) & 0xFF) +#define C_008DFC_SOFFSET 0x00FFFFFF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define R_008DFC_SQ_DS_0 0x008DFC +#define S_008DFC_OFFSET0(x) (((x) & 0xFF) << 0) +#define G_008DFC_OFFSET0(x) (((x) >> 0) & 0xFF) +#define C_008DFC_OFFSET0 0xFFFFFF00 +#define S_008DFC_OFFSET1(x) (((x) & 0xFF) << 8) +#define G_008DFC_OFFSET1(x) (((x) >> 8) & 0xFF) +#define C_008DFC_OFFSET1 0xFFFF00FF +#define S_008DFC_GDS(x) (((x) & 0x1) << 17) +#define G_008DFC_GDS(x) (((x) >> 17) & 0x1) +#define C_008DFC_GDS 0xFFFDFFFF +#define S_008DFC_OP(x) (((x) & 0xFF) << 18) +#define G_008DFC_OP(x) (((x) >> 18) & 0xFF) +#define C_008DFC_OP 0xFC03FFFF +#define V_008DFC_SQ_DS_ADD_U32 0x00 +#define V_008DFC_SQ_DS_SUB_U32 0x01 +#define V_008DFC_SQ_DS_RSUB_U32 0x02 +#define V_008DFC_SQ_DS_INC_U32 0x03 +#define V_008DFC_SQ_DS_DEC_U32 0x04 +#define V_008DFC_SQ_DS_MIN_I32 0x05 +#define V_008DFC_SQ_DS_MAX_I32 0x06 +#define V_008DFC_SQ_DS_MIN_U32 0x07 +#define V_008DFC_SQ_DS_MAX_U32 0x08 +#define V_008DFC_SQ_DS_AND_B32 0x09 +#define V_008DFC_SQ_DS_OR_B32 0x0A +#define V_008DFC_SQ_DS_XOR_B32 0x0B +#define V_008DFC_SQ_DS_MSKOR_B32 0x0C +#define V_008DFC_SQ_DS_WRITE_B32 0x0D +#define V_008DFC_SQ_DS_WRITE2_B32 0x0E +#define V_008DFC_SQ_DS_WRITE2ST64_B32 0x0F +#define V_008DFC_SQ_DS_CMPST_B32 0x10 +#define V_008DFC_SQ_DS_CMPST_F32 0x11 +#define V_008DFC_SQ_DS_MIN_F32 0x12 +#define V_008DFC_SQ_DS_MAX_F32 0x13 +#define V_008DFC_SQ_DS_GWS_INIT 0x19 +#define V_008DFC_SQ_DS_GWS_SEMA_V 0x1A +#define V_008DFC_SQ_DS_GWS_SEMA_BR 0x1B +#define V_008DFC_SQ_DS_GWS_SEMA_P 0x1C +#define V_008DFC_SQ_DS_GWS_BARRIER 0x1D +#define V_008DFC_SQ_DS_WRITE_B8 0x1E +#define V_008DFC_SQ_DS_WRITE_B16 0x1F +#define V_008DFC_SQ_DS_ADD_RTN_U32 0x20 +#define V_008DFC_SQ_DS_SUB_RTN_U32 0x21 +#define V_008DFC_SQ_DS_RSUB_RTN_U32 0x22 +#define V_008DFC_SQ_DS_INC_RTN_U32 0x23 +#define V_008DFC_SQ_DS_DEC_RTN_U32 0x24 +#define V_008DFC_SQ_DS_MIN_RTN_I32 0x25 +#define V_008DFC_SQ_DS_MAX_RTN_I32 0x26 +#define V_008DFC_SQ_DS_MIN_RTN_U32 0x27 +#define V_008DFC_SQ_DS_MAX_RTN_U32 0x28 +#define V_008DFC_SQ_DS_AND_RTN_B32 0x29 +#define V_008DFC_SQ_DS_OR_RTN_B32 0x2A +#define V_008DFC_SQ_DS_XOR_RTN_B32 0x2B +#define V_008DFC_SQ_DS_MSKOR_RTN_B32 0x2C +#define V_008DFC_SQ_DS_WRXCHG_RTN_B32 0x2D +#define V_008DFC_SQ_DS_WRXCHG2_RTN_B32 0x2E +#define V_008DFC_SQ_DS_WRXCHG2ST64_RTN_B32 0x2F +#define V_008DFC_SQ_DS_CMPST_RTN_B32 0x30 +#define V_008DFC_SQ_DS_CMPST_RTN_F32 0x31 +#define V_008DFC_SQ_DS_MIN_RTN_F32 0x32 +#define V_008DFC_SQ_DS_MAX_RTN_F32 0x33 +#define V_008DFC_SQ_DS_SWIZZLE_B32 0x35 +#define V_008DFC_SQ_DS_READ_B32 0x36 +#define V_008DFC_SQ_DS_READ2_B32 0x37 +#define V_008DFC_SQ_DS_READ2ST64_B32 0x38 +#define V_008DFC_SQ_DS_READ_I8 0x39 +#define V_008DFC_SQ_DS_READ_U8 0x3A +#define V_008DFC_SQ_DS_READ_I16 0x3B +#define V_008DFC_SQ_DS_READ_U16 0x3C +#define V_008DFC_SQ_DS_CONSUME 0x3D +#define V_008DFC_SQ_DS_APPEND 0x3E +#define V_008DFC_SQ_DS_ORDERED_COUNT 0x3F +#define V_008DFC_SQ_DS_ADD_U64 0x40 +#define V_008DFC_SQ_DS_SUB_U64 0x41 +#define V_008DFC_SQ_DS_RSUB_U64 0x42 +#define V_008DFC_SQ_DS_INC_U64 0x43 +#define V_008DFC_SQ_DS_DEC_U64 0x44 +#define V_008DFC_SQ_DS_MIN_I64 0x45 +#define V_008DFC_SQ_DS_MAX_I64 0x46 +#define V_008DFC_SQ_DS_MIN_U64 0x47 +#define V_008DFC_SQ_DS_MAX_U64 0x48 +#define V_008DFC_SQ_DS_AND_B64 0x49 +#define V_008DFC_SQ_DS_OR_B64 0x4A +#define V_008DFC_SQ_DS_XOR_B64 0x4B +#define V_008DFC_SQ_DS_MSKOR_B64 0x4C +#define V_008DFC_SQ_DS_WRITE_B64 0x4D +#define V_008DFC_SQ_DS_WRITE2_B64 0x4E +#define V_008DFC_SQ_DS_WRITE2ST64_B64 0x4F +#define V_008DFC_SQ_DS_CMPST_B64 0x50 +#define V_008DFC_SQ_DS_CMPST_F64 0x51 +#define V_008DFC_SQ_DS_MIN_F64 0x52 +#define V_008DFC_SQ_DS_MAX_F64 0x53 +#define V_008DFC_SQ_DS_ADD_RTN_U64 0x60 +#define V_008DFC_SQ_DS_SUB_RTN_U64 0x61 +#define V_008DFC_SQ_DS_RSUB_RTN_U64 0x62 +#define V_008DFC_SQ_DS_INC_RTN_U64 0x63 +#define V_008DFC_SQ_DS_DEC_RTN_U64 0x64 +#define V_008DFC_SQ_DS_MIN_RTN_I64 0x65 +#define V_008DFC_SQ_DS_MAX_RTN_I64 0x66 +#define V_008DFC_SQ_DS_MIN_RTN_U64 0x67 +#define V_008DFC_SQ_DS_MAX_RTN_U64 0x68 +#define V_008DFC_SQ_DS_AND_RTN_B64 0x69 +#define V_008DFC_SQ_DS_OR_RTN_B64 0x6A +#define V_008DFC_SQ_DS_XOR_RTN_B64 0x6B +#define V_008DFC_SQ_DS_MSKOR_RTN_B64 0x6C +#define V_008DFC_SQ_DS_WRXCHG_RTN_B64 0x6D +#define V_008DFC_SQ_DS_WRXCHG2_RTN_B64 0x6E +#define V_008DFC_SQ_DS_WRXCHG2ST64_RTN_B64 0x6F +#define V_008DFC_SQ_DS_CMPST_RTN_B64 0x70 +#define V_008DFC_SQ_DS_CMPST_RTN_F64 0x71 +#define V_008DFC_SQ_DS_MIN_RTN_F64 0x72 +#define V_008DFC_SQ_DS_MAX_RTN_F64 0x73 +#define V_008DFC_SQ_DS_READ_B64 0x76 +#define V_008DFC_SQ_DS_READ2_B64 0x77 +#define V_008DFC_SQ_DS_READ2ST64_B64 0x78 +#define V_008DFC_SQ_DS_ADD_SRC2_U32 0x80 +#define V_008DFC_SQ_DS_SUB_SRC2_U32 0x81 +#define V_008DFC_SQ_DS_RSUB_SRC2_U32 0x82 +#define V_008DFC_SQ_DS_INC_SRC2_U32 0x83 +#define V_008DFC_SQ_DS_DEC_SRC2_U32 0x84 +#define V_008DFC_SQ_DS_MIN_SRC2_I32 0x85 +#define V_008DFC_SQ_DS_MAX_SRC2_I32 0x86 +#define V_008DFC_SQ_DS_MIN_SRC2_U32 0x87 +#define V_008DFC_SQ_DS_MAX_SRC2_U32 0x88 +#define V_008DFC_SQ_DS_AND_SRC2_B32 0x89 +#define V_008DFC_SQ_DS_OR_SRC2_B32 0x8A +#define V_008DFC_SQ_DS_XOR_SRC2_B32 0x8B +#define V_008DFC_SQ_DS_WRITE_SRC2_B32 0x8D +#define V_008DFC_SQ_DS_MIN_SRC2_F32 0x92 +#define V_008DFC_SQ_DS_MAX_SRC2_F32 0x93 +#define V_008DFC_SQ_DS_ADD_SRC2_U64 0xC0 +#define V_008DFC_SQ_DS_SUB_SRC2_U64 0xC1 +#define V_008DFC_SQ_DS_RSUB_SRC2_U64 0xC2 +#define V_008DFC_SQ_DS_INC_SRC2_U64 0xC3 +#define V_008DFC_SQ_DS_DEC_SRC2_U64 0xC4 +#define V_008DFC_SQ_DS_MIN_SRC2_I64 0xC5 +#define V_008DFC_SQ_DS_MAX_SRC2_I64 0xC6 +#define V_008DFC_SQ_DS_MIN_SRC2_U64 0xC7 +#define V_008DFC_SQ_DS_MAX_SRC2_U64 0xC8 +#define V_008DFC_SQ_DS_AND_SRC2_B64 0xC9 +#define V_008DFC_SQ_DS_OR_SRC2_B64 0xCA +#define V_008DFC_SQ_DS_XOR_SRC2_B64 0xCB +#define V_008DFC_SQ_DS_WRITE_SRC2_B64 0xCD +#define V_008DFC_SQ_DS_MIN_SRC2_F64 0xD2 +#define V_008DFC_SQ_DS_MAX_SRC2_F64 0xD3 +#define S_008DFC_ENCODING(x) (((x) & 0x3F) << 26) +#define G_008DFC_ENCODING(x) (((x) >> 26) & 0x3F) +#define C_008DFC_ENCODING 0x03FFFFFF +#define V_008DFC_SQ_ENC_DS_FIELD 0x36 +#define R_008DFC_SQ_SOPC 0x008DFC +#define S_008DFC_SSRC0(x) (((x) & 0xFF) << 0) +#define G_008DFC_SSRC0(x) (((x) >> 0) & 0xFF) +#define C_008DFC_SSRC0 0xFFFFFF00 +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define S_008DFC_SSRC1(x) (((x) & 0xFF) << 8) +#define G_008DFC_SSRC1(x) (((x) >> 8) & 0xFF) +#define C_008DFC_SSRC1 0xFFFF00FF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define S_008DFC_OP(x) (((x) & 0x7F) << 16) +#define G_008DFC_OP(x) (((x) >> 16) & 0x7F) +#define C_008DFC_OP 0xFF80FFFF +#define V_008DFC_SQ_S_CMP_EQ_I32 0x00 +#define V_008DFC_SQ_S_CMP_LG_I32 0x01 +#define V_008DFC_SQ_S_CMP_GT_I32 0x02 +#define V_008DFC_SQ_S_CMP_GE_I32 0x03 +#define V_008DFC_SQ_S_CMP_LT_I32 0x04 +#define V_008DFC_SQ_S_CMP_LE_I32 0x05 +#define V_008DFC_SQ_S_CMP_EQ_U32 0x06 +#define V_008DFC_SQ_S_CMP_LG_U32 0x07 +#define V_008DFC_SQ_S_CMP_GT_U32 0x08 +#define V_008DFC_SQ_S_CMP_GE_U32 0x09 +#define V_008DFC_SQ_S_CMP_LT_U32 0x0A +#define V_008DFC_SQ_S_CMP_LE_U32 0x0B +#define V_008DFC_SQ_S_BITCMP0_B32 0x0C +#define V_008DFC_SQ_S_BITCMP1_B32 0x0D +#define V_008DFC_SQ_S_BITCMP0_B64 0x0E +#define V_008DFC_SQ_S_BITCMP1_B64 0x0F +#define V_008DFC_SQ_S_SETVSKIP 0x10 +#define S_008DFC_ENCODING(x) (((x) & 0x1FF) << 23) +#define G_008DFC_ENCODING(x) (((x) >> 23) & 0x1FF) +#define C_008DFC_ENCODING 0x007FFFFF +#define V_008DFC_SQ_ENC_SOPC_FIELD 0x17E +#endif +#define R_008DFC_SQ_EXP_0 0x008DFC +#define S_008DFC_EN(x) (((x) & 0x0F) << 0) +#define G_008DFC_EN(x) (((x) >> 0) & 0x0F) +#define C_008DFC_EN 0xFFFFFFF0 +#define S_008DFC_TGT(x) (((x) & 0x3F) << 4) +#define G_008DFC_TGT(x) (((x) >> 4) & 0x3F) +#define C_008DFC_TGT 0xFFFFFC0F +#define V_008DFC_SQ_EXP_MRT 0x00 +#define V_008DFC_SQ_EXP_MRTZ 0x08 +#define V_008DFC_SQ_EXP_NULL 0x09 +#define V_008DFC_SQ_EXP_POS 0x0C +#define V_008DFC_SQ_EXP_PARAM 0x20 +#define S_008DFC_COMPR(x) (((x) & 0x1) << 10) +#define G_008DFC_COMPR(x) (((x) >> 10) & 0x1) +#define C_008DFC_COMPR 0xFFFFFBFF +#define S_008DFC_DONE(x) (((x) & 0x1) << 11) +#define G_008DFC_DONE(x) (((x) >> 11) & 0x1) +#define C_008DFC_DONE 0xFFFFF7FF +#define S_008DFC_VM(x) (((x) & 0x1) << 12) +#define G_008DFC_VM(x) (((x) >> 12) & 0x1) +#define C_008DFC_VM 0xFFFFEFFF +#define S_008DFC_ENCODING(x) (((x) & 0x3F) << 26) +#define G_008DFC_ENCODING(x) (((x) >> 26) & 0x3F) +#define C_008DFC_ENCODING 0x03FFFFFF +#define V_008DFC_SQ_ENC_EXP_FIELD 0x3E +#if 0 +#define R_008DFC_SQ_MIMG_0 0x008DFC +#define S_008DFC_DMASK(x) (((x) & 0x0F) << 8) +#define G_008DFC_DMASK(x) (((x) >> 8) & 0x0F) +#define C_008DFC_DMASK 0xFFFFF0FF +#define S_008DFC_UNORM(x) (((x) & 0x1) << 12) +#define G_008DFC_UNORM(x) (((x) >> 12) & 0x1) +#define C_008DFC_UNORM 0xFFFFEFFF +#define S_008DFC_GLC(x) (((x) & 0x1) << 13) +#define G_008DFC_GLC(x) (((x) >> 13) & 0x1) +#define C_008DFC_GLC 0xFFFFDFFF +#define S_008DFC_DA(x) (((x) & 0x1) << 14) +#define G_008DFC_DA(x) (((x) >> 14) & 0x1) +#define C_008DFC_DA 0xFFFFBFFF +#define S_008DFC_R128(x) (((x) & 0x1) << 15) +#define G_008DFC_R128(x) (((x) >> 15) & 0x1) +#define C_008DFC_R128 0xFFFF7FFF +#define S_008DFC_TFE(x) (((x) & 0x1) << 16) +#define G_008DFC_TFE(x) (((x) >> 16) & 0x1) +#define C_008DFC_TFE 0xFFFEFFFF +#define S_008DFC_LWE(x) (((x) & 0x1) << 17) +#define G_008DFC_LWE(x) (((x) >> 17) & 0x1) +#define C_008DFC_LWE 0xFFFDFFFF +#define S_008DFC_OP(x) (((x) & 0x7F) << 18) +#define G_008DFC_OP(x) (((x) >> 18) & 0x7F) +#define C_008DFC_OP 0xFE03FFFF +#define V_008DFC_SQ_IMAGE_LOAD 0x00 +#define V_008DFC_SQ_IMAGE_LOAD_MIP 0x01 +#define V_008DFC_SQ_IMAGE_LOAD_PCK 0x02 +#define V_008DFC_SQ_IMAGE_LOAD_PCK_SGN 0x03 +#define V_008DFC_SQ_IMAGE_LOAD_MIP_PCK 0x04 +#define V_008DFC_SQ_IMAGE_LOAD_MIP_PCK_SGN 0x05 +#define V_008DFC_SQ_IMAGE_STORE 0x08 +#define V_008DFC_SQ_IMAGE_STORE_MIP 0x09 +#define V_008DFC_SQ_IMAGE_STORE_PCK 0x0A +#define V_008DFC_SQ_IMAGE_STORE_MIP_PCK 0x0B +#define V_008DFC_SQ_IMAGE_GET_RESINFO 0x0E +#define V_008DFC_SQ_IMAGE_ATOMIC_SWAP 0x0F +#define V_008DFC_SQ_IMAGE_ATOMIC_CMPSWAP 0x10 +#define V_008DFC_SQ_IMAGE_ATOMIC_ADD 0x11 +#define V_008DFC_SQ_IMAGE_ATOMIC_SUB 0x12 +#define V_008DFC_SQ_IMAGE_ATOMIC_RSUB 0x13 +#define V_008DFC_SQ_IMAGE_ATOMIC_SMIN 0x14 +#define V_008DFC_SQ_IMAGE_ATOMIC_UMIN 0x15 +#define V_008DFC_SQ_IMAGE_ATOMIC_SMAX 0x16 +#define V_008DFC_SQ_IMAGE_ATOMIC_UMAX 0x17 +#define V_008DFC_SQ_IMAGE_ATOMIC_AND 0x18 +#define V_008DFC_SQ_IMAGE_ATOMIC_OR 0x19 +#define V_008DFC_SQ_IMAGE_ATOMIC_XOR 0x1A +#define V_008DFC_SQ_IMAGE_ATOMIC_INC 0x1B +#define V_008DFC_SQ_IMAGE_ATOMIC_DEC 0x1C +#define V_008DFC_SQ_IMAGE_ATOMIC_FCMPSWAP 0x1D +#define V_008DFC_SQ_IMAGE_ATOMIC_FMIN 0x1E +#define V_008DFC_SQ_IMAGE_ATOMIC_FMAX 0x1F +#define V_008DFC_SQ_IMAGE_SAMPLE 0x20 +#define V_008DFC_SQ_IMAGE_SAMPLE_CL 0x21 +#define V_008DFC_SQ_IMAGE_SAMPLE_D 0x22 +#define V_008DFC_SQ_IMAGE_SAMPLE_D_CL 0x23 +#define V_008DFC_SQ_IMAGE_SAMPLE_L 0x24 +#define V_008DFC_SQ_IMAGE_SAMPLE_B 0x25 +#define V_008DFC_SQ_IMAGE_SAMPLE_B_CL 0x26 +#define V_008DFC_SQ_IMAGE_SAMPLE_LZ 0x27 +#define V_008DFC_SQ_IMAGE_SAMPLE_C 0x28 +#define V_008DFC_SQ_IMAGE_SAMPLE_C_CL 0x29 +#define V_008DFC_SQ_IMAGE_SAMPLE_C_D 0x2A +#define V_008DFC_SQ_IMAGE_SAMPLE_C_D_CL 0x2B +#define V_008DFC_SQ_IMAGE_SAMPLE_C_L 0x2C +#define V_008DFC_SQ_IMAGE_SAMPLE_C_B 0x2D +#define V_008DFC_SQ_IMAGE_SAMPLE_C_B_CL 0x2E +#define V_008DFC_SQ_IMAGE_SAMPLE_C_LZ 0x2F +#define V_008DFC_SQ_IMAGE_SAMPLE_O 0x30 +#define V_008DFC_SQ_IMAGE_SAMPLE_CL_O 0x31 +#define V_008DFC_SQ_IMAGE_SAMPLE_D_O 0x32 +#define V_008DFC_SQ_IMAGE_SAMPLE_D_CL_O 0x33 +#define V_008DFC_SQ_IMAGE_SAMPLE_L_O 0x34 +#define V_008DFC_SQ_IMAGE_SAMPLE_B_O 0x35 +#define V_008DFC_SQ_IMAGE_SAMPLE_B_CL_O 0x36 +#define V_008DFC_SQ_IMAGE_SAMPLE_LZ_O 0x37 +#define V_008DFC_SQ_IMAGE_SAMPLE_C_O 0x38 +#define V_008DFC_SQ_IMAGE_SAMPLE_C_CL_O 0x39 +#define V_008DFC_SQ_IMAGE_SAMPLE_C_D_O 0x3A +#define V_008DFC_SQ_IMAGE_SAMPLE_C_D_CL_O 0x3B +#define V_008DFC_SQ_IMAGE_SAMPLE_C_L_O 0x3C +#define V_008DFC_SQ_IMAGE_SAMPLE_C_B_O 0x3D +#define V_008DFC_SQ_IMAGE_SAMPLE_C_B_CL_O 0x3E +#define V_008DFC_SQ_IMAGE_SAMPLE_C_LZ_O 0x3F +#define V_008DFC_SQ_IMAGE_GATHER4 0x40 +#define V_008DFC_SQ_IMAGE_GATHER4_CL 0x41 +#define V_008DFC_SQ_IMAGE_GATHER4_L 0x44 +#define V_008DFC_SQ_IMAGE_GATHER4_B 0x45 +#define V_008DFC_SQ_IMAGE_GATHER4_B_CL 0x46 +#define V_008DFC_SQ_IMAGE_GATHER4_LZ 0x47 +#define V_008DFC_SQ_IMAGE_GATHER4_C 0x48 +#define V_008DFC_SQ_IMAGE_GATHER4_C_CL 0x49 +#define V_008DFC_SQ_IMAGE_GATHER4_C_L 0x4C +#define V_008DFC_SQ_IMAGE_GATHER4_C_B 0x4D +#define V_008DFC_SQ_IMAGE_GATHER4_C_B_CL 0x4E +#define V_008DFC_SQ_IMAGE_GATHER4_C_LZ 0x4F +#define V_008DFC_SQ_IMAGE_GATHER4_O 0x50 +#define V_008DFC_SQ_IMAGE_GATHER4_CL_O 0x51 +#define V_008DFC_SQ_IMAGE_GATHER4_L_O 0x54 +#define V_008DFC_SQ_IMAGE_GATHER4_B_O 0x55 +#define V_008DFC_SQ_IMAGE_GATHER4_B_CL_O 0x56 +#define V_008DFC_SQ_IMAGE_GATHER4_LZ_O 0x57 +#define V_008DFC_SQ_IMAGE_GATHER4_C_O 0x58 +#define V_008DFC_SQ_IMAGE_GATHER4_C_CL_O 0x59 +#define V_008DFC_SQ_IMAGE_GATHER4_C_L_O 0x5C +#define V_008DFC_SQ_IMAGE_GATHER4_C_B_O 0x5D +#define V_008DFC_SQ_IMAGE_GATHER4_C_B_CL_O 0x5E +#define V_008DFC_SQ_IMAGE_GATHER4_C_LZ_O 0x5F +#define V_008DFC_SQ_IMAGE_GET_LOD 0x60 +#define V_008DFC_SQ_IMAGE_SAMPLE_CD 0x68 +#define V_008DFC_SQ_IMAGE_SAMPLE_CD_CL 0x69 +#define V_008DFC_SQ_IMAGE_SAMPLE_C_CD 0x6A +#define V_008DFC_SQ_IMAGE_SAMPLE_C_CD_CL 0x6B +#define V_008DFC_SQ_IMAGE_SAMPLE_CD_O 0x6C +#define V_008DFC_SQ_IMAGE_SAMPLE_CD_CL_O 0x6D +#define V_008DFC_SQ_IMAGE_SAMPLE_C_CD_O 0x6E +#define V_008DFC_SQ_IMAGE_SAMPLE_C_CD_CL_O 0x6F +#define V_008DFC_SQ_IMAGE_RSRC256 0x7E +#define V_008DFC_SQ_IMAGE_SAMPLER 0x7F +#define S_008DFC_SLC(x) (((x) & 0x1) << 25) +#define G_008DFC_SLC(x) (((x) >> 25) & 0x1) +#define C_008DFC_SLC 0xFDFFFFFF +#define S_008DFC_ENCODING(x) (((x) & 0x3F) << 26) +#define G_008DFC_ENCODING(x) (((x) >> 26) & 0x3F) +#define C_008DFC_ENCODING 0x03FFFFFF +#define V_008DFC_SQ_ENC_MIMG_FIELD 0x3C +#define R_008DFC_SQ_SOPP 0x008DFC +#define S_008DFC_SIMM16(x) (((x) & 0xFFFF) << 0) +#define G_008DFC_SIMM16(x) (((x) >> 0) & 0xFFFF) +#define C_008DFC_SIMM16 0xFFFF0000 +#define S_008DFC_OP(x) (((x) & 0x7F) << 16) +#define G_008DFC_OP(x) (((x) >> 16) & 0x7F) +#define C_008DFC_OP 0xFF80FFFF +#define V_008DFC_SQ_S_NOP 0x00 +#define V_008DFC_SQ_S_ENDPGM 0x01 +#define V_008DFC_SQ_S_BRANCH 0x02 +#define V_008DFC_SQ_S_CBRANCH_SCC0 0x04 +#define V_008DFC_SQ_S_CBRANCH_SCC1 0x05 +#define V_008DFC_SQ_S_CBRANCH_VCCZ 0x06 +#define V_008DFC_SQ_S_CBRANCH_VCCNZ 0x07 +#define V_008DFC_SQ_S_CBRANCH_EXECZ 0x08 +#define V_008DFC_SQ_S_CBRANCH_EXECNZ 0x09 +#define V_008DFC_SQ_S_BARRIER 0x0A +#define V_008DFC_SQ_S_WAITCNT 0x0C +#define V_008DFC_SQ_S_SETHALT 0x0D +#define V_008DFC_SQ_S_SLEEP 0x0E +#define V_008DFC_SQ_S_SETPRIO 0x0F +#define V_008DFC_SQ_S_SENDMSG 0x10 +#define V_008DFC_SQ_S_SENDMSGHALT 0x11 +#define V_008DFC_SQ_S_TRAP 0x12 +#define V_008DFC_SQ_S_ICACHE_INV 0x13 +#define V_008DFC_SQ_S_INCPERFLEVEL 0x14 +#define V_008DFC_SQ_S_DECPERFLEVEL 0x15 +#define V_008DFC_SQ_S_TTRACEDATA 0x16 +#define S_008DFC_ENCODING(x) (((x) & 0x1FF) << 23) +#define G_008DFC_ENCODING(x) (((x) >> 23) & 0x1FF) +#define C_008DFC_ENCODING 0x007FFFFF +#define V_008DFC_SQ_ENC_SOPP_FIELD 0x17F +#define R_008DFC_SQ_VINTRP 0x008DFC +#define S_008DFC_VSRC(x) (((x) & 0xFF) << 0) +#define G_008DFC_VSRC(x) (((x) >> 0) & 0xFF) +#define C_008DFC_VSRC 0xFFFFFF00 +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_ATTRCHAN(x) (((x) & 0x03) << 8) +#define G_008DFC_ATTRCHAN(x) (((x) >> 8) & 0x03) +#define C_008DFC_ATTRCHAN 0xFFFFFCFF +#define V_008DFC_SQ_CHAN_X 0x00 +#define V_008DFC_SQ_CHAN_Y 0x01 +#define V_008DFC_SQ_CHAN_Z 0x02 +#define V_008DFC_SQ_CHAN_W 0x03 +#define S_008DFC_ATTR(x) (((x) & 0x3F) << 10) +#define G_008DFC_ATTR(x) (((x) >> 10) & 0x3F) +#define C_008DFC_ATTR 0xFFFF03FF +#define V_008DFC_SQ_ATTR 0x00 +#define S_008DFC_OP(x) (((x) & 0x03) << 16) +#define G_008DFC_OP(x) (((x) >> 16) & 0x03) +#define C_008DFC_OP 0xFFFCFFFF +#define V_008DFC_SQ_V_INTERP_P1_F32 0x00 +#define V_008DFC_SQ_V_INTERP_P2_F32 0x01 +#define V_008DFC_SQ_V_INTERP_MOV_F32 0x02 +#define S_008DFC_VDST(x) (((x) & 0xFF) << 18) +#define G_008DFC_VDST(x) (((x) >> 18) & 0xFF) +#define C_008DFC_VDST 0xFC03FFFF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_ENCODING(x) (((x) & 0x3F) << 26) +#define G_008DFC_ENCODING(x) (((x) >> 26) & 0x3F) +#define C_008DFC_ENCODING 0x03FFFFFF +#define V_008DFC_SQ_ENC_VINTRP_FIELD 0x32 +#define R_008DFC_SQ_MTBUF_0 0x008DFC +#define S_008DFC_OFFSET(x) (((x) & 0xFFF) << 0) +#define G_008DFC_OFFSET(x) (((x) >> 0) & 0xFFF) +#define C_008DFC_OFFSET 0xFFFFF000 +#define S_008DFC_OFFEN(x) (((x) & 0x1) << 12) +#define G_008DFC_OFFEN(x) (((x) >> 12) & 0x1) +#define C_008DFC_OFFEN 0xFFFFEFFF +#define S_008DFC_IDXEN(x) (((x) & 0x1) << 13) +#define G_008DFC_IDXEN(x) (((x) >> 13) & 0x1) +#define C_008DFC_IDXEN 0xFFFFDFFF +#define S_008DFC_GLC(x) (((x) & 0x1) << 14) +#define G_008DFC_GLC(x) (((x) >> 14) & 0x1) +#define C_008DFC_GLC 0xFFFFBFFF +#define S_008DFC_ADDR64(x) (((x) & 0x1) << 15) +#define G_008DFC_ADDR64(x) (((x) >> 15) & 0x1) +#define C_008DFC_ADDR64 0xFFFF7FFF +#define S_008DFC_OP(x) (((x) & 0x07) << 16) +#define G_008DFC_OP(x) (((x) >> 16) & 0x07) +#define C_008DFC_OP 0xFFF8FFFF +#define V_008DFC_SQ_TBUFFER_LOAD_FORMAT_X 0x00 +#define V_008DFC_SQ_TBUFFER_LOAD_FORMAT_XY 0x01 +#define V_008DFC_SQ_TBUFFER_LOAD_FORMAT_XYZ 0x02 +#define V_008DFC_SQ_TBUFFER_LOAD_FORMAT_XYZW 0x03 +#define V_008DFC_SQ_TBUFFER_STORE_FORMAT_X 0x04 +#define V_008DFC_SQ_TBUFFER_STORE_FORMAT_XY 0x05 +#define V_008DFC_SQ_TBUFFER_STORE_FORMAT_XYZ 0x06 +#define V_008DFC_SQ_TBUFFER_STORE_FORMAT_XYZW 0x07 +#define S_008DFC_DFMT(x) (((x) & 0x0F) << 19) +#define G_008DFC_DFMT(x) (((x) >> 19) & 0x0F) +#define C_008DFC_DFMT 0xFF87FFFF +#define S_008DFC_NFMT(x) (((x) & 0x07) << 23) +#define G_008DFC_NFMT(x) (((x) >> 23) & 0x07) +#define C_008DFC_NFMT 0xFC7FFFFF +#define S_008DFC_ENCODING(x) (((x) & 0x3F) << 26) +#define G_008DFC_ENCODING(x) (((x) >> 26) & 0x3F) +#define C_008DFC_ENCODING 0x03FFFFFF +#define V_008DFC_SQ_ENC_MTBUF_FIELD 0x3A +#define R_008DFC_SQ_SMRD 0x008DFC +#define S_008DFC_OFFSET(x) (((x) & 0xFF) << 0) +#define G_008DFC_OFFSET(x) (((x) >> 0) & 0xFF) +#define C_008DFC_OFFSET 0xFFFFFF00 +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define S_008DFC_IMM(x) (((x) & 0x1) << 8) +#define G_008DFC_IMM(x) (((x) >> 8) & 0x1) +#define C_008DFC_IMM 0xFFFFFEFF +#define S_008DFC_SBASE(x) (((x) & 0x3F) << 9) +#define G_008DFC_SBASE(x) (((x) >> 9) & 0x3F) +#define C_008DFC_SBASE 0xFFFF81FF +#define S_008DFC_SDST(x) (((x) & 0x7F) << 15) +#define G_008DFC_SDST(x) (((x) >> 15) & 0x7F) +#define C_008DFC_SDST 0xFFC07FFF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define S_008DFC_OP(x) (((x) & 0x1F) << 22) +#define G_008DFC_OP(x) (((x) >> 22) & 0x1F) +#define C_008DFC_OP 0xF83FFFFF +#define V_008DFC_SQ_S_LOAD_DWORD 0x00 +#define V_008DFC_SQ_S_LOAD_DWORDX2 0x01 +#define V_008DFC_SQ_S_LOAD_DWORDX4 0x02 +#define V_008DFC_SQ_S_LOAD_DWORDX8 0x03 +#define V_008DFC_SQ_S_LOAD_DWORDX16 0x04 +#define V_008DFC_SQ_S_BUFFER_LOAD_DWORD 0x08 +#define V_008DFC_SQ_S_BUFFER_LOAD_DWORDX2 0x09 +#define V_008DFC_SQ_S_BUFFER_LOAD_DWORDX4 0x0A +#define V_008DFC_SQ_S_BUFFER_LOAD_DWORDX8 0x0B +#define V_008DFC_SQ_S_BUFFER_LOAD_DWORDX16 0x0C +#define V_008DFC_SQ_S_MEMTIME 0x1E +#define V_008DFC_SQ_S_DCACHE_INV 0x1F +#define S_008DFC_ENCODING(x) (((x) & 0x1F) << 27) +#define G_008DFC_ENCODING(x) (((x) >> 27) & 0x1F) +#define C_008DFC_ENCODING 0x07FFFFFF +#define V_008DFC_SQ_ENC_SMRD_FIELD 0x18 +#define R_008DFC_SQ_EXP_1 0x008DFC +#define S_008DFC_VSRC0(x) (((x) & 0xFF) << 0) +#define G_008DFC_VSRC0(x) (((x) >> 0) & 0xFF) +#define C_008DFC_VSRC0 0xFFFFFF00 +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_VSRC1(x) (((x) & 0xFF) << 8) +#define G_008DFC_VSRC1(x) (((x) >> 8) & 0xFF) +#define C_008DFC_VSRC1 0xFFFF00FF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_VSRC2(x) (((x) & 0xFF) << 16) +#define G_008DFC_VSRC2(x) (((x) >> 16) & 0xFF) +#define C_008DFC_VSRC2 0xFF00FFFF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_VSRC3(x) (((x) & 0xFF) << 24) +#define G_008DFC_VSRC3(x) (((x) >> 24) & 0xFF) +#define C_008DFC_VSRC3 0x00FFFFFF +#define V_008DFC_SQ_VGPR 0x00 +#define R_008DFC_SQ_DS_1 0x008DFC +#define S_008DFC_ADDR(x) (((x) & 0xFF) << 0) +#define G_008DFC_ADDR(x) (((x) >> 0) & 0xFF) +#define C_008DFC_ADDR 0xFFFFFF00 +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_DATA0(x) (((x) & 0xFF) << 8) +#define G_008DFC_DATA0(x) (((x) >> 8) & 0xFF) +#define C_008DFC_DATA0 0xFFFF00FF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_DATA1(x) (((x) & 0xFF) << 16) +#define G_008DFC_DATA1(x) (((x) >> 16) & 0xFF) +#define C_008DFC_DATA1 0xFF00FFFF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_VDST(x) (((x) & 0xFF) << 24) +#define G_008DFC_VDST(x) (((x) >> 24) & 0xFF) +#define C_008DFC_VDST 0x00FFFFFF +#define V_008DFC_SQ_VGPR 0x00 +#define R_008DFC_SQ_VOPC 0x008DFC +#define S_008DFC_SRC0(x) (((x) & 0x1FF) << 0) +#define G_008DFC_SRC0(x) (((x) >> 0) & 0x1FF) +#define C_008DFC_SRC0 0xFFFFFE00 +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define V_008DFC_SQ_SRC_VGPR 0x100 +#define S_008DFC_VSRC1(x) (((x) & 0xFF) << 9) +#define G_008DFC_VSRC1(x) (((x) >> 9) & 0xFF) +#define C_008DFC_VSRC1 0xFFFE01FF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_OP(x) (((x) & 0xFF) << 17) +#define G_008DFC_OP(x) (((x) >> 17) & 0xFF) +#define C_008DFC_OP 0xFE01FFFF +#define V_008DFC_SQ_V_CMP_F_F32 0x00 +#define V_008DFC_SQ_V_CMP_LT_F32 0x01 +#define V_008DFC_SQ_V_CMP_EQ_F32 0x02 +#define V_008DFC_SQ_V_CMP_LE_F32 0x03 +#define V_008DFC_SQ_V_CMP_GT_F32 0x04 +#define V_008DFC_SQ_V_CMP_LG_F32 0x05 +#define V_008DFC_SQ_V_CMP_GE_F32 0x06 +#define V_008DFC_SQ_V_CMP_O_F32 0x07 +#define V_008DFC_SQ_V_CMP_U_F32 0x08 +#define V_008DFC_SQ_V_CMP_NGE_F32 0x09 +#define V_008DFC_SQ_V_CMP_NLG_F32 0x0A +#define V_008DFC_SQ_V_CMP_NGT_F32 0x0B +#define V_008DFC_SQ_V_CMP_NLE_F32 0x0C +#define V_008DFC_SQ_V_CMP_NEQ_F32 0x0D +#define V_008DFC_SQ_V_CMP_NLT_F32 0x0E +#define V_008DFC_SQ_V_CMP_TRU_F32 0x0F +#define V_008DFC_SQ_V_CMPX_F_F32 0x10 +#define V_008DFC_SQ_V_CMPX_LT_F32 0x11 +#define V_008DFC_SQ_V_CMPX_EQ_F32 0x12 +#define V_008DFC_SQ_V_CMPX_LE_F32 0x13 +#define V_008DFC_SQ_V_CMPX_GT_F32 0x14 +#define V_008DFC_SQ_V_CMPX_LG_F32 0x15 +#define V_008DFC_SQ_V_CMPX_GE_F32 0x16 +#define V_008DFC_SQ_V_CMPX_O_F32 0x17 +#define V_008DFC_SQ_V_CMPX_U_F32 0x18 +#define V_008DFC_SQ_V_CMPX_NGE_F32 0x19 +#define V_008DFC_SQ_V_CMPX_NLG_F32 0x1A +#define V_008DFC_SQ_V_CMPX_NGT_F32 0x1B +#define V_008DFC_SQ_V_CMPX_NLE_F32 0x1C +#define V_008DFC_SQ_V_CMPX_NEQ_F32 0x1D +#define V_008DFC_SQ_V_CMPX_NLT_F32 0x1E +#define V_008DFC_SQ_V_CMPX_TRU_F32 0x1F +#define V_008DFC_SQ_V_CMP_F_F64 0x20 +#define V_008DFC_SQ_V_CMP_LT_F64 0x21 +#define V_008DFC_SQ_V_CMP_EQ_F64 0x22 +#define V_008DFC_SQ_V_CMP_LE_F64 0x23 +#define V_008DFC_SQ_V_CMP_GT_F64 0x24 +#define V_008DFC_SQ_V_CMP_LG_F64 0x25 +#define V_008DFC_SQ_V_CMP_GE_F64 0x26 +#define V_008DFC_SQ_V_CMP_O_F64 0x27 +#define V_008DFC_SQ_V_CMP_U_F64 0x28 +#define V_008DFC_SQ_V_CMP_NGE_F64 0x29 +#define V_008DFC_SQ_V_CMP_NLG_F64 0x2A +#define V_008DFC_SQ_V_CMP_NGT_F64 0x2B +#define V_008DFC_SQ_V_CMP_NLE_F64 0x2C +#define V_008DFC_SQ_V_CMP_NEQ_F64 0x2D +#define V_008DFC_SQ_V_CMP_NLT_F64 0x2E +#define V_008DFC_SQ_V_CMP_TRU_F64 0x2F +#define V_008DFC_SQ_V_CMPX_F_F64 0x30 +#define V_008DFC_SQ_V_CMPX_LT_F64 0x31 +#define V_008DFC_SQ_V_CMPX_EQ_F64 0x32 +#define V_008DFC_SQ_V_CMPX_LE_F64 0x33 +#define V_008DFC_SQ_V_CMPX_GT_F64 0x34 +#define V_008DFC_SQ_V_CMPX_LG_F64 0x35 +#define V_008DFC_SQ_V_CMPX_GE_F64 0x36 +#define V_008DFC_SQ_V_CMPX_O_F64 0x37 +#define V_008DFC_SQ_V_CMPX_U_F64 0x38 +#define V_008DFC_SQ_V_CMPX_NGE_F64 0x39 +#define V_008DFC_SQ_V_CMPX_NLG_F64 0x3A +#define V_008DFC_SQ_V_CMPX_NGT_F64 0x3B +#define V_008DFC_SQ_V_CMPX_NLE_F64 0x3C +#define V_008DFC_SQ_V_CMPX_NEQ_F64 0x3D +#define V_008DFC_SQ_V_CMPX_NLT_F64 0x3E +#define V_008DFC_SQ_V_CMPX_TRU_F64 0x3F +#define V_008DFC_SQ_V_CMPS_F_F32 0x40 +#define V_008DFC_SQ_V_CMPS_LT_F32 0x41 +#define V_008DFC_SQ_V_CMPS_EQ_F32 0x42 +#define V_008DFC_SQ_V_CMPS_LE_F32 0x43 +#define V_008DFC_SQ_V_CMPS_GT_F32 0x44 +#define V_008DFC_SQ_V_CMPS_LG_F32 0x45 +#define V_008DFC_SQ_V_CMPS_GE_F32 0x46 +#define V_008DFC_SQ_V_CMPS_O_F32 0x47 +#define V_008DFC_SQ_V_CMPS_U_F32 0x48 +#define V_008DFC_SQ_V_CMPS_NGE_F32 0x49 +#define V_008DFC_SQ_V_CMPS_NLG_F32 0x4A +#define V_008DFC_SQ_V_CMPS_NGT_F32 0x4B +#define V_008DFC_SQ_V_CMPS_NLE_F32 0x4C +#define V_008DFC_SQ_V_CMPS_NEQ_F32 0x4D +#define V_008DFC_SQ_V_CMPS_NLT_F32 0x4E +#define V_008DFC_SQ_V_CMPS_TRU_F32 0x4F +#define V_008DFC_SQ_V_CMPSX_F_F32 0x50 +#define V_008DFC_SQ_V_CMPSX_LT_F32 0x51 +#define V_008DFC_SQ_V_CMPSX_EQ_F32 0x52 +#define V_008DFC_SQ_V_CMPSX_LE_F32 0x53 +#define V_008DFC_SQ_V_CMPSX_GT_F32 0x54 +#define V_008DFC_SQ_V_CMPSX_LG_F32 0x55 +#define V_008DFC_SQ_V_CMPSX_GE_F32 0x56 +#define V_008DFC_SQ_V_CMPSX_O_F32 0x57 +#define V_008DFC_SQ_V_CMPSX_U_F32 0x58 +#define V_008DFC_SQ_V_CMPSX_NGE_F32 0x59 +#define V_008DFC_SQ_V_CMPSX_NLG_F32 0x5A +#define V_008DFC_SQ_V_CMPSX_NGT_F32 0x5B +#define V_008DFC_SQ_V_CMPSX_NLE_F32 0x5C +#define V_008DFC_SQ_V_CMPSX_NEQ_F32 0x5D +#define V_008DFC_SQ_V_CMPSX_NLT_F32 0x5E +#define V_008DFC_SQ_V_CMPSX_TRU_F32 0x5F +#define V_008DFC_SQ_V_CMPS_F_F64 0x60 +#define V_008DFC_SQ_V_CMPS_LT_F64 0x61 +#define V_008DFC_SQ_V_CMPS_EQ_F64 0x62 +#define V_008DFC_SQ_V_CMPS_LE_F64 0x63 +#define V_008DFC_SQ_V_CMPS_GT_F64 0x64 +#define V_008DFC_SQ_V_CMPS_LG_F64 0x65 +#define V_008DFC_SQ_V_CMPS_GE_F64 0x66 +#define V_008DFC_SQ_V_CMPS_O_F64 0x67 +#define V_008DFC_SQ_V_CMPS_U_F64 0x68 +#define V_008DFC_SQ_V_CMPS_NGE_F64 0x69 +#define V_008DFC_SQ_V_CMPS_NLG_F64 0x6A +#define V_008DFC_SQ_V_CMPS_NGT_F64 0x6B +#define V_008DFC_SQ_V_CMPS_NLE_F64 0x6C +#define V_008DFC_SQ_V_CMPS_NEQ_F64 0x6D +#define V_008DFC_SQ_V_CMPS_NLT_F64 0x6E +#define V_008DFC_SQ_V_CMPS_TRU_F64 0x6F +#define V_008DFC_SQ_V_CMPSX_F_F64 0x70 +#define V_008DFC_SQ_V_CMPSX_LT_F64 0x71 +#define V_008DFC_SQ_V_CMPSX_EQ_F64 0x72 +#define V_008DFC_SQ_V_CMPSX_LE_F64 0x73 +#define V_008DFC_SQ_V_CMPSX_GT_F64 0x74 +#define V_008DFC_SQ_V_CMPSX_LG_F64 0x75 +#define V_008DFC_SQ_V_CMPSX_GE_F64 0x76 +#define V_008DFC_SQ_V_CMPSX_O_F64 0x77 +#define V_008DFC_SQ_V_CMPSX_U_F64 0x78 +#define V_008DFC_SQ_V_CMPSX_NGE_F64 0x79 +#define V_008DFC_SQ_V_CMPSX_NLG_F64 0x7A +#define V_008DFC_SQ_V_CMPSX_NGT_F64 0x7B +#define V_008DFC_SQ_V_CMPSX_NLE_F64 0x7C +#define V_008DFC_SQ_V_CMPSX_NEQ_F64 0x7D +#define V_008DFC_SQ_V_CMPSX_NLT_F64 0x7E +#define V_008DFC_SQ_V_CMPSX_TRU_F64 0x7F +#define V_008DFC_SQ_V_CMP_F_I32 0x80 +#define V_008DFC_SQ_V_CMP_LT_I32 0x81 +#define V_008DFC_SQ_V_CMP_EQ_I32 0x82 +#define V_008DFC_SQ_V_CMP_LE_I32 0x83 +#define V_008DFC_SQ_V_CMP_GT_I32 0x84 +#define V_008DFC_SQ_V_CMP_NE_I32 0x85 +#define V_008DFC_SQ_V_CMP_GE_I32 0x86 +#define V_008DFC_SQ_V_CMP_T_I32 0x87 +#define V_008DFC_SQ_V_CMP_CLASS_F32 0x88 +#define V_008DFC_SQ_V_CMPX_F_I32 0x90 +#define V_008DFC_SQ_V_CMPX_LT_I32 0x91 +#define V_008DFC_SQ_V_CMPX_EQ_I32 0x92 +#define V_008DFC_SQ_V_CMPX_LE_I32 0x93 +#define V_008DFC_SQ_V_CMPX_GT_I32 0x94 +#define V_008DFC_SQ_V_CMPX_NE_I32 0x95 +#define V_008DFC_SQ_V_CMPX_GE_I32 0x96 +#define V_008DFC_SQ_V_CMPX_T_I32 0x97 +#define V_008DFC_SQ_V_CMPX_CLASS_F32 0x98 +#define V_008DFC_SQ_V_CMP_F_I64 0xA0 +#define V_008DFC_SQ_V_CMP_LT_I64 0xA1 +#define V_008DFC_SQ_V_CMP_EQ_I64 0xA2 +#define V_008DFC_SQ_V_CMP_LE_I64 0xA3 +#define V_008DFC_SQ_V_CMP_GT_I64 0xA4 +#define V_008DFC_SQ_V_CMP_NE_I64 0xA5 +#define V_008DFC_SQ_V_CMP_GE_I64 0xA6 +#define V_008DFC_SQ_V_CMP_T_I64 0xA7 +#define V_008DFC_SQ_V_CMP_CLASS_F64 0xA8 +#define V_008DFC_SQ_V_CMPX_F_I64 0xB0 +#define V_008DFC_SQ_V_CMPX_LT_I64 0xB1 +#define V_008DFC_SQ_V_CMPX_EQ_I64 0xB2 +#define V_008DFC_SQ_V_CMPX_LE_I64 0xB3 +#define V_008DFC_SQ_V_CMPX_GT_I64 0xB4 +#define V_008DFC_SQ_V_CMPX_NE_I64 0xB5 +#define V_008DFC_SQ_V_CMPX_GE_I64 0xB6 +#define V_008DFC_SQ_V_CMPX_T_I64 0xB7 +#define V_008DFC_SQ_V_CMPX_CLASS_F64 0xB8 +#define V_008DFC_SQ_V_CMP_F_U32 0xC0 +#define V_008DFC_SQ_V_CMP_LT_U32 0xC1 +#define V_008DFC_SQ_V_CMP_EQ_U32 0xC2 +#define V_008DFC_SQ_V_CMP_LE_U32 0xC3 +#define V_008DFC_SQ_V_CMP_GT_U32 0xC4 +#define V_008DFC_SQ_V_CMP_NE_U32 0xC5 +#define V_008DFC_SQ_V_CMP_GE_U32 0xC6 +#define V_008DFC_SQ_V_CMP_T_U32 0xC7 +#define V_008DFC_SQ_V_CMPX_F_U32 0xD0 +#define V_008DFC_SQ_V_CMPX_LT_U32 0xD1 +#define V_008DFC_SQ_V_CMPX_EQ_U32 0xD2 +#define V_008DFC_SQ_V_CMPX_LE_U32 0xD3 +#define V_008DFC_SQ_V_CMPX_GT_U32 0xD4 +#define V_008DFC_SQ_V_CMPX_NE_U32 0xD5 +#define V_008DFC_SQ_V_CMPX_GE_U32 0xD6 +#define V_008DFC_SQ_V_CMPX_T_U32 0xD7 +#define V_008DFC_SQ_V_CMP_F_U64 0xE0 +#define V_008DFC_SQ_V_CMP_LT_U64 0xE1 +#define V_008DFC_SQ_V_CMP_EQ_U64 0xE2 +#define V_008DFC_SQ_V_CMP_LE_U64 0xE3 +#define V_008DFC_SQ_V_CMP_GT_U64 0xE4 +#define V_008DFC_SQ_V_CMP_NE_U64 0xE5 +#define V_008DFC_SQ_V_CMP_GE_U64 0xE6 +#define V_008DFC_SQ_V_CMP_T_U64 0xE7 +#define V_008DFC_SQ_V_CMPX_F_U64 0xF0 +#define V_008DFC_SQ_V_CMPX_LT_U64 0xF1 +#define V_008DFC_SQ_V_CMPX_EQ_U64 0xF2 +#define V_008DFC_SQ_V_CMPX_LE_U64 0xF3 +#define V_008DFC_SQ_V_CMPX_GT_U64 0xF4 +#define V_008DFC_SQ_V_CMPX_NE_U64 0xF5 +#define V_008DFC_SQ_V_CMPX_GE_U64 0xF6 +#define V_008DFC_SQ_V_CMPX_T_U64 0xF7 +#define S_008DFC_ENCODING(x) (((x) & 0x7F) << 25) +#define G_008DFC_ENCODING(x) (((x) >> 25) & 0x7F) +#define C_008DFC_ENCODING 0x01FFFFFF +#define V_008DFC_SQ_ENC_VOPC_FIELD 0x3E +#define R_008DFC_SQ_SOP1 0x008DFC +#define S_008DFC_SSRC0(x) (((x) & 0xFF) << 0) +#define G_008DFC_SSRC0(x) (((x) >> 0) & 0xFF) +#define C_008DFC_SSRC0 0xFFFFFF00 +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define S_008DFC_OP(x) (((x) & 0xFF) << 8) +#define G_008DFC_OP(x) (((x) >> 8) & 0xFF) +#define C_008DFC_OP 0xFFFF00FF +#define V_008DFC_SQ_S_MOV_B32 0x03 +#define V_008DFC_SQ_S_MOV_B64 0x04 +#define V_008DFC_SQ_S_CMOV_B32 0x05 +#define V_008DFC_SQ_S_CMOV_B64 0x06 +#define V_008DFC_SQ_S_NOT_B32 0x07 +#define V_008DFC_SQ_S_NOT_B64 0x08 +#define V_008DFC_SQ_S_WQM_B32 0x09 +#define V_008DFC_SQ_S_WQM_B64 0x0A +#define V_008DFC_SQ_S_BREV_B32 0x0B +#define V_008DFC_SQ_S_BREV_B64 0x0C +#define V_008DFC_SQ_S_BCNT0_I32_B32 0x0D +#define V_008DFC_SQ_S_BCNT0_I32_B64 0x0E +#define V_008DFC_SQ_S_BCNT1_I32_B32 0x0F +#define V_008DFC_SQ_S_BCNT1_I32_B64 0x10 +#define V_008DFC_SQ_S_FF0_I32_B32 0x11 +#define V_008DFC_SQ_S_FF0_I32_B64 0x12 +#define V_008DFC_SQ_S_FF1_I32_B32 0x13 +#define V_008DFC_SQ_S_FF1_I32_B64 0x14 +#define V_008DFC_SQ_S_FLBIT_I32_B32 0x15 +#define V_008DFC_SQ_S_FLBIT_I32_B64 0x16 +#define V_008DFC_SQ_S_FLBIT_I32 0x17 +#define V_008DFC_SQ_S_FLBIT_I32_I64 0x18 +#define V_008DFC_SQ_S_SEXT_I32_I8 0x19 +#define V_008DFC_SQ_S_SEXT_I32_I16 0x1A +#define V_008DFC_SQ_S_BITSET0_B32 0x1B +#define V_008DFC_SQ_S_BITSET0_B64 0x1C +#define V_008DFC_SQ_S_BITSET1_B32 0x1D +#define V_008DFC_SQ_S_BITSET1_B64 0x1E +#define V_008DFC_SQ_S_GETPC_B64 0x1F +#define V_008DFC_SQ_S_SETPC_B64 0x20 +#define V_008DFC_SQ_S_SWAPPC_B64 0x21 +#define V_008DFC_SQ_S_RFE_B64 0x22 +#define V_008DFC_SQ_S_AND_SAVEEXEC_B64 0x24 +#define V_008DFC_SQ_S_OR_SAVEEXEC_B64 0x25 +#define V_008DFC_SQ_S_XOR_SAVEEXEC_B64 0x26 +#define V_008DFC_SQ_S_ANDN2_SAVEEXEC_B64 0x27 +#define V_008DFC_SQ_S_ORN2_SAVEEXEC_B64 0x28 +#define V_008DFC_SQ_S_NAND_SAVEEXEC_B64 0x29 +#define V_008DFC_SQ_S_NOR_SAVEEXEC_B64 0x2A +#define V_008DFC_SQ_S_XNOR_SAVEEXEC_B64 0x2B +#define V_008DFC_SQ_S_QUADMASK_B32 0x2C +#define V_008DFC_SQ_S_QUADMASK_B64 0x2D +#define V_008DFC_SQ_S_MOVRELS_B32 0x2E +#define V_008DFC_SQ_S_MOVRELS_B64 0x2F +#define V_008DFC_SQ_S_MOVRELD_B32 0x30 +#define V_008DFC_SQ_S_MOVRELD_B64 0x31 +#define V_008DFC_SQ_S_CBRANCH_JOIN 0x32 +#define V_008DFC_SQ_S_MOV_REGRD_B32 0x33 +#define V_008DFC_SQ_S_ABS_I32 0x34 +#define V_008DFC_SQ_S_MOV_FED_B32 0x35 +#define S_008DFC_SDST(x) (((x) & 0x7F) << 16) +#define G_008DFC_SDST(x) (((x) >> 16) & 0x7F) +#define C_008DFC_SDST 0xFF80FFFF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define S_008DFC_ENCODING(x) (((x) & 0x1FF) << 23) +#define G_008DFC_ENCODING(x) (((x) >> 23) & 0x1FF) +#define C_008DFC_ENCODING 0x007FFFFF +#define V_008DFC_SQ_ENC_SOP1_FIELD 0x17D +#define R_008DFC_SQ_MTBUF_1 0x008DFC +#define S_008DFC_VADDR(x) (((x) & 0xFF) << 0) +#define G_008DFC_VADDR(x) (((x) >> 0) & 0xFF) +#define C_008DFC_VADDR 0xFFFFFF00 +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_VDATA(x) (((x) & 0xFF) << 8) +#define G_008DFC_VDATA(x) (((x) >> 8) & 0xFF) +#define C_008DFC_VDATA 0xFFFF00FF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_SRSRC(x) (((x) & 0x1F) << 16) +#define G_008DFC_SRSRC(x) (((x) >> 16) & 0x1F) +#define C_008DFC_SRSRC 0xFFE0FFFF +#define S_008DFC_SLC(x) (((x) & 0x1) << 22) +#define G_008DFC_SLC(x) (((x) >> 22) & 0x1) +#define C_008DFC_SLC 0xFFBFFFFF +#define S_008DFC_TFE(x) (((x) & 0x1) << 23) +#define G_008DFC_TFE(x) (((x) >> 23) & 0x1) +#define C_008DFC_TFE 0xFF7FFFFF +#define S_008DFC_SOFFSET(x) (((x) & 0xFF) << 24) +#define G_008DFC_SOFFSET(x) (((x) >> 24) & 0xFF) +#define C_008DFC_SOFFSET 0x00FFFFFF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define R_008DFC_SQ_SOP2 0x008DFC +#define S_008DFC_SSRC0(x) (((x) & 0xFF) << 0) +#define G_008DFC_SSRC0(x) (((x) >> 0) & 0xFF) +#define C_008DFC_SSRC0 0xFFFFFF00 +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define S_008DFC_SSRC1(x) (((x) & 0xFF) << 8) +#define G_008DFC_SSRC1(x) (((x) >> 8) & 0xFF) +#define C_008DFC_SSRC1 0xFFFF00FF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define S_008DFC_SDST(x) (((x) & 0x7F) << 16) +#define G_008DFC_SDST(x) (((x) >> 16) & 0x7F) +#define C_008DFC_SDST 0xFF80FFFF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define S_008DFC_OP(x) (((x) & 0x7F) << 23) +#define G_008DFC_OP(x) (((x) >> 23) & 0x7F) +#define C_008DFC_OP 0xC07FFFFF +#define V_008DFC_SQ_S_ADD_U32 0x00 +#define V_008DFC_SQ_S_SUB_U32 0x01 +#define V_008DFC_SQ_S_ADD_I32 0x02 +#define V_008DFC_SQ_S_SUB_I32 0x03 +#define V_008DFC_SQ_S_ADDC_U32 0x04 +#define V_008DFC_SQ_S_SUBB_U32 0x05 +#define V_008DFC_SQ_S_MIN_I32 0x06 +#define V_008DFC_SQ_S_MIN_U32 0x07 +#define V_008DFC_SQ_S_MAX_I32 0x08 +#define V_008DFC_SQ_S_MAX_U32 0x09 +#define V_008DFC_SQ_S_CSELECT_B32 0x0A +#define V_008DFC_SQ_S_CSELECT_B64 0x0B +#define V_008DFC_SQ_S_AND_B32 0x0E +#define V_008DFC_SQ_S_AND_B64 0x0F +#define V_008DFC_SQ_S_OR_B32 0x10 +#define V_008DFC_SQ_S_OR_B64 0x11 +#define V_008DFC_SQ_S_XOR_B32 0x12 +#define V_008DFC_SQ_S_XOR_B64 0x13 +#define V_008DFC_SQ_S_ANDN2_B32 0x14 +#define V_008DFC_SQ_S_ANDN2_B64 0x15 +#define V_008DFC_SQ_S_ORN2_B32 0x16 +#define V_008DFC_SQ_S_ORN2_B64 0x17 +#define V_008DFC_SQ_S_NAND_B32 0x18 +#define V_008DFC_SQ_S_NAND_B64 0x19 +#define V_008DFC_SQ_S_NOR_B32 0x1A +#define V_008DFC_SQ_S_NOR_B64 0x1B +#define V_008DFC_SQ_S_XNOR_B32 0x1C +#define V_008DFC_SQ_S_XNOR_B64 0x1D +#define V_008DFC_SQ_S_LSHL_B32 0x1E +#define V_008DFC_SQ_S_LSHL_B64 0x1F +#define V_008DFC_SQ_S_LSHR_B32 0x20 +#define V_008DFC_SQ_S_LSHR_B64 0x21 +#define V_008DFC_SQ_S_ASHR_I32 0x22 +#define V_008DFC_SQ_S_ASHR_I64 0x23 +#define V_008DFC_SQ_S_BFM_B32 0x24 +#define V_008DFC_SQ_S_BFM_B64 0x25 +#define V_008DFC_SQ_S_MUL_I32 0x26 +#define V_008DFC_SQ_S_BFE_U32 0x27 +#define V_008DFC_SQ_S_BFE_I32 0x28 +#define V_008DFC_SQ_S_BFE_U64 0x29 +#define V_008DFC_SQ_S_BFE_I64 0x2A +#define V_008DFC_SQ_S_CBRANCH_G_FORK 0x2B +#define V_008DFC_SQ_S_ABSDIFF_I32 0x2C +#define S_008DFC_ENCODING(x) (((x) & 0x03) << 30) +#define G_008DFC_ENCODING(x) (((x) >> 30) & 0x03) +#define C_008DFC_ENCODING 0x3FFFFFFF +#define V_008DFC_SQ_ENC_SOP2_FIELD 0x02 +#define R_008DFC_SQ_SOPK 0x008DFC +#define S_008DFC_SIMM16(x) (((x) & 0xFFFF) << 0) +#define G_008DFC_SIMM16(x) (((x) >> 0) & 0xFFFF) +#define C_008DFC_SIMM16 0xFFFF0000 +#define S_008DFC_SDST(x) (((x) & 0x7F) << 16) +#define G_008DFC_SDST(x) (((x) >> 16) & 0x7F) +#define C_008DFC_SDST 0xFF80FFFF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define S_008DFC_OP(x) (((x) & 0x1F) << 23) +#define G_008DFC_OP(x) (((x) >> 23) & 0x1F) +#define C_008DFC_OP 0xF07FFFFF +#define V_008DFC_SQ_S_MOVK_I32 0x00 +#define V_008DFC_SQ_S_CMOVK_I32 0x02 +#define V_008DFC_SQ_S_CMPK_EQ_I32 0x03 +#define V_008DFC_SQ_S_CMPK_LG_I32 0x04 +#define V_008DFC_SQ_S_CMPK_GT_I32 0x05 +#define V_008DFC_SQ_S_CMPK_GE_I32 0x06 +#define V_008DFC_SQ_S_CMPK_LT_I32 0x07 +#define V_008DFC_SQ_S_CMPK_LE_I32 0x08 +#define V_008DFC_SQ_S_CMPK_EQ_U32 0x09 +#define V_008DFC_SQ_S_CMPK_LG_U32 0x0A +#define V_008DFC_SQ_S_CMPK_GT_U32 0x0B +#define V_008DFC_SQ_S_CMPK_GE_U32 0x0C +#define V_008DFC_SQ_S_CMPK_LT_U32 0x0D +#define V_008DFC_SQ_S_CMPK_LE_U32 0x0E +#define V_008DFC_SQ_S_ADDK_I32 0x0F +#define V_008DFC_SQ_S_MULK_I32 0x10 +#define V_008DFC_SQ_S_CBRANCH_I_FORK 0x11 +#define V_008DFC_SQ_S_GETREG_B32 0x12 +#define V_008DFC_SQ_S_SETREG_B32 0x13 +#define V_008DFC_SQ_S_GETREG_REGRD_B32 0x14 +#define V_008DFC_SQ_S_SETREG_IMM32_B32 0x15 +#define S_008DFC_ENCODING(x) (((x) & 0x0F) << 28) +#define G_008DFC_ENCODING(x) (((x) >> 28) & 0x0F) +#define C_008DFC_ENCODING 0x0FFFFFFF +#define V_008DFC_SQ_ENC_SOPK_FIELD 0x0B +#define R_008DFC_SQ_VOP3_0 0x008DFC +#define S_008DFC_VDST(x) (((x) & 0xFF) << 0) +#define G_008DFC_VDST(x) (((x) >> 0) & 0xFF) +#define C_008DFC_VDST 0xFFFFFF00 +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_ABS(x) (((x) & 0x07) << 8) +#define G_008DFC_ABS(x) (((x) >> 8) & 0x07) +#define C_008DFC_ABS 0xFFFFF8FF +#define S_008DFC_CLAMP(x) (((x) & 0x1) << 11) +#define G_008DFC_CLAMP(x) (((x) >> 11) & 0x1) +#define C_008DFC_CLAMP 0xFFFFF7FF +#define S_008DFC_OP(x) (((x) & 0x1FF) << 17) +#define G_008DFC_OP(x) (((x) >> 17) & 0x1FF) +#define C_008DFC_OP 0xFC01FFFF +#define V_008DFC_SQ_V_OPC_OFFSET 0x00 +#define V_008DFC_SQ_V_OP2_OFFSET 0x100 +#define V_008DFC_SQ_V_MAD_LEGACY_F32 0x140 +#define V_008DFC_SQ_V_MAD_F32 0x141 +#define V_008DFC_SQ_V_MAD_I32_I24 0x142 +#define V_008DFC_SQ_V_MAD_U32_U24 0x143 +#define V_008DFC_SQ_V_CUBEID_F32 0x144 +#define V_008DFC_SQ_V_CUBESC_F32 0x145 +#define V_008DFC_SQ_V_CUBETC_F32 0x146 +#define V_008DFC_SQ_V_CUBEMA_F32 0x147 +#define V_008DFC_SQ_V_BFE_U32 0x148 +#define V_008DFC_SQ_V_BFE_I32 0x149 +#define V_008DFC_SQ_V_BFI_B32 0x14A +#define V_008DFC_SQ_V_FMA_F32 0x14B +#define V_008DFC_SQ_V_FMA_F64 0x14C +#define V_008DFC_SQ_V_LERP_U8 0x14D +#define V_008DFC_SQ_V_ALIGNBIT_B32 0x14E +#define V_008DFC_SQ_V_ALIGNBYTE_B32 0x14F +#define V_008DFC_SQ_V_MULLIT_F32 0x150 +#define V_008DFC_SQ_V_MIN3_F32 0x151 +#define V_008DFC_SQ_V_MIN3_I32 0x152 +#define V_008DFC_SQ_V_MIN3_U32 0x153 +#define V_008DFC_SQ_V_MAX3_F32 0x154 +#define V_008DFC_SQ_V_MAX3_I32 0x155 +#define V_008DFC_SQ_V_MAX3_U32 0x156 +#define V_008DFC_SQ_V_MED3_F32 0x157 +#define V_008DFC_SQ_V_MED3_I32 0x158 +#define V_008DFC_SQ_V_MED3_U32 0x159 +#define V_008DFC_SQ_V_SAD_U8 0x15A +#define V_008DFC_SQ_V_SAD_HI_U8 0x15B +#define V_008DFC_SQ_V_SAD_U16 0x15C +#define V_008DFC_SQ_V_SAD_U32 0x15D +#define V_008DFC_SQ_V_CVT_PK_U8_F32 0x15E +#define V_008DFC_SQ_V_DIV_FIXUP_F32 0x15F +#define V_008DFC_SQ_V_DIV_FIXUP_F64 0x160 +#define V_008DFC_SQ_V_LSHL_B64 0x161 +#define V_008DFC_SQ_V_LSHR_B64 0x162 +#define V_008DFC_SQ_V_ASHR_I64 0x163 +#define V_008DFC_SQ_V_ADD_F64 0x164 +#define V_008DFC_SQ_V_MUL_F64 0x165 +#define V_008DFC_SQ_V_MIN_F64 0x166 +#define V_008DFC_SQ_V_MAX_F64 0x167 +#define V_008DFC_SQ_V_LDEXP_F64 0x168 +#define V_008DFC_SQ_V_MUL_LO_U32 0x169 +#define V_008DFC_SQ_V_MUL_HI_U32 0x16A +#define V_008DFC_SQ_V_MUL_LO_I32 0x16B +#define V_008DFC_SQ_V_MUL_HI_I32 0x16C +#define V_008DFC_SQ_V_DIV_SCALE_F32 0x16D +#define V_008DFC_SQ_V_DIV_SCALE_F64 0x16E +#define V_008DFC_SQ_V_DIV_FMAS_F32 0x16F +#define V_008DFC_SQ_V_DIV_FMAS_F64 0x170 +#define V_008DFC_SQ_V_MSAD_U8 0x171 +#define V_008DFC_SQ_V_QSAD_U8 0x172 +#define V_008DFC_SQ_V_MQSAD_U8 0x173 +#define V_008DFC_SQ_V_TRIG_PREOP_F64 0x174 +#define V_008DFC_SQ_V_OP1_OFFSET 0x180 +#define S_008DFC_ENCODING(x) (((x) & 0x3F) << 26) +#define G_008DFC_ENCODING(x) (((x) >> 26) & 0x3F) +#define C_008DFC_ENCODING 0x03FFFFFF +#define V_008DFC_SQ_ENC_VOP3_FIELD 0x34 +#define R_008DFC_SQ_VOP2 0x008DFC +#define S_008DFC_SRC0(x) (((x) & 0x1FF) << 0) +#define G_008DFC_SRC0(x) (((x) >> 0) & 0x1FF) +#define C_008DFC_SRC0 0xFFFFFE00 +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define V_008DFC_SQ_M0 0x7C +#define V_008DFC_SQ_EXEC_LO 0x7E +#define V_008DFC_SQ_EXEC_HI 0x7F +#define V_008DFC_SQ_SRC_0 0x80 +#define V_008DFC_SQ_SRC_1_INT 0x81 +#define V_008DFC_SQ_SRC_2_INT 0x82 +#define V_008DFC_SQ_SRC_3_INT 0x83 +#define V_008DFC_SQ_SRC_4_INT 0x84 +#define V_008DFC_SQ_SRC_5_INT 0x85 +#define V_008DFC_SQ_SRC_6_INT 0x86 +#define V_008DFC_SQ_SRC_7_INT 0x87 +#define V_008DFC_SQ_SRC_8_INT 0x88 +#define V_008DFC_SQ_SRC_9_INT 0x89 +#define V_008DFC_SQ_SRC_10_INT 0x8A +#define V_008DFC_SQ_SRC_11_INT 0x8B +#define V_008DFC_SQ_SRC_12_INT 0x8C +#define V_008DFC_SQ_SRC_13_INT 0x8D +#define V_008DFC_SQ_SRC_14_INT 0x8E +#define V_008DFC_SQ_SRC_15_INT 0x8F +#define V_008DFC_SQ_SRC_16_INT 0x90 +#define V_008DFC_SQ_SRC_17_INT 0x91 +#define V_008DFC_SQ_SRC_18_INT 0x92 +#define V_008DFC_SQ_SRC_19_INT 0x93 +#define V_008DFC_SQ_SRC_20_INT 0x94 +#define V_008DFC_SQ_SRC_21_INT 0x95 +#define V_008DFC_SQ_SRC_22_INT 0x96 +#define V_008DFC_SQ_SRC_23_INT 0x97 +#define V_008DFC_SQ_SRC_24_INT 0x98 +#define V_008DFC_SQ_SRC_25_INT 0x99 +#define V_008DFC_SQ_SRC_26_INT 0x9A +#define V_008DFC_SQ_SRC_27_INT 0x9B +#define V_008DFC_SQ_SRC_28_INT 0x9C +#define V_008DFC_SQ_SRC_29_INT 0x9D +#define V_008DFC_SQ_SRC_30_INT 0x9E +#define V_008DFC_SQ_SRC_31_INT 0x9F +#define V_008DFC_SQ_SRC_32_INT 0xA0 +#define V_008DFC_SQ_SRC_33_INT 0xA1 +#define V_008DFC_SQ_SRC_34_INT 0xA2 +#define V_008DFC_SQ_SRC_35_INT 0xA3 +#define V_008DFC_SQ_SRC_36_INT 0xA4 +#define V_008DFC_SQ_SRC_37_INT 0xA5 +#define V_008DFC_SQ_SRC_38_INT 0xA6 +#define V_008DFC_SQ_SRC_39_INT 0xA7 +#define V_008DFC_SQ_SRC_40_INT 0xA8 +#define V_008DFC_SQ_SRC_41_INT 0xA9 +#define V_008DFC_SQ_SRC_42_INT 0xAA +#define V_008DFC_SQ_SRC_43_INT 0xAB +#define V_008DFC_SQ_SRC_44_INT 0xAC +#define V_008DFC_SQ_SRC_45_INT 0xAD +#define V_008DFC_SQ_SRC_46_INT 0xAE +#define V_008DFC_SQ_SRC_47_INT 0xAF +#define V_008DFC_SQ_SRC_48_INT 0xB0 +#define V_008DFC_SQ_SRC_49_INT 0xB1 +#define V_008DFC_SQ_SRC_50_INT 0xB2 +#define V_008DFC_SQ_SRC_51_INT 0xB3 +#define V_008DFC_SQ_SRC_52_INT 0xB4 +#define V_008DFC_SQ_SRC_53_INT 0xB5 +#define V_008DFC_SQ_SRC_54_INT 0xB6 +#define V_008DFC_SQ_SRC_55_INT 0xB7 +#define V_008DFC_SQ_SRC_56_INT 0xB8 +#define V_008DFC_SQ_SRC_57_INT 0xB9 +#define V_008DFC_SQ_SRC_58_INT 0xBA +#define V_008DFC_SQ_SRC_59_INT 0xBB +#define V_008DFC_SQ_SRC_60_INT 0xBC +#define V_008DFC_SQ_SRC_61_INT 0xBD +#define V_008DFC_SQ_SRC_62_INT 0xBE +#define V_008DFC_SQ_SRC_63_INT 0xBF +#define V_008DFC_SQ_SRC_64_INT 0xC0 +#define V_008DFC_SQ_SRC_M_1_INT 0xC1 +#define V_008DFC_SQ_SRC_M_2_INT 0xC2 +#define V_008DFC_SQ_SRC_M_3_INT 0xC3 +#define V_008DFC_SQ_SRC_M_4_INT 0xC4 +#define V_008DFC_SQ_SRC_M_5_INT 0xC5 +#define V_008DFC_SQ_SRC_M_6_INT 0xC6 +#define V_008DFC_SQ_SRC_M_7_INT 0xC7 +#define V_008DFC_SQ_SRC_M_8_INT 0xC8 +#define V_008DFC_SQ_SRC_M_9_INT 0xC9 +#define V_008DFC_SQ_SRC_M_10_INT 0xCA +#define V_008DFC_SQ_SRC_M_11_INT 0xCB +#define V_008DFC_SQ_SRC_M_12_INT 0xCC +#define V_008DFC_SQ_SRC_M_13_INT 0xCD +#define V_008DFC_SQ_SRC_M_14_INT 0xCE +#define V_008DFC_SQ_SRC_M_15_INT 0xCF +#define V_008DFC_SQ_SRC_M_16_INT 0xD0 +#define V_008DFC_SQ_SRC_0_5 0xF0 +#define V_008DFC_SQ_SRC_M_0_5 0xF1 +#define V_008DFC_SQ_SRC_1 0xF2 +#define V_008DFC_SQ_SRC_M_1 0xF3 +#define V_008DFC_SQ_SRC_2 0xF4 +#define V_008DFC_SQ_SRC_M_2 0xF5 +#define V_008DFC_SQ_SRC_4 0xF6 +#define V_008DFC_SQ_SRC_M_4 0xF7 +#define V_008DFC_SQ_SRC_VCCZ 0xFB +#define V_008DFC_SQ_SRC_EXECZ 0xFC +#define V_008DFC_SQ_SRC_SCC 0xFD +#define V_008DFC_SQ_SRC_LDS_DIRECT 0xFE +#define V_008DFC_SQ_SRC_VGPR 0x100 +#define S_008DFC_VSRC1(x) (((x) & 0xFF) << 9) +#define G_008DFC_VSRC1(x) (((x) >> 9) & 0xFF) +#define C_008DFC_VSRC1 0xFFFE01FF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_VDST(x) (((x) & 0xFF) << 17) +#define G_008DFC_VDST(x) (((x) >> 17) & 0xFF) +#define C_008DFC_VDST 0xFE01FFFF +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_OP(x) (((x) & 0x3F) << 25) +#define G_008DFC_OP(x) (((x) >> 25) & 0x3F) +#define C_008DFC_OP 0x81FFFFFF +#define V_008DFC_SQ_V_CNDMASK_B32 0x00 +#define V_008DFC_SQ_V_READLANE_B32 0x01 +#define V_008DFC_SQ_V_WRITELANE_B32 0x02 +#define V_008DFC_SQ_V_ADD_F32 0x03 +#define V_008DFC_SQ_V_SUB_F32 0x04 +#define V_008DFC_SQ_V_SUBREV_F32 0x05 +#define V_008DFC_SQ_V_MAC_LEGACY_F32 0x06 +#define V_008DFC_SQ_V_MUL_LEGACY_F32 0x07 +#define V_008DFC_SQ_V_MUL_F32 0x08 +#define V_008DFC_SQ_V_MUL_I32_I24 0x09 +#define V_008DFC_SQ_V_MUL_HI_I32_I24 0x0A +#define V_008DFC_SQ_V_MUL_U32_U24 0x0B +#define V_008DFC_SQ_V_MUL_HI_U32_U24 0x0C +#define V_008DFC_SQ_V_MIN_LEGACY_F32 0x0D +#define V_008DFC_SQ_V_MAX_LEGACY_F32 0x0E +#define V_008DFC_SQ_V_MIN_F32 0x0F +#define V_008DFC_SQ_V_MAX_F32 0x10 +#define V_008DFC_SQ_V_MIN_I32 0x11 +#define V_008DFC_SQ_V_MAX_I32 0x12 +#define V_008DFC_SQ_V_MIN_U32 0x13 +#define V_008DFC_SQ_V_MAX_U32 0x14 +#define V_008DFC_SQ_V_LSHR_B32 0x15 +#define V_008DFC_SQ_V_LSHRREV_B32 0x16 +#define V_008DFC_SQ_V_ASHR_I32 0x17 +#define V_008DFC_SQ_V_ASHRREV_I32 0x18 +#define V_008DFC_SQ_V_LSHL_B32 0x19 +#define V_008DFC_SQ_V_LSHLREV_B32 0x1A +#define V_008DFC_SQ_V_AND_B32 0x1B +#define V_008DFC_SQ_V_OR_B32 0x1C +#define V_008DFC_SQ_V_XOR_B32 0x1D +#define V_008DFC_SQ_V_BFM_B32 0x1E +#define V_008DFC_SQ_V_MAC_F32 0x1F +#define V_008DFC_SQ_V_MADMK_F32 0x20 +#define V_008DFC_SQ_V_MADAK_F32 0x21 +#define V_008DFC_SQ_V_BCNT_U32_B32 0x22 +#define V_008DFC_SQ_V_MBCNT_LO_U32_B32 0x23 +#define V_008DFC_SQ_V_MBCNT_HI_U32_B32 0x24 +#define V_008DFC_SQ_V_ADD_I32 0x25 +#define V_008DFC_SQ_V_SUB_I32 0x26 +#define V_008DFC_SQ_V_SUBREV_I32 0x27 +#define V_008DFC_SQ_V_ADDC_U32 0x28 +#define V_008DFC_SQ_V_SUBB_U32 0x29 +#define V_008DFC_SQ_V_SUBBREV_U32 0x2A +#define V_008DFC_SQ_V_LDEXP_F32 0x2B +#define V_008DFC_SQ_V_CVT_PKACCUM_U8_F32 0x2C +#define V_008DFC_SQ_V_CVT_PKNORM_I16_F32 0x2D +#define V_008DFC_SQ_V_CVT_PKNORM_U16_F32 0x2E +#define V_008DFC_SQ_V_CVT_PKRTZ_F16_F32 0x2F +#define V_008DFC_SQ_V_CVT_PK_U16_U32 0x30 +#define V_008DFC_SQ_V_CVT_PK_I16_I32 0x31 +#define S_008DFC_ENCODING(x) (((x) & 0x1) << 31) +#define G_008DFC_ENCODING(x) (((x) >> 31) & 0x1) +#define C_008DFC_ENCODING 0x7FFFFFFF +#define R_008DFC_SQ_VOP3_0_SDST_ENC 0x008DFC +#define S_008DFC_VDST(x) (((x) & 0xFF) << 0) +#define G_008DFC_VDST(x) (((x) >> 0) & 0xFF) +#define C_008DFC_VDST 0xFFFFFF00 +#define V_008DFC_SQ_VGPR 0x00 +#define S_008DFC_SDST(x) (((x) & 0x7F) << 8) +#define G_008DFC_SDST(x) (((x) >> 8) & 0x7F) +#define C_008DFC_SDST 0xFFFF80FF +#define V_008DFC_SQ_SGPR 0x00 +#define V_008DFC_SQ_VCC_LO 0x6A +#define V_008DFC_SQ_VCC_HI 0x6B +#define V_008DFC_SQ_TBA_LO 0x6C +#define V_008DFC_SQ_TBA_HI 0x6D +#define V_008DFC_SQ_TMA_LO 0x6E +#define V_008DFC_SQ_TMA_HI 0x6F +#define V_008DFC_SQ_TTMP0 0x70 +#define V_008DFC_SQ_TTMP1 0x71 +#define V_008DFC_SQ_TTMP2 0x72 +#define V_008DFC_SQ_TTMP3 0x73 +#define V_008DFC_SQ_TTMP4 0x74 +#define V_008DFC_SQ_TTMP5 0x75 +#define V_008DFC_SQ_TTMP6 0x76 +#define V_008DFC_SQ_TTMP7 0x77 +#define V_008DFC_SQ_TTMP8 0x78 +#define V_008DFC_SQ_TTMP9 0x79 +#define V_008DFC_SQ_TTMP10 0x7A +#define V_008DFC_SQ_TTMP11 0x7B +#define S_008DFC_OP(x) (((x) & 0x1FF) << 17) +#define G_008DFC_OP(x) (((x) >> 17) & 0x1FF) +#define C_008DFC_OP 0xFC01FFFF +#define V_008DFC_SQ_V_OPC_OFFSET 0x00 +#define V_008DFC_SQ_V_OP2_OFFSET 0x100 +#define V_008DFC_SQ_V_MAD_LEGACY_F32 0x140 +#define V_008DFC_SQ_V_MAD_F32 0x141 +#define V_008DFC_SQ_V_MAD_I32_I24 0x142 +#define V_008DFC_SQ_V_MAD_U32_U24 0x143 +#define V_008DFC_SQ_V_CUBEID_F32 0x144 +#define V_008DFC_SQ_V_CUBESC_F32 0x145 +#define V_008DFC_SQ_V_CUBETC_F32 0x146 +#define V_008DFC_SQ_V_CUBEMA_F32 0x147 +#define V_008DFC_SQ_V_BFE_U32 0x148 +#define V_008DFC_SQ_V_BFE_I32 0x149 +#define V_008DFC_SQ_V_BFI_B32 0x14A +#define V_008DFC_SQ_V_FMA_F32 0x14B +#define V_008DFC_SQ_V_FMA_F64 0x14C +#define V_008DFC_SQ_V_LERP_U8 0x14D +#define V_008DFC_SQ_V_ALIGNBIT_B32 0x14E +#define V_008DFC_SQ_V_ALIGNBYTE_B32 0x14F +#define V_008DFC_SQ_V_MULLIT_F32 0x150 +#define V_008DFC_SQ_V_MIN3_F32 0x151 +#define V_008DFC_SQ_V_MIN3_I32 0x152 +#define V_008DFC_SQ_V_MIN3_U32 0x153 +#define V_008DFC_SQ_V_MAX3_F32 0x154 +#define V_008DFC_SQ_V_MAX3_I32 0x155 +#define V_008DFC_SQ_V_MAX3_U32 0x156 +#define V_008DFC_SQ_V_MED3_F32 0x157 +#define V_008DFC_SQ_V_MED3_I32 0x158 +#define V_008DFC_SQ_V_MED3_U32 0x159 +#define V_008DFC_SQ_V_SAD_U8 0x15A +#define V_008DFC_SQ_V_SAD_HI_U8 0x15B +#define V_008DFC_SQ_V_SAD_U16 0x15C +#define V_008DFC_SQ_V_SAD_U32 0x15D +#define V_008DFC_SQ_V_CVT_PK_U8_F32 0x15E +#define V_008DFC_SQ_V_DIV_FIXUP_F32 0x15F +#define V_008DFC_SQ_V_DIV_FIXUP_F64 0x160 +#define V_008DFC_SQ_V_LSHL_B64 0x161 +#define V_008DFC_SQ_V_LSHR_B64 0x162 +#define V_008DFC_SQ_V_ASHR_I64 0x163 +#define V_008DFC_SQ_V_ADD_F64 0x164 +#define V_008DFC_SQ_V_MUL_F64 0x165 +#define V_008DFC_SQ_V_MIN_F64 0x166 +#define V_008DFC_SQ_V_MAX_F64 0x167 +#define V_008DFC_SQ_V_LDEXP_F64 0x168 +#define V_008DFC_SQ_V_MUL_LO_U32 0x169 +#define V_008DFC_SQ_V_MUL_HI_U32 0x16A +#define V_008DFC_SQ_V_MUL_LO_I32 0x16B +#define V_008DFC_SQ_V_MUL_HI_I32 0x16C +#define V_008DFC_SQ_V_DIV_SCALE_F32 0x16D +#define V_008DFC_SQ_V_DIV_SCALE_F64 0x16E +#define V_008DFC_SQ_V_DIV_FMAS_F32 0x16F +#define V_008DFC_SQ_V_DIV_FMAS_F64 0x170 +#define V_008DFC_SQ_V_MSAD_U8 0x171 +#define V_008DFC_SQ_V_QSAD_U8 0x172 +#define V_008DFC_SQ_V_MQSAD_U8 0x173 +#define V_008DFC_SQ_V_TRIG_PREOP_F64 0x174 +#define V_008DFC_SQ_V_OP1_OFFSET 0x180 +#define S_008DFC_ENCODING(x) (((x) & 0x3F) << 26) +#define G_008DFC_ENCODING(x) (((x) >> 26) & 0x3F) +#define C_008DFC_ENCODING 0x03FFFFFF +#define V_008DFC_SQ_ENC_VOP3_FIELD 0x34 +#define R_008DFC_SQ_MUBUF_0 0x008DFC +#define S_008DFC_OFFSET(x) (((x) & 0xFFF) << 0) +#define G_008DFC_OFFSET(x) (((x) >> 0) & 0xFFF) +#define C_008DFC_OFFSET 0xFFFFF000 +#define S_008DFC_OFFEN(x) (((x) & 0x1) << 12) +#define G_008DFC_OFFEN(x) (((x) >> 12) & 0x1) +#define C_008DFC_OFFEN 0xFFFFEFFF +#define S_008DFC_IDXEN(x) (((x) & 0x1) << 13) +#define G_008DFC_IDXEN(x) (((x) >> 13) & 0x1) +#define C_008DFC_IDXEN 0xFFFFDFFF +#define S_008DFC_GLC(x) (((x) & 0x1) << 14) +#define G_008DFC_GLC(x) (((x) >> 14) & 0x1) +#define C_008DFC_GLC 0xFFFFBFFF +#define S_008DFC_ADDR64(x) (((x) & 0x1) << 15) +#define G_008DFC_ADDR64(x) (((x) >> 15) & 0x1) +#define C_008DFC_ADDR64 0xFFFF7FFF +#define S_008DFC_LDS(x) (((x) & 0x1) << 16) +#define G_008DFC_LDS(x) (((x) >> 16) & 0x1) +#define C_008DFC_LDS 0xFFFEFFFF +#define S_008DFC_OP(x) (((x) & 0x7F) << 18) +#define G_008DFC_OP(x) (((x) >> 18) & 0x7F) +#define C_008DFC_OP 0xFE03FFFF +#define V_008DFC_SQ_BUFFER_LOAD_FORMAT_X 0x00 +#define V_008DFC_SQ_BUFFER_LOAD_FORMAT_XY 0x01 +#define V_008DFC_SQ_BUFFER_LOAD_FORMAT_XYZ 0x02 +#define V_008DFC_SQ_BUFFER_LOAD_FORMAT_XYZW 0x03 +#define V_008DFC_SQ_BUFFER_STORE_FORMAT_X 0x04 +#define V_008DFC_SQ_BUFFER_STORE_FORMAT_XY 0x05 +#define V_008DFC_SQ_BUFFER_STORE_FORMAT_XYZ 0x06 +#define V_008DFC_SQ_BUFFER_STORE_FORMAT_XYZW 0x07 +#define V_008DFC_SQ_BUFFER_LOAD_UBYTE 0x08 +#define V_008DFC_SQ_BUFFER_LOAD_SBYTE 0x09 +#define V_008DFC_SQ_BUFFER_LOAD_USHORT 0x0A +#define V_008DFC_SQ_BUFFER_LOAD_SSHORT 0x0B +#define V_008DFC_SQ_BUFFER_LOAD_DWORD 0x0C +#define V_008DFC_SQ_BUFFER_LOAD_DWORDX2 0x0D +#define V_008DFC_SQ_BUFFER_LOAD_DWORDX4 0x0E +#define V_008DFC_SQ_BUFFER_STORE_BYTE 0x18 +#define V_008DFC_SQ_BUFFER_STORE_SHORT 0x1A +#define V_008DFC_SQ_BUFFER_STORE_DWORD 0x1C +#define V_008DFC_SQ_BUFFER_STORE_DWORDX2 0x1D +#define V_008DFC_SQ_BUFFER_STORE_DWORDX4 0x1E +#define V_008DFC_SQ_BUFFER_ATOMIC_SWAP 0x30 +#define V_008DFC_SQ_BUFFER_ATOMIC_CMPSWAP 0x31 +#define V_008DFC_SQ_BUFFER_ATOMIC_ADD 0x32 +#define V_008DFC_SQ_BUFFER_ATOMIC_SUB 0x33 +#define V_008DFC_SQ_BUFFER_ATOMIC_RSUB 0x34 +#define V_008DFC_SQ_BUFFER_ATOMIC_SMIN 0x35 +#define V_008DFC_SQ_BUFFER_ATOMIC_UMIN 0x36 +#define V_008DFC_SQ_BUFFER_ATOMIC_SMAX 0x37 +#define V_008DFC_SQ_BUFFER_ATOMIC_UMAX 0x38 +#define V_008DFC_SQ_BUFFER_ATOMIC_AND 0x39 +#define V_008DFC_SQ_BUFFER_ATOMIC_OR 0x3A +#define V_008DFC_SQ_BUFFER_ATOMIC_XOR 0x3B +#define V_008DFC_SQ_BUFFER_ATOMIC_INC 0x3C +#define V_008DFC_SQ_BUFFER_ATOMIC_DEC 0x3D +#define V_008DFC_SQ_BUFFER_ATOMIC_FCMPSWAP 0x3E +#define V_008DFC_SQ_BUFFER_ATOMIC_FMIN 0x3F +#define V_008DFC_SQ_BUFFER_ATOMIC_FMAX 0x40 +#define V_008DFC_SQ_BUFFER_ATOMIC_SWAP_X2 0x50 +#define V_008DFC_SQ_BUFFER_ATOMIC_CMPSWAP_X2 0x51 +#define V_008DFC_SQ_BUFFER_ATOMIC_ADD_X2 0x52 +#define V_008DFC_SQ_BUFFER_ATOMIC_SUB_X2 0x53 +#define V_008DFC_SQ_BUFFER_ATOMIC_RSUB_X2 0x54 +#define V_008DFC_SQ_BUFFER_ATOMIC_SMIN_X2 0x55 +#define V_008DFC_SQ_BUFFER_ATOMIC_UMIN_X2 0x56 +#define V_008DFC_SQ_BUFFER_ATOMIC_SMAX_X2 0x57 +#define V_008DFC_SQ_BUFFER_ATOMIC_UMAX_X2 0x58 +#define V_008DFC_SQ_BUFFER_ATOMIC_AND_X2 0x59 +#define V_008DFC_SQ_BUFFER_ATOMIC_OR_X2 0x5A +#define V_008DFC_SQ_BUFFER_ATOMIC_XOR_X2 0x5B +#define V_008DFC_SQ_BUFFER_ATOMIC_INC_X2 0x5C +#define V_008DFC_SQ_BUFFER_ATOMIC_DEC_X2 0x5D +#define V_008DFC_SQ_BUFFER_ATOMIC_FCMPSWAP_X2 0x5E +#define V_008DFC_SQ_BUFFER_ATOMIC_FMIN_X2 0x5F +#define V_008DFC_SQ_BUFFER_ATOMIC_FMAX_X2 0x60 +#define V_008DFC_SQ_BUFFER_WBINVL1_SC 0x70 +#define V_008DFC_SQ_BUFFER_WBINVL1 0x71 +#define S_008DFC_ENCODING(x) (((x) & 0x3F) << 26) +#define G_008DFC_ENCODING(x) (((x) >> 26) & 0x3F) +#define C_008DFC_ENCODING 0x03FFFFFF +#define V_008DFC_SQ_ENC_MUBUF_FIELD 0x38 +#endif +#define R_008F00_SQ_BUF_RSRC_WORD0 0x008F00 +#define R_008F04_SQ_BUF_RSRC_WORD1 0x008F04 +#define S_008F04_BASE_ADDRESS_HI(x) (((x) & 0xFFFF) << 0) +#define G_008F04_BASE_ADDRESS_HI(x) (((x) >> 0) & 0xFFFF) +#define C_008F04_BASE_ADDRESS_HI 0xFFFF0000 +#define S_008F04_STRIDE(x) (((x) & 0x3FFF) << 16) +#define G_008F04_STRIDE(x) (((x) >> 16) & 0x3FFF) +#define C_008F04_STRIDE 0xC000FFFF +#define S_008F04_CACHE_SWIZZLE(x) (((x) & 0x1) << 30) +#define G_008F04_CACHE_SWIZZLE(x) (((x) >> 30) & 0x1) +#define C_008F04_CACHE_SWIZZLE 0xBFFFFFFF +#define S_008F04_SWIZZLE_ENABLE(x) (((x) & 0x1) << 31) +#define G_008F04_SWIZZLE_ENABLE(x) (((x) >> 31) & 0x1) +#define C_008F04_SWIZZLE_ENABLE 0x7FFFFFFF +#define R_008F08_SQ_BUF_RSRC_WORD2 0x008F08 +#define R_008F0C_SQ_BUF_RSRC_WORD3 0x008F0C +#define S_008F0C_DST_SEL_X(x) (((x) & 0x07) << 0) +#define G_008F0C_DST_SEL_X(x) (((x) >> 0) & 0x07) +#define C_008F0C_DST_SEL_X 0xFFFFFFF8 +#define V_008F0C_SQ_SEL_0 0x00 +#define V_008F0C_SQ_SEL_1 0x01 +#define V_008F0C_SQ_SEL_RESERVED_0 0x02 +#define V_008F0C_SQ_SEL_RESERVED_1 0x03 +#define V_008F0C_SQ_SEL_X 0x04 +#define V_008F0C_SQ_SEL_Y 0x05 +#define V_008F0C_SQ_SEL_Z 0x06 +#define V_008F0C_SQ_SEL_W 0x07 +#define S_008F0C_DST_SEL_Y(x) (((x) & 0x07) << 3) +#define G_008F0C_DST_SEL_Y(x) (((x) >> 3) & 0x07) +#define C_008F0C_DST_SEL_Y 0xFFFFFFC7 +#define V_008F0C_SQ_SEL_0 0x00 +#define V_008F0C_SQ_SEL_1 0x01 +#define V_008F0C_SQ_SEL_RESERVED_0 0x02 +#define V_008F0C_SQ_SEL_RESERVED_1 0x03 +#define V_008F0C_SQ_SEL_X 0x04 +#define V_008F0C_SQ_SEL_Y 0x05 +#define V_008F0C_SQ_SEL_Z 0x06 +#define V_008F0C_SQ_SEL_W 0x07 +#define S_008F0C_DST_SEL_Z(x) (((x) & 0x07) << 6) +#define G_008F0C_DST_SEL_Z(x) (((x) >> 6) & 0x07) +#define C_008F0C_DST_SEL_Z 0xFFFFFE3F +#define V_008F0C_SQ_SEL_0 0x00 +#define V_008F0C_SQ_SEL_1 0x01 +#define V_008F0C_SQ_SEL_RESERVED_0 0x02 +#define V_008F0C_SQ_SEL_RESERVED_1 0x03 +#define V_008F0C_SQ_SEL_X 0x04 +#define V_008F0C_SQ_SEL_Y 0x05 +#define V_008F0C_SQ_SEL_Z 0x06 +#define V_008F0C_SQ_SEL_W 0x07 +#define S_008F0C_DST_SEL_W(x) (((x) & 0x07) << 9) +#define G_008F0C_DST_SEL_W(x) (((x) >> 9) & 0x07) +#define C_008F0C_DST_SEL_W 0xFFFFF1FF +#define V_008F0C_SQ_SEL_0 0x00 +#define V_008F0C_SQ_SEL_1 0x01 +#define V_008F0C_SQ_SEL_RESERVED_0 0x02 +#define V_008F0C_SQ_SEL_RESERVED_1 0x03 +#define V_008F0C_SQ_SEL_X 0x04 +#define V_008F0C_SQ_SEL_Y 0x05 +#define V_008F0C_SQ_SEL_Z 0x06 +#define V_008F0C_SQ_SEL_W 0x07 +#define S_008F0C_NUM_FORMAT(x) (((x) & 0x07) << 12) +#define G_008F0C_NUM_FORMAT(x) (((x) >> 12) & 0x07) +#define C_008F0C_NUM_FORMAT 0xFFFF8FFF +#define V_008F0C_BUF_NUM_FORMAT_UNORM 0x00 +#define V_008F0C_BUF_NUM_FORMAT_SNORM 0x01 +#define V_008F0C_BUF_NUM_FORMAT_USCALED 0x02 +#define V_008F0C_BUF_NUM_FORMAT_SSCALED 0x03 +#define V_008F0C_BUF_NUM_FORMAT_UINT 0x04 +#define V_008F0C_BUF_NUM_FORMAT_SINT 0x05 +#define V_008F0C_BUF_NUM_FORMAT_SNORM_OGL 0x06 +#define V_008F0C_BUF_NUM_FORMAT_FLOAT 0x07 +#define S_008F0C_DATA_FORMAT(x) (((x) & 0x0F) << 15) +#define G_008F0C_DATA_FORMAT(x) (((x) >> 15) & 0x0F) +#define C_008F0C_DATA_FORMAT 0xFFF87FFF +#define V_008F0C_BUF_DATA_FORMAT_INVALID 0x00 +#define V_008F0C_BUF_DATA_FORMAT_8 0x01 +#define V_008F0C_BUF_DATA_FORMAT_16 0x02 +#define V_008F0C_BUF_DATA_FORMAT_8_8 0x03 +#define V_008F0C_BUF_DATA_FORMAT_32 0x04 +#define V_008F0C_BUF_DATA_FORMAT_16_16 0x05 +#define V_008F0C_BUF_DATA_FORMAT_10_11_11 0x06 +#define V_008F0C_BUF_DATA_FORMAT_11_11_10 0x07 +#define V_008F0C_BUF_DATA_FORMAT_10_10_10_2 0x08 +#define V_008F0C_BUF_DATA_FORMAT_2_10_10_10 0x09 +#define V_008F0C_BUF_DATA_FORMAT_8_8_8_8 0x0A +#define V_008F0C_BUF_DATA_FORMAT_32_32 0x0B +#define V_008F0C_BUF_DATA_FORMAT_16_16_16_16 0x0C +#define V_008F0C_BUF_DATA_FORMAT_32_32_32 0x0D +#define V_008F0C_BUF_DATA_FORMAT_32_32_32_32 0x0E +#define V_008F0C_BUF_DATA_FORMAT_RESERVED_15 0x0F +#define S_008F0C_ELEMENT_SIZE(x) (((x) & 0x03) << 19) +#define G_008F0C_ELEMENT_SIZE(x) (((x) >> 19) & 0x03) +#define C_008F0C_ELEMENT_SIZE 0xFFE7FFFF +#define S_008F0C_INDEX_STRIDE(x) (((x) & 0x03) << 21) +#define G_008F0C_INDEX_STRIDE(x) (((x) >> 21) & 0x03) +#define C_008F0C_INDEX_STRIDE 0xFF9FFFFF +#define S_008F0C_ADD_TID_ENABLE(x) (((x) & 0x1) << 23) +#define G_008F0C_ADD_TID_ENABLE(x) (((x) >> 23) & 0x1) +#define C_008F0C_ADD_TID_ENABLE 0xFF7FFFFF +#define S_008F0C_HASH_ENABLE(x) (((x) & 0x1) << 25) +#define G_008F0C_HASH_ENABLE(x) (((x) >> 25) & 0x1) +#define C_008F0C_HASH_ENABLE 0xFDFFFFFF +#define S_008F0C_HEAP(x) (((x) & 0x1) << 26) +#define G_008F0C_HEAP(x) (((x) >> 26) & 0x1) +#define C_008F0C_HEAP 0xFBFFFFFF +#define S_008F0C_TYPE(x) (((x) & 0x03) << 30) +#define G_008F0C_TYPE(x) (((x) >> 30) & 0x03) +#define C_008F0C_TYPE 0x3FFFFFFF +#define V_008F0C_SQ_RSRC_BUF 0x00 +#define V_008F0C_SQ_RSRC_BUF_RSVD_1 0x01 +#define V_008F0C_SQ_RSRC_BUF_RSVD_2 0x02 +#define V_008F0C_SQ_RSRC_BUF_RSVD_3 0x03 +#define R_008F10_SQ_IMG_RSRC_WORD0 0x008F10 +#define R_008F14_SQ_IMG_RSRC_WORD1 0x008F14 +#define S_008F14_BASE_ADDRESS_HI(x) (((x) & 0xFF) << 0) +#define G_008F14_BASE_ADDRESS_HI(x) (((x) >> 0) & 0xFF) +#define C_008F14_BASE_ADDRESS_HI 0xFFFFFF00 +#define S_008F14_MIN_LOD(x) (((x) & 0xFFF) << 8) +#define G_008F14_MIN_LOD(x) (((x) >> 8) & 0xFFF) +#define C_008F14_MIN_LOD 0xFFF000FF +#define S_008F14_DATA_FORMAT(x) (((x) & 0x3F) << 20) +#define G_008F14_DATA_FORMAT(x) (((x) >> 20) & 0x3F) +#define C_008F14_DATA_FORMAT 0xFC0FFFFF +#define V_008F14_IMG_DATA_FORMAT_INVALID 0x00 +#define V_008F14_IMG_DATA_FORMAT_8 0x01 +#define V_008F14_IMG_DATA_FORMAT_16 0x02 +#define V_008F14_IMG_DATA_FORMAT_8_8 0x03 +#define V_008F14_IMG_DATA_FORMAT_32 0x04 +#define V_008F14_IMG_DATA_FORMAT_16_16 0x05 +#define V_008F14_IMG_DATA_FORMAT_10_11_11 0x06 +#define V_008F14_IMG_DATA_FORMAT_11_11_10 0x07 +#define V_008F14_IMG_DATA_FORMAT_10_10_10_2 0x08 +#define V_008F14_IMG_DATA_FORMAT_2_10_10_10 0x09 +#define V_008F14_IMG_DATA_FORMAT_8_8_8_8 0x0A +#define V_008F14_IMG_DATA_FORMAT_32_32 0x0B +#define V_008F14_IMG_DATA_FORMAT_16_16_16_16 0x0C +#define V_008F14_IMG_DATA_FORMAT_32_32_32 0x0D +#define V_008F14_IMG_DATA_FORMAT_32_32_32_32 0x0E +#define V_008F14_IMG_DATA_FORMAT_RESERVED_15 0x0F +#define V_008F14_IMG_DATA_FORMAT_5_6_5 0x10 +#define V_008F14_IMG_DATA_FORMAT_1_5_5_5 0x11 +#define V_008F14_IMG_DATA_FORMAT_5_5_5_1 0x12 +#define V_008F14_IMG_DATA_FORMAT_4_4_4_4 0x13 +#define V_008F14_IMG_DATA_FORMAT_8_24 0x14 +#define V_008F14_IMG_DATA_FORMAT_24_8 0x15 +#define V_008F14_IMG_DATA_FORMAT_X24_8_32 0x16 +#define V_008F14_IMG_DATA_FORMAT_RESERVED_23 0x17 +#define V_008F14_IMG_DATA_FORMAT_RESERVED_24 0x18 +#define V_008F14_IMG_DATA_FORMAT_RESERVED_25 0x19 +#define V_008F14_IMG_DATA_FORMAT_RESERVED_26 0x1A +#define V_008F14_IMG_DATA_FORMAT_RESERVED_27 0x1B +#define V_008F14_IMG_DATA_FORMAT_RESERVED_28 0x1C +#define V_008F14_IMG_DATA_FORMAT_RESERVED_29 0x1D +#define V_008F14_IMG_DATA_FORMAT_RESERVED_30 0x1E +#define V_008F14_IMG_DATA_FORMAT_RESERVED_31 0x1F +#define V_008F14_IMG_DATA_FORMAT_GB_GR 0x20 +#define V_008F14_IMG_DATA_FORMAT_BG_RG 0x21 +#define V_008F14_IMG_DATA_FORMAT_5_9_9_9 0x22 +#define V_008F14_IMG_DATA_FORMAT_RESERVED_42 0x2A +#define V_008F14_IMG_DATA_FORMAT_RESERVED_43 0x2B +#define V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1 0x2C +#define V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1 0x2D +#define V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1 0x2E +#define V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2 0x2F +#define V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2 0x30 +#define V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4 0x31 +#define V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1 0x32 +#define V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2 0x33 +#define V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2 0x34 +#define V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4 0x35 +#define V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8 0x36 +#define V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4 0x37 +#define V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8 0x38 +#define V_008F14_IMG_DATA_FORMAT_4_4 0x39 +#define V_008F14_IMG_DATA_FORMAT_6_5_5 0x3A +#define V_008F14_IMG_DATA_FORMAT_1 0x3B +#define V_008F14_IMG_DATA_FORMAT_1_REVERSED 0x3C +#define V_008F14_IMG_DATA_FORMAT_32_AS_8 0x3D +#define V_008F14_IMG_DATA_FORMAT_32_AS_8_8 0x3E +#define V_008F14_IMG_DATA_FORMAT_32_AS_32_32_32_32 0x3F +#define S_008F14_NUM_FORMAT(x) (((x) & 0x0F) << 26) +#define G_008F14_NUM_FORMAT(x) (((x) >> 26) & 0x0F) +#define C_008F14_NUM_FORMAT 0xC3FFFFFF +#define V_008F14_IMG_NUM_FORMAT_UNORM 0x00 +#define V_008F14_IMG_NUM_FORMAT_SNORM 0x01 +#define V_008F14_IMG_NUM_FORMAT_USCALED 0x02 +#define V_008F14_IMG_NUM_FORMAT_SSCALED 0x03 +#define V_008F14_IMG_NUM_FORMAT_UINT 0x04 +#define V_008F14_IMG_NUM_FORMAT_SINT 0x05 +#define V_008F14_IMG_NUM_FORMAT_SNORM_OGL 0x06 +#define V_008F14_IMG_NUM_FORMAT_FLOAT 0x07 +#define V_008F14_IMG_NUM_FORMAT_RESERVED_8 0x08 +#define V_008F14_IMG_NUM_FORMAT_SRGB 0x09 +#define V_008F14_IMG_NUM_FORMAT_UBNORM 0x0A +#define V_008F14_IMG_NUM_FORMAT_UBNORM_OGL 0x0B +#define V_008F14_IMG_NUM_FORMAT_UBINT 0x0C +#define V_008F14_IMG_NUM_FORMAT_UBSCALED 0x0D +#define V_008F14_IMG_NUM_FORMAT_RESERVED_14 0x0E +#define V_008F14_IMG_NUM_FORMAT_RESERVED_15 0x0F +#define R_008F18_SQ_IMG_RSRC_WORD2 0x008F18 +#define S_008F18_WIDTH(x) (((x) & 0x3FFF) << 0) +#define G_008F18_WIDTH(x) (((x) >> 0) & 0x3FFF) +#define C_008F18_WIDTH 0xFFFFC000 +#define S_008F18_HEIGHT(x) (((x) & 0x3FFF) << 14) +#define G_008F18_HEIGHT(x) (((x) >> 14) & 0x3FFF) +#define C_008F18_HEIGHT 0xF0003FFF +#define S_008F18_PERF_MOD(x) (((x) & 0x07) << 28) +#define G_008F18_PERF_MOD(x) (((x) >> 28) & 0x07) +#define C_008F18_PERF_MOD 0x8FFFFFFF +#define S_008F18_INTERLACED(x) (((x) & 0x1) << 31) +#define G_008F18_INTERLACED(x) (((x) >> 31) & 0x1) +#define C_008F18_INTERLACED 0x7FFFFFFF +#define R_008F1C_SQ_IMG_RSRC_WORD3 0x008F1C +#define S_008F1C_DST_SEL_X(x) (((x) & 0x07) << 0) +#define G_008F1C_DST_SEL_X(x) (((x) >> 0) & 0x07) +#define C_008F1C_DST_SEL_X 0xFFFFFFF8 +#define V_008F1C_SQ_SEL_0 0x00 +#define V_008F1C_SQ_SEL_1 0x01 +#define V_008F1C_SQ_SEL_RESERVED_0 0x02 +#define V_008F1C_SQ_SEL_RESERVED_1 0x03 +#define V_008F1C_SQ_SEL_X 0x04 +#define V_008F1C_SQ_SEL_Y 0x05 +#define V_008F1C_SQ_SEL_Z 0x06 +#define V_008F1C_SQ_SEL_W 0x07 +#define S_008F1C_DST_SEL_Y(x) (((x) & 0x07) << 3) +#define G_008F1C_DST_SEL_Y(x) (((x) >> 3) & 0x07) +#define C_008F1C_DST_SEL_Y 0xFFFFFFC7 +#define V_008F1C_SQ_SEL_0 0x00 +#define V_008F1C_SQ_SEL_1 0x01 +#define V_008F1C_SQ_SEL_RESERVED_0 0x02 +#define V_008F1C_SQ_SEL_RESERVED_1 0x03 +#define V_008F1C_SQ_SEL_X 0x04 +#define V_008F1C_SQ_SEL_Y 0x05 +#define V_008F1C_SQ_SEL_Z 0x06 +#define V_008F1C_SQ_SEL_W 0x07 +#define S_008F1C_DST_SEL_Z(x) (((x) & 0x07) << 6) +#define G_008F1C_DST_SEL_Z(x) (((x) >> 6) & 0x07) +#define C_008F1C_DST_SEL_Z 0xFFFFFE3F +#define V_008F1C_SQ_SEL_0 0x00 +#define V_008F1C_SQ_SEL_1 0x01 +#define V_008F1C_SQ_SEL_RESERVED_0 0x02 +#define V_008F1C_SQ_SEL_RESERVED_1 0x03 +#define V_008F1C_SQ_SEL_X 0x04 +#define V_008F1C_SQ_SEL_Y 0x05 +#define V_008F1C_SQ_SEL_Z 0x06 +#define V_008F1C_SQ_SEL_W 0x07 +#define S_008F1C_DST_SEL_W(x) (((x) & 0x07) << 9) +#define G_008F1C_DST_SEL_W(x) (((x) >> 9) & 0x07) +#define C_008F1C_DST_SEL_W 0xFFFFF1FF +#define V_008F1C_SQ_SEL_0 0x00 +#define V_008F1C_SQ_SEL_1 0x01 +#define V_008F1C_SQ_SEL_RESERVED_0 0x02 +#define V_008F1C_SQ_SEL_RESERVED_1 0x03 +#define V_008F1C_SQ_SEL_X 0x04 +#define V_008F1C_SQ_SEL_Y 0x05 +#define V_008F1C_SQ_SEL_Z 0x06 +#define V_008F1C_SQ_SEL_W 0x07 +#define S_008F1C_BASE_LEVEL(x) (((x) & 0x0F) << 12) +#define G_008F1C_BASE_LEVEL(x) (((x) >> 12) & 0x0F) +#define C_008F1C_BASE_LEVEL 0xFFFF0FFF +#define S_008F1C_LAST_LEVEL(x) (((x) & 0x0F) << 16) +#define G_008F1C_LAST_LEVEL(x) (((x) >> 16) & 0x0F) +#define C_008F1C_LAST_LEVEL 0xFFF0FFFF +#define S_008F1C_TILING_INDEX(x) (((x) & 0x1F) << 20) +#define G_008F1C_TILING_INDEX(x) (((x) >> 20) & 0x1F) +#define C_008F1C_TILING_INDEX 0xFE0FFFFF +#define S_008F1C_POW2_PAD(x) (((x) & 0x1) << 25) +#define G_008F1C_POW2_PAD(x) (((x) >> 25) & 0x1) +#define C_008F1C_POW2_PAD 0xFDFFFFFF +#define S_008F1C_TYPE(x) (((x) & 0x0F) << 28) +#define G_008F1C_TYPE(x) (((x) >> 28) & 0x0F) +#define C_008F1C_TYPE 0x0FFFFFFF +#define V_008F1C_SQ_RSRC_IMG_RSVD_0 0x00 +#define V_008F1C_SQ_RSRC_IMG_RSVD_1 0x01 +#define V_008F1C_SQ_RSRC_IMG_RSVD_2 0x02 +#define V_008F1C_SQ_RSRC_IMG_RSVD_3 0x03 +#define V_008F1C_SQ_RSRC_IMG_RSVD_4 0x04 +#define V_008F1C_SQ_RSRC_IMG_RSVD_5 0x05 +#define V_008F1C_SQ_RSRC_IMG_RSVD_6 0x06 +#define V_008F1C_SQ_RSRC_IMG_RSVD_7 0x07 +#define V_008F1C_SQ_RSRC_IMG_1D 0x08 +#define V_008F1C_SQ_RSRC_IMG_2D 0x09 +#define V_008F1C_SQ_RSRC_IMG_3D 0x0A +#define V_008F1C_SQ_RSRC_IMG_CUBE 0x0B +#define V_008F1C_SQ_RSRC_IMG_1D_ARRAY 0x0C +#define V_008F1C_SQ_RSRC_IMG_2D_ARRAY 0x0D +#define V_008F1C_SQ_RSRC_IMG_2D_MSAA 0x0E +#define V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY 0x0F +#define R_008F20_SQ_IMG_RSRC_WORD4 0x008F20 +#define S_008F20_DEPTH(x) (((x) & 0x1FFF) << 0) +#define G_008F20_DEPTH(x) (((x) >> 0) & 0x1FFF) +#define C_008F20_DEPTH 0xFFFFE000 +#define S_008F20_PITCH(x) (((x) & 0x3FFF) << 13) +#define G_008F20_PITCH(x) (((x) >> 13) & 0x3FFF) +#define C_008F20_PITCH 0xF8001FFF +#define R_008F24_SQ_IMG_RSRC_WORD5 0x008F24 +#define S_008F24_BASE_ARRAY(x) (((x) & 0x1FFF) << 0) +#define G_008F24_BASE_ARRAY(x) (((x) >> 0) & 0x1FFF) +#define C_008F24_BASE_ARRAY 0xFFFFE000 +#define S_008F24_LAST_ARRAY(x) (((x) & 0x1FFF) << 13) +#define G_008F24_LAST_ARRAY(x) (((x) >> 13) & 0x1FFF) +#define C_008F24_LAST_ARRAY 0xFC001FFF +#define R_008F28_SQ_IMG_RSRC_WORD6 0x008F28 +#define S_008F28_MIN_LOD_WARN(x) (((x) & 0xFFF) << 0) +#define G_008F28_MIN_LOD_WARN(x) (((x) >> 0) & 0xFFF) +#define C_008F28_MIN_LOD_WARN 0xFFFFF000 +#define R_008F2C_SQ_IMG_RSRC_WORD7 0x008F2C +#define R_008F30_SQ_IMG_SAMP_WORD0 0x008F30 +#define S_008F30_CLAMP_X(x) (((x) & 0x07) << 0) +#define G_008F30_CLAMP_X(x) (((x) >> 0) & 0x07) +#define C_008F30_CLAMP_X 0xFFFFFFF8 +#define V_008F30_SQ_TEX_WRAP 0x00 +#define V_008F30_SQ_TEX_MIRROR 0x01 +#define V_008F30_SQ_TEX_CLAMP_LAST_TEXEL 0x02 +#define V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL 0x03 +#define V_008F30_SQ_TEX_CLAMP_HALF_BORDER 0x04 +#define V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER 0x05 +#define V_008F30_SQ_TEX_CLAMP_BORDER 0x06 +#define V_008F30_SQ_TEX_MIRROR_ONCE_BORDER 0x07 +#define S_008F30_CLAMP_Y(x) (((x) & 0x07) << 3) +#define G_008F30_CLAMP_Y(x) (((x) >> 3) & 0x07) +#define C_008F30_CLAMP_Y 0xFFFFFFC7 +#define V_008F30_SQ_TEX_WRAP 0x00 +#define V_008F30_SQ_TEX_MIRROR 0x01 +#define V_008F30_SQ_TEX_CLAMP_LAST_TEXEL 0x02 +#define V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL 0x03 +#define V_008F30_SQ_TEX_CLAMP_HALF_BORDER 0x04 +#define V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER 0x05 +#define V_008F30_SQ_TEX_CLAMP_BORDER 0x06 +#define V_008F30_SQ_TEX_MIRROR_ONCE_BORDER 0x07 +#define S_008F30_CLAMP_Z(x) (((x) & 0x07) << 6) +#define G_008F30_CLAMP_Z(x) (((x) >> 6) & 0x07) +#define C_008F30_CLAMP_Z 0xFFFFFE3F +#define V_008F30_SQ_TEX_WRAP 0x00 +#define V_008F30_SQ_TEX_MIRROR 0x01 +#define V_008F30_SQ_TEX_CLAMP_LAST_TEXEL 0x02 +#define V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL 0x03 +#define V_008F30_SQ_TEX_CLAMP_HALF_BORDER 0x04 +#define V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER 0x05 +#define V_008F30_SQ_TEX_CLAMP_BORDER 0x06 +#define V_008F30_SQ_TEX_MIRROR_ONCE_BORDER 0x07 +#define S_008F30_DEPTH_COMPARE_FUNC(x) (((x) & 0x07) << 12) +#define G_008F30_DEPTH_COMPARE_FUNC(x) (((x) >> 12) & 0x07) +#define C_008F30_DEPTH_COMPARE_FUNC 0xFFFF8FFF +#define V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER 0x00 +#define V_008F30_SQ_TEX_DEPTH_COMPARE_LESS 0x01 +#define V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL 0x02 +#define V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL 0x03 +#define V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER 0x04 +#define V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL 0x05 +#define V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL 0x06 +#define V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS 0x07 +#define S_008F30_FORCE_UNNORMALIZED(x) (((x) & 0x1) << 15) +#define G_008F30_FORCE_UNNORMALIZED(x) (((x) >> 15) & 0x1) +#define C_008F30_FORCE_UNNORMALIZED 0xFFFF7FFF +#define S_008F30_MC_COORD_TRUNC(x) (((x) & 0x1) << 19) +#define G_008F30_MC_COORD_TRUNC(x) (((x) >> 19) & 0x1) +#define C_008F30_MC_COORD_TRUNC 0xFFF7FFFF +#define S_008F30_FORCE_DEGAMMA(x) (((x) & 0x1) << 20) +#define G_008F30_FORCE_DEGAMMA(x) (((x) >> 20) & 0x1) +#define C_008F30_FORCE_DEGAMMA 0xFFEFFFFF +#define S_008F30_TRUNC_COORD(x) (((x) & 0x1) << 27) +#define G_008F30_TRUNC_COORD(x) (((x) >> 27) & 0x1) +#define C_008F30_TRUNC_COORD 0xF7FFFFFF +#define S_008F30_DISABLE_CUBE_WRAP(x) (((x) & 0x1) << 28) +#define G_008F30_DISABLE_CUBE_WRAP(x) (((x) >> 28) & 0x1) +#define C_008F30_DISABLE_CUBE_WRAP 0xEFFFFFFF +#define S_008F30_FILTER_MODE(x) (((x) & 0x03) << 29) +#define G_008F30_FILTER_MODE(x) (((x) >> 29) & 0x03) +#define C_008F30_FILTER_MODE 0x9FFFFFFF +#define R_008F34_SQ_IMG_SAMP_WORD1 0x008F34 +#define S_008F34_MIN_LOD(x) (((x) & 0xFFF) << 0) +#define G_008F34_MIN_LOD(x) (((x) >> 0) & 0xFFF) +#define C_008F34_MIN_LOD 0xFFFFF000 +#define S_008F34_MAX_LOD(x) (((x) & 0xFFF) << 12) +#define G_008F34_MAX_LOD(x) (((x) >> 12) & 0xFFF) +#define C_008F34_MAX_LOD 0xFF000FFF +#define S_008F34_PERF_MIP(x) (((x) & 0x0F) << 24) +#define G_008F34_PERF_MIP(x) (((x) >> 24) & 0x0F) +#define C_008F34_PERF_MIP 0xF0FFFFFF +#define S_008F34_PERF_Z(x) (((x) & 0x0F) << 28) +#define G_008F34_PERF_Z(x) (((x) >> 28) & 0x0F) +#define C_008F34_PERF_Z 0x0FFFFFFF +#define R_008F38_SQ_IMG_SAMP_WORD2 0x008F38 +#define S_008F38_LOD_BIAS(x) (((x) & 0x3FFF) << 0) +#define G_008F38_LOD_BIAS(x) (((x) >> 0) & 0x3FFF) +#define C_008F38_LOD_BIAS 0xFFFFC000 +#define S_008F38_LOD_BIAS_SEC(x) (((x) & 0x3F) << 14) +#define G_008F38_LOD_BIAS_SEC(x) (((x) >> 14) & 0x3F) +#define C_008F38_LOD_BIAS_SEC 0xFFF03FFF +#define S_008F38_XY_MAG_FILTER(x) (((x) & 0x03) << 20) +#define G_008F38_XY_MAG_FILTER(x) (((x) >> 20) & 0x03) +#define C_008F38_XY_MAG_FILTER 0xFFCFFFFF +#define V_008F38_SQ_TEX_XY_FILTER_POINT 0x00 +#define V_008F38_SQ_TEX_XY_FILTER_BILINEAR 0x01 +#define S_008F38_XY_MIN_FILTER(x) (((x) & 0x03) << 22) +#define G_008F38_XY_MIN_FILTER(x) (((x) >> 22) & 0x03) +#define C_008F38_XY_MIN_FILTER 0xFF3FFFFF +#define V_008F38_SQ_TEX_XY_FILTER_POINT 0x00 +#define V_008F38_SQ_TEX_XY_FILTER_BILINEAR 0x01 +#define S_008F38_Z_FILTER(x) (((x) & 0x03) << 24) +#define G_008F38_Z_FILTER(x) (((x) >> 24) & 0x03) +#define C_008F38_Z_FILTER 0xFCFFFFFF +#define V_008F38_SQ_TEX_Z_FILTER_NONE 0x00 +#define V_008F38_SQ_TEX_Z_FILTER_POINT 0x01 +#define V_008F38_SQ_TEX_Z_FILTER_LINEAR 0x02 +#define S_008F38_MIP_FILTER(x) (((x) & 0x03) << 26) +#define G_008F38_MIP_FILTER(x) (((x) >> 26) & 0x03) +#define C_008F38_MIP_FILTER 0xF3FFFFFF +#define V_008F38_SQ_TEX_Z_FILTER_NONE 0x00 +#define V_008F38_SQ_TEX_Z_FILTER_POINT 0x01 +#define V_008F38_SQ_TEX_Z_FILTER_LINEAR 0x02 +#define S_008F38_MIP_POINT_PRECLAMP(x) (((x) & 0x1) << 28) +#define G_008F38_MIP_POINT_PRECLAMP(x) (((x) >> 28) & 0x1) +#define C_008F38_MIP_POINT_PRECLAMP 0xEFFFFFFF +#define S_008F38_DISABLE_LSB_CEIL(x) (((x) & 0x1) << 29) +#define G_008F38_DISABLE_LSB_CEIL(x) (((x) >> 29) & 0x1) +#define C_008F38_DISABLE_LSB_CEIL 0xDFFFFFFF +#define S_008F38_FILTER_PREC_FIX(x) (((x) & 0x1) << 30) +#define G_008F38_FILTER_PREC_FIX(x) (((x) >> 30) & 0x1) +#define C_008F38_FILTER_PREC_FIX 0xBFFFFFFF +#define R_008F3C_SQ_IMG_SAMP_WORD3 0x008F3C +#define S_008F3C_BORDER_COLOR_PTR(x) (((x) & 0xFFF) << 0) +#define G_008F3C_BORDER_COLOR_PTR(x) (((x) >> 0) & 0xFFF) +#define C_008F3C_BORDER_COLOR_PTR 0xFFFFF000 +#define S_008F3C_BORDER_COLOR_TYPE(x) (((x) & 0x03) << 30) +#define G_008F3C_BORDER_COLOR_TYPE(x) (((x) >> 30) & 0x03) +#define C_008F3C_BORDER_COLOR_TYPE 0x3FFFFFFF +#define V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK 0x00 +#define V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK 0x01 +#define V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE 0x02 +#define V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER 0x03 +#define R_0090DC_SPI_DYN_GPR_LOCK_EN 0x0090DC +#define S_0090DC_VS_LOW_THRESHOLD(x) (((x) & 0x0F) << 0) +#define G_0090DC_VS_LOW_THRESHOLD(x) (((x) >> 0) & 0x0F) +#define C_0090DC_VS_LOW_THRESHOLD 0xFFFFFFF0 +#define S_0090DC_GS_LOW_THRESHOLD(x) (((x) & 0x0F) << 4) +#define G_0090DC_GS_LOW_THRESHOLD(x) (((x) >> 4) & 0x0F) +#define C_0090DC_GS_LOW_THRESHOLD 0xFFFFFF0F +#define S_0090DC_ES_LOW_THRESHOLD(x) (((x) & 0x0F) << 8) +#define G_0090DC_ES_LOW_THRESHOLD(x) (((x) >> 8) & 0x0F) +#define C_0090DC_ES_LOW_THRESHOLD 0xFFFFF0FF +#define S_0090DC_HS_LOW_THRESHOLD(x) (((x) & 0x0F) << 12) +#define G_0090DC_HS_LOW_THRESHOLD(x) (((x) >> 12) & 0x0F) +#define C_0090DC_HS_LOW_THRESHOLD 0xFFFF0FFF +#define S_0090DC_LS_LOW_THRESHOLD(x) (((x) & 0x0F) << 16) +#define G_0090DC_LS_LOW_THRESHOLD(x) (((x) >> 16) & 0x0F) +#define C_0090DC_LS_LOW_THRESHOLD 0xFFF0FFFF +#define R_0090E0_SPI_STATIC_THREAD_MGMT_1 0x0090E0 +#define S_0090E0_PS_CU_EN(x) (((x) & 0xFFFF) << 0) +#define G_0090E0_PS_CU_EN(x) (((x) >> 0) & 0xFFFF) +#define C_0090E0_PS_CU_EN 0xFFFF0000 +#define S_0090E0_VS_CU_EN(x) (((x) & 0xFFFF) << 16) +#define G_0090E0_VS_CU_EN(x) (((x) >> 16) & 0xFFFF) +#define C_0090E0_VS_CU_EN 0x0000FFFF +#define R_0090E4_SPI_STATIC_THREAD_MGMT_2 0x0090E4 +#define S_0090E4_GS_CU_EN(x) (((x) & 0xFFFF) << 0) +#define G_0090E4_GS_CU_EN(x) (((x) >> 0) & 0xFFFF) +#define C_0090E4_GS_CU_EN 0xFFFF0000 +#define S_0090E4_ES_CU_EN(x) (((x) & 0xFFFF) << 16) +#define G_0090E4_ES_CU_EN(x) (((x) >> 16) & 0xFFFF) +#define C_0090E4_ES_CU_EN 0x0000FFFF +#define R_0090E8_SPI_STATIC_THREAD_MGMT_3 0x0090E8 +#define S_0090E8_LSHS_CU_EN(x) (((x) & 0xFFFF) << 0) +#define G_0090E8_LSHS_CU_EN(x) (((x) >> 0) & 0xFFFF) +#define C_0090E8_LSHS_CU_EN 0xFFFF0000 +#define R_0090EC_SPI_PS_MAX_WAVE_ID 0x0090EC +#define S_0090EC_MAX_WAVE_ID(x) (((x) & 0xFFF) << 0) +#define G_0090EC_MAX_WAVE_ID(x) (((x) >> 0) & 0xFFF) +#define C_0090EC_MAX_WAVE_ID 0xFFFFF000 +#define R_0090F0_SPI_ARB_PRIORITY 0x0090F0 +#define S_0090F0_RING_ORDER_TS0(x) (((x) & 0x07) << 0) +#define G_0090F0_RING_ORDER_TS0(x) (((x) >> 0) & 0x07) +#define C_0090F0_RING_ORDER_TS0 0xFFFFFFF8 +#define V_0090F0_X_R0 0x00 +#define S_0090F0_RING_ORDER_TS1(x) (((x) & 0x07) << 3) +#define G_0090F0_RING_ORDER_TS1(x) (((x) >> 3) & 0x07) +#define C_0090F0_RING_ORDER_TS1 0xFFFFFFC7 +#define S_0090F0_RING_ORDER_TS2(x) (((x) & 0x07) << 6) +#define G_0090F0_RING_ORDER_TS2(x) (((x) >> 6) & 0x07) +#define C_0090F0_RING_ORDER_TS2 0xFFFFFE3F +#define R_0090F4_SPI_ARB_CYCLES_0 0x0090F4 +#define S_0090F4_TS0_DURATION(x) (((x) & 0xFFFF) << 0) +#define G_0090F4_TS0_DURATION(x) (((x) >> 0) & 0xFFFF) +#define C_0090F4_TS0_DURATION 0xFFFF0000 +#define S_0090F4_TS1_DURATION(x) (((x) & 0xFFFF) << 16) +#define G_0090F4_TS1_DURATION(x) (((x) >> 16) & 0xFFFF) +#define C_0090F4_TS1_DURATION 0x0000FFFF +#define R_0090F8_SPI_ARB_CYCLES_1 0x0090F8 +#define S_0090F8_TS2_DURATION(x) (((x) & 0xFFFF) << 0) +#define G_0090F8_TS2_DURATION(x) (((x) >> 0) & 0xFFFF) +#define C_0090F8_TS2_DURATION 0xFFFF0000 +#define R_009100_SPI_CONFIG_CNTL 0x009100 +#define S_009100_GPR_WRITE_PRIORITY(x) (((x) & 0x1FFFFF) << 0) +#define G_009100_GPR_WRITE_PRIORITY(x) (((x) >> 0) & 0x1FFFFF) +#define C_009100_GPR_WRITE_PRIORITY 0xFFE00000 +#define S_009100_EXP_PRIORITY_ORDER(x) (((x) & 0x07) << 21) +#define G_009100_EXP_PRIORITY_ORDER(x) (((x) >> 21) & 0x07) +#define C_009100_EXP_PRIORITY_ORDER 0xFF1FFFFF +#define S_009100_ENABLE_SQG_TOP_EVENTS(x) (((x) & 0x1) << 24) +#define G_009100_ENABLE_SQG_TOP_EVENTS(x) (((x) >> 24) & 0x1) +#define C_009100_ENABLE_SQG_TOP_EVENTS 0xFEFFFFFF +#define S_009100_ENABLE_SQG_BOP_EVENTS(x) (((x) & 0x1) << 25) +#define G_009100_ENABLE_SQG_BOP_EVENTS(x) (((x) >> 25) & 0x1) +#define C_009100_ENABLE_SQG_BOP_EVENTS 0xFDFFFFFF +#define S_009100_RSRC_MGMT_RESET(x) (((x) & 0x1) << 26) +#define G_009100_RSRC_MGMT_RESET(x) (((x) >> 26) & 0x1) +#define C_009100_RSRC_MGMT_RESET 0xFBFFFFFF +#define R_00913C_SPI_CONFIG_CNTL_1 0x00913C +#define S_00913C_VTX_DONE_DELAY(x) (((x) & 0x0F) << 0) +#define G_00913C_VTX_DONE_DELAY(x) (((x) >> 0) & 0x0F) +#define C_00913C_VTX_DONE_DELAY 0xFFFFFFF0 +#define V_00913C_X_DELAY_14_CLKS 0x00 +#define V_00913C_X_DELAY_16_CLKS 0x01 +#define V_00913C_X_DELAY_18_CLKS 0x02 +#define V_00913C_X_DELAY_20_CLKS 0x03 +#define V_00913C_X_DELAY_22_CLKS 0x04 +#define V_00913C_X_DELAY_24_CLKS 0x05 +#define V_00913C_X_DELAY_26_CLKS 0x06 +#define V_00913C_X_DELAY_28_CLKS 0x07 +#define V_00913C_X_DELAY_30_CLKS 0x08 +#define V_00913C_X_DELAY_32_CLKS 0x09 +#define V_00913C_X_DELAY_34_CLKS 0x0A +#define V_00913C_X_DELAY_4_CLKS 0x0B +#define V_00913C_X_DELAY_6_CLKS 0x0C +#define V_00913C_X_DELAY_8_CLKS 0x0D +#define V_00913C_X_DELAY_10_CLKS 0x0E +#define V_00913C_X_DELAY_12_CLKS 0x0F +#define S_00913C_INTERP_ONE_PRIM_PER_ROW(x) (((x) & 0x1) << 4) +#define G_00913C_INTERP_ONE_PRIM_PER_ROW(x) (((x) >> 4) & 0x1) +#define C_00913C_INTERP_ONE_PRIM_PER_ROW 0xFFFFFFEF +#define S_00913C_PC_LIMIT_ENABLE(x) (((x) & 0x1) << 6) +#define G_00913C_PC_LIMIT_ENABLE(x) (((x) >> 6) & 0x1) +#define C_00913C_PC_LIMIT_ENABLE 0xFFFFFFBF +#define S_00913C_PC_LIMIT_STRICT(x) (((x) & 0x1) << 7) +#define G_00913C_PC_LIMIT_STRICT(x) (((x) >> 7) & 0x1) +#define C_00913C_PC_LIMIT_STRICT 0xFFFFFF7F +#define S_00913C_PC_LIMIT_SIZE(x) (((x) & 0xFFFF) << 16) +#define G_00913C_PC_LIMIT_SIZE(x) (((x) >> 16) & 0xFFFF) +#define C_00913C_PC_LIMIT_SIZE 0x0000FFFF +#define R_00936C_SPI_RESOURCE_RESERVE_CU_AB_0 0x00936C +#define S_00936C_TYPE_A(x) (((x) & 0x0F) << 0) +#define G_00936C_TYPE_A(x) (((x) >> 0) & 0x0F) +#define C_00936C_TYPE_A 0xFFFFFFF0 +#define S_00936C_VGPR_A(x) (((x) & 0x07) << 4) +#define G_00936C_VGPR_A(x) (((x) >> 4) & 0x07) +#define C_00936C_VGPR_A 0xFFFFFF8F +#define S_00936C_SGPR_A(x) (((x) & 0x07) << 7) +#define G_00936C_SGPR_A(x) (((x) >> 7) & 0x07) +#define C_00936C_SGPR_A 0xFFFFFC7F +#define S_00936C_LDS_A(x) (((x) & 0x07) << 10) +#define G_00936C_LDS_A(x) (((x) >> 10) & 0x07) +#define C_00936C_LDS_A 0xFFFFE3FF +#define S_00936C_WAVES_A(x) (((x) & 0x03) << 13) +#define G_00936C_WAVES_A(x) (((x) >> 13) & 0x03) +#define C_00936C_WAVES_A 0xFFFF9FFF +#define S_00936C_EN_A(x) (((x) & 0x1) << 15) +#define G_00936C_EN_A(x) (((x) >> 15) & 0x1) +#define C_00936C_EN_A 0xFFFF7FFF +#define S_00936C_TYPE_B(x) (((x) & 0x0F) << 16) +#define G_00936C_TYPE_B(x) (((x) >> 16) & 0x0F) +#define C_00936C_TYPE_B 0xFFF0FFFF +#define S_00936C_VGPR_B(x) (((x) & 0x07) << 20) +#define G_00936C_VGPR_B(x) (((x) >> 20) & 0x07) +#define C_00936C_VGPR_B 0xFF8FFFFF +#define S_00936C_SGPR_B(x) (((x) & 0x07) << 23) +#define G_00936C_SGPR_B(x) (((x) >> 23) & 0x07) +#define C_00936C_SGPR_B 0xFC7FFFFF +#define S_00936C_LDS_B(x) (((x) & 0x07) << 26) +#define G_00936C_LDS_B(x) (((x) >> 26) & 0x07) +#define C_00936C_LDS_B 0xE3FFFFFF +#define S_00936C_WAVES_B(x) (((x) & 0x03) << 29) +#define G_00936C_WAVES_B(x) (((x) >> 29) & 0x03) +#define C_00936C_WAVES_B 0x9FFFFFFF +#define S_00936C_EN_B(x) (((x) & 0x1) << 31) +#define G_00936C_EN_B(x) (((x) >> 31) & 0x1) +#define C_00936C_EN_B 0x7FFFFFFF +#define R_00950C_TA_CS_BC_BASE_ADDR 0x00950C +#define R_009858_DB_SUBTILE_CONTROL 0x009858 +#define S_009858_MSAA1_X(x) (((x) & 0x03) << 0) +#define G_009858_MSAA1_X(x) (((x) >> 0) & 0x03) +#define C_009858_MSAA1_X 0xFFFFFFFC +#define S_009858_MSAA1_Y(x) (((x) & 0x03) << 2) +#define G_009858_MSAA1_Y(x) (((x) >> 2) & 0x03) +#define C_009858_MSAA1_Y 0xFFFFFFF3 +#define S_009858_MSAA2_X(x) (((x) & 0x03) << 4) +#define G_009858_MSAA2_X(x) (((x) >> 4) & 0x03) +#define C_009858_MSAA2_X 0xFFFFFFCF +#define S_009858_MSAA2_Y(x) (((x) & 0x03) << 6) +#define G_009858_MSAA2_Y(x) (((x) >> 6) & 0x03) +#define C_009858_MSAA2_Y 0xFFFFFF3F +#define S_009858_MSAA4_X(x) (((x) & 0x03) << 8) +#define G_009858_MSAA4_X(x) (((x) >> 8) & 0x03) +#define C_009858_MSAA4_X 0xFFFFFCFF +#define S_009858_MSAA4_Y(x) (((x) & 0x03) << 10) +#define G_009858_MSAA4_Y(x) (((x) >> 10) & 0x03) +#define C_009858_MSAA4_Y 0xFFFFF3FF +#define S_009858_MSAA8_X(x) (((x) & 0x03) << 12) +#define G_009858_MSAA8_X(x) (((x) >> 12) & 0x03) +#define C_009858_MSAA8_X 0xFFFFCFFF +#define S_009858_MSAA8_Y(x) (((x) & 0x03) << 14) +#define G_009858_MSAA8_Y(x) (((x) >> 14) & 0x03) +#define C_009858_MSAA8_Y 0xFFFF3FFF +#define S_009858_MSAA16_X(x) (((x) & 0x03) << 16) +#define G_009858_MSAA16_X(x) (((x) >> 16) & 0x03) +#define C_009858_MSAA16_X 0xFFFCFFFF +#define S_009858_MSAA16_Y(x) (((x) & 0x03) << 18) +#define G_009858_MSAA16_Y(x) (((x) >> 18) & 0x03) +#define C_009858_MSAA16_Y 0xFFF3FFFF +#define R_009910_GB_TILE_MODE0 0x009910 +#define S_009910_MICRO_TILE_MODE(x) (((x) & 0x03) << 0) +#define G_009910_MICRO_TILE_MODE(x) (((x) >> 0) & 0x03) +#define C_009910_MICRO_TILE_MODE 0xFFFFFFFC +#define V_009910_ADDR_SURF_DISPLAY_MICRO_TILING 0x00 +#define V_009910_ADDR_SURF_THIN_MICRO_TILING 0x01 +#define V_009910_ADDR_SURF_DEPTH_MICRO_TILING 0x02 +#define V_009910_ADDR_SURF_THICK_MICRO_TILING 0x03 +#define S_009910_ARRAY_MODE(x) (((x) & 0x0F) << 2) +#define G_009910_ARRAY_MODE(x) (((x) >> 2) & 0x0F) +#define C_009910_ARRAY_MODE 0xFFFFFFC3 +#define V_009910_ARRAY_LINEAR_GENERAL 0x00 +#define V_009910_ARRAY_LINEAR_ALIGNED 0x01 +#define V_009910_ARRAY_1D_TILED_THIN1 0x02 +#define V_009910_ARRAY_1D_TILED_THICK 0x03 +#define V_009910_ARRAY_2D_TILED_THIN1 0x04 +#define V_009910_ARRAY_2D_TILED_THICK 0x07 +#define V_009910_ARRAY_2D_TILED_XTHICK 0x08 +#define V_009910_ARRAY_3D_TILED_THIN1 0x0C +#define V_009910_ARRAY_3D_TILED_THICK 0x0D +#define V_009910_ARRAY_3D_TILED_XTHICK 0x0E +#define V_009910_ARRAY_POWER_SAVE 0x0F +#define S_009910_PIPE_CONFIG(x) (((x) & 0x1F) << 6) +#define G_009910_PIPE_CONFIG(x) (((x) >> 6) & 0x1F) +#define C_009910_PIPE_CONFIG 0xFFFFF83F +#define V_009910_ADDR_SURF_P2 0x00 +#define V_009910_ADDR_SURF_P2_RESERVED0 0x01 +#define V_009910_ADDR_SURF_P2_RESERVED1 0x02 +#define V_009910_ADDR_SURF_P2_RESERVED2 0x03 +#define V_009910_X_ADDR_SURF_P4_8X16 0x04 +#define V_009910_X_ADDR_SURF_P4_16X16 0x05 +#define V_009910_X_ADDR_SURF_P4_16X32 0x06 +#define V_009910_X_ADDR_SURF_P4_32X32 0x07 +#define V_009910_X_ADDR_SURF_P8_16X16_8X16 0x08 +#define V_009910_X_ADDR_SURF_P8_16X32_8X16 0x09 +#define V_009910_X_ADDR_SURF_P8_32X32_8X16 0x0A +#define V_009910_X_ADDR_SURF_P8_16X32_16X16 0x0B +#define V_009910_X_ADDR_SURF_P8_32X32_16X16 0x0C +#define V_009910_X_ADDR_SURF_P8_32X32_16X32 0x0D +#define V_009910_X_ADDR_SURF_P8_32X64_32X32 0x0E +#define S_009910_TILE_SPLIT(x) (((x) & 0x07) << 11) +#define G_009910_TILE_SPLIT(x) (((x) >> 11) & 0x07) +#define C_009910_TILE_SPLIT 0xFFFFC7FF +#define V_009910_ADDR_SURF_TILE_SPLIT_64B 0x00 +#define V_009910_ADDR_SURF_TILE_SPLIT_128B 0x01 +#define V_009910_ADDR_SURF_TILE_SPLIT_256B 0x02 +#define V_009910_ADDR_SURF_TILE_SPLIT_512B 0x03 +#define V_009910_ADDR_SURF_TILE_SPLIT_1KB 0x04 +#define V_009910_ADDR_SURF_TILE_SPLIT_2KB 0x05 +#define V_009910_ADDR_SURF_TILE_SPLIT_4KB 0x06 +#define S_009910_BANK_WIDTH(x) (((x) & 0x03) << 14) +#define G_009910_BANK_WIDTH(x) (((x) >> 14) & 0x03) +#define C_009910_BANK_WIDTH 0xFFFF3FFF +#define V_009910_ADDR_SURF_BANK_WIDTH_1 0x00 +#define V_009910_ADDR_SURF_BANK_WIDTH_2 0x01 +#define V_009910_ADDR_SURF_BANK_WIDTH_4 0x02 +#define V_009910_ADDR_SURF_BANK_WIDTH_8 0x03 +#define S_009910_BANK_HEIGHT(x) (((x) & 0x03) << 16) +#define G_009910_BANK_HEIGHT(x) (((x) >> 16) & 0x03) +#define C_009910_BANK_HEIGHT 0xFFFCFFFF +#define V_009910_ADDR_SURF_BANK_HEIGHT_1 0x00 +#define V_009910_ADDR_SURF_BANK_HEIGHT_2 0x01 +#define V_009910_ADDR_SURF_BANK_HEIGHT_4 0x02 +#define V_009910_ADDR_SURF_BANK_HEIGHT_8 0x03 +#define S_009910_MACRO_TILE_ASPECT(x) (((x) & 0x03) << 18) +#define G_009910_MACRO_TILE_ASPECT(x) (((x) >> 18) & 0x03) +#define C_009910_MACRO_TILE_ASPECT 0xFFF3FFFF +#define V_009910_ADDR_SURF_MACRO_ASPECT_1 0x00 +#define V_009910_ADDR_SURF_MACRO_ASPECT_2 0x01 +#define V_009910_ADDR_SURF_MACRO_ASPECT_4 0x02 +#define V_009910_ADDR_SURF_MACRO_ASPECT_8 0x03 +#define S_009910_NUM_BANKS(x) (((x) & 0x03) << 20) +#define G_009910_NUM_BANKS(x) (((x) >> 20) & 0x03) +#define C_009910_NUM_BANKS 0xFFCFFFFF +#define V_009910_ADDR_SURF_2_BANK 0x00 +#define V_009910_ADDR_SURF_4_BANK 0x01 +#define V_009910_ADDR_SURF_8_BANK 0x02 +#define V_009910_ADDR_SURF_16_BANK 0x03 +#define R_00B020_SPI_SHADER_PGM_LO_PS 0x00B020 +#define R_00B024_SPI_SHADER_PGM_HI_PS 0x00B024 +#define S_00B024_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B024_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B024_MEM_BASE 0xFFFFFF00 +#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 +#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B028_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B028_VGPRS 0xFFFFFFC0 +#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B028_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B028_SGPRS 0xFFFFFC3F +#define S_00B028_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B028_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B028_PRIORITY 0xFFFFF3FF +#define S_00B028_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B028_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B028_FLOAT_MODE 0xFFF00FFF +#define S_00B028_PRIV(x) (((x) & 0x1) << 20) +#define G_00B028_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B028_PRIV 0xFFEFFFFF +#define S_00B028_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B028_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B028_DX10_CLAMP 0xFFDFFFFF +#define S_00B028_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B028_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B028_DEBUG_MODE 0xFFBFFFFF +#define S_00B028_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B028_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B028_IEEE_MODE 0xFF7FFFFF +#define S_00B028_CU_GROUP_DISABLE(x) (((x) & 0x1) << 24) +#define G_00B028_CU_GROUP_DISABLE(x) (((x) >> 24) & 0x1) +#define C_00B028_CU_GROUP_DISABLE 0xFEFFFFFF +#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C +#define S_00B02C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define G_00B02C_SCRATCH_EN(x) (((x) >> 0) & 0x1) +#define C_00B02C_SCRATCH_EN 0xFFFFFFFE +#define S_00B02C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define G_00B02C_USER_SGPR(x) (((x) >> 1) & 0x1F) +#define C_00B02C_USER_SGPR 0xFFFFFFC1 +#define S_00B02C_WAVE_CNT_EN(x) (((x) & 0x1) << 7) +#define G_00B02C_WAVE_CNT_EN(x) (((x) >> 7) & 0x1) +#define C_00B02C_WAVE_CNT_EN 0xFFFFFF7F +#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) +#define G_00B02C_EXTRA_LDS_SIZE(x) (((x) >> 8) & 0xFF) +#define C_00B02C_EXTRA_LDS_SIZE 0xFFFF00FF +#define S_00B02C_EXCP_EN(x) (((x) & 0x7F) << 16) +#define G_00B02C_EXCP_EN(x) (((x) >> 16) & 0x7F) +#define C_00B02C_EXCP_EN 0xFF80FFFF +#define R_00B030_SPI_SHADER_USER_DATA_PS_0 0x00B030 +#define R_00B034_SPI_SHADER_USER_DATA_PS_1 0x00B034 +#define R_00B038_SPI_SHADER_USER_DATA_PS_2 0x00B038 +#define R_00B03C_SPI_SHADER_USER_DATA_PS_3 0x00B03C +#define R_00B040_SPI_SHADER_USER_DATA_PS_4 0x00B040 +#define R_00B044_SPI_SHADER_USER_DATA_PS_5 0x00B044 +#define R_00B048_SPI_SHADER_USER_DATA_PS_6 0x00B048 +#define R_00B04C_SPI_SHADER_USER_DATA_PS_7 0x00B04C +#define R_00B050_SPI_SHADER_USER_DATA_PS_8 0x00B050 +#define R_00B054_SPI_SHADER_USER_DATA_PS_9 0x00B054 +#define R_00B058_SPI_SHADER_USER_DATA_PS_10 0x00B058 +#define R_00B05C_SPI_SHADER_USER_DATA_PS_11 0x00B05C +#define R_00B060_SPI_SHADER_USER_DATA_PS_12 0x00B060 +#define R_00B064_SPI_SHADER_USER_DATA_PS_13 0x00B064 +#define R_00B068_SPI_SHADER_USER_DATA_PS_14 0x00B068 +#define R_00B06C_SPI_SHADER_USER_DATA_PS_15 0x00B06C +#define R_00B120_SPI_SHADER_PGM_LO_VS 0x00B120 +#define R_00B124_SPI_SHADER_PGM_HI_VS 0x00B124 +#define S_00B124_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B124_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B124_MEM_BASE 0xFFFFFF00 +#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 +#define S_00B128_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B128_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B128_VGPRS 0xFFFFFFC0 +#define S_00B128_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B128_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B128_SGPRS 0xFFFFFC3F +#define S_00B128_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B128_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B128_PRIORITY 0xFFFFF3FF +#define S_00B128_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B128_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B128_FLOAT_MODE 0xFFF00FFF +#define S_00B128_PRIV(x) (((x) & 0x1) << 20) +#define G_00B128_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B128_PRIV 0xFFEFFFFF +#define S_00B128_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B128_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B128_DX10_CLAMP 0xFFDFFFFF +#define S_00B128_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B128_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B128_DEBUG_MODE 0xFFBFFFFF +#define S_00B128_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B128_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B128_IEEE_MODE 0xFF7FFFFF +#define S_00B128_VGPR_COMP_CNT(x) (((x) & 0x03) << 24) +#define G_00B128_VGPR_COMP_CNT(x) (((x) >> 24) & 0x03) +#define C_00B128_VGPR_COMP_CNT 0xFCFFFFFF +#define S_00B128_CU_GROUP_ENABLE(x) (((x) & 0x1) << 26) +#define G_00B128_CU_GROUP_ENABLE(x) (((x) >> 26) & 0x1) +#define C_00B128_CU_GROUP_ENABLE 0xFBFFFFFF +#define R_00B12C_SPI_SHADER_PGM_RSRC2_VS 0x00B12C +#define S_00B12C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define G_00B12C_SCRATCH_EN(x) (((x) >> 0) & 0x1) +#define C_00B12C_SCRATCH_EN 0xFFFFFFFE +#define S_00B12C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define G_00B12C_USER_SGPR(x) (((x) >> 1) & 0x1F) +#define C_00B12C_USER_SGPR 0xFFFFFFC1 +#define S_00B12C_OC_LDS_EN(x) (((x) & 0x1) << 7) +#define G_00B12C_OC_LDS_EN(x) (((x) >> 7) & 0x1) +#define C_00B12C_OC_LDS_EN 0xFFFFFF7F +#define S_00B12C_SO_BASE0_EN(x) (((x) & 0x1) << 8) +#define G_00B12C_SO_BASE0_EN(x) (((x) >> 8) & 0x1) +#define C_00B12C_SO_BASE0_EN 0xFFFFFEFF +#define S_00B12C_SO_BASE1_EN(x) (((x) & 0x1) << 9) +#define G_00B12C_SO_BASE1_EN(x) (((x) >> 9) & 0x1) +#define C_00B12C_SO_BASE1_EN 0xFFFFFDFF +#define S_00B12C_SO_BASE2_EN(x) (((x) & 0x1) << 10) +#define G_00B12C_SO_BASE2_EN(x) (((x) >> 10) & 0x1) +#define C_00B12C_SO_BASE2_EN 0xFFFFFBFF +#define S_00B12C_SO_BASE3_EN(x) (((x) & 0x1) << 11) +#define G_00B12C_SO_BASE3_EN(x) (((x) >> 11) & 0x1) +#define C_00B12C_SO_BASE3_EN 0xFFFFF7FF +#define S_00B12C_SO_EN(x) (((x) & 0x1) << 12) +#define G_00B12C_SO_EN(x) (((x) >> 12) & 0x1) +#define C_00B12C_SO_EN 0xFFFFEFFF +#define S_00B12C_EXCP_EN(x) (((x) & 0x7F) << 13) +#define G_00B12C_EXCP_EN(x) (((x) >> 13) & 0x7F) +#define C_00B12C_EXCP_EN 0xFFF01FFF +#define R_00B130_SPI_SHADER_USER_DATA_VS_0 0x00B130 +#define R_00B134_SPI_SHADER_USER_DATA_VS_1 0x00B134 +#define R_00B138_SPI_SHADER_USER_DATA_VS_2 0x00B138 +#define R_00B13C_SPI_SHADER_USER_DATA_VS_3 0x00B13C +#define R_00B140_SPI_SHADER_USER_DATA_VS_4 0x00B140 +#define R_00B144_SPI_SHADER_USER_DATA_VS_5 0x00B144 +#define R_00B148_SPI_SHADER_USER_DATA_VS_6 0x00B148 +#define R_00B14C_SPI_SHADER_USER_DATA_VS_7 0x00B14C +#define R_00B150_SPI_SHADER_USER_DATA_VS_8 0x00B150 +#define R_00B154_SPI_SHADER_USER_DATA_VS_9 0x00B154 +#define R_00B158_SPI_SHADER_USER_DATA_VS_10 0x00B158 +#define R_00B15C_SPI_SHADER_USER_DATA_VS_11 0x00B15C +#define R_00B160_SPI_SHADER_USER_DATA_VS_12 0x00B160 +#define R_00B164_SPI_SHADER_USER_DATA_VS_13 0x00B164 +#define R_00B168_SPI_SHADER_USER_DATA_VS_14 0x00B168 +#define R_00B16C_SPI_SHADER_USER_DATA_VS_15 0x00B16C +#define R_00B220_SPI_SHADER_PGM_LO_GS 0x00B220 +#define R_00B224_SPI_SHADER_PGM_HI_GS 0x00B224 +#define S_00B224_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B224_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B224_MEM_BASE 0xFFFFFF00 +#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define S_00B228_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B228_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B228_VGPRS 0xFFFFFFC0 +#define S_00B228_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B228_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B228_SGPRS 0xFFFFFC3F +#define S_00B228_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B228_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B228_PRIORITY 0xFFFFF3FF +#define S_00B228_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B228_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B228_FLOAT_MODE 0xFFF00FFF +#define S_00B228_PRIV(x) (((x) & 0x1) << 20) +#define G_00B228_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B228_PRIV 0xFFEFFFFF +#define S_00B228_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B228_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B228_DX10_CLAMP 0xFFDFFFFF +#define S_00B228_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B228_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B228_DEBUG_MODE 0xFFBFFFFF +#define S_00B228_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B228_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B228_IEEE_MODE 0xFF7FFFFF +#define S_00B228_CU_GROUP_ENABLE(x) (((x) & 0x1) << 24) +#define G_00B228_CU_GROUP_ENABLE(x) (((x) >> 24) & 0x1) +#define C_00B228_CU_GROUP_ENABLE 0xFEFFFFFF +#define R_00B22C_SPI_SHADER_PGM_RSRC2_GS 0x00B22C +#define S_00B22C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define G_00B22C_SCRATCH_EN(x) (((x) >> 0) & 0x1) +#define C_00B22C_SCRATCH_EN 0xFFFFFFFE +#define S_00B22C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define G_00B22C_USER_SGPR(x) (((x) >> 1) & 0x1F) +#define C_00B22C_USER_SGPR 0xFFFFFFC1 +#define S_00B22C_EXCP_EN(x) (((x) & 0x7F) << 7) +#define G_00B22C_EXCP_EN(x) (((x) >> 7) & 0x7F) +#define C_00B22C_EXCP_EN 0xFFFFC07F +#define R_00B230_SPI_SHADER_USER_DATA_GS_0 0x00B230 +#define R_00B320_SPI_SHADER_PGM_LO_ES 0x00B320 +#define R_00B324_SPI_SHADER_PGM_HI_ES 0x00B324 +#define S_00B324_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B324_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B324_MEM_BASE 0xFFFFFF00 +#define R_00B328_SPI_SHADER_PGM_RSRC1_ES 0x00B328 +#define S_00B328_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B328_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B328_VGPRS 0xFFFFFFC0 +#define S_00B328_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B328_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B328_SGPRS 0xFFFFFC3F +#define S_00B328_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B328_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B328_PRIORITY 0xFFFFF3FF +#define S_00B328_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B328_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B328_FLOAT_MODE 0xFFF00FFF +#define S_00B328_PRIV(x) (((x) & 0x1) << 20) +#define G_00B328_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B328_PRIV 0xFFEFFFFF +#define S_00B328_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B328_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B328_DX10_CLAMP 0xFFDFFFFF +#define S_00B328_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B328_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B328_DEBUG_MODE 0xFFBFFFFF +#define S_00B328_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B328_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B328_IEEE_MODE 0xFF7FFFFF +#define S_00B328_VGPR_COMP_CNT(x) (((x) & 0x03) << 24) +#define G_00B328_VGPR_COMP_CNT(x) (((x) >> 24) & 0x03) +#define C_00B328_VGPR_COMP_CNT 0xFCFFFFFF +#define S_00B328_CU_GROUP_ENABLE(x) (((x) & 0x1) << 26) +#define G_00B328_CU_GROUP_ENABLE(x) (((x) >> 26) & 0x1) +#define C_00B328_CU_GROUP_ENABLE 0xFBFFFFFF +#define R_00B32C_SPI_SHADER_PGM_RSRC2_ES 0x00B32C +#define S_00B32C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define G_00B32C_SCRATCH_EN(x) (((x) >> 0) & 0x1) +#define C_00B32C_SCRATCH_EN 0xFFFFFFFE +#define S_00B32C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define G_00B32C_USER_SGPR(x) (((x) >> 1) & 0x1F) +#define C_00B32C_USER_SGPR 0xFFFFFFC1 +#define S_00B32C_OC_LDS_EN(x) (((x) & 0x1) << 7) +#define G_00B32C_OC_LDS_EN(x) (((x) >> 7) & 0x1) +#define C_00B32C_OC_LDS_EN 0xFFFFFF7F +#define S_00B32C_EXCP_EN(x) (((x) & 0x7F) << 8) +#define G_00B32C_EXCP_EN(x) (((x) >> 8) & 0x7F) +#define C_00B32C_EXCP_EN 0xFFFF80FF +#define R_00B330_SPI_SHADER_USER_DATA_ES_0 0x00B330 +#define R_00B420_SPI_SHADER_PGM_LO_HS 0x00B420 +#define R_00B424_SPI_SHADER_PGM_HI_HS 0x00B424 +#define S_00B424_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B424_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B424_MEM_BASE 0xFFFFFF00 +#define R_00B428_SPI_SHADER_PGM_RSRC1_HS 0x00B428 +#define S_00B428_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B428_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B428_VGPRS 0xFFFFFFC0 +#define S_00B428_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B428_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B428_SGPRS 0xFFFFFC3F +#define S_00B428_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B428_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B428_PRIORITY 0xFFFFF3FF +#define S_00B428_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B428_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B428_FLOAT_MODE 0xFFF00FFF +#define S_00B428_PRIV(x) (((x) & 0x1) << 20) +#define G_00B428_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B428_PRIV 0xFFEFFFFF +#define S_00B428_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B428_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B428_DX10_CLAMP 0xFFDFFFFF +#define S_00B428_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B428_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B428_DEBUG_MODE 0xFFBFFFFF +#define S_00B428_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B428_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B428_IEEE_MODE 0xFF7FFFFF +#define R_00B42C_SPI_SHADER_PGM_RSRC2_HS 0x00B42C +#define S_00B42C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define G_00B42C_SCRATCH_EN(x) (((x) >> 0) & 0x1) +#define C_00B42C_SCRATCH_EN 0xFFFFFFFE +#define S_00B42C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define G_00B42C_USER_SGPR(x) (((x) >> 1) & 0x1F) +#define C_00B42C_USER_SGPR 0xFFFFFFC1 +#define S_00B42C_OC_LDS_EN(x) (((x) & 0x1) << 7) +#define G_00B42C_OC_LDS_EN(x) (((x) >> 7) & 0x1) +#define C_00B42C_OC_LDS_EN 0xFFFFFF7F +#define S_00B42C_TG_SIZE_EN(x) (((x) & 0x1) << 8) +#define G_00B42C_TG_SIZE_EN(x) (((x) >> 8) & 0x1) +#define C_00B42C_TG_SIZE_EN 0xFFFFFEFF +#define S_00B42C_EXCP_EN(x) (((x) & 0x7F) << 9) +#define G_00B42C_EXCP_EN(x) (((x) >> 9) & 0x7F) +#define C_00B42C_EXCP_EN 0xFFFF01FF +#define R_00B430_SPI_SHADER_USER_DATA_HS_0 0x00B430 +#define R_00B520_SPI_SHADER_PGM_LO_LS 0x00B520 +#define R_00B524_SPI_SHADER_PGM_HI_LS 0x00B524 +#define S_00B524_MEM_BASE(x) (((x) & 0xFF) << 0) +#define G_00B524_MEM_BASE(x) (((x) >> 0) & 0xFF) +#define C_00B524_MEM_BASE 0xFFFFFF00 +#define R_00B528_SPI_SHADER_PGM_RSRC1_LS 0x00B528 +#define S_00B528_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B528_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B528_VGPRS 0xFFFFFFC0 +#define S_00B528_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B528_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B528_SGPRS 0xFFFFFC3F +#define S_00B528_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B528_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B528_PRIORITY 0xFFFFF3FF +#define S_00B528_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B528_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B528_FLOAT_MODE 0xFFF00FFF +#define S_00B528_PRIV(x) (((x) & 0x1) << 20) +#define G_00B528_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B528_PRIV 0xFFEFFFFF +#define S_00B528_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B528_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B528_DX10_CLAMP 0xFFDFFFFF +#define S_00B528_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B528_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B528_DEBUG_MODE 0xFFBFFFFF +#define S_00B528_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B528_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B528_IEEE_MODE 0xFF7FFFFF +#define S_00B528_VGPR_COMP_CNT(x) (((x) & 0x03) << 24) +#define G_00B528_VGPR_COMP_CNT(x) (((x) >> 24) & 0x03) +#define C_00B528_VGPR_COMP_CNT 0xFCFFFFFF +#define R_00B52C_SPI_SHADER_PGM_RSRC2_LS 0x00B52C +#define S_00B52C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define G_00B52C_SCRATCH_EN(x) (((x) >> 0) & 0x1) +#define C_00B52C_SCRATCH_EN 0xFFFFFFFE +#define S_00B52C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define G_00B52C_USER_SGPR(x) (((x) >> 1) & 0x1F) +#define C_00B52C_USER_SGPR 0xFFFFFFC1 +#define S_00B52C_LDS_SIZE(x) (((x) & 0x1FF) << 7) +#define G_00B52C_LDS_SIZE(x) (((x) >> 7) & 0x1FF) +#define C_00B52C_LDS_SIZE 0xFFFF007F +#define S_00B52C_EXCP_EN(x) (((x) & 0x7F) << 16) +#define G_00B52C_EXCP_EN(x) (((x) >> 16) & 0x7F) +#define C_00B52C_EXCP_EN 0xFF80FFFF +#define R_00B530_SPI_SHADER_USER_DATA_LS_0 0x00B530 +#define R_00B800_COMPUTE_DISPATCH_INITIATOR 0x00B800 +#define S_00B800_COMPUTE_SHADER_EN(x) (((x) & 0x1) << 0) +#define G_00B800_COMPUTE_SHADER_EN(x) (((x) >> 0) & 0x1) +#define C_00B800_COMPUTE_SHADER_EN 0xFFFFFFFE +#define S_00B800_PARTIAL_TG_EN(x) (((x) & 0x1) << 1) +#define G_00B800_PARTIAL_TG_EN(x) (((x) >> 1) & 0x1) +#define C_00B800_PARTIAL_TG_EN 0xFFFFFFFD +#define S_00B800_FORCE_START_AT_000(x) (((x) & 0x1) << 2) +#define G_00B800_FORCE_START_AT_000(x) (((x) >> 2) & 0x1) +#define C_00B800_FORCE_START_AT_000 0xFFFFFFFB +#define S_00B800_ORDERED_APPEND_ENBL(x) (((x) & 0x1) << 3) +#define G_00B800_ORDERED_APPEND_ENBL(x) (((x) >> 3) & 0x1) +#define C_00B800_ORDERED_APPEND_ENBL 0xFFFFFFF7 +#define R_00B804_COMPUTE_DIM_X 0x00B804 +#define R_00B808_COMPUTE_DIM_Y 0x00B808 +#define R_00B80C_COMPUTE_DIM_Z 0x00B80C +#define R_00B810_COMPUTE_START_X 0x00B810 +#define R_00B814_COMPUTE_START_Y 0x00B814 +#define R_00B818_COMPUTE_START_Z 0x00B818 +#define R_00B81C_COMPUTE_NUM_THREAD_X 0x00B81C +#define S_00B81C_NUM_THREAD_FULL(x) (((x) & 0xFFFF) << 0) +#define G_00B81C_NUM_THREAD_FULL(x) (((x) >> 0) & 0xFFFF) +#define C_00B81C_NUM_THREAD_FULL 0xFFFF0000 +#define S_00B81C_NUM_THREAD_PARTIAL(x) (((x) & 0xFFFF) << 16) +#define G_00B81C_NUM_THREAD_PARTIAL(x) (((x) >> 16) & 0xFFFF) +#define C_00B81C_NUM_THREAD_PARTIAL 0x0000FFFF +#define R_00B820_COMPUTE_NUM_THREAD_Y 0x00B820 +#define S_00B820_NUM_THREAD_FULL(x) (((x) & 0xFFFF) << 0) +#define G_00B820_NUM_THREAD_FULL(x) (((x) >> 0) & 0xFFFF) +#define C_00B820_NUM_THREAD_FULL 0xFFFF0000 +#define S_00B820_NUM_THREAD_PARTIAL(x) (((x) & 0xFFFF) << 16) +#define G_00B820_NUM_THREAD_PARTIAL(x) (((x) >> 16) & 0xFFFF) +#define C_00B820_NUM_THREAD_PARTIAL 0x0000FFFF +#define R_00B824_COMPUTE_NUM_THREAD_Z 0x00B824 +#define S_00B824_NUM_THREAD_FULL(x) (((x) & 0xFFFF) << 0) +#define G_00B824_NUM_THREAD_FULL(x) (((x) >> 0) & 0xFFFF) +#define C_00B824_NUM_THREAD_FULL 0xFFFF0000 +#define S_00B824_NUM_THREAD_PARTIAL(x) (((x) & 0xFFFF) << 16) +#define G_00B824_NUM_THREAD_PARTIAL(x) (((x) >> 16) & 0xFFFF) +#define C_00B824_NUM_THREAD_PARTIAL 0x0000FFFF +#define R_00B82C_COMPUTE_MAX_WAVE_ID 0x00B82C +#define S_00B82C_MAX_WAVE_ID(x) (((x) & 0xFFF) << 0) +#define G_00B82C_MAX_WAVE_ID(x) (((x) >> 0) & 0xFFF) +#define C_00B82C_MAX_WAVE_ID 0xFFFFF000 +#define R_00B830_COMPUTE_PGM_LO 0x00B830 +#define R_00B834_COMPUTE_PGM_HI 0x00B834 +#define S_00B834_DATA(x) (((x) & 0xFF) << 0) +#define G_00B834_DATA(x) (((x) >> 0) & 0xFF) +#define C_00B834_DATA 0xFFFFFF00 +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B848_VGPRS 0xFFFFFFC0 +#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B848_SGPRS 0xFFFFFC3F +#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B848_PRIORITY 0xFFFFF3FF +#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B848_FLOAT_MODE 0xFFF00FFF +#define S_00B848_PRIV(x) (((x) & 0x1) << 20) +#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B848_PRIV 0xFFEFFFFF +#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B848_DEBUG_MODE 0xFFBFFFFF +#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B848_IEEE_MODE 0xFF7FFFFF +#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C +#define S_00B84C_SCRATCH_EN(x) (((x) & 0x1) << 0) +#define G_00B84C_SCRATCH_EN(x) (((x) >> 0) & 0x1) +#define C_00B84C_SCRATCH_EN 0xFFFFFFFE +#define S_00B84C_USER_SGPR(x) (((x) & 0x1F) << 1) +#define G_00B84C_USER_SGPR(x) (((x) >> 1) & 0x1F) +#define C_00B84C_USER_SGPR 0xFFFFFFC1 +#define S_00B84C_TGID_X_EN(x) (((x) & 0x1) << 7) +#define G_00B84C_TGID_X_EN(x) (((x) >> 7) & 0x1) +#define C_00B84C_TGID_X_EN 0xFFFFFF7F +#define S_00B84C_TGID_Y_EN(x) (((x) & 0x1) << 8) +#define G_00B84C_TGID_Y_EN(x) (((x) >> 8) & 0x1) +#define C_00B84C_TGID_Y_EN 0xFFFFFEFF +#define S_00B84C_TGID_Z_EN(x) (((x) & 0x1) << 9) +#define G_00B84C_TGID_Z_EN(x) (((x) >> 9) & 0x1) +#define C_00B84C_TGID_Z_EN 0xFFFFFDFF +#define S_00B84C_TG_SIZE_EN(x) (((x) & 0x1) << 10) +#define G_00B84C_TG_SIZE_EN(x) (((x) >> 10) & 0x1) +#define C_00B84C_TG_SIZE_EN 0xFFFFFBFF +#define S_00B84C_TIDIG_COMP_CNT(x) (((x) & 0x03) << 11) +#define G_00B84C_TIDIG_COMP_CNT(x) (((x) >> 11) & 0x03) +#define C_00B84C_TIDIG_COMP_CNT 0xFFFFE7FF +#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) +#define G_00B84C_LDS_SIZE(x) (((x) >> 15) & 0x1FF) +#define C_00B84C_LDS_SIZE 0xFF007FFF +#define S_00B84C_EXCP_EN(x) (((x) & 0x7F) << 24) +#define G_00B84C_EXCP_EN(x) (((x) >> 24) & 0x7F) +#define C_00B84C_EXCP_EN 0x80FFFFFF +#define R_00B854_COMPUTE_RESOURCE_LIMITS 0x00B854 +#define S_00B854_WAVES_PER_SH(x) (((x) & 0x3F) << 0) +#define G_00B854_WAVES_PER_SH(x) (((x) >> 0) & 0x3F) +#define C_00B854_WAVES_PER_SH 0xFFFFFFC0 +#define S_00B854_TG_PER_CU(x) (((x) & 0x0F) << 12) +#define G_00B854_TG_PER_CU(x) (((x) >> 12) & 0x0F) +#define C_00B854_TG_PER_CU 0xFFFF0FFF +#define S_00B854_LOCK_THRESHOLD(x) (((x) & 0x3F) << 16) +#define G_00B854_LOCK_THRESHOLD(x) (((x) >> 16) & 0x3F) +#define C_00B854_LOCK_THRESHOLD 0xFFC0FFFF +#define S_00B854_SIMD_DEST_CNTL(x) (((x) & 0x1) << 22) +#define G_00B854_SIMD_DEST_CNTL(x) (((x) >> 22) & 0x1) +#define C_00B854_SIMD_DEST_CNTL 0xFFBFFFFF +#define R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 0x00B858 +#define S_00B858_SH0_CU_EN(x) (((x) & 0xFFFF) << 0) +#define G_00B858_SH0_CU_EN(x) (((x) >> 0) & 0xFFFF) +#define C_00B858_SH0_CU_EN 0xFFFF0000 +#define S_00B858_SH1_CU_EN(x) (((x) & 0xFFFF) << 16) +#define G_00B858_SH1_CU_EN(x) (((x) >> 16) & 0xFFFF) +#define C_00B858_SH1_CU_EN 0x0000FFFF +#define R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1 0x00B85C +#define S_00B85C_SH0_CU_EN(x) (((x) & 0xFFFF) << 0) +#define G_00B85C_SH0_CU_EN(x) (((x) >> 0) & 0xFFFF) +#define C_00B85C_SH0_CU_EN 0xFFFF0000 +#define S_00B85C_SH1_CU_EN(x) (((x) & 0xFFFF) << 16) +#define G_00B85C_SH1_CU_EN(x) (((x) >> 16) & 0xFFFF) +#define C_00B85C_SH1_CU_EN 0x0000FFFF +#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860 +#define S_00B860_WAVES(x) (((x) & 0xFFF) << 0) +#define G_00B860_WAVES(x) (((x) >> 0) & 0xFFF) +#define C_00B860_WAVES 0xFFFFF000 +#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define G_00B860_WAVESIZE(x) (((x) >> 12) & 0x1FFF) +#define C_00B860_WAVESIZE 0xFE000FFF +#define R_00B900_COMPUTE_USER_DATA_0 0x00B900 +#define R_028000_DB_RENDER_CONTROL 0x028000 +#define S_028000_DEPTH_CLEAR_ENABLE(x) (((x) & 0x1) << 0) +#define G_028000_DEPTH_CLEAR_ENABLE(x) (((x) >> 0) & 0x1) +#define C_028000_DEPTH_CLEAR_ENABLE 0xFFFFFFFE +#define S_028000_STENCIL_CLEAR_ENABLE(x) (((x) & 0x1) << 1) +#define G_028000_STENCIL_CLEAR_ENABLE(x) (((x) >> 1) & 0x1) +#define C_028000_STENCIL_CLEAR_ENABLE 0xFFFFFFFD +#define S_028000_DEPTH_COPY(x) (((x) & 0x1) << 2) +#define G_028000_DEPTH_COPY(x) (((x) >> 2) & 0x1) +#define C_028000_DEPTH_COPY 0xFFFFFFFB +#define S_028000_STENCIL_COPY(x) (((x) & 0x1) << 3) +#define G_028000_STENCIL_COPY(x) (((x) >> 3) & 0x1) +#define C_028000_STENCIL_COPY 0xFFFFFFF7 +#define S_028000_RESUMMARIZE_ENABLE(x) (((x) & 0x1) << 4) +#define G_028000_RESUMMARIZE_ENABLE(x) (((x) >> 4) & 0x1) +#define C_028000_RESUMMARIZE_ENABLE 0xFFFFFFEF +#define S_028000_STENCIL_COMPRESS_DISABLE(x) (((x) & 0x1) << 5) +#define G_028000_STENCIL_COMPRESS_DISABLE(x) (((x) >> 5) & 0x1) +#define C_028000_STENCIL_COMPRESS_DISABLE 0xFFFFFFDF +#define S_028000_DEPTH_COMPRESS_DISABLE(x) (((x) & 0x1) << 6) +#define G_028000_DEPTH_COMPRESS_DISABLE(x) (((x) >> 6) & 0x1) +#define C_028000_DEPTH_COMPRESS_DISABLE 0xFFFFFFBF +#define S_028000_COPY_CENTROID(x) (((x) & 0x1) << 7) +#define G_028000_COPY_CENTROID(x) (((x) >> 7) & 0x1) +#define C_028000_COPY_CENTROID 0xFFFFFF7F +#define S_028000_COPY_SAMPLE(x) (((x) & 0x0F) << 8) +#define G_028000_COPY_SAMPLE(x) (((x) >> 8) & 0x0F) +#define C_028000_COPY_SAMPLE 0xFFFFF0FF +#define R_028004_DB_COUNT_CONTROL 0x028004 +#define S_028004_ZPASS_INCREMENT_DISABLE(x) (((x) & 0x1) << 0) +#define G_028004_ZPASS_INCREMENT_DISABLE(x) (((x) >> 0) & 0x1) +#define C_028004_ZPASS_INCREMENT_DISABLE 0xFFFFFFFE +#define S_028004_PERFECT_ZPASS_COUNTS(x) (((x) & 0x1) << 1) +#define G_028004_PERFECT_ZPASS_COUNTS(x) (((x) >> 1) & 0x1) +#define C_028004_PERFECT_ZPASS_COUNTS 0xFFFFFFFD +#define S_028004_SAMPLE_RATE(x) (((x) & 0x07) << 4) +#define G_028004_SAMPLE_RATE(x) (((x) >> 4) & 0x07) +#define C_028004_SAMPLE_RATE 0xFFFFFF8F +#define R_028008_DB_DEPTH_VIEW 0x028008 +#define S_028008_SLICE_START(x) (((x) & 0x7FF) << 0) +#define G_028008_SLICE_START(x) (((x) >> 0) & 0x7FF) +#define C_028008_SLICE_START 0xFFFFF800 +#define S_028008_SLICE_MAX(x) (((x) & 0x7FF) << 13) +#define G_028008_SLICE_MAX(x) (((x) >> 13) & 0x7FF) +#define C_028008_SLICE_MAX 0xFF001FFF +#define S_028008_Z_READ_ONLY(x) (((x) & 0x1) << 24) +#define G_028008_Z_READ_ONLY(x) (((x) >> 24) & 0x1) +#define C_028008_Z_READ_ONLY 0xFEFFFFFF +#define S_028008_STENCIL_READ_ONLY(x) (((x) & 0x1) << 25) +#define G_028008_STENCIL_READ_ONLY(x) (((x) >> 25) & 0x1) +#define C_028008_STENCIL_READ_ONLY 0xFDFFFFFF +#define R_02800C_DB_RENDER_OVERRIDE 0x02800C +#define S_02800C_FORCE_HIZ_ENABLE(x) (((x) & 0x03) << 0) +#define G_02800C_FORCE_HIZ_ENABLE(x) (((x) >> 0) & 0x03) +#define C_02800C_FORCE_HIZ_ENABLE 0xFFFFFFFC +#define V_02800C_FORCE_OFF 0x00 +#define V_02800C_FORCE_ENABLE 0x01 +#define V_02800C_FORCE_DISABLE 0x02 +#define V_02800C_FORCE_RESERVED 0x03 +#define S_02800C_FORCE_HIS_ENABLE0(x) (((x) & 0x03) << 2) +#define G_02800C_FORCE_HIS_ENABLE0(x) (((x) >> 2) & 0x03) +#define C_02800C_FORCE_HIS_ENABLE0 0xFFFFFFF3 +#define V_02800C_FORCE_OFF 0x00 +#define V_02800C_FORCE_ENABLE 0x01 +#define V_02800C_FORCE_DISABLE 0x02 +#define V_02800C_FORCE_RESERVED 0x03 +#define S_02800C_FORCE_HIS_ENABLE1(x) (((x) & 0x03) << 4) +#define G_02800C_FORCE_HIS_ENABLE1(x) (((x) >> 4) & 0x03) +#define C_02800C_FORCE_HIS_ENABLE1 0xFFFFFFCF +#define V_02800C_FORCE_OFF 0x00 +#define V_02800C_FORCE_ENABLE 0x01 +#define V_02800C_FORCE_DISABLE 0x02 +#define V_02800C_FORCE_RESERVED 0x03 +#define S_02800C_FORCE_SHADER_Z_ORDER(x) (((x) & 0x1) << 6) +#define G_02800C_FORCE_SHADER_Z_ORDER(x) (((x) >> 6) & 0x1) +#define C_02800C_FORCE_SHADER_Z_ORDER 0xFFFFFFBF +#define S_02800C_FAST_Z_DISABLE(x) (((x) & 0x1) << 7) +#define G_02800C_FAST_Z_DISABLE(x) (((x) >> 7) & 0x1) +#define C_02800C_FAST_Z_DISABLE 0xFFFFFF7F +#define S_02800C_FAST_STENCIL_DISABLE(x) (((x) & 0x1) << 8) +#define G_02800C_FAST_STENCIL_DISABLE(x) (((x) >> 8) & 0x1) +#define C_02800C_FAST_STENCIL_DISABLE 0xFFFFFEFF +#define S_02800C_NOOP_CULL_DISABLE(x) (((x) & 0x1) << 9) +#define G_02800C_NOOP_CULL_DISABLE(x) (((x) >> 9) & 0x1) +#define C_02800C_NOOP_CULL_DISABLE 0xFFFFFDFF +#define S_02800C_FORCE_COLOR_KILL(x) (((x) & 0x1) << 10) +#define G_02800C_FORCE_COLOR_KILL(x) (((x) >> 10) & 0x1) +#define C_02800C_FORCE_COLOR_KILL 0xFFFFFBFF +#define S_02800C_FORCE_Z_READ(x) (((x) & 0x1) << 11) +#define G_02800C_FORCE_Z_READ(x) (((x) >> 11) & 0x1) +#define C_02800C_FORCE_Z_READ 0xFFFFF7FF +#define S_02800C_FORCE_STENCIL_READ(x) (((x) & 0x1) << 12) +#define G_02800C_FORCE_STENCIL_READ(x) (((x) >> 12) & 0x1) +#define C_02800C_FORCE_STENCIL_READ 0xFFFFEFFF +#define S_02800C_FORCE_FULL_Z_RANGE(x) (((x) & 0x03) << 13) +#define G_02800C_FORCE_FULL_Z_RANGE(x) (((x) >> 13) & 0x03) +#define C_02800C_FORCE_FULL_Z_RANGE 0xFFFF9FFF +#define V_02800C_FORCE_OFF 0x00 +#define V_02800C_FORCE_ENABLE 0x01 +#define V_02800C_FORCE_DISABLE 0x02 +#define V_02800C_FORCE_RESERVED 0x03 +#define S_02800C_FORCE_QC_SMASK_CONFLICT(x) (((x) & 0x1) << 15) +#define G_02800C_FORCE_QC_SMASK_CONFLICT(x) (((x) >> 15) & 0x1) +#define C_02800C_FORCE_QC_SMASK_CONFLICT 0xFFFF7FFF +#define S_02800C_DISABLE_VIEWPORT_CLAMP(x) (((x) & 0x1) << 16) +#define G_02800C_DISABLE_VIEWPORT_CLAMP(x) (((x) >> 16) & 0x1) +#define C_02800C_DISABLE_VIEWPORT_CLAMP 0xFFFEFFFF +#define S_02800C_IGNORE_SC_ZRANGE(x) (((x) & 0x1) << 17) +#define G_02800C_IGNORE_SC_ZRANGE(x) (((x) >> 17) & 0x1) +#define C_02800C_IGNORE_SC_ZRANGE 0xFFFDFFFF +#define S_02800C_DISABLE_FULLY_COVERED(x) (((x) & 0x1) << 18) +#define G_02800C_DISABLE_FULLY_COVERED(x) (((x) >> 18) & 0x1) +#define C_02800C_DISABLE_FULLY_COVERED 0xFFFBFFFF +#define S_02800C_FORCE_Z_LIMIT_SUMM(x) (((x) & 0x03) << 19) +#define G_02800C_FORCE_Z_LIMIT_SUMM(x) (((x) >> 19) & 0x03) +#define C_02800C_FORCE_Z_LIMIT_SUMM 0xFFE7FFFF +#define V_02800C_FORCE_SUMM_OFF 0x00 +#define V_02800C_FORCE_SUMM_MINZ 0x01 +#define V_02800C_FORCE_SUMM_MAXZ 0x02 +#define V_02800C_FORCE_SUMM_BOTH 0x03 +#define S_02800C_MAX_TILES_IN_DTT(x) (((x) & 0x1F) << 21) +#define G_02800C_MAX_TILES_IN_DTT(x) (((x) >> 21) & 0x1F) +#define C_02800C_MAX_TILES_IN_DTT 0xFC1FFFFF +#define S_02800C_DISABLE_TILE_RATE_TILES(x) (((x) & 0x1) << 26) +#define G_02800C_DISABLE_TILE_RATE_TILES(x) (((x) >> 26) & 0x1) +#define C_02800C_DISABLE_TILE_RATE_TILES 0xFBFFFFFF +#define S_02800C_FORCE_Z_DIRTY(x) (((x) & 0x1) << 27) +#define G_02800C_FORCE_Z_DIRTY(x) (((x) >> 27) & 0x1) +#define C_02800C_FORCE_Z_DIRTY 0xF7FFFFFF +#define S_02800C_FORCE_STENCIL_DIRTY(x) (((x) & 0x1) << 28) +#define G_02800C_FORCE_STENCIL_DIRTY(x) (((x) >> 28) & 0x1) +#define C_02800C_FORCE_STENCIL_DIRTY 0xEFFFFFFF +#define S_02800C_FORCE_Z_VALID(x) (((x) & 0x1) << 29) +#define G_02800C_FORCE_Z_VALID(x) (((x) >> 29) & 0x1) +#define C_02800C_FORCE_Z_VALID 0xDFFFFFFF +#define S_02800C_FORCE_STENCIL_VALID(x) (((x) & 0x1) << 30) +#define G_02800C_FORCE_STENCIL_VALID(x) (((x) >> 30) & 0x1) +#define C_02800C_FORCE_STENCIL_VALID 0xBFFFFFFF +#define S_02800C_PRESERVE_COMPRESSION(x) (((x) & 0x1) << 31) +#define G_02800C_PRESERVE_COMPRESSION(x) (((x) >> 31) & 0x1) +#define C_02800C_PRESERVE_COMPRESSION 0x7FFFFFFF +#define R_028010_DB_RENDER_OVERRIDE2 0x028010 +#define S_028010_PARTIAL_SQUAD_LAUNCH_CONTROL(x) (((x) & 0x03) << 0) +#define G_028010_PARTIAL_SQUAD_LAUNCH_CONTROL(x) (((x) >> 0) & 0x03) +#define C_028010_PARTIAL_SQUAD_LAUNCH_CONTROL 0xFFFFFFFC +#define V_028010_PSLC_AUTO 0x00 +#define V_028010_PSLC_ON_HANG_ONLY 0x01 +#define V_028010_PSLC_ASAP 0x02 +#define V_028010_PSLC_COUNTDOWN 0x03 +#define S_028010_PARTIAL_SQUAD_LAUNCH_COUNTDOWN(x) (((x) & 0x07) << 2) +#define G_028010_PARTIAL_SQUAD_LAUNCH_COUNTDOWN(x) (((x) >> 2) & 0x07) +#define C_028010_PARTIAL_SQUAD_LAUNCH_COUNTDOWN 0xFFFFFFE3 +#define S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATIO(x) (((x) & 0x1) << 5) +#define G_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATIO(x) (((x) >> 5) & 0x1) +#define C_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATIO 0xFFFFFFDF +#define S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(x) (((x) & 0x1) << 6) +#define G_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(x) (((x) >> 6) & 0x1) +#define C_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION 0xFFFFFFBF +#define S_028010_DISABLE_COLOR_ON_VALIDATION(x) (((x) & 0x1) << 7) +#define G_028010_DISABLE_COLOR_ON_VALIDATION(x) (((x) >> 7) & 0x1) +#define C_028010_DISABLE_COLOR_ON_VALIDATION 0xFFFFFF7F +#define S_028010_DECOMPRESS_Z_ON_FLUSH(x) (((x) & 0x1) << 8) +#define G_028010_DECOMPRESS_Z_ON_FLUSH(x) (((x) >> 8) & 0x1) +#define C_028010_DECOMPRESS_Z_ON_FLUSH 0xFFFFFEFF +#define S_028010_DISABLE_REG_SNOOP(x) (((x) & 0x1) << 9) +#define G_028010_DISABLE_REG_SNOOP(x) (((x) >> 9) & 0x1) +#define C_028010_DISABLE_REG_SNOOP 0xFFFFFDFF +#define S_028010_DEPTH_BOUNDS_HIER_DEPTH_DISABLE(x) (((x) & 0x1) << 10) +#define G_028010_DEPTH_BOUNDS_HIER_DEPTH_DISABLE(x) (((x) >> 10) & 0x1) +#define C_028010_DEPTH_BOUNDS_HIER_DEPTH_DISABLE 0xFFFFFBFF +#define R_028014_DB_HTILE_DATA_BASE 0x028014 +#define R_028020_DB_DEPTH_BOUNDS_MIN 0x028020 +#define R_028024_DB_DEPTH_BOUNDS_MAX 0x028024 +#define R_028028_DB_STENCIL_CLEAR 0x028028 +#define S_028028_CLEAR(x) (((x) & 0xFF) << 0) +#define G_028028_CLEAR(x) (((x) >> 0) & 0xFF) +#define C_028028_CLEAR 0xFFFFFF00 +#define R_02802C_DB_DEPTH_CLEAR 0x02802C +#define R_028030_PA_SC_SCREEN_SCISSOR_TL 0x028030 +#define S_028030_TL_X(x) (((x) & 0xFFFF) << 0) +#define G_028030_TL_X(x) (((x) >> 0) & 0xFFFF) +#define C_028030_TL_X 0xFFFF0000 +#define S_028030_TL_Y(x) (((x) & 0xFFFF) << 16) +#define G_028030_TL_Y(x) (((x) >> 16) & 0xFFFF) +#define C_028030_TL_Y 0x0000FFFF +#define R_028034_PA_SC_SCREEN_SCISSOR_BR 0x028034 +#define S_028034_BR_X(x) (((x) & 0xFFFF) << 0) +#define G_028034_BR_X(x) (((x) >> 0) & 0xFFFF) +#define C_028034_BR_X 0xFFFF0000 +#define S_028034_BR_Y(x) (((x) & 0xFFFF) << 16) +#define G_028034_BR_Y(x) (((x) >> 16) & 0xFFFF) +#define C_028034_BR_Y 0x0000FFFF +#define R_02803C_DB_DEPTH_INFO 0x02803C +#define S_02803C_ADDR5_SWIZZLE_MASK(x) (((x) & 0x0F) << 0) +#define G_02803C_ADDR5_SWIZZLE_MASK(x) (((x) >> 0) & 0x0F) +#define C_02803C_ADDR5_SWIZZLE_MASK 0xFFFFFFF0 +#define R_028040_DB_Z_INFO 0x028040 +#define S_028040_FORMAT(x) (((x) & 0x03) << 0) +#define G_028040_FORMAT(x) (((x) >> 0) & 0x03) +#define C_028040_FORMAT 0xFFFFFFFC +#define V_028040_Z_INVALID 0x00 +#define V_028040_Z_16 0x01 +#define V_028040_Z_24 0x02 /* deprecated */ +#define V_028040_Z_32_FLOAT 0x03 +#define S_028040_NUM_SAMPLES(x) (((x) & 0x03) << 2) +#define G_028040_NUM_SAMPLES(x) (((x) >> 2) & 0x03) +#define C_028040_NUM_SAMPLES 0xFFFFFFF3 +#define S_028040_TILE_MODE_INDEX(x) (((x) & 0x07) << 20) +#define G_028040_TILE_MODE_INDEX(x) (((x) >> 20) & 0x07) +#define C_028040_TILE_MODE_INDEX 0xFF8FFFFF +#define S_028040_ALLOW_EXPCLEAR(x) (((x) & 0x1) << 27) +#define G_028040_ALLOW_EXPCLEAR(x) (((x) >> 27) & 0x1) +#define C_028040_ALLOW_EXPCLEAR 0xF7FFFFFF +#define S_028040_READ_SIZE(x) (((x) & 0x1) << 28) +#define G_028040_READ_SIZE(x) (((x) >> 28) & 0x1) +#define C_028040_READ_SIZE 0xEFFFFFFF +#define S_028040_TILE_SURFACE_ENABLE(x) (((x) & 0x1) << 29) +#define G_028040_TILE_SURFACE_ENABLE(x) (((x) >> 29) & 0x1) +#define C_028040_TILE_SURFACE_ENABLE 0xDFFFFFFF +#define S_028040_ZRANGE_PRECISION(x) (((x) & 0x1) << 31) +#define G_028040_ZRANGE_PRECISION(x) (((x) >> 31) & 0x1) +#define C_028040_ZRANGE_PRECISION 0x7FFFFFFF +#define R_028044_DB_STENCIL_INFO 0x028044 +#define S_028044_FORMAT(x) (((x) & 0x1) << 0) +#define G_028044_FORMAT(x) (((x) >> 0) & 0x1) +#define C_028044_FORMAT 0xFFFFFFFE +#define S_028044_TILE_MODE_INDEX(x) (((x) & 0x07) << 20) +#define G_028044_TILE_MODE_INDEX(x) (((x) >> 20) & 0x07) +#define C_028044_TILE_MODE_INDEX 0xFF8FFFFF +#define S_028044_ALLOW_EXPCLEAR(x) (((x) & 0x1) << 27) +#define G_028044_ALLOW_EXPCLEAR(x) (((x) >> 27) & 0x1) +#define C_028044_ALLOW_EXPCLEAR 0xF7FFFFFF +#define S_028044_TILE_STENCIL_DISABLE(x) (((x) & 0x1) << 29) +#define G_028044_TILE_STENCIL_DISABLE(x) (((x) >> 29) & 0x1) +#define C_028044_TILE_STENCIL_DISABLE 0xDFFFFFFF +#define R_028048_DB_Z_READ_BASE 0x028048 +#define R_02804C_DB_STENCIL_READ_BASE 0x02804C +#define R_028050_DB_Z_WRITE_BASE 0x028050 +#define R_028054_DB_STENCIL_WRITE_BASE 0x028054 +#define R_028058_DB_DEPTH_SIZE 0x028058 +#define S_028058_PITCH_TILE_MAX(x) (((x) & 0x7FF) << 0) +#define G_028058_PITCH_TILE_MAX(x) (((x) >> 0) & 0x7FF) +#define C_028058_PITCH_TILE_MAX 0xFFFFF800 +#define S_028058_HEIGHT_TILE_MAX(x) (((x) & 0x7FF) << 11) +#define G_028058_HEIGHT_TILE_MAX(x) (((x) >> 11) & 0x7FF) +#define C_028058_HEIGHT_TILE_MAX 0xFFC007FF +#define R_02805C_DB_DEPTH_SLICE 0x02805C +#define S_02805C_SLICE_TILE_MAX(x) (((x) & 0x3FFFFF) << 0) +#define G_02805C_SLICE_TILE_MAX(x) (((x) >> 0) & 0x3FFFFF) +#define C_02805C_SLICE_TILE_MAX 0xFFC00000 +#define R_028080_TA_BC_BASE_ADDR 0x028080 +#define R_028200_PA_SC_WINDOW_OFFSET 0x028200 +#define S_028200_WINDOW_X_OFFSET(x) (((x) & 0xFFFF) << 0) +#define G_028200_WINDOW_X_OFFSET(x) (((x) >> 0) & 0xFFFF) +#define C_028200_WINDOW_X_OFFSET 0xFFFF0000 +#define S_028200_WINDOW_Y_OFFSET(x) (((x) & 0xFFFF) << 16) +#define G_028200_WINDOW_Y_OFFSET(x) (((x) >> 16) & 0xFFFF) +#define C_028200_WINDOW_Y_OFFSET 0x0000FFFF +#define R_028204_PA_SC_WINDOW_SCISSOR_TL 0x028204 +#define S_028204_TL_X(x) (((x) & 0x7FFF) << 0) +#define G_028204_TL_X(x) (((x) >> 0) & 0x7FFF) +#define C_028204_TL_X 0xFFFF8000 +#define S_028204_TL_Y(x) (((x) & 0x7FFF) << 16) +#define G_028204_TL_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028204_TL_Y 0x8000FFFF +#define S_028204_WINDOW_OFFSET_DISABLE(x) (((x) & 0x1) << 31) +#define G_028204_WINDOW_OFFSET_DISABLE(x) (((x) >> 31) & 0x1) +#define C_028204_WINDOW_OFFSET_DISABLE 0x7FFFFFFF +#define R_028208_PA_SC_WINDOW_SCISSOR_BR 0x028208 +#define S_028208_BR_X(x) (((x) & 0x7FFF) << 0) +#define G_028208_BR_X(x) (((x) >> 0) & 0x7FFF) +#define C_028208_BR_X 0xFFFF8000 +#define S_028208_BR_Y(x) (((x) & 0x7FFF) << 16) +#define G_028208_BR_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028208_BR_Y 0x8000FFFF +#define R_02820C_PA_SC_CLIPRECT_RULE 0x02820C +#define S_02820C_CLIP_RULE(x) (((x) & 0xFFFF) << 0) +#define G_02820C_CLIP_RULE(x) (((x) >> 0) & 0xFFFF) +#define C_02820C_CLIP_RULE 0xFFFF0000 +#define R_028210_PA_SC_CLIPRECT_0_TL 0x028210 +#define S_028210_TL_X(x) (((x) & 0x7FFF) << 0) +#define G_028210_TL_X(x) (((x) >> 0) & 0x7FFF) +#define C_028210_TL_X 0xFFFF8000 +#define S_028210_TL_Y(x) (((x) & 0x7FFF) << 16) +#define G_028210_TL_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028210_TL_Y 0x8000FFFF +#define R_028214_PA_SC_CLIPRECT_0_BR 0x028214 +#define S_028214_BR_X(x) (((x) & 0x7FFF) << 0) +#define G_028214_BR_X(x) (((x) >> 0) & 0x7FFF) +#define C_028214_BR_X 0xFFFF8000 +#define S_028214_BR_Y(x) (((x) & 0x7FFF) << 16) +#define G_028214_BR_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028214_BR_Y 0x8000FFFF +#define R_028218_PA_SC_CLIPRECT_1_TL 0x028218 +#define R_02821C_PA_SC_CLIPRECT_1_BR 0x02821C +#define R_028220_PA_SC_CLIPRECT_2_TL 0x028220 +#define R_028224_PA_SC_CLIPRECT_2_BR 0x028224 +#define R_028228_PA_SC_CLIPRECT_3_TL 0x028228 +#define R_02822C_PA_SC_CLIPRECT_3_BR 0x02822C +#define R_028230_PA_SC_EDGERULE 0x028230 +#define S_028230_ER_TRI(x) (((x) & 0x0F) << 0) +#define G_028230_ER_TRI(x) (((x) >> 0) & 0x0F) +#define C_028230_ER_TRI 0xFFFFFFF0 +#define S_028230_ER_POINT(x) (((x) & 0x0F) << 4) +#define G_028230_ER_POINT(x) (((x) >> 4) & 0x0F) +#define C_028230_ER_POINT 0xFFFFFF0F +#define S_028230_ER_RECT(x) (((x) & 0x0F) << 8) +#define G_028230_ER_RECT(x) (((x) >> 8) & 0x0F) +#define C_028230_ER_RECT 0xFFFFF0FF +#define S_028230_ER_LINE_LR(x) (((x) & 0x3F) << 12) +#define G_028230_ER_LINE_LR(x) (((x) >> 12) & 0x3F) +#define C_028230_ER_LINE_LR 0xFFFC0FFF +#define S_028230_ER_LINE_RL(x) (((x) & 0x3F) << 18) +#define G_028230_ER_LINE_RL(x) (((x) >> 18) & 0x3F) +#define C_028230_ER_LINE_RL 0xFF03FFFF +#define S_028230_ER_LINE_TB(x) (((x) & 0x0F) << 24) +#define G_028230_ER_LINE_TB(x) (((x) >> 24) & 0x0F) +#define C_028230_ER_LINE_TB 0xF0FFFFFF +#define S_028230_ER_LINE_BT(x) (((x) & 0x0F) << 28) +#define G_028230_ER_LINE_BT(x) (((x) >> 28) & 0x0F) +#define C_028230_ER_LINE_BT 0x0FFFFFFF +#define R_028234_PA_SU_HARDWARE_SCREEN_OFFSET 0x028234 +#define S_028234_HW_SCREEN_OFFSET_X(x) (((x) & 0x1FF) << 0) +#define G_028234_HW_SCREEN_OFFSET_X(x) (((x) >> 0) & 0x1FF) +#define C_028234_HW_SCREEN_OFFSET_X 0xFFFFFE00 +#define S_028234_HW_SCREEN_OFFSET_Y(x) (((x) & 0x1FF) << 16) +#define G_028234_HW_SCREEN_OFFSET_Y(x) (((x) >> 16) & 0x1FF) +#define C_028234_HW_SCREEN_OFFSET_Y 0xFE00FFFF +#define R_028238_CB_TARGET_MASK 0x028238 +#define S_028238_TARGET0_ENABLE(x) (((x) & 0x0F) << 0) +#define G_028238_TARGET0_ENABLE(x) (((x) >> 0) & 0x0F) +#define C_028238_TARGET0_ENABLE 0xFFFFFFF0 +#define S_028238_TARGET1_ENABLE(x) (((x) & 0x0F) << 4) +#define G_028238_TARGET1_ENABLE(x) (((x) >> 4) & 0x0F) +#define C_028238_TARGET1_ENABLE 0xFFFFFF0F +#define S_028238_TARGET2_ENABLE(x) (((x) & 0x0F) << 8) +#define G_028238_TARGET2_ENABLE(x) (((x) >> 8) & 0x0F) +#define C_028238_TARGET2_ENABLE 0xFFFFF0FF +#define S_028238_TARGET3_ENABLE(x) (((x) & 0x0F) << 12) +#define G_028238_TARGET3_ENABLE(x) (((x) >> 12) & 0x0F) +#define C_028238_TARGET3_ENABLE 0xFFFF0FFF +#define S_028238_TARGET4_ENABLE(x) (((x) & 0x0F) << 16) +#define G_028238_TARGET4_ENABLE(x) (((x) >> 16) & 0x0F) +#define C_028238_TARGET4_ENABLE 0xFFF0FFFF +#define S_028238_TARGET5_ENABLE(x) (((x) & 0x0F) << 20) +#define G_028238_TARGET5_ENABLE(x) (((x) >> 20) & 0x0F) +#define C_028238_TARGET5_ENABLE 0xFF0FFFFF +#define S_028238_TARGET6_ENABLE(x) (((x) & 0x0F) << 24) +#define G_028238_TARGET6_ENABLE(x) (((x) >> 24) & 0x0F) +#define C_028238_TARGET6_ENABLE 0xF0FFFFFF +#define S_028238_TARGET7_ENABLE(x) (((x) & 0x0F) << 28) +#define G_028238_TARGET7_ENABLE(x) (((x) >> 28) & 0x0F) +#define C_028238_TARGET7_ENABLE 0x0FFFFFFF +#define R_02823C_CB_SHADER_MASK 0x02823C +#define S_02823C_OUTPUT0_ENABLE(x) (((x) & 0x0F) << 0) +#define G_02823C_OUTPUT0_ENABLE(x) (((x) >> 0) & 0x0F) +#define C_02823C_OUTPUT0_ENABLE 0xFFFFFFF0 +#define S_02823C_OUTPUT1_ENABLE(x) (((x) & 0x0F) << 4) +#define G_02823C_OUTPUT1_ENABLE(x) (((x) >> 4) & 0x0F) +#define C_02823C_OUTPUT1_ENABLE 0xFFFFFF0F +#define S_02823C_OUTPUT2_ENABLE(x) (((x) & 0x0F) << 8) +#define G_02823C_OUTPUT2_ENABLE(x) (((x) >> 8) & 0x0F) +#define C_02823C_OUTPUT2_ENABLE 0xFFFFF0FF +#define S_02823C_OUTPUT3_ENABLE(x) (((x) & 0x0F) << 12) +#define G_02823C_OUTPUT3_ENABLE(x) (((x) >> 12) & 0x0F) +#define C_02823C_OUTPUT3_ENABLE 0xFFFF0FFF +#define S_02823C_OUTPUT4_ENABLE(x) (((x) & 0x0F) << 16) +#define G_02823C_OUTPUT4_ENABLE(x) (((x) >> 16) & 0x0F) +#define C_02823C_OUTPUT4_ENABLE 0xFFF0FFFF +#define S_02823C_OUTPUT5_ENABLE(x) (((x) & 0x0F) << 20) +#define G_02823C_OUTPUT5_ENABLE(x) (((x) >> 20) & 0x0F) +#define C_02823C_OUTPUT5_ENABLE 0xFF0FFFFF +#define S_02823C_OUTPUT6_ENABLE(x) (((x) & 0x0F) << 24) +#define G_02823C_OUTPUT6_ENABLE(x) (((x) >> 24) & 0x0F) +#define C_02823C_OUTPUT6_ENABLE 0xF0FFFFFF +#define S_02823C_OUTPUT7_ENABLE(x) (((x) & 0x0F) << 28) +#define G_02823C_OUTPUT7_ENABLE(x) (((x) >> 28) & 0x0F) +#define C_02823C_OUTPUT7_ENABLE 0x0FFFFFFF +#define R_028240_PA_SC_GENERIC_SCISSOR_TL 0x028240 +#define S_028240_TL_X(x) (((x) & 0x7FFF) << 0) +#define G_028240_TL_X(x) (((x) >> 0) & 0x7FFF) +#define C_028240_TL_X 0xFFFF8000 +#define S_028240_TL_Y(x) (((x) & 0x7FFF) << 16) +#define G_028240_TL_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028240_TL_Y 0x8000FFFF +#define S_028240_WINDOW_OFFSET_DISABLE(x) (((x) & 0x1) << 31) +#define G_028240_WINDOW_OFFSET_DISABLE(x) (((x) >> 31) & 0x1) +#define C_028240_WINDOW_OFFSET_DISABLE 0x7FFFFFFF +#define R_028244_PA_SC_GENERIC_SCISSOR_BR 0x028244 +#define S_028244_BR_X(x) (((x) & 0x7FFF) << 0) +#define G_028244_BR_X(x) (((x) >> 0) & 0x7FFF) +#define C_028244_BR_X 0xFFFF8000 +#define S_028244_BR_Y(x) (((x) & 0x7FFF) << 16) +#define G_028244_BR_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028244_BR_Y 0x8000FFFF +#define R_028250_PA_SC_VPORT_SCISSOR_0_TL 0x028250 +#define S_028250_TL_X(x) (((x) & 0x7FFF) << 0) +#define G_028250_TL_X(x) (((x) >> 0) & 0x7FFF) +#define C_028250_TL_X 0xFFFF8000 +#define S_028250_TL_Y(x) (((x) & 0x7FFF) << 16) +#define G_028250_TL_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028250_TL_Y 0x8000FFFF +#define S_028250_WINDOW_OFFSET_DISABLE(x) (((x) & 0x1) << 31) +#define G_028250_WINDOW_OFFSET_DISABLE(x) (((x) >> 31) & 0x1) +#define C_028250_WINDOW_OFFSET_DISABLE 0x7FFFFFFF +#define R_028254_PA_SC_VPORT_SCISSOR_0_BR 0x028254 +#define S_028254_BR_X(x) (((x) & 0x7FFF) << 0) +#define G_028254_BR_X(x) (((x) >> 0) & 0x7FFF) +#define C_028254_BR_X 0xFFFF8000 +#define S_028254_BR_Y(x) (((x) & 0x7FFF) << 16) +#define G_028254_BR_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028254_BR_Y 0x8000FFFF +#define R_0282D0_PA_SC_VPORT_ZMIN_0 0x0282D0 +#define R_0282D4_PA_SC_VPORT_ZMAX_0 0x0282D4 +#define R_028350_PA_SC_RASTER_CONFIG 0x028350 +#define S_028350_RB_MAP_PKR0(x) (((x) & 0x03) << 0) +#define G_028350_RB_MAP_PKR0(x) (((x) >> 0) & 0x03) +#define C_028350_RB_MAP_PKR0 0xFFFFFFFC +#define V_028350_RASTER_CONFIG_RB_MAP_0 0x00 +#define V_028350_RASTER_CONFIG_RB_MAP_1 0x01 +#define V_028350_RASTER_CONFIG_RB_MAP_2 0x02 +#define V_028350_RASTER_CONFIG_RB_MAP_3 0x03 +#define S_028350_RB_MAP_PKR1(x) (((x) & 0x03) << 2) +#define G_028350_RB_MAP_PKR1(x) (((x) >> 2) & 0x03) +#define C_028350_RB_MAP_PKR1 0xFFFFFFF3 +#define V_028350_RASTER_CONFIG_RB_MAP_0 0x00 +#define V_028350_RASTER_CONFIG_RB_MAP_1 0x01 +#define V_028350_RASTER_CONFIG_RB_MAP_2 0x02 +#define V_028350_RASTER_CONFIG_RB_MAP_3 0x03 +#define S_028350_RB_XSEL2(x) (((x) & 0x03) << 4) +#define G_028350_RB_XSEL2(x) (((x) >> 4) & 0x03) +#define C_028350_RB_XSEL2 0xFFFFFFCF +#define V_028350_RASTER_CONFIG_RB_XSEL2_0 0x00 +#define V_028350_RASTER_CONFIG_RB_XSEL2_1 0x01 +#define V_028350_RASTER_CONFIG_RB_XSEL2_2 0x02 +#define V_028350_RASTER_CONFIG_RB_XSEL2_3 0x03 +#define S_028350_RB_XSEL(x) (((x) & 0x1) << 6) +#define G_028350_RB_XSEL(x) (((x) >> 6) & 0x1) +#define C_028350_RB_XSEL 0xFFFFFFBF +#define S_028350_RB_YSEL(x) (((x) & 0x1) << 7) +#define G_028350_RB_YSEL(x) (((x) >> 7) & 0x1) +#define C_028350_RB_YSEL 0xFFFFFF7F +#define S_028350_PKR_MAP(x) (((x) & 0x03) << 8) +#define G_028350_PKR_MAP(x) (((x) >> 8) & 0x03) +#define C_028350_PKR_MAP 0xFFFFFCFF +#define V_028350_RASTER_CONFIG_PKR_MAP_0 0x00 +#define V_028350_RASTER_CONFIG_PKR_MAP_1 0x01 +#define V_028350_RASTER_CONFIG_PKR_MAP_2 0x02 +#define V_028350_RASTER_CONFIG_PKR_MAP_3 0x03 +#define S_028350_PKR_XSEL(x) (((x) & 0x03) << 10) +#define G_028350_PKR_XSEL(x) (((x) >> 10) & 0x03) +#define C_028350_PKR_XSEL 0xFFFFF3FF +#define V_028350_RASTER_CONFIG_PKR_XSEL_0 0x00 +#define V_028350_RASTER_CONFIG_PKR_XSEL_1 0x01 +#define V_028350_RASTER_CONFIG_PKR_XSEL_2 0x02 +#define V_028350_RASTER_CONFIG_PKR_XSEL_3 0x03 +#define S_028350_PKR_YSEL(x) (((x) & 0x03) << 12) +#define G_028350_PKR_YSEL(x) (((x) >> 12) & 0x03) +#define C_028350_PKR_YSEL 0xFFFFCFFF +#define V_028350_RASTER_CONFIG_PKR_YSEL_0 0x00 +#define V_028350_RASTER_CONFIG_PKR_YSEL_1 0x01 +#define V_028350_RASTER_CONFIG_PKR_YSEL_2 0x02 +#define V_028350_RASTER_CONFIG_PKR_YSEL_3 0x03 +#define S_028350_SC_MAP(x) (((x) & 0x03) << 16) +#define G_028350_SC_MAP(x) (((x) >> 16) & 0x03) +#define C_028350_SC_MAP 0xFFFCFFFF +#define V_028350_RASTER_CONFIG_SC_MAP_0 0x00 +#define V_028350_RASTER_CONFIG_SC_MAP_1 0x01 +#define V_028350_RASTER_CONFIG_SC_MAP_2 0x02 +#define V_028350_RASTER_CONFIG_SC_MAP_3 0x03 +#define S_028350_SC_XSEL(x) (((x) & 0x03) << 18) +#define G_028350_SC_XSEL(x) (((x) >> 18) & 0x03) +#define C_028350_SC_XSEL 0xFFF3FFFF +#define V_028350_RASTER_CONFIG_SC_XSEL_8_WIDE_TILE 0x00 +#define V_028350_RASTER_CONFIG_SC_XSEL_16_WIDE_TILE 0x01 +#define V_028350_RASTER_CONFIG_SC_XSEL_32_WIDE_TILE 0x02 +#define V_028350_RASTER_CONFIG_SC_XSEL_64_WIDE_TILE 0x03 +#define S_028350_SC_YSEL(x) (((x) & 0x03) << 20) +#define G_028350_SC_YSEL(x) (((x) >> 20) & 0x03) +#define C_028350_SC_YSEL 0xFFCFFFFF +#define V_028350_RASTER_CONFIG_SC_YSEL_8_WIDE_TILE 0x00 +#define V_028350_RASTER_CONFIG_SC_YSEL_16_WIDE_TILE 0x01 +#define V_028350_RASTER_CONFIG_SC_YSEL_32_WIDE_TILE 0x02 +#define V_028350_RASTER_CONFIG_SC_YSEL_64_WIDE_TILE 0x03 +#define S_028350_SE_MAP(x) (((x) & 0x03) << 24) +#define G_028350_SE_MAP(x) (((x) >> 24) & 0x03) +#define C_028350_SE_MAP 0xFCFFFFFF +#define V_028350_RASTER_CONFIG_SE_MAP_0 0x00 +#define V_028350_RASTER_CONFIG_SE_MAP_1 0x01 +#define V_028350_RASTER_CONFIG_SE_MAP_2 0x02 +#define V_028350_RASTER_CONFIG_SE_MAP_3 0x03 +#define S_028350_SE_XSEL(x) (((x) & 0x03) << 26) +#define G_028350_SE_XSEL(x) (((x) >> 26) & 0x03) +#define C_028350_SE_XSEL 0xF3FFFFFF +#define V_028350_RASTER_CONFIG_SE_XSEL_8_WIDE_TILE 0x00 +#define V_028350_RASTER_CONFIG_SE_XSEL_16_WIDE_TILE 0x01 +#define V_028350_RASTER_CONFIG_SE_XSEL_32_WIDE_TILE 0x02 +#define V_028350_RASTER_CONFIG_SE_XSEL_64_WIDE_TILE 0x03 +#define S_028350_SE_YSEL(x) (((x) & 0x03) << 28) +#define G_028350_SE_YSEL(x) (((x) >> 28) & 0x03) +#define C_028350_SE_YSEL 0xCFFFFFFF +#define V_028350_RASTER_CONFIG_SE_YSEL_8_WIDE_TILE 0x00 +#define V_028350_RASTER_CONFIG_SE_YSEL_16_WIDE_TILE 0x01 +#define V_028350_RASTER_CONFIG_SE_YSEL_32_WIDE_TILE 0x02 +#define V_028350_RASTER_CONFIG_SE_YSEL_64_WIDE_TILE 0x03 +#define R_028400_VGT_MAX_VTX_INDX 0x028400 +#define R_028404_VGT_MIN_VTX_INDX 0x028404 +#define R_028408_VGT_INDX_OFFSET 0x028408 +#define R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX 0x02840C +#define R_028414_CB_BLEND_RED 0x028414 +#define R_028418_CB_BLEND_GREEN 0x028418 +#define R_02841C_CB_BLEND_BLUE 0x02841C +#define R_028420_CB_BLEND_ALPHA 0x028420 +#define R_02842C_DB_STENCIL_CONTROL 0x02842C +#define S_02842C_STENCILFAIL(x) (((x) & 0x0F) << 0) +#define G_02842C_STENCILFAIL(x) (((x) >> 0) & 0x0F) +#define C_02842C_STENCILFAIL 0xFFFFFFF0 +#define V_02842C_STENCIL_KEEP 0x00 +#define V_02842C_STENCIL_ZERO 0x01 +#define V_02842C_STENCIL_ONES 0x02 +#define V_02842C_STENCIL_REPLACE_TEST 0x03 +#define V_02842C_STENCIL_REPLACE_OP 0x04 +#define V_02842C_STENCIL_ADD_CLAMP 0x05 +#define V_02842C_STENCIL_SUB_CLAMP 0x06 +#define V_02842C_STENCIL_INVERT 0x07 +#define V_02842C_STENCIL_ADD_WRAP 0x08 +#define V_02842C_STENCIL_SUB_WRAP 0x09 +#define V_02842C_STENCIL_AND 0x0A +#define V_02842C_STENCIL_OR 0x0B +#define V_02842C_STENCIL_XOR 0x0C +#define V_02842C_STENCIL_NAND 0x0D +#define V_02842C_STENCIL_NOR 0x0E +#define V_02842C_STENCIL_XNOR 0x0F +#define S_02842C_STENCILZPASS(x) (((x) & 0x0F) << 4) +#define G_02842C_STENCILZPASS(x) (((x) >> 4) & 0x0F) +#define C_02842C_STENCILZPASS 0xFFFFFF0F +#define V_02842C_STENCIL_KEEP 0x00 +#define V_02842C_STENCIL_ZERO 0x01 +#define V_02842C_STENCIL_ONES 0x02 +#define V_02842C_STENCIL_REPLACE_TEST 0x03 +#define V_02842C_STENCIL_REPLACE_OP 0x04 +#define V_02842C_STENCIL_ADD_CLAMP 0x05 +#define V_02842C_STENCIL_SUB_CLAMP 0x06 +#define V_02842C_STENCIL_INVERT 0x07 +#define V_02842C_STENCIL_ADD_WRAP 0x08 +#define V_02842C_STENCIL_SUB_WRAP 0x09 +#define V_02842C_STENCIL_AND 0x0A +#define V_02842C_STENCIL_OR 0x0B +#define V_02842C_STENCIL_XOR 0x0C +#define V_02842C_STENCIL_NAND 0x0D +#define V_02842C_STENCIL_NOR 0x0E +#define V_02842C_STENCIL_XNOR 0x0F +#define S_02842C_STENCILZFAIL(x) (((x) & 0x0F) << 8) +#define G_02842C_STENCILZFAIL(x) (((x) >> 8) & 0x0F) +#define C_02842C_STENCILZFAIL 0xFFFFF0FF +#define V_02842C_STENCIL_KEEP 0x00 +#define V_02842C_STENCIL_ZERO 0x01 +#define V_02842C_STENCIL_ONES 0x02 +#define V_02842C_STENCIL_REPLACE_TEST 0x03 +#define V_02842C_STENCIL_REPLACE_OP 0x04 +#define V_02842C_STENCIL_ADD_CLAMP 0x05 +#define V_02842C_STENCIL_SUB_CLAMP 0x06 +#define V_02842C_STENCIL_INVERT 0x07 +#define V_02842C_STENCIL_ADD_WRAP 0x08 +#define V_02842C_STENCIL_SUB_WRAP 0x09 +#define V_02842C_STENCIL_AND 0x0A +#define V_02842C_STENCIL_OR 0x0B +#define V_02842C_STENCIL_XOR 0x0C +#define V_02842C_STENCIL_NAND 0x0D +#define V_02842C_STENCIL_NOR 0x0E +#define V_02842C_STENCIL_XNOR 0x0F +#define S_02842C_STENCILFAIL_BF(x) (((x) & 0x0F) << 12) +#define G_02842C_STENCILFAIL_BF(x) (((x) >> 12) & 0x0F) +#define C_02842C_STENCILFAIL_BF 0xFFFF0FFF +#define V_02842C_STENCIL_KEEP 0x00 +#define V_02842C_STENCIL_ZERO 0x01 +#define V_02842C_STENCIL_ONES 0x02 +#define V_02842C_STENCIL_REPLACE_TEST 0x03 +#define V_02842C_STENCIL_REPLACE_OP 0x04 +#define V_02842C_STENCIL_ADD_CLAMP 0x05 +#define V_02842C_STENCIL_SUB_CLAMP 0x06 +#define V_02842C_STENCIL_INVERT 0x07 +#define V_02842C_STENCIL_ADD_WRAP 0x08 +#define V_02842C_STENCIL_SUB_WRAP 0x09 +#define V_02842C_STENCIL_AND 0x0A +#define V_02842C_STENCIL_OR 0x0B +#define V_02842C_STENCIL_XOR 0x0C +#define V_02842C_STENCIL_NAND 0x0D +#define V_02842C_STENCIL_NOR 0x0E +#define V_02842C_STENCIL_XNOR 0x0F +#define S_02842C_STENCILZPASS_BF(x) (((x) & 0x0F) << 16) +#define G_02842C_STENCILZPASS_BF(x) (((x) >> 16) & 0x0F) +#define C_02842C_STENCILZPASS_BF 0xFFF0FFFF +#define V_02842C_STENCIL_KEEP 0x00 +#define V_02842C_STENCIL_ZERO 0x01 +#define V_02842C_STENCIL_ONES 0x02 +#define V_02842C_STENCIL_REPLACE_TEST 0x03 +#define V_02842C_STENCIL_REPLACE_OP 0x04 +#define V_02842C_STENCIL_ADD_CLAMP 0x05 +#define V_02842C_STENCIL_SUB_CLAMP 0x06 +#define V_02842C_STENCIL_INVERT 0x07 +#define V_02842C_STENCIL_ADD_WRAP 0x08 +#define V_02842C_STENCIL_SUB_WRAP 0x09 +#define V_02842C_STENCIL_AND 0x0A +#define V_02842C_STENCIL_OR 0x0B +#define V_02842C_STENCIL_XOR 0x0C +#define V_02842C_STENCIL_NAND 0x0D +#define V_02842C_STENCIL_NOR 0x0E +#define V_02842C_STENCIL_XNOR 0x0F +#define S_02842C_STENCILZFAIL_BF(x) (((x) & 0x0F) << 20) +#define G_02842C_STENCILZFAIL_BF(x) (((x) >> 20) & 0x0F) +#define C_02842C_STENCILZFAIL_BF 0xFF0FFFFF +#define V_02842C_STENCIL_KEEP 0x00 +#define V_02842C_STENCIL_ZERO 0x01 +#define V_02842C_STENCIL_ONES 0x02 +#define V_02842C_STENCIL_REPLACE_TEST 0x03 +#define V_02842C_STENCIL_REPLACE_OP 0x04 +#define V_02842C_STENCIL_ADD_CLAMP 0x05 +#define V_02842C_STENCIL_SUB_CLAMP 0x06 +#define V_02842C_STENCIL_INVERT 0x07 +#define V_02842C_STENCIL_ADD_WRAP 0x08 +#define V_02842C_STENCIL_SUB_WRAP 0x09 +#define V_02842C_STENCIL_AND 0x0A +#define V_02842C_STENCIL_OR 0x0B +#define V_02842C_STENCIL_XOR 0x0C +#define V_02842C_STENCIL_NAND 0x0D +#define V_02842C_STENCIL_NOR 0x0E +#define V_02842C_STENCIL_XNOR 0x0F +#define R_028430_DB_STENCILREFMASK 0x028430 +#define S_028430_STENCILTESTVAL(x) (((x) & 0xFF) << 0) +#define G_028430_STENCILTESTVAL(x) (((x) >> 0) & 0xFF) +#define C_028430_STENCILTESTVAL 0xFFFFFF00 +#define S_028430_STENCILMASK(x) (((x) & 0xFF) << 8) +#define G_028430_STENCILMASK(x) (((x) >> 8) & 0xFF) +#define C_028430_STENCILMASK 0xFFFF00FF +#define S_028430_STENCILWRITEMASK(x) (((x) & 0xFF) << 16) +#define G_028430_STENCILWRITEMASK(x) (((x) >> 16) & 0xFF) +#define C_028430_STENCILWRITEMASK 0xFF00FFFF +#define S_028430_STENCILOPVAL(x) (((x) & 0xFF) << 24) +#define G_028430_STENCILOPVAL(x) (((x) >> 24) & 0xFF) +#define C_028430_STENCILOPVAL 0x00FFFFFF +#define R_028434_DB_STENCILREFMASK_BF 0x028434 +#define S_028434_STENCILTESTVAL_BF(x) (((x) & 0xFF) << 0) +#define G_028434_STENCILTESTVAL_BF(x) (((x) >> 0) & 0xFF) +#define C_028434_STENCILTESTVAL_BF 0xFFFFFF00 +#define S_028434_STENCILMASK_BF(x) (((x) & 0xFF) << 8) +#define G_028434_STENCILMASK_BF(x) (((x) >> 8) & 0xFF) +#define C_028434_STENCILMASK_BF 0xFFFF00FF +#define S_028434_STENCILWRITEMASK_BF(x) (((x) & 0xFF) << 16) +#define G_028434_STENCILWRITEMASK_BF(x) (((x) >> 16) & 0xFF) +#define C_028434_STENCILWRITEMASK_BF 0xFF00FFFF +#define S_028434_STENCILOPVAL_BF(x) (((x) & 0xFF) << 24) +#define G_028434_STENCILOPVAL_BF(x) (((x) >> 24) & 0xFF) +#define C_028434_STENCILOPVAL_BF 0x00FFFFFF +#define R_02843C_PA_CL_VPORT_XSCALE_0 0x02843C +#define R_028440_PA_CL_VPORT_XOFFSET_0 0x028440 +#define R_028444_PA_CL_VPORT_YSCALE_0 0x028444 +#define R_028448_PA_CL_VPORT_YOFFSET_0 0x028448 +#define R_02844C_PA_CL_VPORT_ZSCALE_0 0x02844C +#define R_028450_PA_CL_VPORT_ZOFFSET_0 0x028450 +#define R_0285BC_PA_CL_UCP_0_X 0x0285BC +#define R_0285C0_PA_CL_UCP_0_Y 0x0285C0 +#define R_0285C4_PA_CL_UCP_0_Z 0x0285C4 +#define R_0285C8_PA_CL_UCP_0_W 0x0285C8 +#define R_0285CC_PA_CL_UCP_1_X 0x0285CC +#define R_0285D0_PA_CL_UCP_1_Y 0x0285D0 +#define R_0285D4_PA_CL_UCP_1_Z 0x0285D4 +#define R_0285D8_PA_CL_UCP_1_W 0x0285D8 +#define R_0285DC_PA_CL_UCP_2_X 0x0285DC +#define R_0285E0_PA_CL_UCP_2_Y 0x0285E0 +#define R_0285E4_PA_CL_UCP_2_Z 0x0285E4 +#define R_0285E8_PA_CL_UCP_2_W 0x0285E8 +#define R_0285EC_PA_CL_UCP_3_X 0x0285EC +#define R_0285F0_PA_CL_UCP_3_Y 0x0285F0 +#define R_0285F4_PA_CL_UCP_3_Z 0x0285F4 +#define R_0285F8_PA_CL_UCP_3_W 0x0285F8 +#define R_0285FC_PA_CL_UCP_4_X 0x0285FC +#define R_028600_PA_CL_UCP_4_Y 0x028600 +#define R_028604_PA_CL_UCP_4_Z 0x028604 +#define R_028608_PA_CL_UCP_4_W 0x028608 +#define R_02860C_PA_CL_UCP_5_X 0x02860C +#define R_028610_PA_CL_UCP_5_Y 0x028610 +#define R_028614_PA_CL_UCP_5_Z 0x028614 +#define R_028618_PA_CL_UCP_5_W 0x028618 +#define R_028644_SPI_PS_INPUT_CNTL_0 0x028644 +#define S_028644_OFFSET(x) (((x) & 0x3F) << 0) +#define G_028644_OFFSET(x) (((x) >> 0) & 0x3F) +#define C_028644_OFFSET 0xFFFFFFC0 +#define S_028644_DEFAULT_VAL(x) (((x) & 0x03) << 8) +#define G_028644_DEFAULT_VAL(x) (((x) >> 8) & 0x03) +#define C_028644_DEFAULT_VAL 0xFFFFFCFF +#define V_028644_X_0_0F 0x00 +#define S_028644_FLAT_SHADE(x) (((x) & 0x1) << 10) +#define G_028644_FLAT_SHADE(x) (((x) >> 10) & 0x1) +#define C_028644_FLAT_SHADE 0xFFFFFBFF +#define S_028644_CYL_WRAP(x) (((x) & 0x0F) << 13) +#define G_028644_CYL_WRAP(x) (((x) >> 13) & 0x0F) +#define C_028644_CYL_WRAP 0xFFFE1FFF +#define S_028644_PT_SPRITE_TEX(x) (((x) & 0x1) << 17) +#define G_028644_PT_SPRITE_TEX(x) (((x) >> 17) & 0x1) +#define C_028644_PT_SPRITE_TEX 0xFFFDFFFF +#define R_028648_SPI_PS_INPUT_CNTL_1 0x028648 +#define R_02864C_SPI_PS_INPUT_CNTL_2 0x02864C +#define R_028650_SPI_PS_INPUT_CNTL_3 0x028650 +#define R_028654_SPI_PS_INPUT_CNTL_4 0x028654 +#define R_028658_SPI_PS_INPUT_CNTL_5 0x028658 +#define R_02865C_SPI_PS_INPUT_CNTL_6 0x02865C +#define R_028660_SPI_PS_INPUT_CNTL_7 0x028660 +#define R_028664_SPI_PS_INPUT_CNTL_8 0x028664 +#define R_028668_SPI_PS_INPUT_CNTL_9 0x028668 +#define R_02866C_SPI_PS_INPUT_CNTL_10 0x02866C +#define R_028670_SPI_PS_INPUT_CNTL_11 0x028670 +#define R_028674_SPI_PS_INPUT_CNTL_12 0x028674 +#define R_028678_SPI_PS_INPUT_CNTL_13 0x028678 +#define R_02867C_SPI_PS_INPUT_CNTL_14 0x02867C +#define R_028680_SPI_PS_INPUT_CNTL_15 0x028680 +#define R_028684_SPI_PS_INPUT_CNTL_16 0x028684 +#define R_028688_SPI_PS_INPUT_CNTL_17 0x028688 +#define R_02868C_SPI_PS_INPUT_CNTL_18 0x02868C +#define R_028690_SPI_PS_INPUT_CNTL_19 0x028690 +#define R_028694_SPI_PS_INPUT_CNTL_20 0x028694 +#define R_028698_SPI_PS_INPUT_CNTL_21 0x028698 +#define R_02869C_SPI_PS_INPUT_CNTL_22 0x02869C +#define R_0286A0_SPI_PS_INPUT_CNTL_23 0x0286A0 +#define R_0286A4_SPI_PS_INPUT_CNTL_24 0x0286A4 +#define R_0286A8_SPI_PS_INPUT_CNTL_25 0x0286A8 +#define R_0286AC_SPI_PS_INPUT_CNTL_26 0x0286AC +#define R_0286B0_SPI_PS_INPUT_CNTL_27 0x0286B0 +#define R_0286B4_SPI_PS_INPUT_CNTL_28 0x0286B4 +#define R_0286B8_SPI_PS_INPUT_CNTL_29 0x0286B8 +#define R_0286BC_SPI_PS_INPUT_CNTL_30 0x0286BC +#define R_0286C0_SPI_PS_INPUT_CNTL_31 0x0286C0 +#define R_0286C4_SPI_VS_OUT_CONFIG 0x0286C4 +#define S_0286C4_VS_EXPORT_COUNT(x) (((x) & 0x1F) << 1) +#define G_0286C4_VS_EXPORT_COUNT(x) (((x) >> 1) & 0x1F) +#define C_0286C4_VS_EXPORT_COUNT 0xFFFFFFC1 +#define S_0286C4_VS_HALF_PACK(x) (((x) & 0x1) << 6) +#define G_0286C4_VS_HALF_PACK(x) (((x) >> 6) & 0x1) +#define C_0286C4_VS_HALF_PACK 0xFFFFFFBF +#define S_0286C4_VS_EXPORTS_FOG(x) (((x) & 0x1) << 7) +#define G_0286C4_VS_EXPORTS_FOG(x) (((x) >> 7) & 0x1) +#define C_0286C4_VS_EXPORTS_FOG 0xFFFFFF7F +#define S_0286C4_VS_OUT_FOG_VEC_ADDR(x) (((x) & 0x1F) << 8) +#define G_0286C4_VS_OUT_FOG_VEC_ADDR(x) (((x) >> 8) & 0x1F) +#define C_0286C4_VS_OUT_FOG_VEC_ADDR 0xFFFFE0FF +#define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC +#define S_0286CC_PERSP_SAMPLE_ENA(x) (((x) & 0x1) << 0) +#define G_0286CC_PERSP_SAMPLE_ENA(x) (((x) >> 0) & 0x1) +#define C_0286CC_PERSP_SAMPLE_ENA 0xFFFFFFFE +#define S_0286CC_PERSP_CENTER_ENA(x) (((x) & 0x1) << 1) +#define G_0286CC_PERSP_CENTER_ENA(x) (((x) >> 1) & 0x1) +#define C_0286CC_PERSP_CENTER_ENA 0xFFFFFFFD +#define S_0286CC_PERSP_CENTROID_ENA(x) (((x) & 0x1) << 2) +#define G_0286CC_PERSP_CENTROID_ENA(x) (((x) >> 2) & 0x1) +#define C_0286CC_PERSP_CENTROID_ENA 0xFFFFFFFB +#define S_0286CC_PERSP_PULL_MODEL_ENA(x) (((x) & 0x1) << 3) +#define G_0286CC_PERSP_PULL_MODEL_ENA(x) (((x) >> 3) & 0x1) +#define C_0286CC_PERSP_PULL_MODEL_ENA 0xFFFFFFF7 +#define S_0286CC_LINEAR_SAMPLE_ENA(x) (((x) & 0x1) << 4) +#define G_0286CC_LINEAR_SAMPLE_ENA(x) (((x) >> 4) & 0x1) +#define C_0286CC_LINEAR_SAMPLE_ENA 0xFFFFFFEF +#define S_0286CC_LINEAR_CENTER_ENA(x) (((x) & 0x1) << 5) +#define G_0286CC_LINEAR_CENTER_ENA(x) (((x) >> 5) & 0x1) +#define C_0286CC_LINEAR_CENTER_ENA 0xFFFFFFDF +#define S_0286CC_LINEAR_CENTROID_ENA(x) (((x) & 0x1) << 6) +#define G_0286CC_LINEAR_CENTROID_ENA(x) (((x) >> 6) & 0x1) +#define C_0286CC_LINEAR_CENTROID_ENA 0xFFFFFFBF +#define S_0286CC_LINE_STIPPLE_TEX_ENA(x) (((x) & 0x1) << 7) +#define G_0286CC_LINE_STIPPLE_TEX_ENA(x) (((x) >> 7) & 0x1) +#define C_0286CC_LINE_STIPPLE_TEX_ENA 0xFFFFFF7F +#define S_0286CC_POS_X_FLOAT_ENA(x) (((x) & 0x1) << 8) +#define G_0286CC_POS_X_FLOAT_ENA(x) (((x) >> 8) & 0x1) +#define C_0286CC_POS_X_FLOAT_ENA 0xFFFFFEFF +#define S_0286CC_POS_Y_FLOAT_ENA(x) (((x) & 0x1) << 9) +#define G_0286CC_POS_Y_FLOAT_ENA(x) (((x) >> 9) & 0x1) +#define C_0286CC_POS_Y_FLOAT_ENA 0xFFFFFDFF +#define S_0286CC_POS_Z_FLOAT_ENA(x) (((x) & 0x1) << 10) +#define G_0286CC_POS_Z_FLOAT_ENA(x) (((x) >> 10) & 0x1) +#define C_0286CC_POS_Z_FLOAT_ENA 0xFFFFFBFF +#define S_0286CC_POS_W_FLOAT_ENA(x) (((x) & 0x1) << 11) +#define G_0286CC_POS_W_FLOAT_ENA(x) (((x) >> 11) & 0x1) +#define C_0286CC_POS_W_FLOAT_ENA 0xFFFFF7FF +#define S_0286CC_FRONT_FACE_ENA(x) (((x) & 0x1) << 12) +#define G_0286CC_FRONT_FACE_ENA(x) (((x) >> 12) & 0x1) +#define C_0286CC_FRONT_FACE_ENA 0xFFFFEFFF +#define S_0286CC_ANCILLARY_ENA(x) (((x) & 0x1) << 13) +#define G_0286CC_ANCILLARY_ENA(x) (((x) >> 13) & 0x1) +#define C_0286CC_ANCILLARY_ENA 0xFFFFDFFF +#define S_0286CC_SAMPLE_COVERAGE_ENA(x) (((x) & 0x1) << 14) +#define G_0286CC_SAMPLE_COVERAGE_ENA(x) (((x) >> 14) & 0x1) +#define C_0286CC_SAMPLE_COVERAGE_ENA 0xFFFFBFFF +#define S_0286CC_POS_FIXED_PT_ENA(x) (((x) & 0x1) << 15) +#define G_0286CC_POS_FIXED_PT_ENA(x) (((x) >> 15) & 0x1) +#define C_0286CC_POS_FIXED_PT_ENA 0xFFFF7FFF +#define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0 +#define S_0286D0_PERSP_SAMPLE_ENA(x) (((x) & 0x1) << 0) +#define G_0286D0_PERSP_SAMPLE_ENA(x) (((x) >> 0) & 0x1) +#define C_0286D0_PERSP_SAMPLE_ENA 0xFFFFFFFE +#define S_0286D0_PERSP_CENTER_ENA(x) (((x) & 0x1) << 1) +#define G_0286D0_PERSP_CENTER_ENA(x) (((x) >> 1) & 0x1) +#define C_0286D0_PERSP_CENTER_ENA 0xFFFFFFFD +#define S_0286D0_PERSP_CENTROID_ENA(x) (((x) & 0x1) << 2) +#define G_0286D0_PERSP_CENTROID_ENA(x) (((x) >> 2) & 0x1) +#define C_0286D0_PERSP_CENTROID_ENA 0xFFFFFFFB +#define S_0286D0_PERSP_PULL_MODEL_ENA(x) (((x) & 0x1) << 3) +#define G_0286D0_PERSP_PULL_MODEL_ENA(x) (((x) >> 3) & 0x1) +#define C_0286D0_PERSP_PULL_MODEL_ENA 0xFFFFFFF7 +#define S_0286D0_LINEAR_SAMPLE_ENA(x) (((x) & 0x1) << 4) +#define G_0286D0_LINEAR_SAMPLE_ENA(x) (((x) >> 4) & 0x1) +#define C_0286D0_LINEAR_SAMPLE_ENA 0xFFFFFFEF +#define S_0286D0_LINEAR_CENTER_ENA(x) (((x) & 0x1) << 5) +#define G_0286D0_LINEAR_CENTER_ENA(x) (((x) >> 5) & 0x1) +#define C_0286D0_LINEAR_CENTER_ENA 0xFFFFFFDF +#define S_0286D0_LINEAR_CENTROID_ENA(x) (((x) & 0x1) << 6) +#define G_0286D0_LINEAR_CENTROID_ENA(x) (((x) >> 6) & 0x1) +#define C_0286D0_LINEAR_CENTROID_ENA 0xFFFFFFBF +#define S_0286D0_LINE_STIPPLE_TEX_ENA(x) (((x) & 0x1) << 7) +#define G_0286D0_LINE_STIPPLE_TEX_ENA(x) (((x) >> 7) & 0x1) +#define C_0286D0_LINE_STIPPLE_TEX_ENA 0xFFFFFF7F +#define S_0286D0_POS_X_FLOAT_ENA(x) (((x) & 0x1) << 8) +#define G_0286D0_POS_X_FLOAT_ENA(x) (((x) >> 8) & 0x1) +#define C_0286D0_POS_X_FLOAT_ENA 0xFFFFFEFF +#define S_0286D0_POS_Y_FLOAT_ENA(x) (((x) & 0x1) << 9) +#define G_0286D0_POS_Y_FLOAT_ENA(x) (((x) >> 9) & 0x1) +#define C_0286D0_POS_Y_FLOAT_ENA 0xFFFFFDFF +#define S_0286D0_POS_Z_FLOAT_ENA(x) (((x) & 0x1) << 10) +#define G_0286D0_POS_Z_FLOAT_ENA(x) (((x) >> 10) & 0x1) +#define C_0286D0_POS_Z_FLOAT_ENA 0xFFFFFBFF +#define S_0286D0_POS_W_FLOAT_ENA(x) (((x) & 0x1) << 11) +#define G_0286D0_POS_W_FLOAT_ENA(x) (((x) >> 11) & 0x1) +#define C_0286D0_POS_W_FLOAT_ENA 0xFFFFF7FF +#define S_0286D0_FRONT_FACE_ENA(x) (((x) & 0x1) << 12) +#define G_0286D0_FRONT_FACE_ENA(x) (((x) >> 12) & 0x1) +#define C_0286D0_FRONT_FACE_ENA 0xFFFFEFFF +#define S_0286D0_ANCILLARY_ENA(x) (((x) & 0x1) << 13) +#define G_0286D0_ANCILLARY_ENA(x) (((x) >> 13) & 0x1) +#define C_0286D0_ANCILLARY_ENA 0xFFFFDFFF +#define S_0286D0_SAMPLE_COVERAGE_ENA(x) (((x) & 0x1) << 14) +#define G_0286D0_SAMPLE_COVERAGE_ENA(x) (((x) >> 14) & 0x1) +#define C_0286D0_SAMPLE_COVERAGE_ENA 0xFFFFBFFF +#define S_0286D0_POS_FIXED_PT_ENA(x) (((x) & 0x1) << 15) +#define G_0286D0_POS_FIXED_PT_ENA(x) (((x) >> 15) & 0x1) +#define C_0286D0_POS_FIXED_PT_ENA 0xFFFF7FFF +#define R_0286D4_SPI_INTERP_CONTROL_0 0x0286D4 +#define S_0286D4_FLAT_SHADE_ENA(x) (((x) & 0x1) << 0) +#define G_0286D4_FLAT_SHADE_ENA(x) (((x) >> 0) & 0x1) +#define C_0286D4_FLAT_SHADE_ENA 0xFFFFFFFE +#define S_0286D4_PNT_SPRITE_ENA(x) (((x) & 0x1) << 1) +#define G_0286D4_PNT_SPRITE_ENA(x) (((x) >> 1) & 0x1) +#define C_0286D4_PNT_SPRITE_ENA 0xFFFFFFFD +#define S_0286D4_PNT_SPRITE_OVRD_X(x) (((x) & 0x07) << 2) +#define G_0286D4_PNT_SPRITE_OVRD_X(x) (((x) >> 2) & 0x07) +#define C_0286D4_PNT_SPRITE_OVRD_X 0xFFFFFFE3 +#define V_0286D4_SPI_PNT_SPRITE_SEL_0 0x00 +#define V_0286D4_SPI_PNT_SPRITE_SEL_1 0x01 +#define V_0286D4_SPI_PNT_SPRITE_SEL_S 0x02 +#define V_0286D4_SPI_PNT_SPRITE_SEL_T 0x03 +#define V_0286D4_SPI_PNT_SPRITE_SEL_NONE 0x04 +#define S_0286D4_PNT_SPRITE_OVRD_Y(x) (((x) & 0x07) << 5) +#define G_0286D4_PNT_SPRITE_OVRD_Y(x) (((x) >> 5) & 0x07) +#define C_0286D4_PNT_SPRITE_OVRD_Y 0xFFFFFF1F +#define V_0286D4_SPI_PNT_SPRITE_SEL_0 0x00 +#define V_0286D4_SPI_PNT_SPRITE_SEL_1 0x01 +#define V_0286D4_SPI_PNT_SPRITE_SEL_S 0x02 +#define V_0286D4_SPI_PNT_SPRITE_SEL_T 0x03 +#define V_0286D4_SPI_PNT_SPRITE_SEL_NONE 0x04 +#define S_0286D4_PNT_SPRITE_OVRD_Z(x) (((x) & 0x07) << 8) +#define G_0286D4_PNT_SPRITE_OVRD_Z(x) (((x) >> 8) & 0x07) +#define C_0286D4_PNT_SPRITE_OVRD_Z 0xFFFFF8FF +#define V_0286D4_SPI_PNT_SPRITE_SEL_0 0x00 +#define V_0286D4_SPI_PNT_SPRITE_SEL_1 0x01 +#define V_0286D4_SPI_PNT_SPRITE_SEL_S 0x02 +#define V_0286D4_SPI_PNT_SPRITE_SEL_T 0x03 +#define V_0286D4_SPI_PNT_SPRITE_SEL_NONE 0x04 +#define S_0286D4_PNT_SPRITE_OVRD_W(x) (((x) & 0x07) << 11) +#define G_0286D4_PNT_SPRITE_OVRD_W(x) (((x) >> 11) & 0x07) +#define C_0286D4_PNT_SPRITE_OVRD_W 0xFFFFC7FF +#define V_0286D4_SPI_PNT_SPRITE_SEL_0 0x00 +#define V_0286D4_SPI_PNT_SPRITE_SEL_1 0x01 +#define V_0286D4_SPI_PNT_SPRITE_SEL_S 0x02 +#define V_0286D4_SPI_PNT_SPRITE_SEL_T 0x03 +#define V_0286D4_SPI_PNT_SPRITE_SEL_NONE 0x04 +#define S_0286D4_PNT_SPRITE_TOP_1(x) (((x) & 0x1) << 14) +#define G_0286D4_PNT_SPRITE_TOP_1(x) (((x) >> 14) & 0x1) +#define C_0286D4_PNT_SPRITE_TOP_1 0xFFFFBFFF +#define R_0286D8_SPI_PS_IN_CONTROL 0x0286D8 +#define S_0286D8_NUM_INTERP(x) (((x) & 0x3F) << 0) +#define G_0286D8_NUM_INTERP(x) (((x) >> 0) & 0x3F) +#define C_0286D8_NUM_INTERP 0xFFFFFFC0 +#define S_0286D8_PARAM_GEN(x) (((x) & 0x1) << 6) +#define G_0286D8_PARAM_GEN(x) (((x) >> 6) & 0x1) +#define C_0286D8_PARAM_GEN 0xFFFFFFBF +#define S_0286D8_FOG_ADDR(x) (((x) & 0x7F) << 7) +#define G_0286D8_FOG_ADDR(x) (((x) >> 7) & 0x7F) +#define C_0286D8_FOG_ADDR 0xFFFFC07F +#define S_0286D8_BC_OPTIMIZE_DISABLE(x) (((x) & 0x1) << 14) +#define G_0286D8_BC_OPTIMIZE_DISABLE(x) (((x) >> 14) & 0x1) +#define C_0286D8_BC_OPTIMIZE_DISABLE 0xFFFFBFFF +#define S_0286D8_PASS_FOG_THROUGH_PS(x) (((x) & 0x1) << 15) +#define G_0286D8_PASS_FOG_THROUGH_PS(x) (((x) >> 15) & 0x1) +#define C_0286D8_PASS_FOG_THROUGH_PS 0xFFFF7FFF +#define R_0286E0_SPI_BARYC_CNTL 0x0286E0 +#define S_0286E0_PERSP_CENTER_CNTL(x) (((x) & 0x1) << 0) +#define G_0286E0_PERSP_CENTER_CNTL(x) (((x) >> 0) & 0x1) +#define C_0286E0_PERSP_CENTER_CNTL 0xFFFFFFFE +#define S_0286E0_PERSP_CENTROID_CNTL(x) (((x) & 0x1) << 4) +#define G_0286E0_PERSP_CENTROID_CNTL(x) (((x) >> 4) & 0x1) +#define C_0286E0_PERSP_CENTROID_CNTL 0xFFFFFFEF +#define S_0286E0_LINEAR_CENTER_CNTL(x) (((x) & 0x1) << 8) +#define G_0286E0_LINEAR_CENTER_CNTL(x) (((x) >> 8) & 0x1) +#define C_0286E0_LINEAR_CENTER_CNTL 0xFFFFFEFF +#define S_0286E0_LINEAR_CENTROID_CNTL(x) (((x) & 0x1) << 12) +#define G_0286E0_LINEAR_CENTROID_CNTL(x) (((x) >> 12) & 0x1) +#define C_0286E0_LINEAR_CENTROID_CNTL 0xFFFFEFFF +#define S_0286E0_POS_FLOAT_LOCATION(x) (((x) & 0x03) << 16) +#define G_0286E0_POS_FLOAT_LOCATION(x) (((x) >> 16) & 0x03) +#define C_0286E0_POS_FLOAT_LOCATION 0xFFFCFFFF +#define V_0286E0_X_CALCULATE_PER_PIXEL_FLOATING_POINT_POSITION_AT 0x00 +#define S_0286E0_POS_FLOAT_ULC(x) (((x) & 0x1) << 20) +#define G_0286E0_POS_FLOAT_ULC(x) (((x) >> 20) & 0x1) +#define C_0286E0_POS_FLOAT_ULC 0xFFEFFFFF +#define S_0286E0_FRONT_FACE_ALL_BITS(x) (((x) & 0x1) << 24) +#define G_0286E0_FRONT_FACE_ALL_BITS(x) (((x) >> 24) & 0x1) +#define C_0286E0_FRONT_FACE_ALL_BITS 0xFEFFFFFF +#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8 +#define S_0286E8_WAVES(x) (((x) & 0xFFF) << 0) +#define G_0286E8_WAVES(x) (((x) >> 0) & 0xFFF) +#define C_0286E8_WAVES 0xFFFFF000 +#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12) +#define G_0286E8_WAVESIZE(x) (((x) >> 12) & 0x1FFF) +#define C_0286E8_WAVESIZE 0xFE000FFF +#define R_028704_SPI_WAVE_MGMT_1 0x028704 +#define S_028704_NUM_PS_WAVES(x) (((x) & 0x3F) << 0) +#define G_028704_NUM_PS_WAVES(x) (((x) >> 0) & 0x3F) +#define C_028704_NUM_PS_WAVES 0xFFFFFFC0 +#define S_028704_NUM_VS_WAVES(x) (((x) & 0x3F) << 6) +#define G_028704_NUM_VS_WAVES(x) (((x) >> 6) & 0x3F) +#define C_028704_NUM_VS_WAVES 0xFFFFF03F +#define S_028704_NUM_GS_WAVES(x) (((x) & 0x3F) << 12) +#define G_028704_NUM_GS_WAVES(x) (((x) >> 12) & 0x3F) +#define C_028704_NUM_GS_WAVES 0xFFFC0FFF +#define S_028704_NUM_ES_WAVES(x) (((x) & 0x3F) << 18) +#define G_028704_NUM_ES_WAVES(x) (((x) >> 18) & 0x3F) +#define C_028704_NUM_ES_WAVES 0xFF03FFFF +#define S_028704_NUM_HS_WAVES(x) (((x) & 0x3F) << 24) +#define G_028704_NUM_HS_WAVES(x) (((x) >> 24) & 0x3F) +#define C_028704_NUM_HS_WAVES 0xC0FFFFFF +#define R_028708_SPI_WAVE_MGMT_2 0x028708 +#define S_028708_NUM_LS_WAVES(x) (((x) & 0x3F) << 0) +#define G_028708_NUM_LS_WAVES(x) (((x) >> 0) & 0x3F) +#define C_028708_NUM_LS_WAVES 0xFFFFFFC0 +#define R_02870C_SPI_SHADER_POS_FORMAT 0x02870C +#define S_02870C_POS0_EXPORT_FORMAT(x) (((x) & 0x0F) << 0) +#define G_02870C_POS0_EXPORT_FORMAT(x) (((x) >> 0) & 0x0F) +#define C_02870C_POS0_EXPORT_FORMAT 0xFFFFFFF0 +#define V_02870C_SPI_SHADER_NONE 0x00 +#define V_02870C_SPI_SHADER_1COMP 0x01 +#define V_02870C_SPI_SHADER_2COMP 0x02 +#define V_02870C_SPI_SHADER_4COMPRESS 0x03 +#define V_02870C_SPI_SHADER_4COMP 0x04 +#define S_02870C_POS1_EXPORT_FORMAT(x) (((x) & 0x0F) << 4) +#define G_02870C_POS1_EXPORT_FORMAT(x) (((x) >> 4) & 0x0F) +#define C_02870C_POS1_EXPORT_FORMAT 0xFFFFFF0F +#define V_02870C_SPI_SHADER_NONE 0x00 +#define V_02870C_SPI_SHADER_1COMP 0x01 +#define V_02870C_SPI_SHADER_2COMP 0x02 +#define V_02870C_SPI_SHADER_4COMPRESS 0x03 +#define V_02870C_SPI_SHADER_4COMP 0x04 +#define S_02870C_POS2_EXPORT_FORMAT(x) (((x) & 0x0F) << 8) +#define G_02870C_POS2_EXPORT_FORMAT(x) (((x) >> 8) & 0x0F) +#define C_02870C_POS2_EXPORT_FORMAT 0xFFFFF0FF +#define V_02870C_SPI_SHADER_NONE 0x00 +#define V_02870C_SPI_SHADER_1COMP 0x01 +#define V_02870C_SPI_SHADER_2COMP 0x02 +#define V_02870C_SPI_SHADER_4COMPRESS 0x03 +#define V_02870C_SPI_SHADER_4COMP 0x04 +#define S_02870C_POS3_EXPORT_FORMAT(x) (((x) & 0x0F) << 12) +#define G_02870C_POS3_EXPORT_FORMAT(x) (((x) >> 12) & 0x0F) +#define C_02870C_POS3_EXPORT_FORMAT 0xFFFF0FFF +#define V_02870C_SPI_SHADER_NONE 0x00 +#define V_02870C_SPI_SHADER_1COMP 0x01 +#define V_02870C_SPI_SHADER_2COMP 0x02 +#define V_02870C_SPI_SHADER_4COMPRESS 0x03 +#define V_02870C_SPI_SHADER_4COMP 0x04 +#define R_028710_SPI_SHADER_Z_FORMAT 0x028710 +#define S_028710_Z_EXPORT_FORMAT(x) (((x) & 0x0F) << 0) +#define G_028710_Z_EXPORT_FORMAT(x) (((x) >> 0) & 0x0F) +#define C_028710_Z_EXPORT_FORMAT 0xFFFFFFF0 +#define V_028710_SPI_SHADER_ZERO 0x00 +#define V_028710_SPI_SHADER_32_R 0x01 +#define V_028710_SPI_SHADER_32_GR 0x02 +#define V_028710_SPI_SHADER_32_AR 0x03 +#define V_028710_SPI_SHADER_FP16_ABGR 0x04 +#define V_028710_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028710_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028710_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028710_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028710_SPI_SHADER_32_ABGR 0x09 +#define R_028714_SPI_SHADER_COL_FORMAT 0x028714 +#define S_028714_COL0_EXPORT_FORMAT(x) (((x) & 0x0F) << 0) +#define G_028714_COL0_EXPORT_FORMAT(x) (((x) >> 0) & 0x0F) +#define C_028714_COL0_EXPORT_FORMAT 0xFFFFFFF0 +#define V_028714_SPI_SHADER_ZERO 0x00 +#define V_028714_SPI_SHADER_32_R 0x01 +#define V_028714_SPI_SHADER_32_GR 0x02 +#define V_028714_SPI_SHADER_32_AR 0x03 +#define V_028714_SPI_SHADER_FP16_ABGR 0x04 +#define V_028714_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028714_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028714_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028714_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028714_SPI_SHADER_32_ABGR 0x09 +#define S_028714_COL1_EXPORT_FORMAT(x) (((x) & 0x0F) << 4) +#define G_028714_COL1_EXPORT_FORMAT(x) (((x) >> 4) & 0x0F) +#define C_028714_COL1_EXPORT_FORMAT 0xFFFFFF0F +#define V_028714_SPI_SHADER_ZERO 0x00 +#define V_028714_SPI_SHADER_32_R 0x01 +#define V_028714_SPI_SHADER_32_GR 0x02 +#define V_028714_SPI_SHADER_32_AR 0x03 +#define V_028714_SPI_SHADER_FP16_ABGR 0x04 +#define V_028714_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028714_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028714_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028714_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028714_SPI_SHADER_32_ABGR 0x09 +#define S_028714_COL2_EXPORT_FORMAT(x) (((x) & 0x0F) << 8) +#define G_028714_COL2_EXPORT_FORMAT(x) (((x) >> 8) & 0x0F) +#define C_028714_COL2_EXPORT_FORMAT 0xFFFFF0FF +#define V_028714_SPI_SHADER_ZERO 0x00 +#define V_028714_SPI_SHADER_32_R 0x01 +#define V_028714_SPI_SHADER_32_GR 0x02 +#define V_028714_SPI_SHADER_32_AR 0x03 +#define V_028714_SPI_SHADER_FP16_ABGR 0x04 +#define V_028714_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028714_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028714_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028714_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028714_SPI_SHADER_32_ABGR 0x09 +#define S_028714_COL3_EXPORT_FORMAT(x) (((x) & 0x0F) << 12) +#define G_028714_COL3_EXPORT_FORMAT(x) (((x) >> 12) & 0x0F) +#define C_028714_COL3_EXPORT_FORMAT 0xFFFF0FFF +#define V_028714_SPI_SHADER_ZERO 0x00 +#define V_028714_SPI_SHADER_32_R 0x01 +#define V_028714_SPI_SHADER_32_GR 0x02 +#define V_028714_SPI_SHADER_32_AR 0x03 +#define V_028714_SPI_SHADER_FP16_ABGR 0x04 +#define V_028714_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028714_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028714_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028714_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028714_SPI_SHADER_32_ABGR 0x09 +#define S_028714_COL4_EXPORT_FORMAT(x) (((x) & 0x0F) << 16) +#define G_028714_COL4_EXPORT_FORMAT(x) (((x) >> 16) & 0x0F) +#define C_028714_COL4_EXPORT_FORMAT 0xFFF0FFFF +#define V_028714_SPI_SHADER_ZERO 0x00 +#define V_028714_SPI_SHADER_32_R 0x01 +#define V_028714_SPI_SHADER_32_GR 0x02 +#define V_028714_SPI_SHADER_32_AR 0x03 +#define V_028714_SPI_SHADER_FP16_ABGR 0x04 +#define V_028714_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028714_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028714_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028714_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028714_SPI_SHADER_32_ABGR 0x09 +#define S_028714_COL5_EXPORT_FORMAT(x) (((x) & 0x0F) << 20) +#define G_028714_COL5_EXPORT_FORMAT(x) (((x) >> 20) & 0x0F) +#define C_028714_COL5_EXPORT_FORMAT 0xFF0FFFFF +#define V_028714_SPI_SHADER_ZERO 0x00 +#define V_028714_SPI_SHADER_32_R 0x01 +#define V_028714_SPI_SHADER_32_GR 0x02 +#define V_028714_SPI_SHADER_32_AR 0x03 +#define V_028714_SPI_SHADER_FP16_ABGR 0x04 +#define V_028714_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028714_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028714_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028714_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028714_SPI_SHADER_32_ABGR 0x09 +#define S_028714_COL6_EXPORT_FORMAT(x) (((x) & 0x0F) << 24) +#define G_028714_COL6_EXPORT_FORMAT(x) (((x) >> 24) & 0x0F) +#define C_028714_COL6_EXPORT_FORMAT 0xF0FFFFFF +#define V_028714_SPI_SHADER_ZERO 0x00 +#define V_028714_SPI_SHADER_32_R 0x01 +#define V_028714_SPI_SHADER_32_GR 0x02 +#define V_028714_SPI_SHADER_32_AR 0x03 +#define V_028714_SPI_SHADER_FP16_ABGR 0x04 +#define V_028714_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028714_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028714_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028714_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028714_SPI_SHADER_32_ABGR 0x09 +#define S_028714_COL7_EXPORT_FORMAT(x) (((x) & 0x0F) << 28) +#define G_028714_COL7_EXPORT_FORMAT(x) (((x) >> 28) & 0x0F) +#define C_028714_COL7_EXPORT_FORMAT 0x0FFFFFFF +#define V_028714_SPI_SHADER_ZERO 0x00 +#define V_028714_SPI_SHADER_32_R 0x01 +#define V_028714_SPI_SHADER_32_GR 0x02 +#define V_028714_SPI_SHADER_32_AR 0x03 +#define V_028714_SPI_SHADER_FP16_ABGR 0x04 +#define V_028714_SPI_SHADER_UNORM16_ABGR 0x05 +#define V_028714_SPI_SHADER_SNORM16_ABGR 0x06 +#define V_028714_SPI_SHADER_UINT16_ABGR 0x07 +#define V_028714_SPI_SHADER_SINT16_ABGR 0x08 +#define V_028714_SPI_SHADER_32_ABGR 0x09 +#define R_028780_CB_BLEND0_CONTROL 0x028780 +#define S_028780_COLOR_SRCBLEND(x) (((x) & 0x1F) << 0) +#define G_028780_COLOR_SRCBLEND(x) (((x) >> 0) & 0x1F) +#define C_028780_COLOR_SRCBLEND 0xFFFFFFE0 +#define V_028780_BLEND_ZERO 0x00 +#define V_028780_BLEND_ONE 0x01 +#define V_028780_BLEND_SRC_COLOR 0x02 +#define V_028780_BLEND_ONE_MINUS_SRC_COLOR 0x03 +#define V_028780_BLEND_SRC_ALPHA 0x04 +#define V_028780_BLEND_ONE_MINUS_SRC_ALPHA 0x05 +#define V_028780_BLEND_DST_ALPHA 0x06 +#define V_028780_BLEND_ONE_MINUS_DST_ALPHA 0x07 +#define V_028780_BLEND_DST_COLOR 0x08 +#define V_028780_BLEND_ONE_MINUS_DST_COLOR 0x09 +#define V_028780_BLEND_SRC_ALPHA_SATURATE 0x0A +#define V_028780_BLEND_CONSTANT_COLOR 0x0D +#define V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR 0x0E +#define V_028780_BLEND_SRC1_COLOR 0x0F +#define V_028780_BLEND_INV_SRC1_COLOR 0x10 +#define V_028780_BLEND_SRC1_ALPHA 0x11 +#define V_028780_BLEND_INV_SRC1_ALPHA 0x12 +#define V_028780_BLEND_CONSTANT_ALPHA 0x13 +#define V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA 0x14 +#define S_028780_COLOR_COMB_FCN(x) (((x) & 0x07) << 5) +#define G_028780_COLOR_COMB_FCN(x) (((x) >> 5) & 0x07) +#define C_028780_COLOR_COMB_FCN 0xFFFFFF1F +#define V_028780_COMB_DST_PLUS_SRC 0x00 +#define V_028780_COMB_SRC_MINUS_DST 0x01 +#define V_028780_COMB_MIN_DST_SRC 0x02 +#define V_028780_COMB_MAX_DST_SRC 0x03 +#define V_028780_COMB_DST_MINUS_SRC 0x04 +#define S_028780_COLOR_DESTBLEND(x) (((x) & 0x1F) << 8) +#define G_028780_COLOR_DESTBLEND(x) (((x) >> 8) & 0x1F) +#define C_028780_COLOR_DESTBLEND 0xFFFFE0FF +#define V_028780_BLEND_ZERO 0x00 +#define V_028780_BLEND_ONE 0x01 +#define V_028780_BLEND_SRC_COLOR 0x02 +#define V_028780_BLEND_ONE_MINUS_SRC_COLOR 0x03 +#define V_028780_BLEND_SRC_ALPHA 0x04 +#define V_028780_BLEND_ONE_MINUS_SRC_ALPHA 0x05 +#define V_028780_BLEND_DST_ALPHA 0x06 +#define V_028780_BLEND_ONE_MINUS_DST_ALPHA 0x07 +#define V_028780_BLEND_DST_COLOR 0x08 +#define V_028780_BLEND_ONE_MINUS_DST_COLOR 0x09 +#define V_028780_BLEND_SRC_ALPHA_SATURATE 0x0A +#define V_028780_BLEND_CONSTANT_COLOR 0x0D +#define V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR 0x0E +#define V_028780_BLEND_SRC1_COLOR 0x0F +#define V_028780_BLEND_INV_SRC1_COLOR 0x10 +#define V_028780_BLEND_SRC1_ALPHA 0x11 +#define V_028780_BLEND_INV_SRC1_ALPHA 0x12 +#define V_028780_BLEND_CONSTANT_ALPHA 0x13 +#define V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA 0x14 +#define S_028780_ALPHA_SRCBLEND(x) (((x) & 0x1F) << 16) +#define G_028780_ALPHA_SRCBLEND(x) (((x) >> 16) & 0x1F) +#define C_028780_ALPHA_SRCBLEND 0xFFE0FFFF +#define V_028780_BLEND_ZERO 0x00 +#define V_028780_BLEND_ONE 0x01 +#define V_028780_BLEND_SRC_COLOR 0x02 +#define V_028780_BLEND_ONE_MINUS_SRC_COLOR 0x03 +#define V_028780_BLEND_SRC_ALPHA 0x04 +#define V_028780_BLEND_ONE_MINUS_SRC_ALPHA 0x05 +#define V_028780_BLEND_DST_ALPHA 0x06 +#define V_028780_BLEND_ONE_MINUS_DST_ALPHA 0x07 +#define V_028780_BLEND_DST_COLOR 0x08 +#define V_028780_BLEND_ONE_MINUS_DST_COLOR 0x09 +#define V_028780_BLEND_SRC_ALPHA_SATURATE 0x0A +#define V_028780_BLEND_CONSTANT_COLOR 0x0D +#define V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR 0x0E +#define V_028780_BLEND_SRC1_COLOR 0x0F +#define V_028780_BLEND_INV_SRC1_COLOR 0x10 +#define V_028780_BLEND_SRC1_ALPHA 0x11 +#define V_028780_BLEND_INV_SRC1_ALPHA 0x12 +#define V_028780_BLEND_CONSTANT_ALPHA 0x13 +#define V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA 0x14 +#define S_028780_ALPHA_COMB_FCN(x) (((x) & 0x07) << 21) +#define G_028780_ALPHA_COMB_FCN(x) (((x) >> 21) & 0x07) +#define C_028780_ALPHA_COMB_FCN 0xFF1FFFFF +#define V_028780_COMB_DST_PLUS_SRC 0x00 +#define V_028780_COMB_SRC_MINUS_DST 0x01 +#define V_028780_COMB_MIN_DST_SRC 0x02 +#define V_028780_COMB_MAX_DST_SRC 0x03 +#define V_028780_COMB_DST_MINUS_SRC 0x04 +#define S_028780_ALPHA_DESTBLEND(x) (((x) & 0x1F) << 24) +#define G_028780_ALPHA_DESTBLEND(x) (((x) >> 24) & 0x1F) +#define C_028780_ALPHA_DESTBLEND 0xE0FFFFFF +#define V_028780_BLEND_ZERO 0x00 +#define V_028780_BLEND_ONE 0x01 +#define V_028780_BLEND_SRC_COLOR 0x02 +#define V_028780_BLEND_ONE_MINUS_SRC_COLOR 0x03 +#define V_028780_BLEND_SRC_ALPHA 0x04 +#define V_028780_BLEND_ONE_MINUS_SRC_ALPHA 0x05 +#define V_028780_BLEND_DST_ALPHA 0x06 +#define V_028780_BLEND_ONE_MINUS_DST_ALPHA 0x07 +#define V_028780_BLEND_DST_COLOR 0x08 +#define V_028780_BLEND_ONE_MINUS_DST_COLOR 0x09 +#define V_028780_BLEND_SRC_ALPHA_SATURATE 0x0A +#define V_028780_BLEND_CONSTANT_COLOR 0x0D +#define V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR 0x0E +#define V_028780_BLEND_SRC1_COLOR 0x0F +#define V_028780_BLEND_INV_SRC1_COLOR 0x10 +#define V_028780_BLEND_SRC1_ALPHA 0x11 +#define V_028780_BLEND_INV_SRC1_ALPHA 0x12 +#define V_028780_BLEND_CONSTANT_ALPHA 0x13 +#define V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA 0x14 +#define S_028780_SEPARATE_ALPHA_BLEND(x) (((x) & 0x1) << 29) +#define G_028780_SEPARATE_ALPHA_BLEND(x) (((x) >> 29) & 0x1) +#define C_028780_SEPARATE_ALPHA_BLEND 0xDFFFFFFF +#define S_028780_ENABLE(x) (((x) & 0x1) << 30) +#define G_028780_ENABLE(x) (((x) >> 30) & 0x1) +#define C_028780_ENABLE 0xBFFFFFFF +#define S_028780_DISABLE_ROP3(x) (((x) & 0x1) << 31) +#define G_028780_DISABLE_ROP3(x) (((x) >> 31) & 0x1) +#define C_028780_DISABLE_ROP3 0x7FFFFFFF +#define R_028784_CB_BLEND1_CONTROL 0x028784 +#define R_028788_CB_BLEND2_CONTROL 0x028788 +#define R_02878C_CB_BLEND3_CONTROL 0x02878C +#define R_028790_CB_BLEND4_CONTROL 0x028790 +#define R_028794_CB_BLEND5_CONTROL 0x028794 +#define R_028798_CB_BLEND6_CONTROL 0x028798 +#define R_02879C_CB_BLEND7_CONTROL 0x02879C +#define R_0287D4_PA_CL_POINT_X_RAD 0x0287D4 +#define R_0287D8_PA_CL_POINT_Y_RAD 0x0287D8 +#define R_0287DC_PA_CL_POINT_SIZE 0x0287DC +#define R_0287E0_PA_CL_POINT_CULL_RAD 0x0287E0 +#define R_0287E4_VGT_DMA_BASE_HI 0x0287E4 +#define S_0287E4_BASE_ADDR(x) (((x) & 0xFF) << 0) +#define G_0287E4_BASE_ADDR(x) (((x) >> 0) & 0xFF) +#define C_0287E4_BASE_ADDR 0xFFFFFF00 +#define R_0287E8_VGT_DMA_BASE 0x0287E8 +#define R_0287F0_VGT_DRAW_INITIATOR 0x0287F0 +#define S_0287F0_SOURCE_SELECT(x) (((x) & 0x03) << 0) +#define G_0287F0_SOURCE_SELECT(x) (((x) >> 0) & 0x03) +#define C_0287F0_SOURCE_SELECT 0xFFFFFFFC +#define V_0287F0_DI_SRC_SEL_DMA 0x00 +#define V_0287F0_DI_SRC_SEL_IMMEDIATE 0x01 +#define V_0287F0_DI_SRC_SEL_AUTO_INDEX 0x02 +#define V_0287F0_DI_SRC_SEL_RESERVED 0x03 +#define S_0287F0_MAJOR_MODE(x) (((x) & 0x03) << 2) +#define G_0287F0_MAJOR_MODE(x) (((x) >> 2) & 0x03) +#define C_0287F0_MAJOR_MODE 0xFFFFFFF3 +#define V_0287F0_DI_MAJOR_MODE_0 0x00 +#define V_0287F0_DI_MAJOR_MODE_1 0x01 +#define S_0287F0_NOT_EOP(x) (((x) & 0x1) << 5) +#define G_0287F0_NOT_EOP(x) (((x) >> 5) & 0x1) +#define C_0287F0_NOT_EOP 0xFFFFFFDF +#define S_0287F0_USE_OPAQUE(x) (((x) & 0x1) << 6) +#define G_0287F0_USE_OPAQUE(x) (((x) >> 6) & 0x1) +#define C_0287F0_USE_OPAQUE 0xFFFFFFBF +#define R_0287F4_VGT_IMMED_DATA 0x0287F4 +#define R_028800_DB_DEPTH_CONTROL 0x028800 +#define S_028800_STENCIL_ENABLE(x) (((x) & 0x1) << 0) +#define G_028800_STENCIL_ENABLE(x) (((x) >> 0) & 0x1) +#define C_028800_STENCIL_ENABLE 0xFFFFFFFE +#define S_028800_Z_ENABLE(x) (((x) & 0x1) << 1) +#define G_028800_Z_ENABLE(x) (((x) >> 1) & 0x1) +#define C_028800_Z_ENABLE 0xFFFFFFFD +#define S_028800_Z_WRITE_ENABLE(x) (((x) & 0x1) << 2) +#define G_028800_Z_WRITE_ENABLE(x) (((x) >> 2) & 0x1) +#define C_028800_Z_WRITE_ENABLE 0xFFFFFFFB +#define S_028800_DEPTH_BOUNDS_ENABLE(x) (((x) & 0x1) << 3) +#define G_028800_DEPTH_BOUNDS_ENABLE(x) (((x) >> 3) & 0x1) +#define C_028800_DEPTH_BOUNDS_ENABLE 0xFFFFFFF7 +#define S_028800_ZFUNC(x) (((x) & 0x07) << 4) +#define G_028800_ZFUNC(x) (((x) >> 4) & 0x07) +#define C_028800_ZFUNC 0xFFFFFF8F +#define V_028800_FRAG_NEVER 0x00 +#define V_028800_FRAG_LESS 0x01 +#define V_028800_FRAG_EQUAL 0x02 +#define V_028800_FRAG_LEQUAL 0x03 +#define V_028800_FRAG_GREATER 0x04 +#define V_028800_FRAG_NOTEQUAL 0x05 +#define V_028800_FRAG_GEQUAL 0x06 +#define V_028800_FRAG_ALWAYS 0x07 +#define S_028800_BACKFACE_ENABLE(x) (((x) & 0x1) << 7) +#define G_028800_BACKFACE_ENABLE(x) (((x) >> 7) & 0x1) +#define C_028800_BACKFACE_ENABLE 0xFFFFFF7F +#define S_028800_STENCILFUNC(x) (((x) & 0x07) << 8) +#define G_028800_STENCILFUNC(x) (((x) >> 8) & 0x07) +#define C_028800_STENCILFUNC 0xFFFFF8FF +#define V_028800_REF_NEVER 0x00 +#define V_028800_REF_LESS 0x01 +#define V_028800_REF_EQUAL 0x02 +#define V_028800_REF_LEQUAL 0x03 +#define V_028800_REF_GREATER 0x04 +#define V_028800_REF_NOTEQUAL 0x05 +#define V_028800_REF_GEQUAL 0x06 +#define V_028800_REF_ALWAYS 0x07 +#define S_028800_STENCILFUNC_BF(x) (((x) & 0x07) << 20) +#define G_028800_STENCILFUNC_BF(x) (((x) >> 20) & 0x07) +#define C_028800_STENCILFUNC_BF 0xFF8FFFFF +#define V_028800_REF_NEVER 0x00 +#define V_028800_REF_LESS 0x01 +#define V_028800_REF_EQUAL 0x02 +#define V_028800_REF_LEQUAL 0x03 +#define V_028800_REF_GREATER 0x04 +#define V_028800_REF_NOTEQUAL 0x05 +#define V_028800_REF_GEQUAL 0x06 +#define V_028800_REF_ALWAYS 0x07 +#define S_028800_ENABLE_COLOR_WRITES_ON_DEPTH_FAIL(x) (((x) & 0x1) << 30) +#define G_028800_ENABLE_COLOR_WRITES_ON_DEPTH_FAIL(x) (((x) >> 30) & 0x1) +#define C_028800_ENABLE_COLOR_WRITES_ON_DEPTH_FAIL 0xBFFFFFFF +#define S_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS(x) (((x) & 0x1) << 31) +#define G_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS(x) (((x) >> 31) & 0x1) +#define C_028800_DISABLE_COLOR_WRITES_ON_DEPTH_PASS 0x7FFFFFFF +#define R_028804_DB_EQAA 0x028804 +#define R_028808_CB_COLOR_CONTROL 0x028808 +#define S_028808_DEGAMMA_ENABLE(x) (((x) & 0x1) << 3) +#define G_028808_DEGAMMA_ENABLE(x) (((x) >> 3) & 0x1) +#define C_028808_DEGAMMA_ENABLE 0xFFFFFFF7 +#define S_028808_MODE(x) (((x) & 0x07) << 4) +#define G_028808_MODE(x) (((x) >> 4) & 0x07) +#define C_028808_MODE 0xFFFFFF8F +#define V_028808_CB_DISABLE 0x00 +#define V_028808_CB_NORMAL 0x01 +#define V_028808_CB_ELIMINATE_FAST_CLEAR 0x02 +#define V_028808_CB_RESOLVE 0x03 +#define V_028808_CB_FMASK_DECOMPRESS 0x05 +#define S_028808_ROP3(x) (((x) & 0xFF) << 16) +#define G_028808_ROP3(x) (((x) >> 16) & 0xFF) +#define C_028808_ROP3 0xFF00FFFF +#define V_028808_X_0X00 0x00 +#define V_028808_X_0X05 0x05 +#define V_028808_X_0X0A 0x0A +#define V_028808_X_0X0F 0x0F +#define V_028808_X_0X11 0x11 +#define V_028808_X_0X22 0x22 +#define V_028808_X_0X33 0x33 +#define V_028808_X_0X44 0x44 +#define V_028808_X_0X50 0x50 +#define V_028808_X_0X55 0x55 +#define V_028808_X_0X5A 0x5A +#define V_028808_X_0X5F 0x5F +#define V_028808_X_0X66 0x66 +#define V_028808_X_0X77 0x77 +#define V_028808_X_0X88 0x88 +#define V_028808_X_0X99 0x99 +#define V_028808_X_0XA0 0xA0 +#define V_028808_X_0XA5 0xA5 +#define V_028808_X_0XAA 0xAA +#define V_028808_X_0XAF 0xAF +#define V_028808_X_0XBB 0xBB +#define V_028808_X_0XCC 0xCC +#define V_028808_X_0XDD 0xDD +#define V_028808_X_0XEE 0xEE +#define V_028808_X_0XF0 0xF0 +#define V_028808_X_0XF5 0xF5 +#define V_028808_X_0XFA 0xFA +#define V_028808_X_0XFF 0xFF +#define R_02880C_DB_SHADER_CONTROL 0x02880C +#define S_02880C_Z_EXPORT_ENABLE(x) (((x) & 0x1) << 0) +#define G_02880C_Z_EXPORT_ENABLE(x) (((x) >> 0) & 0x1) +#define C_02880C_Z_EXPORT_ENABLE 0xFFFFFFFE +#define S_02880C_STENCIL_TEST_VAL_EXPORT_ENAB(x) (((x) & 0x1) << 1) +#define G_02880C_STENCIL_TEST_VAL_EXPORT_ENAB(x) (((x) >> 1) & 0x1) +#define C_02880C_STENCIL_TEST_VAL_EXPORT_ENAB 0xFFFFFFFD +#define S_02880C_STENCIL_OP_VAL_EXPORT_ENABLE(x) (((x) & 0x1) << 2) +#define G_02880C_STENCIL_OP_VAL_EXPORT_ENABLE(x) (((x) >> 2) & 0x1) +#define C_02880C_STENCIL_OP_VAL_EXPORT_ENABLE 0xFFFFFFFB +#define S_02880C_Z_ORDER(x) (((x) & 0x03) << 4) +#define G_02880C_Z_ORDER(x) (((x) >> 4) & 0x03) +#define C_02880C_Z_ORDER 0xFFFFFFCF +#define V_02880C_LATE_Z 0x00 +#define V_02880C_EARLY_Z_THEN_LATE_Z 0x01 +#define V_02880C_RE_Z 0x02 +#define V_02880C_EARLY_Z_THEN_RE_Z 0x03 +#define S_02880C_KILL_ENABLE(x) (((x) & 0x1) << 6) +#define G_02880C_KILL_ENABLE(x) (((x) >> 6) & 0x1) +#define C_02880C_KILL_ENABLE 0xFFFFFFBF +#define S_02880C_COVERAGE_TO_MASK_ENABLE(x) (((x) & 0x1) << 7) +#define G_02880C_COVERAGE_TO_MASK_ENABLE(x) (((x) >> 7) & 0x1) +#define C_02880C_COVERAGE_TO_MASK_ENABLE 0xFFFFFF7F +#define S_02880C_MASK_EXPORT_ENABLE(x) (((x) & 0x1) << 8) +#define G_02880C_MASK_EXPORT_ENABLE(x) (((x) >> 8) & 0x1) +#define C_02880C_MASK_EXPORT_ENABLE 0xFFFFFEFF +#define S_02880C_EXEC_ON_HIER_FAIL(x) (((x) & 0x1) << 9) +#define G_02880C_EXEC_ON_HIER_FAIL(x) (((x) >> 9) & 0x1) +#define C_02880C_EXEC_ON_HIER_FAIL 0xFFFFFDFF +#define S_02880C_EXEC_ON_NOOP(x) (((x) & 0x1) << 10) +#define G_02880C_EXEC_ON_NOOP(x) (((x) >> 10) & 0x1) +#define C_02880C_EXEC_ON_NOOP 0xFFFFFBFF +#define S_02880C_ALPHA_TO_MASK_DISABLE(x) (((x) & 0x1) << 11) +#define G_02880C_ALPHA_TO_MASK_DISABLE(x) (((x) >> 11) & 0x1) +#define C_02880C_ALPHA_TO_MASK_DISABLE 0xFFFFF7FF +#define S_02880C_DEPTH_BEFORE_SHADER(x) (((x) & 0x1) << 12) +#define G_02880C_DEPTH_BEFORE_SHADER(x) (((x) >> 12) & 0x1) +#define C_02880C_DEPTH_BEFORE_SHADER 0xFFFFEFFF +#define R_028810_PA_CL_CLIP_CNTL 0x028810 +#define S_028810_UCP_ENA_0(x) (((x) & 0x1) << 0) +#define G_028810_UCP_ENA_0(x) (((x) >> 0) & 0x1) +#define C_028810_UCP_ENA_0 0xFFFFFFFE +#define S_028810_UCP_ENA_1(x) (((x) & 0x1) << 1) +#define G_028810_UCP_ENA_1(x) (((x) >> 1) & 0x1) +#define C_028810_UCP_ENA_1 0xFFFFFFFD +#define S_028810_UCP_ENA_2(x) (((x) & 0x1) << 2) +#define G_028810_UCP_ENA_2(x) (((x) >> 2) & 0x1) +#define C_028810_UCP_ENA_2 0xFFFFFFFB +#define S_028810_UCP_ENA_3(x) (((x) & 0x1) << 3) +#define G_028810_UCP_ENA_3(x) (((x) >> 3) & 0x1) +#define C_028810_UCP_ENA_3 0xFFFFFFF7 +#define S_028810_UCP_ENA_4(x) (((x) & 0x1) << 4) +#define G_028810_UCP_ENA_4(x) (((x) >> 4) & 0x1) +#define C_028810_UCP_ENA_4 0xFFFFFFEF +#define S_028810_UCP_ENA_5(x) (((x) & 0x1) << 5) +#define G_028810_UCP_ENA_5(x) (((x) >> 5) & 0x1) +#define C_028810_UCP_ENA_5 0xFFFFFFDF +#define S_028810_PS_UCP_Y_SCALE_NEG(x) (((x) & 0x1) << 13) +#define G_028810_PS_UCP_Y_SCALE_NEG(x) (((x) >> 13) & 0x1) +#define C_028810_PS_UCP_Y_SCALE_NEG 0xFFFFDFFF +#define S_028810_PS_UCP_MODE(x) (((x) & 0x03) << 14) +#define G_028810_PS_UCP_MODE(x) (((x) >> 14) & 0x03) +#define C_028810_PS_UCP_MODE 0xFFFF3FFF +#define S_028810_CLIP_DISABLE(x) (((x) & 0x1) << 16) +#define G_028810_CLIP_DISABLE(x) (((x) >> 16) & 0x1) +#define C_028810_CLIP_DISABLE 0xFFFEFFFF +#define S_028810_UCP_CULL_ONLY_ENA(x) (((x) & 0x1) << 17) +#define G_028810_UCP_CULL_ONLY_ENA(x) (((x) >> 17) & 0x1) +#define C_028810_UCP_CULL_ONLY_ENA 0xFFFDFFFF +#define S_028810_BOUNDARY_EDGE_FLAG_ENA(x) (((x) & 0x1) << 18) +#define G_028810_BOUNDARY_EDGE_FLAG_ENA(x) (((x) >> 18) & 0x1) +#define C_028810_BOUNDARY_EDGE_FLAG_ENA 0xFFFBFFFF +#define S_028810_DX_CLIP_SPACE_DEF(x) (((x) & 0x1) << 19) +#define G_028810_DX_CLIP_SPACE_DEF(x) (((x) >> 19) & 0x1) +#define C_028810_DX_CLIP_SPACE_DEF 0xFFF7FFFF +#define S_028810_DIS_CLIP_ERR_DETECT(x) (((x) & 0x1) << 20) +#define G_028810_DIS_CLIP_ERR_DETECT(x) (((x) >> 20) & 0x1) +#define C_028810_DIS_CLIP_ERR_DETECT 0xFFEFFFFF +#define S_028810_VTX_KILL_OR(x) (((x) & 0x1) << 21) +#define G_028810_VTX_KILL_OR(x) (((x) >> 21) & 0x1) +#define C_028810_VTX_KILL_OR 0xFFDFFFFF +#define S_028810_DX_RASTERIZATION_KILL(x) (((x) & 0x1) << 22) +#define G_028810_DX_RASTERIZATION_KILL(x) (((x) >> 22) & 0x1) +#define C_028810_DX_RASTERIZATION_KILL 0xFFBFFFFF +#define S_028810_DX_LINEAR_ATTR_CLIP_ENA(x) (((x) & 0x1) << 24) +#define G_028810_DX_LINEAR_ATTR_CLIP_ENA(x) (((x) >> 24) & 0x1) +#define C_028810_DX_LINEAR_ATTR_CLIP_ENA 0xFEFFFFFF +#define S_028810_VTE_VPORT_PROVOKE_DISABLE(x) (((x) & 0x1) << 25) +#define G_028810_VTE_VPORT_PROVOKE_DISABLE(x) (((x) >> 25) & 0x1) +#define C_028810_VTE_VPORT_PROVOKE_DISABLE 0xFDFFFFFF +#define S_028810_ZCLIP_NEAR_DISABLE(x) (((x) & 0x1) << 26) +#define G_028810_ZCLIP_NEAR_DISABLE(x) (((x) >> 26) & 0x1) +#define C_028810_ZCLIP_NEAR_DISABLE 0xFBFFFFFF +#define S_028810_ZCLIP_FAR_DISABLE(x) (((x) & 0x1) << 27) +#define G_028810_ZCLIP_FAR_DISABLE(x) (((x) >> 27) & 0x1) +#define C_028810_ZCLIP_FAR_DISABLE 0xF7FFFFFF +#define R_028814_PA_SU_SC_MODE_CNTL 0x028814 +#define S_028814_CULL_FRONT(x) (((x) & 0x1) << 0) +#define G_028814_CULL_FRONT(x) (((x) >> 0) & 0x1) +#define C_028814_CULL_FRONT 0xFFFFFFFE +#define S_028814_CULL_BACK(x) (((x) & 0x1) << 1) +#define G_028814_CULL_BACK(x) (((x) >> 1) & 0x1) +#define C_028814_CULL_BACK 0xFFFFFFFD +#define S_028814_FACE(x) (((x) & 0x1) << 2) +#define G_028814_FACE(x) (((x) >> 2) & 0x1) +#define C_028814_FACE 0xFFFFFFFB +#define S_028814_POLY_MODE(x) (((x) & 0x03) << 3) +#define G_028814_POLY_MODE(x) (((x) >> 3) & 0x03) +#define C_028814_POLY_MODE 0xFFFFFFE7 +#define V_028814_X_DISABLE_POLY_MODE 0x00 +#define V_028814_X_DUAL_MODE 0x01 +#define S_028814_POLYMODE_FRONT_PTYPE(x) (((x) & 0x07) << 5) +#define G_028814_POLYMODE_FRONT_PTYPE(x) (((x) >> 5) & 0x07) +#define C_028814_POLYMODE_FRONT_PTYPE 0xFFFFFF1F +#define V_028814_X_DRAW_POINTS 0x00 +#define V_028814_X_DRAW_LINES 0x01 +#define V_028814_X_DRAW_TRIANGLES 0x02 +#define S_028814_POLYMODE_BACK_PTYPE(x) (((x) & 0x07) << 8) +#define G_028814_POLYMODE_BACK_PTYPE(x) (((x) >> 8) & 0x07) +#define C_028814_POLYMODE_BACK_PTYPE 0xFFFFF8FF +#define V_028814_X_DRAW_POINTS 0x00 +#define V_028814_X_DRAW_LINES 0x01 +#define V_028814_X_DRAW_TRIANGLES 0x02 +#define S_028814_POLY_OFFSET_FRONT_ENABLE(x) (((x) & 0x1) << 11) +#define G_028814_POLY_OFFSET_FRONT_ENABLE(x) (((x) >> 11) & 0x1) +#define C_028814_POLY_OFFSET_FRONT_ENABLE 0xFFFFF7FF +#define S_028814_POLY_OFFSET_BACK_ENABLE(x) (((x) & 0x1) << 12) +#define G_028814_POLY_OFFSET_BACK_ENABLE(x) (((x) >> 12) & 0x1) +#define C_028814_POLY_OFFSET_BACK_ENABLE 0xFFFFEFFF +#define S_028814_POLY_OFFSET_PARA_ENABLE(x) (((x) & 0x1) << 13) +#define G_028814_POLY_OFFSET_PARA_ENABLE(x) (((x) >> 13) & 0x1) +#define C_028814_POLY_OFFSET_PARA_ENABLE 0xFFFFDFFF +#define S_028814_VTX_WINDOW_OFFSET_ENABLE(x) (((x) & 0x1) << 16) +#define G_028814_VTX_WINDOW_OFFSET_ENABLE(x) (((x) >> 16) & 0x1) +#define C_028814_VTX_WINDOW_OFFSET_ENABLE 0xFFFEFFFF +#define S_028814_PROVOKING_VTX_LAST(x) (((x) & 0x1) << 19) +#define G_028814_PROVOKING_VTX_LAST(x) (((x) >> 19) & 0x1) +#define C_028814_PROVOKING_VTX_LAST 0xFFF7FFFF +#define S_028814_PERSP_CORR_DIS(x) (((x) & 0x1) << 20) +#define G_028814_PERSP_CORR_DIS(x) (((x) >> 20) & 0x1) +#define C_028814_PERSP_CORR_DIS 0xFFEFFFFF +#define S_028814_MULTI_PRIM_IB_ENA(x) (((x) & 0x1) << 21) +#define G_028814_MULTI_PRIM_IB_ENA(x) (((x) >> 21) & 0x1) +#define C_028814_MULTI_PRIM_IB_ENA 0xFFDFFFFF +#define R_028818_PA_CL_VTE_CNTL 0x028818 +#define S_028818_VPORT_X_SCALE_ENA(x) (((x) & 0x1) << 0) +#define G_028818_VPORT_X_SCALE_ENA(x) (((x) >> 0) & 0x1) +#define C_028818_VPORT_X_SCALE_ENA 0xFFFFFFFE +#define S_028818_VPORT_X_OFFSET_ENA(x) (((x) & 0x1) << 1) +#define G_028818_VPORT_X_OFFSET_ENA(x) (((x) >> 1) & 0x1) +#define C_028818_VPORT_X_OFFSET_ENA 0xFFFFFFFD +#define S_028818_VPORT_Y_SCALE_ENA(x) (((x) & 0x1) << 2) +#define G_028818_VPORT_Y_SCALE_ENA(x) (((x) >> 2) & 0x1) +#define C_028818_VPORT_Y_SCALE_ENA 0xFFFFFFFB +#define S_028818_VPORT_Y_OFFSET_ENA(x) (((x) & 0x1) << 3) +#define G_028818_VPORT_Y_OFFSET_ENA(x) (((x) >> 3) & 0x1) +#define C_028818_VPORT_Y_OFFSET_ENA 0xFFFFFFF7 +#define S_028818_VPORT_Z_SCALE_ENA(x) (((x) & 0x1) << 4) +#define G_028818_VPORT_Z_SCALE_ENA(x) (((x) >> 4) & 0x1) +#define C_028818_VPORT_Z_SCALE_ENA 0xFFFFFFEF +#define S_028818_VPORT_Z_OFFSET_ENA(x) (((x) & 0x1) << 5) +#define G_028818_VPORT_Z_OFFSET_ENA(x) (((x) >> 5) & 0x1) +#define C_028818_VPORT_Z_OFFSET_ENA 0xFFFFFFDF +#define S_028818_VTX_XY_FMT(x) (((x) & 0x1) << 8) +#define G_028818_VTX_XY_FMT(x) (((x) >> 8) & 0x1) +#define C_028818_VTX_XY_FMT 0xFFFFFEFF +#define S_028818_VTX_Z_FMT(x) (((x) & 0x1) << 9) +#define G_028818_VTX_Z_FMT(x) (((x) >> 9) & 0x1) +#define C_028818_VTX_Z_FMT 0xFFFFFDFF +#define S_028818_VTX_W0_FMT(x) (((x) & 0x1) << 10) +#define G_028818_VTX_W0_FMT(x) (((x) >> 10) & 0x1) +#define C_028818_VTX_W0_FMT 0xFFFFFBFF +#define R_02881C_PA_CL_VS_OUT_CNTL 0x02881C +#define S_02881C_CLIP_DIST_ENA_0(x) (((x) & 0x1) << 0) +#define G_02881C_CLIP_DIST_ENA_0(x) (((x) >> 0) & 0x1) +#define C_02881C_CLIP_DIST_ENA_0 0xFFFFFFFE +#define S_02881C_CLIP_DIST_ENA_1(x) (((x) & 0x1) << 1) +#define G_02881C_CLIP_DIST_ENA_1(x) (((x) >> 1) & 0x1) +#define C_02881C_CLIP_DIST_ENA_1 0xFFFFFFFD +#define S_02881C_CLIP_DIST_ENA_2(x) (((x) & 0x1) << 2) +#define G_02881C_CLIP_DIST_ENA_2(x) (((x) >> 2) & 0x1) +#define C_02881C_CLIP_DIST_ENA_2 0xFFFFFFFB +#define S_02881C_CLIP_DIST_ENA_3(x) (((x) & 0x1) << 3) +#define G_02881C_CLIP_DIST_ENA_3(x) (((x) >> 3) & 0x1) +#define C_02881C_CLIP_DIST_ENA_3 0xFFFFFFF7 +#define S_02881C_CLIP_DIST_ENA_4(x) (((x) & 0x1) << 4) +#define G_02881C_CLIP_DIST_ENA_4(x) (((x) >> 4) & 0x1) +#define C_02881C_CLIP_DIST_ENA_4 0xFFFFFFEF +#define S_02881C_CLIP_DIST_ENA_5(x) (((x) & 0x1) << 5) +#define G_02881C_CLIP_DIST_ENA_5(x) (((x) >> 5) & 0x1) +#define C_02881C_CLIP_DIST_ENA_5 0xFFFFFFDF +#define S_02881C_CLIP_DIST_ENA_6(x) (((x) & 0x1) << 6) +#define G_02881C_CLIP_DIST_ENA_6(x) (((x) >> 6) & 0x1) +#define C_02881C_CLIP_DIST_ENA_6 0xFFFFFFBF +#define S_02881C_CLIP_DIST_ENA_7(x) (((x) & 0x1) << 7) +#define G_02881C_CLIP_DIST_ENA_7(x) (((x) >> 7) & 0x1) +#define C_02881C_CLIP_DIST_ENA_7 0xFFFFFF7F +#define S_02881C_CULL_DIST_ENA_0(x) (((x) & 0x1) << 8) +#define G_02881C_CULL_DIST_ENA_0(x) (((x) >> 8) & 0x1) +#define C_02881C_CULL_DIST_ENA_0 0xFFFFFEFF +#define S_02881C_CULL_DIST_ENA_1(x) (((x) & 0x1) << 9) +#define G_02881C_CULL_DIST_ENA_1(x) (((x) >> 9) & 0x1) +#define C_02881C_CULL_DIST_ENA_1 0xFFFFFDFF +#define S_02881C_CULL_DIST_ENA_2(x) (((x) & 0x1) << 10) +#define G_02881C_CULL_DIST_ENA_2(x) (((x) >> 10) & 0x1) +#define C_02881C_CULL_DIST_ENA_2 0xFFFFFBFF +#define S_02881C_CULL_DIST_ENA_3(x) (((x) & 0x1) << 11) +#define G_02881C_CULL_DIST_ENA_3(x) (((x) >> 11) & 0x1) +#define C_02881C_CULL_DIST_ENA_3 0xFFFFF7FF +#define S_02881C_CULL_DIST_ENA_4(x) (((x) & 0x1) << 12) +#define G_02881C_CULL_DIST_ENA_4(x) (((x) >> 12) & 0x1) +#define C_02881C_CULL_DIST_ENA_4 0xFFFFEFFF +#define S_02881C_CULL_DIST_ENA_5(x) (((x) & 0x1) << 13) +#define G_02881C_CULL_DIST_ENA_5(x) (((x) >> 13) & 0x1) +#define C_02881C_CULL_DIST_ENA_5 0xFFFFDFFF +#define S_02881C_CULL_DIST_ENA_6(x) (((x) & 0x1) << 14) +#define G_02881C_CULL_DIST_ENA_6(x) (((x) >> 14) & 0x1) +#define C_02881C_CULL_DIST_ENA_6 0xFFFFBFFF +#define S_02881C_CULL_DIST_ENA_7(x) (((x) & 0x1) << 15) +#define G_02881C_CULL_DIST_ENA_7(x) (((x) >> 15) & 0x1) +#define C_02881C_CULL_DIST_ENA_7 0xFFFF7FFF +#define S_02881C_USE_VTX_POINT_SIZE(x) (((x) & 0x1) << 16) +#define G_02881C_USE_VTX_POINT_SIZE(x) (((x) >> 16) & 0x1) +#define C_02881C_USE_VTX_POINT_SIZE 0xFFFEFFFF +#define S_02881C_USE_VTX_EDGE_FLAG(x) (((x) & 0x1) << 17) +#define G_02881C_USE_VTX_EDGE_FLAG(x) (((x) >> 17) & 0x1) +#define C_02881C_USE_VTX_EDGE_FLAG 0xFFFDFFFF +#define S_02881C_USE_VTX_RENDER_TARGET_INDX(x) (((x) & 0x1) << 18) +#define G_02881C_USE_VTX_RENDER_TARGET_INDX(x) (((x) >> 18) & 0x1) +#define C_02881C_USE_VTX_RENDER_TARGET_INDX 0xFFFBFFFF +#define S_02881C_USE_VTX_VIEWPORT_INDX(x) (((x) & 0x1) << 19) +#define G_02881C_USE_VTX_VIEWPORT_INDX(x) (((x) >> 19) & 0x1) +#define C_02881C_USE_VTX_VIEWPORT_INDX 0xFFF7FFFF +#define S_02881C_USE_VTX_KILL_FLAG(x) (((x) & 0x1) << 20) +#define G_02881C_USE_VTX_KILL_FLAG(x) (((x) >> 20) & 0x1) +#define C_02881C_USE_VTX_KILL_FLAG 0xFFEFFFFF +#define S_02881C_VS_OUT_MISC_VEC_ENA(x) (((x) & 0x1) << 21) +#define G_02881C_VS_OUT_MISC_VEC_ENA(x) (((x) >> 21) & 0x1) +#define C_02881C_VS_OUT_MISC_VEC_ENA 0xFFDFFFFF +#define S_02881C_VS_OUT_CCDIST0_VEC_ENA(x) (((x) & 0x1) << 22) +#define G_02881C_VS_OUT_CCDIST0_VEC_ENA(x) (((x) >> 22) & 0x1) +#define C_02881C_VS_OUT_CCDIST0_VEC_ENA 0xFFBFFFFF +#define S_02881C_VS_OUT_CCDIST1_VEC_ENA(x) (((x) & 0x1) << 23) +#define G_02881C_VS_OUT_CCDIST1_VEC_ENA(x) (((x) >> 23) & 0x1) +#define C_02881C_VS_OUT_CCDIST1_VEC_ENA 0xFF7FFFFF +#define S_02881C_VS_OUT_MISC_SIDE_BUS_ENA(x) (((x) & 0x1) << 24) +#define G_02881C_VS_OUT_MISC_SIDE_BUS_ENA(x) (((x) >> 24) & 0x1) +#define C_02881C_VS_OUT_MISC_SIDE_BUS_ENA 0xFEFFFFFF +#define S_02881C_USE_VTX_GS_CUT_FLAG(x) (((x) & 0x1) << 25) +#define G_02881C_USE_VTX_GS_CUT_FLAG(x) (((x) >> 25) & 0x1) +#define C_02881C_USE_VTX_GS_CUT_FLAG 0xFDFFFFFF +#define R_028820_PA_CL_NANINF_CNTL 0x028820 +#define S_028820_VTE_XY_INF_DISCARD(x) (((x) & 0x1) << 0) +#define G_028820_VTE_XY_INF_DISCARD(x) (((x) >> 0) & 0x1) +#define C_028820_VTE_XY_INF_DISCARD 0xFFFFFFFE +#define S_028820_VTE_Z_INF_DISCARD(x) (((x) & 0x1) << 1) +#define G_028820_VTE_Z_INF_DISCARD(x) (((x) >> 1) & 0x1) +#define C_028820_VTE_Z_INF_DISCARD 0xFFFFFFFD +#define S_028820_VTE_W_INF_DISCARD(x) (((x) & 0x1) << 2) +#define G_028820_VTE_W_INF_DISCARD(x) (((x) >> 2) & 0x1) +#define C_028820_VTE_W_INF_DISCARD 0xFFFFFFFB +#define S_028820_VTE_0XNANINF_IS_0(x) (((x) & 0x1) << 3) +#define G_028820_VTE_0XNANINF_IS_0(x) (((x) >> 3) & 0x1) +#define C_028820_VTE_0XNANINF_IS_0 0xFFFFFFF7 +#define S_028820_VTE_XY_NAN_RETAIN(x) (((x) & 0x1) << 4) +#define G_028820_VTE_XY_NAN_RETAIN(x) (((x) >> 4) & 0x1) +#define C_028820_VTE_XY_NAN_RETAIN 0xFFFFFFEF +#define S_028820_VTE_Z_NAN_RETAIN(x) (((x) & 0x1) << 5) +#define G_028820_VTE_Z_NAN_RETAIN(x) (((x) >> 5) & 0x1) +#define C_028820_VTE_Z_NAN_RETAIN 0xFFFFFFDF +#define S_028820_VTE_W_NAN_RETAIN(x) (((x) & 0x1) << 6) +#define G_028820_VTE_W_NAN_RETAIN(x) (((x) >> 6) & 0x1) +#define C_028820_VTE_W_NAN_RETAIN 0xFFFFFFBF +#define S_028820_VTE_W_RECIP_NAN_IS_0(x) (((x) & 0x1) << 7) +#define G_028820_VTE_W_RECIP_NAN_IS_0(x) (((x) >> 7) & 0x1) +#define C_028820_VTE_W_RECIP_NAN_IS_0 0xFFFFFF7F +#define S_028820_VS_XY_NAN_TO_INF(x) (((x) & 0x1) << 8) +#define G_028820_VS_XY_NAN_TO_INF(x) (((x) >> 8) & 0x1) +#define C_028820_VS_XY_NAN_TO_INF 0xFFFFFEFF +#define S_028820_VS_XY_INF_RETAIN(x) (((x) & 0x1) << 9) +#define G_028820_VS_XY_INF_RETAIN(x) (((x) >> 9) & 0x1) +#define C_028820_VS_XY_INF_RETAIN 0xFFFFFDFF +#define S_028820_VS_Z_NAN_TO_INF(x) (((x) & 0x1) << 10) +#define G_028820_VS_Z_NAN_TO_INF(x) (((x) >> 10) & 0x1) +#define C_028820_VS_Z_NAN_TO_INF 0xFFFFFBFF +#define S_028820_VS_Z_INF_RETAIN(x) (((x) & 0x1) << 11) +#define G_028820_VS_Z_INF_RETAIN(x) (((x) >> 11) & 0x1) +#define C_028820_VS_Z_INF_RETAIN 0xFFFFF7FF +#define S_028820_VS_W_NAN_TO_INF(x) (((x) & 0x1) << 12) +#define G_028820_VS_W_NAN_TO_INF(x) (((x) >> 12) & 0x1) +#define C_028820_VS_W_NAN_TO_INF 0xFFFFEFFF +#define S_028820_VS_W_INF_RETAIN(x) (((x) & 0x1) << 13) +#define G_028820_VS_W_INF_RETAIN(x) (((x) >> 13) & 0x1) +#define C_028820_VS_W_INF_RETAIN 0xFFFFDFFF +#define S_028820_VS_CLIP_DIST_INF_DISCARD(x) (((x) & 0x1) << 14) +#define G_028820_VS_CLIP_DIST_INF_DISCARD(x) (((x) >> 14) & 0x1) +#define C_028820_VS_CLIP_DIST_INF_DISCARD 0xFFFFBFFF +#define S_028820_VTE_NO_OUTPUT_NEG_0(x) (((x) & 0x1) << 20) +#define G_028820_VTE_NO_OUTPUT_NEG_0(x) (((x) >> 20) & 0x1) +#define C_028820_VTE_NO_OUTPUT_NEG_0 0xFFEFFFFF +#define R_028824_PA_SU_LINE_STIPPLE_CNTL 0x028824 +#define S_028824_LINE_STIPPLE_RESET(x) (((x) & 0x03) << 0) +#define G_028824_LINE_STIPPLE_RESET(x) (((x) >> 0) & 0x03) +#define C_028824_LINE_STIPPLE_RESET 0xFFFFFFFC +#define S_028824_EXPAND_FULL_LENGTH(x) (((x) & 0x1) << 2) +#define G_028824_EXPAND_FULL_LENGTH(x) (((x) >> 2) & 0x1) +#define C_028824_EXPAND_FULL_LENGTH 0xFFFFFFFB +#define S_028824_FRACTIONAL_ACCUM(x) (((x) & 0x1) << 3) +#define G_028824_FRACTIONAL_ACCUM(x) (((x) >> 3) & 0x1) +#define C_028824_FRACTIONAL_ACCUM 0xFFFFFFF7 +#define S_028824_DIAMOND_ADJUST(x) (((x) & 0x1) << 4) +#define G_028824_DIAMOND_ADJUST(x) (((x) >> 4) & 0x1) +#define C_028824_DIAMOND_ADJUST 0xFFFFFFEF +#define R_028828_PA_SU_LINE_STIPPLE_SCALE 0x028828 +#define R_02882C_PA_SU_PRIM_FILTER_CNTL 0x02882C +#define S_02882C_TRIANGLE_FILTER_DISABLE(x) (((x) & 0x1) << 0) +#define G_02882C_TRIANGLE_FILTER_DISABLE(x) (((x) >> 0) & 0x1) +#define C_02882C_TRIANGLE_FILTER_DISABLE 0xFFFFFFFE +#define S_02882C_LINE_FILTER_DISABLE(x) (((x) & 0x1) << 1) +#define G_02882C_LINE_FILTER_DISABLE(x) (((x) >> 1) & 0x1) +#define C_02882C_LINE_FILTER_DISABLE 0xFFFFFFFD +#define S_02882C_POINT_FILTER_DISABLE(x) (((x) & 0x1) << 2) +#define G_02882C_POINT_FILTER_DISABLE(x) (((x) >> 2) & 0x1) +#define C_02882C_POINT_FILTER_DISABLE 0xFFFFFFFB +#define S_02882C_RECTANGLE_FILTER_DISABLE(x) (((x) & 0x1) << 3) +#define G_02882C_RECTANGLE_FILTER_DISABLE(x) (((x) >> 3) & 0x1) +#define C_02882C_RECTANGLE_FILTER_DISABLE 0xFFFFFFF7 +#define S_02882C_TRIANGLE_EXPAND_ENA(x) (((x) & 0x1) << 4) +#define G_02882C_TRIANGLE_EXPAND_ENA(x) (((x) >> 4) & 0x1) +#define C_02882C_TRIANGLE_EXPAND_ENA 0xFFFFFFEF +#define S_02882C_LINE_EXPAND_ENA(x) (((x) & 0x1) << 5) +#define G_02882C_LINE_EXPAND_ENA(x) (((x) >> 5) & 0x1) +#define C_02882C_LINE_EXPAND_ENA 0xFFFFFFDF +#define S_02882C_POINT_EXPAND_ENA(x) (((x) & 0x1) << 6) +#define G_02882C_POINT_EXPAND_ENA(x) (((x) >> 6) & 0x1) +#define C_02882C_POINT_EXPAND_ENA 0xFFFFFFBF +#define S_02882C_RECTANGLE_EXPAND_ENA(x) (((x) & 0x1) << 7) +#define G_02882C_RECTANGLE_EXPAND_ENA(x) (((x) >> 7) & 0x1) +#define C_02882C_RECTANGLE_EXPAND_ENA 0xFFFFFF7F +#define S_02882C_PRIM_EXPAND_CONSTANT(x) (((x) & 0xFF) << 8) +#define G_02882C_PRIM_EXPAND_CONSTANT(x) (((x) >> 8) & 0xFF) +#define C_02882C_PRIM_EXPAND_CONSTANT 0xFFFF00FF +#define R_028A00_PA_SU_POINT_SIZE 0x028A00 +#define S_028A00_HEIGHT(x) (((x) & 0xFFFF) << 0) +#define G_028A00_HEIGHT(x) (((x) >> 0) & 0xFFFF) +#define C_028A00_HEIGHT 0xFFFF0000 +#define S_028A00_WIDTH(x) (((x) & 0xFFFF) << 16) +#define G_028A00_WIDTH(x) (((x) >> 16) & 0xFFFF) +#define C_028A00_WIDTH 0x0000FFFF +#define R_028A04_PA_SU_POINT_MINMAX 0x028A04 +#define S_028A04_MIN_SIZE(x) (((x) & 0xFFFF) << 0) +#define G_028A04_MIN_SIZE(x) (((x) >> 0) & 0xFFFF) +#define C_028A04_MIN_SIZE 0xFFFF0000 +#define S_028A04_MAX_SIZE(x) (((x) & 0xFFFF) << 16) +#define G_028A04_MAX_SIZE(x) (((x) >> 16) & 0xFFFF) +#define C_028A04_MAX_SIZE 0x0000FFFF +#define R_028A08_PA_SU_LINE_CNTL 0x028A08 +#define S_028A08_WIDTH(x) (((x) & 0xFFFF) << 0) +#define G_028A08_WIDTH(x) (((x) >> 0) & 0xFFFF) +#define C_028A08_WIDTH 0xFFFF0000 +#define R_028A0C_PA_SC_LINE_STIPPLE 0x028A0C +#define S_028A0C_LINE_PATTERN(x) (((x) & 0xFFFF) << 0) +#define G_028A0C_LINE_PATTERN(x) (((x) >> 0) & 0xFFFF) +#define C_028A0C_LINE_PATTERN 0xFFFF0000 +#define S_028A0C_REPEAT_COUNT(x) (((x) & 0xFF) << 16) +#define G_028A0C_REPEAT_COUNT(x) (((x) >> 16) & 0xFF) +#define C_028A0C_REPEAT_COUNT 0xFF00FFFF +#define S_028A0C_PATTERN_BIT_ORDER(x) (((x) & 0x1) << 28) +#define G_028A0C_PATTERN_BIT_ORDER(x) (((x) >> 28) & 0x1) +#define C_028A0C_PATTERN_BIT_ORDER 0xEFFFFFFF +#define S_028A0C_AUTO_RESET_CNTL(x) (((x) & 0x03) << 29) +#define G_028A0C_AUTO_RESET_CNTL(x) (((x) >> 29) & 0x03) +#define C_028A0C_AUTO_RESET_CNTL 0x9FFFFFFF +#define R_028A10_VGT_OUTPUT_PATH_CNTL 0x028A10 +#define S_028A10_PATH_SELECT(x) (((x) & 0x07) << 0) +#define G_028A10_PATH_SELECT(x) (((x) >> 0) & 0x07) +#define C_028A10_PATH_SELECT 0xFFFFFFF8 +#define V_028A10_VGT_OUTPATH_VTX_REUSE 0x00 +#define V_028A10_VGT_OUTPATH_TESS_EN 0x01 +#define V_028A10_VGT_OUTPATH_PASSTHRU 0x02 +#define V_028A10_VGT_OUTPATH_GS_BLOCK 0x03 +#define V_028A10_VGT_OUTPATH_HS_BLOCK 0x04 +#define R_028A14_VGT_HOS_CNTL 0x028A14 +#define S_028A14_TESS_MODE(x) (((x) & 0x03) << 0) +#define G_028A14_TESS_MODE(x) (((x) >> 0) & 0x03) +#define C_028A14_TESS_MODE 0xFFFFFFFC +#define R_028A18_VGT_HOS_MAX_TESS_LEVEL 0x028A18 +#define R_028A1C_VGT_HOS_MIN_TESS_LEVEL 0x028A1C +#define R_028A20_VGT_HOS_REUSE_DEPTH 0x028A20 +#define S_028A20_REUSE_DEPTH(x) (((x) & 0xFF) << 0) +#define G_028A20_REUSE_DEPTH(x) (((x) >> 0) & 0xFF) +#define C_028A20_REUSE_DEPTH 0xFFFFFF00 +#define R_028A24_VGT_GROUP_PRIM_TYPE 0x028A24 +#define S_028A24_PRIM_TYPE(x) (((x) & 0x1F) << 0) +#define G_028A24_PRIM_TYPE(x) (((x) >> 0) & 0x1F) +#define C_028A24_PRIM_TYPE 0xFFFFFFE0 +#define V_028A24_VGT_GRP_3D_POINT 0x00 +#define V_028A24_VGT_GRP_3D_LINE 0x01 +#define V_028A24_VGT_GRP_3D_TRI 0x02 +#define V_028A24_VGT_GRP_3D_RECT 0x03 +#define V_028A24_VGT_GRP_3D_QUAD 0x04 +#define V_028A24_VGT_GRP_2D_COPY_RECT_V0 0x05 +#define V_028A24_VGT_GRP_2D_COPY_RECT_V1 0x06 +#define V_028A24_VGT_GRP_2D_COPY_RECT_V2 0x07 +#define V_028A24_VGT_GRP_2D_COPY_RECT_V3 0x08 +#define V_028A24_VGT_GRP_2D_FILL_RECT 0x09 +#define V_028A24_VGT_GRP_2D_LINE 0x0A +#define V_028A24_VGT_GRP_2D_TRI 0x0B +#define V_028A24_VGT_GRP_PRIM_INDEX_LINE 0x0C +#define V_028A24_VGT_GRP_PRIM_INDEX_TRI 0x0D +#define V_028A24_VGT_GRP_PRIM_INDEX_QUAD 0x0E +#define V_028A24_VGT_GRP_3D_LINE_ADJ 0x0F +#define V_028A24_VGT_GRP_3D_TRI_ADJ 0x10 +#define V_028A24_VGT_GRP_3D_PATCH 0x11 +#define S_028A24_RETAIN_ORDER(x) (((x) & 0x1) << 14) +#define G_028A24_RETAIN_ORDER(x) (((x) >> 14) & 0x1) +#define C_028A24_RETAIN_ORDER 0xFFFFBFFF +#define S_028A24_RETAIN_QUADS(x) (((x) & 0x1) << 15) +#define G_028A24_RETAIN_QUADS(x) (((x) >> 15) & 0x1) +#define C_028A24_RETAIN_QUADS 0xFFFF7FFF +#define S_028A24_PRIM_ORDER(x) (((x) & 0x07) << 16) +#define G_028A24_PRIM_ORDER(x) (((x) >> 16) & 0x07) +#define C_028A24_PRIM_ORDER 0xFFF8FFFF +#define V_028A24_VGT_GRP_LIST 0x00 +#define V_028A24_VGT_GRP_STRIP 0x01 +#define V_028A24_VGT_GRP_FAN 0x02 +#define V_028A24_VGT_GRP_LOOP 0x03 +#define V_028A24_VGT_GRP_POLYGON 0x04 +#define R_028A28_VGT_GROUP_FIRST_DECR 0x028A28 +#define S_028A28_FIRST_DECR(x) (((x) & 0x0F) << 0) +#define G_028A28_FIRST_DECR(x) (((x) >> 0) & 0x0F) +#define C_028A28_FIRST_DECR 0xFFFFFFF0 +#define R_028A2C_VGT_GROUP_DECR 0x028A2C +#define S_028A2C_DECR(x) (((x) & 0x0F) << 0) +#define G_028A2C_DECR(x) (((x) >> 0) & 0x0F) +#define C_028A2C_DECR 0xFFFFFFF0 +#define R_028A30_VGT_GROUP_VECT_0_CNTL 0x028A30 +#define S_028A30_COMP_X_EN(x) (((x) & 0x1) << 0) +#define G_028A30_COMP_X_EN(x) (((x) >> 0) & 0x1) +#define C_028A30_COMP_X_EN 0xFFFFFFFE +#define S_028A30_COMP_Y_EN(x) (((x) & 0x1) << 1) +#define G_028A30_COMP_Y_EN(x) (((x) >> 1) & 0x1) +#define C_028A30_COMP_Y_EN 0xFFFFFFFD +#define S_028A30_COMP_Z_EN(x) (((x) & 0x1) << 2) +#define G_028A30_COMP_Z_EN(x) (((x) >> 2) & 0x1) +#define C_028A30_COMP_Z_EN 0xFFFFFFFB +#define S_028A30_COMP_W_EN(x) (((x) & 0x1) << 3) +#define G_028A30_COMP_W_EN(x) (((x) >> 3) & 0x1) +#define C_028A30_COMP_W_EN 0xFFFFFFF7 +#define S_028A30_STRIDE(x) (((x) & 0xFF) << 8) +#define G_028A30_STRIDE(x) (((x) >> 8) & 0xFF) +#define C_028A30_STRIDE 0xFFFF00FF +#define S_028A30_SHIFT(x) (((x) & 0xFF) << 16) +#define G_028A30_SHIFT(x) (((x) >> 16) & 0xFF) +#define C_028A30_SHIFT 0xFF00FFFF +#define R_028A34_VGT_GROUP_VECT_1_CNTL 0x028A34 +#define S_028A34_COMP_X_EN(x) (((x) & 0x1) << 0) +#define G_028A34_COMP_X_EN(x) (((x) >> 0) & 0x1) +#define C_028A34_COMP_X_EN 0xFFFFFFFE +#define S_028A34_COMP_Y_EN(x) (((x) & 0x1) << 1) +#define G_028A34_COMP_Y_EN(x) (((x) >> 1) & 0x1) +#define C_028A34_COMP_Y_EN 0xFFFFFFFD +#define S_028A34_COMP_Z_EN(x) (((x) & 0x1) << 2) +#define G_028A34_COMP_Z_EN(x) (((x) >> 2) & 0x1) +#define C_028A34_COMP_Z_EN 0xFFFFFFFB +#define S_028A34_COMP_W_EN(x) (((x) & 0x1) << 3) +#define G_028A34_COMP_W_EN(x) (((x) >> 3) & 0x1) +#define C_028A34_COMP_W_EN 0xFFFFFFF7 +#define S_028A34_STRIDE(x) (((x) & 0xFF) << 8) +#define G_028A34_STRIDE(x) (((x) >> 8) & 0xFF) +#define C_028A34_STRIDE 0xFFFF00FF +#define S_028A34_SHIFT(x) (((x) & 0xFF) << 16) +#define G_028A34_SHIFT(x) (((x) >> 16) & 0xFF) +#define C_028A34_SHIFT 0xFF00FFFF +#define R_028A38_VGT_GROUP_VECT_0_FMT_CNTL 0x028A38 +#define S_028A38_X_CONV(x) (((x) & 0x0F) << 0) +#define G_028A38_X_CONV(x) (((x) >> 0) & 0x0F) +#define C_028A38_X_CONV 0xFFFFFFF0 +#define V_028A38_VGT_GRP_INDEX_16 0x00 +#define V_028A38_VGT_GRP_INDEX_32 0x01 +#define V_028A38_VGT_GRP_UINT_16 0x02 +#define V_028A38_VGT_GRP_UINT_32 0x03 +#define V_028A38_VGT_GRP_SINT_16 0x04 +#define V_028A38_VGT_GRP_SINT_32 0x05 +#define V_028A38_VGT_GRP_FLOAT_32 0x06 +#define V_028A38_VGT_GRP_AUTO_PRIM 0x07 +#define V_028A38_VGT_GRP_FIX_1_23_TO_FLOAT 0x08 +#define S_028A38_X_OFFSET(x) (((x) & 0x0F) << 4) +#define G_028A38_X_OFFSET(x) (((x) >> 4) & 0x0F) +#define C_028A38_X_OFFSET 0xFFFFFF0F +#define S_028A38_Y_CONV(x) (((x) & 0x0F) << 8) +#define G_028A38_Y_CONV(x) (((x) >> 8) & 0x0F) +#define C_028A38_Y_CONV 0xFFFFF0FF +#define V_028A38_VGT_GRP_INDEX_16 0x00 +#define V_028A38_VGT_GRP_INDEX_32 0x01 +#define V_028A38_VGT_GRP_UINT_16 0x02 +#define V_028A38_VGT_GRP_UINT_32 0x03 +#define V_028A38_VGT_GRP_SINT_16 0x04 +#define V_028A38_VGT_GRP_SINT_32 0x05 +#define V_028A38_VGT_GRP_FLOAT_32 0x06 +#define V_028A38_VGT_GRP_AUTO_PRIM 0x07 +#define V_028A38_VGT_GRP_FIX_1_23_TO_FLOAT 0x08 +#define S_028A38_Y_OFFSET(x) (((x) & 0x0F) << 12) +#define G_028A38_Y_OFFSET(x) (((x) >> 12) & 0x0F) +#define C_028A38_Y_OFFSET 0xFFFF0FFF +#define S_028A38_Z_CONV(x) (((x) & 0x0F) << 16) +#define G_028A38_Z_CONV(x) (((x) >> 16) & 0x0F) +#define C_028A38_Z_CONV 0xFFF0FFFF +#define V_028A38_VGT_GRP_INDEX_16 0x00 +#define V_028A38_VGT_GRP_INDEX_32 0x01 +#define V_028A38_VGT_GRP_UINT_16 0x02 +#define V_028A38_VGT_GRP_UINT_32 0x03 +#define V_028A38_VGT_GRP_SINT_16 0x04 +#define V_028A38_VGT_GRP_SINT_32 0x05 +#define V_028A38_VGT_GRP_FLOAT_32 0x06 +#define V_028A38_VGT_GRP_AUTO_PRIM 0x07 +#define V_028A38_VGT_GRP_FIX_1_23_TO_FLOAT 0x08 +#define S_028A38_Z_OFFSET(x) (((x) & 0x0F) << 20) +#define G_028A38_Z_OFFSET(x) (((x) >> 20) & 0x0F) +#define C_028A38_Z_OFFSET 0xFF0FFFFF +#define S_028A38_W_CONV(x) (((x) & 0x0F) << 24) +#define G_028A38_W_CONV(x) (((x) >> 24) & 0x0F) +#define C_028A38_W_CONV 0xF0FFFFFF +#define V_028A38_VGT_GRP_INDEX_16 0x00 +#define V_028A38_VGT_GRP_INDEX_32 0x01 +#define V_028A38_VGT_GRP_UINT_16 0x02 +#define V_028A38_VGT_GRP_UINT_32 0x03 +#define V_028A38_VGT_GRP_SINT_16 0x04 +#define V_028A38_VGT_GRP_SINT_32 0x05 +#define V_028A38_VGT_GRP_FLOAT_32 0x06 +#define V_028A38_VGT_GRP_AUTO_PRIM 0x07 +#define V_028A38_VGT_GRP_FIX_1_23_TO_FLOAT 0x08 +#define S_028A38_W_OFFSET(x) (((x) & 0x0F) << 28) +#define G_028A38_W_OFFSET(x) (((x) >> 28) & 0x0F) +#define C_028A38_W_OFFSET 0x0FFFFFFF +#define R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL 0x028A3C +#define S_028A3C_X_CONV(x) (((x) & 0x0F) << 0) +#define G_028A3C_X_CONV(x) (((x) >> 0) & 0x0F) +#define C_028A3C_X_CONV 0xFFFFFFF0 +#define V_028A3C_VGT_GRP_INDEX_16 0x00 +#define V_028A3C_VGT_GRP_INDEX_32 0x01 +#define V_028A3C_VGT_GRP_UINT_16 0x02 +#define V_028A3C_VGT_GRP_UINT_32 0x03 +#define V_028A3C_VGT_GRP_SINT_16 0x04 +#define V_028A3C_VGT_GRP_SINT_32 0x05 +#define V_028A3C_VGT_GRP_FLOAT_32 0x06 +#define V_028A3C_VGT_GRP_AUTO_PRIM 0x07 +#define V_028A3C_VGT_GRP_FIX_1_23_TO_FLOAT 0x08 +#define S_028A3C_X_OFFSET(x) (((x) & 0x0F) << 4) +#define G_028A3C_X_OFFSET(x) (((x) >> 4) & 0x0F) +#define C_028A3C_X_OFFSET 0xFFFFFF0F +#define S_028A3C_Y_CONV(x) (((x) & 0x0F) << 8) +#define G_028A3C_Y_CONV(x) (((x) >> 8) & 0x0F) +#define C_028A3C_Y_CONV 0xFFFFF0FF +#define V_028A3C_VGT_GRP_INDEX_16 0x00 +#define V_028A3C_VGT_GRP_INDEX_32 0x01 +#define V_028A3C_VGT_GRP_UINT_16 0x02 +#define V_028A3C_VGT_GRP_UINT_32 0x03 +#define V_028A3C_VGT_GRP_SINT_16 0x04 +#define V_028A3C_VGT_GRP_SINT_32 0x05 +#define V_028A3C_VGT_GRP_FLOAT_32 0x06 +#define V_028A3C_VGT_GRP_AUTO_PRIM 0x07 +#define V_028A3C_VGT_GRP_FIX_1_23_TO_FLOAT 0x08 +#define S_028A3C_Y_OFFSET(x) (((x) & 0x0F) << 12) +#define G_028A3C_Y_OFFSET(x) (((x) >> 12) & 0x0F) +#define C_028A3C_Y_OFFSET 0xFFFF0FFF +#define S_028A3C_Z_CONV(x) (((x) & 0x0F) << 16) +#define G_028A3C_Z_CONV(x) (((x) >> 16) & 0x0F) +#define C_028A3C_Z_CONV 0xFFF0FFFF +#define V_028A3C_VGT_GRP_INDEX_16 0x00 +#define V_028A3C_VGT_GRP_INDEX_32 0x01 +#define V_028A3C_VGT_GRP_UINT_16 0x02 +#define V_028A3C_VGT_GRP_UINT_32 0x03 +#define V_028A3C_VGT_GRP_SINT_16 0x04 +#define V_028A3C_VGT_GRP_SINT_32 0x05 +#define V_028A3C_VGT_GRP_FLOAT_32 0x06 +#define V_028A3C_VGT_GRP_AUTO_PRIM 0x07 +#define V_028A3C_VGT_GRP_FIX_1_23_TO_FLOAT 0x08 +#define S_028A3C_Z_OFFSET(x) (((x) & 0x0F) << 20) +#define G_028A3C_Z_OFFSET(x) (((x) >> 20) & 0x0F) +#define C_028A3C_Z_OFFSET 0xFF0FFFFF +#define S_028A3C_W_CONV(x) (((x) & 0x0F) << 24) +#define G_028A3C_W_CONV(x) (((x) >> 24) & 0x0F) +#define C_028A3C_W_CONV 0xF0FFFFFF +#define V_028A3C_VGT_GRP_INDEX_16 0x00 +#define V_028A3C_VGT_GRP_INDEX_32 0x01 +#define V_028A3C_VGT_GRP_UINT_16 0x02 +#define V_028A3C_VGT_GRP_UINT_32 0x03 +#define V_028A3C_VGT_GRP_SINT_16 0x04 +#define V_028A3C_VGT_GRP_SINT_32 0x05 +#define V_028A3C_VGT_GRP_FLOAT_32 0x06 +#define V_028A3C_VGT_GRP_AUTO_PRIM 0x07 +#define V_028A3C_VGT_GRP_FIX_1_23_TO_FLOAT 0x08 +#define S_028A3C_W_OFFSET(x) (((x) & 0x0F) << 28) +#define G_028A3C_W_OFFSET(x) (((x) >> 28) & 0x0F) +#define C_028A3C_W_OFFSET 0x0FFFFFFF +#define R_028A40_VGT_GS_MODE 0x028A40 +#define S_028A40_MODE(x) (((x) & 0x07) << 0) +#define G_028A40_MODE(x) (((x) >> 0) & 0x07) +#define C_028A40_MODE 0xFFFFFFF8 +#define V_028A40_GS_OFF 0x00 +#define V_028A40_GS_SCENARIO_A 0x01 +#define V_028A40_GS_SCENARIO_B 0x02 +#define V_028A40_GS_SCENARIO_G 0x03 +#define V_028A40_GS_SCENARIO_C 0x04 +#define V_028A40_SPRITE_EN 0x05 +#define S_028A40_CUT_MODE(x) (((x) & 0x03) << 4) +#define G_028A40_CUT_MODE(x) (((x) >> 4) & 0x03) +#define C_028A40_CUT_MODE 0xFFFFFFCF +#define V_028A40_GS_CUT_1024 0x00 +#define V_028A40_GS_CUT_512 0x01 +#define V_028A40_GS_CUT_256 0x02 +#define V_028A40_GS_CUT_128 0x03 +#define S_028A40_GS_C_PACK_EN(x) (((x) & 0x1) << 11) +#define G_028A40_GS_C_PACK_EN(x) (((x) >> 11) & 0x1) +#define C_028A40_GS_C_PACK_EN 0xFFFFF7FF +#define S_028A40_ES_PASSTHRU(x) (((x) & 0x1) << 13) +#define G_028A40_ES_PASSTHRU(x) (((x) >> 13) & 0x1) +#define C_028A40_ES_PASSTHRU 0xFFFFDFFF +#define S_028A40_COMPUTE_MODE(x) (((x) & 0x1) << 14) +#define G_028A40_COMPUTE_MODE(x) (((x) >> 14) & 0x1) +#define C_028A40_COMPUTE_MODE 0xFFFFBFFF +#define S_028A40_FAST_COMPUTE_MODE(x) (((x) & 0x1) << 15) +#define G_028A40_FAST_COMPUTE_MODE(x) (((x) >> 15) & 0x1) +#define C_028A40_FAST_COMPUTE_MODE 0xFFFF7FFF +#define S_028A40_ELEMENT_INFO_EN(x) (((x) & 0x1) << 16) +#define G_028A40_ELEMENT_INFO_EN(x) (((x) >> 16) & 0x1) +#define C_028A40_ELEMENT_INFO_EN 0xFFFEFFFF +#define S_028A40_PARTIAL_THD_AT_EOI(x) (((x) & 0x1) << 17) +#define G_028A40_PARTIAL_THD_AT_EOI(x) (((x) >> 17) & 0x1) +#define C_028A40_PARTIAL_THD_AT_EOI 0xFFFDFFFF +#define S_028A40_SUPPRESS_CUTS(x) (((x) & 0x1) << 18) +#define G_028A40_SUPPRESS_CUTS(x) (((x) >> 18) & 0x1) +#define C_028A40_SUPPRESS_CUTS 0xFFFBFFFF +#define S_028A40_ES_WRITE_OPTIMIZE(x) (((x) & 0x1) << 19) +#define G_028A40_ES_WRITE_OPTIMIZE(x) (((x) >> 19) & 0x1) +#define C_028A40_ES_WRITE_OPTIMIZE 0xFFF7FFFF +#define S_028A40_GS_WRITE_OPTIMIZE(x) (((x) & 0x1) << 20) +#define G_028A40_GS_WRITE_OPTIMIZE(x) (((x) >> 20) & 0x1) +#define C_028A40_GS_WRITE_OPTIMIZE 0xFFEFFFFF +#define R_028A48_PA_SC_MODE_CNTL_0 0x028A48 +#define S_028A48_MSAA_ENABLE(x) (((x) & 0x1) << 0) +#define G_028A48_MSAA_ENABLE(x) (((x) >> 0) & 0x1) +#define C_028A48_MSAA_ENABLE 0xFFFFFFFE +#define S_028A48_VPORT_SCISSOR_ENABLE(x) (((x) & 0x1) << 1) +#define G_028A48_VPORT_SCISSOR_ENABLE(x) (((x) >> 1) & 0x1) +#define C_028A48_VPORT_SCISSOR_ENABLE 0xFFFFFFFD +#define S_028A48_LINE_STIPPLE_ENABLE(x) (((x) & 0x1) << 2) +#define G_028A48_LINE_STIPPLE_ENABLE(x) (((x) >> 2) & 0x1) +#define C_028A48_LINE_STIPPLE_ENABLE 0xFFFFFFFB +#define S_028A48_SEND_UNLIT_STILES_TO_PKR(x) (((x) & 0x1) << 3) +#define G_028A48_SEND_UNLIT_STILES_TO_PKR(x) (((x) >> 3) & 0x1) +#define C_028A48_SEND_UNLIT_STILES_TO_PKR 0xFFFFFFF7 +#define R_028A4C_PA_SC_MODE_CNTL_1 0x028A4C +#define S_028A4C_WALK_SIZE(x) (((x) & 0x1) << 0) +#define G_028A4C_WALK_SIZE(x) (((x) >> 0) & 0x1) +#define C_028A4C_WALK_SIZE 0xFFFFFFFE +#define S_028A4C_WALK_ALIGNMENT(x) (((x) & 0x1) << 1) +#define G_028A4C_WALK_ALIGNMENT(x) (((x) >> 1) & 0x1) +#define C_028A4C_WALK_ALIGNMENT 0xFFFFFFFD +#define S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(x) (((x) & 0x1) << 2) +#define G_028A4C_WALK_ALIGN8_PRIM_FITS_ST(x) (((x) >> 2) & 0x1) +#define C_028A4C_WALK_ALIGN8_PRIM_FITS_ST 0xFFFFFFFB +#define S_028A4C_WALK_FENCE_ENABLE(x) (((x) & 0x1) << 3) +#define G_028A4C_WALK_FENCE_ENABLE(x) (((x) >> 3) & 0x1) +#define C_028A4C_WALK_FENCE_ENABLE 0xFFFFFFF7 +#define S_028A4C_WALK_FENCE_SIZE(x) (((x) & 0x07) << 4) +#define G_028A4C_WALK_FENCE_SIZE(x) (((x) >> 4) & 0x07) +#define C_028A4C_WALK_FENCE_SIZE 0xFFFFFF8F +#define S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(x) (((x) & 0x1) << 7) +#define G_028A4C_SUPERTILE_WALK_ORDER_ENABLE(x) (((x) >> 7) & 0x1) +#define C_028A4C_SUPERTILE_WALK_ORDER_ENABLE 0xFFFFFF7F +#define S_028A4C_TILE_WALK_ORDER_ENABLE(x) (((x) & 0x1) << 8) +#define G_028A4C_TILE_WALK_ORDER_ENABLE(x) (((x) >> 8) & 0x1) +#define C_028A4C_TILE_WALK_ORDER_ENABLE 0xFFFFFEFF +#define S_028A4C_TILE_COVER_DISABLE(x) (((x) & 0x1) << 9) +#define G_028A4C_TILE_COVER_DISABLE(x) (((x) >> 9) & 0x1) +#define C_028A4C_TILE_COVER_DISABLE 0xFFFFFDFF +#define S_028A4C_TILE_COVER_NO_SCISSOR(x) (((x) & 0x1) << 10) +#define G_028A4C_TILE_COVER_NO_SCISSOR(x) (((x) >> 10) & 0x1) +#define C_028A4C_TILE_COVER_NO_SCISSOR 0xFFFFFBFF +#define S_028A4C_ZMM_LINE_EXTENT(x) (((x) & 0x1) << 11) +#define G_028A4C_ZMM_LINE_EXTENT(x) (((x) >> 11) & 0x1) +#define C_028A4C_ZMM_LINE_EXTENT 0xFFFFF7FF +#define S_028A4C_ZMM_LINE_OFFSET(x) (((x) & 0x1) << 12) +#define G_028A4C_ZMM_LINE_OFFSET(x) (((x) >> 12) & 0x1) +#define C_028A4C_ZMM_LINE_OFFSET 0xFFFFEFFF +#define S_028A4C_ZMM_RECT_EXTENT(x) (((x) & 0x1) << 13) +#define G_028A4C_ZMM_RECT_EXTENT(x) (((x) >> 13) & 0x1) +#define C_028A4C_ZMM_RECT_EXTENT 0xFFFFDFFF +#define S_028A4C_KILL_PIX_POST_HI_Z(x) (((x) & 0x1) << 14) +#define G_028A4C_KILL_PIX_POST_HI_Z(x) (((x) >> 14) & 0x1) +#define C_028A4C_KILL_PIX_POST_HI_Z 0xFFFFBFFF +#define S_028A4C_KILL_PIX_POST_DETAIL_MASK(x) (((x) & 0x1) << 15) +#define G_028A4C_KILL_PIX_POST_DETAIL_MASK(x) (((x) >> 15) & 0x1) +#define C_028A4C_KILL_PIX_POST_DETAIL_MASK 0xFFFF7FFF +#define S_028A4C_PS_ITER_SAMPLE(x) (((x) & 0x1) << 16) +#define G_028A4C_PS_ITER_SAMPLE(x) (((x) >> 16) & 0x1) +#define C_028A4C_PS_ITER_SAMPLE 0xFFFEFFFF +#define S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x) (((x) & 0x1) << 17) +#define G_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC(x) (((x) >> 17) & 0x1) +#define C_028A4C_MULTI_SHADER_ENGINE_PRIM_DISC 0xFFFDFFFF +#define S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x) (((x) & 0x1) << 25) +#define G_028A4C_FORCE_EOV_CNTDWN_ENABLE(x) (((x) >> 25) & 0x1) +#define C_028A4C_FORCE_EOV_CNTDWN_ENABLE 0xFDFFFFFF +#define S_028A4C_FORCE_EOV_REZ_ENABLE(x) (((x) & 0x1) << 26) +#define G_028A4C_FORCE_EOV_REZ_ENABLE(x) (((x) >> 26) & 0x1) +#define C_028A4C_FORCE_EOV_REZ_ENABLE 0xFBFFFFFF +#define S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(x) (((x) & 0x1) << 27) +#define G_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(x) (((x) >> 27) & 0x1) +#define C_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE 0xF7FFFFFF +#define S_028A4C_OUT_OF_ORDER_WATER_MARK(x) (((x) & 0x07) << 28) +#define G_028A4C_OUT_OF_ORDER_WATER_MARK(x) (((x) >> 28) & 0x07) +#define C_028A4C_OUT_OF_ORDER_WATER_MARK 0x8FFFFFFF +#define R_028A50_VGT_ENHANCE 0x028A50 +#define R_028A54_VGT_GS_PER_ES 0x028A54 +#define S_028A54_GS_PER_ES(x) (((x) & 0x7FF) << 0) +#define G_028A54_GS_PER_ES(x) (((x) >> 0) & 0x7FF) +#define C_028A54_GS_PER_ES 0xFFFFF800 +#define R_028A58_VGT_ES_PER_GS 0x028A58 +#define S_028A58_ES_PER_GS(x) (((x) & 0x7FF) << 0) +#define G_028A58_ES_PER_GS(x) (((x) >> 0) & 0x7FF) +#define C_028A58_ES_PER_GS 0xFFFFF800 +#define R_028A5C_VGT_GS_PER_VS 0x028A5C +#define S_028A5C_GS_PER_VS(x) (((x) & 0x0F) << 0) +#define G_028A5C_GS_PER_VS(x) (((x) >> 0) & 0x0F) +#define C_028A5C_GS_PER_VS 0xFFFFFFF0 +#define R_028A60_VGT_GSVS_RING_OFFSET_1 0x028A60 +#define S_028A60_OFFSET(x) (((x) & 0x7FFF) << 0) +#define G_028A60_OFFSET(x) (((x) >> 0) & 0x7FFF) +#define C_028A60_OFFSET 0xFFFF8000 +#define R_028A64_VGT_GSVS_RING_OFFSET_2 0x028A64 +#define S_028A64_OFFSET(x) (((x) & 0x7FFF) << 0) +#define G_028A64_OFFSET(x) (((x) >> 0) & 0x7FFF) +#define C_028A64_OFFSET 0xFFFF8000 +#define R_028A68_VGT_GSVS_RING_OFFSET_3 0x028A68 +#define S_028A68_OFFSET(x) (((x) & 0x7FFF) << 0) +#define G_028A68_OFFSET(x) (((x) >> 0) & 0x7FFF) +#define C_028A68_OFFSET 0xFFFF8000 +#define R_028A6C_VGT_GS_OUT_PRIM_TYPE 0x028A6C +#define S_028A6C_OUTPRIM_TYPE(x) (((x) & 0x3F) << 0) +#define G_028A6C_OUTPRIM_TYPE(x) (((x) >> 0) & 0x3F) +#define C_028A6C_OUTPRIM_TYPE 0xFFFFFFC0 +#define S_028A6C_OUTPRIM_TYPE_1(x) (((x) & 0x3F) << 8) +#define G_028A6C_OUTPRIM_TYPE_1(x) (((x) >> 8) & 0x3F) +#define C_028A6C_OUTPRIM_TYPE_1 0xFFFFC0FF +#define S_028A6C_OUTPRIM_TYPE_2(x) (((x) & 0x3F) << 16) +#define G_028A6C_OUTPRIM_TYPE_2(x) (((x) >> 16) & 0x3F) +#define C_028A6C_OUTPRIM_TYPE_2 0xFFC0FFFF +#define S_028A6C_OUTPRIM_TYPE_3(x) (((x) & 0x3F) << 22) +#define G_028A6C_OUTPRIM_TYPE_3(x) (((x) >> 22) & 0x3F) +#define C_028A6C_OUTPRIM_TYPE_3 0xF03FFFFF +#define S_028A6C_UNIQUE_TYPE_PER_STREAM(x) (((x) & 0x1) << 31) +#define G_028A6C_UNIQUE_TYPE_PER_STREAM(x) (((x) >> 31) & 0x1) +#define C_028A6C_UNIQUE_TYPE_PER_STREAM 0x7FFFFFFF +#define R_028A70_IA_ENHANCE 0x028A70 +#define R_028A74_VGT_DMA_SIZE 0x028A74 +#define R_028A78_VGT_DMA_MAX_SIZE 0x028A78 +#define R_028A7C_VGT_DMA_INDEX_TYPE 0x028A7C +#define S_028A7C_INDEX_TYPE(x) (((x) & 0x03) << 0) +#define G_028A7C_INDEX_TYPE(x) (((x) >> 0) & 0x03) +#define C_028A7C_INDEX_TYPE 0xFFFFFFFC +#define V_028A7C_VGT_INDEX_16 0x00 +#define V_028A7C_VGT_INDEX_32 0x01 +#define S_028A7C_SWAP_MODE(x) (((x) & 0x03) << 2) +#define G_028A7C_SWAP_MODE(x) (((x) >> 2) & 0x03) +#define C_028A7C_SWAP_MODE 0xFFFFFFF3 +#define V_028A7C_VGT_DMA_SWAP_NONE 0x00 +#define V_028A7C_VGT_DMA_SWAP_16_BIT 0x01 +#define V_028A7C_VGT_DMA_SWAP_32_BIT 0x02 +#define V_028A7C_VGT_DMA_SWAP_WORD 0x03 +#define R_028A84_VGT_PRIMITIVEID_EN 0x028A84 +#define S_028A84_PRIMITIVEID_EN(x) (((x) & 0x1) << 0) +#define G_028A84_PRIMITIVEID_EN(x) (((x) >> 0) & 0x1) +#define C_028A84_PRIMITIVEID_EN 0xFFFFFFFE +#define S_028A84_DISABLE_RESET_ON_EOI(x) (((x) & 0x1) << 1) +#define G_028A84_DISABLE_RESET_ON_EOI(x) (((x) >> 1) & 0x1) +#define C_028A84_DISABLE_RESET_ON_EOI 0xFFFFFFFD +#define R_028A88_VGT_DMA_NUM_INSTANCES 0x028A88 +#define R_028A8C_VGT_PRIMITIVEID_RESET 0x028A8C +#define R_028A90_VGT_EVENT_INITIATOR 0x028A90 +#define S_028A90_EVENT_TYPE(x) (((x) & 0x3F) << 0) +#define G_028A90_EVENT_TYPE(x) (((x) >> 0) & 0x3F) +#define C_028A90_EVENT_TYPE 0xFFFFFFC0 +#define V_028A90_SAMPLE_STREAMOUTSTATS1 0x01 +#define V_028A90_SAMPLE_STREAMOUTSTATS2 0x02 +#define V_028A90_SAMPLE_STREAMOUTSTATS3 0x03 +#define V_028A90_CACHE_FLUSH_TS 0x04 +#define V_028A90_CONTEXT_DONE 0x05 +#define V_028A90_CACHE_FLUSH 0x06 +#define V_028A90_CS_PARTIAL_FLUSH 0x07 +#define V_028A90_VGT_STREAMOUT_SYNC 0x08 +#define V_028A90_VGT_STREAMOUT_RESET 0x0A +#define V_028A90_END_OF_PIPE_INCR_DE 0x0B +#define V_028A90_END_OF_PIPE_IB_END 0x0C +#define V_028A90_RST_PIX_CNT 0x0D +#define V_028A90_VS_PARTIAL_FLUSH 0x0F +#define V_028A90_PS_PARTIAL_FLUSH 0x10 +#define V_028A90_FLUSH_HS_OUTPUT 0x11 +#define V_028A90_FLUSH_LS_OUTPUT 0x12 +#define V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT 0x14 +#define V_028A90_ZPASS_DONE 0x15 +#define V_028A90_CACHE_FLUSH_AND_INV_EVENT 0x16 +#define V_028A90_PERFCOUNTER_START 0x17 +#define V_028A90_PERFCOUNTER_STOP 0x18 +#define V_028A90_PIPELINESTAT_START 0x19 +#define V_028A90_PIPELINESTAT_STOP 0x1A +#define V_028A90_PERFCOUNTER_SAMPLE 0x1B +#define V_028A90_FLUSH_ES_OUTPUT 0x1C +#define V_028A90_FLUSH_GS_OUTPUT 0x1D +#define V_028A90_SAMPLE_PIPELINESTAT 0x1E +#define V_028A90_SO_VGTSTREAMOUT_FLUSH 0x1F +#define V_028A90_SAMPLE_STREAMOUTSTATS 0x20 +#define V_028A90_RESET_VTX_CNT 0x21 +#define V_028A90_BLOCK_CONTEXT_DONE 0x22 +#define V_028A90_CS_CONTEXT_DONE 0x23 +#define V_028A90_VGT_FLUSH 0x24 +#define V_028A90_SC_SEND_DB_VPZ 0x27 +#define V_028A90_BOTTOM_OF_PIPE_TS 0x28 +#define V_028A90_DB_CACHE_FLUSH_AND_INV 0x2A +#define V_028A90_FLUSH_AND_INV_DB_DATA_TS 0x2B +#define V_028A90_FLUSH_AND_INV_DB_META 0x2C +#define V_028A90_FLUSH_AND_INV_CB_DATA_TS 0x2D +#define V_028A90_FLUSH_AND_INV_CB_META 0x2E +#define V_028A90_CS_DONE 0x2F +#define V_028A90_PS_DONE 0x30 +#define V_028A90_FLUSH_AND_INV_CB_PIXEL_DATA 0x31 +#define V_028A90_THREAD_TRACE_START 0x33 +#define V_028A90_THREAD_TRACE_STOP 0x34 +#define V_028A90_THREAD_TRACE_MARKER 0x35 +#define V_028A90_THREAD_TRACE_FLUSH 0x36 +#define V_028A90_THREAD_TRACE_FINISH 0x37 +#define S_028A90_ADDRESS_HI(x) (((x) & 0x1FF) << 18) +#define G_028A90_ADDRESS_HI(x) (((x) >> 18) & 0x1FF) +#define C_028A90_ADDRESS_HI 0xF803FFFF +#define S_028A90_EXTENDED_EVENT(x) (((x) & 0x1) << 27) +#define G_028A90_EXTENDED_EVENT(x) (((x) >> 27) & 0x1) +#define C_028A90_EXTENDED_EVENT 0xF7FFFFFF +#define R_028A94_VGT_MULTI_PRIM_IB_RESET_EN 0x028A94 +#define S_028A94_RESET_EN(x) (((x) & 0x1) << 0) +#define G_028A94_RESET_EN(x) (((x) >> 0) & 0x1) +#define C_028A94_RESET_EN 0xFFFFFFFE +#define R_028AA0_VGT_INSTANCE_STEP_RATE_0 0x028AA0 +#define R_028AA4_VGT_INSTANCE_STEP_RATE_1 0x028AA4 +#define R_028AA8_IA_MULTI_VGT_PARAM 0x028AA8 +#define S_028AA8_PRIMGROUP_SIZE(x) (((x) & 0xFFFF) << 0) +#define G_028AA8_PRIMGROUP_SIZE(x) (((x) >> 0) & 0xFFFF) +#define C_028AA8_PRIMGROUP_SIZE 0xFFFF0000 +#define S_028AA8_PARTIAL_VS_WAVE_ON(x) (((x) & 0x1) << 16) +#define G_028AA8_PARTIAL_VS_WAVE_ON(x) (((x) >> 16) & 0x1) +#define C_028AA8_PARTIAL_VS_WAVE_ON 0xFFFEFFFF +#define S_028AA8_SWITCH_ON_EOP(x) (((x) & 0x1) << 17) +#define G_028AA8_SWITCH_ON_EOP(x) (((x) >> 17) & 0x1) +#define C_028AA8_SWITCH_ON_EOP 0xFFFDFFFF +#define S_028AA8_PARTIAL_ES_WAVE_ON(x) (((x) & 0x1) << 18) +#define G_028AA8_PARTIAL_ES_WAVE_ON(x) (((x) >> 18) & 0x1) +#define C_028AA8_PARTIAL_ES_WAVE_ON 0xFFFBFFFF +#define S_028AA8_SWITCH_ON_EOI(x) (((x) & 0x1) << 19) +#define G_028AA8_SWITCH_ON_EOI(x) (((x) >> 19) & 0x1) +#define C_028AA8_SWITCH_ON_EOI 0xFFF7FFFF +#define R_028AAC_VGT_ESGS_RING_ITEMSIZE 0x028AAC +#define S_028AAC_ITEMSIZE(x) (((x) & 0x7FFF) << 0) +#define G_028AAC_ITEMSIZE(x) (((x) >> 0) & 0x7FFF) +#define C_028AAC_ITEMSIZE 0xFFFF8000 +#define R_028AB0_VGT_GSVS_RING_ITEMSIZE 0x028AB0 +#define S_028AB0_ITEMSIZE(x) (((x) & 0x7FFF) << 0) +#define G_028AB0_ITEMSIZE(x) (((x) >> 0) & 0x7FFF) +#define C_028AB0_ITEMSIZE 0xFFFF8000 +#define R_028AB4_VGT_REUSE_OFF 0x028AB4 +#define S_028AB4_REUSE_OFF(x) (((x) & 0x1) << 0) +#define G_028AB4_REUSE_OFF(x) (((x) >> 0) & 0x1) +#define C_028AB4_REUSE_OFF 0xFFFFFFFE +#define R_028AB8_VGT_VTX_CNT_EN 0x028AB8 +#define S_028AB8_VTX_CNT_EN(x) (((x) & 0x1) << 0) +#define G_028AB8_VTX_CNT_EN(x) (((x) >> 0) & 0x1) +#define C_028AB8_VTX_CNT_EN 0xFFFFFFFE +#define R_028ABC_DB_HTILE_SURFACE 0x028ABC +#define S_028ABC_LINEAR(x) (((x) & 0x1) << 0) +#define G_028ABC_LINEAR(x) (((x) >> 0) & 0x1) +#define C_028ABC_LINEAR 0xFFFFFFFE +#define S_028ABC_FULL_CACHE(x) (((x) & 0x1) << 1) +#define G_028ABC_FULL_CACHE(x) (((x) >> 1) & 0x1) +#define C_028ABC_FULL_CACHE 0xFFFFFFFD +#define S_028ABC_HTILE_USES_PRELOAD_WIN(x) (((x) & 0x1) << 2) +#define G_028ABC_HTILE_USES_PRELOAD_WIN(x) (((x) >> 2) & 0x1) +#define C_028ABC_HTILE_USES_PRELOAD_WIN 0xFFFFFFFB +#define S_028ABC_PRELOAD(x) (((x) & 0x1) << 3) +#define G_028ABC_PRELOAD(x) (((x) >> 3) & 0x1) +#define C_028ABC_PRELOAD 0xFFFFFFF7 +#define S_028ABC_PREFETCH_WIDTH(x) (((x) & 0x3F) << 4) +#define G_028ABC_PREFETCH_WIDTH(x) (((x) >> 4) & 0x3F) +#define C_028ABC_PREFETCH_WIDTH 0xFFFFFC0F +#define S_028ABC_PREFETCH_HEIGHT(x) (((x) & 0x3F) << 10) +#define G_028ABC_PREFETCH_HEIGHT(x) (((x) >> 10) & 0x3F) +#define C_028ABC_PREFETCH_HEIGHT 0xFFFF03FF +#define S_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x) (((x) & 0x1) << 16) +#define G_028ABC_DST_OUTSIDE_ZERO_TO_ONE(x) (((x) >> 16) & 0x1) +#define C_028ABC_DST_OUTSIDE_ZERO_TO_ONE 0xFFFEFFFF +#define R_028AC0_DB_SRESULTS_COMPARE_STATE0 0x028AC0 +#define S_028AC0_COMPAREFUNC0(x) (((x) & 0x07) << 0) +#define G_028AC0_COMPAREFUNC0(x) (((x) >> 0) & 0x07) +#define C_028AC0_COMPAREFUNC0 0xFFFFFFF8 +#define V_028AC0_REF_NEVER 0x00 +#define V_028AC0_REF_LESS 0x01 +#define V_028AC0_REF_EQUAL 0x02 +#define V_028AC0_REF_LEQUAL 0x03 +#define V_028AC0_REF_GREATER 0x04 +#define V_028AC0_REF_NOTEQUAL 0x05 +#define V_028AC0_REF_GEQUAL 0x06 +#define V_028AC0_REF_ALWAYS 0x07 +#define S_028AC0_COMPAREVALUE0(x) (((x) & 0xFF) << 4) +#define G_028AC0_COMPAREVALUE0(x) (((x) >> 4) & 0xFF) +#define C_028AC0_COMPAREVALUE0 0xFFFFF00F +#define S_028AC0_COMPAREMASK0(x) (((x) & 0xFF) << 12) +#define G_028AC0_COMPAREMASK0(x) (((x) >> 12) & 0xFF) +#define C_028AC0_COMPAREMASK0 0xFFF00FFF +#define S_028AC0_ENABLE0(x) (((x) & 0x1) << 24) +#define G_028AC0_ENABLE0(x) (((x) >> 24) & 0x1) +#define C_028AC0_ENABLE0 0xFEFFFFFF +#define R_028AC4_DB_SRESULTS_COMPARE_STATE1 0x028AC4 +#define S_028AC4_COMPAREFUNC1(x) (((x) & 0x07) << 0) +#define G_028AC4_COMPAREFUNC1(x) (((x) >> 0) & 0x07) +#define C_028AC4_COMPAREFUNC1 0xFFFFFFF8 +#define V_028AC4_REF_NEVER 0x00 +#define V_028AC4_REF_LESS 0x01 +#define V_028AC4_REF_EQUAL 0x02 +#define V_028AC4_REF_LEQUAL 0x03 +#define V_028AC4_REF_GREATER 0x04 +#define V_028AC4_REF_NOTEQUAL 0x05 +#define V_028AC4_REF_GEQUAL 0x06 +#define V_028AC4_REF_ALWAYS 0x07 +#define S_028AC4_COMPAREVALUE1(x) (((x) & 0xFF) << 4) +#define G_028AC4_COMPAREVALUE1(x) (((x) >> 4) & 0xFF) +#define C_028AC4_COMPAREVALUE1 0xFFFFF00F +#define S_028AC4_COMPAREMASK1(x) (((x) & 0xFF) << 12) +#define G_028AC4_COMPAREMASK1(x) (((x) >> 12) & 0xFF) +#define C_028AC4_COMPAREMASK1 0xFFF00FFF +#define S_028AC4_ENABLE1(x) (((x) & 0x1) << 24) +#define G_028AC4_ENABLE1(x) (((x) >> 24) & 0x1) +#define C_028AC4_ENABLE1 0xFEFFFFFF +#define R_028AC8_DB_PRELOAD_CONTROL 0x028AC8 +#define S_028AC8_START_X(x) (((x) & 0xFF) << 0) +#define G_028AC8_START_X(x) (((x) >> 0) & 0xFF) +#define C_028AC8_START_X 0xFFFFFF00 +#define S_028AC8_START_Y(x) (((x) & 0xFF) << 8) +#define G_028AC8_START_Y(x) (((x) >> 8) & 0xFF) +#define C_028AC8_START_Y 0xFFFF00FF +#define S_028AC8_MAX_X(x) (((x) & 0xFF) << 16) +#define G_028AC8_MAX_X(x) (((x) >> 16) & 0xFF) +#define C_028AC8_MAX_X 0xFF00FFFF +#define S_028AC8_MAX_Y(x) (((x) & 0xFF) << 24) +#define G_028AC8_MAX_Y(x) (((x) >> 24) & 0xFF) +#define C_028AC8_MAX_Y 0x00FFFFFF +#define R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 0x028AD0 +#define R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 0x028AD4 +#define S_028AD4_STRIDE(x) (((x) & 0x3FF) << 0) +#define G_028AD4_STRIDE(x) (((x) >> 0) & 0x3FF) +#define C_028AD4_STRIDE 0xFFFFFC00 +#define R_028ADC_VGT_STRMOUT_BUFFER_OFFSET_0 0x028ADC +#define R_028AE0_VGT_STRMOUT_BUFFER_SIZE_1 0x028AE0 +#define R_028AE4_VGT_STRMOUT_VTX_STRIDE_1 0x028AE4 +#define S_028AE4_STRIDE(x) (((x) & 0x3FF) << 0) +#define G_028AE4_STRIDE(x) (((x) >> 0) & 0x3FF) +#define C_028AE4_STRIDE 0xFFFFFC00 +#define R_028AEC_VGT_STRMOUT_BUFFER_OFFSET_1 0x028AEC +#define R_028AF0_VGT_STRMOUT_BUFFER_SIZE_2 0x028AF0 +#define R_028AF4_VGT_STRMOUT_VTX_STRIDE_2 0x028AF4 +#define S_028AF4_STRIDE(x) (((x) & 0x3FF) << 0) +#define G_028AF4_STRIDE(x) (((x) >> 0) & 0x3FF) +#define C_028AF4_STRIDE 0xFFFFFC00 +#define R_028AFC_VGT_STRMOUT_BUFFER_OFFSET_2 0x028AFC +#define R_028B00_VGT_STRMOUT_BUFFER_SIZE_3 0x028B00 +#define R_028B04_VGT_STRMOUT_VTX_STRIDE_3 0x028B04 +#define S_028B04_STRIDE(x) (((x) & 0x3FF) << 0) +#define G_028B04_STRIDE(x) (((x) >> 0) & 0x3FF) +#define C_028B04_STRIDE 0xFFFFFC00 +#define R_028B0C_VGT_STRMOUT_BUFFER_OFFSET_3 0x028B0C +#define R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET 0x028B28 +#define R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE 0x028B2C +#define R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE 0x028B30 +#define S_028B30_VERTEX_STRIDE(x) (((x) & 0x1FF) << 0) +#define G_028B30_VERTEX_STRIDE(x) (((x) >> 0) & 0x1FF) +#define C_028B30_VERTEX_STRIDE 0xFFFFFE00 +#define R_028B38_VGT_GS_MAX_VERT_OUT 0x028B38 +#define S_028B38_MAX_VERT_OUT(x) (((x) & 0x7FF) << 0) +#define G_028B38_MAX_VERT_OUT(x) (((x) >> 0) & 0x7FF) +#define C_028B38_MAX_VERT_OUT 0xFFFFF800 +#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54 +#define S_028B54_LS_EN(x) (((x) & 0x03) << 0) +#define G_028B54_LS_EN(x) (((x) >> 0) & 0x03) +#define C_028B54_LS_EN 0xFFFFFFFC +#define V_028B54_LS_STAGE_OFF 0x00 +#define V_028B54_LS_STAGE_ON 0x01 +#define V_028B54_CS_STAGE_ON 0x02 +#define S_028B54_HS_EN(x) (((x) & 0x1) << 2) +#define G_028B54_HS_EN(x) (((x) >> 2) & 0x1) +#define C_028B54_HS_EN 0xFFFFFFFB +#define S_028B54_ES_EN(x) (((x) & 0x03) << 3) +#define G_028B54_ES_EN(x) (((x) >> 3) & 0x03) +#define C_028B54_ES_EN 0xFFFFFFE7 +#define V_028B54_ES_STAGE_OFF 0x00 +#define V_028B54_ES_STAGE_DS 0x01 +#define V_028B54_ES_STAGE_REAL 0x02 +#define S_028B54_GS_EN(x) (((x) & 0x1) << 5) +#define G_028B54_GS_EN(x) (((x) >> 5) & 0x1) +#define C_028B54_GS_EN 0xFFFFFFDF +#define S_028B54_VS_EN(x) (((x) & 0x03) << 6) +#define G_028B54_VS_EN(x) (((x) >> 6) & 0x03) +#define C_028B54_VS_EN 0xFFFFFF3F +#define V_028B54_VS_STAGE_REAL 0x00 +#define V_028B54_VS_STAGE_DS 0x01 +#define V_028B54_VS_STAGE_COPY_SHADER 0x02 +#define S_028B54_DYNAMIC_HS(x) (((x) & 0x1) << 8) +#define G_028B54_DYNAMIC_HS(x) (((x) >> 8) & 0x1) +#define C_028B54_DYNAMIC_HS 0xFFFFFEFF +#define R_028B58_VGT_LS_HS_CONFIG 0x028B58 +#define S_028B58_NUM_PATCHES(x) (((x) & 0xFF) << 0) +#define G_028B58_NUM_PATCHES(x) (((x) >> 0) & 0xFF) +#define C_028B58_NUM_PATCHES 0xFFFFFF00 +#define S_028B58_HS_NUM_INPUT_CP(x) (((x) & 0x3F) << 8) +#define G_028B58_HS_NUM_INPUT_CP(x) (((x) >> 8) & 0x3F) +#define C_028B58_HS_NUM_INPUT_CP 0xFFFFC0FF +#define S_028B58_HS_NUM_OUTPUT_CP(x) (((x) & 0x3F) << 14) +#define G_028B58_HS_NUM_OUTPUT_CP(x) (((x) >> 14) & 0x3F) +#define C_028B58_HS_NUM_OUTPUT_CP 0xFFF03FFF +#define R_028B5C_VGT_GS_VERT_ITEMSIZE 0x028B5C +#define S_028B5C_ITEMSIZE(x) (((x) & 0x7FFF) << 0) +#define G_028B5C_ITEMSIZE(x) (((x) >> 0) & 0x7FFF) +#define C_028B5C_ITEMSIZE 0xFFFF8000 +#define R_028B60_VGT_GS_VERT_ITEMSIZE_1 0x028B60 +#define S_028B60_ITEMSIZE(x) (((x) & 0x7FFF) << 0) +#define G_028B60_ITEMSIZE(x) (((x) >> 0) & 0x7FFF) +#define C_028B60_ITEMSIZE 0xFFFF8000 +#define R_028B64_VGT_GS_VERT_ITEMSIZE_2 0x028B64 +#define S_028B64_ITEMSIZE(x) (((x) & 0x7FFF) << 0) +#define G_028B64_ITEMSIZE(x) (((x) >> 0) & 0x7FFF) +#define C_028B64_ITEMSIZE 0xFFFF8000 +#define R_028B68_VGT_GS_VERT_ITEMSIZE_3 0x028B68 +#define S_028B68_ITEMSIZE(x) (((x) & 0x7FFF) << 0) +#define G_028B68_ITEMSIZE(x) (((x) >> 0) & 0x7FFF) +#define C_028B68_ITEMSIZE 0xFFFF8000 +#define R_028B6C_VGT_TF_PARAM 0x028B6C +#define S_028B6C_TYPE(x) (((x) & 0x03) << 0) +#define G_028B6C_TYPE(x) (((x) >> 0) & 0x03) +#define C_028B6C_TYPE 0xFFFFFFFC +#define V_028B6C_TESS_ISOLINE 0x00 +#define V_028B6C_TESS_TRIANGLE 0x01 +#define V_028B6C_TESS_QUAD 0x02 +#define S_028B6C_PARTITIONING(x) (((x) & 0x07) << 2) +#define G_028B6C_PARTITIONING(x) (((x) >> 2) & 0x07) +#define C_028B6C_PARTITIONING 0xFFFFFFE3 +#define V_028B6C_PART_INTEGER 0x00 +#define V_028B6C_PART_POW2 0x01 +#define V_028B6C_PART_FRAC_ODD 0x02 +#define V_028B6C_PART_FRAC_EVEN 0x03 +#define S_028B6C_TOPOLOGY(x) (((x) & 0x07) << 5) +#define G_028B6C_TOPOLOGY(x) (((x) >> 5) & 0x07) +#define C_028B6C_TOPOLOGY 0xFFFFFF1F +#define V_028B6C_OUTPUT_POINT 0x00 +#define V_028B6C_OUTPUT_LINE 0x01 +#define V_028B6C_OUTPUT_TRIANGLE_CW 0x02 +#define V_028B6C_OUTPUT_TRIANGLE_CCW 0x03 +#define S_028B6C_RESERVED_REDUC_AXIS(x) (((x) & 0x1) << 8) +#define G_028B6C_RESERVED_REDUC_AXIS(x) (((x) >> 8) & 0x1) +#define C_028B6C_RESERVED_REDUC_AXIS 0xFFFFFEFF +#define S_028B6C_NUM_DS_WAVES_PER_SIMD(x) (((x) & 0x0F) << 10) +#define G_028B6C_NUM_DS_WAVES_PER_SIMD(x) (((x) >> 10) & 0x0F) +#define C_028B6C_NUM_DS_WAVES_PER_SIMD 0xFFFFC3FF +#define S_028B6C_DISABLE_DONUTS(x) (((x) & 0x1) << 14) +#define G_028B6C_DISABLE_DONUTS(x) (((x) >> 14) & 0x1) +#define C_028B6C_DISABLE_DONUTS 0xFFFFBFFF +#define R_028B70_DB_ALPHA_TO_MASK 0x028B70 +#define S_028B70_ALPHA_TO_MASK_ENABLE(x) (((x) & 0x1) << 0) +#define G_028B70_ALPHA_TO_MASK_ENABLE(x) (((x) >> 0) & 0x1) +#define C_028B70_ALPHA_TO_MASK_ENABLE 0xFFFFFFFE +#define S_028B70_ALPHA_TO_MASK_OFFSET0(x) (((x) & 0x03) << 8) +#define G_028B70_ALPHA_TO_MASK_OFFSET0(x) (((x) >> 8) & 0x03) +#define C_028B70_ALPHA_TO_MASK_OFFSET0 0xFFFFFCFF +#define S_028B70_ALPHA_TO_MASK_OFFSET1(x) (((x) & 0x03) << 10) +#define G_028B70_ALPHA_TO_MASK_OFFSET1(x) (((x) >> 10) & 0x03) +#define C_028B70_ALPHA_TO_MASK_OFFSET1 0xFFFFF3FF +#define S_028B70_ALPHA_TO_MASK_OFFSET2(x) (((x) & 0x03) << 12) +#define G_028B70_ALPHA_TO_MASK_OFFSET2(x) (((x) >> 12) & 0x03) +#define C_028B70_ALPHA_TO_MASK_OFFSET2 0xFFFFCFFF +#define S_028B70_ALPHA_TO_MASK_OFFSET3(x) (((x) & 0x03) << 14) +#define G_028B70_ALPHA_TO_MASK_OFFSET3(x) (((x) >> 14) & 0x03) +#define C_028B70_ALPHA_TO_MASK_OFFSET3 0xFFFF3FFF +#define S_028B70_OFFSET_ROUND(x) (((x) & 0x1) << 16) +#define G_028B70_OFFSET_ROUND(x) (((x) >> 16) & 0x1) +#define C_028B70_OFFSET_ROUND 0xFFFEFFFF +#define R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL 0x028B78 +#define S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(x) (((x) & 0xFF) << 0) +#define G_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(x) (((x) >> 0) & 0xFF) +#define C_028B78_POLY_OFFSET_NEG_NUM_DB_BITS 0xFFFFFF00 +#define S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(x) (((x) & 0x1) << 8) +#define G_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(x) (((x) >> 8) & 0x1) +#define C_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT 0xFFFFFEFF +#define R_028B7C_PA_SU_POLY_OFFSET_CLAMP 0x028B7C +#define R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE 0x028B80 +#define R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET 0x028B84 +#define R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE 0x028B88 +#define R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET 0x028B8C +#define R_028B90_VGT_GS_INSTANCE_CNT 0x028B90 +#define S_028B90_ENABLE(x) (((x) & 0x1) << 0) +#define G_028B90_ENABLE(x) (((x) >> 0) & 0x1) +#define C_028B90_ENABLE 0xFFFFFFFE +#define S_028B90_CNT(x) (((x) & 0x7F) << 2) +#define G_028B90_CNT(x) (((x) >> 2) & 0x7F) +#define C_028B90_CNT 0xFFFFFE03 +#define R_028B94_VGT_STRMOUT_CONFIG 0x028B94 +#define S_028B94_STREAMOUT_0_EN(x) (((x) & 0x1) << 0) +#define G_028B94_STREAMOUT_0_EN(x) (((x) >> 0) & 0x1) +#define C_028B94_STREAMOUT_0_EN 0xFFFFFFFE +#define S_028B94_STREAMOUT_1_EN(x) (((x) & 0x1) << 1) +#define G_028B94_STREAMOUT_1_EN(x) (((x) >> 1) & 0x1) +#define C_028B94_STREAMOUT_1_EN 0xFFFFFFFD +#define S_028B94_STREAMOUT_2_EN(x) (((x) & 0x1) << 2) +#define G_028B94_STREAMOUT_2_EN(x) (((x) >> 2) & 0x1) +#define C_028B94_STREAMOUT_2_EN 0xFFFFFFFB +#define S_028B94_STREAMOUT_3_EN(x) (((x) & 0x1) << 3) +#define G_028B94_STREAMOUT_3_EN(x) (((x) >> 3) & 0x1) +#define C_028B94_STREAMOUT_3_EN 0xFFFFFFF7 +#define S_028B94_RAST_STREAM(x) (((x) & 0x07) << 4) +#define G_028B94_RAST_STREAM(x) (((x) >> 4) & 0x07) +#define C_028B94_RAST_STREAM 0xFFFFFF8F +#define S_028B94_RAST_STREAM_MASK(x) (((x) & 0x0F) << 8) +#define G_028B94_RAST_STREAM_MASK(x) (((x) >> 8) & 0x0F) +#define C_028B94_RAST_STREAM_MASK 0xFFFFF0FF +#define S_028B94_USE_RAST_STREAM_MASK(x) (((x) & 0x1) << 31) +#define G_028B94_USE_RAST_STREAM_MASK(x) (((x) >> 31) & 0x1) +#define C_028B94_USE_RAST_STREAM_MASK 0x7FFFFFFF +#define R_028B98_VGT_STRMOUT_BUFFER_CONFIG 0x028B98 +#define S_028B98_STREAM_0_BUFFER_EN(x) (((x) & 0x0F) << 0) +#define G_028B98_STREAM_0_BUFFER_EN(x) (((x) >> 0) & 0x0F) +#define C_028B98_STREAM_0_BUFFER_EN 0xFFFFFFF0 +#define S_028B98_STREAM_1_BUFFER_EN(x) (((x) & 0x0F) << 4) +#define G_028B98_STREAM_1_BUFFER_EN(x) (((x) >> 4) & 0x0F) +#define C_028B98_STREAM_1_BUFFER_EN 0xFFFFFF0F +#define S_028B98_STREAM_2_BUFFER_EN(x) (((x) & 0x0F) << 8) +#define G_028B98_STREAM_2_BUFFER_EN(x) (((x) >> 8) & 0x0F) +#define C_028B98_STREAM_2_BUFFER_EN 0xFFFFF0FF +#define S_028B98_STREAM_3_BUFFER_EN(x) (((x) & 0x0F) << 12) +#define G_028B98_STREAM_3_BUFFER_EN(x) (((x) >> 12) & 0x0F) +#define C_028B98_STREAM_3_BUFFER_EN 0xFFFF0FFF +#define R_028BD4_PA_SC_CENTROID_PRIORITY_0 0x028BD4 +#define S_028BD4_DISTANCE_0(x) (((x) & 0x0F) << 0) +#define G_028BD4_DISTANCE_0(x) (((x) >> 0) & 0x0F) +#define C_028BD4_DISTANCE_0 0xFFFFFFF0 +#define S_028BD4_DISTANCE_1(x) (((x) & 0x0F) << 4) +#define G_028BD4_DISTANCE_1(x) (((x) >> 4) & 0x0F) +#define C_028BD4_DISTANCE_1 0xFFFFFF0F +#define S_028BD4_DISTANCE_2(x) (((x) & 0x0F) << 8) +#define G_028BD4_DISTANCE_2(x) (((x) >> 8) & 0x0F) +#define C_028BD4_DISTANCE_2 0xFFFFF0FF +#define S_028BD4_DISTANCE_3(x) (((x) & 0x0F) << 12) +#define G_028BD4_DISTANCE_3(x) (((x) >> 12) & 0x0F) +#define C_028BD4_DISTANCE_3 0xFFFF0FFF +#define S_028BD4_DISTANCE_4(x) (((x) & 0x0F) << 16) +#define G_028BD4_DISTANCE_4(x) (((x) >> 16) & 0x0F) +#define C_028BD4_DISTANCE_4 0xFFF0FFFF +#define S_028BD4_DISTANCE_5(x) (((x) & 0x0F) << 20) +#define G_028BD4_DISTANCE_5(x) (((x) >> 20) & 0x0F) +#define C_028BD4_DISTANCE_5 0xFF0FFFFF +#define S_028BD4_DISTANCE_6(x) (((x) & 0x0F) << 24) +#define G_028BD4_DISTANCE_6(x) (((x) >> 24) & 0x0F) +#define C_028BD4_DISTANCE_6 0xF0FFFFFF +#define S_028BD4_DISTANCE_7(x) (((x) & 0x0F) << 28) +#define G_028BD4_DISTANCE_7(x) (((x) >> 28) & 0x0F) +#define C_028BD4_DISTANCE_7 0x0FFFFFFF +#define R_028BD8_PA_SC_CENTROID_PRIORITY_1 0x028BD8 +#define S_028BD8_DISTANCE_8(x) (((x) & 0x0F) << 0) +#define G_028BD8_DISTANCE_8(x) (((x) >> 0) & 0x0F) +#define C_028BD8_DISTANCE_8 0xFFFFFFF0 +#define S_028BD8_DISTANCE_9(x) (((x) & 0x0F) << 4) +#define G_028BD8_DISTANCE_9(x) (((x) >> 4) & 0x0F) +#define C_028BD8_DISTANCE_9 0xFFFFFF0F +#define S_028BD8_DISTANCE_10(x) (((x) & 0x0F) << 8) +#define G_028BD8_DISTANCE_10(x) (((x) >> 8) & 0x0F) +#define C_028BD8_DISTANCE_10 0xFFFFF0FF +#define S_028BD8_DISTANCE_11(x) (((x) & 0x0F) << 12) +#define G_028BD8_DISTANCE_11(x) (((x) >> 12) & 0x0F) +#define C_028BD8_DISTANCE_11 0xFFFF0FFF +#define S_028BD8_DISTANCE_12(x) (((x) & 0x0F) << 16) +#define G_028BD8_DISTANCE_12(x) (((x) >> 16) & 0x0F) +#define C_028BD8_DISTANCE_12 0xFFF0FFFF +#define S_028BD8_DISTANCE_13(x) (((x) & 0x0F) << 20) +#define G_028BD8_DISTANCE_13(x) (((x) >> 20) & 0x0F) +#define C_028BD8_DISTANCE_13 0xFF0FFFFF +#define S_028BD8_DISTANCE_14(x) (((x) & 0x0F) << 24) +#define G_028BD8_DISTANCE_14(x) (((x) >> 24) & 0x0F) +#define C_028BD8_DISTANCE_14 0xF0FFFFFF +#define S_028BD8_DISTANCE_15(x) (((x) & 0x0F) << 28) +#define G_028BD8_DISTANCE_15(x) (((x) >> 28) & 0x0F) +#define C_028BD8_DISTANCE_15 0x0FFFFFFF +#define R_028BDC_PA_SC_LINE_CNTL 0x028BDC +#define S_028BDC_EXPAND_LINE_WIDTH(x) (((x) & 0x1) << 9) +#define G_028BDC_EXPAND_LINE_WIDTH(x) (((x) >> 9) & 0x1) +#define C_028BDC_EXPAND_LINE_WIDTH 0xFFFFFDFF +#define S_028BDC_LAST_PIXEL(x) (((x) & 0x1) << 10) +#define G_028BDC_LAST_PIXEL(x) (((x) >> 10) & 0x1) +#define C_028BDC_LAST_PIXEL 0xFFFFFBFF +#define S_028BDC_PERPENDICULAR_ENDCAP_ENA(x) (((x) & 0x1) << 11) +#define G_028BDC_PERPENDICULAR_ENDCAP_ENA(x) (((x) >> 11) & 0x1) +#define C_028BDC_PERPENDICULAR_ENDCAP_ENA 0xFFFFF7FF +#define S_028BDC_DX10_DIAMOND_TEST_ENA(x) (((x) & 0x1) << 12) +#define G_028BDC_DX10_DIAMOND_TEST_ENA(x) (((x) >> 12) & 0x1) +#define C_028BDC_DX10_DIAMOND_TEST_ENA 0xFFFFEFFF +#define R_028BE0_PA_SC_AA_CONFIG 0x028BE0 +#define S_028BE0_MSAA_NUM_SAMPLES(x) (((x) & 0x07) << 0) +#define G_028BE0_MSAA_NUM_SAMPLES(x) (((x) >> 0) & 0x07) +#define C_028BE0_MSAA_NUM_SAMPLES 0xFFFFFFF8 +#define S_028BE0_AA_MASK_CENTROID_DTMN(x) (((x) & 0x1) << 4) +#define G_028BE0_AA_MASK_CENTROID_DTMN(x) (((x) >> 4) & 0x1) +#define C_028BE0_AA_MASK_CENTROID_DTMN 0xFFFFFFEF +#define S_028BE0_MAX_SAMPLE_DIST(x) (((x) & 0x0F) << 13) +#define G_028BE0_MAX_SAMPLE_DIST(x) (((x) >> 13) & 0x0F) +#define C_028BE0_MAX_SAMPLE_DIST 0xFFFE1FFF +#define S_028BE0_MSAA_EXPOSED_SAMPLES(x) (((x) & 0x07) << 20) +#define G_028BE0_MSAA_EXPOSED_SAMPLES(x) (((x) >> 20) & 0x07) +#define C_028BE0_MSAA_EXPOSED_SAMPLES 0xFF8FFFFF +#define S_028BE0_DETAIL_TO_EXPOSED_MODE(x) (((x) & 0x03) << 24) +#define G_028BE0_DETAIL_TO_EXPOSED_MODE(x) (((x) >> 24) & 0x03) +#define C_028BE0_DETAIL_TO_EXPOSED_MODE 0xFCFFFFFF +#define R_028BE4_PA_SU_VTX_CNTL 0x028BE4 +#define S_028BE4_PIX_CENTER(x) (((x) & 0x1) << 0) +#define G_028BE4_PIX_CENTER(x) (((x) >> 0) & 0x1) +#define C_028BE4_PIX_CENTER 0xFFFFFFFE +#define S_028BE4_ROUND_MODE(x) (((x) & 0x03) << 1) +#define G_028BE4_ROUND_MODE(x) (((x) >> 1) & 0x03) +#define C_028BE4_ROUND_MODE 0xFFFFFFF9 +#define V_028BE4_X_TRUNCATE 0x00 +#define V_028BE4_X_ROUND 0x01 +#define V_028BE4_X_ROUND_TO_EVEN 0x02 +#define V_028BE4_X_ROUND_TO_ODD 0x03 +#define S_028BE4_QUANT_MODE(x) (((x) & 0x07) << 3) +#define G_028BE4_QUANT_MODE(x) (((x) >> 3) & 0x07) +#define C_028BE4_QUANT_MODE 0xFFFFFFC7 +#define V_028BE4_X_16_8_FIXED_POINT_1_16TH 0x00 +#define V_028BE4_X_16_8_FIXED_POINT_1_8TH 0x01 +#define V_028BE4_X_16_8_FIXED_POINT_1_4TH 0x02 +#define V_028BE4_X_16_8_FIXED_POINT_1_2 0x03 +#define V_028BE4_X_16_8_FIXED_POINT_1 0x04 +#define V_028BE4_X_16_8_FIXED_POINT_1_256TH 0x05 +#define V_028BE4_X_14_10_FIXED_POINT_1_1024TH 0x06 +#define V_028BE4_X_12_12_FIXED_POINT_1_4096TH 0x07 +#define R_028BE8_PA_CL_GB_VERT_CLIP_ADJ 0x028BE8 +#define R_028BEC_PA_CL_GB_VERT_DISC_ADJ 0x028BEC +#define R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ 0x028BF0 +#define R_028BF4_PA_CL_GB_HORZ_DISC_ADJ 0x028BF4 +#define R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0 0x028BF8 +#define S_028BF8_S0_X(x) (((x) & 0x0F) << 0) +#define G_028BF8_S0_X(x) (((x) >> 0) & 0x0F) +#define C_028BF8_S0_X 0xFFFFFFF0 +#define S_028BF8_S0_Y(x) (((x) & 0x0F) << 4) +#define G_028BF8_S0_Y(x) (((x) >> 4) & 0x0F) +#define C_028BF8_S0_Y 0xFFFFFF0F +#define S_028BF8_S1_X(x) (((x) & 0x0F) << 8) +#define G_028BF8_S1_X(x) (((x) >> 8) & 0x0F) +#define C_028BF8_S1_X 0xFFFFF0FF +#define S_028BF8_S1_Y(x) (((x) & 0x0F) << 12) +#define G_028BF8_S1_Y(x) (((x) >> 12) & 0x0F) +#define C_028BF8_S1_Y 0xFFFF0FFF +#define S_028BF8_S2_X(x) (((x) & 0x0F) << 16) +#define G_028BF8_S2_X(x) (((x) >> 16) & 0x0F) +#define C_028BF8_S2_X 0xFFF0FFFF +#define S_028BF8_S2_Y(x) (((x) & 0x0F) << 20) +#define G_028BF8_S2_Y(x) (((x) >> 20) & 0x0F) +#define C_028BF8_S2_Y 0xFF0FFFFF +#define S_028BF8_S3_X(x) (((x) & 0x0F) << 24) +#define G_028BF8_S3_X(x) (((x) >> 24) & 0x0F) +#define C_028BF8_S3_X 0xF0FFFFFF +#define S_028BF8_S3_Y(x) (((x) & 0x0F) << 28) +#define G_028BF8_S3_Y(x) (((x) >> 28) & 0x0F) +#define C_028BF8_S3_Y 0x0FFFFFFF +#define R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1 0x028BFC +#define S_028BFC_S4_X(x) (((x) & 0x0F) << 0) +#define G_028BFC_S4_X(x) (((x) >> 0) & 0x0F) +#define C_028BFC_S4_X 0xFFFFFFF0 +#define S_028BFC_S4_Y(x) (((x) & 0x0F) << 4) +#define G_028BFC_S4_Y(x) (((x) >> 4) & 0x0F) +#define C_028BFC_S4_Y 0xFFFFFF0F +#define S_028BFC_S5_X(x) (((x) & 0x0F) << 8) +#define G_028BFC_S5_X(x) (((x) >> 8) & 0x0F) +#define C_028BFC_S5_X 0xFFFFF0FF +#define S_028BFC_S5_Y(x) (((x) & 0x0F) << 12) +#define G_028BFC_S5_Y(x) (((x) >> 12) & 0x0F) +#define C_028BFC_S5_Y 0xFFFF0FFF +#define S_028BFC_S6_X(x) (((x) & 0x0F) << 16) +#define G_028BFC_S6_X(x) (((x) >> 16) & 0x0F) +#define C_028BFC_S6_X 0xFFF0FFFF +#define S_028BFC_S6_Y(x) (((x) & 0x0F) << 20) +#define G_028BFC_S6_Y(x) (((x) >> 20) & 0x0F) +#define C_028BFC_S6_Y 0xFF0FFFFF +#define S_028BFC_S7_X(x) (((x) & 0x0F) << 24) +#define G_028BFC_S7_X(x) (((x) >> 24) & 0x0F) +#define C_028BFC_S7_X 0xF0FFFFFF +#define S_028BFC_S7_Y(x) (((x) & 0x0F) << 28) +#define G_028BFC_S7_Y(x) (((x) >> 28) & 0x0F) +#define C_028BFC_S7_Y 0x0FFFFFFF +#define R_028C00_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2 0x028C00 +#define S_028C00_S8_X(x) (((x) & 0x0F) << 0) +#define G_028C00_S8_X(x) (((x) >> 0) & 0x0F) +#define C_028C00_S8_X 0xFFFFFFF0 +#define S_028C00_S8_Y(x) (((x) & 0x0F) << 4) +#define G_028C00_S8_Y(x) (((x) >> 4) & 0x0F) +#define C_028C00_S8_Y 0xFFFFFF0F +#define S_028C00_S9_X(x) (((x) & 0x0F) << 8) +#define G_028C00_S9_X(x) (((x) >> 8) & 0x0F) +#define C_028C00_S9_X 0xFFFFF0FF +#define S_028C00_S9_Y(x) (((x) & 0x0F) << 12) +#define G_028C00_S9_Y(x) (((x) >> 12) & 0x0F) +#define C_028C00_S9_Y 0xFFFF0FFF +#define S_028C00_S10_X(x) (((x) & 0x0F) << 16) +#define G_028C00_S10_X(x) (((x) >> 16) & 0x0F) +#define C_028C00_S10_X 0xFFF0FFFF +#define S_028C00_S10_Y(x) (((x) & 0x0F) << 20) +#define G_028C00_S10_Y(x) (((x) >> 20) & 0x0F) +#define C_028C00_S10_Y 0xFF0FFFFF +#define S_028C00_S11_X(x) (((x) & 0x0F) << 24) +#define G_028C00_S11_X(x) (((x) >> 24) & 0x0F) +#define C_028C00_S11_X 0xF0FFFFFF +#define S_028C00_S11_Y(x) (((x) & 0x0F) << 28) +#define G_028C00_S11_Y(x) (((x) >> 28) & 0x0F) +#define C_028C00_S11_Y 0x0FFFFFFF +#define R_028C04_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3 0x028C04 +#define S_028C04_S12_X(x) (((x) & 0x0F) << 0) +#define G_028C04_S12_X(x) (((x) >> 0) & 0x0F) +#define C_028C04_S12_X 0xFFFFFFF0 +#define S_028C04_S12_Y(x) (((x) & 0x0F) << 4) +#define G_028C04_S12_Y(x) (((x) >> 4) & 0x0F) +#define C_028C04_S12_Y 0xFFFFFF0F +#define S_028C04_S13_X(x) (((x) & 0x0F) << 8) +#define G_028C04_S13_X(x) (((x) >> 8) & 0x0F) +#define C_028C04_S13_X 0xFFFFF0FF +#define S_028C04_S13_Y(x) (((x) & 0x0F) << 12) +#define G_028C04_S13_Y(x) (((x) >> 12) & 0x0F) +#define C_028C04_S13_Y 0xFFFF0FFF +#define S_028C04_S14_X(x) (((x) & 0x0F) << 16) +#define G_028C04_S14_X(x) (((x) >> 16) & 0x0F) +#define C_028C04_S14_X 0xFFF0FFFF +#define S_028C04_S14_Y(x) (((x) & 0x0F) << 20) +#define G_028C04_S14_Y(x) (((x) >> 20) & 0x0F) +#define C_028C04_S14_Y 0xFF0FFFFF +#define S_028C04_S15_X(x) (((x) & 0x0F) << 24) +#define G_028C04_S15_X(x) (((x) >> 24) & 0x0F) +#define C_028C04_S15_X 0xF0FFFFFF +#define S_028C04_S15_Y(x) (((x) & 0x0F) << 28) +#define G_028C04_S15_Y(x) (((x) >> 28) & 0x0F) +#define C_028C04_S15_Y 0x0FFFFFFF +#define R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0 0x028C08 +#define S_028C08_S0_X(x) (((x) & 0x0F) << 0) +#define G_028C08_S0_X(x) (((x) >> 0) & 0x0F) +#define C_028C08_S0_X 0xFFFFFFF0 +#define S_028C08_S0_Y(x) (((x) & 0x0F) << 4) +#define G_028C08_S0_Y(x) (((x) >> 4) & 0x0F) +#define C_028C08_S0_Y 0xFFFFFF0F +#define S_028C08_S1_X(x) (((x) & 0x0F) << 8) +#define G_028C08_S1_X(x) (((x) >> 8) & 0x0F) +#define C_028C08_S1_X 0xFFFFF0FF +#define S_028C08_S1_Y(x) (((x) & 0x0F) << 12) +#define G_028C08_S1_Y(x) (((x) >> 12) & 0x0F) +#define C_028C08_S1_Y 0xFFFF0FFF +#define S_028C08_S2_X(x) (((x) & 0x0F) << 16) +#define G_028C08_S2_X(x) (((x) >> 16) & 0x0F) +#define C_028C08_S2_X 0xFFF0FFFF +#define S_028C08_S2_Y(x) (((x) & 0x0F) << 20) +#define G_028C08_S2_Y(x) (((x) >> 20) & 0x0F) +#define C_028C08_S2_Y 0xFF0FFFFF +#define S_028C08_S3_X(x) (((x) & 0x0F) << 24) +#define G_028C08_S3_X(x) (((x) >> 24) & 0x0F) +#define C_028C08_S3_X 0xF0FFFFFF +#define S_028C08_S3_Y(x) (((x) & 0x0F) << 28) +#define G_028C08_S3_Y(x) (((x) >> 28) & 0x0F) +#define C_028C08_S3_Y 0x0FFFFFFF +#define R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1 0x028C0C +#define S_028C0C_S4_X(x) (((x) & 0x0F) << 0) +#define G_028C0C_S4_X(x) (((x) >> 0) & 0x0F) +#define C_028C0C_S4_X 0xFFFFFFF0 +#define S_028C0C_S4_Y(x) (((x) & 0x0F) << 4) +#define G_028C0C_S4_Y(x) (((x) >> 4) & 0x0F) +#define C_028C0C_S4_Y 0xFFFFFF0F +#define S_028C0C_S5_X(x) (((x) & 0x0F) << 8) +#define G_028C0C_S5_X(x) (((x) >> 8) & 0x0F) +#define C_028C0C_S5_X 0xFFFFF0FF +#define S_028C0C_S5_Y(x) (((x) & 0x0F) << 12) +#define G_028C0C_S5_Y(x) (((x) >> 12) & 0x0F) +#define C_028C0C_S5_Y 0xFFFF0FFF +#define S_028C0C_S6_X(x) (((x) & 0x0F) << 16) +#define G_028C0C_S6_X(x) (((x) >> 16) & 0x0F) +#define C_028C0C_S6_X 0xFFF0FFFF +#define S_028C0C_S6_Y(x) (((x) & 0x0F) << 20) +#define G_028C0C_S6_Y(x) (((x) >> 20) & 0x0F) +#define C_028C0C_S6_Y 0xFF0FFFFF +#define S_028C0C_S7_X(x) (((x) & 0x0F) << 24) +#define G_028C0C_S7_X(x) (((x) >> 24) & 0x0F) +#define C_028C0C_S7_X 0xF0FFFFFF +#define S_028C0C_S7_Y(x) (((x) & 0x0F) << 28) +#define G_028C0C_S7_Y(x) (((x) >> 28) & 0x0F) +#define C_028C0C_S7_Y 0x0FFFFFFF +#define R_028C10_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2 0x028C10 +#define S_028C10_S8_X(x) (((x) & 0x0F) << 0) +#define G_028C10_S8_X(x) (((x) >> 0) & 0x0F) +#define C_028C10_S8_X 0xFFFFFFF0 +#define S_028C10_S8_Y(x) (((x) & 0x0F) << 4) +#define G_028C10_S8_Y(x) (((x) >> 4) & 0x0F) +#define C_028C10_S8_Y 0xFFFFFF0F +#define S_028C10_S9_X(x) (((x) & 0x0F) << 8) +#define G_028C10_S9_X(x) (((x) >> 8) & 0x0F) +#define C_028C10_S9_X 0xFFFFF0FF +#define S_028C10_S9_Y(x) (((x) & 0x0F) << 12) +#define G_028C10_S9_Y(x) (((x) >> 12) & 0x0F) +#define C_028C10_S9_Y 0xFFFF0FFF +#define S_028C10_S10_X(x) (((x) & 0x0F) << 16) +#define G_028C10_S10_X(x) (((x) >> 16) & 0x0F) +#define C_028C10_S10_X 0xFFF0FFFF +#define S_028C10_S10_Y(x) (((x) & 0x0F) << 20) +#define G_028C10_S10_Y(x) (((x) >> 20) & 0x0F) +#define C_028C10_S10_Y 0xFF0FFFFF +#define S_028C10_S11_X(x) (((x) & 0x0F) << 24) +#define G_028C10_S11_X(x) (((x) >> 24) & 0x0F) +#define C_028C10_S11_X 0xF0FFFFFF +#define S_028C10_S11_Y(x) (((x) & 0x0F) << 28) +#define G_028C10_S11_Y(x) (((x) >> 28) & 0x0F) +#define C_028C10_S11_Y 0x0FFFFFFF +#define R_028C14_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3 0x028C14 +#define S_028C14_S12_X(x) (((x) & 0x0F) << 0) +#define G_028C14_S12_X(x) (((x) >> 0) & 0x0F) +#define C_028C14_S12_X 0xFFFFFFF0 +#define S_028C14_S12_Y(x) (((x) & 0x0F) << 4) +#define G_028C14_S12_Y(x) (((x) >> 4) & 0x0F) +#define C_028C14_S12_Y 0xFFFFFF0F +#define S_028C14_S13_X(x) (((x) & 0x0F) << 8) +#define G_028C14_S13_X(x) (((x) >> 8) & 0x0F) +#define C_028C14_S13_X 0xFFFFF0FF +#define S_028C14_S13_Y(x) (((x) & 0x0F) << 12) +#define G_028C14_S13_Y(x) (((x) >> 12) & 0x0F) +#define C_028C14_S13_Y 0xFFFF0FFF +#define S_028C14_S14_X(x) (((x) & 0x0F) << 16) +#define G_028C14_S14_X(x) (((x) >> 16) & 0x0F) +#define C_028C14_S14_X 0xFFF0FFFF +#define S_028C14_S14_Y(x) (((x) & 0x0F) << 20) +#define G_028C14_S14_Y(x) (((x) >> 20) & 0x0F) +#define C_028C14_S14_Y 0xFF0FFFFF +#define S_028C14_S15_X(x) (((x) & 0x0F) << 24) +#define G_028C14_S15_X(x) (((x) >> 24) & 0x0F) +#define C_028C14_S15_X 0xF0FFFFFF +#define S_028C14_S15_Y(x) (((x) & 0x0F) << 28) +#define G_028C14_S15_Y(x) (((x) >> 28) & 0x0F) +#define C_028C14_S15_Y 0x0FFFFFFF +#define R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0 0x028C18 +#define S_028C18_S0_X(x) (((x) & 0x0F) << 0) +#define G_028C18_S0_X(x) (((x) >> 0) & 0x0F) +#define C_028C18_S0_X 0xFFFFFFF0 +#define S_028C18_S0_Y(x) (((x) & 0x0F) << 4) +#define G_028C18_S0_Y(x) (((x) >> 4) & 0x0F) +#define C_028C18_S0_Y 0xFFFFFF0F +#define S_028C18_S1_X(x) (((x) & 0x0F) << 8) +#define G_028C18_S1_X(x) (((x) >> 8) & 0x0F) +#define C_028C18_S1_X 0xFFFFF0FF +#define S_028C18_S1_Y(x) (((x) & 0x0F) << 12) +#define G_028C18_S1_Y(x) (((x) >> 12) & 0x0F) +#define C_028C18_S1_Y 0xFFFF0FFF +#define S_028C18_S2_X(x) (((x) & 0x0F) << 16) +#define G_028C18_S2_X(x) (((x) >> 16) & 0x0F) +#define C_028C18_S2_X 0xFFF0FFFF +#define S_028C18_S2_Y(x) (((x) & 0x0F) << 20) +#define G_028C18_S2_Y(x) (((x) >> 20) & 0x0F) +#define C_028C18_S2_Y 0xFF0FFFFF +#define S_028C18_S3_X(x) (((x) & 0x0F) << 24) +#define G_028C18_S3_X(x) (((x) >> 24) & 0x0F) +#define C_028C18_S3_X 0xF0FFFFFF +#define S_028C18_S3_Y(x) (((x) & 0x0F) << 28) +#define G_028C18_S3_Y(x) (((x) >> 28) & 0x0F) +#define C_028C18_S3_Y 0x0FFFFFFF +#define R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1 0x028C1C +#define S_028C1C_S4_X(x) (((x) & 0x0F) << 0) +#define G_028C1C_S4_X(x) (((x) >> 0) & 0x0F) +#define C_028C1C_S4_X 0xFFFFFFF0 +#define S_028C1C_S4_Y(x) (((x) & 0x0F) << 4) +#define G_028C1C_S4_Y(x) (((x) >> 4) & 0x0F) +#define C_028C1C_S4_Y 0xFFFFFF0F +#define S_028C1C_S5_X(x) (((x) & 0x0F) << 8) +#define G_028C1C_S5_X(x) (((x) >> 8) & 0x0F) +#define C_028C1C_S5_X 0xFFFFF0FF +#define S_028C1C_S5_Y(x) (((x) & 0x0F) << 12) +#define G_028C1C_S5_Y(x) (((x) >> 12) & 0x0F) +#define C_028C1C_S5_Y 0xFFFF0FFF +#define S_028C1C_S6_X(x) (((x) & 0x0F) << 16) +#define G_028C1C_S6_X(x) (((x) >> 16) & 0x0F) +#define C_028C1C_S6_X 0xFFF0FFFF +#define S_028C1C_S6_Y(x) (((x) & 0x0F) << 20) +#define G_028C1C_S6_Y(x) (((x) >> 20) & 0x0F) +#define C_028C1C_S6_Y 0xFF0FFFFF +#define S_028C1C_S7_X(x) (((x) & 0x0F) << 24) +#define G_028C1C_S7_X(x) (((x) >> 24) & 0x0F) +#define C_028C1C_S7_X 0xF0FFFFFF +#define S_028C1C_S7_Y(x) (((x) & 0x0F) << 28) +#define G_028C1C_S7_Y(x) (((x) >> 28) & 0x0F) +#define C_028C1C_S7_Y 0x0FFFFFFF +#define R_028C20_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2 0x028C20 +#define S_028C20_S8_X(x) (((x) & 0x0F) << 0) +#define G_028C20_S8_X(x) (((x) >> 0) & 0x0F) +#define C_028C20_S8_X 0xFFFFFFF0 +#define S_028C20_S8_Y(x) (((x) & 0x0F) << 4) +#define G_028C20_S8_Y(x) (((x) >> 4) & 0x0F) +#define C_028C20_S8_Y 0xFFFFFF0F +#define S_028C20_S9_X(x) (((x) & 0x0F) << 8) +#define G_028C20_S9_X(x) (((x) >> 8) & 0x0F) +#define C_028C20_S9_X 0xFFFFF0FF +#define S_028C20_S9_Y(x) (((x) & 0x0F) << 12) +#define G_028C20_S9_Y(x) (((x) >> 12) & 0x0F) +#define C_028C20_S9_Y 0xFFFF0FFF +#define S_028C20_S10_X(x) (((x) & 0x0F) << 16) +#define G_028C20_S10_X(x) (((x) >> 16) & 0x0F) +#define C_028C20_S10_X 0xFFF0FFFF +#define S_028C20_S10_Y(x) (((x) & 0x0F) << 20) +#define G_028C20_S10_Y(x) (((x) >> 20) & 0x0F) +#define C_028C20_S10_Y 0xFF0FFFFF +#define S_028C20_S11_X(x) (((x) & 0x0F) << 24) +#define G_028C20_S11_X(x) (((x) >> 24) & 0x0F) +#define C_028C20_S11_X 0xF0FFFFFF +#define S_028C20_S11_Y(x) (((x) & 0x0F) << 28) +#define G_028C20_S11_Y(x) (((x) >> 28) & 0x0F) +#define C_028C20_S11_Y 0x0FFFFFFF +#define R_028C24_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3 0x028C24 +#define S_028C24_S12_X(x) (((x) & 0x0F) << 0) +#define G_028C24_S12_X(x) (((x) >> 0) & 0x0F) +#define C_028C24_S12_X 0xFFFFFFF0 +#define S_028C24_S12_Y(x) (((x) & 0x0F) << 4) +#define G_028C24_S12_Y(x) (((x) >> 4) & 0x0F) +#define C_028C24_S12_Y 0xFFFFFF0F +#define S_028C24_S13_X(x) (((x) & 0x0F) << 8) +#define G_028C24_S13_X(x) (((x) >> 8) & 0x0F) +#define C_028C24_S13_X 0xFFFFF0FF +#define S_028C24_S13_Y(x) (((x) & 0x0F) << 12) +#define G_028C24_S13_Y(x) (((x) >> 12) & 0x0F) +#define C_028C24_S13_Y 0xFFFF0FFF +#define S_028C24_S14_X(x) (((x) & 0x0F) << 16) +#define G_028C24_S14_X(x) (((x) >> 16) & 0x0F) +#define C_028C24_S14_X 0xFFF0FFFF +#define S_028C24_S14_Y(x) (((x) & 0x0F) << 20) +#define G_028C24_S14_Y(x) (((x) >> 20) & 0x0F) +#define C_028C24_S14_Y 0xFF0FFFFF +#define S_028C24_S15_X(x) (((x) & 0x0F) << 24) +#define G_028C24_S15_X(x) (((x) >> 24) & 0x0F) +#define C_028C24_S15_X 0xF0FFFFFF +#define S_028C24_S15_Y(x) (((x) & 0x0F) << 28) +#define G_028C24_S15_Y(x) (((x) >> 28) & 0x0F) +#define C_028C24_S15_Y 0x0FFFFFFF +#define R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 0x028C28 +#define S_028C28_S0_X(x) (((x) & 0x0F) << 0) +#define G_028C28_S0_X(x) (((x) >> 0) & 0x0F) +#define C_028C28_S0_X 0xFFFFFFF0 +#define S_028C28_S0_Y(x) (((x) & 0x0F) << 4) +#define G_028C28_S0_Y(x) (((x) >> 4) & 0x0F) +#define C_028C28_S0_Y 0xFFFFFF0F +#define S_028C28_S1_X(x) (((x) & 0x0F) << 8) +#define G_028C28_S1_X(x) (((x) >> 8) & 0x0F) +#define C_028C28_S1_X 0xFFFFF0FF +#define S_028C28_S1_Y(x) (((x) & 0x0F) << 12) +#define G_028C28_S1_Y(x) (((x) >> 12) & 0x0F) +#define C_028C28_S1_Y 0xFFFF0FFF +#define S_028C28_S2_X(x) (((x) & 0x0F) << 16) +#define G_028C28_S2_X(x) (((x) >> 16) & 0x0F) +#define C_028C28_S2_X 0xFFF0FFFF +#define S_028C28_S2_Y(x) (((x) & 0x0F) << 20) +#define G_028C28_S2_Y(x) (((x) >> 20) & 0x0F) +#define C_028C28_S2_Y 0xFF0FFFFF +#define S_028C28_S3_X(x) (((x) & 0x0F) << 24) +#define G_028C28_S3_X(x) (((x) >> 24) & 0x0F) +#define C_028C28_S3_X 0xF0FFFFFF +#define S_028C28_S3_Y(x) (((x) & 0x0F) << 28) +#define G_028C28_S3_Y(x) (((x) >> 28) & 0x0F) +#define C_028C28_S3_Y 0x0FFFFFFF +#define R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 0x028C2C +#define S_028C2C_S4_X(x) (((x) & 0x0F) << 0) +#define G_028C2C_S4_X(x) (((x) >> 0) & 0x0F) +#define C_028C2C_S4_X 0xFFFFFFF0 +#define S_028C2C_S4_Y(x) (((x) & 0x0F) << 4) +#define G_028C2C_S4_Y(x) (((x) >> 4) & 0x0F) +#define C_028C2C_S4_Y 0xFFFFFF0F +#define S_028C2C_S5_X(x) (((x) & 0x0F) << 8) +#define G_028C2C_S5_X(x) (((x) >> 8) & 0x0F) +#define C_028C2C_S5_X 0xFFFFF0FF +#define S_028C2C_S5_Y(x) (((x) & 0x0F) << 12) +#define G_028C2C_S5_Y(x) (((x) >> 12) & 0x0F) +#define C_028C2C_S5_Y 0xFFFF0FFF +#define S_028C2C_S6_X(x) (((x) & 0x0F) << 16) +#define G_028C2C_S6_X(x) (((x) >> 16) & 0x0F) +#define C_028C2C_S6_X 0xFFF0FFFF +#define S_028C2C_S6_Y(x) (((x) & 0x0F) << 20) +#define G_028C2C_S6_Y(x) (((x) >> 20) & 0x0F) +#define C_028C2C_S6_Y 0xFF0FFFFF +#define S_028C2C_S7_X(x) (((x) & 0x0F) << 24) +#define G_028C2C_S7_X(x) (((x) >> 24) & 0x0F) +#define C_028C2C_S7_X 0xF0FFFFFF +#define S_028C2C_S7_Y(x) (((x) & 0x0F) << 28) +#define G_028C2C_S7_Y(x) (((x) >> 28) & 0x0F) +#define C_028C2C_S7_Y 0x0FFFFFFF +#define R_028C30_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 0x028C30 +#define S_028C30_S8_X(x) (((x) & 0x0F) << 0) +#define G_028C30_S8_X(x) (((x) >> 0) & 0x0F) +#define C_028C30_S8_X 0xFFFFFFF0 +#define S_028C30_S8_Y(x) (((x) & 0x0F) << 4) +#define G_028C30_S8_Y(x) (((x) >> 4) & 0x0F) +#define C_028C30_S8_Y 0xFFFFFF0F +#define S_028C30_S9_X(x) (((x) & 0x0F) << 8) +#define G_028C30_S9_X(x) (((x) >> 8) & 0x0F) +#define C_028C30_S9_X 0xFFFFF0FF +#define S_028C30_S9_Y(x) (((x) & 0x0F) << 12) +#define G_028C30_S9_Y(x) (((x) >> 12) & 0x0F) +#define C_028C30_S9_Y 0xFFFF0FFF +#define S_028C30_S10_X(x) (((x) & 0x0F) << 16) +#define G_028C30_S10_X(x) (((x) >> 16) & 0x0F) +#define C_028C30_S10_X 0xFFF0FFFF +#define S_028C30_S10_Y(x) (((x) & 0x0F) << 20) +#define G_028C30_S10_Y(x) (((x) >> 20) & 0x0F) +#define C_028C30_S10_Y 0xFF0FFFFF +#define S_028C30_S11_X(x) (((x) & 0x0F) << 24) +#define G_028C30_S11_X(x) (((x) >> 24) & 0x0F) +#define C_028C30_S11_X 0xF0FFFFFF +#define S_028C30_S11_Y(x) (((x) & 0x0F) << 28) +#define G_028C30_S11_Y(x) (((x) >> 28) & 0x0F) +#define C_028C30_S11_Y 0x0FFFFFFF +#define R_028C34_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 0x028C34 +#define S_028C34_S12_X(x) (((x) & 0x0F) << 0) +#define G_028C34_S12_X(x) (((x) >> 0) & 0x0F) +#define C_028C34_S12_X 0xFFFFFFF0 +#define S_028C34_S12_Y(x) (((x) & 0x0F) << 4) +#define G_028C34_S12_Y(x) (((x) >> 4) & 0x0F) +#define C_028C34_S12_Y 0xFFFFFF0F +#define S_028C34_S13_X(x) (((x) & 0x0F) << 8) +#define G_028C34_S13_X(x) (((x) >> 8) & 0x0F) +#define C_028C34_S13_X 0xFFFFF0FF +#define S_028C34_S13_Y(x) (((x) & 0x0F) << 12) +#define G_028C34_S13_Y(x) (((x) >> 12) & 0x0F) +#define C_028C34_S13_Y 0xFFFF0FFF +#define S_028C34_S14_X(x) (((x) & 0x0F) << 16) +#define G_028C34_S14_X(x) (((x) >> 16) & 0x0F) +#define C_028C34_S14_X 0xFFF0FFFF +#define S_028C34_S14_Y(x) (((x) & 0x0F) << 20) +#define G_028C34_S14_Y(x) (((x) >> 20) & 0x0F) +#define C_028C34_S14_Y 0xFF0FFFFF +#define S_028C34_S15_X(x) (((x) & 0x0F) << 24) +#define G_028C34_S15_X(x) (((x) >> 24) & 0x0F) +#define C_028C34_S15_X 0xF0FFFFFF +#define S_028C34_S15_Y(x) (((x) & 0x0F) << 28) +#define G_028C34_S15_Y(x) (((x) >> 28) & 0x0F) +#define C_028C34_S15_Y 0x0FFFFFFF +#define R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0 0x028C38 +#define S_028C38_AA_MASK_X0Y0(x) (((x) & 0xFFFF) << 0) +#define G_028C38_AA_MASK_X0Y0(x) (((x) >> 0) & 0xFFFF) +#define C_028C38_AA_MASK_X0Y0 0xFFFF0000 +#define S_028C38_AA_MASK_X1Y0(x) (((x) & 0xFFFF) << 16) +#define G_028C38_AA_MASK_X1Y0(x) (((x) >> 16) & 0xFFFF) +#define C_028C38_AA_MASK_X1Y0 0x0000FFFF +#define R_028C3C_PA_SC_AA_MASK_X0Y1_X1Y1 0x028C3C +#define S_028C3C_AA_MASK_X0Y1(x) (((x) & 0xFFFF) << 0) +#define G_028C3C_AA_MASK_X0Y1(x) (((x) >> 0) & 0xFFFF) +#define C_028C3C_AA_MASK_X0Y1 0xFFFF0000 +#define S_028C3C_AA_MASK_X1Y1(x) (((x) & 0xFFFF) << 16) +#define G_028C3C_AA_MASK_X1Y1(x) (((x) >> 16) & 0xFFFF) +#define C_028C3C_AA_MASK_X1Y1 0x0000FFFF +#define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL 0x028C58 +#define S_028C58_VTX_REUSE_DEPTH(x) (((x) & 0xFF) << 0) +#define G_028C58_VTX_REUSE_DEPTH(x) (((x) >> 0) & 0xFF) +#define C_028C58_VTX_REUSE_DEPTH 0xFFFFFF00 +#define R_028C5C_VGT_OUT_DEALLOC_CNTL 0x028C5C +#define S_028C5C_DEALLOC_DIST(x) (((x) & 0x7F) << 0) +#define G_028C5C_DEALLOC_DIST(x) (((x) >> 0) & 0x7F) +#define C_028C5C_DEALLOC_DIST 0xFFFFFF80 +#define R_028C60_CB_COLOR0_BASE 0x028C60 +#define R_028C64_CB_COLOR0_PITCH 0x028C64 +#define S_028C64_TILE_MAX(x) (((x) & 0x7FF) << 0) +#define G_028C64_TILE_MAX(x) (((x) >> 0) & 0x7FF) +#define C_028C64_TILE_MAX 0xFFFFF800 +#define R_028C68_CB_COLOR0_SLICE 0x028C68 +#define S_028C68_TILE_MAX(x) (((x) & 0x3FFFFF) << 0) +#define G_028C68_TILE_MAX(x) (((x) >> 0) & 0x3FFFFF) +#define C_028C68_TILE_MAX 0xFFC00000 +#define R_028C6C_CB_COLOR0_VIEW 0x028C6C +#define S_028C6C_SLICE_START(x) (((x) & 0x7FF) << 0) +#define G_028C6C_SLICE_START(x) (((x) >> 0) & 0x7FF) +#define C_028C6C_SLICE_START 0xFFFFF800 +#define S_028C6C_SLICE_MAX(x) (((x) & 0x7FF) << 13) +#define G_028C6C_SLICE_MAX(x) (((x) >> 13) & 0x7FF) +#define C_028C6C_SLICE_MAX 0xFF001FFF +#define R_028C70_CB_COLOR0_INFO 0x028C70 +#define S_028C70_ENDIAN(x) (((x) & 0x03) << 0) +#define G_028C70_ENDIAN(x) (((x) >> 0) & 0x03) +#define C_028C70_ENDIAN 0xFFFFFFFC +#define V_028C70_ENDIAN_NONE 0x00 +#define V_028C70_ENDIAN_8IN16 0x01 +#define V_028C70_ENDIAN_8IN32 0x02 +#define V_028C70_ENDIAN_8IN64 0x03 +#define S_028C70_FORMAT(x) (((x) & 0x1F) << 2) +#define G_028C70_FORMAT(x) (((x) >> 2) & 0x1F) +#define C_028C70_FORMAT 0xFFFFFF83 +#define V_028C70_COLOR_INVALID 0x00 +#define V_028C70_COLOR_8 0x01 +#define V_028C70_COLOR_16 0x02 +#define V_028C70_COLOR_8_8 0x03 +#define V_028C70_COLOR_32 0x04 +#define V_028C70_COLOR_16_16 0x05 +#define V_028C70_COLOR_10_11_11 0x06 +#define V_028C70_COLOR_11_11_10 0x07 +#define V_028C70_COLOR_10_10_10_2 0x08 +#define V_028C70_COLOR_2_10_10_10 0x09 +#define V_028C70_COLOR_8_8_8_8 0x0A +#define V_028C70_COLOR_32_32 0x0B +#define V_028C70_COLOR_16_16_16_16 0x0C +#define V_028C70_COLOR_32_32_32_32 0x0E +#define V_028C70_COLOR_5_6_5 0x10 +#define V_028C70_COLOR_1_5_5_5 0x11 +#define V_028C70_COLOR_5_5_5_1 0x12 +#define V_028C70_COLOR_4_4_4_4 0x13 +#define V_028C70_COLOR_8_24 0x14 +#define V_028C70_COLOR_24_8 0x15 +#define V_028C70_COLOR_X24_8_32_FLOAT 0x16 +#define S_028C70_LINEAR_GENERAL(x) (((x) & 0x1) << 7) +#define G_028C70_LINEAR_GENERAL(x) (((x) >> 7) & 0x1) +#define C_028C70_LINEAR_GENERAL 0xFFFFFF7F +#define S_028C70_NUMBER_TYPE(x) (((x) & 0x07) << 8) +#define G_028C70_NUMBER_TYPE(x) (((x) >> 8) & 0x07) +#define C_028C70_NUMBER_TYPE 0xFFFFF8FF +#define V_028C70_NUMBER_UNORM 0x00 +#define V_028C70_NUMBER_SNORM 0x01 +#define V_028C70_NUMBER_UINT 0x04 +#define V_028C70_NUMBER_SINT 0x05 +#define V_028C70_NUMBER_SRGB 0x06 +#define V_028C70_NUMBER_FLOAT 0x07 +#define S_028C70_COMP_SWAP(x) (((x) & 0x03) << 11) +#define G_028C70_COMP_SWAP(x) (((x) >> 11) & 0x03) +#define C_028C70_COMP_SWAP 0xFFFFE7FF +#define V_028C70_SWAP_STD 0x00 +#define V_028C70_SWAP_ALT 0x01 +#define V_028C70_SWAP_STD_REV 0x02 +#define V_028C70_SWAP_ALT_REV 0x03 +#define S_028C70_FAST_CLEAR(x) (((x) & 0x1) << 13) +#define G_028C70_FAST_CLEAR(x) (((x) >> 13) & 0x1) +#define C_028C70_FAST_CLEAR 0xFFFFDFFF +#define S_028C70_COMPRESSION(x) (((x) & 0x1) << 14) +#define G_028C70_COMPRESSION(x) (((x) >> 14) & 0x1) +#define C_028C70_COMPRESSION 0xFFFFBFFF +#define S_028C70_BLEND_CLAMP(x) (((x) & 0x1) << 15) +#define G_028C70_BLEND_CLAMP(x) (((x) >> 15) & 0x1) +#define C_028C70_BLEND_CLAMP 0xFFFF7FFF +#define S_028C70_BLEND_BYPASS(x) (((x) & 0x1) << 16) +#define G_028C70_BLEND_BYPASS(x) (((x) >> 16) & 0x1) +#define C_028C70_BLEND_BYPASS 0xFFFEFFFF +#define S_028C70_SIMPLE_FLOAT(x) (((x) & 0x1) << 17) +#define G_028C70_SIMPLE_FLOAT(x) (((x) >> 17) & 0x1) +#define C_028C70_SIMPLE_FLOAT 0xFFFDFFFF +#define S_028C70_ROUND_MODE(x) (((x) & 0x1) << 18) +#define G_028C70_ROUND_MODE(x) (((x) >> 18) & 0x1) +#define C_028C70_ROUND_MODE 0xFFFBFFFF +#define S_028C70_CMASK_IS_LINEAR(x) (((x) & 0x1) << 19) +#define G_028C70_CMASK_IS_LINEAR(x) (((x) >> 19) & 0x1) +#define C_028C70_CMASK_IS_LINEAR 0xFFF7FFFF +#define S_028C70_BLEND_OPT_DONT_RD_DST(x) (((x) & 0x07) << 20) +#define G_028C70_BLEND_OPT_DONT_RD_DST(x) (((x) >> 20) & 0x07) +#define C_028C70_BLEND_OPT_DONT_RD_DST 0xFF8FFFFF +#define V_028C70_FORCE_OPT_AUTO 0x00 +#define V_028C70_FORCE_OPT_DISABLE 0x01 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_A_0 0x02 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_RGB_0 0x03 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_ARGB_0 0x04 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_A_1 0x05 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_RGB_1 0x06 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_ARGB_1 0x07 +#define S_028C70_BLEND_OPT_DISCARD_PIXEL(x) (((x) & 0x07) << 23) +#define G_028C70_BLEND_OPT_DISCARD_PIXEL(x) (((x) >> 23) & 0x07) +#define C_028C70_BLEND_OPT_DISCARD_PIXEL 0xFC7FFFFF +#define V_028C70_FORCE_OPT_AUTO 0x00 +#define V_028C70_FORCE_OPT_DISABLE 0x01 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_A_0 0x02 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_RGB_0 0x03 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_ARGB_0 0x04 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_A_1 0x05 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_RGB_1 0x06 +#define V_028C70_FORCE_OPT_ENABLE_IF_SRC_ARGB_1 0x07 +#define R_028C74_CB_COLOR0_ATTRIB 0x028C74 +#define S_028C74_TILE_MODE_INDEX(x) (((x) & 0x1F) << 0) +#define G_028C74_TILE_MODE_INDEX(x) (((x) >> 0) & 0x1F) +#define C_028C74_TILE_MODE_INDEX 0xFFFFFFE0 +#define S_028C74_FMASK_TILE_MODE_INDEX(x) (((x) & 0x1F) << 5) +#define G_028C74_FMASK_TILE_MODE_INDEX(x) (((x) >> 5) & 0x1F) +#define C_028C74_FMASK_TILE_MODE_INDEX 0xFFFFFC1F +#define S_028C74_NUM_SAMPLES(x) (((x) & 0x07) << 12) +#define G_028C74_NUM_SAMPLES(x) (((x) >> 12) & 0x07) +#define C_028C74_NUM_SAMPLES 0xFFFF8FFF +#define S_028C74_NUM_FRAGMENTS(x) (((x) & 0x03) << 15) +#define G_028C74_NUM_FRAGMENTS(x) (((x) >> 15) & 0x03) +#define C_028C74_NUM_FRAGMENTS 0xFFFE7FFF +#define S_028C74_FORCE_DST_ALPHA_1(x) (((x) & 0x1) << 17) +#define G_028C74_FORCE_DST_ALPHA_1(x) (((x) >> 17) & 0x1) +#define C_028C74_FORCE_DST_ALPHA_1 0xFFFDFFFF +#define R_028C7C_CB_COLOR0_CMASK 0x028C7C +#define R_028C80_CB_COLOR0_CMASK_SLICE 0x028C80 +#define S_028C80_TILE_MAX(x) (((x) & 0x3FFF) << 0) +#define G_028C80_TILE_MAX(x) (((x) >> 0) & 0x3FFF) +#define C_028C80_TILE_MAX 0xFFFFC000 +#define R_028C84_CB_COLOR0_FMASK 0x028C84 +#define R_028C88_CB_COLOR0_FMASK_SLICE 0x028C88 +#define S_028C88_TILE_MAX(x) (((x) & 0x3FFFFF) << 0) +#define G_028C88_TILE_MAX(x) (((x) >> 0) & 0x3FFFFF) +#define C_028C88_TILE_MAX 0xFFC00000 +#define R_028C8C_CB_COLOR0_CLEAR_WORD0 0x028C8C +#define R_028C90_CB_COLOR0_CLEAR_WORD1 0x028C90 +#define R_028C9C_CB_COLOR1_BASE 0x028C9C +#define R_028CA0_CB_COLOR1_PITCH 0x028CA0 +#define R_028CA4_CB_COLOR1_SLICE 0x028CA4 +#define R_028CA8_CB_COLOR1_VIEW 0x028CA8 +#define R_028CAC_CB_COLOR1_INFO 0x028CAC +#define R_028CB0_CB_COLOR1_ATTRIB 0x028CB0 +#define R_028CD4_CB_COLOR1_CMASK 0x028CB8 +#define R_028CBC_CB_COLOR1_CMASK_SLICE 0x028CBC +#define R_028CC0_CB_COLOR1_FMASK 0x028CC0 +#define R_028CC4_CB_COLOR1_FMASK_SLICE 0x028CC4 +#define R_028CC8_CB_COLOR1_CLEAR_WORD0 0x028CC8 +#define R_028CCC_CB_COLOR1_CLEAR_WORD1 0x028CCC +#define R_028CD8_CB_COLOR2_BASE 0x028CD8 +#define R_028CDC_CB_COLOR2_PITCH 0x028CDC +#define R_028CE0_CB_COLOR2_SLICE 0x028CE0 +#define R_028CE4_CB_COLOR2_VIEW 0x028CE4 +#define R_028CE8_CB_COLOR2_INFO 0x028CE8 +#define R_028CEC_CB_COLOR2_ATTRIB 0x028CEC +#define R_028CF4_CB_COLOR2_CMASK 0x028CF4 +#define R_028CF8_CB_COLOR2_CMASK_SLICE 0x028CF8 +#define R_028CFC_CB_COLOR2_FMASK 0x028CFC +#define R_028D00_CB_COLOR2_FMASK_SLICE 0x028D00 +#define R_028D04_CB_COLOR2_CLEAR_WORD0 0x028D04 +#define R_028D08_CB_COLOR2_CLEAR_WORD1 0x028D08 +#define R_028D14_CB_COLOR3_BASE 0x028D14 +#define R_028D18_CB_COLOR3_PITCH 0x028D18 +#define R_028D1C_CB_COLOR3_SLICE 0x028D1C +#define R_028D20_CB_COLOR3_VIEW 0x028D20 +#define R_028D24_CB_COLOR3_INFO 0x028D24 +#define R_028D28_CB_COLOR3_ATTRIB 0x028D28 +#define R_028D30_CB_COLOR3_CMASK 0x028D30 +#define R_028D34_CB_COLOR3_CMASK_SLICE 0x028D34 +#define R_028D38_CB_COLOR3_FMASK 0x028D38 +#define R_028D3C_CB_COLOR3_FMASK_SLICE 0x028D3C +#define R_028D40_CB_COLOR3_CLEAR_WORD0 0x028D40 +#define R_028D44_CB_COLOR3_CLEAR_WORD1 0x028D44 +#define R_028D50_CB_COLOR4_BASE 0x028D50 +#define R_028D54_CB_COLOR4_PITCH 0x028D54 +#define R_028D58_CB_COLOR4_SLICE 0x028D58 +#define R_028D5C_CB_COLOR4_VIEW 0x028D5C +#define R_028D60_CB_COLOR4_INFO 0x028D60 +#define R_028D64_CB_COLOR4_ATTRIB 0x028D64 +#define R_028D6C_CB_COLOR4_CMASK 0x028D6C +#define R_028D70_CB_COLOR4_CMASK_SLICE 0x028D70 +#define R_028D74_CB_COLOR4_FMASK 0x028D74 +#define R_028D78_CB_COLOR4_FMASK_SLICE 0x028D78 +#define R_028D7C_CB_COLOR4_CLEAR_WORD0 0x028D7C +#define R_028D80_CB_COLOR4_CLEAR_WORD1 0x028D80 +#define R_028D8C_CB_COLOR5_BASE 0x028D8C +#define R_028D90_CB_COLOR5_PITCH 0x028D90 +#define R_028D94_CB_COLOR5_SLICE 0x028D94 +#define R_028D98_CB_COLOR5_VIEW 0x028D98 +#define R_028D9C_CB_COLOR5_INFO 0x028D9C +#define R_028DA0_CB_COLOR5_ATTRIB 0x028DA0 +#define R_028DA8_CB_COLOR5_CMASK 0x028DA8 +#define R_028DAC_CB_COLOR5_CMASK_SLICE 0x028DAC +#define R_028DB0_CB_COLOR5_FMASK 0x028DB0 +#define R_028DB4_CB_COLOR5_FMASK_SLICE 0x028DB4 +#define R_028DB8_CB_COLOR5_CLEAR_WORD0 0x028DB8 +#define R_028DBC_CB_COLOR5_CLEAR_WORD1 0x028DBC +#define R_028DC8_CB_COLOR6_BASE 0x028DC8 +#define R_028DCC_CB_COLOR6_PITCH 0x028DCC +#define R_028DD0_CB_COLOR6_SLICE 0x028DD0 +#define R_028DD4_CB_COLOR6_VIEW 0x028DD4 +#define R_028DD8_CB_COLOR6_INFO 0x028DD8 +#define R_028DDC_CB_COLOR6_ATTRIB 0x028DDC +#define R_028DE4_CB_COLOR6_CMASK 0x028DE4 +#define R_028DE8_CB_COLOR6_CMASK_SLICE 0x028DE8 +#define R_028DEC_CB_COLOR6_FMASK 0x028DEC +#define R_028DF0_CB_COLOR6_FMASK_SLICE 0x028DF0 +#define R_028DF4_CB_COLOR6_CLEAR_WORD0 0x028DF4 +#define R_028DF8_CB_COLOR6_CLEAR_WORD1 0x028DF8 +#define R_028E04_CB_COLOR7_BASE 0x028E04 +#define R_028E08_CB_COLOR7_PITCH 0x028E08 +#define R_028E0C_CB_COLOR7_SLICE 0x028E0C +#define R_028E10_CB_COLOR7_VIEW 0x028E10 +#define R_028E14_CB_COLOR7_INFO 0x028E14 +#define R_028E18_CB_COLOR7_ATTRIB 0x028E18 +#define R_028E20_CB_COLOR7_CMASK 0x028E20 +#define R_028E24_CB_COLOR7_CMASK_SLICE 0x028E24 +#define R_028E28_CB_COLOR7_FMASK 0x028E28 +#define R_028E2C_CB_COLOR7_FMASK_SLICE 0x028E2C +#define R_028E30_CB_COLOR7_CLEAR_WORD0 0x028E30 +#define R_028E34_CB_COLOR7_CLEAR_WORD1 0x028E34 + +#endif /* _SID_H */ + diff --git a/src/gallium/targets/dri-radeonsi/Makefile b/src/gallium/targets/dri-radeonsi/Makefile new file mode 100644 index 00000000000..f76d71bec98 --- /dev/null +++ b/src/gallium/targets/dri-radeonsi/Makefile @@ -0,0 +1,26 @@ +TOP = ../../../.. +include $(TOP)/configs/current + +LIBNAME = radeonsi_dri.so + +PIPE_DRIVERS = \ + $(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a \ + $(TOP)/src/gallium/state_trackers/dri/drm/libdridrm.a \ + $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \ + $(TOP)/src/gallium/drivers/trace/libtrace.a \ + $(TOP)/src/gallium/drivers/rbug/librbug.a \ + $(TOP)/src/gallium/drivers/noop/libnoop.a + +C_SOURCES = \ + target.c \ + $(COMMON_GALLIUM_SOURCES) \ + $(DRIVER_SOURCES) + +DRIVER_DEFINES = \ + -DGALLIUM_RBUG -DGALLIUM_TRACE -DGALLIUM_NOOP + +include ../Makefile.dri + +DRI_LIB_DEPS += -ldrm_radeon + +symlinks: diff --git a/src/gallium/targets/dri-radeonsi/SConscript b/src/gallium/targets/dri-radeonsi/SConscript new file mode 100644 index 00000000000..2b5c151fba6 --- /dev/null +++ b/src/gallium/targets/dri-radeonsi/SConscript @@ -0,0 +1,25 @@ +Import('*') + +env = drienv.Clone() + +env.Append(CPPDEFINES = ['GALLIUM_RBUG', 'GALLIUM_TRACE']) + +env.Prepend(LIBS = [ + st_dri, + radeonwinsys, + radeonsi, + trace, + rbug, + mesa, + glsl, + gallium, + COMMON_DRI_DRM_OBJECTS +]) + +module = env.SharedLibrary( + target ='radeonsi_dri.so', + source = 'target.c', + SHLIBPREFIX = '', +) + +env.Alias('dri-radeonsi', module) diff --git a/src/gallium/targets/dri-radeonsi/target.c b/src/gallium/targets/dri-radeonsi/target.c new file mode 100644 index 00000000000..1350ba2883d --- /dev/null +++ b/src/gallium/targets/dri-radeonsi/target.c @@ -0,0 +1,40 @@ +#include "state_tracker/drm_driver.h" +#include "target-helpers/inline_debug_helper.h" +#include "radeon/drm/radeon_drm_public.h" +#include "radeonsi/radeonsi_public.h" + +static struct pipe_screen *create_screen(int fd) +{ + struct radeon_winsys *radeon; + struct pipe_screen *screen; + + radeon = radeon_drm_winsys_create(fd); + if (!radeon) + return NULL; + + screen = radeonsi_screen_create(radeon); + if (!screen) + return NULL; + + screen = debug_screen_wrap(screen); + + return screen; +} + +static const struct drm_conf_ret throttle_ret = { + .type = DRM_CONF_INT, + .val.val_int = 2, +}; + +static const struct drm_conf_ret *drm_configuration(enum drm_conf conf) +{ + switch (conf) { + case DRM_CONF_THROTTLE: + return &throttle_ret; + default: + break; + } + return NULL; +} + +DRM_DRIVER_DESCRIPTOR("radeonsi", "radeon", create_screen, drm_configuration) diff --git a/src/gallium/targets/egl-static/Android.mk b/src/gallium/targets/egl-static/Android.mk index 21b6dc27921..99c08120d4b 100644 --- a/src/gallium/targets/egl-static/Android.mk +++ b/src/gallium/targets/egl-static/Android.mk @@ -65,6 +65,9 @@ endif ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),) LOCAL_CFLAGS += -D_EGL_PIPE_R600=1 endif +ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),) +LOCAL_CFLAGS += -D_EGL_PIPE_RADEONSI=1 +endif ifneq ($(filter vmwgfx, $(MESA_GPU_DRIVERS)),) LOCAL_CFLAGS += -D_EGL_PIPE_VMWGFX=1 endif diff --git a/src/gallium/targets/egl-static/Makefile b/src/gallium/targets/egl-static/Makefile index 02a55eef160..2c6656bce5e 100644 --- a/src/gallium/targets/egl-static/Makefile +++ b/src/gallium/targets/egl-static/Makefile @@ -130,6 +130,17 @@ egl_SYS += -ldrm_radeon endif endif +# radeonsi +ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),) +ifneq ($(findstring radeonsi,$(GALLIUM_DRIVERS_DIRS)),) +egl_CPPFLAGS += -D_EGL_PIPE_RADEONSI=1 +egl_LIBS += \ + $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \ + $(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a +egl_SYS += -ldrm_radeon +endif +endif + # vmwgfx ifneq ($(findstring svga/drm,$(GALLIUM_WINSYS_DIRS)),) egl_CPPFLAGS += -D_EGL_PIPE_VMWGFX=1 diff --git a/src/gallium/targets/egl-static/SConscript b/src/gallium/targets/egl-static/SConscript index e657e9f2ff5..d831b110764 100644 --- a/src/gallium/targets/egl-static/SConscript +++ b/src/gallium/targets/egl-static/SConscript @@ -98,11 +98,12 @@ if env['HAVE_DRM']: ]) if env['HAVE_DRM_RADEON']: - env.Append(CPPDEFINES = ['_EGL_PIPE_R300', '_EGL_PIPE_R600']) + env.Append(CPPDEFINES = ['_EGL_PIPE_R300', '_EGL_PIPE_R600', '_EGL_PIPE_RADEONSI']) env.Prepend(LIBS = [ radeonwinsys, r300, r600, + radeonsi, ]) env.Append(CPPDEFINES = ['_EGL_PIPE_VMWGFX']) diff --git a/src/gallium/targets/egl-static/egl_pipe.c b/src/gallium/targets/egl-static/egl_pipe.c index 887bcfd12c4..407c6a8f236 100644 --- a/src/gallium/targets/egl-static/egl_pipe.c +++ b/src/gallium/targets/egl-static/egl_pipe.c @@ -40,6 +40,8 @@ #include "r300/r300_public.h" /* for r600 */ #include "r600/r600_public.h" +/* for radeonsi */ +#include "radeonsi/radeonsi_public.h" /* for vmwgfx */ #include "svga/drm/svga_drm_public.h" #include "svga/svga_public.h" @@ -132,6 +134,29 @@ pipe_r600_create_screen(int fd) } static struct pipe_screen * +pipe_radeonsi_create_screen(int fd) +{ +#if _EGL_PIPE_RADEONSI + struct radeon_winsys *rw; + struct pipe_screen *screen; + + rw = radeon_drm_winsys_create(fd); + if (!rw) + return NULL; + + screen = radeonsi_screen_create(rw); + if (!screen) + return NULL; + + screen = debug_screen_wrap(screen); + + return screen; +#else + return NULL; +#endif +} + +static struct pipe_screen * pipe_vmwgfx_create_screen(int fd) { #if _EGL_PIPE_VMWGFX @@ -165,6 +190,8 @@ egl_pipe_create_drm_screen(const char *name, int fd) return pipe_r300_create_screen(fd); else if (strcmp(name, "r600") == 0) return pipe_r600_create_screen(fd); + else if (strcmp(name, "radeonsi") == 0) + return pipe_radeonsi_create_screen(fd); else if (strcmp(name, "vmwgfx") == 0) return pipe_vmwgfx_create_screen(fd); else diff --git a/src/gallium/targets/gbm/Makefile b/src/gallium/targets/gbm/Makefile index 2737b7986cb..50970f9058e 100644 --- a/src/gallium/targets/gbm/Makefile +++ b/src/gallium/targets/gbm/Makefile @@ -80,6 +80,12 @@ r600_LIBS = \ $(TOP)/src/gallium/drivers/r600/libr600.a r600_SYS += -ldrm_radeon +# radeonsi pipe driver +radeonsi_LIBS = \ + $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \ + $(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a +radeonsi_SYS += -ldrm_radeon + # vmwgfx pipe driver vmwgfx_LIBS = \ $(TOP)/src/gallium/winsys/svga/drm/libsvgadrm.a \ @@ -126,6 +132,13 @@ pipe_SOURCES += pipe_r600.c endif endif +ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),) +ifneq ($(findstring radeonsi,$(GALLIUM_DRIVERS_DIRS)),) +_pipe_TARGETS_CC += $(PIPE_PREFIX)radeonsi.so +pipe_SOURCES += pipe_radeonsi.c +endif +endif + ifneq ($(findstring svga/drm,$(GALLIUM_WINSYS_DIRS)),) _pipe_TARGETS_CC += $(PIPE_PREFIX)vmwgfx.so pipe_SOURCES += pipe_vmwgfx.c diff --git a/src/gallium/targets/gbm/pipe_radeonsi.c b/src/gallium/targets/gbm/pipe_radeonsi.c new file mode 100644 index 00000000000..bb57118b7b0 --- /dev/null +++ b/src/gallium/targets/gbm/pipe_radeonsi.c @@ -0,0 +1,26 @@ +#include "state_tracker/drm_driver.h" +#include "target-helpers/inline_debug_helper.h" +#include "radeon/drm/radeon_drm_public.h" +#include "radeonsi/radeonsi_public.h" + +static struct pipe_screen * +create_screen(int fd) +{ + struct radeon_winsys *rw; + struct pipe_screen *screen; + + rw = radeon_drm_winsys_create(fd); + if (!rw) + return NULL; + + screen = radeonsi_screen_create(rw); + if (!screen) + return NULL; + + screen = debug_screen_wrap(screen); + + return screen; +} + +PUBLIC +DRM_DRIVER_DESCRIPTOR("radeonsi", "radeon", create_screen, NULL) diff --git a/src/gallium/targets/xorg-radeonsi/Makefile b/src/gallium/targets/xorg-radeonsi/Makefile new file mode 100644 index 00000000000..af5cf88ea7d --- /dev/null +++ b/src/gallium/targets/xorg-radeonsi/Makefile @@ -0,0 +1,24 @@ +TOP = ../../../.. +include $(TOP)/configs/current + +LIBNAME = radeonsi_drv.so + +C_SOURCES = \ + target.c \ + xorg.c + +DRIVER_DEFINES = \ + -DHAVE_CONFIG_H -DGALLIUM_RBUG -DGALLIUM_TRACE -DGALLIUM_GALAHAD + +DRIVER_PIPES = \ + $(TOP)/src/gallium/state_trackers/xorg/libxorgtracker.a \ + $(TOP)/src/gallium/drivers/radeonsi/libradeonsi.a \ + $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \ + $(TOP)/src/gallium/drivers/galahad/libgalahad.a \ + $(TOP)/src/gallium/drivers/trace/libtrace.a \ + $(TOP)/src/gallium/drivers/rbug/librbug.a + +DRIVER_LINKS = \ + $(shell $(PKG_CONFIG) --libs libdrm) + +include ../Makefile.xorg diff --git a/src/gallium/targets/xorg-radeonsi/target.c b/src/gallium/targets/xorg-radeonsi/target.c new file mode 100644 index 00000000000..c023c687a93 --- /dev/null +++ b/src/gallium/targets/xorg-radeonsi/target.c @@ -0,0 +1,26 @@ + +#include "target-helpers/inline_debug_helper.h" +#include "state_tracker/drm_driver.h" +#include "radeon/drm/radeon_drm_public.h" +#include "radeonsi/radeonsi_public.h" + +static struct pipe_screen * +create_screen(int fd) +{ + struct radeon_winsys *sws; + struct pipe_screen *screen; + + sws = radeon_drm_winsys_create(fd); + if (!sws) + return NULL; + + screen = radeonsi_screen_create(sws); + if (!screen) + return NULL; + + screen = debug_screen_wrap(screen); + + return screen; +} + +DRM_DRIVER_DESCRIPTOR("radeonsi", "radeon", create_screen, NULL) diff --git a/src/gallium/targets/xorg-radeonsi/xorg.c b/src/gallium/targets/xorg-radeonsi/xorg.c new file mode 100644 index 00000000000..3db9f315db1 --- /dev/null +++ b/src/gallium/targets/xorg-radeonsi/xorg.c @@ -0,0 +1,148 @@ +/* + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * + * Author: Alan Hourihane <[email protected]> + * Author: Jakob Bornecrantz <[email protected]> + * Author: Corbin Simpson <[email protected]> + * + */ + +#include "../../state_trackers/xorg/xorg_winsys.h" + +static void radeonsi_xorg_identify(int flags); +static Bool radeonsi_xorg_pci_probe(DriverPtr driver, + int entity_num, + struct pci_device *device, + intptr_t match_data); + +static const struct pci_id_match radeonsi_xorg_device_match[] = { + {0x1002, PCI_MATCH_ANY, PCI_MATCH_ANY, PCI_MATCH_ANY, 0, 0, 0}, + {0, 0, 0}, +}; + +static SymTabRec radeonsi_xorg_chipsets[] = { + {PCI_MATCH_ANY, "AMD Southern Islands Graphics Chipset"}, + {-1, NULL} +}; + +static PciChipsets radeonsi_xorg_pci_devices[] = { + {PCI_MATCH_ANY, PCI_MATCH_ANY, NULL}, + {-1, -1, NULL} +}; + +static XF86ModuleVersionInfo radeonsi_xorg_version = { + "radeonsi", + MODULEVENDORSTRING, + MODINFOSTRING1, + MODINFOSTRING2, + XORG_VERSION_CURRENT, + 0, 1, 0, /* major, minor, patch */ + ABI_CLASS_VIDEODRV, + ABI_VIDEODRV_VERSION, + MOD_CLASS_VIDEODRV, + {0, 0, 0, 0} +}; + +/* + * Xorg driver exported structures + */ + +_X_EXPORT DriverRec radeonsi_driver = { + 1, + "radeonsi", + radeonsi_xorg_identify, + NULL, + xorg_tracker_available_options, + NULL, + 0, + NULL, + radeonsi_xorg_device_match, + radeonsi_xorg_pci_probe +}; + +static MODULESETUPPROTO(radeonsi_xorg_setup); + +_X_EXPORT XF86ModuleData radeonsiModuleData = { + &radeonsi_xorg_version, + radeonsi_xorg_setup, + NULL +}; + +/* + * Xorg driver functions + */ + +static pointer +radeonsi_xorg_setup(pointer module, pointer opts, int *errmaj, int *errmin) +{ + static Bool setupDone = 0; + + /* This module should be loaded only once, but check to be sure. + */ + if (!setupDone) { + setupDone = 1; + xf86AddDriver(&radeonsi_driver, module, HaveDriverFuncs); + + /* + * The return value must be non-NULL on success even though there + * is no TearDownProc. + */ + return (pointer) 1; + } else { + if (errmaj) + *errmaj = LDR_ONCEONLY; + return NULL; + } +} + +static void +radeonsi_xorg_identify(int flags) +{ + xf86PrintChipsets("radeonsi", "Driver for AMD Radeon SI Gallium with KMS", + radeonsi_xorg_chipsets); +} + +static Bool +radeonsi_xorg_pci_probe(DriverPtr driver, + int entity_num, struct pci_device *device, intptr_t match_data) +{ + ScrnInfoPtr scrn = NULL; + EntityInfoPtr entity; + + scrn = xf86ConfigPciEntity(scrn, 0, entity_num, radeonsi_xorg_pci_devices, + NULL, NULL, NULL, NULL, NULL); + if (scrn != NULL) { + scrn->driverVersion = 1; + scrn->driverName = "radeonsi"; + scrn->name = "RADEONSI"; + scrn->Probe = NULL; + + entity = xf86GetEntityInfo(entity_num); + + /* Use all the functions from the xorg tracker */ + xorg_tracker_set_functions(scrn); + } + return scrn != NULL; +} diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index c29dca394f0..4d343b8489b 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -226,6 +226,12 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws) ws->gen = R600; break; +#define CHIPSET(pci_id, name, family) case pci_id: +#include "pci_ids/radeonsi_pci_ids.h" +#undef CHIPSET + ws->gen = SI; + break; + default: fprintf(stderr, "radeon: Invalid PCI ID.\n"); return FALSE; @@ -256,7 +262,7 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws) &ws->info.r300_num_z_pipes)) return FALSE; } - else if (ws->gen == R600) { + else if (ws->gen >= R600) { if (ws->info.drm_minor >= 9 && !radeon_get_drm_value(ws->fd, RADEON_INFO_NUM_BACKENDS, "num backends", diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h index 6ac86bcfabb..22983072fbb 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h @@ -35,7 +35,8 @@ enum radeon_generation { R300, - R600 + R600, + SI }; struct radeon_drm_winsys { |