281 files changed, 4973 insertions, 2798 deletions
diff --git a/src/compiler/glsl/Android.gen.mk b/src/compiler/Android.glsl.gen.mk
index de5cd0f474c..b0df8a146c0 100644
--- a/src/compiler/glsl/Android.gen.mk
+++ b/src/compiler/Android.glsl.gen.mk
@@ -32,8 +32,9 @@ intermediates := $(call local-generated-sources-dir)
 LOCAL_SRC_FILES := $(LOCAL_SRC_FILES)
 
 LOCAL_C_INCLUDES += \
-	$(intermediates)/glcpp \
-	$(LOCAL_PATH)/glcpp \
+	$(intermediates)/glsl \
+	$(LOCAL_PATH)/glsl \
+	$(LOCAL_PATH)/glsl/glcpp \
 
 LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, \
 	$(LIBGLCPP_GENERATED_FILES) \
@@ -65,14 +66,14 @@ define local-yy-to-cpp-and-h
 	rm -f $(@:$1=$(YACC_HEADER_SUFFIX))
 endef
 
-$(intermediates)/glsl_lexer.cpp: $(LOCAL_PATH)/glsl_lexer.ll
+$(intermediates)/glsl/glsl_lexer.cpp: $(LOCAL_PATH)/glsl/glsl_lexer.ll
 	$(call local-l-or-ll-to-c-or-cpp)
 
-$(intermediates)/glsl_parser.cpp: $(LOCAL_PATH)/glsl_parser.yy
+$(intermediates)/glsl/glsl_parser.cpp: $(LOCAL_PATH)/glsl/glsl_parser.yy
 	$(call local-yy-to-cpp-and-h,.cpp)
 
-$(intermediates)/glcpp/glcpp-lex.c: $(LOCAL_PATH)/glcpp/glcpp-lex.l
+$(intermediates)/glsl/glcpp/glcpp-lex.c: $(LOCAL_PATH)/glsl/glcpp/glcpp-lex.l
 	$(call local-l-or-ll-to-c-or-cpp)
 
-$(intermediates)/glcpp/glcpp-parse.c: $(LOCAL_PATH)/glcpp/glcpp-parse.y
+$(intermediates)/glsl/glcpp/glcpp-parse.c: $(LOCAL_PATH)/glsl/glcpp/glcpp-parse.y
 	$(call glsl_local-y-to-c-and-h)
diff --git a/src/compiler/glsl/Android.mk b/src/compiler/Android.glsl.mk
index f5d96b300f0..d9cf06d208f 100644
--- a/src/compiler/glsl/Android.mk
+++ b/src/compiler/Android.glsl.mk
@@ -47,7 +47,7 @@ LOCAL_STATIC_LIBRARIES := libmesa_compiler
 
 LOCAL_MODULE := libmesa_glsl
 
-include $(LOCAL_PATH)/Android.gen.mk
+include $(LOCAL_PATH)/Android.glsl.gen.mk
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
diff --git a/src/compiler/Android.mk b/src/compiler/Android.mk
index 888780ba3fb..ac0ced58334 100644
--- a/src/compiler/Android.mk
+++ b/src/compiler/Android.mk
@@ -43,25 +43,6 @@ LOCAL_MODULE := libmesa_compiler
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
 
-# ---------------------------------------
-# Build libmesa_nir
-# ---------------------------------------
-
-include $(CLEAR_VARS)
+include $(LOCAL_PATH)/Android.glsl.mk
 
-LOCAL_SRC_FILES := \
-	$(NIR_FILES)
-
-LOCAL_C_INCLUDES := \
-	$(MESA_TOP)/src/mapi \
-	$(MESA_TOP)/src/mesa \
-	$(MESA_TOP)/src/gallium/include \
-	$(MESA_TOP)/src/gallium/auxiliary
-
-LOCAL_STATIC_LIBRARIES := libmesa_compiler
-
-LOCAL_MODULE := libmesa_nir
-
-include $(LOCAL_PATH)/Android.gen.mk
-include $(MESA_COMMON_MK)
-include $(BUILD_STATIC_LIBRARY)
+include $(LOCAL_PATH)/Android.nir.mk
diff --git a/src/compiler/Android.gen.mk b/src/compiler/Android.nir.gen.mk
index 96fc750ec64..96fc750ec64 100644
--- a/src/compiler/Android.gen.mk
+++ b/src/compiler/Android.nir.gen.mk
diff --git a/src/compiler/Android.nir.mk b/src/compiler/Android.nir.mk
new file mode 100644
index 00000000000..e89a21c03ac
--- /dev/null
+++ b/src/compiler/Android.nir.mk
@@ -0,0 +1,49 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C) 2015 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(LOCAL_PATH)/Makefile.sources
+
+# ---------------------------------------
+# Build libmesa_nir
+# ---------------------------------------
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := \
+	$(NIR_FILES)
+
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/mapi \
+	$(MESA_TOP)/src/mesa \
+	$(MESA_TOP)/src/gallium/include \
+	$(MESA_TOP)/src/gallium/auxiliary
+
+LOCAL_STATIC_LIBRARIES := libmesa_compiler
+
+LOCAL_MODULE := libmesa_nir
+
+include $(LOCAL_PATH)/Android.nir.gen.mk
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
diff --git a/src/compiler/Makefile.am b/src/compiler/Makefile.am
index 5032890e73d..dc30f908d8a 100644
--- a/src/compiler/Makefile.am
+++ b/src/compiler/Makefile.am
@@ -54,273 +54,8 @@ BUILT_SOURCES =
 CLEANFILES =
 EXTRA_DIST = SConscript
 
-
-EXTRA_DIST += glsl/tests glsl/glcpp/tests glsl/README	\
-	glsl/TODO glsl/glcpp/README			\
-	glsl/glsl_lexer.ll				\
-	glsl/glsl_parser.yy				\
-	glsl/glcpp/glcpp-lex.l				\
-	glsl/glcpp/glcpp-parse.y			\
-	glsl/Makefile.sources				\
-	glsl/SConscript
-
-TESTS += glsl/glcpp/tests/glcpp-test			\
-	glsl/glcpp/tests/glcpp-test-cr-lf		\
-	glsl/tests/blob-test				\
-	glsl/tests/general-ir-test			\
-	glsl/tests/optimization-test			\
-	glsl/tests/sampler-types-test			\
-	glsl/tests/uniform-initializer-test
-
-TESTS_ENVIRONMENT= \
-	export PYTHON2=$(PYTHON2); \
-	export PYTHON_FLAGS=$(PYTHON_FLAGS);
-
-check_PROGRAMS +=					\
-	glsl/glcpp/glcpp				\
-	glsl/glsl_test					\
-	glsl/tests/blob-test				\
-	glsl/tests/general-ir-test			\
-	glsl/tests/sampler-types-test			\
-	glsl/tests/uniform-initializer-test
-
-noinst_PROGRAMS = glsl_compiler
-
-glsl_tests_blob_test_SOURCES =				\
-	glsl/tests/blob_test.c
-glsl_tests_blob_test_LDADD =				\
-	glsl/libglsl.la
-
-glsl_tests_general_ir_test_SOURCES =			\
-	glsl/standalone_scaffolding.cpp			\
-	glsl/tests/builtin_variable_test.cpp		\
-	glsl/tests/invalidate_locations_test.cpp	\
-	glsl/tests/general_ir_test.cpp			\
-	glsl/tests/varyings_test.cpp
-glsl_tests_general_ir_test_CFLAGS =			\
-	$(PTHREAD_CFLAGS)
-glsl_tests_general_ir_test_LDADD =			\
-	$(top_builddir)/src/gtest/libgtest.la		\
-	glsl/libglsl.la		\
-	$(top_builddir)/src/libglsl_util.la		\
-	$(PTHREAD_LIBS)
-
-glsl_tests_uniform_initializer_test_SOURCES =		\
-	glsl/tests/copy_constant_to_storage_tests.cpp	\
-	glsl/tests/set_uniform_initializer_tests.cpp	\
-	glsl/tests/uniform_initializer_utils.cpp	\
-	glsl/tests/uniform_initializer_utils.h
-glsl_tests_uniform_initializer_test_CFLAGS =		\
-	$(PTHREAD_CFLAGS)
-glsl_tests_uniform_initializer_test_LDADD =		\
-	$(top_builddir)/src/gtest/libgtest.la		\
-	glsl/libglsl.la		\
-	$(top_builddir)/src/libglsl_util.la		\
-	$(PTHREAD_LIBS)
-
-glsl_tests_sampler_types_test_SOURCES =			\
-	glsl/tests/sampler_types_test.cpp
-glsl_tests_sampler_types_test_CFLAGS =			\
-	$(PTHREAD_CFLAGS)
-glsl_tests_sampler_types_test_LDADD =			\
-	$(top_builddir)/src/gtest/libgtest.la		\
-	glsl/libglsl.la					\
-	$(top_builddir)/src/libglsl_util.la		\
-	$(PTHREAD_LIBS)
-
-noinst_LTLIBRARIES += glsl/libglsl.la glsl/libglcpp.la
-
-glsl_libglcpp_la_LIBADD =				\
-	$(top_builddir)/src/util/libmesautil.la
-glsl_libglcpp_la_SOURCES =				\
-	glsl/glcpp/glcpp-lex.c				\
-	glsl/glcpp/glcpp-parse.c			\
-	glsl/glcpp/glcpp-parse.h			\
-	$(LIBGLCPP_FILES)
-
-glsl_glcpp_glcpp_SOURCES =				\
-	glsl/glcpp/glcpp.c
-glsl_glcpp_glcpp_LDADD =				\
-	glsl/libglcpp.la	\
-	$(top_builddir)/src/libglsl_util.la		\
-	-lm
-
-glsl_libglsl_la_LIBADD = \
-	nir/libnir.la \
-	glsl/libglcpp.la
-
-glsl_libglsl_la_SOURCES =				\
-	glsl/glsl_lexer.cpp				\
-	glsl/glsl_parser.cpp				\
-	glsl/glsl_parser.h				\
-	$(LIBGLSL_FILES)
-
-
-glsl_compiler_SOURCES = \
-	$(GLSL_COMPILER_CXX_FILES)
-
-glsl_compiler_LDADD =					\
-	glsl/libglsl.la					\
-	$(top_builddir)/src/libglsl_util.la		\
-	$(top_builddir)/src/util/libmesautil.la		\
-	$(PTHREAD_LIBS)
-
-glsl_glsl_test_SOURCES = \
-	glsl/standalone_scaffolding.cpp \
-	glsl/test.cpp \
-	glsl/test_optpass.cpp \
-	glsl/test_optpass.h
-
-glsl_glsl_test_LDADD =					\
-	glsl/libglsl.la					\
-	$(top_builddir)/src/libglsl_util.la		\
-	$(PTHREAD_LIBS)
-
-# We write our own rules for yacc and lex below. We'd rather use automake,
-# but automake makes it especially difficult for a number of reasons:
-#
-#  * < automake-1.12 generates .h files from .yy and .ypp files, but
-#    >=automake-1.12 generates .hh and .hpp files respectively. There's no
-#    good way of making a project that uses C++ yacc files compatible with
-#    both versions of automake. Strong work automake developers.
-#
-#  * Since we're generating code from .l/.y files in a subdirectory (glcpp/)
-#    we'd like the resulting generated code to also go in glcpp/ for purposes
-#    of distribution. Automake gives no way to do this.
-#
-#  * Since we're building multiple yacc parsers into one library (and via one
-#    Makefile) we have to use per-target YFLAGS. Using per-target YFLAGS causes
-#    automake to name the resulting generated code as <library-name>_filename.c.
-#    Frankly, that's ugly and we don't want a libglcpp_glcpp_parser.h file.
-
-# In order to make build output print "LEX" and "YACC", we reproduce the
-# automake variables below.
-
-AM_V_LEX = $(am__v_LEX_$(V))
-am__v_LEX_ = $(am__v_LEX_$(AM_DEFAULT_VERBOSITY))
-am__v_LEX_0 = @echo "  LEX     " $@;
-am__v_LEX_1 =
-
-AM_V_YACC = $(am__v_YACC_$(V))
-am__v_YACC_ = $(am__v_YACC_$(AM_DEFAULT_VERBOSITY))
-am__v_YACC_0 = @echo "  YACC    " $@;
-am__v_YACC_1 =
-
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
-YACC_GEN = $(AM_V_YACC)$(YACC) $(YFLAGS)
-LEX_GEN = $(AM_V_LEX)$(LEX) $(LFLAGS)
-
-glsl/glsl_parser.cpp glsl/glsl_parser.h: glsl/glsl_parser.yy
-	$(MKDIR_GEN)
-	$(YACC_GEN) -o $@ -p "_mesa_glsl_" --defines=$(builddir)/glsl/glsl_parser.h $(srcdir)/glsl/glsl_parser.yy
-
-glsl/glsl_lexer.cpp: glsl/glsl_lexer.ll
-	$(MKDIR_GEN)
-	$(LEX_GEN) -o $@ $(srcdir)/glsl/glsl_lexer.ll
-
-glsl/glcpp/glcpp-parse.c glsl/glcpp/glcpp-parse.h: glsl/glcpp/glcpp-parse.y
-	$(MKDIR_GEN)
-	$(YACC_GEN) -o $@ -p "glcpp_parser_" --defines=$(builddir)/glsl/glcpp/glcpp-parse.h $(srcdir)/glsl/glcpp/glcpp-parse.y
-
-glsl/glcpp/glcpp-lex.c: glsl/glcpp/glcpp-lex.l
-	$(MKDIR_GEN)
-	$(LEX_GEN) -o $@ $(srcdir)/glsl/glcpp/glcpp-lex.l
-
-# Only the parsers (specifically the header files generated at the same time)
-# need to be in BUILT_SOURCES. Though if we list the parser headers YACC is
-# called for the .c/.cpp file and the .h files. By listing the .c/.cpp files
-# YACC is only executed once for each parser. The rest of the generated code
-# will be created at the appropriate times according to standard automake
-# dependency rules.
-BUILT_SOURCES +=					\
-	glsl/glsl_parser.cpp				\
-	glsl/glsl_lexer.cpp				\
-	glsl/glcpp/glcpp-parse.c			\
-	glsl/glcpp/glcpp-lex.c
-CLEANFILES +=						\
-	glsl/glcpp/glcpp-parse.h			\
-	glsl/glsl_parser.h				\
-	glsl/glsl_parser.cpp				\
-	glsl/glsl_lexer.cpp				\
-	glsl/glcpp/glcpp-parse.c			\
-	glsl/glcpp/glcpp-lex.c
-
-clean-local:
-	$(RM) -r subtest-cr subtest-cr-lf subtest-lf subtest-lf-cr
-
-dist-hook:
-	$(RM) glsl/glcpp/tests/*.out
-	$(RM) glsl/glcpp/tests/subtest*/*.out
-
-noinst_LTLIBRARIES += nir/libnir.la
-
-nir_libnir_la_CPPFLAGS = \
-	$(AM_CPPFLAGS) \
-	-I$(top_builddir)/src/compiler/nir \
-	-I$(top_srcdir)/src/compiler/nir
-
-nir_libnir_la_LIBADD = \
-	libcompiler.la
-
-nir_libnir_la_SOURCES =					\
-	$(NIR_FILES)					\
-	$(SPIRV_FILES)					\
-	$(NIR_GENERATED_FILES)
-
-PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
-
-nir/nir_builder_opcodes.h: nir/nir_opcodes.py nir/nir_builder_opcodes_h.py
-	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_builder_opcodes_h.py > $@ || ($(RM) $@; false)
-
-nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py
-	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_constant_expressions.py > $@ || ($(RM) $@; false)
-
-nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py
-	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_h.py > $@ || ($(RM) $@; false)
-
-nir/nir_opcodes.c: nir/nir_opcodes.py nir/nir_opcodes_c.py
-	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_c.py > $@ || ($(RM) $@; false)
-
-nir/nir_opt_algebraic.c: nir/nir_opt_algebraic.py nir/nir_algebraic.py
-	$(MKDIR_GEN)
-	$(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@ || ($(RM) $@; false)
-
-
-check_PROGRAMS += nir/tests/control_flow_tests
-
-nir_tests_control_flow_tests_CPPFLAGS = \
-	$(AM_CPPFLAGS) \
-	-I$(top_builddir)/src/compiler/nir \
-	-I$(top_srcdir)/src/compiler/nir
-
-nir_tests_control_flow_tests_SOURCES =			\
-	nir/tests/control_flow_tests.cpp
-nir_tests_control_flow_tests_CFLAGS =			\
-	$(PTHREAD_CFLAGS)
-nir_tests_control_flow_tests_LDADD =			\
-	$(top_builddir)/src/gtest/libgtest.la		\
-	nir/libnir.la	\
-	$(top_builddir)/src/util/libmesautil.la		\
-	$(PTHREAD_LIBS)
-
-
-TESTS += nir/tests/control_flow_tests
-
 
-BUILT_SOURCES += $(NIR_GENERATED_FILES)
-CLEANFILES += $(NIR_GENERATED_FILES)
+include Makefile.glsl.am
 
-EXTRA_DIST += \
-	nir/nir_algebraic.py				\
-	nir/nir_builder_opcodes_h.py			\
-	nir/nir_constant_expressions.py			\
-	nir/nir_opcodes.py				\
-	nir/nir_opcodes_c.py				\
-	nir/nir_opcodes_h.py				\
-	nir/nir_opt_algebraic.py			\
-	nir/tests					\
-	nir/Makefile.sources
+include Makefile.nir.am
diff --git a/src/compiler/glsl/Makefile.am b/src/compiler/Makefile.glsl.am
index 9954b812403..daf98f61244 100644
--- a/src/compiler/glsl/Makefile.am
+++ b/src/compiler/Makefile.glsl.am
@@ -1,4 +1,6 @@
+#
 # Copyright © 2012 Jon TURNEY
+# Copyright (C) 2015 Intel Corporation
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -19,120 +21,103 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-AM_CPPFLAGS = \
-	-I$(top_srcdir)/include \
-	-I$(top_srcdir)/src \
-	-I$(top_srcdir)/src/mapi \
-	-I$(top_srcdir)/src/mesa/ \
-	-I$(top_srcdir)/src/gallium/include \
-	-I$(top_srcdir)/src/gallium/auxiliary \
-	-I$(top_srcdir)/src/glsl/glcpp \
-	-I$(top_srcdir)/src/gtest/include \
-	$(DEFINES)
-AM_CFLAGS = \
-	$(VISIBILITY_CFLAGS) \
-	$(MSVC2013_COMPAT_CFLAGS)
-AM_CXXFLAGS = \
-	$(VISIBILITY_CXXFLAGS) \
-	$(MSVC2013_COMPAT_CXXFLAGS)
-
-EXTRA_DIST = tests glcpp/tests README TODO glcpp/README	\
-	glsl_lexer.ll					\
-	glsl_parser.yy					\
-	glcpp/glcpp-lex.l				\
-	glcpp/glcpp-parse.y				\
-	SConscript
-
-include Makefile.sources
-
-TESTS = glcpp/tests/glcpp-test				\
-	glcpp/tests/glcpp-test-cr-lf			\
-	tests/blob-test					\
-	tests/general-ir-test				\
-	tests/optimization-test				\
-	tests/sampler-types-test                        \
-	tests/uniform-initializer-test
+EXTRA_DIST += glsl/tests glsl/glcpp/tests glsl/README	\
+	glsl/TODO glsl/glcpp/README			\
+	glsl/glsl_lexer.ll				\
+	glsl/glsl_parser.yy				\
+	glsl/glcpp/glcpp-lex.l				\
+	glsl/glcpp/glcpp-parse.y			\
+	SConscript.glsl
+
+TESTS += glsl/glcpp/tests/glcpp-test			\
+	glsl/glcpp/tests/glcpp-test-cr-lf		\
+	glsl/tests/blob-test				\
+	glsl/tests/general-ir-test			\
+	glsl/tests/optimization-test			\
+	glsl/tests/sampler-types-test			\
+	glsl/tests/uniform-initializer-test
 
 TESTS_ENVIRONMENT= \
 	export PYTHON2=$(PYTHON2); \
 	export PYTHON_FLAGS=$(PYTHON_FLAGS);
 
-noinst_LTLIBRARIES = libglsl.la libglcpp.la
-check_PROGRAMS =					\
-	glcpp/glcpp					\
-	glsl_test					\
-	tests/blob-test					\
-	tests/general-ir-test				\
-	tests/sampler-types-test			\
-	tests/uniform-initializer-test
+check_PROGRAMS +=					\
+	glsl/glcpp/glcpp				\
+	glsl/glsl_test					\
+	glsl/tests/blob-test				\
+	glsl/tests/general-ir-test			\
+	glsl/tests/sampler-types-test			\
+	glsl/tests/uniform-initializer-test
 
 noinst_PROGRAMS = glsl_compiler
 
-tests_blob_test_SOURCES =				\
-	tests/blob_test.c
-tests_blob_test_LDADD =					\
-	$(top_builddir)/src/glsl/libglsl.la
-
-tests_general_ir_test_SOURCES =		\
-	standalone_scaffolding.cpp			\
-	tests/builtin_variable_test.cpp			\
-	tests/invalidate_locations_test.cpp		\
-	tests/general_ir_test.cpp			\
-	tests/varyings_test.cpp
-tests_general_ir_test_CFLAGS =				\
+glsl_tests_blob_test_SOURCES =				\
+	glsl/tests/blob_test.c
+glsl_tests_blob_test_LDADD =				\
+	glsl/libglsl.la
+
+glsl_tests_general_ir_test_SOURCES =			\
+	glsl/standalone_scaffolding.cpp			\
+	glsl/tests/builtin_variable_test.cpp		\
+	glsl/tests/invalidate_locations_test.cpp	\
+	glsl/tests/general_ir_test.cpp			\
+	glsl/tests/varyings_test.cpp
+glsl_tests_general_ir_test_CFLAGS =			\
 	$(PTHREAD_CFLAGS)
-tests_general_ir_test_LDADD =				\
+glsl_tests_general_ir_test_LDADD =			\
 	$(top_builddir)/src/gtest/libgtest.la		\
-	$(top_builddir)/src/glsl/libglsl.la		\
+	glsl/libglsl.la		\
 	$(top_builddir)/src/libglsl_util.la		\
 	$(PTHREAD_LIBS)
 
-tests_uniform_initializer_test_SOURCES =		\
-	tests/copy_constant_to_storage_tests.cpp	\
-	tests/set_uniform_initializer_tests.cpp		\
-	tests/uniform_initializer_utils.cpp		\
-	tests/uniform_initializer_utils.h
-tests_uniform_initializer_test_CFLAGS =			\
+glsl_tests_uniform_initializer_test_SOURCES =		\
+	glsl/tests/copy_constant_to_storage_tests.cpp	\
+	glsl/tests/set_uniform_initializer_tests.cpp	\
+	glsl/tests/uniform_initializer_utils.cpp	\
+	glsl/tests/uniform_initializer_utils.h
+glsl_tests_uniform_initializer_test_CFLAGS =		\
 	$(PTHREAD_CFLAGS)
-tests_uniform_initializer_test_LDADD =			\
+glsl_tests_uniform_initializer_test_LDADD =		\
 	$(top_builddir)/src/gtest/libgtest.la		\
-	$(top_builddir)/src/glsl/libglsl.la		\
+	glsl/libglsl.la		\
 	$(top_builddir)/src/libglsl_util.la		\
 	$(PTHREAD_LIBS)
 
-tests_sampler_types_test_SOURCES =			\
-	tests/sampler_types_test.cpp
-tests_sampler_types_test_CFLAGS =			\
+glsl_tests_sampler_types_test_SOURCES =			\
+	glsl/tests/sampler_types_test.cpp
+glsl_tests_sampler_types_test_CFLAGS =			\
 	$(PTHREAD_CFLAGS)
-tests_sampler_types_test_LDADD =			\
+glsl_tests_sampler_types_test_LDADD =			\
 	$(top_builddir)/src/gtest/libgtest.la		\
-	$(top_builddir)/src/glsl/libglsl.la		\
+	glsl/libglsl.la					\
 	$(top_builddir)/src/libglsl_util.la		\
 	$(PTHREAD_LIBS)
 
-libglcpp_la_LIBADD =					\
+noinst_LTLIBRARIES += glsl/libglsl.la glsl/libglcpp.la
+
+glsl_libglcpp_la_LIBADD =				\
 	$(top_builddir)/src/util/libmesautil.la
-libglcpp_la_SOURCES =					\
-	glcpp/glcpp-lex.c				\
-	glcpp/glcpp-parse.c				\
-	glcpp/glcpp-parse.h				\
+glsl_libglcpp_la_SOURCES =				\
+	glsl/glcpp/glcpp-lex.c				\
+	glsl/glcpp/glcpp-parse.c			\
+	glsl/glcpp/glcpp-parse.h			\
 	$(LIBGLCPP_FILES)
 
-glcpp_glcpp_SOURCES =					\
-	glcpp/glcpp.c
-glcpp_glcpp_LDADD =					\
-	libglcpp.la					\
+glsl_glcpp_glcpp_SOURCES =				\
+	glsl/glcpp/glcpp.c
+glsl_glcpp_glcpp_LDADD =				\
+	glsl/libglcpp.la	\
 	$(top_builddir)/src/libglsl_util.la		\
 	-lm
 
-libglsl_la_LIBADD = \
-	$(top_builddir)/src/compiler/nir/libnir.la \
-	libglcpp.la
+glsl_libglsl_la_LIBADD = \
+	nir/libnir.la \
+	glsl/libglcpp.la
 
-libglsl_la_SOURCES =					\
-	glsl_lexer.cpp					\
-	glsl_parser.cpp					\
-	glsl_parser.h					\
+glsl_libglsl_la_SOURCES =				\
+	glsl/glsl_lexer.cpp				\
+	glsl/glsl_parser.cpp				\
+	glsl/glsl_parser.h				\
 	$(LIBGLSL_FILES)
 
 
@@ -140,19 +125,19 @@ glsl_compiler_SOURCES = \
 	$(GLSL_COMPILER_CXX_FILES)
 
 glsl_compiler_LDADD =					\
-	libglsl.la					\
+	glsl/libglsl.la					\
 	$(top_builddir)/src/libglsl_util.la		\
 	$(top_builddir)/src/util/libmesautil.la		\
 	$(PTHREAD_LIBS)
 
-glsl_test_SOURCES = \
-	standalone_scaffolding.cpp \
-	test.cpp \
-	test_optpass.cpp \
-	test_optpass.h
+glsl_glsl_test_SOURCES = \
+	glsl/standalone_scaffolding.cpp \
+	glsl/test.cpp \
+	glsl/test_optpass.cpp \
+	glsl/test_optpass.h
 
-glsl_test_LDADD =					\
-	libglsl.la					\
+glsl_glsl_test_LDADD =					\
+	glsl/libglsl.la					\
 	$(top_builddir)/src/libglsl_util.la		\
 	$(PTHREAD_LIBS)
 
@@ -186,23 +171,24 @@ am__v_YACC_ = $(am__v_YACC_$(AM_DEFAULT_VERBOSITY))
 am__v_YACC_0 = @echo "  YACC    " $@;
 am__v_YACC_1 =
 
-MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 YACC_GEN = $(AM_V_YACC)$(YACC) $(YFLAGS)
 LEX_GEN = $(AM_V_LEX)$(LEX) $(LFLAGS)
 
-glsl_parser.cpp glsl_parser.h: glsl_parser.yy
-	$(YACC_GEN) -o $@ -p "_mesa_glsl_" --defines=$(builddir)/glsl_parser.h $(srcdir)/glsl_parser.yy
+glsl/glsl_parser.cpp glsl/glsl_parser.h: glsl/glsl_parser.yy
+	$(MKDIR_GEN)
+	$(YACC_GEN) -o $@ -p "_mesa_glsl_" --defines=$(builddir)/glsl/glsl_parser.h $(srcdir)/glsl/glsl_parser.yy
 
-glsl_lexer.cpp: glsl_lexer.ll
-	$(LEX_GEN) -o $@ $(srcdir)/glsl_lexer.ll
+glsl/glsl_lexer.cpp: glsl/glsl_lexer.ll
+	$(MKDIR_GEN)
+	$(LEX_GEN) -o $@ $(srcdir)/glsl/glsl_lexer.ll
 
-glcpp/glcpp-parse.c glcpp/glcpp-parse.h: glcpp/glcpp-parse.y
+glsl/glcpp/glcpp-parse.c glsl/glcpp/glcpp-parse.h: glsl/glcpp/glcpp-parse.y
 	$(MKDIR_GEN)
-	$(YACC_GEN) -o $@ -p "glcpp_parser_" --defines=$(builddir)/glcpp/glcpp-parse.h $(srcdir)/glcpp/glcpp-parse.y
+	$(YACC_GEN) -o $@ -p "glcpp_parser_" --defines=$(builddir)/glsl/glcpp/glcpp-parse.h $(srcdir)/glsl/glcpp/glcpp-parse.y
 
-glcpp/glcpp-lex.c: glcpp/glcpp-lex.l
+glsl/glcpp/glcpp-lex.c: glsl/glcpp/glcpp-lex.l
 	$(MKDIR_GEN)
-	$(LEX_GEN) -o $@ $(srcdir)/glcpp/glcpp-lex.l
+	$(LEX_GEN) -o $@ $(srcdir)/glsl/glcpp/glcpp-lex.l
 
 # Only the parsers (specifically the header files generated at the same time)
 # need to be in BUILT_SOURCES. Though if we list the parser headers YACC is
@@ -210,19 +196,22 @@ glcpp/glcpp-lex.c: glcpp/glcpp-lex.l
 # YACC is only executed once for each parser. The rest of the generated code
 # will be created at the appropriate times according to standard automake
 # dependency rules.
-BUILT_SOURCES =						\
-	glsl_parser.cpp					\
-	glsl_lexer.cpp					\
-	glcpp/glcpp-parse.c				\
-	glcpp/glcpp-lex.c
-CLEANFILES =						\
-	glcpp/glcpp-parse.h				\
-	glsl_parser.h					\
-	$(BUILT_SOURCES)
+BUILT_SOURCES +=					\
+	glsl/glsl_parser.cpp				\
+	glsl/glsl_lexer.cpp				\
+	glsl/glcpp/glcpp-parse.c			\
+	glsl/glcpp/glcpp-lex.c
+CLEANFILES +=						\
+	glsl/glcpp/glcpp-parse.h			\
+	glsl/glsl_parser.h				\
+	glsl/glsl_parser.cpp				\
+	glsl/glsl_lexer.cpp				\
+	glsl/glcpp/glcpp-parse.c			\
+	glsl/glcpp/glcpp-lex.c
 
 clean-local:
 	$(RM) -r subtest-cr subtest-cr-lf subtest-lf subtest-lf-cr
 
 dist-hook:
-	$(RM) glcpp/tests/*.out
-	$(RM) glcpp/tests/subtest*/*.out
+	$(RM) glsl/glcpp/tests/*.out
+	$(RM) glsl/glcpp/tests/subtest*/*.out
diff --git a/src/compiler/Makefile.nir.am b/src/compiler/Makefile.nir.am
new file mode 100644
index 00000000000..45a9c809bf8
--- /dev/null
+++ b/src/compiler/Makefile.nir.am
@@ -0,0 +1,94 @@
+#
+# Copyright © 2012 Jon TURNEY
+# Copyright (C) 2015 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+noinst_LTLIBRARIES += nir/libnir.la
+
+nir_libnir_la_CPPFLAGS = \
+	$(AM_CPPFLAGS) \
+	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_srcdir)/src/compiler/nir
+
+nir_libnir_la_LIBADD = \
+	libcompiler.la
+
+nir_libnir_la_SOURCES =					\
+	$(NIR_FILES)					\
+	$(SPIRV_FILES)					\
+	$(NIR_GENERATED_FILES)
+
+PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS)
+
+nir/nir_builder_opcodes.h: nir/nir_opcodes.py nir/nir_builder_opcodes_h.py
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/nir/nir_builder_opcodes_h.py > $@ || ($(RM) $@; false)
+
+nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/nir/nir_constant_expressions.py > $@ || ($(RM) $@; false)
+
+nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_h.py > $@ || ($(RM) $@; false)
+
+nir/nir_opcodes.c: nir/nir_opcodes.py nir/nir_opcodes_c.py
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_c.py > $@ || ($(RM) $@; false)
+
+nir/nir_opt_algebraic.c: nir/nir_opt_algebraic.py nir/nir_algebraic.py
+	$(MKDIR_GEN)
+	$(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@ || ($(RM) $@; false)
+
+
+check_PROGRAMS += nir/tests/control_flow_tests
+
+nir_tests_control_flow_tests_CPPFLAGS = \
+	$(AM_CPPFLAGS) \
+	-I$(top_builddir)/src/compiler/nir \
+	-I$(top_srcdir)/src/compiler/nir
+
+nir_tests_control_flow_tests_SOURCES =			\
+	nir/tests/control_flow_tests.cpp
+nir_tests_control_flow_tests_CFLAGS =			\
+	$(PTHREAD_CFLAGS)
+nir_tests_control_flow_tests_LDADD =			\
+	$(top_builddir)/src/gtest/libgtest.la		\
+	nir/libnir.la	\
+	$(top_builddir)/src/util/libmesautil.la		\
+	$(PTHREAD_LIBS)
+
+
+TESTS += nir/tests/control_flow_tests
+
+
+BUILT_SOURCES += $(NIR_GENERATED_FILES)
+CLEANFILES += $(NIR_GENERATED_FILES)
+
+EXTRA_DIST += \
+	nir/nir_algebraic.py				\
+	nir/nir_builder_opcodes_h.py			\
+	nir/nir_constant_expressions.py			\
+	nir/nir_opcodes.py				\
+	nir/nir_opcodes_c.py				\
+	nir/nir_opcodes_h.py				\
+	nir/nir_opt_algebraic.py			\
+	nir/tests
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 120ef2935a7..adc7a428469 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -187,6 +187,7 @@ NIR_FILES = \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
 	nir/nir_lower_clip.c \
+	nir/nir_lower_double_packing.c \
 	nir/nir_lower_global_vars_to_local.c \
 	nir/nir_lower_gs_intrinsics.c \
 	nir/nir_lower_load_const_to_scalar.c \
diff --git a/src/compiler/SConscript b/src/compiler/SConscript
index 8d71b82bee0..8969d821984 100644
--- a/src/compiler/SConscript
+++ b/src/compiler/SConscript
@@ -21,4 +21,4 @@ compiler = env.ConvenienceLibrary(
 )
 Export('compiler')
 
-SConscript('glsl/SConscript')
+SConscript('SConscript.glsl')
diff --git a/src/compiler/glsl/SConscript b/src/compiler/SConscript.glsl
index ef82a9d317a..43a11d105d4 100644
--- a/src/compiler/glsl/SConscript
+++ b/src/compiler/SConscript.glsl
@@ -15,14 +15,14 @@ env.Prepend(CPPPATH = [
     '#src/mesa',
     '#src/gallium/include',
     '#src/gallium/auxiliary',
-    '#src/glsl',
-    '#src/glsl/glcpp',
+    '#src/compiler/glsl',
+    '#src/compiler/glsl/glcpp',
 ])
 
 env.Prepend(LIBS = [mesautil])
 
 # Make glcpp-parse.h and glsl_parser.h reachable from the include path.
-env.Append(CPPPATH = [Dir('.').abspath, Dir('glcpp').abspath])
+env.Prepend(CPPPATH = [Dir('.').abspath, Dir('glsl').abspath])
 
 glcpp_env = env.Clone()
 glcpp_env.Append(YACCFLAGS = [
@@ -32,7 +32,7 @@ glcpp_env.Append(YACCFLAGS = [
 
 glsl_env = env.Clone()
 glsl_env.Append(YACCFLAGS = [
-    '--defines=%s' % File('glsl_parser.h').abspath,
+    '--defines=%s' % File('glsl/glsl_parser.h').abspath,
     '-p', '_mesa_glsl_',
 ])
 
@@ -40,10 +40,10 @@ glsl_env.Append(YACCFLAGS = [
 # "glsl_parser.h", causing glsl_parser.cpp to be regenerated every time
 glsl_env['YACCHXXFILESUFFIX'] = '.h'
 
-glcpp_lexer = glcpp_env.CFile('glcpp/glcpp-lex.c', 'glcpp/glcpp-lex.l')
-glcpp_parser = glcpp_env.CFile('glcpp/glcpp-parse.c', 'glcpp/glcpp-parse.y')
-glsl_lexer = glsl_env.CXXFile('glsl_lexer.cpp', 'glsl_lexer.ll')
-glsl_parser = glsl_env.CXXFile('glsl_parser.cpp', 'glsl_parser.yy')
+glcpp_lexer = glcpp_env.CFile('glsl/glcpp/glcpp-lex.c', 'glsl/glcpp/glcpp-lex.l')
+glcpp_parser = glcpp_env.CFile('glsl/glcpp/glcpp-parse.c', 'glsl/glcpp/glcpp-parse.y')
+glsl_lexer = glsl_env.CXXFile('glsl/glsl_lexer.cpp', 'glsl/glsl_lexer.ll')
+glsl_parser = glsl_env.CXXFile('glsl/glsl_parser.cpp', 'glsl/glsl_parser.yy')
 
 # common generated sources
 glsl_sources = [
@@ -66,20 +66,20 @@ if env['msvc']:
 
 # Copy these files to avoid generation object files into src/mesa/program
 env.Prepend(CPPPATH = ['#src/mesa/main'])
-env.Command('imports.c', '#src/mesa/main/imports.c', Copy('$TARGET', '$SOURCE'))
+env.Command('glsl/imports.c', '#src/mesa/main/imports.c', Copy('$TARGET', '$SOURCE'))
 # Copy these files to avoid generation object files into src/mesa/program
 env.Prepend(CPPPATH = ['#src/mesa/program'])
-env.Command('prog_hash_table.c', '#src/mesa/program/prog_hash_table.c', Copy('$TARGET', '$SOURCE'))
-env.Command('symbol_table.c', '#src/mesa/program/symbol_table.c', Copy('$TARGET', '$SOURCE'))
-env.Command('dummy_errors.c', '#src/mesa/program/dummy_errors.c', Copy('$TARGET', '$SOURCE'))
+env.Command('glsl/prog_hash_table.c', '#src/mesa/program/prog_hash_table.c', Copy('$TARGET', '$SOURCE'))
+env.Command('glsl/symbol_table.c', '#src/mesa/program/symbol_table.c', Copy('$TARGET', '$SOURCE'))
+env.Command('glsl/dummy_errors.c', '#src/mesa/program/dummy_errors.c', Copy('$TARGET', '$SOURCE'))
 
 compiler_objs = env.StaticObject(source_lists['GLSL_COMPILER_CXX_FILES'])
 
 mesa_objs = env.StaticObject([
-    'imports.c',
-    'prog_hash_table.c',
-    'symbol_table.c',
-    'dummy_errors.c',
+    'glsl/imports.c',
+    'glsl/prog_hash_table.c',
+    'glsl/symbol_table.c',
+    'glsl/dummy_errors.c',
 ])
 
 compiler_objs += mesa_objs
@@ -116,7 +116,7 @@ glsl_compiler = env.Program(
 env.Alias('glsl_compiler', glsl_compiler)
 
 glcpp = env.Program(
-    target = 'glcpp/glcpp',
-    source = ['glcpp/glcpp.c'] + mesa_objs,
+    target = 'glsl/glcpp/glcpp',
+    source = ['glsl/glcpp/glcpp.c'] + mesa_objs,
 )
 env.Alias('glcpp', glcpp)
diff --git a/src/compiler/glsl/Makefile.sources b/src/compiler/glsl/Makefile.sources
deleted file mode 100644
index 538196a79a9..00000000000
--- a/src/compiler/glsl/Makefile.sources
+++ /dev/null
@@ -1,223 +0,0 @@
-# shared source lists for Makefile, SConscript, and Android.mk
-
-# libglcpp
-
-LIBGLCPP_FILES = \
-	glcpp/glcpp.h \
-	glcpp/pp.c
-
-LIBGLCPP_GENERATED_FILES = \
-	glcpp/glcpp-lex.c \
-	glcpp/glcpp-parse.c
-
-NIR_GENERATED_FILES = \
-	nir/nir_builder_opcodes.h \
-	nir/nir_constant_expressions.c \
-	nir/nir_opcodes.c \
-	nir/nir_opcodes.h \
-	nir/nir_opt_algebraic.c
-
-NIR_FILES = \
-	nir/nir.c \
-	nir/nir.h \
-	nir/nir_array.h \
-	nir/nir_builder.h \
-	nir/nir_clone.c \
-	nir/nir_constant_expressions.h \
-	nir/nir_control_flow.c \
-	nir/nir_control_flow.h \
-	nir/nir_control_flow_private.h \
-	nir/nir_dominance.c \
-	nir/nir_from_ssa.c \
-	nir/nir_gs_count_vertices.c \
-	nir/nir_intrinsics.c \
-	nir/nir_intrinsics.h \
-	nir/nir_instr_set.c \
-	nir/nir_instr_set.h \
-	nir/nir_liveness.c \
-	nir/nir_lower_alu_to_scalar.c \
-	nir/nir_lower_atomics.c \
-	nir/nir_lower_clip.c \
-	nir/nir_lower_global_vars_to_local.c \
-	nir/nir_lower_gs_intrinsics.c \
-	nir/nir_lower_load_const_to_scalar.c \
-	nir/nir_lower_locals_to_regs.c \
-	nir/nir_lower_idiv.c \
-	nir/nir_lower_io.c \
-	nir/nir_lower_outputs_to_temporaries.c \
-	nir/nir_lower_phis_to_scalar.c \
-	nir/nir_lower_samplers.c \
-	nir/nir_lower_system_values.c \
-	nir/nir_lower_tex.c \
-	nir/nir_lower_to_source_mods.c \
-	nir/nir_lower_two_sided_color.c \
-	nir/nir_lower_vars_to_ssa.c \
-	nir/nir_lower_var_copies.c \
-	nir/nir_lower_vec_to_movs.c \
-	nir/nir_metadata.c \
-	nir/nir_move_vec_src_uses_to_dest.c \
-	nir/nir_normalize_cubemap_coords.c \
-	nir/nir_opt_constant_folding.c \
-	nir/nir_opt_copy_propagate.c \
-	nir/nir_opt_cse.c \
-	nir/nir_opt_dce.c \
-	nir/nir_opt_dead_cf.c \
-	nir/nir_opt_gcm.c \
-	nir/nir_opt_global_to_local.c \
-	nir/nir_opt_peephole_select.c \
-	nir/nir_opt_remove_phis.c \
-	nir/nir_opt_undef.c \
-	nir/nir_print.c \
-	nir/nir_remove_dead_variables.c \
-	nir/nir_search.c \
-	nir/nir_search.h \
-	nir/nir_split_var_copies.c \
-	nir/nir_sweep.c \
-	nir/nir_to_ssa.c \
-	nir/nir_validate.c \
-	nir/nir_vla.h \
-	nir/nir_worklist.c \
-	nir/nir_worklist.h
-
-# libglsl
-
-LIBGLSL_FILES = \
-	ast.h \
-	ast_array_index.cpp \
-	ast_expr.cpp \
-	ast_function.cpp \
-	ast_to_hir.cpp \
-	ast_type.cpp \
-	blob.c \
-	blob.h \
-	builtin_functions.cpp \
-	builtin_types.cpp \
-	builtin_variables.cpp \
-	glsl_parser_extras.cpp \
-	glsl_parser_extras.h \
-	glsl_symbol_table.cpp \
-	glsl_symbol_table.h \
-	hir_field_selection.cpp \
-	ir_basic_block.cpp \
-	ir_basic_block.h \
-	ir_builder.cpp \
-	ir_builder.h \
-	ir_clone.cpp \
-	ir_constant_expression.cpp \
-	ir.cpp \
-	ir.h \
-	ir_equals.cpp \
-	ir_expression_flattening.cpp \
-	ir_expression_flattening.h \
-	ir_function_can_inline.cpp \
-	ir_function_detect_recursion.cpp \
-	ir_function_inlining.h \
-	ir_function.cpp \
-	ir_hierarchical_visitor.cpp \
-	ir_hierarchical_visitor.h \
-	ir_hv_accept.cpp \
-	ir_import_prototypes.cpp \
-	ir_optimization.h \
-	ir_print_visitor.cpp \
-	ir_print_visitor.h \
-	ir_reader.cpp \
-	ir_reader.h \
-	ir_rvalue_visitor.cpp \
-	ir_rvalue_visitor.h \
-	ir_set_program_inouts.cpp \
-	ir_uniform.h \
-	ir_validate.cpp \
-	ir_variable_refcount.cpp \
-	ir_variable_refcount.h \
-	ir_visitor.h \
-	linker.cpp \
-	linker.h \
-	link_atomics.cpp \
-	link_functions.cpp \
-	link_interface_blocks.cpp \
-	link_uniforms.cpp \
-	link_uniform_initializers.cpp \
-	link_uniform_block_active_visitor.cpp \
-	link_uniform_block_active_visitor.h \
-	link_uniform_blocks.cpp \
-	link_varyings.cpp \
-	link_varyings.h \
-	list.h \
-	loop_analysis.cpp \
-	loop_analysis.h \
-	loop_controls.cpp \
-	loop_unroll.cpp \
-	lower_buffer_access.cpp \
-	lower_buffer_access.h \
-	lower_clip_distance.cpp \
-	lower_const_arrays_to_uniforms.cpp \
-	lower_discard.cpp \
-	lower_discard_flow.cpp \
-	lower_if_to_cond_assign.cpp \
-	lower_instructions.cpp \
-	lower_jumps.cpp \
-	lower_mat_op_to_vec.cpp \
-	lower_noise.cpp \
-	lower_offset_array.cpp \
-	lower_packed_varyings.cpp \
-	lower_named_interface_blocks.cpp \
-	lower_packing_builtins.cpp \
-	lower_subroutine.cpp \
-	lower_tess_level.cpp \
-	lower_texture_projection.cpp \
-	lower_variable_index_to_cond_assign.cpp \
-	lower_vec_index_to_cond_assign.cpp \
-	lower_vec_index_to_swizzle.cpp \
-	lower_vector.cpp \
-	lower_vector_derefs.cpp \
-	lower_vector_insert.cpp \
-	lower_vertex_id.cpp \
-	lower_output_reads.cpp \
-	lower_shared_reference.cpp \
-	lower_ubo_reference.cpp \
-	opt_algebraic.cpp \
-	opt_array_splitting.cpp \
-	opt_conditional_discard.cpp \
-	opt_constant_folding.cpp \
-	opt_constant_propagation.cpp \
-	opt_constant_variable.cpp \
-	opt_copy_propagation.cpp \
-	opt_copy_propagation_elements.cpp \
-	opt_dead_builtin_variables.cpp \
-	opt_dead_builtin_varyings.cpp \
-	opt_dead_code.cpp \
-	opt_dead_code_local.cpp \
-	opt_dead_functions.cpp \
-	opt_flatten_nested_if_blocks.cpp \
-	opt_flip_matrices.cpp \
-	opt_function_inlining.cpp \
-	opt_if_simplification.cpp \
-	opt_minmax.cpp \
-	opt_noop_swizzle.cpp \
-	opt_rebalance_tree.cpp \
-	opt_redundant_jumps.cpp \
-	opt_structure_splitting.cpp \
-	opt_swizzle_swizzle.cpp \
-	opt_tree_grafting.cpp \
-	opt_vectorize.cpp \
-	program.h \
-	propagate_invariance.cpp \
-	s_expression.cpp \
-	s_expression.h
-
-# glsl to nir pass
-GLSL_TO_NIR_FILES = \
-	nir/glsl_to_nir.cpp \
-	nir/glsl_to_nir.h
-
-# glsl_compiler
-
-GLSL_COMPILER_CXX_FILES = \
-	standalone_scaffolding.cpp \
-	standalone_scaffolding.h \
-	main.cpp
-
-# libglsl generated sources
-LIBGLSL_GENERATED_CXX_FILES = \
-	glsl_lexer.cpp \
-	glsl_parser.cpp
diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index 7436edce88a..92aa39e64b8 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -736,6 +736,11 @@ struct ast_type_qualifier {
                            const ast_type_qualifier &q,
                            ast_node* &node, bool create_node);
 
+   bool validate_flags(YYLTYPE *loc,
+                       _mesa_glsl_parse_state *state,
+                       const char *message,
+                       const ast_type_qualifier &allowed_flags);
+
    ast_subroutine_list *subroutine_list;
 };
 
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 7c9be8171b6..82eb22a82c6 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -4300,6 +4300,17 @@ ast_declarator_list::hir(exec_list *instructions,
             state->atomic_counter_offsets[qual_binding] = qual_offset;
          }
       }
+
+      ast_type_qualifier allowed_atomic_qual_mask;
+      allowed_atomic_qual_mask.flags.i = 0;
+      allowed_atomic_qual_mask.flags.q.explicit_binding = 1;
+      allowed_atomic_qual_mask.flags.q.explicit_offset = 1;
+      allowed_atomic_qual_mask.flags.q.uniform = 1;
+
+      type->qualifier.validate_flags(&loc, state,
+                                     "invalid layout qualifier for "
+                                     "atomic_uint",
+                                     allowed_atomic_qual_mask);
    }
 
    if (this->declarations.is_empty()) {
diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index c3d38cbbf8a..7a0014b5d7f 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -581,6 +581,91 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc,
    return true;
 }
 
+/**
+ * Check if the current type qualifier has any illegal flags.
+ *
+ * If so, print an error message, followed by a list of illegal flags.
+ *
+ * \param message        The error message to print.
+ * \param allowed_flags  A list of valid flags.
+ */
+bool
+ast_type_qualifier::validate_flags(YYLTYPE *loc,
+                                   _mesa_glsl_parse_state *state,
+                                   const char *message,
+                                   const ast_type_qualifier &allowed_flags)
+{
+   ast_type_qualifier bad;
+   bad.flags.i = this->flags.i & ~allowed_flags.flags.i;
+   if (bad.flags.i == 0)
+      return true;
+
+   _mesa_glsl_error(loc, state,
+                    "%s:"
+                    "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s"
+                    "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s"
+                    "%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+                    message,
+                    bad.flags.q.invariant ? " invariant" : "",
+                    bad.flags.q.precise ? " precise" : "",
+                    bad.flags.q.constant ? " constant" : "",
+                    bad.flags.q.attribute ? " attribute" : "",
+                    bad.flags.q.varying ? " varying" : "",
+                    bad.flags.q.in ? " in" : "",
+                    bad.flags.q.out ? " out" : "",
+                    bad.flags.q.centroid ? " centroid" : "",
+                    bad.flags.q.sample ? " sample" : "",
+                    bad.flags.q.patch ? " patch" : "",
+                    bad.flags.q.uniform ? " uniform" : "",
+                    bad.flags.q.buffer ? " buffer" : "",
+                    bad.flags.q.shared_storage ? " shared_storage" : "",
+                    bad.flags.q.smooth ? " smooth" : "",
+                    bad.flags.q.flat ? " flat" : "",
+                    bad.flags.q.noperspective ? " noperspective" : "",
+                    bad.flags.q.origin_upper_left ? " origin_upper_left" : "",
+                    bad.flags.q.pixel_center_integer ? " pixel_center_integer" : "",
+                    bad.flags.q.explicit_align ? " align" : "",
+                    bad.flags.q.explicit_location ? " location" : "",
+                    bad.flags.q.explicit_index ? " index" : "",
+                    bad.flags.q.explicit_binding ? " binding" : "",
+                    bad.flags.q.explicit_offset ? " offset" : "",
+                    bad.flags.q.depth_any ? " depth_any" : "",
+                    bad.flags.q.depth_greater ? " depth_greater" : "",
+                    bad.flags.q.depth_less ? " depth_less" : "",
+                    bad.flags.q.depth_unchanged ? " depth_unchanged" : "",
+                    bad.flags.q.std140 ? " std140" : "",
+                    bad.flags.q.std430 ? " std430" : "",
+                    bad.flags.q.shared ? " shared" : "",
+                    bad.flags.q.packed ? " packed" : "",
+                    bad.flags.q.column_major ? " column_major" : "",
+                    bad.flags.q.row_major ? " row_major" : "",
+                    bad.flags.q.prim_type ? " prim_type" : "",
+                    bad.flags.q.max_vertices ? " max_vertices" : "",
+                    bad.flags.q.local_size ? " local_size" : "",
+                    bad.flags.q.early_fragment_tests ? " early_fragment_tests" : "",
+                    bad.flags.q.explicit_image_format ? " image_format" : "",
+                    bad.flags.q.coherent ? " coherent" : "",
+                    bad.flags.q._volatile ? " _volatile" : "",
+                    bad.flags.q.restrict_flag ? " restrict_flag" : "",
+                    bad.flags.q.read_only ? " read_only" : "",
+                    bad.flags.q.write_only ? " write_only" : "",
+                    bad.flags.q.invocations ? " invocations" : "",
+                    bad.flags.q.stream ? " stream" : "",
+                    bad.flags.q.explicit_stream ? " stream" : "",
+                    bad.flags.q.explicit_xfb_offset ? " xfb_offset" : "",
+                    bad.flags.q.xfb_buffer ? " xfb_buffer" : "",
+                    bad.flags.q.explicit_xfb_buffer ? " xfb_buffer" : "",
+                    bad.flags.q.xfb_stride ? " xfb_stride" : "",
+                    bad.flags.q.explicit_xfb_stride ? " xfb_stride" : "",
+                    bad.flags.q.vertex_spacing ? " vertex_spacing" : "",
+                    bad.flags.q.ordering ? " ordering" : "",
+                    bad.flags.q.point_mode ? " point_mode" : "",
+                    bad.flags.q.vertices ? " vertices" : "",
+                    bad.flags.q.subroutine ? " subroutine" : "",
+                    bad.flags.q.subroutine_def ? " subroutine_def" : "");
+   return false;
+}
+
 bool
 ast_layout_expression::process_qualifier_constant(struct _mesa_glsl_parse_state *state,
                                                   const char *qual_indentifier,
diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index 0b7695f8d3e..6b1ef1717e5 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -304,7 +304,7 @@ in		return IN_TOK;
 out		return OUT_TOK;
 inout		return INOUT_TOK;
 uniform		return UNIFORM;
-buffer		return BUFFER;
+buffer		KEYWORD_WITH_ALT(0, 0, 430, 310, yyextra->ARB_shader_storage_buffer_object_enable, BUFFER);
 varying		DEPRECATED_ES_KEYWORD(VARYING);
 centroid	KEYWORD(120, 300, 120, 300, CENTROID);
 invariant	KEYWORD(120, 100, 120, 100, INVARIANT);
diff --git a/src/compiler/glsl/ir_set_program_inouts.cpp b/src/compiler/glsl/ir_set_program_inouts.cpp
index df06923b870..6768d82f338 100644
--- a/src/compiler/glsl/ir_set_program_inouts.cpp
+++ b/src/compiler/glsl/ir_set_program_inouts.cpp
@@ -149,7 +149,7 @@ void
 ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var)
 {
    const glsl_type *type = var->type;
-   bool vertex_input = false;
+   bool is_vertex_input = false;
    if (this->shader_stage == MESA_SHADER_GEOMETRY &&
        var->data.mode == ir_var_shader_in && type->is_array()) {
       type = type->fields.array;
@@ -175,9 +175,9 @@ ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var)
 
    if (this->shader_stage == MESA_SHADER_VERTEX &&
        var->data.mode == ir_var_shader_in)
-      vertex_input = true;
+      is_vertex_input = true;
 
-   mark(this->prog, var, 0, type->count_attribute_slots(vertex_input),
+   mark(this->prog, var, 0, type->count_attribute_slots(is_vertex_input),
         this->shader_stage);
 }
 
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index e9d0067459a..87606be9337 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -488,7 +488,7 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
        * its value is used by other shader stages. This will cause the
        * variable to have a location assigned.
        */
-      if (var->data.is_unmatched_generic_inout) {
+      if (var->data.is_unmatched_generic_inout && !var->data.is_xfb_only) {
          assert(var->data.mode != ir_var_temporary);
          var->data.mode = ir_var_auto;
       }
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 957efe5b55d..dcc8a57b6be 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -2618,7 +2618,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
          return false;
       }
 
-      const unsigned slots = var->type->count_attribute_slots(target_index == MESA_SHADER_VERTEX ? true : false);
+      const unsigned slots = var->type->count_attribute_slots(target_index == MESA_SHADER_VERTEX);
 
       /* If the variable is not a built-in and has a location statically
        * assigned in the shader (presumably via a layout qualifier), make sure
@@ -3249,12 +3249,12 @@ reserve_subroutine_explicit_locations(struct gl_shader_program *prog,
  * any optimizations happen to handle also inactive uniforms and
  * inactive array elements that may get trimmed away.
  */
-static int
+static unsigned
 check_explicit_uniform_locations(struct gl_context *ctx,
                                  struct gl_shader_program *prog)
 {
    if (!ctx->Extensions.ARB_explicit_uniform_location)
-      return -1;
+      return 0;
 
    /* This map is used to detect if overlapping explicit locations
     * occur with the same uniform (from different stage) or a different one.
@@ -3263,7 +3263,7 @@ check_explicit_uniform_locations(struct gl_context *ctx,
 
    if (!uniform_map) {
       linker_error(prog, "Out of memory during linking.\n");
-      return -1;
+      return 0;
    }
 
    unsigned entries_total = 0;
@@ -3292,7 +3292,7 @@ check_explicit_uniform_locations(struct gl_context *ctx,
             }
             if (!ret) {
                delete uniform_map;
-               return -1;
+               return 0;
             }
          }
       }
@@ -3518,8 +3518,9 @@ build_stageref(struct gl_shader_program *shProg, const char *name,
  */
 static gl_shader_variable *
 create_shader_variable(struct gl_shader_program *shProg,
-                       const ir_variable *in, bool use_implicit_location,
-                       int location_bias)
+                       const ir_variable *in,
+                       const char *name, const glsl_type *type,
+                       bool use_implicit_location, int location)
 {
    gl_shader_variable *out = ralloc(shProg, struct gl_shader_variable);
    if (!out)
@@ -3532,7 +3533,7 @@ create_shader_variable(struct gl_shader_program *shProg,
        in->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) {
       out->name = ralloc_strdup(shProg, "gl_VertexID");
    } else {
-      out->name = ralloc_strdup(shProg, in->name);
+      out->name = ralloc_strdup(shProg, name);
    }
 
    if (!out->name)
@@ -3557,10 +3558,10 @@ create_shader_variable(struct gl_shader_program *shProg,
        !(in->data.explicit_location || use_implicit_location)) {
       out->location = -1;
    } else {
-      out->location = in->data.location - location_bias;
+      out->location = location;
    }
 
-   out->type = in->type;
+   out->type = type;
    out->index = in->data.index;
    out->patch = in->data.patch;
    out->mode = in->data.mode;
@@ -3569,6 +3570,60 @@ create_shader_variable(struct gl_shader_program *shProg,
 }
 
 static bool
+add_shader_variable(struct gl_shader_program *shProg, unsigned stage_mask,
+                    GLenum programInterface, ir_variable *var,
+                    const char *name, const glsl_type *type,
+                    bool use_implicit_location, int location)
+{
+   const bool is_vertex_input =
+      programInterface == GL_PROGRAM_INPUT &&
+      stage_mask == MESA_SHADER_VERTEX;
+
+   switch (type->base_type) {
+   case GLSL_TYPE_STRUCT: {
+      /* From the ARB_program_interface_query specification:
+       *
+       *  "For an active variable declared as a structure, a separate entry
+       *   will be generated for each active structure member.  The name of
+       *   each entry is formed by concatenating the name of the structure,
+       *   the "."  character, and the name of the structure member.  If a
+       *   structure member to enumerate is itself a structure or array, these
+       *   enumeration rules are applied recursively."
+       */
+      unsigned field_location = location;
+      for (unsigned i = 0; i < type->length; i++) {
+         const struct glsl_struct_field *field = &type->fields.structure[i];
+         char *field_name = ralloc_asprintf(shProg, "%s.%s", name, field->name);
+         if (!add_shader_variable(shProg, stage_mask, programInterface,
+                                  var, field_name, field->type,
+                                  use_implicit_location, field_location))
+            return false;
+
+         field_location +=
+            field->type->count_attribute_slots(is_vertex_input);
+      }
+      return true;
+   }
+
+   default: {
+      /* From the ARB_program_interface_query specification:
+       *
+       *  "For an active variable declared as a single instance of a basic
+       *   type, a single entry will be generated, using the variable name
+       *   from the shader source."
+       */
+      gl_shader_variable *sha_v =
+         create_shader_variable(shProg, var, name, type,
+                                use_implicit_location, location);
+      if (!sha_v)
+         return false;
+
+      return add_program_resource(shProg, programInterface, sha_v, stage_mask);
+   }
+   }
+}
+
+static bool
 add_interface_variables(struct gl_shader_program *shProg,
                         unsigned stage, GLenum programInterface)
 {
@@ -3616,12 +3671,9 @@ add_interface_variables(struct gl_shader_program *shProg,
          (stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) ||
          (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out);
 
-      gl_shader_variable *sha_v =
-         create_shader_variable(shProg, var, vs_input_or_fs_output, loc_bias);
-      if (!sha_v)
-         return false;
-
-      if (!add_program_resource(shProg, programInterface, sha_v, 1 << stage))
+      if (!add_shader_variable(shProg, 1 << stage, programInterface,
+                               var, var->name, var->type, vs_input_or_fs_output,
+                               var->data.location - loc_bias))
          return false;
    }
    return true;
@@ -3651,13 +3703,11 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage, GLenum type)
          }
 
          if (type == iface) {
-            gl_shader_variable *sha_v =
-               create_shader_variable(shProg, var, false, VARYING_SLOT_VAR0);
-            if (!sha_v)
-               return false;
-            if (!add_program_resource(shProg, iface, sha_v,
-                                      build_stageref(shProg, sha_v->name,
-                                                     sha_v->mode)))
+            const int stage_mask =
+               build_stageref(shProg, var->name, var->data.mode);
+            if (!add_shader_variable(shProg, stage_mask,
+                                     iface, var, var->name, var->type, false,
+                                     var->data.location - VARYING_SLOT_VAR0))
                return false;
          }
       }
@@ -3677,12 +3727,11 @@ add_fragdata_arrays(struct gl_shader_program *shProg)
       ir_variable *var = node->as_variable();
       if (var) {
          assert(var->data.mode == ir_var_shader_out);
-         gl_shader_variable *sha_v =
-            create_shader_variable(shProg, var, true, FRAG_RESULT_DATA0);
-         if (!sha_v)
-            return false;
-         if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, sha_v,
-                                   1 << MESA_SHADER_FRAGMENT))
+
+         if (!add_shader_variable(shProg,
+                                  1 << MESA_SHADER_FRAGMENT,
+                                  GL_PROGRAM_OUTPUT, var, var->name, var->type,
+                                  true, var->data.location - FRAG_RESULT_DATA0))
             return false;
       }
    }
diff --git a/src/compiler/glsl/opt_constant_propagation.cpp b/src/compiler/glsl/opt_constant_propagation.cpp
index 416ba16a3c5..4764d16de6d 100644
--- a/src/compiler/glsl/opt_constant_propagation.cpp
+++ b/src/compiler/glsl/opt_constant_propagation.cpp
@@ -122,7 +122,7 @@ public:
    exec_list *acp;
 
    /**
-    * List of kill_entry: The masks of variables whose values were
+    * Hash table of kill_entry: The masks of variables whose values were
     * killed in this block.
     */
    hash_table *kills;
@@ -454,7 +454,7 @@ ir_constant_propagation_visitor::kill(ir_variable *var, unsigned write_mask)
       }
    }
 
-   /* Add this writemask of the variable to the list of killed
+   /* Add this writemask of the variable to the hash table of killed
     * variables in this block.
     */
    hash_entry *kill_hash_entry = _mesa_hash_table_search(this->kills, var);
@@ -463,7 +463,7 @@ ir_constant_propagation_visitor::kill(ir_variable *var, unsigned write_mask)
       entry->write_mask |= write_mask;
       return;
    }
-   /* Not already in the list.  Make new entry. */
+   /* Not already in the hash table.  Make new entry. */
    _mesa_hash_table_insert(this->kills, var,
                            new(this->mem_ctx) kill_entry(var, write_mask));
 }
diff --git a/src/compiler/glsl/opt_copy_propagation.cpp b/src/compiler/glsl/opt_copy_propagation.cpp
index 310708db868..ae62921a0df 100644
--- a/src/compiler/glsl/opt_copy_propagation.cpp
+++ b/src/compiler/glsl/opt_copy_propagation.cpp
@@ -331,7 +331,8 @@ ir_copy_propagation_visitor::add_copy(ir_assignment *ir)
 	 ir->condition = new(ralloc_parent(ir)) ir_constant(false);
 	 this->progress = true;
       } else if (lhs_var->data.mode != ir_var_shader_storage &&
-                 lhs_var->data.mode != ir_var_shader_shared) {
+                 lhs_var->data.mode != ir_var_shader_shared &&
+                 lhs_var->data.precise == rhs_var->data.precise) {
 	 entry = new(this->acp) acp_entry(lhs_var, rhs_var);
 	 this->acp->push_tail(entry);
       }
diff --git a/src/compiler/glsl/opt_copy_propagation_elements.cpp b/src/compiler/glsl/opt_copy_propagation_elements.cpp
index a6791801943..e9e7c53505c 100644
--- a/src/compiler/glsl/opt_copy_propagation_elements.cpp
+++ b/src/compiler/glsl/opt_copy_propagation_elements.cpp
@@ -493,6 +493,9 @@ ir_copy_propagation_elements_visitor::add_copy(ir_assignment *ir)
       }
    }
 
+   if (lhs->var->data.precise != rhs->var->data.precise)
+      return;
+
    entry = new(this->mem_ctx) acp_entry(lhs->var, rhs->var, write_mask,
 					swizzle);
    this->acp->push_tail(entry);
diff --git a/src/compiler/glsl/opt_tree_grafting.cpp b/src/compiler/glsl/opt_tree_grafting.cpp
index 812f996fb81..a40e5f71609 100644
--- a/src/compiler/glsl/opt_tree_grafting.cpp
+++ b/src/compiler/glsl/opt_tree_grafting.cpp
@@ -368,6 +368,9 @@ tree_grafting_basic_block(ir_instruction *bb_first,
           lhs_var->data.mode == ir_var_shader_shared)
          continue;
 
+      if (lhs_var->data.precise)
+         continue;
+
       ir_variable_refcount_entry *entry = info->refs->get_variable_entry(lhs_var);
 
       if (!entry->declaration ||
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 39585bff3b9..c058283c48d 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -1897,7 +1897,7 @@ glsl_type::std430_size(bool row_major) const
 }
 
 unsigned
-glsl_type::count_attribute_slots(bool vertex_input_slots) const
+glsl_type::count_attribute_slots(bool is_vertex_input) const
 {
    /* From page 31 (page 37 of the PDF) of the GLSL 1.50 spec:
     *
@@ -1931,7 +1931,7 @@ glsl_type::count_attribute_slots(bool vertex_input_slots) const
    case GLSL_TYPE_BOOL:
       return this->matrix_columns;
    case GLSL_TYPE_DOUBLE:
-      if (this->vector_elements > 2 && !vertex_input_slots)
+      if (this->vector_elements > 2 && !is_vertex_input)
          return this->matrix_columns * 2;
       else
          return this->matrix_columns;
@@ -1940,13 +1940,13 @@ glsl_type::count_attribute_slots(bool vertex_input_slots) const
       unsigned size = 0;
 
       for (unsigned i = 0; i < this->length; i++)
-         size += this->fields.structure[i].type->count_attribute_slots(vertex_input_slots);
+         size += this->fields.structure[i].type->count_attribute_slots(is_vertex_input);
 
       return size;
    }
 
    case GLSL_TYPE_ARRAY:
-      return this->length * this->fields.array->count_attribute_slots(vertex_input_slots);
+      return this->length * this->fields.array->count_attribute_slots(is_vertex_input);
 
    case GLSL_TYPE_FUNCTION:
    case GLSL_TYPE_SAMPLER:
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index dd46479755a..a47b0ffe5a2 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -344,7 +344,7 @@ struct glsl_type {
     * For vertex shader attributes - doubles only take one slot.
     * For inter-shader varyings - dvec3/dvec4 take two slots.
     */
-   unsigned count_attribute_slots(bool vertex_input_slots) const;
+   unsigned count_attribute_slots(bool is_vertex_input) const;
 
    /**
     * Alignment in bytes of the start of this type in a std140 uniform
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
deleted file mode 100644
index e6367d9c282..00000000000
--- a/src/compiler/nir/Makefile.sources
+++ /dev/null
@@ -1,87 +0,0 @@
-NIR_GENERATED_FILES = \
-	nir_builder_opcodes.h \
-	nir_constant_expressions.c \
-	nir_opcodes.c \
-	nir_opcodes.h \
-	nir_opt_algebraic.c
-
-NIR_FILES = \
-	glsl_to_nir.cpp \
-	glsl_to_nir.h \
-	nir.c \
-	nir.h \
-	nir_array.h \
-	nir_builder.h \
-	nir_clone.c \
-	nir_constant_expressions.h \
-	nir_control_flow.c \
-	nir_control_flow.h \
-	nir_control_flow_private.h \
-	nir_dominance.c \
-	nir_from_ssa.c \
-	nir_gather_info.c \
-	nir_gs_count_vertices.c \
-	nir_inline_functions.c \
-	nir_instr_set.c \
-	nir_instr_set.h \
-	nir_intrinsics.c \
-	nir_intrinsics.h \
-	nir_liveness.c \
-	nir_lower_alu_to_scalar.c \
-	nir_lower_atomics.c \
-	nir_lower_clip.c \
-	nir_lower_global_vars_to_local.c \
-	nir_lower_gs_intrinsics.c \
-	nir_lower_load_const_to_scalar.c \
-	nir_lower_locals_to_regs.c \
-	nir_lower_idiv.c \
-	nir_lower_indirect_derefs.c \
-	nir_lower_io.c \
-	nir_lower_outputs_to_temporaries.c \
-	nir_lower_phis_to_scalar.c \
-	nir_lower_returns.c \
-	nir_lower_samplers.c \
-	nir_lower_system_values.c \
-	nir_lower_tex.c \
-	nir_lower_to_source_mods.c \
-	nir_lower_two_sided_color.c \
-	nir_lower_vars_to_ssa.c \
-	nir_lower_var_copies.c \
-	nir_lower_vec_to_movs.c \
-	nir_metadata.c \
-	nir_move_vec_src_uses_to_dest.c \
-	nir_normalize_cubemap_coords.c \
-	nir_opt_constant_folding.c \
-	nir_opt_copy_propagate.c \
-	nir_opt_cse.c \
-	nir_opt_dce.c \
-	nir_opt_dead_cf.c \
-	nir_opt_gcm.c \
-	nir_opt_global_to_local.c \
-	nir_opt_peephole_select.c \
-	nir_opt_remove_phis.c \
-	nir_opt_undef.c \
-	nir_phi_builder.c \
-	nir_phi_builder.h \
-	nir_print.c \
-	nir_remove_dead_variables.c \
-	nir_repair_ssa.c \
-	nir_search.c \
-	nir_search.h \
-	nir_split_var_copies.c \
-	nir_sweep.c \
-	nir_to_ssa.c \
-	nir_validate.c \
-	nir_vla.h \
-	nir_worklist.c \
-	nir_worklist.h
-
-SPIRV_FILES = \
-	spirv/nir_spirv.h \
-	spirv/spirv_to_nir.c \
-	spirv/vtn_alu.c \
-	spirv/vtn_cfg.c \
-	spirv/vtn_glsl450.c \
-	spirv/vtn_private.h \
-	spirv/vtn_variables.c
-
diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index 14affeee8ac..d4c58a9ba2e 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -73,7 +73,7 @@ public:
    void create_function(ir_function_signature *ir);
 
 private:
-   void add_instr(nir_instr *instr, unsigned num_components);
+   void add_instr(nir_instr *instr, unsigned num_components, unsigned bit_size);
    nir_ssa_def *evaluate_rvalue(ir_rvalue *ir);
 
    nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def **srcs);
@@ -257,6 +257,11 @@ constant_copy(ir_constant *ir, void *mem_ctx)
          ret->value.f[i] = ir->value.f[i];
       break;
 
+   case GLSL_TYPE_DOUBLE:
+      for (i = 0; i < total_elems; i++)
+         ret->value.d[i] = ir->value.d[i];
+      break;
+
    case GLSL_TYPE_BOOL:
       for (i = 0; i < total_elems; i++)
          ret->value.b[i] = ir->value.b[i];
@@ -736,7 +741,7 @@ nir_visitor::visit(ir_call *ir)
       case nir_intrinsic_image_samples:
       case nir_intrinsic_image_size: {
          nir_ssa_undef_instr *instr_undef =
-            nir_ssa_undef_instr_create(shader, 1);
+            nir_ssa_undef_instr_create(shader, 1, 32);
          nir_builder_instr_insert(&b, &instr_undef->instr);
 
          /* Set the image variable dereference. */
@@ -854,8 +859,9 @@ nir_visitor::visit(ir_call *ir)
          instr->num_components = type->vector_elements;
 
          /* Setup destination register */
+         unsigned bit_size = glsl_get_bit_size(type->base_type);
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           type->vector_elements, 32, NULL);
+                           type->vector_elements, bit_size, NULL);
 
          /* Insert the created nir instruction now since in the case of boolean
           * result we will need to emit another instruction after it
@@ -878,7 +884,7 @@ nir_visitor::visit(ir_call *ir)
                load_ssbo_compare->src[1].swizzle[i] = 0;
             nir_ssa_dest_init(&load_ssbo_compare->instr,
                               &load_ssbo_compare->dest.dest,
-                              type->vector_elements, 32, NULL);
+                              type->vector_elements, bit_size, NULL);
             load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1;
             nir_builder_instr_insert(&b, &load_ssbo_compare->instr);
             dest = &load_ssbo_compare->dest.dest;
@@ -1152,12 +1158,13 @@ get_instr_dest(nir_instr *instr)
 }
 
 void
-nir_visitor::add_instr(nir_instr *instr, unsigned num_components)
+nir_visitor::add_instr(nir_instr *instr, unsigned num_components,
+                       unsigned bit_size)
 {
    nir_dest *dest = get_instr_dest(instr);
 
    if (dest)
-      nir_ssa_dest_init(instr, dest, num_components, 32, NULL);
+      nir_ssa_dest_init(instr, dest, num_components, bit_size, NULL);
 
    nir_builder_instr_insert(&b, instr);
 
@@ -1182,12 +1189,19 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir)
       load_instr->num_components = ir->type->vector_elements;
       load_instr->variables[0] = this->deref_head;
       ralloc_steal(load_instr, load_instr->variables[0]);
-      add_instr(&load_instr->instr, ir->type->vector_elements);
+      unsigned bit_size = glsl_get_bit_size(ir->type->base_type);
+      add_instr(&load_instr->instr, ir->type->vector_elements, bit_size);
    }
 
    return this->result;
 }
 
+static bool
+type_is_float(glsl_base_type type)
+{
+   return type == GLSL_TYPE_FLOAT || type == GLSL_TYPE_DOUBLE;
+}
+
 void
 nir_visitor::visit(ir_expression *ir)
 {
@@ -1196,11 +1210,11 @@ nir_visitor::visit(ir_expression *ir)
    case ir_binop_ubo_load: {
       nir_intrinsic_instr *load =
          nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_ubo);
+      unsigned bit_size = glsl_get_bit_size(ir->type->base_type);
       load->num_components = ir->type->vector_elements;
-      load->dest.ssa.bit_size = glsl_get_bit_size(ir->type->base_type);
       load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0]));
       load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
-      add_instr(&load->instr, ir->type->vector_elements);
+      add_instr(&load->instr, ir->type->vector_elements, bit_size);
 
       /*
        * In UBO's, a true boolean value is any non-zero value, but we consider
@@ -1265,7 +1279,8 @@ nir_visitor::visit(ir_expression *ir)
           intrin->intrinsic == nir_intrinsic_interp_var_at_sample)
          intrin->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
 
-      add_instr(&intrin->instr, deref->type->vector_elements);
+      unsigned bit_size =  glsl_get_bit_size(deref->type->base_type);
+      add_instr(&intrin->instr, deref->type->vector_elements, bit_size);
 
       if (swizzle) {
          unsigned swiz[4] = {
@@ -1306,20 +1321,20 @@ nir_visitor::visit(ir_expression *ir)
       result = supports_ints ? nir_inot(&b, srcs[0]) : nir_fnot(&b, srcs[0]);
       break;
    case ir_unop_neg:
-      result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fneg(&b, srcs[0])
-                                             : nir_ineg(&b, srcs[0]);
+      result = type_is_float(types[0]) ? nir_fneg(&b, srcs[0])
+                                       : nir_ineg(&b, srcs[0]);
       break;
    case ir_unop_abs:
-      result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fabs(&b, srcs[0])
-                                             : nir_iabs(&b, srcs[0]);
+      result = type_is_float(types[0]) ? nir_fabs(&b, srcs[0])
+                                       : nir_iabs(&b, srcs[0]);
       break;
    case ir_unop_saturate:
-      assert(types[0] == GLSL_TYPE_FLOAT);
+      assert(type_is_float(types[0]));
       result = nir_fsat(&b, srcs[0]);
       break;
    case ir_unop_sign:
-      result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fsign(&b, srcs[0])
-                                             : nir_isign(&b, srcs[0]);
+      result = type_is_float(types[0]) ? nir_fsign(&b, srcs[0])
+                                       : nir_isign(&b, srcs[0]);
       break;
    case ir_unop_rcp:  result = nir_frcp(&b, srcs[0]);  break;
    case ir_unop_rsq:  result = nir_frsq(&b, srcs[0]);  break;
@@ -1342,6 +1357,19 @@ nir_visitor::visit(ir_expression *ir)
    case ir_unop_f2b:  result = nir_f2b(&b, srcs[0]);   break;
    case ir_unop_i2b:  result = nir_i2b(&b, srcs[0]);   break;
    case ir_unop_b2i:  result = nir_b2i(&b, srcs[0]);   break;
+   case ir_unop_d2f:  result = nir_d2f(&b, srcs[0]);   break;
+   case ir_unop_f2d:  result = nir_f2d(&b, srcs[0]);   break;
+   case ir_unop_d2i:  result = nir_d2i(&b, srcs[0]);   break;
+   case ir_unop_d2u:  result = nir_d2u(&b, srcs[0]);   break;
+   case ir_unop_d2b:  result = nir_d2b(&b, srcs[0]);   break;
+   case ir_unop_i2d:
+      assert(supports_ints);
+      result = nir_i2d(&b, srcs[0]);
+      break;
+   case ir_unop_u2d:
+      assert(supports_ints);
+      result = nir_u2d(&b, srcs[0]);
+      break;
    case ir_unop_i2u:
    case ir_unop_u2i:
    case ir_unop_bitcast_i2f:
@@ -1395,6 +1423,12 @@ nir_visitor::visit(ir_expression *ir)
    case ir_unop_unpack_half_2x16:
       result = nir_unpack_half_2x16(&b, srcs[0]);
       break;
+   case ir_unop_pack_double_2x32:
+      result = nir_pack_double_2x32(&b, srcs[0]);
+      break;
+   case ir_unop_unpack_double_2x32:
+      result = nir_unpack_double_2x32(&b, srcs[0]);
+      break;
    case ir_unop_bitfield_reverse:
       result = nir_bitfield_reverse(&b, srcs[0]);
       break;
@@ -1465,24 +1499,25 @@ nir_visitor::visit(ir_expression *ir)
          nir_intrinsic_get_buffer_size);
       load->num_components = ir->type->vector_elements;
       load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0]));
-      add_instr(&load->instr, ir->type->vector_elements);
+      unsigned bit_size = glsl_get_bit_size(ir->type->base_type);
+      add_instr(&load->instr, ir->type->vector_elements, bit_size);
       return;
    }
 
    case ir_binop_add:
-      result = (out_type == GLSL_TYPE_FLOAT) ? nir_fadd(&b, srcs[0], srcs[1])
-                                             : nir_iadd(&b, srcs[0], srcs[1]);
+      result = type_is_float(out_type) ? nir_fadd(&b, srcs[0], srcs[1])
+                                       : nir_iadd(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_sub:
-      result = (out_type == GLSL_TYPE_FLOAT) ? nir_fsub(&b, srcs[0], srcs[1])
-                                             : nir_isub(&b, srcs[0], srcs[1]);
+      result = type_is_float(out_type) ? nir_fsub(&b, srcs[0], srcs[1])
+                                       : nir_isub(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_mul:
-      result = (out_type == GLSL_TYPE_FLOAT) ? nir_fmul(&b, srcs[0], srcs[1])
-                                             : nir_imul(&b, srcs[0], srcs[1]);
+      result = type_is_float(out_type) ? nir_fmul(&b, srcs[0], srcs[1])
+                                       : nir_imul(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_div:
-      if (out_type == GLSL_TYPE_FLOAT)
+      if (type_is_float(out_type))
          result = nir_fdiv(&b, srcs[0], srcs[1]);
       else if (out_type == GLSL_TYPE_INT)
          result = nir_idiv(&b, srcs[0], srcs[1]);
@@ -1490,11 +1525,11 @@ nir_visitor::visit(ir_expression *ir)
          result = nir_udiv(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_mod:
-      result = (out_type == GLSL_TYPE_FLOAT) ? nir_fmod(&b, srcs[0], srcs[1])
-                                             : nir_umod(&b, srcs[0], srcs[1]);
+      result = type_is_float(out_type) ? nir_fmod(&b, srcs[0], srcs[1])
+                                       : nir_umod(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_min:
-      if (out_type == GLSL_TYPE_FLOAT)
+      if (type_is_float(out_type))
          result = nir_fmin(&b, srcs[0], srcs[1]);
       else if (out_type == GLSL_TYPE_INT)
          result = nir_imin(&b, srcs[0], srcs[1]);
@@ -1502,7 +1537,7 @@ nir_visitor::visit(ir_expression *ir)
          result = nir_umin(&b, srcs[0], srcs[1]);
       break;
    case ir_binop_max:
-      if (out_type == GLSL_TYPE_FLOAT)
+      if (type_is_float(out_type))
          result = nir_fmax(&b, srcs[0], srcs[1]);
       else if (out_type == GLSL_TYPE_INT)
          result = nir_imax(&b, srcs[0], srcs[1]);
@@ -1538,7 +1573,7 @@ nir_visitor::visit(ir_expression *ir)
    case ir_binop_borrow: result = nir_usub_borrow(&b, srcs[0], srcs[1]); break;
    case ir_binop_less:
       if (supports_ints) {
-         if (types[0] == GLSL_TYPE_FLOAT)
+         if (type_is_float(types[0]))
             result = nir_flt(&b, srcs[0], srcs[1]);
          else if (types[0] == GLSL_TYPE_INT)
             result = nir_ilt(&b, srcs[0], srcs[1]);
@@ -1550,7 +1585,7 @@ nir_visitor::visit(ir_expression *ir)
       break;
    case ir_binop_greater:
       if (supports_ints) {
-         if (types[0] == GLSL_TYPE_FLOAT)
+         if (type_is_float(types[0]))
             result = nir_flt(&b, srcs[1], srcs[0]);
          else if (types[0] == GLSL_TYPE_INT)
             result = nir_ilt(&b, srcs[1], srcs[0]);
@@ -1562,7 +1597,7 @@ nir_visitor::visit(ir_expression *ir)
       break;
    case ir_binop_lequal:
       if (supports_ints) {
-         if (types[0] == GLSL_TYPE_FLOAT)
+         if (type_is_float(types[0]))
             result = nir_fge(&b, srcs[1], srcs[0]);
          else if (types[0] == GLSL_TYPE_INT)
             result = nir_ige(&b, srcs[1], srcs[0]);
@@ -1574,7 +1609,7 @@ nir_visitor::visit(ir_expression *ir)
       break;
    case ir_binop_gequal:
       if (supports_ints) {
-         if (types[0] == GLSL_TYPE_FLOAT)
+         if (type_is_float(types[0]))
             result = nir_fge(&b, srcs[0], srcs[1]);
          else if (types[0] == GLSL_TYPE_INT)
             result = nir_ige(&b, srcs[0], srcs[1]);
@@ -1586,7 +1621,7 @@ nir_visitor::visit(ir_expression *ir)
       break;
    case ir_binop_equal:
       if (supports_ints) {
-         if (types[0] == GLSL_TYPE_FLOAT)
+         if (type_is_float(types[0]))
             result = nir_feq(&b, srcs[0], srcs[1]);
          else
             result = nir_ieq(&b, srcs[0], srcs[1]);
@@ -1596,7 +1631,7 @@ nir_visitor::visit(ir_expression *ir)
       break;
    case ir_binop_nequal:
       if (supports_ints) {
-         if (types[0] == GLSL_TYPE_FLOAT)
+         if (type_is_float(types[0]))
             result = nir_fne(&b, srcs[0], srcs[1]);
          else
             result = nir_ine(&b, srcs[0], srcs[1]);
@@ -1606,7 +1641,7 @@ nir_visitor::visit(ir_expression *ir)
       break;
    case ir_binop_all_equal:
       if (supports_ints) {
-         if (types[0] == GLSL_TYPE_FLOAT) {
+         if (type_is_float(types[0])) {
             switch (ir->operands[0]->type->vector_elements) {
                case 1: result = nir_feq(&b, srcs[0], srcs[1]); break;
                case 2: result = nir_ball_fequal2(&b, srcs[0], srcs[1]); break;
@@ -1638,7 +1673,7 @@ nir_visitor::visit(ir_expression *ir)
       break;
    case ir_binop_any_nequal:
       if (supports_ints) {
-         if (types[0] == GLSL_TYPE_FLOAT) {
+         if (type_is_float(types[0])) {
             switch (ir->operands[0]->type->vector_elements) {
                case 1: result = nir_fne(&b, srcs[0], srcs[1]); break;
                case 2: result = nir_bany_fnequal2(&b, srcs[0], srcs[1]); break;
@@ -1902,7 +1937,8 @@ nir_visitor::visit(ir_texture *ir)
 
    assert(src_number == num_srcs);
 
-   add_instr(&instr->instr, nir_tex_instr_dest_size(instr));
+   unsigned bit_size = glsl_get_bit_size(ir->type->base_type);
+   add_instr(&instr->instr, nir_tex_instr_dest_size(instr), bit_size);
 }
 
 void
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index b67916dc86b..8d38d3384d8 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -469,12 +469,13 @@ nir_jump_instr_create(nir_shader *shader, nir_jump_type type)
 }
 
 nir_load_const_instr *
-nir_load_const_instr_create(nir_shader *shader, unsigned num_components)
+nir_load_const_instr_create(nir_shader *shader, unsigned num_components,
+                            unsigned bit_size)
 {
    nir_load_const_instr *instr = ralloc(shader, nir_load_const_instr);
    instr_init(&instr->instr, nir_instr_type_load_const);
 
-   nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);
+   nir_ssa_def_init(&instr->instr, &instr->def, num_components, bit_size, NULL);
 
    return instr;
 }
@@ -558,12 +559,14 @@ nir_parallel_copy_instr_create(nir_shader *shader)
 }
 
 nir_ssa_undef_instr *
-nir_ssa_undef_instr_create(nir_shader *shader, unsigned num_components)
+nir_ssa_undef_instr_create(nir_shader *shader,
+                           unsigned num_components,
+                           unsigned bit_size)
 {
    nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr);
    instr_init(&instr->instr, nir_instr_type_ssa_undef);
 
-   nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);
+   nir_ssa_def_init(&instr->instr, &instr->def, num_components, bit_size, NULL);
 
    return instr;
 }
@@ -691,8 +694,10 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref)
       tail = tail->child;
    }
 
+   unsigned bit_size = glsl_get_bit_size(glsl_get_base_type(tail->type));
    nir_load_const_instr *load =
-      nir_load_const_instr_create(shader, glsl_get_vector_elements(tail->type));
+      nir_load_const_instr_create(shader, glsl_get_vector_elements(tail->type),
+                                  bit_size);
 
    matrix_offset *= load->def.num_components;
    for (unsigned i = 0; i < load->def.num_components; i++) {
@@ -702,6 +707,9 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref)
       case GLSL_TYPE_UINT:
          load->value.u32[i] = constant->value.u[matrix_offset + i];
          break;
+      case GLSL_TYPE_DOUBLE:
+         load->value.f64[i] = constant->value.d[matrix_offset + i];
+         break;
       case GLSL_TYPE_BOOL:
          load->value.u32[i] = constant->value.b[matrix_offset + i] ?
                              NIR_TRUE : NIR_FALSE;
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 8e45cba5a16..c3a33431239 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -81,16 +81,16 @@ typedef struct {
 } nir_state_slot;
 
 typedef enum {
-   nir_var_all = -1,
-   nir_var_shader_in,
-   nir_var_shader_out,
-   nir_var_global,
-   nir_var_local,
-   nir_var_uniform,
-   nir_var_shader_storage,
-   nir_var_system_value,
-   nir_var_param,
-   nir_var_shared,
+   nir_var_shader_in       = (1 << 0),
+   nir_var_shader_out      = (1 << 1),
+   nir_var_global          = (1 << 2),
+   nir_var_local           = (1 << 3),
+   nir_var_uniform         = (1 << 4),
+   nir_var_shader_storage  = (1 << 5),
+   nir_var_system_value    = (1 << 6),
+   nir_var_param           = (1 << 7),
+   nir_var_shared          = (1 << 8),
+   nir_var_all             = ~0,
 } nir_variable_mode;
 
 /**
@@ -156,6 +156,12 @@ typedef struct nir_variable {
    char *name;
 
    struct nir_variable_data {
+      /**
+       * Storage class of the variable.
+       *
+       * \sa nir_variable_mode
+       */
+      nir_variable_mode mode;
 
       /**
        * Is the variable read-only?
@@ -170,13 +176,6 @@ typedef struct nir_variable {
       unsigned invariant:1;
 
       /**
-       * Storage class of the variable.
-       *
-       * \sa nir_variable_mode
-       */
-      nir_variable_mode mode:5;
-
-      /**
        * Interpolation mode for shader inputs / outputs
        *
        * \sa glsl_interp_qualifier
@@ -1857,7 +1856,8 @@ nir_alu_instr *nir_alu_instr_create(nir_shader *shader, nir_op op);
 nir_jump_instr *nir_jump_instr_create(nir_shader *shader, nir_jump_type type);
 
 nir_load_const_instr *nir_load_const_instr_create(nir_shader *shader,
-                                                  unsigned num_components);
+                                                  unsigned num_components,
+                                                  unsigned bit_size);
 
 nir_intrinsic_instr *nir_intrinsic_instr_create(nir_shader *shader,
                                                 nir_intrinsic_op op);
@@ -1872,7 +1872,8 @@ nir_phi_instr *nir_phi_instr_create(nir_shader *shader);
 nir_parallel_copy_instr *nir_parallel_copy_instr_create(nir_shader *shader);
 
 nir_ssa_undef_instr *nir_ssa_undef_instr_create(nir_shader *shader,
-                                                unsigned num_components);
+                                                unsigned num_components,
+                                                unsigned bit_size);
 
 nir_deref_var *nir_deref_var_create(void *mem_ctx, nir_variable *var);
 nir_deref_array *nir_deref_array_create(void *mem_ctx);
@@ -2208,12 +2209,13 @@ void nir_lower_var_copies(nir_shader *shader);
 
 bool nir_lower_global_vars_to_local(nir_shader *shader);
 
-bool nir_lower_indirect_derefs(nir_shader *shader, uint32_t mode_mask);
+bool nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes);
 
 bool nir_lower_locals_to_regs(nir_shader *shader);
 
 void nir_lower_outputs_to_temporaries(nir_shader *shader,
                                       nir_function *entrypoint);
+void nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint);
 
 void nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint);
 
@@ -2222,14 +2224,14 @@ void nir_assign_var_locations(struct exec_list *var_list,
                               int (*type_size)(const struct glsl_type *));
 
 void nir_lower_io(nir_shader *shader,
-                  nir_variable_mode mode,
+                  nir_variable_mode modes,
                   int (*type_size)(const struct glsl_type *));
 nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr);
 nir_src *nir_get_io_vertex_index_src(nir_intrinsic_instr *instr);
 
 void nir_lower_vars_to_ssa(nir_shader *shader);
 
-bool nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode);
+bool nir_remove_dead_variables(nir_shader *shader, nir_variable_mode modes);
 
 void nir_move_vec_src_uses_to_dest(nir_shader *shader);
 bool nir_lower_vec_to_movs(nir_shader *shader);
@@ -2305,6 +2307,8 @@ void nir_lower_to_source_mods(nir_shader *shader);
 
 bool nir_lower_gs_intrinsics(nir_shader *shader);
 
+void nir_lower_double_pack(nir_shader *shader);
+
 bool nir_normalize_cubemap_coords(nir_shader *shader);
 
 void nir_live_ssa_defs_impl(nir_function_impl *impl);
diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py
index d05564f779c..53a79073a44 100644
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -291,6 +291,7 @@ ${pass_name}(nir_shader *shader)
    bool progress = false;
    bool condition_flags[${len(condition_list)}];
    const nir_shader_compiler_options *options = shader->options;
+   (void) options;
 
    % for index, condition in enumerate(condition_list):
    condition_flags[${index}] = ${condition};
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 3dc7c25ec28..29b13fb222f 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -78,7 +78,7 @@ static inline nir_ssa_def *
 nir_ssa_undef(nir_builder *build, unsigned num_components, unsigned bit_size)
 {
    nir_ssa_undef_instr *undef =
-      nir_ssa_undef_instr_create(build->shader, num_components);
+      nir_ssa_undef_instr_create(build->shader, num_components, bit_size);
    undef->def.bit_size = bit_size;
    if (!undef)
       return NULL;
@@ -92,7 +92,7 @@ static inline nir_ssa_def *
 nir_build_imm(nir_builder *build, unsigned num_components, nir_const_value value)
 {
    nir_load_const_instr *load_const =
-      nir_load_const_instr_create(build->shader, num_components);
+      nir_load_const_instr_create(build->shader, num_components, 32);
    if (!load_const)
       return NULL;
 
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index 7d2e3835258..e231387c889 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -179,6 +179,7 @@ clone_register(clone_state *state, const nir_register *reg)
    add_remap(state, nreg, reg);
 
    nreg->num_components = reg->num_components;
+   nreg->bit_size = reg->bit_size;
    nreg->num_array_elems = reg->num_array_elems;
    nreg->index = reg->index;
    nreg->name = ralloc_strdup(nreg, reg->name);
@@ -359,7 +360,8 @@ static nir_load_const_instr *
 clone_load_const(clone_state *state, const nir_load_const_instr *lc)
 {
    nir_load_const_instr *nlc =
-      nir_load_const_instr_create(state->ns, lc->def.num_components);
+      nir_load_const_instr_create(state->ns, lc->def.num_components,
+                                  lc->def.bit_size);
 
    memcpy(&nlc->value, &lc->value, sizeof(nlc->value));
 
@@ -372,7 +374,8 @@ static nir_ssa_undef_instr *
 clone_ssa_undef(clone_state *state, const nir_ssa_undef_instr *sa)
 {
    nir_ssa_undef_instr *nsa =
-      nir_ssa_undef_instr_create(state->ns, sa->def.num_components);
+      nir_ssa_undef_instr_create(state->ns, sa->def.num_components,
+                                 sa->def.bit_size);
 
    add_remap(state, &nsa->def, &sa->def);
 
diff --git a/src/compiler/nir/nir_control_flow.c b/src/compiler/nir/nir_control_flow.c
index 33b06d0cc84..ea5741288ce 100644
--- a/src/compiler/nir/nir_control_flow.c
+++ b/src/compiler/nir/nir_control_flow.c
@@ -281,7 +281,8 @@ insert_phi_undef(nir_block *block, nir_block *pred)
       nir_phi_instr *phi = nir_instr_as_phi(instr);
       nir_ssa_undef_instr *undef =
          nir_ssa_undef_instr_create(ralloc_parent(phi),
-                                    phi->dest.ssa.num_components);
+                                    phi->dest.ssa.num_components,
+                                    phi->dest.ssa.bit_size);
       nir_instr_insert_before_cf_list(&impl->body, &undef->instr);
       nir_phi_src *src = ralloc(phi, nir_phi_src);
       src->pred = pred;
@@ -691,7 +692,8 @@ replace_ssa_def_uses(nir_ssa_def *def, void *void_impl)
    void *mem_ctx = ralloc_parent(impl);
 
    nir_ssa_undef_instr *undef =
-      nir_ssa_undef_instr_create(mem_ctx, def->num_components);
+      nir_ssa_undef_instr_create(mem_ctx, def->num_components,
+                                 def->bit_size);
    nir_instr_insert_before_cf_list(&impl->body, &undef->instr);
    nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(&undef->def));
    return true;
diff --git a/src/compiler/nir/nir_from_ssa.c b/src/compiler/nir/nir_from_ssa.c
index 82317c21b62..7bbc2c0f299 100644
--- a/src/compiler/nir/nir_from_ssa.c
+++ b/src/compiler/nir/nir_from_ssa.c
@@ -474,6 +474,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
          node->set->reg = nir_local_reg_create(state->impl);
          node->set->reg->name = def->name;
          node->set->reg->num_components = def->num_components;
+         node->set->reg->bit_size = def->bit_size;
          node->set->reg->num_array_elems = 0;
       }
 
@@ -491,6 +492,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state)
       reg = nir_local_reg_create(state->impl);
       reg->name = def->name;
       reg->num_components = def->num_components;
+      reg->bit_size = def->bit_size;
       reg->num_array_elems = 0;
    }
 
diff --git a/src/compiler/nir/nir_instr_set.c b/src/compiler/nir/nir_instr_set.c
index e244122e466..c6161433516 100644
--- a/src/compiler/nir/nir_instr_set.c
+++ b/src/compiler/nir/nir_instr_set.c
@@ -52,6 +52,7 @@ hash_alu(uint32_t hash, const nir_alu_instr *instr)
 {
    hash = HASH(hash, instr->op);
    hash = HASH(hash, instr->dest.dest.ssa.num_components);
+   hash = HASH(hash, instr->dest.dest.ssa.bit_size);
    /* We explicitly don't hash instr->dest.dest.exact */
 
    if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
@@ -82,9 +83,8 @@ hash_load_const(uint32_t hash, const nir_load_const_instr *instr)
 {
    hash = HASH(hash, instr->def.num_components);
 
-   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32,
-                                          instr->def.num_components
-                                             * sizeof(instr->value.f32[0]));
+   unsigned size = instr->def.num_components * (instr->def.bit_size / 8);
+   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32, size);
 
    return hash;
 }
@@ -126,8 +126,10 @@ hash_intrinsic(uint32_t hash, const nir_intrinsic_instr *instr)
    const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
    hash = HASH(hash, instr->intrinsic);
 
-   if (info->has_dest)
+   if (info->has_dest) {
       hash = HASH(hash, instr->dest.ssa.num_components);
+      hash = HASH(hash, instr->dest.ssa.bit_size);
+   }
 
    assert(info->num_variables == 0);
 
@@ -268,6 +270,9 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
       if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components)
          return false;
 
+      if (alu1->dest.dest.ssa.bit_size != alu2->dest.dest.ssa.bit_size)
+         return false;
+
       /* We explicitly don't hash instr->dest.dest.exact */
 
       if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
@@ -325,8 +330,11 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
       if (load1->def.num_components != load2->def.num_components)
          return false;
 
+      if (load1->def.bit_size != load2->def.bit_size)
+         return false;
+
       return memcmp(load1->value.f32, load2->value.f32,
-                    load1->def.num_components * sizeof(*load2->value.f32)) == 0;
+                    load1->def.num_components * (load1->def.bit_size / 8)) == 0;
    }
    case nir_instr_type_phi: {
       nir_phi_instr *phi1 = nir_instr_as_phi(instr1);
@@ -362,6 +370,10 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
                             intrinsic2->dest.ssa.num_components)
          return false;
 
+      if (info->has_dest && intrinsic1->dest.ssa.bit_size !=
+                            intrinsic2->dest.ssa.bit_size)
+         return false;
+
       for (unsigned i = 0; i < info->num_srcs; i++) {
          if (!nir_srcs_equal(intrinsic1->src[i], intrinsic2->src[i]))
             return false;
diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c
index e8ba640fe0b..1548abbd558 100644
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@@ -187,6 +187,9 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
       return;
    }
 
+   case nir_op_unpack_double_2x32:
+      return;
+
       LOWER_REDUCTION(nir_op_fdot, nir_op_fmul, nir_op_fadd);
       LOWER_REDUCTION(nir_op_ball_fequal, nir_op_feq, nir_op_iand);
       LOWER_REDUCTION(nir_op_ball_iequal, nir_op_ieq, nir_op_iand);
diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c
index 70381a7968a..b2ea31888f8 100644
--- a/src/compiler/nir/nir_lower_atomics.c
+++ b/src/compiler/nir/nir_lower_atomics.c
@@ -74,7 +74,8 @@ lower_instr(nir_intrinsic_instr *instr,
    nir_intrinsic_set_base(new_instr,
       state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index);
 
-   nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
+   nir_load_const_instr *offset_const =
+      nir_load_const_instr_create(mem_ctx, 1, 32);
    offset_const->value.u32[0] = instr->variables[0]->var->data.offset;
 
    nir_instr_insert_before(&instr->instr, &offset_const->instr);
@@ -95,7 +96,7 @@ lower_instr(nir_intrinsic_instr *instr,
 
       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
          nir_load_const_instr *atomic_counter_size =
-               nir_load_const_instr_create(mem_ctx, 1);
+            nir_load_const_instr_create(mem_ctx, 1, 32);
          atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
          nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr);
 
diff --git a/src/compiler/nir/nir_lower_double_packing.c b/src/compiler/nir/nir_lower_double_packing.c
new file mode 100644
index 00000000000..d43683d2007
--- /dev/null
+++ b/src/compiler/nir/nir_lower_double_packing.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/*
+ * lowers:
+ *
+ * packDouble2x32(foo) -> packDouble2x32Split(foo.x, foo.y)
+ * unpackDouble2x32(foo) -> vec2(unpackDouble2x32_x(foo), unpackDouble2x32_y(foo))
+ */
+
+static nir_ssa_def *
+lower_pack_double(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_pack_double_2x32_split(b, nir_channel(b, src, 0),
+                                        nir_channel(b, src, 1));
+}
+
+static nir_ssa_def *
+lower_unpack_double(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_vec2(b, nir_unpack_double_2x32_split_x(b, src),
+                      nir_unpack_double_2x32_split_y(b, src));
+}
+
+static bool
+lower_double_pack_block(nir_block *block, void *ctx)
+{
+   nir_builder *b = (nir_builder *) ctx;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_alu)
+         continue;
+
+      nir_alu_instr *alu_instr = (nir_alu_instr *) instr;
+
+      if (alu_instr->op != nir_op_pack_double_2x32 &&
+          alu_instr->op != nir_op_unpack_double_2x32)
+         continue;
+
+      b->cursor = nir_before_instr(&alu_instr->instr);
+
+      nir_ssa_def *src = nir_ssa_for_alu_src(b, alu_instr, 0);
+      nir_ssa_def *dest =
+         alu_instr->op == nir_op_pack_double_2x32 ?
+         lower_pack_double(b, src) :
+         lower_unpack_double(b, src);
+
+      nir_ssa_def_rewrite_uses(&alu_instr->dest.dest.ssa, nir_src_for_ssa(dest));
+      nir_instr_remove(&alu_instr->instr);
+   }
+
+   return true;
+}
+
+static void
+lower_double_pack_impl(nir_function_impl *impl)
+{
+   nir_builder b;
+   nir_builder_init(&b, impl);
+   nir_foreach_block(impl, lower_double_pack_block, &b);
+}
+
+void
+nir_lower_double_pack(nir_shader *shader)
+{
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         lower_double_pack_impl(function->impl);
+   }
+}
+
diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c
index 62b8c84a956..a69dd612565 100644
--- a/src/compiler/nir/nir_lower_indirect_derefs.c
+++ b/src/compiler/nir/nir_lower_indirect_derefs.c
@@ -161,7 +161,7 @@ deref_has_indirect(nir_deref_var *deref)
 
 struct lower_indirect_state {
    nir_builder builder;
-   uint32_t mode_mask;
+   nir_variable_mode modes;
    bool progress;
 };
 
@@ -183,7 +183,7 @@ lower_indirect_block(nir_block *block, void *void_state)
          continue;
 
       /* Only lower variables whose mode is in the mask */
-      if (!(state->mode_mask & (1 << intrin->variables[0]->var->data.mode)))
+      if (!(state->modes & intrin->variables[0]->var->data.mode))
          continue;
 
       state->builder.cursor = nir_before_instr(&intrin->instr);
@@ -206,12 +206,12 @@ lower_indirect_block(nir_block *block, void *void_state)
 }
 
 static bool
-lower_indirects_impl(nir_function_impl *impl, uint32_t mode_mask)
+lower_indirects_impl(nir_function_impl *impl, nir_variable_mode modes)
 {
    struct lower_indirect_state state;
 
    state.progress = false;
-   state.mode_mask = mode_mask;
+   state.modes = modes;
    nir_builder_init(&state.builder, impl);
 
    nir_foreach_block(impl, lower_indirect_block, &state);
@@ -228,13 +228,13 @@ lower_indirects_impl(nir_function_impl *impl, uint32_t mode_mask)
  * that does a binary search on the array index.
  */
 bool
-nir_lower_indirect_derefs(nir_shader *shader, uint32_t mode_mask)
+nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes)
 {
    bool progress = false;
 
    nir_foreach_function(shader, function) {
       if (function->impl)
-         progress = lower_indirects_impl(function->impl, mode_mask) || progress;
+         progress = lower_indirects_impl(function->impl, modes) || progress;
    }
 
    return progress;
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index a30061d3bf0..369a8ee537e 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -38,7 +38,7 @@ struct lower_io_state {
    nir_builder builder;
    void *mem_ctx;
    int (*type_size)(const struct glsl_type *type);
-   nir_variable_mode mode;
+   nir_variable_mode modes;
 };
 
 void
@@ -245,7 +245,7 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
       nir_variable_mode mode = intrin->variables[0]->var->data.mode;
 
-      if (state->mode != nir_var_all && state->mode != mode)
+      if ((state->modes & mode) == 0)
          continue;
 
       if (mode != nir_var_shader_in &&
@@ -393,14 +393,14 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
 static void
 nir_lower_io_impl(nir_function_impl *impl,
-                  nir_variable_mode mode,
+                  nir_variable_mode modes,
                   int (*type_size)(const struct glsl_type *))
 {
    struct lower_io_state state;
 
    nir_builder_init(&state.builder, impl);
    state.mem_ctx = ralloc_parent(impl);
-   state.mode = mode;
+   state.modes = modes;
    state.type_size = type_size;
 
    nir_foreach_block(impl, nir_lower_io_block, &state);
@@ -410,12 +410,12 @@ nir_lower_io_impl(nir_function_impl *impl,
 }
 
 void
-nir_lower_io(nir_shader *shader, nir_variable_mode mode,
+nir_lower_io(nir_shader *shader, nir_variable_mode modes,
              int (*type_size)(const struct glsl_type *))
 {
    nir_foreach_function(shader, function) {
       if (function->impl)
-         nir_lower_io_impl(function->impl, mode, type_size);
+         nir_lower_io_impl(function->impl, modes, type_size);
    }
 }
 
diff --git a/src/compiler/nir/nir_lower_load_const_to_scalar.c b/src/compiler/nir/nir_lower_load_const_to_scalar.c
index b5df46413f1..db5865fb0c0 100644
--- a/src/compiler/nir/nir_lower_load_const_to_scalar.c
+++ b/src/compiler/nir/nir_lower_load_const_to_scalar.c
@@ -48,8 +48,13 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower)
    /* Emit the individual loads. */
    nir_ssa_def *loads[4];
    for (unsigned i = 0; i < lower->def.num_components; i++) {
-      nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1);
-      load_comp->value.u32[0] = lower->value.u32[i];
+      nir_load_const_instr *load_comp =
+         nir_load_const_instr_create(b.shader, 1, lower->def.bit_size);
+      if (lower->def.bit_size == 64)
+         load_comp->value.f64[0] = lower->value.f64[i];
+      else
+         load_comp->value.u32[0] = lower->value.u32[i];
+      assert(lower->def.bit_size == 64 || lower->def.bit_size == 32);
       nir_builder_instr_insert(&b, &load_comp->instr);
       loads[i] = &load_comp->def;
    }
diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c
index 0438802d3b2..111bfdd2e33 100644
--- a/src/compiler/nir/nir_lower_locals_to_regs.c
+++ b/src/compiler/nir/nir_lower_locals_to_regs.c
@@ -119,6 +119,7 @@ get_reg_for_deref(nir_deref_var *deref, struct locals_to_regs_state *state)
    nir_register *reg = nir_local_reg_create(state->impl);
    reg->num_components = glsl_get_vector_elements(tail->type);
    reg->num_array_elems = array_size > 1 ? array_size : 0;
+   reg->bit_size = glsl_get_bit_size(glsl_get_base_type(tail->type));
 
    _mesa_hash_table_insert_pre_hashed(state->regs_table, hash, deref, reg);
    nir_array_add(&state->derefs_array, nir_deref_var *, deref);
@@ -160,7 +161,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
 
       if (src.reg.indirect) {
          nir_load_const_instr *load_const =
-            nir_load_const_instr_create(state->shader, 1);
+            nir_load_const_instr_create(state->shader, 1, 32);
          load_const->value.u32[0] = glsl_get_length(parent_type);
          nir_instr_insert_before(instr, &load_const->instr);
 
diff --git a/src/compiler/nir/nir_lower_to_source_mods.c b/src/compiler/nir/nir_lower_to_source_mods.c
index 6c4e1f0d3f3..1e8c3c2a130 100644
--- a/src/compiler/nir/nir_lower_to_source_mods.c
+++ b/src/compiler/nir/nir_lower_to_source_mods.c
@@ -54,7 +54,7 @@ nir_lower_to_source_mods_block(nir_block *block, void *state)
          if (parent->dest.saturate)
             continue;
 
-         switch (nir_op_infos[alu->op].input_types[i]) {
+         switch (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i])) {
          case nir_type_float:
             if (parent->op != nir_op_fmov)
                continue;
@@ -128,7 +128,8 @@ nir_lower_to_source_mods_block(nir_block *block, void *state)
          continue;
 
       /* We can only saturate float destinations */
-      if (nir_op_infos[alu->op].output_type != nir_type_float)
+      if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) !=
+          nir_type_float)
          continue;
 
       if (!list_empty(&alu->dest.dest.ssa.if_uses))
diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c
index 9f9e454c198..249c3892335 100644
--- a/src/compiler/nir/nir_lower_vars_to_ssa.c
+++ b/src/compiler/nir/nir_lower_vars_to_ssa.c
@@ -504,8 +504,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
              */
             nir_ssa_undef_instr *undef =
                nir_ssa_undef_instr_create(state->shader,
-                                          intrin->num_components);
-            undef->def.bit_size = intrin->dest.ssa.bit_size;
+                                          intrin->num_components,
+                                          intrin->dest.ssa.bit_size);
 
             nir_instr_insert_before(&intrin->instr, &undef->instr);
             nir_instr_remove(&intrin->instr);
diff --git a/src/compiler/nir/nir_lower_vec_to_movs.c b/src/compiler/nir/nir_lower_vec_to_movs.c
index f51cede3920..9e40b84e6e3 100644
--- a/src/compiler/nir/nir_lower_vec_to_movs.c
+++ b/src/compiler/nir/nir_lower_vec_to_movs.c
@@ -240,6 +240,7 @@ lower_vec_to_movs_block(nir_block *block, void *void_state)
          /* Since we insert multiple MOVs, we have a register destination. */
          nir_register *reg = nir_local_reg_create(impl);
          reg->num_components = vec->dest.dest.ssa.num_components;
+         reg->bit_size = vec->dest.dest.ssa.bit_size;
 
          nir_ssa_def_rewrite_uses(&vec->dest.dest.ssa, nir_src_for_reg(reg));
 
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index d6b658dbfc8..e75ca28cf0e 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -95,6 +95,7 @@ tuint = "uint"
 tfloat32 = "float32"
 tint32 = "int32"
 tuint32 = "uint32"
+tuint64 = "uint64"
 tfloat64 = "float64"
 
 commutative = "commutative "
@@ -161,15 +162,23 @@ unop("fexp2", tfloat, "exp2f(src0)")
 unop("flog2", tfloat, "log2f(src0)")
 unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
 unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
+unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion.
+unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion.
 unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
+unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion.
 # Float-to-boolean conversion
 unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
+unop_convert("d2b", tbool, tfloat64, "src0 != 0.0")
 # Boolean-to-float conversion
 unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 # Int-to-boolean conversion
 unop_convert("i2b", tbool, tint32, "src0 != 0")
 unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
 unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
+unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion.
+# double-to-float conversion
+unop_convert("d2f", tfloat32, tfloat64, "src0") # Single to double precision
+unop_convert("f2d", tfloat64, tfloat32, "src0") # Double to single precision
 
 # Unary floating-point rounding operations.
 
@@ -253,6 +262,34 @@ dst.x = (src0.x <<  0) |
         (src0.w << 24);
 """)
 
+unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32, """
+union {
+    uint64_t u64;
+    struct {
+        uint32_t i1;
+        uint32_t i2;
+    };
+} di;
+
+di.i1 = src0.x;
+di.i2 = src0.y;
+dst.x = di.u64;
+""")
+
+unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64, """
+union {
+    uint64_t u64;
+    struct {
+        uint32_t i1;
+        uint32_t i2;
+    };
+} di;
+
+di.u64 = src0.x;
+dst.x = di.i1;
+dst.y = di.i2;
+""")
+
 # Lowered floating point unpacking operations.
 
 
@@ -261,6 +298,29 @@ unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
 unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 
+unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, """
+union {
+    uint64_t u64;
+    struct {
+        uint32_t x;
+        uint32_t y;
+    };
+} di;
+di.u64 = src0;
+dst = di.x;
+""")
+
+unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, """
+union {
+    uint64_t u64;
+    struct {
+        uint32_t x;
+        uint32_t y;
+    };
+} di;
+di.u64 = src0;
+dst = di.y;
+""")
 
 # Bit operations, part of ARB_gpu_shader5.
 
@@ -540,6 +600,19 @@ binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 
+binop_convert("pack_double_2x32_split", tuint64, tuint32, "", """
+union {
+    uint64_t u64;
+    struct {
+        uint32_t x;
+        uint32_t y;
+    };
+} di;
+di.x = src0;
+di.y = src1;
+dst = di.u64;
+""")
+
 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 # if either of its arguments are 32.
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index ddfe94d9e73..dd41931b345 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -138,7 +138,10 @@ optimizations = [
    (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
    (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
    (('fsat', ('fsat', a)), ('fsat', a)),
-   (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)),
+   (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)),
+   (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)),
+   (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)),
+   (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)),
    (('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
    (('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
    (('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
@@ -275,6 +278,14 @@ optimizations = [
    (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
    (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
 
+   # Reassociate constants in add/mul chains so they can be folded together.
+   # For now, we only handle cases where the constants are separated by
+   # a single non-constant.  We could do better eventually.
+   (('~fmul', '#a', ('fmul', b, '#c')), ('fmul', ('fmul', a, c), b)),
+   (('imul', '#a', ('imul', b, '#c')), ('imul', ('imul', a, c), b)),
+   (('~fadd', '#a', ('fadd', b, '#c')), ('fadd', ('fadd', a, c), b)),
+   (('iadd', '#a', ('iadd', b, '#c')), ('iadd', ('iadd', a, c), b)),
+
    # Misc. lowering
    (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
    (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
@@ -362,26 +373,30 @@ optimizations = [
 ]
 
 def fexp2i(exp):
-   # We assume that exp is already in range.
+   # We assume that exp is already in the range [-126, 127].
    return ('ishl', ('iadd', exp, 127), 23)
 
 def ldexp32(f, exp):
-   # First, we clamp exp to a reasonable range.  The maximum range that we
-   # need is the largest range for an exponent, ([-127, 128] if you include
-   # inf and 0) plus the number of mantissa bits in either direction to
-   # account for denormals.  This means that we need at least a range of
-   # [-150, 151].  For our implementation, however, what we really care
-   # about is that neither exp/2 nor exp-exp/2 go out of the regular range
-   # for floating-point exponents.
+   # First, we clamp exp to a reasonable range.  The maximum possible range
+   # for a normal exponent is [-126, 127] and, throwing in denormals, you get
+   # a maximum range of [-149, 127].  This means that we can potentially have
+   # a swing of +-276.  If you start with FLT_MAX, you actually have to do
+   # ldexp(FLT_MAX, -278) to get it to flush all the way to zero.  The GLSL
+   # spec, on the other hand, only requires that we handle an exponent value
+   # in the range [-126, 128].  This implementation is *mostly* correct; it
+   # handles a range on exp of [-252, 254] which allows you to create any
+   # value (including denorms if the hardware supports it) and to adjust the
+   # exponent of any normal value to anything you want.
    exp = ('imin', ('imax', exp, -252), 254)
 
    # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
-   # While the spec technically defines ldexp as f * 2.0^exp, simply
-   # multiplying once doesn't work when denormals are involved because
-   # 2.0^exp may not be representable even though ldexp(f, exp) is (see
-   # comments above about range).  Instead, we create two powers of two and
-   # multiply by them each in turn.  That way the effective range of our
-   # exponent is doubled.
+   # (We use ishr which isn't the same for -1, but the -1 case still works
+   # since we use exp-exp/2 as the second exponent.)  While the spec
+   # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't
+   # work with denormals and doesn't allow for the full swing in exponents
+   # that you can get with normalized values.  Instead, we create two powers
+   # of two and multiply by them each in turn.  That way the effective range
+   # of our exponent is doubled.
    pow2_1 = fexp2i(('ishr', exp, 1))
    pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)))
    return ('fmul', ('fmul', f, pow2_1), pow2_2)
diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c
index e64ca369bbc..caa4231b188 100644
--- a/src/compiler/nir/nir_opt_constant_folding.c
+++ b/src/compiler/nir/nir_opt_constant_folding.c
@@ -98,9 +98,9 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
 
    nir_load_const_instr *new_instr =
       nir_load_const_instr_create(mem_ctx,
-                                  instr->dest.dest.ssa.num_components);
+                                  instr->dest.dest.ssa.num_components,
+                                  instr->dest.dest.ssa.bit_size);
 
-   new_instr->def.bit_size = instr->dest.dest.ssa.bit_size;
    new_instr->value = dest;
 
    nir_instr_insert_before(&instr->instr, &new_instr->instr);
diff --git a/src/compiler/nir/nir_opt_dce.c b/src/compiler/nir/nir_opt_dce.c
index 32436c18b60..cab09dfffc3 100644
--- a/src/compiler/nir/nir_opt_dce.c
+++ b/src/compiler/nir/nir_opt_dce.c
@@ -71,7 +71,7 @@ init_instr(nir_instr *instr, struct exec_list *worklist)
    nir_tex_instr *tex_instr;
 
    /* We use the pass_flags to store the live/dead information.  In DCE, we
-    * just treat it as a zero/non-zerl boolean for whether or not the
+    * just treat it as a zero/non-zero boolean for whether or not the
     * instruction is live.
     */
    instr->pass_flags = 0;
diff --git a/src/compiler/nir/nir_phi_builder.c b/src/compiler/nir/nir_phi_builder.c
index a39e3606fd5..1f1388a73dd 100644
--- a/src/compiler/nir/nir_phi_builder.c
+++ b/src/compiler/nir/nir_phi_builder.c
@@ -195,7 +195,8 @@ nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
           */
          nir_ssa_undef_instr *undef =
             nir_ssa_undef_instr_create(val->builder->shader,
-                                       val->num_components);
+                                       val->num_components,
+                                       val->bit_size);
          nir_instr_insert(nir_before_cf_list(&val->builder->impl->body),
                           &undef->instr);
          val->defs[block->index] = &undef->def;
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index 17ae3681e21..2793020953e 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -29,6 +29,7 @@
 #include "compiler/shader_enums.h"
 #include <stdio.h>
 #include <stdlib.h>
+#include <inttypes.h> /* for PRIx64 macro */
 
 static void
 print_tabs(unsigned num_tabs, FILE *fp)
@@ -68,7 +69,7 @@ static void
 print_register_decl(nir_register *reg, print_state *state)
 {
    FILE *fp = state->fp;
-   fprintf(fp, "decl_reg %s ", sizes[reg->num_components]);
+   fprintf(fp, "decl_reg %s %u ", sizes[reg->num_components], reg->bit_size);
    if (reg->is_packed)
       fprintf(fp, "(packed) ");
    print_register(reg, state);
@@ -83,7 +84,8 @@ print_ssa_def(nir_ssa_def *def, print_state *state)
    FILE *fp = state->fp;
    if (def->name != NULL)
       fprintf(fp, "/* %s */ ", def->name);
-   fprintf(fp, "%s ssa_%u", sizes[def->num_components], def->index);
+   fprintf(fp, "%s %u ssa_%u", sizes[def->num_components], def->bit_size,
+           def->index);
 }
 
 static void
@@ -279,6 +281,13 @@ print_constant(nir_constant *c, const struct glsl_type *type, print_state *state
       }
       break;
 
+   case GLSL_TYPE_DOUBLE:
+      for (i = 0; i < total_elems; i++) {
+         if (i > 0) fprintf(fp, ", ");
+         fprintf(fp, "%f", c->value.d[i]);
+      }
+      break;
+
    case GLSL_TYPE_STRUCT:
       for (i = 0; i < c->num_elements; i++) {
          if (i > 0) fprintf(fp, ", ");
@@ -716,7 +725,11 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state)
        * and then print the float in a comment for readability.
        */
 
-      fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]);
+      if (instr->def.bit_size == 64)
+         fprintf(fp, "0x%16" PRIx64 " /* %f */", instr->value.u64[i],
+                 instr->value.f64[i]);
+      else
+         fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]);
    }
 
    fprintf(fp, ")");
diff --git a/src/compiler/nir/nir_remove_dead_variables.c b/src/compiler/nir/nir_remove_dead_variables.c
index ad69de85b97..7395805d7a2 100644
--- a/src/compiler/nir/nir_remove_dead_variables.c
+++ b/src/compiler/nir/nir_remove_dead_variables.c
@@ -120,7 +120,7 @@ remove_dead_vars(struct exec_list *var_list, struct set *live)
 }
 
 bool
-nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode)
+nir_remove_dead_variables(nir_shader *shader, nir_variable_mode modes)
 {
    bool progress = false;
    struct set *live =
@@ -128,22 +128,22 @@ nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode)
 
    add_var_use_shader(shader, live);
 
-   if (mode == nir_var_uniform || mode == nir_var_all)
+   if (modes & nir_var_uniform)
       progress = remove_dead_vars(&shader->uniforms, live) || progress;
 
-   if (mode == nir_var_shader_in || mode == nir_var_all)
+   if (modes & nir_var_shader_in)
       progress = remove_dead_vars(&shader->inputs, live) || progress;
 
-   if (mode == nir_var_shader_out || mode == nir_var_all)
+   if (modes & nir_var_shader_out)
       progress = remove_dead_vars(&shader->outputs, live) || progress;
 
-   if (mode == nir_var_global || mode == nir_var_all)
+   if (modes & nir_var_global)
       progress = remove_dead_vars(&shader->globals, live) || progress;
 
-   if (mode == nir_var_system_value || mode == nir_var_all)
+   if (modes & nir_var_system_value)
       progress = remove_dead_vars(&shader->system_values, live) || progress;
 
-   if (mode == nir_var_local || mode == nir_var_all) {
+   if (modes & nir_var_local) {
       nir_foreach_function(shader, function) {
          if (function->impl) {
             if (remove_dead_vars(&function->impl->locals, live)) {
diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index 3a65ab18928..dc53a9063c4 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -477,7 +477,8 @@ construct_value(const nir_search_value *value,
 
    case nir_search_value_constant: {
       const nir_search_constant *c = nir_search_value_as_constant(value);
-      nir_load_const_instr *load = nir_load_const_instr_create(mem_ctx, 1);
+      nir_load_const_instr *load =
+         nir_load_const_instr_create(mem_ctx, 1, bitsize->dest_size);
 
       switch (c->type) {
       case nir_type_float:
@@ -528,8 +529,6 @@ construct_value(const nir_search_value *value,
          unreachable("Invalid alu source type");
       }
 
-      load->def.bit_size = bitsize->dest_size;
-
       nir_instr_insert_before(instr, &load->instr);
 
       nir_alu_src val;
diff --git a/src/compiler/nir/nir_split_var_copies.c b/src/compiler/nir/nir_split_var_copies.c
index 6fdaefa32c8..2b011077a7c 100644
--- a/src/compiler/nir/nir_split_var_copies.c
+++ b/src/compiler/nir/nir_split_var_copies.c
@@ -149,6 +149,7 @@ split_var_copy_instr(nir_intrinsic_instr *old_copy,
    case GLSL_TYPE_UINT:
    case GLSL_TYPE_INT:
    case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
    case GLSL_TYPE_BOOL:
       if (glsl_type_is_matrix(src_tail->type)) {
          nir_deref_array *deref = nir_deref_array_create(state->dead_ctx);
@@ -231,6 +232,7 @@ split_var_copies_block(nir_block *block, void *void_state)
          ralloc_steal(state->dead_ctx, instr);
          break;
       case GLSL_TYPE_FLOAT:
+      case GLSL_TYPE_DOUBLE:
       case GLSL_TYPE_INT:
       case GLSL_TYPE_UINT:
       case GLSL_TYPE_BOOL:
diff --git a/src/compiler/nir/nir_to_ssa.c b/src/compiler/nir/nir_to_ssa.c
index d588d7d2df3..23d709a218a 100644
--- a/src/compiler/nir/nir_to_ssa.c
+++ b/src/compiler/nir/nir_to_ssa.c
@@ -160,7 +160,8 @@ static nir_ssa_def *get_ssa_src(nir_register *reg, rewrite_state *state)
        * to preserve the information that this source is undefined
        */
       nir_ssa_undef_instr *instr =
-         nir_ssa_undef_instr_create(state->mem_ctx, reg->num_components);
+         nir_ssa_undef_instr_create(state->mem_ctx, reg->num_components,
+                                    reg->bit_size);
 
       /*
        * We could just insert the undefined instruction before the instruction
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 9f18d1c33e4..3c3306c75fb 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -903,6 +903,9 @@ validate_var_decl(nir_variable *var, bool is_global, validate_state *state)
 {
    assert(is_global == nir_variable_is_global(var));
 
+   /* Must have exactly one mode set */
+   assert(util_bitcount(var->data.mode) == 1);
+
    /*
     * TODO validate some things ir_validate.cpp does (requires more GLSL type
     * support)
diff --git a/src/compiler/nir/spirv/spirv_to_nir.c b/src/compiler/nir/spirv/spirv_to_nir.c
index 948454494fa..99514b49650 100644
--- a/src/compiler/nir/spirv/spirv_to_nir.c
+++ b/src/compiler/nir/spirv/spirv_to_nir.c
@@ -86,7 +86,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
       if (glsl_type_is_vector_or_scalar(type)) {
          unsigned num_components = glsl_get_vector_elements(val->type);
          nir_load_const_instr *load =
-            nir_load_const_instr_create(b->shader, num_components);
+            nir_load_const_instr_create(b->shader, num_components, 32);
 
          for (unsigned i = 0; i < num_components; i++)
             load->value.u32[i] = constant->value.u[i];
@@ -103,7 +103,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
             struct vtn_ssa_value *col_val = rzalloc(b, struct vtn_ssa_value);
             col_val->type = glsl_get_column_type(val->type);
             nir_load_const_instr *load =
-               nir_load_const_instr_create(b->shader, rows);
+               nir_load_const_instr_create(b->shader, rows, 32);
 
             for (unsigned j = 0; j < rows; j++)
                load->value.u32[j] = constant->value.u[rows * i + j];
diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp
index 70e9cd397fc..62a1071e444 100644
--- a/src/compiler/nir_types.cpp
+++ b/src/compiler/nir_types.cpp
@@ -126,9 +126,9 @@ glsl_get_aoa_size(const struct glsl_type *type)
 
 unsigned
 glsl_count_attribute_slots(const struct glsl_type *type,
-                           bool vertex_input_slots)
+                           bool is_vertex_input)
 {
-   return type->count_attribute_slots(vertex_input_slots);
+   return type->count_attribute_slots(is_vertex_input);
 }
 
 const char *
diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index 5efdd85dea5..851096f9cc0 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -69,7 +69,7 @@ unsigned glsl_get_length(const struct glsl_type *type);
 unsigned glsl_get_aoa_size(const struct glsl_type *type);
 
 unsigned glsl_count_attribute_slots(const struct glsl_type *type,
-                                    bool vertex_input_slots);
+                                    bool is_vertex_input);
 
 const char *glsl_get_struct_elem_name(const struct glsl_type *type,
                                       unsigned index);
diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am
index 82c2869b99b..296ed59317b 100644
--- a/src/gallium/auxiliary/Makefile.am
+++ b/src/gallium/auxiliary/Makefile.am
@@ -1,10 +1,11 @@
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
-noinst_LTLIBRARIES = libgallium_nir.la
+noinst_LTLIBRARIES = libgallium.la
 
 AM_CFLAGS = \
 	-I$(top_srcdir)/src/loader \
+	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_srcdir)/src/gallium/auxiliary/util \
 	$(GALLIUM_CFLAGS) \
 	$(VISIBILITY_CFLAGS) \
@@ -14,24 +15,11 @@ AM_CXXFLAGS = \
 	$(VISIBILITY_CXXFLAGS) \
 	$(MSVC2013_COMPAT_CXXFLAGS)
 
-libgallium_nir_la_SOURCES = \
-	$(NIR_SOURCES)
-
-libgallium_nir_la_CFLAGS = \
-	-I$(top_builddir)/src/compiler/nir \
-	$(GALLIUM_CFLAGS) \
-	$(VISIBILITY_CFLAGS) \
-	$(MSVC2013_COMPAT_CFLAGS)
-
-noinst_LTLIBRARIES += libgallium.la
-
 libgallium_la_SOURCES = \
 	$(C_SOURCES) \
+	$(NIR_SOURCES) \
 	$(GENERATED_SOURCES)
 
-libgallium_la_LIBADD = \
-	libgallium_nir.la
-
 if HAVE_MESA_LLVM
 
 AM_CFLAGS += \
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 790e1211898..4e0cbdd8f9a 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -1539,6 +1539,8 @@ cso_save_state(struct cso_context *cso, unsigned state_mask)
       cso_save_vertex_shader(cso);
    if (state_mask & CSO_BIT_VIEWPORT)
       cso_save_viewport(cso);
+   if (state_mask & CSO_BIT_PAUSE_QUERIES)
+      cso->pipe->set_active_query_state(cso->pipe, false);
 }
 
 
@@ -1590,6 +1592,8 @@ cso_restore_state(struct cso_context *cso)
       cso_restore_vertex_shader(cso);
    if (state_mask & CSO_BIT_VIEWPORT)
       cso_restore_viewport(cso);
+   if (state_mask & CSO_BIT_PAUSE_QUERIES)
+      cso->pipe->set_active_query_state(cso->pipe, true);
 
    cso->saved_state = 0;
 }
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index a3563d83a02..e27cbe9f721 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -170,6 +170,7 @@ void cso_set_render_condition(struct cso_context *cso,
 #define CSO_BIT_VERTEX_ELEMENTS       0x10000
 #define CSO_BIT_VERTEX_SHADER         0x20000
 #define CSO_BIT_VIEWPORT              0x40000
+#define CSO_BIT_PAUSE_QUERIES         0x80000
 
 #define CSO_BITS_ALL_SHADERS (CSO_BIT_VERTEX_SHADER | \
                               CSO_BIT_FRAGMENT_SHADER | \
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 2ba9b099664..75551fbe2dd 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -749,7 +749,23 @@ draw_image(struct draw_context *draw,
    }
 }
 
-
+/**
+ * Provide TGSI buffer objects for vertex/geometry shaders that use
+ * load/store/atomic ops.  This state only needs to be set once per context.
+ * This might only be used by software drivers for the time being.
+ */
+void
+draw_buffer(struct draw_context *draw,
+            uint shader,
+            struct tgsi_buffer *buffer)
+{
+   if (shader == PIPE_SHADER_VERTEX) {
+      draw->vs.tgsi.buffer = buffer;
+   } else {
+      debug_assert(shader == PIPE_SHADER_GEOMETRY);
+      draw->gs.tgsi.buffer = buffer;
+   }
+}
 
 
 void draw_set_render( struct draw_context *draw, 
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 5d9870b115c..3e6722fcb7e 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -49,6 +49,7 @@ struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
 struct tgsi_image;
+struct tgsi_buffer;
 
 /*
  * structure to contain driver internal information 
@@ -161,6 +162,11 @@ draw_image(struct draw_context *draw,
            struct tgsi_image *image);
 
 void
+draw_buffer(struct draw_context *draw,
+           uint shader_type,
+           struct tgsi_buffer *buffer);
+
+void
 draw_set_sampler_views(struct draw_context *draw,
                        unsigned shader_stage,
                        struct pipe_sampler_view **views,
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 14db2d6f39d..ef217fa5ceb 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -681,7 +681,9 @@ void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
    if (!use_llvm && shader && shader->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(shader->machine,
                                     shader->state.tokens,
-                                    draw->gs.tgsi.sampler, draw->gs.tgsi.image);
+                                    draw->gs.tgsi.sampler,
+                                    draw->gs.tgsi.image,
+                                    draw->gs.tgsi.buffer);
    }
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 211bd6f7e70..a18f6632124 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -67,6 +67,7 @@ struct vbuf_render;
 struct tgsi_exec_machine;
 struct tgsi_sampler;
 struct tgsi_image;
+struct tgsi_buffer;
 struct draw_pt_front_end;
 struct draw_assembler;
 struct draw_llvm;
@@ -269,6 +270,7 @@ struct draw_context
 
          struct tgsi_sampler *sampler;
          struct tgsi_image *image;
+         struct tgsi_buffer *buffer;
       } tgsi;
 
       struct translate *fetch;
@@ -289,6 +291,7 @@ struct draw_context
 
          struct tgsi_sampler *sampler;
          struct tgsi_image *image;
+         struct tgsi_buffer *buffer;
       } tgsi;
 
    } gs;
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 5b53cff29f0..da0d1a7f9a8 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -70,7 +70,9 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
    if (evs->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(evs->machine,
                                     shader->state.tokens,
-                                    draw->vs.tgsi.sampler, draw->vs.tgsi.image);
+                                    draw->vs.tgsi.sampler,
+                                    draw->vs.tgsi.image,
+                                    draw->vs.tgsi.buffer);
    }
 }
 
@@ -159,6 +161,7 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
          input = (const float (*)[4])((const char *)input + input_stride);
       } 
 
+      machine->NonHelperMask = (1 << max_vertices) - 1;
       /* run interpreter */
       tgsi_exec_machine_run( machine );
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index 0c43617d531..beff4143fd8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1492,9 +1492,20 @@ lp_build_abs(struct lp_build_context *bld,
       return a;
 
    if(type.floating) {
-      char intrinsic[32];
-      lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
-      return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) {
+         /* Workaround llvm.org/PR27332 */
+         LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type);
+         unsigned long long absMask = ~(1ULL << (type.width - 1));
+         LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask));
+         a = LLVMBuildBitCast(builder, a, int_vec_type, "");
+         a = LLVMBuildAnd(builder, a, mask, "");
+         a = LLVMBuildBitCast(builder, a, vec_type, "");
+         return a;
+      } else {
+         char intrinsic[32];
+         lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type);
+         return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a);
+      }
    }
 
    if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
@@ -1663,99 +1674,6 @@ enum lp_build_round_mode
    LP_BUILD_ROUND_TRUNCATE = 3
 };
 
-/**
- * Helper for SSE4.1's ROUNDxx instructions.
- *
- * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the
- * result is the even value.  That is, rounding 2.5 will be 2.0, and not 3.0.
- */
-static inline LLVMValueRef
-lp_build_nearest_sse41(struct lp_build_context *bld,
-                       LLVMValueRef a)
-{
-   LLVMBuilderRef builder = bld->gallivm->builder;
-   const struct lp_type type = bld->type;
-   LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context);
-   LLVMValueRef mode = LLVMConstNull(i32t);
-   const char *intrinsic;
-   LLVMValueRef res;
-
-   assert(type.floating);
-
-   assert(lp_check_value(type, a));
-   assert(util_cpu_caps.has_sse4_1);
-
-   if (type.length == 1) {
-      LLVMTypeRef vec_type;
-      LLVMValueRef undef;
-      LLVMValueRef args[3];
-      LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0);
-
-      switch(type.width) {
-      case 32:
-         intrinsic = "llvm.x86.sse41.round.ss";
-         break;
-      case 64:
-         intrinsic = "llvm.x86.sse41.round.sd";
-         break;
-      default:
-         assert(0);
-         return bld->undef;
-      }
-
-      vec_type = LLVMVectorType(bld->elem_type, 4);
-
-      undef = LLVMGetUndef(vec_type);
-
-      args[0] = undef;
-      args[1] = LLVMBuildInsertElement(builder, undef, a, index0, "");
-      args[2] = mode;
-
-      res = lp_build_intrinsic(builder, intrinsic,
-                               vec_type, args, Elements(args), 0);
-
-      res = LLVMBuildExtractElement(builder, res, index0, "");
-   }
-   else {
-      if (type.width * type.length == 128) {
-         switch(type.width) {
-         case 32:
-            intrinsic = "llvm.x86.sse41.round.ps";
-            break;
-         case 64:
-            intrinsic = "llvm.x86.sse41.round.pd";
-            break;
-         default:
-            assert(0);
-            return bld->undef;
-         }
-      }
-      else {
-         assert(type.width * type.length == 256);
-         assert(util_cpu_caps.has_avx);
-
-         switch(type.width) {
-         case 32:
-            intrinsic = "llvm.x86.avx.round.ps.256";
-            break;
-         case 64:
-            intrinsic = "llvm.x86.avx.round.pd.256";
-            break;
-         default:
-            assert(0);
-            return bld->undef;
-         }
-      }
-
-      res = lp_build_intrinsic_binary(builder, intrinsic,
-                                      bld->vec_type, a,
-                                      mode);
-   }
-
-   return res;
-}
-
-
 static inline LLVMValueRef
 lp_build_iround_nearest_sse2(struct lp_build_context *bld,
                              LLVMValueRef a)
@@ -1863,11 +1781,7 @@ lp_build_round_arch(struct lp_build_context *bld,
 
       switch (mode) {
       case LP_BUILD_ROUND_NEAREST:
-         if (HAVE_LLVM >= 0x0304) {
-            intrinsic_root = "llvm.round";
-         } else {
-            return lp_build_nearest_sse41(bld, a);
-         }
+         intrinsic_root = "llvm.nearbyint";
          break;
       case LP_BUILD_ROUND_FLOOR:
          intrinsic_root = "llvm.floor";
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 4673458171e..40017c8614f 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -477,6 +477,7 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
                         CSO_BIT_VERTEX_SHADER |
                         CSO_BIT_VERTEX_ELEMENTS |
                         CSO_BIT_AUX_VERTEX_BUFFER_SLOT |
+                        CSO_BIT_PAUSE_QUERIES |
                         CSO_BIT_RENDER_CONDITION));
    cso_save_constant_buffer_slot0(cso, PIPE_SHADER_VERTEX);
 
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 7ec8b662200..d76b6d900ce 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -454,7 +454,7 @@ ttn_emit_immediate(struct ttn_compile *c)
    nir_load_const_instr *load_const;
    int i;
 
-   load_const = nir_load_const_instr_create(b->shader, 4);
+   load_const = nir_load_const_instr_create(b->shader, 4, 32);
    c->imm_defs[c->next_imm] = &load_const->def;
    c->next_imm++;
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 803c1d39192..33c23068c27 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -87,9 +87,9 @@ struct pb_desc
 
 
 /**
- * Size. Regular (32bit) unsigned for now.
+ * 64-bit type for GPU buffer sizes and offsets.
  */
-typedef unsigned pb_size;
+typedef uint64_t pb_size;
 
 
 /**
@@ -98,8 +98,8 @@ typedef unsigned pb_size;
 struct pb_buffer 
 {
    struct pipe_reference  reference;
-   unsigned               size;
    unsigned               alignment;
+   pb_size                size;
    unsigned               usage;
 
    /**
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index fbbe8d11eb0..64af321558e 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -40,6 +40,7 @@
 #include <unistd.h>
 #include <sched.h>
 #endif
+#include <inttypes.h>
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_defines.h"
@@ -208,7 +209,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
    while (curr != &fenced_mgr->unfenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(!fenced_buf->fence);
-      debug_printf("%10p %7u %8u %7s\n",
+      debug_printf("%10p %"PRIu64" %8u %7s\n",
                    (void *) fenced_buf,
                    fenced_buf->base.size,
                    p_atomic_read(&fenced_buf->base.reference.count),
@@ -224,7 +225,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(fenced_buf->buffer);
       signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
-      debug_printf("%10p %7u %8u %7s %10p %s\n",
+      debug_printf("%10p %"PRIu64" %8u %7s %10p %s\n",
                    (void *) fenced_buf,
                    fenced_buf->base.size,
                    p_atomic_read(&fenced_buf->base.reference.count),
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 3d3a7aba7fb..4e36866e08c 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -41,6 +41,7 @@
 #include "util/list.h"
 #include "util/u_time.h"
 #include "util/u_debug_stack.h"
+#include <inttypes.h>
 
 #include "pb_buffer.h"
 #include "pb_bufmgr.h"
@@ -190,7 +191,7 @@ pb_debug_buffer_check(struct pb_debug_buffer *buf)
       underflow = !check_random_pattern(map, buf->underflow_size, 
                                         &min_ofs, &max_ofs);
       if(underflow) {
-         debug_printf("buffer underflow (offset -%u%s to -%u bytes) detected\n",
+         debug_printf("buffer underflow (offset -%"PRIu64"%s to -%"PRIu64" bytes) detected\n",
                       buf->underflow_size - min_ofs,
                       min_ofs == 0 ? "+" : "",
                       buf->underflow_size - max_ofs);
@@ -200,7 +201,7 @@ pb_debug_buffer_check(struct pb_debug_buffer *buf)
                                        buf->overflow_size, 
                                        &min_ofs, &max_ofs);
       if(overflow) {
-         debug_printf("buffer overflow (size %u plus offset %u to %u%s bytes) detected\n",
+         debug_printf("buffer overflow (size %"PRIu64" plus offset %"PRIu64" to %"PRIu64"%s bytes) detected\n",
                       buf->base.size,
                       min_ofs,
                       max_ofs,
@@ -349,7 +350,7 @@ pb_debug_manager_dump_locked(struct pb_debug_manager *mgr)
       buf = LIST_ENTRY(struct pb_debug_buffer, curr, head);
 
       debug_printf("buffer = %p\n", (void *) buf);
-      debug_printf("    .size = 0x%x\n", buf->base.size);
+      debug_printf("    .size = 0x%"PRIx64"\n", buf->base.size);
       debug_backtrace_dump(buf->create_backtrace, PB_DEBUG_CREATE_BACKTRACE);
       
       curr = next; 
diff --git a/src/gallium/auxiliary/postprocess/pp_run.c b/src/gallium/auxiliary/postprocess/pp_run.c
index 9dc8fb51ae2..bc79c5aab6e 100644
--- a/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/src/gallium/auxiliary/postprocess/pp_run.c
@@ -133,6 +133,7 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
                         CSO_BIT_VERTEX_SHADER |
                         CSO_BIT_VIEWPORT |
                         CSO_BIT_AUX_VERTEX_BUFFER_SLOT |
+                        CSO_BIT_PAUSE_QUERIES |
                         CSO_BIT_RENDER_CONDITION));
    cso_save_constant_buffer_slot0(cso, PIPE_SHADER_VERTEX);
    cso_save_constant_buffer_slot0(cso, PIPE_SHADER_FRAGMENT);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 27ee8f1242a..3e7d699627a 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -2203,7 +2203,7 @@ voidptr_to_x86_func(void *v)
       void *v;
       x86_func f;
    } u;
-   assert(sizeof(u.v) == sizeof(u.f));
+   STATIC_ASSERT(sizeof(u.v) == sizeof(u.f));
    u.v = v;
    return u.f;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index a595bbbc6d3..41dd0f0466a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -854,7 +854,8 @@ tgsi_exec_machine_bind_shader(
    struct tgsi_exec_machine *mach,
    const struct tgsi_token *tokens,
    struct tgsi_sampler *sampler,
-   struct tgsi_image *image)
+   struct tgsi_image *image,
+   struct tgsi_buffer *buffer)
 {
    uint k;
    struct tgsi_parse_context parse;
@@ -873,6 +874,7 @@ tgsi_exec_machine_bind_shader(
    mach->Tokens = tokens;
    mach->Sampler = sampler;
    mach->Image = image;
+   mach->Buffer = buffer;
 
    if (!tokens) {
       /* unbind and free all */
@@ -3758,8 +3760,8 @@ get_image_coord_sample(unsigned tgsi_tex)
 }
 
 static void
-exec_load(struct tgsi_exec_machine *mach,
-          const struct tgsi_full_instruction *inst)
+exec_load_img(struct tgsi_exec_machine *mach,
+              const struct tgsi_full_instruction *inst)
 {
    union tgsi_exec_channel r[4], sample_r;
    uint unit;
@@ -3805,8 +3807,51 @@ exec_load(struct tgsi_exec_machine *mach,
 }
 
 static void
-exec_store(struct tgsi_exec_machine *mach,
-           const struct tgsi_full_instruction *inst)
+exec_load_buf(struct tgsi_exec_machine *mach,
+              const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[4];
+   uint unit;
+   int j;
+   uint chan;
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_buffer_params params;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   IFETCH(&r[0], 1, TGSI_CHAN_X);
+
+   mach->Buffer->load(mach->Buffer, &params,
+                      r[0].i, rgba);
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      r[0].f[j] = rgba[0][j];
+      r[1].f[j] = rgba[1][j];
+      r[2].f[j] = rgba[2][j];
+      r[3].f[j] = rgba[3][j];
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_load(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
+      exec_load_img(mach, inst);
+   else
+      exec_load_buf(mach, inst);
+}
+
+static void
+exec_store_img(struct tgsi_exec_machine *mach,
+               const struct tgsi_full_instruction *inst)
 {
    union tgsi_exec_channel r[3], sample_r;
    union tgsi_exec_channel value[4];
@@ -3850,8 +3895,53 @@ exec_store(struct tgsi_exec_machine *mach,
 }
 
 static void
-exec_atomop(struct tgsi_exec_machine *mach,
-            const struct tgsi_full_instruction *inst)
+exec_store_buf(struct tgsi_exec_machine *mach,
+               const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[3];
+   union tgsi_exec_channel value[4];
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_buffer_params params;
+   int i, j;
+   uint unit;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = inst->Dst[0].Register.Index;
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.writemask = inst->Dst[0].Register.WriteMask;
+
+   IFETCH(&r[0], 0, TGSI_CHAN_X);
+   for (i = 0; i < 4; i++) {
+      FETCH(&value[i], 1, TGSI_CHAN_X + i);
+   }
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      rgba[0][j] = value[0].f[j];
+      rgba[1][j] = value[1].f[j];
+      rgba[2][j] = value[2].f[j];
+      rgba[3][j] = value[3].f[j];
+   }
+
+   mach->Buffer->store(mach->Buffer, &params,
+                      r[0].i,
+                      rgba);
+}
+
+static void
+exec_store(struct tgsi_exec_machine *mach,
+           const struct tgsi_full_instruction *inst)
+{
+   if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE)
+      exec_store_img(mach, inst);
+   else
+      exec_store_buf(mach, inst);
+}
+
+static void
+exec_atomop_img(struct tgsi_exec_machine *mach,
+                const struct tgsi_full_instruction *inst)
 {
    union tgsi_exec_channel r[4], sample_r;
    union tgsi_exec_channel value[4], value2[4];
@@ -3918,8 +4008,77 @@ exec_atomop(struct tgsi_exec_machine *mach,
 }
 
 static void
-exec_resq(struct tgsi_exec_machine *mach,
-          const struct tgsi_full_instruction *inst)
+exec_atomop_buf(struct tgsi_exec_machine *mach,
+                const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[4];
+   union tgsi_exec_channel value[4], value2[4];
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_buffer_params params;
+   int i, j;
+   uint unit, chan;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.writemask = inst->Dst[0].Register.WriteMask;
+
+   IFETCH(&r[0], 1, TGSI_CHAN_X);
+
+   for (i = 0; i < 4; i++) {
+      FETCH(&value[i], 2, TGSI_CHAN_X + i);
+      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
+   }
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      rgba[0][j] = value[0].f[j];
+      rgba[1][j] = value[1].f[j];
+      rgba[2][j] = value[2].f[j];
+      rgba[3][j] = value[3].f[j];
+   }
+   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         rgba2[0][j] = value2[0].f[j];
+         rgba2[1][j] = value2[1].f[j];
+         rgba2[2][j] = value2[2].f[j];
+         rgba2[3][j] = value2[3].f[j];
+      }
+   }
+
+   mach->Buffer->op(mach->Buffer, &params, inst->Instruction.Opcode,
+                   r[0].i,
+                   rgba, rgba2);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      r[0].f[j] = rgba[0][j];
+      r[1].f[j] = rgba[1][j];
+      r[2].f[j] = rgba[2][j];
+      r[3].f[j] = rgba[3][j];
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_atomop(struct tgsi_exec_machine *mach,
+            const struct tgsi_full_instruction *inst)
+{
+   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
+      exec_atomop_img(mach, inst);
+   else
+      exec_atomop_buf(mach, inst);
+}
+
+static void
+exec_resq_img(struct tgsi_exec_machine *mach,
+              const struct tgsi_full_instruction *inst)
 {
    int result[4];
    union tgsi_exec_channel r[4];
@@ -3952,6 +4111,46 @@ exec_resq(struct tgsi_exec_machine *mach,
 }
 
 static void
+exec_resq_buf(struct tgsi_exec_machine *mach,
+              const struct tgsi_full_instruction *inst)
+{
+   int result;
+   union tgsi_exec_channel r[4];
+   uint unit;
+   int i, chan;
+   struct tgsi_buffer_params params;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+
+   mach->Buffer->get_dims(mach->Buffer, &params, &result);
+
+   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+      r[0].i[i] = result;
+   }
+
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
+                    TGSI_EXEC_DATA_INT);
+      }
+   }
+}
+
+static void
+exec_resq(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
+      exec_resq_img(mach, inst);
+   else
+      exec_resq_buf(mach, inst);
+}
+
+static void
 micro_i2f(union tgsi_exec_channel *dst,
           const union tgsi_exec_channel *src)
 {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 45fb8d43c88..42fb922baa5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -138,6 +138,36 @@ struct tgsi_image {
                     int dims[4]);
 };
 
+struct tgsi_buffer_params {
+   unsigned unit;
+   unsigned execmask;
+   unsigned writemask;
+};
+
+struct tgsi_buffer {
+   /* buffer interfaces */
+   void (*load)(const struct tgsi_buffer *buffer,
+                const struct tgsi_buffer_params *params,
+                const int s[TGSI_QUAD_SIZE],
+                float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*store)(const struct tgsi_buffer *buffer,
+                 const struct tgsi_buffer_params *params,
+                 const int s[TGSI_QUAD_SIZE],
+                 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*op)(const struct tgsi_buffer *buffer,
+              const struct tgsi_buffer_params *params,
+              unsigned opcode,
+              const int s[TGSI_QUAD_SIZE],
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+              float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*get_dims)(const struct tgsi_buffer *buffer,
+                    const struct tgsi_buffer_params *params,
+                    int *dim);
+};
+
 /**
  * Information for sampling textures, which must be implemented
  * by code outside the TGSI executor.
@@ -334,6 +364,7 @@ struct tgsi_exec_machine
    struct tgsi_sampler           *Sampler;
 
    struct tgsi_image             *Image;
+   struct tgsi_buffer            *Buffer;
    unsigned                      ImmLimit;
 
    const void *Consts[PIPE_MAX_CONSTANT_BUFFERS];
@@ -424,7 +455,8 @@ tgsi_exec_machine_bind_shader(
    struct tgsi_exec_machine *mach,
    const struct tgsi_token *tokens,
    struct tgsi_sampler *sampler,
-   struct tgsi_image *image);
+   struct tgsi_image *image,
+   struct tgsi_buffer *buffer);
 
 uint
 tgsi_exec_machine_run(
@@ -496,8 +528,9 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
       return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
-   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
       return 0;
+   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
+      return PIPE_MAX_SHADER_BUFFERS;
    case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
       return PIPE_MAX_SHADER_IMAGES;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_lowering.c b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
index a3b90bdb509..0ffd855793a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_lowering.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
@@ -1430,7 +1430,7 @@ tgsi_transform_lowering(const struct tgsi_lowering_config *config,
    int newlen, numtmp;
 
    /* sanity check in case limit is ever increased: */
-   assert((sizeof(config->saturate_s) * 8) >= PIPE_MAX_SAMPLERS);
+   STATIC_ASSERT((sizeof(config->saturate_s) * 8) >= PIPE_MAX_SAMPLERS);
 
    memset(&ctx, 0, sizeof(ctx));
    ctx.base.transform_instruction = transform_instr;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index ae95ebd82a4..16564ddf301 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -313,7 +313,7 @@ tgsi_dump_tokens(const struct tgsi_token *tokens)
    int nr = tgsi_num_tokens(tokens);
    int i;
    
-   assert(sizeof(*tokens) == sizeof(unsigned));
+   STATIC_ASSERT(sizeof(*tokens) == sizeof(unsigned));
 
    debug_printf("const unsigned tokens[%d] = {\n", nr);
    for (i = 0; i < nr; i++)
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 22c40d1382d..3677515423c 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -551,6 +551,7 @@ util_blit_pixels_tex(struct blit_state *ctx,
                              CSO_BIT_STREAM_OUTPUTS |
                              CSO_BIT_VIEWPORT |
                              CSO_BIT_FRAMEBUFFER |
+                             CSO_BIT_PAUSE_QUERIES |
                              CSO_BIT_FRAGMENT_SHADER |
                              CSO_BIT_VERTEX_SHADER |
                              CSO_BIT_TESSCTRL_SHADER |
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 43fbd8e6452..3ca2c48c4c7 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -529,6 +529,8 @@ static void blitter_set_running_flag(struct blitter_context_priv *ctx)
                     __LINE__);
    }
    ctx->base.running = TRUE;
+
+   ctx->base.pipe->set_active_query_state(ctx->base.pipe, false);
 }
 
 static void blitter_unset_running_flag(struct blitter_context_priv *ctx)
@@ -538,6 +540,8 @@ static void blitter_unset_running_flag(struct blitter_context_priv *ctx)
                     __LINE__);
    }
    ctx->base.running = FALSE;
+
+   ctx->base.pipe->set_active_query_state(ctx->base.pipe, true);
 }
 
 static void blitter_check_saved_vertex_states(struct blitter_context_priv *ctx)
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index e92f83a8109..b4ac0db3c50 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -792,6 +792,12 @@ align(int value, int alignment)
    return (value + alignment - 1) & ~(alignment - 1);
 }
 
+static inline uint64_t
+align64(uint64_t value, unsigned alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
 /**
  * Works like align but on npot alignments.
  */
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 904e1ff04e7..3a45f402cd8 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -330,6 +330,9 @@ a resource without synchronizing with the CPU. This write will optionally
 wait for the query to complete, and will optionally write whether the value
 is available instead of the value itself.
 
+``set_active_query_state`` Set whether all current non-driver queries except
+TIME_ELAPSED are active or paused.
+
 The interface currently includes the following types of queries:
 
 ``PIPE_QUERY_OCCLUSION_COUNTER`` counts the number of fragments which
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 824f580ed44..94510757254 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -331,6 +331,11 @@ The integer capabilities:
   primitive on a layer is obtained from ``PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS``
   even though it can be larger than the number of layers supported by either
   rendering or textures.
+* ``PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR``: Implementation uses bounds
+  checking on resource accesses by shader if the context is created with
+  PIPE_CONTEXT_ROBUST_BUFFER_ACCESS. See the ARB_robust_buffer_access_behavior
+  extension for information on the required behavior for out of bounds accesses
+  and accesses to unbound resources.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index ac6052a244a..85c302f0dc3 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2710,7 +2710,7 @@ TGSI_SEMANTIC_COLOR
 """""""""""""""""""
 
 For vertex shader outputs or fragment shader inputs/outputs, this
-label indicates that the resister contains an R,G,B,A color.
+label indicates that the register contains an R,G,B,A color.
 
 Several shader inputs/outputs may contain colors so the semantic index
 is used to distinguish them.  For example, color[0] may be the diffuse
diff --git a/src/gallium/drivers/ddebug/dd_context.c b/src/gallium/drivers/ddebug/dd_context.c
index 9dfaa0af289..72a950a456a 100644
--- a/src/gallium/drivers/ddebug/dd_context.c
+++ b/src/gallium/drivers/ddebug/dd_context.c
@@ -124,6 +124,14 @@ dd_context_get_query_result(struct pipe_context *_pipe,
 }
 
 static void
+dd_context_set_active_query_state(struct pipe_context *_pipe, boolean enable)
+{
+   struct pipe_context *pipe = dd_context(_pipe)->pipe;
+
+   pipe->set_active_query_state(pipe, enable);
+}
+
+static void
 dd_context_render_condition(struct pipe_context *_pipe,
                             struct pipe_query *query, boolean condition,
                             uint mode)
@@ -667,6 +675,7 @@ dd_context_create(struct dd_screen *dscreen, struct pipe_context *pipe)
    CTX_INIT(begin_query);
    CTX_INIT(end_query);
    CTX_INIT(get_query_result);
+   CTX_INIT(set_active_query_state);
    CTX_INIT(create_blend_state);
    CTX_INIT(bind_blend_state);
    CTX_INIT(delete_blend_state);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index c34f9441c7b..e874d223187 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -157,7 +157,24 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 	emit.dirty = dirty;
 	emit.vp = NULL;   /* we changed key so need to refetch vp */
 	emit.fp = NULL;
+
+	if (ctx->rasterizer->rasterizer_discard) {
+		fd_wfi(ctx, ctx->ring);
+		OUT_PKT3(ctx->ring, CP_REG_RMW, 3);
+		OUT_RING(ctx->ring, REG_A4XX_RB_RENDER_CONTROL);
+		OUT_RING(ctx->ring, ~A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE);
+		OUT_RING(ctx->ring, A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE);
+	}
+
 	draw_impl(ctx, ctx->ring, &emit);
+
+	if (ctx->rasterizer->rasterizer_discard) {
+		fd_wfi(ctx, ctx->ring);
+		OUT_PKT3(ctx->ring, CP_REG_RMW, 3);
+		OUT_RING(ctx->ring, REG_A4XX_RB_RENDER_CONTROL);
+		OUT_RING(ctx->ring, ~A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE);
+		OUT_RING(ctx->ring, 0);
+	}
 }
 
 /* clear operations ignore viewport state, so we need to reset it
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
index 77e203f6c56..69decbcb251 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
@@ -82,12 +82,7 @@ static uint64_t
 count_samples(const struct fd_rb_samp_ctrs *start,
 		const struct fd_rb_samp_ctrs *end)
 {
-	uint64_t n = 0;
-
-	for (unsigned i = 0; i < 16; i += 4)
-		n += end->ctr[i] - start->ctr[i];
-
-	return n / 2;
+	return end->ctr[0] - start->ctr[0];
 }
 
 static void
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 85ce97c16b7..86992960960 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -241,6 +241,7 @@ struct fd_context {
 	 */
 	struct {
 		uint64_t prims_emitted;
+		uint64_t prims_generated;
 		uint64_t draw_calls;
 		uint64_t batch_total, batch_sysmem, batch_gmem, batch_restore;
 	} stats;
diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c
index bf803cc77bc..66bb1163df2 100644
--- a/src/gallium/drivers/freedreno/freedreno_draw.c
+++ b/src/gallium/drivers/freedreno/freedreno_draw.c
@@ -174,7 +174,16 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	prims = u_reduced_prims_for_vertices(info->mode, info->count);
 
 	ctx->stats.draw_calls++;
-	ctx->stats.prims_emitted += prims;
+
+	/* TODO prims_emitted should be clipped when the stream-out buffer is
+	 * not large enough.  See max_tf_vtx().. probably need to move that
+	 * into common code.  Although a bit more annoying since a2xx doesn't
+	 * use ir3 so no common way to get at the pipe_stream_output_info
+	 * which is needed for this calculation.
+	 */
+	if (ctx->streamout.num_targets > 0)
+		ctx->stats.prims_emitted += prims;
+	ctx->stats.prims_generated += prims;
 
 	/* any buffers that haven't been cleared yet, we need to restore: */
 	ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared);
@@ -189,7 +198,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 	ctx->draw_vbo(ctx, info);
 
 	for (i = 0; i < ctx->streamout.num_targets; i++)
-		ctx->streamout.offsets[i] += prims;
+		ctx->streamout.offsets[i] += info->count;
 
 	if (fd_mesa_debug & FD_DBG_DDRAW)
 		ctx->dirty = 0xffffffff;
diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c
index b87e8250719..a9427058579 100644
--- a/src/gallium/drivers/freedreno/freedreno_query.c
+++ b/src/gallium/drivers/freedreno/freedreno_query.c
@@ -114,6 +114,11 @@ fd_get_driver_query_info(struct pipe_screen *pscreen,
 	return 1;
 }
 
+static void
+fd_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void
 fd_query_screen_init(struct pipe_screen *pscreen)
 {
@@ -128,5 +133,6 @@ fd_query_context_init(struct pipe_context *pctx)
 	pctx->begin_query = fd_begin_query;
 	pctx->end_query = fd_end_query;
 	pctx->get_query_result = fd_get_query_result;
+	pctx->set_active_query_state = fd_set_active_query_state;
 	pctx->render_condition = fd_render_condition;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_query_sw.c b/src/gallium/drivers/freedreno/freedreno_query_sw.c
index 514df145fa8..4af6a125e03 100644
--- a/src/gallium/drivers/freedreno/freedreno_query_sw.c
+++ b/src/gallium/drivers/freedreno/freedreno_query_sw.c
@@ -54,7 +54,7 @@ read_counter(struct fd_context *ctx, int type)
 {
 	switch (type) {
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
-		/* for now same thing as _PRIMITIVES_EMITTED */
+		return ctx->stats.prims_generated;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 		return ctx->stats.prims_emitted;
 	case FD_QUERY_DRAW_CALLS:
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 707be17513b..05100186495 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -71,6 +71,7 @@ static const struct debug_named_value debug_options[] = {
 		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"},
 		{"shaderdb",  FD_DBG_SHADERDB, "Enable shaderdb output"},
 		{"flush",     FD_DBG_FLUSH,  "Force flush after every draw"},
+		{"deqp",      FD_DBG_DEQP,   "Enable dEQP hacks"},
 		DEBUG_NAMED_VALUE_END
 };
 
@@ -256,6 +257,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
 	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+	case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -352,6 +354,16 @@ fd_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
 	switch (param) {
 	case PIPE_CAPF_MAX_LINE_WIDTH:
 	case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+		/* NOTE: actual value is 127.0f, but this is working around a deqp
+		 * bug.. dEQP-GLES3.functional.rasterization.primitives.lines_wide
+		 * uses too small of a render target size, and gets confused when
+		 * the lines start going offscreen.
+		 *
+		 * See: https://code.google.com/p/android/issues/detail?id=206513
+		 */
+		if (fd_mesa_debug & FD_DBG_DEQP)
+			return 63.0f;
+		return 127.0f;
 	case PIPE_CAPF_MAX_POINT_WIDTH:
 	case PIPE_CAPF_MAX_POINT_WIDTH_AA:
 		return 4092.0f;
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index 685d3a75659..6c472d19815 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -359,7 +359,8 @@ fd_set_stream_output_targets(struct pipe_context *pctx,
 		if (!changed && append)
 			continue;
 
-		so->offsets[i] = 0;
+		if (!append)
+			so->offsets[i] = offsets[i];
 
 		pipe_so_target_reference(&so->targets[i], targets[i]);
 	}
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 47dd467f498..85dac982314 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -73,6 +73,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_GLSL120  0x0400
 #define FD_DBG_SHADERDB 0x0800
 #define FD_DBG_FLUSH    0x1000
+#define FD_DBG_DEQP     0x2000
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index 3859f6a39f3..f68275e568c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -308,8 +308,14 @@ struct ir3_instruction {
 static inline struct ir3_instruction *
 ir3_neighbor_first(struct ir3_instruction *instr)
 {
-	while (instr->cp.left)
+	int cnt = 0;
+	while (instr->cp.left) {
 		instr = instr->cp.left;
+		if (++cnt > 0xffff) {
+			debug_assert(0);
+			break;
+		}
+	}
 	return instr;
 }
 
@@ -322,6 +328,10 @@ static inline int ir3_neighbor_count(struct ir3_instruction *instr)
 	while (instr->cp.right) {
 		num++;
 		instr = instr->cp.right;
+		if (num > 0xffff) {
+			debug_assert(0);
+			break;
+		}
 	}
 
 	return num;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 245b61f31e5..940ca7744a2 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -103,6 +103,11 @@ struct ir3_compile {
 	 */
 	bool unminify_coords;
 
+	/* on a4xx, for array textures we need to add 0.5 to the array
+	 * index coordinate:
+	 */
+	bool array_index_add_half;
+
 	/* for looking up which system value is which */
 	unsigned sysval_semantics[8];
 
@@ -128,11 +133,13 @@ compile_init(struct ir3_compiler *compiler,
 		ctx->flat_bypass = true;
 		ctx->levels_add_one = false;
 		ctx->unminify_coords = false;
+		ctx->array_index_add_half = true;
 	} else {
 		/* no special handling for "flat" */
 		ctx->flat_bypass = false;
 		ctx->levels_add_one = true;
 		ctx->unminify_coords = true;
+		ctx->array_index_add_half = false;
 	}
 
 	ctx->compiler = compiler;
@@ -1447,9 +1454,8 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	}
 
 	/* the array coord for cube arrays needs 0.5 added to it */
-	if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && tex->is_array &&
-		opc != OPC_ISAML)
-		coord[3] = ir3_ADD_F(b, coord[3], 0, create_immed(b, fui(0.5)), 0);
+	if (ctx->array_index_add_half && tex->is_array && (opc != OPC_ISAML))
+		coord[coords] = ir3_ADD_F(b, coord[coords], 0, create_immed(b, fui(0.5)), 0);
 
 	/*
 	 * lay out the first argument in the proper order:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index 6037becf22f..e8a2f099391 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -442,6 +442,37 @@ instr_cp(struct ir3_instruction *instr)
 		instr_cp(instr->address);
 		ir3_instr_set_address(instr, eliminate_output_mov(instr->address));
 	}
+
+	/* we can end up with extra cmps.s from frontend, which uses a
+	 *
+	 *    cmps.s p0.x, cond, 0
+	 *
+	 * as a way to mov into the predicate register.  But frequently 'cond'
+	 * is itself a cmps.s/cmps.f/cmps.u.  So detect this special case and
+	 * just re-write the instruction writing predicate register to get rid
+	 * of the double cmps.
+	 */
+	if ((instr->opc == OPC_CMPS_S) &&
+			(instr->regs[0]->num == regid(REG_P0, 0)) &&
+			ssa(instr->regs[1]) &&
+			(instr->regs[2]->flags & IR3_REG_IMMED) &&
+			(instr->regs[2]->iim_val == 0)) {
+		struct ir3_instruction *cond = ssa(instr->regs[1]);
+		switch (cond->opc) {
+		case OPC_CMPS_S:
+		case OPC_CMPS_F:
+		case OPC_CMPS_U:
+			instr->opc   = cond->opc;
+			instr->flags = cond->flags;
+			instr->cat2  = cond->cat2;
+			instr->address = cond->address;
+			instr->regs[1] = cond->regs[1];
+			instr->regs[2] = cond->regs[2];
+			break;
+		default:
+			break;
+		}
+	}
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 77cd0e622f0..7a49f4c371c 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -183,7 +183,13 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 			ctx->has_samp = true;
 			regmask_set(&needs_sy, n->regs[0]);
 		} else if (is_load(n)) {
-			regmask_set(&needs_sy, n->regs[0]);
+			/* seems like ldlv needs (ss) bit instead??  which is odd but
+			 * makes a bunch of flat-varying tests start working on a4xx.
+			 */
+			if (n->opc == OPC_LDLV)
+				regmask_set(&needs_ss, n->regs[0]);
+			else
+				regmask_set(&needs_sy, n->regs[0]);
 		}
 
 		/* both tex/sfu appear to not always immediately consume
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
index 73c65d6ad27..897b3b963be 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -45,6 +45,7 @@ ir3_tgsi_to_nir(const struct tgsi_token *tokens)
 			.lower_flrp = true,
 			.lower_ffract = true,
 			.native_integers = true,
+			.vertex_id_zero_based = true,
 			.lower_extract_byte = true,
 			.lower_extract_word = true,
 	};
@@ -141,7 +142,7 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
 
 	} while (progress);
 
-	OPT_V(s, nir_remove_dead_variables);
+	OPT_V(s, nir_remove_dead_variables, nir_var_local);
 
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		debug_printf("----------------------\n");
diff --git a/src/gallium/drivers/i915/i915_query.c b/src/gallium/drivers/i915/i915_query.c
index 78d67cea2c9..fa1b01d1804 100644
--- a/src/gallium/drivers/i915/i915_query.c
+++ b/src/gallium/drivers/i915/i915_query.c
@@ -76,6 +76,11 @@ static boolean i915_get_query_result(struct pipe_context *ctx,
    return TRUE;
 }
 
+static void
+i915_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void
 i915_init_query_functions(struct i915_context *i915)
 {
@@ -84,5 +89,6 @@ i915_init_query_functions(struct i915_context *i915)
    i915->base.begin_query = i915_begin_query;
    i915->base.end_query = i915_end_query;
    i915->base.get_query_result = i915_get_query_result;
+   i915->base.set_active_query_state = i915_set_active_query_state;
 }
 
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 68e32e51c34..9b6a6604965 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -270,6 +270,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
    case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_query.c b/src/gallium/drivers/ilo/ilo_query.c
index 106bd42a335..8a42f58a87f 100644
--- a/src/gallium/drivers/ilo/ilo_query.c
+++ b/src/gallium/drivers/ilo/ilo_query.c
@@ -222,6 +222,11 @@ ilo_get_query_result(struct pipe_context *pipe, struct pipe_query *query,
    return true;
 }
 
+static void
+ilo_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 /**
  * Initialize query-related functions.
  */
@@ -233,4 +238,5 @@ ilo_init_query_functions(struct ilo_context *ilo)
    ilo->base.begin_query = ilo_begin_query;
    ilo->base.end_query = ilo_end_query;
    ilo->base.get_query_result = ilo_get_query_result;
+   ilo->base.set_active_query_state = ilo_set_active_query_state;
 }
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 142d6f1fa21..538f817e242 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -499,6 +499,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
    case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
index fc593670671..2fddc90503f 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -320,6 +320,11 @@ llvmpipe_check_render_cond(struct llvmpipe_context *lp)
       return TRUE;
 }
 
+static void
+llvmpipe_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void llvmpipe_init_query_funcs(struct llvmpipe_context *llvmpipe )
 {
    llvmpipe->pipe.create_query = llvmpipe_create_query;
@@ -327,6 +332,7 @@ void llvmpipe_init_query_funcs(struct llvmpipe_context *llvmpipe )
    llvmpipe->pipe.begin_query = llvmpipe_begin_query;
    llvmpipe->pipe.end_query = llvmpipe_end_query;
    llvmpipe->pipe.get_query_result = llvmpipe_get_query_result;
+   llvmpipe->pipe.set_active_query_state = llvmpipe_set_active_query_state;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 6a5f906adc6..cb681bac939 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -320,6 +320,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
    case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c
index a0f2db780bb..ba831f37c05 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c
@@ -218,6 +218,7 @@ const float round_values[] = {
       -10.0, -1, 0.0, 12.0,
       -1.49, -0.25, 1.25, 2.51,
       -0.99, -0.01, 0.01, 0.99,
+      -1.5, -0.5, 0.5, 1.5,
       1.401298464324817e-45f, // smallest denormal
       -1.401298464324817e-45f,
       1.62981451e-08f,
@@ -283,7 +284,7 @@ unary_tests[] = {
    {"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 },
    {"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 },
    {"sgn", &lp_build_sgn, &sgnf, exp2_values, Elements(exp2_values), 20.0 },
-   {"round", &lp_build_round, &roundf, round_values, Elements(round_values), 24.0 },
+   {"round", &lp_build_round, &nearbyintf, round_values, Elements(round_values), 24.0 },
    {"trunc", &lp_build_trunc, &truncf, round_values, Elements(round_values), 24.0 },
    {"floor", &lp_build_floor, &floorf, round_values, Elements(round_values), 24.0 },
    {"ceil", &lp_build_ceil, &ceilf, round_values, Elements(round_values), 24.0 },
diff --git a/src/gallium/drivers/noop/noop_pipe.c b/src/gallium/drivers/noop/noop_pipe.c
index fd0a5d0f830..55aca74628e 100644
--- a/src/gallium/drivers/noop/noop_pipe.c
+++ b/src/gallium/drivers/noop/noop_pipe.c
@@ -78,6 +78,11 @@ static boolean noop_get_query_result(struct pipe_context *ctx,
 	return TRUE;
 }
 
+static void
+noop_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 
 /*
  * resource
@@ -284,6 +289,7 @@ static struct pipe_context *noop_create_context(struct pipe_screen *screen,
 	ctx->begin_query = noop_begin_query;
 	ctx->end_query = noop_end_query;
 	ctx->get_query_result = noop_get_query_result;
+	ctx->set_active_query_state = noop_set_active_query_state;
 	ctx->transfer_map = noop_transfer_map;
 	ctx->transfer_flush_region = noop_transfer_flush_region;
 	ctx->transfer_unmap = noop_transfer_unmap;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 500ab8915de..1b595aec364 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1327,7 +1327,11 @@ GCRA::simplify()
                bestScore = score;
             }
          }
+#if __cplusplus >= 201103L
+         if (std::isinf(bestScore)) {
+#else
          if (isinf(bestScore)) {
+#endif
             ERROR("no viable spill candidates left\n");
             break;
          }
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 1695553d793..ba43a614b90 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -843,6 +843,39 @@ nouveau_user_buffer_upload(struct nouveau_context *nv,
    return true;
 }
 
+/* Invalidate underlying buffer storage, reset fences, reallocate to non-busy
+ * buffer.
+ */
+void
+nouveau_buffer_invalidate(struct pipe_context *pipe,
+                          struct pipe_resource *resource)
+{
+   struct nouveau_context *nv = nouveau_context(pipe);
+   struct nv04_resource *buf = nv04_resource(resource);
+   int ref = buf->base.reference.count - 1;
+
+   /* Shared buffers shouldn't get reallocated */
+   if (unlikely(buf->base.bind & PIPE_BIND_SHARED))
+      return;
+
+   /* We can't touch persistent/coherent buffers */
+   if (buf->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
+                          PIPE_RESOURCE_FLAG_MAP_COHERENT))
+      return;
+
+   /* If the buffer is sub-allocated and not currently being written, just
+    * wipe the valid buffer range. Otherwise we have to create fresh
+    * storage. (We don't keep track of fences for non-sub-allocated BO's.)
+    */
+   if (buf->mm && !nouveau_buffer_busy(buf, PIPE_TRANSFER_WRITE)) {
+      util_range_set_empty(&buf->valid_buffer_range);
+   } else {
+      nouveau_buffer_reallocate(nv->screen, buf, buf->domain);
+      if (ref > 0) /* any references inside context possible ? */
+         nv->invalidate_resource_storage(nv, &buf->base, ref);
+   }
+}
+
 
 /* Scratch data allocation. */
 
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.h b/src/gallium/drivers/nouveau/nouveau_buffer.h
index d45bf7aebcf..3a33fae9ce2 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.h
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.h
@@ -99,6 +99,10 @@ bool
 nouveau_user_buffer_upload(struct nouveau_context *, struct nv04_resource *,
                            unsigned base, unsigned size);
 
+void
+nouveau_buffer_invalidate(struct pipe_context *pipe,
+                          struct pipe_resource *resource);
+
 /* Copy data to a scratch buffer and return address & bo the data resides in.
  * Returns 0 on failure.
  */
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c
index 75a4b0446fe..cb53a3663e5 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_query.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c
@@ -263,6 +263,11 @@ nv40_query_render_condition(struct pipe_context *pipe,
    PUSH_DATA (push, 0x02000000 | q->qo[1]->hw->start);
 }
 
+static void
+nv30_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void
 nv30_query_init(struct pipe_context *pipe)
 {
@@ -273,6 +278,7 @@ nv30_query_init(struct pipe_context *pipe)
    pipe->begin_query = nv30_query_begin;
    pipe->end_query = nv30_query_end;
    pipe->get_query_result = nv30_query_result;
+   pipe->set_active_query_state = nv30_set_active_query_state;
    if (eng3d->oclass >= NV40_3D_CLASS)
       pipe->render_condition = nv40_query_render_condition;
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index db7c2d15fb1..400e9f5c04d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -193,6 +193,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
    case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -324,6 +325,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
       case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
       case PIPE_SHADER_CAP_SUBROUTINES:
+      case PIPE_SHADER_CAP_INTEGERS:
       case PIPE_SHADER_CAP_DOUBLES:
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 61a52c4b366..5af0e9b3a27 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -93,6 +93,30 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags)
    }
 }
 
+static void
+nv50_emit_string_marker(struct pipe_context *pipe, const char *str, int len)
+{
+   struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
+   int string_words = len / 4;
+   int data_words;
+
+   if (len <= 0)
+      return;
+   string_words = MIN2(string_words, NV04_PFIFO_MAX_PACKET_LEN);
+   if (string_words == NV04_PFIFO_MAX_PACKET_LEN)
+      data_words = string_words;
+   else
+      data_words = string_words + !!(len & 3);
+   BEGIN_NI04(push, SUBC_3D(NV04_GRAPH_NOP), data_words);
+   if (string_words)
+      PUSH_DATAp(push, str, string_words);
+   if (string_words != data_words) {
+      int data = 0;
+      memcpy(&data, &str[string_words * 4], len & 3);
+      PUSH_DATA (push, data);
+   }
+}
+
 void
 nv50_default_kick_notify(struct nouveau_pushbuf *push)
 {
@@ -309,6 +333,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    pipe->texture_barrier = nv50_texture_barrier;
    pipe->memory_barrier = nv50_memory_barrier;
    pipe->get_sample_position = nv50_context_get_sample_position;
+   pipe->emit_string_marker = nv50_emit_string_marker;
 
    if (!screen->cur_ctx) {
       /* Restore the last context's state here, normally handled during
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index 4cd3b615606..fa70fb6950e 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -143,6 +143,11 @@ nv50_render_condition(struct pipe_context *pipe,
    PUSH_DATA (push, hq->bo->offset + hq->offset);
 }
 
+static void
+nv50_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void
 nv50_init_query_functions(struct nv50_context *nv50)
 {
@@ -153,6 +158,7 @@ nv50_init_query_functions(struct nv50_context *nv50)
    pipe->begin_query = nv50_begin_query;
    pipe->end_query = nv50_end_query;
    pipe->get_query_result = nv50_get_query_result;
+   pipe->set_active_query_state = nv50_set_active_query_state;
    pipe->render_condition = nv50_render_condition;
    nv50->cond_condmode = NV50_3D_COND_MODE_ALWAYS;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.c b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
index ad5f3b814db..b090a30aed6 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
@@ -85,6 +85,13 @@ nv50_surface_destroy(struct pipe_context *pipe, struct pipe_surface *ps)
 }
 
 void
+nv50_invalidate_resource(struct pipe_context *pipe, struct pipe_resource *res)
+{
+   if (res->target == PIPE_BUFFER)
+      nouveau_buffer_invalidate(pipe, res);
+}
+
+void
 nv50_init_resource_functions(struct pipe_context *pcontext)
 {
    pcontext->transfer_map = u_transfer_map_vtbl;
@@ -93,6 +100,7 @@ nv50_init_resource_functions(struct pipe_context *pcontext)
    pcontext->transfer_inline_write = u_transfer_inline_write_vtbl;
    pcontext->create_surface = nv50_surface_create;
    pcontext->surface_destroy = nv50_surface_destroy;
+   pcontext->invalidate_resource = nv50_invalidate_resource;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
index b40370a1d78..5d03925b0d0 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -152,6 +152,9 @@ void
 nv50_surface_destroy(struct pipe_context *, struct pipe_surface *);
 
 void
+nv50_invalidate_resource(struct pipe_context *, struct pipe_resource *);
+
+void
 nv50_clear_texture(struct pipe_context *pipe,
                    struct pipe_resource *res,
                    unsigned level,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 20fb61b51f4..ef114e50529 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -193,6 +193,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_CLEAR_TEXTURE:
    case PIPE_CAP_COMPUTE:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_STRING_MARKER:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -234,9 +236,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
-   case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
-   case PIPE_CAP_STRING_MARKER:
    case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
@@ -246,6 +246,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
    case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 007cccfd10b..fcb8289beda 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -97,6 +97,30 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags)
 }
 
 static void
+nvc0_emit_string_marker(struct pipe_context *pipe, const char *str, int len)
+{
+   struct nouveau_pushbuf *push = nvc0_context(pipe)->base.pushbuf;
+   int string_words = len / 4;
+   int data_words;
+
+   if (len <= 0)
+      return;
+   string_words = MIN2(string_words, NV04_PFIFO_MAX_PACKET_LEN);
+   if (string_words == NV04_PFIFO_MAX_PACKET_LEN)
+      data_words = string_words;
+   else
+      data_words = string_words + !!(len & 3);
+   BEGIN_NIC0(push, SUBC_3D(NV04_GRAPH_NOP), data_words);
+   if (string_words)
+      PUSH_DATAp(push, str, string_words);
+   if (string_words != data_words) {
+      int data = 0;
+      memcpy(&data, &str[string_words * 4], len & 3);
+      PUSH_DATA (push, data);
+   }
+}
+
+static void
 nvc0_context_unreference_resources(struct nvc0_context *nvc0)
 {
    unsigned s, i;
@@ -333,6 +357,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    pipe->texture_barrier = nvc0_texture_barrier;
    pipe->memory_barrier = nvc0_memory_barrier;
    pipe->get_sample_position = nvc0_context_get_sample_position;
+   pipe->emit_string_marker = nvc0_emit_string_marker;
 
    nouveau_context_init(&nvc0->base);
    nvc0_init_query_functions(nvc0);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index db02fa2df5c..d3024f9fa06 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -456,6 +456,13 @@ nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info)
          fp->hdr[18] |= 0xf << info->out[i].slot[0];
    }
 
+   /* There are no "regular" attachments, but the shader still needs to be
+    * executed. It seems like it wants to think that it has some color
+    * outputs in order to actually run.
+    */
+   if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth)
+      fp->hdr[18] |= 0xf;
+
    fp->fp.early_z = info->prop.fp.earlyFragTests;
 
    return 0;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 92ca613cda1..b34271c4911 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -254,6 +254,11 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
    return 0;
 }
 
+static void
+nvc0_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void
 nvc0_init_query_functions(struct nvc0_context *nvc0)
 {
@@ -265,6 +270,7 @@ nvc0_init_query_functions(struct nvc0_context *nvc0)
    pipe->end_query = nvc0_end_query;
    pipe->get_query_result = nvc0_get_query_result;
    pipe->get_query_result_resource = nvc0_get_query_result_resource;
+   pipe->set_active_query_state = nvc0_set_active_query_state;
    pipe->render_condition = nvc0_render_condition;
    nvc0->cond_condmode = NVC0_3D_COND_MODE_ALWAYS;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
index c034d0fd011..0aee5890fd8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
@@ -52,6 +52,7 @@ nvc0_init_resource_functions(struct pipe_context *pcontext)
    pcontext->transfer_inline_write = u_transfer_inline_write_vtbl;
    pcontext->create_surface = nvc0_surface_create;
    pcontext->surface_destroy = nv50_surface_destroy;
+   pcontext->invalidate_resource = nv50_invalidate_resource;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index c41912a6037..9a34007c6e5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -52,6 +52,12 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
    if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */
       return false;
 
+   /* Short-circuit the rest of the logic -- this is used by the state tracker
+    * to determine valid MS levels in a no-attachments scenario.
+    */
+   if (format == PIPE_FORMAT_NONE && bindings & PIPE_BIND_RENDER_TARGET)
+      return true;
+
    if (!util_format_is_supported(format, bindings))
       return false;
 
@@ -216,6 +222,9 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -241,9 +250,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
    case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
-   case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
-   case PIPE_CAP_STRING_MARKER:
    case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_MEMORY_INFO:
@@ -251,7 +258,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
-   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 9c64482f2e2..d0d9315dd2b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -56,15 +56,18 @@ nvc0_validate_zcull(struct nvc0_context *nvc0)
 #endif
 
 static inline void
-nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i)
+nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i, unsigned layers)
 {
-   BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 6);
-   PUSH_DATA (push, 0);
-   PUSH_DATA (push, 0);
-   PUSH_DATA (push, 64);
-   PUSH_DATA (push, 0);
+   BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 9);
    PUSH_DATA (push, 0);
    PUSH_DATA (push, 0);
+   PUSH_DATA (push, 64);     // width
+   PUSH_DATA (push, 0);      // height
+   PUSH_DATA (push, 0);      // format
+   PUSH_DATA (push, 0);      // tile mode
+   PUSH_DATA (push, layers); // layers
+   PUSH_DATA (push, 0);      // layer stride
+   PUSH_DATA (push, 0);      // base layer
 }
 
 static void
@@ -75,12 +78,11 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
     struct nvc0_screen *screen = nvc0->screen;
     unsigned i, ms;
     unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
+    unsigned nr_cbufs = fb->nr_cbufs;
     bool serialize = false;
 
     nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_FB);
 
-    BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
-    PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs);
     BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2);
     PUSH_DATA (push, fb->width << 16);
     PUSH_DATA (push, fb->height << 16);
@@ -91,7 +93,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
         struct nouveau_bo *bo;
 
         if (!fb->cbufs[i]) {
-           nvc0_fb_set_null_rt(push, i);
+           nvc0_fb_set_null_rt(push, i, 0);
            continue;
         }
 
@@ -179,6 +181,19 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
         PUSH_DATA (push, 0);
     }
 
+    if (nr_cbufs == 0 && !fb->zsbuf) {
+       assert(util_is_power_of_two(fb->samples));
+       assert(fb->samples <= 8);
+
+       nvc0_fb_set_null_rt(push, 0, fb->layers);
+
+       if (fb->samples > 1)
+          ms_mode = ffs(fb->samples) - 1;
+       nr_cbufs = 1;
+    }
+
+    BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
+    PUSH_DATA (push, (076543210 << 4) | nr_cbufs);
     IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), ms_mode);
 
     ms = 1 << ms_mode;
@@ -592,8 +607,9 @@ nvc0_validate_derived_2(struct nvc0_context *nvc0)
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 
    if (nvc0->zsa && nvc0->zsa->pipe.alpha.enabled &&
+       nvc0->framebuffer.zsbuf &&
        nvc0->framebuffer.nr_cbufs == 0) {
-      nvc0_fb_set_null_rt(push, 0);
+      nvc0_fb_set_null_rt(push, 0, 0);
       BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
       PUSH_DATA (push, (076543210 << 4) | 1);
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index e657204128e..e108590e215 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1043,6 +1043,8 @@ nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx)
 
    ctx->saved.fb.width = nvc0->framebuffer.width;
    ctx->saved.fb.height = nvc0->framebuffer.height;
+   ctx->saved.fb.samples = nvc0->framebuffer.samples;
+   ctx->saved.fb.layers = nvc0->framebuffer.layers;
    ctx->saved.fb.nr_cbufs = nvc0->framebuffer.nr_cbufs;
    ctx->saved.fb.cbufs[0] = nvc0->framebuffer.cbufs[0];
    ctx->saved.fb.zsbuf = nvc0->framebuffer.zsbuf;
@@ -1110,6 +1112,8 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit)
 
    nvc0->framebuffer.width = blit->saved.fb.width;
    nvc0->framebuffer.height = blit->saved.fb.height;
+   nvc0->framebuffer.samples = blit->saved.fb.samples;
+   nvc0->framebuffer.layers = blit->saved.fb.layers;
    nvc0->framebuffer.nr_cbufs = blit->saved.fb.nr_cbufs;
    nvc0->framebuffer.cbufs[0] = blit->saved.fb.cbufs[0];
    nvc0->framebuffer.zsbuf = blit->saved.fb.zsbuf;
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 6414e80828e..7603985b14b 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -200,6 +200,11 @@ static void r300_render_condition(struct pipe_context *pipe,
     }
 }
 
+static void
+r300_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void r300_init_query_functions(struct r300_context* r300)
 {
     r300->context.create_query = r300_create_query;
@@ -207,5 +212,6 @@ void r300_init_query_functions(struct r300_context* r300)
     r300->context.begin_query = r300_begin_query;
     r300->context.end_query = r300_end_query;
     r300->context.get_query_result = r300_get_query_result;
+    r300->context.set_active_query_state = r300_set_active_query_state;
     r300->context.render_condition = r300_render_condition;
 }
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index b3a7f049e10..eae53e16a54 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -215,6 +215,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_QUERY_BUFFER_OBJECT:
         case PIPE_CAP_QUERY_MEMORY_INFO:
         case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+	case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r300/r300_texture_desc.c b/src/gallium/drivers/r300/r300_texture_desc.c
index 8fa98c5804e..2442d726cd1 100644
--- a/src/gallium/drivers/r300/r300_texture_desc.c
+++ b/src/gallium/drivers/r300/r300_texture_desc.c
@@ -25,6 +25,7 @@
 #include "r300_context.h"
 
 #include "util/u_format.h"
+#include <inttypes.h>
 
 /* Returns the number of pixels that the texture should be aligned to
  * in the given dimension. */
@@ -614,7 +615,7 @@ void r300_texture_desc_init(struct r300_screen *rscreen,
                 "r300: I got a pre-allocated buffer to use it as a texture "
                 "storage, but the buffer is too small. I'll use the buffer "
                 "anyway, because I can't crash here, but it's dangerous. "
-                "This can be a DDX bug. Got: %iB, Need: %iB, Info:\n",
+                "This can be a DDX bug. Got: %"PRIu64"B, Need: %uB, Info:\n",
                 tex->buf->size, tex->tex.size_in_bytes);
             r300_tex_print_info(tex, "texture_desc_init");
             /* Ooops, what now. Apps will break if we fail this,
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 65952676987..2ad9e3eb1ab 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -472,6 +472,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 
 	r600_init_command_buffer(&rs->buffer, 30);
 
+	rs->scissor_enable = state->scissor;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
 	rs->two_side = state->light_twoside;
@@ -528,7 +529,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 	r600_store_context_reg(&rs->buffer, R_0286D4_SPI_INTERP_CONTROL_0, spi_interp);
 	r600_store_context_reg(&rs->buffer, R_028A48_PA_SC_MODE_CNTL_0,
 			       S_028A48_MSAA_ENABLE(state->multisample) |
-			       S_028A48_VPORT_SCISSOR_ENABLE(state->scissor) |
+			       S_028A48_VPORT_SCISSOR_ENABLE(1) |
 			       S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable));
 
 	if (rctx->b.chip_class == CAYMAN) {
@@ -560,8 +561,11 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx,
 static void *evergreen_create_sampler_state(struct pipe_context *ctx,
 					const struct pipe_sampler_state *state)
 {
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen;
 	struct r600_pipe_sampler_state *ss = CALLOC_STRUCT(r600_pipe_sampler_state);
-	unsigned aniso_flag_offset = state->max_anisotropy > 1 ? 2 : 0;
+	unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso
+						       : state->max_anisotropy;
+	unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso);
 
 	if (!ss) {
 		return NULL;
@@ -574,10 +578,10 @@ static void *evergreen_create_sampler_state(struct pipe_context *ctx,
 		S_03C000_CLAMP_X(r600_tex_wrap(state->wrap_s)) |
 		S_03C000_CLAMP_Y(r600_tex_wrap(state->wrap_t)) |
 		S_03C000_CLAMP_Z(r600_tex_wrap(state->wrap_r)) |
-		S_03C000_XY_MAG_FILTER(r600_tex_filter(state->mag_img_filter) | aniso_flag_offset) |
-		S_03C000_XY_MIN_FILTER(r600_tex_filter(state->min_img_filter) | aniso_flag_offset) |
+		S_03C000_XY_MAG_FILTER(eg_tex_filter(state->mag_img_filter, max_aniso)) |
+		S_03C000_XY_MIN_FILTER(eg_tex_filter(state->min_img_filter, max_aniso)) |
 		S_03C000_MIP_FILTER(r600_tex_mipfilter(state->min_mip_filter)) |
-		S_03C000_MAX_ANISO(r600_tex_aniso_filter(state->max_anisotropy)) |
+		S_03C000_MAX_ANISO_RATIO(max_aniso_ratio) |
 		S_03C000_DEPTH_COMPARE_FUNCTION(r600_tex_compare(state->compare_func)) |
 		S_03C000_BORDER_COLOR_TYPE(ss->border_color_use ? V_03C000_SQ_TEX_BORDER_COLOR_REGISTER : 0);
 	/* R_03C004_SQ_TEX_SAMPLER_WORD1_0 */
@@ -849,10 +853,12 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 		view->tex_resource_words[5] |= S_030014_LAST_LEVEL(log_samples);
 		view->tex_resource_words[6] |= S_030018_FMASK_BANK_HEIGHT(fmask_bankh);
 	} else {
+		bool no_mip = first_level == last_level;
+
 		view->tex_resource_words[4] |= S_030010_BASE_LEVEL(first_level);
 		view->tex_resource_words[5] |= S_030014_LAST_LEVEL(last_level);
 		/* aniso max 16 samples */
-		view->tex_resource_words[6] |= S_030018_MAX_ANISO(4);
+		view->tex_resource_words[6] |= S_030018_MAX_ANISO_RATIO(no_mip ? 0 : 4);
 	}
 
 	view->tex_resource_words[7] = S_03001C_DATA_FORMAT(format) |
@@ -919,60 +925,12 @@ static void evergreen_get_scissor_rect(struct r600_context *rctx,
 				       unsigned tl_x, unsigned tl_y, unsigned br_x, unsigned br_y,
 				       uint32_t *tl, uint32_t *br)
 {
-	/* EG hw workaround */
-	if (br_x == 0)
-		tl_x = 1;
-	if (br_y == 0)
-		tl_y = 1;
+	struct pipe_scissor_state scissor = {tl_x, tl_y, br_x, br_y};
 
-	/* cayman hw workaround */
-	if (rctx->b.chip_class == CAYMAN) {
-		if (br_x == 1 && br_y == 1)
-			br_x = 2;
-	}
+	evergreen_apply_scissor_bug_workaround(&rctx->b, &scissor);
 
-	*tl = S_028240_TL_X(tl_x) | S_028240_TL_Y(tl_y);
-	*br = S_028244_BR_X(br_x) | S_028244_BR_Y(br_y);
-}
-
-static void evergreen_set_scissor_states(struct pipe_context *ctx,
-                                         unsigned start_slot,
-                                         unsigned num_scissors,
-					const struct pipe_scissor_state *state)
-{
-	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct r600_scissor_state *rstate = &rctx->scissor;
-	int i;
-
-	for (i = start_slot; i < start_slot + num_scissors; i++)
-		rstate->scissor[i] = state[i - start_slot];
-	rstate->dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
-	rstate->atom.num_dw = util_bitcount(rstate->dirty_mask) * 4;
-	r600_mark_atom_dirty(rctx, &rstate->atom);
-}
-
-static void evergreen_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom)
-{
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
-	struct r600_scissor_state *rstate = &rctx->scissor;
-	struct pipe_scissor_state *state;
-	uint32_t dirty_mask;
-	unsigned i, offset;
-	uint32_t tl, br;
-
-	dirty_mask = rstate->dirty_mask;
-	while (dirty_mask != 0) {
-		i = u_bit_scan(&dirty_mask);
-		state = &rstate->scissor[i];
-		evergreen_get_scissor_rect(rctx, state->minx, state->miny, state->maxx, state->maxy, &tl, &br);
-
-		offset = i * 4 * 2;
-		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + offset, 2);
-		radeon_emit(cs, tl);
-		radeon_emit(cs, br);
-	}
-	rstate->dirty_mask = 0;
-	rstate->atom.num_dw = 0;
+	*tl = S_028240_TL_X(scissor.minx) | S_028240_TL_Y(scissor.miny);
+	*br = S_028244_BR_X(scissor.maxx) | S_028244_BR_Y(scissor.maxy);
 }
 
 /**
@@ -1802,12 +1760,15 @@ static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_
 		S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
 		S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
 
-	if (a->occlusion_query_enabled) {
+	if (rctx->b.num_occlusion_queries > 0 &&
+	    !a->occlusion_queries_disabled) {
 		db_count_control |= S_028004_PERFECT_ZPASS_COUNTS(1);
 		if (rctx->b.chip_class == CAYMAN) {
 			db_count_control |= S_028004_SAMPLE_RATE(a->log_samples);
 		}
 		db_render_override |= S_02800C_NOOP_CULL_DISABLE(1);
+	} else {
+		db_count_control |= S_028004_ZPASS_INCREMENT_DISABLE(1);
 	}
 
 	/* This is to fix a lockup when hyperz and alpha test are enabled at
@@ -2392,6 +2353,12 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
 	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 
+	/* This enables pipeline stat & streamout queries.
+	 * They are only disabled by blits.
+	 */
+	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
+	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0));
+
 	cayman_init_common_regs(cb, rctx->b.chip_class,
 				rctx->b.family, rctx->screen->b.info.drm_minor);
 
@@ -2474,12 +2441,6 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
 	r600_store_context_reg(cb, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA);
 	r600_store_context_reg(cb, R_028820_PA_CL_NANINF_CNTL, 0);
 
-	r600_store_context_reg_seq(cb, CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
-	r600_store_value(cb, fui(1.0)); /* CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */
-	r600_store_value(cb, fui(1.0)); /* CM_R_028BEC_PA_CL_GB_VERT_DISC_ADJ */
-	r600_store_value(cb, fui(1.0)); /* CM_R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */
-	r600_store_value(cb, fui(1.0)); /* CM_R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */
-
 	r600_store_context_reg_seq(cb, R_028240_PA_SC_GENERIC_SCISSOR_TL, 2);
 	r600_store_value(cb, 0); /* R_028240_PA_SC_GENERIC_SCISSOR_TL */
 	r600_store_value(cb, S_028244_BR_X(16384) | S_028244_BR_Y(16384)); /* R_028244_PA_SC_GENERIC_SCISSOR_BR */
@@ -2645,6 +2606,12 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)
 	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 
+	/* This enables pipeline stat & streamout queries.
+	 * They are only disabled by blits.
+	 */
+	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
+	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0));
+
 	evergreen_init_common_regs(rctx, cb, rctx->b.chip_class,
 				   rctx->b.family, rctx->screen->b.info.drm_minor);
 
@@ -2889,12 +2856,6 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)
 	r600_store_value(cb, 0); /* R_028AC4_DB_SRESULTS_COMPARE_STATE1 */
 	r600_store_value(cb, 0); /* R_028AC8_DB_PRELOAD_CONTROL */
 
-	r600_store_context_reg_seq(cb, R_028C0C_PA_CL_GB_VERT_CLIP_ADJ, 4);
-	r600_store_value(cb, fui(1.0)); /* R_028C0C_PA_CL_GB_VERT_CLIP_ADJ */
-	r600_store_value(cb, fui(1.0)); /* R_028C10_PA_CL_GB_VERT_DISC_ADJ */
-	r600_store_value(cb, fui(1.0)); /* R_028C14_PA_CL_GB_HORZ_CLIP_ADJ */
-	r600_store_value(cb, fui(1.0)); /* R_028C18_PA_CL_GB_HORZ_DISC_ADJ */
-
 	r600_store_context_reg_seq(cb, R_028240_PA_SC_GENERIC_SCISSOR_TL, 2);
 	r600_store_value(cb, 0); /* R_028240_PA_SC_GENERIC_SCISSOR_TL */
 	r600_store_value(cb, S_028244_BR_X(16384) | S_028244_BR_Y(16384)); /* R_028244_PA_SC_GENERIC_SCISSOR_BR */
@@ -3696,8 +3657,8 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->dsa_state.atom, id++, r600_emit_cso_state, 0);
 	r600_init_atom(rctx, &rctx->poly_offset_state.atom, id++, evergreen_emit_polygon_offset, 6);
 	r600_init_atom(rctx, &rctx->rasterizer_state.atom, id++, r600_emit_cso_state, 0);
-	r600_init_atom(rctx, &rctx->scissor.atom, id++, evergreen_emit_scissor_state, 0);
-	r600_init_atom(rctx, &rctx->viewport.atom, id++, r600_emit_viewport_state, 0);
+	r600_add_atom(rctx, &rctx->b.scissors.atom, id++);
+	r600_add_atom(rctx, &rctx->b.viewports.atom, id++);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5);
 	r600_add_atom(rctx, &rctx->b.render_cond_atom, id++);
@@ -3716,7 +3677,6 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	rctx->b.b.set_framebuffer_state = evergreen_set_framebuffer_state;
 	rctx->b.b.set_polygon_stipple = evergreen_set_polygon_stipple;
 	rctx->b.b.set_min_samples = evergreen_set_min_samples;
-	rctx->b.b.set_scissor_states = evergreen_set_scissor_states;
 	rctx->b.b.set_tess_state = evergreen_set_tess_state;
 	if (rctx->b.chip_class == EVERGREEN)
                 rctx->b.b.get_sample_position = evergreen_get_sample_position;
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index ebe8c4a65ba..ece421e3d33 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -1202,11 +1202,11 @@
 #define   G_030014_LAST_ARRAY(x)                       (((x) >> 17) & 0x1FFF)
 #define   C_030014_LAST_ARRAY                          0xC001FFFF
 #define R_030018_SQ_TEX_RESOURCE_WORD6_0             0x030018
-/* FMASK_BANK_HEIGHT and MAX_ANISO share the first two bits.
+/* FMASK_BANK_HEIGHT and MAX_ANISO_RATIO share the first two bits.
  * The former is only used with MSAA textures. */
-#define   S_030018_MAX_ANISO(x)                        (((x) & 0x7) << 0)
-#define   G_030018_MAX_ANISO(x)                        (((x) >> 0) & 0x7)
-#define   C_030018_MAX_ANISO                           0xFFFFFFF8
+#define   S_030018_MAX_ANISO_RATIO(x)                  (((x) & 0x7) << 0)
+#define   G_030018_MAX_ANISO_RATIO(x)                  (((x) >> 0) & 0x7)
+#define   C_030018_MAX_ANISO_RATIO                     0xFFFFFFF8
 #define   S_030018_FMASK_BANK_HEIGHT(x)                (((x) & 0x3) << 0)
 #define   S_030018_PERF_MODULATION(x)                  (((x) & 0x7) << 3)
 #define   G_030018_PERF_MODULATION(x)                  (((x) >> 3) & 0x7)
@@ -1344,9 +1344,9 @@
 #define   S_03C000_MIP_FILTER(x)                       (((x) & 0x3) << 15)
 #define   G_03C000_MIP_FILTER(x)                       (((x) >> 15) & 0x3)
 #define   C_03C000_MIP_FILTER                          0xFFFE7FFF
-#define   S_03C000_MAX_ANISO(x)                        (((x) & 0x7) << 17)
-#define   G_03C000_MAX_ANISO(x)                        (((x) >> 17) & 0x7)
-#define   C_03C000_MAX_ANISO                           0xFFF1FFFF
+#define   S_03C000_MAX_ANISO_RATIO(x)                  (((x) & 0x7) << 17)
+#define   G_03C000_MAX_ANISO_RATIO(x)                  (((x) >> 17) & 0x7)
+#define   C_03C000_MAX_ANISO_RATIO                     0xFFF1FFFF
 #define   S_03C000_BORDER_COLOR_TYPE(x)                (((x) & 0x3) << 20)
 #define   G_03C000_BORDER_COLOR_TYPE(x)                (((x) >> 20) & 0x3)
 #define   C_03C000_BORDER_COLOR_TYPE                   0xFFCFFFFF
@@ -1735,7 +1735,7 @@
 #define   S_028000_COPY_SAMPLE(x)                      (((x) & 0x7) << 8)
 #define   S_028000_COLOR_DISABLE(x)                    (((x) & 0x1) << 12)
 #define R_028004_DB_COUNT_CONTROL                    0x00028004
-#define   S_028004_ZPASS_INCREMENT_DISABLE        (((x) & 0x1) << 0)
+#define   S_028004_ZPASS_INCREMENT_DISABLE(x)     (((x) & 0x1) << 0)
 #define   S_028004_PERFECT_ZPASS_COUNTS(x)        (((x) & 0x1) << 1)
 #define   S_028004_SAMPLE_RATE(x)                 (((x) & 0x7) << 4) /* cayman only */
 #define R_028008_DB_DEPTH_VIEW                       0x00028008
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index c52d5a9bad0..1a4cc425394 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -54,8 +54,6 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
-	r600_suspend_nontimer_queries(&rctx->b);
-
 	util_blitter_save_vertex_buffer_slot(rctx->blitter, rctx->vertex_buffer_state.vb);
 	util_blitter_save_vertex_elements(rctx->blitter, rctx->vertex_fetch_shader.cso);
 	util_blitter_save_vertex_shader(rctx->blitter, rctx->vs_shader);
@@ -67,8 +65,8 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op
 	util_blitter_save_rasterizer(rctx->blitter, rctx->rasterizer_state.cso);
 
 	if (op & R600_SAVE_FRAGMENT_STATE) {
-		util_blitter_save_viewport(rctx->blitter, &rctx->viewport.state[0]);
-		util_blitter_save_scissor(rctx->blitter, &rctx->scissor.scissor[0]);
+		util_blitter_save_viewport(rctx->blitter, &rctx->b.viewports.states[0]);
+		util_blitter_save_scissor(rctx->blitter, &rctx->b.scissors.states[0]);
 		util_blitter_save_fragment_shader(rctx->blitter, rctx->ps_shader);
 		util_blitter_save_blend(rctx->blitter, rctx->blend_state.cso);
 		util_blitter_save_depth_stencil_alpha(rctx->blitter, rctx->dsa_state.cso);
@@ -98,7 +96,6 @@ static void r600_blitter_end(struct pipe_context *ctx)
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->b.render_cond_force_off = false;
-	r600_resume_nontimer_queries(&rctx->b);
 }
 
 static unsigned u_max_sample(struct pipe_resource *r)
@@ -584,7 +581,7 @@ static void r600_copy_global_buffer(struct pipe_context *ctx,
 }
 
 static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
-			      unsigned offset, unsigned size, unsigned value,
+			      uint64_t offset, uint64_t size, unsigned value,
 			      bool is_framebuffer)
 {
 	struct r600_context *rctx = (struct r600_context*)ctx;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 7a6f957945b..2bc6d3ffce4 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -64,9 +64,8 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 		num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS;
 	}
 
-	/* Count in queries_suspend. */
-	num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend +
-		  ctx->b.num_cs_dw_timer_queries_suspend;
+	/* Count in r600_suspend_queries. */
+	num_dw += ctx->b.num_cs_dw_queries_suspend;
 
 	/* Count in streamout_end at the end of CS. */
 	if (ctx->b.streamout.begin_emitted) {
@@ -223,6 +222,16 @@ void r600_flush_emit(struct r600_context *rctx)
 		cs->buf[cs->cdw++] = 0x0000000A;      /* POLL_INTERVAL */
 	}
 
+	if (rctx->b.flags & R600_CONTEXT_START_PIPELINE_STATS) {
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) |
+			        EVENT_INDEX(0));
+	} else if (rctx->b.flags & R600_CONTEXT_STOP_PIPELINE_STATS) {
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_STOP) |
+			        EVENT_INDEX(0));
+	}
+
 	if (wait_until) {
 		/* Use of WAIT_UNTIL is deprecated on Cayman+ */
 		if (rctx->b.family < CHIP_CAYMAN) {
@@ -295,12 +304,10 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	r600_mark_atom_dirty(ctx, &ctx->poly_offset_state.atom);
 	r600_mark_atom_dirty(ctx, &ctx->vgt_state.atom);
 	r600_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
-	ctx->scissor.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
-	ctx->scissor.atom.num_dw = R600_MAX_VIEWPORTS * 4;
-	r600_mark_atom_dirty(ctx, &ctx->scissor.atom);
-	ctx->viewport.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
-	ctx->viewport.atom.num_dw = R600_MAX_VIEWPORTS * 8;
-	r600_mark_atom_dirty(ctx, &ctx->viewport.atom);
+	ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+	r600_mark_atom_dirty(ctx, &ctx->b.scissors.atom);
+	ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+	r600_mark_atom_dirty(ctx, &ctx->b.viewports.atom);
 	if (ctx->b.chip_class <= EVERGREEN) {
 		r600_mark_atom_dirty(ctx, &ctx->config_state.atom);
 	}
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 36b808fbbca..c594f5cb18b 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -365,6 +365,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_STRING_MARKER:
 	case PIPE_CAP_QUERY_BUFFER_OBJECT:
+	case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index cd0052a519f..6c2a48ca412 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -38,8 +38,6 @@
 
 #define R600_NUM_ATOMS 52
 
-#define R600_MAX_VIEWPORTS 16
-
 /* read caches */
 #define R600_CONTEXT_INV_VERTEX_CACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
 #define R600_CONTEXT_INV_TEX_CACHE		(R600_CONTEXT_PRIVATE_FLAG << 1)
@@ -56,7 +54,7 @@
 #define R600_CONTEXT_WAIT_CP_DMA_IDLE		(R600_CONTEXT_PRIVATE_FLAG << 10)
 
 /* the number of CS dwords for flushing and drawing */
-#define R600_MAX_FLUSH_CS_DWORDS	16
+#define R600_MAX_FLUSH_CS_DWORDS	18
 #define R600_MAX_DRAW_CS_DWORDS		58
 
 #define R600_MAX_USER_CONST_BUFFERS 13
@@ -120,7 +118,7 @@ struct r600_db_state {
 
 struct r600_db_misc_state {
 	struct r600_atom		atom;
-	bool				occlusion_query_enabled;
+	bool				occlusion_queries_disabled;
 	bool				flush_depthstencil_through_cb;
 	bool				flush_depth_inplace;
 	bool				flush_stencil_inplace;
@@ -221,12 +219,6 @@ struct r600_stencil_ref_state {
 	struct pipe_stencil_ref pipe_state;
 };
 
-struct r600_viewport_state {
-	struct r600_atom atom;
-	struct pipe_viewport_state state[R600_MAX_VIEWPORTS];
-	uint32_t dirty_mask;
-};
-
 struct r600_shader_stages_state {
 	struct r600_atom atom;
 	unsigned geom_enable;
@@ -412,14 +404,6 @@ struct r600_cso_state
 	struct r600_command_buffer *cb;
 };
 
-struct r600_scissor_state
-{
-	struct r600_atom		atom;
-	struct pipe_scissor_state	scissor[R600_MAX_VIEWPORTS];
-	uint32_t			dirty_mask;
-	bool				enable; /* r6xx only */
-};
-
 struct r600_fetch_shader {
 	struct r600_resource		*buffer;
 	unsigned			offset;
@@ -480,12 +464,10 @@ struct r600_context {
 	struct r600_poly_offset_state	poly_offset_state;
 	struct r600_cso_state		rasterizer_state;
 	struct r600_sample_mask		sample_mask;
-	struct r600_scissor_state	scissor;
 	struct r600_seamless_cube_map	seamless_cube_map;
 	struct r600_config_state	config_state;
 	struct r600_stencil_ref_state	stencil_ref;
 	struct r600_vgt_state		vgt_state;
-	struct r600_viewport_state	viewport;
 	/* Shaders and shader resources. */
 	struct r600_cso_state		vertex_fetch_shader;
 	struct r600_shader_state        hw_shader_stages[EG_NUM_HW_STAGES];
@@ -730,7 +712,6 @@ void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_vgt_state(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom);
-void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom);
 void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a);
 void r600_add_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id);
 void r600_init_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id,
@@ -746,7 +727,6 @@ void r600_set_sample_locations_constant_buffer(struct r600_context *rctx);
 uint32_t r600_translate_stencil_op(int s_op);
 uint32_t r600_translate_fill(uint32_t func);
 unsigned r600_tex_wrap(unsigned wrap);
-unsigned r600_tex_filter(unsigned filter);
 unsigned r600_tex_mipfilter(unsigned filter);
 unsigned r600_tex_compare(unsigned compare);
 bool sampler_state_needs_border_color(const struct pipe_sampler_state *state);
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 3189a1360b1..91e747fa937 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -457,6 +457,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 
 	r600_init_command_buffer(&rs->buffer, 30);
 
+	rs->scissor_enable = state->scissor;
 	rs->flatshade = state->flatshade;
 	rs->sprite_coord_enable = state->sprite_coord_enable;
 	rs->two_side = state->light_twoside;
@@ -501,10 +502,9 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	if (rctx->b.chip_class >= R700) {
 		sc_mode_cntl |= S_028A4C_FORCE_EOV_REZ_ENABLE(1) |
 				S_028A4C_R700_ZMM_LINE_OFFSET(1) |
-				S_028A4C_R700_VPORT_SCISSOR_ENABLE(state->scissor);
+				S_028A4C_R700_VPORT_SCISSOR_ENABLE(1);
 	} else {
 		sc_mode_cntl |= S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1);
-		rs->scissor_enable = state->scissor;
 	}
 
 	spi_interp = S_0286D4_FLAT_SHADE_ENA(1);
@@ -558,11 +558,24 @@ static void *r600_create_rs_state(struct pipe_context *ctx,
 	return rs;
 }
 
+static unsigned r600_tex_filter(unsigned filter, unsigned max_aniso)
+{
+	if (filter == PIPE_TEX_FILTER_LINEAR)
+		return max_aniso > 1 ? V_03C000_SQ_TEX_XY_FILTER_ANISO_BILINEAR
+				     : V_03C000_SQ_TEX_XY_FILTER_BILINEAR;
+	else
+		return max_aniso > 1 ? V_03C000_SQ_TEX_XY_FILTER_ANISO_POINT
+				     : V_03C000_SQ_TEX_XY_FILTER_POINT;
+}
+
 static void *r600_create_sampler_state(struct pipe_context *ctx,
 					const struct pipe_sampler_state *state)
 {
+	struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen;
 	struct r600_pipe_sampler_state *ss = CALLOC_STRUCT(r600_pipe_sampler_state);
-	unsigned aniso_flag_offset = state->max_anisotropy > 1 ? 4 : 0;
+	unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso
+						       : state->max_anisotropy;
+	unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso);
 
 	if (!ss) {
 		return NULL;
@@ -576,10 +589,10 @@ static void *r600_create_sampler_state(struct pipe_context *ctx,
 		S_03C000_CLAMP_X(r600_tex_wrap(state->wrap_s)) |
 		S_03C000_CLAMP_Y(r600_tex_wrap(state->wrap_t)) |
 		S_03C000_CLAMP_Z(r600_tex_wrap(state->wrap_r)) |
-		S_03C000_XY_MAG_FILTER(r600_tex_filter(state->mag_img_filter) | aniso_flag_offset) |
-		S_03C000_XY_MIN_FILTER(r600_tex_filter(state->min_img_filter) | aniso_flag_offset) |
+		S_03C000_XY_MAG_FILTER(r600_tex_filter(state->mag_img_filter, max_aniso)) |
+		S_03C000_XY_MIN_FILTER(r600_tex_filter(state->min_img_filter, max_aniso)) |
 		S_03C000_MIP_FILTER(r600_tex_mipfilter(state->min_mip_filter)) |
-		S_03C000_MAX_ANISO(r600_tex_aniso_filter(state->max_anisotropy)) |
+		S_03C000_MAX_ANISO_RATIO(max_aniso_ratio) |
 		S_03C000_DEPTH_COMPARE_FUNCTION(r600_tex_compare(state->compare_func)) |
 		S_03C000_BORDER_COLOR_TYPE(ss->border_color_use ? V_03C000_SQ_TEX_BORDER_COLOR_REGISTER : 0);
 	/* R_03C004_SQ_TEX_SAMPLER_WORD1_0 */
@@ -777,61 +790,6 @@ static void r600_set_polygon_stipple(struct pipe_context *ctx,
 {
 }
 
-static void r600_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom)
-{
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
-	struct r600_scissor_state *rstate = &rctx->scissor;
-	struct pipe_scissor_state *state;
-	bool do_disable_workaround = false;
-	uint32_t dirty_mask;
-	unsigned i, offset;
-	uint32_t tl, br;
-
-	if (rctx->b.chip_class == R600 && !rctx->scissor.enable) {
-		tl = S_028240_TL_X(0) | S_028240_TL_Y(0) | S_028240_WINDOW_OFFSET_DISABLE(1);
-		br = S_028244_BR_X(8192) | S_028244_BR_Y(8192);
-		do_disable_workaround = true;
-	}
-
-	dirty_mask = rstate->dirty_mask;
-	while (dirty_mask != 0)
-	{
-		i = u_bit_scan(&dirty_mask);
-		offset = i * 4 * 2;
-		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + offset, 2);
-		if (!do_disable_workaround) {
-			state = &rstate->scissor[i];
-			tl = S_028240_TL_X(state->minx) | S_028240_TL_Y(state->miny) |
-				S_028240_WINDOW_OFFSET_DISABLE(1);
-			br = S_028244_BR_X(state->maxx) | S_028244_BR_Y(state->maxy);
-		}
-		radeon_emit(cs, tl);
-		radeon_emit(cs, br);
-	}
-	rstate->dirty_mask = 0;
-	rstate->atom.num_dw = 0;
-}
-
-static void r600_set_scissor_states(struct pipe_context *ctx,
-                                    unsigned start_slot,
-                                    unsigned num_scissors,
-                                    const struct pipe_scissor_state *state)
-{
-	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct r600_scissor_state *rstate = &rctx->scissor;
-	int i;
-
-	for (i = start_slot ; i < start_slot + num_scissors; i++)
-		rstate->scissor[i] = state[i - start_slot];
-	rstate->dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
-	rstate->atom.num_dw = util_bitcount(rstate->dirty_mask) * 4;
-
-	if (rctx->b.chip_class == R600 && !rstate->enable)
-		return;
-
-	r600_mark_atom_dirty(rctx, &rstate->atom);
-}
-
 static struct r600_resource *r600_buffer_create_helper(struct r600_screen *rscreen,
 						       unsigned size, unsigned alignment)
 {
@@ -1644,12 +1602,16 @@ static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom
 		}
 	}
 
-	if (a->occlusion_query_enabled) {
+	if (rctx->b.num_occlusion_queries > 0 &&
+	    !a->occlusion_queries_disabled) {
 		if (rctx->b.chip_class >= R700) {
 			db_render_control |= S_028D0C_R700_PERFECT_ZPASS_COUNTS(1);
 		}
 		db_render_override |= S_028D10_NOOP_CULL_DISABLE(1);
+	} else {
+		db_render_control |= S_028D0C_ZPASS_INCREMENT_DISABLE(1);
 	}
+
 	if (rctx->db_state.rsurf && rctx->db_state.rsurf->db_htile_surface) {
 		/* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */
 		db_render_override |= S_028D10_FORCE_HIZ_ENABLE(V_028D10_FORCE_OFF);
@@ -2173,6 +2135,12 @@ void r600_init_atom_start_cs(struct r600_context *rctx)
 	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
 	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4));
 
+	/* This enables pipeline stat & streamout queries.
+	 * They are only disabled by blits.
+	 */
+	r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0));
+	r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0));
+
 	family = rctx->b.family;
 	ps_prio = 0;
 	vs_prio = 1;
@@ -2424,12 +2392,6 @@ void r600_init_atom_start_cs(struct r600_context *rctx)
 	r600_store_context_reg(cb, R_028820_PA_CL_NANINF_CNTL, 0);
 	r600_store_context_reg(cb, R_028A48_PA_SC_MPASS_PS_CNTL, 0);
 
-	r600_store_context_reg_seq(cb, R_028C0C_PA_CL_GB_VERT_CLIP_ADJ, 4);
-	r600_store_value(cb, fui(1.0)); /* R_028C0C_PA_CL_GB_VERT_CLIP_ADJ */
-	r600_store_value(cb, fui(1.0)); /* R_028C10_PA_CL_GB_VERT_DISC_ADJ */
-	r600_store_value(cb, fui(1.0)); /* R_028C14_PA_CL_GB_HORZ_CLIP_ADJ */
-	r600_store_value(cb, fui(1.0)); /* R_028C18_PA_CL_GB_HORZ_DISC_ADJ */
-
 	r600_store_context_reg_seq(cb, R_0282D0_PA_SC_VPORT_ZMIN_0, 2 * R600_MAX_VIEWPORTS);
 	for (tmp = 0; tmp < R600_MAX_VIEWPORTS; tmp++) {
 		r600_store_value(cb, 0); /* R_0282D0_PA_SC_VPORT_ZMIN_0 */
@@ -3132,8 +3094,8 @@ void r600_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->dsa_state.atom, id++, r600_emit_cso_state, 0);
 	r600_init_atom(rctx, &rctx->poly_offset_state.atom, id++, r600_emit_polygon_offset, 6);
 	r600_init_atom(rctx, &rctx->rasterizer_state.atom, id++, r600_emit_cso_state, 0);
-	r600_init_atom(rctx, &rctx->scissor.atom, id++, r600_emit_scissor_state, 0);
-	r600_init_atom(rctx, &rctx->viewport.atom, id++, r600_emit_viewport_state, 0);
+	r600_add_atom(rctx, &rctx->b.scissors.atom, id++);
+	r600_add_atom(rctx, &rctx->b.viewports.atom, id++);
 	r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5);
@@ -3153,7 +3115,6 @@ void r600_init_state_functions(struct r600_context *rctx)
 	rctx->b.b.set_framebuffer_state = r600_set_framebuffer_state;
 	rctx->b.b.set_polygon_stipple = r600_set_polygon_stipple;
 	rctx->b.b.set_min_samples = r600_set_min_samples;
-	rctx->b.b.set_scissor_states = r600_set_scissor_states;
 	rctx->b.b.get_sample_position = r600_get_sample_position;
 	rctx->b.dma_copy = r600_dma_copy;
 }
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index df41d3f028d..cb40c20a7dd 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -364,14 +364,7 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state)
 		r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom);
 	}
 
-	/* Workaround for a missing scissor enable on r600. */
-	if (rctx->b.chip_class == R600 &&
-	    rs->scissor_enable != rctx->scissor.enable) {
-		rctx->scissor.enable = rs->scissor_enable;
-		rctx->scissor.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
-		rctx->scissor.atom.num_dw = R600_MAX_VIEWPORTS * 4;
-		r600_mark_atom_dirty(rctx, &rctx->scissor.atom);
-	}
+	r600_set_scissor_enable(&rctx->b, rs->scissor_enable);
 
 	/* Re-emit PA_SC_LINE_STIPPLE. */
 	rctx->last_primitive_type = -1;
@@ -713,47 +706,6 @@ static void r600_update_compressed_colortex_mask(struct r600_samplerview_state *
 	}
 }
 
-static void r600_set_viewport_states(struct pipe_context *ctx,
-                                     unsigned start_slot,
-                                     unsigned num_viewports,
-                                     const struct pipe_viewport_state *state)
-{
-	struct r600_context *rctx = (struct r600_context *)ctx;
-	struct r600_viewport_state *rstate = &rctx->viewport;
-	int i;
-
-	for (i = start_slot; i < start_slot + num_viewports; i++)
-		rstate->state[i] = state[i - start_slot];
-	rstate->dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
-	rstate->atom.num_dw = util_bitcount(rstate->dirty_mask) * 8;
-	r600_mark_atom_dirty(rctx, &rctx->viewport.atom);
-}
-
-void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
-{
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
-	struct r600_viewport_state *rstate = &rctx->viewport;
-	struct pipe_viewport_state *state;
-	uint32_t dirty_mask;
-	unsigned i, offset;
-
-	dirty_mask = rstate->dirty_mask;
-	while (dirty_mask != 0) {
-		i = u_bit_scan(&dirty_mask);
-		offset = i * 6 * 4;
-		radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE_0 + offset, 6);
-		state = &rstate->state[i];
-		radeon_emit(cs, fui(state->scale[0]));     /* R_02843C_PA_CL_VPORT_XSCALE_0  */
-		radeon_emit(cs, fui(state->translate[0])); /* R_028440_PA_CL_VPORT_XOFFSET_0 */
-		radeon_emit(cs, fui(state->scale[1]));     /* R_028444_PA_CL_VPORT_YSCALE_0  */
-		radeon_emit(cs, fui(state->translate[1])); /* R_028448_PA_CL_VPORT_YOFFSET_0 */
-		radeon_emit(cs, fui(state->scale[2]));     /* R_02844C_PA_CL_VPORT_ZSCALE_0  */
-		radeon_emit(cs, fui(state->translate[2])); /* R_028450_PA_CL_VPORT_ZOFFSET_0 */
-	}
-	rstate->dirty_mask = 0;
-	rstate->atom.num_dw = 0;
-}
-
 /* Compute the key for the hw shader variant */
 static inline union r600_shader_key r600_shader_selector_key(struct pipe_context * ctx,
 		struct r600_pipe_shader_selector * sel)
@@ -961,6 +913,18 @@ static void r600_bind_ps_state(struct pipe_context *ctx, void *state)
 	rctx->ps_shader = (struct r600_pipe_shader_selector *)state;
 }
 
+static struct tgsi_shader_info *r600_get_vs_info(struct r600_context *rctx)
+{
+	if (rctx->gs_shader)
+		return &rctx->gs_shader->info;
+	else if (rctx->tes_shader)
+		return &rctx->tes_shader->info;
+	else if (rctx->vs_shader)
+		return &rctx->vs_shader->info;
+	else
+		return NULL;
+}
+
 static void r600_bind_vs_state(struct pipe_context *ctx, void *state)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
@@ -969,6 +933,7 @@ static void r600_bind_vs_state(struct pipe_context *ctx, void *state)
 		return;
 
 	rctx->vs_shader = (struct r600_pipe_shader_selector *)state;
+	r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx));
 	rctx->b.streamout.stride_in_dw = rctx->vs_shader->so.stride;
 }
 
@@ -977,6 +942,7 @@ static void r600_bind_gs_state(struct pipe_context *ctx, void *state)
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->gs_shader = (struct r600_pipe_shader_selector *)state;
+	r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx));
 
 	if (!state)
 		return;
@@ -995,6 +961,7 @@ static void r600_bind_tes_state(struct pipe_context *ctx, void *state)
 	struct r600_context *rctx = (struct r600_context *)ctx;
 
 	rctx->tes_shader = (struct r600_pipe_shader_selector *)state;
+	r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx));
 
 	if (!state)
 		return;
@@ -1841,8 +1808,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 			ia_switch_on_eop = true;
 		}
 
-		if (rctx->b.streamout.streamout_enabled ||
-		    rctx->b.streamout.prims_gen_query_enabled)
+		if (r600_get_strmout_en(&rctx->b))
 			partial_vs_wave = true;
 
 		radeon_set_context_reg(cs, CM_R_028AA8_IA_MULTI_VGT_PARAM,
@@ -2018,7 +1984,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 	    rctx->b.family == CHIP_RV635) {
 		/* if we have gs shader or streamout
 		   we need to do a wait idle after every draw */
-		if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) {
+		if (rctx->gs_shader || r600_get_strmout_en(&rctx->b)) {
 			radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1));
 		}
 	}
@@ -2123,17 +2089,6 @@ unsigned r600_tex_wrap(unsigned wrap)
 	}
 }
 
-unsigned r600_tex_filter(unsigned filter)
-{
-	switch (filter) {
-	default:
-	case PIPE_TEX_FILTER_NEAREST:
-		return V_03C000_SQ_TEX_XY_FILTER_POINT;
-	case PIPE_TEX_FILTER_LINEAR:
-		return V_03C000_SQ_TEX_XY_FILTER_BILINEAR;
-	}
-}
-
 unsigned r600_tex_mipfilter(unsigned filter)
 {
 	switch (filter) {
@@ -2861,16 +2816,33 @@ static void r600_invalidate_buffer(struct pipe_context *ctx, struct pipe_resourc
 	}
 }
 
-static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
+static void r600_set_active_query_state(struct pipe_context *ctx, boolean enable)
 {
 	struct r600_context *rctx = (struct r600_context*)ctx;
 
-	if (rctx->db_misc_state.occlusion_query_enabled != enable) {
-		rctx->db_misc_state.occlusion_query_enabled = enable;
+	/* Pipeline stat & streamout queries. */
+	if (enable) {
+		rctx->b.flags &= ~R600_CONTEXT_STOP_PIPELINE_STATS;
+		rctx->b.flags |= R600_CONTEXT_START_PIPELINE_STATS;
+	} else {
+		rctx->b.flags &= ~R600_CONTEXT_START_PIPELINE_STATS;
+		rctx->b.flags |= R600_CONTEXT_STOP_PIPELINE_STATS;
+	}
+
+	/* Occlusion queries. */
+	if (rctx->db_misc_state.occlusion_queries_disabled != !enable) {
+		rctx->db_misc_state.occlusion_queries_disabled = !enable;
 		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
 
+static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
+{
+	struct r600_context *rctx = (struct r600_context*)ctx;
+
+	r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
+}
+
 static void r600_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw,
                                    bool include_draw_vbo)
 {
@@ -2911,13 +2883,13 @@ void r600_init_common_state_functions(struct r600_context *rctx)
 	rctx->b.b.set_constant_buffer = r600_set_constant_buffer;
 	rctx->b.b.set_sample_mask = r600_set_sample_mask;
 	rctx->b.b.set_stencil_ref = r600_set_pipe_stencil_ref;
-	rctx->b.b.set_viewport_states = r600_set_viewport_states;
 	rctx->b.b.set_vertex_buffers = r600_set_vertex_buffers;
 	rctx->b.b.set_index_buffer = r600_set_index_buffer;
 	rctx->b.b.set_sampler_views = r600_set_sampler_views;
 	rctx->b.b.sampler_view_destroy = r600_sampler_view_destroy;
 	rctx->b.b.texture_barrier = r600_texture_barrier;
 	rctx->b.b.set_stream_output_targets = r600_set_streamout_targets;
+	rctx->b.b.set_active_query_state = r600_set_active_query_state;
 	rctx->b.b.draw_vbo = r600_draw_vbo;
 	rctx->b.invalidate_buffer = r600_invalidate_buffer;
 	rctx->b.set_occlusion_query_state = r600_set_occlusion_query_state;
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 3d223edb5f4..ecabb340a9c 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -780,7 +780,8 @@
 #define   S_028D0C_STENCIL_COMPRESS_DISABLE(x)         (((x) & 0x1) << 5)
 #define   S_028D0C_DEPTH_COMPRESS_DISABLE(x)           (((x) & 0x1) << 6)
 #define   S_028D0C_COPY_CENTROID(x)                    (((x) & 0x1) << 7)
-#define   S_028D0C_COPY_SAMPLE(x)                      (((x) & 0x1) << 8)
+#define   S_028D0C_COPY_SAMPLE(x)                      (((x) & 0x03) << 8)
+#define   S_028D0C_ZPASS_INCREMENT_DISABLE(x)          (((x) & 0x1) << 11)
 #define   S_028D0C_R700_PERFECT_ZPASS_COUNTS(x)        (((x) & 0x1) << 15)
 #define   S_028D0C_CONSERVATIVE_Z_EXPORT(x)            (((x) & 0x03) << 13)
 #define   G_028D0C_CONSERVATIVE_Z_EXPORT(x)            (((x) >> 13) & 0x03)
@@ -1266,6 +1267,8 @@
 #define     V_03C000_SQ_TEX_XY_FILTER_POINT            0x00000000
 #define     V_03C000_SQ_TEX_XY_FILTER_BILINEAR         0x00000001
 #define     V_03C000_SQ_TEX_XY_FILTER_BICUBIC          0x00000002
+#define     V_03C000_SQ_TEX_XY_FILTER_ANISO_POINT      0x00000004
+#define     V_03C000_SQ_TEX_XY_FILTER_ANISO_BILINEAR   0x00000005
 #define   S_03C000_XY_MIN_FILTER(x)                    (((x) & 0x7) << 12)
 #define   G_03C000_XY_MIN_FILTER(x)                    (((x) >> 12) & 0x7)
 #define   C_03C000_XY_MIN_FILTER                       0xFFFF8FFF
@@ -1278,9 +1281,9 @@
 #define   S_03C000_MIP_FILTER(x)                       (((x) & 0x3) << 17)
 #define   G_03C000_MIP_FILTER(x)                       (((x) >> 17) & 0x3)
 #define   C_03C000_MIP_FILTER                          0xFFF9FFFF
-#define   S_03C000_MAX_ANISO(x)                        (((x) & 0x7) << 19)
-#define   G_03C000_MAX_ANISO(x)                        (((x) >> 19) & 0x7)
-#define   C_03C000_MAX_ANISO                           0xFFB7FFFF
+#define   S_03C000_MAX_ANISO_RATIO(x)                  (((x) & 0x7) << 19)
+#define   G_03C000_MAX_ANISO_RATIO(x)                  (((x) >> 19) & 0x7)
+#define   C_03C000_MAX_ANISO_RATIO                     0xFFB7FFFF
 #define   S_03C000_BORDER_COLOR_TYPE(x)                (((x) & 0x3) << 22)
 #define   G_03C000_BORDER_COLOR_TYPE(x)                (((x) >> 22) & 0x3)
 #define   C_03C000_BORDER_COLOR_TYPE                   0xFF3FFFFF
diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources
index eb171f7da5f..f993d75a6ad 100644
--- a/src/gallium/drivers/radeon/Makefile.sources
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -11,6 +11,7 @@ C_SOURCES := \
 	r600_query.h \
 	r600_streamout.c \
 	r600_texture.c \
+	r600_viewport.c \
 	radeon_uvd.c \
 	radeon_uvd.h \
 	radeon_vce_40_2_2.c \
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 33ba0fbca9b..47514e91d23 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -102,7 +102,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 
 bool r600_init_resource(struct r600_common_screen *rscreen,
 			struct r600_resource *res,
-			unsigned size, unsigned alignment,
+			uint64_t size, unsigned alignment,
 			bool use_reusable_pool)
 {
 	struct r600_texture *rtex = (struct r600_texture*)res;
@@ -160,9 +160,18 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 	    rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) {
 		res->domains = RADEON_DOMAIN_VRAM;
 		flags &= ~RADEON_FLAG_CPU_ACCESS;
-		flags |= RADEON_FLAG_NO_CPU_ACCESS;
+		flags |= RADEON_FLAG_NO_CPU_ACCESS |
+			 RADEON_FLAG_GTT_WC;
 	}
 
+	/* If VRAM is just stolen system memory, allow both VRAM and GTT,
+	 * whichever has free space. If a buffer is evicted from VRAM to GTT,
+	 * it will stay there.
+	 */
+	if (!rscreen->info.has_dedicated_vram &&
+	    res->domains == RADEON_DOMAIN_VRAM)
+		res->domains = RADEON_DOMAIN_VRAM_GTT;
+
 	if (rscreen->debug_flags & DBG_NO_WC)
 		flags &= ~RADEON_FLAG_GTT_WC;
 
@@ -192,7 +201,7 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 	res->TC_L2_dirty = false;
 
 	if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) {
-		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %u bytes\n",
+		fprintf(stderr, "VM start=0x%"PRIX64"  end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n",
 			res->gpu_address, res->gpu_address + res->buf->size,
 			res->buf->size);
 	}
diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c
index f3529a1fe0f..9ab17d9e04c 100644
--- a/src/gallium/drivers/radeon/r600_perfcounter.c
+++ b/src/gallium/drivers/radeon/r600_perfcounter.c
@@ -310,7 +310,6 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx,
 
 	query->b.b.ops = &batch_query_ops;
 	query->b.ops = &batch_query_hw_ops;
-	query->b.flags = R600_QUERY_HW_FLAG_TIMER;
 
 	query->num_counters = num_queries;
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 32bd6e40d32..a7477abea34 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -156,14 +156,8 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
 	/* suspend queries */
-	if (ctx->num_cs_dw_nontimer_queries_suspend) {
-		/* Since non-timer queries are suspended during blits,
-		 * we have to guard against double-suspends. */
-		r600_suspend_nontimer_queries(ctx);
-		ctx->nontimer_queries_suspended_by_flush = true;
-	}
-	if (!LIST_IS_EMPTY(&ctx->active_timer_queries))
-		r600_suspend_timer_queries(ctx);
+	if (!LIST_IS_EMPTY(&ctx->active_queries))
+		r600_suspend_queries(ctx);
 
 	ctx->streamout.suspended = false;
 	if (ctx->streamout.begin_emitted) {
@@ -180,12 +174,8 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 	}
 
 	/* resume queries */
-	if (!LIST_IS_EMPTY(&ctx->active_timer_queries))
-		r600_resume_timer_queries(ctx);
-	if (ctx->nontimer_queries_suspended_by_flush) {
-		ctx->nontimer_queries_suspended_by_flush = false;
-		r600_resume_nontimer_queries(ctx);
-	}
+	if (!LIST_IS_EMPTY(&ctx->active_queries))
+		r600_resume_queries(ctx);
 }
 
 static void r600_flush_from_st(struct pipe_context *ctx,
@@ -296,6 +286,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	LIST_INITHEAD(&rctx->texture_buffers);
 
 	r600_init_context_texture_functions(rctx);
+	r600_init_viewport_functions(rctx);
 	r600_streamout_init(rctx);
 	r600_query_init(rctx);
 	cayman_init_msaa(&rctx->b);
@@ -898,6 +889,13 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	rscreen->chip_class = rscreen->info.chip_class;
 	rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0);
 
+	rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1));
+	if (rscreen->force_aniso >= 0) {
+		printf("radeon: Forcing anisotropy filter to %ix\n",
+		       /* round down to a power of two */
+		       1 << util_logbase2(rscreen->force_aniso));
+	}
+
 	util_format_s3tc_init();
 	pipe_mutex_init(rscreen->aux_context_lock);
 	pipe_mutex_init(rscreen->gpu_load_mutex);
@@ -973,7 +971,7 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen,
 }
 
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
-			      unsigned offset, unsigned size, unsigned value,
+			      uint64_t offset, uint64_t size, unsigned value,
 			      bool is_framebuffer)
 {
 	struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context;
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 062c3193947..a6abe09d438 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -50,7 +50,10 @@
 #define R600_RESOURCE_FLAG_FORCE_TILING		(PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
 
 #define R600_CONTEXT_STREAMOUT_FLUSH		(1u << 0)
-#define R600_CONTEXT_PRIVATE_FLAG		(1u << 1)
+/* Pipeline & streamout query controls. */
+#define R600_CONTEXT_START_PIPELINE_STATS	(1u << 1)
+#define R600_CONTEXT_STOP_PIPELINE_STATS	(1u << 2)
+#define R600_CONTEXT_PRIVATE_FLAG		(1u << 3)
 
 /* special primitive types */
 #define R600_PRIM_RECTANGLE_LIST	PIPE_PRIM_MAX
@@ -94,9 +97,11 @@
 #define DBG_MONOLITHIC_SHADERS	(1llu << 47)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
+#define R600_MAX_VIEWPORTS        16
 
 struct r600_common_context;
 struct r600_perfcounters;
+struct tgsi_shader_info;
 
 struct radeon_shader_reloc {
 	char name[32];
@@ -137,6 +142,9 @@ struct radeon_shader_binary {
 void radeon_shader_binary_init(struct radeon_shader_binary *b);
 void radeon_shader_binary_clean(struct radeon_shader_binary *b);
 
+/* Only 32-bit buffer allocations are supported, gallium doesn't support more
+ * at the moment.
+ */
 struct r600_resource {
 	struct u_resource		b;
 
@@ -181,8 +189,8 @@ struct r600_transfer {
 };
 
 struct r600_fmask_info {
-	unsigned offset;
-	unsigned size;
+	uint64_t offset;
+	uint64_t size;
 	unsigned alignment;
 	unsigned pitch_in_pixels;
 	unsigned bank_height;
@@ -191,8 +199,8 @@ struct r600_fmask_info {
 };
 
 struct r600_cmask_info {
-	unsigned offset;
-	unsigned size;
+	uint64_t offset;
+	uint64_t size;
 	unsigned alignment;
 	unsigned pitch;
 	unsigned height;
@@ -212,7 +220,7 @@ struct r600_htile_info {
 struct r600_texture {
 	struct r600_resource		resource;
 
-	unsigned			size;
+	uint64_t			size;
 	bool				is_depth;
 	unsigned			dirty_level_mask; /* each bit says if that mipmap is compressed */
 	unsigned			stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */
@@ -224,7 +232,7 @@ struct r600_texture {
 	struct r600_fmask_info		fmask;
 	struct r600_cmask_info		cmask;
 	struct r600_resource		*cmask_buffer;
-	unsigned			dcc_offset; /* 0 = disabled */
+	uint64_t			dcc_offset; /* 0 = disabled */
 	unsigned			cb_color_info; /* fast clear enable bit */
 	unsigned			color_clear_value[2];
 
@@ -298,6 +306,9 @@ struct r600_common_screen {
 	bool				has_cp_dma;
 	bool				has_streamout;
 
+	/* Texture filter settings. */
+	int				force_aniso; /* -1 = disabled */
+
 	/* Auxiliary context. Mainly used to initialize resources.
 	 * It must be locked prior to using and flushed before unlocking. */
 	struct pipe_context		*aux_context;
@@ -388,6 +399,26 @@ struct r600_streamout {
 	int				num_prims_gen_queries;
 };
 
+struct r600_signed_scissor {
+	int minx;
+	int miny;
+	int maxx;
+	int maxy;
+};
+
+struct r600_scissors {
+	struct r600_atom		atom;
+	unsigned			dirty_mask;
+	struct pipe_scissor_state	states[R600_MAX_VIEWPORTS];
+};
+
+struct r600_viewports {
+	struct r600_atom		atom;
+	unsigned			dirty_mask;
+	struct pipe_viewport_state	states[R600_MAX_VIEWPORTS];
+	struct r600_signed_scissor	as_scissor[R600_MAX_VIEWPORTS];
+};
+
 struct r600_ring {
 	struct radeon_winsys_cs		*cs;
 	void (*flush)(void *ctx, unsigned flags,
@@ -420,23 +451,20 @@ struct r600_common_context {
 
 	/* States. */
 	struct r600_streamout		streamout;
+	struct r600_scissors		scissors;
+	struct r600_viewports		viewports;
+	bool				scissor_enabled;
+	bool				vs_writes_viewport_index;
 
 	/* Additional context states. */
 	unsigned flags; /* flush flags */
 
 	/* Queries. */
-	/* The list of active queries. */
+	/* Maintain the list of active queries for pausing between IBs. */
 	int				num_occlusion_queries;
 	int				num_perfect_occlusion_queries;
-	/* Keep track of non-timer queries, because they should be suspended
-	 * during context flushing.
-	 * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits,
-	 * but they should be suspended between IBs. */
-	struct list_head		active_nontimer_queries;
-	struct list_head		active_timer_queries;
-	unsigned			num_cs_dw_nontimer_queries_suspend;
-	bool				nontimer_queries_suspended_by_flush;
-	unsigned			num_cs_dw_timer_queries_suspend;
+	struct list_head		active_queries;
+	unsigned			num_cs_dw_queries_suspend;
 	/* Additional hardware info. */
 	unsigned			backend_mask;
 	unsigned			max_db; /* for OQ */
@@ -476,7 +504,7 @@ struct r600_common_context {
 			 const struct pipe_box *src_box);
 
 	void (*clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst,
-			     unsigned offset, unsigned size, unsigned value,
+			     uint64_t offset, uint64_t size, unsigned value,
 			     bool is_framebuffer);
 
 	void (*blit_decompress_depth)(struct pipe_context *ctx,
@@ -513,7 +541,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
                                       unsigned usage);
 bool r600_init_resource(struct r600_common_screen *rscreen,
 			struct r600_resource *res,
-			unsigned size, unsigned alignment,
+			uint64_t size, unsigned alignment,
 			bool use_reusable_pool);
 struct pipe_resource *r600_buffer_create(struct pipe_screen *screen,
 					 const struct pipe_resource *templ,
@@ -548,7 +576,7 @@ void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resour
 bool r600_can_dump_shader(struct r600_common_screen *rscreen,
 			  unsigned processor);
 void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst,
-			      unsigned offset, unsigned size, unsigned value,
+			      uint64_t offset, uint64_t size, unsigned value,
 			      bool is_framebuffer);
 struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen,
 						  const struct pipe_resource *templ);
@@ -566,10 +594,8 @@ void r600_perfcounters_destroy(struct r600_common_screen *rscreen);
 /* r600_query.c */
 void r600_init_screen_query_functions(struct r600_common_screen *rscreen);
 void r600_query_init(struct r600_common_context *rctx);
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx);
-void r600_resume_nontimer_queries(struct r600_common_context *ctx);
-void r600_suspend_timer_queries(struct r600_common_context *ctx);
-void r600_resume_timer_queries(struct r600_common_context *ctx);
+void r600_suspend_queries(struct r600_common_context *ctx);
+void r600_resume_queries(struct r600_common_context *ctx);
 void r600_query_init_backend_mask(struct r600_common_context *ctx);
 
 /* r600_streamout.c */
@@ -612,6 +638,14 @@ void r600_texture_disable_dcc(struct r600_common_screen *rscreen,
 void r600_init_screen_texture_functions(struct r600_common_screen *rscreen);
 void r600_init_context_texture_functions(struct r600_common_context *rctx);
 
+/* r600_viewport.c */
+void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx,
+					    struct pipe_scissor_state *scissor);
+void r600_set_scissor_enable(struct r600_common_context *rctx, bool enable);
+void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx,
+					  struct tgsi_shader_info *info);
+void r600_init_viewport_functions(struct r600_common_context *rctx);
+
 /* cayman_msaa.c */
 extern const uint32_t eg_sample_locs_2x[4];
 extern const unsigned eg_max_dist_2x;
@@ -639,13 +673,38 @@ r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res)
 				(struct pipe_resource *)res);
 }
 
+static inline bool r600_get_strmout_en(struct r600_common_context *rctx)
+{
+	return rctx->streamout.streamout_enabled ||
+	       rctx->streamout.prims_gen_query_enabled;
+}
+
+#define     SQ_TEX_XY_FILTER_POINT                         0x00
+#define     SQ_TEX_XY_FILTER_BILINEAR                      0x01
+#define     SQ_TEX_XY_FILTER_ANISO_POINT                   0x02
+#define     SQ_TEX_XY_FILTER_ANISO_BILINEAR                0x03
+
+static inline unsigned eg_tex_filter(unsigned filter, unsigned max_aniso)
+{
+	if (filter == PIPE_TEX_FILTER_LINEAR)
+		return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_BILINEAR
+				     : SQ_TEX_XY_FILTER_BILINEAR;
+	else
+		return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_POINT
+				     : SQ_TEX_XY_FILTER_POINT;
+}
+
 static inline unsigned r600_tex_aniso_filter(unsigned filter)
 {
-	if (filter <= 1)   return 0;
-	if (filter <= 2)   return 1;
-	if (filter <= 4)   return 2;
-	if (filter <= 8)   return 3;
-	 /* else */        return 4;
+	if (filter < 2)
+		return 0;
+	if (filter < 4)
+		return 1;
+	if (filter < 8)
+		return 2;
+	if (filter < 16)
+		return 3;
+	return 4;
 }
 
 static inline unsigned r600_wavefront_size(enum radeon_family family)
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 7a2d2ee7f31..de6e37b9f62 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -369,13 +369,11 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx,
 		query->result_size = 16;
 		query->num_cs_dw_begin = 8;
 		query->num_cs_dw_end = 8;
-		query->flags = R600_QUERY_HW_FLAG_TIMER;
 		break;
 	case PIPE_QUERY_TIMESTAMP:
 		query->result_size = 8;
 		query->num_cs_dw_end = 8;
-		query->flags = R600_QUERY_HW_FLAG_TIMER |
-			       R600_QUERY_HW_FLAG_NO_START;
+		query->flags = R600_QUERY_HW_FLAG_NO_START;
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -516,10 +514,7 @@ static void r600_query_hw_emit_start(struct r600_common_context *ctx,
 
 	query->ops->emit_start(ctx, query, query->buffer.buf, va);
 
-	if (query->flags & R600_QUERY_HW_FLAG_TIMER)
-		ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw_end;
-	else
-		ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw_end;
+	ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end;
 }
 
 static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx,
@@ -590,12 +585,8 @@ static void r600_query_hw_emit_stop(struct r600_common_context *ctx,
 
 	query->buffer.results_end += query->result_size;
 
-	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) {
-		if (query->flags & R600_QUERY_HW_FLAG_TIMER)
-			ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw_end;
-		else
-			ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw_end;
-	}
+	if (!(query->flags & R600_QUERY_HW_FLAG_NO_START))
+		ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end;
 
 	r600_update_occlusion_query_state(ctx, query->b.type, -1);
 	r600_update_prims_generated_query_state(ctx, query->b.type, -1);
@@ -730,11 +721,8 @@ boolean r600_query_hw_begin(struct r600_common_context *rctx,
 
 	r600_query_hw_emit_start(rctx, query);
 
-	if (query->flags & R600_QUERY_HW_FLAG_TIMER)
-		LIST_ADDTAIL(&query->list, &rctx->active_timer_queries);
-	else
-		LIST_ADDTAIL(&query->list, &rctx->active_nontimer_queries);
-   return true;
+	LIST_ADDTAIL(&query->list, &rctx->active_queries);
+	return true;
 }
 
 static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
@@ -973,28 +961,14 @@ static void r600_render_condition(struct pipe_context *ctx,
 	rctx->set_atom_dirty(rctx, atom, query != NULL);
 }
 
-static void r600_suspend_queries(struct r600_common_context *ctx,
-				 struct list_head *query_list,
-				 unsigned *num_cs_dw_queries_suspend)
+void r600_suspend_queries(struct r600_common_context *ctx)
 {
 	struct r600_query_hw *query;
 
-	LIST_FOR_EACH_ENTRY(query, query_list, list) {
+	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
 		r600_query_hw_emit_stop(ctx, query);
 	}
-	assert(*num_cs_dw_queries_suspend == 0);
-}
-
-void r600_suspend_nontimer_queries(struct r600_common_context *ctx)
-{
-	r600_suspend_queries(ctx, &ctx->active_nontimer_queries,
-			     &ctx->num_cs_dw_nontimer_queries_suspend);
-}
-
-void r600_suspend_timer_queries(struct r600_common_context *ctx)
-{
-	r600_suspend_queries(ctx, &ctx->active_timer_queries,
-			     &ctx->num_cs_dw_timer_queries_suspend);
+	assert(ctx->num_cs_dw_queries_suspend == 0);
 }
 
 static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx,
@@ -1022,35 +996,21 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *
 	return num_dw;
 }
 
-static void r600_resume_queries(struct r600_common_context *ctx,
-				struct list_head *query_list,
-				unsigned *num_cs_dw_queries_suspend)
+void r600_resume_queries(struct r600_common_context *ctx)
 {
 	struct r600_query_hw *query;
-	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list);
+	unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries);
 
-	assert(*num_cs_dw_queries_suspend == 0);
+	assert(ctx->num_cs_dw_queries_suspend == 0);
 
 	/* Check CS space here. Resuming must not be interrupted by flushes. */
 	ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE);
 
-	LIST_FOR_EACH_ENTRY(query, query_list, list) {
+	LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) {
 		r600_query_hw_emit_start(ctx, query);
 	}
 }
 
-void r600_resume_nontimer_queries(struct r600_common_context *ctx)
-{
-	r600_resume_queries(ctx, &ctx->active_nontimer_queries,
-			    &ctx->num_cs_dw_nontimer_queries_suspend);
-}
-
-void r600_resume_timer_queries(struct r600_common_context *ctx)
-{
-	r600_resume_queries(ctx, &ctx->active_timer_queries,
-			    &ctx->num_cs_dw_timer_queries_suspend);
-}
-
 /* Get backends mask */
 void r600_query_init_backend_mask(struct r600_common_context *ctx)
 {
@@ -1274,8 +1234,7 @@ void r600_query_init(struct r600_common_context *rctx)
 	if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0)
 	    rctx->b.render_condition = r600_render_condition;
 
-	LIST_INITHEAD(&rctx->active_nontimer_queries);
-	LIST_INITHEAD(&rctx->active_timer_queries);
+	LIST_INITHEAD(&rctx->active_queries);
 }
 
 void r600_init_screen_query_functions(struct r600_common_screen *rscreen)
diff --git a/src/gallium/drivers/radeon/r600_query.h b/src/gallium/drivers/radeon/r600_query.h
index 8b2c4e3fe93..9f3a917d727 100644
--- a/src/gallium/drivers/radeon/r600_query.h
+++ b/src/gallium/drivers/radeon/r600_query.h
@@ -84,8 +84,7 @@ struct r600_query {
 
 enum {
 	R600_QUERY_HW_FLAG_NO_START = (1 << 0),
-	R600_QUERY_HW_FLAG_TIMER = (1 << 1),
-	R600_QUERY_HW_FLAG_PREDICATE = (1 << 2),
+	R600_QUERY_HW_FLAG_PREDICATE = (1 << 1),
 };
 
 struct r600_query_hw_ops {
diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c
index e977ed9fa10..fc9ec4859f6 100644
--- a/src/gallium/drivers/radeon/r600_streamout.c
+++ b/src/gallium/drivers/radeon/r600_streamout.c
@@ -311,12 +311,6 @@ void r600_emit_streamout_end(struct r600_common_context *rctx)
  * are no buffers bound.
  */
 
-static bool r600_get_strmout_en(struct r600_common_context *rctx)
-{
-	return rctx->streamout.streamout_enabled ||
-	       rctx->streamout.prims_gen_query_enabled;
-}
-
 static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 				       struct r600_atom *atom)
 {
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 4850b73f291..72af5344b70 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -222,10 +222,6 @@ static int r600_setup_surface(struct pipe_screen *screen,
 		rtex->surface.level[0].nblk_x = pitch_in_bytes_override / rtex->surface.bpe;
 		rtex->surface.level[0].pitch_bytes = pitch_in_bytes_override;
 		rtex->surface.level[0].slice_size = pitch_in_bytes_override * rtex->surface.level[0].nblk_y;
-		if (rtex->surface.flags & RADEON_SURF_SBUFFER) {
-			rtex->surface.stencil_offset =
-			rtex->surface.stencil_level[0].offset = rtex->surface.level[0].slice_size;
-		}
 	}
 
 	if (offset) {
@@ -482,7 +478,7 @@ static void r600_texture_allocate_fmask(struct r600_common_screen *rscreen,
 	r600_texture_get_fmask_info(rscreen, rtex,
 				    rtex->resource.b.b.nr_samples, &rtex->fmask);
 
-	rtex->fmask.offset = align(rtex->size, rtex->fmask.alignment);
+	rtex->fmask.offset = align64(rtex->size, rtex->fmask.alignment);
 	rtex->size = rtex->fmask.offset + rtex->fmask.size;
 }
 
@@ -585,7 +581,7 @@ static void r600_texture_allocate_cmask(struct r600_common_screen *rscreen,
 		r600_texture_get_cmask_info(rscreen, rtex, &rtex->cmask);
 	}
 
-	rtex->cmask.offset = align(rtex->size, rtex->cmask.alignment);
+	rtex->cmask.offset = align64(rtex->size, rtex->cmask.alignment);
 	rtex->size = rtex->cmask.offset + rtex->cmask.size;
 
 	if (rscreen->chip_class >= SI)
@@ -747,14 +743,14 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 		(rtex->surface.flags & RADEON_SURF_SCANOUT) != 0);
 
 	if (rtex->fmask.size)
-		fprintf(f, "  FMask: offset=%u, size=%u, alignment=%u, pitch_in_pixels=%u, "
+		fprintf(f, "  FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, "
 			"bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n",
 			rtex->fmask.offset, rtex->fmask.size, rtex->fmask.alignment,
 			rtex->fmask.pitch_in_pixels, rtex->fmask.bank_height,
 			rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index);
 
 	if (rtex->cmask.size)
-		fprintf(f, "  CMask: offset=%u, size=%u, alignment=%u, pitch=%u, "
+		fprintf(f, "  CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, "
 			"height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n",
 			rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment,
 			rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign,
@@ -768,7 +764,7 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 			rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign);
 
 	if (rtex->dcc_offset) {
-		fprintf(f, "  DCC: offset=%u, size=%"PRIu64", alignment=%"PRIu64"\n",
+		fprintf(f, "  DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n",
 			rtex->dcc_offset, rtex->surface.dcc_size,
 			rtex->surface.dcc_alignment);
 		for (i = 0; i <= rtex->surface.last_level; i++)
@@ -873,7 +869,7 @@ r600_texture_create_object(struct pipe_screen *screen,
 		if (!buf && rtex->surface.dcc_size &&
 		    !(rscreen->debug_flags & DBG_NO_DCC)) {
 			/* Reserve space for the DCC buffer. */
-			rtex->dcc_offset = align(rtex->size, rtex->surface.dcc_alignment);
+			rtex->dcc_offset = align64(rtex->size, rtex->surface.dcc_alignment);
 			rtex->size = rtex->dcc_offset + rtex->surface.dcc_size;
 			rtex->cb_color_info |= VI_S_028C70_DCC_ENABLE(1);
 		}
@@ -947,13 +943,12 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen,
 		force_tiling = true;
 
 	/* Handle common candidates for the linear mode.
-	 * Compressed textures must always be tiled. */
-	if (!force_tiling && !util_format_is_compressed(templ->format)) {
-		/* Not everything can be linear, so we cannot enforce it
-		 * for all textures. */
-		if ((rscreen->debug_flags & DBG_NO_TILING) &&
-		    (!util_format_is_depth_or_stencil(templ->format) ||
-		     !(templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)))
+	 * Compressed textures and DB surfaces must always be tiled.
+	 */
+	if (!force_tiling && !util_format_is_compressed(templ->format) &&
+	    (!util_format_is_depth_or_stencil(templ->format) ||
+	     templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)) {
+		if (rscreen->debug_flags & DBG_NO_TILING)
 			return RADEON_SURF_MODE_LINEAR_ALIGNED;
 
 		/* Tiling doesn't work with the 422 (SUBSAMPLED) formats on R600+. */
diff --git a/src/gallium/drivers/radeon/r600_viewport.c b/src/gallium/drivers/radeon/r600_viewport.c
new file mode 100644
index 00000000000..ea558cd22de
--- /dev/null
+++ b/src/gallium/drivers/radeon/r600_viewport.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright 2012 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "r600_cs.h"
+#include "tgsi/tgsi_scan.h"
+
+#define GET_MAX_SCISSOR(rctx) (rctx->chip_class >= EVERGREEN ? 16384 : 8192)
+
+static void r600_set_scissor_states(struct pipe_context *ctx,
+				    unsigned start_slot,
+				    unsigned num_scissors,
+				    const struct pipe_scissor_state *state)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	int i;
+
+	for (i = 0; i < num_scissors; i++)
+		rctx->scissors.states[start_slot + i] = state[i];
+
+	if (!rctx->scissor_enabled)
+		return;
+
+	rctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
+	rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true);
+}
+
+/* Since the guard band disables clipping, we have to clip per-pixel
+ * using a scissor.
+ */
+static void r600_get_scissor_from_viewport(struct r600_common_context *rctx,
+					   const struct pipe_viewport_state *vp,
+					   struct r600_signed_scissor *scissor)
+{
+	int tmp;
+
+	/* Convert (-1, -1) and (1, 1) from clip space into window space. */
+	scissor->minx = -vp->scale[0] + vp->translate[0];
+	scissor->miny = -vp->scale[1] + vp->translate[1];
+	scissor->maxx = vp->scale[0] + vp->translate[0];
+	scissor->maxy = vp->scale[1] + vp->translate[1];
+
+	/* r600_draw_rectangle sets this. Disable the scissor. */
+	if (scissor->minx == -1 && scissor->miny == -1 &&
+	    scissor->maxx == 1 && scissor->maxy == 1) {
+		scissor->minx = scissor->miny = 0;
+		scissor->maxx = scissor->maxy = GET_MAX_SCISSOR(rctx);
+	}
+
+	/* Handle inverted viewports. */
+	if (scissor->minx > scissor->maxx) {
+		tmp = scissor->minx;
+		scissor->minx = scissor->maxx;
+		scissor->maxx = tmp;
+	}
+	if (scissor->miny > scissor->maxy) {
+		tmp = scissor->miny;
+		scissor->miny = scissor->maxy;
+		scissor->maxy = tmp;
+	}
+}
+
+static void r600_clamp_scissor(struct r600_common_context *rctx,
+			       struct pipe_scissor_state *out,
+			       struct r600_signed_scissor *scissor)
+{
+	unsigned max_scissor = GET_MAX_SCISSOR(rctx);
+	out->minx = CLAMP(scissor->minx, 0, max_scissor);
+	out->miny = CLAMP(scissor->miny, 0, max_scissor);
+	out->maxx = CLAMP(scissor->maxx, 0, max_scissor);
+	out->maxy = CLAMP(scissor->maxy, 0, max_scissor);
+}
+
+static void r600_clip_scissor(struct pipe_scissor_state *out,
+			      struct pipe_scissor_state *clip)
+{
+	out->minx = MAX2(out->minx, clip->minx);
+	out->miny = MAX2(out->miny, clip->miny);
+	out->maxx = MIN2(out->maxx, clip->maxx);
+	out->maxy = MIN2(out->maxy, clip->maxy);
+}
+
+static void r600_scissor_make_union(struct r600_signed_scissor *out,
+				    struct r600_signed_scissor *in)
+{
+	out->minx = MIN2(out->minx, in->minx);
+	out->miny = MIN2(out->miny, in->miny);
+	out->maxx = MAX2(out->maxx, in->maxx);
+	out->maxy = MAX2(out->maxy, in->maxy);
+}
+
+void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx,
+					    struct pipe_scissor_state *scissor)
+{
+	if (rctx->chip_class == EVERGREEN || rctx->chip_class == CAYMAN) {
+		if (scissor->maxx == 0)
+			scissor->minx = 1;
+		if (scissor->maxy == 0)
+			scissor->miny = 1;
+
+		if (rctx->chip_class == CAYMAN &&
+		    scissor->maxx == 1 && scissor->maxy == 1)
+			scissor->maxx = 2;
+	}
+}
+
+static void r600_emit_one_scissor(struct r600_common_context *rctx,
+				  struct radeon_winsys_cs *cs,
+				  struct r600_signed_scissor *vp_scissor,
+				  struct pipe_scissor_state *scissor)
+{
+	struct pipe_scissor_state final;
+
+	r600_clamp_scissor(rctx, &final, vp_scissor);
+
+	if (scissor)
+		r600_clip_scissor(&final, scissor);
+
+	evergreen_apply_scissor_bug_workaround(rctx, &final);
+
+	radeon_emit(cs, S_028250_TL_X(final.minx) |
+			S_028250_TL_Y(final.miny) |
+			S_028250_WINDOW_OFFSET_DISABLE(1));
+	radeon_emit(cs, S_028254_BR_X(final.maxx) |
+			S_028254_BR_Y(final.maxy));
+}
+
+/* the range is [-MAX, MAX] */
+#define GET_MAX_VIEWPORT_RANGE(rctx) (rctx->chip_class >= EVERGREEN ? 32768 : 16384)
+
+static void r600_emit_guardband(struct r600_common_context *rctx,
+				struct r600_signed_scissor *vp_as_scissor)
+{
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct pipe_viewport_state vp;
+	float left, top, right, bottom, max_range, guardband_x, guardband_y;
+
+	/* Reconstruct the viewport transformation from the scissor. */
+	vp.translate[0] = (vp_as_scissor->minx + vp_as_scissor->maxx) / 2.0;
+	vp.translate[1] = (vp_as_scissor->miny + vp_as_scissor->maxy) / 2.0;
+	vp.scale[0] = vp_as_scissor->maxx - vp.translate[0];
+	vp.scale[1] = vp_as_scissor->maxy - vp.translate[1];
+
+	/* Treat a 0x0 viewport as 1x1 to prevent division by zero. */
+	if (vp_as_scissor->minx == vp_as_scissor->maxx)
+		vp.scale[0] = 0.5;
+	if (vp_as_scissor->miny == vp_as_scissor->maxy)
+		vp.scale[1] = 0.5;
+
+	/* Find the biggest guard band that is inside the supported viewport
+	 * range. The guard band is specified as a horizontal and vertical
+	 * distance from (0,0) in clip space.
+	 *
+	 * This is done by applying the inverse viewport transformation
+	 * on the viewport limits to get those limits in clip space.
+	 *
+	 * Use a limit one pixel smaller to allow for some precision error.
+	 */
+	max_range = GET_MAX_VIEWPORT_RANGE(rctx) - 1;
+	left   = (-max_range - vp.translate[0]) / vp.scale[0];
+	right  = ( max_range - vp.translate[0]) / vp.scale[0];
+	top    = (-max_range - vp.translate[1]) / vp.scale[1];
+	bottom = ( max_range - vp.translate[1]) / vp.scale[1];
+
+	assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1);
+
+	guardband_x = MIN2(-left, right);
+	guardband_y = MIN2(-top, bottom);
+
+	/* If any of the GB registers is updated, all of them must be updated. */
+	if (rctx->chip_class >= CAYMAN)
+		radeon_set_context_reg_seq(cs, CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
+	else
+		radeon_set_context_reg_seq(cs, R600_R_028C0C_PA_CL_GB_VERT_CLIP_ADJ, 4);
+
+	radeon_emit(cs, fui(guardband_y)); /* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */
+	radeon_emit(cs, fui(1.0));         /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */
+	radeon_emit(cs, fui(guardband_x)); /* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */
+	radeon_emit(cs, fui(1.0));         /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */
+}
+
+static void r600_emit_scissors(struct r600_common_context *rctx, struct r600_atom *atom)
+{
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct pipe_scissor_state *states = rctx->scissors.states;
+	unsigned mask = rctx->scissors.dirty_mask;
+	bool scissor_enabled = rctx->scissor_enabled;
+	struct r600_signed_scissor max_vp_scissor;
+	int i;
+
+	/* The simple case: Only 1 viewport is active. */
+	if (!rctx->vs_writes_viewport_index) {
+		struct r600_signed_scissor *vp = &rctx->viewports.as_scissor[0];
+
+		if (!(mask & 1))
+			return;
+
+		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
+		r600_emit_one_scissor(rctx, cs, vp, scissor_enabled ? &states[0] : NULL);
+		r600_emit_guardband(rctx, vp);
+		rctx->scissors.dirty_mask &= ~1; /* clear one bit */
+		return;
+	}
+
+	/* Shaders can draw to any viewport. Make a union of all viewports. */
+	max_vp_scissor = rctx->viewports.as_scissor[0];
+	for (i = 1; i < R600_MAX_VIEWPORTS; i++)
+		r600_scissor_make_union(&max_vp_scissor,
+				      &rctx->viewports.as_scissor[i]);
+
+	while (mask) {
+		int start, count, i;
+
+		u_bit_scan_consecutive_range(&mask, &start, &count);
+
+		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL +
+					       start * 4 * 2, count * 2);
+		for (i = start; i < start+count; i++) {
+			r600_emit_one_scissor(rctx, cs, &rctx->viewports.as_scissor[i],
+					      scissor_enabled ? &states[i] : NULL);
+		}
+	}
+	r600_emit_guardband(rctx, &max_vp_scissor);
+	rctx->scissors.dirty_mask = 0;
+}
+
+static void r600_set_viewport_states(struct pipe_context *ctx,
+				     unsigned start_slot,
+				     unsigned num_viewports,
+				     const struct pipe_viewport_state *state)
+{
+	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
+	int i;
+
+	for (i = 0; i < num_viewports; i++) {
+		unsigned index = start_slot + i;
+
+		rctx->viewports.states[index] = state[i];
+		r600_get_scissor_from_viewport(rctx, &state[i],
+					       &rctx->viewports.as_scissor[index]);
+	}
+
+	rctx->viewports.dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
+	rctx->scissors.dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
+	rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true);
+	rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true);
+}
+
+static void r600_emit_viewports(struct r600_common_context *rctx, struct r600_atom *atom)
+{
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
+	struct pipe_viewport_state *states = rctx->viewports.states;
+	unsigned mask = rctx->viewports.dirty_mask;
+
+	/* The simple case: Only 1 viewport is active. */
+	if (!rctx->vs_writes_viewport_index) {
+		if (!(mask & 1))
+			return;
+
+		radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
+		radeon_emit(cs, fui(states[0].scale[0]));
+		radeon_emit(cs, fui(states[0].translate[0]));
+		radeon_emit(cs, fui(states[0].scale[1]));
+		radeon_emit(cs, fui(states[0].translate[1]));
+		radeon_emit(cs, fui(states[0].scale[2]));
+		radeon_emit(cs, fui(states[0].translate[2]));
+		rctx->viewports.dirty_mask &= ~1; /* clear one bit */
+		return;
+	}
+
+	while (mask) {
+		int start, count, i;
+
+		u_bit_scan_consecutive_range(&mask, &start, &count);
+
+		radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE +
+					       start * 4 * 6, count * 6);
+		for (i = start; i < start+count; i++) {
+			radeon_emit(cs, fui(states[i].scale[0]));
+			radeon_emit(cs, fui(states[i].translate[0]));
+			radeon_emit(cs, fui(states[i].scale[1]));
+			radeon_emit(cs, fui(states[i].translate[1]));
+			radeon_emit(cs, fui(states[i].scale[2]));
+			radeon_emit(cs, fui(states[i].translate[2]));
+		}
+	}
+	rctx->viewports.dirty_mask = 0;
+}
+
+void r600_set_scissor_enable(struct r600_common_context *rctx, bool enable)
+{
+	if (rctx->scissor_enabled != enable) {
+		rctx->scissor_enabled = enable;
+		rctx->scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+		rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true);
+	}
+}
+
+/**
+ * Normally, we only emit 1 viewport and 1 scissor if no shader is using
+ * the VIEWPORT_INDEX output, and emitting the other viewports and scissors
+ * is delayed. When a shader with VIEWPORT_INDEX appears, this should be
+ * called to emit the rest.
+ */
+void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx,
+					  struct tgsi_shader_info *info)
+{
+	if (!info)
+		return;
+
+	rctx->vs_writes_viewport_index = info->writes_viewport_index;
+	if (!rctx->vs_writes_viewport_index)
+		return;
+
+	if (rctx->scissors.dirty_mask)
+	    rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true);
+	if (rctx->viewports.dirty_mask)
+	    rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true);
+}
+
+void r600_init_viewport_functions(struct r600_common_context *rctx)
+{
+	rctx->scissors.atom.emit = r600_emit_scissors;
+	rctx->viewports.atom.emit = r600_emit_viewports;
+
+	rctx->scissors.atom.num_dw = (2 + 16 * 2) + 6;
+	rctx->viewports.atom.num_dw = 2 + 16 * 6;
+
+	rctx->b.set_scissor_states = r600_set_scissor_states;
+	rctx->b.set_viewport_states = r600_set_viewport_states;
+}
diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h
index eeec6ef7385..c8cb5e217e8 100644
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -220,4 +220,25 @@
 /*CIK+*/
 #define R_0300FC_CP_STRMOUT_CNTL		     0x0300FC
 
+#define R600_R_028C0C_PA_CL_GB_VERT_CLIP_ADJ         0x028C0C
+#define CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ           0x28be8
+#define R_02843C_PA_CL_VPORT_XSCALE                  0x02843C
+
+#define R_028250_PA_SC_VPORT_SCISSOR_0_TL                               0x028250
+#define   S_028250_TL_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028250_TL_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028250_TL_X                                               0xFFFF8000
+#define   S_028250_TL_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028250_TL_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028250_TL_Y                                               0x8000FFFF
+#define   S_028250_WINDOW_OFFSET_DISABLE(x)                           (((x) & 0x1) << 31)
+#define   G_028250_WINDOW_OFFSET_DISABLE(x)                           (((x) >> 31) & 0x1)
+#define   C_028250_WINDOW_OFFSET_DISABLE                              0x7FFFFFFF
+#define   S_028254_BR_X(x)                                            (((x) & 0x7FFF) << 0)
+#define   G_028254_BR_X(x)                                            (((x) >> 0) & 0x7FFF)
+#define   C_028254_BR_X                                               0xFFFF8000
+#define   S_028254_BR_Y(x)                                            (((x) & 0x7FFF) << 16)
+#define   G_028254_BR_Y(x)                                            (((x) >> 16) & 0x7FFF)
+#define   C_028254_BR_Y                                               0x8000FFFF
+
 #endif
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 233f46091a4..098baf20797 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -1003,7 +1003,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
-	dec->msg->body.decode.db_pitch = dec->base.width;
+	dec->msg->body.decode.db_pitch = align(dec->base.width, 16);
 
 	dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target);
 	if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY)
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index baecca72383..0c03652081c 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -245,6 +245,7 @@ struct radeon_info {
     enum chip_class             chip_class;
     uint64_t                    gart_size;
     uint64_t                    vram_size;
+    bool                        has_dedicated_vram;
     boolean                     has_virtual_memory;
     bool                        gfx_ib_pad_with_type2;
     boolean                     has_sdma;
@@ -449,7 +450,7 @@ struct radeon_winsys {
      * \return          The created buffer object.
      */
     struct pb_buffer *(*buffer_create)(struct radeon_winsys *ws,
-                                       unsigned size,
+                                       uint64_t size,
                                        unsigned alignment,
                                        boolean use_reusable_pool,
                                        enum radeon_bo_domain domain,
@@ -528,7 +529,7 @@ struct radeon_winsys {
      * \param Size      Size in bytes for the new buffer.
      */
     struct pb_buffer *(*buffer_from_ptr)(struct radeon_winsys *ws,
-                                         void *pointer, unsigned size);
+                                         void *pointer, uint64_t size);
 
     /**
      * Whether the buffer was created from a user pointer.
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index c5ea8b17119..54da7a20203 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -52,8 +52,6 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
-	r600_suspend_nontimer_queries(&sctx->b);
-
 	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
 	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
 	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
@@ -70,8 +68,8 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 		util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
 		util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
 		util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
-		util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
-		util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
+		util_blitter_save_viewport(sctx->blitter, &sctx->b.viewports.states[0]);
+		util_blitter_save_scissor(sctx->blitter, &sctx->b.scissors.states[0]);
 	}
 
 	if (op & SI_SAVE_FRAMEBUFFER)
@@ -95,7 +93,6 @@ static void si_blitter_end(struct pipe_context *ctx)
 	struct si_context *sctx = (struct si_context *)ctx;
 
 	sctx->b.render_cond_force_off = false;
-	r600_resume_nontimer_queries(&sctx->b);
 }
 
 static unsigned u_max_sample(struct pipe_resource *r)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index dc62415823e..001ddd4bfae 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -124,7 +124,7 @@ static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
 
 static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
 			      struct pipe_resource *src, unsigned byte_count,
-			      unsigned remaining_size, unsigned *flags)
+			      uint64_t remaining_size, unsigned *flags)
 {
 	si_need_cs_space(sctx);
 
@@ -158,7 +158,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
 #define CP_DMA_MAX_BYTE_COUNT	((1 << 21) - CP_DMA_ALIGNMENT)
 
 static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
-			    unsigned offset, unsigned size, unsigned value,
+			    uint64_t offset, uint64_t size, unsigned value,
 			    bool is_framebuffer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
@@ -180,7 +180,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 						      sctx->b.gfx.cs,
 						      PIPE_TRANSFER_WRITE);
 		map += offset;
-		for (unsigned i = 0; i < size; i++) {
+		for (uint64_t i = 0; i < size; i++) {
 			unsigned byte_within_dword = (offset + i) % 4;
 			*map++ = (value >> (byte_within_dword * 8)) & 0xff;
 		}
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 6dd2e4fd89d..b5557d800c7 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -746,6 +746,55 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 	buffers->desc.list_dirty = true;
 }
 
+/* SHADER BUFFERS */
+
+static void si_set_shader_buffers(struct pipe_context *ctx, unsigned shader,
+				  unsigned start_slot, unsigned count,
+				  struct pipe_shader_buffer *sbuffers)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct si_buffer_resources *buffers = &sctx->shader_buffers[shader];
+	unsigned i;
+
+	assert(start_slot + count <= SI_NUM_SHADER_BUFFERS);
+
+	for (i = 0; i < count; ++i) {
+		struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL;
+		struct r600_resource *buf;
+		unsigned slot = start_slot + i;
+		uint32_t *desc = buffers->desc.list + slot * 4;
+		uint64_t va;
+
+		if (!sbuffer || !sbuffer->buffer) {
+			pipe_resource_reference(&buffers->buffers[slot], NULL);
+			memset(desc, 0, sizeof(uint32_t) * 4);
+			buffers->desc.enabled_mask &= ~(1llu << slot);
+			continue;
+		}
+
+		buf = (struct r600_resource *)sbuffer->buffer;
+		va = buf->gpu_address + sbuffer->buffer_offset;
+
+		desc[0] = va;
+		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
+			  S_008F04_STRIDE(0);
+		desc[2] = sbuffer->buffer_size;
+		desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+			  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+		pipe_resource_reference(&buffers->buffers[slot], &buf->b.b);
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buf,
+				      buffers->shader_usage, buffers->priority);
+		buffers->desc.enabled_mask |= 1llu << slot;
+	}
+
+	buffers->desc.list_dirty = true;
+}
+
 /* RING BUFFERS */
 
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
@@ -883,6 +932,12 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 				 SI_CONTEXT_VS_PARTIAL_FLUSH;
 	}
 
+	/* All readers of the streamout targets need to be finished before we can
+	 * start writing to the targets.
+	 */
+	if (num_targets)
+		sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+
 	/* Streamout buffers must be bound in 2 places:
 	 * 1) in VGT by setting the VGT_STRMOUT registers
 	 * 2) as shader resources
@@ -977,6 +1032,30 @@ void si_update_compressed_colortex_masks(struct si_context *sctx)
 
 /* BUFFER DISCARD/INVALIDATION */
 
+/** Reset descriptors of buffer resources after \p buf has been invalidated. */
+static void si_reset_buffer_resources(struct si_context *sctx,
+				      struct si_buffer_resources *buffers,
+				      struct pipe_resource *buf,
+				      uint64_t old_va)
+{
+	uint64_t mask = buffers->desc.enabled_mask;
+
+	while (mask) {
+		unsigned i = u_bit_scan64(&mask);
+		if (buffers->buffers[i] == buf) {
+			si_desc_reset_buffer_offset(&sctx->b.b,
+						    buffers->desc.list + i*4,
+						    old_va, buf);
+			buffers->desc.list_dirty = true;
+
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+						(struct r600_resource *)buf,
+						buffers->shader_usage,
+						buffers->priority);
+		}
+	}
+}
+
 /* Reallocate a buffer a update all resource bindings where the buffer is
  * bound.
  *
@@ -1048,23 +1127,12 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 		}
 	}
 
-	/* Constant buffers. */
+	/* Constant and shader buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
-		struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
-		uint64_t mask = buffers->desc.enabled_mask;
-
-		while (mask) {
-			unsigned i = u_bit_scan64(&mask);
-			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
-							    old_va, buf);
-				buffers->desc.list_dirty = true;
-
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-						      rbuffer, buffers->shader_usage,
-						      buffers->priority);
-			}
-		}
+		si_reset_buffer_resources(sctx, &sctx->const_buffers[shader],
+					  buf, old_va);
+		si_reset_buffer_resources(sctx, &sctx->shader_buffers[shader],
+					  buf, old_va);
 	}
 
 	/* Texture buffers - update virtual addresses in sampler view descriptors. */
@@ -1244,6 +1312,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
 			si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
 
 		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->shader_buffers[i].desc, base, false);
 		si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
 		si_emit_shader_pointer(sctx, &sctx->images[i].desc, base, false);
 	}
@@ -1263,6 +1332,9 @@ void si_init_all_descriptors(struct si_context *sctx)
 		si_init_buffer_resources(&sctx->rw_buffers[i],
 					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
+		si_init_buffer_resources(&sctx->shader_buffers[i],
+					 SI_NUM_SHADER_BUFFERS, SI_SGPR_SHADER_BUFFERS,
+					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER);
 
 		si_init_descriptors(&sctx->samplers[i].views.desc,
 				    SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
@@ -1280,6 +1352,7 @@ void si_init_all_descriptors(struct si_context *sctx)
 	sctx->b.b.bind_sampler_states = si_bind_sampler_states;
 	sctx->b.b.set_shader_images = si_set_shader_images;
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
+	sctx->b.b.set_shader_buffers = si_set_shader_buffers;
 	sctx->b.b.set_sampler_views = si_set_sampler_views;
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
 	sctx->b.invalidate_buffer = si_invalidate_buffer;
@@ -1302,6 +1375,7 @@ bool si_upload_shader_descriptors(struct si_context *sctx)
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
 		    !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->shader_buffers[i].desc) ||
 		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
 		    !si_upload_descriptors(sctx, &sctx->images[i].desc))
 			return false;
@@ -1316,6 +1390,7 @@ void si_release_all_descriptors(struct si_context *sctx)
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_release_buffer_resources(&sctx->const_buffers[i]);
 		si_release_buffer_resources(&sctx->rw_buffers[i]);
+		si_release_buffer_resources(&sctx->shader_buffers[i]);
 		si_release_sampler_views(&sctx->samplers[i].views);
 		si_release_image_views(&sctx->images[i]);
 	}
@@ -1329,6 +1404,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
 		si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
+		si_buffer_resources_begin_new_cs(sctx, &sctx->shader_buffers[i]);
 		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
 		si_image_views_begin_new_cs(sctx, &sctx->images[i]);
 	}
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 8c900a4ecb6..b621b55abd3 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -155,7 +155,8 @@ void si_begin_new_cs(struct si_context *ctx)
 			SI_CONTEXT_INV_VMEM_L1 |
 			SI_CONTEXT_INV_GLOBAL_L2 |
 			SI_CONTEXT_INV_SMEM_L1 |
-			SI_CONTEXT_INV_ICACHE;
+			SI_CONTEXT_INV_ICACHE |
+			R600_CONTEXT_START_PIPELINE_STATS;
 
 	/* set all valid group as dirty so they get reemited on
 	 * next draw command
@@ -185,10 +186,10 @@ void si_begin_new_cs(struct si_context *ctx)
 	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 	si_all_descriptors_begin_new_cs(ctx);
 
-	ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-	ctx->viewports.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-	si_mark_atom_dirty(ctx, &ctx->scissors.atom);
-	si_mark_atom_dirty(ctx, &ctx->viewports.atom);
+	ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+	ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1;
+	si_mark_atom_dirty(ctx, &ctx->b.scissors.atom);
+	si_mark_atom_dirty(ctx, &ctx->b.viewports.atom);
 
 	r600_postflush_resume_features(&ctx->b);
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 41bb84d68df..6a990ed64c3 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -308,6 +308,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_QUERY_MEMORY_INFO:
 	case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
 	case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+	case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -332,9 +333,12 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
 		return 4;
+	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+		return HAVE_LLVM >= 0x0309 ? 4 : 0;
 
 	case PIPE_CAP_GLSL_FEATURE_LEVEL:
-		return HAVE_LLVM >= 0x0307 ? 410 : 330;
+		return HAVE_LLVM >= 0x0309 ? 420 :
+		       HAVE_LLVM >= 0x0307 ? 410 : 330;
 
 	case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
 		return MIN2(sscreen->b.info.vram_size, 0xFFFFFFFF);
@@ -353,7 +357,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_DRAW_PARAMETERS:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT:
 	case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
-	case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
 	case PIPE_CAP_GENERATE_MIPMAP:
 	case PIPE_CAP_STRING_MARKER:
 	case PIPE_CAP_QUERY_BUFFER_OBJECT:
@@ -401,7 +404,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 8;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
-		return SI_MAX_VIEWPORTS;
+		return R600_MAX_VIEWPORTS;
 
 	/* Timer queries, present when the clock frequency is non zero. */
 	case PIPE_CAP_QUERY_TIMESTAMP:
@@ -539,7 +542,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
 	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-		return 0;
+		return HAVE_LLVM >= 0x0309 ? SI_NUM_SHADER_BUFFERS : 0;
 	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 		return HAVE_LLVM >= 0x0309 ? SI_NUM_IMAGES : 0;
 	}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 4158fc5461e..0398b1df61e 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -76,7 +76,6 @@
 #define SI_IS_TRACE_POINT(x)		(((x) & 0xcafe0000) == 0xcafe0000)
 #define SI_GET_TRACE_POINT_ID(x)	((x) & 0xffff)
 
-#define SI_MAX_VIEWPORTS	16
 #define SI_MAX_BORDER_COLORS	4096
 
 struct si_compute;
@@ -173,18 +172,6 @@ struct si_sample_mask {
 	uint16_t		sample_mask;
 };
 
-struct si_scissors {
-	struct r600_atom		atom;
-	unsigned			dirty_mask;
-	struct pipe_scissor_state	states[SI_MAX_VIEWPORTS];
-};
-
-struct si_viewports {
-	struct r600_atom		atom;
-	unsigned			dirty_mask;
-	struct pipe_viewport_state	states[SI_MAX_VIEWPORTS];
-};
-
 /* A shader state consists of the shader selector, which is a constant state
  * object shared by multiple contexts and shouldn't be modified, and
  * the current shader variant selected for this context.
@@ -228,8 +215,6 @@ struct si_context {
 	struct r600_atom		clip_regs;
 	struct si_clip_state		clip_state;
 	struct si_shader_data		shader_userdata;
-	struct si_scissors		scissors;
-	struct si_viewports		viewports;
 	struct si_stencil_ref		stencil_ref;
 	struct r600_atom		spi_map;
 
@@ -256,6 +241,7 @@ struct si_context {
 	struct si_descriptors		vertex_buffers;
 	struct si_buffer_resources	const_buffers[SI_NUM_SHADERS];
 	struct si_buffer_resources	rw_buffers[SI_NUM_SHADERS];
+	struct si_buffer_resources	shader_buffers[SI_NUM_SHADERS];
 	struct si_textures_info		samplers[SI_NUM_SHADERS];
 	struct si_images_info		images[SI_NUM_SHADERS];
 
@@ -289,6 +275,7 @@ struct si_context {
 	bool			db_stencil_clear;
 	bool			db_stencil_disable_expclear;
 	unsigned		ps_db_shader_control;
+	bool			occlusion_queries_disabled;
 
 	/* Emitted draw state. */
 	int			last_base_vertex;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 08da3e37550..c58467ddcb0 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -98,6 +98,7 @@ struct si_shader_context
 	LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS];
 	LLVMValueRef lds;
 	LLVMValueRef *constants[SI_NUM_CONST_BUFFERS];
+	LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS];
 	LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
 	LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
 	LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS];
@@ -2775,6 +2776,24 @@ static void membar_emit(
 	emit_optimization_barrier(ctx);
 }
 
+static LLVMValueRef
+shader_buffer_fetch_rsrc(struct si_shader_context *ctx,
+			 const struct tgsi_full_src_register *reg)
+{
+	LLVMValueRef ind_index;
+	LLVMValueRef rsrc_ptr;
+
+	if (!reg->Register.Indirect)
+		return ctx->shader_buffers[reg->Register.Index];
+
+	ind_index = get_bounded_indirect_index(ctx, &reg->Indirect,
+					       reg->Register.Index,
+					       SI_NUM_SHADER_BUFFERS);
+
+	rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
+	return build_indexed_load_const(ctx, rsrc_ptr, ind_index);
+}
+
 static bool tgsi_is_array_sampler(unsigned target)
 {
 	return target == TGSI_TEXTURE_1D_ARRAY ||
@@ -2924,32 +2943,46 @@ static void image_append_args(
 }
 
 /**
+ * Given a 256 bit resource, extract the top half (which stores the buffer
+ * resource in the case of textures and images).
+ */
+static LLVMValueRef extract_rsrc_top_half(
+		struct si_shader_context *ctx,
+		LLVMValueRef rsrc)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
+
+	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
+	rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
+	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
+
+	return rsrc;
+}
+
+/**
  * Append the resource and indexing arguments for buffer intrinsics.
  *
- * \param rsrc the 256 bit resource
- * \param index index into the buffer
+ * \param rsrc the v4i32 buffer resource
+ * \param index index into the buffer (stride-based)
+ * \param offset byte offset into the buffer
  */
 static void buffer_append_args(
 		struct si_shader_context *ctx,
 		struct lp_build_emit_data *emit_data,
 		LLVMValueRef rsrc,
 		LLVMValueRef index,
+		LLVMValueRef offset,
 		bool atomic)
 {
-	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
-	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
 	const struct tgsi_full_instruction *inst = emit_data->inst;
-	LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
 	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
 	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
 
-	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
-	rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
-	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
-
 	emit_data->args[emit_data->arg_count++] = rsrc;
 	emit_data->args[emit_data->arg_count++] = index; /* vindex */
-	emit_data->args[emit_data->arg_count++] = bld_base->uint_bld.zero; /* voffset */
+	emit_data->args[emit_data->arg_count++] = offset; /* voffset */
 	if (!atomic) {
 		emit_data->args[emit_data->arg_count++] =
 			inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
@@ -2966,24 +2999,73 @@ static void load_fetch_args(
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
 	unsigned target = inst->Memory.Texture;
-	LLVMValueRef coords;
 	LLVMValueRef rsrc;
 
 	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 
-	image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
-	coords = image_fetch_coords(bld_base, inst, 1);
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+		LLVMBuilderRef builder = gallivm->builder;
+		LLVMValueRef offset;
+		LLVMValueRef tmp;
 
-	if (target == TGSI_TEXTURE_BUFFER) {
-		buffer_append_args(ctx, emit_data, rsrc, coords, false);
+		rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
+
+		tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+
+		buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
+				   offset, false);
 	} else {
-		emit_data->args[0] = coords;
-		emit_data->args[1] = rsrc;
-		emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
-		emit_data->arg_count = 3;
+		LLVMValueRef coords;
 
-		image_append_args(ctx, emit_data, target, false);
+		image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
+		coords = image_fetch_coords(bld_base, inst, 1);
+
+		if (target == TGSI_TEXTURE_BUFFER) {
+			rsrc = extract_rsrc_top_half(ctx, rsrc);
+			buffer_append_args(ctx, emit_data, rsrc, coords,
+					bld_base->uint_bld.zero, false);
+		} else {
+			emit_data->args[0] = coords;
+			emit_data->args[1] = rsrc;
+			emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
+			emit_data->arg_count = 3;
+
+			image_append_args(ctx, emit_data, target, false);
+		}
+	}
+}
+
+static void load_emit_buffer(struct si_shader_context *ctx,
+			     struct lp_build_emit_data *emit_data)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	uint writemask = inst->Dst[0].Register.WriteMask;
+	uint count = util_last_bit(writemask);
+	const char *intrinsic_name;
+	LLVMTypeRef dst_type;
+
+	switch (count) {
+	case 1:
+		intrinsic_name = "llvm.amdgcn.buffer.load.f32";
+		dst_type = ctx->f32;
+		break;
+	case 2:
+		intrinsic_name = "llvm.amdgcn.buffer.load.v2f32";
+		dst_type = LLVMVectorType(ctx->f32, 2);
+		break;
+	default: // 3 & 4
+		intrinsic_name = "llvm.amdgcn.buffer.load.v4f32";
+		dst_type = ctx->v4f32;
+		count = 4;
 	}
+
+	emit_data->output[emit_data->chan] = lp_build_intrinsic(
+			builder, intrinsic_name, dst_type,
+			emit_data->args, emit_data->arg_count,
+			LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
 }
 
 static void load_emit(
@@ -2995,18 +3077,23 @@ static void load_emit(
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
-	unsigned target = inst->Memory.Texture;
 	char intrinsic_name[32];
 	char coords_type[8];
 
 	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
 		emit_optimization_barrier(ctx);
 
-	if (target == TGSI_TEXTURE_BUFFER) {
-		emit_data->output[emit_data->chan] = lp_build_intrinsic(
-			builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
-			emit_data->args, emit_data->arg_count,
-			LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+		load_emit_buffer(ctx, emit_data);
+		return;
+	}
+
+	if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
+		emit_data->output[emit_data->chan] =
+			lp_build_intrinsic(
+				builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
+				emit_data->args, emit_data->arg_count,
+				LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
 	} else {
 		build_int_type_name(LLVMTypeOf(emit_data->args[0]),
 				    coords_type, sizeof(coords_type));
@@ -3028,39 +3115,129 @@ static void store_fetch_args(
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
-	struct tgsi_full_src_register image;
-	unsigned target = inst->Memory.Texture;
+	struct tgsi_full_src_register memory;
 	LLVMValueRef chans[4];
 	LLVMValueRef data;
-	LLVMValueRef coords;
 	LLVMValueRef rsrc;
 	unsigned chan;
 
 	emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
 
-	image = tgsi_full_src_register_from_dst(&inst->Dst[0]);
-	coords = image_fetch_coords(bld_base, inst, 0);
-
 	for (chan = 0; chan < 4; ++chan) {
 		chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
 	}
 	data = lp_build_gather_values(gallivm, chans, 4);
 
-	if (target == TGSI_TEXTURE_BUFFER) {
-		image_fetch_rsrc(bld_base, &image, false, &rsrc);
-		emit_data->args[0] = data;
-		emit_data->arg_count = 1;
+	emit_data->args[emit_data->arg_count++] = data;
+
+	memory = tgsi_full_src_register_from_dst(&inst->Dst[0]);
 
-		buffer_append_args(ctx, emit_data, rsrc, coords, false);
+	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
+		LLVMValueRef offset;
+		LLVMValueRef tmp;
+
+		rsrc = shader_buffer_fetch_rsrc(ctx, &memory);
+
+		tmp = lp_build_emit_fetch(bld_base, inst, 0, 0);
+		offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+
+		buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
+				   offset, false);
 	} else {
+		unsigned target = inst->Memory.Texture;
+		LLVMValueRef coords;
+
+		coords = image_fetch_coords(bld_base, inst, 0);
+
+		if (target == TGSI_TEXTURE_BUFFER) {
+			image_fetch_rsrc(bld_base, &memory, false, &rsrc);
+
+			rsrc = extract_rsrc_top_half(ctx, rsrc);
+			buffer_append_args(ctx, emit_data, rsrc, coords,
+					bld_base->uint_bld.zero, false);
+		} else {
+			emit_data->args[1] = coords;
+			image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]);
+			emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
+			emit_data->arg_count = 4;
+
+			image_append_args(ctx, emit_data, target, false);
+		}
+	}
+}
+
+static void store_emit_buffer(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data *emit_data)
+{
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld;
+	LLVMValueRef base_data = emit_data->args[0];
+	LLVMValueRef base_offset = emit_data->args[3];
+	unsigned writemask = inst->Dst[0].Register.WriteMask;
+
+	while (writemask) {
+		int start, count;
+		const char *intrinsic_name;
+		LLVMValueRef data;
+		LLVMValueRef offset;
+		LLVMValueRef tmp;
+
+		u_bit_scan_consecutive_range(&writemask, &start, &count);
+
+		/* Due to an LLVM limitation, split 3-element writes
+		 * into a 2-element and a 1-element write. */
+		if (count == 3) {
+			writemask |= 1 << (start + 2);
+			count = 2;
+		}
+
+		if (count == 4) {
+			data = base_data;
+			intrinsic_name = "llvm.amdgcn.buffer.store.v4f32";
+		} else if (count == 2) {
+			LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2);
+
+			tmp = LLVMBuildExtractElement(
+				builder, base_data,
+				lp_build_const_int32(gallivm, start), "");
+			data = LLVMBuildInsertElement(
+				builder, LLVMGetUndef(v2f32), tmp,
+				uint_bld->zero, "");
+
+			tmp = LLVMBuildExtractElement(
+				builder, base_data,
+				lp_build_const_int32(gallivm, start + 1), "");
+			data = LLVMBuildInsertElement(
+				builder, data, tmp, uint_bld->one, "");
+
+			intrinsic_name = "llvm.amdgcn.buffer.store.v2f32";
+		} else {
+			assert(count == 1);
+			data = LLVMBuildExtractElement(
+				builder, base_data,
+				lp_build_const_int32(gallivm, start), "");
+			intrinsic_name = "llvm.amdgcn.buffer.store.f32";
+		}
+
+		offset = base_offset;
+		if (start != 0) {
+			offset = LLVMBuildAdd(
+				builder, offset,
+				lp_build_const_int32(gallivm, start * 4), "");
+		}
+
 		emit_data->args[0] = data;
-		emit_data->args[1] = coords;
-		image_fetch_rsrc(bld_base, &image, true, &emit_data->args[2]);
-		emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
-		emit_data->arg_count = 4;
+		emit_data->args[3] = offset;
 
-		image_append_args(ctx, emit_data, target, false);
+		lp_build_intrinsic(
+			builder, intrinsic_name, emit_data->dst_type,
+			emit_data->args, emit_data->arg_count,
+			LLVMNoUnwindAttribute);
 	}
 }
 
@@ -3076,6 +3253,11 @@ static void store_emit(
 	char intrinsic_name[32];
 	char coords_type[8];
 
+	if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) {
+		store_emit_buffer(si_shader_context(bld_base), emit_data);
+		return;
+	}
+
 	if (target == TGSI_TEXTURE_BUFFER) {
 		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			builder, "llvm.amdgcn.buffer.store.format.v4f32",
@@ -3103,18 +3285,12 @@ static void atomic_fetch_args(
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
-	unsigned target = inst->Memory.Texture;
 	LLVMValueRef data1, data2;
-	LLVMValueRef coords;
 	LLVMValueRef rsrc;
 	LLVMValueRef tmp;
 
 	emit_data->dst_type = bld_base->base.elem_type;
 
-	image_fetch_rsrc(bld_base, &inst->Src[0], target != TGSI_TEXTURE_BUFFER,
-			 &rsrc);
-	coords = image_fetch_coords(bld_base, inst, 1);
-
 	tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
 	data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
 
@@ -3130,13 +3306,34 @@ static void atomic_fetch_args(
 		emit_data->args[emit_data->arg_count++] = data2;
 	emit_data->args[emit_data->arg_count++] = data1;
 
-	if (target == TGSI_TEXTURE_BUFFER) {
-		buffer_append_args(ctx, emit_data, rsrc, coords, true);
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+		LLVMValueRef offset;
+
+		rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]);
+
+		tmp = lp_build_emit_fetch(bld_base, inst, 1, 0);
+		offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+
+		buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero,
+				   offset, true);
 	} else {
-		emit_data->args[emit_data->arg_count++] = coords;
-		emit_data->args[emit_data->arg_count++] = rsrc;
+		unsigned target = inst->Memory.Texture;
+		LLVMValueRef coords;
+
+		image_fetch_rsrc(bld_base, &inst->Src[0],
+				 target != TGSI_TEXTURE_BUFFER, &rsrc);
+		coords = image_fetch_coords(bld_base, inst, 1);
+
+		if (target == TGSI_TEXTURE_BUFFER) {
+			rsrc = extract_rsrc_top_half(ctx, rsrc);
+			buffer_append_args(ctx, emit_data, rsrc, coords,
+					   bld_base->uint_bld.zero, true);
+		} else {
+			emit_data->args[emit_data->arg_count++] = coords;
+			emit_data->args[emit_data->arg_count++] = rsrc;
 
-		image_append_args(ctx, emit_data, target, true);
+			image_append_args(ctx, emit_data, target, true);
+		}
 	}
 }
 
@@ -3148,11 +3345,11 @@ static void atomic_emit(
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
-	unsigned target = inst->Memory.Texture;
 	char intrinsic_name[40];
 	LLVMValueRef tmp;
 
-	if (target == TGSI_TEXTURE_BUFFER) {
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
+	    inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 		snprintf(intrinsic_name, sizeof(intrinsic_name),
 			 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
 	} else {
@@ -3177,14 +3374,17 @@ static void resq_fetch_args(
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data)
 {
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	const struct tgsi_full_instruction *inst = emit_data->inst;
 	const struct tgsi_full_src_register *reg = &inst->Src[0];
-	unsigned tex_target = inst->Memory.Texture;
 
 	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 
-	if (tex_target == TGSI_TEXTURE_BUFFER) {
+	if (reg->Register.File == TGSI_FILE_BUFFER) {
+		emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg);
+		emit_data->arg_count = 1;
+	} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 		image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
 		emit_data->arg_count = 1;
 	} else {
@@ -3193,7 +3393,7 @@ static void resq_fetch_args(
 		emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
 		emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
 		emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
-		emit_data->args[5] = tgsi_is_array_image(tex_target) ?
+		emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ?
 			bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
 		emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
 		emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
@@ -3211,10 +3411,12 @@ static void resq_emit(
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	const struct tgsi_full_instruction *inst = emit_data->inst;
-	unsigned target = inst->Memory.Texture;
 	LLVMValueRef out;
 
-	if (target == TGSI_TEXTURE_BUFFER) {
+	if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
+		out = LLVMBuildExtractElement(builder, emit_data->args[0],
+					      lp_build_const_int32(gallivm, 2), "");
+	} else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
 		out = get_buffer_size(bld_base, emit_data->args[0]);
 	} else {
 		out = lp_build_intrinsic(
@@ -3223,7 +3425,7 @@ static void resq_emit(
 			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
 
 		/* Divide the number of layers by 6 to get the number of cubes. */
-		if (target == TGSI_TEXTURE_CUBE_ARRAY) {
+		if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) {
 			LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
 			LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
 
@@ -3339,6 +3541,35 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx,
 	return get_sampler_desc_custom(ctx, list, index, type);
 }
 
+/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
+ *
+ * SI-CI:
+ *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
+ *   filtering manually. The driver sets img7 to a mask clearing
+ *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
+ *     s_and_b32 samp0, samp0, img7
+ *
+ * VI:
+ *   The ANISO_OVERRIDE sampler field enables this fix in TA.
+ */
+static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx,
+					   LLVMValueRef res, LLVMValueRef samp)
+{
+	LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
+	LLVMValueRef img7, samp0;
+
+	if (ctx->screen->b.chip_class >= VI)
+		return samp;
+
+	img7 = LLVMBuildExtractElement(builder, res,
+				       LLVMConstInt(ctx->i32, 7, 0), "");
+	samp0 = LLVMBuildExtractElement(builder, samp,
+					LLVMConstInt(ctx->i32, 0, 0), "");
+	samp0 = LLVMBuildAnd(builder, samp0, img7, "");
+	return LLVMBuildInsertElement(builder, samp, samp0,
+				      LLVMConstInt(ctx->i32, 0, 0), "");
+}
+
 static void tex_fetch_ptrs(
 	struct lp_build_tgsi_context *bld_base,
 	struct lp_build_emit_data *emit_data,
@@ -3370,6 +3601,7 @@ static void tex_fetch_ptrs(
 			*fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK);
 		} else {
 			*samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER);
+			*samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
 			*fmask_ptr = NULL;
 		}
 	} else {
@@ -4420,7 +4652,8 @@ static void create_function(struct si_shader_context *ctx)
 	params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
 	params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
 	params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
-	last_array_pointer = SI_PARAM_IMAGES;
+	params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS);
+	last_array_pointer = SI_PARAM_SHADER_BUFFERS;
 
 	switch (ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
@@ -4679,6 +4912,21 @@ static void preload_constants(struct si_shader_context *ctx)
 	}
 }
 
+static void preload_shader_buffers(struct si_shader_context *ctx)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS);
+	int buf, maxbuf;
+
+	maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER],
+		      SI_NUM_SHADER_BUFFERS - 1);
+	for (buf = 0; buf <= maxbuf; ++buf) {
+		ctx->shader_buffers[buf] =
+			build_indexed_load_const(
+				ctx, ptr, lp_build_const_int32(gallivm, buf));
+	}
+}
+
 static void preload_samplers(struct si_shader_context *ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
@@ -4701,9 +4949,13 @@ static void preload_samplers(struct si_shader_context *ctx)
 		if (info->is_msaa_sampler[i])
 			ctx->fmasks[i] =
 				get_sampler_desc(ctx, offset, DESC_FMASK);
-		else
+		else {
 			ctx->sampler_states[i] =
 				get_sampler_desc(ctx, offset, DESC_SAMPLER);
+			ctx->sampler_states[i] =
+				sici_fix_sampler_aniso(ctx, ctx->sampler_views[i],
+						       ctx->sampler_states[i]);
+		}
 	}
 }
 
@@ -5540,6 +5792,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 	create_meta_data(&ctx);
 	create_function(&ctx);
 	preload_constants(&ctx);
+	preload_shader_buffers(&ctx);
 	preload_samplers(&ctx);
 	preload_images(&ctx);
 	preload_streamout_buffers(&ctx);
@@ -6000,6 +6253,7 @@ static bool si_compile_tcs_epilog(struct si_screen *sscreen,
 	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
 	params[SI_PARAM_SAMPLERS] = ctx.i64;
 	params[SI_PARAM_IMAGES] = ctx.i64;
+	params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
 	params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
 	params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
 	params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
@@ -6250,6 +6504,7 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen,
 	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
 	params[SI_PARAM_SAMPLERS] = ctx.i64;
 	params[SI_PARAM_IMAGES] = ctx.i64;
+	params[SI_PARAM_SHADER_BUFFERS] = ctx.i64;
 	params[SI_PARAM_ALPHA_REF] = ctx.f32;
 	last_array_pointer = -1;
 	last_sgpr = SI_PARAM_ALPHA_REF;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 8059edf6395..013c8a2c114 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -81,95 +81,97 @@ struct radeon_shader_reloc;
 #define SI_SGPR_CONST_BUFFERS	2
 #define SI_SGPR_SAMPLERS	4  /* images & sampler states interleaved */
 #define SI_SGPR_IMAGES		6
-#define SI_SGPR_VERTEX_BUFFERS	8  /* VS only */
-#define SI_SGPR_BASE_VERTEX	10 /* VS only */
-#define SI_SGPR_START_INSTANCE	11 /* VS only */
-#define SI_SGPR_VS_STATE_BITS	12 /* VS(VS) only */
-#define SI_SGPR_LS_OUT_LAYOUT	12 /* VS(LS) only */
-#define SI_SGPR_TCS_OUT_OFFSETS	8  /* TCS & TES only */
-#define SI_SGPR_TCS_OUT_LAYOUT	9  /* TCS & TES only */
-#define SI_SGPR_TCS_IN_LAYOUT	10 /* TCS only */
-#define SI_SGPR_ALPHA_REF	8  /* PS only */
-
-#define SI_VS_NUM_USER_SGPR	13 /* API VS */
-#define SI_ES_NUM_USER_SGPR	12 /* API VS */
-#define SI_LS_NUM_USER_SGPR	13 /* API VS */
-#define SI_TCS_NUM_USER_SGPR	11
-#define SI_TES_NUM_USER_SGPR	10
-#define SI_GS_NUM_USER_SGPR	8
+#define SI_SGPR_SHADER_BUFFERS	8
+#define SI_SGPR_VERTEX_BUFFERS	10  /* VS only */
+#define SI_SGPR_BASE_VERTEX	12 /* VS only */
+#define SI_SGPR_START_INSTANCE	13 /* VS only */
+#define SI_SGPR_VS_STATE_BITS	14 /* VS(VS) only */
+#define SI_SGPR_LS_OUT_LAYOUT	14 /* VS(LS) only */
+#define SI_SGPR_TCS_OUT_OFFSETS	10 /* TCS & TES only */
+#define SI_SGPR_TCS_OUT_LAYOUT	11 /* TCS & TES only */
+#define SI_SGPR_TCS_IN_LAYOUT	12 /* TCS only */
+#define SI_SGPR_ALPHA_REF	10 /* PS only */
+
+#define SI_VS_NUM_USER_SGPR	15 /* API VS */
+#define SI_ES_NUM_USER_SGPR	14 /* API VS */
+#define SI_LS_NUM_USER_SGPR	15 /* API VS */
+#define SI_TCS_NUM_USER_SGPR	13
+#define SI_TES_NUM_USER_SGPR	12
+#define SI_GS_NUM_USER_SGPR	10
 #define SI_GSCOPY_NUM_USER_SGPR	4
-#define SI_PS_NUM_USER_SGPR	9
+#define SI_PS_NUM_USER_SGPR	11
 
 /* LLVM function parameter indices */
 #define SI_PARAM_RW_BUFFERS	0
 #define SI_PARAM_CONST_BUFFERS	1
 #define SI_PARAM_SAMPLERS	2
 #define SI_PARAM_IMAGES		3
+#define SI_PARAM_SHADER_BUFFERS	4
 
 /* VS only parameters */
-#define SI_PARAM_VERTEX_BUFFERS	4
-#define SI_PARAM_BASE_VERTEX	5
-#define SI_PARAM_START_INSTANCE	6
+#define SI_PARAM_VERTEX_BUFFERS	5
+#define SI_PARAM_BASE_VERTEX	6
+#define SI_PARAM_START_INSTANCE	7
 /* [0] = clamp vertex color */
-#define SI_PARAM_VS_STATE_BITS	7
+#define SI_PARAM_VS_STATE_BITS	8
 /* the other VS parameters are assigned dynamically */
 
 /* Offsets where TCS outputs and TCS patch outputs live in LDS:
  *   [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32
  *   [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32
  */
-#define SI_PARAM_TCS_OUT_OFFSETS 4 /* for TCS & TES */
+#define SI_PARAM_TCS_OUT_OFFSETS 5 /* for TCS & TES */
 
 /* Layout of TCS outputs / TES inputs:
  *   [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4
  *   [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4
  *   [26:31] = gl_PatchVerticesIn, max = 32
  */
-#define SI_PARAM_TCS_OUT_LAYOUT	5 /* for TCS & TES */
+#define SI_PARAM_TCS_OUT_LAYOUT	6 /* for TCS & TES */
 
 /* Layout of LS outputs / TCS inputs
  *   [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4
  *   [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4
  */
-#define SI_PARAM_TCS_IN_LAYOUT	6 /* TCS only */
-#define SI_PARAM_LS_OUT_LAYOUT	7 /* same value as TCS_IN_LAYOUT, LS only */
+#define SI_PARAM_TCS_IN_LAYOUT	7 /* TCS only */
+#define SI_PARAM_LS_OUT_LAYOUT	8 /* same value as TCS_IN_LAYOUT, LS only */
 
 /* TCS only parameters. */
-#define SI_PARAM_TESS_FACTOR_OFFSET 7
-#define SI_PARAM_PATCH_ID	8
-#define SI_PARAM_REL_IDS	9
+#define SI_PARAM_TESS_FACTOR_OFFSET 8
+#define SI_PARAM_PATCH_ID	9
+#define SI_PARAM_REL_IDS	10
 
 /* GS only parameters */
-#define SI_PARAM_GS2VS_OFFSET	4
-#define SI_PARAM_GS_WAVE_ID	5
-#define SI_PARAM_VTX0_OFFSET	6
-#define SI_PARAM_VTX1_OFFSET	7
-#define SI_PARAM_PRIMITIVE_ID	8
-#define SI_PARAM_VTX2_OFFSET	9
-#define SI_PARAM_VTX3_OFFSET	10
-#define SI_PARAM_VTX4_OFFSET	11
-#define SI_PARAM_VTX5_OFFSET	12
-#define SI_PARAM_GS_INSTANCE_ID	13
+#define SI_PARAM_GS2VS_OFFSET	5
+#define SI_PARAM_GS_WAVE_ID	6
+#define SI_PARAM_VTX0_OFFSET	7
+#define SI_PARAM_VTX1_OFFSET	8
+#define SI_PARAM_PRIMITIVE_ID	9
+#define SI_PARAM_VTX2_OFFSET	10
+#define SI_PARAM_VTX3_OFFSET	11
+#define SI_PARAM_VTX4_OFFSET	12
+#define SI_PARAM_VTX5_OFFSET	13
+#define SI_PARAM_GS_INSTANCE_ID	14
 
 /* PS only parameters */
-#define SI_PARAM_ALPHA_REF		4
-#define SI_PARAM_PRIM_MASK		5
-#define SI_PARAM_PERSP_SAMPLE		6
-#define SI_PARAM_PERSP_CENTER		7
-#define SI_PARAM_PERSP_CENTROID		8
-#define SI_PARAM_PERSP_PULL_MODEL	9
-#define SI_PARAM_LINEAR_SAMPLE		10
-#define SI_PARAM_LINEAR_CENTER		11
-#define SI_PARAM_LINEAR_CENTROID	12
-#define SI_PARAM_LINE_STIPPLE_TEX	13
-#define SI_PARAM_POS_X_FLOAT		14
-#define SI_PARAM_POS_Y_FLOAT		15
-#define SI_PARAM_POS_Z_FLOAT		16
-#define SI_PARAM_POS_W_FLOAT		17
-#define SI_PARAM_FRONT_FACE		18
-#define SI_PARAM_ANCILLARY		19
-#define SI_PARAM_SAMPLE_COVERAGE	20
-#define SI_PARAM_POS_FIXED_PT		21
+#define SI_PARAM_ALPHA_REF		5
+#define SI_PARAM_PRIM_MASK		6
+#define SI_PARAM_PERSP_SAMPLE		7
+#define SI_PARAM_PERSP_CENTER		8
+#define SI_PARAM_PERSP_CENTROID		9
+#define SI_PARAM_PERSP_PULL_MODEL	10
+#define SI_PARAM_LINEAR_SAMPLE		11
+#define SI_PARAM_LINEAR_CENTER		12
+#define SI_PARAM_LINEAR_CENTROID	13
+#define SI_PARAM_LINE_STIPPLE_TEX	14
+#define SI_PARAM_POS_X_FLOAT		15
+#define SI_PARAM_POS_Y_FLOAT		16
+#define SI_PARAM_POS_Z_FLOAT		17
+#define SI_PARAM_POS_W_FLOAT		18
+#define SI_PARAM_FRONT_FACE		19
+#define SI_PARAM_ANCILLARY		20
+#define SI_PARAM_SAMPLE_COVERAGE	21
+#define SI_PARAM_POS_FIXED_PT		22
 
 #define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 9) /* +8 for COLOR[0..1] */
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 8087d2331ff..82ae4c43245 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -752,7 +752,7 @@ static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom)
 }
 
 /*
- * Clipping, scissors and viewport
+ * Clipping
  */
 
 static void si_set_clip_state(struct pipe_context *ctx,
@@ -819,179 +819,6 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 			       S_028AB4_REUSE_OFF(info->writes_viewport_index));
 }
 
-static void si_set_scissor_states(struct pipe_context *ctx,
-                                  unsigned start_slot,
-                                  unsigned num_scissors,
-                                  const struct pipe_scissor_state *state)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	int i;
-
-	for (i = 0; i < num_scissors; i++)
-		sctx->scissors.states[start_slot + i] = state[i];
-
-	if (!sctx->queued.named.rasterizer ||
-	    !sctx->queued.named.rasterizer->scissor_enable)
-		return;
-
-	sctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot;
-	si_mark_atom_dirty(sctx, &sctx->scissors.atom);
-}
-
-static void si_get_scissor_from_viewport(struct pipe_viewport_state *vp,
-					 struct pipe_scissor_state *scissor)
-{
-	/* These must be signed, unlike pipe_scissor_state. */
-	int minx, miny, maxx, maxy, tmp;
-
-	/* Convert (-1, -1) and (1, 1) from clip space into window space. */
-	minx = -vp->scale[0] + vp->translate[0];
-	miny = -vp->scale[1] + vp->translate[1];
-	maxx = vp->scale[0] + vp->translate[0];
-	maxy = vp->scale[1] + vp->translate[1];
-
-	/* r600_draw_rectangle sets this. Disable the scissor. */
-	if (minx == -1 && miny == -1 && maxx == 1 && maxy == 1) {
-		minx = miny = 0;
-		maxx = maxy = 16384;
-	}
-
-	/* Handle inverted viewports. */
-	if (minx > maxx) {
-		tmp = minx;
-		minx = maxx;
-		maxx = tmp;
-	}
-	if (miny > maxy) {
-		tmp = miny;
-		miny = maxy;
-		maxy = tmp;
-	}
-
-	scissor->minx = CLAMP(minx, 0, 16384);
-	scissor->miny = CLAMP(miny, 0, 16384);
-	scissor->maxx = CLAMP(maxx, 0, 16384);
-	scissor->maxy = CLAMP(maxy, 0, 16384);
-}
-
-static void si_clip_scissor(struct pipe_scissor_state *out,
-			    struct pipe_scissor_state *clip)
-{
-	out->minx = MAX2(out->minx, clip->minx);
-	out->miny = MAX2(out->miny, clip->miny);
-	out->maxx = MIN2(out->maxx, clip->maxx);
-	out->maxy = MIN2(out->maxy, clip->maxy);
-}
-
-static void si_emit_one_scissor(struct radeon_winsys_cs *cs,
-				struct pipe_viewport_state *vp,
-				struct pipe_scissor_state *scissor)
-{
-	struct pipe_scissor_state final;
-
-	/* Since the guard band disables clipping, we have to clip per-pixel
-	 * using a scissor.
-	 */
-	si_get_scissor_from_viewport(vp, &final);
-
-	if (scissor)
-		si_clip_scissor(&final, scissor);
-
-	radeon_emit(cs, S_028250_TL_X(final.minx) |
-			S_028250_TL_Y(final.miny) |
-			S_028250_WINDOW_OFFSET_DISABLE(1));
-	radeon_emit(cs, S_028254_BR_X(final.maxx) |
-			S_028254_BR_Y(final.maxy));
-}
-
-static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-	struct pipe_scissor_state *states = sctx->scissors.states;
-	unsigned mask = sctx->scissors.dirty_mask;
-	bool scissor_enable = sctx->queued.named.rasterizer->scissor_enable;
-
-	/* The simple case: Only 1 viewport is active. */
-	if (mask & 1 &&
-	    !si_get_vs_info(sctx)->writes_viewport_index) {
-		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2);
-		si_emit_one_scissor(cs, &sctx->viewports.states[0],
-				    scissor_enable ? &states[0] : NULL);
-		sctx->scissors.dirty_mask &= ~1; /* clear one bit */
-		return;
-	}
-
-	while (mask) {
-		int start, count, i;
-
-		u_bit_scan_consecutive_range(&mask, &start, &count);
-
-		radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL +
-					       start * 4 * 2, count * 2);
-		for (i = start; i < start+count; i++) {
-			si_emit_one_scissor(cs, &sctx->viewports.states[i],
-					    scissor_enable ? &states[i] : NULL);
-		}
-	}
-	sctx->scissors.dirty_mask = 0;
-}
-
-static void si_set_viewport_states(struct pipe_context *ctx,
-                                   unsigned start_slot,
-                                   unsigned num_viewports,
-                                   const struct pipe_viewport_state *state)
-{
-	struct si_context *sctx = (struct si_context *)ctx;
-	int i;
-
-	for (i = 0; i < num_viewports; i++)
-		sctx->viewports.states[start_slot + i] = state[i];
-
-	sctx->viewports.dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
-	sctx->scissors.dirty_mask |= ((1 << num_viewports) - 1) << start_slot;
-	si_mark_atom_dirty(sctx, &sctx->viewports.atom);
-	si_mark_atom_dirty(sctx, &sctx->scissors.atom);
-}
-
-static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
-	struct pipe_viewport_state *states = sctx->viewports.states;
-	unsigned mask = sctx->viewports.dirty_mask;
-
-	/* The simple case: Only 1 viewport is active. */
-	if (mask & 1 &&
-	    !si_get_vs_info(sctx)->writes_viewport_index) {
-		radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6);
-		radeon_emit(cs, fui(states[0].scale[0]));
-		radeon_emit(cs, fui(states[0].translate[0]));
-		radeon_emit(cs, fui(states[0].scale[1]));
-		radeon_emit(cs, fui(states[0].translate[1]));
-		radeon_emit(cs, fui(states[0].scale[2]));
-		radeon_emit(cs, fui(states[0].translate[2]));
-		sctx->viewports.dirty_mask &= ~1; /* clear one bit */
-		return;
-	}
-
-	while (mask) {
-		int start, count, i;
-
-		u_bit_scan_consecutive_range(&mask, &start, &count);
-
-		radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE +
-					       start * 4 * 6, count * 6);
-		for (i = start; i < start+count; i++) {
-			radeon_emit(cs, fui(states[i].scale[0]));
-			radeon_emit(cs, fui(states[i].translate[0]));
-			radeon_emit(cs, fui(states[i].scale[1]));
-			radeon_emit(cs, fui(states[i].translate[1]));
-			radeon_emit(cs, fui(states[i].scale[2]));
-			radeon_emit(cs, fui(states[i].translate[2]));
-		}
-	}
-	sctx->viewports.dirty_mask = 0;
-}
-
 /*
  * inferred state between framebuffer and rasterizer
  */
@@ -1173,10 +1000,7 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 	    (!old_rs || old_rs->multisample_enable != rs->multisample_enable))
 		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
-	if (!old_rs || old_rs->scissor_enable != rs->scissor_enable) {
-		sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
-		si_mark_atom_dirty(sctx, &sctx->scissors.atom);
-	}
+	r600_set_scissor_enable(&sctx->b, rs->scissor_enable);
 
 	si_pm4_bind_state(sctx, rasterizer, rs);
 	si_update_poly_offset_state(sctx);
@@ -1348,6 +1172,26 @@ static void *si_create_db_flush_dsa(struct si_context *sctx)
 
 /* DB RENDER STATE */
 
+static void si_set_active_query_state(struct pipe_context *ctx, boolean enable)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+
+	/* Pipeline stat & streamout queries. */
+	if (enable) {
+		sctx->b.flags &= ~R600_CONTEXT_STOP_PIPELINE_STATS;
+		sctx->b.flags |= R600_CONTEXT_START_PIPELINE_STATS;
+	} else {
+		sctx->b.flags &= ~R600_CONTEXT_START_PIPELINE_STATS;
+		sctx->b.flags |= R600_CONTEXT_STOP_PIPELINE_STATS;
+	}
+
+	/* Occlusion queries. */
+	if (sctx->occlusion_queries_disabled != !enable) {
+		sctx->occlusion_queries_disabled = !enable;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
+	}
+}
+
 static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
@@ -1382,7 +1226,8 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s
 	}
 
 	/* DB_COUNT_CONTROL (occlusion queries) */
-	if (sctx->b.num_occlusion_queries > 0) {
+	if (sctx->b.num_occlusion_queries > 0 &&
+	    !sctx->occlusion_queries_disabled) {
 		bool perfect = sctx->b.num_perfect_occlusion_queries > 0;
 
 		if (sctx->b.chip_class >= CIK) {
@@ -1838,17 +1683,6 @@ static unsigned si_tex_wrap(unsigned wrap)
 	}
 }
 
-static unsigned si_tex_filter(unsigned filter)
-{
-	switch (filter) {
-	default:
-	case PIPE_TEX_FILTER_NEAREST:
-		return V_008F38_SQ_TEX_XY_FILTER_POINT;
-	case PIPE_TEX_FILTER_LINEAR:
-		return V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
-	}
-}
-
 static unsigned si_tex_mipfilter(unsigned filter)
 {
 	switch (filter) {
@@ -3122,6 +2956,16 @@ si_make_texture_descriptor(struct si_screen *screen,
 	} else {
 		state[6] = 0;
 		state[7] = 0;
+
+		/* The last dword is unused by hw. The shader uses it to clear
+		 * bits in the first dword of sampler state.
+		 */
+		if (screen->b.chip_class <= CIK && res->nr_samples <= 1) {
+			if (first_level == last_level)
+				state[7] = C_008F30_MAX_ANISO_RATIO;
+			else
+				state[7] = 0xffffffff;
+		}
 	}
 
 	/* Initialize the sampler view for FMASK. */
@@ -3318,9 +3162,12 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
 				     const struct pipe_sampler_state *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+	struct r600_common_screen *rscreen = sctx->b.screen;
 	struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
-	unsigned aniso_flag_offset = state->max_anisotropy > 1 ? 2 : 0;
 	unsigned border_color_type, border_color_index = 0;
+	unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso
+						       : state->max_anisotropy;
+	unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso);
 
 	if (!rstate) {
 		return NULL;
@@ -3378,16 +3225,21 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
 	rstate->val[0] = (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) |
 			  S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
 			  S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) |
-			  r600_tex_aniso_filter(state->max_anisotropy) << 9 |
+			  S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
 			  S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
 			  S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
-			  S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map));
+			  S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
+			  S_008F30_COMPAT_MODE(sctx->b.chip_class >= VI));
 	rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
 			  S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)));
 	rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
-			  S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter) | aniso_flag_offset) |
-			  S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter) | aniso_flag_offset) |
-			  S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)));
+			  S_008F38_XY_MAG_FILTER(eg_tex_filter(state->mag_img_filter, max_aniso)) |
+			  S_008F38_XY_MIN_FILTER(eg_tex_filter(state->min_img_filter, max_aniso)) |
+			  S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
+			  S_008F38_MIP_POINT_PRECLAMP(1) |
+			  S_008F38_DISABLE_LSB_CEIL(1) |
+			  S_008F38_FILTER_PREC_FIX(1) |
+			  S_008F38_ANISO_OVERRIDE(sctx->b.chip_class >= VI));
 	rstate->val[3] = S_008F3C_BORDER_COLOR_PTR(border_color_index) |
 			 S_008F3C_BORDER_COLOR_TYPE(border_color_type);
 	return rstate;
@@ -3430,7 +3282,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 	struct si_vertex_element *v = CALLOC_STRUCT(si_vertex_element);
 	int i;
 
-	assert(count < SI_MAX_ATTRIBS);
+	assert(count <= SI_MAX_ATTRIBS);
 	if (!v)
 		return NULL;
 
@@ -3678,6 +3530,8 @@ void si_init_state_functions(struct si_context *sctx)
 	si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond);
 	si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin);
 	si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable);
+	si_init_external_atom(sctx, &sctx->b.scissors.atom, &sctx->atoms.s.scissors);
+	si_init_external_atom(sctx, &sctx->b.viewports.atom, &sctx->atoms.s.viewports);
 
 	si_init_atom(sctx, &sctx->cache_flush, &sctx->atoms.s.cache_flush, si_emit_cache_flush);
 	si_init_atom(sctx, &sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state);
@@ -3689,8 +3543,6 @@ void si_init_state_functions(struct si_context *sctx)
 	si_init_atom(sctx, &sctx->blend_color.atom, &sctx->atoms.s.blend_color, si_emit_blend_color);
 	si_init_atom(sctx, &sctx->clip_regs, &sctx->atoms.s.clip_regs, si_emit_clip_regs);
 	si_init_atom(sctx, &sctx->clip_state.atom, &sctx->atoms.s.clip_state, si_emit_clip_state);
-	si_init_atom(sctx, &sctx->scissors.atom, &sctx->atoms.s.scissors, si_emit_scissors);
-	si_init_atom(sctx, &sctx->viewports.atom, &sctx->atoms.s.viewports, si_emit_viewports);
 	si_init_atom(sctx, &sctx->stencil_ref.atom, &sctx->atoms.s.stencil_ref, si_emit_stencil_ref);
 
 	sctx->b.b.create_blend_state = si_create_blend_state;
@@ -3713,8 +3565,6 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
 
 	sctx->b.b.set_clip_state = si_set_clip_state;
-	sctx->b.b.set_scissor_states = si_set_scissor_states;
-	sctx->b.b.set_viewport_states = si_set_viewport_states;
 	sctx->b.b.set_stencil_ref = si_set_stencil_ref;
 
 	sctx->b.b.set_framebuffer_state = si_set_framebuffer_state;
@@ -3740,6 +3590,7 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->b.b.set_min_samples = si_set_min_samples;
 	sctx->b.b.set_tess_state = si_set_tess_state;
 
+	sctx->b.b.set_active_query_state = si_set_active_query_state;
 	sctx->b.set_occlusion_query_state = si_set_occlusion_query_state;
 	sctx->b.need_gfx_cs_space = si_need_gfx_cs_space;
 
@@ -4098,10 +3949,6 @@ static void si_init_config(struct si_context *sctx)
 	/* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */
 	si_pm4_set_reg(pm4, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0);
 	si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
-	si_pm4_set_reg(pm4, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, fui(1.0));
-	si_pm4_set_reg(pm4, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0));
-	si_pm4_set_reg(pm4, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0));
-	si_pm4_set_reg(pm4, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
 	si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
 	si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index f55f19e2918..6748f802c7d 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -161,6 +161,8 @@ struct si_shader_data {
 
 #define SI_NUM_IMAGES			16
 
+#define SI_NUM_SHADER_BUFFERS		16
+
 /* Read-write buffer slots.
  *
  * Ring buffers:        0..1
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 84b850a2992..40cad504e09 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -722,6 +722,16 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 		}
 	}
 
+	if (sctx->flags & R600_CONTEXT_START_PIPELINE_STATS) {
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
+			        EVENT_INDEX(0));
+	} else if (sctx->flags & R600_CONTEXT_STOP_PIPELINE_STATS) {
+		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) |
+			        EVENT_INDEX(0));
+	}
+
 	sctx->flags = 0;
 }
 
@@ -882,8 +892,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	if ((sctx->b.family == CHIP_HAWAII ||
 	     sctx->b.family == CHIP_TONGA ||
 	     sctx->b.family == CHIP_FIJI) &&
-	    (sctx->b.streamout.streamout_enabled ||
-	     sctx->b.streamout.prims_gen_query_enabled)) {
+	    r600_get_strmout_en(&sctx->b)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 02489583423..b7ebb48e6a9 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1270,25 +1270,6 @@ error:
 	return NULL;
 }
 
-/**
- * Normally, we only emit 1 viewport and 1 scissor if no shader is using
- * the VIEWPORT_INDEX output, and emitting the other viewports and scissors
- * is delayed. When a shader with VIEWPORT_INDEX appears, this should be
- * called to emit the rest.
- */
-static void si_update_viewports_and_scissors(struct si_context *sctx)
-{
-	struct tgsi_shader_info *info = si_get_vs_info(sctx);
-
-	if (!info || !info->writes_viewport_index)
-		return;
-
-	if (sctx->scissors.dirty_mask)
-	    si_mark_atom_dirty(sctx, &sctx->scissors.atom);
-	if (sctx->viewports.dirty_mask)
-	    si_mark_atom_dirty(sctx, &sctx->viewports.atom);
-}
-
 static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -1300,7 +1281,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
 	sctx->vs_shader.cso = sel;
 	sctx->vs_shader.current = sel ? sel->first_variant : NULL;
 	si_mark_atom_dirty(sctx, &sctx->clip_regs);
-	si_update_viewports_and_scissors(sctx);
+	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
 }
 
 static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
@@ -1319,7 +1300,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 
 	if (enable_changed)
 		si_shader_change_notify(sctx);
-	si_update_viewports_and_scissors(sctx);
+	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
 }
 
 static void si_bind_tcs_shader(struct pipe_context *ctx, void *state)
@@ -1356,7 +1337,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
 		si_shader_change_notify(sctx);
 		sctx->last_tes_sh_base = -1; /* invalidate derived tess state */
 	}
-	si_update_viewports_and_scissors(sctx);
+	r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx));
 }
 
 static void si_bind_ps_shader(struct pipe_context *ctx, void *state)
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 892084707d2..f0aa605c2d9 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -2307,6 +2307,9 @@
 #define     V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER                 0x05
 #define     V_008F30_SQ_TEX_CLAMP_BORDER                            0x06
 #define     V_008F30_SQ_TEX_MIRROR_ONCE_BORDER                      0x07
+#define   S_008F30_MAX_ANISO_RATIO(x)                                 (((x) & 0x07) << 9)
+#define   G_008F30_MAX_ANISO_RATIO(x)                                 (((x) >> 9) & 0x07)
+#define   C_008F30_MAX_ANISO_RATIO                                    0xFFFFF1FF
 #define   S_008F30_DEPTH_COMPARE_FUNC(x)                              (((x) & 0x07) << 12)
 #define   G_008F30_DEPTH_COMPARE_FUNC(x)                              (((x) >> 12) & 0x07)
 #define   C_008F30_DEPTH_COMPARE_FUNC                                 0xFFFF8FFF
@@ -2371,6 +2374,8 @@
 #define   C_008F38_XY_MIN_FILTER                                      0xFF3FFFFF
 #define     V_008F38_SQ_TEX_XY_FILTER_POINT                         0x00
 #define     V_008F38_SQ_TEX_XY_FILTER_BILINEAR                      0x01
+#define     V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT                   0x02
+#define     V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR                0x03
 #define   S_008F38_Z_FILTER(x)                                        (((x) & 0x03) << 24)
 #define   G_008F38_Z_FILTER(x)                                        (((x) >> 24) & 0x03)
 #define   C_008F38_Z_FILTER                                           0xFCFFFFFF
@@ -2392,6 +2397,9 @@
 #define   S_008F38_FILTER_PREC_FIX(x)                                 (((x) & 0x1) << 30)
 #define   G_008F38_FILTER_PREC_FIX(x)                                 (((x) >> 30) & 0x1)
 #define   C_008F38_FILTER_PREC_FIX                                    0xBFFFFFFF
+#define   S_008F38_ANISO_OVERRIDE(x)                                  (((x) & 0x1) << 31)
+#define   G_008F38_ANISO_OVERRIDE(x)                                  (((x) >> 31) & 0x1)
+#define   C_008F38_ANISO_OVERRIDE                                     0x7FFFFFFF
 #define R_008F3C_SQ_IMG_SAMP_WORD3                                      0x008F3C
 #define   S_008F3C_BORDER_COLOR_PTR(x)                                (((x) & 0xFFF) << 0)
 #define   G_008F3C_BORDER_COLOR_PTR(x)                                (((x) >> 0) & 0xFFF)
diff --git a/src/gallium/drivers/rbug/rbug_context.c b/src/gallium/drivers/rbug/rbug_context.c
index 9ecddad05ec..1280c45b539 100644
--- a/src/gallium/drivers/rbug/rbug_context.c
+++ b/src/gallium/drivers/rbug/rbug_context.c
@@ -211,6 +211,17 @@ rbug_get_query_result(struct pipe_context *_pipe,
    return ret;
 }
 
+static void
+rbug_set_active_query_state(struct pipe_context *_pipe, boolean enable)
+{
+   struct rbug_context *rb_pipe = rbug_context(_pipe);
+   struct pipe_context *pipe = rb_pipe->pipe;
+
+   pipe_mutex_lock(rb_pipe->call_mutex);
+   pipe->set_active_query_state(pipe, enable);
+   pipe_mutex_unlock(rb_pipe->call_mutex);
+}
+
 static void *
 rbug_create_blend_state(struct pipe_context *_pipe,
                         const struct pipe_blend_state *blend)
@@ -1184,6 +1195,7 @@ rbug_context_create(struct pipe_screen *_screen, struct pipe_context *pipe)
    rb_pipe->base.begin_query = rbug_begin_query;
    rb_pipe->base.end_query = rbug_end_query;
    rb_pipe->base.get_query_result = rbug_get_query_result;
+   rb_pipe->base.set_active_query_state = rbug_set_active_query_state;
    rb_pipe->base.create_blend_state = rbug_create_blend_state;
    rb_pipe->base.bind_blend_state = rbug_bind_blend_state;
    rb_pipe->base.delete_blend_state = rbug_delete_blend_state;
diff --git a/src/gallium/drivers/softpipe/Makefile.sources b/src/gallium/drivers/softpipe/Makefile.sources
index efe88468e3f..1d42351f975 100644
--- a/src/gallium/drivers/softpipe/Makefile.sources
+++ b/src/gallium/drivers/softpipe/Makefile.sources
@@ -1,4 +1,5 @@
 C_SOURCES := \
+	sp_buffer.c \
 	sp_clear.c \
 	sp_clear.h \
 	sp_context.c \
@@ -11,6 +12,7 @@ C_SOURCES := \
 	sp_fs_exec.c \
 	sp_fs.h \
 	sp_image.c \
+	sp_image.h \
 	sp_limits.h \
 	sp_prim_vbuf.c \
 	sp_prim_vbuf.h \
diff --git a/src/gallium/drivers/softpipe/sp_buffer.c b/src/gallium/drivers/softpipe/sp_buffer.c
new file mode 100644
index 00000000000..69a6bd18c3b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_buffer.c
@@ -0,0 +1,371 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_buffer.h"
+#include "sp_texture.h"
+
+#include "util/u_format.h"
+
+static bool
+get_dimensions(const struct pipe_shader_buffer *bview,
+               const struct softpipe_resource *spr,
+               unsigned *width)
+{
+   *width = bview->buffer_size;
+   /*
+    * Bounds check the buffer size from the view
+    * and the buffer size from the underlying buffer.
+    */
+   if (*width > spr->base.width0)
+      return false;
+   return true;
+}
+
+/*
+ * Implement the image LOAD operation.
+ */
+static void
+sp_tgsi_load(const struct tgsi_buffer *buffer,
+             const struct tgsi_buffer_params *params,
+             const int s[TGSI_QUAD_SIZE],
+             float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_buffer *sp_buf = (struct sp_tgsi_buffer *)buffer;
+   struct pipe_shader_buffer *bview;
+   struct softpipe_resource *spr;
+   unsigned width;
+   int c, j;
+   unsigned char *data_ptr;
+   const struct util_format_description *format_desc = util_format_description(PIPE_FORMAT_R32_UINT);
+
+   if (params->unit >= PIPE_MAX_SHADER_BUFFERS)
+      goto fail_write_all_zero;
+
+   bview = &sp_buf->sp_bview[params->unit];
+   spr = softpipe_resource(bview->buffer);
+   if (!spr)
+      goto fail_write_all_zero;
+
+   if (!get_dimensions(bview, spr, &width))
+      return;
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord;
+      bool fill_zero = false;
+      uint32_t sdata[4];
+
+      if (!(params->execmask & (1 << j)))
+         fill_zero = true;
+
+      s_coord = s[j];
+      if (s_coord >= width)
+         fill_zero = true;
+
+      if (fill_zero) {
+         for (c = 0; c < 4; c++)
+            rgba[c][j] = 0;
+         continue;
+      }
+      data_ptr = (unsigned char *)spr->data + bview->buffer_offset + s_coord;
+      for (c = 0; c < 4; c++) {
+         format_desc->fetch_rgba_uint(sdata, data_ptr, 0, 0);
+         ((uint32_t *)rgba[c])[j] = sdata[0];
+         data_ptr += 4;
+      }
+   }
+   return;
+fail_write_all_zero:
+   memset(rgba, 0, TGSI_NUM_CHANNELS * TGSI_QUAD_SIZE * 4);
+   return;
+}
+
+/*
+ * Implement the buffer STORE operation.
+ */
+static void
+sp_tgsi_store(const struct tgsi_buffer *buffer,
+              const struct tgsi_buffer_params *params,
+              const int s[TGSI_QUAD_SIZE],
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_buffer *sp_buf = (struct sp_tgsi_buffer *)buffer;
+   struct pipe_shader_buffer *bview;
+   struct softpipe_resource *spr;
+   unsigned width;
+   unsigned char *data_ptr;
+   int j, c;
+   const struct util_format_description *format_desc = util_format_description(PIPE_FORMAT_R32_UINT);
+
+   if (params->unit >= PIPE_MAX_SHADER_BUFFERS)
+      return;
+
+   bview = &sp_buf->sp_bview[params->unit];
+   spr = softpipe_resource(bview->buffer);
+   if (!spr)
+      return;
+
+   if (!get_dimensions(bview, spr, &width))
+      return;
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord;
+
+      if (!(params->execmask & (1 << j)))
+         continue;
+
+      s_coord = s[j];
+      if (s_coord >= width)
+         continue;
+
+      data_ptr = (unsigned char *)spr->data + bview->buffer_offset + s_coord;
+
+      for (c = 0; c < 4; c++) {
+         if (params->writemask & (1 << c)) {
+            unsigned temp[4];
+            unsigned char *dptr = data_ptr + (c * 4);
+            temp[0] = ((uint32_t *)rgba[c])[j];
+            format_desc->pack_rgba_uint(dptr, 0, temp, 0, 1, 1);
+         }
+      }
+   }
+}
+
+/*
+ * Implement atomic operations on unsigned integers.
+ */
+static void
+handle_op_uint(const struct pipe_shader_buffer *bview,
+               bool just_read,
+               unsigned char *data_ptr,
+               uint qi,
+               unsigned opcode,
+               unsigned writemask,
+               float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+               float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   uint c;
+   const struct util_format_description *format_desc = util_format_description(PIPE_FORMAT_R32_UINT);
+   unsigned sdata[4];
+
+   for (c = 0; c < 4; c++) {
+      unsigned temp[4];
+      unsigned char *dptr = data_ptr + (c * 4);
+      format_desc->fetch_rgba_uint(temp, dptr, 0, 0);
+      sdata[c] = temp[0];
+   }
+
+   if (just_read) {
+      for (c = 0; c < 4; c++) {
+         ((uint32_t *)rgba[c])[qi] = sdata[c];
+      }
+      return;
+   }
+
+   switch (opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      for (c = 0; c < 4; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] += ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      for (c = 0; c < 4; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] = ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      for (c = 0; c < 4; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned cmp_x = ((uint32_t *)rgba[c])[qi];
+         unsigned src_x = ((uint32_t *)rgba2[c])[qi];
+         unsigned temp = sdata[c];
+         sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      for (c = 0; c < 4; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] &= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      for (c = 0; c < 4; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] |= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      for (c = 0; c < 4; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] ^= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      for (c = 0; c < 4; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      for (c = 0; c < 4; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      for (c = 0; c < 4; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      for (c = 0; c < 4; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   default:
+      assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+      break;
+   }
+
+   for (c = 0; c < 4; c++) {
+      if (writemask & (1 << c)) {
+         unsigned temp[4];
+         unsigned char *dptr = data_ptr + (c * 4);
+         temp[0] = sdata[c];
+         format_desc->pack_rgba_uint(dptr, 0, temp, 0, 1, 1);
+      }
+   }
+}
+
+/*
+ * Implement atomic buffer operations.
+ */
+static void
+sp_tgsi_op(const struct tgsi_buffer *buffer,
+           const struct tgsi_buffer_params *params,
+           unsigned opcode,
+           const int s[TGSI_QUAD_SIZE],
+           float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+           float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_buffer *sp_buf = (struct sp_tgsi_buffer *)buffer;
+   struct pipe_shader_buffer *bview;
+   struct softpipe_resource *spr;
+   unsigned width;
+   int j, c;
+   unsigned char *data_ptr;
+
+   if (params->unit >= PIPE_MAX_SHADER_BUFFERS)
+      return;
+
+   bview = &sp_buf->sp_bview[params->unit];
+   spr = softpipe_resource(bview->buffer);
+   if (!spr)
+      goto fail_write_all_zero;
+
+   if (!get_dimensions(bview, spr, &width))
+      goto fail_write_all_zero;
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord;
+      bool just_read = false;
+
+      s_coord = s[j];
+      if (s_coord >= width) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = 0;
+         }
+         continue;
+      }
+
+      /* just readback the value for atomic if execmask isn't set */
+      if (!(params->execmask & (1 << j))) {
+         just_read = true;
+      }
+
+      data_ptr = (unsigned char *)spr->data + bview->buffer_offset + s_coord;
+      /* we should see atomic operations on r32 formats */
+
+      handle_op_uint(bview, just_read, data_ptr, j,
+                     opcode, params->writemask, rgba, rgba2);
+   }
+   return;
+fail_write_all_zero:
+   memset(rgba, 0, TGSI_NUM_CHANNELS * TGSI_QUAD_SIZE * 4);
+   return;
+}
+
+/*
+ * return size of the attached buffer for RESQ opcode.
+ */
+static void
+sp_tgsi_get_dims(const struct tgsi_buffer *buffer,
+                 const struct tgsi_buffer_params *params,
+                 int *dim)
+{
+   struct sp_tgsi_buffer *sp_buf = (struct sp_tgsi_buffer *)buffer;
+   struct pipe_shader_buffer *bview;
+   struct softpipe_resource *spr;
+
+   if (params->unit >= PIPE_MAX_SHADER_BUFFERS)
+      return;
+
+   bview = &sp_buf->sp_bview[params->unit];
+   spr = softpipe_resource(bview->buffer);
+   if (!spr)
+      return;
+
+   *dim = bview->buffer_size;
+}
+
+struct sp_tgsi_buffer *
+sp_create_tgsi_buffer(void)
+{
+   struct sp_tgsi_buffer *buf = CALLOC_STRUCT(sp_tgsi_buffer);
+   if (!buf)
+      return NULL;
+
+   buf->base.load = sp_tgsi_load;
+   buf->base.store = sp_tgsi_store;
+   buf->base.op = sp_tgsi_op;
+   buf->base.get_dims = sp_tgsi_get_dims;
+   return buf;
+};
diff --git a/src/gallium/drivers/softpipe/sp_buffer.h b/src/gallium/drivers/softpipe/sp_buffer.h
new file mode 100644
index 00000000000..1822fe709fe
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_buffer.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SP_BUFFER_H
+#define SP_BUFFER_H
+#include "tgsi/tgsi_exec.h"
+
+struct sp_tgsi_buffer
+{
+   struct tgsi_buffer base;
+   struct pipe_shader_buffer sp_bview[PIPE_MAX_SHADER_BUFFERS];
+};
+
+struct sp_tgsi_buffer *
+sp_create_tgsi_buffer(void);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 30b0276cfe0..0342fc6f8cd 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -38,6 +38,7 @@
 #include "util/u_pstipple.h"
 #include "util/u_inlines.h"
 #include "tgsi/tgsi_exec.h"
+#include "sp_buffer.h"
 #include "sp_clear.h"
 #include "sp_context.h"
 #include "sp_flush.h"
@@ -116,6 +117,8 @@ softpipe_destroy( struct pipe_context *pipe )
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
       FREE(softpipe->tgsi.sampler[i]);
+      FREE(softpipe->tgsi.image[i]);
+      FREE(softpipe->tgsi.buffer[i]);
    }
 
    FREE( softpipe );
@@ -203,6 +206,10 @@ softpipe_create_context(struct pipe_screen *screen,
       softpipe->tgsi.image[i] = sp_create_tgsi_image();
    }
 
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      softpipe->tgsi.buffer[i] = sp_create_tgsi_buffer();
+   }
+
    softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE );
    softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
 
@@ -288,6 +295,16 @@ softpipe_create_context(struct pipe_screen *screen,
               (struct tgsi_image *)
               softpipe->tgsi.image[PIPE_SHADER_GEOMETRY]);
 
+   draw_buffer(softpipe->draw,
+              PIPE_SHADER_VERTEX,
+              (struct tgsi_buffer *)
+              softpipe->tgsi.buffer[PIPE_SHADER_VERTEX]);
+
+   draw_buffer(softpipe->draw,
+              PIPE_SHADER_GEOMETRY,
+              (struct tgsi_buffer *)
+              softpipe->tgsi.buffer[PIPE_SHADER_GEOMETRY]);
+
    if (debug_get_bool_option( "SOFTPIPE_NO_RAST", FALSE ))
       softpipe->no_rast = TRUE;
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 20a12353b38..70d00c88b6e 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -84,6 +84,7 @@ struct softpipe_context {
    struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
 
    struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
+   struct pipe_shader_buffer buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS];
    struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_index_buffer index_buffer;
@@ -174,6 +175,7 @@ struct softpipe_context {
    struct {
       struct sp_tgsi_sampler *sampler[PIPE_SHADER_TYPES];
       struct sp_tgsi_image *image[PIPE_SHADER_TYPES];
+      struct sp_tgsi_buffer *buffer[PIPE_SHADER_TYPES];
    } tgsi;
 
    struct tgsi_exec_machine *fs_machine;
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index bfd9a4b7496..155382af825 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -63,14 +63,15 @@ static void
 exec_prepare( const struct sp_fragment_shader_variant *var,
               struct tgsi_exec_machine *machine,
               struct tgsi_sampler *sampler,
-              struct tgsi_image *image )
+              struct tgsi_image *image,
+              struct tgsi_buffer *buffer )
 {
    /*
     * Bind tokens/shader to the interpreter's machine state.
     */
    tgsi_exec_machine_bind_shader(machine,
                                  var->tokens,
-                                 sampler, image);
+                                 sampler, image, buffer);
 }
 
 
@@ -186,7 +187,7 @@ exec_delete(struct sp_fragment_shader_variant *var,
             struct tgsi_exec_machine *machine)
 {
    if (machine->Tokens == var->tokens) {
-      tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL);
+      tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL, NULL);
    }
 
    FREE( (void *) var->tokens );
diff --git a/src/gallium/drivers/softpipe/sp_image.c b/src/gallium/drivers/softpipe/sp_image.c
index 3488fa83185..a7c73280a80 100644
--- a/src/gallium/drivers/softpipe/sp_image.c
+++ b/src/gallium/drivers/softpipe/sp_image.c
@@ -217,7 +217,7 @@ sp_tgsi_load(const struct tgsi_image *image,
    char *data_ptr;
    unsigned offset = 0;
 
-   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+   if (params->unit >= PIPE_MAX_SHADER_IMAGES)
       goto fail_write_all_zero;
    iview = &sp_img->sp_iview[params->unit];
    spr = (struct softpipe_resource *)iview->resource;
@@ -320,7 +320,7 @@ sp_tgsi_store(const struct tgsi_image *image,
    unsigned offset = 0;
    unsigned pformat = params->format;
 
-   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+   if (params->unit >= PIPE_MAX_SHADER_IMAGES)
       return;
    iview = &sp_img->sp_iview[params->unit];
    spr = (struct softpipe_resource *)iview->resource;
@@ -630,7 +630,7 @@ sp_tgsi_op(const struct tgsi_image *image,
    unsigned offset;
    char *data_ptr;
 
-   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+   if (params->unit >= PIPE_MAX_SHADER_IMAGES)
       return;
    iview = &sp_img->sp_iview[params->unit];
    spr = (struct softpipe_resource *)iview->resource;
@@ -704,7 +704,7 @@ sp_tgsi_get_dims(const struct tgsi_image *image,
    struct softpipe_resource *spr;
    int level;
 
-   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+   if (params->unit >= PIPE_MAX_SHADER_IMAGES)
       return;
    iview = &sp_img->sp_iview[params->unit];
    spr = (struct softpipe_resource *)iview->resource;
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index c28d28d5f5d..81e97107d59 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -283,6 +283,12 @@ softpipe_check_render_cond(struct softpipe_context *sp)
 }
 
 
+static void
+softpipe_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
+
 void softpipe_init_query_funcs(struct softpipe_context *softpipe )
 {
    softpipe->pipe.create_query = softpipe_create_query;
@@ -290,6 +296,7 @@ void softpipe_init_query_funcs(struct softpipe_context *softpipe )
    softpipe->pipe.begin_query = softpipe_begin_query;
    softpipe->pipe.end_query = softpipe_end_query;
    softpipe->pipe.get_query_result = softpipe_get_query_result;
+   softpipe->pipe.set_active_query_state = softpipe_set_active_query_state;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 90f29d6e52a..d89d95c884c 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -239,6 +239,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
    case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
       return 1;
+   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+      return 1;
    case PIPE_CAP_VERTEXID_NOBASE:
       return 0;
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
@@ -259,7 +261,6 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
    case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
    case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
-   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
    case PIPE_CAP_INVALIDATE_BUFFER:
    case PIPE_CAP_GENERATE_MIPMAP:
    case PIPE_CAP_STRING_MARKER:
@@ -270,8 +271,10 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
-   case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+      return 4;
    }
    /* should only get here on unhandled cases */
    debug_printf("Unexpected PIPE_CAP %d query\n", param);
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 570bc549cc4..2fc48ab13d8 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -57,6 +57,7 @@
 
 struct tgsi_sampler;
 struct tgsi_image;
+struct tgsi_buffer;
 struct tgsi_exec_machine;
 struct vertex_info;
 
@@ -83,7 +84,8 @@ struct sp_fragment_shader_variant
    void (*prepare)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
 		   struct tgsi_sampler *sampler,
-		   struct tgsi_image *image);
+		   struct tgsi_image *image,
+		   struct tgsi_buffer *buffer);
 
    unsigned (*run)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 65679e73515..4ce9d95bc6e 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -344,7 +344,8 @@ update_fragment_shader(struct softpipe_context *softpipe, unsigned prim)
                                     softpipe->fs_machine,
                                     (struct tgsi_sampler *) softpipe->
                                     tgsi.sampler[PIPE_SHADER_FRAGMENT],
-                                    (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT]);
+                                    (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT],
+                                    (struct tgsi_buffer *)softpipe->tgsi.buffer[PIPE_SHADER_FRAGMENT]);
    }
    else {
       softpipe->fs_variant = NULL;
diff --git a/src/gallium/drivers/softpipe/sp_state_image.c b/src/gallium/drivers/softpipe/sp_state_image.c
index 8909fa26864..5947c934e86 100644
--- a/src/gallium/drivers/softpipe/sp_state_image.c
+++ b/src/gallium/drivers/softpipe/sp_state_image.c
@@ -24,6 +24,7 @@
 #include "sp_context.h"
 #include "sp_state.h"
 #include "sp_image.h"
+#include "sp_buffer.h"
 
 static void softpipe_set_shader_images(struct pipe_context *pipe,
                                        unsigned shader,
@@ -51,7 +52,34 @@ static void softpipe_set_shader_images(struct pipe_context *pipe,
    }
 }
 
+static void softpipe_set_shader_buffers(struct pipe_context *pipe,
+                                        unsigned shader,
+                                        unsigned start,
+                                        unsigned num,
+                                        struct pipe_shader_buffer *buffers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(start + num <= Elements(softpipe->buffers[shader]));
+
+   /* set the new images */
+   for (i = 0; i < num; i++) {
+      int idx = start + i;
+
+      if (buffers) {
+         pipe_resource_reference(&softpipe->tgsi.buffer[shader]->sp_bview[idx].buffer, buffers[i].buffer);
+         softpipe->tgsi.buffer[shader]->sp_bview[idx] = buffers[i];
+      }
+      else {
+         pipe_resource_reference(&softpipe->tgsi.buffer[shader]->sp_bview[idx].buffer, NULL);
+         memset(&softpipe->tgsi.buffer[shader]->sp_bview[idx], 0, sizeof(struct pipe_shader_buffer));
+      }
+   }
+}
+
 void softpipe_init_image_funcs(struct pipe_context *pipe)
 {
    pipe->set_shader_images = softpipe_set_shader_images;
+   pipe->set_shader_buffers = softpipe_set_shader_buffers;
 }
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 9cc8ac12525..c6233261938 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -99,9 +99,9 @@ sp_create_tile_cache( struct pipe_context *pipe )
    maxTexSize = 1 << (maxLevels - 1);
    assert(MAX_WIDTH >= maxTexSize);
 
-   assert(sizeof(union tile_address) == 4);
+   STATIC_ASSERT(sizeof(union tile_address) == 4);
 
-   assert((TILE_SIZE << TILE_ADDR_BITS) >= MAX_WIDTH);
+   STATIC_ASSERT((TILE_SIZE << TILE_ADDR_BITS) >= MAX_WIDTH);
 
    tc = CALLOC_STRUCT( softpipe_tile_cache );
    if (tc) {
diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
index 83fcdc3d80b..c5d83c33f29 100644
--- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c
+++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c
@@ -86,9 +86,9 @@ define_depth_stencil_state_object(struct svga_context *svga,
    ds->id = util_bitmask_add(svga->ds_object_id_bm);
 
    /* spot check that these comparision tokens are the same */
-   assert(SVGA3D_COMPARISON_NEVER == SVGA3D_CMP_NEVER);
-   assert(SVGA3D_COMPARISON_LESS == SVGA3D_CMP_LESS);
-   assert(SVGA3D_COMPARISON_NOT_EQUAL == SVGA3D_CMP_NOTEQUAL);
+   STATIC_ASSERT(SVGA3D_COMPARISON_NEVER == SVGA3D_CMP_NEVER);
+   STATIC_ASSERT(SVGA3D_COMPARISON_LESS == SVGA3D_CMP_LESS);
+   STATIC_ASSERT(SVGA3D_COMPARISON_NOT_EQUAL == SVGA3D_CMP_NOTEQUAL);
 
    /* Loop in case command buffer is full and we need to flush and retry */
    for (try = 0; try < 2; try++) {
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 88f41eadc1d..75bc9ce092b 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -1246,6 +1246,12 @@ svga_get_timestamp(struct pipe_context *pipe)
 }
 
 
+static void
+svga_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
+
 void
 svga_init_query_functions(struct svga_context *svga)
 {
@@ -1254,6 +1260,7 @@ svga_init_query_functions(struct svga_context *svga)
    svga->pipe.begin_query = svga_begin_query;
    svga->pipe.end_query = svga_end_query;
    svga->pipe.get_query_result = svga_get_query_result;
+   svga->pipe.set_active_query_state = svga_set_active_query_state;
    svga->pipe.render_condition = svga_render_condition;
    svga->pipe.get_timestamp = svga_get_timestamp;
 }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 536fb6f786f..010d94b1b58 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -364,6 +364,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_BUS:
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c
index d43894d71b1..317b44eef00 100644
--- a/src/gallium/drivers/svga/svga_state_rss.c
+++ b/src/gallium/drivers/svga/svga_state_rss.c
@@ -47,7 +47,7 @@ struct rs_queue {
 
 #define EMIT_RS(svga, value, token, fail)                       \
 do {                                                            \
-   assert(SVGA3D_RS_##token < Elements(svga->state.hw_draw.rs)); \
+   STATIC_ASSERT(SVGA3D_RS_##token < Elements(svga->state.hw_draw.rs)); \
    if (svga->state.hw_draw.rs[SVGA3D_RS_##token] != value) {    \
       svga_queue_rs( &queue, SVGA3D_RS_##token, value );        \
       svga->state.hw_draw.rs[SVGA3D_RS_##token] = value;        \
@@ -57,7 +57,7 @@ do {                                                            \
 #define EMIT_RS_FLOAT(svga, fvalue, token, fail)                \
 do {                                                            \
    unsigned value = fui(fvalue);                                \
-   assert(SVGA3D_RS_##token < Elements(svga->state.hw_draw.rs)); \
+   STATIC_ASSERT(SVGA3D_RS_##token < Elements(svga->state.hw_draw.rs)); \
    if (svga->state.hw_draw.rs[SVGA3D_RS_##token] != value) {    \
       svga_queue_rs( &queue, SVGA3D_RS_##token, value );        \
       svga->state.hw_draw.rs[SVGA3D_RS_##token] = value;        \
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
index 4debbf1669b..fd6d1ce84b4 100644
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -327,7 +327,7 @@ svga_queue_tss( struct ts_queue *q,
 #define EMIT_TS(svga, unit, val, token)                                 \
 do {                                                                    \
    assert(unit < Elements(svga->state.hw_draw.ts));                     \
-   assert(SVGA3D_TS_##token < Elements(svga->state.hw_draw.ts[unit]));  \
+   STATIC_ASSERT(SVGA3D_TS_##token < Elements(svga->state.hw_draw.ts[unit])); \
    if (svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] != val) {        \
       svga_queue_tss( queue, unit, SVGA3D_TS_##token, val );            \
       svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] = val;            \
@@ -338,7 +338,7 @@ do {                                                                    \
 do {                                                                    \
    unsigned val = fui(fvalue);                                          \
    assert(unit < Elements(svga->state.hw_draw.ts));                     \
-   assert(SVGA3D_TS_##token < Elements(svga->state.hw_draw.ts[unit]));  \
+   STATIC_ASSERT(SVGA3D_TS_##token < Elements(svga->state.hw_draw.ts[unit])); \
    if (svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] != val) {        \
       svga_queue_tss( queue, unit, SVGA3D_TS_##token, val );            \
       svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] = val;            \
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 5794f3f625a..180a0560822 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -30,10 +30,6 @@
 
 #define SWR_API __cdecl
 
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
 #ifndef NOMINMAX
 #define NOMINMAX
 #endif
@@ -52,7 +48,6 @@
 
 #define PRAGMA_WARNING_POP() __pragma(warning(pop))
 
-#if defined(_WIN32)
 #if defined(_WIN64)
 #define BitScanReverseSizeT BitScanReverse64
 #define BitScanForwardSizeT BitScanForward64
@@ -62,7 +57,6 @@
 #define BitScanForwardSizeT BitScanForward
 #define _mm_popcount_sizeT _mm_popcnt_u32
 #endif
-#endif
 
 #elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
 
@@ -199,9 +193,7 @@ typedef KILOBYTE    MEGABYTE[1024];
 typedef MEGABYTE    GIGABYTE[1024];
 
 #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
-#if KNOB_SIMD_WIDTH == 8
-#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32)
-#endif
+#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
 
 #include "common/swr_assert.h"
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index f0f7956b590..ca9cfdb629e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -29,10 +29,12 @@
 #include <cfloat>
 #include <cmath>
 #include <cstdio>
+#include <new>
 
 #include "core/api.h"
 #include "core/backend.h"
 #include "core/context.h"
+#include "core/depthstencil.h"
 #include "core/frontend.h"
 #include "core/rasterizer.h"
 #include "core/rdtsc_core.h"
@@ -64,11 +66,14 @@ HANDLE SwrCreateContext(
     pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
     pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
 
+    pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
         pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
-        pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
-        pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+        new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+        new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
 
         pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
@@ -86,15 +91,26 @@ HANDLE SwrCreateContext(
     // Calling createThreadPool() above can set SINGLE_THREADED
     if (KNOB_SINGLE_THREADED)
     {
+        SET_KNOB(HYPERTHREADED_FE, false);
         pContext->NumWorkerThreads = 1;
+        pContext->NumFEThreads = 1;
+        pContext->NumBEThreads = 1;
     }
 
     // Allocate scratch space for workers.
     ///@note We could lazily allocate this but its rather small amount of memory.
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
-        ///@todo Use numa API for allocations using numa information from thread data (if exists).
-        pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
+#if defined(_WIN32)
+        uint32_t numaNode = pContext->threadPool.pThreadData ?
+            pContext->threadPool.pThreadData[i].numaId : 0;
+        pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
+            GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
+            MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
+            numaNode);
+#else
+        pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+#endif
     }
 
     // State setup AFTER context is fully initialized
@@ -131,14 +147,21 @@ void SwrDestroyContext(HANDLE hContext)
     {
         delete pContext->dcRing[i].pArena;
         delete pContext->dsRing[i].pArena;
-        delete(pContext->dcRing[i].pTileMgr);
-        delete(pContext->dcRing[i].pDispatch);
+        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+        pContext->pDispatchQueueArray[i].~DispatchQueue();
     }
 
+    _aligned_free(pContext->pDispatchQueueArray);
+    _aligned_free(pContext->pMacroTileManagerArray);
+
     // Free scratch space.
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
+#if defined(_WIN32)
+        VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
+#else
         _aligned_free(pContext->pScratch[i]);
+#endif
     }
 
     delete(pContext->pHotTileMgr);
@@ -160,12 +183,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
 template<bool IsDraw>
 void QueueWork(SWR_CONTEXT *pContext)
 {
+    DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
+    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+    if (IsDraw)
+    {
+        pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
+        pDC->pTileMgr->initialize();
+    }
+
     // Each worker thread looks at a DC for both FE and BE work at different times and so we
     // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
     // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
     // then moved on if all work is done.)
-    pContext->pCurDrawContext->threadsDone =
-        pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
+    pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
 
     _ReadWriteBarrier();
     {
@@ -183,7 +214,7 @@ void QueueWork(SWR_CONTEXT *pContext)
         {
             static TileSet lockedTiles;
             uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
-            WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+            WorkOnFifoFE(pContext, 0, curDraw[0]);
             WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
         }
         else
@@ -232,7 +263,20 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
             _mm_pause();
         }
 
-        uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+        uint64_t curDraw = pContext->dcRing.GetHead();
+        uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+        static uint64_t lastDrawChecked;
+        static uint32_t lastFrameChecked;
+        if ((pContext->frameCount - lastFrameChecked) > 2 ||
+            (curDraw - lastDrawChecked) > 0x10000)
+        {
+            // Take this opportunity to clean-up old arena allocations
+            pContext->cachingArenaAllocator.FreeOldBlocks();
+
+            lastFrameChecked = pContext->frameCount;
+            lastDrawChecked = curDraw;
+        }
 
         DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
         pContext->pCurDrawContext = pCurDrawContext;
@@ -284,8 +328,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         pCurDrawContext->FeLock = 0;
         pCurDrawContext->threadsDone = 0;
 
-        pCurDrawContext->pTileMgr->initialize();
-
         // Assign unique drawId for this DC
         pCurDrawContext->drawId = pContext->dcRing.GetHead();
 
@@ -872,6 +914,25 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
                  !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
         }
     }
+
+    // Setup depth quantization function
+    if (pState->state.depthHottileEnable)
+    {
+        switch (pState->state.rastState.depthFormat)
+        {
+        case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
+        case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
+        case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
+        case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
+        default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
+            pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
+        }
+    }
+    else
+    {
+        // set up pass-through quantize if depth isn't enabled
+        pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1029,9 +1090,9 @@ void DrawInstanced(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
+    uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
-    int32_t remainingVerts = numVertices;
+    uint32_t remainingVerts = numVertices;
 
     API_STATE    *pState = &pDC->pState->state;
     pState->topology = topology;
@@ -1149,9 +1210,9 @@ void DrawIndexedInstance(
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
     API_STATE* pState = &pDC->pState->state;
 
-    int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
+    uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
-    int32_t remainingIndices = numIndices;
+    uint32_t remainingIndices = numIndices;
 
     uint32_t indexSize = 0;
     switch (pState->indexBuffer.format)
@@ -1334,9 +1395,6 @@ void SwrDispatch(
 
     pDC->isCompute = true;      // This is a compute context.
 
-    // Ensure spill fill pointers are initialized to nullptr.
-    memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
-
     COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
 
     pTaskData->threadGroupCountX = threadGroupCountX;
@@ -1344,6 +1402,8 @@ void SwrDispatch(
     pTaskData->threadGroupCountZ = threadGroupCountZ;
 
     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+    pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
     pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
 
     QueueDispatch(pContext);
@@ -1497,4 +1557,6 @@ void SWR_API SwrEndFrame(
     HANDLE hContext)
 {
     RDTSC_ENDFRAME();
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    pContext->frameCount++;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 67d81a44347..64184e16865 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -65,69 +65,41 @@ static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
 template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16>
 struct CachingAllocatorT : DefaultAllocator
 {
-    static uint32_t GetBucketId(size_t blockSize)
-    {
-        uint32_t bucketId = 0;
-
-#if defined(BitScanReverseSizeT)
-        BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
-        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
-#endif
-
-        return bucketId;
-    }
-
     void* AllocateAligned(size_t size, size_t align)
     {
         SWR_ASSERT(size >= sizeof(ArenaBlock));
         SWR_ASSERT(size <= uint32_t(-1));
 
         size_t blockSize = size - ARENA_BLOCK_ALIGN;
+        uint32_t bucket = GetBucketId(blockSize);
 
         {
             // search cached blocks
             std::lock_guard<std::mutex> l(m_mutex);
-            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)];
-            ArenaBlock* pBlock = pPrevBlock->pNext;
-            ArenaBlock* pPotentialBlock = nullptr;
-            ArenaBlock* pPotentialPrev = nullptr;
+            ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket];
+            ArenaBlock* pBlock = SearchBlocks(pPrevBlock, blockSize, align);
 
-            while (pBlock)
+            if (pBlock)
             {
-                if (pBlock->blockSize >= blockSize)
-                {
-                    if (pBlock == AlignUp(pBlock, align))
-                    {
-                        if (pBlock->blockSize == blockSize)
-                        {
-                            // Won't find a better match
-                            break;
-                        }
-
-                        // We could use this as it is larger than we wanted, but
-                        // continue to search for a better match
-                        pPotentialBlock = pBlock;
-                        pPotentialPrev = pPrevBlock;
-                    }
-                }
-                else
+                m_cachedSize -= pBlock->blockSize;
+                if (pBlock == m_pLastCachedBlocks[bucket])
                 {
-                    // Blocks are sorted by size (biggest first)
-                    // So, if we get here, there are no blocks 
-                    // large enough, fall through to allocation.
-                    pBlock = nullptr;
-                    break;
+                    m_pLastCachedBlocks[bucket] = pPrevBlock;
                 }
-
-                pPrevBlock = pBlock;
-                pBlock = pBlock->pNext;
             }
-
-            if (!pBlock)
+            else
             {
-                // Couldn't find an exact match, use next biggest size
-                pBlock = pPotentialBlock;
-                pPrevBlock = pPotentialPrev;
+                pPrevBlock = &m_oldCachedBlocks[GetBucketId(blockSize)];
+                pBlock = SearchBlocks(pPrevBlock, blockSize, align);
+
+                if (pBlock)
+                {
+                    m_oldCachedSize -= pBlock->blockSize;
+                    if (pBlock == m_pOldLastCachedBlocks[bucket])
+                    {
+                        m_pLastCachedBlocks[bucket] = pPrevBlock;
+                    }
+                }
             }
 
             if (pBlock)
@@ -154,7 +126,7 @@ struct CachingAllocatorT : DefaultAllocator
         return this->DefaultAllocator::AllocateAligned(size, align);
     }
 
-    void  Free(void* pMem)
+    void Free(void* pMem)
     {
         if (pMem)
         {
@@ -162,24 +134,57 @@ struct CachingAllocatorT : DefaultAllocator
             SWR_ASSERT(pNewBlock->blockSize >= 0);
 
             std::unique_lock<std::mutex> l(m_mutex);
-            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
-            ArenaBlock* pBlock = pPrevBlock->pNext;
+            InsertCachedBlock(GetBucketId(pNewBlock->blockSize), pNewBlock);
+        }
+    }
 
-            while (pBlock)
+    void FreeOldBlocks()
+    {
+        if (!m_cachedSize) { return; }
+        std::lock_guard<std::mutex> l(m_mutex);
+
+        bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE);
+
+        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+        {
+            if (doFree)
             {
-                if (pNewBlock->blockSize >= pBlock->blockSize)
+                ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext;
+                while (pBlock)
                 {
-                    // Insert here
-                    break;
+                    ArenaBlock* pNext = pBlock->pNext;
+                    m_oldCachedSize -= pBlock->blockSize;
+                    m_totalAllocated -= (pBlock->blockSize + ARENA_BLOCK_ALIGN);
+                    this->DefaultAllocator::Free(pBlock);
+                    pBlock = pNext;
                 }
-                pPrevBlock = pBlock;
-                pBlock = pBlock->pNext;
+                m_oldCachedBlocks[i].pNext = nullptr;
+                m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
             }
 
-            // Insert into list
-            SWR_ASSERT(pPrevBlock);
-            pPrevBlock->pNext = pNewBlock;
-            pNewBlock->pNext = pBlock;
+            if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i])
+            {
+                m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext;
+                m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext;
+                m_cachedBlocks[i].pNext = nullptr;
+                if (m_pOldLastCachedBlocks[i]->pNext)
+                {
+                    m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i];
+                }
+                m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
+            }
+        }
+
+        m_oldCachedSize += m_cachedSize;
+        m_cachedSize = 0;
+    }
+
+    CachingAllocatorT()
+    {
+        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+        {
+            m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
+            m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
         }
     }
 
@@ -195,21 +200,126 @@ struct CachingAllocatorT : DefaultAllocator
                 this->DefaultAllocator::Free(pBlock);
                 pBlock = pNext;
             }
+            pBlock = m_oldCachedBlocks[i].pNext;
+            while (pBlock)
+            {
+                ArenaBlock* pNext = pBlock->pNext;
+                this->DefaultAllocator::Free(pBlock);
+                pBlock = pNext;
+            }
         }
     }
 
+private:
+    static uint32_t GetBucketId(size_t blockSize)
+    {
+        uint32_t bucketId = 0;
+
+#if defined(BitScanReverseSizeT)
+        BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
+        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
+#endif
+
+        return bucketId;
+    }
+
+    void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock)
+    {
+        SWR_ASSERT(bucketId < CACHE_NUM_BUCKETS);
+
+        ArenaBlock* pPrevBlock = &m_cachedBlocks[bucketId];
+        ArenaBlock* pBlock = pPrevBlock->pNext;
+
+        while (pBlock)
+        {
+            if (pNewBlock->blockSize >= pBlock->blockSize)
+            {
+                // Insert here
+                break;
+            }
+            pPrevBlock = pBlock;
+            pBlock = pBlock->pNext;
+        }
+
+        // Insert into list
+        SWR_ASSERT(pPrevBlock);
+        pPrevBlock->pNext = pNewBlock;
+        pNewBlock->pNext = pBlock;
+
+        if (m_pLastCachedBlocks[bucketId] == pPrevBlock)
+        {
+            m_pLastCachedBlocks[bucketId] = pNewBlock;
+        }
+
+        m_cachedSize += pNewBlock->blockSize;
+    }
+
+    static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align)
+    {
+        ArenaBlock* pBlock = pPrevBlock->pNext;
+        ArenaBlock* pPotentialBlock = nullptr;
+        ArenaBlock* pPotentialPrev = nullptr;
+
+        while (pBlock)
+        {
+            if (pBlock->blockSize >= blockSize)
+            {
+                if (pBlock == AlignUp(pBlock, align))
+                {
+                    if (pBlock->blockSize == blockSize)
+                    {
+                        // Won't find a better match
+                        break;
+                    }
+
+                    // We could use this as it is larger than we wanted, but
+                    // continue to search for a better match
+                    pPotentialBlock = pBlock;
+                    pPotentialPrev = pPrevBlock;
+                }
+            }
+            else
+            {
+                // Blocks are sorted by size (biggest first)
+                // So, if we get here, there are no blocks 
+                // large enough, fall through to allocation.
+                pBlock = nullptr;
+                break;
+            }
+
+            pPrevBlock = pBlock;
+            pBlock = pBlock->pNext;
+        }
+
+        if (!pBlock)
+        {
+            // Couldn't find an exact match, use next biggest size
+            pBlock = pPotentialBlock;
+            pPrevBlock = pPotentialPrev;
+        }
+
+        return pBlock;
+    }
+
     // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
     static const uint32_t   CACHE_NUM_BUCKETS       = NumBucketsT;
     static const uint32_t   CACHE_START_BUCKET_BIT  = StartBucketBitT;
+    static const size_t     MAX_UNUSED_SIZE         = 20 * sizeof(MEGABYTE);
 
     ArenaBlock              m_cachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock*             m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock              m_oldCachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock*             m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
     std::mutex              m_mutex;
 
     size_t                  m_totalAllocated = 0;
+
+    size_t                  m_cachedSize = 0;
+    size_t                  m_oldCachedSize = 0;
 };
 typedef CachingAllocatorT<> CachingAllocator;
 
-template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)>
+template<typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
 class TArena
 {
 public:
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 7fb83edf169..b2d3d9ef4f4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -70,7 +70,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
 /// @param pDC - pointer to draw context (dispatch).
 /// @param workerId - The unique worker ID that is assigned to this thread.
 /// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
 {
     RDTSC_START(BEDispatch);
 
@@ -80,10 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     SWR_ASSERT(pTaskData != nullptr);
 
     // Ensure spill fill memory has been allocated.
-    if (pDC->pSpillFill[workerId] == nullptr)
+    if (pSpillFillBuffer == nullptr)
     {
         ///@todo Add state which indicates the spill fill size.
-        pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8);
+        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
     }
 
     const API_STATE& state = GetApiState(pDC);
@@ -94,7 +94,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
     csContext.pTGSM = pContext->pScratch[workerId];
-    csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
 
     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
 
@@ -772,8 +772,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                     psContext.vOneOverW.centroid = psContext.vOneOverW.center;
                 }
 
-                // interpolate z
+                // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
                 RDTSC_STOP(BEBarycentric, 0, 0);
 
                 simdmask clipCoverageMask = coverageMask & MASK;
@@ -793,7 +795,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 if(CanEarlyZ(pPSState))
                 {
                     RDTSC_START(BEEarlyDepthTest);
-                    depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                                         psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
                     RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
@@ -825,7 +827,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 if(!CanEarlyZ(pPSState))
                 {
                     RDTSC_START(BELateDepthTest);
-                    depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                                         psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
                     RDTSC_STOP(BELateDepthTest, 0, 0);
 
@@ -977,8 +979,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 
                     backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
 
-                    // interpolate z
+                    // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
@@ -1000,7 +1003,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     if (CanEarlyZ(pPSState))
                     {
                         RDTSC_START(BEEarlyDepthTest);
-                        depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
@@ -1033,7 +1036,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     if (!CanEarlyZ(pPSState))
                     {
                         RDTSC_START(BELateDepthTest);
-                        depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         RDTSC_STOP(BELateDepthTest, 0, 0);
 
@@ -1200,8 +1203,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 RDTSC_START(BEBarycentric);
                 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
 
-                // interpolate z
+                // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                 RDTSC_STOP(BEBarycentric, 0, 0);
 
                 // execute pixel shader
@@ -1263,10 +1267,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                     // calc I & J per sample
                     backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
 
-                    // interpolate z
+                    // interpolate and quantize z
                     if (!pPSState->writesODepth)
                     {
                         vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                        vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
                     }
                     
                     ///@todo: perspective correct vs non-perspective correct clipping?
@@ -1292,7 +1297,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 // ZTest for this sample
                 RDTSC_START(BEEarlyDepthTest);
                 stencilPassMask[sample] = vCoverageMask[sample];
-                depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing,
                                         vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
                 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
@@ -1308,8 +1313,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             {
                 RDTSC_START(BEBarycentric);
                 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
-                // interpolate z
+                // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                 RDTSC_STOP(BEBarycentric, 0, 0);
 
                 // execute pixel shader
@@ -1463,8 +1469,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
                     backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
 
-                    // interpolate z
+                    // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
@@ -1483,7 +1490,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
 
                     RDTSC_START(BEEarlyDepthTest);
-                    simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                     DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 2fa18953cad..d0626b997af 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -32,7 +32,7 @@
 #include "core/context.h"
 #include "core/multisample.h"
 
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index 3a2a8b35be8..e624fd8f674 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -162,8 +162,8 @@ int ClipTriToPlane( const float *pInPts, int numInPts,
 void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs)
 {
     // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping
-    OSALIGN(float, 16) tempPts[6 * 4];
-    OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
+    OSALIGNSIMD(float) tempPts[6 * 4];
+    OSALIGNSIMD(float) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
 
     // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision
     int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index ba5870a92bb..67a4c4f47bb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -265,8 +265,8 @@ public:
     // clip a single primitive
     int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
     {
-        OSALIGN(float, 16) inVerts[3 * 4];
-        OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
+        OSALIGNSIMD(float) inVerts[3 * 4];
+        OSALIGNSIMD(float) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
 
         // transpose primitive position
         __m128 verts[3];
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 39f23372a18..6464aa20af7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -308,6 +308,8 @@ OSALIGNLINE(struct) API_STATE
         uint32_t depthHottileEnable: 1;
         uint32_t stencilHottileEnable : 1;
     };
+
+    PFN_QUANTIZE_DEPTH      pfnQuantizeDepth;
 };
 
 class MacroTileMgr;
@@ -380,32 +382,29 @@ struct DRAW_STATE
 //    This draw context maintains all of the state needed for the draw operation.
 struct DRAW_CONTEXT
 {
-    SWR_CONTEXT *pContext;
-
-    uint64_t drawId;
-
-    bool isCompute;    // Is this DC a compute context?
-
-    FE_WORK FeWork;
-    volatile OSALIGNLINE(uint32_t) FeLock;
-    volatile OSALIGNLINE(bool) doneFE;    // Is FE work done for this draw?
-    volatile OSALIGNLINE(int64_t) threadsDone;
-
-    uint64_t dependency;
-
-    MacroTileMgr* pTileMgr;
-
-    // The following fields are valid if isCompute is true.
-    DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
+    SWR_CONTEXT*    pContext;
+    uint64_t        drawId;
+    union
+    {
+        MacroTileMgr*   pTileMgr;
+        DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
+    };
+    uint64_t        dependency;
+    DRAW_STATE*     pState;
+    CachingArena*   pArena;
 
-    DRAW_STATE* pState;
-    CachingArena* pArena;
+    bool            isCompute;      // Is this DC a compute context?
+    bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
+    volatile bool   doneFE;         // Is FE work done for this draw?
 
-    uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
+    FE_WORK         FeWork;
 
-    bool  cleanupState; // True if this is the last draw using an entry in the state ring.
+    volatile OSALIGNLINE(uint32_t)   FeLock;
+    volatile int64_t    threadsDone;
 };
 
+static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
+
 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
 {
     SWR_ASSERT(pDC != nullptr);
@@ -447,6 +446,9 @@ struct SWR_CONTEXT
     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
 
+    MacroTileMgr* pMacroTileManagerArray;
+    DispatchQueue* pDispatchQueueArray;
+
     // Draw State Ring
     //  When draw are very large (lots of primitives) then the API thread will break these up.
     //  These split draws all have identical state. So instead of storing the state directly
@@ -457,6 +459,8 @@ struct SWR_CONTEXT
     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
 
     uint32_t NumWorkerThreads;
+    uint32_t NumFEThreads;
+    uint32_t NumBEThreads;
 
     THREAD_POOL threadPool; // Thread pool associated with this context
 
@@ -481,6 +485,7 @@ struct SWR_CONTEXT
     uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
 
     CachingAllocator cachingArenaAllocator;
+    uint32_t frameCount;
 };
 
 void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 2cc9d4054ac..7b55580bf0a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -80,14 +80,52 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
 }
 
 
+template<SWR_FORMAT depthFormatT>
+simdscalar QuantizeDepth(simdscalar depth)
+{
+    SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
+    uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0);
+
+    if (depthType == SWR_TYPE_FLOAT)
+    {
+        // assume only 32bit float depth supported
+        SWR_ASSERT(depthBpc == 32);
+
+        // matches shader precision, no quantizing needed
+        return depth;
+    }
+
+    // should be unorm depth if not float
+    SWR_ASSERT(depthType == SWR_TYPE_UNORM);
+
+    float quantize = (float)((1 << depthBpc) - 1);
+    simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize));
+    result = _simd_add_ps(result, _simd_set1_ps(0.5f));
+    result = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
+    
+    if (depthBpc > 16)
+    {
+        result = _simd_div_ps(result, _simd_set1_ps(quantize));
+    }
+    else
+    {
+        result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize));
+    }
+
+    return result;
+}
+
 INLINE
-simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
+simdscalar DepthStencilTest(const API_STATE* pState,
                  bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
                  simdscalar* pStencilMask)
 {
     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
     static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
 
+    const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
+    const SWR_VIEWPORT* pViewport = &pState->vp[0];
+
     simdscalar depthResult = _simd_set1_ps(-1.0f);
     simdscalar zbuf;
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 36721e00beb..93869610ff9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -793,8 +793,14 @@ static void GeometryShaderStage(
             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
             
             DWORD numAttribs;
-            _BitScanReverse(&numAttribs, state.feAttribMask);
-            numAttribs++;
+            if (_BitScanReverse(&numAttribs, state.feAttribMask))
+            {
+                numAttribs++;
+            }
+            else
+            {
+                numAttribs = 0;
+            }
 
             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
             {
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index d7feb86273d..55a22a67f4c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -45,14 +45,17 @@
 #define KNOB_ARCH_ISA AVX
 #define KNOB_ARCH_STR "AVX"
 #define KNOB_SIMD_WIDTH 8
+#define KNOB_SIMD_BYTES 32
 #elif (KNOB_ARCH == KNOB_ARCH_AVX2)
 #define KNOB_ARCH_ISA AVX2
 #define KNOB_ARCH_STR "AVX2"
 #define KNOB_SIMD_WIDTH 8
+#define KNOB_SIMD_BYTES 32
 #elif (KNOB_ARCH == KNOB_ARCH_AVX512)
 #define KNOB_ARCH_ISA AVX512F
 #define KNOB_ARCH_STR "AVX512"
 #define KNOB_SIMD_WIDTH 16
+#define KNOB_SIMD_BYTES 64
 #error "AVX512 not yet supported"
 #else
 #error "Unknown architecture"
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index f8f1a33b7e3..17f488538d6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -1017,13 +1017,13 @@ struct PA_TESS : PA_STATE
     {
         SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
 #if KNOB_SIMD_WIDTH == 8
-        static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
+        static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
         {
             -1, -1, -1, -1, -1, -1, -1, -1,
              0,  0,  0,  0,  0,  0,  0,  0
         };
 #elif KNOB_SIMD_WIDTH == 16
-        static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
+        static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
         {
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
@@ -1167,8 +1167,14 @@ struct PA_FACTORY
         {
             memset(&indexStore, 0, sizeof(indexStore));
             DWORD numAttribs;
-            _BitScanReverse(&numAttribs, state.feAttribMask);
-            numAttribs++;
+            if (_BitScanReverse(&numAttribs, state.feAttribMask))
+            {
+                numAttribs++;
+            }
+            else
+            {
+                numAttribs = 0;
+            }
             new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, 
                 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
             cutPA = true;
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 52fb7c88cdd..3144a901c91 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -383,7 +383,7 @@ __declspec(thread) volatile uint64_t gToss;
 
 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
 // try to avoid _chkstk insertions; make this thread local
-static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
+static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
 
 INLINE
 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
@@ -439,7 +439,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     const SWR_RASTSTATE &rastState = state.rastState;
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 
-    OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
 
     __m128 vX, vY, vZ, vRecipW;
@@ -502,7 +502,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
     _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
 
-    OSALIGN(float, 16) oneOverW[4];
+    OSALIGNSIMD(float) oneOverW[4];
     _mm_store_ps(oneOverW, vRecipW);
     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
@@ -537,7 +537,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
     // compute bary Z
     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
-    OSALIGN(float, 16) a[4];
+    OSALIGNSIMD(float) a[4];
     _mm_store_ps(a, vZ);
     triDesc.Z[0] = a[0] - a[2];
     triDesc.Z[1] = a[1] - a[2];
@@ -575,7 +575,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     }
 
     // Calc bounding box of triangle
-    OSALIGN(BBOX, 16) bbox;
+    OSALIGNSIMD(BBOX) bbox;
     calcBoundingBoxInt(vXi, vYi, bbox);
 
     // Intersect with scissor/viewport
@@ -594,7 +594,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
 
-    OSALIGN(BBOX, 16) intersect;
+    OSALIGNSIMD(BBOX) intersect;
     intersect.left   = std::max(bbox.left, macroBoxLeft);
     intersect.top    = std::max(bbox.top, macroBoxTop);
     intersect.right  = std::min(bbox.right, macroBoxRight);
@@ -1047,7 +1047,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi
         { 50, 51, 54, 55, 58, 59, 62, 63 }
     };
 
-    OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
 
     // pull point information from triangle buffer
     // @todo use structs for readability
@@ -1286,7 +1286,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     // make sure this macrotile intersects the triangle
     __m128i vXai = fpToFixedPoint(vXa);
     __m128i vYai = fpToFixedPoint(vYa);
-    OSALIGN(BBOX, 16) bboxA;
+    OSALIGNSIMD(BBOX) bboxA;
     calcBoundingBoxInt(vXai, vYai, bboxA);
 
     if (!(bboxA.left > macroBoxRight ||
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 5752094ca10..50361068025 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -790,6 +790,7 @@ typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
 typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
 typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
 typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
+typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar);
 
 //////////////////////////////////////////////////////////////////////////
 /// FRONTEND_STATE
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 07bc94a1a54..4b7a207f366 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -68,7 +68,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
 
 #if defined(_WIN32)
 
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
+    static std::mutex m;
+    std::lock_guard<std::mutex> l(m);
+
+    static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
     DWORD bufSize = sizeof(buffer);
 
     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
@@ -288,7 +291,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
     {
         // Cleanup memory allocations
         pDC->pArena->Reset(true);
-        pDC->pTileMgr->initialize();
+        if (!pDC->isCompute)
+        {
+            pDC->pTileMgr->initialize();
+        }
         if (pDC->cleanupState)
         {
             pDC->pState->pArena->Reset(true);
@@ -302,10 +308,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
     return result;
 }
 
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued)
 {
     // increment our current draw id to the first incomplete draw
-    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+    drawEnqueued = GetEnqueuedDraw(pContext);
     while (curDrawBE < drawEnqueued)
     {
         DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -313,8 +319,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
         // If its not compute and FE is not done then break out of loop.
         if (!pDC->doneFE && !pDC->isCompute) break;
 
-        bool isWorkComplete = (pDC->isCompute) ?
-            pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
+        bool isWorkComplete = pDC->isCompute ?
+            pDC->pDispatch->isWorkComplete() :
+            pDC->pTileMgr->isWorkComplete();
 
         if (isWorkComplete)
         {
@@ -355,7 +362,8 @@ void WorkOnFifoBE(
 {
     // Find the first incomplete draw that has pending work. If no such draw is found then
     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
-    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+    uint64_t drawEnqueued = 0;
+    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
     {
         return;
     }
@@ -370,7 +378,7 @@ void WorkOnFifoBE(
     //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
     //      working on those macrotiles that are known to be complete in the prior draw to
     //      maintain order. The locked tiles provides the history to ensures this.
-    for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
+    for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
     {
         DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
 
@@ -463,7 +471,7 @@ void WorkOnFifoBE(
     }
 }
 
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
 {
     // Try to grab the next DC from the ring
     uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -516,38 +524,44 @@ void WorkOnCompute(
     uint32_t workerId,
     uint64_t& curDrawBE)
 {
-    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+    uint64_t drawEnqueued = 0;
+    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
     {
         return;
     }
 
     uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
 
-    DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
-    if (pDC->isCompute == false) return;
-
-    // check dependencies
-    if (CheckDependency(pContext, pDC, lastRetiredDraw))
+    for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
     {
-        return;
-    }
+        DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
+        if (pDC->isCompute == false) return;
 
-    SWR_ASSERT(pDC->pDispatch != nullptr);
-    DispatchQueue& queue = *pDC->pDispatch;
+        // check dependencies
+        if (CheckDependency(pContext, pDC, lastRetiredDraw))
+        {
+            return;
+        }
 
-    // Is there any work remaining?
-    if (queue.getNumQueued() > 0)
-    {
-        uint32_t threadGroupId = 0;
-        while (queue.getWork(threadGroupId))
+        SWR_ASSERT(pDC->pDispatch != nullptr);
+        DispatchQueue& queue = *pDC->pDispatch;
+
+        // Is there any work remaining?
+        if (queue.getNumQueued() > 0)
         {
-            ProcessComputeBE(pDC, workerId, threadGroupId);
+            void* pSpillFillBuffer = nullptr;
+            uint32_t threadGroupId = 0;
+            while (queue.getWork(threadGroupId))
+            {
+                ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);
 
-            queue.finishedWork();
+                queue.finishedWork();
+            }
         }
     }
 }
 
+template<bool IsFEThread, bool IsBEThread>
 DWORD workerThreadMain(LPVOID pData)
 {
     THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
@@ -631,25 +645,38 @@ DWORD workerThreadMain(LPVOID pData)
             }
         }
 
-        RDTSC_START(WorkerWorkOnFifoBE);
-        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
-        RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+        if (IsBEThread)
+        {
+            RDTSC_START(WorkerWorkOnFifoBE);
+            WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
+            RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
 
-        WorkOnCompute(pContext, workerId, curDrawBE);
+            WorkOnCompute(pContext, workerId, curDrawBE);
+        }
 
-        WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
+        if (IsFEThread)
+        {
+            WorkOnFifoFE(pContext, workerId, curDrawFE);
+
+            if (!IsBEThread)
+            {
+                curDrawBE = curDrawFE;
+            }
+        }
     }
 
     return 0;
 }
+template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
 
+template <bool IsFEThread, bool IsBEThread>
 DWORD workerThreadInit(LPVOID pData)
 {
 #if defined(_WIN32)
     __try
 #endif // _WIN32
     {
-        return workerThreadMain(pData);
+        return workerThreadMain<IsFEThread, IsBEThread>(pData);
     }
 
 #if defined(_WIN32)
@@ -661,6 +688,7 @@ DWORD workerThreadInit(LPVOID pData)
 
     return 1;
 }
+template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
 
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 {
@@ -678,6 +706,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     uint32_t numCoresPerNode    = numHWCoresPerNode;
     uint32_t numHyperThreads    = numHWHyperThreads;
 
+    if (KNOB_MAX_WORKER_THREADS)
+    {
+        SET_KNOB(HYPERTHREADED_FE, false);
+    }
+
+    if (KNOB_HYPERTHREADED_FE)
+    {
+        SET_KNOB(MAX_THREADS_PER_CORE, 0);
+    }
+
     if (KNOB_MAX_NUMA_NODES)
     {
         numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
@@ -693,6 +731,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
         numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
     }
 
+    if (numHyperThreads < 2)
+    {
+        SET_KNOB(HYPERTHREADED_FE, false);
+    }
+
     // Calculate numThreads
     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
 
@@ -767,9 +810,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
             pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
             pPool->pThreadData[workerId].threadId = 0;
             pPool->pThreadData[workerId].numaId = 0;
+            pPool->pThreadData[workerId].coreId = 0;
+            pPool->pThreadData[workerId].htId = 0;
             pPool->pThreadData[workerId].pContext = pContext;
             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
-            pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+            pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+
+            pContext->NumBEThreads++;
+            pContext->NumFEThreads++;
         }
     }
     else
@@ -780,6 +828,10 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
         for (uint32_t n = 0; n < numNodes; ++n)
         {
             auto& node = nodes[n];
+            if (node.cores.size() == 0)
+            {
+               continue;
+            }
 
             uint32_t numCores = numCoresPerNode;
             for (uint32_t c = 0; c < numCores; ++c)
@@ -797,8 +849,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
                     pPool->pThreadData[workerId].threadId = core.threadIds[t];
                     pPool->pThreadData[workerId].numaId = n;
+                    pPool->pThreadData[workerId].coreId = c;
+                    pPool->pThreadData[workerId].htId = t;
                     pPool->pThreadData[workerId].pContext = pContext;
-                    pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+
+                    if (KNOB_HYPERTHREADED_FE)
+                    {
+                        if (t == 0)
+                        {
+                            pContext->NumBEThreads++;
+                            pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
+                        }
+                        else
+                        {
+                            pContext->NumFEThreads++;
+                            pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
+                        }
+                    }
+                    else
+                    {
+                        pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+                        pContext->NumBEThreads++;
+                        pContext->NumFEThreads++;
+                    }
 
                     ++workerId;
                 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 821d7dcb16e..3aba6323a95 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -41,6 +41,8 @@ struct THREAD_DATA
     uint32_t procGroupId;   // Will always be 0 for non-Windows OS
     uint32_t threadId;      // within the procGroup for Windows
     uint32_t numaId;        // NUMA node id
+    uint32_t coreId;        // Core id
+    uint32_t htId;          // Hyperthread id
     uint32_t workerId;
     SWR_CONTEXT *pContext;
     bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
@@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE);
 void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
 int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
 \ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 794577270cf..87d9f42c032 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -35,27 +35,6 @@
 
 #define TILE_ID(x,y) ((x << 16 | y))
 
-// override new/delete for alignment
-void *MacroTileMgr::operator new(size_t size)
-{
-    return _aligned_malloc(size, 64);
-}
-
-void MacroTileMgr::operator delete(void *p)
-{
-    _aligned_free(p);
-}
-
-void* DispatchQueue::operator new(size_t size)
-{
-    return _aligned_malloc(size, 64);
-}
-
-void DispatchQueue::operator delete(void *p)
-{
-    _aligned_free(p);
-}
-
 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
@@ -304,7 +283,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
 {
     const API_STATE& state = GetApiState(pDC);
-    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
 
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index aa561badc1c..82a15e16a33 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -140,9 +140,6 @@ public:
         x = (tileID >> 16) & 0xffff;
     }
 
-    void *operator new(size_t size);
-    void operator delete (void *p);
-
 private:
     CachingArena& mArena;
     std::unordered_map<uint32_t, MacroTileQueue> mTiles;
@@ -229,9 +226,6 @@ public:
         return mpTaskData;
     }
 
-    void *operator new(size_t size);
-    void operator delete (void *p);
-
     void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
 
     OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
@@ -272,7 +266,7 @@ class HotTileMgr
 public:
     HotTileMgr()
     {
-        memset(&mHotTiles[0][0], 0, sizeof(mHotTiles));
+        memset(mHotTiles, 0, sizeof(mHotTiles));
 
         // cache hottile size
         for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 0f3ded68544..3832b91d93e 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -30,6 +30,18 @@ KNOBS = [
         'category'  : 'debug',
     }],
 
+    ['HYPERTHREADED_FE', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['EXPERIMENTAL!!',
+                       'If enabled will attempt to use secondary threads per core to perform',
+                       'front-end (VS/GS) work.',
+                       '',
+                       'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'],
+        'category'  : 'perf',
+        'advanced'  : 'true',
+    }],
+
     ['DUMP_SHADER_IR', {
         'type'      : 'bool',
         'default'   : 'false',
@@ -166,6 +178,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_FETCH', {
@@ -175,6 +188,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_IA', {
@@ -184,6 +198,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_VS', {
@@ -193,6 +208,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_SETUP_TRIS', {
@@ -202,6 +218,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_BIN_TRIS', {
@@ -211,6 +228,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_RS', {
@@ -220,4 +238,5 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],]
diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp
index 810c50b2f8f..e4b8b683278 100644
--- a/src/gallium/drivers/swr/swr_query.cpp
+++ b/src/gallium/drivers/swr/swr_query.cpp
@@ -319,6 +319,12 @@ swr_check_render_cond(struct pipe_context *pipe)
       return TRUE;
 }
 
+
+static void
+swr_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void
 swr_query_init(struct pipe_context *pipe)
 {
@@ -329,6 +335,7 @@ swr_query_init(struct pipe_context *pipe)
    pipe->begin_query = swr_begin_query;
    pipe->end_query = swr_end_query;
    pipe->get_query_result = swr_get_query_result;
+   pipe->set_active_query_state = swr_set_active_query_state;
 
    ctx->active_queries = 0;
 }
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
index f9e52be2367..a0a6324f334 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -337,6 +337,11 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
    }
 
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
index ff16d0f2f11..83e32163ecc 100644
--- a/src/gallium/drivers/swr/swr_shader.cpp
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -40,32 +40,29 @@
 #include "swr_state.h"
 #include "swr_screen.h"
 
-bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs)
+bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs)
 {
    return !memcmp(&lhs, &rhs, sizeof(lhs));
 }
 
-void
-swr_generate_fs_key(struct swr_jit_key &key,
-                    struct swr_context *ctx,
-                    swr_fragment_shader *swr_fs)
+bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs)
 {
-   key.nr_cbufs = ctx->framebuffer.nr_cbufs;
-   key.light_twoside = ctx->rasterizer->light_twoside;
-   memcpy(&key.vs_output_semantic_name,
-          &ctx->vs->info.base.output_semantic_name,
-          sizeof(key.vs_output_semantic_name));
-   memcpy(&key.vs_output_semantic_idx,
-          &ctx->vs->info.base.output_semantic_index,
-          sizeof(key.vs_output_semantic_idx));
+   return !memcmp(&lhs, &rhs, sizeof(lhs));
+}
 
-   key.nr_samplers = swr_fs->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
+static void
+swr_generate_sampler_key(const struct lp_tgsi_info &info,
+                         struct swr_context *ctx,
+                         unsigned shader_type,
+                         struct swr_jit_sampler_key &key)
+{
+   key.nr_samplers = info.base.file_max[TGSI_FILE_SAMPLER] + 1;
 
    for (unsigned i = 0; i < key.nr_samplers; i++) {
-      if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+      if (info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
          lp_sampler_static_sampler_state(
             &key.sampler[i].sampler_state,
-            ctx->samplers[PIPE_SHADER_FRAGMENT][i]);
+            ctx->samplers[shader_type][i]);
       }
    }
 
@@ -74,28 +71,58 @@ swr_generate_fs_key(struct swr_jit_key &key,
     * are dx10-style? Can't really have mixed opcodes, at least not
     * if we want to skip the holes here (without rescanning tgsi).
     */
-   if (swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
+   if (info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
       key.nr_sampler_views =
-         swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
+         info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
       for (unsigned i = 0; i < key.nr_sampler_views; i++) {
-         if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
+         if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
             lp_sampler_static_texture_state(
                &key.sampler[i].texture_state,
-               ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]);
+               ctx->sampler_views[shader_type][i]);
          }
       }
    } else {
       key.nr_sampler_views = key.nr_samplers;
       for (unsigned i = 0; i < key.nr_sampler_views; i++) {
-         if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+         if (info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
             lp_sampler_static_texture_state(
                &key.sampler[i].texture_state,
-               ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]);
+               ctx->sampler_views[shader_type][i]);
          }
       }
    }
 }
 
+void
+swr_generate_fs_key(struct swr_jit_fs_key &key,
+                    struct swr_context *ctx,
+                    swr_fragment_shader *swr_fs)
+{
+   memset(&key, 0, sizeof(key));
+
+   key.nr_cbufs = ctx->framebuffer.nr_cbufs;
+   key.light_twoside = ctx->rasterizer->light_twoside;
+   key.flatshade = ctx->rasterizer->flatshade;
+   memcpy(&key.vs_output_semantic_name,
+          &ctx->vs->info.base.output_semantic_name,
+          sizeof(key.vs_output_semantic_name));
+   memcpy(&key.vs_output_semantic_idx,
+          &ctx->vs->info.base.output_semantic_index,
+          sizeof(key.vs_output_semantic_idx));
+
+   swr_generate_sampler_key(swr_fs->info, ctx, PIPE_SHADER_FRAGMENT, key);
+}
+
+void
+swr_generate_vs_key(struct swr_jit_vs_key &key,
+                    struct swr_context *ctx,
+                    swr_vertex_shader *swr_vs)
+{
+   memset(&key, 0, sizeof(key));
+
+   swr_generate_sampler_key(swr_vs->info, ctx, PIPE_SHADER_VERTEX, key);
+}
+
 struct BuilderSWR : public Builder {
    BuilderSWR(JitManager *pJitMgr)
       : Builder(pJitMgr)
@@ -103,14 +130,15 @@ struct BuilderSWR : public Builder {
       pJitMgr->SetupNewModule();
    }
 
-   PFN_VERTEX_FUNC
-   CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs);
-   PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_key &key);
+   PFN_VERTEX_FUNC CompileVS(struct swr_context *ctx, swr_jit_vs_key &key);
+   PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_fs_key &key);
 };
 
 PFN_VERTEX_FUNC
-BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
+BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key)
 {
+   struct swr_vertex_shader *swr_vs = ctx->vs;
+
    swr_vs->linkageMask = 0;
 
    for (unsigned i = 0; i < swr_vs->info.base.num_outputs; i++) {
@@ -180,6 +208,9 @@ BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
       }
    }
 
+   struct lp_build_sampler_soa *sampler =
+      swr_sampler_soa_create(key.sampler, PIPE_SHADER_VERTEX);
+
    struct lp_bld_tgsi_system_values system_values;
    memset(&system_values, 0, sizeof(system_values));
    system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID}));
@@ -194,9 +225,9 @@ BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
                      &system_values,
                      inputs,
                      outputs,
-                     NULL, // wrap(hPrivateData), (sampler context)
+                     wrap(hPrivateData), // (sampler context)
                      NULL, // thread data
-                     NULL, // sampler
+                     sampler, // sampler
                      &swr_vs->info.base,
                      NULL); // geometry shader face
 
@@ -239,11 +270,11 @@ BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
 }
 
 PFN_VERTEX_FUNC
-swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
+swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key)
 {
    BuilderSWR builder(
-      reinterpret_cast<JitManager *>(swr_screen(ctx->screen)->hJitMgr));
-   return builder.CompileVS(ctx, swr_vs);
+      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr));
+   return builder.CompileVS(ctx, key);
 }
 
 static unsigned
@@ -269,7 +300,7 @@ locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info)
 }
 
 PFN_PIXEL_KERNEL
-BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key)
+BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key)
 {
    struct swr_fragment_shader *swr_fs = ctx->fs;
 
@@ -461,6 +492,9 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key)
 
             if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
                inputs[attrib][channel] = wrap(va);
+            } else if ((interpMode == TGSI_INTERPOLATE_COLOR) &&
+                       (key.flatshade == true)) {
+               inputs[attrib][channel] = wrap(vc);
             } else {
                Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj);
 
@@ -478,7 +512,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key)
       }
    }
 
-   sampler = swr_sampler_soa_create(key.sampler);
+   sampler = swr_sampler_soa_create(key.sampler, PIPE_SHADER_FRAGMENT);
 
    struct lp_bld_tgsi_system_values system_values;
    memset(&system_values, 0, sizeof(system_values));
@@ -583,7 +617,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key)
 }
 
 PFN_PIXEL_KERNEL
-swr_compile_fs(struct swr_context *ctx, swr_jit_key &key)
+swr_compile_fs(struct swr_context *ctx, swr_jit_fs_key &key)
 {
    BuilderSWR builder(
       reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr));
diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h
index e22a7c48c2a..3f79570bbd9 100644
--- a/src/gallium/drivers/swr/swr_shader.h
+++ b/src/gallium/drivers/swr/swr_shader.h
@@ -25,36 +25,56 @@
 
 class swr_vertex_shader;
 class swr_fragment_shader;
-class swr_jit_key;
+class swr_jit_fs_key;
+class swr_jit_vs_key;
 
 PFN_VERTEX_FUNC
-swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs);
+swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key);
 
 PFN_PIXEL_KERNEL
-swr_compile_fs(struct swr_context *ctx, swr_jit_key &key);
+swr_compile_fs(struct swr_context *ctx, swr_jit_fs_key &key);
 
-void swr_generate_fs_key(struct swr_jit_key &key,
+void swr_generate_fs_key(struct swr_jit_fs_key &key,
                          struct swr_context *ctx,
                          swr_fragment_shader *swr_fs);
 
-struct swr_jit_key {
+void swr_generate_vs_key(struct swr_jit_vs_key &key,
+                         struct swr_context *ctx,
+                         swr_vertex_shader *swr_vs);
+
+struct swr_jit_sampler_key {
+   unsigned nr_samplers;
+   unsigned nr_sampler_views;
+   struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+};
+
+struct swr_jit_fs_key : swr_jit_sampler_key {
    unsigned nr_cbufs;
    unsigned light_twoside;
+   unsigned flatshade;
    ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
    ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS];
-   unsigned nr_samplers;
-   unsigned nr_sampler_views;
-   struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+};
+
+struct swr_jit_vs_key : swr_jit_sampler_key {
 };
 
 namespace std
 {
-template <> struct hash<swr_jit_key> {
-   std::size_t operator()(const swr_jit_key &k) const
+template <> struct hash<swr_jit_fs_key> {
+   std::size_t operator()(const swr_jit_fs_key &k) const
+   {
+      return util_hash_crc32(&k, sizeof(k));
+   }
+};
+
+template <> struct hash<swr_jit_vs_key> {
+   std::size_t operator()(const swr_jit_vs_key &k) const
    {
       return util_hash_crc32(&k, sizeof(k));
    }
 };
 };
 
-bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs);
+bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs);
+bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs);
diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
index e7bf3618a7d..ded51a9b196 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -317,8 +317,7 @@ static void *
 swr_create_vs_state(struct pipe_context *pipe,
                     const struct pipe_shader_state *vs)
 {
-   struct swr_vertex_shader *swr_vs =
-      (swr_vertex_shader *)CALLOC_STRUCT(swr_vertex_shader);
+   struct swr_vertex_shader *swr_vs = new swr_vertex_shader;
    if (!swr_vs)
       return NULL;
 
@@ -327,8 +326,6 @@ swr_create_vs_state(struct pipe_context *pipe,
 
    lp_build_tgsi_info(vs->tokens, &swr_vs->info);
 
-   swr_vs->func = swr_compile_vs(pipe, swr_vs);
-
    swr_vs->soState = {0};
 
    if (swr_vs->pipe.stream_output.num_outputs) {
@@ -368,7 +365,7 @@ swr_delete_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs;
    FREE((void *)swr_vs->pipe.tokens);
-   FREE(vs);
+   delete swr_vs;
 }
 
 static void *
@@ -675,6 +672,58 @@ swr_update_resource_status(struct pipe_context *pipe,
    }
 }
 
+static void
+swr_update_texture_state(struct swr_context *ctx,
+                         unsigned shader_type,
+                         unsigned num_sampler_views,
+                         swr_jit_texture *textures)
+{
+   for (unsigned i = 0; i < num_sampler_views; i++) {
+      struct pipe_sampler_view *view =
+         ctx->sampler_views[shader_type][i];
+
+      if (view) {
+         struct pipe_resource *res = view->texture;
+         struct swr_resource *swr_res = swr_resource(res);
+         struct swr_jit_texture *jit_tex = &textures[i];
+         memset(jit_tex, 0, sizeof(*jit_tex));
+         jit_tex->width = res->width0;
+         jit_tex->height = res->height0;
+         jit_tex->depth = res->depth0;
+         jit_tex->first_level = view->u.tex.first_level;
+         jit_tex->last_level = view->u.tex.last_level;
+         jit_tex->base_ptr = swr_res->swr.pBaseAddress;
+
+         for (unsigned level = jit_tex->first_level;
+              level <= jit_tex->last_level;
+              level++) {
+            jit_tex->row_stride[level] = swr_res->row_stride[level];
+            jit_tex->img_stride[level] = swr_res->img_stride[level];
+            jit_tex->mip_offsets[level] = swr_res->mip_offsets[level];
+         }
+      }
+   }
+}
+
+static void
+swr_update_sampler_state(struct swr_context *ctx,
+                         unsigned shader_type,
+                         unsigned num_samplers,
+                         swr_jit_sampler *samplers)
+{
+   for (unsigned i = 0; i < num_samplers; i++) {
+      const struct pipe_sampler_state *sampler =
+         ctx->samplers[shader_type][i];
+
+      if (sampler) {
+         samplers[i].min_lod = sampler->min_lod;
+         samplers[i].max_lod = sampler->max_lod;
+         samplers[i].lod_bias = sampler->lod_bias;
+         COPY_4V(samplers[i].border_color, sampler->border_color.f);
+      }
+   }
+}
+
 void
 swr_update_derived(struct pipe_context *pipe,
                    const struct pipe_draw_info *p_draw_info)
@@ -974,14 +1023,43 @@ swr_update_derived(struct pipe_context *pipe,
    }
 
    /* VertexShader */
-   if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_FRAMEBUFFER)) {
-      SwrSetVertexFunc(ctx->swrContext, ctx->vs->func);
+   if (ctx->dirty & (SWR_NEW_VS |
+                     SWR_NEW_SAMPLER |
+                     SWR_NEW_SAMPLER_VIEW |
+                     SWR_NEW_FRAMEBUFFER)) {
+      swr_jit_vs_key key;
+      swr_generate_vs_key(key, ctx, ctx->vs);
+      auto search = ctx->vs->map.find(key);
+      PFN_VERTEX_FUNC func;
+      if (search != ctx->vs->map.end()) {
+         func = search->second;
+      } else {
+         func = swr_compile_vs(ctx, key);
+         ctx->vs->map.insert(std::make_pair(key, func));
+      }
+      SwrSetVertexFunc(ctx->swrContext, func);
+
+      /* JIT sampler state */
+      if (ctx->dirty & SWR_NEW_SAMPLER) {
+         swr_update_sampler_state(ctx,
+                                  PIPE_SHADER_VERTEX,
+                                  key.nr_samplers,
+                                  ctx->swrDC.samplersVS);
+      }
+
+      /* JIT sampler view state */
+      if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
+         swr_update_texture_state(ctx,
+                                  PIPE_SHADER_VERTEX,
+                                  key.nr_sampler_views,
+                                  ctx->swrDC.texturesVS);
+      }
    }
 
-   swr_jit_key key;
+   /* FragmentShader */
    if (ctx->dirty & (SWR_NEW_FS | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW
                      | SWR_NEW_RASTERIZER | SWR_NEW_FRAMEBUFFER)) {
-      memset(&key, 0, sizeof(key));
+      swr_jit_fs_key key;
       swr_generate_fs_key(key, ctx, ctx->fs);
       auto search = ctx->fs->map.find(key);
       PFN_PIXEL_KERNEL func;
@@ -1031,56 +1109,25 @@ swr_update_derived(struct pipe_context *pipe,
       psState.usesUAV = false; // XXX
       psState.forceEarlyZ = false;
       SwrSetPixelShaderState(ctx->swrContext, &psState);
-   }
-
-   /* JIT sampler state */
-   if (ctx->dirty & SWR_NEW_SAMPLER) {
-      swr_draw_context *pDC = &ctx->swrDC;
 
-      for (unsigned i = 0; i < key.nr_samplers; i++) {
-         const struct pipe_sampler_state *sampler =
-            ctx->samplers[PIPE_SHADER_FRAGMENT][i];
-
-         if (sampler) {
-            pDC->samplersFS[i].min_lod = sampler->min_lod;
-            pDC->samplersFS[i].max_lod = sampler->max_lod;
-            pDC->samplersFS[i].lod_bias = sampler->lod_bias;
-            COPY_4V(pDC->samplersFS[i].border_color, sampler->border_color.f);
-         }
+      /* JIT sampler state */
+      if (ctx->dirty & SWR_NEW_SAMPLER) {
+         swr_update_sampler_state(ctx,
+                                  PIPE_SHADER_FRAGMENT,
+                                  key.nr_samplers,
+                                  ctx->swrDC.samplersFS);
       }
-   }
-
-   /* JIT sampler view state */
-   if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
-      swr_draw_context *pDC = &ctx->swrDC;
 
-      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
-         struct pipe_sampler_view *view =
-            ctx->sampler_views[PIPE_SHADER_FRAGMENT][i];
-
-         if (view) {
-            struct pipe_resource *res = view->texture;
-            struct swr_resource *swr_res = swr_resource(res);
-            struct swr_jit_texture *jit_tex = &pDC->texturesFS[i];
-            memset(jit_tex, 0, sizeof(*jit_tex));
-            jit_tex->width = res->width0;
-            jit_tex->height = res->height0;
-            jit_tex->depth = res->depth0;
-            jit_tex->first_level = view->u.tex.first_level;
-            jit_tex->last_level = view->u.tex.last_level;
-            jit_tex->base_ptr = swr_res->swr.pBaseAddress;
-
-            for (unsigned level = jit_tex->first_level;
-                 level <= jit_tex->last_level;
-                 level++) {
-               jit_tex->row_stride[level] = swr_res->row_stride[level];
-               jit_tex->img_stride[level] = swr_res->img_stride[level];
-               jit_tex->mip_offsets[level] = swr_res->mip_offsets[level];
-            }
-         }
+      /* JIT sampler view state */
+      if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
+         swr_update_texture_state(ctx,
+                                  PIPE_SHADER_FRAGMENT,
+                                  key.nr_sampler_views,
+                                  ctx->swrDC.texturesFS);
       }
    }
 
+
    /* VertexShader Constants */
    if (ctx->dirty & SWR_NEW_VSCONSTANTS) {
       swr_draw_context *pDC = &ctx->swrDC;
diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h
index f0a7ff3b185..32a5441295b 100644
--- a/src/gallium/drivers/swr/swr_state.h
+++ b/src/gallium/drivers/swr/swr_state.h
@@ -40,9 +40,9 @@ struct swr_vertex_shader {
    struct pipe_shader_state pipe;
    struct lp_tgsi_info info;
    unsigned linkageMask;
-   PFN_VERTEX_FUNC func;
+   std::unordered_map<swr_jit_vs_key, PFN_VERTEX_FUNC> map;
    SWR_STREAMOUT_STATE soState;
-   PFN_SO_FUNC soFunc[PIPE_PRIM_MAX];
+   PFN_SO_FUNC soFunc[PIPE_PRIM_MAX] {0};
 };
 
 struct swr_fragment_shader {
@@ -50,7 +50,7 @@ struct swr_fragment_shader {
    struct lp_tgsi_info info;
    uint32_t constantMask;
    uint32_t pointSpriteMask;
-   std::unordered_map<swr_jit_key, PFN_PIXEL_KERNEL> map;
+   std::unordered_map<swr_jit_fs_key, PFN_PIXEL_KERNEL> map;
 };
 
 /* Vertex element state */
diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp
index 8e01e32e280..8172c820f22 100644
--- a/src/gallium/drivers/swr/swr_tex_sample.cpp
+++ b/src/gallium/drivers/swr/swr_tex_sample.cpp
@@ -72,6 +72,8 @@ struct swr_sampler_dynamic_state {
    struct lp_sampler_dynamic_state base;
 
    const struct swr_sampler_static_state *static_state;
+
+   unsigned shader_type;
 };
 
 
@@ -112,7 +114,18 @@ swr_texture_member(const struct lp_sampler_dynamic_state *base,
    /* context[0] */
    indices[0] = lp_build_const_int32(gallivm, 0);
    /* context[0].textures */
-   indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS);
+   auto dynamic = (const struct swr_sampler_dynamic_state *)base;
+   switch (dynamic->shader_type) {
+   case PIPE_SHADER_FRAGMENT:
+      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS);
+      break;
+   case PIPE_SHADER_VERTEX:
+      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesVS);
+      break;
+   default:
+      assert(0 && "unsupported shader type");
+      break;
+   }
    /* context[0].textures[unit] */
    indices[2] = lp_build_const_int32(gallivm, texture_unit);
    /* context[0].textures[unit].member */
@@ -195,7 +208,18 @@ swr_sampler_member(const struct lp_sampler_dynamic_state *base,
    /* context[0] */
    indices[0] = lp_build_const_int32(gallivm, 0);
    /* context[0].samplers */
-   indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS);
+   auto dynamic = (const struct swr_sampler_dynamic_state *)base;
+   switch (dynamic->shader_type) {
+   case PIPE_SHADER_FRAGMENT:
+      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS);
+      break;
+   case PIPE_SHADER_VERTEX:
+      indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersVS);
+      break;
+   default:
+      assert(0 && "unsupported shader type");
+      break;
+   }
    /* context[0].samplers[unit] */
    indices[2] = lp_build_const_int32(gallivm, sampler_unit);
    /* context[0].samplers[unit].member */
@@ -307,7 +331,8 @@ swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
 
 
 struct lp_build_sampler_soa *
-swr_sampler_soa_create(const struct swr_sampler_static_state *static_state)
+swr_sampler_soa_create(const struct swr_sampler_static_state *static_state,
+                       unsigned shader_type)
 {
    struct swr_sampler_soa *sampler;
 
@@ -334,5 +359,7 @@ swr_sampler_soa_create(const struct swr_sampler_static_state *static_state)
 
    sampler->dynamic_state.static_state = static_state;
 
+   sampler->dynamic_state.shader_type = shader_type;
+
    return &sampler->base;
 }
diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h
index f5c368c108d..cb7e83d1c39 100644
--- a/src/gallium/drivers/swr/swr_tex_sample.h
+++ b/src/gallium/drivers/swr/swr_tex_sample.h
@@ -44,4 +44,4 @@ struct swr_sampler_static_state {
  *
  */
 struct lp_build_sampler_soa *
-swr_sampler_soa_create(const struct swr_sampler_static_state *key);
+swr_sampler_soa_create(const struct swr_sampler_static_state *key, unsigned shader_type);
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 08b1d32afb0..b575f2cdb34 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -273,6 +273,24 @@ trace_context_get_query_result(struct pipe_context *_pipe,
 }
 
 
+static void
+trace_context_set_active_query_state(struct pipe_context *_pipe,
+                                     boolean enable)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_active_query_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(bool, enable);
+
+   pipe->set_active_query_state(pipe, enable);
+
+   trace_dump_call_end();
+}
+
+
 static void *
 trace_context_create_blend_state(struct pipe_context *_pipe,
                                  const struct pipe_blend_state *state)
@@ -1781,6 +1799,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(begin_query);
    TR_CTX_INIT(end_query);
    TR_CTX_INIT(get_query_result);
+   TR_CTX_INIT(set_active_query_state);
    TR_CTX_INIT(create_blend_state);
    TR_CTX_INIT(bind_blend_state);
    TR_CTX_INIT(delete_blend_state);
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 9b0b540d3fc..68b85737628 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -32,12 +32,19 @@
 #include "vc4_resource.h"
 
 static void
-vc4_get_draw_cl_space(struct vc4_context *vc4)
+vc4_get_draw_cl_space(struct vc4_context *vc4, int vert_count)
 {
+        /* The SW-5891 workaround may cause us to emit multiple shader recs
+         * and draw packets.
+         */
+        int num_draws = DIV_ROUND_UP(vert_count, 65535) + 1;
+
         /* Binner gets our packet state -- vc4_emit.c contents,
          * and the primitive itself.
          */
-        cl_ensure_space(&vc4->bcl, 256);
+        cl_ensure_space(&vc4->bcl,
+                        256 + (VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE +
+                               VC4_PACKET_GL_SHADER_STATE_SIZE) * num_draws);
 
         /* Nothing for rcl -- that's covered by vc4_context.c */
 
@@ -45,7 +52,8 @@ vc4_get_draw_cl_space(struct vc4_context *vc4)
          * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of
          * vattr stride).
          */
-        cl_ensure_space(&vc4->shader_rec, 12 * sizeof(uint32_t) + 104 + 8 * 32);
+        cl_ensure_space(&vc4->shader_rec,
+                        (12 * sizeof(uint32_t) + 104 + 8 * 32) * num_draws);
 
         /* Uniforms are covered by vc4_write_uniforms(). */
 
@@ -61,12 +69,12 @@ vc4_get_draw_cl_space(struct vc4_context *vc4)
  * Does the initial bining command list setup for drawing to a given FBO.
  */
 static void
-vc4_start_draw(struct vc4_context *vc4)
+vc4_start_draw(struct vc4_context *vc4, int vert_count)
 {
         if (vc4->needs_flush)
                 return;
 
-        vc4_get_draw_cl_space(vc4);
+        vc4_get_draw_cl_space(vc4, 0);
 
         struct vc4_cl_out *bcl = cl_start(&vc4->bcl);
         //   Tile state data is 48 bytes per tile, I think it can be thrown away
@@ -119,7 +127,8 @@ vc4_update_shadow_textures(struct pipe_context *pctx,
 }
 
 static void
-vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info)
+vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info,
+                         uint32_t extra_index_bias)
 {
         /* VC4_DIRTY_VTXSTATE */
         struct vc4_vertex_stateobj *vtx = vc4->vtx;
@@ -170,7 +179,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *i
                 /* not vc4->dirty tracked: vc4->last_index_bias */
                 uint32_t offset = (vb->buffer_offset +
                                    elem->src_offset +
-                                   vb->stride * info->index_bias);
+                                   vb->stride * (info->index_bias +
+                                                 extra_index_bias));
                 uint32_t vb_size = rsc->bo->size - offset;
                 uint32_t elem_size =
                         util_format_get_blocksize(elem->src_format);
@@ -219,8 +229,9 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *i
                            &vc4->constbuf[PIPE_SHADER_VERTEX],
                            &vc4->verttex);
 
-        vc4->last_index_bias = info->index_bias;
+        vc4->last_index_bias = info->index_bias + extra_index_bias;
         vc4->max_index = max_index;
+        vc4->shader_rec_count++;
 }
 
 /**
@@ -275,14 +286,14 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
 
         vc4_hw_2116_workaround(pctx);
 
-        vc4_get_draw_cl_space(vc4);
+        vc4_get_draw_cl_space(vc4, info->count);
 
         if (vc4->prim_mode != info->mode) {
                 vc4->prim_mode = info->mode;
                 vc4->dirty |= VC4_DIRTY_PRIM_MODE;
         }
 
-        vc4_start_draw(vc4);
+        vc4_start_draw(vc4, info->count);
         vc4_update_compiled_shaders(vc4, info->mode);
 
         vc4_emit_state(pctx);
@@ -298,7 +309,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                            vc4->prog.vs->uniform_dirty_bits |
                            vc4->prog.fs->uniform_dirty_bits)) ||
             vc4->last_index_bias != info->index_bias) {
-                vc4_emit_gl_shader_state(vc4, info);
+                vc4_emit_gl_shader_state(vc4, info, 0);
         }
 
         vc4->dirty = 0;
@@ -342,10 +353,75 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 if (vc4->indexbuf.index_size == 4 || vc4->indexbuf.user_buffer)
                         pipe_resource_reference(&prsc, NULL);
         } else {
-                cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
-                cl_u8(&bcl, info->mode);
-                cl_u32(&bcl, info->count);
-                cl_u32(&bcl, info->start);
+                uint32_t count = info->count;
+                uint32_t start = info->start;
+                uint32_t extra_index_bias = 0;
+
+                while (count) {
+                        uint32_t this_count = count;
+                        uint32_t step = count;
+                        static const uint32_t max_verts = 65535;
+
+                        /* GFXH-515 / SW-5891: The binner emits 16 bit indices
+                         * for drawarrays, which means that if start + count >
+                         * 64k it would truncate the top bits.  Work around
+                         * this by emitting a limited number of primitives at
+                         * a time and reemitting the shader state pointing
+                         * farther down the vertex attribute arrays.
+                         *
+                         * To do this properly for line loops or trifans, we'd
+                         * need to make a new VB containing the first vertex
+                         * plus whatever remainder.
+                         */
+                        if (extra_index_bias) {
+                                cl_end(&vc4->bcl, bcl);
+                                vc4_emit_gl_shader_state(vc4, info,
+                                                         extra_index_bias);
+                                bcl = cl_start(&vc4->bcl);
+                        }
+
+                        if (start + count > max_verts) {
+                                switch (info->mode) {
+                                case PIPE_PRIM_POINTS:
+                                        this_count = step = max_verts;
+                                        break;
+                                case PIPE_PRIM_LINES:
+                                        this_count = step = max_verts - (max_verts % 2);
+                                        break;
+                                case PIPE_PRIM_LINE_STRIP:
+                                        this_count = max_verts;
+                                        step = max_verts - 1;
+                                        break;
+                                case PIPE_PRIM_LINE_LOOP:
+                                        this_count = max_verts;
+                                        step = max_verts - 1;
+                                        debug_warn_once("unhandled line loop "
+                                                        "looping behavior with "
+                                                        ">65535 verts\n");
+                                        break;
+                                case PIPE_PRIM_TRIANGLES:
+                                        this_count = step = max_verts - (max_verts % 3);
+                                        break;
+                                case PIPE_PRIM_TRIANGLE_STRIP:
+                                        this_count = max_verts;
+                                        step = max_verts - 2;
+                                        break;
+                                default:
+                                        debug_warn_once("unhandled primitive "
+                                                        "max vert count, truncating\n");
+                                        this_count = step = max_verts;
+                                }
+                        }
+
+                        cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
+                        cl_u8(&bcl, info->mode);
+                        cl_u32(&bcl, this_count);
+                        cl_u32(&bcl, start);
+
+                        count -= step;
+                        extra_index_bias += start + step;
+                        start = 0;
+                }
         }
         cl_end(&vc4->bcl, bcl);
 
@@ -356,8 +432,6 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 vc4->resolve |= PIPE_CLEAR_STENCIL;
         vc4->resolve |= PIPE_CLEAR_COLOR0;
 
-        vc4->shader_rec_count++;
-
         if (vc4_debug & VC4_DEBUG_ALWAYS_FLUSH)
                 vc4_flush(pctx);
 }
@@ -410,7 +484,7 @@ vc4_clear(struct pipe_context *pctx, unsigned buffers,
         vc4->cleared |= buffers;
         vc4->resolve |= buffers;
 
-        vc4_start_draw(vc4);
+        vc4_start_draw(vc4, 0);
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index 49a314cdb25..cf6d2896f7d 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -710,9 +710,9 @@ vc4_nir_lower_blend_block(nir_block *block, void *state)
 }
 
 void
-vc4_nir_lower_blend(struct vc4_compile *c)
+vc4_nir_lower_blend(nir_shader *s, struct vc4_compile *c)
 {
-        nir_foreach_function(c->s, function) {
+        nir_foreach_function(s, function) {
                 if (function->impl) {
                         nir_foreach_block(function->impl,
                                           vc4_nir_lower_blend_block, c);
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index d08ad588e5b..22c602adb54 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -380,24 +380,14 @@ vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
                 intr_comp->num_components = 1;
                 nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
 
-                /* Convert the uniform (not user_clip_plane) offset to bytes.
-                 * If it happens to be a constant, constant-folding will clean
-                 * up the shift for us.
+                /* Convert the uniform offset to bytes.  If it happens to be a
+                 * constant, constant-folding will clean up the shift for us.
                  */
-                if (intr->intrinsic == nir_intrinsic_load_uniform) {
-                        /* Convert the base offset to bytes and add the
-                         * component
-                         */
-                        intr_comp->const_index[0] = (intr->const_index[0] * 16 + i * 4);
-
-                        intr_comp->src[0] =
-                                nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
-                                                         nir_imm_int(b, 4)));
-                } else {
-                        assert(intr->intrinsic ==
-                               nir_intrinsic_load_user_clip_plane);
-                        intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
-                }
+                intr_comp->const_index[0] = (intr->const_index[0] * 16 + i * 4);
+
+                intr_comp->src[0] =
+                        nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa,
+                                                 nir_imm_int(b, 4)));
 
                 dests[i] = &intr_comp->dest.ssa;
 
@@ -428,10 +418,10 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b,
                 break;
 
         case nir_intrinsic_load_uniform:
-        case nir_intrinsic_load_user_clip_plane:
                 vc4_nir_lower_uniform(c, b, intr);
                 break;
 
+        case nir_intrinsic_load_user_clip_plane:
         default:
                 break;
         }
@@ -465,9 +455,9 @@ vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl)
 }
 
 void
-vc4_nir_lower_io(struct vc4_compile *c)
+vc4_nir_lower_io(nir_shader *s, struct vc4_compile *c)
 {
-        nir_foreach_function(c->s, function) {
+        nir_foreach_function(s, function) {
                 if (function->impl)
                         vc4_nir_lower_io_impl(c, function->impl);
         }
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
index 8b65cac5084..6b8830743eb 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -162,9 +162,9 @@ vc4_nir_lower_txf_ms_impl(struct vc4_compile *c, nir_function_impl *impl)
 }
 
 void
-vc4_nir_lower_txf_ms(struct vc4_compile *c)
+vc4_nir_lower_txf_ms(nir_shader *s, struct vc4_compile *c)
 {
-        nir_foreach_function(c->s, function) {
+        nir_foreach_function(s, function) {
                 if (function->impl)
                         vc4_nir_lower_txf_ms_impl(c, function->impl);
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm.c b/src/gallium/drivers/vc4/vc4_opt_vpm.c
index d15b0c1a39f..d31b673bd63 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c
@@ -65,7 +65,7 @@ qir_opt_vpm(struct vc4_compile *c)
          * result, try to move the instruction up in place of the VPM read.
          */
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
-                if (!inst || qir_is_multi_instruction(inst))
+                if (!inst)
                         continue;
 
                 if (qir_depends_on_flags(inst) || inst->sf)
@@ -132,7 +132,7 @@ qir_opt_vpm(struct vc4_compile *c)
                         continue;
 
                 struct qinst *inst = c->defs[temp];
-                if (!inst || qir_is_multi_instruction(inst))
+                if (!inst)
                         continue;
 
                 if (qir_depends_on_flags(inst) || inst->sf)
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 71a1ebbb313..eccc7ab413f 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -30,7 +30,6 @@
 #include "util/ralloc.h"
 #include "util/hash_table.h"
 #include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_lowering.h"
 #include "tgsi/tgsi_parse.h"
 #include "compiler/nir/nir.h"
 #include "compiler/nir/nir_builder.h"
@@ -638,8 +637,8 @@ emit_vertex_input(struct vc4_compile *c, int attr)
 
         c->vattr_sizes[attr] = align(attr_size, 4);
         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
-                struct qreg vpm = { QFILE_VPM, attr * 4 + i };
-                c->inputs[attr * 4 + i] = qir_MOV(c, vpm);
+                c->inputs[attr * 4 + i] =
+                        qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
                 c->num_inputs++;
         }
 }
@@ -647,8 +646,8 @@ emit_vertex_input(struct vc4_compile *c, int attr)
 static void
 emit_fragcoord_input(struct vc4_compile *c, int attr)
 {
-        c->inputs[attr * 4 + 0] = qir_FRAG_X(c);
-        c->inputs[attr * 4 + 1] = qir_FRAG_Y(c);
+        c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0));
+        c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0));
         c->inputs[attr * 4 + 2] =
                 qir_FMUL(c,
                          qir_ITOF(c, qir_FRAG_Z(c)),
@@ -1193,12 +1192,15 @@ emit_frag_end(struct vc4_compile *c)
         }
 
         if (c->fs_key->stencil_enabled) {
-                qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 0));
+                qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
+                             qir_uniform(c, QUNIFORM_STENCIL, 0));
                 if (c->fs_key->stencil_twoside) {
-                        qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 1));
+                        qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
+                                     qir_uniform(c, QUNIFORM_STENCIL, 1));
                 }
                 if (c->fs_key->stencil_full_writemasks) {
-                        qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 2));
+                        qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0),
+                                     qir_uniform(c, QUNIFORM_STENCIL, 2));
                 }
         }
 
@@ -1207,24 +1209,24 @@ emit_frag_end(struct vc4_compile *c)
         }
 
         if (c->fs_key->depth_enabled) {
-                struct qreg z;
                 if (c->output_position_index != -1) {
-                        z = qir_FTOI(c, qir_FMUL(c, c->outputs[c->output_position_index + 2],
-                                                 qir_uniform_f(c, 0xffffff)));
+                        qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
+                                      qir_FMUL(c,
+                                               c->outputs[c->output_position_index + 2],
+                                               qir_uniform_f(c, 0xffffff)))->cond = discard_cond;
                 } else {
-                        z = qir_FRAG_Z(c);
+                        qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0),
+                                     qir_FRAG_Z(c))->cond = discard_cond;
                 }
-                struct qinst *inst = qir_TLB_Z_WRITE(c, z);
-                inst->cond = discard_cond;
         }
 
         if (!c->msaa_per_sample_output) {
-                struct qinst *inst = qir_TLB_COLOR_WRITE(c, color);
-                inst->cond = discard_cond;
+                qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0),
+                             color)->cond = discard_cond;
         } else {
                 for (int i = 0; i < VC4_MAX_SAMPLES; i++) {
-                        struct qinst *inst = qir_TLB_COLOR_WRITE_MS(c, c->sample_colors[i]);
-                        inst->cond = discard_cond;
+                        qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0),
+                                     c->sample_colors[i])->cond = discard_cond;
                 }
         }
 }
@@ -1304,8 +1306,7 @@ emit_stub_vpm_read(struct vc4_compile *c)
                 return;
 
         c->vattr_sizes[0] = 4;
-        struct qreg vpm = { QFILE_VPM, 0 };
-        (void)qir_MOV(c, vpm);
+        (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
         c->num_inputs++;
 }
 
@@ -1371,16 +1372,16 @@ vc4_optimize_nir(struct nir_shader *s)
         do {
                 progress = false;
 
-                nir_lower_vars_to_ssa(s);
-                nir_lower_alu_to_scalar(s);
+                NIR_PASS_V(s, nir_lower_vars_to_ssa);
+                NIR_PASS_V(s, nir_lower_alu_to_scalar);
 
-                progress = nir_copy_prop(s) || progress;
-                progress = nir_opt_dce(s) || progress;
-                progress = nir_opt_cse(s) || progress;
-                progress = nir_opt_peephole_select(s) || progress;
-                progress = nir_opt_algebraic(s) || progress;
-                progress = nir_opt_constant_folding(s) || progress;
-                progress = nir_opt_undef(s) || progress;
+                NIR_PASS(progress, s, nir_copy_prop);
+                NIR_PASS(progress, s, nir_opt_dce);
+                NIR_PASS(progress, s, nir_opt_cse);
+                NIR_PASS(progress, s, nir_opt_peephole_select);
+                NIR_PASS(progress, s, nir_opt_algebraic);
+                NIR_PASS(progress, s, nir_opt_constant_folding);
+                NIR_PASS(progress, s, nir_opt_undef);
         } while (progress);
 }
 
@@ -1427,7 +1428,9 @@ ntq_setup_inputs(struct vc4_compile *c)
                         if (var->data.location == VARYING_SLOT_POS) {
                                 emit_fragcoord_input(c, loc);
                         } else if (var->data.location == VARYING_SLOT_FACE) {
-                                c->inputs[loc * 4 + 0] = qir_FRAG_REV_FLAG(c);
+                                c->inputs[loc * 4 + 0] =
+                                        qir_ITOF(c, qir_reg(QFILE_FRAG_REV_FLAG,
+                                                            0));
                         } else if (var->data.location >= VARYING_SLOT_VAR0 &&
                                    (c->fs_key->point_sprite_mask &
                                     (1 << (var->data.location -
@@ -1573,8 +1576,10 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_user_clip_plane:
-                *dest = qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
-                                    instr->const_index[0]);
+                for (int i = 0; i < instr->num_components; i++) {
+                        dest[i] = qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+                                              instr->const_index[0] * 4 + i);
+                }
                 break;
 
         case nir_intrinsic_load_sample_mask_in:
@@ -1694,12 +1699,27 @@ ntq_emit_block(struct vc4_compile *c, nir_block *block)
         }
 }
 
+static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list);
+
+static void
+ntq_emit_loop(struct vc4_compile *c, nir_loop *nloop)
+{
+        fprintf(stderr, "LOOPS not fully handled. Rendering errors likely.\n");
+        ntq_emit_cf_list(c, &nloop->body);
+}
+
+static void
+ntq_emit_function(struct vc4_compile *c, nir_function_impl *func)
+{
+        fprintf(stderr, "FUNCTIONS not handled.\n");
+        abort();
+}
+
 static void
 ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
 {
         foreach_list_typed(nir_cf_node, node, node, list) {
                 switch (node->type) {
-                        /* case nir_cf_node_loop: */
                 case nir_cf_node_block:
                         ntq_emit_block(c, nir_cf_node_as_block(node));
                         break;
@@ -1708,8 +1728,17 @@ ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list)
                         ntq_emit_if(c, nir_cf_node_as_if(node));
                         break;
 
+                case nir_cf_node_loop:
+                        ntq_emit_loop(c, nir_cf_node_as_loop(node));
+                        break;
+
+                case nir_cf_node_function:
+                        ntq_emit_function(c, nir_cf_node_as_function(node));
+                        break;
+
                 default:
-                        assert(0);
+                        fprintf(stderr, "Unknown NIR node type\n");
+                        abort();
                 }
         }
 }
@@ -1810,11 +1839,11 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
         }
 
         c->s = tgsi_to_nir(tokens, &nir_options);
-        nir_opt_global_to_local(c->s);
-        nir_convert_to_ssa(c->s);
+        NIR_PASS_V(c->s, nir_opt_global_to_local);
+        NIR_PASS_V(c->s, nir_convert_to_ssa);
 
         if (stage == QSTAGE_FRAG)
-                vc4_nir_lower_blend(c);
+                NIR_PASS_V(c->s, vc4_nir_lower_blend, c);
 
         struct nir_lower_tex_options tex_options = {
                 /* We would need to implement txs, but we don't want the
@@ -1864,26 +1893,25 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage,
                 }
         }
 
-        nir_lower_tex(c->s, &tex_options);
+        NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
 
         if (c->fs_key && c->fs_key->light_twoside)
-                nir_lower_two_sided_color(c->s);
+                NIR_PASS_V(c->s, nir_lower_two_sided_color);
 
         if (stage == QSTAGE_FRAG)
-                nir_lower_clip_fs(c->s, c->key->ucp_enables);
+                NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables);
         else
-                nir_lower_clip_vs(c->s, c->key->ucp_enables);
+                NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables);
 
-        vc4_nir_lower_io(c);
-        vc4_nir_lower_txf_ms(c);
-        nir_lower_idiv(c->s);
-        nir_lower_load_const_to_scalar(c->s);
+        NIR_PASS_V(c->s, vc4_nir_lower_io, c);
+        NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c);
+        NIR_PASS_V(c->s, nir_lower_idiv);
+        NIR_PASS_V(c->s, nir_lower_load_const_to_scalar);
 
         vc4_optimize_nir(c->s);
 
-        nir_remove_dead_variables(c->s);
-
-        nir_convert_from_ssa(c->s, true);
+        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_local);
+        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
 
         if (vc4_debug & VC4_DEBUG_SHADERDB) {
                 fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n",
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index e73e3899410..293eb01adab 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -31,7 +31,6 @@ struct qir_op_info {
         const char *name;
         uint8_t ndst, nsrc;
         bool has_side_effects;
-        bool multi_instruction;
 };
 
 static const struct qir_op_info qir_op_info[] = {
@@ -65,23 +64,16 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_XOR] = { "xor", 1, 2 },
         [QOP_NOT] = { "not", 1, 1 },
 
-        [QOP_RCP] = { "rcp", 1, 1, false, true },
-        [QOP_RSQ] = { "rsq", 1, 1, false, true },
-        [QOP_EXP2] = { "exp2", 1, 2, false, true },
-        [QOP_LOG2] = { "log2", 1, 2, false, true },
-        [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true },
-        [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true },
-        [QOP_TLB_COLOR_WRITE] = { "tlb_color", 0, 1, true },
-        [QOP_TLB_COLOR_WRITE_MS] = { "tlb_color_ms", 0, 1, true },
+        [QOP_RCP] = { "rcp", 1, 1 },
+        [QOP_RSQ] = { "rsq", 1, 1 },
+        [QOP_EXP2] = { "exp2", 1, 2 },
+        [QOP_LOG2] = { "log2", 1, 2 },
         [QOP_TLB_COLOR_READ] = { "tlb_color_read", 1, 0 },
         [QOP_MS_MASK] = { "ms_mask", 0, 1, true },
         [QOP_VARY_ADD_C] = { "vary_add_c", 1, 1 },
 
-        [QOP_FRAG_X] = { "frag_x", 1, 0 },
-        [QOP_FRAG_Y] = { "frag_y", 1, 0 },
         [QOP_FRAG_Z] = { "frag_z", 1, 0 },
         [QOP_FRAG_W] = { "frag_w", 1, 0 },
-        [QOP_FRAG_REV_FLAG] = { "frag_rev_flag", 1, 0 },
 
         [QOP_TEX_S] = { "tex_s", 0, 2 },
         [QOP_TEX_T] = { "tex_t", 0, 2 },
@@ -116,6 +108,16 @@ qir_get_op_nsrc(enum qop qop)
 bool
 qir_has_side_effects(struct vc4_compile *c, struct qinst *inst)
 {
+        switch (inst->dst.file) {
+        case QFILE_TLB_Z_WRITE:
+        case QFILE_TLB_COLOR_WRITE:
+        case QFILE_TLB_COLOR_WRITE_MS:
+        case QFILE_TLB_STENCIL_SETUP:
+                return true;
+        default:
+                break;
+        }
+
         return qir_op_info[inst->op].has_side_effects;
 }
 
@@ -144,12 +146,6 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst)
 }
 
 bool
-qir_is_multi_instruction(struct qinst *inst)
-{
-        return qir_op_info[inst->op].multi_instruction;
-}
-
-bool
 qir_is_mul(struct qinst *inst)
 {
         switch (inst->op) {
@@ -233,24 +229,47 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write)
                 [QFILE_TEMP] = "t",
                 [QFILE_VARY] = "v",
                 [QFILE_UNIF] = "u",
+                [QFILE_TLB_COLOR_WRITE] = "tlb_c",
+                [QFILE_TLB_COLOR_WRITE_MS] = "tlb_c_ms",
+                [QFILE_TLB_Z_WRITE] = "tlb_z",
+                [QFILE_TLB_STENCIL_SETUP] = "tlb_stencil",
+                [QFILE_FRAG_X] = "frag_x",
+                [QFILE_FRAG_Y] = "frag_y",
+                [QFILE_FRAG_REV_FLAG] = "frag_rev_flag",
         };
 
-        if (reg.file == QFILE_NULL) {
+        switch (reg.file) {
+
+        case QFILE_NULL:
                 fprintf(stderr, "null");
-        } else if (reg.file == QFILE_SMALL_IMM) {
+                break;
+
+        case QFILE_SMALL_IMM:
                 if ((int)reg.index >= -16 && (int)reg.index <= 15)
                         fprintf(stderr, "%d", reg.index);
                 else
                         fprintf(stderr, "%f", uif(reg.index));
-        } else if (reg.file == QFILE_VPM) {
+                break;
+
+        case QFILE_VPM:
                 if (write) {
                         fprintf(stderr, "vpm");
                 } else {
                         fprintf(stderr, "vpm%d.%d",
                                 reg.index / 4, reg.index % 4);
                 }
-        } else {
+                break;
+
+        case QFILE_TLB_COLOR_WRITE:
+        case QFILE_TLB_COLOR_WRITE_MS:
+        case QFILE_TLB_Z_WRITE:
+        case QFILE_TLB_STENCIL_SETUP:
+                fprintf(stderr, "%s", files[reg.file]);
+                break;
+
+        default:
                 fprintf(stderr, "%s%d", files[reg.file], reg.index);
+                break;
         }
 
         if (reg.file == QFILE_UNIF &&
@@ -455,12 +474,11 @@ qir_uniform(struct vc4_compile *c,
         for (int i = 0; i < c->num_uniforms; i++) {
                 if (c->uniform_contents[i] == contents &&
                     c->uniform_data[i] == data) {
-                        return (struct qreg) { QFILE_UNIF, i };
+                        return qir_reg(QFILE_UNIF, i);
                 }
         }
 
         uint32_t uniform = c->num_uniforms++;
-        struct qreg u = { QFILE_UNIF, uniform };
 
         if (uniform >= c->uniform_array_size) {
                 c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
@@ -477,7 +495,7 @@ qir_uniform(struct vc4_compile *c,
         c->uniform_contents[uniform] = contents;
         c->uniform_data[uniform] = data;
 
-        return u;
+        return qir_reg(QFILE_UNIF, uniform);
 }
 
 void
@@ -492,10 +510,8 @@ qir_SF(struct vc4_compile *c, struct qreg src)
 
         if (src.file != QFILE_TEMP ||
             !c->defs[src.index] ||
-            last_inst != c->defs[src.index] ||
-            qir_is_multi_instruction(last_inst)) {
-                struct qreg null = { QFILE_NULL, 0 };
-                last_inst = qir_MOV_dest(c, null, src);
+            last_inst != c->defs[src.index]) {
+                last_inst = qir_MOV_dest(c, qir_reg(QFILE_NULL, 0), src);
                 last_inst = (struct qinst *)c->instructions.prev;
         }
         last_inst->sf = true;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 3fbf5d749e7..e8ba74b9a4d 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -49,6 +49,17 @@ enum qfile {
         QFILE_VARY,
         QFILE_UNIF,
         QFILE_VPM,
+        QFILE_TLB_COLOR_WRITE,
+        QFILE_TLB_COLOR_WRITE_MS,
+        QFILE_TLB_Z_WRITE,
+        QFILE_TLB_STENCIL_SETUP,
+
+        /* Payload registers that aren't in the physical register file, so we
+         * can just use the corresponding qpu_reg at qpu_emit time.
+         */
+        QFILE_FRAG_X,
+        QFILE_FRAG_Y,
+        QFILE_FRAG_REV_FLAG,
 
         /**
          * Stores an immediate value in the index field that can be turned
@@ -63,6 +74,11 @@ struct qreg {
         int pack;
 };
 
+static inline struct qreg qir_reg(enum qfile file, uint32_t index)
+{
+        return (struct qreg){file, index};
+}
+
 enum qop {
         QOP_UNDEF,
         QOP_MOV,
@@ -101,19 +117,12 @@ enum qop {
         QOP_LOG2,
         QOP_VW_SETUP,
         QOP_VR_SETUP,
-        QOP_TLB_STENCIL_SETUP,
-        QOP_TLB_Z_WRITE,
-        QOP_TLB_COLOR_WRITE,
-        QOP_TLB_COLOR_WRITE_MS,
         QOP_TLB_COLOR_READ,
         QOP_MS_MASK,
         QOP_VARY_ADD_C,
 
-        QOP_FRAG_X,
-        QOP_FRAG_Y,
         QOP_FRAG_Z,
         QOP_FRAG_W,
-        QOP_FRAG_REV_FLAG,
 
         /** Texture x coordinate parameter write */
         QOP_TEX_S,
@@ -463,7 +472,6 @@ int qir_get_op_nsrc(enum qop qop);
 bool qir_reg_equals(struct qreg a, struct qreg b);
 bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst);
 bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst);
-bool qir_is_multi_instruction(struct qinst *inst);
 bool qir_is_mul(struct qinst *inst);
 bool qir_is_raw_mov(struct qinst *inst);
 bool qir_is_tex(struct qinst *inst);
@@ -484,13 +492,13 @@ bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
 bool qir_opt_vpm(struct vc4_compile *c);
-void vc4_nir_lower_blend(struct vc4_compile *c);
-void vc4_nir_lower_io(struct vc4_compile *c);
+void vc4_nir_lower_blend(nir_shader *s, struct vc4_compile *c);
+void vc4_nir_lower_io(nir_shader *s, struct vc4_compile *c);
 nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
                                        enum quniform_contents contents);
 nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
                                           nir_ssa_def **srcs, int swiz);
-void vc4_nir_lower_txf_ms(struct vc4_compile *c);
+void vc4_nir_lower_txf_ms(nir_shader *s, struct vc4_compile *c);
 void qir_lower_uniforms(struct vc4_compile *c);
 
 uint32_t qpu_schedule_instructions(struct vc4_compile *c);
@@ -618,17 +626,10 @@ QIR_NODST_2(TEX_T)
 QIR_NODST_2(TEX_R)
 QIR_NODST_2(TEX_B)
 QIR_NODST_2(TEX_DIRECT)
-QIR_ALU0(FRAG_X)
-QIR_ALU0(FRAG_Y)
 QIR_ALU0(FRAG_Z)
 QIR_ALU0(FRAG_W)
-QIR_ALU0(FRAG_REV_FLAG)
 QIR_ALU0(TEX_RESULT)
 QIR_ALU0(TLB_COLOR_READ)
-QIR_NODST_1(TLB_COLOR_WRITE)
-QIR_NODST_1(TLB_COLOR_WRITE_MS)
-QIR_NODST_1(TLB_Z_WRITE)
-QIR_NODST_1(TLB_STENCIL_SETUP)
 QIR_NODST_1(MS_MASK)
 
 static inline struct qreg
@@ -703,8 +704,7 @@ qir_POW(struct vc4_compile *c, struct qreg x, struct qreg y)
 static inline void
 qir_VPM_WRITE(struct vc4_compile *c, struct qreg val)
 {
-        static const struct qreg vpm = { QFILE_VPM, 0 };
-        qir_emit(c, qir_inst(QOP_MOV, vpm, val, c->undef));
+        qir_MOV_dest(c, qir_reg(QFILE_VPM, 0), val);
 }
 
 #endif /* VC4_QIR_H */
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index a57e100593c..927268d71ef 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -150,7 +150,7 @@ qir_lower_uniforms(struct vc4_compile *c)
                  * reference a temp instead.
                  */
                 struct qreg temp = qir_get_temp(c);
-                struct qreg unif = { QFILE_UNIF, max_index };
+                struct qreg unif = qir_reg(QFILE_UNIF, max_index);
                 struct qinst *mov = qir_inst(QOP_MOV, temp, unif, c->undef);
                 list_add(&mov->link, &c->instructions);
                 c->defs[temp.index] = mov;
diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c
index 186e81be750..8b843a3a158 100644
--- a/src/gallium/drivers/vc4/vc4_qir_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c
@@ -228,10 +228,7 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
                 add_write_dep(dir, &state->last_tex_result, n);
                 break;
 
-        case QOP_TLB_COLOR_WRITE:
         case QOP_TLB_COLOR_READ:
-        case QOP_TLB_Z_WRITE:
-        case QOP_TLB_STENCIL_SETUP:
         case QOP_MS_MASK:
                 add_write_dep(dir, &state->last_tlb, n);
                 break;
@@ -240,10 +237,25 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n)
                 break;
         }
 
-        if (inst->dst.file == QFILE_VPM)
+        switch (inst->dst.file) {
+        case QFILE_VPM:
                 add_write_dep(dir, &state->last_vpm_write, n);
-        else if (inst->dst.file == QFILE_TEMP)
+                break;
+
+        case QFILE_TEMP:
                 add_write_dep(dir, &state->last_temp_write[inst->dst.index], n);
+                break;
+
+        case QFILE_TLB_COLOR_WRITE:
+        case QFILE_TLB_COLOR_WRITE_MS:
+        case QFILE_TLB_Z_WRITE:
+        case QFILE_TLB_STENCIL_SETUP:
+                add_write_dep(dir, &state->last_tlb, n);
+                break;
+
+        default:
+                break;
+        }
 
         if (qir_depends_on_flags(inst))
                 add_dep(dir, state->last_sf, n);
@@ -357,11 +369,13 @@ get_register_pressure_cost(struct schedule_state *state, struct qinst *inst)
 static bool
 locks_scoreboard(struct qinst *inst)
 {
-        switch (inst->op) {
-        case QOP_TLB_Z_WRITE:
-        case QOP_TLB_COLOR_WRITE:
-        case QOP_TLB_COLOR_WRITE_MS:
-        case QOP_TLB_COLOR_READ:
+        if (inst->op == QOP_TLB_COLOR_READ)
+                return true;
+
+        switch (inst->dst.file) {
+        case QFILE_TLB_Z_WRITE:
+        case QFILE_TLB_COLOR_WRITE:
+        case QFILE_TLB_COLOR_WRITE_MS:
                 return true;
         default:
                 return false;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index b507e370683..ae3590854b2 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -167,6 +167,16 @@ set_last_dst_pack(struct vc4_compile *c, struct qinst *inst)
         }
 }
 
+static void
+handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst,
+                    struct qpu_reg dst)
+{
+        if (dst.mux != QPU_MUX_R4)
+                queue(c, qpu_a_MOV(dst, qpu_r4()));
+        else if (qinst->sf)
+                queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
+}
+
 void
 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
 {
@@ -290,6 +300,22 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 last_vpm_read_index = qinst->src[i].index;
                                 src[i] = qpu_ra(QPU_R_VPM);
                                 break;
+
+                        case QFILE_FRAG_X:
+                                src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
+                                break;
+                        case QFILE_FRAG_Y:
+                                src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
+                                break;
+                        case QFILE_FRAG_REV_FLAG:
+                                src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
+                                break;
+
+                        case QFILE_TLB_COLOR_WRITE:
+                        case QFILE_TLB_COLOR_WRITE_MS:
+                        case QFILE_TLB_Z_WRITE:
+                        case QFILE_TLB_STENCIL_SETUP:
+                                unreachable("bad qir src file");
                         }
                 }
 
@@ -304,9 +330,29 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 case QFILE_VPM:
                         dst = qpu_ra(QPU_W_VPM);
                         break;
+
+                case QFILE_TLB_COLOR_WRITE:
+                        dst = qpu_tlbc();
+                        break;
+
+                case QFILE_TLB_COLOR_WRITE_MS:
+                        dst = qpu_tlbc_ms();
+                        break;
+
+                case QFILE_TLB_Z_WRITE:
+                        dst = qpu_ra(QPU_W_TLB_Z);
+                        break;
+
+                case QFILE_TLB_STENCIL_SETUP:
+                        dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
+                        break;
+
                 case QFILE_VARY:
                 case QFILE_UNIF:
                 case QFILE_SMALL_IMM:
+                case QFILE_FRAG_X:
+                case QFILE_FRAG_Y:
+                case QFILE_FRAG_REV_FLAG:
                         assert(!"not reached");
                         break;
                 }
@@ -339,24 +385,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                                 abort();
                         }
 
-                        if (dst.mux != QPU_MUX_R4)
-                                queue(c, qpu_a_MOV(dst, qpu_r4()));
-
-                        break;
+                        handle_r4_qpu_write(c, qinst, dst);
 
-                case QOP_FRAG_X:
-                        queue(c, qpu_a_ITOF(dst,
-                                            qpu_ra(QPU_R_XY_PIXEL_COORD)));
-                        break;
-
-                case QOP_FRAG_Y:
-                        queue(c, qpu_a_ITOF(dst,
-                                            qpu_rb(QPU_R_XY_PIXEL_COORD)));
-                        break;
-
-                case QOP_FRAG_REV_FLAG:
-                        queue(c, qpu_a_ITOF(dst,
-                                            qpu_rb(QPU_R_MS_REV_FLAGS)));
                         break;
 
                 case QOP_MS_MASK:
@@ -374,38 +404,11 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                          */
                         break;
 
-                case QOP_TLB_STENCIL_SETUP:
-                        assert(!unpack);
-                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP),
-                                           src[0]) | unpack);
-                        break;
-
-                case QOP_TLB_Z_WRITE:
-                        queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
-                                           src[0]) | unpack);
-                        set_last_cond_add(c, qinst->cond);
-                        handled_qinst_cond = true;
-                        break;
-
                 case QOP_TLB_COLOR_READ:
                         queue(c, qpu_NOP());
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_COLOR_LOAD);
-
-                        if (dst.mux != QPU_MUX_R4)
-                                queue(c, qpu_a_MOV(dst, qpu_r4()));
-                        break;
-
-                case QOP_TLB_COLOR_WRITE:
-                        queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack);
-                        set_last_cond_add(c, qinst->cond);
-                        handled_qinst_cond = true;
-                        break;
-
-                case QOP_TLB_COLOR_WRITE_MS:
-                        queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0]));
-                        set_last_cond_add(c, qinst->cond);
-                        handled_qinst_cond = true;
+                        handle_r4_qpu_write(c, qinst, dst);
                         break;
 
                 case QOP_VARY_ADD_C:
@@ -432,8 +435,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                         queue(c, qpu_NOP());
                         *last_inst(c) = qpu_set_sig(*last_inst(c),
                                                     QPU_SIG_LOAD_TMU0);
-                        if (dst.mux != QPU_MUX_R4)
-                                queue(c, qpu_a_MOV(dst, qpu_r4()));
+                        handle_r4_qpu_write(c, qinst, dst);
                         break;
 
                 default:
@@ -476,10 +478,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 assert(qinst->cond == QPU_COND_ALWAYS ||
                        handled_qinst_cond);
 
-                if (qinst->sf) {
-                        assert(!qir_is_multi_instruction(qinst));
+                if (qinst->sf)
                         *last_inst(c) |= QPU_SF;
-                }
         }
 
         uint32_t cycles = qpu_schedule_instructions(c);
diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c
index 270832eae3a..17400a37ca3 100644
--- a/src/gallium/drivers/vc4/vc4_query.c
+++ b/src/gallium/drivers/vc4/vc4_query.c
@@ -72,6 +72,11 @@ vc4_get_query_result(struct pipe_context *ctx, struct pipe_query *query,
         return true;
 }
 
+static void
+vc4_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void
 vc4_query_init(struct pipe_context *pctx)
 {
@@ -80,5 +85,6 @@ vc4_query_init(struct pipe_context *pctx)
         pctx->begin_query = vc4_begin_query;
         pctx->end_query = vc4_end_query;
         pctx->get_query_result = vc4_get_query_result;
+	pctx->set_active_query_state = vc4_set_active_query_state;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 167a2f5bd8e..1da4db2ebb7 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -208,6 +208,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_PCI_DEVICE:
         case PIPE_CAP_PCI_FUNCTION:
         case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+        case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/virgl/virgl_query.c b/src/gallium/drivers/virgl/virgl_query.c
index b0200556342..5173bd39a45 100644
--- a/src/gallium/drivers/virgl/virgl_query.c
+++ b/src/gallium/drivers/virgl/virgl_query.c
@@ -164,6 +164,11 @@ static boolean virgl_get_query_result(struct pipe_context *ctx,
    return TRUE;
 }
 
+static void
+virgl_set_active_query_state(struct pipe_context *pipe, boolean enable)
+{
+}
+
 void virgl_init_query_functions(struct virgl_context *vctx)
 {
    vctx->base.render_condition = virgl_render_condition;
@@ -172,4 +177,5 @@ void virgl_init_query_functions(struct virgl_context *vctx)
    vctx->base.begin_query = virgl_begin_query;
    vctx->base.end_query = virgl_end_query;
    vctx->base.get_query_result = virgl_get_query_result;
+   vctx->base.set_active_query_state = virgl_set_active_query_state;
 }
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index 5a5afc1712f..14c91105a04 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -240,6 +240,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PCI_DEVICE:
    case PIPE_CAP_PCI_FUNCTION:
    case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT:
+   case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 1c97e82ece5..82efaf5d8a9 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -173,6 +173,12 @@ struct pipe_context {
                                      struct pipe_resource *resource,
                                      unsigned offset);
 
+   /**
+    * Set whether all current non-driver queries except TIME_ELAPSED are
+    * active or paused.
+    */
+   void (*set_active_query_state)(struct pipe_context *pipe, boolean enable);
+
    /*@}*/
 
    /**
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 5e204a3e5ea..1aef21d6292 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -691,6 +691,7 @@ enum pipe_cap
    PIPE_CAP_PCI_DEVICE,
    PIPE_CAP_PCI_FUNCTION,
    PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT,
+   PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 4d11c2477c7..fb757886381 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -322,6 +322,15 @@ namespace {
       // list of kernel functions to the internalizer.  The internalizer will
       // treat the functions in the list as "main" functions and internalize
       // all of the other functions.
+#if HAVE_LLVM >= 0x0309
+      auto preserve_kernels = [=](const llvm::GlobalValue &GV) {
+         for (const auto &kernel : kernels) {
+            if (GV.getName() == kernel->getName())
+               return true;
+         }
+         return false;
+      };
+#else
       std::vector<const char*> export_list;
       for (std::vector<llvm::Function *>::const_iterator I = kernels.begin(),
                                                          E = kernels.end();
@@ -329,12 +338,17 @@ namespace {
          llvm::Function *kernel = *I;
          export_list.push_back(kernel->getName().data());
       }
+#endif
 #if HAVE_LLVM < 0x0306
       PM.add(new llvm::DataLayoutPass(mod));
 #elif HAVE_LLVM < 0x0307
       PM.add(new llvm::DataLayoutPass());
 #endif
+#if HAVE_LLVM >= 0x0309
+      PM.add(llvm::createInternalizePass(preserve_kernels));
+#else
       PM.add(llvm::createInternalizePass(export_list));
+#endif
 
       llvm::PassManagerBuilder PMB;
       PMB.OptLevel = optimization_level;
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index b25c381d968..25d587af46f 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -283,6 +283,10 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id)
    drv = VL_VA_DRIVER(ctx);
    pipe_mutex_lock(drv->mutex);
    context = handle_table_get(drv->htab, context_id);
+   if (!context) {
+      pipe_mutex_unlock(drv->mutex);
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+   }
 
    if (context->decoder) {
       if (u_reduce_video_profile(context->decoder->profile) ==
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index 2c42a985823..92d014c3d44 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -280,6 +280,7 @@ vlVaDestroyImage(VADriverContextP ctx, VAImageID image)
 {
    vlVaDriver *drv;
    VAImage  *vaimage;
+   VAStatus status;
 
    if (!ctx)
       return VA_STATUS_ERROR_INVALID_CONTEXT;
@@ -294,8 +295,9 @@ vlVaDestroyImage(VADriverContextP ctx, VAImageID image)
 
    handle_table_remove(VL_VA_DRIVER(ctx)->htab, image);
    pipe_mutex_unlock(drv->mutex);
+   status = vlVaDestroyBuffer(ctx, vaimage->buf);
    FREE(vaimage);
-   return vlVaDestroyBuffer(ctx, vaimage->buf);
+   return status;
 }
 
 VAStatus
diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript
index e1c78dd06a0..1c816ff7762 100644
--- a/src/gallium/targets/libgl-xlib/SConscript
+++ b/src/gallium/targets/libgl-xlib/SConscript
@@ -48,11 +48,15 @@ if env['llvm']:
     env.Prepend(LIBS = [llvmpipe])
 
 if env['platform'] != 'darwin':
+    # Disallow undefined symbols, except with Address Sanitizer, since libasan
+    # is not linked on shared libs, as it should be LD_PRELOAD'ed instead
+    if not env['asan']:
+        env.Append(SHLINKFLAGS = [
+            '-Wl,-z,defs',
+        ])
     env.Append(SHLINKFLAGS = [
-       # Disallow undefined symbols
-       '-Wl,-z,defs',
-       # Restrict exported symbols
-       '-Wl,--version-script=%s' % File("libgl-xlib.sym").srcnode().path,
+        # Restrict exported symbols
+        '-Wl,--version-script=%s' % File("libgl-xlib.sym").srcnode().path,
     ])
 
 # libGL.so.1.5
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index c79bed45753..1b2793a5d6b 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -36,6 +36,7 @@
 #include <amdgpu_drm.h>
 #include <xf86drm.h>
 #include <stdio.h>
+#include <inttypes.h>
 
 static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo)
 {
@@ -141,9 +142,9 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf)
       amdgpu_fence_reference(&bo->fence[i], NULL);
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-      bo->ws->allocated_vram -= align(bo->base.size, bo->ws->gart_page_size);
+      bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->gart_page_size);
    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-      bo->ws->allocated_gtt -= align(bo->base.size, bo->ws->gart_page_size);
+      bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->gart_page_size);
    FREE(bo);
 }
 
@@ -265,7 +266,7 @@ static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo)
 }
 
 static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
-                                                 unsigned size,
+                                                 uint64_t size,
                                                  unsigned alignment,
                                                  unsigned usage,
                                                  enum radeon_bo_domain initial_domain,
@@ -303,9 +304,9 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
    r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle);
    if (r) {
       fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n");
-      fprintf(stderr, "amdgpu:    size      : %d bytes\n", size);
-      fprintf(stderr, "amdgpu:    alignment : %d bytes\n", alignment);
-      fprintf(stderr, "amdgpu:    domains   : %d\n", initial_domain);
+      fprintf(stderr, "amdgpu:    size      : %"PRIu64" bytes\n", size);
+      fprintf(stderr, "amdgpu:    alignment : %u bytes\n", alignment);
+      fprintf(stderr, "amdgpu:    domains   : %u\n", initial_domain);
       goto error_bo_alloc;
    }
 
@@ -331,9 +332,9 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws,
    bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
 
    if (initial_domain & RADEON_DOMAIN_VRAM)
-      ws->allocated_vram += align(size, ws->gart_page_size);
+      ws->allocated_vram += align64(size, ws->gart_page_size);
    else if (initial_domain & RADEON_DOMAIN_GTT)
-      ws->allocated_gtt += align(size, ws->gart_page_size);
+      ws->allocated_gtt += align64(size, ws->gart_page_size);
 
    amdgpu_add_buffer_to_global_list(bo);
 
@@ -458,7 +459,7 @@ static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
 
 static struct pb_buffer *
 amdgpu_bo_create(struct radeon_winsys *rws,
-                 unsigned size,
+                 uint64_t size,
                  unsigned alignment,
                  boolean use_reusable_pool,
                  enum radeon_bo_domain domain,
@@ -468,21 +469,11 @@ amdgpu_bo_create(struct radeon_winsys *rws,
    struct amdgpu_winsys_bo *bo;
    unsigned usage = 0;
 
-   /* Don't use VRAM if the GPU doesn't have much. This is only the initial
-    * domain. The kernel is free to move the buffer if it wants to.
-    *
-    * 64MB means no VRAM by todays standards.
-    */
-   if (domain & RADEON_DOMAIN_VRAM && ws->info.vram_size <= 64*1024*1024) {
-      domain = RADEON_DOMAIN_GTT;
-      flags = RADEON_FLAG_GTT_WC;
-   }
-
    /* Align size to page size. This is the minimum alignment for normal
     * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
     * like constant/uniform buffers, can benefit from better and more reuse.
     */
-   size = align(size, ws->gart_page_size);
+   size = align64(size, ws->gart_page_size);
 
    /* Only set one usage bit each for domains and flags, or the cache manager
     * might consider different sets of domains / flags compatible
@@ -592,9 +583,9 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
       *offset = whandle->offset;
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-      ws->allocated_vram += align(bo->base.size, ws->gart_page_size);
+      ws->allocated_vram += align64(bo->base.size, ws->gart_page_size);
    else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-      ws->allocated_gtt += align(bo->base.size, ws->gart_page_size);
+      ws->allocated_gtt += align64(bo->base.size, ws->gart_page_size);
 
    amdgpu_add_buffer_to_global_list(bo);
 
@@ -648,7 +639,7 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
 }
 
 static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
-					    void *pointer, unsigned size)
+					    void *pointer, uint64_t size)
 {
     struct amdgpu_winsys *ws = amdgpu_winsys(rws);
     amdgpu_bo_handle buf_handle;
@@ -684,7 +675,7 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
     bo->initial_domain = RADEON_DOMAIN_GTT;
     bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1);
 
-    ws->allocated_gtt += align(bo->base.size, ws->gart_page_size);
+    ws->allocated_gtt += align64(bo->base.size, ws->gart_page_size);
 
     amdgpu_add_buffer_to_global_list(bo);
 
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
index 4c837a8e20f..1164a3058c5 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -212,7 +212,7 @@ static int compute_level(struct amdgpu_winsys *ws,
    }
 
    surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level];
-   surf_level->offset = align(surf->bo_size, AddrSurfInfoOut->baseAlign);
+   surf_level->offset = align64(surf->bo_size, AddrSurfInfoOut->baseAlign);
    surf_level->slice_size = AddrSurfInfoOut->sliceSize;
    surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe);
    surf_level->npix_x = u_minify(surf->npix_x, level);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 87d9a6aebec..1177d3e3c3a 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -256,6 +256,10 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws, int fd)
       goto fail;
    }
 
+   /* Set which chips have dedicated VRAM. */
+   ws->info.has_dedicated_vram =
+      !(ws->amdinfo.ids_flags & AMDGPU_IDS_FLAGS_FUSION);
+
    /* Set hardware information. */
    ws->info.gart_size = gtt.heap_size;
    ws->info.vram_size = vram.heap_size;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 08856dff430..dd6555c9502 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -40,6 +40,7 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
+#include <inttypes.h>
 
 static inline struct radeon_bo *radeon_bo(struct pb_buffer *bo)
 {
@@ -297,8 +298,8 @@ void radeon_bo_destroy(struct pb_buffer *_buf)
 				    sizeof(va)) != 0 &&
 		va.operation == RADEON_VA_RESULT_ERROR) {
                 fprintf(stderr, "radeon: Failed to deallocate virtual address for buffer:\n");
-                fprintf(stderr, "radeon:    size      : %d bytes\n", bo->base.size);
-                fprintf(stderr, "radeon:    va        : 0x%016llx\n", (unsigned long long)bo->va);
+                fprintf(stderr, "radeon:    size      : %"PRIu64" bytes\n", bo->base.size);
+                fprintf(stderr, "radeon:    va        : 0x%"PRIx64"\n", bo->va);
             }
 	}
 
@@ -529,10 +530,10 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws,
     if (drmCommandWriteRead(rws->fd, DRM_RADEON_GEM_CREATE,
                             &args, sizeof(args))) {
         fprintf(stderr, "radeon: Failed to allocate a buffer:\n");
-        fprintf(stderr, "radeon:    size      : %d bytes\n", size);
-        fprintf(stderr, "radeon:    alignment : %d bytes\n", alignment);
-        fprintf(stderr, "radeon:    domains   : %d\n", args.initial_domain);
-        fprintf(stderr, "radeon:    flags     : %d\n", args.flags);
+        fprintf(stderr, "radeon:    size      : %u bytes\n", size);
+        fprintf(stderr, "radeon:    alignment : %u bytes\n", alignment);
+        fprintf(stderr, "radeon:    domains   : %u\n", args.initial_domain);
+        fprintf(stderr, "radeon:    flags     : %u\n", args.flags);
         return NULL;
     }
 
@@ -717,7 +718,7 @@ static void radeon_bo_set_metadata(struct pb_buffer *_buf,
 
 static struct pb_buffer *
 radeon_winsys_bo_create(struct radeon_winsys *rws,
-                        unsigned size,
+                        uint64_t size,
                         unsigned alignment,
                         boolean use_reusable_pool,
                         enum radeon_bo_domain domain,
@@ -727,6 +728,10 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
     struct radeon_bo *bo;
     unsigned usage = 0;
 
+    /* Only 32-bit sizes are supported. */
+    if (size > UINT_MAX)
+        return NULL;
+
     /* Align size to page size. This is the minimum alignment for normal
      * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
      * like constant/uniform buffers, can benefit from better and more reuse.
@@ -768,7 +773,7 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
 }
 
 static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
-                                                   void *pointer, unsigned size)
+                                                   void *pointer, uint64_t size)
 {
     struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
     struct drm_radeon_gem_userptr args;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index 1accc6a1863..2d9ec8cee09 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -297,6 +297,30 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws)
         break;
     }
 
+    /* Set which chips don't have dedicated VRAM. */
+    switch (ws->info.family) {
+    case CHIP_RS400:
+    case CHIP_RC410:
+    case CHIP_RS480:
+    case CHIP_RS600:
+    case CHIP_RS690:
+    case CHIP_RS740:
+    case CHIP_RS780:
+    case CHIP_RS880:
+    case CHIP_PALM:
+    case CHIP_SUMO:
+    case CHIP_SUMO2:
+    case CHIP_ARUBA:
+    case CHIP_KAVERI:
+    case CHIP_KABINI:
+    case CHIP_MULLINS:
+       ws->info.has_dedicated_vram = false;
+       break;
+
+    default:
+       ws->info.has_dedicated_vram = true;
+    }
+
     /* Check for dma */
     ws->info.has_sdma = FALSE;
     /* DMA is disabled on R700. There is IB corruption and hangs. */
diff --git a/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c b/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
index c1b9eb95c52..d049d1dbc46 100644
--- a/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
+++ b/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c
@@ -40,6 +40,7 @@
 #include <unistd.h>
 #include <sched.h>
 #endif
+#include <inttypes.h>
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_defines.h"
@@ -172,7 +173,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
    while(curr != &fenced_mgr->unfenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(!fenced_buf->fence);
-      debug_printf("%10p %7u %8u %7s\n",
+      debug_printf("%10p %"PRIu64" %8u %7s\n",
                    (void *) fenced_buf,
                    fenced_buf->base.size,
                    p_atomic_read(&fenced_buf->base.reference.count),
@@ -188,7 +189,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(fenced_buf->buffer);
       signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
-      debug_printf("%10p %7u %8u %7s %10p %s\n",
+      debug_printf("%10p %"PRIu64" %8u %7s %10p %s\n",
                    (void *) fenced_buf,
                    fenced_buf->base.size,
                    p_atomic_read(&fenced_buf->base.reference.count),
diff --git a/src/gallium/winsys/svga/drm/vmw_buffer.c b/src/gallium/winsys/svga/drm/vmw_buffer.c
index c082dcc34e9..3ac80c7caf5 100644
--- a/src/gallium/winsys/svga/drm/vmw_buffer.c
+++ b/src/gallium/winsys/svga/drm/vmw_buffer.c
@@ -154,7 +154,7 @@ vmw_gmr_buffer_unmap(struct pb_buffer *_buf)
 static void
 vmw_gmr_buffer_get_base_buffer(struct pb_buffer *buf,
                            struct pb_buffer **base_buf,
-                           unsigned *offset)
+                           pb_size *offset)
 {
    *base_buf = buf;
    *offset = 0;
@@ -266,7 +266,7 @@ vmw_gmr_bufmgr_region_ptr(struct pb_buffer *buf,
                           struct SVGAGuestPtr *ptr)
 {
    struct pb_buffer *base_buf;
-   unsigned offset = 0;
+   pb_size offset = 0;
    struct vmw_gmr_buffer *gmr_buf;
    
    pb_get_base_buffer( buf, &base_buf, &offset );
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 56d79a02d79..52748a0619a 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -170,7 +170,7 @@ anv_shader_compile_to_nir(struct anv_device *device,
    /* Vulkan uses the separate-shader linking model */
    nir->info.separate_shader = true;
 
-   nir = brw_preprocess_nir(nir, compiler->scalar_stage[stage]);
+   nir = brw_preprocess_nir(compiler, nir);
 
    nir_shader_gather_info(nir, entry_point->impl);
 
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 8dc44fda0f2..390381828e9 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -179,10 +179,8 @@ libmesagallium_la_LIBADD = \
 	$(ARCH_LIBS)
 
 libmesa_sse41_la_SOURCES = \
-	main/streaming-load-memcpy.c \
-	main/streaming-load-memcpy.h \
-	main/sse_minmax.c \
-	main/sse_minmax.h
+	$(X86_SSE41_FILES)
+
 libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_CFLAGS)
 
 pkgconfigdir = $(libdir)/pkgconfig
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 7425f01273d..2ffbb152e3c 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -396,6 +396,7 @@ VBO_FILES = \
 
 STATETRACKER_FILES = \
 	state_tracker/st_atifs_to_tgsi.c \
+	state_tracker/st_atifs_to_tgsi.h \
 	state_tracker/st_atom_array.c \
 	state_tracker/st_atom_atomicbuf.c \
 	state_tracker/st_atom_blend.c \
@@ -589,7 +590,9 @@ X86_64_FILES =		\
 
 X86_SSE41_FILES = \
 	main/streaming-load-memcpy.c \
-	main/sse_minmax.c
+	main/streaming-load-memcpy.h \
+	main/sse_minmax.c \
+	main/sse_minmax.h
 
 SPARC_FILES =			\
 	sparc/sparc.h		\
diff --git a/src/mesa/drivers/dri/i965/.gitignore b/src/mesa/drivers/dri/i965/.gitignore
index 8eb9f4e1598..70aae3f4d4c 100644
--- a/src/mesa/drivers/dri/i965/.gitignore
+++ b/src/mesa/drivers/dri/i965/.gitignore
@@ -1,3 +1,4 @@
+brw_nir_trig_workarounds.c
 i965_symbols_test
 test_eu_compact
 test_vec4_copy_propagation
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 0db5a51e725..a41c8305a80 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -33,6 +33,7 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/mesa/drivers/dri/common \
 	-I$(top_srcdir)/src/mesa/drivers/dri/intel/server \
 	-I$(top_srcdir)/src/gtest/include \
+	-I$(top_srcdir)/src/compiler/nir \
 	-I$(top_builddir)/src/compiler/nir \
 	-I$(top_builddir)/src/mesa/drivers/dri/common \
 	$(DEFINES) \
@@ -41,6 +42,10 @@ AM_CFLAGS = \
 
 AM_CXXFLAGS = $(AM_CFLAGS)
 
+brw_nir_trig_workarounds.c: brw_nir_trig_workarounds.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py
+	$(MKDIR_GEN)
+	$(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/brw_nir_trig_workarounds.py > $@ || ($(RM) $@; false)
+
 noinst_LTLIBRARIES = libi965_dri.la libi965_compiler.la
 libi965_dri_la_SOURCES = $(i965_FILES)
 libi965_dri_la_LIBADD = libi965_compiler.la $(INTEL_LIBS)
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 2802ec9887c..c314d7470bb 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -44,6 +44,7 @@ i965_compiler_FILES = \
 	brw_nir.c \
 	brw_nir_analyze_boolean_resolves.c \
 	brw_nir_attribute_workarounds.c \
+	brw_nir_trig_workarounds.c \
 	brw_nir_opt_peephole_ffma.c \
 	brw_nir_uniforms.cpp \
 	brw_packed_float.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 2d480d02366..63ac3bc31ed 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -1151,10 +1151,9 @@ intel_gles3_srgb_workaround(struct brw_context *brw,
     */
    fb->Visual.sRGBCapable = false;
    for (int i = 0; i < BUFFER_COUNT; i++) {
-      if (fb->Attachment[i].Renderbuffer &&
-          fb->Attachment[i].Renderbuffer->Format == MESA_FORMAT_B8G8R8A8_SRGB) {
-         fb->Attachment[i].Renderbuffer->Format = MESA_FORMAT_B8G8R8A8_UNORM;
-      }
+      struct gl_renderbuffer *rb = fb->Attachment[i].Renderbuffer;
+      if (rb)
+         rb->Format = _mesa_get_srgb_format_linear(rb->Format);
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 09eb2392836..88bd7a499a7 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -1505,19 +1505,33 @@ brw_disassemble_inst(FILE *file, const struct brw_device_info *devinfo,
             break;
          }
 
-         case BRW_SFID_URB:
+         case BRW_SFID_URB: {
+            unsigned opcode = brw_inst_urb_opcode(devinfo, inst);
+
             format(file, " %ld", brw_inst_urb_global_offset(devinfo, inst));
 
             space = 1;
-            if (devinfo->gen >= 7) {
-               err |= control(file, "urb opcode", gen7_urb_opcode,
-                              brw_inst_urb_opcode(devinfo, inst), &space);
-            } else if (devinfo->gen >= 5) {
-               err |= control(file, "urb opcode", gen5_urb_opcode,
-                              brw_inst_urb_opcode(devinfo, inst), &space);
+
+            err |= control(file, "urb opcode",
+                           devinfo->gen >= 7 ? gen7_urb_opcode
+                                             : gen5_urb_opcode,
+                           opcode, &space);
+
+            if (devinfo->gen >= 7 &&
+                brw_inst_urb_per_slot_offset(devinfo, inst)) {
+               string(file, " per-slot");
+            }
+
+            if (opcode == GEN8_URB_OPCODE_SIMD8_WRITE ||
+                opcode == GEN8_URB_OPCODE_SIMD8_READ) {
+               if (brw_inst_urb_channel_mask_present(devinfo, inst))
+                  string(file, " masked");
+            } else {
+               err |= control(file, "urb swizzle", urb_swizzle,
+                              brw_inst_urb_swizzle_control(devinfo, inst),
+                              &space);
             }
-            err |= control(file, "urb swizzle", urb_swizzle,
-                           brw_inst_urb_swizzle_control(devinfo, inst), &space);
+
             if (devinfo->gen < 7) {
                err |= control(file, "urb allocate", urb_allocate,
                               brw_inst_urb_allocate(devinfo, inst), &space);
@@ -1529,6 +1543,7 @@ brw_disassemble_inst(FILE *file, const struct brw_device_info *devinfo,
                               brw_inst_urb_complete(devinfo, inst), &space);
             }
             break;
+         }
          case BRW_SFID_THREAD_SPAWNER:
             break;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 4c2e360edf9..7ae7b2ecdf6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -289,6 +289,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_binop_gequal:
    case ir_binop_equal:
    case ir_binop_nequal:
+   case ir_binop_ldexp:
       for (i = 0; i < vector_elements; i++) {
 	 ir_rvalue *op0 = get_element(op_var[0], i);
 	 ir_rvalue *op1 = get_element(op_var[1], i);
@@ -404,7 +405,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_unpack_unorm_2x16:
    case ir_unop_unpack_unorm_4x8:
    case ir_unop_unpack_half_2x16:
-   case ir_binop_ldexp:
    case ir_binop_vector_extract:
    case ir_triop_vector_insert:
    case ir_quadop_vector:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index b27b170ebc3..ab564bbcb9e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -766,24 +766,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       break;
 
    case nir_op_fsin:
-      if (!compiler->precise_trig) {
-         inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
-      } else {
-         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
-         inst = bld.emit(SHADER_OPCODE_SIN, tmp, op[0]);
-         inst = bld.MUL(result, tmp, brw_imm_f(0.99997));
-      }
+      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fcos:
-      if (!compiler->precise_trig) {
-         inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
-      } else {
-         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
-         inst = bld.emit(SHADER_OPCODE_COS, tmp, op[0]);
-         inst = bld.MUL(result, tmp, brw_imm_f(0.99997));
-      }
+      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -876,6 +864,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
        * When we XOR the sources, the top bit is 0 if they are the same and 1
        * if they are different.  We can then use a conditional modifier to
        * turn that into a predicate.  This leads us to an XOR.l instruction.
+       *
+       * Technically, according to the PRM, you're not allowed to use .l on a
+       * XOR instruction.  However, emperical experiments and Curro's reading
+       * of the simulator source both indicate that it's safe.
        */
       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
       inst = bld.XOR(tmp, op[0], op[1]);
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index c7d6fb8c79b..bb7e1eb128c 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -110,7 +110,6 @@ process_glsl_ir(gl_shader_stage stage,
                       SUB_TO_ADD_NEG |
                       EXP_TO_EXP2 |
                       LOG_TO_LOG2 |
-                      LDEXP_TO_ARITH |
                       CARRY_TO_ARITH |
                       BORROW_TO_ARITH);
 
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 83921891d1c..fb7fa235861 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -437,14 +437,19 @@ nir_optimize(nir_shader *nir, bool is_scalar)
  * is_scalar = true to scalarize everything prior to code gen.
  */
 nir_shader *
-brw_preprocess_nir(nir_shader *nir, bool is_scalar)
+brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
 {
    bool progress; /* Written by OPT and OPT_V */
    (void)progress;
 
+   const bool is_scalar = compiler->scalar_stage[nir->stage];
+
    if (nir->stage == MESA_SHADER_GEOMETRY)
       OPT(nir_lower_gs_intrinsics);
 
+   if (compiler->precise_trig)
+      OPT(brw_nir_apply_trig_workarounds);
+
    static const nir_lower_tex_options tex_options = {
       .lower_txp = ~0,
    };
@@ -568,7 +573,7 @@ brw_create_nir(struct brw_context *brw,
 
    (void)progress;
 
-   nir = brw_preprocess_nir(nir, is_scalar);
+   nir = brw_preprocess_nir(brw->intelScreen->compiler, nir);
 
    OPT(nir_lower_system_values);
    OPT_V(brw_nir_lower_uniforms, is_scalar);
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 440b4ceb669..2711606511d 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -81,7 +81,8 @@ nir_shader *brw_create_nir(struct brw_context *brw,
                            gl_shader_stage stage,
                            bool is_scalar);
 
-nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar);
+nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler,
+                               nir_shader *nir);
 
 void brw_nir_lower_vs_inputs(nir_shader *nir,
                              const struct brw_device_info *devinfo,
@@ -105,6 +106,8 @@ bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
                                          bool use_legacy_snorm_formula,
                                          const uint8_t *attrib_wa_flags);
 
+bool brw_nir_apply_trig_workarounds(nir_shader *nir);
+
 nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
                                       const struct brw_device_info *devinfo,
                                       const struct brw_sampler_prog_key_data *key,
diff --git a/src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py b/src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py
new file mode 100755
index 00000000000..67dab9ab326
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py
@@ -0,0 +1,43 @@
+#! /usr/bin/env python
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import nir_algebraic
+
+# The SIN and COS instructions on Intel hardware can produce values
+# slightly outside of the [-1.0, 1.0] range for a small set of values.
+# Obviously, this can break everyone's expectations about trig functions.
+#
+# According to an internal presentation, the COS instruction can produce
+# a value up to 1.000027 for inputs in the range (0.08296, 0.09888).  One
+# suggested workaround is to multiply by 0.99997, scaling down the
+# amplitude slightly.  Apparently this also minimizes the error function,
+# reducing the maximum error from 0.00006 to about 0.00003.
+
+trig_workarounds = [
+   (('fsin', 'x'), ('fmul', ('fsin', 'x'), 0.99997)),
+   (('fcos', 'x'), ('fmul', ('fcos', 'x'), 0.99997)),
+]
+
+print '#include "brw_nir.h"'
+print nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
+                                  trig_workarounds).render()
diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
index f3361d69786..636340add35 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
+++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp
@@ -75,7 +75,7 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var,
    int namelen = strlen(var->name);
 
    /* The data for our (non-builtin) uniforms is stored in a series of
-    * gl_uniform_driver_storage structs for each subcomponent that
+    * gl_uniform_storage structs for each subcomponent that
     * glGetUniformLocation() could name.  We know it's been set up in the same
     * order we'd walk the type, so walk the list of storage and find anything
     * with our name, or the prefix of a component that starts with our name.
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index b63e44a3bfb..032fdaa4d23 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1094,24 +1094,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_fsin:
-      if (!compiler->precise_trig) {
-         inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
-      } else {
-         src_reg tmp = src_reg(this, glsl_type::vec4_type);
-         inst = emit_math(SHADER_OPCODE_SIN, dst_reg(tmp), op[0]);
-         inst = emit(MUL(dst, tmp, brw_imm_f(0.99997)));
-      }
+      inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fcos:
-      if (!compiler->precise_trig) {
-         inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
-      } else {
-         src_reg tmp = src_reg(this, glsl_type::vec4_type);
-         inst = emit_math(SHADER_OPCODE_COS, dst_reg(tmp), op[0]);
-         inst = emit(MUL(dst, tmp, brw_imm_f(0.99997)));
-      }
+      inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -1141,6 +1129,10 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
        * When we XOR the sources, the top bit is 0 if they are the same and 1
        * if they are different.  We can then use a conditional modifier to
        * turn that into a predicate.  This leads us to an XOR.l instruction.
+       *
+       * Technically, according to the PRM, you're not allowed to use .l on a
+       * XOR instruction.  However, emperical experiments and Curro's reading
+       * of the simulator source both indicate that it's safe.
        */
       src_reg tmp = src_reg(this, glsl_type::ivec4_type);
       inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index 60ac124ecd0..6a20bd6d925 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -340,6 +340,7 @@ intelInitExtensions(struct gl_context *ctx)
       ctx->Extensions.ARB_texture_view = true;
       ctx->Extensions.ARB_shader_storage_buffer_object = true;
       ctx->Extensions.EXT_shader_samples_identical = true;
+      ctx->Extensions.OES_texture_buffer = true;
 
       if (brw->can_do_pipelined_register_writes) {
          ctx->Extensions.ARB_draw_indirect = true;
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index 31030b1b4ea..a486d6e1ab9 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -141,8 +141,7 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
    if (rb->_BaseFormat == GL_RGB)
       return false;
 
-   if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp,
-                         INTEL_DOWNLOAD))
+   if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp))
       return false;
 
    if (!irb->mt ||
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index c6eb50aaba8..dbec82fbd44 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1000,14 +1000,18 @@ intelCreateBuffer(__DRIscreen * driScrnPriv,
       fb->Visual.samples = num_samples;
    }
 
-   if (mesaVis->redBits == 5)
-      rgbFormat = MESA_FORMAT_B5G6R5_UNORM;
-   else if (mesaVis->sRGBCapable)
-      rgbFormat = MESA_FORMAT_B8G8R8A8_SRGB;
-   else if (mesaVis->alphaBits == 0)
-      rgbFormat = MESA_FORMAT_B8G8R8X8_UNORM;
-   else {
-      rgbFormat = MESA_FORMAT_B8G8R8A8_SRGB;
+   if (mesaVis->redBits == 5) {
+      rgbFormat = mesaVis->redMask == 0x1f ? MESA_FORMAT_R5G6B5_UNORM
+                                           : MESA_FORMAT_B5G6R5_UNORM;
+   } else if (mesaVis->sRGBCapable) {
+      rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8A8_SRGB
+                                           : MESA_FORMAT_B8G8R8A8_SRGB;
+   } else if (mesaVis->alphaBits == 0) {
+      rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8X8_UNORM
+                                           : MESA_FORMAT_B8G8R8X8_UNORM;
+   } else {
+      rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8A8_SRGB
+                                           : MESA_FORMAT_B8G8R8A8_SRGB;
       fb->Visual.sRGBCapable = true;
    }
 
@@ -1078,11 +1082,45 @@ intelDestroyBuffer(__DRIdrawable * driDrawPriv)
     _mesa_reference_framebuffer(&fb, NULL);
 }
 
+static void
+intel_detect_sseu(struct intel_screen *intelScreen)
+{
+   assert(intelScreen->devinfo->gen >= 8);
+   int ret;
+
+   intelScreen->subslice_total = -1;
+   intelScreen->eu_total = -1;
+
+   ret = intel_get_param(intelScreen->driScrnPriv, I915_PARAM_SUBSLICE_TOTAL,
+                         &intelScreen->subslice_total);
+   if (ret != -EINVAL)
+      goto err_out;
+
+   ret = intel_get_param(intelScreen->driScrnPriv,
+                         I915_PARAM_EU_TOTAL, &intelScreen->eu_total);
+   if (ret != -EINVAL)
+      goto err_out;
+
+   /* Without this information, we cannot get the right Braswell brandstrings,
+    * and we have to use conservative numbers for GPGPU on many platforms, but
+    * otherwise, things will just work.
+    */
+   if (intelScreen->subslice_total < 1 || intelScreen->eu_total < 1)
+      _mesa_warning(NULL,
+                    "Kernel 4.1 required to properly query GPU properties.\n");
+
+   return;
+
+err_out:
+   intelScreen->subslice_total = -1;
+   intelScreen->eu_total = -1;
+   _mesa_warning(NULL, "Failed to query GPU properties.\n");
+}
+
 static bool
 intel_init_bufmgr(struct intel_screen *intelScreen)
 {
    __DRIscreen *spriv = intelScreen->driScrnPriv;
-   bool devid_override = getenv("INTEL_DEVID_OVERRIDE") != NULL;
 
    intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
@@ -1100,25 +1138,6 @@ intel_init_bufmgr(struct intel_screen *intelScreen)
       return false;
    }
 
-   intelScreen->subslice_total = -1;
-   intelScreen->eu_total = -1;
-
-   /* Everything below this is for real hardware only */
-   if (intelScreen->no_hw || devid_override)
-      return true;
-
-   intel_get_param(spriv, I915_PARAM_SUBSLICE_TOTAL,
-                   &intelScreen->subslice_total);
-   intel_get_param(spriv, I915_PARAM_EU_TOTAL, &intelScreen->eu_total);
-
-   /* Without this information, we cannot get the right Braswell brandstrings,
-    * and we have to use conservative numbers for GPGPU on many platforms, but
-    * otherwise, things will just work.
-    */
-   if (intelScreen->subslice_total == -1 || intelScreen->eu_total == -1)
-      _mesa_warning(NULL,
-                    "Kernel 4.1 required to properly query GPU properties.\n");
-
    return true;
 }
 
@@ -1473,6 +1492,10 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
    intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen);
 
+   /* GENs prior to 8 do not support EU/Subslice info */
+   if (intelScreen->devinfo->gen >= 8)
+      intel_detect_sseu(intelScreen);
+
    const char *force_msaa = getenv("INTEL_FORCE_MSAA");
    if (force_msaa) {
       intelScreen->winsys_msaa_samples_override =
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 1601edddef6..bee8be1fd27 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -404,8 +404,7 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx,
    if (texImage->_BaseFormat == GL_RGB)
       return false;
 
-   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp,
-                         INTEL_DOWNLOAD))
+   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp))
       return false;
 
    /* If this is a nontrivial texture view, let another path handle it instead. */
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 4849a4151e2..9561968d2d6 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -119,8 +119,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx,
    if (ctx->_ImageTransferState)
       return false;
 
-   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp,
-                         INTEL_UPLOAD))
+   if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp))
       return false;
 
    /* If this is a nontrivial texture view, let another path handle it instead. */
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 31354582964..a549854dce6 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -36,8 +36,10 @@
 #include "brw_context.h"
 #include "intel_tiled_memcpy.h"
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__)
 #include <tmmintrin.h>
+#elif defined(__SSE2__)
+#include <emmintrin.h>
 #endif
 
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
@@ -56,21 +58,86 @@ static const uint32_t ytile_width = 128;
 static const uint32_t ytile_height = 32;
 static const uint32_t ytile_span = 16;
 
+static inline uint32_t
+ror(uint32_t n, uint32_t d)
+{
+   return (n >> d) | (n << (32 - d));
+}
+
+/**
+ * Copy RGBA to BGRA - swap R and B.
+ */
+static inline void *
+rgba8_copy(void *dst, const void *src, size_t bytes)
+{
+   uint32_t *d = dst;
+   uint32_t const *s = src;
+
+   assert(bytes % 4 == 0);
+
+   while (bytes >= 4) {
+      *d = ror(__builtin_bswap32(*s), 8);
+      d += 1;
+      s += 1;
+      bytes -= 4;
+   }
+   return dst;
+}
+
 #ifdef __SSSE3__
 static const uint8_t rgba8_permutation[16] =
    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
 
-/* NOTE: dst must be 16-byte aligned. src may be unaligned. */
-#define rgba8_copy_16_aligned_dst(dst, src)                            \
-   _mm_store_si128((__m128i *)(dst),                                   \
-                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
-                                    *(__m128i *) rgba8_permutation))
-
-/* NOTE: src must be 16-byte aligned. dst may be unaligned. */
-#define rgba8_copy_16_aligned_src(dst, src)                            \
-   _mm_storeu_si128((__m128i *)(dst),                                  \
-                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
-                                     *(__m128i *) rgba8_permutation))
+static inline void
+rgba8_copy_16_aligned_dst(void *dst, const void *src)
+{
+   _mm_store_si128(dst,
+                   _mm_shuffle_epi8(_mm_loadu_si128(src),
+                                    *(__m128i *)rgba8_permutation));
+}
+
+static inline void
+rgba8_copy_16_aligned_src(void *dst, const void *src)
+{
+   _mm_storeu_si128(dst,
+                    _mm_shuffle_epi8(_mm_load_si128(src),
+                                     *(__m128i *)rgba8_permutation));
+}
+
+#elif defined(__SSE2__)
+static inline void
+rgba8_copy_16_aligned_dst(void *dst, const void *src)
+{
+   __m128i srcreg, dstreg, agmask, ag, rb, br;
+
+   agmask = _mm_set1_epi32(0xFF00FF00);
+   srcreg = _mm_loadu_si128((__m128i *)src);
+
+   rb = _mm_andnot_si128(agmask, srcreg);
+   ag = _mm_and_si128(agmask, srcreg);
+   br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
+                            _MM_SHUFFLE(2, 3, 0, 1));
+   dstreg = _mm_or_si128(ag, br);
+
+   _mm_store_si128((__m128i *)dst, dstreg);
+}
+
+static inline void
+rgba8_copy_16_aligned_src(void *dst, const void *src)
+{
+   __m128i srcreg, dstreg, agmask, ag, rb, br;
+
+   agmask = _mm_set1_epi32(0xFF00FF00);
+   srcreg = _mm_load_si128((__m128i *)src);
+
+   rb = _mm_andnot_si128(agmask, srcreg);
+   ag = _mm_and_si128(agmask, srcreg);
+   br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
+                            _MM_SHUFFLE(2, 3, 0, 1));
+   dstreg = _mm_or_si128(ag, br);
+
+   _mm_storeu_si128((__m128i *)dst, dstreg);
+}
 #endif
 
 /**
@@ -79,35 +146,27 @@ static const uint8_t rgba8_permutation[16] =
 static inline void *
 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
 {
-   uint8_t *d = dst;
-   uint8_t const *s = src;
+   assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
 
-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
+#if defined(__SSSE3__) || defined(__SSE2__)
+   if (bytes == 64) {
+      rgba8_copy_16_aligned_dst(dst +  0, src +  0);
+      rgba8_copy_16_aligned_dst(dst + 16, src + 16);
+      rgba8_copy_16_aligned_dst(dst + 32, src + 32);
+      rgba8_copy_16_aligned_dst(dst + 48, src + 48);
       return dst;
    }
 
-   if (bytes == 64) {
-      assert(!(((uintptr_t)dst) & 0xf));
-      rgba8_copy_16_aligned_dst(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_dst(d+16, s+16);
-      rgba8_copy_16_aligned_dst(d+32, s+32);
-      rgba8_copy_16_aligned_dst(d+48, s+48);
-      return dst;
+   while (bytes >= 16) {
+      rgba8_copy_16_aligned_dst(dst, src);
+      src += 16;
+      dst += 16;
+      bytes -= 16;
    }
 #endif
 
-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(dst, src, bytes);
+
    return dst;
 }
 
@@ -117,35 +176,27 @@ rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
 static inline void *
 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
 {
-   uint8_t *d = dst;
-   uint8_t const *s = src;
+   assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
 
-#ifdef __SSSE3__
-   if (bytes == 16) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
+#if defined(__SSSE3__) || defined(__SSE2__)
+   if (bytes == 64) {
+      rgba8_copy_16_aligned_src(dst +  0, src +  0);
+      rgba8_copy_16_aligned_src(dst + 16, src + 16);
+      rgba8_copy_16_aligned_src(dst + 32, src + 32);
+      rgba8_copy_16_aligned_src(dst + 48, src + 48);
       return dst;
    }
 
-   if (bytes == 64) {
-      assert(!(((uintptr_t)src) & 0xf));
-      rgba8_copy_16_aligned_src(d+ 0, s+ 0);
-      rgba8_copy_16_aligned_src(d+16, s+16);
-      rgba8_copy_16_aligned_src(d+32, s+32);
-      rgba8_copy_16_aligned_src(d+48, s+48);
-      return dst;
+   while (bytes >= 16) {
+      rgba8_copy_16_aligned_src(dst, src);
+      src += 16;
+      dst += 16;
+      bytes -= 16;
    }
 #endif
 
-   while (bytes >= 4) {
-      d[0] = s[2];
-      d[1] = s[1];
-      d[2] = s[0];
-      d[3] = s[3];
-      d += 4;
-      s += 4;
-      bytes -= 4;
-   }
+   rgba8_copy(dst, src, bytes);
+
    return dst;
 }
 
@@ -172,6 +223,12 @@ typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
  * Copy texture data from linear to X tile layout.
  *
  * \copydoc tile_copy_fn
+ *
+ * The mem_copy parameters allow the user to specify an alternative mem_copy
+ * function that, for instance, may do RGBA -> BGRA swizzling.  The first
+ * function must handle any memory alignment while the second function must
+ * only handle 16-byte alignment in whichever side (source or destination) is
+ * tiled.
  */
 static inline void
 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
@@ -179,7 +236,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
                  char *dst, const char *src,
                  int32_t src_pitch,
                  uint32_t swizzle_bit,
-                 mem_copy_fn mem_copy)
+                 mem_copy_fn mem_copy,
+                 mem_copy_fn mem_copy_align16)
 {
    /* The copy destination offset for each range copied is the sum of
     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
@@ -200,10 +258,10 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
 
       for (xo = x1; xo < x2; xo += xtile_span) {
-         mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
+         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
       }
 
-      mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+      mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
 
       src += src_pitch;
    }
@@ -220,7 +278,8 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
                  char *dst, const char *src,
                  int32_t src_pitch,
                  uint32_t swizzle_bit,
-                 mem_copy_fn mem_copy)
+                 mem_copy_fn mem_copy,
+                 mem_copy_fn mem_copy_align16)
 {
    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
     * as the tile).  Thus the destination offset for (x,y) is the sum of:
@@ -259,12 +318,12 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
        * at each step so we don't need to calculate it explicitly.
        */
       for (x = x1; x < x2; x += ytile_span) {
-         mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
+         mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
          xo += bytes_per_column;
          swizzle ^= swizzle_bit;
       }
 
-      mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
+      mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
 
       src += src_pitch;
    }
@@ -281,7 +340,8 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
                  char *dst, const char *src,
                  int32_t dst_pitch,
                  uint32_t swizzle_bit,
-                 mem_copy_fn mem_copy)
+                 mem_copy_fn mem_copy,
+                 mem_copy_fn mem_copy_align16)
 {
    /* The copy destination offset for each range copied is the sum of
     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
@@ -302,10 +362,10 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
 
       for (xo = x1; xo < x2; xo += xtile_span) {
-         mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
+         mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
       }
 
-      mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
 
       dst += dst_pitch;
    }
@@ -322,7 +382,8 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
                  char *dst, const char *src,
                  int32_t dst_pitch,
                  uint32_t swizzle_bit,
-                 mem_copy_fn mem_copy)
+                 mem_copy_fn mem_copy,
+                 mem_copy_fn mem_copy_align16)
 {
    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
     * as the tile).  Thus the destination offset for (x,y) is the sum of:
@@ -361,12 +422,12 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
        * at each step so we don't need to calculate it explicitly.
        */
       for (x = x1; x < x2; x += ytile_span) {
-         mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
+         mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
          xo += bytes_per_column;
          swizzle ^= swizzle_bit;
       }
 
-      mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
+      mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
 
       dst += dst_pitch;
    }
@@ -393,26 +454,27 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
       if (mem_copy == memcpy)
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
-                                 dst, src, src_pitch, swizzle_bit, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
-                                 dst, src, src_pitch, swizzle_bit, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+                                 dst, src, src_pitch, swizzle_bit,
+                                 memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    }
    linear_to_xtiled(x0, x1, x2, x3, y0, y1,
-                    dst, src, src_pitch, swizzle_bit, mem_copy);
+                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
 }
 
 /**
@@ -435,26 +497,26 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
       if (mem_copy == memcpy)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
-                                 dst, src, src_pitch, swizzle_bit, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
-                                 dst, src, src_pitch, swizzle_bit, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_dst)
+                                 dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
                                  dst, src, src_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_dst);
+                                 rgba8_copy, rgba8_copy_aligned_dst);
       else
          unreachable("not reached");
    }
    linear_to_ytiled(x0, x1, x2, x3, y0, y1,
-                    dst, src, src_pitch, swizzle_bit, mem_copy);
+                    dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
 }
 
 /**
@@ -477,26 +539,26 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
       if (mem_copy == memcpy)
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
-                                 dst, src, dst_pitch, swizzle_bit, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
-                                 dst, src, dst_pitch, swizzle_bit, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    }
    xtiled_to_linear(x0, x1, x2, x3, y0, y1,
-                    dst, src, dst_pitch, swizzle_bit, mem_copy);
+                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
 }
 
 /**
@@ -519,26 +581,26 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
       if (mem_copy == memcpy)
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
-                                 dst, src, dst_pitch, swizzle_bit, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    } else {
       if (mem_copy == memcpy)
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
-                                 dst, src, dst_pitch, swizzle_bit, memcpy);
-      else if (mem_copy == rgba8_copy_aligned_src)
+                                 dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
+      else if (mem_copy == rgba8_copy)
          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
                                  dst, src, dst_pitch, swizzle_bit,
-                                 rgba8_copy_aligned_src);
+                                 rgba8_copy, rgba8_copy_aligned_src);
       else
          unreachable("not reached");
    }
    ytiled_to_linear(x0, x1, x2, x3, y0, y1,
-                    dst, src, dst_pitch, swizzle_bit, mem_copy);
+                    dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
 }
 
 /**
@@ -745,8 +807,7 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
  * \return true if the format and type combination are valid
  */
 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
-                      GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp,
-                      enum intel_memcpy_direction direction)
+                      GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp)
 {
    if (type == GL_UNSIGNED_INT_8_8_8_8_REV &&
        !(format == GL_RGBA || format == GL_BGRA))
@@ -764,8 +825,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
       if (format == GL_BGRA) {
          *mem_copy = memcpy;
       } else if (format == GL_RGBA) {
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
       }
    } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) ||
               (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) ||
@@ -776,8 +836,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
          /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can
           * use the same function.
           */
-         *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst
-                                               : rgba8_copy_aligned_src;
+         *mem_copy = rgba8_copy;
       } else if (format == GL_RGBA) {
          *mem_copy = memcpy;
       }
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
index 01543bf298d..d9148bb6239 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h
@@ -55,20 +55,7 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2,
                 uint32_t tiling,
                 mem_copy_fn mem_copy);
 
-/* Tells intel_get_memcpy() whether the memcpy() is
- *
- *  - an upload to the GPU with an aligned destination and a potentially
- *    unaligned source; or
- *  - a download from the GPU with an aligned source and a potentially
- *    unaligned destination.
- */
-enum intel_memcpy_direction {
-   INTEL_UPLOAD,
-   INTEL_DOWNLOAD
-};
-
 bool intel_get_memcpy(mesa_format tiledFormat, GLenum format,
-                      GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp,
-                      enum intel_memcpy_direction direction);
+                      GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp);
 
 #endif /* INTEL_TILED_MEMCPY */
diff --git a/src/mesa/drivers/x11/SConscript b/src/mesa/drivers/x11/SConscript
index 45419973d39..59c8df4b3c2 100644
--- a/src/mesa/drivers/x11/SConscript
+++ b/src/mesa/drivers/x11/SConscript
@@ -34,9 +34,13 @@ sources = [
 	'xm_tri.c',
 ]
 
-# Disallow undefined symbols
 if env['platform'] != 'darwin':
-    env.Append(SHLINKFLAGS = ['-Wl,-z,defs'])
+    # Disallow undefined symbols, except with Address Sanitizer, since libasan
+    # is not linked on shared libs, as it should be LD_PRELOAD'ed instead
+    if not env['asan']:
+        env.Append(SHLINKFLAGS = [
+            '-Wl,-z,defs',
+        ])
 
 # libGL.so.1.6
 libgl_1_6 = env.SharedLibrary(
diff --git a/src/mesa/drivers/x11/fakeglx.c b/src/mesa/drivers/x11/fakeglx.c
index 80b71765e6c..2f4d966973e 100644
--- a/src/mesa/drivers/x11/fakeglx.c
+++ b/src/mesa/drivers/x11/fakeglx.c
@@ -794,7 +794,7 @@ destroy_visuals_on_display(Display *dpy)
       if (VisualTable[i]->display == dpy) {
          /* remove this visual */
          int j;
-         free(VisualTable[i]);
+         XMesaDestroyVisual(VisualTable[i]);
          for (j = i; j < NumVisuals - 1; j++)
             VisualTable[j] = VisualTable[j + 1];
          NumVisuals--;
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index 65e7ca89d32..82c4d188d5a 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -856,6 +856,7 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
                                 accum_red_size, accum_green_size,
                                 accum_blue_size, accum_alpha_size,
                                 0)) {
+      free(v->visinfo);
       free(v);
       return NULL;
    }
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index dbba136f526..6af02d1c3dc 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -1525,10 +1525,6 @@ _mesa_copy_context( const struct gl_context *src, struct gl_context *dst,
  * Check if the given context can render into the given framebuffer
  * by checking visual attributes.
  *
- * Most of these tests could go away because Mesa is now pretty flexible
- * in terms of mixing rendering contexts with framebuffers.  As long
- * as RGB vs. CI mode agree, we're probably good.
- *
  * \return GL_TRUE if compatible, GL_FALSE otherwise.
  */
 static GLboolean 
@@ -1541,32 +1537,18 @@ check_compatible(const struct gl_context *ctx,
    if (buffer == _mesa_get_incomplete_framebuffer())
       return GL_TRUE;
 
-#if 0
-   /* disabling this fixes the fgl_glxgears pbuffer demo */
-   if (ctxvis->doubleBufferMode && !bufvis->doubleBufferMode)
-      return GL_FALSE;
-#endif
-   if (ctxvis->stereoMode && !bufvis->stereoMode)
-      return GL_FALSE;
-   if (ctxvis->haveAccumBuffer && !bufvis->haveAccumBuffer)
-      return GL_FALSE;
-   if (ctxvis->haveDepthBuffer && !bufvis->haveDepthBuffer)
-      return GL_FALSE;
-   if (ctxvis->haveStencilBuffer && !bufvis->haveStencilBuffer)
-      return GL_FALSE;
-   if (ctxvis->redMask && ctxvis->redMask != bufvis->redMask)
-      return GL_FALSE;
-   if (ctxvis->greenMask && ctxvis->greenMask != bufvis->greenMask)
-      return GL_FALSE;
-   if (ctxvis->blueMask && ctxvis->blueMask != bufvis->blueMask)
-      return GL_FALSE;
-#if 0
-   /* disabled (see bug 11161) */
-   if (ctxvis->depthBits && ctxvis->depthBits != bufvis->depthBits)
-      return GL_FALSE;
-#endif
-   if (ctxvis->stencilBits && ctxvis->stencilBits != bufvis->stencilBits)
-      return GL_FALSE;
+#define check_component(foo)           \
+   if (ctxvis->foo && bufvis->foo &&   \
+       ctxvis->foo != bufvis->foo)     \
+      return GL_FALSE
+
+   check_component(redMask);
+   check_component(greenMask);
+   check_component(blueMask);
+   check_component(depthBits);
+   check_component(stencilBits);
+
+#undef check_component
 
    return GL_TRUE;
 }
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index ddc25d812c7..78899ecccad 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -91,6 +91,7 @@ EXT(ARB_point_sprite                        , ARB_point_sprite
 EXT(ARB_program_interface_query             , dummy_true                             , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_provoking_vertex                    , EXT_provoking_vertex                   , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_query_buffer_object                 , ARB_query_buffer_object                , GLL, GLC,  x ,  x , 2013)
+EXT(ARB_robust_buffer_access_behavior       , ARB_robust_buffer_access_behavior      , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_robustness                          , dummy_true                             , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_sample_shading                      , ARB_sample_shading                     , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_sampler_objects                     , dummy_true                             , GLL, GLC,  x ,  x , 2009)
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index 14cd58870f7..fe54109322d 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -262,7 +262,7 @@ ffsll(long long int val)
 {
    int bit;
 
-   assert(sizeof(val) == 8);
+   STATIC_ASSERT(sizeof(val) == 8);
 
    bit = ffs((int) val);
    if (bit != 0)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 6c09948af04..eec057e0137 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3771,6 +3771,7 @@ struct gl_extensions
    GLboolean ARB_pipeline_statistics_query;
    GLboolean ARB_point_sprite;
    GLboolean ARB_query_buffer_object;
+   GLboolean ARB_robust_buffer_access_behavior;
    GLboolean ARB_sample_shading;
    GLboolean ARB_seamless_cube_map;
    GLboolean ARB_shader_atomic_counter_ops;
diff --git a/src/mesa/main/samplerobj.c b/src/mesa/main/samplerobj.c
index ca366d967ab..74761953044 100644
--- a/src/mesa/main/samplerobj.c
+++ b/src/mesa/main/samplerobj.c
@@ -1171,8 +1171,9 @@ _mesa_SamplerParameterIiv(GLuint sampler, GLenum pname, const GLint *params)
 
    sampObj = _mesa_lookup_samplerobj(ctx, sampler);
    if (!sampObj) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glSamplerParameterIiv(sampler %u)",
-                  sampler);
+      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
+                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+                  "glSamplerParameterIiv(sampler %u)", sampler);
       return;
    }
 
@@ -1257,8 +1258,9 @@ _mesa_SamplerParameterIuiv(GLuint sampler, GLenum pname, const GLuint *params)
 
    sampObj = _mesa_lookup_samplerobj(ctx, sampler);
    if (!sampObj) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glSamplerParameterIuiv(sampler %u)",
-                  sampler);
+      _mesa_error(ctx, (_mesa_is_gles(ctx) ?
+                        GL_INVALID_OPERATION : GL_INVALID_VALUE),
+                  "glSamplerParameterIuiv(sampler %u)", sampler);
       return;
    }
 
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 2af3653f7bb..b9c1bcbbc6e 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -361,7 +361,7 @@ compute_version(const struct gl_extensions *extensions,
                          extensions->ARB_fragment_layer_viewport &&
                          extensions->ARB_framebuffer_no_attachments &&
                          extensions->ARB_internalformat_query2 &&
-                         /* extensions->ARB_robust_buffer_access_behavior */ 0 &&
+                         extensions->ARB_robust_buffer_access_behavior &&
                          extensions->ARB_shader_image_size &&
                          extensions->ARB_shader_storage_buffer_object &&
                          extensions->ARB_stencil_texturing &&
diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c
index 202b4eeeefa..8bbc2f0af4b 100644
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -98,7 +98,7 @@ static void
 st_bufferobj_subdata(struct gl_context *ctx,
 		     GLintptrARB offset,
 		     GLsizeiptrARB size,
-		     const GLvoid * data, struct gl_buffer_object *obj)
+		     const void * data, struct gl_buffer_object *obj)
 {
    struct st_buffer_object *st_obj = st_buffer_object(obj);
 
@@ -142,7 +142,7 @@ static void
 st_bufferobj_get_subdata(struct gl_context *ctx,
                          GLintptrARB offset,
                          GLsizeiptrARB size,
-                         GLvoid * data, struct gl_buffer_object *obj)
+                         void * data, struct gl_buffer_object *obj)
 {
    struct st_buffer_object *st_obj = st_buffer_object(obj);
 
@@ -175,7 +175,7 @@ static GLboolean
 st_bufferobj_data(struct gl_context *ctx,
 		  GLenum target,
 		  GLsizeiptrARB size,
-		  const GLvoid * data,
+		  const void * data,
 		  GLenum usage,
                   GLbitfield storageFlags,
 		  struct gl_buffer_object *obj)
@@ -513,7 +513,7 @@ st_copy_buffer_subdata(struct gl_context *ctx,
 static void
 st_clear_buffer_subdata(struct gl_context *ctx,
                         GLintptr offset, GLsizeiptr size,
-                        const GLvoid *clearValue,
+                        const void *clearValue,
                         GLsizeiptr clearValueSize,
                         struct gl_buffer_object *bufObj)
 {
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 55801469f23..362cef46286 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -206,6 +206,7 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
                         CSO_BIT_STREAM_OUTPUTS |
                         CSO_BIT_VERTEX_ELEMENTS |
                         CSO_BIT_AUX_VERTEX_BUFFER_SLOT |
+                        CSO_BIT_PAUSE_QUERIES |
                         CSO_BITS_ALL_SHADERS));
 
    /* blend state: RGBA masking */
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 01ed5441d11..c3e05bbb7ce 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -379,12 +379,12 @@ static struct pipe_resource *
 make_texture(struct st_context *st,
 	     GLsizei width, GLsizei height, GLenum format, GLenum type,
 	     const struct gl_pixelstore_attrib *unpack,
-	     const GLvoid *pixels)
+	     const void *pixels)
 {
    struct gl_context *ctx = st->ctx;
    struct pipe_context *pipe = st->pipe;
    mesa_format mformat;
-   struct pipe_resource *pt;
+   struct pipe_resource *pt = NULL;
    enum pipe_format pipeFormat;
    GLenum baseInternalFormat;
 
@@ -403,10 +403,18 @@ make_texture(struct st_context *st,
        unpack->SkipRows == 0 &&
        unpack->SwapBytes == GL_FALSE &&
        st->drawpix_cache.image) {
+      assert(st->drawpix_cache.texture);
+
       /* check if the pixel data is the same */
       if (memcmp(pixels, st->drawpix_cache.image, width * height * bpp) == 0) {
          /* OK, re-use the cached texture */
-         return st->drawpix_cache.texture;
+         pipe_resource_reference(&pt, st->drawpix_cache.texture);
+         /* refcount of returned texture should be at least two here.  One
+          * reference for the cache to hold on to, one for the caller (which
+          * it will release), and possibly more held by the driver.
+          */
+         assert(pt->reference.count >= 2);
+         return pt;
       }
    }
 
@@ -525,8 +533,14 @@ make_texture(struct st_context *st,
       st->drawpix_cache.image = malloc(width * height * bpp);
       if (st->drawpix_cache.image) {
          memcpy(st->drawpix_cache.image, pixels, width * height * bpp);
+         pipe_resource_reference(&st->drawpix_cache.texture, pt);
+      }
+      else {
+         /* out of memory, free/disable cached texture */
+         st->drawpix_cache.width = 0;
+         st->drawpix_cache.height = 0;
+         pipe_resource_reference(&st->drawpix_cache.texture, NULL);
       }
-      st->drawpix_cache.texture = pt;
    }
 #endif
 
@@ -744,7 +758,7 @@ static void
 draw_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
                     GLsizei width, GLsizei height, GLenum format, GLenum type,
                     const struct gl_pixelstore_attrib *unpack,
-                    const GLvoid *pixels)
+                    const void *pixels)
 {
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
@@ -798,7 +812,7 @@ draw_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
       for (row = 0; row < height; row++) {
          GLfloat *zValuesFloat = (GLfloat*)zValues;
          GLenum destType = GL_UNSIGNED_BYTE;
-         const GLvoid *source = _mesa_image_address2d(&clippedUnpack, pixels,
+         const void *source = _mesa_image_address2d(&clippedUnpack, pixels,
                                                       width, height,
                                                       format, type,
                                                       row, 0);
@@ -1041,7 +1055,7 @@ static void
 st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
               GLsizei width, GLsizei height,
               GLenum format, GLenum type,
-              const struct gl_pixelstore_attrib *unpack, const GLvoid *pixels)
+              const struct gl_pixelstore_attrib *unpack, const void *pixels)
 {
    void *driver_vp, *driver_fp;
    struct st_context *st = st_context(ctx);
@@ -1160,9 +1174,8 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y,
    if (num_sampler_view > 1)
       pipe_sampler_view_reference(&sv[1], NULL);
 
-#if !USE_DRAWPIXELS_CACHE
+   /* free the texture (but may persist in the cache) */
    pipe_resource_reference(&pt, NULL);
-#endif
 }
 
 
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index ff570e0e444..456ad83818b 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -40,6 +40,7 @@
 #include "main/glformats.h"
 #include "main/macros.h"
 #include "main/renderbuffer.h"
+#include "main/state.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -729,6 +730,7 @@ st_ReadBuffer(struct gl_context *ctx, GLenum buffer)
        fb->Attachment[fb->_ColorReadBufferIndex].Type == GL_NONE) {
       /* add the buffer */
       st_manager_add_color_renderbuffer(st, fb, fb->_ColorReadBufferIndex);
+      _mesa_update_state(ctx);
       st_validate_state(st, ST_PIPELINE_RENDER);
    }
 }
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 5153c4bbba1..393b881ea4c 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -85,7 +85,7 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y,
               GLsizei width, GLsizei height,
               GLenum format, GLenum type,
               const struct gl_pixelstore_attrib *pack,
-              GLvoid *pixels)
+              void *pixels)
 {
    struct st_context *st = st_context(ctx);
    struct gl_renderbuffer *rb =
@@ -238,7 +238,7 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y,
       GLuint row;
 
       for (row = 0; row < (unsigned) height; row++) {
-         GLvoid *dest = _mesa_image_address2d(pack, pixels,
+         void *dest = _mesa_image_address2d(pack, pixels,
                                               width, height, format,
                                               type, row, 0);
          memcpy(dest, map, bytesPerRow);
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 3980f5d2f51..a18b08b3226 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -1345,6 +1345,7 @@ try_pbo_upload_common(struct gl_context *ctx,
                         CSO_BIT_DEPTH_STENCIL_ALPHA |
                         CSO_BIT_RASTERIZER |
                         CSO_BIT_STREAM_OUTPUTS |
+                        CSO_BIT_PAUSE_QUERIES |
                         CSO_BITS_ALL_SHADERS));
    cso_save_constant_buffer_slot0(cso, PIPE_SHADER_FRAGMENT);
 
@@ -1845,7 +1846,7 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims,
             /* 1D array textures.
              * We need to convert gallium coords to GL coords.
              */
-            GLvoid *src = _mesa_image_address2d(unpack, pixels,
+            void *src = _mesa_image_address2d(unpack, pixels,
                                                 width, depth, format,
                                                 type, slice, 0);
             memcpy(map, src, bytesPerRow);
@@ -1854,7 +1855,7 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims,
             ubyte *slice_map = map;
 
             for (row = 0; row < (unsigned) height; row++) {
-               GLvoid *src = _mesa_image_address(dims, unpack, pixels,
+               void *src = _mesa_image_address(dims, unpack, pixels,
                                                  width, height, format,
                                                  type, slice, row, 0);
                memcpy(slice_map, src, bytesPerRow);
@@ -1928,7 +1929,7 @@ st_CompressedTexSubImage(struct gl_context *ctx, GLuint dims,
                          struct gl_texture_image *texImage,
                          GLint x, GLint y, GLint z,
                          GLsizei w, GLsizei h, GLsizei d,
-                         GLenum format, GLsizei imageSize, const GLvoid *data)
+                         GLenum format, GLsizei imageSize, const void *data)
 {
    struct st_context *st = st_context(ctx);
    struct st_texture_image *stImage = st_texture_image(texImage);
@@ -2053,7 +2054,7 @@ fallback:
 static void
 st_CompressedTexImage(struct gl_context *ctx, GLuint dims,
                       struct gl_texture_image *texImage,
-                      GLsizei imageSize, const GLvoid *data)
+                      GLsizei imageSize, const void *data)
 {
    prep_teximage(ctx, texImage, GL_NONE, GL_NONE);
 
@@ -2106,7 +2107,7 @@ static void
 st_GetTexSubImage(struct gl_context * ctx,
                   GLint xoffset, GLint yoffset, GLint zoffset,
                   GLsizei width, GLsizei height, GLint depth,
-                  GLenum format, GLenum type, GLvoid * pixels,
+                  GLenum format, GLenum type, void * pixels,
                   struct gl_texture_image *texImage)
 {
    struct st_context *st = st_context(ctx);
@@ -2319,7 +2320,7 @@ st_GetTexSubImage(struct gl_context * ctx,
             /* 1D array textures.
              * We need to convert gallium coords to GL coords.
              */
-            GLvoid *dest = _mesa_image_address3d(&ctx->Pack, pixels,
+            void *dest = _mesa_image_address3d(&ctx->Pack, pixels,
                                                  width, depth, format,
                                                  type, 0, slice, 0);
             memcpy(dest, map, bytesPerRow);
@@ -2328,7 +2329,7 @@ st_GetTexSubImage(struct gl_context * ctx,
             ubyte *slice_map = map;
 
             for (row = 0; row < height; row++) {
-               GLvoid *dest = _mesa_image_address3d(&ctx->Pack, pixels,
+               void *dest = _mesa_image_address3d(&ctx->Pack, pixels,
                                                     width, height, format,
                                                     type, slice, row, 0);
                memcpy(dest, slice_map, bytesPerRow);
@@ -2363,7 +2364,7 @@ st_GetTexSubImage(struct gl_context * ctx,
             /* 1D array textures.
              * We need to convert gallium coords to GL coords.
              */
-            GLvoid *dest = _mesa_image_address3d(&ctx->Pack, pixels,
+            void *dest = _mesa_image_address3d(&ctx->Pack, pixels,
                                                  width, depth, format,
                                                  type, 0, slice, 0);
 
@@ -2377,7 +2378,7 @@ st_GetTexSubImage(struct gl_context * ctx,
          }
          else {
             for (row = 0; row < height; row++) {
-               GLvoid *dest = _mesa_image_address3d(&ctx->Pack, pixels,
+               void *dest = _mesa_image_address3d(&ctx->Pack, pixels,
                                                     width, height, format,
                                                     type, slice, row, 0);
 
@@ -3085,7 +3086,7 @@ st_ClearTexSubImage(struct gl_context *ctx,
                     struct gl_texture_image *texImage,
                     GLint xoffset, GLint yoffset, GLint zoffset,
                     GLsizei width, GLsizei height, GLsizei depth,
-                    const GLvoid *clearValue)
+                    const void *clearValue)
 {
    static const char zeros[16] = {0};
    struct st_texture_image *stImage = st_texture_image(texImage);
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 287894317df..6d407d33eff 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -589,6 +589,7 @@ void st_init_extensions(struct pipe_screen *screen,
       { o(ARB_pipeline_statistics_query),    PIPE_CAP_QUERY_PIPELINE_STATISTICS        },
       { o(ARB_point_sprite),                 PIPE_CAP_POINT_SPRITE                     },
       { o(ARB_query_buffer_object),          PIPE_CAP_QUERY_BUFFER_OBJECT              },
+      { o(ARB_robust_buffer_access_behavior), PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR   },
       { o(ARB_sample_shading),               PIPE_CAP_SAMPLE_SHADING                   },
       { o(ARB_seamless_cube_map),            PIPE_CAP_SEAMLESS_CUBE_MAP                },
       { o(ARB_shader_draw_parameters),       PIPE_CAP_DRAW_PARAMETERS                  },
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index b9ab7ae9919..5f037daea76 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -5192,43 +5192,72 @@ struct st_translate {
 };
 
 /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */
-const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = {
-   /* Vertex shader
-    */
-   TGSI_SEMANTIC_VERTEXID,
-   TGSI_SEMANTIC_INSTANCEID,
-   TGSI_SEMANTIC_VERTEXID_NOBASE,
-   TGSI_SEMANTIC_BASEVERTEX,
-   TGSI_SEMANTIC_BASEINSTANCE,
-   TGSI_SEMANTIC_DRAWID,
-
-   /* Geometry shader
-    */
-   TGSI_SEMANTIC_INVOCATIONID,
-
-   /* Fragment shader
-    */
-   TGSI_SEMANTIC_POSITION,
-   TGSI_SEMANTIC_FACE,
-   TGSI_SEMANTIC_SAMPLEID,
-   TGSI_SEMANTIC_SAMPLEPOS,
-   TGSI_SEMANTIC_SAMPLEMASK,
-   TGSI_SEMANTIC_HELPER_INVOCATION,
-
-   /* Tessellation shaders
-    */
-   TGSI_SEMANTIC_TESSCOORD,
-   TGSI_SEMANTIC_VERTICESIN,
-   TGSI_SEMANTIC_PRIMID,
-   TGSI_SEMANTIC_TESSOUTER,
-   TGSI_SEMANTIC_TESSINNER,
+unsigned
+_mesa_sysval_to_semantic(unsigned sysval)
+{
+   switch (sysval) {
+   /* Vertex shader */
+   case SYSTEM_VALUE_VERTEX_ID:
+      return TGSI_SEMANTIC_VERTEXID;
+   case SYSTEM_VALUE_INSTANCE_ID:
+      return TGSI_SEMANTIC_INSTANCEID;
+   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+      return TGSI_SEMANTIC_VERTEXID_NOBASE;
+   case SYSTEM_VALUE_BASE_VERTEX:
+      return TGSI_SEMANTIC_BASEVERTEX;
+   case SYSTEM_VALUE_BASE_INSTANCE:
+      return TGSI_SEMANTIC_BASEINSTANCE;
+   case SYSTEM_VALUE_DRAW_ID:
+      return TGSI_SEMANTIC_DRAWID;
+
+   /* Geometry shader */
+   case SYSTEM_VALUE_INVOCATION_ID:
+      return TGSI_SEMANTIC_INVOCATIONID;
+
+   /* Fragment shader */
+   case SYSTEM_VALUE_FRAG_COORD:
+      return TGSI_SEMANTIC_POSITION;
+   case SYSTEM_VALUE_FRONT_FACE:
+      return TGSI_SEMANTIC_FACE;
+   case SYSTEM_VALUE_SAMPLE_ID:
+      return TGSI_SEMANTIC_SAMPLEID;
+   case SYSTEM_VALUE_SAMPLE_POS:
+      return TGSI_SEMANTIC_SAMPLEPOS;
+   case SYSTEM_VALUE_SAMPLE_MASK_IN:
+      return TGSI_SEMANTIC_SAMPLEMASK;
+   case SYSTEM_VALUE_HELPER_INVOCATION:
+      return TGSI_SEMANTIC_HELPER_INVOCATION;
+
+   /* Tessellation shader */
+   case SYSTEM_VALUE_TESS_COORD:
+      return TGSI_SEMANTIC_TESSCOORD;
+   case SYSTEM_VALUE_VERTICES_IN:
+      return TGSI_SEMANTIC_VERTICESIN;
+   case SYSTEM_VALUE_PRIMITIVE_ID:
+      return TGSI_SEMANTIC_PRIMID;
+   case SYSTEM_VALUE_TESS_LEVEL_OUTER:
+      return TGSI_SEMANTIC_TESSOUTER;
+   case SYSTEM_VALUE_TESS_LEVEL_INNER:
+      return TGSI_SEMANTIC_TESSINNER;
+
+   /* Compute shader */
+   case SYSTEM_VALUE_LOCAL_INVOCATION_ID:
+      return TGSI_SEMANTIC_THREAD_ID;
+   case SYSTEM_VALUE_WORK_GROUP_ID:
+      return TGSI_SEMANTIC_BLOCK_ID;
+   case SYSTEM_VALUE_NUM_WORK_GROUPS:
+      return TGSI_SEMANTIC_GRID_SIZE;
+
+   /* Unhandled */
+   case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX:
+   case SYSTEM_VALUE_GLOBAL_INVOCATION_ID:
+   case SYSTEM_VALUE_VERTEX_CNT:
+   default:
+      assert(!"Unexpected SYSTEM_VALUE_ enum");
+      return TGSI_SEMANTIC_COUNT;
+   }
+}
 
-   /* Compute shaders
-    */
-   TGSI_SEMANTIC_THREAD_ID,
-   TGSI_SEMANTIC_BLOCK_ID,
-   TGSI_SEMANTIC_GRID_SIZE,
-};
 
 /**
  * Make note of a branch to a label in the TGSI code.
@@ -6000,35 +6029,6 @@ st_translate_program(
    assert(numInputs <= ARRAY_SIZE(t->inputs));
    assert(numOutputs <= ARRAY_SIZE(t->outputs));
 
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_FRONT_FACE] ==
-          TGSI_SEMANTIC_FACE);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_VERTEX_ID] ==
-          TGSI_SEMANTIC_VERTEXID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_INSTANCE_ID] ==
-          TGSI_SEMANTIC_INSTANCEID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_ID] ==
-          TGSI_SEMANTIC_SAMPLEID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_POS] ==
-          TGSI_SEMANTIC_SAMPLEPOS);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_MASK_IN] ==
-          TGSI_SEMANTIC_SAMPLEMASK);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_INVOCATION_ID] ==
-          TGSI_SEMANTIC_INVOCATIONID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE] ==
-          TGSI_SEMANTIC_VERTEXID_NOBASE);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_BASE_VERTEX] ==
-          TGSI_SEMANTIC_BASEVERTEX);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_TESS_COORD] ==
-          TGSI_SEMANTIC_TESSCOORD);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_HELPER_INVOCATION] ==
-          TGSI_SEMANTIC_HELPER_INVOCATION);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_LOCAL_INVOCATION_ID] ==
-          TGSI_SEMANTIC_THREAD_ID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_WORK_GROUP_ID] ==
-          TGSI_SEMANTIC_BLOCK_ID);
-   assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_NUM_WORK_GROUPS] ==
-          TGSI_SEMANTIC_GRID_SIZE);
-
    t = CALLOC_STRUCT(st_translate);
    if (!t) {
       ret = PIPE_ERROR_OUT_OF_MEMORY;
@@ -6215,7 +6215,7 @@ st_translate_program(
 
       for (i = 0; sysInputs; i++) {
          if (sysInputs & (1 << i)) {
-            unsigned semName = _mesa_sysval_to_semantic[i];
+            unsigned semName = _mesa_sysval_to_semantic(i);
 
             t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h
index 729295bcb52..774588a111b 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.h
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h
@@ -63,7 +63,8 @@ st_translate_stream_output_info(struct glsl_to_tgsi_visitor *glsl_to_tgsi,
                                 const GLuint outputMapping[],
                                 struct pipe_stream_output_info *so);
 
-extern const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX];
+unsigned
+_mesa_sysval_to_semantic(unsigned sysval);
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 7a686b199d5..e1c79a57b0a 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -1074,7 +1074,7 @@ st_translate_mesa_program(
 
       for (i = 0; sysInputs; i++) {
          if (sysInputs & (1 << i)) {
-            unsigned semName = _mesa_sysval_to_semantic[i];
+            unsigned semName = _mesa_sysval_to_semantic(i);
 
             t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0);
 
diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c
index b9abebfc7bf..08f25535ae1 100644
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -55,7 +55,7 @@
 #include "state_tracker/drm_driver.h"
 
 static struct pipe_resource *
-st_vdpau_video_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface,
+st_vdpau_video_surface_gallium(struct gl_context *ctx, const void *vdpSurface,
                                GLuint index)
 {
    int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
@@ -86,7 +86,7 @@ st_vdpau_video_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface,
 }
 
 static struct pipe_resource *
-st_vdpau_output_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface)
+st_vdpau_output_surface_gallium(struct gl_context *ctx, const void *vdpSurface)
 {
    int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
    uint32_t device = (uintptr_t)ctx->vdpDevice;
@@ -135,7 +135,7 @@ st_vdpau_resource_from_description(struct gl_context *ctx,
 }
 
 static struct pipe_resource *
-st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface)
+st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const void *vdpSurface)
 {
    int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
    uint32_t device = (uintptr_t)ctx->vdpDevice;
@@ -154,7 +154,7 @@ st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface
 }
 
 static struct pipe_resource *
-st_vdpau_video_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface,
+st_vdpau_video_surface_dma_buf(struct gl_context *ctx, const void *vdpSurface,
                                GLuint index)
 {
    int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
@@ -177,7 +177,7 @@ static void
 st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
                      GLboolean output, struct gl_texture_object *texObj,
                      struct gl_texture_image *texImage,
-                     const GLvoid *vdpSurface, GLuint index)
+                     const void *vdpSurface, GLuint index)
 {
    struct st_context *st = st_context(ctx);
    struct st_texture_object *stObj = st_texture_object(texObj);
@@ -250,7 +250,7 @@ static void
 st_vdpau_unmap_surface(struct gl_context *ctx, GLenum target, GLenum access,
                        GLboolean output, struct gl_texture_object *texObj,
                        struct gl_texture_image *texImage,
-                       const GLvoid *vdpSurface, GLuint index)
+                       const void *vdpSurface, GLuint index)
 {
    struct st_context *st = st_context(ctx);
    struct st_texture_object *stObj = st_texture_object(texObj);