diff options
Diffstat (limited to 'src')
281 files changed, 4973 insertions, 2798 deletions
diff --git a/src/compiler/glsl/Android.gen.mk b/src/compiler/Android.glsl.gen.mk index de5cd0f474c..b0df8a146c0 100644 --- a/src/compiler/glsl/Android.gen.mk +++ b/src/compiler/Android.glsl.gen.mk @@ -32,8 +32,9 @@ intermediates := $(call local-generated-sources-dir) LOCAL_SRC_FILES := $(LOCAL_SRC_FILES) LOCAL_C_INCLUDES += \ - $(intermediates)/glcpp \ - $(LOCAL_PATH)/glcpp \ + $(intermediates)/glsl \ + $(LOCAL_PATH)/glsl \ + $(LOCAL_PATH)/glsl/glcpp \ LOCAL_GENERATED_SOURCES += $(addprefix $(intermediates)/, \ $(LIBGLCPP_GENERATED_FILES) \ @@ -65,14 +66,14 @@ define local-yy-to-cpp-and-h rm -f $(@:$1=$(YACC_HEADER_SUFFIX)) endef -$(intermediates)/glsl_lexer.cpp: $(LOCAL_PATH)/glsl_lexer.ll +$(intermediates)/glsl/glsl_lexer.cpp: $(LOCAL_PATH)/glsl/glsl_lexer.ll $(call local-l-or-ll-to-c-or-cpp) -$(intermediates)/glsl_parser.cpp: $(LOCAL_PATH)/glsl_parser.yy +$(intermediates)/glsl/glsl_parser.cpp: $(LOCAL_PATH)/glsl/glsl_parser.yy $(call local-yy-to-cpp-and-h,.cpp) -$(intermediates)/glcpp/glcpp-lex.c: $(LOCAL_PATH)/glcpp/glcpp-lex.l +$(intermediates)/glsl/glcpp/glcpp-lex.c: $(LOCAL_PATH)/glsl/glcpp/glcpp-lex.l $(call local-l-or-ll-to-c-or-cpp) -$(intermediates)/glcpp/glcpp-parse.c: $(LOCAL_PATH)/glcpp/glcpp-parse.y +$(intermediates)/glsl/glcpp/glcpp-parse.c: $(LOCAL_PATH)/glsl/glcpp/glcpp-parse.y $(call glsl_local-y-to-c-and-h) diff --git a/src/compiler/glsl/Android.mk b/src/compiler/Android.glsl.mk index f5d96b300f0..d9cf06d208f 100644 --- a/src/compiler/glsl/Android.mk +++ b/src/compiler/Android.glsl.mk @@ -47,7 +47,7 @@ LOCAL_STATIC_LIBRARIES := libmesa_compiler LOCAL_MODULE := libmesa_glsl -include $(LOCAL_PATH)/Android.gen.mk +include $(LOCAL_PATH)/Android.glsl.gen.mk include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) diff --git a/src/compiler/Android.mk b/src/compiler/Android.mk index 888780ba3fb..ac0ced58334 100644 --- a/src/compiler/Android.mk +++ b/src/compiler/Android.mk @@ -43,25 +43,6 @@ LOCAL_MODULE := libmesa_compiler include $(MESA_COMMON_MK) include $(BUILD_STATIC_LIBRARY) -# --------------------------------------- -# Build libmesa_nir -# --------------------------------------- - -include $(CLEAR_VARS) +include $(LOCAL_PATH)/Android.glsl.mk -LOCAL_SRC_FILES := \ - $(NIR_FILES) - -LOCAL_C_INCLUDES := \ - $(MESA_TOP)/src/mapi \ - $(MESA_TOP)/src/mesa \ - $(MESA_TOP)/src/gallium/include \ - $(MESA_TOP)/src/gallium/auxiliary - -LOCAL_STATIC_LIBRARIES := libmesa_compiler - -LOCAL_MODULE := libmesa_nir - -include $(LOCAL_PATH)/Android.gen.mk -include $(MESA_COMMON_MK) -include $(BUILD_STATIC_LIBRARY) +include $(LOCAL_PATH)/Android.nir.mk diff --git a/src/compiler/Android.gen.mk b/src/compiler/Android.nir.gen.mk index 96fc750ec64..96fc750ec64 100644 --- a/src/compiler/Android.gen.mk +++ b/src/compiler/Android.nir.gen.mk diff --git a/src/compiler/Android.nir.mk b/src/compiler/Android.nir.mk new file mode 100644 index 00000000000..e89a21c03ac --- /dev/null +++ b/src/compiler/Android.nir.mk @@ -0,0 +1,49 @@ +# Mesa 3-D graphics library +# +# Copyright (C) 2015 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +LOCAL_PATH := $(call my-dir) + +include $(LOCAL_PATH)/Makefile.sources + +# --------------------------------------- +# Build libmesa_nir +# --------------------------------------- + +include $(CLEAR_VARS) + +LOCAL_SRC_FILES := \ + $(NIR_FILES) + +LOCAL_C_INCLUDES := \ + $(MESA_TOP)/src/mapi \ + $(MESA_TOP)/src/mesa \ + $(MESA_TOP)/src/gallium/include \ + $(MESA_TOP)/src/gallium/auxiliary + +LOCAL_STATIC_LIBRARIES := libmesa_compiler + +LOCAL_MODULE := libmesa_nir + +include $(LOCAL_PATH)/Android.nir.gen.mk +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) + diff --git a/src/compiler/Makefile.am b/src/compiler/Makefile.am index 5032890e73d..dc30f908d8a 100644 --- a/src/compiler/Makefile.am +++ b/src/compiler/Makefile.am @@ -54,273 +54,8 @@ BUILT_SOURCES = CLEANFILES = EXTRA_DIST = SConscript - -EXTRA_DIST += glsl/tests glsl/glcpp/tests glsl/README \ - glsl/TODO glsl/glcpp/README \ - glsl/glsl_lexer.ll \ - glsl/glsl_parser.yy \ - glsl/glcpp/glcpp-lex.l \ - glsl/glcpp/glcpp-parse.y \ - glsl/Makefile.sources \ - glsl/SConscript - -TESTS += glsl/glcpp/tests/glcpp-test \ - glsl/glcpp/tests/glcpp-test-cr-lf \ - glsl/tests/blob-test \ - glsl/tests/general-ir-test \ - glsl/tests/optimization-test \ - glsl/tests/sampler-types-test \ - glsl/tests/uniform-initializer-test - -TESTS_ENVIRONMENT= \ - export PYTHON2=$(PYTHON2); \ - export PYTHON_FLAGS=$(PYTHON_FLAGS); - -check_PROGRAMS += \ - glsl/glcpp/glcpp \ - glsl/glsl_test \ - glsl/tests/blob-test \ - glsl/tests/general-ir-test \ - glsl/tests/sampler-types-test \ - glsl/tests/uniform-initializer-test - -noinst_PROGRAMS = glsl_compiler - -glsl_tests_blob_test_SOURCES = \ - glsl/tests/blob_test.c -glsl_tests_blob_test_LDADD = \ - glsl/libglsl.la - -glsl_tests_general_ir_test_SOURCES = \ - glsl/standalone_scaffolding.cpp \ - glsl/tests/builtin_variable_test.cpp \ - glsl/tests/invalidate_locations_test.cpp \ - glsl/tests/general_ir_test.cpp \ - glsl/tests/varyings_test.cpp -glsl_tests_general_ir_test_CFLAGS = \ - $(PTHREAD_CFLAGS) -glsl_tests_general_ir_test_LDADD = \ - $(top_builddir)/src/gtest/libgtest.la \ - glsl/libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ - $(PTHREAD_LIBS) - -glsl_tests_uniform_initializer_test_SOURCES = \ - glsl/tests/copy_constant_to_storage_tests.cpp \ - glsl/tests/set_uniform_initializer_tests.cpp \ - glsl/tests/uniform_initializer_utils.cpp \ - glsl/tests/uniform_initializer_utils.h -glsl_tests_uniform_initializer_test_CFLAGS = \ - $(PTHREAD_CFLAGS) -glsl_tests_uniform_initializer_test_LDADD = \ - $(top_builddir)/src/gtest/libgtest.la \ - glsl/libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ - $(PTHREAD_LIBS) - -glsl_tests_sampler_types_test_SOURCES = \ - glsl/tests/sampler_types_test.cpp -glsl_tests_sampler_types_test_CFLAGS = \ - $(PTHREAD_CFLAGS) -glsl_tests_sampler_types_test_LDADD = \ - $(top_builddir)/src/gtest/libgtest.la \ - glsl/libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ - $(PTHREAD_LIBS) - -noinst_LTLIBRARIES += glsl/libglsl.la glsl/libglcpp.la - -glsl_libglcpp_la_LIBADD = \ - $(top_builddir)/src/util/libmesautil.la -glsl_libglcpp_la_SOURCES = \ - glsl/glcpp/glcpp-lex.c \ - glsl/glcpp/glcpp-parse.c \ - glsl/glcpp/glcpp-parse.h \ - $(LIBGLCPP_FILES) - -glsl_glcpp_glcpp_SOURCES = \ - glsl/glcpp/glcpp.c -glsl_glcpp_glcpp_LDADD = \ - glsl/libglcpp.la \ - $(top_builddir)/src/libglsl_util.la \ - -lm - -glsl_libglsl_la_LIBADD = \ - nir/libnir.la \ - glsl/libglcpp.la - -glsl_libglsl_la_SOURCES = \ - glsl/glsl_lexer.cpp \ - glsl/glsl_parser.cpp \ - glsl/glsl_parser.h \ - $(LIBGLSL_FILES) - - -glsl_compiler_SOURCES = \ - $(GLSL_COMPILER_CXX_FILES) - -glsl_compiler_LDADD = \ - glsl/libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ - $(top_builddir)/src/util/libmesautil.la \ - $(PTHREAD_LIBS) - -glsl_glsl_test_SOURCES = \ - glsl/standalone_scaffolding.cpp \ - glsl/test.cpp \ - glsl/test_optpass.cpp \ - glsl/test_optpass.h - -glsl_glsl_test_LDADD = \ - glsl/libglsl.la \ - $(top_builddir)/src/libglsl_util.la \ - $(PTHREAD_LIBS) - -# We write our own rules for yacc and lex below. We'd rather use automake, -# but automake makes it especially difficult for a number of reasons: -# -# * < automake-1.12 generates .h files from .yy and .ypp files, but -# >=automake-1.12 generates .hh and .hpp files respectively. There's no -# good way of making a project that uses C++ yacc files compatible with -# both versions of automake. Strong work automake developers. -# -# * Since we're generating code from .l/.y files in a subdirectory (glcpp/) -# we'd like the resulting generated code to also go in glcpp/ for purposes -# of distribution. Automake gives no way to do this. -# -# * Since we're building multiple yacc parsers into one library (and via one -# Makefile) we have to use per-target YFLAGS. Using per-target YFLAGS causes -# automake to name the resulting generated code as <library-name>_filename.c. -# Frankly, that's ugly and we don't want a libglcpp_glcpp_parser.h file. - -# In order to make build output print "LEX" and "YACC", we reproduce the -# automake variables below. - -AM_V_LEX = $(am__v_LEX_$(V)) -am__v_LEX_ = $(am__v_LEX_$(AM_DEFAULT_VERBOSITY)) -am__v_LEX_0 = @echo " LEX " $@; -am__v_LEX_1 = - -AM_V_YACC = $(am__v_YACC_$(V)) -am__v_YACC_ = $(am__v_YACC_$(AM_DEFAULT_VERBOSITY)) -am__v_YACC_0 = @echo " YACC " $@; -am__v_YACC_1 = - MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D) -YACC_GEN = $(AM_V_YACC)$(YACC) $(YFLAGS) -LEX_GEN = $(AM_V_LEX)$(LEX) $(LFLAGS) - -glsl/glsl_parser.cpp glsl/glsl_parser.h: glsl/glsl_parser.yy - $(MKDIR_GEN) - $(YACC_GEN) -o $@ -p "_mesa_glsl_" --defines=$(builddir)/glsl/glsl_parser.h $(srcdir)/glsl/glsl_parser.yy - -glsl/glsl_lexer.cpp: glsl/glsl_lexer.ll - $(MKDIR_GEN) - $(LEX_GEN) -o $@ $(srcdir)/glsl/glsl_lexer.ll - -glsl/glcpp/glcpp-parse.c glsl/glcpp/glcpp-parse.h: glsl/glcpp/glcpp-parse.y - $(MKDIR_GEN) - $(YACC_GEN) -o $@ -p "glcpp_parser_" --defines=$(builddir)/glsl/glcpp/glcpp-parse.h $(srcdir)/glsl/glcpp/glcpp-parse.y - -glsl/glcpp/glcpp-lex.c: glsl/glcpp/glcpp-lex.l - $(MKDIR_GEN) - $(LEX_GEN) -o $@ $(srcdir)/glsl/glcpp/glcpp-lex.l - -# Only the parsers (specifically the header files generated at the same time) -# need to be in BUILT_SOURCES. Though if we list the parser headers YACC is -# called for the .c/.cpp file and the .h files. By listing the .c/.cpp files -# YACC is only executed once for each parser. The rest of the generated code -# will be created at the appropriate times according to standard automake -# dependency rules. -BUILT_SOURCES += \ - glsl/glsl_parser.cpp \ - glsl/glsl_lexer.cpp \ - glsl/glcpp/glcpp-parse.c \ - glsl/glcpp/glcpp-lex.c -CLEANFILES += \ - glsl/glcpp/glcpp-parse.h \ - glsl/glsl_parser.h \ - glsl/glsl_parser.cpp \ - glsl/glsl_lexer.cpp \ - glsl/glcpp/glcpp-parse.c \ - glsl/glcpp/glcpp-lex.c - -clean-local: - $(RM) -r subtest-cr subtest-cr-lf subtest-lf subtest-lf-cr - -dist-hook: - $(RM) glsl/glcpp/tests/*.out - $(RM) glsl/glcpp/tests/subtest*/*.out - -noinst_LTLIBRARIES += nir/libnir.la - -nir_libnir_la_CPPFLAGS = \ - $(AM_CPPFLAGS) \ - -I$(top_builddir)/src/compiler/nir \ - -I$(top_srcdir)/src/compiler/nir - -nir_libnir_la_LIBADD = \ - libcompiler.la - -nir_libnir_la_SOURCES = \ - $(NIR_FILES) \ - $(SPIRV_FILES) \ - $(NIR_GENERATED_FILES) - -PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) - -nir/nir_builder_opcodes.h: nir/nir_opcodes.py nir/nir_builder_opcodes_h.py - $(MKDIR_GEN) - $(PYTHON_GEN) $(srcdir)/nir/nir_builder_opcodes_h.py > $@ || ($(RM) $@; false) - -nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py - $(MKDIR_GEN) - $(PYTHON_GEN) $(srcdir)/nir/nir_constant_expressions.py > $@ || ($(RM) $@; false) - -nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py - $(MKDIR_GEN) - $(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_h.py > $@ || ($(RM) $@; false) - -nir/nir_opcodes.c: nir/nir_opcodes.py nir/nir_opcodes_c.py - $(MKDIR_GEN) - $(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_c.py > $@ || ($(RM) $@; false) - -nir/nir_opt_algebraic.c: nir/nir_opt_algebraic.py nir/nir_algebraic.py - $(MKDIR_GEN) - $(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@ || ($(RM) $@; false) - - -check_PROGRAMS += nir/tests/control_flow_tests - -nir_tests_control_flow_tests_CPPFLAGS = \ - $(AM_CPPFLAGS) \ - -I$(top_builddir)/src/compiler/nir \ - -I$(top_srcdir)/src/compiler/nir - -nir_tests_control_flow_tests_SOURCES = \ - nir/tests/control_flow_tests.cpp -nir_tests_control_flow_tests_CFLAGS = \ - $(PTHREAD_CFLAGS) -nir_tests_control_flow_tests_LDADD = \ - $(top_builddir)/src/gtest/libgtest.la \ - nir/libnir.la \ - $(top_builddir)/src/util/libmesautil.la \ - $(PTHREAD_LIBS) - - -TESTS += nir/tests/control_flow_tests - -BUILT_SOURCES += $(NIR_GENERATED_FILES) -CLEANFILES += $(NIR_GENERATED_FILES) +include Makefile.glsl.am -EXTRA_DIST += \ - nir/nir_algebraic.py \ - nir/nir_builder_opcodes_h.py \ - nir/nir_constant_expressions.py \ - nir/nir_opcodes.py \ - nir/nir_opcodes_c.py \ - nir/nir_opcodes_h.py \ - nir/nir_opt_algebraic.py \ - nir/tests \ - nir/Makefile.sources +include Makefile.nir.am diff --git a/src/compiler/glsl/Makefile.am b/src/compiler/Makefile.glsl.am index 9954b812403..daf98f61244 100644 --- a/src/compiler/glsl/Makefile.am +++ b/src/compiler/Makefile.glsl.am @@ -1,4 +1,6 @@ +# # Copyright © 2012 Jon TURNEY +# Copyright (C) 2015 Intel Corporation # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -19,120 +21,103 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. -AM_CPPFLAGS = \ - -I$(top_srcdir)/include \ - -I$(top_srcdir)/src \ - -I$(top_srcdir)/src/mapi \ - -I$(top_srcdir)/src/mesa/ \ - -I$(top_srcdir)/src/gallium/include \ - -I$(top_srcdir)/src/gallium/auxiliary \ - -I$(top_srcdir)/src/glsl/glcpp \ - -I$(top_srcdir)/src/gtest/include \ - $(DEFINES) -AM_CFLAGS = \ - $(VISIBILITY_CFLAGS) \ - $(MSVC2013_COMPAT_CFLAGS) -AM_CXXFLAGS = \ - $(VISIBILITY_CXXFLAGS) \ - $(MSVC2013_COMPAT_CXXFLAGS) - -EXTRA_DIST = tests glcpp/tests README TODO glcpp/README \ - glsl_lexer.ll \ - glsl_parser.yy \ - glcpp/glcpp-lex.l \ - glcpp/glcpp-parse.y \ - SConscript - -include Makefile.sources - -TESTS = glcpp/tests/glcpp-test \ - glcpp/tests/glcpp-test-cr-lf \ - tests/blob-test \ - tests/general-ir-test \ - tests/optimization-test \ - tests/sampler-types-test \ - tests/uniform-initializer-test +EXTRA_DIST += glsl/tests glsl/glcpp/tests glsl/README \ + glsl/TODO glsl/glcpp/README \ + glsl/glsl_lexer.ll \ + glsl/glsl_parser.yy \ + glsl/glcpp/glcpp-lex.l \ + glsl/glcpp/glcpp-parse.y \ + SConscript.glsl + +TESTS += glsl/glcpp/tests/glcpp-test \ + glsl/glcpp/tests/glcpp-test-cr-lf \ + glsl/tests/blob-test \ + glsl/tests/general-ir-test \ + glsl/tests/optimization-test \ + glsl/tests/sampler-types-test \ + glsl/tests/uniform-initializer-test TESTS_ENVIRONMENT= \ export PYTHON2=$(PYTHON2); \ export PYTHON_FLAGS=$(PYTHON_FLAGS); -noinst_LTLIBRARIES = libglsl.la libglcpp.la -check_PROGRAMS = \ - glcpp/glcpp \ - glsl_test \ - tests/blob-test \ - tests/general-ir-test \ - tests/sampler-types-test \ - tests/uniform-initializer-test +check_PROGRAMS += \ + glsl/glcpp/glcpp \ + glsl/glsl_test \ + glsl/tests/blob-test \ + glsl/tests/general-ir-test \ + glsl/tests/sampler-types-test \ + glsl/tests/uniform-initializer-test noinst_PROGRAMS = glsl_compiler -tests_blob_test_SOURCES = \ - tests/blob_test.c -tests_blob_test_LDADD = \ - $(top_builddir)/src/glsl/libglsl.la - -tests_general_ir_test_SOURCES = \ - standalone_scaffolding.cpp \ - tests/builtin_variable_test.cpp \ - tests/invalidate_locations_test.cpp \ - tests/general_ir_test.cpp \ - tests/varyings_test.cpp -tests_general_ir_test_CFLAGS = \ +glsl_tests_blob_test_SOURCES = \ + glsl/tests/blob_test.c +glsl_tests_blob_test_LDADD = \ + glsl/libglsl.la + +glsl_tests_general_ir_test_SOURCES = \ + glsl/standalone_scaffolding.cpp \ + glsl/tests/builtin_variable_test.cpp \ + glsl/tests/invalidate_locations_test.cpp \ + glsl/tests/general_ir_test.cpp \ + glsl/tests/varyings_test.cpp +glsl_tests_general_ir_test_CFLAGS = \ $(PTHREAD_CFLAGS) -tests_general_ir_test_LDADD = \ +glsl_tests_general_ir_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ - $(top_builddir)/src/glsl/libglsl.la \ + glsl/libglsl.la \ $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) -tests_uniform_initializer_test_SOURCES = \ - tests/copy_constant_to_storage_tests.cpp \ - tests/set_uniform_initializer_tests.cpp \ - tests/uniform_initializer_utils.cpp \ - tests/uniform_initializer_utils.h -tests_uniform_initializer_test_CFLAGS = \ +glsl_tests_uniform_initializer_test_SOURCES = \ + glsl/tests/copy_constant_to_storage_tests.cpp \ + glsl/tests/set_uniform_initializer_tests.cpp \ + glsl/tests/uniform_initializer_utils.cpp \ + glsl/tests/uniform_initializer_utils.h +glsl_tests_uniform_initializer_test_CFLAGS = \ $(PTHREAD_CFLAGS) -tests_uniform_initializer_test_LDADD = \ +glsl_tests_uniform_initializer_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ - $(top_builddir)/src/glsl/libglsl.la \ + glsl/libglsl.la \ $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) -tests_sampler_types_test_SOURCES = \ - tests/sampler_types_test.cpp -tests_sampler_types_test_CFLAGS = \ +glsl_tests_sampler_types_test_SOURCES = \ + glsl/tests/sampler_types_test.cpp +glsl_tests_sampler_types_test_CFLAGS = \ $(PTHREAD_CFLAGS) -tests_sampler_types_test_LDADD = \ +glsl_tests_sampler_types_test_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ - $(top_builddir)/src/glsl/libglsl.la \ + glsl/libglsl.la \ $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) -libglcpp_la_LIBADD = \ +noinst_LTLIBRARIES += glsl/libglsl.la glsl/libglcpp.la + +glsl_libglcpp_la_LIBADD = \ $(top_builddir)/src/util/libmesautil.la -libglcpp_la_SOURCES = \ - glcpp/glcpp-lex.c \ - glcpp/glcpp-parse.c \ - glcpp/glcpp-parse.h \ +glsl_libglcpp_la_SOURCES = \ + glsl/glcpp/glcpp-lex.c \ + glsl/glcpp/glcpp-parse.c \ + glsl/glcpp/glcpp-parse.h \ $(LIBGLCPP_FILES) -glcpp_glcpp_SOURCES = \ - glcpp/glcpp.c -glcpp_glcpp_LDADD = \ - libglcpp.la \ +glsl_glcpp_glcpp_SOURCES = \ + glsl/glcpp/glcpp.c +glsl_glcpp_glcpp_LDADD = \ + glsl/libglcpp.la \ $(top_builddir)/src/libglsl_util.la \ -lm -libglsl_la_LIBADD = \ - $(top_builddir)/src/compiler/nir/libnir.la \ - libglcpp.la +glsl_libglsl_la_LIBADD = \ + nir/libnir.la \ + glsl/libglcpp.la -libglsl_la_SOURCES = \ - glsl_lexer.cpp \ - glsl_parser.cpp \ - glsl_parser.h \ +glsl_libglsl_la_SOURCES = \ + glsl/glsl_lexer.cpp \ + glsl/glsl_parser.cpp \ + glsl/glsl_parser.h \ $(LIBGLSL_FILES) @@ -140,19 +125,19 @@ glsl_compiler_SOURCES = \ $(GLSL_COMPILER_CXX_FILES) glsl_compiler_LDADD = \ - libglsl.la \ + glsl/libglsl.la \ $(top_builddir)/src/libglsl_util.la \ $(top_builddir)/src/util/libmesautil.la \ $(PTHREAD_LIBS) -glsl_test_SOURCES = \ - standalone_scaffolding.cpp \ - test.cpp \ - test_optpass.cpp \ - test_optpass.h +glsl_glsl_test_SOURCES = \ + glsl/standalone_scaffolding.cpp \ + glsl/test.cpp \ + glsl/test_optpass.cpp \ + glsl/test_optpass.h -glsl_test_LDADD = \ - libglsl.la \ +glsl_glsl_test_LDADD = \ + glsl/libglsl.la \ $(top_builddir)/src/libglsl_util.la \ $(PTHREAD_LIBS) @@ -186,23 +171,24 @@ am__v_YACC_ = $(am__v_YACC_$(AM_DEFAULT_VERBOSITY)) am__v_YACC_0 = @echo " YACC " $@; am__v_YACC_1 = -MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D) YACC_GEN = $(AM_V_YACC)$(YACC) $(YFLAGS) LEX_GEN = $(AM_V_LEX)$(LEX) $(LFLAGS) -glsl_parser.cpp glsl_parser.h: glsl_parser.yy - $(YACC_GEN) -o $@ -p "_mesa_glsl_" --defines=$(builddir)/glsl_parser.h $(srcdir)/glsl_parser.yy +glsl/glsl_parser.cpp glsl/glsl_parser.h: glsl/glsl_parser.yy + $(MKDIR_GEN) + $(YACC_GEN) -o $@ -p "_mesa_glsl_" --defines=$(builddir)/glsl/glsl_parser.h $(srcdir)/glsl/glsl_parser.yy -glsl_lexer.cpp: glsl_lexer.ll - $(LEX_GEN) -o $@ $(srcdir)/glsl_lexer.ll +glsl/glsl_lexer.cpp: glsl/glsl_lexer.ll + $(MKDIR_GEN) + $(LEX_GEN) -o $@ $(srcdir)/glsl/glsl_lexer.ll -glcpp/glcpp-parse.c glcpp/glcpp-parse.h: glcpp/glcpp-parse.y +glsl/glcpp/glcpp-parse.c glsl/glcpp/glcpp-parse.h: glsl/glcpp/glcpp-parse.y $(MKDIR_GEN) - $(YACC_GEN) -o $@ -p "glcpp_parser_" --defines=$(builddir)/glcpp/glcpp-parse.h $(srcdir)/glcpp/glcpp-parse.y + $(YACC_GEN) -o $@ -p "glcpp_parser_" --defines=$(builddir)/glsl/glcpp/glcpp-parse.h $(srcdir)/glsl/glcpp/glcpp-parse.y -glcpp/glcpp-lex.c: glcpp/glcpp-lex.l +glsl/glcpp/glcpp-lex.c: glsl/glcpp/glcpp-lex.l $(MKDIR_GEN) - $(LEX_GEN) -o $@ $(srcdir)/glcpp/glcpp-lex.l + $(LEX_GEN) -o $@ $(srcdir)/glsl/glcpp/glcpp-lex.l # Only the parsers (specifically the header files generated at the same time) # need to be in BUILT_SOURCES. Though if we list the parser headers YACC is @@ -210,19 +196,22 @@ glcpp/glcpp-lex.c: glcpp/glcpp-lex.l # YACC is only executed once for each parser. The rest of the generated code # will be created at the appropriate times according to standard automake # dependency rules. -BUILT_SOURCES = \ - glsl_parser.cpp \ - glsl_lexer.cpp \ - glcpp/glcpp-parse.c \ - glcpp/glcpp-lex.c -CLEANFILES = \ - glcpp/glcpp-parse.h \ - glsl_parser.h \ - $(BUILT_SOURCES) +BUILT_SOURCES += \ + glsl/glsl_parser.cpp \ + glsl/glsl_lexer.cpp \ + glsl/glcpp/glcpp-parse.c \ + glsl/glcpp/glcpp-lex.c +CLEANFILES += \ + glsl/glcpp/glcpp-parse.h \ + glsl/glsl_parser.h \ + glsl/glsl_parser.cpp \ + glsl/glsl_lexer.cpp \ + glsl/glcpp/glcpp-parse.c \ + glsl/glcpp/glcpp-lex.c clean-local: $(RM) -r subtest-cr subtest-cr-lf subtest-lf subtest-lf-cr dist-hook: - $(RM) glcpp/tests/*.out - $(RM) glcpp/tests/subtest*/*.out + $(RM) glsl/glcpp/tests/*.out + $(RM) glsl/glcpp/tests/subtest*/*.out diff --git a/src/compiler/Makefile.nir.am b/src/compiler/Makefile.nir.am new file mode 100644 index 00000000000..45a9c809bf8 --- /dev/null +++ b/src/compiler/Makefile.nir.am @@ -0,0 +1,94 @@ +# +# Copyright © 2012 Jon TURNEY +# Copyright (C) 2015 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +noinst_LTLIBRARIES += nir/libnir.la + +nir_libnir_la_CPPFLAGS = \ + $(AM_CPPFLAGS) \ + -I$(top_builddir)/src/compiler/nir \ + -I$(top_srcdir)/src/compiler/nir + +nir_libnir_la_LIBADD = \ + libcompiler.la + +nir_libnir_la_SOURCES = \ + $(NIR_FILES) \ + $(SPIRV_FILES) \ + $(NIR_GENERATED_FILES) + +PYTHON_GEN = $(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) + +nir/nir_builder_opcodes.h: nir/nir_opcodes.py nir/nir_builder_opcodes_h.py + $(MKDIR_GEN) + $(PYTHON_GEN) $(srcdir)/nir/nir_builder_opcodes_h.py > $@ || ($(RM) $@; false) + +nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py + $(MKDIR_GEN) + $(PYTHON_GEN) $(srcdir)/nir/nir_constant_expressions.py > $@ || ($(RM) $@; false) + +nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py + $(MKDIR_GEN) + $(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_h.py > $@ || ($(RM) $@; false) + +nir/nir_opcodes.c: nir/nir_opcodes.py nir/nir_opcodes_c.py + $(MKDIR_GEN) + $(PYTHON_GEN) $(srcdir)/nir/nir_opcodes_c.py > $@ || ($(RM) $@; false) + +nir/nir_opt_algebraic.c: nir/nir_opt_algebraic.py nir/nir_algebraic.py + $(MKDIR_GEN) + $(PYTHON_GEN) $(srcdir)/nir/nir_opt_algebraic.py > $@ || ($(RM) $@; false) + + +check_PROGRAMS += nir/tests/control_flow_tests + +nir_tests_control_flow_tests_CPPFLAGS = \ + $(AM_CPPFLAGS) \ + -I$(top_builddir)/src/compiler/nir \ + -I$(top_srcdir)/src/compiler/nir + +nir_tests_control_flow_tests_SOURCES = \ + nir/tests/control_flow_tests.cpp +nir_tests_control_flow_tests_CFLAGS = \ + $(PTHREAD_CFLAGS) +nir_tests_control_flow_tests_LDADD = \ + $(top_builddir)/src/gtest/libgtest.la \ + nir/libnir.la \ + $(top_builddir)/src/util/libmesautil.la \ + $(PTHREAD_LIBS) + + +TESTS += nir/tests/control_flow_tests + + +BUILT_SOURCES += $(NIR_GENERATED_FILES) +CLEANFILES += $(NIR_GENERATED_FILES) + +EXTRA_DIST += \ + nir/nir_algebraic.py \ + nir/nir_builder_opcodes_h.py \ + nir/nir_constant_expressions.py \ + nir/nir_opcodes.py \ + nir/nir_opcodes_c.py \ + nir/nir_opcodes_h.py \ + nir/nir_opt_algebraic.py \ + nir/tests diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources index 120ef2935a7..adc7a428469 100644 --- a/src/compiler/Makefile.sources +++ b/src/compiler/Makefile.sources @@ -187,6 +187,7 @@ NIR_FILES = \ nir/nir_lower_alu_to_scalar.c \ nir/nir_lower_atomics.c \ nir/nir_lower_clip.c \ + nir/nir_lower_double_packing.c \ nir/nir_lower_global_vars_to_local.c \ nir/nir_lower_gs_intrinsics.c \ nir/nir_lower_load_const_to_scalar.c \ diff --git a/src/compiler/SConscript b/src/compiler/SConscript index 8d71b82bee0..8969d821984 100644 --- a/src/compiler/SConscript +++ b/src/compiler/SConscript @@ -21,4 +21,4 @@ compiler = env.ConvenienceLibrary( ) Export('compiler') -SConscript('glsl/SConscript') +SConscript('SConscript.glsl') diff --git a/src/compiler/glsl/SConscript b/src/compiler/SConscript.glsl index ef82a9d317a..43a11d105d4 100644 --- a/src/compiler/glsl/SConscript +++ b/src/compiler/SConscript.glsl @@ -15,14 +15,14 @@ env.Prepend(CPPPATH = [ '#src/mesa', '#src/gallium/include', '#src/gallium/auxiliary', - '#src/glsl', - '#src/glsl/glcpp', + '#src/compiler/glsl', + '#src/compiler/glsl/glcpp', ]) env.Prepend(LIBS = [mesautil]) # Make glcpp-parse.h and glsl_parser.h reachable from the include path. -env.Append(CPPPATH = [Dir('.').abspath, Dir('glcpp').abspath]) +env.Prepend(CPPPATH = [Dir('.').abspath, Dir('glsl').abspath]) glcpp_env = env.Clone() glcpp_env.Append(YACCFLAGS = [ @@ -32,7 +32,7 @@ glcpp_env.Append(YACCFLAGS = [ glsl_env = env.Clone() glsl_env.Append(YACCFLAGS = [ - '--defines=%s' % File('glsl_parser.h').abspath, + '--defines=%s' % File('glsl/glsl_parser.h').abspath, '-p', '_mesa_glsl_', ]) @@ -40,10 +40,10 @@ glsl_env.Append(YACCFLAGS = [ # "glsl_parser.h", causing glsl_parser.cpp to be regenerated every time glsl_env['YACCHXXFILESUFFIX'] = '.h' -glcpp_lexer = glcpp_env.CFile('glcpp/glcpp-lex.c', 'glcpp/glcpp-lex.l') -glcpp_parser = glcpp_env.CFile('glcpp/glcpp-parse.c', 'glcpp/glcpp-parse.y') -glsl_lexer = glsl_env.CXXFile('glsl_lexer.cpp', 'glsl_lexer.ll') -glsl_parser = glsl_env.CXXFile('glsl_parser.cpp', 'glsl_parser.yy') +glcpp_lexer = glcpp_env.CFile('glsl/glcpp/glcpp-lex.c', 'glsl/glcpp/glcpp-lex.l') +glcpp_parser = glcpp_env.CFile('glsl/glcpp/glcpp-parse.c', 'glsl/glcpp/glcpp-parse.y') +glsl_lexer = glsl_env.CXXFile('glsl/glsl_lexer.cpp', 'glsl/glsl_lexer.ll') +glsl_parser = glsl_env.CXXFile('glsl/glsl_parser.cpp', 'glsl/glsl_parser.yy') # common generated sources glsl_sources = [ @@ -66,20 +66,20 @@ if env['msvc']: # Copy these files to avoid generation object files into src/mesa/program env.Prepend(CPPPATH = ['#src/mesa/main']) -env.Command('imports.c', '#src/mesa/main/imports.c', Copy('$TARGET', '$SOURCE')) +env.Command('glsl/imports.c', '#src/mesa/main/imports.c', Copy('$TARGET', '$SOURCE')) # Copy these files to avoid generation object files into src/mesa/program env.Prepend(CPPPATH = ['#src/mesa/program']) -env.Command('prog_hash_table.c', '#src/mesa/program/prog_hash_table.c', Copy('$TARGET', '$SOURCE')) -env.Command('symbol_table.c', '#src/mesa/program/symbol_table.c', Copy('$TARGET', '$SOURCE')) -env.Command('dummy_errors.c', '#src/mesa/program/dummy_errors.c', Copy('$TARGET', '$SOURCE')) +env.Command('glsl/prog_hash_table.c', '#src/mesa/program/prog_hash_table.c', Copy('$TARGET', '$SOURCE')) +env.Command('glsl/symbol_table.c', '#src/mesa/program/symbol_table.c', Copy('$TARGET', '$SOURCE')) +env.Command('glsl/dummy_errors.c', '#src/mesa/program/dummy_errors.c', Copy('$TARGET', '$SOURCE')) compiler_objs = env.StaticObject(source_lists['GLSL_COMPILER_CXX_FILES']) mesa_objs = env.StaticObject([ - 'imports.c', - 'prog_hash_table.c', - 'symbol_table.c', - 'dummy_errors.c', + 'glsl/imports.c', + 'glsl/prog_hash_table.c', + 'glsl/symbol_table.c', + 'glsl/dummy_errors.c', ]) compiler_objs += mesa_objs @@ -116,7 +116,7 @@ glsl_compiler = env.Program( env.Alias('glsl_compiler', glsl_compiler) glcpp = env.Program( - target = 'glcpp/glcpp', - source = ['glcpp/glcpp.c'] + mesa_objs, + target = 'glsl/glcpp/glcpp', + source = ['glsl/glcpp/glcpp.c'] + mesa_objs, ) env.Alias('glcpp', glcpp) diff --git a/src/compiler/glsl/Makefile.sources b/src/compiler/glsl/Makefile.sources deleted file mode 100644 index 538196a79a9..00000000000 --- a/src/compiler/glsl/Makefile.sources +++ /dev/null @@ -1,223 +0,0 @@ -# shared source lists for Makefile, SConscript, and Android.mk - -# libglcpp - -LIBGLCPP_FILES = \ - glcpp/glcpp.h \ - glcpp/pp.c - -LIBGLCPP_GENERATED_FILES = \ - glcpp/glcpp-lex.c \ - glcpp/glcpp-parse.c - -NIR_GENERATED_FILES = \ - nir/nir_builder_opcodes.h \ - nir/nir_constant_expressions.c \ - nir/nir_opcodes.c \ - nir/nir_opcodes.h \ - nir/nir_opt_algebraic.c - -NIR_FILES = \ - nir/nir.c \ - nir/nir.h \ - nir/nir_array.h \ - nir/nir_builder.h \ - nir/nir_clone.c \ - nir/nir_constant_expressions.h \ - nir/nir_control_flow.c \ - nir/nir_control_flow.h \ - nir/nir_control_flow_private.h \ - nir/nir_dominance.c \ - nir/nir_from_ssa.c \ - nir/nir_gs_count_vertices.c \ - nir/nir_intrinsics.c \ - nir/nir_intrinsics.h \ - nir/nir_instr_set.c \ - nir/nir_instr_set.h \ - nir/nir_liveness.c \ - nir/nir_lower_alu_to_scalar.c \ - nir/nir_lower_atomics.c \ - nir/nir_lower_clip.c \ - nir/nir_lower_global_vars_to_local.c \ - nir/nir_lower_gs_intrinsics.c \ - nir/nir_lower_load_const_to_scalar.c \ - nir/nir_lower_locals_to_regs.c \ - nir/nir_lower_idiv.c \ - nir/nir_lower_io.c \ - nir/nir_lower_outputs_to_temporaries.c \ - nir/nir_lower_phis_to_scalar.c \ - nir/nir_lower_samplers.c \ - nir/nir_lower_system_values.c \ - nir/nir_lower_tex.c \ - nir/nir_lower_to_source_mods.c \ - nir/nir_lower_two_sided_color.c \ - nir/nir_lower_vars_to_ssa.c \ - nir/nir_lower_var_copies.c \ - nir/nir_lower_vec_to_movs.c \ - nir/nir_metadata.c \ - nir/nir_move_vec_src_uses_to_dest.c \ - nir/nir_normalize_cubemap_coords.c \ - nir/nir_opt_constant_folding.c \ - nir/nir_opt_copy_propagate.c \ - nir/nir_opt_cse.c \ - nir/nir_opt_dce.c \ - nir/nir_opt_dead_cf.c \ - nir/nir_opt_gcm.c \ - nir/nir_opt_global_to_local.c \ - nir/nir_opt_peephole_select.c \ - nir/nir_opt_remove_phis.c \ - nir/nir_opt_undef.c \ - nir/nir_print.c \ - nir/nir_remove_dead_variables.c \ - nir/nir_search.c \ - nir/nir_search.h \ - nir/nir_split_var_copies.c \ - nir/nir_sweep.c \ - nir/nir_to_ssa.c \ - nir/nir_validate.c \ - nir/nir_vla.h \ - nir/nir_worklist.c \ - nir/nir_worklist.h - -# libglsl - -LIBGLSL_FILES = \ - ast.h \ - ast_array_index.cpp \ - ast_expr.cpp \ - ast_function.cpp \ - ast_to_hir.cpp \ - ast_type.cpp \ - blob.c \ - blob.h \ - builtin_functions.cpp \ - builtin_types.cpp \ - builtin_variables.cpp \ - glsl_parser_extras.cpp \ - glsl_parser_extras.h \ - glsl_symbol_table.cpp \ - glsl_symbol_table.h \ - hir_field_selection.cpp \ - ir_basic_block.cpp \ - ir_basic_block.h \ - ir_builder.cpp \ - ir_builder.h \ - ir_clone.cpp \ - ir_constant_expression.cpp \ - ir.cpp \ - ir.h \ - ir_equals.cpp \ - ir_expression_flattening.cpp \ - ir_expression_flattening.h \ - ir_function_can_inline.cpp \ - ir_function_detect_recursion.cpp \ - ir_function_inlining.h \ - ir_function.cpp \ - ir_hierarchical_visitor.cpp \ - ir_hierarchical_visitor.h \ - ir_hv_accept.cpp \ - ir_import_prototypes.cpp \ - ir_optimization.h \ - ir_print_visitor.cpp \ - ir_print_visitor.h \ - ir_reader.cpp \ - ir_reader.h \ - ir_rvalue_visitor.cpp \ - ir_rvalue_visitor.h \ - ir_set_program_inouts.cpp \ - ir_uniform.h \ - ir_validate.cpp \ - ir_variable_refcount.cpp \ - ir_variable_refcount.h \ - ir_visitor.h \ - linker.cpp \ - linker.h \ - link_atomics.cpp \ - link_functions.cpp \ - link_interface_blocks.cpp \ - link_uniforms.cpp \ - link_uniform_initializers.cpp \ - link_uniform_block_active_visitor.cpp \ - link_uniform_block_active_visitor.h \ - link_uniform_blocks.cpp \ - link_varyings.cpp \ - link_varyings.h \ - list.h \ - loop_analysis.cpp \ - loop_analysis.h \ - loop_controls.cpp \ - loop_unroll.cpp \ - lower_buffer_access.cpp \ - lower_buffer_access.h \ - lower_clip_distance.cpp \ - lower_const_arrays_to_uniforms.cpp \ - lower_discard.cpp \ - lower_discard_flow.cpp \ - lower_if_to_cond_assign.cpp \ - lower_instructions.cpp \ - lower_jumps.cpp \ - lower_mat_op_to_vec.cpp \ - lower_noise.cpp \ - lower_offset_array.cpp \ - lower_packed_varyings.cpp \ - lower_named_interface_blocks.cpp \ - lower_packing_builtins.cpp \ - lower_subroutine.cpp \ - lower_tess_level.cpp \ - lower_texture_projection.cpp \ - lower_variable_index_to_cond_assign.cpp \ - lower_vec_index_to_cond_assign.cpp \ - lower_vec_index_to_swizzle.cpp \ - lower_vector.cpp \ - lower_vector_derefs.cpp \ - lower_vector_insert.cpp \ - lower_vertex_id.cpp \ - lower_output_reads.cpp \ - lower_shared_reference.cpp \ - lower_ubo_reference.cpp \ - opt_algebraic.cpp \ - opt_array_splitting.cpp \ - opt_conditional_discard.cpp \ - opt_constant_folding.cpp \ - opt_constant_propagation.cpp \ - opt_constant_variable.cpp \ - opt_copy_propagation.cpp \ - opt_copy_propagation_elements.cpp \ - opt_dead_builtin_variables.cpp \ - opt_dead_builtin_varyings.cpp \ - opt_dead_code.cpp \ - opt_dead_code_local.cpp \ - opt_dead_functions.cpp \ - opt_flatten_nested_if_blocks.cpp \ - opt_flip_matrices.cpp \ - opt_function_inlining.cpp \ - opt_if_simplification.cpp \ - opt_minmax.cpp \ - opt_noop_swizzle.cpp \ - opt_rebalance_tree.cpp \ - opt_redundant_jumps.cpp \ - opt_structure_splitting.cpp \ - opt_swizzle_swizzle.cpp \ - opt_tree_grafting.cpp \ - opt_vectorize.cpp \ - program.h \ - propagate_invariance.cpp \ - s_expression.cpp \ - s_expression.h - -# glsl to nir pass -GLSL_TO_NIR_FILES = \ - nir/glsl_to_nir.cpp \ - nir/glsl_to_nir.h - -# glsl_compiler - -GLSL_COMPILER_CXX_FILES = \ - standalone_scaffolding.cpp \ - standalone_scaffolding.h \ - main.cpp - -# libglsl generated sources -LIBGLSL_GENERATED_CXX_FILES = \ - glsl_lexer.cpp \ - glsl_parser.cpp diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h index 7436edce88a..92aa39e64b8 100644 --- a/src/compiler/glsl/ast.h +++ b/src/compiler/glsl/ast.h @@ -736,6 +736,11 @@ struct ast_type_qualifier { const ast_type_qualifier &q, ast_node* &node, bool create_node); + bool validate_flags(YYLTYPE *loc, + _mesa_glsl_parse_state *state, + const char *message, + const ast_type_qualifier &allowed_flags); + ast_subroutine_list *subroutine_list; }; diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index 7c9be8171b6..82eb22a82c6 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -4300,6 +4300,17 @@ ast_declarator_list::hir(exec_list *instructions, state->atomic_counter_offsets[qual_binding] = qual_offset; } } + + ast_type_qualifier allowed_atomic_qual_mask; + allowed_atomic_qual_mask.flags.i = 0; + allowed_atomic_qual_mask.flags.q.explicit_binding = 1; + allowed_atomic_qual_mask.flags.q.explicit_offset = 1; + allowed_atomic_qual_mask.flags.q.uniform = 1; + + type->qualifier.validate_flags(&loc, state, + "invalid layout qualifier for " + "atomic_uint", + allowed_atomic_qual_mask); } if (this->declarations.is_empty()) { diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp index c3d38cbbf8a..7a0014b5d7f 100644 --- a/src/compiler/glsl/ast_type.cpp +++ b/src/compiler/glsl/ast_type.cpp @@ -581,6 +581,91 @@ ast_type_qualifier::merge_in_qualifier(YYLTYPE *loc, return true; } +/** + * Check if the current type qualifier has any illegal flags. + * + * If so, print an error message, followed by a list of illegal flags. + * + * \param message The error message to print. + * \param allowed_flags A list of valid flags. + */ +bool +ast_type_qualifier::validate_flags(YYLTYPE *loc, + _mesa_glsl_parse_state *state, + const char *message, + const ast_type_qualifier &allowed_flags) +{ + ast_type_qualifier bad; + bad.flags.i = this->flags.i & ~allowed_flags.flags.i; + if (bad.flags.i == 0) + return true; + + _mesa_glsl_error(loc, state, + "%s:" + "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s" + "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s" + "%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + message, + bad.flags.q.invariant ? " invariant" : "", + bad.flags.q.precise ? " precise" : "", + bad.flags.q.constant ? " constant" : "", + bad.flags.q.attribute ? " attribute" : "", + bad.flags.q.varying ? " varying" : "", + bad.flags.q.in ? " in" : "", + bad.flags.q.out ? " out" : "", + bad.flags.q.centroid ? " centroid" : "", + bad.flags.q.sample ? " sample" : "", + bad.flags.q.patch ? " patch" : "", + bad.flags.q.uniform ? " uniform" : "", + bad.flags.q.buffer ? " buffer" : "", + bad.flags.q.shared_storage ? " shared_storage" : "", + bad.flags.q.smooth ? " smooth" : "", + bad.flags.q.flat ? " flat" : "", + bad.flags.q.noperspective ? " noperspective" : "", + bad.flags.q.origin_upper_left ? " origin_upper_left" : "", + bad.flags.q.pixel_center_integer ? " pixel_center_integer" : "", + bad.flags.q.explicit_align ? " align" : "", + bad.flags.q.explicit_location ? " location" : "", + bad.flags.q.explicit_index ? " index" : "", + bad.flags.q.explicit_binding ? " binding" : "", + bad.flags.q.explicit_offset ? " offset" : "", + bad.flags.q.depth_any ? " depth_any" : "", + bad.flags.q.depth_greater ? " depth_greater" : "", + bad.flags.q.depth_less ? " depth_less" : "", + bad.flags.q.depth_unchanged ? " depth_unchanged" : "", + bad.flags.q.std140 ? " std140" : "", + bad.flags.q.std430 ? " std430" : "", + bad.flags.q.shared ? " shared" : "", + bad.flags.q.packed ? " packed" : "", + bad.flags.q.column_major ? " column_major" : "", + bad.flags.q.row_major ? " row_major" : "", + bad.flags.q.prim_type ? " prim_type" : "", + bad.flags.q.max_vertices ? " max_vertices" : "", + bad.flags.q.local_size ? " local_size" : "", + bad.flags.q.early_fragment_tests ? " early_fragment_tests" : "", + bad.flags.q.explicit_image_format ? " image_format" : "", + bad.flags.q.coherent ? " coherent" : "", + bad.flags.q._volatile ? " _volatile" : "", + bad.flags.q.restrict_flag ? " restrict_flag" : "", + bad.flags.q.read_only ? " read_only" : "", + bad.flags.q.write_only ? " write_only" : "", + bad.flags.q.invocations ? " invocations" : "", + bad.flags.q.stream ? " stream" : "", + bad.flags.q.explicit_stream ? " stream" : "", + bad.flags.q.explicit_xfb_offset ? " xfb_offset" : "", + bad.flags.q.xfb_buffer ? " xfb_buffer" : "", + bad.flags.q.explicit_xfb_buffer ? " xfb_buffer" : "", + bad.flags.q.xfb_stride ? " xfb_stride" : "", + bad.flags.q.explicit_xfb_stride ? " xfb_stride" : "", + bad.flags.q.vertex_spacing ? " vertex_spacing" : "", + bad.flags.q.ordering ? " ordering" : "", + bad.flags.q.point_mode ? " point_mode" : "", + bad.flags.q.vertices ? " vertices" : "", + bad.flags.q.subroutine ? " subroutine" : "", + bad.flags.q.subroutine_def ? " subroutine_def" : ""); + return false; +} + bool ast_layout_expression::process_qualifier_constant(struct _mesa_glsl_parse_state *state, const char *qual_indentifier, diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll index 0b7695f8d3e..6b1ef1717e5 100644 --- a/src/compiler/glsl/glsl_lexer.ll +++ b/src/compiler/glsl/glsl_lexer.ll @@ -304,7 +304,7 @@ in return IN_TOK; out return OUT_TOK; inout return INOUT_TOK; uniform return UNIFORM; -buffer return BUFFER; +buffer KEYWORD_WITH_ALT(0, 0, 430, 310, yyextra->ARB_shader_storage_buffer_object_enable, BUFFER); varying DEPRECATED_ES_KEYWORD(VARYING); centroid KEYWORD(120, 300, 120, 300, CENTROID); invariant KEYWORD(120, 100, 120, 100, INVARIANT); diff --git a/src/compiler/glsl/ir_set_program_inouts.cpp b/src/compiler/glsl/ir_set_program_inouts.cpp index df06923b870..6768d82f338 100644 --- a/src/compiler/glsl/ir_set_program_inouts.cpp +++ b/src/compiler/glsl/ir_set_program_inouts.cpp @@ -149,7 +149,7 @@ void ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var) { const glsl_type *type = var->type; - bool vertex_input = false; + bool is_vertex_input = false; if (this->shader_stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in && type->is_array()) { type = type->fields.array; @@ -175,9 +175,9 @@ ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var) if (this->shader_stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) - vertex_input = true; + is_vertex_input = true; - mark(this->prog, var, 0, type->count_attribute_slots(vertex_input), + mark(this->prog, var, 0, type->count_attribute_slots(is_vertex_input), this->shader_stage); } diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp index e9d0067459a..87606be9337 100644 --- a/src/compiler/glsl/link_varyings.cpp +++ b/src/compiler/glsl/link_varyings.cpp @@ -488,7 +488,7 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object, * its value is used by other shader stages. This will cause the * variable to have a location assigned. */ - if (var->data.is_unmatched_generic_inout) { + if (var->data.is_unmatched_generic_inout && !var->data.is_xfb_only) { assert(var->data.mode != ir_var_temporary); var->data.mode = ir_var_auto; } diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp index 957efe5b55d..dcc8a57b6be 100644 --- a/src/compiler/glsl/linker.cpp +++ b/src/compiler/glsl/linker.cpp @@ -2618,7 +2618,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog, return false; } - const unsigned slots = var->type->count_attribute_slots(target_index == MESA_SHADER_VERTEX ? true : false); + const unsigned slots = var->type->count_attribute_slots(target_index == MESA_SHADER_VERTEX); /* If the variable is not a built-in and has a location statically * assigned in the shader (presumably via a layout qualifier), make sure @@ -3249,12 +3249,12 @@ reserve_subroutine_explicit_locations(struct gl_shader_program *prog, * any optimizations happen to handle also inactive uniforms and * inactive array elements that may get trimmed away. */ -static int +static unsigned check_explicit_uniform_locations(struct gl_context *ctx, struct gl_shader_program *prog) { if (!ctx->Extensions.ARB_explicit_uniform_location) - return -1; + return 0; /* This map is used to detect if overlapping explicit locations * occur with the same uniform (from different stage) or a different one. @@ -3263,7 +3263,7 @@ check_explicit_uniform_locations(struct gl_context *ctx, if (!uniform_map) { linker_error(prog, "Out of memory during linking.\n"); - return -1; + return 0; } unsigned entries_total = 0; @@ -3292,7 +3292,7 @@ check_explicit_uniform_locations(struct gl_context *ctx, } if (!ret) { delete uniform_map; - return -1; + return 0; } } } @@ -3518,8 +3518,9 @@ build_stageref(struct gl_shader_program *shProg, const char *name, */ static gl_shader_variable * create_shader_variable(struct gl_shader_program *shProg, - const ir_variable *in, bool use_implicit_location, - int location_bias) + const ir_variable *in, + const char *name, const glsl_type *type, + bool use_implicit_location, int location) { gl_shader_variable *out = ralloc(shProg, struct gl_shader_variable); if (!out) @@ -3532,7 +3533,7 @@ create_shader_variable(struct gl_shader_program *shProg, in->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) { out->name = ralloc_strdup(shProg, "gl_VertexID"); } else { - out->name = ralloc_strdup(shProg, in->name); + out->name = ralloc_strdup(shProg, name); } if (!out->name) @@ -3557,10 +3558,10 @@ create_shader_variable(struct gl_shader_program *shProg, !(in->data.explicit_location || use_implicit_location)) { out->location = -1; } else { - out->location = in->data.location - location_bias; + out->location = location; } - out->type = in->type; + out->type = type; out->index = in->data.index; out->patch = in->data.patch; out->mode = in->data.mode; @@ -3569,6 +3570,60 @@ create_shader_variable(struct gl_shader_program *shProg, } static bool +add_shader_variable(struct gl_shader_program *shProg, unsigned stage_mask, + GLenum programInterface, ir_variable *var, + const char *name, const glsl_type *type, + bool use_implicit_location, int location) +{ + const bool is_vertex_input = + programInterface == GL_PROGRAM_INPUT && + stage_mask == MESA_SHADER_VERTEX; + + switch (type->base_type) { + case GLSL_TYPE_STRUCT: { + /* From the ARB_program_interface_query specification: + * + * "For an active variable declared as a structure, a separate entry + * will be generated for each active structure member. The name of + * each entry is formed by concatenating the name of the structure, + * the "." character, and the name of the structure member. If a + * structure member to enumerate is itself a structure or array, these + * enumeration rules are applied recursively." + */ + unsigned field_location = location; + for (unsigned i = 0; i < type->length; i++) { + const struct glsl_struct_field *field = &type->fields.structure[i]; + char *field_name = ralloc_asprintf(shProg, "%s.%s", name, field->name); + if (!add_shader_variable(shProg, stage_mask, programInterface, + var, field_name, field->type, + use_implicit_location, field_location)) + return false; + + field_location += + field->type->count_attribute_slots(is_vertex_input); + } + return true; + } + + default: { + /* From the ARB_program_interface_query specification: + * + * "For an active variable declared as a single instance of a basic + * type, a single entry will be generated, using the variable name + * from the shader source." + */ + gl_shader_variable *sha_v = + create_shader_variable(shProg, var, name, type, + use_implicit_location, location); + if (!sha_v) + return false; + + return add_program_resource(shProg, programInterface, sha_v, stage_mask); + } + } +} + +static bool add_interface_variables(struct gl_shader_program *shProg, unsigned stage, GLenum programInterface) { @@ -3616,12 +3671,9 @@ add_interface_variables(struct gl_shader_program *shProg, (stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) || (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out); - gl_shader_variable *sha_v = - create_shader_variable(shProg, var, vs_input_or_fs_output, loc_bias); - if (!sha_v) - return false; - - if (!add_program_resource(shProg, programInterface, sha_v, 1 << stage)) + if (!add_shader_variable(shProg, 1 << stage, programInterface, + var, var->name, var->type, vs_input_or_fs_output, + var->data.location - loc_bias)) return false; } return true; @@ -3651,13 +3703,11 @@ add_packed_varyings(struct gl_shader_program *shProg, int stage, GLenum type) } if (type == iface) { - gl_shader_variable *sha_v = - create_shader_variable(shProg, var, false, VARYING_SLOT_VAR0); - if (!sha_v) - return false; - if (!add_program_resource(shProg, iface, sha_v, - build_stageref(shProg, sha_v->name, - sha_v->mode))) + const int stage_mask = + build_stageref(shProg, var->name, var->data.mode); + if (!add_shader_variable(shProg, stage_mask, + iface, var, var->name, var->type, false, + var->data.location - VARYING_SLOT_VAR0)) return false; } } @@ -3677,12 +3727,11 @@ add_fragdata_arrays(struct gl_shader_program *shProg) ir_variable *var = node->as_variable(); if (var) { assert(var->data.mode == ir_var_shader_out); - gl_shader_variable *sha_v = - create_shader_variable(shProg, var, true, FRAG_RESULT_DATA0); - if (!sha_v) - return false; - if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, sha_v, - 1 << MESA_SHADER_FRAGMENT)) + + if (!add_shader_variable(shProg, + 1 << MESA_SHADER_FRAGMENT, + GL_PROGRAM_OUTPUT, var, var->name, var->type, + true, var->data.location - FRAG_RESULT_DATA0)) return false; } } diff --git a/src/compiler/glsl/opt_constant_propagation.cpp b/src/compiler/glsl/opt_constant_propagation.cpp index 416ba16a3c5..4764d16de6d 100644 --- a/src/compiler/glsl/opt_constant_propagation.cpp +++ b/src/compiler/glsl/opt_constant_propagation.cpp @@ -122,7 +122,7 @@ public: exec_list *acp; /** - * List of kill_entry: The masks of variables whose values were + * Hash table of kill_entry: The masks of variables whose values were * killed in this block. */ hash_table *kills; @@ -454,7 +454,7 @@ ir_constant_propagation_visitor::kill(ir_variable *var, unsigned write_mask) } } - /* Add this writemask of the variable to the list of killed + /* Add this writemask of the variable to the hash table of killed * variables in this block. */ hash_entry *kill_hash_entry = _mesa_hash_table_search(this->kills, var); @@ -463,7 +463,7 @@ ir_constant_propagation_visitor::kill(ir_variable *var, unsigned write_mask) entry->write_mask |= write_mask; return; } - /* Not already in the list. Make new entry. */ + /* Not already in the hash table. Make new entry. */ _mesa_hash_table_insert(this->kills, var, new(this->mem_ctx) kill_entry(var, write_mask)); } diff --git a/src/compiler/glsl/opt_copy_propagation.cpp b/src/compiler/glsl/opt_copy_propagation.cpp index 310708db868..ae62921a0df 100644 --- a/src/compiler/glsl/opt_copy_propagation.cpp +++ b/src/compiler/glsl/opt_copy_propagation.cpp @@ -331,7 +331,8 @@ ir_copy_propagation_visitor::add_copy(ir_assignment *ir) ir->condition = new(ralloc_parent(ir)) ir_constant(false); this->progress = true; } else if (lhs_var->data.mode != ir_var_shader_storage && - lhs_var->data.mode != ir_var_shader_shared) { + lhs_var->data.mode != ir_var_shader_shared && + lhs_var->data.precise == rhs_var->data.precise) { entry = new(this->acp) acp_entry(lhs_var, rhs_var); this->acp->push_tail(entry); } diff --git a/src/compiler/glsl/opt_copy_propagation_elements.cpp b/src/compiler/glsl/opt_copy_propagation_elements.cpp index a6791801943..e9e7c53505c 100644 --- a/src/compiler/glsl/opt_copy_propagation_elements.cpp +++ b/src/compiler/glsl/opt_copy_propagation_elements.cpp @@ -493,6 +493,9 @@ ir_copy_propagation_elements_visitor::add_copy(ir_assignment *ir) } } + if (lhs->var->data.precise != rhs->var->data.precise) + return; + entry = new(this->mem_ctx) acp_entry(lhs->var, rhs->var, write_mask, swizzle); this->acp->push_tail(entry); diff --git a/src/compiler/glsl/opt_tree_grafting.cpp b/src/compiler/glsl/opt_tree_grafting.cpp index 812f996fb81..a40e5f71609 100644 --- a/src/compiler/glsl/opt_tree_grafting.cpp +++ b/src/compiler/glsl/opt_tree_grafting.cpp @@ -368,6 +368,9 @@ tree_grafting_basic_block(ir_instruction *bb_first, lhs_var->data.mode == ir_var_shader_shared) continue; + if (lhs_var->data.precise) + continue; + ir_variable_refcount_entry *entry = info->refs->get_variable_entry(lhs_var); if (!entry->declaration || diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp index 39585bff3b9..c058283c48d 100644 --- a/src/compiler/glsl_types.cpp +++ b/src/compiler/glsl_types.cpp @@ -1897,7 +1897,7 @@ glsl_type::std430_size(bool row_major) const } unsigned -glsl_type::count_attribute_slots(bool vertex_input_slots) const +glsl_type::count_attribute_slots(bool is_vertex_input) const { /* From page 31 (page 37 of the PDF) of the GLSL 1.50 spec: * @@ -1931,7 +1931,7 @@ glsl_type::count_attribute_slots(bool vertex_input_slots) const case GLSL_TYPE_BOOL: return this->matrix_columns; case GLSL_TYPE_DOUBLE: - if (this->vector_elements > 2 && !vertex_input_slots) + if (this->vector_elements > 2 && !is_vertex_input) return this->matrix_columns * 2; else return this->matrix_columns; @@ -1940,13 +1940,13 @@ glsl_type::count_attribute_slots(bool vertex_input_slots) const unsigned size = 0; for (unsigned i = 0; i < this->length; i++) - size += this->fields.structure[i].type->count_attribute_slots(vertex_input_slots); + size += this->fields.structure[i].type->count_attribute_slots(is_vertex_input); return size; } case GLSL_TYPE_ARRAY: - return this->length * this->fields.array->count_attribute_slots(vertex_input_slots); + return this->length * this->fields.array->count_attribute_slots(is_vertex_input); case GLSL_TYPE_FUNCTION: case GLSL_TYPE_SAMPLER: diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h index dd46479755a..a47b0ffe5a2 100644 --- a/src/compiler/glsl_types.h +++ b/src/compiler/glsl_types.h @@ -344,7 +344,7 @@ struct glsl_type { * For vertex shader attributes - doubles only take one slot. * For inter-shader varyings - dvec3/dvec4 take two slots. */ - unsigned count_attribute_slots(bool vertex_input_slots) const; + unsigned count_attribute_slots(bool is_vertex_input) const; /** * Alignment in bytes of the start of this type in a std140 uniform diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources deleted file mode 100644 index e6367d9c282..00000000000 --- a/src/compiler/nir/Makefile.sources +++ /dev/null @@ -1,87 +0,0 @@ -NIR_GENERATED_FILES = \ - nir_builder_opcodes.h \ - nir_constant_expressions.c \ - nir_opcodes.c \ - nir_opcodes.h \ - nir_opt_algebraic.c - -NIR_FILES = \ - glsl_to_nir.cpp \ - glsl_to_nir.h \ - nir.c \ - nir.h \ - nir_array.h \ - nir_builder.h \ - nir_clone.c \ - nir_constant_expressions.h \ - nir_control_flow.c \ - nir_control_flow.h \ - nir_control_flow_private.h \ - nir_dominance.c \ - nir_from_ssa.c \ - nir_gather_info.c \ - nir_gs_count_vertices.c \ - nir_inline_functions.c \ - nir_instr_set.c \ - nir_instr_set.h \ - nir_intrinsics.c \ - nir_intrinsics.h \ - nir_liveness.c \ - nir_lower_alu_to_scalar.c \ - nir_lower_atomics.c \ - nir_lower_clip.c \ - nir_lower_global_vars_to_local.c \ - nir_lower_gs_intrinsics.c \ - nir_lower_load_const_to_scalar.c \ - nir_lower_locals_to_regs.c \ - nir_lower_idiv.c \ - nir_lower_indirect_derefs.c \ - nir_lower_io.c \ - nir_lower_outputs_to_temporaries.c \ - nir_lower_phis_to_scalar.c \ - nir_lower_returns.c \ - nir_lower_samplers.c \ - nir_lower_system_values.c \ - nir_lower_tex.c \ - nir_lower_to_source_mods.c \ - nir_lower_two_sided_color.c \ - nir_lower_vars_to_ssa.c \ - nir_lower_var_copies.c \ - nir_lower_vec_to_movs.c \ - nir_metadata.c \ - nir_move_vec_src_uses_to_dest.c \ - nir_normalize_cubemap_coords.c \ - nir_opt_constant_folding.c \ - nir_opt_copy_propagate.c \ - nir_opt_cse.c \ - nir_opt_dce.c \ - nir_opt_dead_cf.c \ - nir_opt_gcm.c \ - nir_opt_global_to_local.c \ - nir_opt_peephole_select.c \ - nir_opt_remove_phis.c \ - nir_opt_undef.c \ - nir_phi_builder.c \ - nir_phi_builder.h \ - nir_print.c \ - nir_remove_dead_variables.c \ - nir_repair_ssa.c \ - nir_search.c \ - nir_search.h \ - nir_split_var_copies.c \ - nir_sweep.c \ - nir_to_ssa.c \ - nir_validate.c \ - nir_vla.h \ - nir_worklist.c \ - nir_worklist.h - -SPIRV_FILES = \ - spirv/nir_spirv.h \ - spirv/spirv_to_nir.c \ - spirv/vtn_alu.c \ - spirv/vtn_cfg.c \ - spirv/vtn_glsl450.c \ - spirv/vtn_private.h \ - spirv/vtn_variables.c - diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp index 14affeee8ac..d4c58a9ba2e 100644 --- a/src/compiler/nir/glsl_to_nir.cpp +++ b/src/compiler/nir/glsl_to_nir.cpp @@ -73,7 +73,7 @@ public: void create_function(ir_function_signature *ir); private: - void add_instr(nir_instr *instr, unsigned num_components); + void add_instr(nir_instr *instr, unsigned num_components, unsigned bit_size); nir_ssa_def *evaluate_rvalue(ir_rvalue *ir); nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def **srcs); @@ -257,6 +257,11 @@ constant_copy(ir_constant *ir, void *mem_ctx) ret->value.f[i] = ir->value.f[i]; break; + case GLSL_TYPE_DOUBLE: + for (i = 0; i < total_elems; i++) + ret->value.d[i] = ir->value.d[i]; + break; + case GLSL_TYPE_BOOL: for (i = 0; i < total_elems; i++) ret->value.b[i] = ir->value.b[i]; @@ -736,7 +741,7 @@ nir_visitor::visit(ir_call *ir) case nir_intrinsic_image_samples: case nir_intrinsic_image_size: { nir_ssa_undef_instr *instr_undef = - nir_ssa_undef_instr_create(shader, 1); + nir_ssa_undef_instr_create(shader, 1, 32); nir_builder_instr_insert(&b, &instr_undef->instr); /* Set the image variable dereference. */ @@ -854,8 +859,9 @@ nir_visitor::visit(ir_call *ir) instr->num_components = type->vector_elements; /* Setup destination register */ + unsigned bit_size = glsl_get_bit_size(type->base_type); nir_ssa_dest_init(&instr->instr, &instr->dest, - type->vector_elements, 32, NULL); + type->vector_elements, bit_size, NULL); /* Insert the created nir instruction now since in the case of boolean * result we will need to emit another instruction after it @@ -878,7 +884,7 @@ nir_visitor::visit(ir_call *ir) load_ssbo_compare->src[1].swizzle[i] = 0; nir_ssa_dest_init(&load_ssbo_compare->instr, &load_ssbo_compare->dest.dest, - type->vector_elements, 32, NULL); + type->vector_elements, bit_size, NULL); load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1; nir_builder_instr_insert(&b, &load_ssbo_compare->instr); dest = &load_ssbo_compare->dest.dest; @@ -1152,12 +1158,13 @@ get_instr_dest(nir_instr *instr) } void -nir_visitor::add_instr(nir_instr *instr, unsigned num_components) +nir_visitor::add_instr(nir_instr *instr, unsigned num_components, + unsigned bit_size) { nir_dest *dest = get_instr_dest(instr); if (dest) - nir_ssa_dest_init(instr, dest, num_components, 32, NULL); + nir_ssa_dest_init(instr, dest, num_components, bit_size, NULL); nir_builder_instr_insert(&b, instr); @@ -1182,12 +1189,19 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir) load_instr->num_components = ir->type->vector_elements; load_instr->variables[0] = this->deref_head; ralloc_steal(load_instr, load_instr->variables[0]); - add_instr(&load_instr->instr, ir->type->vector_elements); + unsigned bit_size = glsl_get_bit_size(ir->type->base_type); + add_instr(&load_instr->instr, ir->type->vector_elements, bit_size); } return this->result; } +static bool +type_is_float(glsl_base_type type) +{ + return type == GLSL_TYPE_FLOAT || type == GLSL_TYPE_DOUBLE; +} + void nir_visitor::visit(ir_expression *ir) { @@ -1196,11 +1210,11 @@ nir_visitor::visit(ir_expression *ir) case ir_binop_ubo_load: { nir_intrinsic_instr *load = nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_ubo); + unsigned bit_size = glsl_get_bit_size(ir->type->base_type); load->num_components = ir->type->vector_elements; - load->dest.ssa.bit_size = glsl_get_bit_size(ir->type->base_type); load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0])); load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1])); - add_instr(&load->instr, ir->type->vector_elements); + add_instr(&load->instr, ir->type->vector_elements, bit_size); /* * In UBO's, a true boolean value is any non-zero value, but we consider @@ -1265,7 +1279,8 @@ nir_visitor::visit(ir_expression *ir) intrin->intrinsic == nir_intrinsic_interp_var_at_sample) intrin->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1])); - add_instr(&intrin->instr, deref->type->vector_elements); + unsigned bit_size = glsl_get_bit_size(deref->type->base_type); + add_instr(&intrin->instr, deref->type->vector_elements, bit_size); if (swizzle) { unsigned swiz[4] = { @@ -1306,20 +1321,20 @@ nir_visitor::visit(ir_expression *ir) result = supports_ints ? nir_inot(&b, srcs[0]) : nir_fnot(&b, srcs[0]); break; case ir_unop_neg: - result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fneg(&b, srcs[0]) - : nir_ineg(&b, srcs[0]); + result = type_is_float(types[0]) ? nir_fneg(&b, srcs[0]) + : nir_ineg(&b, srcs[0]); break; case ir_unop_abs: - result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fabs(&b, srcs[0]) - : nir_iabs(&b, srcs[0]); + result = type_is_float(types[0]) ? nir_fabs(&b, srcs[0]) + : nir_iabs(&b, srcs[0]); break; case ir_unop_saturate: - assert(types[0] == GLSL_TYPE_FLOAT); + assert(type_is_float(types[0])); result = nir_fsat(&b, srcs[0]); break; case ir_unop_sign: - result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fsign(&b, srcs[0]) - : nir_isign(&b, srcs[0]); + result = type_is_float(types[0]) ? nir_fsign(&b, srcs[0]) + : nir_isign(&b, srcs[0]); break; case ir_unop_rcp: result = nir_frcp(&b, srcs[0]); break; case ir_unop_rsq: result = nir_frsq(&b, srcs[0]); break; @@ -1342,6 +1357,19 @@ nir_visitor::visit(ir_expression *ir) case ir_unop_f2b: result = nir_f2b(&b, srcs[0]); break; case ir_unop_i2b: result = nir_i2b(&b, srcs[0]); break; case ir_unop_b2i: result = nir_b2i(&b, srcs[0]); break; + case ir_unop_d2f: result = nir_d2f(&b, srcs[0]); break; + case ir_unop_f2d: result = nir_f2d(&b, srcs[0]); break; + case ir_unop_d2i: result = nir_d2i(&b, srcs[0]); break; + case ir_unop_d2u: result = nir_d2u(&b, srcs[0]); break; + case ir_unop_d2b: result = nir_d2b(&b, srcs[0]); break; + case ir_unop_i2d: + assert(supports_ints); + result = nir_i2d(&b, srcs[0]); + break; + case ir_unop_u2d: + assert(supports_ints); + result = nir_u2d(&b, srcs[0]); + break; case ir_unop_i2u: case ir_unop_u2i: case ir_unop_bitcast_i2f: @@ -1395,6 +1423,12 @@ nir_visitor::visit(ir_expression *ir) case ir_unop_unpack_half_2x16: result = nir_unpack_half_2x16(&b, srcs[0]); break; + case ir_unop_pack_double_2x32: + result = nir_pack_double_2x32(&b, srcs[0]); + break; + case ir_unop_unpack_double_2x32: + result = nir_unpack_double_2x32(&b, srcs[0]); + break; case ir_unop_bitfield_reverse: result = nir_bitfield_reverse(&b, srcs[0]); break; @@ -1465,24 +1499,25 @@ nir_visitor::visit(ir_expression *ir) nir_intrinsic_get_buffer_size); load->num_components = ir->type->vector_elements; load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0])); - add_instr(&load->instr, ir->type->vector_elements); + unsigned bit_size = glsl_get_bit_size(ir->type->base_type); + add_instr(&load->instr, ir->type->vector_elements, bit_size); return; } case ir_binop_add: - result = (out_type == GLSL_TYPE_FLOAT) ? nir_fadd(&b, srcs[0], srcs[1]) - : nir_iadd(&b, srcs[0], srcs[1]); + result = type_is_float(out_type) ? nir_fadd(&b, srcs[0], srcs[1]) + : nir_iadd(&b, srcs[0], srcs[1]); break; case ir_binop_sub: - result = (out_type == GLSL_TYPE_FLOAT) ? nir_fsub(&b, srcs[0], srcs[1]) - : nir_isub(&b, srcs[0], srcs[1]); + result = type_is_float(out_type) ? nir_fsub(&b, srcs[0], srcs[1]) + : nir_isub(&b, srcs[0], srcs[1]); break; case ir_binop_mul: - result = (out_type == GLSL_TYPE_FLOAT) ? nir_fmul(&b, srcs[0], srcs[1]) - : nir_imul(&b, srcs[0], srcs[1]); + result = type_is_float(out_type) ? nir_fmul(&b, srcs[0], srcs[1]) + : nir_imul(&b, srcs[0], srcs[1]); break; case ir_binop_div: - if (out_type == GLSL_TYPE_FLOAT) + if (type_is_float(out_type)) result = nir_fdiv(&b, srcs[0], srcs[1]); else if (out_type == GLSL_TYPE_INT) result = nir_idiv(&b, srcs[0], srcs[1]); @@ -1490,11 +1525,11 @@ nir_visitor::visit(ir_expression *ir) result = nir_udiv(&b, srcs[0], srcs[1]); break; case ir_binop_mod: - result = (out_type == GLSL_TYPE_FLOAT) ? nir_fmod(&b, srcs[0], srcs[1]) - : nir_umod(&b, srcs[0], srcs[1]); + result = type_is_float(out_type) ? nir_fmod(&b, srcs[0], srcs[1]) + : nir_umod(&b, srcs[0], srcs[1]); break; case ir_binop_min: - if (out_type == GLSL_TYPE_FLOAT) + if (type_is_float(out_type)) result = nir_fmin(&b, srcs[0], srcs[1]); else if (out_type == GLSL_TYPE_INT) result = nir_imin(&b, srcs[0], srcs[1]); @@ -1502,7 +1537,7 @@ nir_visitor::visit(ir_expression *ir) result = nir_umin(&b, srcs[0], srcs[1]); break; case ir_binop_max: - if (out_type == GLSL_TYPE_FLOAT) + if (type_is_float(out_type)) result = nir_fmax(&b, srcs[0], srcs[1]); else if (out_type == GLSL_TYPE_INT) result = nir_imax(&b, srcs[0], srcs[1]); @@ -1538,7 +1573,7 @@ nir_visitor::visit(ir_expression *ir) case ir_binop_borrow: result = nir_usub_borrow(&b, srcs[0], srcs[1]); break; case ir_binop_less: if (supports_ints) { - if (types[0] == GLSL_TYPE_FLOAT) + if (type_is_float(types[0])) result = nir_flt(&b, srcs[0], srcs[1]); else if (types[0] == GLSL_TYPE_INT) result = nir_ilt(&b, srcs[0], srcs[1]); @@ -1550,7 +1585,7 @@ nir_visitor::visit(ir_expression *ir) break; case ir_binop_greater: if (supports_ints) { - if (types[0] == GLSL_TYPE_FLOAT) + if (type_is_float(types[0])) result = nir_flt(&b, srcs[1], srcs[0]); else if (types[0] == GLSL_TYPE_INT) result = nir_ilt(&b, srcs[1], srcs[0]); @@ -1562,7 +1597,7 @@ nir_visitor::visit(ir_expression *ir) break; case ir_binop_lequal: if (supports_ints) { - if (types[0] == GLSL_TYPE_FLOAT) + if (type_is_float(types[0])) result = nir_fge(&b, srcs[1], srcs[0]); else if (types[0] == GLSL_TYPE_INT) result = nir_ige(&b, srcs[1], srcs[0]); @@ -1574,7 +1609,7 @@ nir_visitor::visit(ir_expression *ir) break; case ir_binop_gequal: if (supports_ints) { - if (types[0] == GLSL_TYPE_FLOAT) + if (type_is_float(types[0])) result = nir_fge(&b, srcs[0], srcs[1]); else if (types[0] == GLSL_TYPE_INT) result = nir_ige(&b, srcs[0], srcs[1]); @@ -1586,7 +1621,7 @@ nir_visitor::visit(ir_expression *ir) break; case ir_binop_equal: if (supports_ints) { - if (types[0] == GLSL_TYPE_FLOAT) + if (type_is_float(types[0])) result = nir_feq(&b, srcs[0], srcs[1]); else result = nir_ieq(&b, srcs[0], srcs[1]); @@ -1596,7 +1631,7 @@ nir_visitor::visit(ir_expression *ir) break; case ir_binop_nequal: if (supports_ints) { - if (types[0] == GLSL_TYPE_FLOAT) + if (type_is_float(types[0])) result = nir_fne(&b, srcs[0], srcs[1]); else result = nir_ine(&b, srcs[0], srcs[1]); @@ -1606,7 +1641,7 @@ nir_visitor::visit(ir_expression *ir) break; case ir_binop_all_equal: if (supports_ints) { - if (types[0] == GLSL_TYPE_FLOAT) { + if (type_is_float(types[0])) { switch (ir->operands[0]->type->vector_elements) { case 1: result = nir_feq(&b, srcs[0], srcs[1]); break; case 2: result = nir_ball_fequal2(&b, srcs[0], srcs[1]); break; @@ -1638,7 +1673,7 @@ nir_visitor::visit(ir_expression *ir) break; case ir_binop_any_nequal: if (supports_ints) { - if (types[0] == GLSL_TYPE_FLOAT) { + if (type_is_float(types[0])) { switch (ir->operands[0]->type->vector_elements) { case 1: result = nir_fne(&b, srcs[0], srcs[1]); break; case 2: result = nir_bany_fnequal2(&b, srcs[0], srcs[1]); break; @@ -1902,7 +1937,8 @@ nir_visitor::visit(ir_texture *ir) assert(src_number == num_srcs); - add_instr(&instr->instr, nir_tex_instr_dest_size(instr)); + unsigned bit_size = glsl_get_bit_size(ir->type->base_type); + add_instr(&instr->instr, nir_tex_instr_dest_size(instr), bit_size); } void diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c index b67916dc86b..8d38d3384d8 100644 --- a/src/compiler/nir/nir.c +++ b/src/compiler/nir/nir.c @@ -469,12 +469,13 @@ nir_jump_instr_create(nir_shader *shader, nir_jump_type type) } nir_load_const_instr * -nir_load_const_instr_create(nir_shader *shader, unsigned num_components) +nir_load_const_instr_create(nir_shader *shader, unsigned num_components, + unsigned bit_size) { nir_load_const_instr *instr = ralloc(shader, nir_load_const_instr); instr_init(&instr->instr, nir_instr_type_load_const); - nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL); + nir_ssa_def_init(&instr->instr, &instr->def, num_components, bit_size, NULL); return instr; } @@ -558,12 +559,14 @@ nir_parallel_copy_instr_create(nir_shader *shader) } nir_ssa_undef_instr * -nir_ssa_undef_instr_create(nir_shader *shader, unsigned num_components) +nir_ssa_undef_instr_create(nir_shader *shader, + unsigned num_components, + unsigned bit_size) { nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr); instr_init(&instr->instr, nir_instr_type_ssa_undef); - nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL); + nir_ssa_def_init(&instr->instr, &instr->def, num_components, bit_size, NULL); return instr; } @@ -691,8 +694,10 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref) tail = tail->child; } + unsigned bit_size = glsl_get_bit_size(glsl_get_base_type(tail->type)); nir_load_const_instr *load = - nir_load_const_instr_create(shader, glsl_get_vector_elements(tail->type)); + nir_load_const_instr_create(shader, glsl_get_vector_elements(tail->type), + bit_size); matrix_offset *= load->def.num_components; for (unsigned i = 0; i < load->def.num_components; i++) { @@ -702,6 +707,9 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref) case GLSL_TYPE_UINT: load->value.u32[i] = constant->value.u[matrix_offset + i]; break; + case GLSL_TYPE_DOUBLE: + load->value.f64[i] = constant->value.d[matrix_offset + i]; + break; case GLSL_TYPE_BOOL: load->value.u32[i] = constant->value.b[matrix_offset + i] ? NIR_TRUE : NIR_FALSE; diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index 8e45cba5a16..c3a33431239 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -81,16 +81,16 @@ typedef struct { } nir_state_slot; typedef enum { - nir_var_all = -1, - nir_var_shader_in, - nir_var_shader_out, - nir_var_global, - nir_var_local, - nir_var_uniform, - nir_var_shader_storage, - nir_var_system_value, - nir_var_param, - nir_var_shared, + nir_var_shader_in = (1 << 0), + nir_var_shader_out = (1 << 1), + nir_var_global = (1 << 2), + nir_var_local = (1 << 3), + nir_var_uniform = (1 << 4), + nir_var_shader_storage = (1 << 5), + nir_var_system_value = (1 << 6), + nir_var_param = (1 << 7), + nir_var_shared = (1 << 8), + nir_var_all = ~0, } nir_variable_mode; /** @@ -156,6 +156,12 @@ typedef struct nir_variable { char *name; struct nir_variable_data { + /** + * Storage class of the variable. + * + * \sa nir_variable_mode + */ + nir_variable_mode mode; /** * Is the variable read-only? @@ -170,13 +176,6 @@ typedef struct nir_variable { unsigned invariant:1; /** - * Storage class of the variable. - * - * \sa nir_variable_mode - */ - nir_variable_mode mode:5; - - /** * Interpolation mode for shader inputs / outputs * * \sa glsl_interp_qualifier @@ -1857,7 +1856,8 @@ nir_alu_instr *nir_alu_instr_create(nir_shader *shader, nir_op op); nir_jump_instr *nir_jump_instr_create(nir_shader *shader, nir_jump_type type); nir_load_const_instr *nir_load_const_instr_create(nir_shader *shader, - unsigned num_components); + unsigned num_components, + unsigned bit_size); nir_intrinsic_instr *nir_intrinsic_instr_create(nir_shader *shader, nir_intrinsic_op op); @@ -1872,7 +1872,8 @@ nir_phi_instr *nir_phi_instr_create(nir_shader *shader); nir_parallel_copy_instr *nir_parallel_copy_instr_create(nir_shader *shader); nir_ssa_undef_instr *nir_ssa_undef_instr_create(nir_shader *shader, - unsigned num_components); + unsigned num_components, + unsigned bit_size); nir_deref_var *nir_deref_var_create(void *mem_ctx, nir_variable *var); nir_deref_array *nir_deref_array_create(void *mem_ctx); @@ -2208,12 +2209,13 @@ void nir_lower_var_copies(nir_shader *shader); bool nir_lower_global_vars_to_local(nir_shader *shader); -bool nir_lower_indirect_derefs(nir_shader *shader, uint32_t mode_mask); +bool nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes); bool nir_lower_locals_to_regs(nir_shader *shader); void nir_lower_outputs_to_temporaries(nir_shader *shader, nir_function *entrypoint); +void nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint); void nir_shader_gather_info(nir_shader *shader, nir_function_impl *entrypoint); @@ -2222,14 +2224,14 @@ void nir_assign_var_locations(struct exec_list *var_list, int (*type_size)(const struct glsl_type *)); void nir_lower_io(nir_shader *shader, - nir_variable_mode mode, + nir_variable_mode modes, int (*type_size)(const struct glsl_type *)); nir_src *nir_get_io_offset_src(nir_intrinsic_instr *instr); nir_src *nir_get_io_vertex_index_src(nir_intrinsic_instr *instr); void nir_lower_vars_to_ssa(nir_shader *shader); -bool nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode); +bool nir_remove_dead_variables(nir_shader *shader, nir_variable_mode modes); void nir_move_vec_src_uses_to_dest(nir_shader *shader); bool nir_lower_vec_to_movs(nir_shader *shader); @@ -2305,6 +2307,8 @@ void nir_lower_to_source_mods(nir_shader *shader); bool nir_lower_gs_intrinsics(nir_shader *shader); +void nir_lower_double_pack(nir_shader *shader); + bool nir_normalize_cubemap_coords(nir_shader *shader); void nir_live_ssa_defs_impl(nir_function_impl *impl); diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py index d05564f779c..53a79073a44 100644 --- a/src/compiler/nir/nir_algebraic.py +++ b/src/compiler/nir/nir_algebraic.py @@ -291,6 +291,7 @@ ${pass_name}(nir_shader *shader) bool progress = false; bool condition_flags[${len(condition_list)}]; const nir_shader_compiler_options *options = shader->options; + (void) options; % for index, condition in enumerate(condition_list): condition_flags[${index}] = ${condition}; diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h index 3dc7c25ec28..29b13fb222f 100644 --- a/src/compiler/nir/nir_builder.h +++ b/src/compiler/nir/nir_builder.h @@ -78,7 +78,7 @@ static inline nir_ssa_def * nir_ssa_undef(nir_builder *build, unsigned num_components, unsigned bit_size) { nir_ssa_undef_instr *undef = - nir_ssa_undef_instr_create(build->shader, num_components); + nir_ssa_undef_instr_create(build->shader, num_components, bit_size); undef->def.bit_size = bit_size; if (!undef) return NULL; @@ -92,7 +92,7 @@ static inline nir_ssa_def * nir_build_imm(nir_builder *build, unsigned num_components, nir_const_value value) { nir_load_const_instr *load_const = - nir_load_const_instr_create(build->shader, num_components); + nir_load_const_instr_create(build->shader, num_components, 32); if (!load_const) return NULL; diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c index 7d2e3835258..e231387c889 100644 --- a/src/compiler/nir/nir_clone.c +++ b/src/compiler/nir/nir_clone.c @@ -179,6 +179,7 @@ clone_register(clone_state *state, const nir_register *reg) add_remap(state, nreg, reg); nreg->num_components = reg->num_components; + nreg->bit_size = reg->bit_size; nreg->num_array_elems = reg->num_array_elems; nreg->index = reg->index; nreg->name = ralloc_strdup(nreg, reg->name); @@ -359,7 +360,8 @@ static nir_load_const_instr * clone_load_const(clone_state *state, const nir_load_const_instr *lc) { nir_load_const_instr *nlc = - nir_load_const_instr_create(state->ns, lc->def.num_components); + nir_load_const_instr_create(state->ns, lc->def.num_components, + lc->def.bit_size); memcpy(&nlc->value, &lc->value, sizeof(nlc->value)); @@ -372,7 +374,8 @@ static nir_ssa_undef_instr * clone_ssa_undef(clone_state *state, const nir_ssa_undef_instr *sa) { nir_ssa_undef_instr *nsa = - nir_ssa_undef_instr_create(state->ns, sa->def.num_components); + nir_ssa_undef_instr_create(state->ns, sa->def.num_components, + sa->def.bit_size); add_remap(state, &nsa->def, &sa->def); diff --git a/src/compiler/nir/nir_control_flow.c b/src/compiler/nir/nir_control_flow.c index 33b06d0cc84..ea5741288ce 100644 --- a/src/compiler/nir/nir_control_flow.c +++ b/src/compiler/nir/nir_control_flow.c @@ -281,7 +281,8 @@ insert_phi_undef(nir_block *block, nir_block *pred) nir_phi_instr *phi = nir_instr_as_phi(instr); nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(ralloc_parent(phi), - phi->dest.ssa.num_components); + phi->dest.ssa.num_components, + phi->dest.ssa.bit_size); nir_instr_insert_before_cf_list(&impl->body, &undef->instr); nir_phi_src *src = ralloc(phi, nir_phi_src); src->pred = pred; @@ -691,7 +692,8 @@ replace_ssa_def_uses(nir_ssa_def *def, void *void_impl) void *mem_ctx = ralloc_parent(impl); nir_ssa_undef_instr *undef = - nir_ssa_undef_instr_create(mem_ctx, def->num_components); + nir_ssa_undef_instr_create(mem_ctx, def->num_components, + def->bit_size); nir_instr_insert_before_cf_list(&impl->body, &undef->instr); nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(&undef->def)); return true; diff --git a/src/compiler/nir/nir_from_ssa.c b/src/compiler/nir/nir_from_ssa.c index 82317c21b62..7bbc2c0f299 100644 --- a/src/compiler/nir/nir_from_ssa.c +++ b/src/compiler/nir/nir_from_ssa.c @@ -474,6 +474,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state) node->set->reg = nir_local_reg_create(state->impl); node->set->reg->name = def->name; node->set->reg->num_components = def->num_components; + node->set->reg->bit_size = def->bit_size; node->set->reg->num_array_elems = 0; } @@ -491,6 +492,7 @@ rewrite_ssa_def(nir_ssa_def *def, void *void_state) reg = nir_local_reg_create(state->impl); reg->name = def->name; reg->num_components = def->num_components; + reg->bit_size = def->bit_size; reg->num_array_elems = 0; } diff --git a/src/compiler/nir/nir_instr_set.c b/src/compiler/nir/nir_instr_set.c index e244122e466..c6161433516 100644 --- a/src/compiler/nir/nir_instr_set.c +++ b/src/compiler/nir/nir_instr_set.c @@ -52,6 +52,7 @@ hash_alu(uint32_t hash, const nir_alu_instr *instr) { hash = HASH(hash, instr->op); hash = HASH(hash, instr->dest.dest.ssa.num_components); + hash = HASH(hash, instr->dest.dest.ssa.bit_size); /* We explicitly don't hash instr->dest.dest.exact */ if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { @@ -82,9 +83,8 @@ hash_load_const(uint32_t hash, const nir_load_const_instr *instr) { hash = HASH(hash, instr->def.num_components); - hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32, - instr->def.num_components - * sizeof(instr->value.f32[0])); + unsigned size = instr->def.num_components * (instr->def.bit_size / 8); + hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32, size); return hash; } @@ -126,8 +126,10 @@ hash_intrinsic(uint32_t hash, const nir_intrinsic_instr *instr) const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; hash = HASH(hash, instr->intrinsic); - if (info->has_dest) + if (info->has_dest) { hash = HASH(hash, instr->dest.ssa.num_components); + hash = HASH(hash, instr->dest.ssa.bit_size); + } assert(info->num_variables == 0); @@ -268,6 +270,9 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components) return false; + if (alu1->dest.dest.ssa.bit_size != alu2->dest.dest.ssa.bit_size) + return false; + /* We explicitly don't hash instr->dest.dest.exact */ if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { @@ -325,8 +330,11 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) if (load1->def.num_components != load2->def.num_components) return false; + if (load1->def.bit_size != load2->def.bit_size) + return false; + return memcmp(load1->value.f32, load2->value.f32, - load1->def.num_components * sizeof(*load2->value.f32)) == 0; + load1->def.num_components * (load1->def.bit_size / 8)) == 0; } case nir_instr_type_phi: { nir_phi_instr *phi1 = nir_instr_as_phi(instr1); @@ -362,6 +370,10 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) intrinsic2->dest.ssa.num_components) return false; + if (info->has_dest && intrinsic1->dest.ssa.bit_size != + intrinsic2->dest.ssa.bit_size) + return false; + for (unsigned i = 0; i < info->num_srcs; i++) { if (!nir_srcs_equal(intrinsic1->src[i], intrinsic2->src[i])) return false; diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c index e8ba640fe0b..1548abbd558 100644 --- a/src/compiler/nir/nir_lower_alu_to_scalar.c +++ b/src/compiler/nir/nir_lower_alu_to_scalar.c @@ -187,6 +187,9 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b) return; } + case nir_op_unpack_double_2x32: + return; + LOWER_REDUCTION(nir_op_fdot, nir_op_fmul, nir_op_fadd); LOWER_REDUCTION(nir_op_ball_fequal, nir_op_feq, nir_op_iand); LOWER_REDUCTION(nir_op_ball_iequal, nir_op_ieq, nir_op_iand); diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c index 70381a7968a..b2ea31888f8 100644 --- a/src/compiler/nir/nir_lower_atomics.c +++ b/src/compiler/nir/nir_lower_atomics.c @@ -74,7 +74,8 @@ lower_instr(nir_intrinsic_instr *instr, nir_intrinsic_set_base(new_instr, state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index); - nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1); + nir_load_const_instr *offset_const = + nir_load_const_instr_create(mem_ctx, 1, 32); offset_const->value.u32[0] = instr->variables[0]->var->data.offset; nir_instr_insert_before(&instr->instr, &offset_const->instr); @@ -95,7 +96,7 @@ lower_instr(nir_intrinsic_instr *instr, if (deref_array->deref_array_type == nir_deref_array_type_indirect) { nir_load_const_instr *atomic_counter_size = - nir_load_const_instr_create(mem_ctx, 1); + nir_load_const_instr_create(mem_ctx, 1, 32); atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE; nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr); diff --git a/src/compiler/nir/nir_lower_double_packing.c b/src/compiler/nir/nir_lower_double_packing.c new file mode 100644 index 00000000000..d43683d2007 --- /dev/null +++ b/src/compiler/nir/nir_lower_double_packing.c @@ -0,0 +1,95 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "nir.h" +#include "nir_builder.h" + +/* + * lowers: + * + * packDouble2x32(foo) -> packDouble2x32Split(foo.x, foo.y) + * unpackDouble2x32(foo) -> vec2(unpackDouble2x32_x(foo), unpackDouble2x32_y(foo)) + */ + +static nir_ssa_def * +lower_pack_double(nir_builder *b, nir_ssa_def *src) +{ + return nir_pack_double_2x32_split(b, nir_channel(b, src, 0), + nir_channel(b, src, 1)); +} + +static nir_ssa_def * +lower_unpack_double(nir_builder *b, nir_ssa_def *src) +{ + return nir_vec2(b, nir_unpack_double_2x32_split_x(b, src), + nir_unpack_double_2x32_split_y(b, src)); +} + +static bool +lower_double_pack_block(nir_block *block, void *ctx) +{ + nir_builder *b = (nir_builder *) ctx; + + nir_foreach_instr_safe(block, instr) { + if (instr->type != nir_instr_type_alu) + continue; + + nir_alu_instr *alu_instr = (nir_alu_instr *) instr; + + if (alu_instr->op != nir_op_pack_double_2x32 && + alu_instr->op != nir_op_unpack_double_2x32) + continue; + + b->cursor = nir_before_instr(&alu_instr->instr); + + nir_ssa_def *src = nir_ssa_for_alu_src(b, alu_instr, 0); + nir_ssa_def *dest = + alu_instr->op == nir_op_pack_double_2x32 ? + lower_pack_double(b, src) : + lower_unpack_double(b, src); + + nir_ssa_def_rewrite_uses(&alu_instr->dest.dest.ssa, nir_src_for_ssa(dest)); + nir_instr_remove(&alu_instr->instr); + } + + return true; +} + +static void +lower_double_pack_impl(nir_function_impl *impl) +{ + nir_builder b; + nir_builder_init(&b, impl); + nir_foreach_block(impl, lower_double_pack_block, &b); +} + +void +nir_lower_double_pack(nir_shader *shader) +{ + nir_foreach_function(shader, function) { + if (function->impl) + lower_double_pack_impl(function->impl); + } +} + diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c index 62b8c84a956..a69dd612565 100644 --- a/src/compiler/nir/nir_lower_indirect_derefs.c +++ b/src/compiler/nir/nir_lower_indirect_derefs.c @@ -161,7 +161,7 @@ deref_has_indirect(nir_deref_var *deref) struct lower_indirect_state { nir_builder builder; - uint32_t mode_mask; + nir_variable_mode modes; bool progress; }; @@ -183,7 +183,7 @@ lower_indirect_block(nir_block *block, void *void_state) continue; /* Only lower variables whose mode is in the mask */ - if (!(state->mode_mask & (1 << intrin->variables[0]->var->data.mode))) + if (!(state->modes & intrin->variables[0]->var->data.mode)) continue; state->builder.cursor = nir_before_instr(&intrin->instr); @@ -206,12 +206,12 @@ lower_indirect_block(nir_block *block, void *void_state) } static bool -lower_indirects_impl(nir_function_impl *impl, uint32_t mode_mask) +lower_indirects_impl(nir_function_impl *impl, nir_variable_mode modes) { struct lower_indirect_state state; state.progress = false; - state.mode_mask = mode_mask; + state.modes = modes; nir_builder_init(&state.builder, impl); nir_foreach_block(impl, lower_indirect_block, &state); @@ -228,13 +228,13 @@ lower_indirects_impl(nir_function_impl *impl, uint32_t mode_mask) * that does a binary search on the array index. */ bool -nir_lower_indirect_derefs(nir_shader *shader, uint32_t mode_mask) +nir_lower_indirect_derefs(nir_shader *shader, nir_variable_mode modes) { bool progress = false; nir_foreach_function(shader, function) { if (function->impl) - progress = lower_indirects_impl(function->impl, mode_mask) || progress; + progress = lower_indirects_impl(function->impl, modes) || progress; } return progress; diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index a30061d3bf0..369a8ee537e 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -38,7 +38,7 @@ struct lower_io_state { nir_builder builder; void *mem_ctx; int (*type_size)(const struct glsl_type *type); - nir_variable_mode mode; + nir_variable_mode modes; }; void @@ -245,7 +245,7 @@ nir_lower_io_block(nir_block *block, void *void_state) nir_variable_mode mode = intrin->variables[0]->var->data.mode; - if (state->mode != nir_var_all && state->mode != mode) + if ((state->modes & mode) == 0) continue; if (mode != nir_var_shader_in && @@ -393,14 +393,14 @@ nir_lower_io_block(nir_block *block, void *void_state) static void nir_lower_io_impl(nir_function_impl *impl, - nir_variable_mode mode, + nir_variable_mode modes, int (*type_size)(const struct glsl_type *)) { struct lower_io_state state; nir_builder_init(&state.builder, impl); state.mem_ctx = ralloc_parent(impl); - state.mode = mode; + state.modes = modes; state.type_size = type_size; nir_foreach_block(impl, nir_lower_io_block, &state); @@ -410,12 +410,12 @@ nir_lower_io_impl(nir_function_impl *impl, } void -nir_lower_io(nir_shader *shader, nir_variable_mode mode, +nir_lower_io(nir_shader *shader, nir_variable_mode modes, int (*type_size)(const struct glsl_type *)) { nir_foreach_function(shader, function) { if (function->impl) - nir_lower_io_impl(function->impl, mode, type_size); + nir_lower_io_impl(function->impl, modes, type_size); } } diff --git a/src/compiler/nir/nir_lower_load_const_to_scalar.c b/src/compiler/nir/nir_lower_load_const_to_scalar.c index b5df46413f1..db5865fb0c0 100644 --- a/src/compiler/nir/nir_lower_load_const_to_scalar.c +++ b/src/compiler/nir/nir_lower_load_const_to_scalar.c @@ -48,8 +48,13 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower) /* Emit the individual loads. */ nir_ssa_def *loads[4]; for (unsigned i = 0; i < lower->def.num_components; i++) { - nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1); - load_comp->value.u32[0] = lower->value.u32[i]; + nir_load_const_instr *load_comp = + nir_load_const_instr_create(b.shader, 1, lower->def.bit_size); + if (lower->def.bit_size == 64) + load_comp->value.f64[0] = lower->value.f64[i]; + else + load_comp->value.u32[0] = lower->value.u32[i]; + assert(lower->def.bit_size == 64 || lower->def.bit_size == 32); nir_builder_instr_insert(&b, &load_comp->instr); loads[i] = &load_comp->def; } diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c index 0438802d3b2..111bfdd2e33 100644 --- a/src/compiler/nir/nir_lower_locals_to_regs.c +++ b/src/compiler/nir/nir_lower_locals_to_regs.c @@ -119,6 +119,7 @@ get_reg_for_deref(nir_deref_var *deref, struct locals_to_regs_state *state) nir_register *reg = nir_local_reg_create(state->impl); reg->num_components = glsl_get_vector_elements(tail->type); reg->num_array_elems = array_size > 1 ? array_size : 0; + reg->bit_size = glsl_get_bit_size(glsl_get_base_type(tail->type)); _mesa_hash_table_insert_pre_hashed(state->regs_table, hash, deref, reg); nir_array_add(&state->derefs_array, nir_deref_var *, deref); @@ -160,7 +161,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr, if (src.reg.indirect) { nir_load_const_instr *load_const = - nir_load_const_instr_create(state->shader, 1); + nir_load_const_instr_create(state->shader, 1, 32); load_const->value.u32[0] = glsl_get_length(parent_type); nir_instr_insert_before(instr, &load_const->instr); diff --git a/src/compiler/nir/nir_lower_to_source_mods.c b/src/compiler/nir/nir_lower_to_source_mods.c index 6c4e1f0d3f3..1e8c3c2a130 100644 --- a/src/compiler/nir/nir_lower_to_source_mods.c +++ b/src/compiler/nir/nir_lower_to_source_mods.c @@ -54,7 +54,7 @@ nir_lower_to_source_mods_block(nir_block *block, void *state) if (parent->dest.saturate) continue; - switch (nir_op_infos[alu->op].input_types[i]) { + switch (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i])) { case nir_type_float: if (parent->op != nir_op_fmov) continue; @@ -128,7 +128,8 @@ nir_lower_to_source_mods_block(nir_block *block, void *state) continue; /* We can only saturate float destinations */ - if (nir_op_infos[alu->op].output_type != nir_type_float) + if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) != + nir_type_float) continue; if (!list_empty(&alu->dest.dest.ssa.if_uses)) diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c index 9f9e454c198..249c3892335 100644 --- a/src/compiler/nir/nir_lower_vars_to_ssa.c +++ b/src/compiler/nir/nir_lower_vars_to_ssa.c @@ -504,8 +504,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state) */ nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(state->shader, - intrin->num_components); - undef->def.bit_size = intrin->dest.ssa.bit_size; + intrin->num_components, + intrin->dest.ssa.bit_size); nir_instr_insert_before(&intrin->instr, &undef->instr); nir_instr_remove(&intrin->instr); diff --git a/src/compiler/nir/nir_lower_vec_to_movs.c b/src/compiler/nir/nir_lower_vec_to_movs.c index f51cede3920..9e40b84e6e3 100644 --- a/src/compiler/nir/nir_lower_vec_to_movs.c +++ b/src/compiler/nir/nir_lower_vec_to_movs.c @@ -240,6 +240,7 @@ lower_vec_to_movs_block(nir_block *block, void *void_state) /* Since we insert multiple MOVs, we have a register destination. */ nir_register *reg = nir_local_reg_create(impl); reg->num_components = vec->dest.dest.ssa.num_components; + reg->bit_size = vec->dest.dest.ssa.bit_size; nir_ssa_def_rewrite_uses(&vec->dest.dest.ssa, nir_src_for_reg(reg)); diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index d6b658dbfc8..e75ca28cf0e 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -95,6 +95,7 @@ tuint = "uint" tfloat32 = "float32" tint32 = "int32" tuint32 = "uint32" +tuint64 = "uint64" tfloat64 = "float64" commutative = "commutative " @@ -161,15 +162,23 @@ unop("fexp2", tfloat, "exp2f(src0)") unop("flog2", tfloat, "log2f(src0)") unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion. unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion +unop_convert("d2i", tint32, tfloat64, "src0") # Double-to-integer conversion. +unop_convert("d2u", tuint32, tfloat64, "src0") # Double-to-unsigned conversion. unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion. +unop_convert("i2d", tfloat64, tint32, "src0") # Integer-to-double conversion. # Float-to-boolean conversion unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f") +unop_convert("d2b", tbool, tfloat64, "src0 != 0.0") # Boolean-to-float conversion unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f") # Int-to-boolean conversion unop_convert("i2b", tbool, tint32, "src0 != 0") unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion. +unop_convert("u2d", tfloat64, tuint32, "src0") # Unsigned-to-double conversion. +# double-to-float conversion +unop_convert("d2f", tfloat32, tfloat64, "src0") # Single to double precision +unop_convert("f2d", tfloat64, tfloat32, "src0") # Double to single precision # Unary floating-point rounding operations. @@ -253,6 +262,34 @@ dst.x = (src0.x << 0) | (src0.w << 24); """) +unop_horiz("pack_double_2x32", 1, tuint64, 2, tuint32, """ +union { + uint64_t u64; + struct { + uint32_t i1; + uint32_t i2; + }; +} di; + +di.i1 = src0.x; +di.i2 = src0.y; +dst.x = di.u64; +""") + +unop_horiz("unpack_double_2x32", 2, tuint32, 1, tuint64, """ +union { + uint64_t u64; + struct { + uint32_t i1; + uint32_t i2; + }; +} di; + +di.u64 = src0.x; +dst.x = di.i1; +dst.y = di.i2; +""") + # Lowered floating point unpacking operations. @@ -261,6 +298,29 @@ unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32, unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32, "unpack_half_1x16((uint16_t)(src0.x >> 16))") +unop_convert("unpack_double_2x32_split_x", tuint32, tuint64, """ +union { + uint64_t u64; + struct { + uint32_t x; + uint32_t y; + }; +} di; +di.u64 = src0; +dst = di.x; +""") + +unop_convert("unpack_double_2x32_split_y", tuint32, tuint64, """ +union { + uint64_t u64; + struct { + uint32_t x; + uint32_t y; + }; +} di; +di.u64 = src0; +dst = di.y; +""") # Bit operations, part of ARB_gpu_shader5. @@ -540,6 +600,19 @@ binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") +binop_convert("pack_double_2x32_split", tuint64, tuint32, "", """ +union { + uint64_t u64; + struct { + uint32_t x; + uint32_t y; + }; +} di; +di.x = src0; +di.y = src1; +dst = di.u64; +""") + # bfm implements the behavior of the first operation of the SM5 "bfi" assembly # and that of the "bfi1" i965 instruction. That is, it has undefined behavior # if either of its arguments are 32. diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index ddfe94d9e73..dd41931b345 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -138,7 +138,10 @@ optimizations = [ (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), (('fsat', ('fsat', a)), ('fsat', a)), - (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)), + (('fmin', ('fmax', ('fmin', ('fmax', a, b), c), b), c), ('fmin', ('fmax', a, b), c)), + (('imin', ('imax', ('imin', ('imax', a, b), c), b), c), ('imin', ('imax', a, b), c)), + (('umin', ('umax', ('umin', ('umax', a, b), c), b), c), ('umin', ('umax', a, b), c)), + (('extract_u8', ('imin', ('imax', a, 0), 0xff), 0), ('imin', ('imax', a, 0), 0xff)), (('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))), (('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)), (('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))), @@ -275,6 +278,14 @@ optimizations = [ (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))), (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), + # Reassociate constants in add/mul chains so they can be folded together. + # For now, we only handle cases where the constants are separated by + # a single non-constant. We could do better eventually. + (('~fmul', '#a', ('fmul', b, '#c')), ('fmul', ('fmul', a, c), b)), + (('imul', '#a', ('imul', b, '#c')), ('imul', ('imul', a, c), b)), + (('~fadd', '#a', ('fadd', b, '#c')), ('fadd', ('fadd', a, c), b)), + (('iadd', '#a', ('iadd', b, '#c')), ('iadd', ('iadd', a, c), b)), + # Misc. lowering (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), @@ -362,26 +373,30 @@ optimizations = [ ] def fexp2i(exp): - # We assume that exp is already in range. + # We assume that exp is already in the range [-126, 127]. return ('ishl', ('iadd', exp, 127), 23) def ldexp32(f, exp): - # First, we clamp exp to a reasonable range. The maximum range that we - # need is the largest range for an exponent, ([-127, 128] if you include - # inf and 0) plus the number of mantissa bits in either direction to - # account for denormals. This means that we need at least a range of - # [-150, 151]. For our implementation, however, what we really care - # about is that neither exp/2 nor exp-exp/2 go out of the regular range - # for floating-point exponents. + # First, we clamp exp to a reasonable range. The maximum possible range + # for a normal exponent is [-126, 127] and, throwing in denormals, you get + # a maximum range of [-149, 127]. This means that we can potentially have + # a swing of +-276. If you start with FLT_MAX, you actually have to do + # ldexp(FLT_MAX, -278) to get it to flush all the way to zero. The GLSL + # spec, on the other hand, only requires that we handle an exponent value + # in the range [-126, 128]. This implementation is *mostly* correct; it + # handles a range on exp of [-252, 254] which allows you to create any + # value (including denorms if the hardware supports it) and to adjust the + # exponent of any normal value to anything you want. exp = ('imin', ('imax', exp, -252), 254) # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2. - # While the spec technically defines ldexp as f * 2.0^exp, simply - # multiplying once doesn't work when denormals are involved because - # 2.0^exp may not be representable even though ldexp(f, exp) is (see - # comments above about range). Instead, we create two powers of two and - # multiply by them each in turn. That way the effective range of our - # exponent is doubled. + # (We use ishr which isn't the same for -1, but the -1 case still works + # since we use exp-exp/2 as the second exponent.) While the spec + # technically defines ldexp as f * 2.0^exp, simply multiplying once doesn't + # work with denormals and doesn't allow for the full swing in exponents + # that you can get with normalized values. Instead, we create two powers + # of two and multiply by them each in turn. That way the effective range + # of our exponent is doubled. pow2_1 = fexp2i(('ishr', exp, 1)) pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1))) return ('fmul', ('fmul', f, pow2_1), pow2_2) diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c index e64ca369bbc..caa4231b188 100644 --- a/src/compiler/nir/nir_opt_constant_folding.c +++ b/src/compiler/nir/nir_opt_constant_folding.c @@ -98,9 +98,9 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx) nir_load_const_instr *new_instr = nir_load_const_instr_create(mem_ctx, - instr->dest.dest.ssa.num_components); + instr->dest.dest.ssa.num_components, + instr->dest.dest.ssa.bit_size); - new_instr->def.bit_size = instr->dest.dest.ssa.bit_size; new_instr->value = dest; nir_instr_insert_before(&instr->instr, &new_instr->instr); diff --git a/src/compiler/nir/nir_opt_dce.c b/src/compiler/nir/nir_opt_dce.c index 32436c18b60..cab09dfffc3 100644 --- a/src/compiler/nir/nir_opt_dce.c +++ b/src/compiler/nir/nir_opt_dce.c @@ -71,7 +71,7 @@ init_instr(nir_instr *instr, struct exec_list *worklist) nir_tex_instr *tex_instr; /* We use the pass_flags to store the live/dead information. In DCE, we - * just treat it as a zero/non-zerl boolean for whether or not the + * just treat it as a zero/non-zero boolean for whether or not the * instruction is live. */ instr->pass_flags = 0; diff --git a/src/compiler/nir/nir_phi_builder.c b/src/compiler/nir/nir_phi_builder.c index a39e3606fd5..1f1388a73dd 100644 --- a/src/compiler/nir/nir_phi_builder.c +++ b/src/compiler/nir/nir_phi_builder.c @@ -195,7 +195,8 @@ nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val, */ nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(val->builder->shader, - val->num_components); + val->num_components, + val->bit_size); nir_instr_insert(nir_before_cf_list(&val->builder->impl->body), &undef->instr); val->defs[block->index] = &undef->def; diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index 17ae3681e21..2793020953e 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -29,6 +29,7 @@ #include "compiler/shader_enums.h" #include <stdio.h> #include <stdlib.h> +#include <inttypes.h> /* for PRIx64 macro */ static void print_tabs(unsigned num_tabs, FILE *fp) @@ -68,7 +69,7 @@ static void print_register_decl(nir_register *reg, print_state *state) { FILE *fp = state->fp; - fprintf(fp, "decl_reg %s ", sizes[reg->num_components]); + fprintf(fp, "decl_reg %s %u ", sizes[reg->num_components], reg->bit_size); if (reg->is_packed) fprintf(fp, "(packed) "); print_register(reg, state); @@ -83,7 +84,8 @@ print_ssa_def(nir_ssa_def *def, print_state *state) FILE *fp = state->fp; if (def->name != NULL) fprintf(fp, "/* %s */ ", def->name); - fprintf(fp, "%s ssa_%u", sizes[def->num_components], def->index); + fprintf(fp, "%s %u ssa_%u", sizes[def->num_components], def->bit_size, + def->index); } static void @@ -279,6 +281,13 @@ print_constant(nir_constant *c, const struct glsl_type *type, print_state *state } break; + case GLSL_TYPE_DOUBLE: + for (i = 0; i < total_elems; i++) { + if (i > 0) fprintf(fp, ", "); + fprintf(fp, "%f", c->value.d[i]); + } + break; + case GLSL_TYPE_STRUCT: for (i = 0; i < c->num_elements; i++) { if (i > 0) fprintf(fp, ", "); @@ -716,7 +725,11 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state) * and then print the float in a comment for readability. */ - fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]); + if (instr->def.bit_size == 64) + fprintf(fp, "0x%16" PRIx64 " /* %f */", instr->value.u64[i], + instr->value.f64[i]); + else + fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]); } fprintf(fp, ")"); diff --git a/src/compiler/nir/nir_remove_dead_variables.c b/src/compiler/nir/nir_remove_dead_variables.c index ad69de85b97..7395805d7a2 100644 --- a/src/compiler/nir/nir_remove_dead_variables.c +++ b/src/compiler/nir/nir_remove_dead_variables.c @@ -120,7 +120,7 @@ remove_dead_vars(struct exec_list *var_list, struct set *live) } bool -nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode) +nir_remove_dead_variables(nir_shader *shader, nir_variable_mode modes) { bool progress = false; struct set *live = @@ -128,22 +128,22 @@ nir_remove_dead_variables(nir_shader *shader, nir_variable_mode mode) add_var_use_shader(shader, live); - if (mode == nir_var_uniform || mode == nir_var_all) + if (modes & nir_var_uniform) progress = remove_dead_vars(&shader->uniforms, live) || progress; - if (mode == nir_var_shader_in || mode == nir_var_all) + if (modes & nir_var_shader_in) progress = remove_dead_vars(&shader->inputs, live) || progress; - if (mode == nir_var_shader_out || mode == nir_var_all) + if (modes & nir_var_shader_out) progress = remove_dead_vars(&shader->outputs, live) || progress; - if (mode == nir_var_global || mode == nir_var_all) + if (modes & nir_var_global) progress = remove_dead_vars(&shader->globals, live) || progress; - if (mode == nir_var_system_value || mode == nir_var_all) + if (modes & nir_var_system_value) progress = remove_dead_vars(&shader->system_values, live) || progress; - if (mode == nir_var_local || mode == nir_var_all) { + if (modes & nir_var_local) { nir_foreach_function(shader, function) { if (function->impl) { if (remove_dead_vars(&function->impl->locals, live)) { diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c index 3a65ab18928..dc53a9063c4 100644 --- a/src/compiler/nir/nir_search.c +++ b/src/compiler/nir/nir_search.c @@ -477,7 +477,8 @@ construct_value(const nir_search_value *value, case nir_search_value_constant: { const nir_search_constant *c = nir_search_value_as_constant(value); - nir_load_const_instr *load = nir_load_const_instr_create(mem_ctx, 1); + nir_load_const_instr *load = + nir_load_const_instr_create(mem_ctx, 1, bitsize->dest_size); switch (c->type) { case nir_type_float: @@ -528,8 +529,6 @@ construct_value(const nir_search_value *value, unreachable("Invalid alu source type"); } - load->def.bit_size = bitsize->dest_size; - nir_instr_insert_before(instr, &load->instr); nir_alu_src val; diff --git a/src/compiler/nir/nir_split_var_copies.c b/src/compiler/nir/nir_split_var_copies.c index 6fdaefa32c8..2b011077a7c 100644 --- a/src/compiler/nir/nir_split_var_copies.c +++ b/src/compiler/nir/nir_split_var_copies.c @@ -149,6 +149,7 @@ split_var_copy_instr(nir_intrinsic_instr *old_copy, case GLSL_TYPE_UINT: case GLSL_TYPE_INT: case GLSL_TYPE_FLOAT: + case GLSL_TYPE_DOUBLE: case GLSL_TYPE_BOOL: if (glsl_type_is_matrix(src_tail->type)) { nir_deref_array *deref = nir_deref_array_create(state->dead_ctx); @@ -231,6 +232,7 @@ split_var_copies_block(nir_block *block, void *void_state) ralloc_steal(state->dead_ctx, instr); break; case GLSL_TYPE_FLOAT: + case GLSL_TYPE_DOUBLE: case GLSL_TYPE_INT: case GLSL_TYPE_UINT: case GLSL_TYPE_BOOL: diff --git a/src/compiler/nir/nir_to_ssa.c b/src/compiler/nir/nir_to_ssa.c index d588d7d2df3..23d709a218a 100644 --- a/src/compiler/nir/nir_to_ssa.c +++ b/src/compiler/nir/nir_to_ssa.c @@ -160,7 +160,8 @@ static nir_ssa_def *get_ssa_src(nir_register *reg, rewrite_state *state) * to preserve the information that this source is undefined */ nir_ssa_undef_instr *instr = - nir_ssa_undef_instr_create(state->mem_ctx, reg->num_components); + nir_ssa_undef_instr_create(state->mem_ctx, reg->num_components, + reg->bit_size); /* * We could just insert the undefined instruction before the instruction diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 9f18d1c33e4..3c3306c75fb 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -903,6 +903,9 @@ validate_var_decl(nir_variable *var, bool is_global, validate_state *state) { assert(is_global == nir_variable_is_global(var)); + /* Must have exactly one mode set */ + assert(util_bitcount(var->data.mode) == 1); + /* * TODO validate some things ir_validate.cpp does (requires more GLSL type * support) diff --git a/src/compiler/nir/spirv/spirv_to_nir.c b/src/compiler/nir/spirv/spirv_to_nir.c index 948454494fa..99514b49650 100644 --- a/src/compiler/nir/spirv/spirv_to_nir.c +++ b/src/compiler/nir/spirv/spirv_to_nir.c @@ -86,7 +86,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant, if (glsl_type_is_vector_or_scalar(type)) { unsigned num_components = glsl_get_vector_elements(val->type); nir_load_const_instr *load = - nir_load_const_instr_create(b->shader, num_components); + nir_load_const_instr_create(b->shader, num_components, 32); for (unsigned i = 0; i < num_components; i++) load->value.u32[i] = constant->value.u[i]; @@ -103,7 +103,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant, struct vtn_ssa_value *col_val = rzalloc(b, struct vtn_ssa_value); col_val->type = glsl_get_column_type(val->type); nir_load_const_instr *load = - nir_load_const_instr_create(b->shader, rows); + nir_load_const_instr_create(b->shader, rows, 32); for (unsigned j = 0; j < rows; j++) load->value.u32[j] = constant->value.u[rows * i + j]; diff --git a/src/compiler/nir_types.cpp b/src/compiler/nir_types.cpp index 70e9cd397fc..62a1071e444 100644 --- a/src/compiler/nir_types.cpp +++ b/src/compiler/nir_types.cpp @@ -126,9 +126,9 @@ glsl_get_aoa_size(const struct glsl_type *type) unsigned glsl_count_attribute_slots(const struct glsl_type *type, - bool vertex_input_slots) + bool is_vertex_input) { - return type->count_attribute_slots(vertex_input_slots); + return type->count_attribute_slots(is_vertex_input); } const char * diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h index 5efdd85dea5..851096f9cc0 100644 --- a/src/compiler/nir_types.h +++ b/src/compiler/nir_types.h @@ -69,7 +69,7 @@ unsigned glsl_get_length(const struct glsl_type *type); unsigned glsl_get_aoa_size(const struct glsl_type *type); unsigned glsl_count_attribute_slots(const struct glsl_type *type, - bool vertex_input_slots); + bool is_vertex_input); const char *glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index); diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am index 82c2869b99b..296ed59317b 100644 --- a/src/gallium/auxiliary/Makefile.am +++ b/src/gallium/auxiliary/Makefile.am @@ -1,10 +1,11 @@ include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc -noinst_LTLIBRARIES = libgallium_nir.la +noinst_LTLIBRARIES = libgallium.la AM_CFLAGS = \ -I$(top_srcdir)/src/loader \ + -I$(top_builddir)/src/compiler/nir \ -I$(top_srcdir)/src/gallium/auxiliary/util \ $(GALLIUM_CFLAGS) \ $(VISIBILITY_CFLAGS) \ @@ -14,24 +15,11 @@ AM_CXXFLAGS = \ $(VISIBILITY_CXXFLAGS) \ $(MSVC2013_COMPAT_CXXFLAGS) -libgallium_nir_la_SOURCES = \ - $(NIR_SOURCES) - -libgallium_nir_la_CFLAGS = \ - -I$(top_builddir)/src/compiler/nir \ - $(GALLIUM_CFLAGS) \ - $(VISIBILITY_CFLAGS) \ - $(MSVC2013_COMPAT_CFLAGS) - -noinst_LTLIBRARIES += libgallium.la - libgallium_la_SOURCES = \ $(C_SOURCES) \ + $(NIR_SOURCES) \ $(GENERATED_SOURCES) -libgallium_la_LIBADD = \ - libgallium_nir.la - if HAVE_MESA_LLVM AM_CFLAGS += \ diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c index 790e1211898..4e0cbdd8f9a 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.c +++ b/src/gallium/auxiliary/cso_cache/cso_context.c @@ -1539,6 +1539,8 @@ cso_save_state(struct cso_context *cso, unsigned state_mask) cso_save_vertex_shader(cso); if (state_mask & CSO_BIT_VIEWPORT) cso_save_viewport(cso); + if (state_mask & CSO_BIT_PAUSE_QUERIES) + cso->pipe->set_active_query_state(cso->pipe, false); } @@ -1590,6 +1592,8 @@ cso_restore_state(struct cso_context *cso) cso_restore_vertex_shader(cso); if (state_mask & CSO_BIT_VIEWPORT) cso_restore_viewport(cso); + if (state_mask & CSO_BIT_PAUSE_QUERIES) + cso->pipe->set_active_query_state(cso->pipe, true); cso->saved_state = 0; } diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h index a3563d83a02..e27cbe9f721 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.h +++ b/src/gallium/auxiliary/cso_cache/cso_context.h @@ -170,6 +170,7 @@ void cso_set_render_condition(struct cso_context *cso, #define CSO_BIT_VERTEX_ELEMENTS 0x10000 #define CSO_BIT_VERTEX_SHADER 0x20000 #define CSO_BIT_VIEWPORT 0x40000 +#define CSO_BIT_PAUSE_QUERIES 0x80000 #define CSO_BITS_ALL_SHADERS (CSO_BIT_VERTEX_SHADER | \ CSO_BIT_FRAGMENT_SHADER | \ diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 2ba9b099664..75551fbe2dd 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -749,7 +749,23 @@ draw_image(struct draw_context *draw, } } - +/** + * Provide TGSI buffer objects for vertex/geometry shaders that use + * load/store/atomic ops. This state only needs to be set once per context. + * This might only be used by software drivers for the time being. + */ +void +draw_buffer(struct draw_context *draw, + uint shader, + struct tgsi_buffer *buffer) +{ + if (shader == PIPE_SHADER_VERTEX) { + draw->vs.tgsi.buffer = buffer; + } else { + debug_assert(shader == PIPE_SHADER_GEOMETRY); + draw->gs.tgsi.buffer = buffer; + } +} void draw_set_render( struct draw_context *draw, diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index 5d9870b115c..3e6722fcb7e 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -49,6 +49,7 @@ struct draw_geometry_shader; struct draw_fragment_shader; struct tgsi_sampler; struct tgsi_image; +struct tgsi_buffer; /* * structure to contain driver internal information @@ -161,6 +162,11 @@ draw_image(struct draw_context *draw, struct tgsi_image *image); void +draw_buffer(struct draw_context *draw, + uint shader_type, + struct tgsi_buffer *buffer); + +void draw_set_sampler_views(struct draw_context *draw, unsigned shader_stage, struct pipe_sampler_view **views, diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c index 14db2d6f39d..ef217fa5ceb 100644 --- a/src/gallium/auxiliary/draw/draw_gs.c +++ b/src/gallium/auxiliary/draw/draw_gs.c @@ -681,7 +681,9 @@ void draw_geometry_shader_prepare(struct draw_geometry_shader *shader, if (!use_llvm && shader && shader->machine->Tokens != shader->state.tokens) { tgsi_exec_machine_bind_shader(shader->machine, shader->state.tokens, - draw->gs.tgsi.sampler, draw->gs.tgsi.image); + draw->gs.tgsi.sampler, + draw->gs.tgsi.image, + draw->gs.tgsi.buffer); } } diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 211bd6f7e70..a18f6632124 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -67,6 +67,7 @@ struct vbuf_render; struct tgsi_exec_machine; struct tgsi_sampler; struct tgsi_image; +struct tgsi_buffer; struct draw_pt_front_end; struct draw_assembler; struct draw_llvm; @@ -269,6 +270,7 @@ struct draw_context struct tgsi_sampler *sampler; struct tgsi_image *image; + struct tgsi_buffer *buffer; } tgsi; struct translate *fetch; @@ -289,6 +291,7 @@ struct draw_context struct tgsi_sampler *sampler; struct tgsi_image *image; + struct tgsi_buffer *buffer; } tgsi; } gs; diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index 5b53cff29f0..da0d1a7f9a8 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -70,7 +70,9 @@ vs_exec_prepare( struct draw_vertex_shader *shader, if (evs->machine->Tokens != shader->state.tokens) { tgsi_exec_machine_bind_shader(evs->machine, shader->state.tokens, - draw->vs.tgsi.sampler, draw->vs.tgsi.image); + draw->vs.tgsi.sampler, + draw->vs.tgsi.image, + draw->vs.tgsi.buffer); } } @@ -159,6 +161,7 @@ vs_exec_run_linear( struct draw_vertex_shader *shader, input = (const float (*)[4])((const char *)input + input_stride); } + machine->NonHelperMask = (1 << max_vertices) - 1; /* run interpreter */ tgsi_exec_machine_run( machine ); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 0c43617d531..beff4143fd8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1492,9 +1492,20 @@ lp_build_abs(struct lp_build_context *bld, return a; if(type.floating) { - char intrinsic[32]; - lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); - return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); + if (0x0306 <= HAVE_LLVM && HAVE_LLVM < 0x0309) { + /* Workaround llvm.org/PR27332 */ + LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); + unsigned long long absMask = ~(1ULL << (type.width - 1)); + LLVMValueRef mask = lp_build_const_int_vec(bld->gallivm, type, ((unsigned long long) absMask)); + a = LLVMBuildBitCast(builder, a, int_vec_type, ""); + a = LLVMBuildAnd(builder, a, mask, ""); + a = LLVMBuildBitCast(builder, a, vec_type, ""); + return a; + } else { + char intrinsic[32]; + lp_format_intrinsic(intrinsic, sizeof intrinsic, "llvm.fabs", vec_type); + return lp_build_intrinsic_unary(builder, intrinsic, vec_type, a); + } } if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { @@ -1663,99 +1674,6 @@ enum lp_build_round_mode LP_BUILD_ROUND_TRUNCATE = 3 }; -/** - * Helper for SSE4.1's ROUNDxx instructions. - * - * NOTE: In the SSE4.1's nearest mode, if two values are equally close, the - * result is the even value. That is, rounding 2.5 will be 2.0, and not 3.0. - */ -static inline LLVMValueRef -lp_build_nearest_sse41(struct lp_build_context *bld, - LLVMValueRef a) -{ - LLVMBuilderRef builder = bld->gallivm->builder; - const struct lp_type type = bld->type; - LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); - LLVMValueRef mode = LLVMConstNull(i32t); - const char *intrinsic; - LLVMValueRef res; - - assert(type.floating); - - assert(lp_check_value(type, a)); - assert(util_cpu_caps.has_sse4_1); - - if (type.length == 1) { - LLVMTypeRef vec_type; - LLVMValueRef undef; - LLVMValueRef args[3]; - LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); - - switch(type.width) { - case 32: - intrinsic = "llvm.x86.sse41.round.ss"; - break; - case 64: - intrinsic = "llvm.x86.sse41.round.sd"; - break; - default: - assert(0); - return bld->undef; - } - - vec_type = LLVMVectorType(bld->elem_type, 4); - - undef = LLVMGetUndef(vec_type); - - args[0] = undef; - args[1] = LLVMBuildInsertElement(builder, undef, a, index0, ""); - args[2] = mode; - - res = lp_build_intrinsic(builder, intrinsic, - vec_type, args, Elements(args), 0); - - res = LLVMBuildExtractElement(builder, res, index0, ""); - } - else { - if (type.width * type.length == 128) { - switch(type.width) { - case 32: - intrinsic = "llvm.x86.sse41.round.ps"; - break; - case 64: - intrinsic = "llvm.x86.sse41.round.pd"; - break; - default: - assert(0); - return bld->undef; - } - } - else { - assert(type.width * type.length == 256); - assert(util_cpu_caps.has_avx); - - switch(type.width) { - case 32: - intrinsic = "llvm.x86.avx.round.ps.256"; - break; - case 64: - intrinsic = "llvm.x86.avx.round.pd.256"; - break; - default: - assert(0); - return bld->undef; - } - } - - res = lp_build_intrinsic_binary(builder, intrinsic, - bld->vec_type, a, - mode); - } - - return res; -} - - static inline LLVMValueRef lp_build_iround_nearest_sse2(struct lp_build_context *bld, LLVMValueRef a) @@ -1863,11 +1781,7 @@ lp_build_round_arch(struct lp_build_context *bld, switch (mode) { case LP_BUILD_ROUND_NEAREST: - if (HAVE_LLVM >= 0x0304) { - intrinsic_root = "llvm.round"; - } else { - return lp_build_nearest_sse41(bld, a); - } + intrinsic_root = "llvm.nearbyint"; break; case LP_BUILD_ROUND_FLOOR: intrinsic_root = "llvm.floor"; diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c index 4673458171e..40017c8614f 100644 --- a/src/gallium/auxiliary/hud/hud_context.c +++ b/src/gallium/auxiliary/hud/hud_context.c @@ -477,6 +477,7 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex) CSO_BIT_VERTEX_SHADER | CSO_BIT_VERTEX_ELEMENTS | CSO_BIT_AUX_VERTEX_BUFFER_SLOT | + CSO_BIT_PAUSE_QUERIES | CSO_BIT_RENDER_CONDITION)); cso_save_constant_buffer_slot0(cso, PIPE_SHADER_VERTEX); diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c index 7ec8b662200..d76b6d900ce 100644 --- a/src/gallium/auxiliary/nir/tgsi_to_nir.c +++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c @@ -454,7 +454,7 @@ ttn_emit_immediate(struct ttn_compile *c) nir_load_const_instr *load_const; int i; - load_const = nir_load_const_instr_create(b->shader, 4); + load_const = nir_load_const_instr_create(b->shader, 4, 32); c->imm_defs[c->next_imm] = &load_const->def; c->next_imm++; diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h index 803c1d39192..33c23068c27 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h @@ -87,9 +87,9 @@ struct pb_desc /** - * Size. Regular (32bit) unsigned for now. + * 64-bit type for GPU buffer sizes and offsets. */ -typedef unsigned pb_size; +typedef uint64_t pb_size; /** @@ -98,8 +98,8 @@ typedef unsigned pb_size; struct pb_buffer { struct pipe_reference reference; - unsigned size; unsigned alignment; + pb_size size; unsigned usage; /** diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c index fbbe8d11eb0..64af321558e 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c @@ -40,6 +40,7 @@ #include <unistd.h> #include <sched.h> #endif +#include <inttypes.h> #include "pipe/p_compiler.h" #include "pipe/p_defines.h" @@ -208,7 +209,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr) while (curr != &fenced_mgr->unfenced) { fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head); assert(!fenced_buf->fence); - debug_printf("%10p %7u %8u %7s\n", + debug_printf("%10p %"PRIu64" %8u %7s\n", (void *) fenced_buf, fenced_buf->base.size, p_atomic_read(&fenced_buf->base.reference.count), @@ -224,7 +225,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr) fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head); assert(fenced_buf->buffer); signaled = ops->fence_signalled(ops, fenced_buf->fence, 0); - debug_printf("%10p %7u %8u %7s %10p %s\n", + debug_printf("%10p %"PRIu64" %8u %7s %10p %s\n", (void *) fenced_buf, fenced_buf->base.size, p_atomic_read(&fenced_buf->base.reference.count), diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c index 3d3a7aba7fb..4e36866e08c 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c @@ -41,6 +41,7 @@ #include "util/list.h" #include "util/u_time.h" #include "util/u_debug_stack.h" +#include <inttypes.h> #include "pb_buffer.h" #include "pb_bufmgr.h" @@ -190,7 +191,7 @@ pb_debug_buffer_check(struct pb_debug_buffer *buf) underflow = !check_random_pattern(map, buf->underflow_size, &min_ofs, &max_ofs); if(underflow) { - debug_printf("buffer underflow (offset -%u%s to -%u bytes) detected\n", + debug_printf("buffer underflow (offset -%"PRIu64"%s to -%"PRIu64" bytes) detected\n", buf->underflow_size - min_ofs, min_ofs == 0 ? "+" : "", buf->underflow_size - max_ofs); @@ -200,7 +201,7 @@ pb_debug_buffer_check(struct pb_debug_buffer *buf) buf->overflow_size, &min_ofs, &max_ofs); if(overflow) { - debug_printf("buffer overflow (size %u plus offset %u to %u%s bytes) detected\n", + debug_printf("buffer overflow (size %"PRIu64" plus offset %"PRIu64" to %"PRIu64"%s bytes) detected\n", buf->base.size, min_ofs, max_ofs, @@ -349,7 +350,7 @@ pb_debug_manager_dump_locked(struct pb_debug_manager *mgr) buf = LIST_ENTRY(struct pb_debug_buffer, curr, head); debug_printf("buffer = %p\n", (void *) buf); - debug_printf(" .size = 0x%x\n", buf->base.size); + debug_printf(" .size = 0x%"PRIx64"\n", buf->base.size); debug_backtrace_dump(buf->create_backtrace, PB_DEBUG_CREATE_BACKTRACE); curr = next; diff --git a/src/gallium/auxiliary/postprocess/pp_run.c b/src/gallium/auxiliary/postprocess/pp_run.c index 9dc8fb51ae2..bc79c5aab6e 100644 --- a/src/gallium/auxiliary/postprocess/pp_run.c +++ b/src/gallium/auxiliary/postprocess/pp_run.c @@ -133,6 +133,7 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in, CSO_BIT_VERTEX_SHADER | CSO_BIT_VIEWPORT | CSO_BIT_AUX_VERTEX_BUFFER_SLOT | + CSO_BIT_PAUSE_QUERIES | CSO_BIT_RENDER_CONDITION)); cso_save_constant_buffer_slot0(cso, PIPE_SHADER_VERTEX); cso_save_constant_buffer_slot0(cso, PIPE_SHADER_FRAGMENT); diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c index 27ee8f1242a..3e7d699627a 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c @@ -2203,7 +2203,7 @@ voidptr_to_x86_func(void *v) void *v; x86_func f; } u; - assert(sizeof(u.v) == sizeof(u.f)); + STATIC_ASSERT(sizeof(u.v) == sizeof(u.f)); u.v = v; return u.f; } diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index a595bbbc6d3..41dd0f0466a 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -854,7 +854,8 @@ tgsi_exec_machine_bind_shader( struct tgsi_exec_machine *mach, const struct tgsi_token *tokens, struct tgsi_sampler *sampler, - struct tgsi_image *image) + struct tgsi_image *image, + struct tgsi_buffer *buffer) { uint k; struct tgsi_parse_context parse; @@ -873,6 +874,7 @@ tgsi_exec_machine_bind_shader( mach->Tokens = tokens; mach->Sampler = sampler; mach->Image = image; + mach->Buffer = buffer; if (!tokens) { /* unbind and free all */ @@ -3758,8 +3760,8 @@ get_image_coord_sample(unsigned tgsi_tex) } static void -exec_load(struct tgsi_exec_machine *mach, - const struct tgsi_full_instruction *inst) +exec_load_img(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) { union tgsi_exec_channel r[4], sample_r; uint unit; @@ -3805,8 +3807,51 @@ exec_load(struct tgsi_exec_machine *mach, } static void -exec_store(struct tgsi_exec_machine *mach, - const struct tgsi_full_instruction *inst) +exec_load_buf(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + union tgsi_exec_channel r[4]; + uint unit; + int j; + uint chan; + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; + struct tgsi_buffer_params params; + int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + + unit = fetch_sampler_unit(mach, inst, 0); + + params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; + params.unit = unit; + IFETCH(&r[0], 1, TGSI_CHAN_X); + + mach->Buffer->load(mach->Buffer, ¶ms, + r[0].i, rgba); + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + r[0].f[j] = rgba[0][j]; + r[1].f[j] = rgba[1][j]; + r[2].f[j] = rgba[2][j]; + r[3].f[j] = rgba[3][j]; + } + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { + store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT); + } + } +} + +static void +exec_load(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) + exec_load_img(mach, inst); + else + exec_load_buf(mach, inst); +} + +static void +exec_store_img(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) { union tgsi_exec_channel r[3], sample_r; union tgsi_exec_channel value[4]; @@ -3850,8 +3895,53 @@ exec_store(struct tgsi_exec_machine *mach, } static void -exec_atomop(struct tgsi_exec_machine *mach, - const struct tgsi_full_instruction *inst) +exec_store_buf(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + union tgsi_exec_channel r[3]; + union tgsi_exec_channel value[4]; + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; + struct tgsi_buffer_params params; + int i, j; + uint unit; + int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + + unit = inst->Dst[0].Register.Index; + + params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; + params.unit = unit; + params.writemask = inst->Dst[0].Register.WriteMask; + + IFETCH(&r[0], 0, TGSI_CHAN_X); + for (i = 0; i < 4; i++) { + FETCH(&value[i], 1, TGSI_CHAN_X + i); + } + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + rgba[0][j] = value[0].f[j]; + rgba[1][j] = value[1].f[j]; + rgba[2][j] = value[2].f[j]; + rgba[3][j] = value[3].f[j]; + } + + mach->Buffer->store(mach->Buffer, ¶ms, + r[0].i, + rgba); +} + +static void +exec_store(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) + exec_store_img(mach, inst); + else + exec_store_buf(mach, inst); +} + +static void +exec_atomop_img(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) { union tgsi_exec_channel r[4], sample_r; union tgsi_exec_channel value[4], value2[4]; @@ -3918,8 +4008,77 @@ exec_atomop(struct tgsi_exec_machine *mach, } static void -exec_resq(struct tgsi_exec_machine *mach, - const struct tgsi_full_instruction *inst) +exec_atomop_buf(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + union tgsi_exec_channel r[4]; + union tgsi_exec_channel value[4], value2[4]; + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; + struct tgsi_buffer_params params; + int i, j; + uint unit, chan; + int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + + unit = fetch_sampler_unit(mach, inst, 0); + + params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; + params.unit = unit; + params.writemask = inst->Dst[0].Register.WriteMask; + + IFETCH(&r[0], 1, TGSI_CHAN_X); + + for (i = 0; i < 4; i++) { + FETCH(&value[i], 2, TGSI_CHAN_X + i); + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) + FETCH(&value2[i], 3, TGSI_CHAN_X + i); + } + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + rgba[0][j] = value[0].f[j]; + rgba[1][j] = value[1].f[j]; + rgba[2][j] = value[2].f[j]; + rgba[3][j] = value[3].f[j]; + } + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + rgba2[0][j] = value2[0].f[j]; + rgba2[1][j] = value2[1].f[j]; + rgba2[2][j] = value2[2].f[j]; + rgba2[3][j] = value2[3].f[j]; + } + } + + mach->Buffer->op(mach->Buffer, ¶ms, inst->Instruction.Opcode, + r[0].i, + rgba, rgba2); + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + r[0].f[j] = rgba[0][j]; + r[1].f[j] = rgba[1][j]; + r[2].f[j] = rgba[2][j]; + r[3].f[j] = rgba[3][j]; + } + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { + store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT); + } + } +} + +static void +exec_atomop(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) + exec_atomop_img(mach, inst); + else + exec_atomop_buf(mach, inst); +} + +static void +exec_resq_img(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) { int result[4]; union tgsi_exec_channel r[4]; @@ -3952,6 +4111,46 @@ exec_resq(struct tgsi_exec_machine *mach, } static void +exec_resq_buf(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + int result; + union tgsi_exec_channel r[4]; + uint unit; + int i, chan; + struct tgsi_buffer_params params; + int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + + unit = fetch_sampler_unit(mach, inst, 0); + + params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; + params.unit = unit; + + mach->Buffer->get_dims(mach->Buffer, ¶ms, &result); + + for (i = 0; i < TGSI_QUAD_SIZE; i++) { + r[0].i[i] = result; + } + + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { + store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, + TGSI_EXEC_DATA_INT); + } + } +} + +static void +exec_resq(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) + exec_resq_img(mach, inst); + else + exec_resq_buf(mach, inst); +} + +static void micro_i2f(union tgsi_exec_channel *dst, const union tgsi_exec_channel *src) { diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h index 45fb8d43c88..42fb922baa5 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.h +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h @@ -138,6 +138,36 @@ struct tgsi_image { int dims[4]); }; +struct tgsi_buffer_params { + unsigned unit; + unsigned execmask; + unsigned writemask; +}; + +struct tgsi_buffer { + /* buffer interfaces */ + void (*load)(const struct tgsi_buffer *buffer, + const struct tgsi_buffer_params *params, + const int s[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]); + + void (*store)(const struct tgsi_buffer *buffer, + const struct tgsi_buffer_params *params, + const int s[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]); + + void (*op)(const struct tgsi_buffer *buffer, + const struct tgsi_buffer_params *params, + unsigned opcode, + const int s[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE], + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]); + + void (*get_dims)(const struct tgsi_buffer *buffer, + const struct tgsi_buffer_params *params, + int *dim); +}; + /** * Information for sampling textures, which must be implemented * by code outside the TGSI executor. @@ -334,6 +364,7 @@ struct tgsi_exec_machine struct tgsi_sampler *Sampler; struct tgsi_image *Image; + struct tgsi_buffer *Buffer; unsigned ImmLimit; const void *Consts[PIPE_MAX_CONSTANT_BUFFERS]; @@ -424,7 +455,8 @@ tgsi_exec_machine_bind_shader( struct tgsi_exec_machine *mach, const struct tgsi_token *tokens, struct tgsi_sampler *sampler, - struct tgsi_image *image); + struct tgsi_image *image, + struct tgsi_buffer *buffer); uint tgsi_exec_machine_run( @@ -496,8 +528,9 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param) return 1; case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: - case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: return 0; + case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: + return PIPE_MAX_SHADER_BUFFERS; case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: return PIPE_MAX_SHADER_IMAGES; diff --git a/src/gallium/auxiliary/tgsi/tgsi_lowering.c b/src/gallium/auxiliary/tgsi/tgsi_lowering.c index a3b90bdb509..0ffd855793a 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_lowering.c +++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.c @@ -1430,7 +1430,7 @@ tgsi_transform_lowering(const struct tgsi_lowering_config *config, int newlen, numtmp; /* sanity check in case limit is ever increased: */ - assert((sizeof(config->saturate_s) * 8) >= PIPE_MAX_SAMPLERS); + STATIC_ASSERT((sizeof(config->saturate_s) * 8) >= PIPE_MAX_SAMPLERS); memset(&ctx, 0, sizeof(ctx)); ctx.base.transform_instruction = transform_instr; diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c index ae95ebd82a4..16564ddf301 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.c +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c @@ -313,7 +313,7 @@ tgsi_dump_tokens(const struct tgsi_token *tokens) int nr = tgsi_num_tokens(tokens); int i; - assert(sizeof(*tokens) == sizeof(unsigned)); + STATIC_ASSERT(sizeof(*tokens) == sizeof(unsigned)); debug_printf("const unsigned tokens[%d] = {\n", nr); for (i = 0; i < nr; i++) diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c index 22c40d1382d..3677515423c 100644 --- a/src/gallium/auxiliary/util/u_blit.c +++ b/src/gallium/auxiliary/util/u_blit.c @@ -551,6 +551,7 @@ util_blit_pixels_tex(struct blit_state *ctx, CSO_BIT_STREAM_OUTPUTS | CSO_BIT_VIEWPORT | CSO_BIT_FRAMEBUFFER | + CSO_BIT_PAUSE_QUERIES | CSO_BIT_FRAGMENT_SHADER | CSO_BIT_VERTEX_SHADER | CSO_BIT_TESSCTRL_SHADER | diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c index 43fbd8e6452..3ca2c48c4c7 100644 --- a/src/gallium/auxiliary/util/u_blitter.c +++ b/src/gallium/auxiliary/util/u_blitter.c @@ -529,6 +529,8 @@ static void blitter_set_running_flag(struct blitter_context_priv *ctx) __LINE__); } ctx->base.running = TRUE; + + ctx->base.pipe->set_active_query_state(ctx->base.pipe, false); } static void blitter_unset_running_flag(struct blitter_context_priv *ctx) @@ -538,6 +540,8 @@ static void blitter_unset_running_flag(struct blitter_context_priv *ctx) __LINE__); } ctx->base.running = FALSE; + + ctx->base.pipe->set_active_query_state(ctx->base.pipe, true); } static void blitter_check_saved_vertex_states(struct blitter_context_priv *ctx) diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index e92f83a8109..b4ac0db3c50 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -792,6 +792,12 @@ align(int value, int alignment) return (value + alignment - 1) & ~(alignment - 1); } +static inline uint64_t +align64(uint64_t value, unsigned alignment) +{ + return (value + alignment - 1) & ~(alignment - 1); +} + /** * Works like align but on npot alignments. */ diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst index 904e1ff04e7..3a45f402cd8 100644 --- a/src/gallium/docs/source/context.rst +++ b/src/gallium/docs/source/context.rst @@ -330,6 +330,9 @@ a resource without synchronizing with the CPU. This write will optionally wait for the query to complete, and will optionally write whether the value is available instead of the value itself. +``set_active_query_state`` Set whether all current non-driver queries except +TIME_ELAPSED are active or paused. + The interface currently includes the following types of queries: ``PIPE_QUERY_OCCLUSION_COUNTER`` counts the number of fragments which diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index 824f580ed44..94510757254 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -331,6 +331,11 @@ The integer capabilities: primitive on a layer is obtained from ``PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS`` even though it can be larger than the number of layers supported by either rendering or textures. +* ``PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR``: Implementation uses bounds + checking on resource accesses by shader if the context is created with + PIPE_CONTEXT_ROBUST_BUFFER_ACCESS. See the ARB_robust_buffer_access_behavior + extension for information on the required behavior for out of bounds accesses + and accesses to unbound resources. .. _pipe_capf: diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst index ac6052a244a..85c302f0dc3 100644 --- a/src/gallium/docs/source/tgsi.rst +++ b/src/gallium/docs/source/tgsi.rst @@ -2710,7 +2710,7 @@ TGSI_SEMANTIC_COLOR """"""""""""""""""" For vertex shader outputs or fragment shader inputs/outputs, this -label indicates that the resister contains an R,G,B,A color. +label indicates that the register contains an R,G,B,A color. Several shader inputs/outputs may contain colors so the semantic index is used to distinguish them. For example, color[0] may be the diffuse diff --git a/src/gallium/drivers/ddebug/dd_context.c b/src/gallium/drivers/ddebug/dd_context.c index 9dfaa0af289..72a950a456a 100644 --- a/src/gallium/drivers/ddebug/dd_context.c +++ b/src/gallium/drivers/ddebug/dd_context.c @@ -124,6 +124,14 @@ dd_context_get_query_result(struct pipe_context *_pipe, } static void +dd_context_set_active_query_state(struct pipe_context *_pipe, boolean enable) +{ + struct pipe_context *pipe = dd_context(_pipe)->pipe; + + pipe->set_active_query_state(pipe, enable); +} + +static void dd_context_render_condition(struct pipe_context *_pipe, struct pipe_query *query, boolean condition, uint mode) @@ -667,6 +675,7 @@ dd_context_create(struct dd_screen *dscreen, struct pipe_context *pipe) CTX_INIT(begin_query); CTX_INIT(end_query); CTX_INIT(get_query_result); + CTX_INIT(set_active_query_state); CTX_INIT(create_blend_state); CTX_INIT(bind_blend_state); CTX_INIT(delete_blend_state); diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c index c34f9441c7b..e874d223187 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c @@ -157,7 +157,24 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info) emit.dirty = dirty; emit.vp = NULL; /* we changed key so need to refetch vp */ emit.fp = NULL; + + if (ctx->rasterizer->rasterizer_discard) { + fd_wfi(ctx, ctx->ring); + OUT_PKT3(ctx->ring, CP_REG_RMW, 3); + OUT_RING(ctx->ring, REG_A4XX_RB_RENDER_CONTROL); + OUT_RING(ctx->ring, ~A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); + OUT_RING(ctx->ring, A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); + } + draw_impl(ctx, ctx->ring, &emit); + + if (ctx->rasterizer->rasterizer_discard) { + fd_wfi(ctx, ctx->ring); + OUT_PKT3(ctx->ring, CP_REG_RMW, 3); + OUT_RING(ctx->ring, REG_A4XX_RB_RENDER_CONTROL); + OUT_RING(ctx->ring, ~A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE); + OUT_RING(ctx->ring, 0); + } } /* clear operations ignore viewport state, so we need to reset it diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c index 77e203f6c56..69decbcb251 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c @@ -82,12 +82,7 @@ static uint64_t count_samples(const struct fd_rb_samp_ctrs *start, const struct fd_rb_samp_ctrs *end) { - uint64_t n = 0; - - for (unsigned i = 0; i < 16; i += 4) - n += end->ctr[i] - start->ctr[i]; - - return n / 2; + return end->ctr[0] - start->ctr[0]; } static void diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h index 85ce97c16b7..86992960960 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.h +++ b/src/gallium/drivers/freedreno/freedreno_context.h @@ -241,6 +241,7 @@ struct fd_context { */ struct { uint64_t prims_emitted; + uint64_t prims_generated; uint64_t draw_calls; uint64_t batch_total, batch_sysmem, batch_gmem, batch_restore; } stats; diff --git a/src/gallium/drivers/freedreno/freedreno_draw.c b/src/gallium/drivers/freedreno/freedreno_draw.c index bf803cc77bc..66bb1163df2 100644 --- a/src/gallium/drivers/freedreno/freedreno_draw.c +++ b/src/gallium/drivers/freedreno/freedreno_draw.c @@ -174,7 +174,16 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) prims = u_reduced_prims_for_vertices(info->mode, info->count); ctx->stats.draw_calls++; - ctx->stats.prims_emitted += prims; + + /* TODO prims_emitted should be clipped when the stream-out buffer is + * not large enough. See max_tf_vtx().. probably need to move that + * into common code. Although a bit more annoying since a2xx doesn't + * use ir3 so no common way to get at the pipe_stream_output_info + * which is needed for this calculation. + */ + if (ctx->streamout.num_targets > 0) + ctx->stats.prims_emitted += prims; + ctx->stats.prims_generated += prims; /* any buffers that haven't been cleared yet, we need to restore: */ ctx->restore |= buffers & (FD_BUFFER_ALL & ~ctx->cleared); @@ -189,7 +198,7 @@ fd_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) ctx->draw_vbo(ctx, info); for (i = 0; i < ctx->streamout.num_targets; i++) - ctx->streamout.offsets[i] += prims; + ctx->streamout.offsets[i] += info->count; if (fd_mesa_debug & FD_DBG_DDRAW) ctx->dirty = 0xffffffff; diff --git a/src/gallium/drivers/freedreno/freedreno_query.c b/src/gallium/drivers/freedreno/freedreno_query.c index b87e8250719..a9427058579 100644 --- a/src/gallium/drivers/freedreno/freedreno_query.c +++ b/src/gallium/drivers/freedreno/freedreno_query.c @@ -114,6 +114,11 @@ fd_get_driver_query_info(struct pipe_screen *pscreen, return 1; } +static void +fd_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void fd_query_screen_init(struct pipe_screen *pscreen) { @@ -128,5 +133,6 @@ fd_query_context_init(struct pipe_context *pctx) pctx->begin_query = fd_begin_query; pctx->end_query = fd_end_query; pctx->get_query_result = fd_get_query_result; + pctx->set_active_query_state = fd_set_active_query_state; pctx->render_condition = fd_render_condition; } diff --git a/src/gallium/drivers/freedreno/freedreno_query_sw.c b/src/gallium/drivers/freedreno/freedreno_query_sw.c index 514df145fa8..4af6a125e03 100644 --- a/src/gallium/drivers/freedreno/freedreno_query_sw.c +++ b/src/gallium/drivers/freedreno/freedreno_query_sw.c @@ -54,7 +54,7 @@ read_counter(struct fd_context *ctx, int type) { switch (type) { case PIPE_QUERY_PRIMITIVES_GENERATED: - /* for now same thing as _PRIMITIVES_EMITTED */ + return ctx->stats.prims_generated; case PIPE_QUERY_PRIMITIVES_EMITTED: return ctx->stats.prims_emitted; case FD_QUERY_DRAW_CALLS: diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 707be17513b..05100186495 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -71,6 +71,7 @@ static const struct debug_named_value debug_options[] = { {"glsl120", FD_DBG_GLSL120,"Temporary flag to force GLSL 1.20 (rather than 1.30) on a3xx+"}, {"shaderdb", FD_DBG_SHADERDB, "Enable shaderdb output"}, {"flush", FD_DBG_FLUSH, "Force flush after every draw"}, + {"deqp", FD_DBG_DEQP, "Enable dEQP hacks"}, DEBUG_NAMED_VALUE_END }; @@ -256,6 +257,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_GENERATE_MIPMAP: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_MAX_VIEWPORTS: @@ -352,6 +354,16 @@ fd_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param) switch (param) { case PIPE_CAPF_MAX_LINE_WIDTH: case PIPE_CAPF_MAX_LINE_WIDTH_AA: + /* NOTE: actual value is 127.0f, but this is working around a deqp + * bug.. dEQP-GLES3.functional.rasterization.primitives.lines_wide + * uses too small of a render target size, and gets confused when + * the lines start going offscreen. + * + * See: https://code.google.com/p/android/issues/detail?id=206513 + */ + if (fd_mesa_debug & FD_DBG_DEQP) + return 63.0f; + return 127.0f; case PIPE_CAPF_MAX_POINT_WIDTH: case PIPE_CAPF_MAX_POINT_WIDTH_AA: return 4092.0f; diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c index 685d3a75659..6c472d19815 100644 --- a/src/gallium/drivers/freedreno/freedreno_state.c +++ b/src/gallium/drivers/freedreno/freedreno_state.c @@ -359,7 +359,8 @@ fd_set_stream_output_targets(struct pipe_context *pctx, if (!changed && append) continue; - so->offsets[i] = 0; + if (!append) + so->offsets[i] = offsets[i]; pipe_so_target_reference(&so->targets[i], targets[i]); } diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h index 47dd467f498..85dac982314 100644 --- a/src/gallium/drivers/freedreno/freedreno_util.h +++ b/src/gallium/drivers/freedreno/freedreno_util.h @@ -73,6 +73,7 @@ enum adreno_stencil_op fd_stencil_op(unsigned op); #define FD_DBG_GLSL120 0x0400 #define FD_DBG_SHADERDB 0x0800 #define FD_DBG_FLUSH 0x1000 +#define FD_DBG_DEQP 0x2000 extern int fd_mesa_debug; extern bool fd_binning_enabled; diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h index 3859f6a39f3..f68275e568c 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3.h +++ b/src/gallium/drivers/freedreno/ir3/ir3.h @@ -308,8 +308,14 @@ struct ir3_instruction { static inline struct ir3_instruction * ir3_neighbor_first(struct ir3_instruction *instr) { - while (instr->cp.left) + int cnt = 0; + while (instr->cp.left) { instr = instr->cp.left; + if (++cnt > 0xffff) { + debug_assert(0); + break; + } + } return instr; } @@ -322,6 +328,10 @@ static inline int ir3_neighbor_count(struct ir3_instruction *instr) while (instr->cp.right) { num++; instr = instr->cp.right; + if (num > 0xffff) { + debug_assert(0); + break; + } } return num; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 245b61f31e5..940ca7744a2 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -103,6 +103,11 @@ struct ir3_compile { */ bool unminify_coords; + /* on a4xx, for array textures we need to add 0.5 to the array + * index coordinate: + */ + bool array_index_add_half; + /* for looking up which system value is which */ unsigned sysval_semantics[8]; @@ -128,11 +133,13 @@ compile_init(struct ir3_compiler *compiler, ctx->flat_bypass = true; ctx->levels_add_one = false; ctx->unminify_coords = false; + ctx->array_index_add_half = true; } else { /* no special handling for "flat" */ ctx->flat_bypass = false; ctx->levels_add_one = true; ctx->unminify_coords = true; + ctx->array_index_add_half = false; } ctx->compiler = compiler; @@ -1447,9 +1454,8 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) } /* the array coord for cube arrays needs 0.5 added to it */ - if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && tex->is_array && - opc != OPC_ISAML) - coord[3] = ir3_ADD_F(b, coord[3], 0, create_immed(b, fui(0.5)), 0); + if (ctx->array_index_add_half && tex->is_array && (opc != OPC_ISAML)) + coord[coords] = ir3_ADD_F(b, coord[coords], 0, create_immed(b, fui(0.5)), 0); /* * lay out the first argument in the proper order: diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c index 6037becf22f..e8a2f099391 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c @@ -442,6 +442,37 @@ instr_cp(struct ir3_instruction *instr) instr_cp(instr->address); ir3_instr_set_address(instr, eliminate_output_mov(instr->address)); } + + /* we can end up with extra cmps.s from frontend, which uses a + * + * cmps.s p0.x, cond, 0 + * + * as a way to mov into the predicate register. But frequently 'cond' + * is itself a cmps.s/cmps.f/cmps.u. So detect this special case and + * just re-write the instruction writing predicate register to get rid + * of the double cmps. + */ + if ((instr->opc == OPC_CMPS_S) && + (instr->regs[0]->num == regid(REG_P0, 0)) && + ssa(instr->regs[1]) && + (instr->regs[2]->flags & IR3_REG_IMMED) && + (instr->regs[2]->iim_val == 0)) { + struct ir3_instruction *cond = ssa(instr->regs[1]); + switch (cond->opc) { + case OPC_CMPS_S: + case OPC_CMPS_F: + case OPC_CMPS_U: + instr->opc = cond->opc; + instr->flags = cond->flags; + instr->cat2 = cond->cat2; + instr->address = cond->address; + instr->regs[1] = cond->regs[1]; + instr->regs[2] = cond->regs[2]; + break; + default: + break; + } + } } void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c index 77cd0e622f0..7a49f4c371c 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c @@ -183,7 +183,13 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) ctx->has_samp = true; regmask_set(&needs_sy, n->regs[0]); } else if (is_load(n)) { - regmask_set(&needs_sy, n->regs[0]); + /* seems like ldlv needs (ss) bit instead?? which is odd but + * makes a bunch of flat-varying tests start working on a4xx. + */ + if (n->opc == OPC_LDLV) + regmask_set(&needs_ss, n->regs[0]); + else + regmask_set(&needs_sy, n->regs[0]); } /* both tex/sfu appear to not always immediately consume diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c index 73c65d6ad27..897b3b963be 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c @@ -45,6 +45,7 @@ ir3_tgsi_to_nir(const struct tgsi_token *tokens) .lower_flrp = true, .lower_ffract = true, .native_integers = true, + .vertex_id_zero_based = true, .lower_extract_byte = true, .lower_extract_word = true, }; @@ -141,7 +142,7 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s, } while (progress); - OPT_V(s, nir_remove_dead_variables); + OPT_V(s, nir_remove_dead_variables, nir_var_local); if (fd_mesa_debug & FD_DBG_DISASM) { debug_printf("----------------------\n"); diff --git a/src/gallium/drivers/i915/i915_query.c b/src/gallium/drivers/i915/i915_query.c index 78d67cea2c9..fa1b01d1804 100644 --- a/src/gallium/drivers/i915/i915_query.c +++ b/src/gallium/drivers/i915/i915_query.c @@ -76,6 +76,11 @@ static boolean i915_get_query_result(struct pipe_context *ctx, return TRUE; } +static void +i915_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void i915_init_query_functions(struct i915_context *i915) { @@ -84,5 +89,6 @@ i915_init_query_functions(struct i915_context *i915) i915->base.begin_query = i915_begin_query; i915->base.end_query = i915_end_query; i915->base.get_query_result = i915_get_query_result; + i915->base.set_active_query_state = i915_set_active_query_state; } diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index 68e32e51c34..9b6a6604965 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -270,6 +270,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: diff --git a/src/gallium/drivers/ilo/ilo_query.c b/src/gallium/drivers/ilo/ilo_query.c index 106bd42a335..8a42f58a87f 100644 --- a/src/gallium/drivers/ilo/ilo_query.c +++ b/src/gallium/drivers/ilo/ilo_query.c @@ -222,6 +222,11 @@ ilo_get_query_result(struct pipe_context *pipe, struct pipe_query *query, return true; } +static void +ilo_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + /** * Initialize query-related functions. */ @@ -233,4 +238,5 @@ ilo_init_query_functions(struct ilo_context *ilo) ilo->base.begin_query = ilo_begin_query; ilo->base.end_query = ilo_end_query; ilo->base.get_query_result = ilo_get_query_result; + ilo->base.set_active_query_state = ilo_set_active_query_state; } diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c index 142d6f1fa21..538f817e242 100644 --- a/src/gallium/drivers/ilo/ilo_screen.c +++ b/src/gallium/drivers/ilo/ilo_screen.c @@ -499,6 +499,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c index fc593670671..2fddc90503f 100644 --- a/src/gallium/drivers/llvmpipe/lp_query.c +++ b/src/gallium/drivers/llvmpipe/lp_query.c @@ -320,6 +320,11 @@ llvmpipe_check_render_cond(struct llvmpipe_context *lp) return TRUE; } +static void +llvmpipe_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void llvmpipe_init_query_funcs(struct llvmpipe_context *llvmpipe ) { llvmpipe->pipe.create_query = llvmpipe_create_query; @@ -327,6 +332,7 @@ void llvmpipe_init_query_funcs(struct llvmpipe_context *llvmpipe ) llvmpipe->pipe.begin_query = llvmpipe_begin_query; llvmpipe->pipe.end_query = llvmpipe_end_query; llvmpipe->pipe.get_query_result = llvmpipe_get_query_result; + llvmpipe->pipe.set_active_query_state = llvmpipe_set_active_query_state; } diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index 6a5f906adc6..cb681bac939 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -320,6 +320,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/llvmpipe/lp_test_arit.c b/src/gallium/drivers/llvmpipe/lp_test_arit.c index a0f2db780bb..ba831f37c05 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_arit.c +++ b/src/gallium/drivers/llvmpipe/lp_test_arit.c @@ -218,6 +218,7 @@ const float round_values[] = { -10.0, -1, 0.0, 12.0, -1.49, -0.25, 1.25, 2.51, -0.99, -0.01, 0.01, 0.99, + -1.5, -0.5, 0.5, 1.5, 1.401298464324817e-45f, // smallest denormal -1.401298464324817e-45f, 1.62981451e-08f, @@ -283,7 +284,7 @@ unary_tests[] = { {"sin", &lp_build_sin, &sinf, sincos_values, Elements(sincos_values), 20.0 }, {"cos", &lp_build_cos, &cosf, sincos_values, Elements(sincos_values), 20.0 }, {"sgn", &lp_build_sgn, &sgnf, exp2_values, Elements(exp2_values), 20.0 }, - {"round", &lp_build_round, &roundf, round_values, Elements(round_values), 24.0 }, + {"round", &lp_build_round, &nearbyintf, round_values, Elements(round_values), 24.0 }, {"trunc", &lp_build_trunc, &truncf, round_values, Elements(round_values), 24.0 }, {"floor", &lp_build_floor, &floorf, round_values, Elements(round_values), 24.0 }, {"ceil", &lp_build_ceil, &ceilf, round_values, Elements(round_values), 24.0 }, diff --git a/src/gallium/drivers/noop/noop_pipe.c b/src/gallium/drivers/noop/noop_pipe.c index fd0a5d0f830..55aca74628e 100644 --- a/src/gallium/drivers/noop/noop_pipe.c +++ b/src/gallium/drivers/noop/noop_pipe.c @@ -78,6 +78,11 @@ static boolean noop_get_query_result(struct pipe_context *ctx, return TRUE; } +static void +noop_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + /* * resource @@ -284,6 +289,7 @@ static struct pipe_context *noop_create_context(struct pipe_screen *screen, ctx->begin_query = noop_begin_query; ctx->end_query = noop_end_query; ctx->get_query_result = noop_get_query_result; + ctx->set_active_query_state = noop_set_active_query_state; ctx->transfer_map = noop_transfer_map; ctx->transfer_flush_region = noop_transfer_flush_region; ctx->transfer_unmap = noop_transfer_unmap; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index 500ab8915de..1b595aec364 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -1327,7 +1327,11 @@ GCRA::simplify() bestScore = score; } } +#if __cplusplus >= 201103L + if (std::isinf(bestScore)) { +#else if (isinf(bestScore)) { +#endif ERROR("no viable spill candidates left\n"); break; } diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c index 1695553d793..ba43a614b90 100644 --- a/src/gallium/drivers/nouveau/nouveau_buffer.c +++ b/src/gallium/drivers/nouveau/nouveau_buffer.c @@ -843,6 +843,39 @@ nouveau_user_buffer_upload(struct nouveau_context *nv, return true; } +/* Invalidate underlying buffer storage, reset fences, reallocate to non-busy + * buffer. + */ +void +nouveau_buffer_invalidate(struct pipe_context *pipe, + struct pipe_resource *resource) +{ + struct nouveau_context *nv = nouveau_context(pipe); + struct nv04_resource *buf = nv04_resource(resource); + int ref = buf->base.reference.count - 1; + + /* Shared buffers shouldn't get reallocated */ + if (unlikely(buf->base.bind & PIPE_BIND_SHARED)) + return; + + /* We can't touch persistent/coherent buffers */ + if (buf->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT | + PIPE_RESOURCE_FLAG_MAP_COHERENT)) + return; + + /* If the buffer is sub-allocated and not currently being written, just + * wipe the valid buffer range. Otherwise we have to create fresh + * storage. (We don't keep track of fences for non-sub-allocated BO's.) + */ + if (buf->mm && !nouveau_buffer_busy(buf, PIPE_TRANSFER_WRITE)) { + util_range_set_empty(&buf->valid_buffer_range); + } else { + nouveau_buffer_reallocate(nv->screen, buf, buf->domain); + if (ref > 0) /* any references inside context possible ? */ + nv->invalidate_resource_storage(nv, &buf->base, ref); + } +} + /* Scratch data allocation. */ diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.h b/src/gallium/drivers/nouveau/nouveau_buffer.h index d45bf7aebcf..3a33fae9ce2 100644 --- a/src/gallium/drivers/nouveau/nouveau_buffer.h +++ b/src/gallium/drivers/nouveau/nouveau_buffer.h @@ -99,6 +99,10 @@ bool nouveau_user_buffer_upload(struct nouveau_context *, struct nv04_resource *, unsigned base, unsigned size); +void +nouveau_buffer_invalidate(struct pipe_context *pipe, + struct pipe_resource *resource); + /* Copy data to a scratch buffer and return address & bo the data resides in. * Returns 0 on failure. */ diff --git a/src/gallium/drivers/nouveau/nv30/nv30_query.c b/src/gallium/drivers/nouveau/nv30/nv30_query.c index 75a4b0446fe..cb53a3663e5 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_query.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_query.c @@ -263,6 +263,11 @@ nv40_query_render_condition(struct pipe_context *pipe, PUSH_DATA (push, 0x02000000 | q->qo[1]->hw->start); } +static void +nv30_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void nv30_query_init(struct pipe_context *pipe) { @@ -273,6 +278,7 @@ nv30_query_init(struct pipe_context *pipe) pipe->begin_query = nv30_query_begin; pipe->end_query = nv30_query_end; pipe->get_query_result = nv30_query_result; + pipe->set_active_query_state = nv30_set_active_query_state; if (eng3d->oclass >= NV40_3D_CLASS) pipe->render_condition = nv40_query_render_condition; } diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index db7c2d15fb1..400e9f5c04d 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -193,6 +193,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_VENDOR_ID: @@ -324,6 +325,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR: case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR: case PIPE_SHADER_CAP_SUBROUTINES: + case PIPE_SHADER_CAP_INTEGERS: case PIPE_SHADER_CAP_DOUBLES: case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED: diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c index 61a52c4b366..5af0e9b3a27 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c @@ -93,6 +93,30 @@ nv50_memory_barrier(struct pipe_context *pipe, unsigned flags) } } +static void +nv50_emit_string_marker(struct pipe_context *pipe, const char *str, int len) +{ + struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf; + int string_words = len / 4; + int data_words; + + if (len <= 0) + return; + string_words = MIN2(string_words, NV04_PFIFO_MAX_PACKET_LEN); + if (string_words == NV04_PFIFO_MAX_PACKET_LEN) + data_words = string_words; + else + data_words = string_words + !!(len & 3); + BEGIN_NI04(push, SUBC_3D(NV04_GRAPH_NOP), data_words); + if (string_words) + PUSH_DATAp(push, str, string_words); + if (string_words != data_words) { + int data = 0; + memcpy(&data, &str[string_words * 4], len & 3); + PUSH_DATA (push, data); + } +} + void nv50_default_kick_notify(struct nouveau_pushbuf *push) { @@ -309,6 +333,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) pipe->texture_barrier = nv50_texture_barrier; pipe->memory_barrier = nv50_memory_barrier; pipe->get_sample_position = nv50_context_get_sample_position; + pipe->emit_string_marker = nv50_emit_string_marker; if (!screen->cur_ctx) { /* Restore the last context's state here, normally handled during diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c index 4cd3b615606..fa70fb6950e 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c @@ -143,6 +143,11 @@ nv50_render_condition(struct pipe_context *pipe, PUSH_DATA (push, hq->bo->offset + hq->offset); } +static void +nv50_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void nv50_init_query_functions(struct nv50_context *nv50) { @@ -153,6 +158,7 @@ nv50_init_query_functions(struct nv50_context *nv50) pipe->begin_query = nv50_begin_query; pipe->end_query = nv50_end_query; pipe->get_query_result = nv50_get_query_result; + pipe->set_active_query_state = nv50_set_active_query_state; pipe->render_condition = nv50_render_condition; nv50->cond_condmode = NV50_3D_COND_MODE_ALWAYS; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.c b/src/gallium/drivers/nouveau/nv50/nv50_resource.c index ad5f3b814db..b090a30aed6 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_resource.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.c @@ -85,6 +85,13 @@ nv50_surface_destroy(struct pipe_context *pipe, struct pipe_surface *ps) } void +nv50_invalidate_resource(struct pipe_context *pipe, struct pipe_resource *res) +{ + if (res->target == PIPE_BUFFER) + nouveau_buffer_invalidate(pipe, res); +} + +void nv50_init_resource_functions(struct pipe_context *pcontext) { pcontext->transfer_map = u_transfer_map_vtbl; @@ -93,6 +100,7 @@ nv50_init_resource_functions(struct pipe_context *pcontext) pcontext->transfer_inline_write = u_transfer_inline_write_vtbl; pcontext->create_surface = nv50_surface_create; pcontext->surface_destroy = nv50_surface_destroy; + pcontext->invalidate_resource = nv50_invalidate_resource; } void diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h index b40370a1d78..5d03925b0d0 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h @@ -152,6 +152,9 @@ void nv50_surface_destroy(struct pipe_context *, struct pipe_surface *); void +nv50_invalidate_resource(struct pipe_context *, struct pipe_resource *); + +void nv50_clear_texture(struct pipe_context *pipe, struct pipe_resource *res, unsigned level, diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 20fb61b51f4..ef114e50529 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -193,6 +193,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_CLEAR_TEXTURE: case PIPE_CAP_COMPUTE: case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: + case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_STRING_MARKER: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: return 1; /* class_3d >= NVA0_3D_CLASS; */ @@ -234,9 +236,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_PACK_HALF_FLOAT: case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: - case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_STRING_MARKER: case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: @@ -246,6 +246,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c index 007cccfd10b..fcb8289beda 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c @@ -97,6 +97,30 @@ nvc0_memory_barrier(struct pipe_context *pipe, unsigned flags) } static void +nvc0_emit_string_marker(struct pipe_context *pipe, const char *str, int len) +{ + struct nouveau_pushbuf *push = nvc0_context(pipe)->base.pushbuf; + int string_words = len / 4; + int data_words; + + if (len <= 0) + return; + string_words = MIN2(string_words, NV04_PFIFO_MAX_PACKET_LEN); + if (string_words == NV04_PFIFO_MAX_PACKET_LEN) + data_words = string_words; + else + data_words = string_words + !!(len & 3); + BEGIN_NIC0(push, SUBC_3D(NV04_GRAPH_NOP), data_words); + if (string_words) + PUSH_DATAp(push, str, string_words); + if (string_words != data_words) { + int data = 0; + memcpy(&data, &str[string_words * 4], len & 3); + PUSH_DATA (push, data); + } +} + +static void nvc0_context_unreference_resources(struct nvc0_context *nvc0) { unsigned s, i; @@ -333,6 +357,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) pipe->texture_barrier = nvc0_texture_barrier; pipe->memory_barrier = nvc0_memory_barrier; pipe->get_sample_position = nvc0_context_get_sample_position; + pipe->emit_string_marker = nvc0_emit_string_marker; nouveau_context_init(&nvc0->base); nvc0_init_query_functions(nvc0); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index db02fa2df5c..d3024f9fa06 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -456,6 +456,13 @@ nvc0_fp_gen_header(struct nvc0_program *fp, struct nv50_ir_prog_info *info) fp->hdr[18] |= 0xf << info->out[i].slot[0]; } + /* There are no "regular" attachments, but the shader still needs to be + * executed. It seems like it wants to think that it has some color + * outputs in order to actually run. + */ + if (info->prop.fp.numColourResults == 0 && !info->prop.fp.writesDepth) + fp->hdr[18] |= 0xf; + fp->fp.early_z = info->prop.fp.earlyFragTests; return 0; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 92ca613cda1..b34271c4911 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -254,6 +254,11 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, return 0; } +static void +nvc0_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void nvc0_init_query_functions(struct nvc0_context *nvc0) { @@ -265,6 +270,7 @@ nvc0_init_query_functions(struct nvc0_context *nvc0) pipe->end_query = nvc0_end_query; pipe->get_query_result = nvc0_get_query_result; pipe->get_query_result_resource = nvc0_get_query_result_resource; + pipe->set_active_query_state = nvc0_set_active_query_state; pipe->render_condition = nvc0_render_condition; nvc0->cond_condmode = NVC0_3D_COND_MODE_ALWAYS; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c index c034d0fd011..0aee5890fd8 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c @@ -52,6 +52,7 @@ nvc0_init_resource_functions(struct pipe_context *pcontext) pcontext->transfer_inline_write = u_transfer_inline_write_vtbl; pcontext->create_surface = nvc0_surface_create; pcontext->surface_destroy = nv50_surface_destroy; + pcontext->invalidate_resource = nv50_invalidate_resource; } void diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index c41912a6037..9a34007c6e5 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -52,6 +52,12 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen, if (!(0x117 & (1 << sample_count))) /* 0, 1, 2, 4 or 8 */ return false; + /* Short-circuit the rest of the logic -- this is used by the state tracker + * to determine valid MS levels in a no-attachments scenario. + */ + if (format == PIPE_FORMAT_NONE && bindings & PIPE_BIND_RENDER_TARGET) + return true; + if (!util_format_is_supported(format, bindings)) return false; @@ -216,6 +222,9 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: case PIPE_CAP_QUERY_BUFFER_OBJECT: + case PIPE_CAP_INVALIDATE_BUFFER: + case PIPE_CAP_STRING_MARKER: + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: return (class_3d >= NVE4_3D_CLASS) ? 1 : 0; @@ -241,9 +250,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: case PIPE_CAP_DEVICE_RESET_STATUS_QUERY: case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: - case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: - case PIPE_CAP_STRING_MARKER: case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY: case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_MEMORY_INFO: @@ -251,7 +258,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_PCI_BUS: case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c index 9c64482f2e2..d0d9315dd2b 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c @@ -56,15 +56,18 @@ nvc0_validate_zcull(struct nvc0_context *nvc0) #endif static inline void -nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i) +nvc0_fb_set_null_rt(struct nouveau_pushbuf *push, unsigned i, unsigned layers) { - BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 6); - PUSH_DATA (push, 0); - PUSH_DATA (push, 0); - PUSH_DATA (push, 64); - PUSH_DATA (push, 0); + BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(i)), 9); PUSH_DATA (push, 0); PUSH_DATA (push, 0); + PUSH_DATA (push, 64); // width + PUSH_DATA (push, 0); // height + PUSH_DATA (push, 0); // format + PUSH_DATA (push, 0); // tile mode + PUSH_DATA (push, layers); // layers + PUSH_DATA (push, 0); // layer stride + PUSH_DATA (push, 0); // base layer } static void @@ -75,12 +78,11 @@ nvc0_validate_fb(struct nvc0_context *nvc0) struct nvc0_screen *screen = nvc0->screen; unsigned i, ms; unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1; + unsigned nr_cbufs = fb->nr_cbufs; bool serialize = false; nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_FB); - BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); - PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs); BEGIN_NVC0(push, NVC0_3D(SCREEN_SCISSOR_HORIZ), 2); PUSH_DATA (push, fb->width << 16); PUSH_DATA (push, fb->height << 16); @@ -91,7 +93,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0) struct nouveau_bo *bo; if (!fb->cbufs[i]) { - nvc0_fb_set_null_rt(push, i); + nvc0_fb_set_null_rt(push, i, 0); continue; } @@ -179,6 +181,19 @@ nvc0_validate_fb(struct nvc0_context *nvc0) PUSH_DATA (push, 0); } + if (nr_cbufs == 0 && !fb->zsbuf) { + assert(util_is_power_of_two(fb->samples)); + assert(fb->samples <= 8); + + nvc0_fb_set_null_rt(push, 0, fb->layers); + + if (fb->samples > 1) + ms_mode = ffs(fb->samples) - 1; + nr_cbufs = 1; + } + + BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); + PUSH_DATA (push, (076543210 << 4) | nr_cbufs); IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), ms_mode); ms = 1 << ms_mode; @@ -592,8 +607,9 @@ nvc0_validate_derived_2(struct nvc0_context *nvc0) struct nouveau_pushbuf *push = nvc0->base.pushbuf; if (nvc0->zsa && nvc0->zsa->pipe.alpha.enabled && + nvc0->framebuffer.zsbuf && nvc0->framebuffer.nr_cbufs == 0) { - nvc0_fb_set_null_rt(push, 0); + nvc0_fb_set_null_rt(push, 0, 0); BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); PUSH_DATA (push, (076543210 << 4) | 1); } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index e657204128e..e108590e215 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -1043,6 +1043,8 @@ nvc0_blitctx_pre_blit(struct nvc0_blitctx *ctx) ctx->saved.fb.width = nvc0->framebuffer.width; ctx->saved.fb.height = nvc0->framebuffer.height; + ctx->saved.fb.samples = nvc0->framebuffer.samples; + ctx->saved.fb.layers = nvc0->framebuffer.layers; ctx->saved.fb.nr_cbufs = nvc0->framebuffer.nr_cbufs; ctx->saved.fb.cbufs[0] = nvc0->framebuffer.cbufs[0]; ctx->saved.fb.zsbuf = nvc0->framebuffer.zsbuf; @@ -1110,6 +1112,8 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit) nvc0->framebuffer.width = blit->saved.fb.width; nvc0->framebuffer.height = blit->saved.fb.height; + nvc0->framebuffer.samples = blit->saved.fb.samples; + nvc0->framebuffer.layers = blit->saved.fb.layers; nvc0->framebuffer.nr_cbufs = blit->saved.fb.nr_cbufs; nvc0->framebuffer.cbufs[0] = blit->saved.fb.cbufs[0]; nvc0->framebuffer.zsbuf = blit->saved.fb.zsbuf; diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c index 6414e80828e..7603985b14b 100644 --- a/src/gallium/drivers/r300/r300_query.c +++ b/src/gallium/drivers/r300/r300_query.c @@ -200,6 +200,11 @@ static void r300_render_condition(struct pipe_context *pipe, } } +static void +r300_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void r300_init_query_functions(struct r300_context* r300) { r300->context.create_query = r300_create_query; @@ -207,5 +212,6 @@ void r300_init_query_functions(struct r300_context* r300) r300->context.begin_query = r300_begin_query; r300->context.end_query = r300_end_query; r300->context.get_query_result = r300_get_query_result; + r300->context.set_active_query_state = r300_set_active_query_state; r300->context.render_condition = r300_render_condition; } diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index b3a7f049e10..eae53e16a54 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -215,6 +215,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; /* SWTCL-only features. */ diff --git a/src/gallium/drivers/r300/r300_texture_desc.c b/src/gallium/drivers/r300/r300_texture_desc.c index 8fa98c5804e..2442d726cd1 100644 --- a/src/gallium/drivers/r300/r300_texture_desc.c +++ b/src/gallium/drivers/r300/r300_texture_desc.c @@ -25,6 +25,7 @@ #include "r300_context.h" #include "util/u_format.h" +#include <inttypes.h> /* Returns the number of pixels that the texture should be aligned to * in the given dimension. */ @@ -614,7 +615,7 @@ void r300_texture_desc_init(struct r300_screen *rscreen, "r300: I got a pre-allocated buffer to use it as a texture " "storage, but the buffer is too small. I'll use the buffer " "anyway, because I can't crash here, but it's dangerous. " - "This can be a DDX bug. Got: %iB, Need: %iB, Info:\n", + "This can be a DDX bug. Got: %"PRIu64"B, Need: %uB, Info:\n", tex->buf->size, tex->tex.size_in_bytes); r300_tex_print_info(tex, "texture_desc_init"); /* Ooops, what now. Apps will break if we fail this, diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 65952676987..2ad9e3eb1ab 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -472,6 +472,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx, r600_init_command_buffer(&rs->buffer, 30); + rs->scissor_enable = state->scissor; rs->flatshade = state->flatshade; rs->sprite_coord_enable = state->sprite_coord_enable; rs->two_side = state->light_twoside; @@ -528,7 +529,7 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx, r600_store_context_reg(&rs->buffer, R_0286D4_SPI_INTERP_CONTROL_0, spi_interp); r600_store_context_reg(&rs->buffer, R_028A48_PA_SC_MODE_CNTL_0, S_028A48_MSAA_ENABLE(state->multisample) | - S_028A48_VPORT_SCISSOR_ENABLE(state->scissor) | + S_028A48_VPORT_SCISSOR_ENABLE(1) | S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable)); if (rctx->b.chip_class == CAYMAN) { @@ -560,8 +561,11 @@ static void *evergreen_create_rs_state(struct pipe_context *ctx, static void *evergreen_create_sampler_state(struct pipe_context *ctx, const struct pipe_sampler_state *state) { + struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; struct r600_pipe_sampler_state *ss = CALLOC_STRUCT(r600_pipe_sampler_state); - unsigned aniso_flag_offset = state->max_anisotropy > 1 ? 2 : 0; + unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso + : state->max_anisotropy; + unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso); if (!ss) { return NULL; @@ -574,10 +578,10 @@ static void *evergreen_create_sampler_state(struct pipe_context *ctx, S_03C000_CLAMP_X(r600_tex_wrap(state->wrap_s)) | S_03C000_CLAMP_Y(r600_tex_wrap(state->wrap_t)) | S_03C000_CLAMP_Z(r600_tex_wrap(state->wrap_r)) | - S_03C000_XY_MAG_FILTER(r600_tex_filter(state->mag_img_filter) | aniso_flag_offset) | - S_03C000_XY_MIN_FILTER(r600_tex_filter(state->min_img_filter) | aniso_flag_offset) | + S_03C000_XY_MAG_FILTER(eg_tex_filter(state->mag_img_filter, max_aniso)) | + S_03C000_XY_MIN_FILTER(eg_tex_filter(state->min_img_filter, max_aniso)) | S_03C000_MIP_FILTER(r600_tex_mipfilter(state->min_mip_filter)) | - S_03C000_MAX_ANISO(r600_tex_aniso_filter(state->max_anisotropy)) | + S_03C000_MAX_ANISO_RATIO(max_aniso_ratio) | S_03C000_DEPTH_COMPARE_FUNCTION(r600_tex_compare(state->compare_func)) | S_03C000_BORDER_COLOR_TYPE(ss->border_color_use ? V_03C000_SQ_TEX_BORDER_COLOR_REGISTER : 0); /* R_03C004_SQ_TEX_SAMPLER_WORD1_0 */ @@ -849,10 +853,12 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, view->tex_resource_words[5] |= S_030014_LAST_LEVEL(log_samples); view->tex_resource_words[6] |= S_030018_FMASK_BANK_HEIGHT(fmask_bankh); } else { + bool no_mip = first_level == last_level; + view->tex_resource_words[4] |= S_030010_BASE_LEVEL(first_level); view->tex_resource_words[5] |= S_030014_LAST_LEVEL(last_level); /* aniso max 16 samples */ - view->tex_resource_words[6] |= S_030018_MAX_ANISO(4); + view->tex_resource_words[6] |= S_030018_MAX_ANISO_RATIO(no_mip ? 0 : 4); } view->tex_resource_words[7] = S_03001C_DATA_FORMAT(format) | @@ -919,60 +925,12 @@ static void evergreen_get_scissor_rect(struct r600_context *rctx, unsigned tl_x, unsigned tl_y, unsigned br_x, unsigned br_y, uint32_t *tl, uint32_t *br) { - /* EG hw workaround */ - if (br_x == 0) - tl_x = 1; - if (br_y == 0) - tl_y = 1; + struct pipe_scissor_state scissor = {tl_x, tl_y, br_x, br_y}; - /* cayman hw workaround */ - if (rctx->b.chip_class == CAYMAN) { - if (br_x == 1 && br_y == 1) - br_x = 2; - } + evergreen_apply_scissor_bug_workaround(&rctx->b, &scissor); - *tl = S_028240_TL_X(tl_x) | S_028240_TL_Y(tl_y); - *br = S_028244_BR_X(br_x) | S_028244_BR_Y(br_y); -} - -static void evergreen_set_scissor_states(struct pipe_context *ctx, - unsigned start_slot, - unsigned num_scissors, - const struct pipe_scissor_state *state) -{ - struct r600_context *rctx = (struct r600_context *)ctx; - struct r600_scissor_state *rstate = &rctx->scissor; - int i; - - for (i = start_slot; i < start_slot + num_scissors; i++) - rstate->scissor[i] = state[i - start_slot]; - rstate->dirty_mask |= ((1 << num_scissors) - 1) << start_slot; - rstate->atom.num_dw = util_bitcount(rstate->dirty_mask) * 4; - r600_mark_atom_dirty(rctx, &rstate->atom); -} - -static void evergreen_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom) -{ - struct radeon_winsys_cs *cs = rctx->b.gfx.cs; - struct r600_scissor_state *rstate = &rctx->scissor; - struct pipe_scissor_state *state; - uint32_t dirty_mask; - unsigned i, offset; - uint32_t tl, br; - - dirty_mask = rstate->dirty_mask; - while (dirty_mask != 0) { - i = u_bit_scan(&dirty_mask); - state = &rstate->scissor[i]; - evergreen_get_scissor_rect(rctx, state->minx, state->miny, state->maxx, state->maxy, &tl, &br); - - offset = i * 4 * 2; - radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + offset, 2); - radeon_emit(cs, tl); - radeon_emit(cs, br); - } - rstate->dirty_mask = 0; - rstate->atom.num_dw = 0; + *tl = S_028240_TL_X(scissor.minx) | S_028240_TL_Y(scissor.miny); + *br = S_028244_BR_X(scissor.maxx) | S_028244_BR_Y(scissor.maxy); } /** @@ -1802,12 +1760,15 @@ static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_ S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE); - if (a->occlusion_query_enabled) { + if (rctx->b.num_occlusion_queries > 0 && + !a->occlusion_queries_disabled) { db_count_control |= S_028004_PERFECT_ZPASS_COUNTS(1); if (rctx->b.chip_class == CAYMAN) { db_count_control |= S_028004_SAMPLE_RATE(a->log_samples); } db_render_override |= S_02800C_NOOP_CULL_DISABLE(1); + } else { + db_count_control |= S_028004_ZPASS_INCREMENT_DISABLE(1); } /* This is to fix a lockup when hyperz and alpha test are enabled at @@ -2392,6 +2353,12 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx) r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + /* This enables pipeline stat & streamout queries. + * They are only disabled by blits. + */ + r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); + r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0)); + cayman_init_common_regs(cb, rctx->b.chip_class, rctx->b.family, rctx->screen->b.info.drm_minor); @@ -2474,12 +2441,6 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx) r600_store_context_reg(cb, R_028230_PA_SC_EDGERULE, 0xAAAAAAAA); r600_store_context_reg(cb, R_028820_PA_CL_NANINF_CNTL, 0); - r600_store_context_reg_seq(cb, CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4); - r600_store_value(cb, fui(1.0)); /* CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */ - r600_store_value(cb, fui(1.0)); /* CM_R_028BEC_PA_CL_GB_VERT_DISC_ADJ */ - r600_store_value(cb, fui(1.0)); /* CM_R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */ - r600_store_value(cb, fui(1.0)); /* CM_R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */ - r600_store_context_reg_seq(cb, R_028240_PA_SC_GENERIC_SCISSOR_TL, 2); r600_store_value(cb, 0); /* R_028240_PA_SC_GENERIC_SCISSOR_TL */ r600_store_value(cb, S_028244_BR_X(16384) | S_028244_BR_Y(16384)); /* R_028244_PA_SC_GENERIC_SCISSOR_BR */ @@ -2645,6 +2606,12 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx) r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + /* This enables pipeline stat & streamout queries. + * They are only disabled by blits. + */ + r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); + r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0)); + evergreen_init_common_regs(rctx, cb, rctx->b.chip_class, rctx->b.family, rctx->screen->b.info.drm_minor); @@ -2889,12 +2856,6 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx) r600_store_value(cb, 0); /* R_028AC4_DB_SRESULTS_COMPARE_STATE1 */ r600_store_value(cb, 0); /* R_028AC8_DB_PRELOAD_CONTROL */ - r600_store_context_reg_seq(cb, R_028C0C_PA_CL_GB_VERT_CLIP_ADJ, 4); - r600_store_value(cb, fui(1.0)); /* R_028C0C_PA_CL_GB_VERT_CLIP_ADJ */ - r600_store_value(cb, fui(1.0)); /* R_028C10_PA_CL_GB_VERT_DISC_ADJ */ - r600_store_value(cb, fui(1.0)); /* R_028C14_PA_CL_GB_HORZ_CLIP_ADJ */ - r600_store_value(cb, fui(1.0)); /* R_028C18_PA_CL_GB_HORZ_DISC_ADJ */ - r600_store_context_reg_seq(cb, R_028240_PA_SC_GENERIC_SCISSOR_TL, 2); r600_store_value(cb, 0); /* R_028240_PA_SC_GENERIC_SCISSOR_TL */ r600_store_value(cb, S_028244_BR_X(16384) | S_028244_BR_Y(16384)); /* R_028244_PA_SC_GENERIC_SCISSOR_BR */ @@ -3696,8 +3657,8 @@ void evergreen_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->dsa_state.atom, id++, r600_emit_cso_state, 0); r600_init_atom(rctx, &rctx->poly_offset_state.atom, id++, evergreen_emit_polygon_offset, 6); r600_init_atom(rctx, &rctx->rasterizer_state.atom, id++, r600_emit_cso_state, 0); - r600_init_atom(rctx, &rctx->scissor.atom, id++, evergreen_emit_scissor_state, 0); - r600_init_atom(rctx, &rctx->viewport.atom, id++, r600_emit_viewport_state, 0); + r600_add_atom(rctx, &rctx->b.scissors.atom, id++); + r600_add_atom(rctx, &rctx->b.viewports.atom, id++); r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4); r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5); r600_add_atom(rctx, &rctx->b.render_cond_atom, id++); @@ -3716,7 +3677,6 @@ void evergreen_init_state_functions(struct r600_context *rctx) rctx->b.b.set_framebuffer_state = evergreen_set_framebuffer_state; rctx->b.b.set_polygon_stipple = evergreen_set_polygon_stipple; rctx->b.b.set_min_samples = evergreen_set_min_samples; - rctx->b.b.set_scissor_states = evergreen_set_scissor_states; rctx->b.b.set_tess_state = evergreen_set_tess_state; if (rctx->b.chip_class == EVERGREEN) rctx->b.b.get_sample_position = evergreen_get_sample_position; diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index ebe8c4a65ba..ece421e3d33 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -1202,11 +1202,11 @@ #define G_030014_LAST_ARRAY(x) (((x) >> 17) & 0x1FFF) #define C_030014_LAST_ARRAY 0xC001FFFF #define R_030018_SQ_TEX_RESOURCE_WORD6_0 0x030018 -/* FMASK_BANK_HEIGHT and MAX_ANISO share the first two bits. +/* FMASK_BANK_HEIGHT and MAX_ANISO_RATIO share the first two bits. * The former is only used with MSAA textures. */ -#define S_030018_MAX_ANISO(x) (((x) & 0x7) << 0) -#define G_030018_MAX_ANISO(x) (((x) >> 0) & 0x7) -#define C_030018_MAX_ANISO 0xFFFFFFF8 +#define S_030018_MAX_ANISO_RATIO(x) (((x) & 0x7) << 0) +#define G_030018_MAX_ANISO_RATIO(x) (((x) >> 0) & 0x7) +#define C_030018_MAX_ANISO_RATIO 0xFFFFFFF8 #define S_030018_FMASK_BANK_HEIGHT(x) (((x) & 0x3) << 0) #define S_030018_PERF_MODULATION(x) (((x) & 0x7) << 3) #define G_030018_PERF_MODULATION(x) (((x) >> 3) & 0x7) @@ -1344,9 +1344,9 @@ #define S_03C000_MIP_FILTER(x) (((x) & 0x3) << 15) #define G_03C000_MIP_FILTER(x) (((x) >> 15) & 0x3) #define C_03C000_MIP_FILTER 0xFFFE7FFF -#define S_03C000_MAX_ANISO(x) (((x) & 0x7) << 17) -#define G_03C000_MAX_ANISO(x) (((x) >> 17) & 0x7) -#define C_03C000_MAX_ANISO 0xFFF1FFFF +#define S_03C000_MAX_ANISO_RATIO(x) (((x) & 0x7) << 17) +#define G_03C000_MAX_ANISO_RATIO(x) (((x) >> 17) & 0x7) +#define C_03C000_MAX_ANISO_RATIO 0xFFF1FFFF #define S_03C000_BORDER_COLOR_TYPE(x) (((x) & 0x3) << 20) #define G_03C000_BORDER_COLOR_TYPE(x) (((x) >> 20) & 0x3) #define C_03C000_BORDER_COLOR_TYPE 0xFFCFFFFF @@ -1735,7 +1735,7 @@ #define S_028000_COPY_SAMPLE(x) (((x) & 0x7) << 8) #define S_028000_COLOR_DISABLE(x) (((x) & 0x1) << 12) #define R_028004_DB_COUNT_CONTROL 0x00028004 -#define S_028004_ZPASS_INCREMENT_DISABLE (((x) & 0x1) << 0) +#define S_028004_ZPASS_INCREMENT_DISABLE(x) (((x) & 0x1) << 0) #define S_028004_PERFECT_ZPASS_COUNTS(x) (((x) & 0x1) << 1) #define S_028004_SAMPLE_RATE(x) (((x) & 0x7) << 4) /* cayman only */ #define R_028008_DB_DEPTH_VIEW 0x00028008 diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index c52d5a9bad0..1a4cc425394 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -54,8 +54,6 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op { struct r600_context *rctx = (struct r600_context *)ctx; - r600_suspend_nontimer_queries(&rctx->b); - util_blitter_save_vertex_buffer_slot(rctx->blitter, rctx->vertex_buffer_state.vb); util_blitter_save_vertex_elements(rctx->blitter, rctx->vertex_fetch_shader.cso); util_blitter_save_vertex_shader(rctx->blitter, rctx->vs_shader); @@ -67,8 +65,8 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op util_blitter_save_rasterizer(rctx->blitter, rctx->rasterizer_state.cso); if (op & R600_SAVE_FRAGMENT_STATE) { - util_blitter_save_viewport(rctx->blitter, &rctx->viewport.state[0]); - util_blitter_save_scissor(rctx->blitter, &rctx->scissor.scissor[0]); + util_blitter_save_viewport(rctx->blitter, &rctx->b.viewports.states[0]); + util_blitter_save_scissor(rctx->blitter, &rctx->b.scissors.states[0]); util_blitter_save_fragment_shader(rctx->blitter, rctx->ps_shader); util_blitter_save_blend(rctx->blitter, rctx->blend_state.cso); util_blitter_save_depth_stencil_alpha(rctx->blitter, rctx->dsa_state.cso); @@ -98,7 +96,6 @@ static void r600_blitter_end(struct pipe_context *ctx) struct r600_context *rctx = (struct r600_context *)ctx; rctx->b.render_cond_force_off = false; - r600_resume_nontimer_queries(&rctx->b); } static unsigned u_max_sample(struct pipe_resource *r) @@ -584,7 +581,7 @@ static void r600_copy_global_buffer(struct pipe_context *ctx, } static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, + uint64_t offset, uint64_t size, unsigned value, bool is_framebuffer) { struct r600_context *rctx = (struct r600_context*)ctx; diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 7a6f957945b..2bc6d3ffce4 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -64,9 +64,8 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS; } - /* Count in queries_suspend. */ - num_dw += ctx->b.num_cs_dw_nontimer_queries_suspend + - ctx->b.num_cs_dw_timer_queries_suspend; + /* Count in r600_suspend_queries. */ + num_dw += ctx->b.num_cs_dw_queries_suspend; /* Count in streamout_end at the end of CS. */ if (ctx->b.streamout.begin_emitted) { @@ -223,6 +222,16 @@ void r600_flush_emit(struct r600_context *rctx) cs->buf[cs->cdw++] = 0x0000000A; /* POLL_INTERVAL */ } + if (rctx->b.flags & R600_CONTEXT_START_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | + EVENT_INDEX(0)); + } else if (rctx->b.flags & R600_CONTEXT_STOP_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_STOP) | + EVENT_INDEX(0)); + } + if (wait_until) { /* Use of WAIT_UNTIL is deprecated on Cayman+ */ if (rctx->b.family < CHIP_CAYMAN) { @@ -295,12 +304,10 @@ void r600_begin_new_cs(struct r600_context *ctx) r600_mark_atom_dirty(ctx, &ctx->poly_offset_state.atom); r600_mark_atom_dirty(ctx, &ctx->vgt_state.atom); r600_mark_atom_dirty(ctx, &ctx->sample_mask.atom); - ctx->scissor.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; - ctx->scissor.atom.num_dw = R600_MAX_VIEWPORTS * 4; - r600_mark_atom_dirty(ctx, &ctx->scissor.atom); - ctx->viewport.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; - ctx->viewport.atom.num_dw = R600_MAX_VIEWPORTS * 8; - r600_mark_atom_dirty(ctx, &ctx->viewport.atom); + ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; + r600_mark_atom_dirty(ctx, &ctx->b.scissors.atom); + ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; + r600_mark_atom_dirty(ctx, &ctx->b.viewports.atom); if (ctx->b.chip_class <= EVERGREEN) { r600_mark_atom_dirty(ctx, &ctx->config_state.atom); } diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 36b808fbbca..c594f5cb18b 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -365,6 +365,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_GENERATE_MIPMAP: case PIPE_CAP_STRING_MARKER: case PIPE_CAP_QUERY_BUFFER_OBJECT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index cd0052a519f..6c2a48ca412 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -38,8 +38,6 @@ #define R600_NUM_ATOMS 52 -#define R600_MAX_VIEWPORTS 16 - /* read caches */ #define R600_CONTEXT_INV_VERTEX_CACHE (R600_CONTEXT_PRIVATE_FLAG << 0) #define R600_CONTEXT_INV_TEX_CACHE (R600_CONTEXT_PRIVATE_FLAG << 1) @@ -56,7 +54,7 @@ #define R600_CONTEXT_WAIT_CP_DMA_IDLE (R600_CONTEXT_PRIVATE_FLAG << 10) /* the number of CS dwords for flushing and drawing */ -#define R600_MAX_FLUSH_CS_DWORDS 16 +#define R600_MAX_FLUSH_CS_DWORDS 18 #define R600_MAX_DRAW_CS_DWORDS 58 #define R600_MAX_USER_CONST_BUFFERS 13 @@ -120,7 +118,7 @@ struct r600_db_state { struct r600_db_misc_state { struct r600_atom atom; - bool occlusion_query_enabled; + bool occlusion_queries_disabled; bool flush_depthstencil_through_cb; bool flush_depth_inplace; bool flush_stencil_inplace; @@ -221,12 +219,6 @@ struct r600_stencil_ref_state { struct pipe_stencil_ref pipe_state; }; -struct r600_viewport_state { - struct r600_atom atom; - struct pipe_viewport_state state[R600_MAX_VIEWPORTS]; - uint32_t dirty_mask; -}; - struct r600_shader_stages_state { struct r600_atom atom; unsigned geom_enable; @@ -412,14 +404,6 @@ struct r600_cso_state struct r600_command_buffer *cb; }; -struct r600_scissor_state -{ - struct r600_atom atom; - struct pipe_scissor_state scissor[R600_MAX_VIEWPORTS]; - uint32_t dirty_mask; - bool enable; /* r6xx only */ -}; - struct r600_fetch_shader { struct r600_resource *buffer; unsigned offset; @@ -480,12 +464,10 @@ struct r600_context { struct r600_poly_offset_state poly_offset_state; struct r600_cso_state rasterizer_state; struct r600_sample_mask sample_mask; - struct r600_scissor_state scissor; struct r600_seamless_cube_map seamless_cube_map; struct r600_config_state config_state; struct r600_stencil_ref_state stencil_ref; struct r600_vgt_state vgt_state; - struct r600_viewport_state viewport; /* Shaders and shader resources. */ struct r600_cso_state vertex_fetch_shader; struct r600_shader_state hw_shader_stages[EG_NUM_HW_STAGES]; @@ -730,7 +712,6 @@ void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom); void r600_emit_vgt_state(struct r600_context *rctx, struct r600_atom *atom); void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom); void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom); -void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom); void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a); void r600_add_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id); void r600_init_atom(struct r600_context *rctx, struct r600_atom *atom, unsigned id, @@ -746,7 +727,6 @@ void r600_set_sample_locations_constant_buffer(struct r600_context *rctx); uint32_t r600_translate_stencil_op(int s_op); uint32_t r600_translate_fill(uint32_t func); unsigned r600_tex_wrap(unsigned wrap); -unsigned r600_tex_filter(unsigned filter); unsigned r600_tex_mipfilter(unsigned filter); unsigned r600_tex_compare(unsigned compare); bool sampler_state_needs_border_color(const struct pipe_sampler_state *state); diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 3189a1360b1..91e747fa937 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -457,6 +457,7 @@ static void *r600_create_rs_state(struct pipe_context *ctx, r600_init_command_buffer(&rs->buffer, 30); + rs->scissor_enable = state->scissor; rs->flatshade = state->flatshade; rs->sprite_coord_enable = state->sprite_coord_enable; rs->two_side = state->light_twoside; @@ -501,10 +502,9 @@ static void *r600_create_rs_state(struct pipe_context *ctx, if (rctx->b.chip_class >= R700) { sc_mode_cntl |= S_028A4C_FORCE_EOV_REZ_ENABLE(1) | S_028A4C_R700_ZMM_LINE_OFFSET(1) | - S_028A4C_R700_VPORT_SCISSOR_ENABLE(state->scissor); + S_028A4C_R700_VPORT_SCISSOR_ENABLE(1); } else { sc_mode_cntl |= S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1); - rs->scissor_enable = state->scissor; } spi_interp = S_0286D4_FLAT_SHADE_ENA(1); @@ -558,11 +558,24 @@ static void *r600_create_rs_state(struct pipe_context *ctx, return rs; } +static unsigned r600_tex_filter(unsigned filter, unsigned max_aniso) +{ + if (filter == PIPE_TEX_FILTER_LINEAR) + return max_aniso > 1 ? V_03C000_SQ_TEX_XY_FILTER_ANISO_BILINEAR + : V_03C000_SQ_TEX_XY_FILTER_BILINEAR; + else + return max_aniso > 1 ? V_03C000_SQ_TEX_XY_FILTER_ANISO_POINT + : V_03C000_SQ_TEX_XY_FILTER_POINT; +} + static void *r600_create_sampler_state(struct pipe_context *ctx, const struct pipe_sampler_state *state) { + struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; struct r600_pipe_sampler_state *ss = CALLOC_STRUCT(r600_pipe_sampler_state); - unsigned aniso_flag_offset = state->max_anisotropy > 1 ? 4 : 0; + unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso + : state->max_anisotropy; + unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso); if (!ss) { return NULL; @@ -576,10 +589,10 @@ static void *r600_create_sampler_state(struct pipe_context *ctx, S_03C000_CLAMP_X(r600_tex_wrap(state->wrap_s)) | S_03C000_CLAMP_Y(r600_tex_wrap(state->wrap_t)) | S_03C000_CLAMP_Z(r600_tex_wrap(state->wrap_r)) | - S_03C000_XY_MAG_FILTER(r600_tex_filter(state->mag_img_filter) | aniso_flag_offset) | - S_03C000_XY_MIN_FILTER(r600_tex_filter(state->min_img_filter) | aniso_flag_offset) | + S_03C000_XY_MAG_FILTER(r600_tex_filter(state->mag_img_filter, max_aniso)) | + S_03C000_XY_MIN_FILTER(r600_tex_filter(state->min_img_filter, max_aniso)) | S_03C000_MIP_FILTER(r600_tex_mipfilter(state->min_mip_filter)) | - S_03C000_MAX_ANISO(r600_tex_aniso_filter(state->max_anisotropy)) | + S_03C000_MAX_ANISO_RATIO(max_aniso_ratio) | S_03C000_DEPTH_COMPARE_FUNCTION(r600_tex_compare(state->compare_func)) | S_03C000_BORDER_COLOR_TYPE(ss->border_color_use ? V_03C000_SQ_TEX_BORDER_COLOR_REGISTER : 0); /* R_03C004_SQ_TEX_SAMPLER_WORD1_0 */ @@ -777,61 +790,6 @@ static void r600_set_polygon_stipple(struct pipe_context *ctx, { } -static void r600_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom) -{ - struct radeon_winsys_cs *cs = rctx->b.gfx.cs; - struct r600_scissor_state *rstate = &rctx->scissor; - struct pipe_scissor_state *state; - bool do_disable_workaround = false; - uint32_t dirty_mask; - unsigned i, offset; - uint32_t tl, br; - - if (rctx->b.chip_class == R600 && !rctx->scissor.enable) { - tl = S_028240_TL_X(0) | S_028240_TL_Y(0) | S_028240_WINDOW_OFFSET_DISABLE(1); - br = S_028244_BR_X(8192) | S_028244_BR_Y(8192); - do_disable_workaround = true; - } - - dirty_mask = rstate->dirty_mask; - while (dirty_mask != 0) - { - i = u_bit_scan(&dirty_mask); - offset = i * 4 * 2; - radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + offset, 2); - if (!do_disable_workaround) { - state = &rstate->scissor[i]; - tl = S_028240_TL_X(state->minx) | S_028240_TL_Y(state->miny) | - S_028240_WINDOW_OFFSET_DISABLE(1); - br = S_028244_BR_X(state->maxx) | S_028244_BR_Y(state->maxy); - } - radeon_emit(cs, tl); - radeon_emit(cs, br); - } - rstate->dirty_mask = 0; - rstate->atom.num_dw = 0; -} - -static void r600_set_scissor_states(struct pipe_context *ctx, - unsigned start_slot, - unsigned num_scissors, - const struct pipe_scissor_state *state) -{ - struct r600_context *rctx = (struct r600_context *)ctx; - struct r600_scissor_state *rstate = &rctx->scissor; - int i; - - for (i = start_slot ; i < start_slot + num_scissors; i++) - rstate->scissor[i] = state[i - start_slot]; - rstate->dirty_mask |= ((1 << num_scissors) - 1) << start_slot; - rstate->atom.num_dw = util_bitcount(rstate->dirty_mask) * 4; - - if (rctx->b.chip_class == R600 && !rstate->enable) - return; - - r600_mark_atom_dirty(rctx, &rstate->atom); -} - static struct r600_resource *r600_buffer_create_helper(struct r600_screen *rscreen, unsigned size, unsigned alignment) { @@ -1644,12 +1602,16 @@ static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom } } - if (a->occlusion_query_enabled) { + if (rctx->b.num_occlusion_queries > 0 && + !a->occlusion_queries_disabled) { if (rctx->b.chip_class >= R700) { db_render_control |= S_028D0C_R700_PERFECT_ZPASS_COUNTS(1); } db_render_override |= S_028D10_NOOP_CULL_DISABLE(1); + } else { + db_render_control |= S_028D0C_ZPASS_INCREMENT_DISABLE(1); } + if (rctx->db_state.rsurf && rctx->db_state.rsurf->db_htile_surface) { /* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */ db_render_override |= S_028D10_FORCE_HIZ_ENABLE(V_028D10_FORCE_OFF); @@ -2173,6 +2135,12 @@ void r600_init_atom_start_cs(struct r600_context *rctx) r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); + /* This enables pipeline stat & streamout queries. + * They are only disabled by blits. + */ + r600_store_value(cb, PKT3(PKT3_EVENT_WRITE, 0, 0)); + r600_store_value(cb, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | EVENT_INDEX(0)); + family = rctx->b.family; ps_prio = 0; vs_prio = 1; @@ -2424,12 +2392,6 @@ void r600_init_atom_start_cs(struct r600_context *rctx) r600_store_context_reg(cb, R_028820_PA_CL_NANINF_CNTL, 0); r600_store_context_reg(cb, R_028A48_PA_SC_MPASS_PS_CNTL, 0); - r600_store_context_reg_seq(cb, R_028C0C_PA_CL_GB_VERT_CLIP_ADJ, 4); - r600_store_value(cb, fui(1.0)); /* R_028C0C_PA_CL_GB_VERT_CLIP_ADJ */ - r600_store_value(cb, fui(1.0)); /* R_028C10_PA_CL_GB_VERT_DISC_ADJ */ - r600_store_value(cb, fui(1.0)); /* R_028C14_PA_CL_GB_HORZ_CLIP_ADJ */ - r600_store_value(cb, fui(1.0)); /* R_028C18_PA_CL_GB_HORZ_DISC_ADJ */ - r600_store_context_reg_seq(cb, R_0282D0_PA_SC_VPORT_ZMIN_0, 2 * R600_MAX_VIEWPORTS); for (tmp = 0; tmp < R600_MAX_VIEWPORTS; tmp++) { r600_store_value(cb, 0); /* R_0282D0_PA_SC_VPORT_ZMIN_0 */ @@ -3132,8 +3094,8 @@ void r600_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->dsa_state.atom, id++, r600_emit_cso_state, 0); r600_init_atom(rctx, &rctx->poly_offset_state.atom, id++, r600_emit_polygon_offset, 6); r600_init_atom(rctx, &rctx->rasterizer_state.atom, id++, r600_emit_cso_state, 0); - r600_init_atom(rctx, &rctx->scissor.atom, id++, r600_emit_scissor_state, 0); - r600_init_atom(rctx, &rctx->viewport.atom, id++, r600_emit_viewport_state, 0); + r600_add_atom(rctx, &rctx->b.scissors.atom, id++); + r600_add_atom(rctx, &rctx->b.viewports.atom, id++); r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3); r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4); r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5); @@ -3153,7 +3115,6 @@ void r600_init_state_functions(struct r600_context *rctx) rctx->b.b.set_framebuffer_state = r600_set_framebuffer_state; rctx->b.b.set_polygon_stipple = r600_set_polygon_stipple; rctx->b.b.set_min_samples = r600_set_min_samples; - rctx->b.b.set_scissor_states = r600_set_scissor_states; rctx->b.b.get_sample_position = r600_get_sample_position; rctx->b.dma_copy = r600_dma_copy; } diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index df41d3f028d..cb40c20a7dd 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -364,14 +364,7 @@ static void r600_bind_rs_state(struct pipe_context *ctx, void *state) r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom); } - /* Workaround for a missing scissor enable on r600. */ - if (rctx->b.chip_class == R600 && - rs->scissor_enable != rctx->scissor.enable) { - rctx->scissor.enable = rs->scissor_enable; - rctx->scissor.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; - rctx->scissor.atom.num_dw = R600_MAX_VIEWPORTS * 4; - r600_mark_atom_dirty(rctx, &rctx->scissor.atom); - } + r600_set_scissor_enable(&rctx->b, rs->scissor_enable); /* Re-emit PA_SC_LINE_STIPPLE. */ rctx->last_primitive_type = -1; @@ -713,47 +706,6 @@ static void r600_update_compressed_colortex_mask(struct r600_samplerview_state * } } -static void r600_set_viewport_states(struct pipe_context *ctx, - unsigned start_slot, - unsigned num_viewports, - const struct pipe_viewport_state *state) -{ - struct r600_context *rctx = (struct r600_context *)ctx; - struct r600_viewport_state *rstate = &rctx->viewport; - int i; - - for (i = start_slot; i < start_slot + num_viewports; i++) - rstate->state[i] = state[i - start_slot]; - rstate->dirty_mask |= ((1 << num_viewports) - 1) << start_slot; - rstate->atom.num_dw = util_bitcount(rstate->dirty_mask) * 8; - r600_mark_atom_dirty(rctx, &rctx->viewport.atom); -} - -void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom) -{ - struct radeon_winsys_cs *cs = rctx->b.gfx.cs; - struct r600_viewport_state *rstate = &rctx->viewport; - struct pipe_viewport_state *state; - uint32_t dirty_mask; - unsigned i, offset; - - dirty_mask = rstate->dirty_mask; - while (dirty_mask != 0) { - i = u_bit_scan(&dirty_mask); - offset = i * 6 * 4; - radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE_0 + offset, 6); - state = &rstate->state[i]; - radeon_emit(cs, fui(state->scale[0])); /* R_02843C_PA_CL_VPORT_XSCALE_0 */ - radeon_emit(cs, fui(state->translate[0])); /* R_028440_PA_CL_VPORT_XOFFSET_0 */ - radeon_emit(cs, fui(state->scale[1])); /* R_028444_PA_CL_VPORT_YSCALE_0 */ - radeon_emit(cs, fui(state->translate[1])); /* R_028448_PA_CL_VPORT_YOFFSET_0 */ - radeon_emit(cs, fui(state->scale[2])); /* R_02844C_PA_CL_VPORT_ZSCALE_0 */ - radeon_emit(cs, fui(state->translate[2])); /* R_028450_PA_CL_VPORT_ZOFFSET_0 */ - } - rstate->dirty_mask = 0; - rstate->atom.num_dw = 0; -} - /* Compute the key for the hw shader variant */ static inline union r600_shader_key r600_shader_selector_key(struct pipe_context * ctx, struct r600_pipe_shader_selector * sel) @@ -961,6 +913,18 @@ static void r600_bind_ps_state(struct pipe_context *ctx, void *state) rctx->ps_shader = (struct r600_pipe_shader_selector *)state; } +static struct tgsi_shader_info *r600_get_vs_info(struct r600_context *rctx) +{ + if (rctx->gs_shader) + return &rctx->gs_shader->info; + else if (rctx->tes_shader) + return &rctx->tes_shader->info; + else if (rctx->vs_shader) + return &rctx->vs_shader->info; + else + return NULL; +} + static void r600_bind_vs_state(struct pipe_context *ctx, void *state) { struct r600_context *rctx = (struct r600_context *)ctx; @@ -969,6 +933,7 @@ static void r600_bind_vs_state(struct pipe_context *ctx, void *state) return; rctx->vs_shader = (struct r600_pipe_shader_selector *)state; + r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx)); rctx->b.streamout.stride_in_dw = rctx->vs_shader->so.stride; } @@ -977,6 +942,7 @@ static void r600_bind_gs_state(struct pipe_context *ctx, void *state) struct r600_context *rctx = (struct r600_context *)ctx; rctx->gs_shader = (struct r600_pipe_shader_selector *)state; + r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx)); if (!state) return; @@ -995,6 +961,7 @@ static void r600_bind_tes_state(struct pipe_context *ctx, void *state) struct r600_context *rctx = (struct r600_context *)ctx; rctx->tes_shader = (struct r600_pipe_shader_selector *)state; + r600_update_vs_writes_viewport_index(&rctx->b, r600_get_vs_info(rctx)); if (!state) return; @@ -1841,8 +1808,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info ia_switch_on_eop = true; } - if (rctx->b.streamout.streamout_enabled || - rctx->b.streamout.prims_gen_query_enabled) + if (r600_get_strmout_en(&rctx->b)) partial_vs_wave = true; radeon_set_context_reg(cs, CM_R_028AA8_IA_MULTI_VGT_PARAM, @@ -2018,7 +1984,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info rctx->b.family == CHIP_RV635) { /* if we have gs shader or streamout we need to do a wait idle after every draw */ - if (rctx->gs_shader || rctx->b.streamout.streamout_enabled) { + if (rctx->gs_shader || r600_get_strmout_en(&rctx->b)) { radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, S_008040_WAIT_3D_IDLE(1)); } } @@ -2123,17 +2089,6 @@ unsigned r600_tex_wrap(unsigned wrap) } } -unsigned r600_tex_filter(unsigned filter) -{ - switch (filter) { - default: - case PIPE_TEX_FILTER_NEAREST: - return V_03C000_SQ_TEX_XY_FILTER_POINT; - case PIPE_TEX_FILTER_LINEAR: - return V_03C000_SQ_TEX_XY_FILTER_BILINEAR; - } -} - unsigned r600_tex_mipfilter(unsigned filter) { switch (filter) { @@ -2861,16 +2816,33 @@ static void r600_invalidate_buffer(struct pipe_context *ctx, struct pipe_resourc } } -static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable) +static void r600_set_active_query_state(struct pipe_context *ctx, boolean enable) { struct r600_context *rctx = (struct r600_context*)ctx; - if (rctx->db_misc_state.occlusion_query_enabled != enable) { - rctx->db_misc_state.occlusion_query_enabled = enable; + /* Pipeline stat & streamout queries. */ + if (enable) { + rctx->b.flags &= ~R600_CONTEXT_STOP_PIPELINE_STATS; + rctx->b.flags |= R600_CONTEXT_START_PIPELINE_STATS; + } else { + rctx->b.flags &= ~R600_CONTEXT_START_PIPELINE_STATS; + rctx->b.flags |= R600_CONTEXT_STOP_PIPELINE_STATS; + } + + /* Occlusion queries. */ + if (rctx->db_misc_state.occlusion_queries_disabled != !enable) { + rctx->db_misc_state.occlusion_queries_disabled = !enable; r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } +static void r600_set_occlusion_query_state(struct pipe_context *ctx, bool enable) +{ + struct r600_context *rctx = (struct r600_context*)ctx; + + r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); +} + static void r600_need_gfx_cs_space(struct pipe_context *ctx, unsigned num_dw, bool include_draw_vbo) { @@ -2911,13 +2883,13 @@ void r600_init_common_state_functions(struct r600_context *rctx) rctx->b.b.set_constant_buffer = r600_set_constant_buffer; rctx->b.b.set_sample_mask = r600_set_sample_mask; rctx->b.b.set_stencil_ref = r600_set_pipe_stencil_ref; - rctx->b.b.set_viewport_states = r600_set_viewport_states; rctx->b.b.set_vertex_buffers = r600_set_vertex_buffers; rctx->b.b.set_index_buffer = r600_set_index_buffer; rctx->b.b.set_sampler_views = r600_set_sampler_views; rctx->b.b.sampler_view_destroy = r600_sampler_view_destroy; rctx->b.b.texture_barrier = r600_texture_barrier; rctx->b.b.set_stream_output_targets = r600_set_streamout_targets; + rctx->b.b.set_active_query_state = r600_set_active_query_state; rctx->b.b.draw_vbo = r600_draw_vbo; rctx->b.invalidate_buffer = r600_invalidate_buffer; rctx->b.set_occlusion_query_state = r600_set_occlusion_query_state; diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 3d223edb5f4..ecabb340a9c 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -780,7 +780,8 @@ #define S_028D0C_STENCIL_COMPRESS_DISABLE(x) (((x) & 0x1) << 5) #define S_028D0C_DEPTH_COMPRESS_DISABLE(x) (((x) & 0x1) << 6) #define S_028D0C_COPY_CENTROID(x) (((x) & 0x1) << 7) -#define S_028D0C_COPY_SAMPLE(x) (((x) & 0x1) << 8) +#define S_028D0C_COPY_SAMPLE(x) (((x) & 0x03) << 8) +#define S_028D0C_ZPASS_INCREMENT_DISABLE(x) (((x) & 0x1) << 11) #define S_028D0C_R700_PERFECT_ZPASS_COUNTS(x) (((x) & 0x1) << 15) #define S_028D0C_CONSERVATIVE_Z_EXPORT(x) (((x) & 0x03) << 13) #define G_028D0C_CONSERVATIVE_Z_EXPORT(x) (((x) >> 13) & 0x03) @@ -1266,6 +1267,8 @@ #define V_03C000_SQ_TEX_XY_FILTER_POINT 0x00000000 #define V_03C000_SQ_TEX_XY_FILTER_BILINEAR 0x00000001 #define V_03C000_SQ_TEX_XY_FILTER_BICUBIC 0x00000002 +#define V_03C000_SQ_TEX_XY_FILTER_ANISO_POINT 0x00000004 +#define V_03C000_SQ_TEX_XY_FILTER_ANISO_BILINEAR 0x00000005 #define S_03C000_XY_MIN_FILTER(x) (((x) & 0x7) << 12) #define G_03C000_XY_MIN_FILTER(x) (((x) >> 12) & 0x7) #define C_03C000_XY_MIN_FILTER 0xFFFF8FFF @@ -1278,9 +1281,9 @@ #define S_03C000_MIP_FILTER(x) (((x) & 0x3) << 17) #define G_03C000_MIP_FILTER(x) (((x) >> 17) & 0x3) #define C_03C000_MIP_FILTER 0xFFF9FFFF -#define S_03C000_MAX_ANISO(x) (((x) & 0x7) << 19) -#define G_03C000_MAX_ANISO(x) (((x) >> 19) & 0x7) -#define C_03C000_MAX_ANISO 0xFFB7FFFF +#define S_03C000_MAX_ANISO_RATIO(x) (((x) & 0x7) << 19) +#define G_03C000_MAX_ANISO_RATIO(x) (((x) >> 19) & 0x7) +#define C_03C000_MAX_ANISO_RATIO 0xFFB7FFFF #define S_03C000_BORDER_COLOR_TYPE(x) (((x) & 0x3) << 22) #define G_03C000_BORDER_COLOR_TYPE(x) (((x) >> 22) & 0x3) #define C_03C000_BORDER_COLOR_TYPE 0xFF3FFFFF diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources index eb171f7da5f..f993d75a6ad 100644 --- a/src/gallium/drivers/radeon/Makefile.sources +++ b/src/gallium/drivers/radeon/Makefile.sources @@ -11,6 +11,7 @@ C_SOURCES := \ r600_query.h \ r600_streamout.c \ r600_texture.c \ + r600_viewport.c \ radeon_uvd.c \ radeon_uvd.h \ radeon_vce_40_2_2.c \ diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 33ba0fbca9b..47514e91d23 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -102,7 +102,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, bool r600_init_resource(struct r600_common_screen *rscreen, struct r600_resource *res, - unsigned size, unsigned alignment, + uint64_t size, unsigned alignment, bool use_reusable_pool) { struct r600_texture *rtex = (struct r600_texture*)res; @@ -160,9 +160,18 @@ bool r600_init_resource(struct r600_common_screen *rscreen, rtex->surface.level[0].mode >= RADEON_SURF_MODE_1D) { res->domains = RADEON_DOMAIN_VRAM; flags &= ~RADEON_FLAG_CPU_ACCESS; - flags |= RADEON_FLAG_NO_CPU_ACCESS; + flags |= RADEON_FLAG_NO_CPU_ACCESS | + RADEON_FLAG_GTT_WC; } + /* If VRAM is just stolen system memory, allow both VRAM and GTT, + * whichever has free space. If a buffer is evicted from VRAM to GTT, + * it will stay there. + */ + if (!rscreen->info.has_dedicated_vram && + res->domains == RADEON_DOMAIN_VRAM) + res->domains = RADEON_DOMAIN_VRAM_GTT; + if (rscreen->debug_flags & DBG_NO_WC) flags &= ~RADEON_FLAG_GTT_WC; @@ -192,7 +201,7 @@ bool r600_init_resource(struct r600_common_screen *rscreen, res->TC_L2_dirty = false; if (rscreen->debug_flags & DBG_VM && res->b.b.target == PIPE_BUFFER) { - fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %u bytes\n", + fprintf(stderr, "VM start=0x%"PRIX64" end=0x%"PRIX64" | Buffer %"PRIu64" bytes\n", res->gpu_address, res->gpu_address + res->buf->size, res->buf->size); } diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c index f3529a1fe0f..9ab17d9e04c 100644 --- a/src/gallium/drivers/radeon/r600_perfcounter.c +++ b/src/gallium/drivers/radeon/r600_perfcounter.c @@ -310,7 +310,6 @@ struct pipe_query *r600_create_batch_query(struct pipe_context *ctx, query->b.b.ops = &batch_query_ops; query->b.ops = &batch_query_hw_ops; - query->b.flags = R600_QUERY_HW_FLAG_TIMER; query->num_counters = num_queries; diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 32bd6e40d32..a7477abea34 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -156,14 +156,8 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags) void r600_preflush_suspend_features(struct r600_common_context *ctx) { /* suspend queries */ - if (ctx->num_cs_dw_nontimer_queries_suspend) { - /* Since non-timer queries are suspended during blits, - * we have to guard against double-suspends. */ - r600_suspend_nontimer_queries(ctx); - ctx->nontimer_queries_suspended_by_flush = true; - } - if (!LIST_IS_EMPTY(&ctx->active_timer_queries)) - r600_suspend_timer_queries(ctx); + if (!LIST_IS_EMPTY(&ctx->active_queries)) + r600_suspend_queries(ctx); ctx->streamout.suspended = false; if (ctx->streamout.begin_emitted) { @@ -180,12 +174,8 @@ void r600_postflush_resume_features(struct r600_common_context *ctx) } /* resume queries */ - if (!LIST_IS_EMPTY(&ctx->active_timer_queries)) - r600_resume_timer_queries(ctx); - if (ctx->nontimer_queries_suspended_by_flush) { - ctx->nontimer_queries_suspended_by_flush = false; - r600_resume_nontimer_queries(ctx); - } + if (!LIST_IS_EMPTY(&ctx->active_queries)) + r600_resume_queries(ctx); } static void r600_flush_from_st(struct pipe_context *ctx, @@ -296,6 +286,7 @@ bool r600_common_context_init(struct r600_common_context *rctx, LIST_INITHEAD(&rctx->texture_buffers); r600_init_context_texture_functions(rctx); + r600_init_viewport_functions(rctx); r600_streamout_init(rctx); r600_query_init(rctx); cayman_init_msaa(&rctx->b); @@ -898,6 +889,13 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, rscreen->chip_class = rscreen->info.chip_class; rscreen->debug_flags = debug_get_flags_option("R600_DEBUG", common_debug_options, 0); + rscreen->force_aniso = MIN2(16, debug_get_num_option("R600_TEX_ANISO", -1)); + if (rscreen->force_aniso >= 0) { + printf("radeon: Forcing anisotropy filter to %ix\n", + /* round down to a power of two */ + 1 << util_logbase2(rscreen->force_aniso)); + } + util_format_s3tc_init(); pipe_mutex_init(rscreen->aux_context_lock); pipe_mutex_init(rscreen->gpu_load_mutex); @@ -973,7 +971,7 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen, } void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, + uint64_t offset, uint64_t size, unsigned value, bool is_framebuffer) { struct r600_common_context *rctx = (struct r600_common_context*)rscreen->aux_context; diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index 062c3193947..a6abe09d438 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -50,7 +50,10 @@ #define R600_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) #define R600_CONTEXT_STREAMOUT_FLUSH (1u << 0) -#define R600_CONTEXT_PRIVATE_FLAG (1u << 1) +/* Pipeline & streamout query controls. */ +#define R600_CONTEXT_START_PIPELINE_STATS (1u << 1) +#define R600_CONTEXT_STOP_PIPELINE_STATS (1u << 2) +#define R600_CONTEXT_PRIVATE_FLAG (1u << 3) /* special primitive types */ #define R600_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX @@ -94,9 +97,11 @@ #define DBG_MONOLITHIC_SHADERS (1llu << 47) #define R600_MAP_BUFFER_ALIGNMENT 64 +#define R600_MAX_VIEWPORTS 16 struct r600_common_context; struct r600_perfcounters; +struct tgsi_shader_info; struct radeon_shader_reloc { char name[32]; @@ -137,6 +142,9 @@ struct radeon_shader_binary { void radeon_shader_binary_init(struct radeon_shader_binary *b); void radeon_shader_binary_clean(struct radeon_shader_binary *b); +/* Only 32-bit buffer allocations are supported, gallium doesn't support more + * at the moment. + */ struct r600_resource { struct u_resource b; @@ -181,8 +189,8 @@ struct r600_transfer { }; struct r600_fmask_info { - unsigned offset; - unsigned size; + uint64_t offset; + uint64_t size; unsigned alignment; unsigned pitch_in_pixels; unsigned bank_height; @@ -191,8 +199,8 @@ struct r600_fmask_info { }; struct r600_cmask_info { - unsigned offset; - unsigned size; + uint64_t offset; + uint64_t size; unsigned alignment; unsigned pitch; unsigned height; @@ -212,7 +220,7 @@ struct r600_htile_info { struct r600_texture { struct r600_resource resource; - unsigned size; + uint64_t size; bool is_depth; unsigned dirty_level_mask; /* each bit says if that mipmap is compressed */ unsigned stencil_dirty_level_mask; /* each bit says if that mipmap is compressed */ @@ -224,7 +232,7 @@ struct r600_texture { struct r600_fmask_info fmask; struct r600_cmask_info cmask; struct r600_resource *cmask_buffer; - unsigned dcc_offset; /* 0 = disabled */ + uint64_t dcc_offset; /* 0 = disabled */ unsigned cb_color_info; /* fast clear enable bit */ unsigned color_clear_value[2]; @@ -298,6 +306,9 @@ struct r600_common_screen { bool has_cp_dma; bool has_streamout; + /* Texture filter settings. */ + int force_aniso; /* -1 = disabled */ + /* Auxiliary context. Mainly used to initialize resources. * It must be locked prior to using and flushed before unlocking. */ struct pipe_context *aux_context; @@ -388,6 +399,26 @@ struct r600_streamout { int num_prims_gen_queries; }; +struct r600_signed_scissor { + int minx; + int miny; + int maxx; + int maxy; +}; + +struct r600_scissors { + struct r600_atom atom; + unsigned dirty_mask; + struct pipe_scissor_state states[R600_MAX_VIEWPORTS]; +}; + +struct r600_viewports { + struct r600_atom atom; + unsigned dirty_mask; + struct pipe_viewport_state states[R600_MAX_VIEWPORTS]; + struct r600_signed_scissor as_scissor[R600_MAX_VIEWPORTS]; +}; + struct r600_ring { struct radeon_winsys_cs *cs; void (*flush)(void *ctx, unsigned flags, @@ -420,23 +451,20 @@ struct r600_common_context { /* States. */ struct r600_streamout streamout; + struct r600_scissors scissors; + struct r600_viewports viewports; + bool scissor_enabled; + bool vs_writes_viewport_index; /* Additional context states. */ unsigned flags; /* flush flags */ /* Queries. */ - /* The list of active queries. */ + /* Maintain the list of active queries for pausing between IBs. */ int num_occlusion_queries; int num_perfect_occlusion_queries; - /* Keep track of non-timer queries, because they should be suspended - * during context flushing. - * The timer queries (TIME_ELAPSED) shouldn't be suspended for blits, - * but they should be suspended between IBs. */ - struct list_head active_nontimer_queries; - struct list_head active_timer_queries; - unsigned num_cs_dw_nontimer_queries_suspend; - bool nontimer_queries_suspended_by_flush; - unsigned num_cs_dw_timer_queries_suspend; + struct list_head active_queries; + unsigned num_cs_dw_queries_suspend; /* Additional hardware info. */ unsigned backend_mask; unsigned max_db; /* for OQ */ @@ -476,7 +504,7 @@ struct r600_common_context { const struct pipe_box *src_box); void (*clear_buffer)(struct pipe_context *ctx, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, + uint64_t offset, uint64_t size, unsigned value, bool is_framebuffer); void (*blit_decompress_depth)(struct pipe_context *ctx, @@ -513,7 +541,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, unsigned usage); bool r600_init_resource(struct r600_common_screen *rscreen, struct r600_resource *res, - unsigned size, unsigned alignment, + uint64_t size, unsigned alignment, bool use_reusable_pool); struct pipe_resource *r600_buffer_create(struct pipe_screen *screen, const struct pipe_resource *templ, @@ -548,7 +576,7 @@ void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resour bool r600_can_dump_shader(struct r600_common_screen *rscreen, unsigned processor); void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, + uint64_t offset, uint64_t size, unsigned value, bool is_framebuffer); struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, const struct pipe_resource *templ); @@ -566,10 +594,8 @@ void r600_perfcounters_destroy(struct r600_common_screen *rscreen); /* r600_query.c */ void r600_init_screen_query_functions(struct r600_common_screen *rscreen); void r600_query_init(struct r600_common_context *rctx); -void r600_suspend_nontimer_queries(struct r600_common_context *ctx); -void r600_resume_nontimer_queries(struct r600_common_context *ctx); -void r600_suspend_timer_queries(struct r600_common_context *ctx); -void r600_resume_timer_queries(struct r600_common_context *ctx); +void r600_suspend_queries(struct r600_common_context *ctx); +void r600_resume_queries(struct r600_common_context *ctx); void r600_query_init_backend_mask(struct r600_common_context *ctx); /* r600_streamout.c */ @@ -612,6 +638,14 @@ void r600_texture_disable_dcc(struct r600_common_screen *rscreen, void r600_init_screen_texture_functions(struct r600_common_screen *rscreen); void r600_init_context_texture_functions(struct r600_common_context *rctx); +/* r600_viewport.c */ +void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx, + struct pipe_scissor_state *scissor); +void r600_set_scissor_enable(struct r600_common_context *rctx, bool enable); +void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx, + struct tgsi_shader_info *info); +void r600_init_viewport_functions(struct r600_common_context *rctx); + /* cayman_msaa.c */ extern const uint32_t eg_sample_locs_2x[4]; extern const unsigned eg_max_dist_2x; @@ -639,13 +673,38 @@ r600_resource_reference(struct r600_resource **ptr, struct r600_resource *res) (struct pipe_resource *)res); } +static inline bool r600_get_strmout_en(struct r600_common_context *rctx) +{ + return rctx->streamout.streamout_enabled || + rctx->streamout.prims_gen_query_enabled; +} + +#define SQ_TEX_XY_FILTER_POINT 0x00 +#define SQ_TEX_XY_FILTER_BILINEAR 0x01 +#define SQ_TEX_XY_FILTER_ANISO_POINT 0x02 +#define SQ_TEX_XY_FILTER_ANISO_BILINEAR 0x03 + +static inline unsigned eg_tex_filter(unsigned filter, unsigned max_aniso) +{ + if (filter == PIPE_TEX_FILTER_LINEAR) + return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_BILINEAR + : SQ_TEX_XY_FILTER_BILINEAR; + else + return max_aniso > 1 ? SQ_TEX_XY_FILTER_ANISO_POINT + : SQ_TEX_XY_FILTER_POINT; +} + static inline unsigned r600_tex_aniso_filter(unsigned filter) { - if (filter <= 1) return 0; - if (filter <= 2) return 1; - if (filter <= 4) return 2; - if (filter <= 8) return 3; - /* else */ return 4; + if (filter < 2) + return 0; + if (filter < 4) + return 1; + if (filter < 8) + return 2; + if (filter < 16) + return 3; + return 4; } static inline unsigned r600_wavefront_size(enum radeon_family family) diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index 7a2d2ee7f31..de6e37b9f62 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -369,13 +369,11 @@ static struct pipe_query *r600_query_hw_create(struct r600_common_context *rctx, query->result_size = 16; query->num_cs_dw_begin = 8; query->num_cs_dw_end = 8; - query->flags = R600_QUERY_HW_FLAG_TIMER; break; case PIPE_QUERY_TIMESTAMP: query->result_size = 8; query->num_cs_dw_end = 8; - query->flags = R600_QUERY_HW_FLAG_TIMER | - R600_QUERY_HW_FLAG_NO_START; + query->flags = R600_QUERY_HW_FLAG_NO_START; break; case PIPE_QUERY_PRIMITIVES_EMITTED: case PIPE_QUERY_PRIMITIVES_GENERATED: @@ -516,10 +514,7 @@ static void r600_query_hw_emit_start(struct r600_common_context *ctx, query->ops->emit_start(ctx, query, query->buffer.buf, va); - if (query->flags & R600_QUERY_HW_FLAG_TIMER) - ctx->num_cs_dw_timer_queries_suspend += query->num_cs_dw_end; - else - ctx->num_cs_dw_nontimer_queries_suspend += query->num_cs_dw_end; + ctx->num_cs_dw_queries_suspend += query->num_cs_dw_end; } static void r600_query_hw_do_emit_stop(struct r600_common_context *ctx, @@ -590,12 +585,8 @@ static void r600_query_hw_emit_stop(struct r600_common_context *ctx, query->buffer.results_end += query->result_size; - if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) { - if (query->flags & R600_QUERY_HW_FLAG_TIMER) - ctx->num_cs_dw_timer_queries_suspend -= query->num_cs_dw_end; - else - ctx->num_cs_dw_nontimer_queries_suspend -= query->num_cs_dw_end; - } + if (!(query->flags & R600_QUERY_HW_FLAG_NO_START)) + ctx->num_cs_dw_queries_suspend -= query->num_cs_dw_end; r600_update_occlusion_query_state(ctx, query->b.type, -1); r600_update_prims_generated_query_state(ctx, query->b.type, -1); @@ -730,11 +721,8 @@ boolean r600_query_hw_begin(struct r600_common_context *rctx, r600_query_hw_emit_start(rctx, query); - if (query->flags & R600_QUERY_HW_FLAG_TIMER) - LIST_ADDTAIL(&query->list, &rctx->active_timer_queries); - else - LIST_ADDTAIL(&query->list, &rctx->active_nontimer_queries); - return true; + LIST_ADDTAIL(&query->list, &rctx->active_queries); + return true; } static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query) @@ -973,28 +961,14 @@ static void r600_render_condition(struct pipe_context *ctx, rctx->set_atom_dirty(rctx, atom, query != NULL); } -static void r600_suspend_queries(struct r600_common_context *ctx, - struct list_head *query_list, - unsigned *num_cs_dw_queries_suspend) +void r600_suspend_queries(struct r600_common_context *ctx) { struct r600_query_hw *query; - LIST_FOR_EACH_ENTRY(query, query_list, list) { + LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) { r600_query_hw_emit_stop(ctx, query); } - assert(*num_cs_dw_queries_suspend == 0); -} - -void r600_suspend_nontimer_queries(struct r600_common_context *ctx) -{ - r600_suspend_queries(ctx, &ctx->active_nontimer_queries, - &ctx->num_cs_dw_nontimer_queries_suspend); -} - -void r600_suspend_timer_queries(struct r600_common_context *ctx) -{ - r600_suspend_queries(ctx, &ctx->active_timer_queries, - &ctx->num_cs_dw_timer_queries_suspend); + assert(ctx->num_cs_dw_queries_suspend == 0); } static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context *ctx, @@ -1022,35 +996,21 @@ static unsigned r600_queries_num_cs_dw_for_resuming(struct r600_common_context * return num_dw; } -static void r600_resume_queries(struct r600_common_context *ctx, - struct list_head *query_list, - unsigned *num_cs_dw_queries_suspend) +void r600_resume_queries(struct r600_common_context *ctx) { struct r600_query_hw *query; - unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, query_list); + unsigned num_cs_dw = r600_queries_num_cs_dw_for_resuming(ctx, &ctx->active_queries); - assert(*num_cs_dw_queries_suspend == 0); + assert(ctx->num_cs_dw_queries_suspend == 0); /* Check CS space here. Resuming must not be interrupted by flushes. */ ctx->need_gfx_cs_space(&ctx->b, num_cs_dw, TRUE); - LIST_FOR_EACH_ENTRY(query, query_list, list) { + LIST_FOR_EACH_ENTRY(query, &ctx->active_queries, list) { r600_query_hw_emit_start(ctx, query); } } -void r600_resume_nontimer_queries(struct r600_common_context *ctx) -{ - r600_resume_queries(ctx, &ctx->active_nontimer_queries, - &ctx->num_cs_dw_nontimer_queries_suspend); -} - -void r600_resume_timer_queries(struct r600_common_context *ctx) -{ - r600_resume_queries(ctx, &ctx->active_timer_queries, - &ctx->num_cs_dw_timer_queries_suspend); -} - /* Get backends mask */ void r600_query_init_backend_mask(struct r600_common_context *ctx) { @@ -1274,8 +1234,7 @@ void r600_query_init(struct r600_common_context *rctx) if (((struct r600_common_screen*)rctx->b.screen)->info.num_render_backends > 0) rctx->b.render_condition = r600_render_condition; - LIST_INITHEAD(&rctx->active_nontimer_queries); - LIST_INITHEAD(&rctx->active_timer_queries); + LIST_INITHEAD(&rctx->active_queries); } void r600_init_screen_query_functions(struct r600_common_screen *rscreen) diff --git a/src/gallium/drivers/radeon/r600_query.h b/src/gallium/drivers/radeon/r600_query.h index 8b2c4e3fe93..9f3a917d727 100644 --- a/src/gallium/drivers/radeon/r600_query.h +++ b/src/gallium/drivers/radeon/r600_query.h @@ -84,8 +84,7 @@ struct r600_query { enum { R600_QUERY_HW_FLAG_NO_START = (1 << 0), - R600_QUERY_HW_FLAG_TIMER = (1 << 1), - R600_QUERY_HW_FLAG_PREDICATE = (1 << 2), + R600_QUERY_HW_FLAG_PREDICATE = (1 << 1), }; struct r600_query_hw_ops { diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c index e977ed9fa10..fc9ec4859f6 100644 --- a/src/gallium/drivers/radeon/r600_streamout.c +++ b/src/gallium/drivers/radeon/r600_streamout.c @@ -311,12 +311,6 @@ void r600_emit_streamout_end(struct r600_common_context *rctx) * are no buffers bound. */ -static bool r600_get_strmout_en(struct r600_common_context *rctx) -{ - return rctx->streamout.streamout_enabled || - rctx->streamout.prims_gen_query_enabled; -} - static void r600_emit_streamout_enable(struct r600_common_context *rctx, struct r600_atom *atom) { diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index 4850b73f291..72af5344b70 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -222,10 +222,6 @@ static int r600_setup_surface(struct pipe_screen *screen, rtex->surface.level[0].nblk_x = pitch_in_bytes_override / rtex->surface.bpe; rtex->surface.level[0].pitch_bytes = pitch_in_bytes_override; rtex->surface.level[0].slice_size = pitch_in_bytes_override * rtex->surface.level[0].nblk_y; - if (rtex->surface.flags & RADEON_SURF_SBUFFER) { - rtex->surface.stencil_offset = - rtex->surface.stencil_level[0].offset = rtex->surface.level[0].slice_size; - } } if (offset) { @@ -482,7 +478,7 @@ static void r600_texture_allocate_fmask(struct r600_common_screen *rscreen, r600_texture_get_fmask_info(rscreen, rtex, rtex->resource.b.b.nr_samples, &rtex->fmask); - rtex->fmask.offset = align(rtex->size, rtex->fmask.alignment); + rtex->fmask.offset = align64(rtex->size, rtex->fmask.alignment); rtex->size = rtex->fmask.offset + rtex->fmask.size; } @@ -585,7 +581,7 @@ static void r600_texture_allocate_cmask(struct r600_common_screen *rscreen, r600_texture_get_cmask_info(rscreen, rtex, &rtex->cmask); } - rtex->cmask.offset = align(rtex->size, rtex->cmask.alignment); + rtex->cmask.offset = align64(rtex->size, rtex->cmask.alignment); rtex->size = rtex->cmask.offset + rtex->cmask.size; if (rscreen->chip_class >= SI) @@ -747,14 +743,14 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f) (rtex->surface.flags & RADEON_SURF_SCANOUT) != 0); if (rtex->fmask.size) - fprintf(f, " FMask: offset=%u, size=%u, alignment=%u, pitch_in_pixels=%u, " + fprintf(f, " FMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch_in_pixels=%u, " "bankh=%u, slice_tile_max=%u, tile_mode_index=%u\n", rtex->fmask.offset, rtex->fmask.size, rtex->fmask.alignment, rtex->fmask.pitch_in_pixels, rtex->fmask.bank_height, rtex->fmask.slice_tile_max, rtex->fmask.tile_mode_index); if (rtex->cmask.size) - fprintf(f, " CMask: offset=%u, size=%u, alignment=%u, pitch=%u, " + fprintf(f, " CMask: offset=%"PRIu64", size=%"PRIu64", alignment=%u, pitch=%u, " "height=%u, xalign=%u, yalign=%u, slice_tile_max=%u\n", rtex->cmask.offset, rtex->cmask.size, rtex->cmask.alignment, rtex->cmask.pitch, rtex->cmask.height, rtex->cmask.xalign, @@ -768,7 +764,7 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f) rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign); if (rtex->dcc_offset) { - fprintf(f, " DCC: offset=%u, size=%"PRIu64", alignment=%"PRIu64"\n", + fprintf(f, " DCC: offset=%"PRIu64", size=%"PRIu64", alignment=%"PRIu64"\n", rtex->dcc_offset, rtex->surface.dcc_size, rtex->surface.dcc_alignment); for (i = 0; i <= rtex->surface.last_level; i++) @@ -873,7 +869,7 @@ r600_texture_create_object(struct pipe_screen *screen, if (!buf && rtex->surface.dcc_size && !(rscreen->debug_flags & DBG_NO_DCC)) { /* Reserve space for the DCC buffer. */ - rtex->dcc_offset = align(rtex->size, rtex->surface.dcc_alignment); + rtex->dcc_offset = align64(rtex->size, rtex->surface.dcc_alignment); rtex->size = rtex->dcc_offset + rtex->surface.dcc_size; rtex->cb_color_info |= VI_S_028C70_DCC_ENABLE(1); } @@ -947,13 +943,12 @@ static unsigned r600_choose_tiling(struct r600_common_screen *rscreen, force_tiling = true; /* Handle common candidates for the linear mode. - * Compressed textures must always be tiled. */ - if (!force_tiling && !util_format_is_compressed(templ->format)) { - /* Not everything can be linear, so we cannot enforce it - * for all textures. */ - if ((rscreen->debug_flags & DBG_NO_TILING) && - (!util_format_is_depth_or_stencil(templ->format) || - !(templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH))) + * Compressed textures and DB surfaces must always be tiled. + */ + if (!force_tiling && !util_format_is_compressed(templ->format) && + (!util_format_is_depth_or_stencil(templ->format) || + templ->flags & R600_RESOURCE_FLAG_FLUSHED_DEPTH)) { + if (rscreen->debug_flags & DBG_NO_TILING) return RADEON_SURF_MODE_LINEAR_ALIGNED; /* Tiling doesn't work with the 422 (SUBSAMPLED) formats on R600+. */ diff --git a/src/gallium/drivers/radeon/r600_viewport.c b/src/gallium/drivers/radeon/r600_viewport.c new file mode 100644 index 00000000000..ea558cd22de --- /dev/null +++ b/src/gallium/drivers/radeon/r600_viewport.c @@ -0,0 +1,350 @@ +/* + * Copyright 2012 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "r600_cs.h" +#include "tgsi/tgsi_scan.h" + +#define GET_MAX_SCISSOR(rctx) (rctx->chip_class >= EVERGREEN ? 16384 : 8192) + +static void r600_set_scissor_states(struct pipe_context *ctx, + unsigned start_slot, + unsigned num_scissors, + const struct pipe_scissor_state *state) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + int i; + + for (i = 0; i < num_scissors; i++) + rctx->scissors.states[start_slot + i] = state[i]; + + if (!rctx->scissor_enabled) + return; + + rctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot; + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); +} + +/* Since the guard band disables clipping, we have to clip per-pixel + * using a scissor. + */ +static void r600_get_scissor_from_viewport(struct r600_common_context *rctx, + const struct pipe_viewport_state *vp, + struct r600_signed_scissor *scissor) +{ + int tmp; + + /* Convert (-1, -1) and (1, 1) from clip space into window space. */ + scissor->minx = -vp->scale[0] + vp->translate[0]; + scissor->miny = -vp->scale[1] + vp->translate[1]; + scissor->maxx = vp->scale[0] + vp->translate[0]; + scissor->maxy = vp->scale[1] + vp->translate[1]; + + /* r600_draw_rectangle sets this. Disable the scissor. */ + if (scissor->minx == -1 && scissor->miny == -1 && + scissor->maxx == 1 && scissor->maxy == 1) { + scissor->minx = scissor->miny = 0; + scissor->maxx = scissor->maxy = GET_MAX_SCISSOR(rctx); + } + + /* Handle inverted viewports. */ + if (scissor->minx > scissor->maxx) { + tmp = scissor->minx; + scissor->minx = scissor->maxx; + scissor->maxx = tmp; + } + if (scissor->miny > scissor->maxy) { + tmp = scissor->miny; + scissor->miny = scissor->maxy; + scissor->maxy = tmp; + } +} + +static void r600_clamp_scissor(struct r600_common_context *rctx, + struct pipe_scissor_state *out, + struct r600_signed_scissor *scissor) +{ + unsigned max_scissor = GET_MAX_SCISSOR(rctx); + out->minx = CLAMP(scissor->minx, 0, max_scissor); + out->miny = CLAMP(scissor->miny, 0, max_scissor); + out->maxx = CLAMP(scissor->maxx, 0, max_scissor); + out->maxy = CLAMP(scissor->maxy, 0, max_scissor); +} + +static void r600_clip_scissor(struct pipe_scissor_state *out, + struct pipe_scissor_state *clip) +{ + out->minx = MAX2(out->minx, clip->minx); + out->miny = MAX2(out->miny, clip->miny); + out->maxx = MIN2(out->maxx, clip->maxx); + out->maxy = MIN2(out->maxy, clip->maxy); +} + +static void r600_scissor_make_union(struct r600_signed_scissor *out, + struct r600_signed_scissor *in) +{ + out->minx = MIN2(out->minx, in->minx); + out->miny = MIN2(out->miny, in->miny); + out->maxx = MAX2(out->maxx, in->maxx); + out->maxy = MAX2(out->maxy, in->maxy); +} + +void evergreen_apply_scissor_bug_workaround(struct r600_common_context *rctx, + struct pipe_scissor_state *scissor) +{ + if (rctx->chip_class == EVERGREEN || rctx->chip_class == CAYMAN) { + if (scissor->maxx == 0) + scissor->minx = 1; + if (scissor->maxy == 0) + scissor->miny = 1; + + if (rctx->chip_class == CAYMAN && + scissor->maxx == 1 && scissor->maxy == 1) + scissor->maxx = 2; + } +} + +static void r600_emit_one_scissor(struct r600_common_context *rctx, + struct radeon_winsys_cs *cs, + struct r600_signed_scissor *vp_scissor, + struct pipe_scissor_state *scissor) +{ + struct pipe_scissor_state final; + + r600_clamp_scissor(rctx, &final, vp_scissor); + + if (scissor) + r600_clip_scissor(&final, scissor); + + evergreen_apply_scissor_bug_workaround(rctx, &final); + + radeon_emit(cs, S_028250_TL_X(final.minx) | + S_028250_TL_Y(final.miny) | + S_028250_WINDOW_OFFSET_DISABLE(1)); + radeon_emit(cs, S_028254_BR_X(final.maxx) | + S_028254_BR_Y(final.maxy)); +} + +/* the range is [-MAX, MAX] */ +#define GET_MAX_VIEWPORT_RANGE(rctx) (rctx->chip_class >= EVERGREEN ? 32768 : 16384) + +static void r600_emit_guardband(struct r600_common_context *rctx, + struct r600_signed_scissor *vp_as_scissor) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct pipe_viewport_state vp; + float left, top, right, bottom, max_range, guardband_x, guardband_y; + + /* Reconstruct the viewport transformation from the scissor. */ + vp.translate[0] = (vp_as_scissor->minx + vp_as_scissor->maxx) / 2.0; + vp.translate[1] = (vp_as_scissor->miny + vp_as_scissor->maxy) / 2.0; + vp.scale[0] = vp_as_scissor->maxx - vp.translate[0]; + vp.scale[1] = vp_as_scissor->maxy - vp.translate[1]; + + /* Treat a 0x0 viewport as 1x1 to prevent division by zero. */ + if (vp_as_scissor->minx == vp_as_scissor->maxx) + vp.scale[0] = 0.5; + if (vp_as_scissor->miny == vp_as_scissor->maxy) + vp.scale[1] = 0.5; + + /* Find the biggest guard band that is inside the supported viewport + * range. The guard band is specified as a horizontal and vertical + * distance from (0,0) in clip space. + * + * This is done by applying the inverse viewport transformation + * on the viewport limits to get those limits in clip space. + * + * Use a limit one pixel smaller to allow for some precision error. + */ + max_range = GET_MAX_VIEWPORT_RANGE(rctx) - 1; + left = (-max_range - vp.translate[0]) / vp.scale[0]; + right = ( max_range - vp.translate[0]) / vp.scale[0]; + top = (-max_range - vp.translate[1]) / vp.scale[1]; + bottom = ( max_range - vp.translate[1]) / vp.scale[1]; + + assert(left <= -1 && top <= -1 && right >= 1 && bottom >= 1); + + guardband_x = MIN2(-left, right); + guardband_y = MIN2(-top, bottom); + + /* If any of the GB registers is updated, all of them must be updated. */ + if (rctx->chip_class >= CAYMAN) + radeon_set_context_reg_seq(cs, CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4); + else + radeon_set_context_reg_seq(cs, R600_R_028C0C_PA_CL_GB_VERT_CLIP_ADJ, 4); + + radeon_emit(cs, fui(guardband_y)); /* R_028BE8_PA_CL_GB_VERT_CLIP_ADJ */ + radeon_emit(cs, fui(1.0)); /* R_028BEC_PA_CL_GB_VERT_DISC_ADJ */ + radeon_emit(cs, fui(guardband_x)); /* R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ */ + radeon_emit(cs, fui(1.0)); /* R_028BF4_PA_CL_GB_HORZ_DISC_ADJ */ +} + +static void r600_emit_scissors(struct r600_common_context *rctx, struct r600_atom *atom) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct pipe_scissor_state *states = rctx->scissors.states; + unsigned mask = rctx->scissors.dirty_mask; + bool scissor_enabled = rctx->scissor_enabled; + struct r600_signed_scissor max_vp_scissor; + int i; + + /* The simple case: Only 1 viewport is active. */ + if (!rctx->vs_writes_viewport_index) { + struct r600_signed_scissor *vp = &rctx->viewports.as_scissor[0]; + + if (!(mask & 1)) + return; + + radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2); + r600_emit_one_scissor(rctx, cs, vp, scissor_enabled ? &states[0] : NULL); + r600_emit_guardband(rctx, vp); + rctx->scissors.dirty_mask &= ~1; /* clear one bit */ + return; + } + + /* Shaders can draw to any viewport. Make a union of all viewports. */ + max_vp_scissor = rctx->viewports.as_scissor[0]; + for (i = 1; i < R600_MAX_VIEWPORTS; i++) + r600_scissor_make_union(&max_vp_scissor, + &rctx->viewports.as_scissor[i]); + + while (mask) { + int start, count, i; + + u_bit_scan_consecutive_range(&mask, &start, &count); + + radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + + start * 4 * 2, count * 2); + for (i = start; i < start+count; i++) { + r600_emit_one_scissor(rctx, cs, &rctx->viewports.as_scissor[i], + scissor_enabled ? &states[i] : NULL); + } + } + r600_emit_guardband(rctx, &max_vp_scissor); + rctx->scissors.dirty_mask = 0; +} + +static void r600_set_viewport_states(struct pipe_context *ctx, + unsigned start_slot, + unsigned num_viewports, + const struct pipe_viewport_state *state) +{ + struct r600_common_context *rctx = (struct r600_common_context *)ctx; + int i; + + for (i = 0; i < num_viewports; i++) { + unsigned index = start_slot + i; + + rctx->viewports.states[index] = state[i]; + r600_get_scissor_from_viewport(rctx, &state[i], + &rctx->viewports.as_scissor[index]); + } + + rctx->viewports.dirty_mask |= ((1 << num_viewports) - 1) << start_slot; + rctx->scissors.dirty_mask |= ((1 << num_viewports) - 1) << start_slot; + rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true); + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); +} + +static void r600_emit_viewports(struct r600_common_context *rctx, struct r600_atom *atom) +{ + struct radeon_winsys_cs *cs = rctx->gfx.cs; + struct pipe_viewport_state *states = rctx->viewports.states; + unsigned mask = rctx->viewports.dirty_mask; + + /* The simple case: Only 1 viewport is active. */ + if (!rctx->vs_writes_viewport_index) { + if (!(mask & 1)) + return; + + radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); + radeon_emit(cs, fui(states[0].scale[0])); + radeon_emit(cs, fui(states[0].translate[0])); + radeon_emit(cs, fui(states[0].scale[1])); + radeon_emit(cs, fui(states[0].translate[1])); + radeon_emit(cs, fui(states[0].scale[2])); + radeon_emit(cs, fui(states[0].translate[2])); + rctx->viewports.dirty_mask &= ~1; /* clear one bit */ + return; + } + + while (mask) { + int start, count, i; + + u_bit_scan_consecutive_range(&mask, &start, &count); + + radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + + start * 4 * 6, count * 6); + for (i = start; i < start+count; i++) { + radeon_emit(cs, fui(states[i].scale[0])); + radeon_emit(cs, fui(states[i].translate[0])); + radeon_emit(cs, fui(states[i].scale[1])); + radeon_emit(cs, fui(states[i].translate[1])); + radeon_emit(cs, fui(states[i].scale[2])); + radeon_emit(cs, fui(states[i].translate[2])); + } + } + rctx->viewports.dirty_mask = 0; +} + +void r600_set_scissor_enable(struct r600_common_context *rctx, bool enable) +{ + if (rctx->scissor_enabled != enable) { + rctx->scissor_enabled = enable; + rctx->scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); + } +} + +/** + * Normally, we only emit 1 viewport and 1 scissor if no shader is using + * the VIEWPORT_INDEX output, and emitting the other viewports and scissors + * is delayed. When a shader with VIEWPORT_INDEX appears, this should be + * called to emit the rest. + */ +void r600_update_vs_writes_viewport_index(struct r600_common_context *rctx, + struct tgsi_shader_info *info) +{ + if (!info) + return; + + rctx->vs_writes_viewport_index = info->writes_viewport_index; + if (!rctx->vs_writes_viewport_index) + return; + + if (rctx->scissors.dirty_mask) + rctx->set_atom_dirty(rctx, &rctx->scissors.atom, true); + if (rctx->viewports.dirty_mask) + rctx->set_atom_dirty(rctx, &rctx->viewports.atom, true); +} + +void r600_init_viewport_functions(struct r600_common_context *rctx) +{ + rctx->scissors.atom.emit = r600_emit_scissors; + rctx->viewports.atom.emit = r600_emit_viewports; + + rctx->scissors.atom.num_dw = (2 + 16 * 2) + 6; + rctx->viewports.atom.num_dw = 2 + 16 * 6; + + rctx->b.set_scissor_states = r600_set_scissor_states; + rctx->b.set_viewport_states = r600_set_viewport_states; +} diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h index eeec6ef7385..c8cb5e217e8 100644 --- a/src/gallium/drivers/radeon/r600d_common.h +++ b/src/gallium/drivers/radeon/r600d_common.h @@ -220,4 +220,25 @@ /*CIK+*/ #define R_0300FC_CP_STRMOUT_CNTL 0x0300FC +#define R600_R_028C0C_PA_CL_GB_VERT_CLIP_ADJ 0x028C0C +#define CM_R_028BE8_PA_CL_GB_VERT_CLIP_ADJ 0x28be8 +#define R_02843C_PA_CL_VPORT_XSCALE 0x02843C + +#define R_028250_PA_SC_VPORT_SCISSOR_0_TL 0x028250 +#define S_028250_TL_X(x) (((x) & 0x7FFF) << 0) +#define G_028250_TL_X(x) (((x) >> 0) & 0x7FFF) +#define C_028250_TL_X 0xFFFF8000 +#define S_028250_TL_Y(x) (((x) & 0x7FFF) << 16) +#define G_028250_TL_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028250_TL_Y 0x8000FFFF +#define S_028250_WINDOW_OFFSET_DISABLE(x) (((x) & 0x1) << 31) +#define G_028250_WINDOW_OFFSET_DISABLE(x) (((x) >> 31) & 0x1) +#define C_028250_WINDOW_OFFSET_DISABLE 0x7FFFFFFF +#define S_028254_BR_X(x) (((x) & 0x7FFF) << 0) +#define G_028254_BR_X(x) (((x) >> 0) & 0x7FFF) +#define C_028254_BR_X 0xFFFF8000 +#define S_028254_BR_Y(x) (((x) & 0x7FFF) << 16) +#define G_028254_BR_Y(x) (((x) >> 16) & 0x7FFF) +#define C_028254_BR_Y 0x8000FFFF + #endif diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c index 233f46091a4..098baf20797 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.c +++ b/src/gallium/drivers/radeon/radeon_uvd.c @@ -1003,7 +1003,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size; dec->msg->body.decode.bsd_size = bs_size; - dec->msg->body.decode.db_pitch = dec->base.width; + dec->msg->body.decode.db_pitch = align(dec->base.width, 16); dt = dec->set_dtb(dec->msg, (struct vl_video_buffer *)target); if (((struct r600_common_screen*)dec->screen)->family >= CHIP_STONEY) diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index baecca72383..0c03652081c 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -245,6 +245,7 @@ struct radeon_info { enum chip_class chip_class; uint64_t gart_size; uint64_t vram_size; + bool has_dedicated_vram; boolean has_virtual_memory; bool gfx_ib_pad_with_type2; boolean has_sdma; @@ -449,7 +450,7 @@ struct radeon_winsys { * \return The created buffer object. */ struct pb_buffer *(*buffer_create)(struct radeon_winsys *ws, - unsigned size, + uint64_t size, unsigned alignment, boolean use_reusable_pool, enum radeon_bo_domain domain, @@ -528,7 +529,7 @@ struct radeon_winsys { * \param Size Size in bytes for the new buffer. */ struct pb_buffer *(*buffer_from_ptr)(struct radeon_winsys *ws, - void *pointer, unsigned size); + void *pointer, uint64_t size); /** * Whether the buffer was created from a user pointer. diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index c5ea8b17119..54da7a20203 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -52,8 +52,6 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) { struct si_context *sctx = (struct si_context *)ctx; - r600_suspend_nontimer_queries(&sctx->b); - util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer); util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements); util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); @@ -70,8 +68,8 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask); - util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]); - util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]); + util_blitter_save_viewport(sctx->blitter, &sctx->b.viewports.states[0]); + util_blitter_save_scissor(sctx->blitter, &sctx->b.scissors.states[0]); } if (op & SI_SAVE_FRAMEBUFFER) @@ -95,7 +93,6 @@ static void si_blitter_end(struct pipe_context *ctx) struct si_context *sctx = (struct si_context *)ctx; sctx->b.render_cond_force_off = false; - r600_resume_nontimer_queries(&sctx->b); } static unsigned u_max_sample(struct pipe_resource *r) diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index dc62415823e..001ddd4bfae 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -124,7 +124,7 @@ static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer) static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, unsigned byte_count, - unsigned remaining_size, unsigned *flags) + uint64_t remaining_size, unsigned *flags) { si_need_cs_space(sctx); @@ -158,7 +158,7 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - CP_DMA_ALIGNMENT) static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, - unsigned offset, unsigned size, unsigned value, + uint64_t offset, uint64_t size, unsigned value, bool is_framebuffer) { struct si_context *sctx = (struct si_context*)ctx; @@ -180,7 +180,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, sctx->b.gfx.cs, PIPE_TRANSFER_WRITE); map += offset; - for (unsigned i = 0; i < size; i++) { + for (uint64_t i = 0; i < size; i++) { unsigned byte_within_dword = (offset + i) % 4; *map++ = (value >> (byte_within_dword * 8)) & 0xff; } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 6dd2e4fd89d..b5557d800c7 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -746,6 +746,55 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s buffers->desc.list_dirty = true; } +/* SHADER BUFFERS */ + +static void si_set_shader_buffers(struct pipe_context *ctx, unsigned shader, + unsigned start_slot, unsigned count, + struct pipe_shader_buffer *sbuffers) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct si_buffer_resources *buffers = &sctx->shader_buffers[shader]; + unsigned i; + + assert(start_slot + count <= SI_NUM_SHADER_BUFFERS); + + for (i = 0; i < count; ++i) { + struct pipe_shader_buffer *sbuffer = sbuffers ? &sbuffers[i] : NULL; + struct r600_resource *buf; + unsigned slot = start_slot + i; + uint32_t *desc = buffers->desc.list + slot * 4; + uint64_t va; + + if (!sbuffer || !sbuffer->buffer) { + pipe_resource_reference(&buffers->buffers[slot], NULL); + memset(desc, 0, sizeof(uint32_t) * 4); + buffers->desc.enabled_mask &= ~(1llu << slot); + continue; + } + + buf = (struct r600_resource *)sbuffer->buffer; + va = buf->gpu_address + sbuffer->buffer_offset; + + desc[0] = va; + desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | + S_008F04_STRIDE(0); + desc[2] = sbuffer->buffer_size; + desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | + S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) | + S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + + pipe_resource_reference(&buffers->buffers[slot], &buf->b.b); + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buf, + buffers->shader_usage, buffers->priority); + buffers->desc.enabled_mask |= 1llu << slot; + } + + buffers->desc.list_dirty = true; +} + /* RING BUFFERS */ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, @@ -883,6 +932,12 @@ static void si_set_streamout_targets(struct pipe_context *ctx, SI_CONTEXT_VS_PARTIAL_FLUSH; } + /* All readers of the streamout targets need to be finished before we can + * start writing to the targets. + */ + if (num_targets) + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + /* Streamout buffers must be bound in 2 places: * 1) in VGT by setting the VGT_STRMOUT registers * 2) as shader resources @@ -977,6 +1032,30 @@ void si_update_compressed_colortex_masks(struct si_context *sctx) /* BUFFER DISCARD/INVALIDATION */ +/** Reset descriptors of buffer resources after \p buf has been invalidated. */ +static void si_reset_buffer_resources(struct si_context *sctx, + struct si_buffer_resources *buffers, + struct pipe_resource *buf, + uint64_t old_va) +{ + uint64_t mask = buffers->desc.enabled_mask; + + while (mask) { + unsigned i = u_bit_scan64(&mask); + if (buffers->buffers[i] == buf) { + si_desc_reset_buffer_offset(&sctx->b.b, + buffers->desc.list + i*4, + old_va, buf); + buffers->desc.list_dirty = true; + + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + (struct r600_resource *)buf, + buffers->shader_usage, + buffers->priority); + } + } +} + /* Reallocate a buffer a update all resource bindings where the buffer is * bound. * @@ -1048,23 +1127,12 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource } } - /* Constant buffers. */ + /* Constant and shader buffers. */ for (shader = 0; shader < SI_NUM_SHADERS; shader++) { - struct si_buffer_resources *buffers = &sctx->const_buffers[shader]; - uint64_t mask = buffers->desc.enabled_mask; - - while (mask) { - unsigned i = u_bit_scan64(&mask); - if (buffers->buffers[i] == buf) { - si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4, - old_va, buf); - buffers->desc.list_dirty = true; - - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, - rbuffer, buffers->shader_usage, - buffers->priority); - } - } + si_reset_buffer_resources(sctx, &sctx->const_buffers[shader], + buf, old_va); + si_reset_buffer_resources(sctx, &sctx->shader_buffers[shader], + buf, old_va); } /* Texture buffers - update virtual addresses in sampler view descriptors. */ @@ -1244,6 +1312,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom) si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false); si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false); + si_emit_shader_pointer(sctx, &sctx->shader_buffers[i].desc, base, false); si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false); si_emit_shader_pointer(sctx, &sctx->images[i].desc, base, false); } @@ -1263,6 +1332,9 @@ void si_init_all_descriptors(struct si_context *sctx) si_init_buffer_resources(&sctx->rw_buffers[i], SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT); + si_init_buffer_resources(&sctx->shader_buffers[i], + SI_NUM_SHADER_BUFFERS, SI_SGPR_SHADER_BUFFERS, + RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER); si_init_descriptors(&sctx->samplers[i].views.desc, SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS, @@ -1280,6 +1352,7 @@ void si_init_all_descriptors(struct si_context *sctx) sctx->b.b.bind_sampler_states = si_bind_sampler_states; sctx->b.b.set_shader_images = si_set_shader_images; sctx->b.b.set_constant_buffer = si_set_constant_buffer; + sctx->b.b.set_shader_buffers = si_set_shader_buffers; sctx->b.b.set_sampler_views = si_set_sampler_views; sctx->b.b.set_stream_output_targets = si_set_streamout_targets; sctx->b.invalidate_buffer = si_invalidate_buffer; @@ -1302,6 +1375,7 @@ bool si_upload_shader_descriptors(struct si_context *sctx) for (i = 0; i < SI_NUM_SHADERS; i++) { if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) || !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) || + !si_upload_descriptors(sctx, &sctx->shader_buffers[i].desc) || !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) || !si_upload_descriptors(sctx, &sctx->images[i].desc)) return false; @@ -1316,6 +1390,7 @@ void si_release_all_descriptors(struct si_context *sctx) for (i = 0; i < SI_NUM_SHADERS; i++) { si_release_buffer_resources(&sctx->const_buffers[i]); si_release_buffer_resources(&sctx->rw_buffers[i]); + si_release_buffer_resources(&sctx->shader_buffers[i]); si_release_sampler_views(&sctx->samplers[i].views); si_release_image_views(&sctx->images[i]); } @@ -1329,6 +1404,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) for (i = 0; i < SI_NUM_SHADERS; i++) { si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]); si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]); + si_buffer_resources_begin_new_cs(sctx, &sctx->shader_buffers[i]); si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views); si_image_views_begin_new_cs(sctx, &sctx->images[i]); } diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 8c900a4ecb6..b621b55abd3 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -155,7 +155,8 @@ void si_begin_new_cs(struct si_context *ctx) SI_CONTEXT_INV_VMEM_L1 | SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_ICACHE; + SI_CONTEXT_INV_ICACHE | + R600_CONTEXT_START_PIPELINE_STATS; /* set all valid group as dirty so they get reemited on * next draw command @@ -185,10 +186,10 @@ void si_begin_new_cs(struct si_context *ctx) si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom); si_all_descriptors_begin_new_cs(ctx); - ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; - ctx->viewports.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; - si_mark_atom_dirty(ctx, &ctx->scissors.atom); - si_mark_atom_dirty(ctx, &ctx->viewports.atom); + ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; + ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; + si_mark_atom_dirty(ctx, &ctx->b.scissors.atom); + si_mark_atom_dirty(ctx, &ctx->b.viewports.atom); r600_postflush_resume_features(&ctx->b); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 41bb84d68df..6a990ed64c3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -308,6 +308,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_QUERY_MEMORY_INFO: case PIPE_CAP_TGSI_PACK_HALF_FLOAT: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 1; case PIPE_CAP_RESOURCE_FROM_USER_MEMORY: @@ -332,9 +333,12 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: return 4; + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + return HAVE_LLVM >= 0x0309 ? 4 : 0; case PIPE_CAP_GLSL_FEATURE_LEVEL: - return HAVE_LLVM >= 0x0307 ? 410 : 330; + return HAVE_LLVM >= 0x0309 ? 420 : + HAVE_LLVM >= 0x0307 ? 410 : 330; case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE: return MIN2(sscreen->b.info.vram_size, 0xFFFFFFFF); @@ -353,7 +357,6 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_DRAW_PARAMETERS: case PIPE_CAP_MULTI_DRAW_INDIRECT: case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_GENERATE_MIPMAP: case PIPE_CAP_STRING_MARKER: case PIPE_CAP_QUERY_BUFFER_OBJECT: @@ -401,7 +404,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) return 8; case PIPE_CAP_MAX_VIEWPORTS: - return SI_MAX_VIEWPORTS; + return R600_MAX_VIEWPORTS; /* Timer queries, present when the clock frequency is non zero. */ case PIPE_CAP_QUERY_TIMESTAMP: @@ -539,7 +542,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - return 0; + return HAVE_LLVM >= 0x0309 ? SI_NUM_SHADER_BUFFERS : 0; case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: return HAVE_LLVM >= 0x0309 ? SI_NUM_IMAGES : 0; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 4158fc5461e..0398b1df61e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -76,7 +76,6 @@ #define SI_IS_TRACE_POINT(x) (((x) & 0xcafe0000) == 0xcafe0000) #define SI_GET_TRACE_POINT_ID(x) ((x) & 0xffff) -#define SI_MAX_VIEWPORTS 16 #define SI_MAX_BORDER_COLORS 4096 struct si_compute; @@ -173,18 +172,6 @@ struct si_sample_mask { uint16_t sample_mask; }; -struct si_scissors { - struct r600_atom atom; - unsigned dirty_mask; - struct pipe_scissor_state states[SI_MAX_VIEWPORTS]; -}; - -struct si_viewports { - struct r600_atom atom; - unsigned dirty_mask; - struct pipe_viewport_state states[SI_MAX_VIEWPORTS]; -}; - /* A shader state consists of the shader selector, which is a constant state * object shared by multiple contexts and shouldn't be modified, and * the current shader variant selected for this context. @@ -228,8 +215,6 @@ struct si_context { struct r600_atom clip_regs; struct si_clip_state clip_state; struct si_shader_data shader_userdata; - struct si_scissors scissors; - struct si_viewports viewports; struct si_stencil_ref stencil_ref; struct r600_atom spi_map; @@ -256,6 +241,7 @@ struct si_context { struct si_descriptors vertex_buffers; struct si_buffer_resources const_buffers[SI_NUM_SHADERS]; struct si_buffer_resources rw_buffers[SI_NUM_SHADERS]; + struct si_buffer_resources shader_buffers[SI_NUM_SHADERS]; struct si_textures_info samplers[SI_NUM_SHADERS]; struct si_images_info images[SI_NUM_SHADERS]; @@ -289,6 +275,7 @@ struct si_context { bool db_stencil_clear; bool db_stencil_disable_expclear; unsigned ps_db_shader_control; + bool occlusion_queries_disabled; /* Emitted draw state. */ int last_base_vertex; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 08da3e37550..c58467ddcb0 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -98,6 +98,7 @@ struct si_shader_context LLVMValueRef const_buffers[SI_NUM_CONST_BUFFERS]; LLVMValueRef lds; LLVMValueRef *constants[SI_NUM_CONST_BUFFERS]; + LLVMValueRef shader_buffers[SI_NUM_SHADER_BUFFERS]; LLVMValueRef sampler_views[SI_NUM_SAMPLERS]; LLVMValueRef sampler_states[SI_NUM_SAMPLERS]; LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS]; @@ -2775,6 +2776,24 @@ static void membar_emit( emit_optimization_barrier(ctx); } +static LLVMValueRef +shader_buffer_fetch_rsrc(struct si_shader_context *ctx, + const struct tgsi_full_src_register *reg) +{ + LLVMValueRef ind_index; + LLVMValueRef rsrc_ptr; + + if (!reg->Register.Indirect) + return ctx->shader_buffers[reg->Register.Index]; + + ind_index = get_bounded_indirect_index(ctx, ®->Indirect, + reg->Register.Index, + SI_NUM_SHADER_BUFFERS); + + rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS); + return build_indexed_load_const(ctx, rsrc_ptr, ind_index); +} + static bool tgsi_is_array_sampler(unsigned target) { return target == TGSI_TEXTURE_1D_ARRAY || @@ -2924,32 +2943,46 @@ static void image_append_args( } /** + * Given a 256 bit resource, extract the top half (which stores the buffer + * resource in the case of textures and images). + */ +static LLVMValueRef extract_rsrc_top_half( + struct si_shader_context *ctx, + LLVMValueRef rsrc) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; + LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2); + + rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, ""); + rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, ""); + rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""); + + return rsrc; +} + +/** * Append the resource and indexing arguments for buffer intrinsics. * - * \param rsrc the 256 bit resource - * \param index index into the buffer + * \param rsrc the v4i32 buffer resource + * \param index index into the buffer (stride-based) + * \param offset byte offset into the buffer */ static void buffer_append_args( struct si_shader_context *ctx, struct lp_build_emit_data *emit_data, LLVMValueRef rsrc, LLVMValueRef index, + LLVMValueRef offset, bool atomic) { - struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; - struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; const struct tgsi_full_instruction *inst = emit_data->inst; - LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2); LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); - rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, ""); - rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, ""); - rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""); - emit_data->args[emit_data->arg_count++] = rsrc; emit_data->args[emit_data->arg_count++] = index; /* vindex */ - emit_data->args[emit_data->arg_count++] = bld_base->uint_bld.zero; /* voffset */ + emit_data->args[emit_data->arg_count++] = offset; /* voffset */ if (!atomic) { emit_data->args[emit_data->arg_count++] = inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? @@ -2966,24 +2999,73 @@ static void load_fetch_args( struct gallivm_state *gallivm = bld_base->base.gallivm; const struct tgsi_full_instruction * inst = emit_data->inst; unsigned target = inst->Memory.Texture; - LLVMValueRef coords; LLVMValueRef rsrc; emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); - image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc); - coords = image_fetch_coords(bld_base, inst, 1); + if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef offset; + LLVMValueRef tmp; - if (target == TGSI_TEXTURE_BUFFER) { - buffer_append_args(ctx, emit_data, rsrc, coords, false); + rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]); + + tmp = lp_build_emit_fetch(bld_base, inst, 1, 0); + offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + + buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, + offset, false); } else { - emit_data->args[0] = coords; - emit_data->args[1] = rsrc; - emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ - emit_data->arg_count = 3; + LLVMValueRef coords; - image_append_args(ctx, emit_data, target, false); + image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc); + coords = image_fetch_coords(bld_base, inst, 1); + + if (target == TGSI_TEXTURE_BUFFER) { + rsrc = extract_rsrc_top_half(ctx, rsrc); + buffer_append_args(ctx, emit_data, rsrc, coords, + bld_base->uint_bld.zero, false); + } else { + emit_data->args[0] = coords; + emit_data->args[1] = rsrc; + emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ + emit_data->arg_count = 3; + + image_append_args(ctx, emit_data, target, false); + } + } +} + +static void load_emit_buffer(struct si_shader_context *ctx, + struct lp_build_emit_data *emit_data) +{ + const struct tgsi_full_instruction *inst = emit_data->inst; + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMBuilderRef builder = gallivm->builder; + uint writemask = inst->Dst[0].Register.WriteMask; + uint count = util_last_bit(writemask); + const char *intrinsic_name; + LLVMTypeRef dst_type; + + switch (count) { + case 1: + intrinsic_name = "llvm.amdgcn.buffer.load.f32"; + dst_type = ctx->f32; + break; + case 2: + intrinsic_name = "llvm.amdgcn.buffer.load.v2f32"; + dst_type = LLVMVectorType(ctx->f32, 2); + break; + default: // 3 & 4 + intrinsic_name = "llvm.amdgcn.buffer.load.v4f32"; + dst_type = ctx->v4f32; + count = 4; } + + emit_data->output[emit_data->chan] = lp_build_intrinsic( + builder, intrinsic_name, dst_type, + emit_data->args, emit_data->arg_count, + LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); } static void load_emit( @@ -2995,18 +3077,23 @@ static void load_emit( struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; const struct tgsi_full_instruction * inst = emit_data->inst; - unsigned target = inst->Memory.Texture; char intrinsic_name[32]; char coords_type[8]; if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) emit_optimization_barrier(ctx); - if (target == TGSI_TEXTURE_BUFFER) { - emit_data->output[emit_data->chan] = lp_build_intrinsic( - builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type, - emit_data->args, emit_data->arg_count, - LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); + if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { + load_emit_buffer(ctx, emit_data); + return; + } + + if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { + emit_data->output[emit_data->chan] = + lp_build_intrinsic( + builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); } else { build_int_type_name(LLVMTypeOf(emit_data->args[0]), coords_type, sizeof(coords_type)); @@ -3028,39 +3115,129 @@ static void store_fetch_args( { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; const struct tgsi_full_instruction * inst = emit_data->inst; - struct tgsi_full_src_register image; - unsigned target = inst->Memory.Texture; + struct tgsi_full_src_register memory; LLVMValueRef chans[4]; LLVMValueRef data; - LLVMValueRef coords; LLVMValueRef rsrc; unsigned chan; emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context); - image = tgsi_full_src_register_from_dst(&inst->Dst[0]); - coords = image_fetch_coords(bld_base, inst, 0); - for (chan = 0; chan < 4; ++chan) { chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan); } data = lp_build_gather_values(gallivm, chans, 4); - if (target == TGSI_TEXTURE_BUFFER) { - image_fetch_rsrc(bld_base, &image, false, &rsrc); - emit_data->args[0] = data; - emit_data->arg_count = 1; + emit_data->args[emit_data->arg_count++] = data; + + memory = tgsi_full_src_register_from_dst(&inst->Dst[0]); - buffer_append_args(ctx, emit_data, rsrc, coords, false); + if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { + LLVMValueRef offset; + LLVMValueRef tmp; + + rsrc = shader_buffer_fetch_rsrc(ctx, &memory); + + tmp = lp_build_emit_fetch(bld_base, inst, 0, 0); + offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + + buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, + offset, false); } else { + unsigned target = inst->Memory.Texture; + LLVMValueRef coords; + + coords = image_fetch_coords(bld_base, inst, 0); + + if (target == TGSI_TEXTURE_BUFFER) { + image_fetch_rsrc(bld_base, &memory, false, &rsrc); + + rsrc = extract_rsrc_top_half(ctx, rsrc); + buffer_append_args(ctx, emit_data, rsrc, coords, + bld_base->uint_bld.zero, false); + } else { + emit_data->args[1] = coords; + image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]); + emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */ + emit_data->arg_count = 4; + + image_append_args(ctx, emit_data, target, false); + } + } +} + +static void store_emit_buffer( + struct si_shader_context *ctx, + struct lp_build_emit_data *emit_data) +{ + const struct tgsi_full_instruction *inst = emit_data->inst; + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMBuilderRef builder = gallivm->builder; + struct lp_build_context *uint_bld = &ctx->radeon_bld.soa.bld_base.uint_bld; + LLVMValueRef base_data = emit_data->args[0]; + LLVMValueRef base_offset = emit_data->args[3]; + unsigned writemask = inst->Dst[0].Register.WriteMask; + + while (writemask) { + int start, count; + const char *intrinsic_name; + LLVMValueRef data; + LLVMValueRef offset; + LLVMValueRef tmp; + + u_bit_scan_consecutive_range(&writemask, &start, &count); + + /* Due to an LLVM limitation, split 3-element writes + * into a 2-element and a 1-element write. */ + if (count == 3) { + writemask |= 1 << (start + 2); + count = 2; + } + + if (count == 4) { + data = base_data; + intrinsic_name = "llvm.amdgcn.buffer.store.v4f32"; + } else if (count == 2) { + LLVMTypeRef v2f32 = LLVMVectorType(ctx->f32, 2); + + tmp = LLVMBuildExtractElement( + builder, base_data, + lp_build_const_int32(gallivm, start), ""); + data = LLVMBuildInsertElement( + builder, LLVMGetUndef(v2f32), tmp, + uint_bld->zero, ""); + + tmp = LLVMBuildExtractElement( + builder, base_data, + lp_build_const_int32(gallivm, start + 1), ""); + data = LLVMBuildInsertElement( + builder, data, tmp, uint_bld->one, ""); + + intrinsic_name = "llvm.amdgcn.buffer.store.v2f32"; + } else { + assert(count == 1); + data = LLVMBuildExtractElement( + builder, base_data, + lp_build_const_int32(gallivm, start), ""); + intrinsic_name = "llvm.amdgcn.buffer.store.f32"; + } + + offset = base_offset; + if (start != 0) { + offset = LLVMBuildAdd( + builder, offset, + lp_build_const_int32(gallivm, start * 4), ""); + } + emit_data->args[0] = data; - emit_data->args[1] = coords; - image_fetch_rsrc(bld_base, &image, true, &emit_data->args[2]); - emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */ - emit_data->arg_count = 4; + emit_data->args[3] = offset; - image_append_args(ctx, emit_data, target, false); + lp_build_intrinsic( + builder, intrinsic_name, emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMNoUnwindAttribute); } } @@ -3076,6 +3253,11 @@ static void store_emit( char intrinsic_name[32]; char coords_type[8]; + if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { + store_emit_buffer(si_shader_context(bld_base), emit_data); + return; + } + if (target == TGSI_TEXTURE_BUFFER) { emit_data->output[emit_data->chan] = lp_build_intrinsic( builder, "llvm.amdgcn.buffer.store.format.v4f32", @@ -3103,18 +3285,12 @@ static void atomic_fetch_args( struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; const struct tgsi_full_instruction * inst = emit_data->inst; - unsigned target = inst->Memory.Texture; LLVMValueRef data1, data2; - LLVMValueRef coords; LLVMValueRef rsrc; LLVMValueRef tmp; emit_data->dst_type = bld_base->base.elem_type; - image_fetch_rsrc(bld_base, &inst->Src[0], target != TGSI_TEXTURE_BUFFER, - &rsrc); - coords = image_fetch_coords(bld_base, inst, 1); - tmp = lp_build_emit_fetch(bld_base, inst, 2, 0); data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); @@ -3130,13 +3306,34 @@ static void atomic_fetch_args( emit_data->args[emit_data->arg_count++] = data2; emit_data->args[emit_data->arg_count++] = data1; - if (target == TGSI_TEXTURE_BUFFER) { - buffer_append_args(ctx, emit_data, rsrc, coords, true); + if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { + LLVMValueRef offset; + + rsrc = shader_buffer_fetch_rsrc(ctx, &inst->Src[0]); + + tmp = lp_build_emit_fetch(bld_base, inst, 1, 0); + offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + + buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, + offset, true); } else { - emit_data->args[emit_data->arg_count++] = coords; - emit_data->args[emit_data->arg_count++] = rsrc; + unsigned target = inst->Memory.Texture; + LLVMValueRef coords; + + image_fetch_rsrc(bld_base, &inst->Src[0], + target != TGSI_TEXTURE_BUFFER, &rsrc); + coords = image_fetch_coords(bld_base, inst, 1); + + if (target == TGSI_TEXTURE_BUFFER) { + rsrc = extract_rsrc_top_half(ctx, rsrc); + buffer_append_args(ctx, emit_data, rsrc, coords, + bld_base->uint_bld.zero, true); + } else { + emit_data->args[emit_data->arg_count++] = coords; + emit_data->args[emit_data->arg_count++] = rsrc; - image_append_args(ctx, emit_data, target, true); + image_append_args(ctx, emit_data, target, true); + } } } @@ -3148,11 +3345,11 @@ static void atomic_emit( struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; const struct tgsi_full_instruction * inst = emit_data->inst; - unsigned target = inst->Memory.Texture; char intrinsic_name[40]; LLVMValueRef tmp; - if (target == TGSI_TEXTURE_BUFFER) { + if (inst->Src[0].Register.File == TGSI_FILE_BUFFER || + inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { snprintf(intrinsic_name, sizeof(intrinsic_name), "llvm.amdgcn.buffer.atomic.%s", action->intr_name); } else { @@ -3177,14 +3374,17 @@ static void resq_fetch_args( struct lp_build_tgsi_context * bld_base, struct lp_build_emit_data * emit_data) { + struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; const struct tgsi_full_instruction *inst = emit_data->inst; const struct tgsi_full_src_register *reg = &inst->Src[0]; - unsigned tex_target = inst->Memory.Texture; emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); - if (tex_target == TGSI_TEXTURE_BUFFER) { + if (reg->Register.File == TGSI_FILE_BUFFER) { + emit_data->args[0] = shader_buffer_fetch_rsrc(ctx, reg); + emit_data->arg_count = 1; + } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]); emit_data->arg_count = 1; } else { @@ -3193,7 +3393,7 @@ static void resq_fetch_args( emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */ emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */ - emit_data->args[5] = tgsi_is_array_image(tex_target) ? + emit_data->args[5] = tgsi_is_array_image(inst->Memory.Texture) ? bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */ emit_data->args[6] = bld_base->uint_bld.zero; /* glc */ emit_data->args[7] = bld_base->uint_bld.zero; /* slc */ @@ -3211,10 +3411,12 @@ static void resq_emit( struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; const struct tgsi_full_instruction *inst = emit_data->inst; - unsigned target = inst->Memory.Texture; LLVMValueRef out; - if (target == TGSI_TEXTURE_BUFFER) { + if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { + out = LLVMBuildExtractElement(builder, emit_data->args[0], + lp_build_const_int32(gallivm, 2), ""); + } else if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) { out = get_buffer_size(bld_base, emit_data->args[0]); } else { out = lp_build_intrinsic( @@ -3223,7 +3425,7 @@ static void resq_emit( LLVMReadNoneAttribute | LLVMNoUnwindAttribute); /* Divide the number of layers by 6 to get the number of cubes. */ - if (target == TGSI_TEXTURE_CUBE_ARRAY) { + if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY) { LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2); LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6); @@ -3339,6 +3541,35 @@ static LLVMValueRef get_sampler_desc(struct si_shader_context *ctx, return get_sampler_desc_custom(ctx, list, index, type); } +/* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL. + * + * SI-CI: + * If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic + * filtering manually. The driver sets img7 to a mask clearing + * MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do: + * s_and_b32 samp0, samp0, img7 + * + * VI: + * The ANISO_OVERRIDE sampler field enables this fix in TA. + */ +static LLVMValueRef sici_fix_sampler_aniso(struct si_shader_context *ctx, + LLVMValueRef res, LLVMValueRef samp) +{ + LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder; + LLVMValueRef img7, samp0; + + if (ctx->screen->b.chip_class >= VI) + return samp; + + img7 = LLVMBuildExtractElement(builder, res, + LLVMConstInt(ctx->i32, 7, 0), ""); + samp0 = LLVMBuildExtractElement(builder, samp, + LLVMConstInt(ctx->i32, 0, 0), ""); + samp0 = LLVMBuildAnd(builder, samp0, img7, ""); + return LLVMBuildInsertElement(builder, samp, samp0, + LLVMConstInt(ctx->i32, 0, 0), ""); +} + static void tex_fetch_ptrs( struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data, @@ -3370,6 +3601,7 @@ static void tex_fetch_ptrs( *fmask_ptr = get_sampler_desc(ctx, ind_index, DESC_FMASK); } else { *samp_ptr = get_sampler_desc(ctx, ind_index, DESC_SAMPLER); + *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr); *fmask_ptr = NULL; } } else { @@ -4420,7 +4652,8 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS); params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS); params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES); - last_array_pointer = SI_PARAM_IMAGES; + params[SI_PARAM_SHADER_BUFFERS] = const_array(ctx->v4i32, SI_NUM_SHADER_BUFFERS); + last_array_pointer = SI_PARAM_SHADER_BUFFERS; switch (ctx->type) { case TGSI_PROCESSOR_VERTEX: @@ -4679,6 +4912,21 @@ static void preload_constants(struct si_shader_context *ctx) } } +static void preload_shader_buffers(struct si_shader_context *ctx) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMValueRef ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_SHADER_BUFFERS); + int buf, maxbuf; + + maxbuf = MIN2(ctx->shader->selector->info.file_max[TGSI_FILE_BUFFER], + SI_NUM_SHADER_BUFFERS - 1); + for (buf = 0; buf <= maxbuf; ++buf) { + ctx->shader_buffers[buf] = + build_indexed_load_const( + ctx, ptr, lp_build_const_int32(gallivm, buf)); + } +} + static void preload_samplers(struct si_shader_context *ctx) { struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; @@ -4701,9 +4949,13 @@ static void preload_samplers(struct si_shader_context *ctx) if (info->is_msaa_sampler[i]) ctx->fmasks[i] = get_sampler_desc(ctx, offset, DESC_FMASK); - else + else { ctx->sampler_states[i] = get_sampler_desc(ctx, offset, DESC_SAMPLER); + ctx->sampler_states[i] = + sici_fix_sampler_aniso(ctx, ctx->sampler_views[i], + ctx->sampler_states[i]); + } } } @@ -5540,6 +5792,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, create_meta_data(&ctx); create_function(&ctx); preload_constants(&ctx); + preload_shader_buffers(&ctx); preload_samplers(&ctx); preload_images(&ctx); preload_streamout_buffers(&ctx); @@ -6000,6 +6253,7 @@ static bool si_compile_tcs_epilog(struct si_screen *sscreen, params[SI_PARAM_CONST_BUFFERS] = ctx.i64; params[SI_PARAM_SAMPLERS] = ctx.i64; params[SI_PARAM_IMAGES] = ctx.i64; + params[SI_PARAM_SHADER_BUFFERS] = ctx.i64; params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32; params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32; params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32; @@ -6250,6 +6504,7 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen, params[SI_PARAM_CONST_BUFFERS] = ctx.i64; params[SI_PARAM_SAMPLERS] = ctx.i64; params[SI_PARAM_IMAGES] = ctx.i64; + params[SI_PARAM_SHADER_BUFFERS] = ctx.i64; params[SI_PARAM_ALPHA_REF] = ctx.f32; last_array_pointer = -1; last_sgpr = SI_PARAM_ALPHA_REF; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 8059edf6395..013c8a2c114 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -81,95 +81,97 @@ struct radeon_shader_reloc; #define SI_SGPR_CONST_BUFFERS 2 #define SI_SGPR_SAMPLERS 4 /* images & sampler states interleaved */ #define SI_SGPR_IMAGES 6 -#define SI_SGPR_VERTEX_BUFFERS 8 /* VS only */ -#define SI_SGPR_BASE_VERTEX 10 /* VS only */ -#define SI_SGPR_START_INSTANCE 11 /* VS only */ -#define SI_SGPR_VS_STATE_BITS 12 /* VS(VS) only */ -#define SI_SGPR_LS_OUT_LAYOUT 12 /* VS(LS) only */ -#define SI_SGPR_TCS_OUT_OFFSETS 8 /* TCS & TES only */ -#define SI_SGPR_TCS_OUT_LAYOUT 9 /* TCS & TES only */ -#define SI_SGPR_TCS_IN_LAYOUT 10 /* TCS only */ -#define SI_SGPR_ALPHA_REF 8 /* PS only */ - -#define SI_VS_NUM_USER_SGPR 13 /* API VS */ -#define SI_ES_NUM_USER_SGPR 12 /* API VS */ -#define SI_LS_NUM_USER_SGPR 13 /* API VS */ -#define SI_TCS_NUM_USER_SGPR 11 -#define SI_TES_NUM_USER_SGPR 10 -#define SI_GS_NUM_USER_SGPR 8 +#define SI_SGPR_SHADER_BUFFERS 8 +#define SI_SGPR_VERTEX_BUFFERS 10 /* VS only */ +#define SI_SGPR_BASE_VERTEX 12 /* VS only */ +#define SI_SGPR_START_INSTANCE 13 /* VS only */ +#define SI_SGPR_VS_STATE_BITS 14 /* VS(VS) only */ +#define SI_SGPR_LS_OUT_LAYOUT 14 /* VS(LS) only */ +#define SI_SGPR_TCS_OUT_OFFSETS 10 /* TCS & TES only */ +#define SI_SGPR_TCS_OUT_LAYOUT 11 /* TCS & TES only */ +#define SI_SGPR_TCS_IN_LAYOUT 12 /* TCS only */ +#define SI_SGPR_ALPHA_REF 10 /* PS only */ + +#define SI_VS_NUM_USER_SGPR 15 /* API VS */ +#define SI_ES_NUM_USER_SGPR 14 /* API VS */ +#define SI_LS_NUM_USER_SGPR 15 /* API VS */ +#define SI_TCS_NUM_USER_SGPR 13 +#define SI_TES_NUM_USER_SGPR 12 +#define SI_GS_NUM_USER_SGPR 10 #define SI_GSCOPY_NUM_USER_SGPR 4 -#define SI_PS_NUM_USER_SGPR 9 +#define SI_PS_NUM_USER_SGPR 11 /* LLVM function parameter indices */ #define SI_PARAM_RW_BUFFERS 0 #define SI_PARAM_CONST_BUFFERS 1 #define SI_PARAM_SAMPLERS 2 #define SI_PARAM_IMAGES 3 +#define SI_PARAM_SHADER_BUFFERS 4 /* VS only parameters */ -#define SI_PARAM_VERTEX_BUFFERS 4 -#define SI_PARAM_BASE_VERTEX 5 -#define SI_PARAM_START_INSTANCE 6 +#define SI_PARAM_VERTEX_BUFFERS 5 +#define SI_PARAM_BASE_VERTEX 6 +#define SI_PARAM_START_INSTANCE 7 /* [0] = clamp vertex color */ -#define SI_PARAM_VS_STATE_BITS 7 +#define SI_PARAM_VS_STATE_BITS 8 /* the other VS parameters are assigned dynamically */ /* Offsets where TCS outputs and TCS patch outputs live in LDS: * [0:15] = TCS output patch0 offset / 16, max = NUM_PATCHES * 32 * 32 * [16:31] = TCS output patch0 offset for per-patch / 16, max = NUM_PATCHES*32*32* + 32*32 */ -#define SI_PARAM_TCS_OUT_OFFSETS 4 /* for TCS & TES */ +#define SI_PARAM_TCS_OUT_OFFSETS 5 /* for TCS & TES */ /* Layout of TCS outputs / TES inputs: * [0:12] = stride between output patches in dwords, num_outputs * num_vertices * 4, max = 32*32*4 * [13:20] = stride between output vertices in dwords = num_inputs * 4, max = 32*4 * [26:31] = gl_PatchVerticesIn, max = 32 */ -#define SI_PARAM_TCS_OUT_LAYOUT 5 /* for TCS & TES */ +#define SI_PARAM_TCS_OUT_LAYOUT 6 /* for TCS & TES */ /* Layout of LS outputs / TCS inputs * [0:12] = stride between patches in dwords = num_inputs * num_vertices * 4, max = 32*32*4 * [13:20] = stride between vertices in dwords = num_inputs * 4, max = 32*4 */ -#define SI_PARAM_TCS_IN_LAYOUT 6 /* TCS only */ -#define SI_PARAM_LS_OUT_LAYOUT 7 /* same value as TCS_IN_LAYOUT, LS only */ +#define SI_PARAM_TCS_IN_LAYOUT 7 /* TCS only */ +#define SI_PARAM_LS_OUT_LAYOUT 8 /* same value as TCS_IN_LAYOUT, LS only */ /* TCS only parameters. */ -#define SI_PARAM_TESS_FACTOR_OFFSET 7 -#define SI_PARAM_PATCH_ID 8 -#define SI_PARAM_REL_IDS 9 +#define SI_PARAM_TESS_FACTOR_OFFSET 8 +#define SI_PARAM_PATCH_ID 9 +#define SI_PARAM_REL_IDS 10 /* GS only parameters */ -#define SI_PARAM_GS2VS_OFFSET 4 -#define SI_PARAM_GS_WAVE_ID 5 -#define SI_PARAM_VTX0_OFFSET 6 -#define SI_PARAM_VTX1_OFFSET 7 -#define SI_PARAM_PRIMITIVE_ID 8 -#define SI_PARAM_VTX2_OFFSET 9 -#define SI_PARAM_VTX3_OFFSET 10 -#define SI_PARAM_VTX4_OFFSET 11 -#define SI_PARAM_VTX5_OFFSET 12 -#define SI_PARAM_GS_INSTANCE_ID 13 +#define SI_PARAM_GS2VS_OFFSET 5 +#define SI_PARAM_GS_WAVE_ID 6 +#define SI_PARAM_VTX0_OFFSET 7 +#define SI_PARAM_VTX1_OFFSET 8 +#define SI_PARAM_PRIMITIVE_ID 9 +#define SI_PARAM_VTX2_OFFSET 10 +#define SI_PARAM_VTX3_OFFSET 11 +#define SI_PARAM_VTX4_OFFSET 12 +#define SI_PARAM_VTX5_OFFSET 13 +#define SI_PARAM_GS_INSTANCE_ID 14 /* PS only parameters */ -#define SI_PARAM_ALPHA_REF 4 -#define SI_PARAM_PRIM_MASK 5 -#define SI_PARAM_PERSP_SAMPLE 6 -#define SI_PARAM_PERSP_CENTER 7 -#define SI_PARAM_PERSP_CENTROID 8 -#define SI_PARAM_PERSP_PULL_MODEL 9 -#define SI_PARAM_LINEAR_SAMPLE 10 -#define SI_PARAM_LINEAR_CENTER 11 -#define SI_PARAM_LINEAR_CENTROID 12 -#define SI_PARAM_LINE_STIPPLE_TEX 13 -#define SI_PARAM_POS_X_FLOAT 14 -#define SI_PARAM_POS_Y_FLOAT 15 -#define SI_PARAM_POS_Z_FLOAT 16 -#define SI_PARAM_POS_W_FLOAT 17 -#define SI_PARAM_FRONT_FACE 18 -#define SI_PARAM_ANCILLARY 19 -#define SI_PARAM_SAMPLE_COVERAGE 20 -#define SI_PARAM_POS_FIXED_PT 21 +#define SI_PARAM_ALPHA_REF 5 +#define SI_PARAM_PRIM_MASK 6 +#define SI_PARAM_PERSP_SAMPLE 7 +#define SI_PARAM_PERSP_CENTER 8 +#define SI_PARAM_PERSP_CENTROID 9 +#define SI_PARAM_PERSP_PULL_MODEL 10 +#define SI_PARAM_LINEAR_SAMPLE 11 +#define SI_PARAM_LINEAR_CENTER 12 +#define SI_PARAM_LINEAR_CENTROID 13 +#define SI_PARAM_LINE_STIPPLE_TEX 14 +#define SI_PARAM_POS_X_FLOAT 15 +#define SI_PARAM_POS_Y_FLOAT 16 +#define SI_PARAM_POS_Z_FLOAT 17 +#define SI_PARAM_POS_W_FLOAT 18 +#define SI_PARAM_FRONT_FACE 19 +#define SI_PARAM_ANCILLARY 20 +#define SI_PARAM_SAMPLE_COVERAGE 21 +#define SI_PARAM_POS_FIXED_PT 22 #define SI_NUM_PARAMS (SI_PARAM_POS_FIXED_PT + 9) /* +8 for COLOR[0..1] */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 8087d2331ff..82ae4c43245 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -752,7 +752,7 @@ static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom) } /* - * Clipping, scissors and viewport + * Clipping */ static void si_set_clip_state(struct pipe_context *ctx, @@ -819,179 +819,6 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom) S_028AB4_REUSE_OFF(info->writes_viewport_index)); } -static void si_set_scissor_states(struct pipe_context *ctx, - unsigned start_slot, - unsigned num_scissors, - const struct pipe_scissor_state *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - int i; - - for (i = 0; i < num_scissors; i++) - sctx->scissors.states[start_slot + i] = state[i]; - - if (!sctx->queued.named.rasterizer || - !sctx->queued.named.rasterizer->scissor_enable) - return; - - sctx->scissors.dirty_mask |= ((1 << num_scissors) - 1) << start_slot; - si_mark_atom_dirty(sctx, &sctx->scissors.atom); -} - -static void si_get_scissor_from_viewport(struct pipe_viewport_state *vp, - struct pipe_scissor_state *scissor) -{ - /* These must be signed, unlike pipe_scissor_state. */ - int minx, miny, maxx, maxy, tmp; - - /* Convert (-1, -1) and (1, 1) from clip space into window space. */ - minx = -vp->scale[0] + vp->translate[0]; - miny = -vp->scale[1] + vp->translate[1]; - maxx = vp->scale[0] + vp->translate[0]; - maxy = vp->scale[1] + vp->translate[1]; - - /* r600_draw_rectangle sets this. Disable the scissor. */ - if (minx == -1 && miny == -1 && maxx == 1 && maxy == 1) { - minx = miny = 0; - maxx = maxy = 16384; - } - - /* Handle inverted viewports. */ - if (minx > maxx) { - tmp = minx; - minx = maxx; - maxx = tmp; - } - if (miny > maxy) { - tmp = miny; - miny = maxy; - maxy = tmp; - } - - scissor->minx = CLAMP(minx, 0, 16384); - scissor->miny = CLAMP(miny, 0, 16384); - scissor->maxx = CLAMP(maxx, 0, 16384); - scissor->maxy = CLAMP(maxy, 0, 16384); -} - -static void si_clip_scissor(struct pipe_scissor_state *out, - struct pipe_scissor_state *clip) -{ - out->minx = MAX2(out->minx, clip->minx); - out->miny = MAX2(out->miny, clip->miny); - out->maxx = MIN2(out->maxx, clip->maxx); - out->maxy = MIN2(out->maxy, clip->maxy); -} - -static void si_emit_one_scissor(struct radeon_winsys_cs *cs, - struct pipe_viewport_state *vp, - struct pipe_scissor_state *scissor) -{ - struct pipe_scissor_state final; - - /* Since the guard band disables clipping, we have to clip per-pixel - * using a scissor. - */ - si_get_scissor_from_viewport(vp, &final); - - if (scissor) - si_clip_scissor(&final, scissor); - - radeon_emit(cs, S_028250_TL_X(final.minx) | - S_028250_TL_Y(final.miny) | - S_028250_WINDOW_OFFSET_DISABLE(1)); - radeon_emit(cs, S_028254_BR_X(final.maxx) | - S_028254_BR_Y(final.maxy)); -} - -static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom) -{ - struct radeon_winsys_cs *cs = sctx->b.gfx.cs; - struct pipe_scissor_state *states = sctx->scissors.states; - unsigned mask = sctx->scissors.dirty_mask; - bool scissor_enable = sctx->queued.named.rasterizer->scissor_enable; - - /* The simple case: Only 1 viewport is active. */ - if (mask & 1 && - !si_get_vs_info(sctx)->writes_viewport_index) { - radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, 2); - si_emit_one_scissor(cs, &sctx->viewports.states[0], - scissor_enable ? &states[0] : NULL); - sctx->scissors.dirty_mask &= ~1; /* clear one bit */ - return; - } - - while (mask) { - int start, count, i; - - u_bit_scan_consecutive_range(&mask, &start, &count); - - radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL + - start * 4 * 2, count * 2); - for (i = start; i < start+count; i++) { - si_emit_one_scissor(cs, &sctx->viewports.states[i], - scissor_enable ? &states[i] : NULL); - } - } - sctx->scissors.dirty_mask = 0; -} - -static void si_set_viewport_states(struct pipe_context *ctx, - unsigned start_slot, - unsigned num_viewports, - const struct pipe_viewport_state *state) -{ - struct si_context *sctx = (struct si_context *)ctx; - int i; - - for (i = 0; i < num_viewports; i++) - sctx->viewports.states[start_slot + i] = state[i]; - - sctx->viewports.dirty_mask |= ((1 << num_viewports) - 1) << start_slot; - sctx->scissors.dirty_mask |= ((1 << num_viewports) - 1) << start_slot; - si_mark_atom_dirty(sctx, &sctx->viewports.atom); - si_mark_atom_dirty(sctx, &sctx->scissors.atom); -} - -static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom) -{ - struct radeon_winsys_cs *cs = sctx->b.gfx.cs; - struct pipe_viewport_state *states = sctx->viewports.states; - unsigned mask = sctx->viewports.dirty_mask; - - /* The simple case: Only 1 viewport is active. */ - if (mask & 1 && - !si_get_vs_info(sctx)->writes_viewport_index) { - radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE, 6); - radeon_emit(cs, fui(states[0].scale[0])); - radeon_emit(cs, fui(states[0].translate[0])); - radeon_emit(cs, fui(states[0].scale[1])); - radeon_emit(cs, fui(states[0].translate[1])); - radeon_emit(cs, fui(states[0].scale[2])); - radeon_emit(cs, fui(states[0].translate[2])); - sctx->viewports.dirty_mask &= ~1; /* clear one bit */ - return; - } - - while (mask) { - int start, count, i; - - u_bit_scan_consecutive_range(&mask, &start, &count); - - radeon_set_context_reg_seq(cs, R_02843C_PA_CL_VPORT_XSCALE + - start * 4 * 6, count * 6); - for (i = start; i < start+count; i++) { - radeon_emit(cs, fui(states[i].scale[0])); - radeon_emit(cs, fui(states[i].translate[0])); - radeon_emit(cs, fui(states[i].scale[1])); - radeon_emit(cs, fui(states[i].translate[1])); - radeon_emit(cs, fui(states[i].scale[2])); - radeon_emit(cs, fui(states[i].translate[2])); - } - } - sctx->viewports.dirty_mask = 0; -} - /* * inferred state between framebuffer and rasterizer */ @@ -1173,10 +1000,7 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state) (!old_rs || old_rs->multisample_enable != rs->multisample_enable)) si_mark_atom_dirty(sctx, &sctx->db_render_state); - if (!old_rs || old_rs->scissor_enable != rs->scissor_enable) { - sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; - si_mark_atom_dirty(sctx, &sctx->scissors.atom); - } + r600_set_scissor_enable(&sctx->b, rs->scissor_enable); si_pm4_bind_state(sctx, rasterizer, rs); si_update_poly_offset_state(sctx); @@ -1348,6 +1172,26 @@ static void *si_create_db_flush_dsa(struct si_context *sctx) /* DB RENDER STATE */ +static void si_set_active_query_state(struct pipe_context *ctx, boolean enable) +{ + struct si_context *sctx = (struct si_context*)ctx; + + /* Pipeline stat & streamout queries. */ + if (enable) { + sctx->b.flags &= ~R600_CONTEXT_STOP_PIPELINE_STATS; + sctx->b.flags |= R600_CONTEXT_START_PIPELINE_STATS; + } else { + sctx->b.flags &= ~R600_CONTEXT_START_PIPELINE_STATS; + sctx->b.flags |= R600_CONTEXT_STOP_PIPELINE_STATS; + } + + /* Occlusion queries. */ + if (sctx->occlusion_queries_disabled != !enable) { + sctx->occlusion_queries_disabled = !enable; + si_mark_atom_dirty(sctx, &sctx->db_render_state); + } +} + static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable) { struct si_context *sctx = (struct si_context*)ctx; @@ -1382,7 +1226,8 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s } /* DB_COUNT_CONTROL (occlusion queries) */ - if (sctx->b.num_occlusion_queries > 0) { + if (sctx->b.num_occlusion_queries > 0 && + !sctx->occlusion_queries_disabled) { bool perfect = sctx->b.num_perfect_occlusion_queries > 0; if (sctx->b.chip_class >= CIK) { @@ -1838,17 +1683,6 @@ static unsigned si_tex_wrap(unsigned wrap) } } -static unsigned si_tex_filter(unsigned filter) -{ - switch (filter) { - default: - case PIPE_TEX_FILTER_NEAREST: - return V_008F38_SQ_TEX_XY_FILTER_POINT; - case PIPE_TEX_FILTER_LINEAR: - return V_008F38_SQ_TEX_XY_FILTER_BILINEAR; - } -} - static unsigned si_tex_mipfilter(unsigned filter) { switch (filter) { @@ -3122,6 +2956,16 @@ si_make_texture_descriptor(struct si_screen *screen, } else { state[6] = 0; state[7] = 0; + + /* The last dword is unused by hw. The shader uses it to clear + * bits in the first dword of sampler state. + */ + if (screen->b.chip_class <= CIK && res->nr_samples <= 1) { + if (first_level == last_level) + state[7] = C_008F30_MAX_ANISO_RATIO; + else + state[7] = 0xffffffff; + } } /* Initialize the sampler view for FMASK. */ @@ -3318,9 +3162,12 @@ static void *si_create_sampler_state(struct pipe_context *ctx, const struct pipe_sampler_state *state) { struct si_context *sctx = (struct si_context *)ctx; + struct r600_common_screen *rscreen = sctx->b.screen; struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state); - unsigned aniso_flag_offset = state->max_anisotropy > 1 ? 2 : 0; unsigned border_color_type, border_color_index = 0; + unsigned max_aniso = rscreen->force_aniso >= 0 ? rscreen->force_aniso + : state->max_anisotropy; + unsigned max_aniso_ratio = r600_tex_aniso_filter(max_aniso); if (!rstate) { return NULL; @@ -3378,16 +3225,21 @@ static void *si_create_sampler_state(struct pipe_context *ctx, rstate->val[0] = (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) | S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | - r600_tex_aniso_filter(state->max_anisotropy) << 9 | + S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) | S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) | S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) | - S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map)); + S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) | + S_008F30_COMPAT_MODE(sctx->b.chip_class >= VI)); rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) | S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8))); rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) | - S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter) | aniso_flag_offset) | - S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter) | aniso_flag_offset) | - S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter))); + S_008F38_XY_MAG_FILTER(eg_tex_filter(state->mag_img_filter, max_aniso)) | + S_008F38_XY_MIN_FILTER(eg_tex_filter(state->min_img_filter, max_aniso)) | + S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) | + S_008F38_MIP_POINT_PRECLAMP(1) | + S_008F38_DISABLE_LSB_CEIL(1) | + S_008F38_FILTER_PREC_FIX(1) | + S_008F38_ANISO_OVERRIDE(sctx->b.chip_class >= VI)); rstate->val[3] = S_008F3C_BORDER_COLOR_PTR(border_color_index) | S_008F3C_BORDER_COLOR_TYPE(border_color_type); return rstate; @@ -3430,7 +3282,7 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, struct si_vertex_element *v = CALLOC_STRUCT(si_vertex_element); int i; - assert(count < SI_MAX_ATTRIBS); + assert(count <= SI_MAX_ATTRIBS); if (!v) return NULL; @@ -3678,6 +3530,8 @@ void si_init_state_functions(struct si_context *sctx) si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond); si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin); si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable); + si_init_external_atom(sctx, &sctx->b.scissors.atom, &sctx->atoms.s.scissors); + si_init_external_atom(sctx, &sctx->b.viewports.atom, &sctx->atoms.s.viewports); si_init_atom(sctx, &sctx->cache_flush, &sctx->atoms.s.cache_flush, si_emit_cache_flush); si_init_atom(sctx, &sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state); @@ -3689,8 +3543,6 @@ void si_init_state_functions(struct si_context *sctx) si_init_atom(sctx, &sctx->blend_color.atom, &sctx->atoms.s.blend_color, si_emit_blend_color); si_init_atom(sctx, &sctx->clip_regs, &sctx->atoms.s.clip_regs, si_emit_clip_regs); si_init_atom(sctx, &sctx->clip_state.atom, &sctx->atoms.s.clip_state, si_emit_clip_state); - si_init_atom(sctx, &sctx->scissors.atom, &sctx->atoms.s.scissors, si_emit_scissors); - si_init_atom(sctx, &sctx->viewports.atom, &sctx->atoms.s.viewports, si_emit_viewports); si_init_atom(sctx, &sctx->stencil_ref.atom, &sctx->atoms.s.stencil_ref, si_emit_stencil_ref); sctx->b.b.create_blend_state = si_create_blend_state; @@ -3713,8 +3565,6 @@ void si_init_state_functions(struct si_context *sctx) sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS); sctx->b.b.set_clip_state = si_set_clip_state; - sctx->b.b.set_scissor_states = si_set_scissor_states; - sctx->b.b.set_viewport_states = si_set_viewport_states; sctx->b.b.set_stencil_ref = si_set_stencil_ref; sctx->b.b.set_framebuffer_state = si_set_framebuffer_state; @@ -3740,6 +3590,7 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.b.set_min_samples = si_set_min_samples; sctx->b.b.set_tess_state = si_set_tess_state; + sctx->b.b.set_active_query_state = si_set_active_query_state; sctx->b.set_occlusion_query_state = si_set_occlusion_query_state; sctx->b.need_gfx_cs_space = si_need_gfx_cs_space; @@ -4098,10 +3949,6 @@ static void si_init_config(struct si_context *sctx) /* PA_SU_HARDWARE_SCREEN_OFFSET must be 0 due to hw bug on SI */ si_pm4_set_reg(pm4, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET, 0); si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0); - si_pm4_set_reg(pm4, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, fui(1.0)); - si_pm4_set_reg(pm4, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0)); - si_pm4_set_reg(pm4, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0)); - si_pm4_set_reg(pm4, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0)); si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index f55f19e2918..6748f802c7d 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -161,6 +161,8 @@ struct si_shader_data { #define SI_NUM_IMAGES 16 +#define SI_NUM_SHADER_BUFFERS 16 + /* Read-write buffer slots. * * Ring buffers: 0..1 diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 84b850a2992..40cad504e09 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -722,6 +722,16 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom) } } + if (sctx->flags & R600_CONTEXT_START_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) | + EVENT_INDEX(0)); + } else if (sctx->flags & R600_CONTEXT_STOP_PIPELINE_STATS) { + radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); + radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_STOP) | + EVENT_INDEX(0)); + } + sctx->flags = 0; } @@ -882,8 +892,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA || sctx->b.family == CHIP_FIJI) && - (sctx->b.streamout.streamout_enabled || - sctx->b.streamout.prims_gen_query_enabled)) { + r600_get_strmout_en(&sctx->b)) { sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; } diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 02489583423..b7ebb48e6a9 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1270,25 +1270,6 @@ error: return NULL; } -/** - * Normally, we only emit 1 viewport and 1 scissor if no shader is using - * the VIEWPORT_INDEX output, and emitting the other viewports and scissors - * is delayed. When a shader with VIEWPORT_INDEX appears, this should be - * called to emit the rest. - */ -static void si_update_viewports_and_scissors(struct si_context *sctx) -{ - struct tgsi_shader_info *info = si_get_vs_info(sctx); - - if (!info || !info->writes_viewport_index) - return; - - if (sctx->scissors.dirty_mask) - si_mark_atom_dirty(sctx, &sctx->scissors.atom); - if (sctx->viewports.dirty_mask) - si_mark_atom_dirty(sctx, &sctx->viewports.atom); -} - static void si_bind_vs_shader(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; @@ -1300,7 +1281,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state) sctx->vs_shader.cso = sel; sctx->vs_shader.current = sel ? sel->first_variant : NULL; si_mark_atom_dirty(sctx, &sctx->clip_regs); - si_update_viewports_and_scissors(sctx); + r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); } static void si_bind_gs_shader(struct pipe_context *ctx, void *state) @@ -1319,7 +1300,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state) if (enable_changed) si_shader_change_notify(sctx); - si_update_viewports_and_scissors(sctx); + r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); } static void si_bind_tcs_shader(struct pipe_context *ctx, void *state) @@ -1356,7 +1337,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state) si_shader_change_notify(sctx); sctx->last_tes_sh_base = -1; /* invalidate derived tess state */ } - si_update_viewports_and_scissors(sctx); + r600_update_vs_writes_viewport_index(&sctx->b, si_get_vs_info(sctx)); } static void si_bind_ps_shader(struct pipe_context *ctx, void *state) diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index 892084707d2..f0aa605c2d9 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -2307,6 +2307,9 @@ #define V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER 0x05 #define V_008F30_SQ_TEX_CLAMP_BORDER 0x06 #define V_008F30_SQ_TEX_MIRROR_ONCE_BORDER 0x07 +#define S_008F30_MAX_ANISO_RATIO(x) (((x) & 0x07) << 9) +#define G_008F30_MAX_ANISO_RATIO(x) (((x) >> 9) & 0x07) +#define C_008F30_MAX_ANISO_RATIO 0xFFFFF1FF #define S_008F30_DEPTH_COMPARE_FUNC(x) (((x) & 0x07) << 12) #define G_008F30_DEPTH_COMPARE_FUNC(x) (((x) >> 12) & 0x07) #define C_008F30_DEPTH_COMPARE_FUNC 0xFFFF8FFF @@ -2371,6 +2374,8 @@ #define C_008F38_XY_MIN_FILTER 0xFF3FFFFF #define V_008F38_SQ_TEX_XY_FILTER_POINT 0x00 #define V_008F38_SQ_TEX_XY_FILTER_BILINEAR 0x01 +#define V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT 0x02 +#define V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR 0x03 #define S_008F38_Z_FILTER(x) (((x) & 0x03) << 24) #define G_008F38_Z_FILTER(x) (((x) >> 24) & 0x03) #define C_008F38_Z_FILTER 0xFCFFFFFF @@ -2392,6 +2397,9 @@ #define S_008F38_FILTER_PREC_FIX(x) (((x) & 0x1) << 30) #define G_008F38_FILTER_PREC_FIX(x) (((x) >> 30) & 0x1) #define C_008F38_FILTER_PREC_FIX 0xBFFFFFFF +#define S_008F38_ANISO_OVERRIDE(x) (((x) & 0x1) << 31) +#define G_008F38_ANISO_OVERRIDE(x) (((x) >> 31) & 0x1) +#define C_008F38_ANISO_OVERRIDE 0x7FFFFFFF #define R_008F3C_SQ_IMG_SAMP_WORD3 0x008F3C #define S_008F3C_BORDER_COLOR_PTR(x) (((x) & 0xFFF) << 0) #define G_008F3C_BORDER_COLOR_PTR(x) (((x) >> 0) & 0xFFF) diff --git a/src/gallium/drivers/rbug/rbug_context.c b/src/gallium/drivers/rbug/rbug_context.c index 9ecddad05ec..1280c45b539 100644 --- a/src/gallium/drivers/rbug/rbug_context.c +++ b/src/gallium/drivers/rbug/rbug_context.c @@ -211,6 +211,17 @@ rbug_get_query_result(struct pipe_context *_pipe, return ret; } +static void +rbug_set_active_query_state(struct pipe_context *_pipe, boolean enable) +{ + struct rbug_context *rb_pipe = rbug_context(_pipe); + struct pipe_context *pipe = rb_pipe->pipe; + + pipe_mutex_lock(rb_pipe->call_mutex); + pipe->set_active_query_state(pipe, enable); + pipe_mutex_unlock(rb_pipe->call_mutex); +} + static void * rbug_create_blend_state(struct pipe_context *_pipe, const struct pipe_blend_state *blend) @@ -1184,6 +1195,7 @@ rbug_context_create(struct pipe_screen *_screen, struct pipe_context *pipe) rb_pipe->base.begin_query = rbug_begin_query; rb_pipe->base.end_query = rbug_end_query; rb_pipe->base.get_query_result = rbug_get_query_result; + rb_pipe->base.set_active_query_state = rbug_set_active_query_state; rb_pipe->base.create_blend_state = rbug_create_blend_state; rb_pipe->base.bind_blend_state = rbug_bind_blend_state; rb_pipe->base.delete_blend_state = rbug_delete_blend_state; diff --git a/src/gallium/drivers/softpipe/Makefile.sources b/src/gallium/drivers/softpipe/Makefile.sources index efe88468e3f..1d42351f975 100644 --- a/src/gallium/drivers/softpipe/Makefile.sources +++ b/src/gallium/drivers/softpipe/Makefile.sources @@ -1,4 +1,5 @@ C_SOURCES := \ + sp_buffer.c \ sp_clear.c \ sp_clear.h \ sp_context.c \ @@ -11,6 +12,7 @@ C_SOURCES := \ sp_fs_exec.c \ sp_fs.h \ sp_image.c \ + sp_image.h \ sp_limits.h \ sp_prim_vbuf.c \ sp_prim_vbuf.h \ diff --git a/src/gallium/drivers/softpipe/sp_buffer.c b/src/gallium/drivers/softpipe/sp_buffer.c new file mode 100644 index 00000000000..69a6bd18c3b --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_buffer.c @@ -0,0 +1,371 @@ +/* + * Copyright 2016 Red Hat. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "sp_context.h" +#include "sp_buffer.h" +#include "sp_texture.h" + +#include "util/u_format.h" + +static bool +get_dimensions(const struct pipe_shader_buffer *bview, + const struct softpipe_resource *spr, + unsigned *width) +{ + *width = bview->buffer_size; + /* + * Bounds check the buffer size from the view + * and the buffer size from the underlying buffer. + */ + if (*width > spr->base.width0) + return false; + return true; +} + +/* + * Implement the image LOAD operation. + */ +static void +sp_tgsi_load(const struct tgsi_buffer *buffer, + const struct tgsi_buffer_params *params, + const int s[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + struct sp_tgsi_buffer *sp_buf = (struct sp_tgsi_buffer *)buffer; + struct pipe_shader_buffer *bview; + struct softpipe_resource *spr; + unsigned width; + int c, j; + unsigned char *data_ptr; + const struct util_format_description *format_desc = util_format_description(PIPE_FORMAT_R32_UINT); + + if (params->unit >= PIPE_MAX_SHADER_BUFFERS) + goto fail_write_all_zero; + + bview = &sp_buf->sp_bview[params->unit]; + spr = softpipe_resource(bview->buffer); + if (!spr) + goto fail_write_all_zero; + + if (!get_dimensions(bview, spr, &width)) + return; + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + int s_coord; + bool fill_zero = false; + uint32_t sdata[4]; + + if (!(params->execmask & (1 << j))) + fill_zero = true; + + s_coord = s[j]; + if (s_coord >= width) + fill_zero = true; + + if (fill_zero) { + for (c = 0; c < 4; c++) + rgba[c][j] = 0; + continue; + } + data_ptr = (unsigned char *)spr->data + bview->buffer_offset + s_coord; + for (c = 0; c < 4; c++) { + format_desc->fetch_rgba_uint(sdata, data_ptr, 0, 0); + ((uint32_t *)rgba[c])[j] = sdata[0]; + data_ptr += 4; + } + } + return; +fail_write_all_zero: + memset(rgba, 0, TGSI_NUM_CHANNELS * TGSI_QUAD_SIZE * 4); + return; +} + +/* + * Implement the buffer STORE operation. + */ +static void +sp_tgsi_store(const struct tgsi_buffer *buffer, + const struct tgsi_buffer_params *params, + const int s[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + struct sp_tgsi_buffer *sp_buf = (struct sp_tgsi_buffer *)buffer; + struct pipe_shader_buffer *bview; + struct softpipe_resource *spr; + unsigned width; + unsigned char *data_ptr; + int j, c; + const struct util_format_description *format_desc = util_format_description(PIPE_FORMAT_R32_UINT); + + if (params->unit >= PIPE_MAX_SHADER_BUFFERS) + return; + + bview = &sp_buf->sp_bview[params->unit]; + spr = softpipe_resource(bview->buffer); + if (!spr) + return; + + if (!get_dimensions(bview, spr, &width)) + return; + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + int s_coord; + + if (!(params->execmask & (1 << j))) + continue; + + s_coord = s[j]; + if (s_coord >= width) + continue; + + data_ptr = (unsigned char *)spr->data + bview->buffer_offset + s_coord; + + for (c = 0; c < 4; c++) { + if (params->writemask & (1 << c)) { + unsigned temp[4]; + unsigned char *dptr = data_ptr + (c * 4); + temp[0] = ((uint32_t *)rgba[c])[j]; + format_desc->pack_rgba_uint(dptr, 0, temp, 0, 1, 1); + } + } + } +} + +/* + * Implement atomic operations on unsigned integers. + */ +static void +handle_op_uint(const struct pipe_shader_buffer *bview, + bool just_read, + unsigned char *data_ptr, + uint qi, + unsigned opcode, + unsigned writemask, + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE], + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + uint c; + const struct util_format_description *format_desc = util_format_description(PIPE_FORMAT_R32_UINT); + unsigned sdata[4]; + + for (c = 0; c < 4; c++) { + unsigned temp[4]; + unsigned char *dptr = data_ptr + (c * 4); + format_desc->fetch_rgba_uint(temp, dptr, 0, 0); + sdata[c] = temp[0]; + } + + if (just_read) { + for (c = 0; c < 4; c++) { + ((uint32_t *)rgba[c])[qi] = sdata[c]; + } + return; + } + + switch (opcode) { + case TGSI_OPCODE_ATOMUADD: + for (c = 0; c < 4; c++) { + unsigned temp = sdata[c]; + sdata[c] += ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMXCHG: + for (c = 0; c < 4; c++) { + unsigned temp = sdata[c]; + sdata[c] = ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMCAS: + for (c = 0; c < 4; c++) { + unsigned dst_x = sdata[c]; + unsigned cmp_x = ((uint32_t *)rgba[c])[qi]; + unsigned src_x = ((uint32_t *)rgba2[c])[qi]; + unsigned temp = sdata[c]; + sdata[c] = (dst_x == cmp_x) ? src_x : dst_x; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMAND: + for (c = 0; c < 4; c++) { + unsigned temp = sdata[c]; + sdata[c] &= ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMOR: + for (c = 0; c < 4; c++) { + unsigned temp = sdata[c]; + sdata[c] |= ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMXOR: + for (c = 0; c < 4; c++) { + unsigned temp = sdata[c]; + sdata[c] ^= ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMUMIN: + for (c = 0; c < 4; c++) { + unsigned dst_x = sdata[c]; + unsigned src_x = ((uint32_t *)rgba[c])[qi]; + sdata[c] = MIN2(dst_x, src_x); + ((uint32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMUMAX: + for (c = 0; c < 4; c++) { + unsigned dst_x = sdata[c]; + unsigned src_x = ((uint32_t *)rgba[c])[qi]; + sdata[c] = MAX2(dst_x, src_x); + ((uint32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMIMIN: + for (c = 0; c < 4; c++) { + int dst_x = sdata[c]; + int src_x = ((uint32_t *)rgba[c])[qi]; + sdata[c] = MIN2(dst_x, src_x); + ((uint32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMIMAX: + for (c = 0; c < 4; c++) { + int dst_x = sdata[c]; + int src_x = ((uint32_t *)rgba[c])[qi]; + sdata[c] = MAX2(dst_x, src_x); + ((uint32_t *)rgba[c])[qi] = dst_x; + } + break; + default: + assert(!"Unexpected TGSI opcode in sp_tgsi_op"); + break; + } + + for (c = 0; c < 4; c++) { + if (writemask & (1 << c)) { + unsigned temp[4]; + unsigned char *dptr = data_ptr + (c * 4); + temp[0] = sdata[c]; + format_desc->pack_rgba_uint(dptr, 0, temp, 0, 1, 1); + } + } +} + +/* + * Implement atomic buffer operations. + */ +static void +sp_tgsi_op(const struct tgsi_buffer *buffer, + const struct tgsi_buffer_params *params, + unsigned opcode, + const int s[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE], + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + struct sp_tgsi_buffer *sp_buf = (struct sp_tgsi_buffer *)buffer; + struct pipe_shader_buffer *bview; + struct softpipe_resource *spr; + unsigned width; + int j, c; + unsigned char *data_ptr; + + if (params->unit >= PIPE_MAX_SHADER_BUFFERS) + return; + + bview = &sp_buf->sp_bview[params->unit]; + spr = softpipe_resource(bview->buffer); + if (!spr) + goto fail_write_all_zero; + + if (!get_dimensions(bview, spr, &width)) + goto fail_write_all_zero; + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + int s_coord; + bool just_read = false; + + s_coord = s[j]; + if (s_coord >= width) { + for (c = 0; c < 4; c++) { + rgba[c][j] = 0; + } + continue; + } + + /* just readback the value for atomic if execmask isn't set */ + if (!(params->execmask & (1 << j))) { + just_read = true; + } + + data_ptr = (unsigned char *)spr->data + bview->buffer_offset + s_coord; + /* we should see atomic operations on r32 formats */ + + handle_op_uint(bview, just_read, data_ptr, j, + opcode, params->writemask, rgba, rgba2); + } + return; +fail_write_all_zero: + memset(rgba, 0, TGSI_NUM_CHANNELS * TGSI_QUAD_SIZE * 4); + return; +} + +/* + * return size of the attached buffer for RESQ opcode. + */ +static void +sp_tgsi_get_dims(const struct tgsi_buffer *buffer, + const struct tgsi_buffer_params *params, + int *dim) +{ + struct sp_tgsi_buffer *sp_buf = (struct sp_tgsi_buffer *)buffer; + struct pipe_shader_buffer *bview; + struct softpipe_resource *spr; + + if (params->unit >= PIPE_MAX_SHADER_BUFFERS) + return; + + bview = &sp_buf->sp_bview[params->unit]; + spr = softpipe_resource(bview->buffer); + if (!spr) + return; + + *dim = bview->buffer_size; +} + +struct sp_tgsi_buffer * +sp_create_tgsi_buffer(void) +{ + struct sp_tgsi_buffer *buf = CALLOC_STRUCT(sp_tgsi_buffer); + if (!buf) + return NULL; + + buf->base.load = sp_tgsi_load; + buf->base.store = sp_tgsi_store; + buf->base.op = sp_tgsi_op; + buf->base.get_dims = sp_tgsi_get_dims; + return buf; +}; diff --git a/src/gallium/drivers/softpipe/sp_buffer.h b/src/gallium/drivers/softpipe/sp_buffer.h new file mode 100644 index 00000000000..1822fe709fe --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_buffer.h @@ -0,0 +1,37 @@ +/* + * Copyright 2016 Red Hat. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SP_BUFFER_H +#define SP_BUFFER_H +#include "tgsi/tgsi_exec.h" + +struct sp_tgsi_buffer +{ + struct tgsi_buffer base; + struct pipe_shader_buffer sp_bview[PIPE_MAX_SHADER_BUFFERS]; +}; + +struct sp_tgsi_buffer * +sp_create_tgsi_buffer(void); + +#endif diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c index 30b0276cfe0..0342fc6f8cd 100644 --- a/src/gallium/drivers/softpipe/sp_context.c +++ b/src/gallium/drivers/softpipe/sp_context.c @@ -38,6 +38,7 @@ #include "util/u_pstipple.h" #include "util/u_inlines.h" #include "tgsi/tgsi_exec.h" +#include "sp_buffer.h" #include "sp_clear.h" #include "sp_context.h" #include "sp_flush.h" @@ -116,6 +117,8 @@ softpipe_destroy( struct pipe_context *pipe ) for (i = 0; i < PIPE_SHADER_TYPES; i++) { FREE(softpipe->tgsi.sampler[i]); + FREE(softpipe->tgsi.image[i]); + FREE(softpipe->tgsi.buffer[i]); } FREE( softpipe ); @@ -203,6 +206,10 @@ softpipe_create_context(struct pipe_screen *screen, softpipe->tgsi.image[i] = sp_create_tgsi_image(); } + for (i = 0; i < PIPE_SHADER_TYPES; i++) { + softpipe->tgsi.buffer[i] = sp_create_tgsi_buffer(); + } + softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE ); softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE ); @@ -288,6 +295,16 @@ softpipe_create_context(struct pipe_screen *screen, (struct tgsi_image *) softpipe->tgsi.image[PIPE_SHADER_GEOMETRY]); + draw_buffer(softpipe->draw, + PIPE_SHADER_VERTEX, + (struct tgsi_buffer *) + softpipe->tgsi.buffer[PIPE_SHADER_VERTEX]); + + draw_buffer(softpipe->draw, + PIPE_SHADER_GEOMETRY, + (struct tgsi_buffer *) + softpipe->tgsi.buffer[PIPE_SHADER_GEOMETRY]); + if (debug_get_bool_option( "SOFTPIPE_NO_RAST", FALSE )) softpipe->no_rast = TRUE; diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h index 20a12353b38..70d00c88b6e 100644 --- a/src/gallium/drivers/softpipe/sp_context.h +++ b/src/gallium/drivers/softpipe/sp_context.h @@ -84,6 +84,7 @@ struct softpipe_context { struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES]; + struct pipe_shader_buffer buffers[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_BUFFERS]; struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS]; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; struct pipe_index_buffer index_buffer; @@ -174,6 +175,7 @@ struct softpipe_context { struct { struct sp_tgsi_sampler *sampler[PIPE_SHADER_TYPES]; struct sp_tgsi_image *image[PIPE_SHADER_TYPES]; + struct sp_tgsi_buffer *buffer[PIPE_SHADER_TYPES]; } tgsi; struct tgsi_exec_machine *fs_machine; diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c index bfd9a4b7496..155382af825 100644 --- a/src/gallium/drivers/softpipe/sp_fs_exec.c +++ b/src/gallium/drivers/softpipe/sp_fs_exec.c @@ -63,14 +63,15 @@ static void exec_prepare( const struct sp_fragment_shader_variant *var, struct tgsi_exec_machine *machine, struct tgsi_sampler *sampler, - struct tgsi_image *image ) + struct tgsi_image *image, + struct tgsi_buffer *buffer ) { /* * Bind tokens/shader to the interpreter's machine state. */ tgsi_exec_machine_bind_shader(machine, var->tokens, - sampler, image); + sampler, image, buffer); } @@ -186,7 +187,7 @@ exec_delete(struct sp_fragment_shader_variant *var, struct tgsi_exec_machine *machine) { if (machine->Tokens == var->tokens) { - tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL); + tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL, NULL); } FREE( (void *) var->tokens ); diff --git a/src/gallium/drivers/softpipe/sp_image.c b/src/gallium/drivers/softpipe/sp_image.c index 3488fa83185..a7c73280a80 100644 --- a/src/gallium/drivers/softpipe/sp_image.c +++ b/src/gallium/drivers/softpipe/sp_image.c @@ -217,7 +217,7 @@ sp_tgsi_load(const struct tgsi_image *image, char *data_ptr; unsigned offset = 0; - if (params->unit > PIPE_MAX_SHADER_IMAGES) + if (params->unit >= PIPE_MAX_SHADER_IMAGES) goto fail_write_all_zero; iview = &sp_img->sp_iview[params->unit]; spr = (struct softpipe_resource *)iview->resource; @@ -320,7 +320,7 @@ sp_tgsi_store(const struct tgsi_image *image, unsigned offset = 0; unsigned pformat = params->format; - if (params->unit > PIPE_MAX_SHADER_IMAGES) + if (params->unit >= PIPE_MAX_SHADER_IMAGES) return; iview = &sp_img->sp_iview[params->unit]; spr = (struct softpipe_resource *)iview->resource; @@ -630,7 +630,7 @@ sp_tgsi_op(const struct tgsi_image *image, unsigned offset; char *data_ptr; - if (params->unit > PIPE_MAX_SHADER_IMAGES) + if (params->unit >= PIPE_MAX_SHADER_IMAGES) return; iview = &sp_img->sp_iview[params->unit]; spr = (struct softpipe_resource *)iview->resource; @@ -704,7 +704,7 @@ sp_tgsi_get_dims(const struct tgsi_image *image, struct softpipe_resource *spr; int level; - if (params->unit > PIPE_MAX_SHADER_IMAGES) + if (params->unit >= PIPE_MAX_SHADER_IMAGES) return; iview = &sp_img->sp_iview[params->unit]; spr = (struct softpipe_resource *)iview->resource; diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c index c28d28d5f5d..81e97107d59 100644 --- a/src/gallium/drivers/softpipe/sp_query.c +++ b/src/gallium/drivers/softpipe/sp_query.c @@ -283,6 +283,12 @@ softpipe_check_render_cond(struct softpipe_context *sp) } +static void +softpipe_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + + void softpipe_init_query_funcs(struct softpipe_context *softpipe ) { softpipe->pipe.create_query = softpipe_create_query; @@ -290,6 +296,7 @@ void softpipe_init_query_funcs(struct softpipe_context *softpipe ) softpipe->pipe.begin_query = softpipe_begin_query; softpipe->pipe.end_query = softpipe_end_query; softpipe->pipe.get_query_result = softpipe_get_query_result; + softpipe->pipe.set_active_query_state = softpipe_set_active_query_state; } diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 90f29d6e52a..d89d95c884c 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -239,6 +239,8 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TEXTURE_FLOAT_LINEAR: case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR: return 1; + case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + return 1; case PIPE_CAP_VERTEXID_NOBASE: return 0; case PIPE_CAP_POLYGON_OFFSET_CLAMP: @@ -259,7 +261,6 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL: case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL: - case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: case PIPE_CAP_INVALIDATE_BUFFER: case PIPE_CAP_GENERATE_MIPMAP: case PIPE_CAP_STRING_MARKER: @@ -270,8 +271,10 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_PCI_BUS: case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: - case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; + case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT: + return 4; } /* should only get here on unhandled cases */ debug_printf("Unexpected PIPE_CAP %d query\n", param); diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h index 570bc549cc4..2fc48ab13d8 100644 --- a/src/gallium/drivers/softpipe/sp_state.h +++ b/src/gallium/drivers/softpipe/sp_state.h @@ -57,6 +57,7 @@ struct tgsi_sampler; struct tgsi_image; +struct tgsi_buffer; struct tgsi_exec_machine; struct vertex_info; @@ -83,7 +84,8 @@ struct sp_fragment_shader_variant void (*prepare)(const struct sp_fragment_shader_variant *shader, struct tgsi_exec_machine *machine, struct tgsi_sampler *sampler, - struct tgsi_image *image); + struct tgsi_image *image, + struct tgsi_buffer *buffer); unsigned (*run)(const struct sp_fragment_shader_variant *shader, struct tgsi_exec_machine *machine, diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c index 65679e73515..4ce9d95bc6e 100644 --- a/src/gallium/drivers/softpipe/sp_state_derived.c +++ b/src/gallium/drivers/softpipe/sp_state_derived.c @@ -344,7 +344,8 @@ update_fragment_shader(struct softpipe_context *softpipe, unsigned prim) softpipe->fs_machine, (struct tgsi_sampler *) softpipe-> tgsi.sampler[PIPE_SHADER_FRAGMENT], - (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT]); + (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT], + (struct tgsi_buffer *)softpipe->tgsi.buffer[PIPE_SHADER_FRAGMENT]); } else { softpipe->fs_variant = NULL; diff --git a/src/gallium/drivers/softpipe/sp_state_image.c b/src/gallium/drivers/softpipe/sp_state_image.c index 8909fa26864..5947c934e86 100644 --- a/src/gallium/drivers/softpipe/sp_state_image.c +++ b/src/gallium/drivers/softpipe/sp_state_image.c @@ -24,6 +24,7 @@ #include "sp_context.h" #include "sp_state.h" #include "sp_image.h" +#include "sp_buffer.h" static void softpipe_set_shader_images(struct pipe_context *pipe, unsigned shader, @@ -51,7 +52,34 @@ static void softpipe_set_shader_images(struct pipe_context *pipe, } } +static void softpipe_set_shader_buffers(struct pipe_context *pipe, + unsigned shader, + unsigned start, + unsigned num, + struct pipe_shader_buffer *buffers) +{ + struct softpipe_context *softpipe = softpipe_context(pipe); + unsigned i; + assert(shader < PIPE_SHADER_TYPES); + assert(start + num <= Elements(softpipe->buffers[shader])); + + /* set the new images */ + for (i = 0; i < num; i++) { + int idx = start + i; + + if (buffers) { + pipe_resource_reference(&softpipe->tgsi.buffer[shader]->sp_bview[idx].buffer, buffers[i].buffer); + softpipe->tgsi.buffer[shader]->sp_bview[idx] = buffers[i]; + } + else { + pipe_resource_reference(&softpipe->tgsi.buffer[shader]->sp_bview[idx].buffer, NULL); + memset(&softpipe->tgsi.buffer[shader]->sp_bview[idx], 0, sizeof(struct pipe_shader_buffer)); + } + } +} + void softpipe_init_image_funcs(struct pipe_context *pipe) { pipe->set_shader_images = softpipe_set_shader_images; + pipe->set_shader_buffers = softpipe_set_shader_buffers; } diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c index 9cc8ac12525..c6233261938 100644 --- a/src/gallium/drivers/softpipe/sp_tile_cache.c +++ b/src/gallium/drivers/softpipe/sp_tile_cache.c @@ -99,9 +99,9 @@ sp_create_tile_cache( struct pipe_context *pipe ) maxTexSize = 1 << (maxLevels - 1); assert(MAX_WIDTH >= maxTexSize); - assert(sizeof(union tile_address) == 4); + STATIC_ASSERT(sizeof(union tile_address) == 4); - assert((TILE_SIZE << TILE_ADDR_BITS) >= MAX_WIDTH); + STATIC_ASSERT((TILE_SIZE << TILE_ADDR_BITS) >= MAX_WIDTH); tc = CALLOC_STRUCT( softpipe_tile_cache ); if (tc) { diff --git a/src/gallium/drivers/svga/svga_pipe_depthstencil.c b/src/gallium/drivers/svga/svga_pipe_depthstencil.c index 83fcdc3d80b..c5d83c33f29 100644 --- a/src/gallium/drivers/svga/svga_pipe_depthstencil.c +++ b/src/gallium/drivers/svga/svga_pipe_depthstencil.c @@ -86,9 +86,9 @@ define_depth_stencil_state_object(struct svga_context *svga, ds->id = util_bitmask_add(svga->ds_object_id_bm); /* spot check that these comparision tokens are the same */ - assert(SVGA3D_COMPARISON_NEVER == SVGA3D_CMP_NEVER); - assert(SVGA3D_COMPARISON_LESS == SVGA3D_CMP_LESS); - assert(SVGA3D_COMPARISON_NOT_EQUAL == SVGA3D_CMP_NOTEQUAL); + STATIC_ASSERT(SVGA3D_COMPARISON_NEVER == SVGA3D_CMP_NEVER); + STATIC_ASSERT(SVGA3D_COMPARISON_LESS == SVGA3D_CMP_LESS); + STATIC_ASSERT(SVGA3D_COMPARISON_NOT_EQUAL == SVGA3D_CMP_NOTEQUAL); /* Loop in case command buffer is full and we need to flush and retry */ for (try = 0; try < 2; try++) { diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c index 88f41eadc1d..75bc9ce092b 100644 --- a/src/gallium/drivers/svga/svga_pipe_query.c +++ b/src/gallium/drivers/svga/svga_pipe_query.c @@ -1246,6 +1246,12 @@ svga_get_timestamp(struct pipe_context *pipe) } +static void +svga_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + + void svga_init_query_functions(struct svga_context *svga) { @@ -1254,6 +1260,7 @@ svga_init_query_functions(struct svga_context *svga) svga->pipe.begin_query = svga_begin_query; svga->pipe.end_query = svga_end_query; svga->pipe.get_query_result = svga_get_query_result; + svga->pipe.set_active_query_state = svga_set_active_query_state; svga->pipe.render_condition = svga_render_condition; svga->pipe.get_timestamp = svga_get_timestamp; } diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index 536fb6f786f..010d94b1b58 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -364,6 +364,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_PCI_BUS: case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT: return 64; diff --git a/src/gallium/drivers/svga/svga_state_rss.c b/src/gallium/drivers/svga/svga_state_rss.c index d43894d71b1..317b44eef00 100644 --- a/src/gallium/drivers/svga/svga_state_rss.c +++ b/src/gallium/drivers/svga/svga_state_rss.c @@ -47,7 +47,7 @@ struct rs_queue { #define EMIT_RS(svga, value, token, fail) \ do { \ - assert(SVGA3D_RS_##token < Elements(svga->state.hw_draw.rs)); \ + STATIC_ASSERT(SVGA3D_RS_##token < Elements(svga->state.hw_draw.rs)); \ if (svga->state.hw_draw.rs[SVGA3D_RS_##token] != value) { \ svga_queue_rs( &queue, SVGA3D_RS_##token, value ); \ svga->state.hw_draw.rs[SVGA3D_RS_##token] = value; \ @@ -57,7 +57,7 @@ do { \ #define EMIT_RS_FLOAT(svga, fvalue, token, fail) \ do { \ unsigned value = fui(fvalue); \ - assert(SVGA3D_RS_##token < Elements(svga->state.hw_draw.rs)); \ + STATIC_ASSERT(SVGA3D_RS_##token < Elements(svga->state.hw_draw.rs)); \ if (svga->state.hw_draw.rs[SVGA3D_RS_##token] != value) { \ svga_queue_rs( &queue, SVGA3D_RS_##token, value ); \ svga->state.hw_draw.rs[SVGA3D_RS_##token] = value; \ diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c index 4debbf1669b..fd6d1ce84b4 100644 --- a/src/gallium/drivers/svga/svga_state_tss.c +++ b/src/gallium/drivers/svga/svga_state_tss.c @@ -327,7 +327,7 @@ svga_queue_tss( struct ts_queue *q, #define EMIT_TS(svga, unit, val, token) \ do { \ assert(unit < Elements(svga->state.hw_draw.ts)); \ - assert(SVGA3D_TS_##token < Elements(svga->state.hw_draw.ts[unit])); \ + STATIC_ASSERT(SVGA3D_TS_##token < Elements(svga->state.hw_draw.ts[unit])); \ if (svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] != val) { \ svga_queue_tss( queue, unit, SVGA3D_TS_##token, val ); \ svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] = val; \ @@ -338,7 +338,7 @@ do { \ do { \ unsigned val = fui(fvalue); \ assert(unit < Elements(svga->state.hw_draw.ts)); \ - assert(SVGA3D_TS_##token < Elements(svga->state.hw_draw.ts[unit])); \ + STATIC_ASSERT(SVGA3D_TS_##token < Elements(svga->state.hw_draw.ts[unit])); \ if (svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] != val) { \ svga_queue_tss( queue, unit, SVGA3D_TS_##token, val ); \ svga->state.hw_draw.ts[unit][SVGA3D_TS_##token] = val; \ diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index 5794f3f625a..180a0560822 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -30,10 +30,6 @@ #define SWR_API __cdecl -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif - #ifndef NOMINMAX #define NOMINMAX #endif @@ -52,7 +48,6 @@ #define PRAGMA_WARNING_POP() __pragma(warning(pop)) -#if defined(_WIN32) #if defined(_WIN64) #define BitScanReverseSizeT BitScanReverse64 #define BitScanForwardSizeT BitScanForward64 @@ -62,7 +57,6 @@ #define BitScanForwardSizeT BitScanForward #define _mm_popcount_sizeT _mm_popcnt_u32 #endif -#endif #elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) @@ -199,9 +193,7 @@ typedef KILOBYTE MEGABYTE[1024]; typedef MEGABYTE GIGABYTE[1024]; #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) -#if KNOB_SIMD_WIDTH == 8 -#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32) -#endif +#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES) #include "common/swr_assert.h" diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index f0f7956b590..ca9cfdb629e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -29,10 +29,12 @@ #include <cfloat> #include <cmath> #include <cstdio> +#include <new> #include "core/api.h" #include "core/backend.h" #include "core/context.h" +#include "core/depthstencil.h" #include "core/frontend.h" #include "core/rasterizer.h" #include "core/rdtsc_core.h" @@ -64,11 +66,14 @@ HANDLE SwrCreateContext( pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); + pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); + for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); - pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); - pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. + new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); + new (&pContext->pDispatchQueueArray[dc]) DispatchQueue(); pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } @@ -86,15 +91,26 @@ HANDLE SwrCreateContext( // Calling createThreadPool() above can set SINGLE_THREADED if (KNOB_SINGLE_THREADED) { + SET_KNOB(HYPERTHREADED_FE, false); pContext->NumWorkerThreads = 1; + pContext->NumFEThreads = 1; + pContext->NumBEThreads = 1; } // Allocate scratch space for workers. ///@note We could lazily allocate this but its rather small amount of memory. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { - ///@todo Use numa API for allocations using numa information from thread data (if exists). - pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); +#if defined(_WIN32) + uint32_t numaNode = pContext->threadPool.pThreadData ? + pContext->threadPool.pThreadData[i].numaId : 0; + pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma( + GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE), + MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, + numaNode); +#else + pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); +#endif } // State setup AFTER context is fully initialized @@ -131,14 +147,21 @@ void SwrDestroyContext(HANDLE hContext) { delete pContext->dcRing[i].pArena; delete pContext->dsRing[i].pArena; - delete(pContext->dcRing[i].pTileMgr); - delete(pContext->dcRing[i].pDispatch); + pContext->pMacroTileManagerArray[i].~MacroTileMgr(); + pContext->pDispatchQueueArray[i].~DispatchQueue(); } + _aligned_free(pContext->pDispatchQueueArray); + _aligned_free(pContext->pMacroTileManagerArray); + // Free scratch space. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { +#if defined(_WIN32) + VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE); +#else _aligned_free(pContext->pScratch[i]); +#endif } delete(pContext->pHotTileMgr); @@ -160,12 +183,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext) template<bool IsDraw> void QueueWork(SWR_CONTEXT *pContext) { + DRAW_CONTEXT* pDC = pContext->pCurDrawContext; + uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; + + if (IsDraw) + { + pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex]; + pDC->pTileMgr->initialize(); + } + // Each worker thread looks at a DC for both FE and BE work at different times and so we // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and // then moved on if all work is done.) - pContext->pCurDrawContext->threadsDone = - pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2; + pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads; _ReadWriteBarrier(); { @@ -183,7 +214,7 @@ void QueueWork(SWR_CONTEXT *pContext) { static TileSet lockedTiles; uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; - WorkOnFifoFE(pContext, 0, curDraw[0], 0); + WorkOnFifoFE(pContext, 0, curDraw[0]); WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0); } else @@ -232,7 +263,20 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) _mm_pause(); } - uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT; + uint64_t curDraw = pContext->dcRing.GetHead(); + uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; + + static uint64_t lastDrawChecked; + static uint32_t lastFrameChecked; + if ((pContext->frameCount - lastFrameChecked) > 2 || + (curDraw - lastDrawChecked) > 0x10000) + { + // Take this opportunity to clean-up old arena allocations + pContext->cachingArenaAllocator.FreeOldBlocks(); + + lastFrameChecked = pContext->frameCount; + lastDrawChecked = curDraw; + } DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; pContext->pCurDrawContext = pCurDrawContext; @@ -284,8 +328,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pCurDrawContext->FeLock = 0; pCurDrawContext->threadsDone = 0; - pCurDrawContext->pTileMgr->initialize(); - // Assign unique drawId for this DC pCurDrawContext->drawId = pContext->dcRing.GetHead(); @@ -872,6 +914,25 @@ void SetupPipeline(DRAW_CONTEXT *pDC) !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0; } } + + // Setup depth quantization function + if (pState->state.depthHottileEnable) + { + switch (pState->state.rastState.depthFormat) + { + case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break; + case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break; + case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break; + case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break; + default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion."); + pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; + } + } + else + { + // set up pass-through quantize if depth isn't enabled + pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; + } } ////////////////////////////////////////////////////////////////////////// @@ -1029,9 +1090,9 @@ void DrawInstanced( SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); - int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); + uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); - int32_t remainingVerts = numVertices; + uint32_t remainingVerts = numVertices; API_STATE *pState = &pDC->pState->state; pState->topology = topology; @@ -1149,9 +1210,9 @@ void DrawIndexedInstance( DRAW_CONTEXT* pDC = GetDrawContext(pContext); API_STATE* pState = &pDC->pState->state; - int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); + uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); - int32_t remainingIndices = numIndices; + uint32_t remainingIndices = numIndices; uint32_t indexSize = 0; switch (pState->indexBuffer.format) @@ -1334,9 +1395,6 @@ void SwrDispatch( pDC->isCompute = true; // This is a compute context. - // Ensure spill fill pointers are initialized to nullptr. - memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill)); - COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); pTaskData->threadGroupCountX = threadGroupCountX; @@ -1344,6 +1402,8 @@ void SwrDispatch( pTaskData->threadGroupCountZ = threadGroupCountZ; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; + uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; + pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; pDC->pDispatch->initialize(totalThreadGroups, pTaskData); QueueDispatch(pContext); @@ -1497,4 +1557,6 @@ void SWR_API SwrEndFrame( HANDLE hContext) { RDTSC_ENDFRAME(); + SWR_CONTEXT *pContext = GetContext(hContext); + pContext->frameCount++; } diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h index 67d81a44347..64184e16865 100644 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -65,69 +65,41 @@ static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16> struct CachingAllocatorT : DefaultAllocator { - static uint32_t GetBucketId(size_t blockSize) - { - uint32_t bucketId = 0; - -#if defined(BitScanReverseSizeT) - BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT); - bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1); -#endif - - return bucketId; - } - void* AllocateAligned(size_t size, size_t align) { SWR_ASSERT(size >= sizeof(ArenaBlock)); SWR_ASSERT(size <= uint32_t(-1)); size_t blockSize = size - ARENA_BLOCK_ALIGN; + uint32_t bucket = GetBucketId(blockSize); { // search cached blocks std::lock_guard<std::mutex> l(m_mutex); - ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)]; - ArenaBlock* pBlock = pPrevBlock->pNext; - ArenaBlock* pPotentialBlock = nullptr; - ArenaBlock* pPotentialPrev = nullptr; + ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket]; + ArenaBlock* pBlock = SearchBlocks(pPrevBlock, blockSize, align); - while (pBlock) + if (pBlock) { - if (pBlock->blockSize >= blockSize) - { - if (pBlock == AlignUp(pBlock, align)) - { - if (pBlock->blockSize == blockSize) - { - // Won't find a better match - break; - } - - // We could use this as it is larger than we wanted, but - // continue to search for a better match - pPotentialBlock = pBlock; - pPotentialPrev = pPrevBlock; - } - } - else + m_cachedSize -= pBlock->blockSize; + if (pBlock == m_pLastCachedBlocks[bucket]) { - // Blocks are sorted by size (biggest first) - // So, if we get here, there are no blocks - // large enough, fall through to allocation. - pBlock = nullptr; - break; + m_pLastCachedBlocks[bucket] = pPrevBlock; } - - pPrevBlock = pBlock; - pBlock = pBlock->pNext; } - - if (!pBlock) + else { - // Couldn't find an exact match, use next biggest size - pBlock = pPotentialBlock; - pPrevBlock = pPotentialPrev; + pPrevBlock = &m_oldCachedBlocks[GetBucketId(blockSize)]; + pBlock = SearchBlocks(pPrevBlock, blockSize, align); + + if (pBlock) + { + m_oldCachedSize -= pBlock->blockSize; + if (pBlock == m_pOldLastCachedBlocks[bucket]) + { + m_pLastCachedBlocks[bucket] = pPrevBlock; + } + } } if (pBlock) @@ -154,7 +126,7 @@ struct CachingAllocatorT : DefaultAllocator return this->DefaultAllocator::AllocateAligned(size, align); } - void Free(void* pMem) + void Free(void* pMem) { if (pMem) { @@ -162,24 +134,57 @@ struct CachingAllocatorT : DefaultAllocator SWR_ASSERT(pNewBlock->blockSize >= 0); std::unique_lock<std::mutex> l(m_mutex); - ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)]; - ArenaBlock* pBlock = pPrevBlock->pNext; + InsertCachedBlock(GetBucketId(pNewBlock->blockSize), pNewBlock); + } + } - while (pBlock) + void FreeOldBlocks() + { + if (!m_cachedSize) { return; } + std::lock_guard<std::mutex> l(m_mutex); + + bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE); + + for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) + { + if (doFree) { - if (pNewBlock->blockSize >= pBlock->blockSize) + ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext; + while (pBlock) { - // Insert here - break; + ArenaBlock* pNext = pBlock->pNext; + m_oldCachedSize -= pBlock->blockSize; + m_totalAllocated -= (pBlock->blockSize + ARENA_BLOCK_ALIGN); + this->DefaultAllocator::Free(pBlock); + pBlock = pNext; } - pPrevBlock = pBlock; - pBlock = pBlock->pNext; + m_oldCachedBlocks[i].pNext = nullptr; + m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; } - // Insert into list - SWR_ASSERT(pPrevBlock); - pPrevBlock->pNext = pNewBlock; - pNewBlock->pNext = pBlock; + if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i]) + { + m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext; + m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext; + m_cachedBlocks[i].pNext = nullptr; + if (m_pOldLastCachedBlocks[i]->pNext) + { + m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i]; + } + m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; + } + } + + m_oldCachedSize += m_cachedSize; + m_cachedSize = 0; + } + + CachingAllocatorT() + { + for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) + { + m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; + m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; } } @@ -195,21 +200,126 @@ struct CachingAllocatorT : DefaultAllocator this->DefaultAllocator::Free(pBlock); pBlock = pNext; } + pBlock = m_oldCachedBlocks[i].pNext; + while (pBlock) + { + ArenaBlock* pNext = pBlock->pNext; + this->DefaultAllocator::Free(pBlock); + pBlock = pNext; + } } } +private: + static uint32_t GetBucketId(size_t blockSize) + { + uint32_t bucketId = 0; + +#if defined(BitScanReverseSizeT) + BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT); + bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1); +#endif + + return bucketId; + } + + void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock) + { + SWR_ASSERT(bucketId < CACHE_NUM_BUCKETS); + + ArenaBlock* pPrevBlock = &m_cachedBlocks[bucketId]; + ArenaBlock* pBlock = pPrevBlock->pNext; + + while (pBlock) + { + if (pNewBlock->blockSize >= pBlock->blockSize) + { + // Insert here + break; + } + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + // Insert into list + SWR_ASSERT(pPrevBlock); + pPrevBlock->pNext = pNewBlock; + pNewBlock->pNext = pBlock; + + if (m_pLastCachedBlocks[bucketId] == pPrevBlock) + { + m_pLastCachedBlocks[bucketId] = pNewBlock; + } + + m_cachedSize += pNewBlock->blockSize; + } + + static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align) + { + ArenaBlock* pBlock = pPrevBlock->pNext; + ArenaBlock* pPotentialBlock = nullptr; + ArenaBlock* pPotentialPrev = nullptr; + + while (pBlock) + { + if (pBlock->blockSize >= blockSize) + { + if (pBlock == AlignUp(pBlock, align)) + { + if (pBlock->blockSize == blockSize) + { + // Won't find a better match + break; + } + + // We could use this as it is larger than we wanted, but + // continue to search for a better match + pPotentialBlock = pBlock; + pPotentialPrev = pPrevBlock; + } + } + else + { + // Blocks are sorted by size (biggest first) + // So, if we get here, there are no blocks + // large enough, fall through to allocation. + pBlock = nullptr; + break; + } + + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + if (!pBlock) + { + // Couldn't find an exact match, use next biggest size + pBlock = pPotentialBlock; + pPrevBlock = pPotentialPrev; + } + + return pBlock; + } + // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ... static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT; static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT; + static const size_t MAX_UNUSED_SIZE = 20 * sizeof(MEGABYTE); ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock m_oldCachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS]; std::mutex m_mutex; size_t m_totalAllocated = 0; + + size_t m_cachedSize = 0; + size_t m_oldCachedSize = 0; }; typedef CachingAllocatorT<> CachingAllocator; -template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)> +template<typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)> class TArena { public: diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 7fb83edf169..b2d3d9ef4f4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -70,7 +70,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; /// @param pDC - pointer to draw context (dispatch). /// @param workerId - The unique worker ID that is assigned to this thread. /// @param threadGroupId - the linear index for the thread group within the dispatch. -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId) +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer) { RDTSC_START(BEDispatch); @@ -80,10 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup SWR_ASSERT(pTaskData != nullptr); // Ensure spill fill memory has been allocated. - if (pDC->pSpillFill[workerId] == nullptr) + if (pSpillFillBuffer == nullptr) { ///@todo Add state which indicates the spill fill size. - pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8); + pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8); } const API_STATE& state = GetApiState(pDC); @@ -94,7 +94,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup csContext.dispatchDims[1] = pTaskData->threadGroupCountY; csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; csContext.pTGSM = pContext->pScratch[workerId]; - csContext.pSpillFillBuffer = pDC->pSpillFill[workerId]; + csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; state.pfnCsFunc(GetPrivateState(pDC), &csContext); @@ -772,8 +772,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 psContext.vOneOverW.centroid = psContext.vOneOverW.center; } - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); + RDTSC_STOP(BEBarycentric, 0, 0); simdmask clipCoverageMask = coverageMask & MASK; @@ -793,7 +795,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(CanEarlyZ(pPSState)) { RDTSC_START(BEEarlyDepthTest); - depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); RDTSC_STOP(BEEarlyDepthTest, 0, 0); @@ -825,7 +827,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(!CanEarlyZ(pPSState)) { RDTSC_START(BELateDepthTest); - depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); RDTSC_STOP(BELateDepthTest, 0, 0); @@ -977,8 +979,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); @@ -1000,7 +1003,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ if (CanEarlyZ(pPSState)) { RDTSC_START(BEEarlyDepthTest); - depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); RDTSC_STOP(BEEarlyDepthTest, 0, 0); @@ -1033,7 +1036,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ if (!CanEarlyZ(pPSState)) { RDTSC_START(BELateDepthTest); - depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); RDTSC_STOP(BELateDepthTest, 0, 0); @@ -1200,8 +1203,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t RDTSC_START(BEBarycentric); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); // execute pixel shader @@ -1263,10 +1267,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t // calc I & J per sample backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z if (!pPSState->writesODepth) { vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); } ///@todo: perspective correct vs non-perspective correct clipping? @@ -1292,7 +1297,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t // ZTest for this sample RDTSC_START(BEEarlyDepthTest); stencilPassMask[sample] = vCoverageMask[sample]; - depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]); RDTSC_STOP(BEEarlyDepthTest, 0, 0); @@ -1308,8 +1313,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t { RDTSC_START(BEBarycentric); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); // execute pixel shader @@ -1463,8 +1469,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); @@ -1483,7 +1490,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample); RDTSC_START(BEEarlyDepthTest); - simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 2fa18953cad..d0626b997af 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -32,7 +32,7 @@ #include "core/context.h" #include "core/multisample.h" -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer); void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index 3a2a8b35be8..e624fd8f674 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -162,8 +162,8 @@ int ClipTriToPlane( const float *pInPts, int numInPts, void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs) { // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping - OSALIGN(float, 16) tempPts[6 * 4]; - OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4]; + OSALIGNSIMD(float) tempPts[6 * 4]; + OSALIGNSIMD(float) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4]; // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index ba5870a92bb..67a4c4f47bb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -265,8 +265,8 @@ public: // clip a single primitive int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs) { - OSALIGN(float, 16) inVerts[3 * 4]; - OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4]; + OSALIGNSIMD(float) inVerts[3 * 4]; + OSALIGNSIMD(float) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4]; // transpose primitive position __m128 verts[3]; diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 39f23372a18..6464aa20af7 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -308,6 +308,8 @@ OSALIGNLINE(struct) API_STATE uint32_t depthHottileEnable: 1; uint32_t stencilHottileEnable : 1; }; + + PFN_QUANTIZE_DEPTH pfnQuantizeDepth; }; class MacroTileMgr; @@ -380,32 +382,29 @@ struct DRAW_STATE // This draw context maintains all of the state needed for the draw operation. struct DRAW_CONTEXT { - SWR_CONTEXT *pContext; - - uint64_t drawId; - - bool isCompute; // Is this DC a compute context? - - FE_WORK FeWork; - volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - volatile OSALIGNLINE(int64_t) threadsDone; - - uint64_t dependency; - - MacroTileMgr* pTileMgr; - - // The following fields are valid if isCompute is true. - DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + SWR_CONTEXT* pContext; + uint64_t drawId; + union + { + MacroTileMgr* pTileMgr; + DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + }; + uint64_t dependency; + DRAW_STATE* pState; + CachingArena* pArena; - DRAW_STATE* pState; - CachingArena* pArena; + bool isCompute; // Is this DC a compute context? + bool cleanupState; // True if this is the last draw using an entry in the state ring. + volatile bool doneFE; // Is FE work done for this draw? - uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. + FE_WORK FeWork; - bool cleanupState; // True if this is the last draw using an entry in the state ring. + volatile OSALIGNLINE(uint32_t) FeLock; + volatile int64_t threadsDone; }; +static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); + INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) { SWR_ASSERT(pDC != nullptr); @@ -447,6 +446,9 @@ struct SWR_CONTEXT DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. + MacroTileMgr* pMacroTileManagerArray; + DispatchQueue* pDispatchQueueArray; + // Draw State Ring // When draw are very large (lots of primitives) then the API thread will break these up. // These split draws all have identical state. So instead of storing the state directly @@ -457,6 +459,8 @@ struct SWR_CONTEXT uint32_t curStateId; // Current index to the next available entry in the DS ring. uint32_t NumWorkerThreads; + uint32_t NumFEThreads; + uint32_t NumBEThreads; THREAD_POOL threadPool; // Thread pool associated with this context @@ -481,6 +485,7 @@ struct SWR_CONTEXT uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; CachingAllocator cachingArenaAllocator; + uint32_t frameCount; }; void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h index 2cc9d4054ac..7b55580bf0a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h @@ -80,14 +80,52 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds } +template<SWR_FORMAT depthFormatT> +simdscalar QuantizeDepth(simdscalar depth) +{ + SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0); + uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0); + + if (depthType == SWR_TYPE_FLOAT) + { + // assume only 32bit float depth supported + SWR_ASSERT(depthBpc == 32); + + // matches shader precision, no quantizing needed + return depth; + } + + // should be unorm depth if not float + SWR_ASSERT(depthType == SWR_TYPE_UNORM); + + float quantize = (float)((1 << depthBpc) - 1); + simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize)); + result = _simd_add_ps(result, _simd_set1_ps(0.5f)); + result = _simd_round_ps(result, _MM_FROUND_TO_ZERO); + + if (depthBpc > 16) + { + result = _simd_div_ps(result, _simd_set1_ps(quantize)); + } + else + { + result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize)); + } + + return result; +} + INLINE -simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, +simdscalar DepthStencilTest(const API_STATE* pState, bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase, simdscalar* pStencilMask) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); + const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState; + const SWR_VIEWPORT* pViewport = &pState->vp[0]; + simdscalar depthResult = _simd_set1_ps(-1.0f); simdscalar zbuf; diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 36721e00beb..93869610ff9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -793,8 +793,14 @@ static void GeometryShaderStage( uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride; DWORD numAttribs; - _BitScanReverse(&numAttribs, state.feAttribMask); - numAttribs++; + if (_BitScanReverse(&numAttribs, state.feAttribMask)) + { + numAttribs++; + } + else + { + numAttribs = 0; + } for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) { diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index d7feb86273d..55a22a67f4c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -45,14 +45,17 @@ #define KNOB_ARCH_ISA AVX #define KNOB_ARCH_STR "AVX" #define KNOB_SIMD_WIDTH 8 +#define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX2) #define KNOB_ARCH_ISA AVX2 #define KNOB_ARCH_STR "AVX2" #define KNOB_SIMD_WIDTH 8 +#define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX512) #define KNOB_ARCH_ISA AVX512F #define KNOB_ARCH_STR "AVX512" #define KNOB_SIMD_WIDTH 16 +#define KNOB_SIMD_BYTES 64 #error "AVX512 not yet supported" #else #error "Unknown architecture" diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index f8f1a33b7e3..17f488538d6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -1017,13 +1017,13 @@ struct PA_TESS : PA_STATE { SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH); #if KNOB_SIMD_WIDTH == 8 - static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] = + static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; #elif KNOB_SIMD_WIDTH == 16 - static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] = + static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 @@ -1167,8 +1167,14 @@ struct PA_FACTORY { memset(&indexStore, 0, sizeof(indexStore)); DWORD numAttribs; - _BitScanReverse(&numAttribs, state.feAttribMask); - numAttribs++; + if (_BitScanReverse(&numAttribs, state.feAttribMask)) + { + numAttribs++; + } + else + { + numAttribs = 0; + } new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, &this->indexStore[0], numVerts, numAttribs, state.topology, false); cutPA = true; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 52fb7c88cdd..3144a901c91 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -383,7 +383,7 @@ __declspec(thread) volatile uint64_t gToss; static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; // try to avoid _chkstk insertions; make this thread local -static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib]; +static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib]; INLINE void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) @@ -439,7 +439,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, const SWR_RASTSTATE &rastState = state.rastState; const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; - OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; + OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; __m128 vX, vY, vZ, vRecipW; @@ -502,7 +502,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); - OSALIGN(float, 16) oneOverW[4]; + OSALIGNSIMD(float) oneOverW[4]; _mm_store_ps(oneOverW, vRecipW); triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; @@ -537,7 +537,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // compute bary Z // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) - OSALIGN(float, 16) a[4]; + OSALIGNSIMD(float) a[4]; _mm_store_ps(a, vZ); triDesc.Z[0] = a[0] - a[2]; triDesc.Z[1] = a[1] - a[2]; @@ -575,7 +575,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } // Calc bounding box of triangle - OSALIGN(BBOX, 16) bbox; + OSALIGNSIMD(BBOX) bbox; calcBoundingBoxInt(vXi, vYi, bbox); // Intersect with scissor/viewport @@ -594,7 +594,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; - OSALIGN(BBOX, 16) intersect; + OSALIGNSIMD(BBOX) intersect; intersect.left = std::max(bbox.left, macroBoxLeft); intersect.top = std::max(bbox.top, macroBoxTop); intersect.right = std::min(bbox.right, macroBoxRight); @@ -1047,7 +1047,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi { 50, 51, 54, 55, 58, 59, 62, 63 } }; - OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; + OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; // pull point information from triangle buffer // @todo use structs for readability @@ -1286,7 +1286,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi // make sure this macrotile intersects the triangle __m128i vXai = fpToFixedPoint(vXa); __m128i vYai = fpToFixedPoint(vYa); - OSALIGN(BBOX, 16) bboxA; + OSALIGNSIMD(BBOX) bboxA; calcBoundingBoxInt(vXai, vYai, bboxA); if (!(bboxA.left > macroBoxRight || diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 5752094ca10..50361068025 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -790,6 +790,7 @@ typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*); +typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar); ////////////////////////////////////////////////////////////////////////// /// FRONTEND_STATE diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 07bc94a1a54..4b7a207f366 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -68,7 +68,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread #if defined(_WIN32) - SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; + static std::mutex m; + std::lock_guard<std::mutex> l(m); + + static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; DWORD bufSize = sizeof(buffer); BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); @@ -288,7 +291,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { // Cleanup memory allocations pDC->pArena->Reset(true); - pDC->pTileMgr->initialize(); + if (!pDC->isCompute) + { + pDC->pTileMgr->initialize(); + } if (pDC->cleanupState) { pDC->pState->pArena->Reset(true); @@ -302,10 +308,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) return result; } -INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) +INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued) { // increment our current draw id to the first incomplete draw - uint64_t drawEnqueued = GetEnqueuedDraw(pContext); + drawEnqueued = GetEnqueuedDraw(pContext); while (curDrawBE < drawEnqueued) { DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; @@ -313,8 +319,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) // If its not compute and FE is not done then break out of loop. if (!pDC->doneFE && !pDC->isCompute) break; - bool isWorkComplete = (pDC->isCompute) ? - pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete(); + bool isWorkComplete = pDC->isCompute ? + pDC->pDispatch->isWorkComplete() : + pDC->pTileMgr->isWorkComplete(); if (isWorkComplete) { @@ -355,7 +362,8 @@ void WorkOnFifoBE( { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. - if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) + uint64_t drawEnqueued = 0; + if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } @@ -370,7 +378,7 @@ void WorkOnFifoBE( // 2. If we're trying to work on draws after curDrawBE, we are restricted to // working on those macrotiles that are known to be complete in the prior draw to // maintain order. The locked tiles provides the history to ensures this. - for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i) + for (uint64_t i = curDrawBE; i < drawEnqueued; ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; @@ -463,7 +471,7 @@ void WorkOnFifoBE( } } -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode) +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE) { // Try to grab the next DC from the ring uint64_t drawEnqueued = GetEnqueuedDraw(pContext); @@ -516,38 +524,44 @@ void WorkOnCompute( uint32_t workerId, uint64_t& curDrawBE) { - if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) + uint64_t drawEnqueued = 0; + if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; - DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; - if (pDC->isCompute == false) return; - - // check dependencies - if (CheckDependency(pContext, pDC, lastRetiredDraw)) + for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i) { - return; - } + DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; + if (pDC->isCompute == false) return; - SWR_ASSERT(pDC->pDispatch != nullptr); - DispatchQueue& queue = *pDC->pDispatch; + // check dependencies + if (CheckDependency(pContext, pDC, lastRetiredDraw)) + { + return; + } - // Is there any work remaining? - if (queue.getNumQueued() > 0) - { - uint32_t threadGroupId = 0; - while (queue.getWork(threadGroupId)) + SWR_ASSERT(pDC->pDispatch != nullptr); + DispatchQueue& queue = *pDC->pDispatch; + + // Is there any work remaining? + if (queue.getNumQueued() > 0) { - ProcessComputeBE(pDC, workerId, threadGroupId); + void* pSpillFillBuffer = nullptr; + uint32_t threadGroupId = 0; + while (queue.getWork(threadGroupId)) + { + ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer); - queue.finishedWork(); + queue.finishedWork(); + } } } } +template<bool IsFEThread, bool IsBEThread> DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; @@ -631,25 +645,38 @@ DWORD workerThreadMain(LPVOID pData) } } - RDTSC_START(WorkerWorkOnFifoBE); - WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); - RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); + if (IsBEThread) + { + RDTSC_START(WorkerWorkOnFifoBE); + WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); + RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); - WorkOnCompute(pContext, workerId, curDrawBE); + WorkOnCompute(pContext, workerId, curDrawBE); + } - WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode); + if (IsFEThread) + { + WorkOnFifoFE(pContext, workerId, curDrawFE); + + if (!IsBEThread) + { + curDrawBE = curDrawFE; + } + } } return 0; } +template<> DWORD workerThreadMain<false, false>(LPVOID) = delete; +template <bool IsFEThread, bool IsBEThread> DWORD workerThreadInit(LPVOID pData) { #if defined(_WIN32) __try #endif // _WIN32 { - return workerThreadMain(pData); + return workerThreadMain<IsFEThread, IsBEThread>(pData); } #if defined(_WIN32) @@ -661,6 +688,7 @@ DWORD workerThreadInit(LPVOID pData) return 1; } +template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete; void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { @@ -678,6 +706,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) uint32_t numCoresPerNode = numHWCoresPerNode; uint32_t numHyperThreads = numHWHyperThreads; + if (KNOB_MAX_WORKER_THREADS) + { + SET_KNOB(HYPERTHREADED_FE, false); + } + + if (KNOB_HYPERTHREADED_FE) + { + SET_KNOB(MAX_THREADS_PER_CORE, 0); + } + if (KNOB_MAX_NUMA_NODES) { numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); @@ -693,6 +731,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); } + if (numHyperThreads < 2) + { + SET_KNOB(HYPERTHREADED_FE, false); + } + // Calculate numThreads uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; @@ -767,9 +810,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups; pPool->pThreadData[workerId].threadId = 0; pPool->pThreadData[workerId].numaId = 0; + pPool->pThreadData[workerId].coreId = 0; + pPool->pThreadData[workerId].htId = 0; pPool->pThreadData[workerId].pContext = pContext; pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]); + + pContext->NumBEThreads++; + pContext->NumFEThreads++; } } else @@ -780,6 +828,10 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) for (uint32_t n = 0; n < numNodes; ++n) { auto& node = nodes[n]; + if (node.cores.size() == 0) + { + continue; + } uint32_t numCores = numCoresPerNode; for (uint32_t c = 0; c < numCores; ++c) @@ -797,8 +849,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].procGroupId = core.procGroup; pPool->pThreadData[workerId].threadId = core.threadIds[t]; pPool->pThreadData[workerId].numaId = n; + pPool->pThreadData[workerId].coreId = c; + pPool->pThreadData[workerId].htId = t; pPool->pThreadData[workerId].pContext = pContext; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + + if (KNOB_HYPERTHREADED_FE) + { + if (t == 0) + { + pContext->NumBEThreads++; + pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]); + } + else + { + pContext->NumFEThreads++; + pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]); + } + } + else + { + pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]); + pContext->NumBEThreads++; + pContext->NumFEThreads++; + } ++workerId; } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 821d7dcb16e..3aba6323a95 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -41,6 +41,8 @@ struct THREAD_DATA uint32_t procGroupId; // Will always be 0 for non-Windows OS uint32_t threadId; // within the procGroup for Windows uint32_t numaId; // NUMA node id + uint32_t coreId; // Core id + uint32_t htId; // Hyperthread id uint32_t workerId; SWR_CONTEXT *pContext; bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set. @@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); // Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode); +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE); void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE); int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 794577270cf..87d9f42c032 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -35,27 +35,6 @@ #define TILE_ID(x,y) ((x << 16 | y)) -// override new/delete for alignment -void *MacroTileMgr::operator new(size_t size) -{ - return _aligned_malloc(size, 64); -} - -void MacroTileMgr::operator delete(void *p) -{ - _aligned_free(p); -} - -void* DispatchQueue::operator new(size_t size) -{ - return _aligned_malloc(size, 64); -} - -void DispatchQueue::operator delete(void *p) -{ - _aligned_free(p); -} - MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) { } @@ -304,7 +283,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID) { const API_STATE& state = GetApiState(pDC); - HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index aa561badc1c..82a15e16a33 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -140,9 +140,6 @@ public: x = (tileID >> 16) & 0xffff; } - void *operator new(size_t size); - void operator delete (void *p); - private: CachingArena& mArena; std::unordered_map<uint32_t, MacroTileQueue> mTiles; @@ -229,9 +226,6 @@ public: return mpTaskData; } - void *operator new(size_t size); - void operator delete (void *p); - void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this. OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 }; @@ -272,7 +266,7 @@ class HotTileMgr public: HotTileMgr() { - memset(&mHotTiles[0][0], 0, sizeof(mHotTiles)); + memset(mHotTiles, 0, sizeof(mHotTiles)); // cache hottile size for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py index 0f3ded68544..3832b91d93e 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py @@ -30,6 +30,18 @@ KNOBS = [ 'category' : 'debug', }], + ['HYPERTHREADED_FE', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['EXPERIMENTAL!!', + 'If enabled will attempt to use secondary threads per core to perform', + 'front-end (VS/GS) work.', + '', + 'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'], + 'category' : 'perf', + 'advanced' : 'true', + }], + ['DUMP_SHADER_IR', { 'type' : 'bool', 'default' : 'false', @@ -166,6 +178,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_FETCH', { @@ -175,6 +188,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_IA', { @@ -184,6 +198,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_VS', { @@ -193,6 +208,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_SETUP_TRIS', { @@ -202,6 +218,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_BIN_TRIS', { @@ -211,6 +228,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_RS', { @@ -220,4 +238,5 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }],] diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp index 810c50b2f8f..e4b8b683278 100644 --- a/src/gallium/drivers/swr/swr_query.cpp +++ b/src/gallium/drivers/swr/swr_query.cpp @@ -319,6 +319,12 @@ swr_check_render_cond(struct pipe_context *pipe) return TRUE; } + +static void +swr_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void swr_query_init(struct pipe_context *pipe) { @@ -329,6 +335,7 @@ swr_query_init(struct pipe_context *pipe) pipe->begin_query = swr_begin_query; pipe->end_query = swr_end_query; pipe->get_query_result = swr_get_query_result; + pipe->set_active_query_state = swr_set_active_query_state; ctx->active_queries = 0; } diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp index f9e52be2367..a0a6324f334 100644 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -337,6 +337,11 @@ swr_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS: case PIPE_CAP_QUERY_BUFFER_OBJECT: case PIPE_CAP_QUERY_MEMORY_INFO: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: + case PIPE_CAP_PCI_GROUP: + case PIPE_CAP_PCI_BUS: + case PIPE_CAP_PCI_DEVICE: + case PIPE_CAP_PCI_FUNCTION: return 0; } diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp index ff16d0f2f11..83e32163ecc 100644 --- a/src/gallium/drivers/swr/swr_shader.cpp +++ b/src/gallium/drivers/swr/swr_shader.cpp @@ -40,32 +40,29 @@ #include "swr_state.h" #include "swr_screen.h" -bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs) +bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs) { return !memcmp(&lhs, &rhs, sizeof(lhs)); } -void -swr_generate_fs_key(struct swr_jit_key &key, - struct swr_context *ctx, - swr_fragment_shader *swr_fs) +bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs) { - key.nr_cbufs = ctx->framebuffer.nr_cbufs; - key.light_twoside = ctx->rasterizer->light_twoside; - memcpy(&key.vs_output_semantic_name, - &ctx->vs->info.base.output_semantic_name, - sizeof(key.vs_output_semantic_name)); - memcpy(&key.vs_output_semantic_idx, - &ctx->vs->info.base.output_semantic_index, - sizeof(key.vs_output_semantic_idx)); + return !memcmp(&lhs, &rhs, sizeof(lhs)); +} - key.nr_samplers = swr_fs->info.base.file_max[TGSI_FILE_SAMPLER] + 1; +static void +swr_generate_sampler_key(const struct lp_tgsi_info &info, + struct swr_context *ctx, + unsigned shader_type, + struct swr_jit_sampler_key &key) +{ + key.nr_samplers = info.base.file_max[TGSI_FILE_SAMPLER] + 1; for (unsigned i = 0; i < key.nr_samplers; i++) { - if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { + if (info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { lp_sampler_static_sampler_state( &key.sampler[i].sampler_state, - ctx->samplers[PIPE_SHADER_FRAGMENT][i]); + ctx->samplers[shader_type][i]); } } @@ -74,28 +71,58 @@ swr_generate_fs_key(struct swr_jit_key &key, * are dx10-style? Can't really have mixed opcodes, at least not * if we want to skip the holes here (without rescanning tgsi). */ - if (swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { + if (info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) { key.nr_sampler_views = - swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; + info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1; for (unsigned i = 0; i < key.nr_sampler_views; i++) { - if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { + if (info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) { lp_sampler_static_texture_state( &key.sampler[i].texture_state, - ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]); + ctx->sampler_views[shader_type][i]); } } } else { key.nr_sampler_views = key.nr_samplers; for (unsigned i = 0; i < key.nr_sampler_views; i++) { - if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { + if (info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { lp_sampler_static_texture_state( &key.sampler[i].texture_state, - ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]); + ctx->sampler_views[shader_type][i]); } } } } +void +swr_generate_fs_key(struct swr_jit_fs_key &key, + struct swr_context *ctx, + swr_fragment_shader *swr_fs) +{ + memset(&key, 0, sizeof(key)); + + key.nr_cbufs = ctx->framebuffer.nr_cbufs; + key.light_twoside = ctx->rasterizer->light_twoside; + key.flatshade = ctx->rasterizer->flatshade; + memcpy(&key.vs_output_semantic_name, + &ctx->vs->info.base.output_semantic_name, + sizeof(key.vs_output_semantic_name)); + memcpy(&key.vs_output_semantic_idx, + &ctx->vs->info.base.output_semantic_index, + sizeof(key.vs_output_semantic_idx)); + + swr_generate_sampler_key(swr_fs->info, ctx, PIPE_SHADER_FRAGMENT, key); +} + +void +swr_generate_vs_key(struct swr_jit_vs_key &key, + struct swr_context *ctx, + swr_vertex_shader *swr_vs) +{ + memset(&key, 0, sizeof(key)); + + swr_generate_sampler_key(swr_vs->info, ctx, PIPE_SHADER_VERTEX, key); +} + struct BuilderSWR : public Builder { BuilderSWR(JitManager *pJitMgr) : Builder(pJitMgr) @@ -103,14 +130,15 @@ struct BuilderSWR : public Builder { pJitMgr->SetupNewModule(); } - PFN_VERTEX_FUNC - CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs); - PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_key &key); + PFN_VERTEX_FUNC CompileVS(struct swr_context *ctx, swr_jit_vs_key &key); + PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_fs_key &key); }; PFN_VERTEX_FUNC -BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs) +BuilderSWR::CompileVS(struct swr_context *ctx, swr_jit_vs_key &key) { + struct swr_vertex_shader *swr_vs = ctx->vs; + swr_vs->linkageMask = 0; for (unsigned i = 0; i < swr_vs->info.base.num_outputs; i++) { @@ -180,6 +208,9 @@ BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs) } } + struct lp_build_sampler_soa *sampler = + swr_sampler_soa_create(key.sampler, PIPE_SHADER_VERTEX); + struct lp_bld_tgsi_system_values system_values; memset(&system_values, 0, sizeof(system_values)); system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID})); @@ -194,9 +225,9 @@ BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs) &system_values, inputs, outputs, - NULL, // wrap(hPrivateData), (sampler context) + wrap(hPrivateData), // (sampler context) NULL, // thread data - NULL, // sampler + sampler, // sampler &swr_vs->info.base, NULL); // geometry shader face @@ -239,11 +270,11 @@ BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs) } PFN_VERTEX_FUNC -swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs) +swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key) { BuilderSWR builder( - reinterpret_cast<JitManager *>(swr_screen(ctx->screen)->hJitMgr)); - return builder.CompileVS(ctx, swr_vs); + reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr)); + return builder.CompileVS(ctx, key); } static unsigned @@ -269,7 +300,7 @@ locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info) } PFN_PIXEL_KERNEL -BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key) +BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_fs_key &key) { struct swr_fragment_shader *swr_fs = ctx->fs; @@ -461,6 +492,9 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key) if (interpMode == TGSI_INTERPOLATE_CONSTANT) { inputs[attrib][channel] = wrap(va); + } else if ((interpMode == TGSI_INTERPOLATE_COLOR) && + (key.flatshade == true)) { + inputs[attrib][channel] = wrap(vc); } else { Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj); @@ -478,7 +512,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key) } } - sampler = swr_sampler_soa_create(key.sampler); + sampler = swr_sampler_soa_create(key.sampler, PIPE_SHADER_FRAGMENT); struct lp_bld_tgsi_system_values system_values; memset(&system_values, 0, sizeof(system_values)); @@ -583,7 +617,7 @@ BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key) } PFN_PIXEL_KERNEL -swr_compile_fs(struct swr_context *ctx, swr_jit_key &key) +swr_compile_fs(struct swr_context *ctx, swr_jit_fs_key &key) { BuilderSWR builder( reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr)); diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h index e22a7c48c2a..3f79570bbd9 100644 --- a/src/gallium/drivers/swr/swr_shader.h +++ b/src/gallium/drivers/swr/swr_shader.h @@ -25,36 +25,56 @@ class swr_vertex_shader; class swr_fragment_shader; -class swr_jit_key; +class swr_jit_fs_key; +class swr_jit_vs_key; PFN_VERTEX_FUNC -swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs); +swr_compile_vs(struct swr_context *ctx, swr_jit_vs_key &key); PFN_PIXEL_KERNEL -swr_compile_fs(struct swr_context *ctx, swr_jit_key &key); +swr_compile_fs(struct swr_context *ctx, swr_jit_fs_key &key); -void swr_generate_fs_key(struct swr_jit_key &key, +void swr_generate_fs_key(struct swr_jit_fs_key &key, struct swr_context *ctx, swr_fragment_shader *swr_fs); -struct swr_jit_key { +void swr_generate_vs_key(struct swr_jit_vs_key &key, + struct swr_context *ctx, + swr_vertex_shader *swr_vs); + +struct swr_jit_sampler_key { + unsigned nr_samplers; + unsigned nr_sampler_views; + struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS]; +}; + +struct swr_jit_fs_key : swr_jit_sampler_key { unsigned nr_cbufs; unsigned light_twoside; + unsigned flatshade; ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS]; - unsigned nr_samplers; - unsigned nr_sampler_views; - struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS]; +}; + +struct swr_jit_vs_key : swr_jit_sampler_key { }; namespace std { -template <> struct hash<swr_jit_key> { - std::size_t operator()(const swr_jit_key &k) const +template <> struct hash<swr_jit_fs_key> { + std::size_t operator()(const swr_jit_fs_key &k) const + { + return util_hash_crc32(&k, sizeof(k)); + } +}; + +template <> struct hash<swr_jit_vs_key> { + std::size_t operator()(const swr_jit_vs_key &k) const { return util_hash_crc32(&k, sizeof(k)); } }; }; -bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs); +bool operator==(const swr_jit_fs_key &lhs, const swr_jit_fs_key &rhs); +bool operator==(const swr_jit_vs_key &lhs, const swr_jit_vs_key &rhs); diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp index e7bf3618a7d..ded51a9b196 100644 --- a/src/gallium/drivers/swr/swr_state.cpp +++ b/src/gallium/drivers/swr/swr_state.cpp @@ -317,8 +317,7 @@ static void * swr_create_vs_state(struct pipe_context *pipe, const struct pipe_shader_state *vs) { - struct swr_vertex_shader *swr_vs = - (swr_vertex_shader *)CALLOC_STRUCT(swr_vertex_shader); + struct swr_vertex_shader *swr_vs = new swr_vertex_shader; if (!swr_vs) return NULL; @@ -327,8 +326,6 @@ swr_create_vs_state(struct pipe_context *pipe, lp_build_tgsi_info(vs->tokens, &swr_vs->info); - swr_vs->func = swr_compile_vs(pipe, swr_vs); - swr_vs->soState = {0}; if (swr_vs->pipe.stream_output.num_outputs) { @@ -368,7 +365,7 @@ swr_delete_vs_state(struct pipe_context *pipe, void *vs) { struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs; FREE((void *)swr_vs->pipe.tokens); - FREE(vs); + delete swr_vs; } static void * @@ -675,6 +672,58 @@ swr_update_resource_status(struct pipe_context *pipe, } } +static void +swr_update_texture_state(struct swr_context *ctx, + unsigned shader_type, + unsigned num_sampler_views, + swr_jit_texture *textures) +{ + for (unsigned i = 0; i < num_sampler_views; i++) { + struct pipe_sampler_view *view = + ctx->sampler_views[shader_type][i]; + + if (view) { + struct pipe_resource *res = view->texture; + struct swr_resource *swr_res = swr_resource(res); + struct swr_jit_texture *jit_tex = &textures[i]; + memset(jit_tex, 0, sizeof(*jit_tex)); + jit_tex->width = res->width0; + jit_tex->height = res->height0; + jit_tex->depth = res->depth0; + jit_tex->first_level = view->u.tex.first_level; + jit_tex->last_level = view->u.tex.last_level; + jit_tex->base_ptr = swr_res->swr.pBaseAddress; + + for (unsigned level = jit_tex->first_level; + level <= jit_tex->last_level; + level++) { + jit_tex->row_stride[level] = swr_res->row_stride[level]; + jit_tex->img_stride[level] = swr_res->img_stride[level]; + jit_tex->mip_offsets[level] = swr_res->mip_offsets[level]; + } + } + } +} + +static void +swr_update_sampler_state(struct swr_context *ctx, + unsigned shader_type, + unsigned num_samplers, + swr_jit_sampler *samplers) +{ + for (unsigned i = 0; i < num_samplers; i++) { + const struct pipe_sampler_state *sampler = + ctx->samplers[shader_type][i]; + + if (sampler) { + samplers[i].min_lod = sampler->min_lod; + samplers[i].max_lod = sampler->max_lod; + samplers[i].lod_bias = sampler->lod_bias; + COPY_4V(samplers[i].border_color, sampler->border_color.f); + } + } +} + void swr_update_derived(struct pipe_context *pipe, const struct pipe_draw_info *p_draw_info) @@ -974,14 +1023,43 @@ swr_update_derived(struct pipe_context *pipe, } /* VertexShader */ - if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_FRAMEBUFFER)) { - SwrSetVertexFunc(ctx->swrContext, ctx->vs->func); + if (ctx->dirty & (SWR_NEW_VS | + SWR_NEW_SAMPLER | + SWR_NEW_SAMPLER_VIEW | + SWR_NEW_FRAMEBUFFER)) { + swr_jit_vs_key key; + swr_generate_vs_key(key, ctx, ctx->vs); + auto search = ctx->vs->map.find(key); + PFN_VERTEX_FUNC func; + if (search != ctx->vs->map.end()) { + func = search->second; + } else { + func = swr_compile_vs(ctx, key); + ctx->vs->map.insert(std::make_pair(key, func)); + } + SwrSetVertexFunc(ctx->swrContext, func); + + /* JIT sampler state */ + if (ctx->dirty & SWR_NEW_SAMPLER) { + swr_update_sampler_state(ctx, + PIPE_SHADER_VERTEX, + key.nr_samplers, + ctx->swrDC.samplersVS); + } + + /* JIT sampler view state */ + if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { + swr_update_texture_state(ctx, + PIPE_SHADER_VERTEX, + key.nr_sampler_views, + ctx->swrDC.texturesVS); + } } - swr_jit_key key; + /* FragmentShader */ if (ctx->dirty & (SWR_NEW_FS | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW | SWR_NEW_RASTERIZER | SWR_NEW_FRAMEBUFFER)) { - memset(&key, 0, sizeof(key)); + swr_jit_fs_key key; swr_generate_fs_key(key, ctx, ctx->fs); auto search = ctx->fs->map.find(key); PFN_PIXEL_KERNEL func; @@ -1031,56 +1109,25 @@ swr_update_derived(struct pipe_context *pipe, psState.usesUAV = false; // XXX psState.forceEarlyZ = false; SwrSetPixelShaderState(ctx->swrContext, &psState); - } - - /* JIT sampler state */ - if (ctx->dirty & SWR_NEW_SAMPLER) { - swr_draw_context *pDC = &ctx->swrDC; - for (unsigned i = 0; i < key.nr_samplers; i++) { - const struct pipe_sampler_state *sampler = - ctx->samplers[PIPE_SHADER_FRAGMENT][i]; - - if (sampler) { - pDC->samplersFS[i].min_lod = sampler->min_lod; - pDC->samplersFS[i].max_lod = sampler->max_lod; - pDC->samplersFS[i].lod_bias = sampler->lod_bias; - COPY_4V(pDC->samplersFS[i].border_color, sampler->border_color.f); - } + /* JIT sampler state */ + if (ctx->dirty & SWR_NEW_SAMPLER) { + swr_update_sampler_state(ctx, + PIPE_SHADER_FRAGMENT, + key.nr_samplers, + ctx->swrDC.samplersFS); } - } - - /* JIT sampler view state */ - if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { - swr_draw_context *pDC = &ctx->swrDC; - for (unsigned i = 0; i < key.nr_sampler_views; i++) { - struct pipe_sampler_view *view = - ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]; - - if (view) { - struct pipe_resource *res = view->texture; - struct swr_resource *swr_res = swr_resource(res); - struct swr_jit_texture *jit_tex = &pDC->texturesFS[i]; - memset(jit_tex, 0, sizeof(*jit_tex)); - jit_tex->width = res->width0; - jit_tex->height = res->height0; - jit_tex->depth = res->depth0; - jit_tex->first_level = view->u.tex.first_level; - jit_tex->last_level = view->u.tex.last_level; - jit_tex->base_ptr = swr_res->swr.pBaseAddress; - - for (unsigned level = jit_tex->first_level; - level <= jit_tex->last_level; - level++) { - jit_tex->row_stride[level] = swr_res->row_stride[level]; - jit_tex->img_stride[level] = swr_res->img_stride[level]; - jit_tex->mip_offsets[level] = swr_res->mip_offsets[level]; - } - } + /* JIT sampler view state */ + if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) { + swr_update_texture_state(ctx, + PIPE_SHADER_FRAGMENT, + key.nr_sampler_views, + ctx->swrDC.texturesFS); } } + /* VertexShader Constants */ if (ctx->dirty & SWR_NEW_VSCONSTANTS) { swr_draw_context *pDC = &ctx->swrDC; diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h index f0a7ff3b185..32a5441295b 100644 --- a/src/gallium/drivers/swr/swr_state.h +++ b/src/gallium/drivers/swr/swr_state.h @@ -40,9 +40,9 @@ struct swr_vertex_shader { struct pipe_shader_state pipe; struct lp_tgsi_info info; unsigned linkageMask; - PFN_VERTEX_FUNC func; + std::unordered_map<swr_jit_vs_key, PFN_VERTEX_FUNC> map; SWR_STREAMOUT_STATE soState; - PFN_SO_FUNC soFunc[PIPE_PRIM_MAX]; + PFN_SO_FUNC soFunc[PIPE_PRIM_MAX] {0}; }; struct swr_fragment_shader { @@ -50,7 +50,7 @@ struct swr_fragment_shader { struct lp_tgsi_info info; uint32_t constantMask; uint32_t pointSpriteMask; - std::unordered_map<swr_jit_key, PFN_PIXEL_KERNEL> map; + std::unordered_map<swr_jit_fs_key, PFN_PIXEL_KERNEL> map; }; /* Vertex element state */ diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp index 8e01e32e280..8172c820f22 100644 --- a/src/gallium/drivers/swr/swr_tex_sample.cpp +++ b/src/gallium/drivers/swr/swr_tex_sample.cpp @@ -72,6 +72,8 @@ struct swr_sampler_dynamic_state { struct lp_sampler_dynamic_state base; const struct swr_sampler_static_state *static_state; + + unsigned shader_type; }; @@ -112,7 +114,18 @@ swr_texture_member(const struct lp_sampler_dynamic_state *base, /* context[0] */ indices[0] = lp_build_const_int32(gallivm, 0); /* context[0].textures */ - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS); + auto dynamic = (const struct swr_sampler_dynamic_state *)base; + switch (dynamic->shader_type) { + case PIPE_SHADER_FRAGMENT: + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS); + break; + case PIPE_SHADER_VERTEX: + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesVS); + break; + default: + assert(0 && "unsupported shader type"); + break; + } /* context[0].textures[unit] */ indices[2] = lp_build_const_int32(gallivm, texture_unit); /* context[0].textures[unit].member */ @@ -195,7 +208,18 @@ swr_sampler_member(const struct lp_sampler_dynamic_state *base, /* context[0] */ indices[0] = lp_build_const_int32(gallivm, 0); /* context[0].samplers */ - indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS); + auto dynamic = (const struct swr_sampler_dynamic_state *)base; + switch (dynamic->shader_type) { + case PIPE_SHADER_FRAGMENT: + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS); + break; + case PIPE_SHADER_VERTEX: + indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersVS); + break; + default: + assert(0 && "unsupported shader type"); + break; + } /* context[0].samplers[unit] */ indices[2] = lp_build_const_int32(gallivm, sampler_unit); /* context[0].samplers[unit].member */ @@ -307,7 +331,8 @@ swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base, struct lp_build_sampler_soa * -swr_sampler_soa_create(const struct swr_sampler_static_state *static_state) +swr_sampler_soa_create(const struct swr_sampler_static_state *static_state, + unsigned shader_type) { struct swr_sampler_soa *sampler; @@ -334,5 +359,7 @@ swr_sampler_soa_create(const struct swr_sampler_static_state *static_state) sampler->dynamic_state.static_state = static_state; + sampler->dynamic_state.shader_type = shader_type; + return &sampler->base; } diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h index f5c368c108d..cb7e83d1c39 100644 --- a/src/gallium/drivers/swr/swr_tex_sample.h +++ b/src/gallium/drivers/swr/swr_tex_sample.h @@ -44,4 +44,4 @@ struct swr_sampler_static_state { * */ struct lp_build_sampler_soa * -swr_sampler_soa_create(const struct swr_sampler_static_state *key); +swr_sampler_soa_create(const struct swr_sampler_static_state *key, unsigned shader_type); diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c index 08b1d32afb0..b575f2cdb34 100644 --- a/src/gallium/drivers/trace/tr_context.c +++ b/src/gallium/drivers/trace/tr_context.c @@ -273,6 +273,24 @@ trace_context_get_query_result(struct pipe_context *_pipe, } +static void +trace_context_set_active_query_state(struct pipe_context *_pipe, + boolean enable) +{ + struct trace_context *tr_ctx = trace_context(_pipe); + struct pipe_context *pipe = tr_ctx->pipe; + + trace_dump_call_begin("pipe_context", "set_active_query_state"); + + trace_dump_arg(ptr, pipe); + trace_dump_arg(bool, enable); + + pipe->set_active_query_state(pipe, enable); + + trace_dump_call_end(); +} + + static void * trace_context_create_blend_state(struct pipe_context *_pipe, const struct pipe_blend_state *state) @@ -1781,6 +1799,7 @@ trace_context_create(struct trace_screen *tr_scr, TR_CTX_INIT(begin_query); TR_CTX_INIT(end_query); TR_CTX_INIT(get_query_result); + TR_CTX_INIT(set_active_query_state); TR_CTX_INIT(create_blend_state); TR_CTX_INIT(bind_blend_state); TR_CTX_INIT(delete_blend_state); diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c index 9b0b540d3fc..68b85737628 100644 --- a/src/gallium/drivers/vc4/vc4_draw.c +++ b/src/gallium/drivers/vc4/vc4_draw.c @@ -32,12 +32,19 @@ #include "vc4_resource.h" static void -vc4_get_draw_cl_space(struct vc4_context *vc4) +vc4_get_draw_cl_space(struct vc4_context *vc4, int vert_count) { + /* The SW-5891 workaround may cause us to emit multiple shader recs + * and draw packets. + */ + int num_draws = DIV_ROUND_UP(vert_count, 65535) + 1; + /* Binner gets our packet state -- vc4_emit.c contents, * and the primitive itself. */ - cl_ensure_space(&vc4->bcl, 256); + cl_ensure_space(&vc4->bcl, + 256 + (VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE + + VC4_PACKET_GL_SHADER_STATE_SIZE) * num_draws); /* Nothing for rcl -- that's covered by vc4_context.c */ @@ -45,7 +52,8 @@ vc4_get_draw_cl_space(struct vc4_context *vc4) * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of * vattr stride). */ - cl_ensure_space(&vc4->shader_rec, 12 * sizeof(uint32_t) + 104 + 8 * 32); + cl_ensure_space(&vc4->shader_rec, + (12 * sizeof(uint32_t) + 104 + 8 * 32) * num_draws); /* Uniforms are covered by vc4_write_uniforms(). */ @@ -61,12 +69,12 @@ vc4_get_draw_cl_space(struct vc4_context *vc4) * Does the initial bining command list setup for drawing to a given FBO. */ static void -vc4_start_draw(struct vc4_context *vc4) +vc4_start_draw(struct vc4_context *vc4, int vert_count) { if (vc4->needs_flush) return; - vc4_get_draw_cl_space(vc4); + vc4_get_draw_cl_space(vc4, 0); struct vc4_cl_out *bcl = cl_start(&vc4->bcl); // Tile state data is 48 bytes per tile, I think it can be thrown away @@ -119,7 +127,8 @@ vc4_update_shadow_textures(struct pipe_context *pctx, } static void -vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info) +vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *info, + uint32_t extra_index_bias) { /* VC4_DIRTY_VTXSTATE */ struct vc4_vertex_stateobj *vtx = vc4->vtx; @@ -170,7 +179,8 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *i /* not vc4->dirty tracked: vc4->last_index_bias */ uint32_t offset = (vb->buffer_offset + elem->src_offset + - vb->stride * info->index_bias); + vb->stride * (info->index_bias + + extra_index_bias)); uint32_t vb_size = rsc->bo->size - offset; uint32_t elem_size = util_format_get_blocksize(elem->src_format); @@ -219,8 +229,9 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4, const struct pipe_draw_info *i &vc4->constbuf[PIPE_SHADER_VERTEX], &vc4->verttex); - vc4->last_index_bias = info->index_bias; + vc4->last_index_bias = info->index_bias + extra_index_bias; vc4->max_index = max_index; + vc4->shader_rec_count++; } /** @@ -275,14 +286,14 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) vc4_hw_2116_workaround(pctx); - vc4_get_draw_cl_space(vc4); + vc4_get_draw_cl_space(vc4, info->count); if (vc4->prim_mode != info->mode) { vc4->prim_mode = info->mode; vc4->dirty |= VC4_DIRTY_PRIM_MODE; } - vc4_start_draw(vc4); + vc4_start_draw(vc4, info->count); vc4_update_compiled_shaders(vc4, info->mode); vc4_emit_state(pctx); @@ -298,7 +309,7 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) vc4->prog.vs->uniform_dirty_bits | vc4->prog.fs->uniform_dirty_bits)) || vc4->last_index_bias != info->index_bias) { - vc4_emit_gl_shader_state(vc4, info); + vc4_emit_gl_shader_state(vc4, info, 0); } vc4->dirty = 0; @@ -342,10 +353,75 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) if (vc4->indexbuf.index_size == 4 || vc4->indexbuf.user_buffer) pipe_resource_reference(&prsc, NULL); } else { - cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); - cl_u8(&bcl, info->mode); - cl_u32(&bcl, info->count); - cl_u32(&bcl, info->start); + uint32_t count = info->count; + uint32_t start = info->start; + uint32_t extra_index_bias = 0; + + while (count) { + uint32_t this_count = count; + uint32_t step = count; + static const uint32_t max_verts = 65535; + + /* GFXH-515 / SW-5891: The binner emits 16 bit indices + * for drawarrays, which means that if start + count > + * 64k it would truncate the top bits. Work around + * this by emitting a limited number of primitives at + * a time and reemitting the shader state pointing + * farther down the vertex attribute arrays. + * + * To do this properly for line loops or trifans, we'd + * need to make a new VB containing the first vertex + * plus whatever remainder. + */ + if (extra_index_bias) { + cl_end(&vc4->bcl, bcl); + vc4_emit_gl_shader_state(vc4, info, + extra_index_bias); + bcl = cl_start(&vc4->bcl); + } + + if (start + count > max_verts) { + switch (info->mode) { + case PIPE_PRIM_POINTS: + this_count = step = max_verts; + break; + case PIPE_PRIM_LINES: + this_count = step = max_verts - (max_verts % 2); + break; + case PIPE_PRIM_LINE_STRIP: + this_count = max_verts; + step = max_verts - 1; + break; + case PIPE_PRIM_LINE_LOOP: + this_count = max_verts; + step = max_verts - 1; + debug_warn_once("unhandled line loop " + "looping behavior with " + ">65535 verts\n"); + break; + case PIPE_PRIM_TRIANGLES: + this_count = step = max_verts - (max_verts % 3); + break; + case PIPE_PRIM_TRIANGLE_STRIP: + this_count = max_verts; + step = max_verts - 2; + break; + default: + debug_warn_once("unhandled primitive " + "max vert count, truncating\n"); + this_count = step = max_verts; + } + } + + cl_u8(&bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE); + cl_u8(&bcl, info->mode); + cl_u32(&bcl, this_count); + cl_u32(&bcl, start); + + count -= step; + extra_index_bias += start + step; + start = 0; + } } cl_end(&vc4->bcl, bcl); @@ -356,8 +432,6 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) vc4->resolve |= PIPE_CLEAR_STENCIL; vc4->resolve |= PIPE_CLEAR_COLOR0; - vc4->shader_rec_count++; - if (vc4_debug & VC4_DEBUG_ALWAYS_FLUSH) vc4_flush(pctx); } @@ -410,7 +484,7 @@ vc4_clear(struct pipe_context *pctx, unsigned buffers, vc4->cleared |= buffers; vc4->resolve |= buffers; - vc4_start_draw(vc4); + vc4_start_draw(vc4, 0); } static void diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c index 49a314cdb25..cf6d2896f7d 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -710,9 +710,9 @@ vc4_nir_lower_blend_block(nir_block *block, void *state) } void -vc4_nir_lower_blend(struct vc4_compile *c) +vc4_nir_lower_blend(nir_shader *s, struct vc4_compile *c) { - nir_foreach_function(c->s, function) { + nir_foreach_function(s, function) { if (function->impl) { nir_foreach_block(function->impl, vc4_nir_lower_blend_block, c); diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c index d08ad588e5b..22c602adb54 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -380,24 +380,14 @@ vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b, intr_comp->num_components = 1; nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); - /* Convert the uniform (not user_clip_plane) offset to bytes. - * If it happens to be a constant, constant-folding will clean - * up the shift for us. + /* Convert the uniform offset to bytes. If it happens to be a + * constant, constant-folding will clean up the shift for us. */ - if (intr->intrinsic == nir_intrinsic_load_uniform) { - /* Convert the base offset to bytes and add the - * component - */ - intr_comp->const_index[0] = (intr->const_index[0] * 16 + i * 4); - - intr_comp->src[0] = - nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa, - nir_imm_int(b, 4))); - } else { - assert(intr->intrinsic == - nir_intrinsic_load_user_clip_plane); - intr_comp->const_index[0] = intr->const_index[0] * 4 + i; - } + intr_comp->const_index[0] = (intr->const_index[0] * 16 + i * 4); + + intr_comp->src[0] = + nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa, + nir_imm_int(b, 4))); dests[i] = &intr_comp->dest.ssa; @@ -428,10 +418,10 @@ vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b, break; case nir_intrinsic_load_uniform: - case nir_intrinsic_load_user_clip_plane: vc4_nir_lower_uniform(c, b, intr); break; + case nir_intrinsic_load_user_clip_plane: default: break; } @@ -465,9 +455,9 @@ vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl) } void -vc4_nir_lower_io(struct vc4_compile *c) +vc4_nir_lower_io(nir_shader *s, struct vc4_compile *c) { - nir_foreach_function(c->s, function) { + nir_foreach_function(s, function) { if (function->impl) vc4_nir_lower_io_impl(c, function->impl); } diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c index 8b65cac5084..6b8830743eb 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c @@ -162,9 +162,9 @@ vc4_nir_lower_txf_ms_impl(struct vc4_compile *c, nir_function_impl *impl) } void -vc4_nir_lower_txf_ms(struct vc4_compile *c) +vc4_nir_lower_txf_ms(nir_shader *s, struct vc4_compile *c) { - nir_foreach_function(c->s, function) { + nir_foreach_function(s, function) { if (function->impl) vc4_nir_lower_txf_ms_impl(c, function->impl); } diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm.c b/src/gallium/drivers/vc4/vc4_opt_vpm.c index d15b0c1a39f..d31b673bd63 100644 --- a/src/gallium/drivers/vc4/vc4_opt_vpm.c +++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c @@ -65,7 +65,7 @@ qir_opt_vpm(struct vc4_compile *c) * result, try to move the instruction up in place of the VPM read. */ list_for_each_entry(struct qinst, inst, &c->instructions, link) { - if (!inst || qir_is_multi_instruction(inst)) + if (!inst) continue; if (qir_depends_on_flags(inst) || inst->sf) @@ -132,7 +132,7 @@ qir_opt_vpm(struct vc4_compile *c) continue; struct qinst *inst = c->defs[temp]; - if (!inst || qir_is_multi_instruction(inst)) + if (!inst) continue; if (qir_depends_on_flags(inst) || inst->sf) diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 71a1ebbb313..eccc7ab413f 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -30,7 +30,6 @@ #include "util/ralloc.h" #include "util/hash_table.h" #include "tgsi/tgsi_dump.h" -#include "tgsi/tgsi_lowering.h" #include "tgsi/tgsi_parse.h" #include "compiler/nir/nir.h" #include "compiler/nir/nir_builder.h" @@ -638,8 +637,8 @@ emit_vertex_input(struct vc4_compile *c, int attr) c->vattr_sizes[attr] = align(attr_size, 4); for (int i = 0; i < align(attr_size, 4) / 4; i++) { - struct qreg vpm = { QFILE_VPM, attr * 4 + i }; - c->inputs[attr * 4 + i] = qir_MOV(c, vpm); + c->inputs[attr * 4 + i] = + qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i)); c->num_inputs++; } } @@ -647,8 +646,8 @@ emit_vertex_input(struct vc4_compile *c, int attr) static void emit_fragcoord_input(struct vc4_compile *c, int attr) { - c->inputs[attr * 4 + 0] = qir_FRAG_X(c); - c->inputs[attr * 4 + 1] = qir_FRAG_Y(c); + c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0)); + c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0)); c->inputs[attr * 4 + 2] = qir_FMUL(c, qir_ITOF(c, qir_FRAG_Z(c)), @@ -1193,12 +1192,15 @@ emit_frag_end(struct vc4_compile *c) } if (c->fs_key->stencil_enabled) { - qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 0)); + qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), + qir_uniform(c, QUNIFORM_STENCIL, 0)); if (c->fs_key->stencil_twoside) { - qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 1)); + qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), + qir_uniform(c, QUNIFORM_STENCIL, 1)); } if (c->fs_key->stencil_full_writemasks) { - qir_TLB_STENCIL_SETUP(c, qir_uniform(c, QUNIFORM_STENCIL, 2)); + qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), + qir_uniform(c, QUNIFORM_STENCIL, 2)); } } @@ -1207,24 +1209,24 @@ emit_frag_end(struct vc4_compile *c) } if (c->fs_key->depth_enabled) { - struct qreg z; if (c->output_position_index != -1) { - z = qir_FTOI(c, qir_FMUL(c, c->outputs[c->output_position_index + 2], - qir_uniform_f(c, 0xffffff))); + qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0), + qir_FMUL(c, + c->outputs[c->output_position_index + 2], + qir_uniform_f(c, 0xffffff)))->cond = discard_cond; } else { - z = qir_FRAG_Z(c); + qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0), + qir_FRAG_Z(c))->cond = discard_cond; } - struct qinst *inst = qir_TLB_Z_WRITE(c, z); - inst->cond = discard_cond; } if (!c->msaa_per_sample_output) { - struct qinst *inst = qir_TLB_COLOR_WRITE(c, color); - inst->cond = discard_cond; + qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0), + color)->cond = discard_cond; } else { for (int i = 0; i < VC4_MAX_SAMPLES; i++) { - struct qinst *inst = qir_TLB_COLOR_WRITE_MS(c, c->sample_colors[i]); - inst->cond = discard_cond; + qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0), + c->sample_colors[i])->cond = discard_cond; } } } @@ -1304,8 +1306,7 @@ emit_stub_vpm_read(struct vc4_compile *c) return; c->vattr_sizes[0] = 4; - struct qreg vpm = { QFILE_VPM, 0 }; - (void)qir_MOV(c, vpm); + (void)qir_MOV(c, qir_reg(QFILE_VPM, 0)); c->num_inputs++; } @@ -1371,16 +1372,16 @@ vc4_optimize_nir(struct nir_shader *s) do { progress = false; - nir_lower_vars_to_ssa(s); - nir_lower_alu_to_scalar(s); + NIR_PASS_V(s, nir_lower_vars_to_ssa); + NIR_PASS_V(s, nir_lower_alu_to_scalar); - progress = nir_copy_prop(s) || progress; - progress = nir_opt_dce(s) || progress; - progress = nir_opt_cse(s) || progress; - progress = nir_opt_peephole_select(s) || progress; - progress = nir_opt_algebraic(s) || progress; - progress = nir_opt_constant_folding(s) || progress; - progress = nir_opt_undef(s) || progress; + NIR_PASS(progress, s, nir_copy_prop); + NIR_PASS(progress, s, nir_opt_dce); + NIR_PASS(progress, s, nir_opt_cse); + NIR_PASS(progress, s, nir_opt_peephole_select); + NIR_PASS(progress, s, nir_opt_algebraic); + NIR_PASS(progress, s, nir_opt_constant_folding); + NIR_PASS(progress, s, nir_opt_undef); } while (progress); } @@ -1427,7 +1428,9 @@ ntq_setup_inputs(struct vc4_compile *c) if (var->data.location == VARYING_SLOT_POS) { emit_fragcoord_input(c, loc); } else if (var->data.location == VARYING_SLOT_FACE) { - c->inputs[loc * 4 + 0] = qir_FRAG_REV_FLAG(c); + c->inputs[loc * 4 + 0] = + qir_ITOF(c, qir_reg(QFILE_FRAG_REV_FLAG, + 0)); } else if (var->data.location >= VARYING_SLOT_VAR0 && (c->fs_key->point_sprite_mask & (1 << (var->data.location - @@ -1573,8 +1576,10 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_user_clip_plane: - *dest = qir_uniform(c, QUNIFORM_USER_CLIP_PLANE, - instr->const_index[0]); + for (int i = 0; i < instr->num_components; i++) { + dest[i] = qir_uniform(c, QUNIFORM_USER_CLIP_PLANE, + instr->const_index[0] * 4 + i); + } break; case nir_intrinsic_load_sample_mask_in: @@ -1694,12 +1699,27 @@ ntq_emit_block(struct vc4_compile *c, nir_block *block) } } +static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list); + +static void +ntq_emit_loop(struct vc4_compile *c, nir_loop *nloop) +{ + fprintf(stderr, "LOOPS not fully handled. Rendering errors likely.\n"); + ntq_emit_cf_list(c, &nloop->body); +} + +static void +ntq_emit_function(struct vc4_compile *c, nir_function_impl *func) +{ + fprintf(stderr, "FUNCTIONS not handled.\n"); + abort(); +} + static void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list) { foreach_list_typed(nir_cf_node, node, node, list) { switch (node->type) { - /* case nir_cf_node_loop: */ case nir_cf_node_block: ntq_emit_block(c, nir_cf_node_as_block(node)); break; @@ -1708,8 +1728,17 @@ ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list) ntq_emit_if(c, nir_cf_node_as_if(node)); break; + case nir_cf_node_loop: + ntq_emit_loop(c, nir_cf_node_as_loop(node)); + break; + + case nir_cf_node_function: + ntq_emit_function(c, nir_cf_node_as_function(node)); + break; + default: - assert(0); + fprintf(stderr, "Unknown NIR node type\n"); + abort(); } } } @@ -1810,11 +1839,11 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, } c->s = tgsi_to_nir(tokens, &nir_options); - nir_opt_global_to_local(c->s); - nir_convert_to_ssa(c->s); + NIR_PASS_V(c->s, nir_opt_global_to_local); + NIR_PASS_V(c->s, nir_convert_to_ssa); if (stage == QSTAGE_FRAG) - vc4_nir_lower_blend(c); + NIR_PASS_V(c->s, vc4_nir_lower_blend, c); struct nir_lower_tex_options tex_options = { /* We would need to implement txs, but we don't want the @@ -1864,26 +1893,25 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, } } - nir_lower_tex(c->s, &tex_options); + NIR_PASS_V(c->s, nir_lower_tex, &tex_options); if (c->fs_key && c->fs_key->light_twoside) - nir_lower_two_sided_color(c->s); + NIR_PASS_V(c->s, nir_lower_two_sided_color); if (stage == QSTAGE_FRAG) - nir_lower_clip_fs(c->s, c->key->ucp_enables); + NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables); else - nir_lower_clip_vs(c->s, c->key->ucp_enables); + NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables); - vc4_nir_lower_io(c); - vc4_nir_lower_txf_ms(c); - nir_lower_idiv(c->s); - nir_lower_load_const_to_scalar(c->s); + NIR_PASS_V(c->s, vc4_nir_lower_io, c); + NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c); + NIR_PASS_V(c->s, nir_lower_idiv); + NIR_PASS_V(c->s, nir_lower_load_const_to_scalar); vc4_optimize_nir(c->s); - nir_remove_dead_variables(c->s); - - nir_convert_from_ssa(c->s, true); + NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_local); + NIR_PASS_V(c->s, nir_convert_from_ssa, true); if (vc4_debug & VC4_DEBUG_SHADERDB) { fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n", diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index e73e3899410..293eb01adab 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -31,7 +31,6 @@ struct qir_op_info { const char *name; uint8_t ndst, nsrc; bool has_side_effects; - bool multi_instruction; }; static const struct qir_op_info qir_op_info[] = { @@ -65,23 +64,16 @@ static const struct qir_op_info qir_op_info[] = { [QOP_XOR] = { "xor", 1, 2 }, [QOP_NOT] = { "not", 1, 1 }, - [QOP_RCP] = { "rcp", 1, 1, false, true }, - [QOP_RSQ] = { "rsq", 1, 1, false, true }, - [QOP_EXP2] = { "exp2", 1, 2, false, true }, - [QOP_LOG2] = { "log2", 1, 2, false, true }, - [QOP_TLB_STENCIL_SETUP] = { "tlb_stencil_setup", 0, 1, true }, - [QOP_TLB_Z_WRITE] = { "tlb_z", 0, 1, true }, - [QOP_TLB_COLOR_WRITE] = { "tlb_color", 0, 1, true }, - [QOP_TLB_COLOR_WRITE_MS] = { "tlb_color_ms", 0, 1, true }, + [QOP_RCP] = { "rcp", 1, 1 }, + [QOP_RSQ] = { "rsq", 1, 1 }, + [QOP_EXP2] = { "exp2", 1, 2 }, + [QOP_LOG2] = { "log2", 1, 2 }, [QOP_TLB_COLOR_READ] = { "tlb_color_read", 1, 0 }, [QOP_MS_MASK] = { "ms_mask", 0, 1, true }, [QOP_VARY_ADD_C] = { "vary_add_c", 1, 1 }, - [QOP_FRAG_X] = { "frag_x", 1, 0 }, - [QOP_FRAG_Y] = { "frag_y", 1, 0 }, [QOP_FRAG_Z] = { "frag_z", 1, 0 }, [QOP_FRAG_W] = { "frag_w", 1, 0 }, - [QOP_FRAG_REV_FLAG] = { "frag_rev_flag", 1, 0 }, [QOP_TEX_S] = { "tex_s", 0, 2 }, [QOP_TEX_T] = { "tex_t", 0, 2 }, @@ -116,6 +108,16 @@ qir_get_op_nsrc(enum qop qop) bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst) { + switch (inst->dst.file) { + case QFILE_TLB_Z_WRITE: + case QFILE_TLB_COLOR_WRITE: + case QFILE_TLB_COLOR_WRITE_MS: + case QFILE_TLB_STENCIL_SETUP: + return true; + default: + break; + } + return qir_op_info[inst->op].has_side_effects; } @@ -144,12 +146,6 @@ qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst) } bool -qir_is_multi_instruction(struct qinst *inst) -{ - return qir_op_info[inst->op].multi_instruction; -} - -bool qir_is_mul(struct qinst *inst) { switch (inst->op) { @@ -233,24 +229,47 @@ qir_print_reg(struct vc4_compile *c, struct qreg reg, bool write) [QFILE_TEMP] = "t", [QFILE_VARY] = "v", [QFILE_UNIF] = "u", + [QFILE_TLB_COLOR_WRITE] = "tlb_c", + [QFILE_TLB_COLOR_WRITE_MS] = "tlb_c_ms", + [QFILE_TLB_Z_WRITE] = "tlb_z", + [QFILE_TLB_STENCIL_SETUP] = "tlb_stencil", + [QFILE_FRAG_X] = "frag_x", + [QFILE_FRAG_Y] = "frag_y", + [QFILE_FRAG_REV_FLAG] = "frag_rev_flag", }; - if (reg.file == QFILE_NULL) { + switch (reg.file) { + + case QFILE_NULL: fprintf(stderr, "null"); - } else if (reg.file == QFILE_SMALL_IMM) { + break; + + case QFILE_SMALL_IMM: if ((int)reg.index >= -16 && (int)reg.index <= 15) fprintf(stderr, "%d", reg.index); else fprintf(stderr, "%f", uif(reg.index)); - } else if (reg.file == QFILE_VPM) { + break; + + case QFILE_VPM: if (write) { fprintf(stderr, "vpm"); } else { fprintf(stderr, "vpm%d.%d", reg.index / 4, reg.index % 4); } - } else { + break; + + case QFILE_TLB_COLOR_WRITE: + case QFILE_TLB_COLOR_WRITE_MS: + case QFILE_TLB_Z_WRITE: + case QFILE_TLB_STENCIL_SETUP: + fprintf(stderr, "%s", files[reg.file]); + break; + + default: fprintf(stderr, "%s%d", files[reg.file], reg.index); + break; } if (reg.file == QFILE_UNIF && @@ -455,12 +474,11 @@ qir_uniform(struct vc4_compile *c, for (int i = 0; i < c->num_uniforms; i++) { if (c->uniform_contents[i] == contents && c->uniform_data[i] == data) { - return (struct qreg) { QFILE_UNIF, i }; + return qir_reg(QFILE_UNIF, i); } } uint32_t uniform = c->num_uniforms++; - struct qreg u = { QFILE_UNIF, uniform }; if (uniform >= c->uniform_array_size) { c->uniform_array_size = MAX2(MAX2(16, uniform + 1), @@ -477,7 +495,7 @@ qir_uniform(struct vc4_compile *c, c->uniform_contents[uniform] = contents; c->uniform_data[uniform] = data; - return u; + return qir_reg(QFILE_UNIF, uniform); } void @@ -492,10 +510,8 @@ qir_SF(struct vc4_compile *c, struct qreg src) if (src.file != QFILE_TEMP || !c->defs[src.index] || - last_inst != c->defs[src.index] || - qir_is_multi_instruction(last_inst)) { - struct qreg null = { QFILE_NULL, 0 }; - last_inst = qir_MOV_dest(c, null, src); + last_inst != c->defs[src.index]) { + last_inst = qir_MOV_dest(c, qir_reg(QFILE_NULL, 0), src); last_inst = (struct qinst *)c->instructions.prev; } last_inst->sf = true; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 3fbf5d749e7..e8ba74b9a4d 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -49,6 +49,17 @@ enum qfile { QFILE_VARY, QFILE_UNIF, QFILE_VPM, + QFILE_TLB_COLOR_WRITE, + QFILE_TLB_COLOR_WRITE_MS, + QFILE_TLB_Z_WRITE, + QFILE_TLB_STENCIL_SETUP, + + /* Payload registers that aren't in the physical register file, so we + * can just use the corresponding qpu_reg at qpu_emit time. + */ + QFILE_FRAG_X, + QFILE_FRAG_Y, + QFILE_FRAG_REV_FLAG, /** * Stores an immediate value in the index field that can be turned @@ -63,6 +74,11 @@ struct qreg { int pack; }; +static inline struct qreg qir_reg(enum qfile file, uint32_t index) +{ + return (struct qreg){file, index}; +} + enum qop { QOP_UNDEF, QOP_MOV, @@ -101,19 +117,12 @@ enum qop { QOP_LOG2, QOP_VW_SETUP, QOP_VR_SETUP, - QOP_TLB_STENCIL_SETUP, - QOP_TLB_Z_WRITE, - QOP_TLB_COLOR_WRITE, - QOP_TLB_COLOR_WRITE_MS, QOP_TLB_COLOR_READ, QOP_MS_MASK, QOP_VARY_ADD_C, - QOP_FRAG_X, - QOP_FRAG_Y, QOP_FRAG_Z, QOP_FRAG_W, - QOP_FRAG_REV_FLAG, /** Texture x coordinate parameter write */ QOP_TEX_S, @@ -463,7 +472,6 @@ int qir_get_op_nsrc(enum qop qop); bool qir_reg_equals(struct qreg a, struct qreg b); bool qir_has_side_effects(struct vc4_compile *c, struct qinst *inst); bool qir_has_side_effect_reads(struct vc4_compile *c, struct qinst *inst); -bool qir_is_multi_instruction(struct qinst *inst); bool qir_is_mul(struct qinst *inst); bool qir_is_raw_mov(struct qinst *inst); bool qir_is_tex(struct qinst *inst); @@ -484,13 +492,13 @@ bool qir_opt_cse(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); bool qir_opt_small_immediates(struct vc4_compile *c); bool qir_opt_vpm(struct vc4_compile *c); -void vc4_nir_lower_blend(struct vc4_compile *c); -void vc4_nir_lower_io(struct vc4_compile *c); +void vc4_nir_lower_blend(nir_shader *s, struct vc4_compile *c); +void vc4_nir_lower_io(nir_shader *s, struct vc4_compile *c); nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, enum quniform_contents contents); nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b, nir_ssa_def **srcs, int swiz); -void vc4_nir_lower_txf_ms(struct vc4_compile *c); +void vc4_nir_lower_txf_ms(nir_shader *s, struct vc4_compile *c); void qir_lower_uniforms(struct vc4_compile *c); uint32_t qpu_schedule_instructions(struct vc4_compile *c); @@ -618,17 +626,10 @@ QIR_NODST_2(TEX_T) QIR_NODST_2(TEX_R) QIR_NODST_2(TEX_B) QIR_NODST_2(TEX_DIRECT) -QIR_ALU0(FRAG_X) -QIR_ALU0(FRAG_Y) QIR_ALU0(FRAG_Z) QIR_ALU0(FRAG_W) -QIR_ALU0(FRAG_REV_FLAG) QIR_ALU0(TEX_RESULT) QIR_ALU0(TLB_COLOR_READ) -QIR_NODST_1(TLB_COLOR_WRITE) -QIR_NODST_1(TLB_COLOR_WRITE_MS) -QIR_NODST_1(TLB_Z_WRITE) -QIR_NODST_1(TLB_STENCIL_SETUP) QIR_NODST_1(MS_MASK) static inline struct qreg @@ -703,8 +704,7 @@ qir_POW(struct vc4_compile *c, struct qreg x, struct qreg y) static inline void qir_VPM_WRITE(struct vc4_compile *c, struct qreg val) { - static const struct qreg vpm = { QFILE_VPM, 0 }; - qir_emit(c, qir_inst(QOP_MOV, vpm, val, c->undef)); + qir_MOV_dest(c, qir_reg(QFILE_VPM, 0), val); } #endif /* VC4_QIR_H */ diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c index a57e100593c..927268d71ef 100644 --- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c +++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c @@ -150,7 +150,7 @@ qir_lower_uniforms(struct vc4_compile *c) * reference a temp instead. */ struct qreg temp = qir_get_temp(c); - struct qreg unif = { QFILE_UNIF, max_index }; + struct qreg unif = qir_reg(QFILE_UNIF, max_index); struct qinst *mov = qir_inst(QOP_MOV, temp, unif, c->undef); list_add(&mov->link, &c->instructions); c->defs[temp.index] = mov; diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c index 186e81be750..8b843a3a158 100644 --- a/src/gallium/drivers/vc4/vc4_qir_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c @@ -228,10 +228,7 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) add_write_dep(dir, &state->last_tex_result, n); break; - case QOP_TLB_COLOR_WRITE: case QOP_TLB_COLOR_READ: - case QOP_TLB_Z_WRITE: - case QOP_TLB_STENCIL_SETUP: case QOP_MS_MASK: add_write_dep(dir, &state->last_tlb, n); break; @@ -240,10 +237,25 @@ calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) break; } - if (inst->dst.file == QFILE_VPM) + switch (inst->dst.file) { + case QFILE_VPM: add_write_dep(dir, &state->last_vpm_write, n); - else if (inst->dst.file == QFILE_TEMP) + break; + + case QFILE_TEMP: add_write_dep(dir, &state->last_temp_write[inst->dst.index], n); + break; + + case QFILE_TLB_COLOR_WRITE: + case QFILE_TLB_COLOR_WRITE_MS: + case QFILE_TLB_Z_WRITE: + case QFILE_TLB_STENCIL_SETUP: + add_write_dep(dir, &state->last_tlb, n); + break; + + default: + break; + } if (qir_depends_on_flags(inst)) add_dep(dir, state->last_sf, n); @@ -357,11 +369,13 @@ get_register_pressure_cost(struct schedule_state *state, struct qinst *inst) static bool locks_scoreboard(struct qinst *inst) { - switch (inst->op) { - case QOP_TLB_Z_WRITE: - case QOP_TLB_COLOR_WRITE: - case QOP_TLB_COLOR_WRITE_MS: - case QOP_TLB_COLOR_READ: + if (inst->op == QOP_TLB_COLOR_READ) + return true; + + switch (inst->dst.file) { + case QFILE_TLB_Z_WRITE: + case QFILE_TLB_COLOR_WRITE: + case QFILE_TLB_COLOR_WRITE_MS: return true; default: return false; diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index b507e370683..ae3590854b2 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -167,6 +167,16 @@ set_last_dst_pack(struct vc4_compile *c, struct qinst *inst) } } +static void +handle_r4_qpu_write(struct vc4_compile *c, struct qinst *qinst, + struct qpu_reg dst) +{ + if (dst.mux != QPU_MUX_R4) + queue(c, qpu_a_MOV(dst, qpu_r4())); + else if (qinst->sf) + queue(c, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4())); +} + void vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) { @@ -290,6 +300,22 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) last_vpm_read_index = qinst->src[i].index; src[i] = qpu_ra(QPU_R_VPM); break; + + case QFILE_FRAG_X: + src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD); + break; + case QFILE_FRAG_Y: + src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD); + break; + case QFILE_FRAG_REV_FLAG: + src[i] = qpu_rb(QPU_R_MS_REV_FLAGS); + break; + + case QFILE_TLB_COLOR_WRITE: + case QFILE_TLB_COLOR_WRITE_MS: + case QFILE_TLB_Z_WRITE: + case QFILE_TLB_STENCIL_SETUP: + unreachable("bad qir src file"); } } @@ -304,9 +330,29 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QFILE_VPM: dst = qpu_ra(QPU_W_VPM); break; + + case QFILE_TLB_COLOR_WRITE: + dst = qpu_tlbc(); + break; + + case QFILE_TLB_COLOR_WRITE_MS: + dst = qpu_tlbc_ms(); + break; + + case QFILE_TLB_Z_WRITE: + dst = qpu_ra(QPU_W_TLB_Z); + break; + + case QFILE_TLB_STENCIL_SETUP: + dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP); + break; + case QFILE_VARY: case QFILE_UNIF: case QFILE_SMALL_IMM: + case QFILE_FRAG_X: + case QFILE_FRAG_Y: + case QFILE_FRAG_REV_FLAG: assert(!"not reached"); break; } @@ -339,24 +385,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) abort(); } - if (dst.mux != QPU_MUX_R4) - queue(c, qpu_a_MOV(dst, qpu_r4())); - - break; + handle_r4_qpu_write(c, qinst, dst); - case QOP_FRAG_X: - queue(c, qpu_a_ITOF(dst, - qpu_ra(QPU_R_XY_PIXEL_COORD))); - break; - - case QOP_FRAG_Y: - queue(c, qpu_a_ITOF(dst, - qpu_rb(QPU_R_XY_PIXEL_COORD))); - break; - - case QOP_FRAG_REV_FLAG: - queue(c, qpu_a_ITOF(dst, - qpu_rb(QPU_R_MS_REV_FLAGS))); break; case QOP_MS_MASK: @@ -374,38 +404,11 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) */ break; - case QOP_TLB_STENCIL_SETUP: - assert(!unpack); - queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_STENCIL_SETUP), - src[0]) | unpack); - break; - - case QOP_TLB_Z_WRITE: - queue(c, qpu_a_MOV(qpu_ra(QPU_W_TLB_Z), - src[0]) | unpack); - set_last_cond_add(c, qinst->cond); - handled_qinst_cond = true; - break; - case QOP_TLB_COLOR_READ: queue(c, qpu_NOP()); *last_inst(c) = qpu_set_sig(*last_inst(c), QPU_SIG_COLOR_LOAD); - - if (dst.mux != QPU_MUX_R4) - queue(c, qpu_a_MOV(dst, qpu_r4())); - break; - - case QOP_TLB_COLOR_WRITE: - queue(c, qpu_a_MOV(qpu_tlbc(), src[0]) | unpack); - set_last_cond_add(c, qinst->cond); - handled_qinst_cond = true; - break; - - case QOP_TLB_COLOR_WRITE_MS: - queue(c, qpu_a_MOV(qpu_tlbc_ms(), src[0])); - set_last_cond_add(c, qinst->cond); - handled_qinst_cond = true; + handle_r4_qpu_write(c, qinst, dst); break; case QOP_VARY_ADD_C: @@ -432,8 +435,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) queue(c, qpu_NOP()); *last_inst(c) = qpu_set_sig(*last_inst(c), QPU_SIG_LOAD_TMU0); - if (dst.mux != QPU_MUX_R4) - queue(c, qpu_a_MOV(dst, qpu_r4())); + handle_r4_qpu_write(c, qinst, dst); break; default: @@ -476,10 +478,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) assert(qinst->cond == QPU_COND_ALWAYS || handled_qinst_cond); - if (qinst->sf) { - assert(!qir_is_multi_instruction(qinst)); + if (qinst->sf) *last_inst(c) |= QPU_SF; - } } uint32_t cycles = qpu_schedule_instructions(c); diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c index 270832eae3a..17400a37ca3 100644 --- a/src/gallium/drivers/vc4/vc4_query.c +++ b/src/gallium/drivers/vc4/vc4_query.c @@ -72,6 +72,11 @@ vc4_get_query_result(struct pipe_context *ctx, struct pipe_query *query, return true; } +static void +vc4_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void vc4_query_init(struct pipe_context *pctx) { @@ -80,5 +85,6 @@ vc4_query_init(struct pipe_context *pctx) pctx->begin_query = vc4_begin_query; pctx->end_query = vc4_end_query; pctx->get_query_result = vc4_get_query_result; + pctx->set_active_query_state = vc4_set_active_query_state; } diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index 167a2f5bd8e..1da4db2ebb7 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -208,6 +208,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; /* Stream output. */ diff --git a/src/gallium/drivers/virgl/virgl_query.c b/src/gallium/drivers/virgl/virgl_query.c index b0200556342..5173bd39a45 100644 --- a/src/gallium/drivers/virgl/virgl_query.c +++ b/src/gallium/drivers/virgl/virgl_query.c @@ -164,6 +164,11 @@ static boolean virgl_get_query_result(struct pipe_context *ctx, return TRUE; } +static void +virgl_set_active_query_state(struct pipe_context *pipe, boolean enable) +{ +} + void virgl_init_query_functions(struct virgl_context *vctx) { vctx->base.render_condition = virgl_render_condition; @@ -172,4 +177,5 @@ void virgl_init_query_functions(struct virgl_context *vctx) vctx->base.begin_query = virgl_begin_query; vctx->base.end_query = virgl_end_query; vctx->base.get_query_result = virgl_get_query_result; + vctx->base.set_active_query_state = virgl_set_active_query_state; } diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c index 5a5afc1712f..14c91105a04 100644 --- a/src/gallium/drivers/virgl/virgl_screen.c +++ b/src/gallium/drivers/virgl/virgl_screen.c @@ -240,6 +240,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_PCI_DEVICE: case PIPE_CAP_PCI_FUNCTION: case PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT: + case PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR: return 0; case PIPE_CAP_VENDOR_ID: return 0x1af4; diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h index 1c97e82ece5..82efaf5d8a9 100644 --- a/src/gallium/include/pipe/p_context.h +++ b/src/gallium/include/pipe/p_context.h @@ -173,6 +173,12 @@ struct pipe_context { struct pipe_resource *resource, unsigned offset); + /** + * Set whether all current non-driver queries except TIME_ELAPSED are + * active or paused. + */ + void (*set_active_query_state)(struct pipe_context *pipe, boolean enable); + /*@}*/ /** diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index 5e204a3e5ea..1aef21d6292 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -691,6 +691,7 @@ enum pipe_cap PIPE_CAP_PCI_DEVICE, PIPE_CAP_PCI_FUNCTION, PIPE_CAP_FRAMEBUFFER_NO_ATTACHMENT, + PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR, }; #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0) diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp index 4d11c2477c7..fb757886381 100644 --- a/src/gallium/state_trackers/clover/llvm/invocation.cpp +++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp @@ -322,6 +322,15 @@ namespace { // list of kernel functions to the internalizer. The internalizer will // treat the functions in the list as "main" functions and internalize // all of the other functions. +#if HAVE_LLVM >= 0x0309 + auto preserve_kernels = [=](const llvm::GlobalValue &GV) { + for (const auto &kernel : kernels) { + if (GV.getName() == kernel->getName()) + return true; + } + return false; + }; +#else std::vector<const char*> export_list; for (std::vector<llvm::Function *>::const_iterator I = kernels.begin(), E = kernels.end(); @@ -329,12 +338,17 @@ namespace { llvm::Function *kernel = *I; export_list.push_back(kernel->getName().data()); } +#endif #if HAVE_LLVM < 0x0306 PM.add(new llvm::DataLayoutPass(mod)); #elif HAVE_LLVM < 0x0307 PM.add(new llvm::DataLayoutPass()); #endif +#if HAVE_LLVM >= 0x0309 + PM.add(llvm::createInternalizePass(preserve_kernels)); +#else PM.add(llvm::createInternalizePass(export_list)); +#endif llvm::PassManagerBuilder PMB; PMB.OptLevel = optimization_level; diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c index b25c381d968..25d587af46f 100644 --- a/src/gallium/state_trackers/va/context.c +++ b/src/gallium/state_trackers/va/context.c @@ -283,6 +283,10 @@ vlVaDestroyContext(VADriverContextP ctx, VAContextID context_id) drv = VL_VA_DRIVER(ctx); pipe_mutex_lock(drv->mutex); context = handle_table_get(drv->htab, context_id); + if (!context) { + pipe_mutex_unlock(drv->mutex); + return VA_STATUS_ERROR_INVALID_CONTEXT; + } if (context->decoder) { if (u_reduce_video_profile(context->decoder->profile) == diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c index 2c42a985823..92d014c3d44 100644 --- a/src/gallium/state_trackers/va/image.c +++ b/src/gallium/state_trackers/va/image.c @@ -280,6 +280,7 @@ vlVaDestroyImage(VADriverContextP ctx, VAImageID image) { vlVaDriver *drv; VAImage *vaimage; + VAStatus status; if (!ctx) return VA_STATUS_ERROR_INVALID_CONTEXT; @@ -294,8 +295,9 @@ vlVaDestroyImage(VADriverContextP ctx, VAImageID image) handle_table_remove(VL_VA_DRIVER(ctx)->htab, image); pipe_mutex_unlock(drv->mutex); + status = vlVaDestroyBuffer(ctx, vaimage->buf); FREE(vaimage); - return vlVaDestroyBuffer(ctx, vaimage->buf); + return status; } VAStatus diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript index e1c78dd06a0..1c816ff7762 100644 --- a/src/gallium/targets/libgl-xlib/SConscript +++ b/src/gallium/targets/libgl-xlib/SConscript @@ -48,11 +48,15 @@ if env['llvm']: env.Prepend(LIBS = [llvmpipe]) if env['platform'] != 'darwin': + # Disallow undefined symbols, except with Address Sanitizer, since libasan + # is not linked on shared libs, as it should be LD_PRELOAD'ed instead + if not env['asan']: + env.Append(SHLINKFLAGS = [ + '-Wl,-z,defs', + ]) env.Append(SHLINKFLAGS = [ - # Disallow undefined symbols - '-Wl,-z,defs', - # Restrict exported symbols - '-Wl,--version-script=%s' % File("libgl-xlib.sym").srcnode().path, + # Restrict exported symbols + '-Wl,--version-script=%s' % File("libgl-xlib.sym").srcnode().path, ]) # libGL.so.1.5 diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index c79bed45753..1b2793a5d6b 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -36,6 +36,7 @@ #include <amdgpu_drm.h> #include <xf86drm.h> #include <stdio.h> +#include <inttypes.h> static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo) { @@ -141,9 +142,9 @@ void amdgpu_bo_destroy(struct pb_buffer *_buf) amdgpu_fence_reference(&bo->fence[i], NULL); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - bo->ws->allocated_vram -= align(bo->base.size, bo->ws->gart_page_size); + bo->ws->allocated_vram -= align64(bo->base.size, bo->ws->gart_page_size); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - bo->ws->allocated_gtt -= align(bo->base.size, bo->ws->gart_page_size); + bo->ws->allocated_gtt -= align64(bo->base.size, bo->ws->gart_page_size); FREE(bo); } @@ -265,7 +266,7 @@ static void amdgpu_add_buffer_to_global_list(struct amdgpu_winsys_bo *bo) } static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, - unsigned size, + uint64_t size, unsigned alignment, unsigned usage, enum radeon_bo_domain initial_domain, @@ -303,9 +304,9 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); if (r) { fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); - fprintf(stderr, "amdgpu: size : %d bytes\n", size); - fprintf(stderr, "amdgpu: alignment : %d bytes\n", alignment); - fprintf(stderr, "amdgpu: domains : %d\n", initial_domain); + fprintf(stderr, "amdgpu: size : %"PRIu64" bytes\n", size); + fprintf(stderr, "amdgpu: alignment : %u bytes\n", alignment); + fprintf(stderr, "amdgpu: domains : %u\n", initial_domain); goto error_bo_alloc; } @@ -331,9 +332,9 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); if (initial_domain & RADEON_DOMAIN_VRAM) - ws->allocated_vram += align(size, ws->gart_page_size); + ws->allocated_vram += align64(size, ws->gart_page_size); else if (initial_domain & RADEON_DOMAIN_GTT) - ws->allocated_gtt += align(size, ws->gart_page_size); + ws->allocated_gtt += align64(size, ws->gart_page_size); amdgpu_add_buffer_to_global_list(bo); @@ -458,7 +459,7 @@ static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf, static struct pb_buffer * amdgpu_bo_create(struct radeon_winsys *rws, - unsigned size, + uint64_t size, unsigned alignment, boolean use_reusable_pool, enum radeon_bo_domain domain, @@ -468,21 +469,11 @@ amdgpu_bo_create(struct radeon_winsys *rws, struct amdgpu_winsys_bo *bo; unsigned usage = 0; - /* Don't use VRAM if the GPU doesn't have much. This is only the initial - * domain. The kernel is free to move the buffer if it wants to. - * - * 64MB means no VRAM by todays standards. - */ - if (domain & RADEON_DOMAIN_VRAM && ws->info.vram_size <= 64*1024*1024) { - domain = RADEON_DOMAIN_GTT; - flags = RADEON_FLAG_GTT_WC; - } - /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. */ - size = align(size, ws->gart_page_size); + size = align64(size, ws->gart_page_size); /* Only set one usage bit each for domains and flags, or the cache manager * might consider different sets of domains / flags compatible @@ -592,9 +583,9 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, *offset = whandle->offset; if (bo->initial_domain & RADEON_DOMAIN_VRAM) - ws->allocated_vram += align(bo->base.size, ws->gart_page_size); + ws->allocated_vram += align64(bo->base.size, ws->gart_page_size); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - ws->allocated_gtt += align(bo->base.size, ws->gart_page_size); + ws->allocated_gtt += align64(bo->base.size, ws->gart_page_size); amdgpu_add_buffer_to_global_list(bo); @@ -648,7 +639,7 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer, } static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, - void *pointer, unsigned size) + void *pointer, uint64_t size) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); amdgpu_bo_handle buf_handle; @@ -684,7 +675,7 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, bo->initial_domain = RADEON_DOMAIN_GTT; bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); - ws->allocated_gtt += align(bo->base.size, ws->gart_page_size); + ws->allocated_gtt += align64(bo->base.size, ws->gart_page_size); amdgpu_add_buffer_to_global_list(bo); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c index 4c837a8e20f..1164a3058c5 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c @@ -212,7 +212,7 @@ static int compute_level(struct amdgpu_winsys *ws, } surf_level = is_stencil ? &surf->stencil_level[level] : &surf->level[level]; - surf_level->offset = align(surf->bo_size, AddrSurfInfoOut->baseAlign); + surf_level->offset = align64(surf->bo_size, AddrSurfInfoOut->baseAlign); surf_level->slice_size = AddrSurfInfoOut->sliceSize; surf_level->pitch_bytes = AddrSurfInfoOut->pitch * (is_stencil ? 1 : surf->bpe); surf_level->npix_x = u_minify(surf->npix_x, level); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index 87d9a6aebec..1177d3e3c3a 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -256,6 +256,10 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws, int fd) goto fail; } + /* Set which chips have dedicated VRAM. */ + ws->info.has_dedicated_vram = + !(ws->amdinfo.ids_flags & AMDGPU_IDS_FLAGS_FUSION); + /* Set hardware information. */ ws->info.gart_size = gtt.heap_size; ws->info.vram_size = vram.heap_size; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 08856dff430..dd6555c9502 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -40,6 +40,7 @@ #include <errno.h> #include <fcntl.h> #include <stdio.h> +#include <inttypes.h> static inline struct radeon_bo *radeon_bo(struct pb_buffer *bo) { @@ -297,8 +298,8 @@ void radeon_bo_destroy(struct pb_buffer *_buf) sizeof(va)) != 0 && va.operation == RADEON_VA_RESULT_ERROR) { fprintf(stderr, "radeon: Failed to deallocate virtual address for buffer:\n"); - fprintf(stderr, "radeon: size : %d bytes\n", bo->base.size); - fprintf(stderr, "radeon: va : 0x%016llx\n", (unsigned long long)bo->va); + fprintf(stderr, "radeon: size : %"PRIu64" bytes\n", bo->base.size); + fprintf(stderr, "radeon: va : 0x%"PRIx64"\n", bo->va); } } @@ -529,10 +530,10 @@ static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws, if (drmCommandWriteRead(rws->fd, DRM_RADEON_GEM_CREATE, &args, sizeof(args))) { fprintf(stderr, "radeon: Failed to allocate a buffer:\n"); - fprintf(stderr, "radeon: size : %d bytes\n", size); - fprintf(stderr, "radeon: alignment : %d bytes\n", alignment); - fprintf(stderr, "radeon: domains : %d\n", args.initial_domain); - fprintf(stderr, "radeon: flags : %d\n", args.flags); + fprintf(stderr, "radeon: size : %u bytes\n", size); + fprintf(stderr, "radeon: alignment : %u bytes\n", alignment); + fprintf(stderr, "radeon: domains : %u\n", args.initial_domain); + fprintf(stderr, "radeon: flags : %u\n", args.flags); return NULL; } @@ -717,7 +718,7 @@ static void radeon_bo_set_metadata(struct pb_buffer *_buf, static struct pb_buffer * radeon_winsys_bo_create(struct radeon_winsys *rws, - unsigned size, + uint64_t size, unsigned alignment, boolean use_reusable_pool, enum radeon_bo_domain domain, @@ -727,6 +728,10 @@ radeon_winsys_bo_create(struct radeon_winsys *rws, struct radeon_bo *bo; unsigned usage = 0; + /* Only 32-bit sizes are supported. */ + if (size > UINT_MAX) + return NULL; + /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. @@ -768,7 +773,7 @@ radeon_winsys_bo_create(struct radeon_winsys *rws, } static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, - void *pointer, unsigned size) + void *pointer, uint64_t size) { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); struct drm_radeon_gem_userptr args; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index 1accc6a1863..2d9ec8cee09 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -297,6 +297,30 @@ static boolean do_winsys_init(struct radeon_drm_winsys *ws) break; } + /* Set which chips don't have dedicated VRAM. */ + switch (ws->info.family) { + case CHIP_RS400: + case CHIP_RC410: + case CHIP_RS480: + case CHIP_RS600: + case CHIP_RS690: + case CHIP_RS740: + case CHIP_RS780: + case CHIP_RS880: + case CHIP_PALM: + case CHIP_SUMO: + case CHIP_SUMO2: + case CHIP_ARUBA: + case CHIP_KAVERI: + case CHIP_KABINI: + case CHIP_MULLINS: + ws->info.has_dedicated_vram = false; + break; + + default: + ws->info.has_dedicated_vram = true; + } + /* Check for dma */ ws->info.has_sdma = FALSE; /* DMA is disabled on R700. There is IB corruption and hangs. */ diff --git a/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c b/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c index c1b9eb95c52..d049d1dbc46 100644 --- a/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c +++ b/src/gallium/winsys/svga/drm/pb_buffer_simple_fenced.c @@ -40,6 +40,7 @@ #include <unistd.h> #include <sched.h> #endif +#include <inttypes.h> #include "pipe/p_compiler.h" #include "pipe/p_defines.h" @@ -172,7 +173,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr) while(curr != &fenced_mgr->unfenced) { fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head); assert(!fenced_buf->fence); - debug_printf("%10p %7u %8u %7s\n", + debug_printf("%10p %"PRIu64" %8u %7s\n", (void *) fenced_buf, fenced_buf->base.size, p_atomic_read(&fenced_buf->base.reference.count), @@ -188,7 +189,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr) fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head); assert(fenced_buf->buffer); signaled = ops->fence_signalled(ops, fenced_buf->fence, 0); - debug_printf("%10p %7u %8u %7s %10p %s\n", + debug_printf("%10p %"PRIu64" %8u %7s %10p %s\n", (void *) fenced_buf, fenced_buf->base.size, p_atomic_read(&fenced_buf->base.reference.count), diff --git a/src/gallium/winsys/svga/drm/vmw_buffer.c b/src/gallium/winsys/svga/drm/vmw_buffer.c index c082dcc34e9..3ac80c7caf5 100644 --- a/src/gallium/winsys/svga/drm/vmw_buffer.c +++ b/src/gallium/winsys/svga/drm/vmw_buffer.c @@ -154,7 +154,7 @@ vmw_gmr_buffer_unmap(struct pb_buffer *_buf) static void vmw_gmr_buffer_get_base_buffer(struct pb_buffer *buf, struct pb_buffer **base_buf, - unsigned *offset) + pb_size *offset) { *base_buf = buf; *offset = 0; @@ -266,7 +266,7 @@ vmw_gmr_bufmgr_region_ptr(struct pb_buffer *buf, struct SVGAGuestPtr *ptr) { struct pb_buffer *base_buf; - unsigned offset = 0; + pb_size offset = 0; struct vmw_gmr_buffer *gmr_buf; pb_get_base_buffer( buf, &base_buf, &offset ); diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 56d79a02d79..52748a0619a 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -170,7 +170,7 @@ anv_shader_compile_to_nir(struct anv_device *device, /* Vulkan uses the separate-shader linking model */ nir->info.separate_shader = true; - nir = brw_preprocess_nir(nir, compiler->scalar_stage[stage]); + nir = brw_preprocess_nir(compiler, nir); nir_shader_gather_info(nir, entry_point->impl); diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am index 8dc44fda0f2..390381828e9 100644 --- a/src/mesa/Makefile.am +++ b/src/mesa/Makefile.am @@ -179,10 +179,8 @@ libmesagallium_la_LIBADD = \ $(ARCH_LIBS) libmesa_sse41_la_SOURCES = \ - main/streaming-load-memcpy.c \ - main/streaming-load-memcpy.h \ - main/sse_minmax.c \ - main/sse_minmax.h + $(X86_SSE41_FILES) + libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_CFLAGS) pkgconfigdir = $(libdir)/pkgconfig diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index 7425f01273d..2ffbb152e3c 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -396,6 +396,7 @@ VBO_FILES = \ STATETRACKER_FILES = \ state_tracker/st_atifs_to_tgsi.c \ + state_tracker/st_atifs_to_tgsi.h \ state_tracker/st_atom_array.c \ state_tracker/st_atom_atomicbuf.c \ state_tracker/st_atom_blend.c \ @@ -589,7 +590,9 @@ X86_64_FILES = \ X86_SSE41_FILES = \ main/streaming-load-memcpy.c \ - main/sse_minmax.c + main/streaming-load-memcpy.h \ + main/sse_minmax.c \ + main/sse_minmax.h SPARC_FILES = \ sparc/sparc.h \ diff --git a/src/mesa/drivers/dri/i965/.gitignore b/src/mesa/drivers/dri/i965/.gitignore index 8eb9f4e1598..70aae3f4d4c 100644 --- a/src/mesa/drivers/dri/i965/.gitignore +++ b/src/mesa/drivers/dri/i965/.gitignore @@ -1,3 +1,4 @@ +brw_nir_trig_workarounds.c i965_symbols_test test_eu_compact test_vec4_copy_propagation diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index 0db5a51e725..a41c8305a80 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -33,6 +33,7 @@ AM_CFLAGS = \ -I$(top_srcdir)/src/mesa/drivers/dri/common \ -I$(top_srcdir)/src/mesa/drivers/dri/intel/server \ -I$(top_srcdir)/src/gtest/include \ + -I$(top_srcdir)/src/compiler/nir \ -I$(top_builddir)/src/compiler/nir \ -I$(top_builddir)/src/mesa/drivers/dri/common \ $(DEFINES) \ @@ -41,6 +42,10 @@ AM_CFLAGS = \ AM_CXXFLAGS = $(AM_CFLAGS) +brw_nir_trig_workarounds.c: brw_nir_trig_workarounds.py $(top_srcdir)/src/compiler/nir/nir_algebraic.py + $(MKDIR_GEN) + $(AM_V_GEN) PYTHONPATH=$(top_srcdir)/src/compiler/nir $(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/brw_nir_trig_workarounds.py > $@ || ($(RM) $@; false) + noinst_LTLIBRARIES = libi965_dri.la libi965_compiler.la libi965_dri_la_SOURCES = $(i965_FILES) libi965_dri_la_LIBADD = libi965_compiler.la $(INTEL_LIBS) diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index 2802ec9887c..c314d7470bb 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -44,6 +44,7 @@ i965_compiler_FILES = \ brw_nir.c \ brw_nir_analyze_boolean_resolves.c \ brw_nir_attribute_workarounds.c \ + brw_nir_trig_workarounds.c \ brw_nir_opt_peephole_ffma.c \ brw_nir_uniforms.cpp \ brw_packed_float.c \ diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 2d480d02366..63ac3bc31ed 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -1151,10 +1151,9 @@ intel_gles3_srgb_workaround(struct brw_context *brw, */ fb->Visual.sRGBCapable = false; for (int i = 0; i < BUFFER_COUNT; i++) { - if (fb->Attachment[i].Renderbuffer && - fb->Attachment[i].Renderbuffer->Format == MESA_FORMAT_B8G8R8A8_SRGB) { - fb->Attachment[i].Renderbuffer->Format = MESA_FORMAT_B8G8R8A8_UNORM; - } + struct gl_renderbuffer *rb = fb->Attachment[i].Renderbuffer; + if (rb) + rb->Format = _mesa_get_srgb_format_linear(rb->Format); } } diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c index 09eb2392836..88bd7a499a7 100644 --- a/src/mesa/drivers/dri/i965/brw_disasm.c +++ b/src/mesa/drivers/dri/i965/brw_disasm.c @@ -1505,19 +1505,33 @@ brw_disassemble_inst(FILE *file, const struct brw_device_info *devinfo, break; } - case BRW_SFID_URB: + case BRW_SFID_URB: { + unsigned opcode = brw_inst_urb_opcode(devinfo, inst); + format(file, " %ld", brw_inst_urb_global_offset(devinfo, inst)); space = 1; - if (devinfo->gen >= 7) { - err |= control(file, "urb opcode", gen7_urb_opcode, - brw_inst_urb_opcode(devinfo, inst), &space); - } else if (devinfo->gen >= 5) { - err |= control(file, "urb opcode", gen5_urb_opcode, - brw_inst_urb_opcode(devinfo, inst), &space); + + err |= control(file, "urb opcode", + devinfo->gen >= 7 ? gen7_urb_opcode + : gen5_urb_opcode, + opcode, &space); + + if (devinfo->gen >= 7 && + brw_inst_urb_per_slot_offset(devinfo, inst)) { + string(file, " per-slot"); + } + + if (opcode == GEN8_URB_OPCODE_SIMD8_WRITE || + opcode == GEN8_URB_OPCODE_SIMD8_READ) { + if (brw_inst_urb_channel_mask_present(devinfo, inst)) + string(file, " masked"); + } else { + err |= control(file, "urb swizzle", urb_swizzle, + brw_inst_urb_swizzle_control(devinfo, inst), + &space); } - err |= control(file, "urb swizzle", urb_swizzle, - brw_inst_urb_swizzle_control(devinfo, inst), &space); + if (devinfo->gen < 7) { err |= control(file, "urb allocate", urb_allocate, brw_inst_urb_allocate(devinfo, inst), &space); @@ -1529,6 +1543,7 @@ brw_disassemble_inst(FILE *file, const struct brw_device_info *devinfo, brw_inst_urb_complete(devinfo, inst), &space); } break; + } case BRW_SFID_THREAD_SPAWNER: break; diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index 4c2e360edf9..7ae7b2ecdf6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -289,6 +289,7 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) case ir_binop_gequal: case ir_binop_equal: case ir_binop_nequal: + case ir_binop_ldexp: for (i = 0; i < vector_elements; i++) { ir_rvalue *op0 = get_element(op_var[0], i); ir_rvalue *op1 = get_element(op_var[1], i); @@ -404,7 +405,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) case ir_unop_unpack_unorm_2x16: case ir_unop_unpack_unorm_4x8: case ir_unop_unpack_half_2x16: - case ir_binop_ldexp: case ir_binop_vector_extract: case ir_triop_vector_insert: case ir_quadop_vector: diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index b27b170ebc3..ab564bbcb9e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -766,24 +766,12 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) break; case nir_op_fsin: - if (!compiler->precise_trig) { - inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); - } else { - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F); - inst = bld.emit(SHADER_OPCODE_SIN, tmp, op[0]); - inst = bld.MUL(result, tmp, brw_imm_f(0.99997)); - } + inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); inst->saturate = instr->dest.saturate; break; case nir_op_fcos: - if (!compiler->precise_trig) { - inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); - } else { - fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F); - inst = bld.emit(SHADER_OPCODE_COS, tmp, op[0]); - inst = bld.MUL(result, tmp, brw_imm_f(0.99997)); - } + inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); inst->saturate = instr->dest.saturate; break; @@ -876,6 +864,10 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) * When we XOR the sources, the top bit is 0 if they are the same and 1 * if they are different. We can then use a conditional modifier to * turn that into a predicate. This leads us to an XOR.l instruction. + * + * Technically, according to the PRM, you're not allowed to use .l on a + * XOR instruction. However, emperical experiments and Curro's reading + * of the simulator source both indicate that it's safe. */ fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); inst = bld.XOR(tmp, op[0], op[1]); diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp index c7d6fb8c79b..bb7e1eb128c 100644 --- a/src/mesa/drivers/dri/i965/brw_link.cpp +++ b/src/mesa/drivers/dri/i965/brw_link.cpp @@ -110,7 +110,6 @@ process_glsl_ir(gl_shader_stage stage, SUB_TO_ADD_NEG | EXP_TO_EXP2 | LOG_TO_LOG2 | - LDEXP_TO_ARITH | CARRY_TO_ARITH | BORROW_TO_ARITH); diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 83921891d1c..fb7fa235861 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -437,14 +437,19 @@ nir_optimize(nir_shader *nir, bool is_scalar) * is_scalar = true to scalarize everything prior to code gen. */ nir_shader * -brw_preprocess_nir(nir_shader *nir, bool is_scalar) +brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir) { bool progress; /* Written by OPT and OPT_V */ (void)progress; + const bool is_scalar = compiler->scalar_stage[nir->stage]; + if (nir->stage == MESA_SHADER_GEOMETRY) OPT(nir_lower_gs_intrinsics); + if (compiler->precise_trig) + OPT(brw_nir_apply_trig_workarounds); + static const nir_lower_tex_options tex_options = { .lower_txp = ~0, }; @@ -568,7 +573,7 @@ brw_create_nir(struct brw_context *brw, (void)progress; - nir = brw_preprocess_nir(nir, is_scalar); + nir = brw_preprocess_nir(brw->intelScreen->compiler, nir); OPT(nir_lower_system_values); OPT_V(brw_nir_lower_uniforms, is_scalar); diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h index 440b4ceb669..2711606511d 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.h +++ b/src/mesa/drivers/dri/i965/brw_nir.h @@ -81,7 +81,8 @@ nir_shader *brw_create_nir(struct brw_context *brw, gl_shader_stage stage, bool is_scalar); -nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar); +nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler, + nir_shader *nir); void brw_nir_lower_vs_inputs(nir_shader *nir, const struct brw_device_info *devinfo, @@ -105,6 +106,8 @@ bool brw_nir_apply_attribute_workarounds(nir_shader *nir, bool use_legacy_snorm_formula, const uint8_t *attrib_wa_flags); +bool brw_nir_apply_trig_workarounds(nir_shader *nir); + nir_shader *brw_nir_apply_sampler_key(nir_shader *nir, const struct brw_device_info *devinfo, const struct brw_sampler_prog_key_data *key, diff --git a/src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py b/src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py new file mode 100755 index 00000000000..67dab9ab326 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_nir_trig_workarounds.py @@ -0,0 +1,43 @@ +#! /usr/bin/env python +# +# Copyright (C) 2016 Intel Corporation +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice (including the next +# paragraph) shall be included in all copies or substantial portions of the +# Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import nir_algebraic + +# The SIN and COS instructions on Intel hardware can produce values +# slightly outside of the [-1.0, 1.0] range for a small set of values. +# Obviously, this can break everyone's expectations about trig functions. +# +# According to an internal presentation, the COS instruction can produce +# a value up to 1.000027 for inputs in the range (0.08296, 0.09888). One +# suggested workaround is to multiply by 0.99997, scaling down the +# amplitude slightly. Apparently this also minimizes the error function, +# reducing the maximum error from 0.00006 to about 0.00003. + +trig_workarounds = [ + (('fsin', 'x'), ('fmul', ('fsin', 'x'), 0.99997)), + (('fcos', 'x'), ('fmul', ('fcos', 'x'), 0.99997)), +] + +print '#include "brw_nir.h"' +print nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds", + trig_workarounds).render() diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp index f3361d69786..636340add35 100644 --- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp +++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp @@ -75,7 +75,7 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var, int namelen = strlen(var->name); /* The data for our (non-builtin) uniforms is stored in a series of - * gl_uniform_driver_storage structs for each subcomponent that + * gl_uniform_storage structs for each subcomponent that * glGetUniformLocation() could name. We know it's been set up in the same * order we'd walk the type, so walk the list of storage and find anything * with our name, or the prefix of a component that starts with our name. diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index b63e44a3bfb..032fdaa4d23 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -1094,24 +1094,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) break; case nir_op_fsin: - if (!compiler->precise_trig) { - inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]); - } else { - src_reg tmp = src_reg(this, glsl_type::vec4_type); - inst = emit_math(SHADER_OPCODE_SIN, dst_reg(tmp), op[0]); - inst = emit(MUL(dst, tmp, brw_imm_f(0.99997))); - } + inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]); inst->saturate = instr->dest.saturate; break; case nir_op_fcos: - if (!compiler->precise_trig) { - inst = emit_math(SHADER_OPCODE_COS, dst, op[0]); - } else { - src_reg tmp = src_reg(this, glsl_type::vec4_type); - inst = emit_math(SHADER_OPCODE_COS, dst_reg(tmp), op[0]); - inst = emit(MUL(dst, tmp, brw_imm_f(0.99997))); - } + inst = emit_math(SHADER_OPCODE_COS, dst, op[0]); inst->saturate = instr->dest.saturate; break; @@ -1141,6 +1129,10 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) * When we XOR the sources, the top bit is 0 if they are the same and 1 * if they are different. We can then use a conditional modifier to * turn that into a predicate. This leads us to an XOR.l instruction. + * + * Technically, according to the PRM, you're not allowed to use .l on a + * XOR instruction. However, emperical experiments and Curro's reading + * of the simulator source both indicate that it's safe. */ src_reg tmp = src_reg(this, glsl_type::ivec4_type); inst = emit(XOR(dst_reg(tmp), op[0], op[1])); diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 60ac124ecd0..6a20bd6d925 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -340,6 +340,7 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_texture_view = true; ctx->Extensions.ARB_shader_storage_buffer_object = true; ctx->Extensions.EXT_shader_samples_identical = true; + ctx->Extensions.OES_texture_buffer = true; if (brw->can_do_pipelined_register_writes) { ctx->Extensions.ARB_draw_indirect = true; diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c index 31030b1b4ea..a486d6e1ab9 100644 --- a/src/mesa/drivers/dri/i965/intel_pixel_read.c +++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c @@ -141,8 +141,7 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx, if (rb->_BaseFormat == GL_RGB) return false; - if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp, - INTEL_DOWNLOAD)) + if (!intel_get_memcpy(rb->Format, format, type, &mem_copy, &cpp)) return false; if (!irb->mt || diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index c6eb50aaba8..dbec82fbd44 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -1000,14 +1000,18 @@ intelCreateBuffer(__DRIscreen * driScrnPriv, fb->Visual.samples = num_samples; } - if (mesaVis->redBits == 5) - rgbFormat = MESA_FORMAT_B5G6R5_UNORM; - else if (mesaVis->sRGBCapable) - rgbFormat = MESA_FORMAT_B8G8R8A8_SRGB; - else if (mesaVis->alphaBits == 0) - rgbFormat = MESA_FORMAT_B8G8R8X8_UNORM; - else { - rgbFormat = MESA_FORMAT_B8G8R8A8_SRGB; + if (mesaVis->redBits == 5) { + rgbFormat = mesaVis->redMask == 0x1f ? MESA_FORMAT_R5G6B5_UNORM + : MESA_FORMAT_B5G6R5_UNORM; + } else if (mesaVis->sRGBCapable) { + rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8A8_SRGB + : MESA_FORMAT_B8G8R8A8_SRGB; + } else if (mesaVis->alphaBits == 0) { + rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8X8_UNORM + : MESA_FORMAT_B8G8R8X8_UNORM; + } else { + rgbFormat = mesaVis->redMask == 0xff ? MESA_FORMAT_R8G8B8A8_SRGB + : MESA_FORMAT_B8G8R8A8_SRGB; fb->Visual.sRGBCapable = true; } @@ -1078,11 +1082,45 @@ intelDestroyBuffer(__DRIdrawable * driDrawPriv) _mesa_reference_framebuffer(&fb, NULL); } +static void +intel_detect_sseu(struct intel_screen *intelScreen) +{ + assert(intelScreen->devinfo->gen >= 8); + int ret; + + intelScreen->subslice_total = -1; + intelScreen->eu_total = -1; + + ret = intel_get_param(intelScreen->driScrnPriv, I915_PARAM_SUBSLICE_TOTAL, + &intelScreen->subslice_total); + if (ret != -EINVAL) + goto err_out; + + ret = intel_get_param(intelScreen->driScrnPriv, + I915_PARAM_EU_TOTAL, &intelScreen->eu_total); + if (ret != -EINVAL) + goto err_out; + + /* Without this information, we cannot get the right Braswell brandstrings, + * and we have to use conservative numbers for GPGPU on many platforms, but + * otherwise, things will just work. + */ + if (intelScreen->subslice_total < 1 || intelScreen->eu_total < 1) + _mesa_warning(NULL, + "Kernel 4.1 required to properly query GPU properties.\n"); + + return; + +err_out: + intelScreen->subslice_total = -1; + intelScreen->eu_total = -1; + _mesa_warning(NULL, "Failed to query GPU properties.\n"); +} + static bool intel_init_bufmgr(struct intel_screen *intelScreen) { __DRIscreen *spriv = intelScreen->driScrnPriv; - bool devid_override = getenv("INTEL_DEVID_OVERRIDE") != NULL; intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL; @@ -1100,25 +1138,6 @@ intel_init_bufmgr(struct intel_screen *intelScreen) return false; } - intelScreen->subslice_total = -1; - intelScreen->eu_total = -1; - - /* Everything below this is for real hardware only */ - if (intelScreen->no_hw || devid_override) - return true; - - intel_get_param(spriv, I915_PARAM_SUBSLICE_TOTAL, - &intelScreen->subslice_total); - intel_get_param(spriv, I915_PARAM_EU_TOTAL, &intelScreen->eu_total); - - /* Without this information, we cannot get the right Braswell brandstrings, - * and we have to use conservative numbers for GPGPU on many platforms, but - * otherwise, things will just work. - */ - if (intelScreen->subslice_total == -1 || intelScreen->eu_total == -1) - _mesa_warning(NULL, - "Kernel 4.1 required to properly query GPU properties.\n"); - return true; } @@ -1473,6 +1492,10 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp) intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen); intelScreen->hw_has_timestamp = intel_detect_timestamp(intelScreen); + /* GENs prior to 8 do not support EU/Subslice info */ + if (intelScreen->devinfo->gen >= 8) + intel_detect_sseu(intelScreen); + const char *force_msaa = getenv("INTEL_FORCE_MSAA"); if (force_msaa) { intelScreen->winsys_msaa_samples_override = diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c index 1601edddef6..bee8be1fd27 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_image.c +++ b/src/mesa/drivers/dri/i965/intel_tex_image.c @@ -404,8 +404,7 @@ intel_gettexsubimage_tiled_memcpy(struct gl_context *ctx, if (texImage->_BaseFormat == GL_RGB) return false; - if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp, - INTEL_DOWNLOAD)) + if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp)) return false; /* If this is a nontrivial texture view, let another path handle it instead. */ diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c index 4849a4151e2..9561968d2d6 100644 --- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c +++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c @@ -119,8 +119,7 @@ intel_texsubimage_tiled_memcpy(struct gl_context * ctx, if (ctx->_ImageTransferState) return false; - if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp, - INTEL_UPLOAD)) + if (!intel_get_memcpy(texImage->TexFormat, format, type, &mem_copy, &cpp)) return false; /* If this is a nontrivial texture view, let another path handle it instead. */ diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c index 31354582964..a549854dce6 100644 --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c @@ -36,8 +36,10 @@ #include "brw_context.h" #include "intel_tiled_memcpy.h" -#ifdef __SSSE3__ +#if defined(__SSSE3__) #include <tmmintrin.h> +#elif defined(__SSE2__) +#include <emmintrin.h> #endif #define FILE_DEBUG_FLAG DEBUG_TEXTURE @@ -56,21 +58,86 @@ static const uint32_t ytile_width = 128; static const uint32_t ytile_height = 32; static const uint32_t ytile_span = 16; +static inline uint32_t +ror(uint32_t n, uint32_t d) +{ + return (n >> d) | (n << (32 - d)); +} + +/** + * Copy RGBA to BGRA - swap R and B. + */ +static inline void * +rgba8_copy(void *dst, const void *src, size_t bytes) +{ + uint32_t *d = dst; + uint32_t const *s = src; + + assert(bytes % 4 == 0); + + while (bytes >= 4) { + *d = ror(__builtin_bswap32(*s), 8); + d += 1; + s += 1; + bytes -= 4; + } + return dst; +} + #ifdef __SSSE3__ static const uint8_t rgba8_permutation[16] = { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; -/* NOTE: dst must be 16-byte aligned. src may be unaligned. */ -#define rgba8_copy_16_aligned_dst(dst, src) \ - _mm_store_si128((__m128i *)(dst), \ - _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \ - *(__m128i *) rgba8_permutation)) - -/* NOTE: src must be 16-byte aligned. dst may be unaligned. */ -#define rgba8_copy_16_aligned_src(dst, src) \ - _mm_storeu_si128((__m128i *)(dst), \ - _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \ - *(__m128i *) rgba8_permutation)) +static inline void +rgba8_copy_16_aligned_dst(void *dst, const void *src) +{ + _mm_store_si128(dst, + _mm_shuffle_epi8(_mm_loadu_si128(src), + *(__m128i *)rgba8_permutation)); +} + +static inline void +rgba8_copy_16_aligned_src(void *dst, const void *src) +{ + _mm_storeu_si128(dst, + _mm_shuffle_epi8(_mm_load_si128(src), + *(__m128i *)rgba8_permutation)); +} + +#elif defined(__SSE2__) +static inline void +rgba8_copy_16_aligned_dst(void *dst, const void *src) +{ + __m128i srcreg, dstreg, agmask, ag, rb, br; + + agmask = _mm_set1_epi32(0xFF00FF00); + srcreg = _mm_loadu_si128((__m128i *)src); + + rb = _mm_andnot_si128(agmask, srcreg); + ag = _mm_and_si128(agmask, srcreg); + br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)), + _MM_SHUFFLE(2, 3, 0, 1)); + dstreg = _mm_or_si128(ag, br); + + _mm_store_si128((__m128i *)dst, dstreg); +} + +static inline void +rgba8_copy_16_aligned_src(void *dst, const void *src) +{ + __m128i srcreg, dstreg, agmask, ag, rb, br; + + agmask = _mm_set1_epi32(0xFF00FF00); + srcreg = _mm_load_si128((__m128i *)src); + + rb = _mm_andnot_si128(agmask, srcreg); + ag = _mm_and_si128(agmask, srcreg); + br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)), + _MM_SHUFFLE(2, 3, 0, 1)); + dstreg = _mm_or_si128(ag, br); + + _mm_storeu_si128((__m128i *)dst, dstreg); +} #endif /** @@ -79,35 +146,27 @@ static const uint8_t rgba8_permutation[16] = static inline void * rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes) { - uint8_t *d = dst; - uint8_t const *s = src; + assert(bytes == 0 || !(((uintptr_t)dst) & 0xf)); -#ifdef __SSSE3__ - if (bytes == 16) { - assert(!(((uintptr_t)dst) & 0xf)); - rgba8_copy_16_aligned_dst(d+ 0, s+ 0); +#if defined(__SSSE3__) || defined(__SSE2__) + if (bytes == 64) { + rgba8_copy_16_aligned_dst(dst + 0, src + 0); + rgba8_copy_16_aligned_dst(dst + 16, src + 16); + rgba8_copy_16_aligned_dst(dst + 32, src + 32); + rgba8_copy_16_aligned_dst(dst + 48, src + 48); return dst; } - if (bytes == 64) { - assert(!(((uintptr_t)dst) & 0xf)); - rgba8_copy_16_aligned_dst(d+ 0, s+ 0); - rgba8_copy_16_aligned_dst(d+16, s+16); - rgba8_copy_16_aligned_dst(d+32, s+32); - rgba8_copy_16_aligned_dst(d+48, s+48); - return dst; + while (bytes >= 16) { + rgba8_copy_16_aligned_dst(dst, src); + src += 16; + dst += 16; + bytes -= 16; } #endif - while (bytes >= 4) { - d[0] = s[2]; - d[1] = s[1]; - d[2] = s[0]; - d[3] = s[3]; - d += 4; - s += 4; - bytes -= 4; - } + rgba8_copy(dst, src, bytes); + return dst; } @@ -117,35 +176,27 @@ rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes) static inline void * rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes) { - uint8_t *d = dst; - uint8_t const *s = src; + assert(bytes == 0 || !(((uintptr_t)src) & 0xf)); -#ifdef __SSSE3__ - if (bytes == 16) { - assert(!(((uintptr_t)src) & 0xf)); - rgba8_copy_16_aligned_src(d+ 0, s+ 0); +#if defined(__SSSE3__) || defined(__SSE2__) + if (bytes == 64) { + rgba8_copy_16_aligned_src(dst + 0, src + 0); + rgba8_copy_16_aligned_src(dst + 16, src + 16); + rgba8_copy_16_aligned_src(dst + 32, src + 32); + rgba8_copy_16_aligned_src(dst + 48, src + 48); return dst; } - if (bytes == 64) { - assert(!(((uintptr_t)src) & 0xf)); - rgba8_copy_16_aligned_src(d+ 0, s+ 0); - rgba8_copy_16_aligned_src(d+16, s+16); - rgba8_copy_16_aligned_src(d+32, s+32); - rgba8_copy_16_aligned_src(d+48, s+48); - return dst; + while (bytes >= 16) { + rgba8_copy_16_aligned_src(dst, src); + src += 16; + dst += 16; + bytes -= 16; } #endif - while (bytes >= 4) { - d[0] = s[2]; - d[1] = s[1]; - d[2] = s[0]; - d[3] = s[3]; - d += 4; - s += 4; - bytes -= 4; - } + rgba8_copy(dst, src, bytes); + return dst; } @@ -172,6 +223,12 @@ typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, * Copy texture data from linear to X tile layout. * * \copydoc tile_copy_fn + * + * The mem_copy parameters allow the user to specify an alternative mem_copy + * function that, for instance, may do RGBA -> BGRA swizzling. The first + * function must handle any memory alignment while the second function must + * only handle 16-byte alignment in whichever side (source or destination) is + * tiled. */ static inline void linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, @@ -179,7 +236,8 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, char *dst, const char *src, int32_t src_pitch, uint32_t swizzle_bit, - mem_copy_fn mem_copy) + mem_copy_fn mem_copy, + mem_copy_fn mem_copy_align16) { /* The copy destination offset for each range copied is the sum of * an X offset 'x0' or 'xo' and a Y offset 'yo.' @@ -200,10 +258,10 @@ linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0); for (xo = x1; xo < x2; xo += xtile_span) { - mem_copy(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span); + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span); } - mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); src += src_pitch; } @@ -220,7 +278,8 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, char *dst, const char *src, int32_t src_pitch, uint32_t swizzle_bit, - mem_copy_fn mem_copy) + mem_copy_fn mem_copy, + mem_copy_fn mem_copy_align16) { /* Y tiles consist of columns that are 'ytile_span' wide (and the same height * as the tile). Thus the destination offset for (x,y) is the sum of: @@ -259,12 +318,12 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, * at each step so we don't need to calculate it explicitly. */ for (x = x1; x < x2; x += ytile_span) { - mem_copy(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span); xo += bytes_per_column; swizzle ^= swizzle_bit; } - mem_copy(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); + mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2); src += src_pitch; } @@ -281,7 +340,8 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, char *dst, const char *src, int32_t dst_pitch, uint32_t swizzle_bit, - mem_copy_fn mem_copy) + mem_copy_fn mem_copy, + mem_copy_fn mem_copy_align16) { /* The copy destination offset for each range copied is the sum of * an X offset 'x0' or 'xo' and a Y offset 'yo.' @@ -302,10 +362,10 @@ xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0); for (xo = x1; xo < x2; xo += xtile_span) { - mem_copy(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span); + mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span); } - mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); + mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); dst += dst_pitch; } @@ -322,7 +382,8 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, char *dst, const char *src, int32_t dst_pitch, uint32_t swizzle_bit, - mem_copy_fn mem_copy) + mem_copy_fn mem_copy, + mem_copy_fn mem_copy_align16) { /* Y tiles consist of columns that are 'ytile_span' wide (and the same height * as the tile). Thus the destination offset for (x,y) is the sum of: @@ -361,12 +422,12 @@ ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, * at each step so we don't need to calculate it explicitly. */ for (x = x1; x < x2; x += ytile_span) { - mem_copy(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); + mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span); xo += bytes_per_column; swizzle ^= swizzle_bit; } - mem_copy(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); + mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2); dst += dst_pitch; } @@ -393,26 +454,27 @@ linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { if (mem_copy == memcpy) return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, - dst, src, src_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy_aligned_dst) + dst, src, src_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height, dst, src, src_pitch, swizzle_bit, - rgba8_copy_aligned_dst); + rgba8_copy, rgba8_copy_aligned_dst); else unreachable("not reached"); } else { if (mem_copy == memcpy) return linear_to_xtiled(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy_aligned_dst) + dst, src, src_pitch, swizzle_bit, + memcpy, memcpy); + else if (mem_copy == rgba8_copy) return linear_to_xtiled(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, - rgba8_copy_aligned_dst); + rgba8_copy, rgba8_copy_aligned_dst); else unreachable("not reached"); } linear_to_xtiled(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, mem_copy); + dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); } /** @@ -435,26 +497,26 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { if (mem_copy == memcpy) return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, - dst, src, src_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy_aligned_dst) + dst, src, src_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, src_pitch, swizzle_bit, - rgba8_copy_aligned_dst); + rgba8_copy, rgba8_copy_aligned_dst); else unreachable("not reached"); } else { if (mem_copy == memcpy) return linear_to_ytiled(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy_aligned_dst) + dst, src, src_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) return linear_to_ytiled(x0, x1, x2, x3, y0, y1, dst, src, src_pitch, swizzle_bit, - rgba8_copy_aligned_dst); + rgba8_copy, rgba8_copy_aligned_dst); else unreachable("not reached"); } linear_to_ytiled(x0, x1, x2, x3, y0, y1, - dst, src, src_pitch, swizzle_bit, mem_copy); + dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); } /** @@ -477,26 +539,26 @@ xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) { if (mem_copy == memcpy) return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, - dst, src, dst_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy_aligned_src) + dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height, dst, src, dst_pitch, swizzle_bit, - rgba8_copy_aligned_src); + rgba8_copy, rgba8_copy_aligned_src); else unreachable("not reached"); } else { if (mem_copy == memcpy) return xtiled_to_linear(x0, x1, x2, x3, y0, y1, - dst, src, dst_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy_aligned_src) + dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) return xtiled_to_linear(x0, x1, x2, x3, y0, y1, dst, src, dst_pitch, swizzle_bit, - rgba8_copy_aligned_src); + rgba8_copy, rgba8_copy_aligned_src); else unreachable("not reached"); } xtiled_to_linear(x0, x1, x2, x3, y0, y1, - dst, src, dst_pitch, swizzle_bit, mem_copy); + dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); } /** @@ -519,26 +581,26 @@ ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { if (mem_copy == memcpy) return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, - dst, src, dst_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy_aligned_src) + dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height, dst, src, dst_pitch, swizzle_bit, - rgba8_copy_aligned_src); + rgba8_copy, rgba8_copy_aligned_src); else unreachable("not reached"); } else { if (mem_copy == memcpy) return ytiled_to_linear(x0, x1, x2, x3, y0, y1, - dst, src, dst_pitch, swizzle_bit, memcpy); - else if (mem_copy == rgba8_copy_aligned_src) + dst, src, dst_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) return ytiled_to_linear(x0, x1, x2, x3, y0, y1, dst, src, dst_pitch, swizzle_bit, - rgba8_copy_aligned_src); + rgba8_copy, rgba8_copy_aligned_src); else unreachable("not reached"); } ytiled_to_linear(x0, x1, x2, x3, y0, y1, - dst, src, dst_pitch, swizzle_bit, mem_copy); + dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy); } /** @@ -745,8 +807,7 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2, * \return true if the format and type combination are valid */ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format, - GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp, - enum intel_memcpy_direction direction) + GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp) { if (type == GL_UNSIGNED_INT_8_8_8_8_REV && !(format == GL_RGBA || format == GL_BGRA)) @@ -764,8 +825,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format, if (format == GL_BGRA) { *mem_copy = memcpy; } else if (format == GL_RGBA) { - *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst - : rgba8_copy_aligned_src; + *mem_copy = rgba8_copy; } } else if ((tiledFormat == MESA_FORMAT_R8G8B8A8_UNORM) || (tiledFormat == MESA_FORMAT_R8G8B8X8_UNORM) || @@ -776,8 +836,7 @@ bool intel_get_memcpy(mesa_format tiledFormat, GLenum format, /* Copying from RGBA to BGRA is the same as BGRA to RGBA so we can * use the same function. */ - *mem_copy = direction == INTEL_UPLOAD ? rgba8_copy_aligned_dst - : rgba8_copy_aligned_src; + *mem_copy = rgba8_copy; } else if (format == GL_RGBA) { *mem_copy = memcpy; } diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h index 01543bf298d..d9148bb6239 100644 --- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h +++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.h @@ -55,20 +55,7 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2, uint32_t tiling, mem_copy_fn mem_copy); -/* Tells intel_get_memcpy() whether the memcpy() is - * - * - an upload to the GPU with an aligned destination and a potentially - * unaligned source; or - * - a download from the GPU with an aligned source and a potentially - * unaligned destination. - */ -enum intel_memcpy_direction { - INTEL_UPLOAD, - INTEL_DOWNLOAD -}; - bool intel_get_memcpy(mesa_format tiledFormat, GLenum format, - GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp, - enum intel_memcpy_direction direction); + GLenum type, mem_copy_fn *mem_copy, uint32_t *cpp); #endif /* INTEL_TILED_MEMCPY */ diff --git a/src/mesa/drivers/x11/SConscript b/src/mesa/drivers/x11/SConscript index 45419973d39..59c8df4b3c2 100644 --- a/src/mesa/drivers/x11/SConscript +++ b/src/mesa/drivers/x11/SConscript @@ -34,9 +34,13 @@ sources = [ 'xm_tri.c', ] -# Disallow undefined symbols if env['platform'] != 'darwin': - env.Append(SHLINKFLAGS = ['-Wl,-z,defs']) + # Disallow undefined symbols, except with Address Sanitizer, since libasan + # is not linked on shared libs, as it should be LD_PRELOAD'ed instead + if not env['asan']: + env.Append(SHLINKFLAGS = [ + '-Wl,-z,defs', + ]) # libGL.so.1.6 libgl_1_6 = env.SharedLibrary( diff --git a/src/mesa/drivers/x11/fakeglx.c b/src/mesa/drivers/x11/fakeglx.c index 80b71765e6c..2f4d966973e 100644 --- a/src/mesa/drivers/x11/fakeglx.c +++ b/src/mesa/drivers/x11/fakeglx.c @@ -794,7 +794,7 @@ destroy_visuals_on_display(Display *dpy) if (VisualTable[i]->display == dpy) { /* remove this visual */ int j; - free(VisualTable[i]); + XMesaDestroyVisual(VisualTable[i]); for (j = i; j < NumVisuals - 1; j++) VisualTable[j] = VisualTable[j + 1]; NumVisuals--; diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c index 65e7ca89d32..82c4d188d5a 100644 --- a/src/mesa/drivers/x11/xm_api.c +++ b/src/mesa/drivers/x11/xm_api.c @@ -856,6 +856,7 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display, accum_red_size, accum_green_size, accum_blue_size, accum_alpha_size, 0)) { + free(v->visinfo); free(v); return NULL; } diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c index dbba136f526..6af02d1c3dc 100644 --- a/src/mesa/main/context.c +++ b/src/mesa/main/context.c @@ -1525,10 +1525,6 @@ _mesa_copy_context( const struct gl_context *src, struct gl_context *dst, * Check if the given context can render into the given framebuffer * by checking visual attributes. * - * Most of these tests could go away because Mesa is now pretty flexible - * in terms of mixing rendering contexts with framebuffers. As long - * as RGB vs. CI mode agree, we're probably good. - * * \return GL_TRUE if compatible, GL_FALSE otherwise. */ static GLboolean @@ -1541,32 +1537,18 @@ check_compatible(const struct gl_context *ctx, if (buffer == _mesa_get_incomplete_framebuffer()) return GL_TRUE; -#if 0 - /* disabling this fixes the fgl_glxgears pbuffer demo */ - if (ctxvis->doubleBufferMode && !bufvis->doubleBufferMode) - return GL_FALSE; -#endif - if (ctxvis->stereoMode && !bufvis->stereoMode) - return GL_FALSE; - if (ctxvis->haveAccumBuffer && !bufvis->haveAccumBuffer) - return GL_FALSE; - if (ctxvis->haveDepthBuffer && !bufvis->haveDepthBuffer) - return GL_FALSE; - if (ctxvis->haveStencilBuffer && !bufvis->haveStencilBuffer) - return GL_FALSE; - if (ctxvis->redMask && ctxvis->redMask != bufvis->redMask) - return GL_FALSE; - if (ctxvis->greenMask && ctxvis->greenMask != bufvis->greenMask) - return GL_FALSE; - if (ctxvis->blueMask && ctxvis->blueMask != bufvis->blueMask) - return GL_FALSE; -#if 0 - /* disabled (see bug 11161) */ - if (ctxvis->depthBits && ctxvis->depthBits != bufvis->depthBits) - return GL_FALSE; -#endif - if (ctxvis->stencilBits && ctxvis->stencilBits != bufvis->stencilBits) - return GL_FALSE; +#define check_component(foo) \ + if (ctxvis->foo && bufvis->foo && \ + ctxvis->foo != bufvis->foo) \ + return GL_FALSE + + check_component(redMask); + check_component(greenMask); + check_component(blueMask); + check_component(depthBits); + check_component(stencilBits); + +#undef check_component return GL_TRUE; } diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h index ddc25d812c7..78899ecccad 100644 --- a/src/mesa/main/extensions_table.h +++ b/src/mesa/main/extensions_table.h @@ -91,6 +91,7 @@ EXT(ARB_point_sprite , ARB_point_sprite EXT(ARB_program_interface_query , dummy_true , GLL, GLC, x , x , 2012) EXT(ARB_provoking_vertex , EXT_provoking_vertex , GLL, GLC, x , x , 2009) EXT(ARB_query_buffer_object , ARB_query_buffer_object , GLL, GLC, x , x , 2013) +EXT(ARB_robust_buffer_access_behavior , ARB_robust_buffer_access_behavior , GLL, GLC, x , x , 2012) EXT(ARB_robustness , dummy_true , GLL, GLC, x , x , 2010) EXT(ARB_sample_shading , ARB_sample_shading , GLL, GLC, x , x , 2009) EXT(ARB_sampler_objects , dummy_true , GLL, GLC, x , x , 2009) diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c index 14cd58870f7..fe54109322d 100644 --- a/src/mesa/main/imports.c +++ b/src/mesa/main/imports.c @@ -262,7 +262,7 @@ ffsll(long long int val) { int bit; - assert(sizeof(val) == 8); + STATIC_ASSERT(sizeof(val) == 8); bit = ffs((int) val); if (bit != 0) diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 6c09948af04..eec057e0137 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -3771,6 +3771,7 @@ struct gl_extensions GLboolean ARB_pipeline_statistics_query; GLboolean ARB_point_sprite; GLboolean ARB_query_buffer_object; + GLboolean ARB_robust_buffer_access_behavior; GLboolean ARB_sample_shading; GLboolean ARB_seamless_cube_map; GLboolean ARB_shader_atomic_counter_ops; diff --git a/src/mesa/main/samplerobj.c b/src/mesa/main/samplerobj.c index ca366d967ab..74761953044 100644 --- a/src/mesa/main/samplerobj.c +++ b/src/mesa/main/samplerobj.c @@ -1171,8 +1171,9 @@ _mesa_SamplerParameterIiv(GLuint sampler, GLenum pname, const GLint *params) sampObj = _mesa_lookup_samplerobj(ctx, sampler); if (!sampObj) { - _mesa_error(ctx, GL_INVALID_VALUE, "glSamplerParameterIiv(sampler %u)", - sampler); + _mesa_error(ctx, (_mesa_is_gles(ctx) ? + GL_INVALID_OPERATION : GL_INVALID_VALUE), + "glSamplerParameterIiv(sampler %u)", sampler); return; } @@ -1257,8 +1258,9 @@ _mesa_SamplerParameterIuiv(GLuint sampler, GLenum pname, const GLuint *params) sampObj = _mesa_lookup_samplerobj(ctx, sampler); if (!sampObj) { - _mesa_error(ctx, GL_INVALID_VALUE, "glSamplerParameterIuiv(sampler %u)", - sampler); + _mesa_error(ctx, (_mesa_is_gles(ctx) ? + GL_INVALID_OPERATION : GL_INVALID_VALUE), + "glSamplerParameterIuiv(sampler %u)", sampler); return; } diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c index 2af3653f7bb..b9c1bcbbc6e 100644 --- a/src/mesa/main/version.c +++ b/src/mesa/main/version.c @@ -361,7 +361,7 @@ compute_version(const struct gl_extensions *extensions, extensions->ARB_fragment_layer_viewport && extensions->ARB_framebuffer_no_attachments && extensions->ARB_internalformat_query2 && - /* extensions->ARB_robust_buffer_access_behavior */ 0 && + extensions->ARB_robust_buffer_access_behavior && extensions->ARB_shader_image_size && extensions->ARB_shader_storage_buffer_object && extensions->ARB_stencil_texturing && diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c index 202b4eeeefa..8bbc2f0af4b 100644 --- a/src/mesa/state_tracker/st_cb_bufferobjects.c +++ b/src/mesa/state_tracker/st_cb_bufferobjects.c @@ -98,7 +98,7 @@ static void st_bufferobj_subdata(struct gl_context *ctx, GLintptrARB offset, GLsizeiptrARB size, - const GLvoid * data, struct gl_buffer_object *obj) + const void * data, struct gl_buffer_object *obj) { struct st_buffer_object *st_obj = st_buffer_object(obj); @@ -142,7 +142,7 @@ static void st_bufferobj_get_subdata(struct gl_context *ctx, GLintptrARB offset, GLsizeiptrARB size, - GLvoid * data, struct gl_buffer_object *obj) + void * data, struct gl_buffer_object *obj) { struct st_buffer_object *st_obj = st_buffer_object(obj); @@ -175,7 +175,7 @@ static GLboolean st_bufferobj_data(struct gl_context *ctx, GLenum target, GLsizeiptrARB size, - const GLvoid * data, + const void * data, GLenum usage, GLbitfield storageFlags, struct gl_buffer_object *obj) @@ -513,7 +513,7 @@ st_copy_buffer_subdata(struct gl_context *ctx, static void st_clear_buffer_subdata(struct gl_context *ctx, GLintptr offset, GLsizeiptr size, - const GLvoid *clearValue, + const void *clearValue, GLsizeiptr clearValueSize, struct gl_buffer_object *bufObj) { diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c index 55801469f23..362cef46286 100644 --- a/src/mesa/state_tracker/st_cb_clear.c +++ b/src/mesa/state_tracker/st_cb_clear.c @@ -206,6 +206,7 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers) CSO_BIT_STREAM_OUTPUTS | CSO_BIT_VERTEX_ELEMENTS | CSO_BIT_AUX_VERTEX_BUFFER_SLOT | + CSO_BIT_PAUSE_QUERIES | CSO_BITS_ALL_SHADERS)); /* blend state: RGBA masking */ diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c index 01ed5441d11..c3e05bbb7ce 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.c +++ b/src/mesa/state_tracker/st_cb_drawpixels.c @@ -379,12 +379,12 @@ static struct pipe_resource * make_texture(struct st_context *st, GLsizei width, GLsizei height, GLenum format, GLenum type, const struct gl_pixelstore_attrib *unpack, - const GLvoid *pixels) + const void *pixels) { struct gl_context *ctx = st->ctx; struct pipe_context *pipe = st->pipe; mesa_format mformat; - struct pipe_resource *pt; + struct pipe_resource *pt = NULL; enum pipe_format pipeFormat; GLenum baseInternalFormat; @@ -403,10 +403,18 @@ make_texture(struct st_context *st, unpack->SkipRows == 0 && unpack->SwapBytes == GL_FALSE && st->drawpix_cache.image) { + assert(st->drawpix_cache.texture); + /* check if the pixel data is the same */ if (memcmp(pixels, st->drawpix_cache.image, width * height * bpp) == 0) { /* OK, re-use the cached texture */ - return st->drawpix_cache.texture; + pipe_resource_reference(&pt, st->drawpix_cache.texture); + /* refcount of returned texture should be at least two here. One + * reference for the cache to hold on to, one for the caller (which + * it will release), and possibly more held by the driver. + */ + assert(pt->reference.count >= 2); + return pt; } } @@ -525,8 +533,14 @@ make_texture(struct st_context *st, st->drawpix_cache.image = malloc(width * height * bpp); if (st->drawpix_cache.image) { memcpy(st->drawpix_cache.image, pixels, width * height * bpp); + pipe_resource_reference(&st->drawpix_cache.texture, pt); + } + else { + /* out of memory, free/disable cached texture */ + st->drawpix_cache.width = 0; + st->drawpix_cache.height = 0; + pipe_resource_reference(&st->drawpix_cache.texture, NULL); } - st->drawpix_cache.texture = pt; } #endif @@ -744,7 +758,7 @@ static void draw_stencil_pixels(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, const struct gl_pixelstore_attrib *unpack, - const GLvoid *pixels) + const void *pixels) { struct st_context *st = st_context(ctx); struct pipe_context *pipe = st->pipe; @@ -798,7 +812,7 @@ draw_stencil_pixels(struct gl_context *ctx, GLint x, GLint y, for (row = 0; row < height; row++) { GLfloat *zValuesFloat = (GLfloat*)zValues; GLenum destType = GL_UNSIGNED_BYTE; - const GLvoid *source = _mesa_image_address2d(&clippedUnpack, pixels, + const void *source = _mesa_image_address2d(&clippedUnpack, pixels, width, height, format, type, row, 0); @@ -1041,7 +1055,7 @@ static void st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, - const struct gl_pixelstore_attrib *unpack, const GLvoid *pixels) + const struct gl_pixelstore_attrib *unpack, const void *pixels) { void *driver_vp, *driver_fp; struct st_context *st = st_context(ctx); @@ -1160,9 +1174,8 @@ st_DrawPixels(struct gl_context *ctx, GLint x, GLint y, if (num_sampler_view > 1) pipe_sampler_view_reference(&sv[1], NULL); -#if !USE_DRAWPIXELS_CACHE + /* free the texture (but may persist in the cache) */ pipe_resource_reference(&pt, NULL); -#endif } diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c index ff570e0e444..456ad83818b 100644 --- a/src/mesa/state_tracker/st_cb_fbo.c +++ b/src/mesa/state_tracker/st_cb_fbo.c @@ -40,6 +40,7 @@ #include "main/glformats.h" #include "main/macros.h" #include "main/renderbuffer.h" +#include "main/state.h" #include "pipe/p_context.h" #include "pipe/p_defines.h" @@ -729,6 +730,7 @@ st_ReadBuffer(struct gl_context *ctx, GLenum buffer) fb->Attachment[fb->_ColorReadBufferIndex].Type == GL_NONE) { /* add the buffer */ st_manager_add_color_renderbuffer(st, fb, fb->_ColorReadBufferIndex); + _mesa_update_state(ctx); st_validate_state(st, ST_PIPELINE_RENDER); } } diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c index 5153c4bbba1..393b881ea4c 100644 --- a/src/mesa/state_tracker/st_cb_readpixels.c +++ b/src/mesa/state_tracker/st_cb_readpixels.c @@ -85,7 +85,7 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GLsizei height, GLenum format, GLenum type, const struct gl_pixelstore_attrib *pack, - GLvoid *pixels) + void *pixels) { struct st_context *st = st_context(ctx); struct gl_renderbuffer *rb = @@ -238,7 +238,7 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y, GLuint row; for (row = 0; row < (unsigned) height; row++) { - GLvoid *dest = _mesa_image_address2d(pack, pixels, + void *dest = _mesa_image_address2d(pack, pixels, width, height, format, type, row, 0); memcpy(dest, map, bytesPerRow); diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c index 3980f5d2f51..a18b08b3226 100644 --- a/src/mesa/state_tracker/st_cb_texture.c +++ b/src/mesa/state_tracker/st_cb_texture.c @@ -1345,6 +1345,7 @@ try_pbo_upload_common(struct gl_context *ctx, CSO_BIT_DEPTH_STENCIL_ALPHA | CSO_BIT_RASTERIZER | CSO_BIT_STREAM_OUTPUTS | + CSO_BIT_PAUSE_QUERIES | CSO_BITS_ALL_SHADERS)); cso_save_constant_buffer_slot0(cso, PIPE_SHADER_FRAGMENT); @@ -1845,7 +1846,7 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims, /* 1D array textures. * We need to convert gallium coords to GL coords. */ - GLvoid *src = _mesa_image_address2d(unpack, pixels, + void *src = _mesa_image_address2d(unpack, pixels, width, depth, format, type, slice, 0); memcpy(map, src, bytesPerRow); @@ -1854,7 +1855,7 @@ st_TexSubImage(struct gl_context *ctx, GLuint dims, ubyte *slice_map = map; for (row = 0; row < (unsigned) height; row++) { - GLvoid *src = _mesa_image_address(dims, unpack, pixels, + void *src = _mesa_image_address(dims, unpack, pixels, width, height, format, type, slice, row, 0); memcpy(slice_map, src, bytesPerRow); @@ -1928,7 +1929,7 @@ st_CompressedTexSubImage(struct gl_context *ctx, GLuint dims, struct gl_texture_image *texImage, GLint x, GLint y, GLint z, GLsizei w, GLsizei h, GLsizei d, - GLenum format, GLsizei imageSize, const GLvoid *data) + GLenum format, GLsizei imageSize, const void *data) { struct st_context *st = st_context(ctx); struct st_texture_image *stImage = st_texture_image(texImage); @@ -2053,7 +2054,7 @@ fallback: static void st_CompressedTexImage(struct gl_context *ctx, GLuint dims, struct gl_texture_image *texImage, - GLsizei imageSize, const GLvoid *data) + GLsizei imageSize, const void *data) { prep_teximage(ctx, texImage, GL_NONE, GL_NONE); @@ -2106,7 +2107,7 @@ static void st_GetTexSubImage(struct gl_context * ctx, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLint depth, - GLenum format, GLenum type, GLvoid * pixels, + GLenum format, GLenum type, void * pixels, struct gl_texture_image *texImage) { struct st_context *st = st_context(ctx); @@ -2319,7 +2320,7 @@ st_GetTexSubImage(struct gl_context * ctx, /* 1D array textures. * We need to convert gallium coords to GL coords. */ - GLvoid *dest = _mesa_image_address3d(&ctx->Pack, pixels, + void *dest = _mesa_image_address3d(&ctx->Pack, pixels, width, depth, format, type, 0, slice, 0); memcpy(dest, map, bytesPerRow); @@ -2328,7 +2329,7 @@ st_GetTexSubImage(struct gl_context * ctx, ubyte *slice_map = map; for (row = 0; row < height; row++) { - GLvoid *dest = _mesa_image_address3d(&ctx->Pack, pixels, + void *dest = _mesa_image_address3d(&ctx->Pack, pixels, width, height, format, type, slice, row, 0); memcpy(dest, slice_map, bytesPerRow); @@ -2363,7 +2364,7 @@ st_GetTexSubImage(struct gl_context * ctx, /* 1D array textures. * We need to convert gallium coords to GL coords. */ - GLvoid *dest = _mesa_image_address3d(&ctx->Pack, pixels, + void *dest = _mesa_image_address3d(&ctx->Pack, pixels, width, depth, format, type, 0, slice, 0); @@ -2377,7 +2378,7 @@ st_GetTexSubImage(struct gl_context * ctx, } else { for (row = 0; row < height; row++) { - GLvoid *dest = _mesa_image_address3d(&ctx->Pack, pixels, + void *dest = _mesa_image_address3d(&ctx->Pack, pixels, width, height, format, type, slice, row, 0); @@ -3085,7 +3086,7 @@ st_ClearTexSubImage(struct gl_context *ctx, struct gl_texture_image *texImage, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, - const GLvoid *clearValue) + const void *clearValue) { static const char zeros[16] = {0}; struct st_texture_image *stImage = st_texture_image(texImage); diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 287894317df..6d407d33eff 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -589,6 +589,7 @@ void st_init_extensions(struct pipe_screen *screen, { o(ARB_pipeline_statistics_query), PIPE_CAP_QUERY_PIPELINE_STATISTICS }, { o(ARB_point_sprite), PIPE_CAP_POINT_SPRITE }, { o(ARB_query_buffer_object), PIPE_CAP_QUERY_BUFFER_OBJECT }, + { o(ARB_robust_buffer_access_behavior), PIPE_CAP_ROBUST_BUFFER_ACCESS_BEHAVIOR }, { o(ARB_sample_shading), PIPE_CAP_SAMPLE_SHADING }, { o(ARB_seamless_cube_map), PIPE_CAP_SEAMLESS_CUBE_MAP }, { o(ARB_shader_draw_parameters), PIPE_CAP_DRAW_PARAMETERS }, diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index b9ab7ae9919..5f037daea76 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -5192,43 +5192,72 @@ struct st_translate { }; /** Map Mesa's SYSTEM_VALUE_x to TGSI_SEMANTIC_x */ -const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = { - /* Vertex shader - */ - TGSI_SEMANTIC_VERTEXID, - TGSI_SEMANTIC_INSTANCEID, - TGSI_SEMANTIC_VERTEXID_NOBASE, - TGSI_SEMANTIC_BASEVERTEX, - TGSI_SEMANTIC_BASEINSTANCE, - TGSI_SEMANTIC_DRAWID, - - /* Geometry shader - */ - TGSI_SEMANTIC_INVOCATIONID, - - /* Fragment shader - */ - TGSI_SEMANTIC_POSITION, - TGSI_SEMANTIC_FACE, - TGSI_SEMANTIC_SAMPLEID, - TGSI_SEMANTIC_SAMPLEPOS, - TGSI_SEMANTIC_SAMPLEMASK, - TGSI_SEMANTIC_HELPER_INVOCATION, - - /* Tessellation shaders - */ - TGSI_SEMANTIC_TESSCOORD, - TGSI_SEMANTIC_VERTICESIN, - TGSI_SEMANTIC_PRIMID, - TGSI_SEMANTIC_TESSOUTER, - TGSI_SEMANTIC_TESSINNER, +unsigned +_mesa_sysval_to_semantic(unsigned sysval) +{ + switch (sysval) { + /* Vertex shader */ + case SYSTEM_VALUE_VERTEX_ID: + return TGSI_SEMANTIC_VERTEXID; + case SYSTEM_VALUE_INSTANCE_ID: + return TGSI_SEMANTIC_INSTANCEID; + case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE: + return TGSI_SEMANTIC_VERTEXID_NOBASE; + case SYSTEM_VALUE_BASE_VERTEX: + return TGSI_SEMANTIC_BASEVERTEX; + case SYSTEM_VALUE_BASE_INSTANCE: + return TGSI_SEMANTIC_BASEINSTANCE; + case SYSTEM_VALUE_DRAW_ID: + return TGSI_SEMANTIC_DRAWID; + + /* Geometry shader */ + case SYSTEM_VALUE_INVOCATION_ID: + return TGSI_SEMANTIC_INVOCATIONID; + + /* Fragment shader */ + case SYSTEM_VALUE_FRAG_COORD: + return TGSI_SEMANTIC_POSITION; + case SYSTEM_VALUE_FRONT_FACE: + return TGSI_SEMANTIC_FACE; + case SYSTEM_VALUE_SAMPLE_ID: + return TGSI_SEMANTIC_SAMPLEID; + case SYSTEM_VALUE_SAMPLE_POS: + return TGSI_SEMANTIC_SAMPLEPOS; + case SYSTEM_VALUE_SAMPLE_MASK_IN: + return TGSI_SEMANTIC_SAMPLEMASK; + case SYSTEM_VALUE_HELPER_INVOCATION: + return TGSI_SEMANTIC_HELPER_INVOCATION; + + /* Tessellation shader */ + case SYSTEM_VALUE_TESS_COORD: + return TGSI_SEMANTIC_TESSCOORD; + case SYSTEM_VALUE_VERTICES_IN: + return TGSI_SEMANTIC_VERTICESIN; + case SYSTEM_VALUE_PRIMITIVE_ID: + return TGSI_SEMANTIC_PRIMID; + case SYSTEM_VALUE_TESS_LEVEL_OUTER: + return TGSI_SEMANTIC_TESSOUTER; + case SYSTEM_VALUE_TESS_LEVEL_INNER: + return TGSI_SEMANTIC_TESSINNER; + + /* Compute shader */ + case SYSTEM_VALUE_LOCAL_INVOCATION_ID: + return TGSI_SEMANTIC_THREAD_ID; + case SYSTEM_VALUE_WORK_GROUP_ID: + return TGSI_SEMANTIC_BLOCK_ID; + case SYSTEM_VALUE_NUM_WORK_GROUPS: + return TGSI_SEMANTIC_GRID_SIZE; + + /* Unhandled */ + case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX: + case SYSTEM_VALUE_GLOBAL_INVOCATION_ID: + case SYSTEM_VALUE_VERTEX_CNT: + default: + assert(!"Unexpected SYSTEM_VALUE_ enum"); + return TGSI_SEMANTIC_COUNT; + } +} - /* Compute shaders - */ - TGSI_SEMANTIC_THREAD_ID, - TGSI_SEMANTIC_BLOCK_ID, - TGSI_SEMANTIC_GRID_SIZE, -}; /** * Make note of a branch to a label in the TGSI code. @@ -6000,35 +6029,6 @@ st_translate_program( assert(numInputs <= ARRAY_SIZE(t->inputs)); assert(numOutputs <= ARRAY_SIZE(t->outputs)); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_FRONT_FACE] == - TGSI_SEMANTIC_FACE); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_VERTEX_ID] == - TGSI_SEMANTIC_VERTEXID); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_INSTANCE_ID] == - TGSI_SEMANTIC_INSTANCEID); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_ID] == - TGSI_SEMANTIC_SAMPLEID); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_POS] == - TGSI_SEMANTIC_SAMPLEPOS); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_SAMPLE_MASK_IN] == - TGSI_SEMANTIC_SAMPLEMASK); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_INVOCATION_ID] == - TGSI_SEMANTIC_INVOCATIONID); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE] == - TGSI_SEMANTIC_VERTEXID_NOBASE); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_BASE_VERTEX] == - TGSI_SEMANTIC_BASEVERTEX); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_TESS_COORD] == - TGSI_SEMANTIC_TESSCOORD); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_HELPER_INVOCATION] == - TGSI_SEMANTIC_HELPER_INVOCATION); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_LOCAL_INVOCATION_ID] == - TGSI_SEMANTIC_THREAD_ID); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_WORK_GROUP_ID] == - TGSI_SEMANTIC_BLOCK_ID); - assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_NUM_WORK_GROUPS] == - TGSI_SEMANTIC_GRID_SIZE); - t = CALLOC_STRUCT(st_translate); if (!t) { ret = PIPE_ERROR_OUT_OF_MEMORY; @@ -6215,7 +6215,7 @@ st_translate_program( for (i = 0; sysInputs; i++) { if (sysInputs & (1 << i)) { - unsigned semName = _mesa_sysval_to_semantic[i]; + unsigned semName = _mesa_sysval_to_semantic(i); t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0); diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h index 729295bcb52..774588a111b 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.h +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h @@ -63,7 +63,8 @@ st_translate_stream_output_info(struct glsl_to_tgsi_visitor *glsl_to_tgsi, const GLuint outputMapping[], struct pipe_stream_output_info *so); -extern const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX]; +unsigned +_mesa_sysval_to_semantic(unsigned sysval); #ifdef __cplusplus } diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c index 7a686b199d5..e1c79a57b0a 100644 --- a/src/mesa/state_tracker/st_mesa_to_tgsi.c +++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c @@ -1074,7 +1074,7 @@ st_translate_mesa_program( for (i = 0; sysInputs; i++) { if (sysInputs & (1 << i)) { - unsigned semName = _mesa_sysval_to_semantic[i]; + unsigned semName = _mesa_sysval_to_semantic(i); t->systemValues[i] = ureg_DECL_system_value(ureg, semName, 0); diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c index b9abebfc7bf..08f25535ae1 100644 --- a/src/mesa/state_tracker/st_vdpau.c +++ b/src/mesa/state_tracker/st_vdpau.c @@ -55,7 +55,7 @@ #include "state_tracker/drm_driver.h" static struct pipe_resource * -st_vdpau_video_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface, +st_vdpau_video_surface_gallium(struct gl_context *ctx, const void *vdpSurface, GLuint index) { int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); @@ -86,7 +86,7 @@ st_vdpau_video_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface, } static struct pipe_resource * -st_vdpau_output_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface) +st_vdpau_output_surface_gallium(struct gl_context *ctx, const void *vdpSurface) { int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); uint32_t device = (uintptr_t)ctx->vdpDevice; @@ -135,7 +135,7 @@ st_vdpau_resource_from_description(struct gl_context *ctx, } static struct pipe_resource * -st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface) +st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const void *vdpSurface) { int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); uint32_t device = (uintptr_t)ctx->vdpDevice; @@ -154,7 +154,7 @@ st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface } static struct pipe_resource * -st_vdpau_video_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface, +st_vdpau_video_surface_dma_buf(struct gl_context *ctx, const void *vdpSurface, GLuint index) { int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); @@ -177,7 +177,7 @@ static void st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access, GLboolean output, struct gl_texture_object *texObj, struct gl_texture_image *texImage, - const GLvoid *vdpSurface, GLuint index) + const void *vdpSurface, GLuint index) { struct st_context *st = st_context(ctx); struct st_texture_object *stObj = st_texture_object(texObj); @@ -250,7 +250,7 @@ static void st_vdpau_unmap_surface(struct gl_context *ctx, GLenum target, GLenum access, GLboolean output, struct gl_texture_object *texObj, struct gl_texture_image *texImage, - const GLvoid *vdpSurface, GLuint index) + const void *vdpSurface, GLuint index) { struct st_context *st = st_context(ctx); struct st_texture_object *stObj = st_texture_object(texObj); |