aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorBrian Paul <[email protected]>2005-05-07 16:59:58 +0000
committerBrian Paul <[email protected]>2005-05-07 16:59:58 +0000
commit42fa81275c67d7d1ad8d255120af0ffeeb46b963 (patch)
tree1c786fb74f62263d6a1a312178bcef74daeffa5e /src
parente3f684b753c94d8657a1487655b41fdfc0119dba (diff)
x86-64 transform optimizations (Mikko T.)
Diffstat (limited to 'src')
-rw-r--r--src/mesa/Makefile2
-rw-r--r--src/mesa/math/m_debug_util.h38
-rw-r--r--src/mesa/math/m_debug_xform.c10
-rw-r--r--src/mesa/math/m_xform.c6
-rw-r--r--src/mesa/sources6
-rw-r--r--src/mesa/x86-64/Makefile29
-rw-r--r--src/mesa/x86-64/calling_convention.txt50
-rw-r--r--src/mesa/x86-64/matypes.h164
-rw-r--r--src/mesa/x86-64/x86-64.c115
-rw-r--r--src/mesa/x86-64/x86-64.h32
-rw-r--r--src/mesa/x86-64/xform4.S458
-rw-r--r--src/mesa/x86/assyntax.h12
-rw-r--r--src/mesa/x86/gen_matypes.c4
-rw-r--r--src/mesa/x86/glapi_x86.S10
14 files changed, 915 insertions, 21 deletions
diff --git a/src/mesa/Makefile b/src/mesa/Makefile
index 49fd88f1141..3ff8da7e4d6 100644
--- a/src/mesa/Makefile
+++ b/src/mesa/Makefile
@@ -146,6 +146,7 @@ osmesa-only: depend subdirs $(LIB_DIR)/$(OSMESA_LIB_NAME)
subdirs:
@ (cd x86 ; $(MAKE))
+ @ (cd x86-64 ; $(MAKE))
# Make the GL library
$(LIB_DIR)/$(GL_LIB_NAME): $(STAND_ALONE_OBJECTS)
@@ -223,5 +224,6 @@ clean:
-rm -f drivers/*/*.o
(cd drivers/dri ; $(MAKE) clean)
(cd x86 ; $(MAKE) clean)
+ (cd x86-64 ; $(MAKE) clean)
include depend
diff --git a/src/mesa/math/m_debug_util.h b/src/mesa/math/m_debug_util.h
index c07cdcf7ba7..765f54dfb5a 100644
--- a/src/mesa/math/m_debug_util.h
+++ b/src/mesa/math/m_debug_util.h
@@ -185,6 +185,44 @@ extern char *mesa_profile;
#endif
+#elif defined(__amd64__)
+
+#define rdtscll(val) do { \
+ unsigned int a,d; \
+ __asm__ volatile("rdtsc" : "=a" (a), "=d" (d)); \
+ (val) = ((unsigned long)a) | (((unsigned long)d)<<32); \
+} while(0)
+
+/* Copied from i386 PIII version */
+#define INIT_COUNTER() \
+ do { \
+ int cycle_i; \
+ counter_overhead = LONG_MAX; \
+ for ( cycle_i = 0 ; cycle_i < 16 ; cycle_i++ ) { \
+ unsigned long cycle_tmp1, cycle_tmp2; \
+ rdtscll(cycle_tmp1); \
+ rdtscll(cycle_tmp2); \
+ if ( counter_overhead > (cycle_tmp2 - cycle_tmp1) ) { \
+ counter_overhead = cycle_tmp2 - cycle_tmp1; \
+ } \
+ } \
+ } while (0)
+
+
+#define BEGIN_RACE(x) \
+ x = LONG_MAX; \
+ for ( cycle_i = 0 ; cycle_i < 10 ; cycle_i++ ) { \
+ unsigned long cycle_tmp1, cycle_tmp2; \
+ rdtscll(cycle_tmp1); \
+
+#define END_RACE(x) \
+ rdtscll(cycle_tmp2); \
+ if ( x > (cycle_tmp2 - cycle_tmp1) ) { \
+ x = cycle_tmp2 - cycle_tmp1; \
+ } \
+ } \
+ x -= counter_overhead;
+
#elif defined(__sparc__)
#define INIT_COUNTER() \
diff --git a/src/mesa/math/m_debug_xform.c b/src/mesa/math/m_debug_xform.c
index b634527b24d..d8250f246ef 100644
--- a/src/mesa/math/m_debug_xform.c
+++ b/src/mesa/math/m_debug_xform.c
@@ -166,7 +166,7 @@ ALIGN16(static GLfloat, d[TEST_COUNT][4]);
ALIGN16(static GLfloat, r[TEST_COUNT][4]);
static int test_transform_function( transform_func func, int psize,
- int mtype, long *cycles )
+ int mtype, unsigned long *cycles )
{
GLvector4f source[1], dest[1], ref[1];
GLmatrix mat[1];
@@ -187,7 +187,7 @@ static int test_transform_function( transform_func func, int psize,
mat->type = mtypes[mtype];
m = mat->m;
- ASSERT( ((GLuint)m & 15) == 0 );
+ ASSERT( ((long)m & 15) == 0 );
init_matrix( m );
@@ -279,7 +279,7 @@ static int test_transform_function( transform_func func, int psize,
void _math_test_all_transform_functions( char *description )
{
int psize, mtype;
- long benchmark_tab[4][7];
+ unsigned long benchmark_tab[4][7];
static int first_time = 1;
if ( first_time ) {
@@ -291,7 +291,7 @@ void _math_test_all_transform_functions( char *description )
if ( mesa_profile ) {
if ( !counter_overhead ) {
INIT_COUNTER();
- _mesa_printf("counter overhead: %ld cycles\n\n", counter_overhead );
+ _mesa_printf("counter overhead: %lu cycles\n\n", counter_overhead );
}
_mesa_printf("transform results after hooking in %s functions:\n", description );
}
@@ -310,7 +310,7 @@ void _math_test_all_transform_functions( char *description )
for ( mtype = 0 ; mtype < 7 ; mtype++ ) {
for ( psize = 1 ; psize <= 4 ; psize++ ) {
transform_func func = _mesa_transform_tab[psize][mtypes[mtype]];
- long *cycles = &(benchmark_tab[psize-1][mtype]);
+ unsigned long *cycles = &(benchmark_tab[psize-1][mtype]);
if ( test_transform_function( func, psize, mtype, cycles ) == 0 ) {
char buf[100];
diff --git a/src/mesa/math/m_xform.c b/src/mesa/math/m_xform.c
index 66dc44d9541..5366e34989c 100644
--- a/src/mesa/math/m_xform.c
+++ b/src/mesa/math/m_xform.c
@@ -51,6 +51,10 @@
#include "x86/common_x86_asm.h"
#endif
+#ifdef USE_X86_64_ASM
+#include "x86-64/x86-64.h"
+#endif
+
#ifdef USE_SPARC_ASM
#include "sparc/sparc.h"
#endif
@@ -212,6 +216,8 @@ _math_init_transformation( void )
_mesa_init_all_sparc_transform_asm();
#elif defined( USE_PPC_ASM )
_mesa_init_all_ppc_transform_asm();
+#elif defined( USE_X86_64_ASM )
+ _mesa_init_all_x86_64_transform_asm();
#endif
}
diff --git a/src/mesa/sources b/src/mesa/sources
index c4249a7e491..f2f3b6b642c 100644
--- a/src/mesa/sources
+++ b/src/mesa/sources
@@ -197,7 +197,8 @@ ASM_C_SOURCES = \
x86/3dnow.c \
x86/sse.c \
sparc/sparc.c \
- ppc/common_ppc.c
+ ppc/common_ppc.c \
+ x86-64/x86-64.c
X86_SOURCES = \
x86/common_x86_asm.S \
@@ -222,6 +223,9 @@ X86_SOURCES = \
X86_API = \
x86/glapi_x86.S
+X86-64_SOURCES = \
+ x86-64/xform4.S
+
SPARC_SOURCES = \
sparc/clip.S \
sparc/norm.S \
diff --git a/src/mesa/x86-64/Makefile b/src/mesa/x86-64/Makefile
new file mode 100644
index 00000000000..252218ca86b
--- /dev/null
+++ b/src/mesa/x86-64/Makefile
@@ -0,0 +1,29 @@
+# src/mesa/x86-64/Makefile
+
+TOP = ../../..
+
+include $(TOP)/configs/current
+
+
+
+INCLUDE_DIRS = \
+ -I$(TOP)/include/GL \
+ -I$(TOP)/include \
+ -I.. \
+ -I../main \
+ -I../math \
+ -I../glapi \
+ -I../tnl
+
+
+default: matypes.h
+
+clean:
+ rm -f matypes.h
+
+
+# need some special rules here, unfortunately
+matypes.h: ../main/mtypes.h ../tnl/t_context.h ../x86/gen_matypes
+ ../x86/gen_matypes | grep -v '#include "assyntax.h' > matypes.h
+
+xform4.o: matypes.h
diff --git a/src/mesa/x86-64/calling_convention.txt b/src/mesa/x86-64/calling_convention.txt
new file mode 100644
index 00000000000..4147f7eba91
--- /dev/null
+++ b/src/mesa/x86-64/calling_convention.txt
@@ -0,0 +1,50 @@
+Register Usage
+rax temporary register; with variable arguments passes information
+ about the number of SSE registers used; 1st return register
+
+rbx* callee-saved register; optionally used as base pointer
+
+rcx used to pass 4th integer argument to functions
+
+rdx used to pass 3rd argument to functions 2nd return register
+
+rsp* stack pointer
+
+rbp* callee-saved register; optionally used as frame pointer
+
+rsi used to pass 2nd argument to functions
+
+rdi used to pass 1st argument to functions
+
+r8 used to pass 5th argument to functions
+
+r9 used to pass 6th argument to functions
+
+r10 temporary register, used for passing a function's static chain pointer
+
+r11 temporary register
+
+r12-15* callee-saved registers
+
+xmm0�1 used to pass and return floating point arguments
+
+xmm2�7 used to pass floating point arguments
+
+xmm8�15 temporary registers
+
+mmx0�7 temporary registers
+
+st0 temporary register; used to return long double arguments
+
+st1 temporary registers; used to return long double arguments
+
+st2�7 temporary registers
+
+fs Reserved for system use (as thread specific data register)
+
+
+
+*) must be preserved across function calls
+
+Integer arguments from list: rdi,rsi,rdx,rcx,r8,r9,stack
+Floating point arguments from list: xmm0-xmm7 \ No newline at end of file
diff --git a/src/mesa/x86-64/matypes.h b/src/mesa/x86-64/matypes.h
new file mode 100644
index 00000000000..cdface9d3ae
--- /dev/null
+++ b/src/mesa/x86-64/matypes.h
@@ -0,0 +1,164 @@
+/*
+ * This file is automatically generated from the Mesa internal type
+ * definitions. Do not edit directly.
+ */
+
+#ifndef __ASM_TYPES_H__
+#define __ASM_TYPES_H__
+
+
+
+/* =============================================================
+ * Offsets for GLcontext
+ */
+
+#define CTX_DRIVER_CTX 904
+
+#define CTX_LIGHT_ENABLED 38592
+#define CTX_LIGHT_SHADE_MODEL 38596
+#define CTX_LIGHT_COLOR_MAT_FACE 38600
+#define CTX_LIGHT_COLOR_MAT_MODE 38604
+#define CTX_LIGHT_COLOR_MAT_MASK 38608
+#define CTX_LIGHT_COLOR_MAT_ENABLED 38612
+#define CTX_LIGHT_ENABLED_LIST 38616
+#define CTX_LIGHT_NEED_VERTS 42973
+#define CTX_LIGHT_FLAGS 42976
+#define CTX_LIGHT_BASE_COLOR 42980
+
+
+/* =============================================================
+ * Offsets for struct vertex_buffer
+ */
+
+#define VB_SIZE 0
+#define VB_COUNT 4
+
+#define VB_ELTS 8
+#define VB_OBJ_PTR 12
+#define VB_EYE_PTR 16
+#define VB_CLIP_PTR 20
+#define VB_PROJ_CLIP_PTR 24
+#define VB_CLIP_OR_MASK 28
+#define VB_CLIP_MASK 32
+#define VB_NORMAL_PTR 36
+#define VB_EDGE_FLAG 44
+#define VB_TEX0_COORD_PTR 48
+#define VB_TEX1_COORD_PTR 52
+#define VB_TEX2_COORD_PTR 56
+#define VB_TEX3_COORD_PTR 60
+#define VB_INDEX_PTR 80
+#define VB_COLOR_PTR 88
+#define VB_SECONDARY_COLOR_PTR 96
+#define VB_FOG_COORD_PTR 108
+#define VB_POINT_SIZE_PTR 104
+#define VB_PRIMITIVE 112
+
+#define VB_LAST_CLIPPED 244
+
+/*
+ * Flags for struct vertex_buffer
+ */
+
+#define VERT_BIT_OBJ 0x1
+#define VERT_BIT_NORM 0x4
+#define VERT_BIT_RGBA 0x8
+#define VERT_BIT_SPEC_RGB 0x10
+#define VERT_BIT_FOG_COORD 0x20
+#define VERT_BIT_TEX0 0x100
+#define VERT_BIT_TEX1 0x200
+#define VERT_BIT_TEX2 0x400
+#define VERT_BIT_TEX3 0x800
+
+
+/* =============================================================
+ * Offsets for GLvector4f
+ */
+
+#define V4F_DATA 0
+#define V4F_START 4
+#define V4F_COUNT 8
+#define V4F_STRIDE 12
+#define V4F_SIZE 16
+#define V4F_FLAGS 20
+
+/*
+ * Flags for GLvector4f
+ */
+
+#define VEC_MALLOC 0x10
+#define VEC_NOT_WRITEABLE 0x40
+#define VEC_BAD_STRIDE 0x100
+
+#define VEC_SIZE_1 0x1
+#define VEC_SIZE_2 0x3
+#define VEC_SIZE_3 0x7
+#define VEC_SIZE_4 0xf
+
+
+/* =============================================================
+ * Offsets for GLmatrix
+ */
+
+#define MATRIX_DATA 0
+#define MATRIX_INV 4
+#define MATRIX_FLAGS 8
+#define MATRIX_TYPE 12
+
+
+/* =============================================================
+ * Offsets for struct gl_light
+ */
+
+#define LIGHT_NEXT 0
+#define LIGHT_PREV 4
+
+#define LIGHT_AMBIENT 8
+#define LIGHT_DIFFUSE 24
+#define LIGHT_SPECULAR 40
+#define LIGHT_EYE_POSITION 56
+#define LIGHT_EYE_DIRECTION 72
+#define LIGHT_SPOT_EXPONENT 88
+#define LIGHT_SPOT_CUTOFF 92
+#define LIGHT_COS_CUTOFF 96
+#define LIGHT_CONST_ATTEN 100
+#define LIGHT_LINEAR_ATTEN 104
+#define LIGHT_QUADRATIC_ATTEN 108
+#define LIGHT_ENABLED 112
+
+#define LIGHT_FLAGS 116
+
+#define LIGHT_POSITION 120
+#define LIGHT_VP_INF_NORM 136
+#define LIGHT_H_INF_NORM 148
+#define LIGHT_NORM_DIRECTION 160
+#define LIGHT_VP_INF_SPOT_ATTEN 176
+
+#define LIGHT_SPOT_EXP_TABLE 180
+#define LIGHT_MAT_AMBIENT 4276
+#define LIGHT_MAT_DIFFUSE 4300
+#define LIGHT_MAT_SPECULAR 4324
+
+#define SIZEOF_GL_LIGHT 4356
+
+/*
+ * Flags for struct gl_light
+ */
+
+#define LIGHT_SPOT 0x1
+#define LIGHT_LOCAL_VIEWER 0x2
+#define LIGHT_POSITIONAL 0x4
+
+#define LIGHT_NEED_VERTICES 0x6
+
+
+/* =============================================================
+ * Offsets for struct gl_lightmodel
+ */
+
+#define LIGHT_MODEL_AMBIENT 0
+#define LIGHT_MODEL_LOCAL_VIEWER 16
+#define LIGHT_MODEL_TWO_SIDE 17
+#define LIGHT_MODEL_COLOR_CONTROL 20
+
+
+#endif /* __ASM_TYPES_H__ */
diff --git a/src/mesa/x86-64/x86-64.c b/src/mesa/x86-64/x86-64.c
new file mode 100644
index 00000000000..e70bc66274d
--- /dev/null
+++ b/src/mesa/x86-64/x86-64.c
@@ -0,0 +1,115 @@
+/* $Id: x86-64.c,v 1.1 2005/05/07 16:59:59 brianp Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 6.3
+ *
+ * Copyright (C) 1999-2003 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * x86-64 optimizations shamelessy converted from x86/sse/3dnow assembly by
+ * Mikko Tiihonen
+ */
+
+#ifdef USE_X86_64_ASM
+
+#include "glheader.h"
+#include "context.h"
+#include "math/m_xform.h"
+#include "tnl/t_context.h"
+#include "x86-64.h"
+#include "../x86/common_x86_macros.h"
+
+#ifdef DEBUG
+#include "math/m_debug.h"
+#endif
+
+DECLARE_XFORM_GROUP( x86_64, 4 )
+
+#endif
+
+/*
+extern void _mesa_x86_64_transform_points4_general( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_identity( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_perspective( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_3d( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_3d_no_rot( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_2d_no_rot( XFORM_ARGS );
+extern void _mesa_x86_64_transform_points4_2d( XFORM_ARGS );
+*/
+
+#ifdef USE_X86_64_ASM
+static void message( const char *msg )
+{
+ GLboolean debug;
+#ifdef DEBUG
+ debug = GL_TRUE;
+#else
+ if ( _mesa_getenv( "MESA_DEBUG" ) ) {
+ debug = GL_TRUE;
+ } else {
+ debug = GL_FALSE;
+ }
+#endif
+ if ( debug ) {
+ fprintf( stderr, "%s", msg );
+ }
+}
+#endif
+
+
+void _mesa_init_all_x86_64_transform_asm(void)
+{
+#ifdef USE_X86_64_ASM
+
+ if ( _mesa_getenv( "MESA_NO_ASM" ) ) {
+ return;
+ }
+
+ message("Initializing x86-64 optimizations\n");
+
+ ASSIGN_XFORM_GROUP( x86_64, 4 );
+
+ /*
+ _mesa_transform_tab[4][MATRIX_GENERAL] =
+ _mesa_x86_64_transform_points4_general;
+ _mesa_transform_tab[4][MATRIX_IDENTITY] =
+ _mesa_x86_64_transform_points4_identity;
+ _mesa_transform_tab[4][MATRIX_3D] =
+ _mesa_x86_64_transform_points4_3d;
+ _mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
+ _mesa_x86_64_transform_points4_3d_no_rot;
+ _mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
+ _mesa_x86_64_transform_points4_perspective;
+ _mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
+ _mesa_x86_64_transform_points4_2d_no_rot;
+ _mesa_transform_tab[4][MATRIX_2D] =
+ _mesa_x86_64_transform_points4_2d;
+ */
+
+#ifdef DEBUG
+ _math_test_all_transform_functions("x86_64");
+ _math_test_all_cliptest_functions("x86_64");
+ _math_test_all_normal_transform_functions("x86_64");
+#endif
+
+#endif
+}
diff --git a/src/mesa/x86-64/x86-64.h b/src/mesa/x86-64/x86-64.h
new file mode 100644
index 00000000000..fdbd154d5d6
--- /dev/null
+++ b/src/mesa/x86-64/x86-64.h
@@ -0,0 +1,32 @@
+/* $Id: x86-64.h,v 1.1 2005/05/07 16:59:59 brianp Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __X86_64_ASM_H__
+#define __X86_64_ASM_H__
+
+extern void _mesa_init_all_x86_64_transform_asm( void );
+
+#endif
diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S
new file mode 100644
index 00000000000..622c3f0c251
--- /dev/null
+++ b/src/mesa/x86-64/xform4.S
@@ -0,0 +1,458 @@
+/* $Id: xform4.S,v 1.1 2005/05/07 16:59:59 brianp Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef USE_X86_64_ASM
+
+#include "matypes.h"
+
+.text
+
+.align 16
+
+.globl _mesa_x86_64_transform_points4_general
+_mesa_x86_64_transform_points4_general:
+/*
+ * rdi = dest
+ * rsi = matrix
+ * rdx = source
+ */
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzx V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ .byte 0x66, 0x66, 0x66, 0x90 /* manual align += 3 */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ testl %ecx, %ecx /* verify non-zero count */
+ prefetchnta 64(%rsi)
+ jz p4_general_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ prefetch 16(%rdx)
+
+ movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
+ movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
+ movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
+
+p4_general_loop:
+
+ movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
+ prefetchw 16(%rdi)
+
+ pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
+ addq %rax, %rdx
+ pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
+ mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
+ mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+ pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
+ mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+ addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
+ mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+ addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
+ prefetch 16(%rdx)
+ addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+
+ movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+ addq $16, %rdi
+
+ decl %ecx
+ jnz p4_general_loop
+
+p4_general_done:
+ .byte 0xf3
+ ret
+
+.section .rodata
+
+.align 16
+p4_constants:
+.byte 0xff, 0xff, 0xff, 0xff
+.byte 0xff, 0xff, 0xff, 0xff
+.byte 0xff, 0xff, 0xff, 0xff
+.byte 0x00, 0x00, 0x00, 0x00
+
+.byte 0x00, 0x00, 0x00, 0x00
+.byte 0x00, 0x00, 0x00, 0x00
+.byte 0x00, 0x00, 0x00, 0x00
+.float 0f+1.0
+
+.text
+.align 16
+.globl _mesa_x86_64_transform_points4_3d
+/*
+ * this is slower than _mesa_x86_64_transform_points4_general
+ * because it ensures that the last matrix row (or is it column?) is 0,0,0,1
+ */
+_mesa_x86_64_transform_points4_3d:
+
+ leaq p4_constants(%rip), %rax
+
+ prefetchnta 64(%rsi)
+
+ movaps (%rax), %xmm9
+ movaps 16(%rax), %xmm10
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzx V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ testl %ecx, %ecx /* verify non-zero count */
+ jz p4_3d_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ prefetch 16(%rdx)
+
+ movaps 0(%rsi), %xmm4 /* m3 | m2 | m1 | m0 */
+ movaps 16(%rsi), %xmm5 /* m7 | m6 | m5 | m4 */
+ andps %xmm9, %xmm4 /* 0.0 | m2 | m1 | m0 */
+ movaps 32(%rsi), %xmm6 /* m11 | m10 | m9 | m8 */
+ andps %xmm9, %xmm5 /* 0.0 | m6 | m5 | m4 */
+ movaps 48(%rsi), %xmm7 /* m15 | m14 | m13 | m12 */
+ andps %xmm9, %xmm6 /* 0.0 | m10 | m9 | m8 */
+ andps %xmm9, %xmm7 /* 0.0 | m14 | m13 | m12 */
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ orps %xmm10, %xmm7 /* 1.0 | m14 | m13 | m12 */
+
+p4_3d_loop:
+
+ movaps (%rdx), %xmm8 /* ox | oy | oz | ow */
+ prefetchw 16(%rdi)
+
+ pshufd $0x00, %xmm8, %xmm0 /* ox | ox | ox | ox */
+ addq %rax, %rdx
+ pshufd $0x55, %xmm8, %xmm1 /* oy | oy | oy | oy */
+ mulps %xmm4, %xmm0 /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ pshufd $0xAA, %xmm8, %xmm2 /* oz | oz | oz | ox */
+ mulps %xmm5, %xmm1 /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+ pshufd $0xFF, %xmm8, %xmm3 /* ow | ow | ow | ow */
+ mulps %xmm6, %xmm2 /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+ addps %xmm1, %xmm0 /* ox*m3+oy*m7 | ... */
+ mulps %xmm7, %xmm3 /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+ addps %xmm2, %xmm0 /* ox*m3+oy*m7+oz*m11 | ... */
+ prefetch 16(%rdx)
+ addps %xmm3, %xmm0 /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+
+ movaps %xmm0, (%rdi) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+ addq $16, %rdi
+
+ dec %ecx
+ jnz p4_3d_loop
+
+p4_3d_done:
+ .byte 0xf3
+ ret
+
+
+.align 16
+.globl _mesa_x86_64_transform_points4_identity
+_mesa_x86_64_transform_points4_identity:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzx V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ jz p4_identity_done
+
+ movq V4F_START(%rdx), %rsi /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+ prefetch 64(%rsi)
+ prefetchw 64(%rdi)
+
+ add %ecx, %ecx
+
+ rep movsq
+
+p4_identity_done:
+ .byte 0xf3
+ ret
+
+
+.align 16
+.globl _mesa_x86_64_transform_points4_3d_no_rot
+_mesa_x86_64_transform_points4_3d_no_rot:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzx V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ jz p4_3d_no_rot_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ prefetch (%rdx)
+
+ movd (%rsi), %mm0 /* | m00 */
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ punpckldq 20(%rsi), %mm0 /* m11 | m00 */
+
+ movd 40(%rsi), %mm2 /* | m22 */
+ movq 48(%rsi), %mm1 /* m31 | m30 */
+
+ punpckldq 56(%rsi), %mm2 /* m11 | m00 */
+
+p4_3d_no_rot_loop:
+
+ prefetchw 32(%rdi)
+
+ movq (%rdx), %mm4 /* x1 | x0 */
+ movq 8(%rdx), %mm5 /* x3 | x2 */
+ movd 12(%rdx), %mm7 /* | x3 */
+
+ movq %mm5, %mm6 /* x3 | x2 */
+ pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
+
+ punpckhdq %mm6, %mm6 /* x3 | x3 */
+ pfmul %mm2, %mm5 /* x3*m32 | x2*m22 */
+
+ pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
+ pfacc %mm7, %mm5 /* x3 | x2*m22+x3*m32 */
+
+ pfadd %mm6, %mm4 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
+
+ addq %rax, %rdx
+ movq %mm4, (%rdi) /* write r0, r1 */
+ movq %mm5, 8(%rdi) /* write r2, r3 */
+
+ addq $16, %rdi
+
+ decl %ecx
+ prefetch 32(%rdx)
+ jnz p4_3d_no_rot_loop
+
+p4_3d_no_rot_done:
+ femms
+ ret
+
+
+.align 16
+.globl _mesa_x86_64_transform_points4_perspective
+_mesa_x86_64_transform_points4_perspective:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzx V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ jz p4_perspective_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ movd (%rsi), %mm0 /* | m00 */
+ pxor %mm7, %mm7 /* 0 | 0 */
+ punpckldq 20(%rsi), %mm0 /* m11 | m00 */
+
+ movq 32(%rsi), %mm2 /* m21 | m20 */
+ prefetch (%rdx)
+
+ movd 40(%rsi), %mm1 /* | m22 */
+
+ .byte 0x66, 0x66, 0x90 /* manual align += 3 */
+ punpckldq 56(%rsi), %mm1 /* m32 | m22 */
+
+
+p4_perspective_loop:
+
+ prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
+
+ movq (%rdx), %mm4 /* x1 | x0 */
+ movq 8(%rdx), %mm5 /* x3 | x2 */
+ movd 8(%rdx), %mm3 /* | x2 */
+
+ movq %mm5, %mm6 /* x3 | x2 */
+ pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
+
+ punpckldq %mm5, %mm5 /* x2 | x2 */
+
+ pfmul %mm2, %mm5 /* x2*m21 | x2*m20 */
+ pfsubr %mm7, %mm3 /* | -x2 */
+
+ pfmul %mm1, %mm6 /* x3*m32 | x2*m22 */
+ pfadd %mm4, %mm5 /* x1*m11+x2*m21 | x0*m00+x2*m20 */
+
+ pfacc %mm3, %mm6 /* -x2 | x2*m22+x3*m32 */
+
+ movq %mm5, (%rdi) /* write r0, r1 */
+ addq %rax, %rdx
+ movq %mm6, 8(%rdi) /* write r2, r3 */
+
+ addq $16, %rdi
+
+ decl %ecx
+ prefetch 32(%rdx) /* hopefully stride is zero */
+ jnz p4_perspective_loop
+
+p4_perspective_done:
+ femms
+ ret
+
+.align 16
+.globl _mesa_x86_64_transform_points4_2d_no_rot
+_mesa_x86_64_transform_points4_2d_no_rot:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzx V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ .byte 0x90 /* manual align += 1 */
+ jz p4_2d_no_rot_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ movd (%rsi), %mm0 /* | m00 */
+ prefetch (%rdx)
+ punpckldq 20(%rsi), %mm0 /* m11 | m00 */
+
+ movq 48(%rsi), %mm1 /* m31 | m30 */
+
+p4_2d_no_rot_loop:
+
+ prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
+
+ movq (%rdx), %mm4 /* x1 | x0 */
+ movq 8(%rdx), %mm5 /* x3 | x2 */
+
+ pfmul %mm0, %mm4 /* x1*m11 | x0*m00 */
+ movq %mm5, %mm6 /* x3 | x2 */
+
+ punpckhdq %mm6, %mm6 /* x3 | x3 */
+
+ addq %rax, %rdx
+ pfmul %mm1, %mm6 /* x3*m31 | x3*m30 */
+
+ prefetch 32(%rdx) /* hopefully stride is zero */
+ pfadd %mm4, %mm6 /* x1*m11+x3*m31 | x0*m00+x3*m30 */
+
+ movq %mm6, (%rdi) /* write r0, r1 */
+ movq %mm5, 8(%rdi) /* write r2, r3 */
+
+ addq $16, %rdi
+
+ decl %ecx
+ jnz p4_2d_no_rot_loop
+
+p4_2d_no_rot_done:
+ femms
+ ret
+
+
+.align 16
+.globl _mesa_x86_64_transform_points4_2d
+_mesa_x86_64_transform_points4_2d:
+
+ movl V4F_COUNT(%rdx), %ecx /* count */
+ movzx V4F_STRIDE(%rdx), %eax /* stride */
+
+ movl %ecx, V4F_COUNT(%rdi) /* set dest count */
+ movl $4, V4F_SIZE(%rdi) /* set dest size */
+ .byte 0x66, 0x66, 0x90 /* manual align += 4 */
+ orl $VEC_SIZE_4, V4F_FLAGS(%rdi)/* set dest flags */
+
+ test %ecx, %ecx
+ .byte 0x66, 0x66, 0x90 /* manual align += 4 */
+ jz p4_2d_done
+
+ movq V4F_START(%rdx), %rdx /* ptr to first src vertex */
+ movq V4F_START(%rdi), %rdi /* ptr to first dest vertex */
+
+ movd (%rsi), %mm0 /* | m00 */
+ movd 4(%rsi), %mm1 /* | m01 */
+
+ prefetch (%rdx)
+
+ punpckldq 16(%rsi), %mm0 /* m10 | m00 */
+ .byte 0x66, 0x66, 0x90 /* manual align += 4 */
+ punpckldq 20(%rsi), %mm1 /* m11 | m01 */
+
+ movq 48(%rsi), %mm2 /* m31 | m30 */
+
+p4_2d_loop:
+
+ prefetchw 32(%rdi) /* prefetch 2 vertices ahead */
+
+ movq (%rdx), %mm3 /* x1 | x0 */
+ movq 8(%rdx), %mm5 /* x3 | x2 */
+
+ movq %mm3, %mm4 /* x1 | x0 */
+ movq %mm5, %mm6 /* x3 | x2 */
+
+ pfmul %mm1, %mm4 /* x1*m11 | x0*m01 */
+ punpckhdq %mm6, %mm6 /* x3 | x3 */
+
+ pfmul %mm0, %mm3 /* x1*m10 | x0*m00 */
+
+ addq %rax, %rdx
+ pfacc %mm4, %mm3 /* x0*m01+x1*m11 | x0*m00+x1*m10 */
+
+ pfmul %mm2, %mm6 /* x3*m31 | x3*m30 */
+ prefetch 32(%rdx) /* hopefully stride is zero */
+
+ pfadd %mm6, %mm3 /* r1 | r0 */
+
+ movq %mm3, (%rdi) /* write r0, r1 */
+ movq %mm5, 8(%rdi) /* write r2, r3 */
+
+ addq $16, %rdi
+
+ decl %ecx
+ jnz p4_2d_loop
+
+p4_2d_done:
+ femms
+ ret
+
+#endif
diff --git a/src/mesa/x86/assyntax.h b/src/mesa/x86/assyntax.h
index f89cc6c575a..4b7317b0805 100644
--- a/src/mesa/x86/assyntax.h
+++ b/src/mesa/x86/assyntax.h
@@ -1730,11 +1730,17 @@ SECTION _DATA public align=16 class=DATA use32 flat
#define TLBL(a) CONCAT(a,$)
#endif
-/* hidden symbol visibility support */
+/* Hidden symbol visibility support.
+ * If we build with gcc's -fvisibility=hidden flag, we'll need to change
+ * the symbol visibility mode to 'default'.
+ */
#if defined(GNU_ASSEMBLER) && !defined(__DJGPP__) && !defined(__MINGW32__)
-#define HIDDEN(a) .hidden a
+# define HIDDEN(x) .hidden x
+#elif defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
+# pragma GCC visibility push(default)
+# define HIDDEN(x) .hidden x
#else
-#define HIDDEN(a)
+# define HIDDEN(x)
#endif
#endif /* __ASSYNTAX_H__ */
diff --git a/src/mesa/x86/gen_matypes.c b/src/mesa/x86/gen_matypes.c
index 30642e4b12d..d5cee5347ce 100644
--- a/src/mesa/x86/gen_matypes.c
+++ b/src/mesa/x86/gen_matypes.c
@@ -61,7 +61,7 @@ do { \
printf( "\n" ); \
} while (0)
-#if defined(__BEOS__)
+#if defined(__BEOS__) || defined(_LP64)
#define OFFSET( s, t, m ) \
printf( "#define %s\t%ld\n", s, offsetof( t, m ) );
#else
@@ -69,7 +69,7 @@ do { \
printf( "#define %s\t%d\n", s, offsetof( t, m ) );
#endif
-#if defined(__BEOS__)
+#if defined(__BEOS__) || defined(_LP64)
#define SIZEOF( s, t ) \
printf( "#define %s\t%ld\n", s, sizeof(t) );
#else
diff --git a/src/mesa/x86/glapi_x86.S b/src/mesa/x86/glapi_x86.S
index c0a971bd53b..6e8f32e373d 100644
--- a/src/mesa/x86/glapi_x86.S
+++ b/src/mesa/x86/glapi_x86.S
@@ -29,16 +29,6 @@
#include "assyntax.h"
#include "glapioffsets.h"
-/* If we build with gcc's -fvisibility=hidden flag, we'll need to change
-* the symbol visibility mode to 'default'.
-*/
-#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303
-# pragma GCC visibility push(default)
-# define HIDDEN(x) .hidden x
-#else
-# define HIDDEN(x)
-#endif
-
#ifndef __WIN32__
#if defined(STDCALL_API)