summaryrefslogtreecommitdiffstats
path: root/src/mesa/x86
diff options
context:
space:
mode:
authorGareth Hughes <[email protected]>2001-03-29 06:46:15 +0000
committerGareth Hughes <[email protected]>2001-03-29 06:46:15 +0000
commit1b2fef5c28a40cd001598071e25b876ad4fccdd1 (patch)
tree03d5dd8385ddfc5ad18b4e02f1de6533e202c3a5 /src/mesa/x86
parent8e48a232fe48e4b6855cecb1d02363fb142365ae (diff)
Consolidation of asm code in 3.5
Diffstat (limited to 'src/mesa/x86')
-rw-r--r--src/mesa/x86/3dnow.c131
-rw-r--r--src/mesa/x86/3dnow_normal.S858
-rw-r--r--src/mesa/x86/3dnow_xform1.S423
-rw-r--r--src/mesa/x86/3dnow_xform2.S464
-rw-r--r--src/mesa/x86/3dnow_xform3.S570
-rw-r--r--src/mesa/x86/3dnow_xform4.S588
-rw-r--r--src/mesa/x86/common_x86.c36
-rw-r--r--src/mesa/x86/common_x86_asm.S18
-rw-r--r--src/mesa/x86/common_x86_asm.h6
-rw-r--r--src/mesa/x86/sse.c208
-rw-r--r--src/mesa/x86/sse.h40
-rw-r--r--src/mesa/x86/sse_normal.S252
-rw-r--r--src/mesa/x86/sse_xform1.S433
-rw-r--r--src/mesa/x86/sse_xform2.S452
-rw-r--r--src/mesa/x86/sse_xform3.S498
-rw-r--r--src/mesa/x86/sse_xform4.S226
-rw-r--r--src/mesa/x86/x86.c71
-rw-r--r--src/mesa/x86/x86_xform2.S536
-rw-r--r--src/mesa/x86/x86_xform3.S606
-rw-r--r--src/mesa/x86/x86_xform4.S639
20 files changed, 6916 insertions, 139 deletions
diff --git a/src/mesa/x86/3dnow.c b/src/mesa/x86/3dnow.c
index b6ef3739361..821a33d76cd 100644
--- a/src/mesa/x86/3dnow.c
+++ b/src/mesa/x86/3dnow.c
@@ -1,4 +1,4 @@
-/* $Id: 3dnow.c,v 1.17 2001/03/28 20:44:43 gareth Exp $ */
+/* $Id: 3dnow.c,v 1.18 2001/03/29 06:46:15 gareth Exp $ */
/*
* Mesa 3-D graphics library
@@ -51,31 +51,31 @@
const GLubyte flag
-#define DECLARE_XFORM_GROUP( pfx, sz, masked ) \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_##masked( XFORM_ARGS );
+#define DECLARE_XFORM_GROUP( pfx, sz ) \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d( XFORM_ARGS );
-#define ASSIGN_XFORM_GROUP( pfx, cma, sz, masked ) \
- _mesa_transform_tab[cma][sz][MATRIX_GENERAL] = \
- _mesa_##pfx##_transform_points##sz##_general_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_IDENTITY] = \
- _mesa_##pfx##_transform_points##sz##_identity_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_3D_NO_ROT] = \
- _mesa_##pfx##_transform_points##sz##_3d_no_rot_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_PERSPECTIVE] = \
- _mesa_##pfx##_transform_points##sz##_perspective_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_2D] = \
- _mesa_##pfx##_transform_points##sz##_2d_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_2D_NO_ROT] = \
- _mesa_##pfx##_transform_points##sz##_2d_no_rot_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_3D] = \
- _mesa_##pfx##_transform_points##sz##_3d_##masked;
+#define ASSIGN_XFORM_GROUP( pfx, sz ) \
+ _mesa_transform_tab[0][sz][MATRIX_GENERAL] = \
+ _mesa_##pfx##_transform_points##sz##_general; \
+ _mesa_transform_tab[0][sz][MATRIX_IDENTITY] = \
+ _mesa_##pfx##_transform_points##sz##_identity; \
+ _mesa_transform_tab[0][sz][MATRIX_3D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_3d_no_rot; \
+ _mesa_transform_tab[0][sz][MATRIX_PERSPECTIVE] = \
+ _mesa_##pfx##_transform_points##sz##_perspective; \
+ _mesa_transform_tab[0][sz][MATRIX_2D] = \
+ _mesa_##pfx##_transform_points##sz##_2d; \
+ _mesa_transform_tab[0][sz][MATRIX_2D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_2d_no_rot; \
+ _mesa_transform_tab[0][sz][MATRIX_3D] = \
+ _mesa_##pfx##_transform_points##sz##_3d;
@@ -87,47 +87,42 @@ extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_##masked( XFORM_ARGS
GLvector3f *dest
-#define DECLARE_NORM_GROUP( pfx, masked ) \
-extern void _ASMAPI _mesa_##pfx##_rescale_normals_##masked( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_normalize_normals_##masked( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_normals_##masked( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_normals_no_rot_##masked( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals_##masked( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals_no_rot_##masked( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals_##masked( NORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals_no_rot_##masked( NORM_ARGS );
-
-
-#define ASSIGN_NORM_GROUP( pfx, cma, masked ) \
- _mesa_normal_tab[NORM_RESCALE][cma] = \
- _mesa_##pfx##_rescale_normals_##masked; \
- _mesa_normal_tab[NORM_NORMALIZE][cma] = \
- _mesa_##pfx##_normalize_normals_##masked; \
- _mesa_normal_tab[NORM_TRANSFORM][cma] = \
- _mesa_##pfx##_transform_normals_##masked; \
- _mesa_normal_tab[NORM_TRANSFORM_NO_ROT][cma] = \
- _mesa_##pfx##_transform_normals_no_rot_##masked; \
- _mesa_normal_tab[NORM_TRANSFORM | NORM_RESCALE][cma] = \
- _mesa_##pfx##_transform_rescale_normals_##masked; \
- _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_RESCALE][cma] = \
- _mesa_##pfx##_transform_rescale_normals_no_rot_##masked; \
- _mesa_normal_tab[NORM_TRANSFORM | NORM_NORMALIZE][cma] = \
- _mesa_##pfx##_transform_normalize_normals_##masked; \
- _mesa_normal_tab[NORM_TRANSFORM_NO_ROT | NORM_NORMALIZE][cma] = \
- _mesa_##pfx##_transform_normalize_normals_no_rot_##masked;
+#define DECLARE_NORM_GROUP( pfx ) \
+extern void _ASMAPI _mesa_##pfx##_rescale_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_normalize_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normals_no_rot( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals_no_rot( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals_no_rot( NORM_ARGS );
+
+
+#define ASSIGN_NORM_GROUP( pfx ) \
+ _mesa_normal_tab[NORM_RESCALE][0] = \
+ _mesa_##pfx##_rescale_normals; \
+ _mesa_normal_tab[NORM_NORMALIZE][0] = \
+ _mesa_##pfx##_normalize_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM][0] = \
+ _mesa_##pfx##_transform_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT][0] = \
+ _mesa_##pfx##_transform_normals_no_rot; \
+ _mesa_normal_tab[NORM_TRANSFORM|NORM_RESCALE][0] = \
+ _mesa_##pfx##_transform_rescale_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT|NORM_RESCALE][0] = \
+ _mesa_##pfx##_transform_rescale_normals_no_rot; \
+ _mesa_normal_tab[NORM_TRANSFORM|NORM_NORMALIZE][0] = \
+ _mesa_##pfx##_transform_normalize_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT|NORM_NORMALIZE][0] = \
+ _mesa_##pfx##_transform_normalize_normals_no_rot;
#ifdef USE_3DNOW_ASM
-DECLARE_XFORM_GROUP( 3dnow, 2, raw )
-DECLARE_XFORM_GROUP( 3dnow, 3, raw )
-DECLARE_XFORM_GROUP( 3dnow, 4, raw )
+DECLARE_XFORM_GROUP( 3dnow, 2 )
+DECLARE_XFORM_GROUP( 3dnow, 3 )
+DECLARE_XFORM_GROUP( 3dnow, 4 )
-DECLARE_XFORM_GROUP( 3dnow, 2, masked )
-DECLARE_XFORM_GROUP( 3dnow, 3, masked )
-DECLARE_XFORM_GROUP( 3dnow, 4, masked )
-
-DECLARE_NORM_GROUP( 3dnow, raw )
-/*DECLARE_NORM_GROUP( 3dnow, masked )*/
+DECLARE_NORM_GROUP( 3dnow )
extern void _ASMAPI
@@ -155,16 +150,11 @@ _mesa_3dnow_project_clipped_vertices( GLfloat *first,
void _mesa_init_3dnow_transform_asm( void )
{
#ifdef USE_3DNOW_ASM
- ASSIGN_XFORM_GROUP( 3dnow, 0, 2, raw );
- ASSIGN_XFORM_GROUP( 3dnow, 0, 3, raw );
- ASSIGN_XFORM_GROUP( 3dnow, 0, 4, raw );
-
-/* ASSIGN_XFORM_GROUP( 3dnow, CULL_MASK_ACTIVE, 2, masked ); */
-/* ASSIGN_XFORM_GROUP( 3dnow, CULL_MASK_ACTIVE, 3, masked ); */
-/* ASSIGN_XFORM_GROUP( 3dnow, CULL_MASK_ACTIVE, 4, masked ); */
+ ASSIGN_XFORM_GROUP( 3dnow, 2 );
+ ASSIGN_XFORM_GROUP( 3dnow, 3 );
+ ASSIGN_XFORM_GROUP( 3dnow, 4 );
- ASSIGN_NORM_GROUP( 3dnow, 0, raw );
-/* ASSIGN_NORM_GROUP( 3dnow, CULL_MASK_ACTIVE, masked ); */
+ ASSIGN_NORM_GROUP( 3dnow );
#ifdef DEBUG
_math_test_all_transform_functions( "3DNow!" );
@@ -177,6 +167,7 @@ void _mesa_init_3dnow_vertex_asm( void )
{
#ifdef USE_3DNOW_ASM
_mesa_xform_points3_v16_general = _mesa_v16_3dnow_general_xform;
+
_mesa_project_v16 = _mesa_3dnow_project_vertices;
_mesa_project_clipped_v16 = _mesa_3dnow_project_clipped_vertices;
diff --git a/src/mesa/x86/3dnow_normal.S b/src/mesa/x86/3dnow_normal.S
new file mode 100644
index 00000000000..f7cc069b165
--- /dev/null
+++ b/src/mesa/x86/3dnow_normal.S
@@ -0,0 +1,858 @@
+/* $Id: 3dnow_normal.S,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * 3Dnow assembly code by Holger Waechtler
+ */
+
+#include "matypes.h"
+#include "norm_args.h"
+
+ SEG_TEXT
+
+#define M(i) REGOFF(i * 4, ECX)
+#define STRIDE REGOFF(12, ESI)
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals)
+GLNAME(_mesa_3dnow_transform_normalize_normals):
+
+ #define FRAME_OFFSET 12
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+ PUSH_L ( EBP )
+
+ MOV_L ( ARG_LENGTHS, EDI )
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( REGOFF(V3F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V3F_COUNT, EAX) )
+ MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+
+ CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
+ JE ( LLBL (G3TN_end) )
+
+ MOV_L ( REGOFF (V3F_COUNT, ESI), EBP )
+ FEMMS
+
+ PUSH_L ( EBP )
+ PUSH_L ( EAX )
+ PUSH_L ( EDX ) /* save counter & pointer for */
+ /* the normalize pass */
+ #undef FRAME_OFFSET
+ #define FRAME_OFFSET 24
+
+ MOVQ ( M(0), MM3 ) /* m1 | m0 */
+ MOVQ ( M(4), MM4 ) /* m5 | m4 */
+
+ MOVD ( M(2), MM5 ) /* | m2 */
+ PUNPCKLDQ ( M(6), MM5 ) /* m6 | m2 */
+
+ MOVQ ( M(8), MM6 ) /* m9 | m8 */
+ MOVQ ( M(10), MM7 ) /* | m10 */
+
+ CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
+ JNE ( LLBL (G3TN_scale_end ) )
+
+ MOVD ( ARG_SCALE, MM0 ) /* | scale */
+ PUNPCKLDQ ( MM0, MM0 ) /* scale | scale */
+
+ PFMUL ( MM0, MM3 ) /* scale * m1 | scale * m0 */
+ PFMUL ( MM0, MM4 ) /* scale * m5 | scale * m4 */
+ PFMUL ( MM0, MM5 ) /* scale * m6 | scale * m2 */
+ PFMUL ( MM0, MM6 ) /* scale * m9 | scale * m8 */
+ PFMUL ( MM0, MM7 ) /* | scale * m10 */
+
+LLBL (G3TN_scale_end):
+ MOVQ ( REGIND (EDX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
+
+ALIGNTEXT32
+LLBL (G3TN_transform):
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
+
+ PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ PREFETCHW ( REGIND(EAX) )
+
+ PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
+ PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
+
+ PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
+ PFADD ( MM2, MM0 ) /* x0*m4+x1*m5+x2*m6| x0*m0+...+x2**/
+
+ MOVQ ( REGIND (EDX), MM1 ) /* x1 | x0 */
+ MOVQ ( MM0, REGOFF(-12, EAX) ) /* write r0, r1 */
+
+ PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
+ MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
+
+ PFMUL ( MM7, MM2 ) /* | x2*m10 */
+ PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
+
+ PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m*/
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ MOVD ( MM1, REGOFF(-4, EAX) ) /* write r2 */
+ MOVQ ( REGIND (EDX), MM0 ) /* x1 | x0 */
+
+ MOVD ( REGOFF (8, EDX), MM2 ) /* | x2 */
+ DEC_L ( EBP ) /* decrement normal counter */
+ JA ( LLBL (G3TN_transform) )
+
+
+ POP_L ( EDX ) /* end of transform --- */
+ POP_L ( EAX ) /* now normalizing ... */
+ POP_L ( EBP )
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
+ CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
+ JE ( LLBL (G3TN_norm ) ) /* calculate lengths */
+
+
+ALIGNTEXT32
+LLBL (G3TN_norm_w_lengths):
+
+ PREFETCHW ( REGOFF(12,EAX) )
+
+ MOVD ( REGIND (EDI), MM3 ) /* | length (x) */
+ PFMUL ( MM3, MM1 ) /* | x2 (normalize*/
+
+ PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
+ PFMUL ( MM3, MM0 ) /* x1 (normalized) | x0 (normalize*/
+
+ ADD_L ( STRIDE, EDX ) /* next normal */
+ ADD_L ( CONST(4), EDI ) /* next length */
+
+ PREFETCH ( REGIND(EDI) )
+
+ MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */
+ MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */
+
+ ADD_L ( CONST(12), EAX ) /* next r */
+ DEC_L ( EBP ) /* decrement normal counter */
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+ JA ( LLBL (G3TN_norm_w_lengths) )
+ JMP ( LLBL (G3TN_exit_3dnow) )
+
+ALIGNTEXT32
+LLBL (G3TN_norm):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM3 ) /* x1 | x0 */
+ MOVQ ( MM1, MM4 ) /* | x2 */
+
+ PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ PFMUL ( MM1, MM4 ) /* | x2*x2 */
+ PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */
+
+ PFACC ( MM3, MM3 ) /* **not used** | x0*x0+x1*x1+x2**/
+ PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
+
+ MOVQ ( MM5, MM4 )
+ PUNPCKLDQ ( MM3, MM3 )
+
+ DEC_L ( EBP ) /* decrement normal counter */
+ PFMUL ( MM5, MM5 )
+
+ PFRSQIT1 ( MM3, MM5 )
+ PFRCPIT2 ( MM4, MM5 )
+
+ PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalize*/
+
+ MOVQ ( MM0, REGOFF(-12, EAX) ) /* write new x0, x1 */
+ PFMUL ( MM5, MM1 ) /* | x2 (normalize*/
+
+ MOVD ( MM1, REGOFF(-4, EAX) ) /* write new x2 */
+ MOVQ ( REGIND (EAX), MM0 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+ JA ( LLBL (G3TN_norm) )
+
+LLBL (G3TN_exit_3dnow):
+ FEMMS
+
+LLBL (G3TN_end):
+ POP_L ( EBP )
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_normalize_normals_no_rot):
+
+ #undef FRAME_OFFSET
+ #define FRAME_OFFSET 12
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+ PUSH_L ( EBP )
+
+ MOV_L ( ARG_LENGTHS, EDI )
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( REGOFF(V3F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V3F_COUNT, EAX) )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+ MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+
+ CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
+ JE ( LLBL (G3TNNR_end) )
+
+ FEMMS
+
+ MOVD ( M(0), MM0 ) /* | m0 */
+ PUNPCKLDQ ( M(5), MM0 ) /* m5 | m0 */
+
+ MOVD ( M(10), MM2 ) /* | m10 */
+ PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */
+
+ CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
+ JNE ( LLBL (G3TNNR_scale_end ) )
+
+ MOVD ( ARG_SCALE, MM7 ) /* | scale */
+ PUNPCKLDQ ( MM7, MM7 ) /* scale | scale */
+
+ PFMUL ( MM7, MM0 ) /* scale * m5 | scale * m0 */
+ PFMUL ( MM7, MM2 ) /* scale * m10 | scale * m10 */
+
+ALIGNTEXT32
+LLBL (G3TNNR_scale_end):
+ MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
+
+ CMP_L ( CONST(0), EDI ) /* lengths == 0 ? */
+ JE ( LLBL (G3TNNR_norm) ) /* need to calculate lengths */
+
+ MOVD ( REGIND(EDI), MM3 ) /* | length (x) */
+
+
+ALIGNTEXT32
+LLBL (G3TNNR_norm_w_lengths): /* use precalculated lengths */
+
+ PREFETCHW ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFMUL ( MM2, MM7 ) /* | x2*m10 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ PFMUL ( MM3, MM7 ) /* | x2 (normalized) */
+ PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
+
+ ADD_L ( CONST(4), EDI ) /* next length */
+ PFMUL ( MM3, MM6 ) /* x1 (normalized) | x0 (normalized) */
+
+ DEC_L ( EBP ) /* decrement normal counter */
+ MOVQ ( MM6, REGOFF(-12, EAX) ) /* write r0, r1 */
+
+ MOVD ( MM7, REGOFF(-4, EAX) ) /* write r2 */
+ MOVD ( REGIND(EDI), MM3 ) /* | length (x) */
+
+ MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
+
+ JA ( LLBL (G3TNNR_norm_w_lengths) )
+ JMP ( LLBL (G3TNNR_exit_3dnow) )
+
+ALIGNTEXT32
+LLBL (G3TNNR_norm): /* need to calculate lengths */
+
+ PREFETCHW ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM6 ) /* x1*m5 | x0*m0 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ PFMUL ( MM2, MM7 ) /* | x2*m10 */
+ MOVQ ( MM6, MM3 ) /* x1 (transformed)| x0 (transformed) */
+
+ MOVQ ( MM7, MM4 ) /* | x2 (transformed) */
+ PFMUL ( MM6, MM3 ) /* x1*x1 | x0*x0 */
+
+
+ PFMUL ( MM7, MM4 ) /* | x2*x2 */
+ PFACC ( MM3, MM3 ) /* **not used** | x0*x0+x1*x1 */
+
+ PFADD ( MM4, MM3 ) /* | x0*x0+x1*x1+x2*x2*/
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
+ MOVQ ( MM5, MM4 )
+
+ PUNPCKLDQ ( MM3, MM3 )
+ PFMUL ( MM5, MM5 )
+
+ PFRSQIT1 ( MM3, MM5 )
+ DEC_L ( EBP ) /* decrement normal counter */
+
+ PFRCPIT2 ( MM4, MM5 )
+ PFMUL ( MM5, MM6 ) /* x1 (normalized) | x0 (normalized) */
+
+ MOVQ ( MM6, REGOFF(-12, EAX) ) /* write r0, r1 */
+ PFMUL ( MM5, MM7 ) /* | x2 (normalized) */
+
+ MOVD ( MM7, REGOFF(-4, EAX) ) /* write r2 */
+ MOVQ ( REGIND(EDX), MM6 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EDX), MM7 ) /* | x2 */
+ JA ( LLBL (G3TNNR_norm) )
+
+
+LLBL (G3TNNR_exit_3dnow):
+ FEMMS
+
+LLBL (G3TNNR_end):
+ POP_L ( EBP )
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_rescale_normals_no_rot):
+
+ #undef FRAME_OFFSET
+ #define FRAME_OFFSET 12
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+ PUSH_L ( EBP )
+
+ MOV_L ( ARG_IN, EAX )
+ MOV_L ( ARG_DEST, EDX )
+ MOV_L ( REGOFF(V3F_COUNT, EAX), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V3F_COUNT, EDX) )
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+ MOV_L ( REGOFF(V3F_START, EDX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+
+ CMP_L ( CONST(0), EBP )
+ JE ( LLBL (G3TRNR_end) )
+
+ FEMMS
+
+ MOVD ( ARG_SCALE, MM6 ) /* | scale */
+ PUNPCKLDQ ( MM6, MM6 ) /* scale | scale */
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m0 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m5 | m0 */
+
+ PFMUL ( MM6, MM0 ) /* scale*m5 | scale*m0 */
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */
+
+ MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+ PFMUL ( MM6, MM2 ) /* | scale*m10 */
+
+ MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+
+ALIGNTEXT32
+LLBL (G3TRNR_rescale):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFMUL ( MM2, MM5 ) /* | x2*m10 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ DEC_L ( EBP ) /* decrement normal counter */
+ MOVQ ( MM4, REGOFF(-12, EAX) ) /* write r0, r1 */
+
+ MOVD ( MM5, REGOFF(-4, EAX) ) /* write r2 */
+ MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+ JA ( LLBL (G3TRNR_rescale) ) /* cnt > 0 ? -> process next normal */
+
+ FEMMS
+
+LLBL (G3TRNR_end):
+ POP_L ( EBP )
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_rescale_normals)
+GLNAME(_mesa_3dnow_transform_rescale_normals):
+
+ #undef FRAME_OFFSET
+ #define FRAME_OFFSET 8
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(V3F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V3F_COUNT, EAX) )
+ MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+
+ CMP_L ( CONST(0), EDI )
+ JE ( LLBL (G3TR_end) )
+
+ FEMMS
+
+ MOVQ ( REGIND(ECX), MM3 ) /* m1 | m0 */
+
+ MOVQ ( REGOFF(16,ECX), MM4 ) /* m5 | m4 */
+ MOVD ( ARG_SCALE, MM0 ) /* scale */
+
+ MOVD ( REGOFF(8,ECX), MM5 ) /* | m2 */
+ PUNPCKLDQ ( MM0, MM0 ) /* scale | scale */
+
+ PUNPCKLDQ ( REGOFF(24, ECX), MM5 )
+ PFMUL ( MM0, MM3 ) /* scale*m1 | scale*m0 */
+
+ MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8*/
+ PFMUL ( MM0, MM4 ) /* scale*m5 | scale*m4 */
+
+ MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
+ PFMUL ( MM0, MM5 ) /* scale*m6 | scale*m2 */
+
+ PFMUL ( MM0, MM6 ) /* scale*m9 | scale*m8 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
+ PFMUL ( MM0, MM7 ) /* | scale*m10 */
+ MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
+
+ALIGNTEXT32
+LLBL (G3TR_rescale):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
+
+ PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
+ PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
+
+ MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */
+
+ PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
+ PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */
+
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ MOVQ ( MM0, REGOFF(-12, EAX) ) /* write r0, r1 */
+ PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
+
+ PFMUL ( MM7, MM2 ) /* | x2*m10 */
+ PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
+
+ PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */
+ MOVD ( MM1, REGOFF(-4, EAX) ) /* write r2 */
+
+ MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
+ DEC_L ( EDI ) /* decrement normal counter */
+ JA ( LLBL (G3TR_rescale) )
+
+ FEMMS
+
+LLBL (G3TR_end):
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normals_no_rot)
+GLNAME(_mesa_3dnow_transform_normals_no_rot):
+
+ #undef FRAME_OFFSET
+ #define FRAME_OFFSET 8
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(V3F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V3F_COUNT, EAX) )
+ MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+
+ CMP_L ( CONST(0), EDI )
+ JE ( LLBL (G3TNR_end) )
+
+ FEMMS
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m0 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m5 | m0 */
+
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m10 */
+ PUNPCKLDQ ( MM2, MM2 ) /* m10 | m10 */
+
+ MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+
+ALIGNTEXT32
+LLBL (G3TNR_transform):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM4 ) /* x1*m5 | x0*m0 */
+ ADD_L ( STRIDE, EDX) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFMUL ( MM2, MM5 ) /* | x2*m10 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ DEC_L ( EDI ) /* decrement normal counter */
+ MOVQ ( MM4, REGOFF(-12, EAX) ) /* write r0, r1 */
+
+ MOVD ( MM5, REGOFF(-4, EAX) ) /* write r2 */
+ MOVQ ( REGIND(EDX), MM4 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EDX), MM5 ) /* | x2 */
+ JA ( LLBL (G3TNR_transform) )
+
+ FEMMS
+
+LLBL (G3TNR_end):
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_transform_normals)
+GLNAME(_mesa_3dnow_transform_normals):
+
+ #undef FRAME_OFFSET
+ #define FRAME_OFFSET 8
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( ARG_MAT, ECX )
+ MOV_L ( REGOFF(V3F_COUNT, ESI), EDI ) /* dest->count = in->count */
+ MOV_L ( EDI, REGOFF(V3F_COUNT, EAX) )
+ MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V3F_START, ESI), EDX ) /* in->start */
+ MOV_L ( REGOFF(MATRIX_INV, ECX), ECX ) /* mat->inv */
+
+ CMP_L ( CONST(0), EDI ) /* count > 0 ?? */
+ JE ( LLBL (G3T_end) )
+
+ FEMMS
+
+ MOVQ ( REGIND(ECX), MM3 ) /* m1 | m0 */
+ MOVQ ( REGOFF(16, ECX), MM4 ) /* m5 | m4 */
+
+ MOVD ( REGOFF(8, ECX), MM5 ) /* | m2 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM5 ) /* m6 | m2 */
+
+ MOVQ ( REGOFF(32, ECX), MM6 ) /* m9 | m8 */
+ MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
+
+ MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
+ALIGNTEXT32
+LLBL (G3T_transform):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
+
+ PFMUL ( MM3, MM0 ) /* x1*m1 | x0*m0 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ PFMUL ( MM4, MM1 ) /* x1*m5 | x0*m4 */
+ PFACC ( MM1, MM0 ) /* x0*m4+x1*m5 | x0*m0+x1*m1 */
+
+ PFMUL ( MM5, MM2 ) /* x2*m6 | x2*m2 */
+ PFADD ( MM2, MM0 ) /* x0*m4...+x2*m6| x0*m0+x1*m1+x2*m2 */
+
+ MOVQ ( REGIND(EDX), MM1 ) /* x1 | x0 */
+ MOVQ ( MM0, REGOFF(-12, EAX) ) /* write r0, r1 */
+
+ PFMUL ( MM6, MM1 ) /* x1*m9 | x0*m8 */
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+
+ PFMUL ( MM7, MM2 ) /* | x2*m10 */
+ ADD_L ( STRIDE, EDX ) /* next normal */
+
+ PREFETCH ( REGIND(EDX) )
+
+ PFACC ( MM1, MM1 ) /* *not used* | x0*m8+x1*m9 */
+ PFADD ( MM2, MM1 ) /* *not used* | x0*m8+x1*m9+x2*m10 */
+
+ MOVD ( MM1, REGOFF(-4, EAX) ) /* write r2 */
+ MOVQ ( REGIND(EDX), MM0 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EDX), MM2 ) /* | x2 */
+ DEC_L ( EDI ) /* decrement normal counter */
+ JA ( LLBL (G3T_transform) )
+
+ FEMMS
+
+LLBL (G3T_end):
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_normalize_normals)
+GLNAME(_mesa_3dnow_normalize_normals):
+
+ #undef FRAME_OFFSET
+ #define FRAME_OFFSET 12
+
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+ PUSH_L ( EBP )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( REGOFF(V3F_COUNT, ESI), EBP ) /* dest->count = in->count */
+ MOV_L ( EBP, REGOFF(V3F_COUNT, EAX) )
+ MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V3F_START, ESI), ECX ) /* in->start */
+ MOV_L ( ARG_LENGTHS, EDX )
+
+ CMP_L ( CONST(0), EBP ) /* count > 0 ?? */
+ JE ( LLBL (G3N_end) )
+
+ FEMMS
+
+ MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
+
+ CMP_L ( CONST(0), EDX ) /* lengths == 0 ? */
+ JE ( LLBL (G3N_norm2) ) /* calculate lengths */
+
+ALIGNTEXT32
+LLBL (G3N_norm1): /* use precalculated lengths */
+
+ PREFETCH ( REGIND(EAX) )
+
+ MOVD ( REGIND(EDX), MM3 ) /* | length (x) */
+ PFMUL ( MM3, MM1 ) /* | x2 (normalized) */
+
+ PUNPCKLDQ ( MM3, MM3 ) /* length (x) | length (x) */
+ ADD_L ( STRIDE, ECX ) /* next normal */
+
+ PREFETCH ( REGIND(ECX) )
+
+ PFMUL ( MM3, MM0 ) /* x1 (normalized) | x0 (normalized) */
+ MOVQ ( MM0, REGIND(EAX) ) /* write new x0, x1 */
+
+ MOVD ( MM1, REGOFF(8, EAX) ) /* write new x2 */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ ADD_L ( CONST(4), EDX ) /* next length */
+ DEC_L ( EBP ) /* decrement normal counter */
+
+ MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
+ JA ( LLBL (G3N_norm1) )
+
+ JMP ( LLBL (G3N_end1) )
+
+ALIGNTEXT32
+LLBL (G3N_norm2): /* need to calculate lengths */
+
+ PREFETCHW ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM3 ) /* x1 | x0 */
+ ADD_L ( STRIDE, ECX ) /* next normal */
+
+ PREFETCH ( REGIND(ECX) )
+
+ PFMUL ( MM0, MM3 ) /* x1*x1 | x0*x0 */
+ MOVQ ( MM1, MM4 ) /* | x2 */
+
+ ADD_L ( CONST(12), EAX ) /* next r */
+ PFMUL ( MM1, MM4 ) /* | x2*x2 */
+
+ PFADD ( MM4, MM3 ) /* | x0*x0+x2*x2 */
+ PFACC ( MM3, MM3 ) /* x0*x0+...+x2*x2 | x0*x0+x1*x1+x2*x2*/
+
+ PFRSQRT ( MM3, MM5 ) /* 1/sqrt (x0*x0+x1*x1+x2*x2) */
+ MOVQ ( MM5, MM4 )
+
+ PUNPCKLDQ ( MM3, MM3 )
+ PFMUL ( MM5, MM5 )
+
+ PFRSQIT1 ( MM3, MM5 )
+ DEC_L ( EBP ) /* decrement normal counter */
+
+ PFRCPIT2 ( MM4, MM5 )
+
+ PFMUL ( MM5, MM0 ) /* x1 (normalized) | x0 (normalized) */
+ MOVQ ( MM0, REGOFF(-12, EAX) ) /* write new x0, x1 */
+
+ PFMUL ( MM5, MM1 ) /* | x2 (normalized) */
+ MOVD ( MM1, REGOFF(-4, EAX) ) /* write new x2 */
+
+ MOVQ ( REGIND(ECX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | x2 */
+ JA ( LLBL (G3N_norm2) )
+
+LLBL (G3N_end1):
+ FEMMS
+
+LLBL (G3N_end):
+ POP_L ( EBP )
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
+
+
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_3dnow_rescale_normals)
+GLNAME(_mesa_3dnow_rescale_normals):
+
+ #undef FRAME_OFFSET
+ #define FRAME_OFFSET 8
+ PUSH_L ( EDI )
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_IN, ESI )
+ MOV_L ( ARG_DEST, EAX )
+ MOV_L ( REGOFF(V3F_COUNT, ESI), EDX ) /* dest->count = in->count */
+ MOV_L ( EDX, REGOFF(V3F_COUNT, EAX) )
+ MOV_L ( REGOFF(V3F_START, EAX), EAX ) /* dest->start */
+ MOV_L ( REGOFF(V3F_START, ESI), ECX ) /* in->start */
+
+ CMP_L ( CONST(0), EDX )
+ JE ( LLBL (G3R_end) )
+
+ FEMMS
+
+ MOVD ( ARG_SCALE, MM0 ) /* scale */
+ PUNPCKLDQ ( MM0, MM0 )
+
+ MOVQ ( REGIND(ECX), MM1 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, ECX), MM2 ) /* | x2 */
+
+ALIGNTEXT32
+LLBL (G3R_rescale):
+
+ PREFETCHW ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM1 ) /* x1*scale | x0*scale */
+ ADD_L ( STRIDE, ECX ) /* next normal */
+
+ PREFETCH ( REGIND(ECX) )
+
+ PFMUL ( MM0, MM2 ) /* | x2*scale */
+ ADD_L ( CONST(12), EAX ) /* next r */
+
+ MOVQ ( MM1, REGOFF(-12, EAX) ) /* write r0, r1 */
+ MOVD ( MM2, REGOFF(-4, EAX) ) /* write r2 */
+
+ DEC_L ( EDX ) /* decrement normal counter */
+ MOVQ ( REGIND(ECX), MM1 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, ECX), MM2 ) /* | x2 */
+ JA ( LLBL (G3R_rescale) )
+
+ FEMMS
+
+LLBL (G3R_end):
+ POP_L ( ESI )
+ POP_L ( EDI )
+ RET
diff --git a/src/mesa/x86/3dnow_xform1.S b/src/mesa/x86/3dnow_xform1.S
new file mode 100644
index 00000000000..9957676ee96
--- /dev/null
+++ b/src/mesa/x86/3dnow_xform1.S
@@ -0,0 +1,423 @@
+/* $Id: 3dnow_xform1.S,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_general )
+GLNAME( _mesa_3dnow_transform_points1_general ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPGR_3 ) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
+ MOVQ ( REGOFF(8, ECX), MM1 ) /* m03 | m02 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+ MOVQ ( REGOFF(56, ECX), MM3 ) /* m33 | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPGR_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
+
+ MOVQ ( MM4, MM5 ) /* x0 | x0 */
+ PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
+
+ PFMUL ( MM1, MM5 ) /* x0*m03 | x0*m02 */
+ PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
+
+ PFADD ( MM3, MM5 ) /* x0*m03+m33 | x0*m02+m32 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ MOVQ ( MM5, REGOFF(8, EDX) ) /* write r3, r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TPGR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPGR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_identity )
+GLNAME( _mesa_3dnow_transform_points1_identity ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(1), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPIR_4) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_3 ):
+
+ MOVD ( REGIND(EAX), MM0 ) /* | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ MOVD ( MM0, REGIND(EDX) ) /* | r0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPIR_3 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPIR_4 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_3d_no_rot )
+GLNAME( _mesa_3dnow_transform_points1_3d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ PFMUL ( MM0, MM4 ) /* | x0*m00 */
+
+ PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ MOVD ( MM3, REGOFF(8, EDX) ) /* write r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP3NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_perspective )
+GLNAME( _mesa_3dnow_transform_points1_perspective ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPPR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPPR_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* 0 | x0 */
+ PFMUL ( MM0, MM4 ) /* 0 | x0*m00 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ MOVQ ( MM3, REGOFF(8, EDX) ) /* write r2 (=m32), r3 (=0) */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPPR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPPR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_2d )
+GLNAME( _mesa_3dnow_transform_points1_2d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2R_3 ) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
+
+ PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
+ PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_2d_no_rot )
+GLNAME( _mesa_3dnow_transform_points1_2d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PFMUL ( MM0, MM4 ) /* | x0*m00 */
+ PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points1_3d )
+GLNAME( _mesa_3dnow_transform_points1_3d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(4, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3R_3 ) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
+ MOVD ( REGOFF(8, ECX), MM1 ) /* | m02 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TP3R_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
+
+ MOVQ ( MM4, MM5 ) /* | x0 */
+ PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
+
+ PFMUL ( MM1, MM5 ) /* | x0*m02 */
+ PFADD ( MM2, MM4 ) /* x0*m01+m31 | x0*m00+m30 */
+
+ PFADD ( MM3, MM5 ) /* | x0*m02+m32 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ MOVD ( MM5, REGOFF(8, EDX) ) /* write r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP3R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
diff --git a/src/mesa/x86/3dnow_xform2.S b/src/mesa/x86/3dnow_xform2.S
new file mode 100644
index 00000000000..cb3a8531df7
--- /dev/null
+++ b/src/mesa/x86/3dnow_xform2.S
@@ -0,0 +1,464 @@
+/* $Id: 3dnow_xform2.S,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_general )
+GLNAME( _mesa_3dnow_transform_points2_general ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPGR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
+
+ MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
+
+ MOVD ( REGOFF(8, ECX), MM2 ) /* | m02 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM2 ) /* m12 | m02 */
+
+ MOVD ( REGOFF(12, ECX), MM3 ) /* | m03 */
+ PUNPCKLDQ ( REGOFF(28, ECX), MM3 ) /* m13 | m03 */
+
+ MOVQ ( REGOFF(48, ECX), MM4 ) /* m31 | m30 */
+ MOVQ ( REGOFF(56, ECX), MM5 ) /* m33 | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPGR_2 ):
+
+ MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
+ MOVQ ( MM6, MM7 ) /* x1 | x0 */
+
+ PFMUL ( MM0, MM6 ) /* x1*m10 | x0*m00 */
+ PFMUL ( MM1, MM7 ) /* x1*m11 | x0*m01 */
+
+ PFACC ( MM7, MM6 ) /* x0*m01+x1*m11 | x0*x00+x1*m10 */
+ PFADD ( MM4, MM6 ) /* x0*...*m11+m31 | x0*...*m10+m30 */
+
+ MOVQ ( MM6, REGIND(EDX) ) /* write r1, r0 */
+ MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
+
+ MOVQ ( MM6, MM7 ) /* x1 | x0 */
+ PFMUL ( MM2, MM6 ) /* x1*m12 | x0*m02 */
+
+ PFMUL ( MM3, MM7 ) /* x1*m13 | x0*m03 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PFACC ( MM7, MM6 ) /* x0*m03+x1*m13 | x0*x02+x1*m12 */
+ PFADD ( MM5, MM6 ) /* x0*...*m13+m33 | x0*...*m12+m32 */
+
+ MOVQ ( MM6, REGOFF(8, EDX) ) /* write r3, r2 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPGR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPGR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_perspective )
+GLNAME( _mesa_3dnow_transform_points2_perspective ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPPR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TPPR_2 ):
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ MOVQ ( MM3, REGOFF(8, EDX) ) /* write r2 (=m32), r3 (=0) */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPPR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPPR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_3d )
+GLNAME( _mesa_3dnow_transform_points2_3d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3R_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
+
+ MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
+
+ MOVD ( REGOFF(8, ECX), MM2 ) /* | m02 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM2 ) /* m12 | m02 */
+
+ MOVQ ( REGOFF(48, ECX), MM4 ) /* m31 | m30 */
+ MOVD ( REGOFF(56, ECX), MM5 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TP3R_2 ):
+
+ MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
+ MOVQ ( MM6, MM7 ) /* x1 | x0 */
+
+ PFMUL ( MM0, MM6 ) /* x1*m10 | x0*m00 */
+ PFMUL ( MM1, MM7 ) /* x1*m11 | x0*m01 */
+
+ PFACC ( MM7, MM6 ) /* x0*m01+x1*m11 | x0*x00+x1*m10 */
+ PFADD ( MM4, MM6 ) /* x0*...*m11+m31 | x0*...*m10+m30 */
+
+ MOVQ ( MM6, REGIND(EDX) ) /* write r1, r0 */
+ MOVQ ( REGIND(EAX), MM6 ) /* x1 | x0 */
+
+ MOVQ ( MM6, MM7 ) /* x1 | x0 */
+ PFMUL ( MM2, MM6 ) /* x1*m12 | x0*m02 */
+
+ PFACC ( MM7, MM6 ) /* ***trash*** | x0*x02+x1*m12 */
+ PFADD ( MM5, MM6 ) /* ***trash*** | x0*...*m12+m32 */
+
+ MOVD ( MM6, REGOFF(8, EDX) ) /* write r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP3R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_3d_no_rot )
+GLNAME( _mesa_3dnow_transform_points2_3d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3 ), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_2 ):
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ PFADD ( MM2, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ MOVD ( MM3, REGOFF(8, EDX) ) /* write r2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP3NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_2d )
+GLNAME( _mesa_3dnow_transform_points2_2d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2R_3 ) )
+
+ MOVQ ( REGIND(ECX), MM0 ) /* m01 | m00 */
+ MOVQ ( REGOFF(16, ECX), MM1 ) /* m11 | m10 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+ MOVD ( REGIND(EAX), MM4 ) /* | x0 */
+ MOVD ( REGOFF(4, EAX), MM5 ) /* | x1 */
+
+ PUNPCKLDQ ( MM4, MM4 ) /* x0 | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PFMUL ( MM0, MM4 ) /* x0*m01 | x0*m00 */
+ PUNPCKLDQ ( MM5, MM5 ) /* x1 | x1 */
+
+ PFMUL ( MM1, MM5 ) /* x1*m11 | x1*m10 */
+ PFADD ( MM2, MM4 ) /* x...x1*m11+31 | x0*..*m10+m30 */
+
+ PFADD ( MM5, MM4 ) /* x0*m01+x1*m11 | x0*m00+x1*m10 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_2d_no_rot )
+GLNAME( _mesa_3dnow_transform_points2_2d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+ PFADD ( MM2, MM4 ) /* m31 | x0*m00+m30 */
+
+ MOVQ ( MM4, REGIND(EDX) ) /* write r1, r0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points2_identity )
+GLNAME( _mesa_3dnow_transform_points2_identity ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(2), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPIR_3 ) )
+
+ALIGNTEXT16
+LLBL( G3TPIR_3 ):
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ MOVQ ( MM0, REGIND(EDX) ) /* r1 | r0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPIR_3 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPIR_4 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
diff --git a/src/mesa/x86/3dnow_xform3.S b/src/mesa/x86/3dnow_xform3.S
new file mode 100644
index 00000000000..7d225b33bc4
--- /dev/null
+++ b/src/mesa/x86/3dnow_xform3.S
@@ -0,0 +1,570 @@
+/* $Id: 3dnow_xform3.S,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_general )
+GLNAME( _mesa_3dnow_transform_points3_general ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPGR_2 ) )
+
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM2 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TPGR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ PUNPCKLDQ ( MM2, MM2 ) /* x2 | x2 */
+
+ PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
+ MOVQ ( MM2, MM5 ) /* x2 | x2 */
+
+ PUNPCKHDQ ( MM1, MM1 ) /* x1 | x1 */
+ PFMUL ( REGOFF(32, ECX), MM2 ) /* x2*m9 | x2*m8 */
+
+ MOVQ ( MM0, MM3 ) /* x0 | x0 */
+ PFMUL ( REGOFF(40, ECX), MM5 ) /* x2*m11 | x2*m10 */
+
+ MOVQ ( MM1, MM4 ) /* x1 | x1 */
+ PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
+
+ PFADD ( REGOFF(48, ECX), MM2 ) /* x2*m9+m13 | x2*m8+m12 */
+ PFMUL ( REGOFF(16, ECX), MM1 ) /* x1*m5 | x1*m4 */
+
+ PFADD ( REGOFF(56, ECX), MM5 ) /* x2*m11+m15 | x2*m10+m14 */
+ PFADD ( MM0, MM1 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
+
+ PFMUL ( REGOFF(8, ECX), MM3 ) /* x0*m3 | x0*m2 */
+ PFADD ( MM1, MM2 ) /* r1 | r0 */
+
+ PFMUL ( REGOFF(24, ECX), MM4 ) /* x1*m7 | x1*m6 */
+ ADD_L ( CONST(16), EDX ) /* next output vertex */
+
+ PFADD ( MM3, MM4 ) /* x0*m3+x1*m7 | x0*m2+x1*m6 */
+ MOVQ ( MM2, REGOFF(-16, EDX) ) /* write r0, r1 */
+
+ PFADD ( MM4, MM5 ) /* r3 | r2 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM2 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPGR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPGR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_perspective )
+GLNAME( _mesa_3dnow_transform_points3_perspective ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPPR_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(32, ECX), MM1 ) /* m21 | m20 */
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
+
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TPPR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ PXOR ( MM7, MM7 ) /* 0 | 0 */
+ MOVQ ( MM5, MM6 ) /* | x2 */
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+ PFSUB ( MM5, MM7 ) /* | -x2 */
+
+ PFMUL ( MM2, MM6 ) /* | x2*m22 */
+ PUNPCKLDQ ( MM5, MM5 ) /* x2 | x2 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ PFMUL ( MM1, MM5 ) /* x2*m21 | x2*m20 */
+
+ PFADD ( MM3, MM6 ) /* | x2*m22+m32 */
+ PFADD ( MM4, MM5 ) /* x1*m11+x2*m21 | x0*m00+x2*m20 */
+
+ MOVQ ( MM5, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVD ( MM6, REGOFF(-8, EDX) ) /* write r2 */
+
+ MOVD ( MM7, REGOFF(-4, EDX) ) /* write r3 */
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPPR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPPR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_3d )
+GLNAME( _mesa_3dnow_transform_points3_3d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3R_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCH ( REGIND(EDX) )
+
+ MOVD ( REGOFF(8, ECX), MM7 ) /* | m2 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM7 ) /* m6 | m2 */
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TP3R_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM2 ) /* x1 | x0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PUNPCKLDQ ( MM2, MM2 ) /* x0 | x0 */
+ MOVQ ( MM0, MM3 ) /* x1 | x0 */
+
+ PFMUL ( REGIND(ECX), MM2 ) /* x0*m1 | x0*m0 */
+ PUNPCKHDQ ( MM3, MM3 ) /* x1 | x1 */
+
+ MOVQ ( MM1, MM4 ) /* | x2 */
+ PFMUL ( REGOFF(16, ECX), MM3 ) /* x1*m5 | x1*m4 */
+
+ PUNPCKLDQ ( MM4, MM4 ) /* x2 | x2 */
+ PFADD ( MM2, MM3 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
+
+ PFMUL ( REGOFF(32, ECX), MM4 ) /* x2*m9 | x2*m8 */
+ PFADD ( REGOFF(48, ECX), MM3 ) /* x0*m1+...+m11 | x0*m0+x1*m4+m12 */
+
+ PFMUL ( MM7, MM0 ) /* x1*m6 | x0*m2 */
+ PFADD ( MM4, MM3 ) /* r1 | r0 */
+
+ PFMUL ( REGOFF(40, ECX), MM1 ) /* | x2*m10 */
+ PUNPCKLDQ ( REGOFF(56, ECX), MM1 ) /* m14 | x2*m10 */
+
+ PFACC ( MM0, MM1 )
+
+ MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
+ PFACC ( MM1, MM1 ) /* | r2 */
+
+ MOVD ( MM1, REGOFF(-8, EDX) ) /* write r2 */
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP3R_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3R_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_3d_no_rot )
+GLNAME( _mesa_3dnow_transform_points3_3d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3NRR_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
+ PUNPCKLDQ ( MM2, MM2 ) /* m22 | m22 */
+
+ MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
+ MOVD ( REGOFF(56, ECX), MM3 ) /* | m32 */
+
+ PUNPCKLDQ ( MM3, MM3 ) /* m32 | m32 */
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ PREFETCHW ( REGIND(EAX) )
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ PFADD ( MM1, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
+ PFMUL ( MM2, MM5 ) /* | x2*m22 */
+
+ PFADD ( MM3, MM5 ) /* | x2*m22+m32 */
+ MOVQ ( MM4, REGIND(EDX) ) /* write r0, r1 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 */
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+ JNZ ( LLBL( G3TP3NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3NRR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_2d )
+GLNAME( _mesa_3dnow_transform_points3_2d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2R_3) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
+
+ MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+ MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TP2R_2 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM3, MM4 ) /* x1 | x0 */
+ PFMUL ( MM0, MM3 ) /* x1*m10 | x0*m00 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ PFMUL ( MM1, MM4 ) /* x1*m11 | x0*m01 */
+
+ PFACC ( MM4, MM3 ) /* x0*m00+x1*m10 | x0*m01+x1*m11 */
+ MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 (=x2) */
+
+ PFADD ( MM2, MM3 ) /* x0*...*m10+m30 | x0*...*m11+m31 */
+ MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
+
+ MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2R_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2R_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_2d_no_rot )
+GLNAME( _mesa_3dnow_transform_points3_2d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2NRR_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFADD ( MM1, MM4 ) /* x1*m11+m31 | x0*m00+m30 */
+
+ MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVD ( MM5, REGOFF(-8, EDX) ) /* write r2 (=x2) */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM5 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2NRR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points3_identity )
+GLNAME( _mesa_3dnow_transform_points3_identity ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(3), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPIR_2 ) )
+
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+
+ALIGNTEXT16
+LLBL( G3TPIR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) )
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ MOVQ ( MM0, REGOFF(-16, EDX) ) /* r1 | r0 */
+
+ MOVD ( MM1, REGOFF(-8, EDX) ) /* | r2 */
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+
+ MOVD ( REGOFF(8, EAX), MM1 ) /* | x2 */
+ JNZ ( LLBL( G3TPIR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPIR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
diff --git a/src/mesa/x86/3dnow_xform4.S b/src/mesa/x86/3dnow_xform4.S
new file mode 100644
index 00000000000..06d7c6950a2
--- /dev/null
+++ b/src/mesa/x86/3dnow_xform4.S
@@ -0,0 +1,588 @@
+/* $Id: 3dnow_xform4.S,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 4
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_general )
+GLNAME( _mesa_3dnow_transform_points4_general ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPGR_2 ) )
+
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM4 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TPGR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM0, MM2 ) /* x1 | x0 */
+ MOVQ ( MM4, MM6 ) /* x3 | x2 */
+
+ PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
+ PUNPCKHDQ ( MM2, MM2 ) /* x1 | x1 */
+
+ MOVQ ( MM0, MM1 ) /* x0 | x0 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
+ MOVQ ( MM2, MM3 ) /* x1 | x1 */
+
+ PFMUL ( REGOFF(8, ECX), MM1 ) /* x0*m3 | x0*m2 */
+ PUNPCKLDQ ( MM4, MM4 ) /* x2 | x2 */
+
+ PFMUL ( REGOFF(16, ECX), MM2 ) /* x1*m5 | x1*m4 */
+ MOVQ ( MM4, MM5 ) /* x2 | x2 */
+
+ PFMUL ( REGOFF(24, ECX), MM3 ) /* x1*m7 | x1*m6 */
+ PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
+
+ PFMUL ( REGOFF(32, ECX), MM4 ) /* x2*m9 | x2*m8 */
+ MOVQ ( MM6, MM7 ) /* x3 | x3 */
+
+ PFMUL ( REGOFF(40, ECX), MM5 ) /* x2*m11 | x2*m10 */
+ PFADD ( MM0, MM2 )
+
+ PFMUL ( REGOFF(48, ECX), MM6 ) /* x3*m13 | x3*m12 */
+ PFADD ( MM1, MM3 )
+
+ PFMUL ( REGOFF(56, ECX), MM7 ) /* x3*m15 | x3*m14 */
+ PFADD ( MM4, MM6 )
+
+ PFADD ( MM5, MM7 )
+ PFADD ( MM2, MM6 )
+
+ PFADD ( MM3, MM7 )
+ MOVQ ( MM6, REGOFF(-16, EDX) )
+
+ MOVQ ( MM7, REGOFF(-8, EDX) )
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+
+ MOVQ ( REGOFF(8, EAX), MM4 ) /* x3 | x2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPGR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPGR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_perspective )
+GLNAME( _mesa_3dnow_transform_points4_perspective ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPPR_2 ) )
+
+ PREFETCH ( REGIND(EAX) )
+ PREFETCHW ( REGIND(EDX) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVD ( REGOFF(40, ECX), MM1 ) /* | m22 */
+ PUNPCKLDQ ( REGOFF(56, ECX), MM1 ) /* m32 | m22 */
+
+ MOVQ ( REGOFF(32, ECX), MM2 ) /* m21 | m20 */
+ PXOR ( MM7, MM7 ) /* 0 | 0 */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+ MOVD ( REGOFF(8, EAX), MM3 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TPPR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGOFF(32, EAX) ) /* hopefully stride is zero */
+
+ MOVQ ( MM5, MM6 ) /* x3 | x2 */
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ PUNPCKLDQ ( MM5, MM5 ) /* x2 | x2 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFMUL ( MM2, MM5 ) /* x2*m21 | x2*m20 */
+ PFSUBR ( MM7, MM3 ) /* | -x2 */
+
+ PFMUL ( MM1, MM6 ) /* x3*m32 | x2*m22 */
+ PFADD ( MM4, MM5 ) /* x1*m11+x2*m21 | x0*m00+x2*m20 */
+
+ PFACC ( MM3, MM6 ) /* -x2 | x2*m22+x3*m32 */
+ MOVQ ( MM5, REGOFF(-16, EDX) ) /* write r0, r1 */
+
+ MOVQ ( MM6, REGOFF(-8, EDX) ) /* write r2, r3 */
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+ MOVD ( REGOFF(8, EAX), MM3 ) /* | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TPPR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPPR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_3d )
+GLNAME( _mesa_3dnow_transform_points4_3d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3R_2 ) )
+
+ MOVD ( REGOFF(8, ECX), MM6 ) /* | m2 */
+ PUNPCKLDQ ( REGOFF(24, ECX), MM6 ) /* m6 | m2 */
+
+ MOVD ( REGOFF(40, ECX), MM7 ) /* | m10 */
+ PUNPCKLDQ ( REGOFF(56, ECX), MM7 ) /* m14 | m10 */
+
+ MOVQ ( REGIND(EAX), MM2 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM3 ) /* x3 | x2 */
+
+ALIGNTEXT16
+LLBL( G3TP3R_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGOFF(32, EAX) ) /* hopefully array is tightly packed */
+
+ MOVQ ( MM2, MM0 ) /* x1 | x0 */
+ MOVQ ( MM3, MM4 ) /* x3 | x2 */
+
+ MOVQ ( MM0, MM1 ) /* x1 | x0 */
+ MOVQ ( MM4, MM5 ) /* x3 | x2 */
+
+ PUNPCKLDQ ( MM0, MM0 ) /* x0 | x0 */
+ PUNPCKHDQ ( MM1, MM1 ) /* x1 | x1 */
+
+ PFMUL ( REGIND(ECX), MM0 ) /* x0*m1 | x0*m0 */
+ PUNPCKLDQ ( MM3, MM3 ) /* x2 | x2 */
+
+ PFMUL ( REGOFF(16, ECX), MM1 ) /* x1*m5 | x1*m4 */
+ PUNPCKHDQ ( MM4, MM4 ) /* x3 | x3 */
+
+ PFMUL ( MM6, MM2 ) /* x1*m6 | x0*m2 */
+ PFADD ( MM0, MM1 ) /* x0*m1+x1*m5 | x0*m0+x1*m4 */
+
+ PFMUL ( REGOFF(32, ECX), MM3 ) /* x2*m9 | x2*m8 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFMUL ( REGOFF(48, ECX), MM4 ) /* x3*m13 | x3*m12 */
+ PFADD ( MM1, MM3 ) /* x0*m1+..+x2*m9 | x0*m0+...+x2*m8 */
+
+ PFMUL ( MM7, MM5 ) /* x3*m14 | x2*m10 */
+ PFADD ( MM3, MM4 ) /* r1 | r0 */
+
+ PFACC ( MM2, MM5 ) /* x0*m2+x1*m6 | x2*m10+x3*m14 */
+ MOVD ( REGOFF(12, EAX), MM0 ) /* | x3 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ PFACC ( MM0, MM5 ) /* r3 | r2 */
+
+ MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ MOVQ ( REGIND(EAX), MM2 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM3 ) /* x3 | x2 */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP3R_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3R_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_3d_no_rot )
+GLNAME( _mesa_3dnow_transform_points4_3d_no_rot ):
+
+ PUSH_L ( ESI )
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP3NRR_2 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVD ( REGOFF(40, ECX), MM2 ) /* | m22 */
+ PUNPCKLDQ ( REGOFF(56, ECX), MM2 ) /* m32 | m22 */
+
+ MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+ MOVD ( REGOFF(12, EAX), MM7 ) /* | x3 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TP3NRR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGOFF(32, EAX) ) /* hopefully stride is zero */
+
+ MOVQ ( MM5, MM6 ) /* x3 | x2 */
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+
+ PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
+ PFMUL ( MM2, MM5 ) /* x3*m32 | x2*m22 */
+
+ PFMUL ( MM1, MM6 ) /* x3*m31 | x3*m30 */
+ PFACC ( MM7, MM5 ) /* x3 | x2*m22+x3*m32 */
+
+ PFADD ( MM6, MM4 ) /* x1*m11+x3*m31 | x0*m00+x3*m30 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ MOVQ ( MM4, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+
+ MOVD ( REGOFF(12, EAX), MM7 ) /* | x3 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP3NRR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP3NRR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_2d )
+GLNAME( _mesa_3dnow_transform_points4_2d ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2R_2 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(16, ECX), MM0 ) /* m10 | m00 */
+
+ MOVD ( REGOFF(4, ECX), MM1 ) /* | m01 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM1 ) /* m11 | m01 */
+
+ MOVQ ( REGOFF(48, ECX), MM2 ) /* m31 | m30 */
+
+ MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TP2R_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ MOVQ ( MM3, MM4 ) /* x1 | x0 */
+ MOVQ ( MM5, MM6 ) /* x3 | x2 */
+
+ PFMUL ( MM1, MM4 ) /* x1*m11 | x0*m01 */
+ PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
+
+ PFMUL ( MM0, MM3 ) /* x1*m10 | x0*m00 */
+ ADD_L ( CONST(16), EDX ) /* next r */
+
+ PFACC ( MM4, MM3 ) /* x0*m01+x1*m11 | x0*m00+x1*m10 */
+ PFMUL ( MM2, MM6 ) /* x3*m31 | x3*m30 */
+
+ PFADD ( MM6, MM3 ) /* r1 | r0 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ MOVQ ( MM3, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVQ ( REGIND(EAX), MM3 ) /* x1 | x0 */
+
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TP2R_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2R_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_2d_no_rot )
+GLNAME( _mesa_3dnow_transform_points4_2d_no_rot ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TP2NRR_3 ) )
+
+ MOVD ( REGIND(ECX), MM0 ) /* | m00 */
+ PUNPCKLDQ ( REGOFF(20, ECX), MM0 ) /* m11 | m00 */
+
+ MOVQ ( REGOFF(48, ECX), MM1 ) /* m31 | m30 */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TP2NRR_2 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ PFMUL ( MM0, MM4 ) /* x1*m11 | x0*m00 */
+ MOVQ ( MM5, MM6 ) /* x3 | x2 */
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ PUNPCKHDQ ( MM6, MM6 ) /* x3 | x3 */
+
+ PFMUL ( MM1, MM6 ) /* x3*m31 | x3*m30 */
+ PFADD ( MM4, MM6 ) /* x1*m11+x3*m31 | x0*m00+x3*m30 */
+
+ MOVQ ( MM6, REGOFF(-16, EDX) ) /* write r0, r1 */
+ MOVQ ( MM5, REGOFF(-8, EDX) ) /* write r2, r3 */
+
+ MOVQ ( REGIND(EAX), MM4 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM5 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+ DEC_L ( ESI ) /* decrement vertex counter */
+
+ JNZ ( LLBL( G3TP2NRR_2 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TP2NRR_3 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_3dnow_transform_points4_identity )
+GLNAME( _mesa_3dnow_transform_points4_identity ):
+
+ PUSH_L ( ESI )
+
+ MOV_L ( ARG_DEST, ECX )
+ MOV_L ( ARG_MATRIX, ESI )
+ MOV_L ( ARG_SOURCE, EAX )
+ MOV_L ( CONST(4), REGOFF(V4F_SIZE, ECX) )
+ OR_B ( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, ECX) )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), EDX )
+ MOV_L ( EDX, REGOFF(V4F_COUNT, ECX) )
+
+ PUSH_L ( EDI )
+
+ MOV_L ( REGOFF(V4F_START, ECX), EDX )
+ MOV_L ( ESI, ECX )
+ MOV_L ( REGOFF(V4F_COUNT, EAX), ESI )
+ MOV_L ( REGOFF(V4F_STRIDE, EAX), EDI )
+ MOV_L ( REGOFF(V4F_START, EAX), EAX )
+
+ TEST_L ( ESI, ESI )
+ JZ ( LLBL( G3TPIR_2 ) )
+
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+ MOVQ ( REGOFF(8, EAX), MM1 ) /* x3 | x2 */
+
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ALIGNTEXT16
+LLBL( G3TPIR_1 ):
+
+ PREFETCHW ( REGOFF(32, EDX) ) /* prefetch 2 vertices ahead */
+ PREFETCH ( REGIND(EAX) )
+
+ ADD_L ( CONST(16), EDX ) /* next r */
+ MOVQ ( MM0, REGOFF(-16, EDX) ) /* r1 | r0 */
+
+ MOVQ ( MM1, REGOFF(-8, EDX) ) /* r3 | r2 */
+ MOVQ ( REGIND(EAX), MM0 ) /* x1 | x0 */
+
+ MOVQ ( REGOFF(8, EAX), MM1 ) /* x3 | x2 */
+ ADD_L ( EDI, EAX ) /* next vertex */
+
+ DEC_L ( ESI ) /* decrement vertex counter */
+ JNZ ( LLBL( G3TPIR_1 ) ) /* cnt > 0 ? -> process next vertex */
+
+LLBL( G3TPIR_2 ):
+
+ FEMMS
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
diff --git a/src/mesa/x86/common_x86.c b/src/mesa/x86/common_x86.c
index 7d893917185..15028197fb8 100644
--- a/src/mesa/x86/common_x86.c
+++ b/src/mesa/x86/common_x86.c
@@ -1,4 +1,4 @@
-/* $Id: common_x86.c,v 1.14 2001/03/28 20:44:43 gareth Exp $ */
+/* $Id: common_x86.c,v 1.15 2001/03/29 06:46:16 gareth Exp $ */
/*
* Mesa 3-D graphics library
@@ -35,7 +35,7 @@
#include <stdlib.h>
#include <stdio.h>
-#if defined(USE_KATMAI_ASM) && defined(__linux__)
+#if defined(USE_SSE_ASM) && defined(__linux__)
#include <signal.h>
#endif
@@ -67,7 +67,7 @@ static void message( const char *msg )
}
}
-#if defined(USE_KATMAI_ASM)
+#if defined(USE_SSE_ASM)
/*
* We must verify that the Streaming SIMD Extensions are truly supported
* on this processor before we go ahead and hook out the optimized code.
@@ -84,8 +84,8 @@ static void message( const char *msg )
* not good.
*/
-extern void _mesa_test_os_katmai_support( void );
-extern void _mesa_test_os_katmai_exception_support( void );
+extern void _mesa_test_os_sse_support( void );
+extern void _mesa_test_os_sse_exception_support( void );
#if defined(__linux__) && defined(_POSIX_SOURCE)
static void sigill_handler( int signal, struct sigcontext sc )
@@ -135,7 +135,7 @@ static void sigfpe_handler( int signal, struct sigcontext sc )
*
* GH: Isn't this just awful?
*/
-static void check_os_katmai_support( void )
+static void check_os_sse_support( void )
{
#if defined(__linux__)
#if defined(_POSIX_SOURCE)
@@ -159,7 +159,7 @@ static void check_os_katmai_support( void )
if ( cpu_has_xmm ) {
message( "Testing OS support for SSE... " );
- _mesa_test_os_katmai_support();
+ _mesa_test_os_sse_support();
if ( cpu_has_xmm ) {
message( "yes.\n" );
@@ -184,7 +184,7 @@ static void check_os_katmai_support( void )
if ( cpu_has_xmm ) {
message( "Testing OS support for SSE unmasked exceptions... " );
- _mesa_test_os_katmai_exception_support();
+ _mesa_test_os_sse_exception_support();
if ( cpu_has_xmm ) {
message( "yes.\n" );
@@ -220,7 +220,7 @@ static void check_os_katmai_support( void )
#endif /* __linux__ */
}
-#endif /* USE_KATMAI_ASM */
+#endif /* USE_SSE_ASM */
void _mesa_init_all_x86_transform_asm( void )
@@ -257,14 +257,14 @@ void _mesa_init_all_x86_transform_asm( void )
}
#endif
-#ifdef USE_KATMAI_ASM
- if ( cpu_has_xmm && getenv( "MESA_FORCE_KATMAI" ) == 0 ) {
- check_os_katmai_support();
+#ifdef USE_SSE_ASM
+ if ( cpu_has_xmm && getenv( "MESA_FORCE_SSE" ) == 0 ) {
+ check_os_sse_support();
}
if ( cpu_has_xmm ) {
- if ( getenv( "MESA_NO_KATMAI" ) == 0 ) {
- message( "Katmai cpu detected.\n" );
- _mesa_init_katmai_transform_asm();
+ if ( getenv( "MESA_NO_SSE" ) == 0 ) {
+ message( "SSE cpu detected.\n" );
+ _mesa_init_sse_transform_asm();
} else {
_mesa_x86_cpu_features &= ~(X86_FEATURE_XMM);
}
@@ -289,9 +289,9 @@ void _mesa_init_all_x86_vertex_asm( void )
}
#endif
-#ifdef USE_KATMAI_ASM
- if ( cpu_has_xmm && getenv( "MESA_NO_KATMAI" ) == 0 ) {
- _mesa_init_katmai_vertex_asm();
+#ifdef USE_SSE_ASM
+ if ( cpu_has_xmm && getenv( "MESA_NO_SSE" ) == 0 ) {
+ _mesa_init_sse_vertex_asm();
}
#endif
#endif
diff --git a/src/mesa/x86/common_x86_asm.S b/src/mesa/x86/common_x86_asm.S
index 69cc83f6cd2..8a0a6477afa 100644
--- a/src/mesa/x86/common_x86_asm.S
+++ b/src/mesa/x86/common_x86_asm.S
@@ -1,4 +1,4 @@
-/* $Id: common_x86_asm.S,v 1.6 2001/03/28 20:44:43 gareth Exp $ */
+/* $Id: common_x86_asm.S,v 1.7 2001/03/29 06:46:16 gareth Exp $ */
/*
* Mesa 3-D graphics library
@@ -58,8 +58,8 @@
GLNAME( found_intel ): STRING( "Genuine Intel processor found\n\0" )
GLNAME( found_amd ): STRING( "Authentic AMD processor found\n\0" )
-#ifdef USE_KATMAI_ASM
-GLNAME( katmai_test_dummy ):
+#ifdef USE_SSE_ASM
+GLNAME( sse_test_dummy ):
D_LONG 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
#endif
@@ -157,15 +157,15 @@ LLBL ( cpuid_done ):
RET
-#ifdef USE_KATMAI_ASM
+#ifdef USE_SSE_ASM
/* Execute an SSE instruction to see if the operating system correctly
* supports SSE. A signal handler for SIGILL should have been set
* before calling this function, otherwise this could kill the client
* application.
*/
ALIGNTEXT4
-GLOBL GLNAME( _mesa_test_os_katmai_support )
-GLNAME( _mesa_test_os_katmai_support ):
+GLOBL GLNAME( _mesa_test_os_sse_support )
+GLNAME( _mesa_test_os_sse_support ):
XORPS ( XMM0, XMM0 )
@@ -178,8 +178,8 @@ GLNAME( _mesa_test_os_katmai_support ):
* otherwise this could kill the client application.
*/
ALIGNTEXT4
-GLOBL GLNAME( _mesa_test_os_katmai_exception_support )
-GLNAME( _mesa_test_os_katmai_exception_support ):
+GLOBL GLNAME( _mesa_test_os_sse_exception_support )
+GLNAME( _mesa_test_os_sse_exception_support ):
PUSH_L ( EBP )
MOV_L ( ESP, EBP )
@@ -196,7 +196,7 @@ GLNAME( _mesa_test_os_katmai_exception_support ):
LDMXCSR ( REGOFF( -8, EBP ) )
XORPS ( XMM0, XMM0 )
- MOVUPS ( GLNAME( katmai_test_dummy ), XMM1 )
+ MOVUPS ( GLNAME( sse_test_dummy ), XMM1 )
DIVPS ( XMM0, XMM1 )
diff --git a/src/mesa/x86/common_x86_asm.h b/src/mesa/x86/common_x86_asm.h
index 312855dc652..ba965e6c088 100644
--- a/src/mesa/x86/common_x86_asm.h
+++ b/src/mesa/x86/common_x86_asm.h
@@ -1,4 +1,4 @@
-/* $Id: common_x86_asm.h,v 1.7 2001/03/28 20:44:44 gareth Exp $ */
+/* $Id: common_x86_asm.h,v 1.8 2001/03/29 06:46:16 gareth Exp $ */
/*
* Mesa 3-D graphics library
@@ -52,8 +52,8 @@
#ifdef USE_3DNOW_ASM
#include "3dnow.h"
#endif
-#ifdef USE_KATMAI_ASM
-#include "katmai.h"
+#ifdef USE_SSE_ASM
+#include "sse.h"
#endif
#endif
diff --git a/src/mesa/x86/sse.c b/src/mesa/x86/sse.c
new file mode 100644
index 00000000000..a986636edc7
--- /dev/null
+++ b/src/mesa/x86/sse.c
@@ -0,0 +1,208 @@
+/* $Id: sse.c,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * PentiumIII-SIMD (SSE) optimizations contributed by
+ * Andre Werthmann <[email protected]>
+ */
+
+#include "glheader.h"
+#include "context.h"
+#include "mtypes.h"
+#include "sse.h"
+
+#include "math/m_vertices.h"
+#include "math/m_xform.h"
+
+#include "tnl/t_context.h"
+
+#ifdef DEBUG
+#include "math/m_debug.h"
+#endif
+
+
+#define XFORM_ARGS GLvector4f *to_vec, \
+ const GLfloat m[16], \
+ const GLvector4f *from_vec, \
+ const GLubyte *mask, \
+ const GLubyte flag
+
+
+#define DECLARE_XFORM_GROUP( pfx, sz ) \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d( XFORM_ARGS );
+
+
+#define ASSIGN_XFORM_GROUP( pfx, sz ) \
+ _mesa_transform_tab[0][sz][MATRIX_GENERAL] = \
+ _mesa_##pfx##_transform_points##sz##_general; \
+ _mesa_transform_tab[0][sz][MATRIX_IDENTITY] = \
+ _mesa_##pfx##_transform_points##sz##_identity; \
+ _mesa_transform_tab[0][sz][MATRIX_3D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_3d_no_rot; \
+ _mesa_transform_tab[0][sz][MATRIX_PERSPECTIVE] = \
+ _mesa_##pfx##_transform_points##sz##_perspective; \
+ _mesa_transform_tab[0][sz][MATRIX_2D] = \
+ _mesa_##pfx##_transform_points##sz##_2d; \
+ _mesa_transform_tab[0][sz][MATRIX_2D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_2d_no_rot; \
+ _mesa_transform_tab[0][sz][MATRIX_3D] = \
+ _mesa_##pfx##_transform_points##sz##_3d;
+
+
+
+#define NORM_ARGS const GLmatrix *mat, \
+ GLfloat scale, \
+ const GLvector3f *in, \
+ const GLfloat *lengths, \
+ const GLubyte mask[], \
+ GLvector3f *dest
+
+
+#define DECLARE_NORM_GROUP( pfx ) \
+extern void _ASMAPI _mesa_##pfx##_rescale_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_normalize_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normals_no_rot( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_rescale_normals_no_rot( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals( NORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_normalize_normals_no_rot( NORM_ARGS );
+
+
+#define ASSIGN_NORM_GROUP( pfx ) \
+ _mesa_normal_tab[NORM_RESCALE][0] = \
+ _mesa_##pfx##_rescale_normals; \
+ _mesa_normal_tab[NORM_NORMALIZE][0] = \
+ _mesa_##pfx##_normalize_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM][0] = \
+ _mesa_##pfx##_transform_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT][0] = \
+ _mesa_##pfx##_transform_normals_no_rot; \
+ _mesa_normal_tab[NORM_TRANSFORM|NORM_RESCALE][0] = \
+ _mesa_##pfx##_transform_rescale_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT|NORM_RESCALE][0] = \
+ _mesa_##pfx##_transform_rescale_normals_no_rot; \
+ _mesa_normal_tab[NORM_TRANSFORM|NORM_NORMALIZE][0] = \
+ _mesa_##pfx##_transform_normalize_normals; \
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT|NORM_NORMALIZE][0] = \
+ _mesa_##pfx##_transform_normalize_normals_no_rot;
+
+
+#ifdef USE_SSE_ASM
+DECLARE_XFORM_GROUP( sse, 2 )
+DECLARE_XFORM_GROUP( sse, 3 )
+
+#if 1
+/* Some functions are not written in SSE-assembly, because the fpu ones are faster */
+extern void _mesa_sse_transform_normals_no_rot( NORM_ARGS );
+extern void _mesa_sse_transform_rescale_normals( NORM_ARGS );
+extern void _mesa_sse_transform_rescale_normals_no_rot( NORM_ARGS );
+
+extern void _mesa_sse_transform_points4_general( XFORM_ARGS );
+extern void _mesa_sse_transform_points4_3d( XFORM_ARGS );
+extern void _mesa_sse_transform_points4_identity( XFORM_ARGS );
+#else
+DECLARE_NORM_GROUP( sse )
+#endif
+
+
+extern void _ASMAPI
+_mesa_v16_sse_general_xform( GLfloat *first_vert,
+ const GLfloat *m,
+ const GLfloat *src,
+ GLuint src_stride,
+ GLuint count );
+
+extern void _ASMAPI
+_mesa_sse_project_vertices( GLfloat *first,
+ GLfloat *last,
+ const GLfloat *m,
+ GLuint stride );
+
+extern void _ASMAPI
+_mesa_sse_project_clipped_vertices( GLfloat *first,
+ GLfloat *last,
+ const GLfloat *m,
+ GLuint stride,
+ const GLubyte *clipmask );
+#endif
+
+
+void _mesa_init_sse_transform_asm( void )
+{
+#ifdef USE_SSE_ASM
+ ASSIGN_XFORM_GROUP( sse, 2 );
+ ASSIGN_XFORM_GROUP( sse, 3 );
+
+#if 1
+ /* TODO: Finish these off.
+ */
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT][0] =
+ _mesa_sse_transform_normals_no_rot;
+ _mesa_normal_tab[NORM_TRANSFORM|NORM_RESCALE][0] =
+ _mesa_sse_transform_rescale_normals;
+ _mesa_normal_tab[NORM_TRANSFORM_NO_ROT|NORM_RESCALE][0] =
+ _mesa_sse_transform_rescale_normals_no_rot;
+
+ _mesa_transform_tab[0][4][MATRIX_GENERAL] =
+ _mesa_sse_transform_points4_general;
+ _mesa_transform_tab[0][4][MATRIX_3D] =
+ _mesa_sse_transform_points4_3d;
+ _mesa_transform_tab[0][4][MATRIX_IDENTITY] =
+ _mesa_sse_transform_points4_identity;
+#else
+ ASSIGN_NORM_GROUP( sse );
+#endif
+
+#ifdef DEBUG
+ _math_test_all_transform_functions( "SSE" );
+ _math_test_all_normal_transform_functions( "SSE" );
+#endif
+#endif
+}
+
+void _mesa_init_sse_vertex_asm( void )
+{
+#ifdef USE_SSE_ASM
+ _mesa_xform_points3_v16_general = _mesa_v16_sse_general_xform;
+#if 0
+ /* GH: These are broken. I'm fixing them now.
+ */
+ _mesa_project_v16 = _mesa_sse_project_vertices;
+ _mesa_project_clipped_v16 = _mesa_sse_project_clipped_vertices;
+#endif
+
+#ifdef DEBUG_NOT
+ _math_test_all_vertex_functions( "SSE" );
+#endif
+#endif
+}
diff --git a/src/mesa/x86/sse.h b/src/mesa/x86/sse.h
new file mode 100644
index 00000000000..3ed1a95892e
--- /dev/null
+++ b/src/mesa/x86/sse.h
@@ -0,0 +1,40 @@
+/* $Id: sse.h,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * PentiumIII-SIMD (SSE) optimizations contributed by
+ * Andre Werthmann <[email protected]>
+ */
+
+#ifndef __SSE_H__
+#define __SSE_H__
+
+#include "math/m_xform.h"
+
+void _mesa_init_sse_transform_asm( void );
+void _mesa_init_sse_vertex_asm( void );
+
+#endif
diff --git a/src/mesa/x86/sse_normal.S b/src/mesa/x86/sse_normal.S
new file mode 100644
index 00000000000..8dd84967154
--- /dev/null
+++ b/src/mesa/x86/sse_normal.S
@@ -0,0 +1,252 @@
+/* $Id: sse_normal.S,v 1.1 2001/03/29 06:46:16 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+ * - insert PREFETCH instructions to avoid cache-misses !
+ * - some more optimizations are possible...
+ * - for 40-50% more performance in the SSE-functions, the
+ * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+ */
+
+#include "matypes.h"
+#include "norm_args.h"
+
+ SEG_TEXT
+
+#define M(i) REGOFF(i * 4, EDX)
+#define S(i) REGOFF(i * 4, ESI)
+#define D(i) REGOFF(i * 4, EDI)
+#define STRIDE REGOFF(12, ESI)
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_rescale_normals_no_rot)
+GLNAME(_mesa_sse_transform_rescale_normals_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
+ MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
+
+ MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
+ MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
+
+ MOV_L ( REGOFF(V3F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L ( ECX, ECX )
+ JZ( LLBL(K_G3TRNNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L ( STRIDE, EAX ) /* stride */
+ MOV_L ( ECX, REGOFF(V3F_COUNT, EDI) ) /* set dest-count */
+
+ IMUL_L( CONST(12), ECX ) /* count *= 12 */
+ MOV_L( REGOFF(V3F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V3F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* m0 */
+ MOVSS ( M(5), XMM2 ) /* m5 */
+ UNPCKLPS( XMM2, XMM1 ) /* m5 | m0 */
+ MOVSS ( ARG_SCALE, XMM0 ) /* scale */
+ SHUFPS ( CONST(0x0), XMM0, XMM0 ) /* scale | scale */
+ MULPS ( XMM0, XMM1 ) /* m5*scale | m0*scale */
+ MULSS ( M(10), XMM0 ) /* m10*scale */
+
+ALIGNTEXT32
+LLBL(K_G3TRNNRR_top):
+ MOVLPS ( S(0), XMM2 ) /* uy | ux */
+ MULPS ( XMM1, XMM2 ) /* uy*m5*scale | ux*m0*scale */
+ MOVLPS ( XMM2, D(0) ) /* ->D(1) | D(0) */
+
+ MOVSS ( S(2), XMM2 ) /* uz */
+ MULSS ( XMM0, XMM2 ) /* uz*m10*scale */
+ MOVSS ( XMM2, D(2) ) /* ->D(2) */
+
+LLBL(K_G3TRNNRR_skip):
+ ADD_L ( CONST(12), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_G3TRNNRR_top) )
+
+LLBL(K_G3TRNNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_rescale_normals)
+GLNAME(_mesa_sse_transform_rescale_normals):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
+ MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
+
+ MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
+ MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
+
+ MOV_L ( REGOFF(V3F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L ( ECX, ECX )
+ JZ( LLBL(K_G3TRNR_finish) ) /* count was zero; go to finish */
+
+ MOV_L ( STRIDE, EAX ) /* stride */
+ MOV_L ( ECX, REGOFF(V3F_COUNT, EDI) ) /* set dest-count */
+
+ IMUL_L( CONST(12), ECX ) /* count *= 12 */
+ MOV_L( REGOFF(V3F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V3F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM0 ) /* m0 */
+ MOVSS ( M(4), XMM1 ) /* m4 */
+ UNPCKLPS( XMM1, XMM0 ) /* m4 | m0 */
+
+ MOVSS ( ARG_SCALE, XMM4 ) /* scale */
+ SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* scale | scale */
+
+ MULPS ( XMM4, XMM0 ) /* m4*scale | m0*scale */
+ MOVSS ( M(1), XMM1 ) /* m1 */
+ MOVSS ( M(5), XMM2 ) /* m5 */
+ UNPCKLPS( XMM2, XMM1 ) /* m5 | m1 */
+ MULPS ( XMM4, XMM1 ) /* m5*scale | m1*scale */
+ MOVSS ( M(2), XMM2 ) /* m2 */
+ MOVSS ( M(6), XMM3 ) /* m6 */
+ UNPCKLPS( XMM3, XMM2 ) /* m6 | m2 */
+ MULPS ( XMM4, XMM2 ) /* m6*scale | m2*scale */
+
+ MOVSS ( M(8), XMM6 ) /* m8 */
+ MULSS ( ARG_SCALE, XMM6 ) /* m8*scale */
+ MOVSS ( M(9), XMM7 ) /* m9 */
+ MULSS ( ARG_SCALE, XMM7 ) /* m9*scale */
+
+ALIGNTEXT32
+LLBL(K_G3TRNR_top):
+ MOVSS ( S(0), XMM3 ) /* ux */
+ SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ux | ux */
+ MULPS ( XMM0, XMM3 ) /* ux*m4 | ux*m0 */
+ MOVSS ( S(1), XMM4 ) /* uy */
+ SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* uy | uy */
+ MULPS ( XMM1, XMM4 ) /* uy*m5 | uy*m1 */
+ MOVSS ( S(2), XMM5 ) /* uz */
+ SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* uz | uz */
+ MULPS ( XMM2, XMM5 ) /* uz*m6 | uz*m2 */
+
+ ADDPS ( XMM4, XMM3 )
+ ADDPS ( XMM5, XMM3 )
+ MOVLPS ( XMM3, D(0) )
+
+ MOVSS ( M(10), XMM3 ) /* m10 */
+ MULSS ( ARG_SCALE, XMM3 ) /* m10*scale */
+ MULSS ( S(2), XMM3 ) /* m10*scale*uz */
+ MOVSS ( S(1), XMM4 ) /* uy */
+ MULSS ( XMM7, XMM4 ) /* uy*m9*scale */
+ MOVSS ( S(0), XMM5 ) /* ux */
+ MULSS ( XMM6, XMM5 ) /* ux*m8*scale */
+
+ ADDSS ( XMM4, XMM3 )
+ ADDSS ( XMM5, XMM3 )
+ MOVSS ( XMM3, D(2) )
+
+LLBL(K_G3TRNR_skip):
+ ADD_L ( CONST(12), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_G3TRNR_top) )
+
+LLBL(K_G3TRNR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_sse_transform_normals_no_rot)
+GLNAME(_mesa_sse_transform_normals_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L ( ARG_IN, ESI ) /* ptr to source GLvector3f */
+ MOV_L ( ARG_DEST, EDI ) /* ptr to dest GLvector3f */
+
+ MOV_L ( ARG_MAT, EDX ) /* ptr to matrix */
+ MOV_L ( REGOFF(MATRIX_INV, EDX), EDX) /* matrix->inv */
+
+ MOV_L ( REGOFF(V3F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L ( ECX, ECX )
+ JZ( LLBL(K_G3TNNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L ( STRIDE, EAX ) /* stride */
+ MOV_L ( ECX, REGOFF(V3F_COUNT, EDI) ) /* set dest-count */
+
+ IMUL_L( CONST(12), ECX ) /* count *= 12 */
+ MOV_L( REGOFF(V3F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V3F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS( M(0), XMM0 ) /* m0 */
+ MOVSS( M(5), XMM1 ) /* m5 */
+ UNPCKLPS( XMM1, XMM0 ) /* m5 | m0 */
+ MOVSS( M(10), XMM1 ) /* m10 */
+
+ALIGNTEXT32
+LLBL(K_G3TNNRR_top):
+ MOVLPS( S(0), XMM2 ) /* uy | ux */
+ MULPS( XMM0, XMM2 ) /* uy*m5 | ux*m0 */
+ MOVLPS( XMM2, D(0) )
+
+ MOVSS( S(2), XMM2 ) /* uz */
+ MULSS( XMM1, XMM2 ) /* uz*m10 */
+ MOVSS( XMM2, D(2) )
+
+LLBL(K_G3TNNRR_skip):
+ ADD_L ( CONST(12), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_G3TNNRR_top) )
+
+LLBL(K_G3TNNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
diff --git a/src/mesa/x86/sse_xform1.S b/src/mesa/x86/sse_xform1.S
new file mode 100644
index 00000000000..89c55673311
--- /dev/null
+++ b/src/mesa/x86/sse_xform1.S
@@ -0,0 +1,433 @@
+/* $Id: sse_xform1.S,v 1.1 2001/03/29 06:46:27 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+ * - insert PREFETCH instructions to avoid cache-misses !
+ * - some more optimizations are possible...
+ * - for 40-50% more performance in the SSE-functions, the
+ * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define S(i) REGOFF(i * 4, ESI)
+#define D(i) REGOFF(i * 4, EDI)
+#define M(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_general)
+GLNAME( _mesa_sse_transform_points1_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ CMP_L( CONST(0), ECX ) /* count == 0 ? */
+ JE( LLBL(K_GTP1GR_finish) ) /* yes -> nothing to do. */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( M(12), XMM1 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP1GR_top):
+ MOVSS( S(0), XMM2 ) /* ox */
+ SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM2 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ ADDPS( XMM1, XMM2 ) /* + | + | + | + */
+ MOVUPS( XMM2, D(0) )
+
+LLBL(K_GTP1GR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP1GR_top) )
+
+LLBL(K_GTP1GR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_identity)
+GLNAME( _mesa_sse_transform_points1_identity ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP1IR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_1), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(1), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(K_GTP1IR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTP1IR_top):
+ MOV_L( S(0), EDX )
+ MOV_L( EDX, D(0) )
+
+LLBL(K_GTP1IR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP1IR_top) )
+
+LLBL(K_GTP1IR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_3d_no_rot)
+GLNAME(_mesa_sse_transform_points1_3d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVSS( M(0), XMM0 ) /* m0 */
+ MOVSS( M(12), XMM1 ) /* m12 */
+ MOVSS( M(13), XMM2 ) /* m13 */
+ MOVSS( M(14), XMM3 ) /* m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP13DNRR_top):
+ MOVSS( S(0), XMM4 ) /* ox */
+ MULSS( XMM0, XMM4 ) /* ox*m0 */
+ ADDSS( XMM1, XMM4 ) /* ox*m0+m12 */
+ MOVSS( XMM4, D(0) )
+
+ MOVSS( XMM2, D(1) )
+ MOVSS( XMM3, D(2) )
+
+LLBL(K_GTP13DNRR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP13DNRR_top) )
+
+LLBL(K_GTP13DNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_perspective)
+GLNAME(_mesa_sse_transform_points1_perspective):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13PR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ XORPS( XMM0, XMM0 ) /* 0 | 0 | 0 | 0 */
+ MOVSS( M(0), XMM1 ) /* m0 */
+ MOVSS( M(14), XMM2 ) /* m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP13PR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ MULSS( XMM1, XMM3 ) /* ox*m0 */
+ MOVSS( XMM3, D(0) ) /* ox*m0->D(0) */
+ MOVSS( XMM2, D(2) ) /* m14->D(2) */
+
+ MOVSS( XMM0, D(1) )
+ MOVSS( XMM0, D(3) )
+
+LLBL(K_GTP13PR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP13PR_top) )
+
+LLBL(K_GTP13PR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_2d)
+GLNAME(_mesa_sse_transform_points1_2d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13P2DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVLPS( M(0), XMM0 ) /* m1 | m0 */
+ MOVLPS( M(12), XMM1 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P2DR_top):
+ MOVSS( S(0), XMM2 ) /* ox */
+ SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM2 ) /* - | - | ox*m1 | ox*m0 */
+ ADDPS( XMM1, XMM2 ) /* - | - | ox*m1+m13 | ox*m0+m12 */
+ MOVLPS( XMM2, D(0) )
+
+LLBL(K_GTP13P2DR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP13P2DR_top) )
+
+LLBL(K_GTP13P2DR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_2d_no_rot)
+GLNAME(_mesa_sse_transform_points1_2d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13P2DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS( M(0), XMM0 ) /* m0 */
+ MOVSS( M(12), XMM1 ) /* m12 */
+ MOVSS( M(13), XMM2 ) /* m13 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P2DNRR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ MULSS( XMM0, XMM3 ) /* ox*m0 */
+ ADDSS( XMM1, XMM3 ) /* ox*m0+m12 */
+ MOVSS( XMM3, D(0) )
+ MOVSS( XMM2, D(1) )
+
+LLBL(K_GTP13P2DNRR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP13P2DNRR_top) )
+
+LLBL(K_GTP13P2DNRR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points1_3d)
+GLNAME(_mesa_sse_transform_points1_3d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP13P3DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( M(12), XMM1 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP13P3DR_top):
+ MOVSS( S(0), XMM2 ) /* ox */
+ SHUFPS( CONST(0x0), XMM2, XMM2 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM2 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ ADDPS( XMM1, XMM2 ) /* +m15 | +m14 | +m13 | +m12 */
+ MOVLPS( XMM2, D(0) ) /* - | - | ->D(1)| ->D(0)*/
+ UNPCKHPS( XMM2, XMM2 ) /* ox*m3+m15 | ox*m3+m15 | ox*m2+m14 | ox*m2+m14 */
+ MOVSS( XMM2, D(2) )
+
+LLBL(K_GTP13P3DR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP13P3DR_top) )
+
+LLBL(K_GTP13P3DR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
diff --git a/src/mesa/x86/sse_xform2.S b/src/mesa/x86/sse_xform2.S
new file mode 100644
index 00000000000..f936088f917
--- /dev/null
+++ b/src/mesa/x86/sse_xform2.S
@@ -0,0 +1,452 @@
+/* $Id: sse_xform2.S,v 1.1 2001/03/29 06:46:27 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+ * - insert PREFETCH instructions to avoid cache-misses !
+ * - some more optimizations are possible...
+ * - for 40-50% more performance in the SSE-functions, the
+ * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define S(i) REGOFF(i * 4, ESI)
+#define D(i) REGOFF(i * 4, EDI)
+#define M(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_general)
+GLNAME( _mesa_sse_transform_points2_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL(K_GTP2GR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( M(4), XMM1 ) /* m7 | m6 | m5 | m4 */
+ MOVAPS( M(12), XMM2 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP2GR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM3 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+ MOVSS( S(1), XMM4 ) /* oy */
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy | oy | oy */
+ MULPS( XMM1, XMM4 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+ ADDPS( XMM4, XMM3 )
+ ADDPS( XMM2, XMM3 )
+ MOVAPS( XMM3, D(0) )
+
+LLBL(K_GTP2GR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP2GR_top) )
+
+LLBL(K_GTP2GR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_identity)
+GLNAME( _mesa_sse_transform_points2_identity ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP2IR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(K_GTP2IR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTP2IR_top):
+ MOV_L ( S(0), EDX )
+ MOV_L ( EDX, D(0) )
+ MOV_L ( S(1), EDX )
+ MOV_L ( EDX, D(1) )
+
+LLBL(K_GTP2IR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP2IR_top) )
+
+LLBL(K_GTP2IR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_3d_no_rot)
+GLNAME(_mesa_sse_transform_points2_3d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
+ MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
+ MOVLPS ( M(12), XMM2 ) /* - | - | m13 | m12 */
+ MOVSS ( M(14), XMM3 ) /* - | - | - | m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP23DNRR_top):
+ MOVLPS ( S(0), XMM0 ) /* - | - | oy | ox */
+ MULPS ( XMM1, XMM0 ) /* - | - | oy*m5 | ox*m0 */
+ ADDPS ( XMM2, XMM0 ) /* - | - | +m13 | +m12 */
+ MOVLPS ( XMM0, D(0) ) /* -> D(1) | -> D(0) */
+
+ MOVSS ( XMM3, D(2) ) /* -> D(2) */
+
+LLBL(K_GTP23DNRR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP23DNRR_top) )
+
+LLBL(K_GTP23DNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_perspective)
+GLNAME(_mesa_sse_transform_points2_perspective):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23PR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
+ MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
+ MOVSS ( M(14), XMM3 ) /* m14 */
+ XORPS ( XMM0, XMM0 ) /* 0 | 0 | 0 | 0 */
+
+ALIGNTEXT32
+LLBL(K_GTP23PR_top):
+ MOVLPS( S(0), XMM4 ) /* oy | ox */
+ MULPS( XMM1, XMM4 ) /* oy*m5 | ox*m0 */
+ MOVLPS( XMM4, D(0) ) /* ->D(1) | ->D(0) */
+ MOVSS( XMM3, D(2) ) /* ->D(2) */
+ MOVSS( XMM0, D(3) ) /* ->D(3) */
+
+LLBL(K_GTP23PR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP23PR_top) )
+
+LLBL(K_GTP23PR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_2d)
+GLNAME(_mesa_sse_transform_points2_2d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23P2DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVLPS( M(0), XMM0 ) /* m1 | m0 */
+ MOVLPS( M(4), XMM1 ) /* m5 | m4 */
+ MOVLPS( M(12), XMM2 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P2DR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox */
+ MULPS( XMM0, XMM3 ) /* ox*m1 | ox*m0 */
+
+ MOVSS( S(1), XMM4 ) /* oy */
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy */
+ MULPS( XMM1, XMM4 ) /* oy*m5 | oy*m4 */
+
+ ADDPS( XMM4, XMM3 )
+ ADDPS( XMM2, XMM3 )
+ MOVLPS( XMM3, D(0) ) /* ->D(1) | ->D(0) */
+
+LLBL(K_GTP23P2DR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP23P2DR_top) )
+
+LLBL(K_GTP23P2DR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_2d_no_rot)
+GLNAME(_mesa_sse_transform_points2_2d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23P2DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* m0 */
+ MOVSS ( M(5), XMM2 ) /* m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* m5 | m0 */
+ MOVLPS ( M(12), XMM2 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P2DNRR_top):
+ MOVLPS( S(0), XMM0 ) /* oy | ox */
+ MULPS( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
+ ADDPS( XMM2, XMM0 ) /* +m13 | +m12 */
+ MOVLPS( XMM0, D(0) ) /* ->D(1) | ->D(0) */
+
+LLBL(K_GTP23P2DNRR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP23P2DNRR_top) )
+
+LLBL(K_GTP23P2DNRR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points2_3d)
+GLNAME(_mesa_sse_transform_points2_3d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP23P3DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m2 | m1 | m0 */
+ MOVAPS( M(4), XMM1 ) /* m6 | m5 | m4 */
+ MOVAPS( M(12), XMM2 ) /* m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP23P3DR_top):
+ MOVSS( S(0), XMM3 ) /* ox */
+ SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ox | ox | ox */
+ MULPS( XMM0, XMM3 ) /* ox*m2 | ox*m1 | ox*m0 */
+
+ MOVSS( S(1), XMM4 ) /* oy */
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* oy | oy | oy */
+ MULPS( XMM1, XMM4 ) /* oy*m6 | oy*m5 | oy*m4 */
+
+ ADDPS( XMM4, XMM3 )
+ ADDPS( XMM2, XMM3 )
+
+ MOVLPS( XMM3, D(0) ) /* ->D(1) | ->D(0) */
+ UNPCKHPS( XMM3, XMM3 )
+ MOVSS( XMM3, D(2) ) /* ->D(2) */
+
+LLBL(K_GTP23P3DR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP23P3DR_top) )
+
+LLBL(K_GTP23P3DR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
diff --git a/src/mesa/x86/sse_xform3.S b/src/mesa/x86/sse_xform3.S
new file mode 100644
index 00000000000..ff989c337df
--- /dev/null
+++ b/src/mesa/x86/sse_xform3.S
@@ -0,0 +1,498 @@
+/* $Id: sse_xform3.S,v 1.1 2001/03/29 06:46:27 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/** TODO:
+ * - insert PREFETCH instructions to avoid cache-misses !
+ * - some more optimizations are possible...
+ * - for 40-50% more performance in the SSE-functions, the
+ * data (trans-matrix, src_vert, dst_vert) needs to be 16byte aligned !
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define S(i) REGOFF(i * 4, ESI)
+#define D(i) REGOFF(i * 4, EDI)
+#define M(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_general)
+GLNAME( _mesa_sse_transform_points3_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ CMP_L ( CONST(0), ECX ) /* count == 0 ? */
+ JE ( LLBL(K_GTPGR_finish) ) /* yes -> nothing to do. */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVAPS ( REGOFF(0, EDX), XMM0 ) /* m0 | m1 | m2 | m3 */
+ MOVAPS ( REGOFF(16, EDX), XMM1 ) /* m4 | m5 | m6 | m7 */
+ MOVAPS ( REGOFF(32, EDX), XMM2 ) /* m8 | m9 | m10 | m11 */
+ MOVAPS ( REGOFF(48, EDX), XMM3 ) /* m12 | m13 | m14 | m15 */
+
+
+ALIGNTEXT32
+LLBL(K_GTPGR_top):
+ MOVSS ( REGOFF(0, ESI), XMM4 ) /* | | | ox */
+ SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */
+ MOVSS ( REGOFF(4, ESI), XMM5 ) /* | | | oy */
+ SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */
+ MOVSS ( REGOFF(8, ESI), XMM6 ) /* | | | oz */
+ SHUFPS ( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */
+
+ MULPS ( XMM0, XMM4 ) /* m3*ox | m2*ox | m1*ox | m0*ox */
+ MULPS ( XMM1, XMM5 ) /* m7*oy | m6*oy | m5*oy | m4*oy */
+ MULPS ( XMM2, XMM6 ) /* m11*oz | m10*oz | m9*oz | m8*oz */
+
+ ADDPS ( XMM5, XMM4 )
+ ADDPS ( XMM6, XMM4 )
+ ADDPS ( XMM3, XMM4 )
+
+ MOVAPS ( XMM4, REGOFF(0, EDI) )
+
+LLBL(K_GTPGR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTPGR_top) )
+
+LLBL(K_GTPGR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_identity)
+GLNAME( _mesa_sse_transform_points3_identity ):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTPIR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ CMP_L( ESI, EDI )
+ JE( LLBL(K_GTPIR_finish) )
+
+
+ALIGNTEXT32
+LLBL(K_GTPIR_top):
+ MOVLPS ( S(0), XMM0 )
+ MOVLPS ( XMM0, D(0) )
+ MOVSS ( S(2), XMM0 )
+ MOVSS ( XMM0, D(2) )
+
+LLBL(K_GTPIR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTPIR_top) )
+
+LLBL(K_GTPIR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_3d_no_rot)
+GLNAME(_mesa_sse_transform_points3_3d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
+ MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
+ MOVLPS ( M(12), XMM2 ) /* - | - | m13 | m12 */
+ MOVSS ( M(10), XMM3 ) /* - | - | - | m10 */
+ MOVSS ( M(14), XMM4 ) /* - | - | - | m14 */
+
+ALIGNTEXT32
+LLBL(K_GTP3DNRR_top):
+
+ MOVLPS ( S(0), XMM0 ) /* - | - | s1 | s0 */
+ MULPS ( XMM1, XMM0 ) /* - | - | s1*m5 | s0*m0 */
+ ADDPS ( XMM2, XMM0 ) /* - | - | +m13 | +m12 */
+ MOVLPS ( XMM0, D(0) ) /* -> D(1) | -> D(0) */
+
+ MOVSS ( S(2), XMM0 ) /* sz */
+ MULSS ( XMM3, XMM0 ) /* sz*m10 */
+ ADDSS ( XMM4, XMM0 ) /* +m14 */
+ MOVSS ( XMM0, D(2) ) /* -> D(2) */
+
+LLBL(K_GTP3DNRR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP3DNRR_top) )
+
+LLBL(K_GTP3DNRR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_perspective)
+GLNAME(_mesa_sse_transform_points3_perspective):
+
+#define FRAME_OFFSET 8
+ PUSH_L ( ESI )
+ PUSH_L ( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3PR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* - | - | - | m0 */
+ MOVSS ( M(5), XMM2 ) /* - | - | - | m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* - | - | m5 | m0 */
+ MOVLPS ( M(8), XMM2 ) /* - | - | m9 | m8 */
+ MOVSS ( M(10), XMM3 ) /* m10 */
+ MOVSS ( M(14), XMM4 ) /* m14 */
+ XORPS ( XMM6, XMM6 ) /* 0 */
+
+ALIGNTEXT32
+LLBL(K_GTP3PR_top):
+ MOVLPS ( S(0), XMM0 ) /* oy | ox */
+ MULPS ( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
+ MOVSS ( S(2), XMM5 ) /* oz */
+ SHUFPS ( CONST(0x0), XMM5, XMM5 ) /* oz | oz */
+ MULPS ( XMM2, XMM5 ) /* oz*m9 | oz*m8 */
+ ADDPS ( XMM5, XMM0 ) /* +oy*m5 | +ox*m0 */
+ MOVLPS ( XMM0, D(0) ) /* ->D(1) | ->D(0) */
+
+ MOVSS ( S(2), XMM0 ) /* oz */
+ MULSS ( XMM3, XMM0 ) /* oz*m10 */
+ ADDSS ( XMM4, XMM0 ) /* +m14 */
+ MOVSS ( XMM0, D(2) ) /* ->D(2) */
+
+ MOVSS ( S(2), XMM0 ) /* oz */
+ MOVSS ( XMM6, XMM5 ) /* 0 */
+ SUBPS ( XMM0, XMM5 ) /* -oz */
+ MOVSS ( XMM5, D(3) ) /* ->D(3) */
+
+LLBL(K_GTP3PR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP3PR_top) )
+
+LLBL(K_GTP3PR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_2d)
+GLNAME(_mesa_sse_transform_points3_2d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3P2DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVLPS( M(0), XMM0 ) /* m1 | m0 */
+ MOVLPS( M(4), XMM1 ) /* m5 | m4 */
+ MOVLPS( M(12), XMM2 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P2DR_top):
+ MOVSS ( S(0), XMM3 ) /* ox */
+ SHUFPS ( CONST(0x0), XMM3, XMM3 ) /* ox | ox */
+ MULPS ( XMM0, XMM3 ) /* ox*m1 | ox*m0 */
+ MOVSS ( S(1), XMM4 ) /* oy */
+ SHUFPS ( CONST(0x0), XMM4, XMM4 ) /* oy | oy */
+ MULPS ( XMM1, XMM4 ) /* oy*m5 | oy*m4 */
+
+ ADDPS ( XMM4, XMM3 )
+ ADDPS ( XMM2, XMM3 )
+ MOVLPS ( XMM3, D(0) )
+
+ MOVSS ( S(2), XMM3 )
+ MOVSS ( XMM3, D(2) )
+
+LLBL(K_GTP3P2DR_skip):
+ ADD_L ( CONST(16), EDI )
+ ADD_L ( EAX, ESI )
+ CMP_L ( ECX, EDI )
+ JNE ( LLBL(K_GTP3P2DR_top) )
+
+LLBL(K_GTP3P2DR_finish):
+ POP_L ( EDI )
+ POP_L ( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_2d_no_rot)
+GLNAME(_mesa_sse_transform_points3_2d_no_rot):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3P2DNRR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ALIGNTEXT32
+ MOVSS ( M(0), XMM1 ) /* m0 */
+ MOVSS ( M(5), XMM2 ) /* m5 */
+ UNPCKLPS ( XMM2, XMM1 ) /* m5 | m0 */
+ MOVLPS ( M(12), XMM2 ) /* m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P2DNRR_top):
+ MOVLPS( S(0), XMM0 ) /* oy | ox */
+ MULPS( XMM1, XMM0 ) /* oy*m5 | ox*m0 */
+ ADDPS( XMM2, XMM0 ) /* +m13 | +m12 */
+ MOVLPS( XMM0, D(0) ) /* ->D(1) | ->D(0) */
+
+ MOVSS( S(2), XMM0 )
+ MOVSS( XMM0, D(2) )
+
+LLBL(K_GTP3P2DNRR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP3P2DNRR_top) )
+
+LLBL(K_GTP3P2DNRR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME(_mesa_sse_transform_points3_3d)
+GLNAME(_mesa_sse_transform_points3_3d):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( REGOFF(OFFSET_SOURCE+8, ESP), ESI ) /* ptr to source GLvector4f */
+ MOV_L( REGOFF(OFFSET_DEST+8, ESP), EDI ) /* ptr to dest GLvector4f */
+
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP3P3DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) ) /* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+
+ALIGNTEXT32
+ MOVAPS( M(0), XMM0 ) /* m2 | m1 | m0 */
+ MOVAPS( M(4), XMM1 ) /* m6 | m5 | m4 */
+ MOVAPS( M(8), XMM2 ) /* m10 | m9 | m8 */
+ MOVAPS( M(12), XMM3 ) /* m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL(K_GTP3P3DR_top):
+ MOVSS( S(0), XMM4 )
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox */
+ MULPS( XMM0, XMM4 ) /* ox*m2 | ox*m1 | ox*m0 */
+
+ MOVSS( S(1), XMM5 )
+ SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy */
+ MULPS( XMM1, XMM5 ) /* oy*m6 | oy*m5 | oy*m4 */
+
+ MOVSS( S(2), XMM6 )
+ SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz */
+ MULPS( XMM2, XMM6 ) /* oz*m10 | oz*m9 | oz*m8 */
+
+ ADDPS( XMM5, XMM4 ) /* + | + | + */
+ ADDPS( XMM6, XMM4 ) /* + | + | + */
+ ADDPS( XMM3, XMM4 ) /* + | + | + */
+
+ MOVLPS( XMM4, D(0) ) /* => D(1) | => D(0) */
+ UNPCKHPS( XMM4, XMM4 )
+ MOVSS( XMM4, D(2) )
+
+LLBL(K_GTP3P3DR_skip):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP3P3DR_top) )
+
+LLBL(K_GTP3P3DR_finish):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
diff --git a/src/mesa/x86/sse_xform4.S b/src/mesa/x86/sse_xform4.S
new file mode 100644
index 00000000000..13cb2a3ecb5
--- /dev/null
+++ b/src/mesa/x86/sse_xform4.S
@@ -0,0 +1,226 @@
+/* $Id: sse_xform4.S,v 1.1 2001/03/29 06:46:27 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FRAME_OFFSET 8
+
+#define SRC(i) REGOFF(i * 4, ESI)
+#define DST(i) REGOFF(i * 4, EDI)
+#define MAT(i) REGOFF(i * 4, EDX)
+
+#define SELECT(r0, r1, r2, r3) CONST( r0 * 64 + r1 * 16 + r2 * 4 + r3 )
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_sse_transform_points4_general )
+GLNAME( _mesa_sse_transform_points4_general ):
+
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX ) /* verify non-zero count */
+ JE( LLBL( sse_general_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+
+ PREFETCHT0( REGIND(ESI) )
+
+ MOVAPS( MAT(0), XMM4 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( MAT(4), XMM5 ) /* m7 | m6 | m5 | m4 */
+ MOVAPS( MAT(8), XMM6 ) /* m11 | m10 | m9 | m8 */
+ MOVAPS( MAT(12), XMM7 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT16
+LLBL( sse_general_loop ):
+
+ MOVSS( SRC(0), XMM0 ) /* ox */
+ SHUFPS( CONST(0x0), XMM0, XMM0 ) /* ox | ox | ox | ox */
+ MULPS( XMM4, XMM0 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+
+ MOVSS( SRC(1), XMM1 ) /* oy */
+ SHUFPS( CONST(0x0), XMM1, XMM1 ) /* oy | oy | oy | oy */
+ MULPS( XMM5, XMM1 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+ MOVSS( SRC(2), XMM2 ) /* oz */
+ SHUFPS( CONST(0x0), XMM2, XMM2 ) /* oz | oz | oz | oz */
+ MULPS( XMM6, XMM2 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+
+ MOVSS( SRC(3), XMM3 ) /* ow */
+ SHUFPS( CONST(0x0), XMM3, XMM3 ) /* ow | ow | ow | ow */
+ MULPS( XMM7, XMM3 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+
+ ADDPS( XMM1, XMM0 ) /* ox*m3+oy*m7 | ... */
+ ADDPS( XMM2, XMM0 ) /* ox*m3+oy*m7+oz*m11 | ... */
+ ADDPS( XMM3, XMM0 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+ MOVAPS( XMM0, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+
+ DEC_L( ECX )
+ JNZ( LLBL( sse_general_loop ) )
+
+LLBL( sse_general_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_sse_transform_points4_3d )
+GLNAME( _mesa_sse_transform_points4_3d ):
+
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI ) /* ptr to source GLvector4f */
+ MOV_L( ARG_DEST, EDI ) /* ptr to dest GLvector4f */
+
+ MOV_L( ARG_MATRIX, EDX ) /* ptr to matrix */
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX ) /* source count */
+
+ TEST_L( ECX, ECX)
+ JZ( LLBL(K_GTP43P3DR_finish) ) /* count was zero; go to finish */
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+ SHL_L( CONST(4), ECX ) /* count *= 16 */
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+ ADD_L( EDI, ECX ) /* count += dest ptr */
+
+ MOVAPS( MAT(0), XMM0 ) /* m3 | m2 | m1 | m0 */
+ MOVAPS( MAT(4), XMM1 ) /* m7 | m6 | m5 | m4 */
+ MOVAPS( MAT(8), XMM2 ) /* m11 | m10 | m9 | m8 */
+ MOVAPS( MAT(12), XMM3 ) /* m15 | m14 | m13 | m12 */
+
+ALIGNTEXT32
+LLBL( K_GTP43P3DR_top ):
+ MOVSS( SRC(0), XMM4 ) /* ox */
+ SHUFPS( CONST(0x0), XMM4, XMM4 ) /* ox | ox | ox | ox */
+ MULPS( XMM0, XMM4 ) /* ox*m3 | ox*m2 | ox*m1 | ox*m0 */
+
+ MOVSS( SRC(1), XMM5 ) /* oy */
+ SHUFPS( CONST(0x0), XMM5, XMM5 ) /* oy | oy | oy | oy */
+ MULPS( XMM1, XMM5 ) /* oy*m7 | oy*m6 | oy*m5 | oy*m4 */
+
+ MOVSS( SRC(2), XMM6 ) /* oz */
+ SHUFPS( CONST(0x0), XMM6, XMM6 ) /* oz | oz | oz | oz */
+ MULPS( XMM2, XMM6 ) /* oz*m11 | oz*m10 | oz*m9 | oz*m8 */
+
+ MOVSS( SRC(3), XMM7 ) /* ow */
+ SHUFPS( CONST(0x0), XMM7, XMM7 ) /* ow | ow | ow | ow */
+ MULPS( XMM3, XMM7 ) /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
+
+ ADDPS( XMM5, XMM4 ) /* ox*m3+oy*m7 | ... */
+ ADDPS( XMM6, XMM4 ) /* ox*m3+oy*m7+oz*m11 | ... */
+ ADDPS( XMM7, XMM4 ) /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
+ MOVAPS( XMM4, DST(0) ) /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
+
+ MOVSS( SRC(3), XMM4 ) /* ow */
+ MOVSS( XMM4, DST(3) ) /* ->D(3) */
+
+LLBL( K_GTP43P3DR_skip ):
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL(K_GTP43P3DR_top) )
+
+LLBL( K_GTP43P3DR_finish ):
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_sse_transform_points4_identity )
+GLNAME( _mesa_sse_transform_points4_identity ):
+
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX ) /* verify non-zero count */
+ JE( LLBL( sse_identity_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX ) /* stride */
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) ) /* set dest flags */
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) ) /* set dest count */
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )/* set dest size */
+
+ MOV_L( REGOFF(V4F_START, ESI), ESI ) /* ptr to first source vertex */
+ MOV_L( REGOFF(V4F_START, EDI), EDI ) /* ptr to first dest vertex */
+
+ALIGNTEXT16
+LLBL( sse_identity_loop ):
+
+ PREFETCHNTA( REGOFF(32, ESI) )
+
+ MOVAPS( REGIND(ESI), XMM0 )
+ ADD_L( EAX, ESI )
+
+ MOVAPS( XMM0, REGIND(EDI) )
+ ADD_L( CONST(16), EDI )
+
+ DEC_L( ECX )
+ JNZ( LLBL( sse_identity_loop ) )
+
+LLBL( sse_identity_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
diff --git a/src/mesa/x86/x86.c b/src/mesa/x86/x86.c
index 4f35a2ffc50..a51f181916d 100644
--- a/src/mesa/x86/x86.c
+++ b/src/mesa/x86/x86.c
@@ -1,4 +1,4 @@
-/* $Id: x86.c,v 1.19 2001/03/28 20:44:44 gareth Exp $ */
+/* $Id: x86.c,v 1.20 2001/03/29 06:46:27 gareth Exp $ */
/*
* Mesa 3-D graphics library
@@ -50,40 +50,37 @@
const GLubyte flag
-#define DECLARE_XFORM_GROUP( pfx, sz, masked ) \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot_##masked( XFORM_ARGS ); \
-extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_##masked( XFORM_ARGS );
-
-
-#define ASSIGN_XFORM_GROUP( pfx, cma, sz, masked ) \
- _mesa_transform_tab[cma][sz][MATRIX_GENERAL] = \
- _mesa_##pfx##_transform_points##sz##_general_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_IDENTITY] = \
- _mesa_##pfx##_transform_points##sz##_identity_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_3D_NO_ROT] = \
- _mesa_##pfx##_transform_points##sz##_3d_no_rot_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_PERSPECTIVE] = \
- _mesa_##pfx##_transform_points##sz##_perspective_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_2D] = \
- _mesa_##pfx##_transform_points##sz##_2d_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_2D_NO_ROT] = \
- _mesa_##pfx##_transform_points##sz##_2d_no_rot_##masked; \
- _mesa_transform_tab[cma][sz][MATRIX_3D] = \
- _mesa_##pfx##_transform_points##sz##_3d_##masked;
+#define DECLARE_XFORM_GROUP( pfx, sz ) \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_general( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_identity( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d_no_rot( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_perspective( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_2d_no_rot( XFORM_ARGS ); \
+extern void _ASMAPI _mesa_##pfx##_transform_points##sz##_3d( XFORM_ARGS );
+
+
+#define ASSIGN_XFORM_GROUP( pfx, sz ) \
+ _mesa_transform_tab[0][sz][MATRIX_GENERAL] = \
+ _mesa_##pfx##_transform_points##sz##_general; \
+ _mesa_transform_tab[0][sz][MATRIX_IDENTITY] = \
+ _mesa_##pfx##_transform_points##sz##_identity; \
+ _mesa_transform_tab[0][sz][MATRIX_3D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_3d_no_rot; \
+ _mesa_transform_tab[0][sz][MATRIX_PERSPECTIVE] = \
+ _mesa_##pfx##_transform_points##sz##_perspective; \
+ _mesa_transform_tab[0][sz][MATRIX_2D] = \
+ _mesa_##pfx##_transform_points##sz##_2d; \
+ _mesa_transform_tab[0][sz][MATRIX_2D_NO_ROT] = \
+ _mesa_##pfx##_transform_points##sz##_2d_no_rot; \
+ _mesa_transform_tab[0][sz][MATRIX_3D] = \
+ _mesa_##pfx##_transform_points##sz##_3d;
#ifdef USE_X86_ASM
-DECLARE_XFORM_GROUP( x86, 2, raw )
-DECLARE_XFORM_GROUP( x86, 3, raw )
-DECLARE_XFORM_GROUP( x86, 4, raw )
-DECLARE_XFORM_GROUP( x86, 2, masked )
-DECLARE_XFORM_GROUP( x86, 3, masked )
-DECLARE_XFORM_GROUP( x86, 4, masked )
+DECLARE_XFORM_GROUP( x86, 2 )
+DECLARE_XFORM_GROUP( x86, 3 )
+DECLARE_XFORM_GROUP( x86, 4 )
extern GLvector4f * _ASMAPI
@@ -119,13 +116,9 @@ _mesa_v16_x86_general_xform( GLfloat *dest,
void _mesa_init_x86_transform_asm( void )
{
#ifdef USE_X86_ASM
- ASSIGN_XFORM_GROUP( x86, 0, 2, raw );
- ASSIGN_XFORM_GROUP( x86, 0, 3, raw );
- ASSIGN_XFORM_GROUP( x86, 0, 4, raw );
-
-/* ASSIGN_XFORM_GROUP( x86, CULL_MASK_ACTIVE, 2, masked ); */
-/* ASSIGN_XFORM_GROUP( x86, CULL_MASK_ACTIVE, 3, masked ); */
-/* ASSIGN_XFORM_GROUP( x86, CULL_MASK_ACTIVE, 4, masked ); */
+ ASSIGN_XFORM_GROUP( x86, 2 );
+ ASSIGN_XFORM_GROUP( x86, 3 );
+ ASSIGN_XFORM_GROUP( x86, 4 );
/* XXX this function has been found to cause FP overflow exceptions */
_mesa_clip_tab[4] = _mesa_x86_cliptest_points4;
diff --git a/src/mesa/x86/x86_xform2.S b/src/mesa/x86/x86_xform2.S
new file mode 100644
index 00000000000..0e46a8411ec
--- /dev/null
+++ b/src/mesa/x86/x86_xform2.S
@@ -0,0 +1,536 @@
+/* $Id: x86_xform2.S,v 1.1 2001/03/29 06:46:27 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FP_ONE 1065353216
+#define FP_ZERO 0
+
+#define SRC(i) REGOFF(i * 4, ESI)
+#define DST(i) REGOFF(i * 4, EDI)
+#define MAT(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_general )
+GLNAME( _mesa_x86_transform_points2_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p2_gr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p2_gr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+ FLD_S( SRC(0) ) /* F6 F5 F4 */
+ FMUL_S( MAT(2) )
+ FLD_S( SRC(0) ) /* F7 F6 F5 F4 */
+ FMUL_S( MAT(3) )
+
+ FLD_S( SRC(1) ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(5) )
+ FLD_S( SRC(1) ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(6) )
+ FLD_S( SRC(1) ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(7) )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F7 F6 F5 F4 */
+
+ FXCH( ST(3) ) /* F4 F6 F5 F7 */
+ FADD_S( MAT(12) )
+ FXCH( ST(2) ) /* F5 F6 F4 F7 */
+ FADD_S( MAT(13) )
+ FXCH( ST(1) ) /* F6 F5 F4 F7 */
+ FADD_S( MAT(14) )
+ FXCH( ST(3) ) /* F7 F5 F4 F6 */
+ FADD_S( MAT(15) )
+
+ FXCH( ST(2) ) /* F4 F5 F7 F6 */
+ FSTP_S( DST(0) ) /* F5 F7 F6 */
+ FSTP_S( DST(1) ) /* F7 F6 */
+ FXCH( ST(1) ) /* F6 F7 */
+ FSTP_S( DST(2) ) /* F7 */
+ FSTP_S( DST(3) ) /* */
+
+LLBL( x86_p2_gr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p2_gr_loop ) )
+
+LLBL( x86_p2_gr_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_perspective )
+GLNAME( _mesa_x86_transform_points2_perspective ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p2_pr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ MOV_L( MAT(14), EBX )
+
+ALIGNTEXT16
+LLBL( x86_p2_pr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F1 F4 */
+ FMUL_S( MAT(5) )
+
+ FXCH( ST(1) ) /* F4 F1 */
+ FSTP_S( DST(0) ) /* F1 */
+ FSTP_S( DST(1) ) /* */
+ MOV_L( EBX, DST(2) )
+ MOV_L( CONST(FP_ZERO), DST(3) )
+
+LLBL( x86_p2_pr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p2_pr_loop ) )
+
+LLBL( x86_p2_pr_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_3d )
+GLNAME( _mesa_x86_transform_points2_3d ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p2_3dr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p2_3dr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+ FLD_S( SRC(0) ) /* F6 F5 F4 */
+ FMUL_S( MAT(2) )
+
+ FLD_S( SRC(1) ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(5) )
+ FLD_S( SRC(1) ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(6) )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FADD_S( MAT(12) )
+ FXCH( ST(1) ) /* F5 F4 F6 */
+ FADD_S( MAT(13) )
+ FXCH( ST(2) ) /* F6 F4 F5 */
+ FADD_S( MAT(14) )
+
+ FXCH( ST(1) ) /* F4 F6 F5 */
+ FSTP_S( DST(0) ) /* F6 F5 */
+ FXCH( ST(1) ) /* F5 F6 */
+ FSTP_S( DST(1) ) /* F6 */
+ FSTP_S( DST(2) ) /* */
+
+LLBL( x86_p2_3dr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p2_3dr_loop ) )
+
+LLBL( x86_p2_3dr_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_3d_no_rot )
+GLNAME( _mesa_x86_transform_points2_3d_no_rot ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p2_3dnrr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ MOV_L( MAT(14), EBX )
+
+ALIGNTEXT16
+LLBL( x86_p2_3dnrr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F1 F4 */
+ FMUL_S( MAT(5) )
+
+ FXCH( ST(1) ) /* F4 F1 */
+ FADD_S( MAT(12) )
+ FLD_S( MAT(13) ) /* F5 F4 F1 */
+ FXCH( ST(2) ) /* F1 F4 F5 */
+ FADDP( ST(0), ST(2) ) /* F4 F5 */
+
+ FSTP_S( DST(0) ) /* F5 */
+ FSTP_S( DST(1) ) /* */
+ MOV_L( EBX, DST(2) )
+
+LLBL( x86_p2_3dnrr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p2_3dnrr_loop ) )
+
+LLBL( x86_p2_3dnrr_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_2d )
+GLNAME( _mesa_x86_transform_points2_2d ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p2_2dr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p2_2dr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+
+ FLD_S( SRC(1) ) /* F0 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT(5) )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F1 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F5 F4 */
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FADD_S( MAT(12) )
+ FXCH( ST(1) ) /* F5 F4 */
+ FADD_S( MAT(13) )
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FSTP_S( DST(0) ) /* F5 */
+ FSTP_S( DST(1) ) /* */
+
+LLBL( x86_p2_2dr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p2_2dr_loop ) )
+
+LLBL( x86_p2_2dr_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT4
+GLOBL GLNAME( _mesa_x86_transform_points2_2d_no_rot )
+GLNAME( _mesa_x86_transform_points2_2d_no_rot ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p2_2dnrr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p2_2dnrr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F1 F4 */
+ FMUL_S( MAT(5) )
+
+ FXCH( ST(1) ) /* F4 F1 */
+ FADD_S( MAT(12) )
+ FLD_S( MAT(13) ) /* F5 F4 F1 */
+ FXCH( ST(2) ) /* F1 F4 F5 */
+ FADDP( ST(0), ST(2) ) /* F4 F5 */
+
+ FSTP_S( DST(0) ) /* F5 */
+ FSTP_S( DST(1) ) /* */
+
+LLBL( x86_p2_2dnrr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p2_2dnrr_loop ) )
+
+LLBL( x86_p2_2dnrr_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points2_identity )
+GLNAME( _mesa_x86_transform_points2_identity ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p2_ir_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_2), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(2), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ CMP_L( ESI, EDI )
+ JE( LLBL( x86_p2_ir_done ) )
+
+ALIGNTEXT16
+LLBL( x86_p2_ir_loop ):
+
+ MOV_L( SRC(0), EBX )
+ MOV_L( SRC(1), EDX )
+
+ MOV_L( EBX, DST(0) )
+ MOV_L( EDX, DST(1) )
+
+LLBL( x86_p2_ir_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p2_ir_loop ) )
+
+LLBL( x86_p2_ir_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
diff --git a/src/mesa/x86/x86_xform3.S b/src/mesa/x86/x86_xform3.S
new file mode 100644
index 00000000000..1f798744e81
--- /dev/null
+++ b/src/mesa/x86/x86_xform3.S
@@ -0,0 +1,606 @@
+/* $Id: x86_xform3.S,v 1.1 2001/03/29 06:46:27 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FP_ONE 1065353216
+#define FP_ZERO 0
+
+#define SRC(i) REGOFF(i * 4, ESI)
+#define DST(i) REGOFF(i * 4, EDI)
+#define MAT(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_general )
+GLNAME( _mesa_x86_transform_points3_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p3_gr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p3_gr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+ FLD_S( SRC(0) ) /* F6 F5 F4 */
+ FMUL_S( MAT(2) )
+ FLD_S( SRC(0) ) /* F7 F6 F5 F4 */
+ FMUL_S( MAT(3) )
+
+ FLD_S( SRC(1) ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(5) )
+ FLD_S( SRC(1) ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(6) )
+ FLD_S( SRC(1) ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(7) )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F7 F6 F5 F4 */
+
+ FLD_S( SRC(2) ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(8) )
+ FLD_S( SRC(2) ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(9) )
+ FLD_S( SRC(2) ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(10) )
+ FLD_S( SRC(2) ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(11) )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F7 F6 F5 F4 */
+
+ FXCH( ST(3) ) /* F4 F6 F5 F7 */
+ FADD_S( MAT(12) )
+ FXCH( ST(2) ) /* F5 F6 F4 F7 */
+ FADD_S( MAT(13) )
+ FXCH( ST(1) ) /* F6 F5 F4 F7 */
+ FADD_S( MAT(14) )
+ FXCH( ST(3) ) /* F7 F5 F4 F6 */
+ FADD_S( MAT(15) )
+
+ FXCH( ST(2) ) /* F4 F5 F7 F6 */
+ FSTP_S( DST(0) ) /* F5 F7 F6 */
+ FSTP_S( DST(1) ) /* F7 F6 */
+ FXCH( ST(1) ) /* F6 F7 */
+ FSTP_S( DST(2) ) /* F7 */
+ FSTP_S( DST(3) ) /* */
+
+LLBL( x86_p3_gr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p3_gr_loop ) )
+
+LLBL( x86_p3_gr_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_perspective )
+GLNAME( _mesa_x86_transform_points3_perspective ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p3_pr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p3_pr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F5 F4 */
+ FMUL_S( MAT(5) )
+
+ FLD_S( SRC(2) ) /* F0 F5 F4 */
+ FMUL_S( MAT(8) )
+ FLD_S( SRC(2) ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT(9) )
+ FLD_S( SRC(2) ) /* F2 F1 F0 F5 F4 */
+ FMUL_S( MAT(10) )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F5 F4 */
+ FADDP( ST(0), ST(4) ) /* F1 F2 F5 F4 */
+ FADDP( ST(0), ST(2) ) /* F2 F5 F4 */
+ FLD_S( MAT(14) ) /* F6 F2 F5 F4 */
+ FXCH( ST(1) ) /* F2 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ MOV_L( SRC(2), EBX )
+ XOR_L( CONST(-2147483648), EBX )/* change sign */
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FSTP_S( DST(0) ) /* F5 F6 */
+ FSTP_S( DST(1) ) /* F6 */
+ FSTP_S( DST(2) ) /* */
+ MOV_L( EBX, DST(3) )
+
+LLBL( x86_p3_pr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p3_pr_loop ) )
+
+LLBL( x86_p3_pr_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_3d )
+GLNAME( _mesa_x86_transform_points3_3d ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p3_3dr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p3_3dr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+ FLD_S( SRC(0) ) /* F6 F5 F4 */
+ FMUL_S( MAT(2) )
+
+ FLD_S( SRC(1) ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(5) )
+ FLD_S( SRC(1) ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(6) )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ FLD_S( SRC(2) ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT(8) )
+ FLD_S( SRC(2) ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(9) )
+ FLD_S( SRC(2) ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(10) )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FADD_S( MAT(12) )
+ FXCH( ST(1) ) /* F5 F4 F6 */
+ FADD_S( MAT(13) )
+ FXCH( ST(2) ) /* F6 F4 F5 */
+ FADD_S( MAT(14) )
+
+ FXCH( ST(1) ) /* F4 F6 F5 */
+ FSTP_S( DST(0) ) /* F6 F5 */
+ FXCH( ST(1) ) /* F5 F6 */
+ FSTP_S( DST(1) ) /* F6 */
+ FSTP_S( DST(2) ) /* */
+
+LLBL( x86_p3_3dr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p3_3dr_loop ) )
+
+LLBL( x86_p3_3dr_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_3d_no_rot )
+GLNAME( _mesa_x86_transform_points3_3d_no_rot ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p3_3dnrr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p3_3dnrr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F1 F4 */
+ FMUL_S( MAT(5) )
+
+ FLD_S( SRC(2) ) /* F2 F1 F4 */
+ FMUL_S( MAT(10) )
+
+ FXCH( ST(2) ) /* F4 F1 F2 */
+ FADD_S( MAT(12) )
+ FLD_S( MAT(13) ) /* F5 F4 F1 F2 */
+ FXCH( ST(2) ) /* F1 F4 F5 F2 */
+ FADDP( ST(0), ST(2) ) /* F4 F5 F2 */
+ FLD_S( MAT(14) ) /* F6 F4 F5 F2 */
+ FXCH( ST(3) ) /* F2 F4 F5 F6 */
+ FADDP( ST(0), ST(3) ) /* F4 F5 F6 */
+
+ FSTP_S( DST(0) ) /* F5 F6 */
+ FSTP_S( DST(1) ) /* F6 */
+ FSTP_S( DST(2) ) /* */
+
+LLBL( x86_p3_3dnrr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p3_3dnrr_loop ) )
+
+LLBL( x86_p3_3dnrr_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_2d )
+GLNAME( _mesa_x86_transform_points3_2d ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p3_2dr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p3_2dr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+
+ FLD_S( SRC(1) ) /* F0 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT(5) )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F1 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F5 F4 */
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FADD_S( MAT(12) )
+ FXCH( ST(1) ) /* F5 F4 */
+ FADD_S( MAT(13) )
+
+ MOV_L( SRC(2), EBX )
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FSTP_S( DST(0) ) /* F5 */
+ FSTP_S( DST(1) ) /* */
+ MOV_L( EBX, DST(2) )
+
+LLBL( x86_p3_2dr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p3_2dr_loop ) )
+
+LLBL( x86_p3_2dr_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_2d_no_rot )
+GLNAME( _mesa_x86_transform_points3_2d_no_rot ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p3_2dnrr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p3_2dnrr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F1 F4 */
+ FMUL_S( MAT(5) )
+
+ FXCH( ST(1) ) /* F4 F1 */
+ FADD_S( MAT(12) )
+ FLD_S( MAT(13) ) /* F5 F4 F1 */
+
+ FXCH( ST(2) ) /* F1 F4 F5 */
+ FADDP( ST(0), ST(2) ) /* F4 F5 */
+
+ MOV_L( SRC(2), EBX )
+
+ FSTP_S( DST(0) ) /* F5 */
+ FSTP_S( DST(1) ) /* */
+ MOV_L( EBX, DST(2) )
+
+LLBL( x86_p3_2dnrr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p3_2dnrr_loop ) )
+
+LLBL( x86_p3_2dnrr_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points3_identity )
+GLNAME(_mesa_x86_transform_points3_identity ):
+
+#define FRAME_OFFSET 16
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+ PUSH_L( EBP )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p3_ir_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_3), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(3), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ CMP_L( ESI, EDI )
+ JE( LLBL( x86_p3_ir_done ) )
+
+ALIGNTEXT16
+LLBL( x86_p3_ir_loop ):
+
+#if 1
+ MOV_L( SRC(0), EBX )
+ MOV_L( SRC(1), EBP )
+ MOV_L( SRC(2), EDX )
+
+ MOV_L( EBX, DST(0) )
+ MOV_L( EBP, DST(1) )
+ MOV_L( EDX, DST(2) )
+#else
+ FLD_S( SRC(0) )
+ FLD_S( SRC(1) )
+ FLD_S( SRC(2) )
+
+ FSTP_S( DST(2) )
+ FSTP_S( DST(1) )
+ FSTP_S( DST(0) )
+#endif
+
+LLBL( x86_p3_ir_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p3_ir_loop ) )
+
+LLBL( x86_p3_ir_done ):
+
+ POP_L( EBP )
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
diff --git a/src/mesa/x86/x86_xform4.S b/src/mesa/x86/x86_xform4.S
new file mode 100644
index 00000000000..1b6a8d8aec0
--- /dev/null
+++ b/src/mesa/x86/x86_xform4.S
@@ -0,0 +1,639 @@
+/* $Id: x86_xform4.S,v 1.1 2001/03/29 06:46:27 gareth Exp $ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version: 3.5
+ *
+ * Copyright (C) 1999-2001 Brian Paul All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "matypes.h"
+#include "xform_args.h"
+
+ SEG_TEXT
+
+#define FP_ONE 1065353216
+#define FP_ZERO 0
+
+#define SRC(i) REGOFF(i * 4, ESI)
+#define DST(i) REGOFF(i * 4, EDI)
+#define MAT(i) REGOFF(i * 4, EDX)
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_general )
+GLNAME( _mesa_x86_transform_points4_general ):
+
+#define FRAME_OFFSET 8
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p4_gr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p4_gr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+ FLD_S( SRC(0) ) /* F6 F5 F4 */
+ FMUL_S( MAT(2) )
+ FLD_S( SRC(0) ) /* F7 F6 F5 F4 */
+ FMUL_S( MAT(3) )
+
+ FLD_S( SRC(1) ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(5) )
+ FLD_S( SRC(1) ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(6) )
+ FLD_S( SRC(1) ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(7) )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F7 F6 F5 F4 */
+
+ FLD_S( SRC(2) ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(8) )
+ FLD_S( SRC(2) ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(9) )
+ FLD_S( SRC(2) ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(10) )
+ FLD_S( SRC(2) ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(11) )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F7 F6 F5 F4 */
+
+ FLD_S( SRC(3) ) /* F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(12) )
+ FLD_S( SRC(3) ) /* F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(13) )
+ FLD_S( SRC(3) ) /* F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(14) )
+ FLD_S( SRC(3) ) /* F3 F2 F1 F0 F7 F6 F5 F4 */
+ FMUL_S( MAT(15) )
+
+ FXCH( ST(3) ) /* F0 F2 F1 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(7) ) /* F2 F1 F3 F7 F6 F5 F4 */
+ FXCH( ST(1) ) /* F1 F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F2 F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F3 F7 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F7 F6 F5 F4 */
+
+ FXCH( ST(3) ) /* F4 F6 F5 F7 */
+ FSTP_S( DST(0) ) /* F6 F5 F7 */
+ FXCH( ST(1) ) /* F5 F6 F7 */
+ FSTP_S( DST(1) ) /* F6 F7 */
+ FSTP_S( DST(2) ) /* F7 */
+ FSTP_S( DST(3) ) /* */
+
+LLBL( x86_p4_gr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p4_gr_loop ) )
+
+LLBL( x86_p4_gr_done ):
+
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_perspective )
+GLNAME( _mesa_x86_transform_points4_perspective ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p4_pr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p4_pr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F5 F4 */
+ FMUL_S( MAT(5) )
+
+ FLD_S( SRC(2) ) /* F0 F5 F4 */
+ FMUL_S( MAT(8) )
+ FLD_S( SRC(2) ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT(9) )
+ FLD_S( SRC(2) ) /* F6 F1 F0 F5 F4 */
+ FMUL_S( MAT(10) )
+
+ FXCH( ST(2) ) /* F0 F1 F6 F5 F4 */
+ FADDP( ST(0), ST(4) ) /* F1 F6 F5 F4 */
+ FADDP( ST(0), ST(2) ) /* F6 F5 F4 */
+
+ FLD_S( SRC(3) ) /* F2 F6 F5 F4 */
+ FMUL_S( MAT(14) )
+
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ MOV_L( SRC(2), EBX )
+ XOR_L( CONST(-2147483648), EBX )/* change sign */
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FSTP_S( DST(0) ) /* F5 F6 */
+ FSTP_S( DST(1) ) /* F6 */
+ FSTP_S( DST(2) ) /* */
+ MOV_L( EBX, DST(3) )
+
+LLBL( x86_p4_pr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p4_pr_loop ) )
+
+LLBL( x86_p4_pr_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_3d )
+GLNAME( _mesa_x86_transform_points4_3d ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p4_3dr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p4_3dr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+ FLD_S( SRC(0) ) /* F6 F5 F4 */
+ FMUL_S( MAT(2) )
+
+ FLD_S( SRC(1) ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(5) )
+ FLD_S( SRC(1) ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(6) )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ FLD_S( SRC(2) ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT(8) )
+ FLD_S( SRC(2) ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(9) )
+ FLD_S( SRC(2) ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(10) )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ FLD_S( SRC(3) ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT(12) )
+ FLD_S( SRC(3) ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(13) )
+ FLD_S( SRC(3) ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(14) )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ MOV_L( SRC(3), EBX )
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FSTP_S( DST(0) ) /* F5 F6 */
+ FSTP_S( DST(1) ) /* F6 */
+ FSTP_S( DST(2) ) /* */
+ MOV_L( EBX, DST(3) )
+
+LLBL( x86_p4_3dr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p4_3dr_loop ) )
+
+LLBL( x86_p4_3dr_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME(_mesa_x86_transform_points4_3d_no_rot)
+GLNAME(_mesa_x86_transform_points4_3d_no_rot):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p4_3dnrr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p4_3dnrr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F5 F4 */
+ FMUL_S( MAT(5) )
+
+ FLD_S( SRC(2) ) /* F6 F5 F4 */
+ FMUL_S( MAT(10) )
+
+ FLD_S( SRC(3) ) /* F0 F6 F5 F4 */
+ FMUL_S( MAT(12) )
+ FLD_S( SRC(3) ) /* F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(13) )
+ FLD_S( SRC(3) ) /* F2 F1 F0 F6 F5 F4 */
+ FMUL_S( MAT(14) )
+
+ FXCH( ST(2) ) /* F0 F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(5) ) /* F1 F2 F6 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F2 F6 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F6 F5 F4 */
+
+ MOV_L( SRC(3), EBX )
+
+ FXCH( ST(2) ) /* F4 F5 F6 */
+ FSTP_S( DST(0) ) /* F5 F6 */
+ FSTP_S( DST(1) ) /* F6 */
+ FSTP_S( DST(2) ) /* */
+ MOV_L( EBX, DST(3) )
+
+LLBL( x86_p4_3dnrr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p4_3dnrr_loop ) )
+
+LLBL( x86_p4_3dnrr_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_2d )
+GLNAME( _mesa_x86_transform_points4_2d ):
+
+#define FRAME_OFFSET 16
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+ PUSH_L( EBP )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p4_2dr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p4_2dr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+ FLD_S( SRC(0) ) /* F5 F4 */
+ FMUL_S( MAT(1) )
+
+ FLD_S( SRC(1) ) /* F0 F5 F4 */
+ FMUL_S( MAT(4) )
+ FLD_S( SRC(1) ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT(5) )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F1 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F5 F4 */
+
+ FLD_S( SRC(3) ) /* F0 F5 F4 */
+ FMUL_S( MAT(12) )
+ FLD_S( SRC(3) ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT(13) )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F1 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F5 F4 */
+
+ MOV_L( SRC(2), EBX )
+ MOV_L( SRC(3), EBP )
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FSTP_S( DST(0) ) /* F5 */
+ FSTP_S( DST(1) ) /* */
+ MOV_L( EBX, DST(2) )
+ MOV_L( EBP, DST(3) )
+
+LLBL( x86_p4_2dr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p4_2dr_loop ) )
+
+LLBL( x86_p4_2dr_done ):
+
+ POP_L( EBP )
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_2d_no_rot )
+GLNAME( _mesa_x86_transform_points4_2d_no_rot ):
+
+#define FRAME_OFFSET 16
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+ PUSH_L( EBP )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p4_2dnrr_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ALIGNTEXT16
+LLBL( x86_p4_2dnrr_loop ):
+
+ FLD_S( SRC(0) ) /* F4 */
+ FMUL_S( MAT(0) )
+
+ FLD_S( SRC(1) ) /* F5 F4 */
+ FMUL_S( MAT(5) )
+
+ FLD_S( SRC(3) ) /* F0 F5 F4 */
+ FMUL_S( MAT(12) )
+ FLD_S( SRC(3) ) /* F1 F0 F5 F4 */
+ FMUL_S( MAT(13) )
+
+ FXCH( ST(1) ) /* F0 F1 F5 F4 */
+ FADDP( ST(0), ST(3) ) /* F1 F5 F4 */
+ FADDP( ST(0), ST(1) ) /* F5 F4 */
+
+ MOV_L( SRC(2), EBX )
+ MOV_L( SRC(3), EBP )
+
+ FXCH( ST(1) ) /* F4 F5 */
+ FSTP_S( DST(0) ) /* F5 */
+ FSTP_S( DST(1) ) /* */
+ MOV_L( EBX, DST(2) )
+ MOV_L( EBP, DST(3) )
+
+LLBL( x86_p4_2dnrr_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p4_2dnrr_loop ) )
+
+LLBL( x86_p4_2dnrr_done ):
+
+ POP_L( EBP )
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET
+#undef FRAME_OFFSET
+
+
+
+
+ALIGNTEXT16
+GLOBL GLNAME( _mesa_x86_transform_points4_identity )
+GLNAME( _mesa_x86_transform_points4_identity ):
+
+#define FRAME_OFFSET 12
+ PUSH_L( ESI )
+ PUSH_L( EDI )
+ PUSH_L( EBX )
+
+ MOV_L( ARG_SOURCE, ESI )
+ MOV_L( ARG_DEST, EDI )
+
+ MOV_L( ARG_MATRIX, EDX )
+ MOV_L( REGOFF(V4F_COUNT, ESI), ECX )
+
+ TEST_L( ECX, ECX )
+ JZ( LLBL( x86_p4_ir_done ) )
+
+ MOV_L( REGOFF(V4F_STRIDE, ESI), EAX )
+ OR_L( CONST(VEC_SIZE_4), REGOFF(V4F_FLAGS, EDI) )
+
+ MOV_L( ECX, REGOFF(V4F_COUNT, EDI) )
+ MOV_L( CONST(4), REGOFF(V4F_SIZE, EDI) )
+
+ SHL_L( CONST(4), ECX )
+ MOV_L( REGOFF(V4F_START, ESI), ESI )
+
+ MOV_L( REGOFF(V4F_START, EDI), EDI )
+ ADD_L( EDI, ECX )
+
+ CMP_L( ESI, EDI )
+ JE( LLBL( x86_p4_ir_done ) )
+
+ALIGNTEXT16
+LLBL( x86_p4_ir_loop ):
+
+ MOV_L( SRC(0), EBX )
+ MOV_L( SRC(1), EDX )
+
+ MOV_L( EBX, DST(0) )
+ MOV_L( EDX, DST(1) )
+
+ MOV_L( SRC(2), EBX )
+ MOV_L( SRC(3), EDX )
+
+ MOV_L( EBX, DST(2) )
+ MOV_L( EDX, DST(3) )
+
+LLBL( x86_p4_ir_skip ):
+
+ ADD_L( CONST(16), EDI )
+ ADD_L( EAX, ESI )
+ CMP_L( ECX, EDI )
+ JNE( LLBL( x86_p4_ir_loop ) )
+
+LLBL( x86_p4_ir_done ):
+
+ POP_L( EBX )
+ POP_L( EDI )
+ POP_L( ESI )
+ RET