282 files changed, 7734 insertions, 3462 deletions
diff --git a/common.py b/common.py
index 9141699ffec..7a14a2c026d 100644
--- a/common.py
+++ b/common.py
@@ -87,7 +87,7 @@ def AddOptions(opts):
 	opts.Add(EnumOption('machine', 'use machine-specific assembly code', default_machine,
 											 allowed_values=('generic', 'ppc', 'x86', 'x86_64')))
 	opts.Add(EnumOption('platform', 'target platform', default_platform,
-											 allowed_values=('linux', 'cell', 'windows', 'winddk', 'wince', 'darwin', 'embedded', 'cygwin', 'sunos5')))
+											 allowed_values=('linux', 'cell', 'windows', 'winddk', 'wince', 'darwin', 'embedded', 'cygwin', 'sunos5', 'freebsd8')))
 	opts.Add('toolchain', 'compiler toolchain', 'default')
 	opts.Add(BoolOption('llvm', 'use LLVM', default_llvm))
 	opts.Add(BoolOption('dri', 'build DRI drivers', default_dri))
diff --git a/include/EGL/eglext.h b/include/EGL/eglext.h
index 68591bdeb8c..171892c93ea 100644
--- a/include/EGL/eglext.h
+++ b/include/EGL/eglext.h
@@ -6,7 +6,7 @@ extern "C" {
 #endif
 
 /*
-** Copyright (c) 2007-2009 The Khronos Group Inc.
+** Copyright (c) 2007-2010 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -34,8 +34,8 @@ extern "C" {
 
 /* Header file version number */
 /* Current version at http://www.khronos.org/registry/egl/ */
-/* $Revision: 10185 $ on $Date: 2010-01-22 11:38:01 -0800 (Fri, 22 Jan 2010) $ */
-#define EGL_EGLEXT_VERSION 5
+/* $Revision: 12124 $ on $Date: 2010-07-27 20:12:35 -0700 (Tue, 27 Jul 2010) $ */
+#define EGL_EGLEXT_VERSION 7
 
 #ifndef EGL_KHR_config_attribs
 #define EGL_KHR_config_attribs 1
@@ -120,6 +120,7 @@ typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYIMAGEKHRPROC) (EGLDisplay dpy, EGL
 #define EGL_GL_RENDERBUFFER_KHR			0x30B9	/* eglCreateImageKHR target */
 #endif
 
+#if KHRONOS_SUPPORT_INT64   /* EGLTimeKHR requires 64-bit uint support */
 #ifndef EGL_KHR_reusable_sync
 #define EGL_KHR_reusable_sync 1
 
@@ -149,6 +150,7 @@ typedef EGLint (EGLAPIENTRYP PFNEGLCLIENTWAITSYNCKHRPROC) (EGLDisplay dpy, EGLSy
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLSIGNALSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLenum mode);
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCATTRIBKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value);
 #endif
+#endif
 
 /* EGL_MESA_screen extension  >>> PRELIMINARY <<< */
 #ifndef EGL_MESA_screen_surface
@@ -238,6 +240,101 @@ typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETDRMDISPLAYMESA) (int fd);
 #define EGL_CONTEXT_PRIORITY_LOW_IMG		0x3103
 #endif
 
+#ifndef EGL_KHR_lock_surface2
+#define EGL_KHR_lock_surface2 1
+#define EGL_BITMAP_PIXEL_SIZE_KHR		0x3110
+#endif
+
+#ifndef EGL_NV_coverage_sample
+#define EGL_NV_coverage_sample 1
+#define EGL_COVERAGE_BUFFERS_NV 0x30E0
+#define EGL_COVERAGE_SAMPLES_NV 0x30E1
+#endif
+
+#ifndef EGL_NV_depth_nonlinear
+#define EGL_NV_depth_nonlinear 1
+#define EGL_DEPTH_ENCODING_NV 0x30E2
+#define EGL_DEPTH_ENCODING_NONE_NV 0
+#define EGL_DEPTH_ENCODING_NONLINEAR_NV 0x30E3
+#endif
+
+#if KHRONOS_SUPPORT_INT64   /* EGLTimeNV requires 64-bit uint support */
+#ifndef EGL_NV_sync
+#define EGL_NV_sync 1
+#define EGL_SYNC_PRIOR_COMMANDS_COMPLETE_NV	0x30E6
+#define EGL_SYNC_STATUS_NV			0x30E7
+#define EGL_SIGNALED_NV				0x30E8
+#define EGL_UNSIGNALED_NV			0x30E9
+#define EGL_SYNC_FLUSH_COMMANDS_BIT_NV		0x0001
+#define EGL_FOREVER_NV				0xFFFFFFFFFFFFFFFFull
+#define EGL_ALREADY_SIGNALED_NV			0x30EA
+#define EGL_TIMEOUT_EXPIRED_NV			0x30EB
+#define EGL_CONDITION_SATISFIED_NV		0x30EC
+#define EGL_SYNC_TYPE_NV			0x30ED
+#define EGL_SYNC_CONDITION_NV			0x30EE
+#define EGL_SYNC_FENCE_NV			0x30EF
+#define EGL_NO_SYNC_NV				((EGLSyncNV)0)
+typedef void* EGLSyncNV;
+typedef khronos_utime_nanoseconds_t EGLTimeNV;
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLSyncNV eglCreateFenceSyncNV (EGLDisplay dpy, EGLenum condition, const EGLint *attrib_list);
+EGLBoolean eglDestroySyncNV (EGLSyncNV sync);
+EGLBoolean eglFenceNV (EGLSyncNV sync);
+EGLint eglClientWaitSyncNV (EGLSyncNV sync, EGLint flags, EGLTimeNV timeout);
+EGLBoolean eglSignalSyncNV (EGLSyncNV sync, EGLenum mode);
+EGLBoolean eglGetSyncAttribNV (EGLSyncNV sync, EGLint attribute, EGLint *value);
+#endif /* EGL_EGLEXT_PROTOTYPES */
+typedef EGLSyncNV (EGLAPIENTRYP PFNEGLCREATEFENCESYNCNVPROC) (EGLDisplay dpy, EGLenum condition, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYSYNCNVPROC) (EGLSyncNV sync);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLFENCENVPROC) (EGLSyncNV sync);
+typedef EGLint (EGLAPIENTRYP PFNEGLCLIENTWAITSYNCNVPROC) (EGLSyncNV sync, EGLint flags, EGLTimeNV timeout);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSIGNALSYNCNVPROC) (EGLSyncNV sync, EGLenum mode);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCATTRIBNVPROC) (EGLSyncNV sync, EGLint attribute, EGLint *value);
+#endif
+#endif
+
+#if KHRONOS_SUPPORT_INT64   /* Dependent on EGL_KHR_reusable_sync which requires 64-bit uint support */
+#ifndef EGL_KHR_fence_sync
+#define EGL_KHR_fence_sync 1
+/* Reuses most tokens and entry points from EGL_KHR_reusable_sync */
+#define EGL_SYNC_PRIOR_COMMANDS_COMPLETE_KHR	0x30F0
+#define EGL_SYNC_CONDITION_KHR			0x30F8
+#define EGL_SYNC_FENCE_KHR			0x30F9
+#endif
+#endif
+
+#ifndef EGL_HI_clientpixmap
+#define EGL_HI_clientpixmap 1
+
+/* Surface Attribute */
+#define EGL_CLIENT_PIXMAP_POINTER_HI		0x8F74
+/*
+ * Structure representing a client pixmap
+ * (pixmap's data is in client-space memory).
+ */
+struct EGLClientPixmapHI
+{
+	void*		pData;
+	EGLint		iWidth;
+	EGLint		iHeight;
+	EGLint		iStride;
+};
+
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePixmapSurfaceHI(EGLDisplay dpy, EGLConfig config, struct EGLClientPixmapHI* pixmap);
+#endif /* EGL_EGLEXT_PROTOTYPES */
+typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATEPIXMAPSURFACEHIPROC) (EGLDisplay dpy, EGLConfig config, struct EGLClientPixmapHI* pixmap);
+#endif	/* EGL_HI_clientpixmap */
+
+#ifndef EGL_HI_colorformats
+#define EGL_HI_colorformats 1
+/* Config Attribute */
+#define EGL_COLOR_FORMAT_HI			0x8F70
+/* Color Formats */
+#define EGL_COLOR_RGB_HI			0x8F71
+#define EGL_COLOR_RGBA_HI			0x8F72
+#define EGL_COLOR_ARGB_HI			0x8F73
+#endif /* EGL_HI_colorformats */
 
 #ifndef EGL_NOK_swap_region
 #define EGL_NOK_swap_region 1
diff --git a/src/egl/main/Makefile b/src/egl/main/Makefile
index 41d301fc140..d92fbf6d9a7 100644
--- a/src/egl/main/Makefile
+++ b/src/egl/main/Makefile
@@ -26,7 +26,8 @@ HEADERS = \
 	eglmutex.h \
 	eglscreen.h \
 	eglstring.h \
-	eglsurface.h
+	eglsurface.h \
+	eglsync.h
 
 SOURCES = \
 	eglapi.c \
@@ -44,7 +45,8 @@ SOURCES = \
 	eglmode.c \
 	eglscreen.c \
 	eglstring.c \
-	eglsurface.c
+	eglsurface.c \
+	eglsync.c
 
 OBJECTS = $(SOURCES:.c=.o)
 
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 4dc8707cfbc..53a5f6ed223 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -68,6 +68,7 @@
 #include "eglscreen.h"
 #include "eglmode.h"
 #include "eglimage.h"
+#include "eglsync.h"
 
 
 /**
@@ -126,6 +127,8 @@
 #define _EGL_CHECK_MODE(disp, m, ret, drv) \
    _EGL_CHECK_OBJECT(disp, Mode, m, ret, drv)
 
+#define _EGL_CHECK_SYNC(disp, s, ret, drv) \
+   _EGL_CHECK_OBJECT(disp, Sync, s, ret, drv)
 
 
 static INLINE _EGLDriver *
@@ -185,6 +188,26 @@ _eglCheckConfig(_EGLDisplay *disp, _EGLConfig *conf, const char *msg)
 }
 
 
+#ifdef EGL_KHR_reusable_sync
+
+
+static INLINE _EGLDriver *
+_eglCheckSync(_EGLDisplay *disp, _EGLSync *s, const char *msg)
+{
+   _EGLDriver *drv = _eglCheckDisplay(disp, msg);
+   if (!drv)
+      return NULL;
+   if (!s) {
+      _eglError(EGL_BAD_PARAMETER, msg);
+      return NULL;
+   }
+   return drv;
+}
+
+
+#endif /* EGL_KHR_reusable_sync */
+
+
 #ifdef EGL_MESA_screen_surface
 
 
@@ -1245,6 +1268,90 @@ eglDestroyImageKHR(EGLDisplay dpy, EGLImageKHR image)
 #endif /* EGL_KHR_image_base */
 
 
+#ifdef EGL_KHR_reusable_sync
+
+
+EGLSyncKHR EGLAPIENTRY
+eglCreateSyncKHR(EGLDisplay dpy, EGLenum type, const EGLint *attrib_list)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLDriver *drv;
+   _EGLSync *sync;
+   EGLSyncKHR ret;
+
+   _EGL_CHECK_DISPLAY(disp, EGL_NO_SYNC_KHR, drv);
+
+   sync = drv->API.CreateSyncKHR(drv, disp, type, attrib_list);
+   ret = (sync) ? _eglLinkSync(sync, disp) : EGL_NO_SYNC_KHR;
+
+   RETURN_EGL_EVAL(disp, ret);
+}
+
+
+EGLBoolean EGLAPIENTRY
+eglDestroySyncKHR(EGLDisplay dpy, EGLSyncKHR sync)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLSync *s = _eglLookupSync(sync, disp);
+   _EGLDriver *drv;
+   EGLBoolean ret;
+
+   _EGL_CHECK_SYNC(disp, s, EGL_FALSE, drv);
+   _eglUnlinkSync(s);
+   ret = drv->API.DestroySyncKHR(drv, disp, s);
+
+   RETURN_EGL_EVAL(disp, ret);
+}
+
+
+EGLint EGLAPIENTRY
+eglClientWaitSyncKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR timeout)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLSync *s = _eglLookupSync(sync, disp);
+   _EGLDriver *drv;
+   EGLint ret;
+
+   _EGL_CHECK_SYNC(disp, s, EGL_FALSE, drv);
+   ret = drv->API.ClientWaitSyncKHR(drv, disp, s, flags, timeout);
+
+   RETURN_EGL_EVAL(disp, ret);
+}
+
+
+EGLBoolean EGLAPIENTRY
+eglSignalSyncKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLenum mode)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLSync *s = _eglLookupSync(sync, disp);
+   _EGLDriver *drv;
+   EGLBoolean ret;
+
+   _EGL_CHECK_SYNC(disp, s, EGL_FALSE, drv);
+   ret = drv->API.SignalSyncKHR(drv, disp, s, mode);
+
+   RETURN_EGL_EVAL(disp, ret);
+}
+
+
+EGLBoolean EGLAPIENTRY
+eglGetSyncAttribKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value)
+{
+   _EGLDisplay *disp = _eglLockDisplay(dpy);
+   _EGLSync *s = _eglLookupSync(sync, disp);
+   _EGLDriver *drv;
+   EGLBoolean ret;
+
+   _EGL_CHECK_SYNC(disp, s, EGL_FALSE, drv);
+   ret = drv->API.GetSyncAttribKHR(drv, disp, s, attribute, value);
+
+   RETURN_EGL_EVAL(disp, ret);
+}
+
+
+#endif /* EGL_KHR_reusable_sync */
+
+
 #ifdef EGL_NOK_swap_region
 
 EGLBoolean EGLAPIENTRY
diff --git a/src/egl/main/eglapi.h b/src/egl/main/eglapi.h
index d8c8b49a49d..5045a9a272f 100644
--- a/src/egl/main/eglapi.h
+++ b/src/egl/main/eglapi.h
@@ -76,6 +76,16 @@ typedef _EGLImage *(*CreateImageKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLCo
 typedef EGLBoolean (*DestroyImageKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *image);
 #endif /* EGL_KHR_image_base */
 
+
+#ifdef EGL_KHR_reusable_sync
+typedef _EGLSync *(*CreateSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum type, const EGLint *attrib_list);
+typedef EGLBoolean (*DestroySyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
+typedef EGLint (*ClientWaitSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint flags, EGLTimeKHR timeout);
+typedef EGLBoolean (*SignalSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLenum mode);
+typedef EGLBoolean (*GetSyncAttribKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint attribute, EGLint *value);
+#endif /* EGL_KHR_reusable_sync */
+
+
 #ifdef EGL_NOK_swap_region
 typedef EGLBoolean (*SwapBuffersRegionNOK_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf, EGLint numRects, const EGLint *rects);
 #endif
@@ -138,6 +148,14 @@ struct _egl_api
    DestroyImageKHR_t DestroyImageKHR;
 #endif /* EGL_KHR_image_base */
 
+#ifdef EGL_KHR_reusable_sync
+   CreateSyncKHR_t CreateSyncKHR;
+   DestroySyncKHR_t DestroySyncKHR;
+   ClientWaitSyncKHR_t ClientWaitSyncKHR;
+   SignalSyncKHR_t SignalSyncKHR;
+   GetSyncAttribKHR_t GetSyncAttribKHR;
+#endif /* EGL_KHR_reusable_sync */
+
 #ifdef EGL_NOK_swap_region
    SwapBuffersRegionNOK_t SwapBuffersRegionNOK;
 #endif
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index a2cee08bf6f..97c9d196ec4 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -24,6 +24,7 @@ enum _egl_resource_type {
    _EGL_RESOURCE_CONTEXT,
    _EGL_RESOURCE_SURFACE,
    _EGL_RESOURCE_IMAGE,
+   _EGL_RESOURCE_SYNC,
 
    _EGL_NUM_RESOURCES
 };
@@ -53,6 +54,7 @@ struct _egl_extensions
    EGLBoolean MESA_screen_surface;
    EGLBoolean MESA_copy_context;
    EGLBoolean MESA_drm_display;
+
    EGLBoolean KHR_image_base;
    EGLBoolean KHR_image_pixmap;
    EGLBoolean KHR_vg_parent_image;
@@ -60,9 +62,14 @@ struct _egl_extensions
    EGLBoolean KHR_gl_texture_cubemap_image;
    EGLBoolean KHR_gl_texture_3D_image;
    EGLBoolean KHR_gl_renderbuffer_image;
+
+   EGLBoolean KHR_reusable_sync;
+   EGLBoolean KHR_fence_sync;
+
    EGLBoolean KHR_surfaceless_gles1;
    EGLBoolean KHR_surfaceless_gles2;
    EGLBoolean KHR_surfaceless_opengl;
+
    EGLBoolean NOK_swap_region;
    EGLBoolean NOK_texture_from_pixmap;
 
diff --git a/src/egl/main/egldriver.c b/src/egl/main/egldriver.c
index 8fc9e792b06..67f1d3dbaa4 100644
--- a/src/egl/main/egldriver.c
+++ b/src/egl/main/egldriver.c
@@ -21,6 +21,7 @@
 #include "eglstring.h"
 #include "eglsurface.h"
 #include "eglimage.h"
+#include "eglsync.h"
 #include "eglmutex.h"
 
 #if defined(_EGL_OS_UNIX)
@@ -722,6 +723,14 @@ _eglInitDriverFallbacks(_EGLDriver *drv)
    drv->API.CreateImageKHR = _eglCreateImageKHR;
    drv->API.DestroyImageKHR = _eglDestroyImageKHR;
 #endif /* EGL_KHR_image_base */
+
+#ifdef EGL_KHR_reusable_sync
+   drv->API.CreateSyncKHR = _eglCreateSyncKHR;
+   drv->API.DestroySyncKHR = _eglDestroySyncKHR;
+   drv->API.ClientWaitSyncKHR = _eglClientWaitSyncKHR;
+   drv->API.SignalSyncKHR = _eglSignalSyncKHR;
+   drv->API.GetSyncAttribKHR = _eglGetSyncAttribKHR;
+#endif /* EGL_KHR_reusable_sync */
 }
 
 
diff --git a/src/egl/main/eglmisc.c b/src/egl/main/eglmisc.c
index 985d1e0069d..b10783bcb96 100644
--- a/src/egl/main/eglmisc.c
+++ b/src/egl/main/eglmisc.c
@@ -97,6 +97,9 @@ _eglUpdateExtensionsString(_EGLDisplay *dpy)
    _EGL_CHECK_EXTENSION(KHR_gl_texture_3D_image);
    _EGL_CHECK_EXTENSION(KHR_gl_renderbuffer_image);
 
+   _EGL_CHECK_EXTENSION(KHR_reusable_sync);
+   _EGL_CHECK_EXTENSION(KHR_fence_sync);
+
    _EGL_CHECK_EXTENSION(KHR_surfaceless_gles1);
    _EGL_CHECK_EXTENSION(KHR_surfaceless_gles2);
    _EGL_CHECK_EXTENSION(KHR_surfaceless_opengl);
diff --git a/src/egl/main/eglsync.c b/src/egl/main/eglsync.c
new file mode 100644
index 00000000000..b6c62d0087d
--- /dev/null
+++ b/src/egl/main/eglsync.c
@@ -0,0 +1,128 @@
+#include <string.h>
+
+#include "eglsync.h"
+#include "eglcurrent.h"
+#include "egllog.h"
+
+
+#ifdef EGL_KHR_reusable_sync
+
+
+/**
+ * Parse the list of sync attributes and return the proper error code.
+ */
+static EGLint
+_eglParseSyncAttribList(_EGLSync *sync, const EGLint *attrib_list)
+{
+   EGLint i, err = EGL_SUCCESS;
+
+   if (!attrib_list)
+      return EGL_SUCCESS;
+
+   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
+      EGLint attr = attrib_list[i++];
+      EGLint val = attrib_list[i];
+
+      switch (attr) {
+      default:
+         (void) val;
+         err = EGL_BAD_ATTRIBUTE;
+         break;
+      }
+
+      if (err != EGL_SUCCESS) {
+         _eglLog(_EGL_DEBUG, "bad sync attribute 0x%04x", attr);
+         break;
+      }
+   }
+
+   return err;
+}
+
+
+EGLBoolean
+_eglInitSync(_EGLSync *sync, _EGLDisplay *dpy, EGLenum type,
+             const EGLint *attrib_list)
+{
+   EGLint err;
+
+   if (!(type == EGL_SYNC_REUSABLE_KHR && dpy->Extensions.KHR_reusable_sync) &&
+       !(type == EGL_SYNC_FENCE_KHR && dpy->Extensions.KHR_fence_sync))
+      return _eglError(EGL_BAD_ATTRIBUTE, "eglCreateSyncKHR");
+
+   memset(sync, 0, sizeof(*sync));
+
+   sync->Resource.Display = dpy;
+
+   sync->Type = type;
+   sync->SyncStatus = EGL_UNSIGNALED_KHR;
+   sync->SyncCondition = EGL_SYNC_PRIOR_COMMANDS_COMPLETE_KHR;
+
+   err = _eglParseSyncAttribList(sync, attrib_list);
+   if (err != EGL_SUCCESS)
+      return _eglError(err, "eglCreateSyncKHR");
+
+   return EGL_TRUE;
+}
+
+
+_EGLSync *
+_eglCreateSyncKHR(_EGLDriver *drv, _EGLDisplay *dpy,
+                  EGLenum type, const EGLint *attrib_list)
+{
+   return NULL;
+}
+
+
+EGLBoolean
+_eglDestroySyncKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync)
+{
+   return EGL_TRUE;
+}
+
+
+EGLint
+_eglClientWaitSyncKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                      EGLint flags, EGLTimeKHR timeout)
+{
+   return EGL_FALSE;
+}
+
+
+EGLBoolean
+_eglSignalSyncKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                  EGLenum mode)
+{
+   return EGL_FALSE;
+}
+
+
+EGLBoolean
+_eglGetSyncAttribKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                     EGLint attribute, EGLint *value)
+{
+   if (!value)
+      return _eglError(EGL_BAD_PARAMETER, "eglGetConfigs");
+
+   switch (attribute) {
+   case EGL_SYNC_TYPE_KHR:
+      *value = sync->Type;
+      break;
+   case EGL_SYNC_STATUS_KHR:
+      *value = sync->SyncStatus;
+      break;
+   case EGL_SYNC_CONDITION_KHR:
+      if (sync->Type != EGL_SYNC_FENCE_KHR)
+         return _eglError(EGL_BAD_ATTRIBUTE, "eglGetSyncAttribKHR");
+      *value = sync->SyncCondition;
+      break;
+   default:
+      return _eglError(EGL_BAD_ATTRIBUTE, "eglGetSyncAttribKHR");
+      break;
+   }
+
+   return EGL_TRUE;
+}
+
+
+#endif /* EGL_KHR_reusable_sync */
diff --git a/src/egl/main/eglsync.h b/src/egl/main/eglsync.h
new file mode 100644
index 00000000000..25c467175e9
--- /dev/null
+++ b/src/egl/main/eglsync.h
@@ -0,0 +1,120 @@
+#ifndef EGLSYNC_INCLUDED
+#define EGLSYNC_INCLUDED
+
+
+#include "egltypedefs.h"
+#include "egldisplay.h"
+
+
+#ifdef EGL_KHR_reusable_sync
+
+
+/**
+ * "Base" class for device driver syncs.
+ */
+struct _egl_sync
+{
+   /* A sync is a display resource */
+   _EGLResource Resource;
+
+   EGLenum Type;
+   EGLenum SyncStatus;
+   EGLenum SyncCondition;
+};
+
+
+PUBLIC EGLBoolean
+_eglInitSync(_EGLSync *sync, _EGLDisplay *dpy, EGLenum type,
+             const EGLint *attrib_list);
+
+
+extern _EGLSync *
+_eglCreateSyncKHR(_EGLDriver *drv, _EGLDisplay *dpy,
+                  EGLenum type, const EGLint *attrib_list);
+
+
+extern EGLBoolean
+_eglDestroySyncKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
+
+
+extern EGLint
+_eglClientWaitSyncKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                      EGLint flags, EGLTimeKHR timeout);
+
+
+extern EGLBoolean
+_eglSignalSyncKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                  EGLenum mode);
+
+
+extern EGLBoolean
+_eglGetSyncAttribKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                     EGLint attribute, EGLint *value);
+
+
+/**
+ * Link a sync to a display and return the handle of the link.
+ * The handle can be passed to client directly.
+ */
+static INLINE EGLSyncKHR
+_eglLinkSync(_EGLSync *sync, _EGLDisplay *dpy)
+{
+   _eglLinkResource(&sync->Resource, _EGL_RESOURCE_SYNC, dpy);
+   return (EGLSyncKHR) sync;
+}
+
+
+/**
+ * Unlink a linked sync from its display.
+ */
+static INLINE void
+_eglUnlinkSync(_EGLSync *sync)
+{
+   _eglUnlinkResource(&sync->Resource, _EGL_RESOURCE_SYNC);
+}
+
+
+/**
+ * Lookup a handle to find the linked sync.
+ * Return NULL if the handle has no corresponding linked sync.
+ */
+static INLINE _EGLSync *
+_eglLookupSync(EGLSyncKHR handle, _EGLDisplay *dpy)
+{
+   _EGLSync *sync = (_EGLSync *) handle;
+   if (!dpy || !_eglCheckResource((void *) sync, _EGL_RESOURCE_SYNC, dpy))
+      sync = NULL;
+   return sync;
+}
+
+
+/**
+ * Return the handle of a linked sync, or EGL_NO_SYNC_KHR.
+ */
+static INLINE EGLSyncKHR
+_eglGetSyncHandle(_EGLSync *sync)
+{
+   _EGLResource *res = (_EGLResource *) sync;
+   return (res && _eglIsResourceLinked(res)) ?
+      (EGLSyncKHR) sync : EGL_NO_SYNC_KHR;
+}
+
+
+/**
+ * Return true if the sync is linked to a display.
+ *
+ * The link is considered a reference to the sync (the display is owning the
+ * sync).  Drivers should not destroy a sync when it is linked.
+ */
+static INLINE EGLBoolean
+_eglIsSyncLinked(_EGLSync *sync)
+{
+   _EGLResource *res = (_EGLResource *) sync;
+   return (res && _eglIsResourceLinked(res));
+}
+
+
+#endif /* EGL_KHR_reusable_sync */
+
+
+#endif /* EGLSYNC_INCLUDED */
diff --git a/src/egl/main/egltypedefs.h b/src/egl/main/egltypedefs.h
index 0e29e9aa47e..b65f3b72ae5 100644
--- a/src/egl/main/egltypedefs.h
+++ b/src/egl/main/egltypedefs.h
@@ -32,6 +32,8 @@ typedef struct _egl_screen _EGLScreen;
 
 typedef struct _egl_surface _EGLSurface;
 
+typedef struct _egl_sync _EGLSync;
+
 typedef struct _egl_thread_info _EGLThreadInfo;
 
 #endif /* EGLTYPEDEFS_INCLUDED */
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
index 843b72bc38b..eb2a40cbaa3 100644
--- a/src/gallium/auxiliary/Makefile
+++ b/src/gallium/auxiliary/Makefile
@@ -26,7 +26,6 @@ C_SOURCES = \
 	draw/draw_pipe_wide_line.c \
 	draw/draw_pipe_wide_point.c \
 	draw/draw_pt.c \
-	draw/draw_pt_elts.c \
 	draw/draw_pt_emit.c \
 	draw/draw_pt_fetch.c \
 	draw/draw_pt_fetch_emit.c \
@@ -35,8 +34,7 @@ C_SOURCES = \
 	draw/draw_pt_post_vs.c \
 	draw/draw_pt_so_emit.c \
 	draw/draw_pt_util.c \
-	draw/draw_pt_varray.c \
-	draw/draw_pt_vcache.c \
+	draw/draw_pt_vsplit.c \
 	draw/draw_vertex.c \
 	draw/draw_vs.c \
 	draw/draw_vs_varient.c \
@@ -131,6 +129,7 @@ C_SOURCES = \
 	util/u_sampler.c \
 	util/u_simple_shaders.c \
 	util/u_snprintf.c \
+	util/u_staging.c \
 	util/u_surface.c \
 	util/u_surfaces.c \
 	util/u_texture.c \
diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript
index 1f091987218..30e5d02c9bb 100644
--- a/src/gallium/auxiliary/SConscript
+++ b/src/gallium/auxiliary/SConscript
@@ -71,7 +71,6 @@ source = [
     'draw/draw_pipe_wide_line.c',
     'draw/draw_pipe_wide_point.c',
     'draw/draw_pt.c',
-    'draw/draw_pt_elts.c',
     'draw/draw_pt_emit.c',
     'draw/draw_pt_fetch.c',
     'draw/draw_pt_fetch_emit.c',
@@ -80,8 +79,7 @@ source = [
     'draw/draw_pt_post_vs.c',
     'draw/draw_pt_so_emit.c',
     'draw/draw_pt_util.c',
-    'draw/draw_pt_varray.c',
-    'draw/draw_pt_vcache.c',
+    'draw/draw_pt_vsplit.c',
     'draw/draw_vertex.c',
     'draw/draw_vs.c',
     'draw/draw_vs_aos.c',
@@ -180,6 +178,7 @@ source = [
     'util/u_sampler.c',
     'util/u_simple_shaders.c',
     'util/u_snprintf.c',
+    'util/u_staging.c',
     'util/u_surface.c',
     'util/u_surfaces.c',
     'util/u_texture.c',
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 995b675b9a1..d118a8db52d 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -34,6 +34,7 @@
 #include "pipe/p_context.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 #include "draw_context.h"
 #include "draw_vs.h"
 #include "draw_gs.h"
@@ -41,6 +42,25 @@
 #if HAVE_LLVM
 #include "gallivm/lp_bld_init.h"
 #include "draw_llvm.h"
+
+static boolean
+draw_get_option_use_llvm(void)
+{
+   static boolean first = TRUE;
+   static boolean value;
+   if (first) {
+      first = FALSE;
+      value = debug_get_bool_option("DRAW_USE_LLVM", TRUE);
+
+#ifdef PIPE_ARCH_X86
+      util_cpu_detect();
+      /* require SSE2 due to LLVM PR6960. */
+      if (!util_cpu_caps.has_sse2)
+         value = FALSE;
+#endif
+   }
+   return value;
+}
 #endif
 
 struct draw_context *draw_create( struct pipe_context *pipe )
@@ -50,10 +70,13 @@ struct draw_context *draw_create( struct pipe_context *pipe )
       goto fail;
 
 #if HAVE_LLVM
-   lp_build_init();
-   assert(lp_build_engine);
-   draw->engine = lp_build_engine;
-   draw->llvm = draw_llvm_create(draw);
+   if(draw_get_option_use_llvm())
+   {
+      lp_build_init();
+      assert(lp_build_engine);
+      draw->engine = lp_build_engine;
+      draw->llvm = draw_llvm_create(draw);
+   }
 #endif
 
    if (!draw_init(draw))
@@ -135,7 +158,8 @@ void draw_destroy( struct draw_context *draw )
    draw_vs_destroy( draw );
    draw_gs_destroy( draw );
 #ifdef HAVE_LLVM
-   draw_llvm_destroy( draw->llvm );
+   if(draw->llvm)
+      draw_llvm_destroy( draw->llvm );
 #endif
 
    FREE( draw );
@@ -659,7 +683,8 @@ draw_set_mapped_texture(struct draw_context *draw,
                         const void *data[DRAW_MAX_TEXTURE_LEVELS])
 {
 #ifdef HAVE_LLVM
-   draw_llvm_set_mapped_texture(draw,
+   if(draw->llvm)
+      draw_llvm_set_mapped_texture(draw,
                                 sampler_idx,
                                 width, height, depth, last_level,
                                 row_stride, img_stride, data);
diff --git a/src/gallium/auxiliary/draw/draw_decompose_tmp.h b/src/gallium/auxiliary/draw/draw_decompose_tmp.h
index a52d2b50588..a142563af97 100644
--- a/src/gallium/auxiliary/draw/draw_decompose_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_decompose_tmp.h
@@ -54,10 +54,10 @@ FUNC(FUNC_VARS)
 
    FUNC_ENTER;
 
-   /* prim, count, and last_vertex_last should have been defined */
+   /* prim, prim_flags, count, and last_vertex_last should have been defined */
    if (0) {
-      debug_printf("%s: prim 0x%x, count %d, last_vertex_last %d\n",
-            __FUNCTION__, prim, count, last_vertex_last);
+      debug_printf("%s: prim 0x%x, prim_flags 0x%x, count %d, last_vertex_last %d\n",
+            __FUNCTION__, prim, prim_flags, count, last_vertex_last);
    }
 
    switch (prim) {
@@ -80,7 +80,7 @@ FUNC(FUNC_VARS)
    case PIPE_PRIM_LINE_LOOP:
    case PIPE_PRIM_LINE_STRIP:
       if (count >= 2) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
+         flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE;
          idx[1] = GET_ELT(0);
          idx[2] = idx[1];
 
@@ -90,7 +90,7 @@ FUNC(FUNC_VARS)
             LINE(flags, idx[0], idx[1]);
          }
          /* close the loop */
-         if (prim == PIPE_PRIM_LINE_LOOP)
+         if (prim == PIPE_PRIM_LINE_LOOP && !prim_flags)
             LINE(flags, idx[1], idx[2]);
       }
       break;
@@ -255,17 +255,23 @@ FUNC(FUNC_VARS)
 
          if (last_vertex_last) {
             flags = (DRAW_PIPE_RESET_STIPPLE |
-                     DRAW_PIPE_EDGE_FLAG_2 |
                      DRAW_PIPE_EDGE_FLAG_0);
+            if (!(prim_flags & DRAW_SPLIT_BEFORE))
+               flags |= DRAW_PIPE_EDGE_FLAG_2;
+
             edge_next = DRAW_PIPE_EDGE_FLAG_0;
-            edge_finish = DRAW_PIPE_EDGE_FLAG_1;
+            edge_finish =
+               (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_1;
          }
          else {
             flags = (DRAW_PIPE_RESET_STIPPLE |
-                     DRAW_PIPE_EDGE_FLAG_0 |
                      DRAW_PIPE_EDGE_FLAG_1);
+            if (!(prim_flags & DRAW_SPLIT_BEFORE))
+               flags |= DRAW_PIPE_EDGE_FLAG_0;
+
             edge_next = DRAW_PIPE_EDGE_FLAG_1;
-            edge_finish = DRAW_PIPE_EDGE_FLAG_2;
+            edge_finish =
+               (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_2;
          }
 
          idx[0] = GET_ELT(0);
@@ -300,7 +306,7 @@ FUNC(FUNC_VARS)
 
    case PIPE_PRIM_LINE_STRIP_ADJACENCY:
       if (count >= 4) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
+         flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE;
          idx[1] = GET_ELT(0);
          idx[2] = GET_ELT(1);
          idx[3] = GET_ELT(2);
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 4a1013e79a5..50a03ac95a5 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -380,7 +380,7 @@ static void gs_tri_adj(struct draw_geometry_shader *shader,
 
 #define FUNC         gs_run_elts
 #define LOCAL_VARS   const ushort *elts = input_prims->elts;
-#define GET_ELT(idx) (elts[idx] & ~DRAW_PIPE_FLAG_MASK)
+#define GET_ELT(idx) (elts[idx])
 #include "draw_gs_tmp.h"
 
 
@@ -457,6 +457,7 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader,
    output_prims->start = 0;
    output_prims->count = shader->emitted_vertices;
    output_prims->prim = shader->output_primitive;
+   output_prims->flags = 0x0;
    output_prims->primitive_lengths = shader->primitive_lengths;
    output_prims->primitive_count = shader->emitted_primitives;
    output_verts->count = shader->emitted_vertices;
diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h
index 4a17af0dea3..de7b02655a5 100644
--- a/src/gallium/auxiliary/draw/draw_gs_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h
@@ -6,12 +6,10 @@
 
 #define FUNC_ENTER                                                \
    /* declare more local vars */                                  \
-   struct draw_context *draw = gs->draw;                          \
    const unsigned prim = input_prims->prim;                       \
+   const unsigned prim_flags = input_prims->flags;                \
    const unsigned count = input_prims->count;                     \
-   const boolean last_vertex_last =                               \
-      !(draw->rasterizer->flatshade &&                            \
-        draw->rasterizer->flatshade_first);                       \
+   const boolean last_vertex_last = TRUE;                         \
    do {                                                           \
       debug_assert(input_prims->primitive_count == 1);            \
       switch (prim) {                                             \
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index de99b00a814..58d3e345e5f 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -210,13 +210,6 @@ draw_llvm_create(struct draw_context *draw)
 {
    struct draw_llvm *llvm;
 
-#ifdef PIPE_ARCH_X86
-   util_cpu_detect();
-   /* require SSE2 due to LLVM PR6960. */
-   if (!util_cpu_caps.has_sse2)
-       return NULL;
-#endif
-
    llvm = CALLOC_STRUCT( draw_llvm );
    if (!llvm)
       return NULL;
@@ -683,7 +676,6 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    unsigned i, j;
    struct lp_build_context bld;
    struct lp_build_loop_state lp_loop;
-   struct lp_type vs_type = lp_type_float_vec(32);
    const int max_vertices = 4;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    void *code;
@@ -732,7 +724,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant)
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
-   lp_build_context_init(&bld, builder, vs_type);
+   lp_build_context_init(&bld, builder, lp_type_int(32));
 
    end = lp_build_add(&bld, start, count);
 
@@ -845,9 +837,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    struct draw_context *draw = llvm->draw;
    unsigned i, j;
    struct lp_build_context bld;
-   struct lp_build_context bld_int;
    struct lp_build_loop_state lp_loop;
-   struct lp_type vs_type = lp_type_float_vec(32);
    const int max_vertices = 4;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
    LLVMValueRef fetch_max;
@@ -899,8 +889,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
    builder = LLVMCreateBuilder();
    LLVMPositionBuilderAtEnd(builder, block);
 
-   lp_build_context_init(&bld, builder, vs_type);
-   lp_build_context_init(&bld_int, builder, lp_type_int(32));
+   lp_build_context_init(&bld, builder, lp_type_int(32));
 
    step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0);
 
@@ -935,7 +924,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian
          /* make sure we're not out of bounds which can happen
           * if fetch_count % 4 != 0, because on the last iteration
           * a few of the 4 vertex fetches will be out of bounds */
-         true_index = lp_build_min(&bld_int, true_index, fetch_max);
+         true_index = lp_build_min(&bld, true_index, fetch_max);
 
          fetch_ptr = LLVMBuildGEP(builder, fetch_elts,
                                   &true_index, 1, "");
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 070ac803c85..b75262a3575 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -169,27 +169,29 @@ static void do_triangle( struct draw_context *draw,
 /*
  * Set up macros for draw_pt_decompose.h template code.
  * This code uses vertex indexes / elements.
- *
- * Flags are needed by the stipple and unfilled stages.  When the two stages
- * are active, vcache_run_extras is called and the flags are stored in the
- * higher bits of i0.  Otherwise, flags do not matter.
  */
 
-#define TRIANGLE(flags,i0,i1,i2)                               \
-   do_triangle( draw,                                          \
-                i0,  /* flags */                               \
-                verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK),  \
-                verts + stride * (i1),                         \
-                verts + stride * (i2) )
-
-#define LINE(flags,i0,i1)                                      \
-   do_line( draw,                                              \
-            i0, /* flags */                                    \
-            verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK),      \
-            verts + stride * (i1) )
+#define TRIANGLE(flags,i0,i1,i2)                                  \
+   do {                                                           \
+      do_triangle( draw,                                          \
+                   flags,                                         \
+                   verts + stride * (i0),                         \
+                   verts + stride * (i1),                         \
+                   verts + stride * (i2) );                       \
+   } while (0)
+
+#define LINE(flags,i0,i1)                                         \
+   do {                                                           \
+      do_line( draw,                                              \
+               flags,                                             \
+               verts + stride * (i0),                             \
+               verts + stride * (i1) );                           \
+   } while (0)
 
 #define POINT(i0)                               \
-   do_point( draw, verts + stride * (i0) )
+   do {                                         \
+      do_point( draw, verts + stride * (i0) );  \
+   } while (0)
 
 #define GET_ELT(idx) (elts[idx])
 
@@ -197,6 +199,7 @@ static void do_triangle( struct draw_context *draw,
 #define FUNC_VARS                               \
     struct draw_context *draw,                  \
     unsigned prim,                              \
+    unsigned prim_flags,                        \
     struct vertex_header *vertices,             \
     unsigned stride,                            \
     const ushort *elts,                         \
@@ -240,8 +243,7 @@ void draw_pipeline_run( struct draw_context *draw,
          unsigned max_index = 0x0, i;
          /* find the largest element index */
          for (i = 0; i < count; i++) {
-            unsigned int index = (prim_info->elts[start + i]
-                                  & ~DRAW_PIPE_FLAG_MASK);
+            unsigned int index = prim_info->elts[start + i];
             if (index > max_index)
                max_index = index;
          }
@@ -251,6 +253,7 @@ void draw_pipeline_run( struct draw_context *draw,
 
       pipe_run_elts(draw,
                     prim_info->prim,
+                    prim_info->flags,
                     vert_info->verts,
                     vert_info->stride,
                     prim_info->elts + start,
@@ -288,6 +291,7 @@ void draw_pipeline_run( struct draw_context *draw,
 #define FUNC_VARS                      \
     struct draw_context *draw,         \
     unsigned prim,                     \
+    unsigned prim_flags,               \
     struct vertex_header *vertices,    \
     unsigned stride,                   \
     unsigned count
@@ -320,6 +324,7 @@ void draw_pipeline_run_linear( struct draw_context *draw,
 
       pipe_run_linear(draw,
                       prim_info->prim,
+                      prim_info->flags,
                       (struct vertex_header*)verts,
                       vert_info->stride,
                       count);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index 3c93c9014a6..58c5858734a 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -353,9 +353,6 @@ vbuf_alloc_vertices( struct vbuf_stage *vbuf )
    /* Allocate a new vertex buffer */
    vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
 
-   /* even number */
-   vbuf->max_vertices = vbuf->max_vertices & ~1;
-
    if(vbuf->max_vertices >= UNDEFINED_VERTEX_ID)
       vbuf->max_vertices = UNDEFINED_VERTEX_ID - 1;
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 397d4bf653c..854c45f0602 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -140,8 +140,7 @@ struct draw_context
       } middle;
 
       struct {
-         struct draw_pt_front_end *vcache;
-         struct draw_pt_front_end *varray;
+         struct draw_pt_front_end *vsplit;
       } front;
 
       struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
@@ -296,6 +295,10 @@ struct draw_vertex_info {
    unsigned count;
 };
 
+/* these flags are set if the primitive is a segment of a larger one */
+#define DRAW_SPLIT_BEFORE 0x1
+#define DRAW_SPLIT_AFTER  0x2
+
 struct draw_prim_info {
    boolean linear;
    unsigned start;
@@ -304,6 +307,7 @@ struct draw_prim_info {
    unsigned count;
 
    unsigned prim;
+   unsigned flags;
    unsigned *primitive_lengths;
    unsigned primitive_count;
 };
@@ -369,21 +373,15 @@ void draw_pipeline_destroy( struct draw_context *draw );
 
 
 
-/* We use the top few bits in the elts[] parameter to convey a little
- * API information.  This limits the number of vertices we can address
- * to only 4096 -- if that becomes a problem, we can switch to 32-bit
- * draw indices.
- *
- * These flags expected at first vertex of lines & triangles when
- * unfilled and/or line stipple modes are operational.
+/*
+ * These flags are used by the pipeline when unfilled and/or line stipple modes
+ * are operational.
  */
-#define DRAW_PIPE_MAX_VERTICES  (0x1<<12)
-#define DRAW_PIPE_EDGE_FLAG_0   (0x1<<12)
-#define DRAW_PIPE_EDGE_FLAG_1   (0x2<<12)
-#define DRAW_PIPE_EDGE_FLAG_2   (0x4<<12)
-#define DRAW_PIPE_EDGE_FLAG_ALL (0x7<<12)
-#define DRAW_PIPE_RESET_STIPPLE (0x8<<12)
-#define DRAW_PIPE_FLAG_MASK     (0xf<<12)
+#define DRAW_PIPE_EDGE_FLAG_0   0x1
+#define DRAW_PIPE_EDGE_FLAG_1   0x2
+#define DRAW_PIPE_EDGE_FLAG_2   0x4
+#define DRAW_PIPE_EDGE_FLAG_ALL 0x7
+#define DRAW_PIPE_RESET_STIPPLE 0x8
 
 void draw_pipeline_run( struct draw_context *draw,
                         const struct draw_vertex_info *vert,
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 248927505da..feacd8258b5 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -43,21 +43,9 @@
 
 DEBUG_GET_ONCE_BOOL_OPTION(draw_fse, "DRAW_FSE", FALSE)
 DEBUG_GET_ONCE_BOOL_OPTION(draw_no_fse, "DRAW_NO_FSE", FALSE)
-#ifdef HAVE_LLVM
-DEBUG_GET_ONCE_BOOL_OPTION(draw_use_llvm, "DRAW_USE_LLVM", TRUE)
-#endif
-
-static unsigned trim( unsigned count, unsigned first, unsigned incr )
-{
-   if (count < first)
-      return 0;
-   return count - (count - first) % incr; 
-}
-
-
 
 /* Overall we split things into:
- *     - frontend -- prepare fetch_elts, draw_elts - eg vcache
+ *     - frontend -- prepare fetch_elts, draw_elts - eg vsplit
  *     - middle   -- fetch, shade, cliptest, viewport
  *     - pipeline -- the prim pipeline: clipping, wide lines, etc 
  *     - backend  -- the vbuf_render provided by the driver.
@@ -77,7 +65,7 @@ draw_pt_arrays(struct draw_context *draw,
    {
       unsigned first, incr;
       draw_pt_split_prim(prim, &first, &incr);
-      count = trim(count, first, incr);
+      count = draw_pt_trim_count(count, first, incr);
       if (count < first)
          return TRUE;
    }
@@ -115,22 +103,11 @@ draw_pt_arrays(struct draw_context *draw,
          middle = draw->pt.middle.general;
    }
 
-
-   /* Pick the right frontend
-    */
-   if (draw->pt.user.elts || (opt & PT_PIPELINE)) {
-      frontend = draw->pt.front.vcache;
-   } else {
-      frontend = draw->pt.front.varray;
-   }
+   frontend = draw->pt.front.vsplit;
 
    frontend->prepare( frontend, prim, middle, opt );
 
-   frontend->run(frontend,
-                 draw_pt_elt_func(draw),
-                 draw_pt_elt_ptr(draw, start),
-                 draw->pt.user.eltBias,
-                 count);
+   frontend->run(frontend, start, count);
 
    frontend->finish( frontend );
 
@@ -143,12 +120,8 @@ boolean draw_pt_init( struct draw_context *draw )
    draw->pt.test_fse = debug_get_option_draw_fse();
    draw->pt.no_fse = debug_get_option_draw_no_fse();
 
-   draw->pt.front.vcache = draw_pt_vcache( draw );
-   if (!draw->pt.front.vcache)
-      return FALSE;
-
-   draw->pt.front.varray = draw_pt_varray(draw);
-   if (!draw->pt.front.varray)
+   draw->pt.front.vsplit = draw_pt_vsplit(draw);
+   if (!draw->pt.front.vsplit)
       return FALSE;
 
    draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw );
@@ -164,7 +137,7 @@ boolean draw_pt_init( struct draw_context *draw )
       return FALSE;
 
 #if HAVE_LLVM
-   if (debug_get_option_draw_use_llvm())
+   if (draw->llvm)
       draw->pt.middle.llvm = draw_pt_fetch_pipeline_or_emit_llvm( draw );
 #endif
 
@@ -194,14 +167,9 @@ void draw_pt_destroy( struct draw_context *draw )
       draw->pt.middle.fetch_shade_emit = NULL;
    }
 
-   if (draw->pt.front.vcache) {
-      draw->pt.front.vcache->destroy( draw->pt.front.vcache );
-      draw->pt.front.vcache = NULL;
-   }
-
-   if (draw->pt.front.varray) {
-      draw->pt.front.varray->destroy( draw->pt.front.varray );
-      draw->pt.front.varray = NULL;
+   if (draw->pt.front.vsplit) {
+      draw->pt.front.vsplit->destroy( draw->pt.front.vsplit );
+      draw->pt.front.vsplit = NULL;
    }
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 44356fba4c5..0db56665296 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -35,8 +35,6 @@
 
 #include "pipe/p_compiler.h"
 
-typedef unsigned (*pt_elt_func)( const void *elts, unsigned idx );
-
 struct draw_pt_middle_end;
 struct draw_context;
 struct draw_prim_info;
@@ -52,13 +50,18 @@ struct draw_vertex_info;
 /* The "front end" - prepare sets of fetch, draw elements for the
  * middle end.
  *
- * Currenly one version of this:
- *    - vcache - catchall implementation, decomposes to TRI/LINE/POINT prims
- * Later:
- *    - varray, varray_split
- *    - velement, velement_split
+ * The fetch elements are indices to the vertices.  The draw elements are
+ * indices to the fetched vertices.  When both arrays of elements are both
+ * linear, middle->run_linear is called;  When only the fetch elements are
+ * linear, middle->run_linear_elts is called;  Otherwise, middle->run is
+ * called.
+ *
+ * When the number of the draw elements exceeds max_vertex of the middle end,
+ * the draw elements (as well as the fetch elements) are splitted and the
+ * middle end is called multiple times.
  *
- * Currenly only using the vcache version.
+ * Currenly there is:
+ *    - vsplit - catchall implementation, splits big prims
  */
 struct draw_pt_front_end {
    void (*prepare)( struct draw_pt_front_end *,
@@ -67,9 +70,7 @@ struct draw_pt_front_end {
 		    unsigned opt );
 
    void (*run)( struct draw_pt_front_end *,
-                pt_elt_func elt_func,
-                const void *elt_ptr,
-                int elt_bias,
+                unsigned start,
                 unsigned count );
 
    void (*finish)( struct draw_pt_front_end * );
@@ -80,6 +81,8 @@ struct draw_pt_front_end {
 /* The "middle end" - prepares actual hardware vertices for the
  * hardware backend.
  *
+ * prim_flags is as defined by pipe_draw_info::flags.
+ *
  * Currently two versions of this:
  *     - fetch, vertex shade, cliptest, prim-pipeline
  *     - fetch, emit (ie passthrough)
@@ -94,11 +97,13 @@ struct draw_pt_middle_end {
                 const unsigned *fetch_elts,
                 unsigned fetch_count,
                 const ushort *draw_elts,
-                unsigned draw_count );
+                unsigned draw_count,
+                unsigned prim_flags );
 
    void (*run_linear)(struct draw_pt_middle_end *,
                       unsigned start,
-                      unsigned count);
+                      unsigned count,
+                      unsigned prim_flags );
 
    /* Transform all vertices in a linear range and then draw them with
     * the supplied element list.  May fail and return FALSE.
@@ -107,7 +112,8 @@ struct draw_pt_middle_end {
                             unsigned fetch_start,
                             unsigned fetch_count,
                             const ushort *draw_elts,
-                            unsigned draw_count );
+                            unsigned draw_count,
+                            unsigned prim_flags );
 
    int (*get_max_vertex_count)( struct draw_pt_middle_end * );
 
@@ -122,19 +128,11 @@ struct vbuf_render;
 struct vertex_header;
 
 
-/* Helper functions.
- */
-pt_elt_func draw_pt_elt_func( struct draw_context *draw );
-const void *draw_pt_elt_ptr( struct draw_context *draw,
-                             unsigned start );
-
 /* Frontends: 
  *
- * Currently only the general-purpose vcache implementation, could add
- * a special case for tiny vertex buffers.
+ * Currently only the general-purpose vsplit implementation.
  */
-struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw );
-struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw);
+struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw);
 
 
 /* Middle-ends:
@@ -237,6 +235,7 @@ void draw_pt_post_vs_destroy( struct pt_post_vs *pvs );
  * Utils: 
  */
 void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr);
+unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr);
 
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c
deleted file mode 100644
index 88f4d9f495a..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_elts.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <[email protected]>
-  */
-
-#include "draw/draw_pt.h"
-#include "draw/draw_private.h"
-
-/* Neat get_elt func that also works for varrays drawing by encoding
- * the start value into a pointer.  
- */
-
-static unsigned elt_uint( const void *elts, unsigned idx )
-{
-   return *(((const uint *)elts) + idx);
-}
-
-static unsigned elt_ushort( const void *elts, unsigned idx )
-{
-   return *(((const ushort *)elts) + idx);
-}
-
-static unsigned elt_ubyte( const void *elts, unsigned idx )
-{
-   return *(((const ubyte *)elts) + idx);
-}
-
-static unsigned elt_vert( const void *elts, unsigned idx )
-{
-   /* unsigned index is packed in the pointer */
-   return (unsigned)(uintptr_t)elts + idx;
-}
-
-pt_elt_func draw_pt_elt_func( struct draw_context *draw )
-{
-   switch (draw->pt.user.eltSize) {
-   case 0: return &elt_vert;
-   case 1: return &elt_ubyte;
-   case 2: return &elt_ushort; 
-   case 4: return &elt_uint;
-   default: return NULL;
-   }
-}     
-
-const void *draw_pt_elt_ptr( struct draw_context *draw,
-                             unsigned start )
-{
-   const char *elts = draw->pt.user.elts;
-
-   switch (draw->pt.user.eltSize) {
-   case 0: 
-      return (const void *)(((const ubyte *)NULL) + start);
-   case 1: 
-      return (const void *)(((const ubyte *)elts) + start);
-   case 2: 
-      return (const void *)(((const ushort *)elts) + start);
-   case 4: 
-      return (const void *)(((const uint *)elts) + start);
-   default:
-      return NULL;
-   }
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index 5568fbb9f88..89d96c4235f 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -120,9 +120,6 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
-
-   /* even number */
-   *max_vertices = *max_vertices & ~1;
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 5c8af17c8e3..80a89428b6d 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -191,15 +191,6 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
-
-   /* Return an even number of verts.
-    * This prevents "parity" errors when splitting long triangle strips which
-    * can lead to front/back culling mix-ups.
-    * Every other triangle in a strip has an alternate front/back orientation
-    * so splitting at an odd position can cause the orientation of subsequent
-    * triangles to get reversed.
-    */
-   *max_vertices = *max_vertices & ~1;
 }
 
 
@@ -210,7 +201,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
                             const unsigned *fetch_elts,
                             unsigned fetch_count,
                             const ushort *draw_elts,
-                            unsigned draw_count )
+                            unsigned draw_count,
+                            unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
@@ -273,7 +265,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
 
 static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
                                    unsigned start,
-                                   unsigned count )
+                                   unsigned count,
+                                   unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
@@ -334,7 +327,8 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
                                         unsigned start,
                                         unsigned count,
                                         const ushort *draw_elts,
-                                        unsigned draw_count )
+                                        unsigned draw_count,
+                                        unsigned prim_flags )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index b8270280b64..a31d3feb160 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -175,15 +175,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
                     (vinfo->size * 4));
 
-   /* Return an even number of verts.
-    * This prevents "parity" errors when splitting long triangle strips which
-    * can lead to front/back culling mix-ups.
-    * Every other triangle in a strip has an alternate front/back orientation
-    * so splitting at an odd position can cause the orientation of subsequent
-    * triangles to get reversed.
-    */
-   *max_vertices = *max_vertices & ~1;
-
    /* Probably need to do this somewhere (or fix exec shader not to
     * need it):
     */
@@ -197,7 +188,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
 
 static void fse_run_linear( struct draw_pt_middle_end *middle, 
                             unsigned start, 
-                            unsigned count )
+                            unsigned count,
+                            unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
@@ -265,7 +257,8 @@ fse_run(struct draw_pt_middle_end *middle,
         const unsigned *fetch_elts,
         unsigned fetch_count,
         const ushort *draw_elts,
-        unsigned draw_count )
+        unsigned draw_count,
+        unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
@@ -327,7 +320,8 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle,
                                  unsigned start, 
                                  unsigned count,
                                  const ushort *draw_elts,
-                                 unsigned draw_count )
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
 {
    struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
    struct draw_context *draw = fse->draw;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 5b16c3788e5..96b40fb3630 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -112,16 +112,13 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
 			    gs_out_prim,
                             max_vertices );
 
-      *max_vertices = MAX2( *max_vertices,
-                            DRAW_PIPE_MAX_VERTICES );
+      *max_vertices = MAX2( *max_vertices, 4096 );
    }
    else {
-      *max_vertices = DRAW_PIPE_MAX_VERTICES; 
+      /* limit max fetches by limiting max_vertices */
+      *max_vertices = 4096;
    }
 
-   /* return even number */
-   *max_vertices = *max_vertices & ~1;
-
    /* No need to prepare the shader.
     */
    vs->prepare(vs, draw);
@@ -295,7 +292,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
                                 const unsigned *fetch_elts,
                                 unsigned fetch_count,
                                 const ushort *draw_elts,
-                                unsigned draw_count )
+                                unsigned draw_count,
+                                unsigned prim_flags )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -311,6 +309,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
 
@@ -320,7 +319,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
 
 static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
                                        unsigned start,
-                                       unsigned count)
+                                       unsigned count,
+                                       unsigned prim_flags)
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -336,6 +336,7 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
    prim_info.count = count;
    prim_info.elts = NULL;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &count;
 
@@ -348,7 +349,8 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle
                                                unsigned start,
                                                unsigned count,
                                                const ushort *draw_elts,
-                                               unsigned draw_count )
+                                               unsigned draw_count,
+                                               unsigned prim_flags )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -364,6 +366,7 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
index 4b99bee86a0..78b1bf988cf 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c
@@ -118,16 +118,13 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle,
 			    out_prim,
                             max_vertices );
 
-      *max_vertices = MAX2( *max_vertices,
-                            DRAW_PIPE_MAX_VERTICES );
+      *max_vertices = MAX2( *max_vertices, 4096 );
    }
    else {
-      *max_vertices = DRAW_PIPE_MAX_VERTICES;
+      /* limit max fetches by limiting max_vertices */
+      *max_vertices = 4096;
    }
 
-   /* return even number */
-   *max_vertices = *max_vertices & ~1;
-
    draw_llvm_make_variant_key(fpme->llvm, &key);
 
    li = first_elem(&shader->variants);
@@ -294,7 +291,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
                                  const unsigned *fetch_elts,
                                  unsigned fetch_count,
                                  const ushort *draw_elts,
-                                 unsigned draw_count )
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -310,6 +308,7 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
 
@@ -319,7 +318,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle,
 
 static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle,
                                        unsigned start,
-                                       unsigned count)
+                                       unsigned count,
+                                       unsigned prim_flags)
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -335,6 +335,7 @@ static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle,
    prim_info.count = count;
    prim_info.elts = NULL;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &count;
 
@@ -348,7 +349,8 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle,
                                  unsigned start,
                                  unsigned count,
                                  const ushort *draw_elts,
-                                 unsigned draw_count )
+                                 unsigned draw_count,
+                                 unsigned prim_flags )
 {
    struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle;
    struct draw_fetch_info fetch_info;
@@ -364,6 +366,7 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle,
    prim_info.count = draw_count;
    prim_info.elts = draw_elts;
    prim_info.prim = fpme->input_prim;
+   prim_info.flags = prim_flags;
    prim_info.primitive_count = 1;
    prim_info.primitive_lengths = &draw_count;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
index f7f4f24d354..c86bdd99a33 100644
--- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c
@@ -225,7 +225,7 @@ static void so_tri(struct pt_so_emit *so, int i0, int i1, int i2)
 
 #define FUNC         so_run_elts
 #define LOCAL_VARS   const ushort *elts = input_prims->elts;
-#define GET_ELT(idx) (elts[start + (idx)] & ~DRAW_PIPE_FLAG_MASK)
+#define GET_ELT(idx) (elts[start + (idx)])
 #include "draw_so_emit_tmp.h"
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c
index 3236d38e6ab..513bbbed216 100644
--- a/src/gallium/auxiliary/draw/draw_pt_util.c
+++ b/src/gallium/auxiliary/draw/draw_pt_util.c
@@ -53,7 +53,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    case PIPE_PRIM_LINES_ADJACENCY:
       *first = 4;
-      *incr = 2;
+      *incr = 4;
       break;
    case PIPE_PRIM_LINE_STRIP_ADJACENCY:
       *first = 4;
@@ -65,7 +65,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    case PIPE_PRIM_TRIANGLES_ADJACENCY:
       *first = 6;
-      *incr = 3;
+      *incr = 6;
       break;
    case PIPE_PRIM_TRIANGLE_STRIP:
    case PIPE_PRIM_TRIANGLE_FAN:
@@ -75,7 +75,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
       *first = 6;
-      *incr = 1;
+      *incr = 2;
       break;
    case PIPE_PRIM_QUADS:
       *first = 4;
@@ -92,3 +92,10 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
       break;
    }
 }
+
+unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr)
+{
+   if (count < first)
+      return 0;
+   return count - (count - first) % incr;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
deleted file mode 100644
index cd7bb7bf253..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-
-#include "draw/draw_context.h"
-#include "draw/draw_private.h"
-#include "draw/draw_pt.h"
-
-#define FETCH_MAX 256
-#define DRAW_MAX (FETCH_MAX+8)
-
-struct varray_frontend {
-   struct draw_pt_front_end base;
-   struct draw_context *draw;
-
-   ushort draw_elts[DRAW_MAX];
-   unsigned fetch_elts[FETCH_MAX];
-
-   unsigned driver_fetch_max;
-   unsigned fetch_max;
-
-   struct draw_pt_middle_end *middle;
-
-   unsigned input_prim;
-   unsigned output_prim;
-};
-
-
-static void varray_flush_linear(struct varray_frontend *varray,
-                                unsigned start, unsigned count)
-{
-   if (count) {
-      assert(varray->middle->run_linear);
-      varray->middle->run_linear(varray->middle, start, count);
-   }
-}
-
-static void varray_line_loop_segment(struct varray_frontend *varray,
-                                     unsigned start,
-                                     unsigned segment_start,
-                                     unsigned segment_count,
-                                     boolean end )
-{
-   assert(segment_count < varray->fetch_max);
-   if (segment_count >= 1) {
-      unsigned nr = 0, i;
-
-      for (i = 0; i < segment_count; i++) 
-         varray->fetch_elts[nr++] = start + segment_start + i;
-
-      if (end) 
-         varray->fetch_elts[nr++] = start;
-
-      assert(nr <= FETCH_MAX);
-
-      varray->middle->run(varray->middle, 
-                          varray->fetch_elts,
-                          nr,
-                          varray->draw_elts, /* ie. linear */
-                          nr);
-   }
-}
-
-
-
-static void varray_fan_segment(struct varray_frontend *varray,
-                               unsigned start, 
-                               unsigned segment_start,
-                               unsigned segment_count )
-{
-   assert(segment_count < varray->fetch_max);
-   if (segment_count >= 2) {
-      unsigned nr = 0, i;
-
-      if (segment_start != 0)
-         varray->fetch_elts[nr++] = start;
-
-      for (i = 0 ; i < segment_count; i++) 
-         varray->fetch_elts[nr++] = start + segment_start + i;
-
-      assert(nr <= FETCH_MAX);
-
-      varray->middle->run(varray->middle, 
-                          varray->fetch_elts,
-                          nr,
-                          varray->draw_elts, /* ie. linear */
-                          nr);
-   }
-}
-
-
-
-
-#define FUNC varray_run
-#include "draw_pt_varray_tmp_linear.h"
-
-static unsigned decompose_prim[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY + 1] = {
-   PIPE_PRIM_POINTS,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINE_STRIP,        /* decomposed LINELOOP */
-   PIPE_PRIM_LINE_STRIP,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLE_STRIP,
-   PIPE_PRIM_TRIANGLE_FAN,
-   PIPE_PRIM_QUADS,
-   PIPE_PRIM_QUAD_STRIP,
-   PIPE_PRIM_POLYGON,
-   PIPE_PRIM_LINES_ADJACENCY,
-   PIPE_PRIM_LINE_STRIP_ADJACENCY,
-   PIPE_PRIM_TRIANGLES_ADJACENCY,
-   PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY
-};
-
-
-
-static void varray_prepare(struct draw_pt_front_end *frontend,
-                           unsigned in_prim,
-                           struct draw_pt_middle_end *middle,
-                           unsigned opt)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-
-   varray->base.run = varray_run;
-
-   varray->input_prim = in_prim;
-   assert(in_prim < Elements(decompose_prim));
-   varray->output_prim = decompose_prim[in_prim];
-
-   varray->middle = middle;
-   middle->prepare(middle,
-                   varray->output_prim,
-                   opt, &varray->driver_fetch_max );
-
-   /* check that the max is even */
-   assert((varray->driver_fetch_max & 1) == 0);
-
-   varray->fetch_max = MIN2(FETCH_MAX, varray->driver_fetch_max);
-}
-
-
-
-
-static void varray_finish(struct draw_pt_front_end *frontend)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   varray->middle->finish(varray->middle);
-   varray->middle = NULL;
-}
-
-static void varray_destroy(struct draw_pt_front_end *frontend)
-{
-   FREE(frontend);
-}
-
-
-struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw)
-{
-   ushort i;
-   struct varray_frontend *varray = CALLOC_STRUCT(varray_frontend);
-   if (varray == NULL)
-      return NULL;
-
-   varray->base.prepare = varray_prepare;
-   varray->base.run     = NULL;
-   varray->base.finish  = varray_finish;
-   varray->base.destroy = varray_destroy;
-   varray->draw = draw;
-
-   for (i = 0; i < DRAW_MAX; i++) {
-      varray->draw_elts[i] = i;
-   }
-
-   return &varray->base;
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
deleted file mode 100644
index 7c722457c3c..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
+++ /dev/null
@@ -1,238 +0,0 @@
-
-static void FUNC(struct draw_pt_front_end *frontend,
-                 pt_elt_func get_elt,
-                 const void *elts,
-                 unsigned count)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   struct draw_context *draw = varray->draw;
-   unsigned start = (unsigned)elts;
-
-   boolean flatfirst = (draw->rasterizer->flatshade &&
-                        draw->rasterizer->flatshade_first);
-   unsigned i, j;
-   ushort flags;
-   unsigned first, incr;
-
-   varray->fetch_start = start;
-
-   draw_pt_split_prim(varray->input_prim, &first, &incr);
-
-#if 0
-   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
-                varray->input_prim,
-                start, count);
-#endif
-
-   switch (varray->input_prim) {
-   case PIPE_PRIM_POINTS:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i < end; i++) {
-            POINT(varray, i + 0);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_LINES:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+1 < end; i += 2) {
-            LINE(varray, DRAW_PIPE_RESET_STIPPLE,
-                 i + 0, i + 1);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      if (count >= 2) {
-         flags = DRAW_PIPE_RESET_STIPPLE;
-
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end % incr);
-            for (i = 1; i < end; i++, flags = 0) {
-               LINE(varray, flags, i - 1, i);
-            }
-            LINE(varray, flags, i - 1, 0);
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-         }
-      }
-      break;
-
-   case PIPE_PRIM_LINE_STRIP:
-      flags = DRAW_PIPE_RESET_STIPPLE;
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 1; i < end; i++, flags = 0) {
-            LINE(varray, flags, i - 1, i);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLES:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+2 < end; i += 3) {
-            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                     i + 0, i + 1, i + 2);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (flatfirst) {
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end % incr);
-            for (i = 0; i+2 < end; i++) {
-               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                        i + 0, i + 1 + (i&1), i + 2 - (i&1));
-            }
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-            if (j + first + i <= count) {
-               varray->fetch_start -= 2;
-               i -= 2;
-            }
-         }
-      }
-      else {
-         for (j = 0; j + first <= count; j += i) {
-            unsigned end = MIN2(FETCH_MAX, count - j);
-            end -= (end  % incr);
-            for (i = 0; i + 2 < end; i++) {
-               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                        i + 0 + (i&1), i + 1 - (i&1), i + 2);
-            }
-            i = end;
-            fetch_init(varray, end);
-            varray_flush(varray);
-            if (j + first + i <= count) {
-               varray->fetch_start -= 2;
-               i -= 2;
-            }
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      if (count >= 3) {
-         if (flatfirst) {
-            flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
-            for (j = 0; j + first <= count; j += i) {
-               unsigned end = MIN2(FETCH_MAX, count - j);
-               end -= (end % incr);
-               for (i = 0; i+2 < end; i++) {
-                  TRIANGLE(varray, flags, i + 1, i + 2, 0);
-               }
-               i = end;
-               fetch_init(varray, end);
-               varray_flush(varray);
-            }
-         }
-         else {
-            flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
-            for (j = 0; j + first <= count; j += i) {
-               unsigned end = MIN2(FETCH_MAX, count - j);
-               end -= (end % incr);
-               for (i = 0; i+2 < end; i++) {
-                  TRIANGLE(varray, flags, 0, i + 1, i + 2);
-               }
-               i = end;
-               fetch_init(varray, end);
-               varray_flush(varray);
-            }
-         }
-      }
-      break;
-
-   case PIPE_PRIM_QUADS:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+3 < end; i += 4) {
-            QUAD(varray, i + 0, i + 1, i + 2, i + 3);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-      break;
-
-   case PIPE_PRIM_QUAD_STRIP:
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+3 < end; i += 2) {
-            QUAD(varray, i + 2, i + 0, i + 1, i + 3);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-         if (j + first + i <= count) {
-            varray->fetch_start -= 2;
-            i -= 2;
-         }
-      }
-      break;
-
-   case PIPE_PRIM_POLYGON:
-   {
-      /* These bitflags look a little odd because we submit the
-       * vertices as (1,2,0) to satisfy flatshade requirements.
-       */
-      const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
-      const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
-      const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
-
-      flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
-      for (j = 0; j + first <= count; j += i) {
-         unsigned end = MIN2(FETCH_MAX, count - j);
-         end -= (end % incr);
-         for (i = 0; i+2 < end; i++, flags = edge_middle) {
-
-            if (i + 3 == count)
-               flags |= edge_last;
-
-            TRIANGLE(varray, flags, i + 1, i + 2, 0);
-         }
-         i = end;
-         fetch_init(varray, end);
-         varray_flush(varray);
-      }
-   }
-   break;
-
-   default:
-      assert(0);
-      break;
-   }
-
-   varray_flush(varray);
-}
-
-#undef TRIANGLE
-#undef QUAD
-#undef POINT
-#undef LINE
-#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
deleted file mode 100644
index a292346be95..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
+++ /dev/null
@@ -1,98 +0,0 @@
-static unsigned trim( unsigned count, unsigned first, unsigned incr )
-{
-   return count - (count - first) % incr; 
-}
-
-static void FUNC(struct draw_pt_front_end *frontend,
-                 pt_elt_func get_elt,
-                 const void *elts,
-                 int elt_bias,
-                 unsigned count)
-{
-   struct varray_frontend *varray = (struct varray_frontend *)frontend;
-   unsigned start = (unsigned) ((char *) elts - (char *) NULL);
-
-   unsigned j;
-   unsigned first, incr;
-
-   assert(elt_bias == 0);
-
-   draw_pt_split_prim(varray->input_prim, &first, &incr);
-   
-   /* Sanitize primitive length:
-    */
-   count = trim(count, first, incr); 
-   if (count < first)
-      return;
-
-#if 0
-   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
-                varray->input_prim,
-                start, count);
-#endif
-
-   switch (varray->input_prim) {
-   case PIPE_PRIM_POINTS:
-   case PIPE_PRIM_LINES:
-   case PIPE_PRIM_TRIANGLES:
-   case PIPE_PRIM_LINE_STRIP:
-   case PIPE_PRIM_TRIANGLE_STRIP:
-   case PIPE_PRIM_QUADS:
-   case PIPE_PRIM_QUAD_STRIP:
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      for (j = 0; j < count;) {
-         unsigned remaining = count - j;
-         unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr );
-         varray_flush_linear(varray, start + j, nr);
-         j += nr;
-         if (nr != remaining) 
-            j -= (first - incr);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:
-      /* Always have to decompose as we've stated that this will be
-       * emitted as a line-strip.
-       */
-      for (j = 0; j < count;) {
-         unsigned remaining = count - j;
-         unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr );
-         varray_line_loop_segment(varray, start, j, nr, nr == remaining);
-         j += nr;
-         if (nr != remaining) 
-            j -= (first - incr);
-      }
-      break;
-
-
-   case PIPE_PRIM_POLYGON:
-   case PIPE_PRIM_TRIANGLE_FAN: 
-      if (count < varray->driver_fetch_max) {
-         varray_flush_linear(varray, start, count);
-      }
-      else {
-         for ( j = 0; j < count;) {
-            unsigned remaining = count - j;
-            unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr );
-            varray_fan_segment(varray, start, j, nr);
-            j += nr;
-            if (nr != remaining) 
-               j -= (first - incr);
-         }
-      }
-      break;
-
-   default:
-      assert(0);
-      break;
-   }
-}
-
-#undef TRIANGLE
-#undef QUAD
-#undef POINT
-#undef LINE
-#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
deleted file mode 100644
index a848b54f7d2..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ /dev/null
@@ -1,610 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <[email protected]>
-  */
-
-#include "util/u_memory.h"
-#include "util/u_prim.h"
-#include "draw/draw_context.h"
-#include "draw/draw_private.h"
-#include "draw/draw_pt.h"
-
-
-#define CACHE_MAX 256
-#define FETCH_MAX 256
-#define DRAW_MAX (16*1024)
-
-
-struct vcache_frontend {
-   struct draw_pt_front_end base;
-   struct draw_context *draw;
-
-   unsigned in[CACHE_MAX];
-   ushort out[CACHE_MAX];
-
-   ushort draw_elts[DRAW_MAX];
-   unsigned fetch_elts[FETCH_MAX];
-
-   unsigned draw_count;
-   unsigned fetch_count;
-   unsigned fetch_max;
-
-   struct draw_pt_middle_end *middle;
-
-   unsigned input_prim;
-   unsigned output_prim;
-
-   unsigned middle_prim;
-   unsigned opt;
-};
-
-
-static INLINE void
-vcache_flush( struct vcache_frontend *vcache )
-{
-   if (vcache->middle_prim != vcache->output_prim) {
-      vcache->middle_prim = vcache->output_prim;
-      vcache->middle->prepare( vcache->middle,
-                               vcache->middle_prim,
-                               vcache->opt,
-                               &vcache->fetch_max );
-   }
-
-   if (vcache->draw_count) {
-      vcache->middle->run( vcache->middle,
-                           vcache->fetch_elts,
-                           vcache->fetch_count,
-                           vcache->draw_elts,
-                           vcache->draw_count );
-   }
-
-   memset(vcache->in, ~0, sizeof(vcache->in));
-   vcache->fetch_count = 0;
-   vcache->draw_count = 0;
-}
-
-
-static INLINE void 
-vcache_check_flush( struct vcache_frontend *vcache )
-{
-   if (vcache->draw_count + 6 >= DRAW_MAX ||
-       vcache->fetch_count + 6 >= FETCH_MAX) {
-      vcache_flush( vcache );
-   }
-}
-
-
-static INLINE void 
-vcache_elt( struct vcache_frontend *vcache,
-            unsigned felt,
-            ushort flags )
-{
-   unsigned idx = felt % CACHE_MAX;
-
-   if (vcache->in[idx] != felt) {
-      assert(vcache->fetch_count < FETCH_MAX);
-
-      vcache->in[idx] = felt;
-      vcache->out[idx] = (ushort)vcache->fetch_count;
-      vcache->fetch_elts[vcache->fetch_count++] = felt;
-   }
-
-   vcache->draw_elts[vcache->draw_count++] = vcache->out[idx] | flags;
-}
-
-
-                   
-static INLINE void 
-vcache_triangle( struct vcache_frontend *vcache,
-                 unsigned i0,
-                 unsigned i1,
-                 unsigned i2 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_check_flush(vcache);
-}
-
-			  
-static INLINE void 
-vcache_triangle_flags( struct vcache_frontend *vcache,
-                       ushort flags,
-                       unsigned i0,
-                       unsigned i1,
-                       unsigned i2 )
-{
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void 
-vcache_line( struct vcache_frontend *vcache,
-             unsigned i0,
-             unsigned i1 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void 
-vcache_line_flags( struct vcache_frontend *vcache,
-                   ushort flags,
-                   unsigned i0,
-                   unsigned i1 )
-{
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, i1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void 
-vcache_point( struct vcache_frontend *vcache,
-              unsigned i0 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void
-vcache_line_adj_flags( struct vcache_frontend *vcache,
-                       unsigned flags,
-                       unsigned a0, unsigned i0, unsigned i1, unsigned a1 )
-{
-   vcache_elt(vcache, a0, 0);
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, a1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void
-vcache_line_adj( struct vcache_frontend *vcache,
-                 unsigned a0, unsigned i0, unsigned i1, unsigned a1 )
-{
-   vcache_elt(vcache, a0, 0);
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, a1, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void
-vcache_triangle_adj_flags( struct vcache_frontend *vcache,
-                           unsigned flags,
-                           unsigned i0, unsigned a0,
-                           unsigned i1, unsigned a1,
-                           unsigned i2, unsigned a2 )
-{
-   vcache_elt(vcache, i0, flags);
-   vcache_elt(vcache, a0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, a1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_elt(vcache, a2, 0);
-   vcache_check_flush(vcache);
-}
-
-
-static INLINE void
-vcache_triangle_adj( struct vcache_frontend *vcache,
-                     unsigned i0, unsigned a0,
-                     unsigned i1, unsigned a1,
-                     unsigned i2, unsigned a2 )
-{
-   vcache_elt(vcache, i0, 0);
-   vcache_elt(vcache, a0, 0);
-   vcache_elt(vcache, i1, 0);
-   vcache_elt(vcache, a1, 0);
-   vcache_elt(vcache, i2, 0);
-   vcache_elt(vcache, a2, 0);
-   vcache_check_flush(vcache);
-}
-
-
-/* At least for now, we're back to using a template include file for
- * this.  The two paths aren't too different though - it may be
- * possible to reunify them.
- */
-#define TRIANGLE(flags,i0,i1,i2) vcache_triangle_flags(vcache,flags,i0,i1,i2)
-#define LINE(flags,i0,i1)        vcache_line_flags(vcache,flags,i0,i1)
-#define POINT(i0)                vcache_point(vcache,i0)
-#define LINE_ADJ(flags,a0,i0,i1,a1) \
-   vcache_line_adj_flags(vcache,flags,a0,i0,i1,a1)
-#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \
-   vcache_triangle_adj_flags(vcache,flags,i0,a0,i1,a1,i2,a2)
-#define FUNC vcache_run_extras
-#include "draw_pt_vcache_tmp.h"
-
-#define TRIANGLE(flags,i0,i1,i2) vcache_triangle(vcache,i0,i1,i2)
-#define LINE(flags,i0,i1)        vcache_line(vcache,i0,i1)
-#define POINT(i0)                vcache_point(vcache,i0)
-#define LINE_ADJ(flags,a0,i0,i1,a1) \
-   vcache_line_adj(vcache,a0,i0,i1,a1)
-#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \
-   vcache_triangle_adj(vcache,i0,a0,i1,a1,i2,a2)
-#define FUNC vcache_run
-#include "draw_pt_vcache_tmp.h"
-
-static INLINE void 
-rebase_uint_elts( const unsigned *src,
-                  unsigned count,
-                  int delta,
-                  ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-
-static INLINE void 
-rebase_ushort_elts( const ushort *src,
-                    unsigned count,
-                    int delta,
-                    ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-
-static INLINE void 
-rebase_ubyte_elts( const ubyte *src,
-                   unsigned count,
-                   int delta,
-                   ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i] + delta);
-}
-
-
-static INLINE void 
-translate_uint_elts( const unsigned *src,
-                     unsigned count,
-                     ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-
-static INLINE void 
-translate_ushort_elts( const ushort *src,
-                       unsigned count,
-                       ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-
-static INLINE void 
-translate_ubyte_elts( const ubyte *src,
-                      unsigned count,
-                      ushort *dest )
-{
-   unsigned i;
-   for (i = 0; i < count; i++) 
-      dest[i] = (ushort)(src[i]);
-}
-
-
-
-
-#if 0
-static INLINE enum pipe_format 
-format_from_get_elt( pt_elt_func get_elt )
-{
-   switch (draw->pt.user.eltSize) {
-   case 1: return PIPE_FORMAT_R8_UNORM;
-   case 2: return PIPE_FORMAT_R16_UNORM;
-   case 4: return PIPE_FORMAT_R32_UNORM;
-   default: return PIPE_FORMAT_NONE;
-   }
-}
-#endif
-
-
-/**
- * Check if any vertex attributes use instance divisors.
- * Note that instance divisors complicate vertex fetching so we need
- * to take the vcache path when they're in use.
- */
-static boolean
-any_instance_divisors(const struct draw_context *draw)
-{
-   uint i;
-
-   for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
-      uint div = draw->pt.vertex_element[i].instance_divisor;
-      if (div)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-
-static INLINE void 
-vcache_check_run( struct draw_pt_front_end *frontend, 
-                  pt_elt_func get_elt,
-                  const void *elts,
-                  int elt_bias,
-                  unsigned draw_count )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; 
-   struct draw_context *draw = vcache->draw;
-   const unsigned min_index = draw->pt.user.min_index;
-   const unsigned max_index = draw->pt.user.max_index;
-   const unsigned index_size = draw->pt.user.eltSize;
-   unsigned fetch_count;
-   const ushort *transformed_elts;
-   ushort *storage = NULL;
-   boolean ok = FALSE;
-
-   /* debug: verify indexes are in range [min_index, max_index] */
-   if (0) {
-      unsigned i;
-      for (i = 0; i < draw_count; i++) {
-         if (index_size == 1) {
-            assert( ((const ubyte *) elts)[i] >= min_index);
-            assert( ((const ubyte *) elts)[i] <= max_index);
-         }
-         else if (index_size == 2) {
-            assert( ((const ushort *) elts)[i] >= min_index);
-            assert( ((const ushort *) elts)[i] <= max_index);
-         }
-         else {
-            assert(index_size == 4);
-            assert( ((const uint *) elts)[i] >= min_index);
-            assert( ((const uint *) elts)[i] <= max_index);
-         }
-      }
-   }
-
-   /* Note: max_index is frequently 0xffffffff so we have to be sure
-    * that any arithmetic involving max_index doesn't overflow!
-    */
-   if (max_index >= (unsigned) DRAW_PIPE_MAX_VERTICES)
-      goto fail;
-
-   if (any_instance_divisors(draw))
-      goto fail;
-
-   fetch_count = max_index + 1 - min_index;
-
-   if (0)
-      debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, 
-                   vcache->fetch_max,
-                   draw_count);
-
-   if (elt_bias + max_index >= DRAW_PIPE_MAX_VERTICES ||
-       fetch_count >= UNDEFINED_VERTEX_ID ||
-       fetch_count > draw_count) {
-      if (0) debug_printf("fail\n");
-      goto fail;
-   }
-
-   if (vcache->middle_prim != vcache->input_prim) {
-      vcache->middle_prim = vcache->input_prim;
-      vcache->middle->prepare( vcache->middle,
-                               vcache->middle_prim,
-                               vcache->opt,
-                               &vcache->fetch_max );
-   }
-
-   assert((elt_bias >= 0 && min_index + elt_bias >= min_index) ||
-          (elt_bias <  0 && min_index + elt_bias <  min_index));
-
-   if (min_index == 0 &&
-       index_size == 2) {
-      transformed_elts = (const ushort *)elts;
-   }
-   else {
-      storage = MALLOC( draw_count * sizeof(ushort) );
-      if (!storage)
-         goto fail;
-      
-      if (min_index == 0) {
-         switch(index_size) {
-         case 1:
-            translate_ubyte_elts( (const ubyte *)elts,
-                                  draw_count,
-                                  storage );
-            break;
-
-         case 2:
-            translate_ushort_elts( (const ushort *)elts,
-                                   draw_count,
-                                   storage );
-            break;
-
-         case 4:
-            translate_uint_elts( (const uint *)elts,
-                                 draw_count,
-                                 storage );
-            break;
-
-         default:
-            assert(0);
-            FREE(storage);
-            return;
-         }
-      }
-      else {
-         switch(index_size) {
-         case 1:
-            rebase_ubyte_elts( (const ubyte *)elts,
-                               draw_count,
-                               0 - (int)min_index,
-                               storage );
-            break;
-
-         case 2:
-            rebase_ushort_elts( (const ushort *)elts,
-                                draw_count,
-                                0 - (int)min_index,
-                                storage );
-            break;
-
-         case 4:
-            rebase_uint_elts( (const uint *)elts,
-                              draw_count,
-                              0 - (int)min_index,
-                              storage );
-            break;
-
-         default:
-            assert(0);
-            FREE(storage);
-            return;
-         }
-      }
-      transformed_elts = storage;
-   }
-
-   if (fetch_count < UNDEFINED_VERTEX_ID)
-      ok = vcache->middle->run_linear_elts( vcache->middle,
-                                            min_index + elt_bias, /* start */
-                                            fetch_count,
-                                            transformed_elts,
-                                            draw_count );
-   
-   FREE(storage);
-
-   if (ok)
-      return;
-
-   debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n",
-                fetch_count, draw_count);
-
-fail:
-   vcache_run( frontend, get_elt, elts, elt_bias, draw_count );
-}
-
-
-
-
-static void
-vcache_prepare( struct draw_pt_front_end *frontend,
-                unsigned in_prim,
-                struct draw_pt_middle_end *middle,
-                unsigned opt )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
-
-   if (opt & PT_PIPELINE) {
-      vcache->base.run = vcache_run_extras;
-   }
-   else {
-      vcache->base.run = vcache_check_run;
-   }
-
-   /* VCache will always emit the reduced version of its input
-    * primitive, ie STRIP/FANS become TRIS, etc.
-    *
-    * This is not to be confused with what the GS might be up to,
-    * which is a separate issue.
-    */
-   vcache->input_prim = in_prim;
-   switch (in_prim) {
-   case PIPE_PRIM_LINES_ADJACENCY:
-   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
-      vcache->output_prim = PIPE_PRIM_LINES_ADJACENCY;
-      break;
-   case PIPE_PRIM_TRIANGLES_ADJACENCY:
-   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
-      vcache->output_prim = PIPE_PRIM_TRIANGLES_ADJACENCY;
-      break;
-   default:
-      vcache->output_prim = u_reduced_prim(in_prim);
-   }
-
-   vcache->middle = middle;
-   vcache->opt = opt;
-
-   /* Have to run prepare here, but try and guess a good prim for
-    * doing so:
-    */
-   vcache->middle_prim = (opt & PT_PIPELINE)
-      ? vcache->output_prim : vcache->input_prim;
-
-   middle->prepare( middle,
-                    vcache->middle_prim,
-                    opt, &vcache->fetch_max );
-}
-
-
-static void 
-vcache_finish( struct draw_pt_front_end *frontend )
-{
-   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
-   vcache->middle->finish( vcache->middle );
-   vcache->middle = NULL;
-}
-
-
-static void 
-vcache_destroy( struct draw_pt_front_end *frontend )
-{
-   FREE(frontend);
-}
-
-
-struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw )
-{
-   struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend );
-   if (vcache == NULL)
-      return NULL;
- 
-   vcache->base.prepare = vcache_prepare;
-   vcache->base.run     = NULL;
-   vcache->base.finish  = vcache_finish;
-   vcache->base.destroy = vcache_destroy;
-   vcache->draw = draw;
-   
-   memset(vcache->in, ~0, sizeof(vcache->in));
-  
-   return &vcache->base;
-}
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
deleted file mode 100644
index 1a3748d5f0b..00000000000
--- a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#define FUNC_VARS                      \
-   struct draw_pt_front_end *frontend, \
-   pt_elt_func get_elt,                \
-   const void *elts,                   \
-   int elt_bias,                       \
-   unsigned count
-
-#define LOCAL_VARS \
-   struct vcache_frontend *vcache = (struct vcache_frontend *) frontend;   \
-   struct draw_context *draw = vcache->draw;                               \
-   const unsigned prim = vcache->input_prim;                               \
-   const boolean last_vertex_last = !(draw->rasterizer->flatshade &&       \
-                                      draw->rasterizer->flatshade_first);
-
-#define GET_ELT(idx) (get_elt(elts, idx) + elt_bias)
-
-#define FUNC_EXIT do { vcache_flush(vcache); } while (0)
-
-#include "draw_decompose_tmp.h"
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
new file mode 100644
index 00000000000..a6875253094
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c
@@ -0,0 +1,208 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pt.h"
+
+#define SEGMENT_SIZE 1024
+#define MAP_SIZE     256
+
+struct vsplit_frontend {
+   struct draw_pt_front_end base;
+   struct draw_context *draw;
+
+   unsigned prim;
+
+   struct draw_pt_middle_end *middle;
+
+   unsigned max_vertices;
+   ushort segment_size;
+
+   /* buffers for splitting */
+   unsigned fetch_elts[SEGMENT_SIZE];
+   ushort draw_elts[SEGMENT_SIZE];
+   ushort identity_draw_elts[SEGMENT_SIZE];
+
+   struct {
+      /* map a fetch element to a draw element */
+      unsigned fetches[MAP_SIZE];
+      ushort draws[MAP_SIZE];
+      boolean has_max_fetch;
+
+      ushort num_fetch_elts;
+      ushort num_draw_elts;
+   } cache;
+};
+
+
+static void
+vsplit_clear_cache(struct vsplit_frontend *vsplit)
+{
+   memset(vsplit->cache.fetches, 0xff, sizeof(vsplit->cache.fetches));
+   vsplit->cache.has_max_fetch = FALSE;
+   vsplit->cache.num_fetch_elts = 0;
+   vsplit->cache.num_draw_elts = 0;
+}
+
+static void
+vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags)
+{
+   vsplit->middle->run(vsplit->middle,
+         vsplit->fetch_elts, vsplit->cache.num_fetch_elts,
+         vsplit->draw_elts, vsplit->cache.num_draw_elts, flags);
+}
+
+/**
+ * Add a fetch element and add it to the draw elements.
+ */
+static INLINE void
+vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch)
+{
+   unsigned hash = fetch % MAP_SIZE;
+
+   if (vsplit->cache.fetches[hash] != fetch) {
+      /* update cache */
+      vsplit->cache.fetches[hash] = fetch;
+      vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts;
+
+      /* add fetch */
+      assert(vsplit->cache.num_fetch_elts < vsplit->segment_size);
+      vsplit->fetch_elts[vsplit->cache.num_fetch_elts++] = fetch;
+   }
+
+   vsplit->draw_elts[vsplit->cache.num_draw_elts++] = vsplit->cache.draws[hash];
+}
+
+
+/**
+ * Add a fetch element and add it to the draw elements.  The fetch element is
+ * in full range (uint).
+ */
+static INLINE void
+vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch)
+{
+   /* special care for 0xffffffff */
+   if (fetch == 0xffffffff && !vsplit->cache.has_max_fetch) {
+      unsigned hash = fetch % MAP_SIZE;
+      vsplit->cache.fetches[hash] = fetch - 1; /* force update */
+      vsplit->cache.has_max_fetch = TRUE;
+   }
+
+   vsplit_add_cache(vsplit, fetch);
+}
+
+
+#define FUNC vsplit_run_linear
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_ubyte
+#define ELT_TYPE ubyte
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_ushort
+#define ELT_TYPE ushort
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+#define FUNC vsplit_run_uint
+#define ELT_TYPE uint
+#define ADD_CACHE(vsplit, fetch) vsplit_add_cache_uint(vsplit, fetch)
+#include "draw_pt_vsplit_tmp.h"
+
+
+static void vsplit_prepare(struct draw_pt_front_end *frontend,
+                           unsigned in_prim,
+                           struct draw_pt_middle_end *middle,
+                           unsigned opt)
+{
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;
+
+   switch (vsplit->draw->pt.user.eltSize) {
+   case 0:
+      vsplit->base.run = vsplit_run_linear;
+      break;
+   case 1:
+      vsplit->base.run = vsplit_run_ubyte;
+      break;
+   case 2:
+      vsplit->base.run = vsplit_run_ushort;
+      break;
+   case 4:
+      vsplit->base.run = vsplit_run_uint;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   /* split only */
+   vsplit->prim = in_prim;
+
+   vsplit->middle = middle;
+   middle->prepare(middle, vsplit->prim, opt, &vsplit->max_vertices);
+
+   vsplit->segment_size = MIN2(SEGMENT_SIZE, vsplit->max_vertices);
+}
+
+
+static void vsplit_finish(struct draw_pt_front_end *frontend)
+{
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;
+   vsplit->middle->finish(vsplit->middle);
+   vsplit->middle = NULL;
+}
+
+
+static void vsplit_destroy(struct draw_pt_front_end *frontend)
+{
+   FREE(frontend);
+}
+
+
+struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw)
+{
+   struct vsplit_frontend *vsplit = CALLOC_STRUCT(vsplit_frontend);
+   ushort i;
+
+   if (!vsplit)
+      return NULL;
+
+   vsplit->base.prepare = vsplit_prepare;
+   vsplit->base.run     = NULL;
+   vsplit->base.finish  = vsplit_finish;
+   vsplit->base.destroy = vsplit_destroy;
+   vsplit->draw = draw;
+
+   for (i = 0; i < SEGMENT_SIZE; i++)
+      vsplit->identity_draw_elts[i] = i;
+
+   return &vsplit->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
new file mode 100644
index 00000000000..4bb57b1493f
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h
@@ -0,0 +1,307 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#define CONCAT2(name, elt_type) name ## elt_type
+#define CONCAT(name, elt_type) CONCAT2(name, elt_type)
+
+#ifdef ELT_TYPE
+
+/**
+ * Fetch all elements in [min_index, max_index] with bias, and use the
+ * (rebased) index buffer as the draw elements.
+ */
+static boolean
+CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                    unsigned istart, unsigned icount)
+{
+   struct draw_context *draw = vsplit->draw;
+   const ELT_TYPE *ib = (const ELT_TYPE *) draw->pt.user.elts;
+   const unsigned min_index = draw->pt.user.min_index;
+   const unsigned max_index = draw->pt.user.max_index;
+   const int elt_bias = draw->pt.user.eltBias;
+   unsigned fetch_start, fetch_count;
+   const ushort *draw_elts = NULL;
+   unsigned i;
+
+   /* use the ib directly */
+   if (min_index == 0 && sizeof(ib[0]) == sizeof(draw_elts[0])) {
+      if (icount > vsplit->max_vertices)
+         return FALSE;
+
+      for (i = 0; i < icount; i++) {
+         ELT_TYPE idx = ib[istart + i];
+         assert(idx >= min_index && idx <= max_index);
+      }
+      draw_elts = (const ushort *) ib;
+   }
+   else {
+      /* have to go through vsplit->draw_elts */
+      if (icount > vsplit->segment_size)
+         return FALSE;
+   }
+
+   /* this is faster only when we fetch less elements than the normal path */
+   if (max_index - min_index > icount - 1)
+      return FALSE;
+
+   if (elt_bias < 0 && min_index < -elt_bias)
+      return FALSE;
+
+   /* why this check? */
+   for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+      if (draw->pt.vertex_element[i].instance_divisor)
+         return FALSE;
+   }
+
+   fetch_start = min_index + elt_bias;
+   fetch_count = max_index - min_index + 1;
+
+   if (!draw_elts) {
+      if (min_index == 0) {
+         for (i = 0; i < icount; i++) {
+            ELT_TYPE idx = ib[istart + i];
+
+            assert(idx >= min_index && idx <= max_index);
+            vsplit->draw_elts[i] = (ushort) idx;
+         }
+      }
+      else {
+         for (i = 0; i < icount; i++) {
+            ELT_TYPE idx = ib[istart + i];
+
+            assert(idx >= min_index && idx <= max_index);
+            vsplit->draw_elts[i] = (ushort) (idx - min_index);
+         }
+      }
+
+      draw_elts = vsplit->draw_elts;
+   }
+
+   return vsplit->middle->run_linear_elts(vsplit->middle,
+                                          fetch_start, fetch_count,
+                                          draw_elts, icount, 0x0);
+}
+
+/**
+ * Use the cache to prepare the fetch and draw elements, and flush.
+ *
+ * When spoken is TRUE, ispoken replaces istart;  When close is TRUE, iclose is
+ * appended.
+ */
+static INLINE void
+CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                        unsigned flags,
+                                        unsigned istart, unsigned icount,
+                                        boolean spoken, unsigned ispoken,
+                                        boolean close, unsigned iclose)
+{
+   struct draw_context *draw = vsplit->draw;
+   const ELT_TYPE *ib = (const ELT_TYPE *) draw->pt.user.elts;
+   const int ibias = draw->pt.user.eltBias;
+   unsigned i;
+
+   assert(icount + !!close <= vsplit->segment_size);
+
+   vsplit_clear_cache(vsplit);
+
+   spoken = !!spoken;
+   if (ibias == 0) {
+      if (spoken)
+         ADD_CACHE(vsplit, ib[ispoken]);
+
+      for (i = spoken; i < icount; i++)
+         ADD_CACHE(vsplit, ib[istart + i]);
+
+      if (close)
+         ADD_CACHE(vsplit, ib[iclose]);
+   }
+   else if (ibias > 0) {
+      if (spoken)
+         ADD_CACHE(vsplit, (uint) ib[ispoken] + ibias);
+
+      for (i = spoken; i < icount; i++)
+         ADD_CACHE(vsplit, (uint) ib[istart + i] + ibias);
+
+      if (close)
+         ADD_CACHE(vsplit, (uint) ib[iclose] + ibias);
+   }
+   else {
+      if (spoken) {
+         if (ib[ispoken] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[ispoken] + ibias);
+      }
+
+      for (i = spoken; i < icount; i++) {
+         if (ib[istart + i] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[istart + i] + ibias);
+      }
+
+      if (close) {
+         if (ib[iclose] < -ibias)
+            return;
+         ADD_CACHE(vsplit, ib[iclose] + ibias);
+      }
+   }
+
+   vsplit_flush_cache(vsplit, flags);
+}
+
+static void
+CONCAT(vsplit_segment_simple_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                         unsigned flags,
+                                         unsigned istart,
+                                         unsigned icount)
+{
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, FALSE, 0, FALSE, 0);
+}
+
+static void
+CONCAT(vsplit_segment_loop_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                       unsigned flags,
+                                       unsigned istart,
+                                       unsigned icount,
+                                       unsigned i0)
+{
+   const boolean close_loop = ((flags) == DRAW_SPLIT_BEFORE);
+
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, FALSE, 0, close_loop, i0);
+}
+
+static void
+CONCAT(vsplit_segment_fan_, ELT_TYPE)(struct vsplit_frontend *vsplit,
+                                      unsigned flags,
+                                      unsigned istart,
+                                      unsigned icount,
+                                      unsigned i0)
+{
+   const boolean use_spoken = (((flags) & DRAW_SPLIT_BEFORE) != 0);
+
+   CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit,
+         flags, istart, icount, use_spoken, i0, FALSE, 0);
+}
+
+#define LOCAL_VARS                                                         \
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;   \
+   const unsigned prim = vsplit->prim;                                     \
+   const unsigned max_count_simple = vsplit->segment_size;                 \
+   const unsigned max_count_loop = vsplit->segment_size - 1;               \
+   const unsigned max_count_fan = vsplit->segment_size;
+
+#define PRIMITIVE(istart, icount)   \
+   CONCAT(vsplit_primitive_, ELT_TYPE)(vsplit, istart, icount)
+
+#else /* ELT_TYPE */
+
+static void
+vsplit_segment_simple_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                             unsigned istart, unsigned icount)
+{
+   assert(icount <= vsplit->max_vertices);
+   vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+}
+
+static void
+vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                           unsigned istart, unsigned icount, unsigned i0)
+{
+   boolean close_loop = (flags == DRAW_SPLIT_BEFORE);
+   unsigned nr;
+
+   assert(icount + !!close_loop <= vsplit->segment_size);
+
+   if (close_loop) {
+      for (nr = 0; nr < icount; nr++)
+         vsplit->fetch_elts[nr] = istart + nr;
+      vsplit->fetch_elts[nr++] = i0;
+
+      vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr,
+            vsplit->identity_draw_elts, nr, flags);
+   }
+   else {
+      vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+   }
+}
+
+static void
+vsplit_segment_fan_linear(struct vsplit_frontend *vsplit, unsigned flags,
+                          unsigned istart, unsigned icount, unsigned i0)
+{
+   boolean use_spoken = ((flags & DRAW_SPLIT_BEFORE) != 0);
+   unsigned nr = 0, i;
+
+   assert(icount + !!use_spoken <= vsplit->segment_size);
+
+   if (use_spoken) {
+      vsplit->fetch_elts[nr++] = i0;
+      for (i = 1 ; i < icount; i++)
+         vsplit->fetch_elts[nr++] = istart + i;
+
+      vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr,
+            vsplit->identity_draw_elts, nr, flags);
+   }
+   else {
+      vsplit->middle->run_linear(vsplit->middle, istart, icount, flags);
+   }
+}
+
+#define LOCAL_VARS                                                         \
+   struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend;   \
+   const unsigned prim = vsplit->prim;                                     \
+   const unsigned max_count_simple = vsplit->max_vertices;                 \
+   const unsigned max_count_loop = vsplit->segment_size - 1;               \
+   const unsigned max_count_fan = vsplit->segment_size;
+
+#define PRIMITIVE(istart, icount) FALSE
+
+#define ELT_TYPE linear
+
+#endif /* ELT_TYPE */
+
+#define FUNC_VARS                      \
+   struct draw_pt_front_end *frontend, \
+   unsigned start,                     \
+   unsigned count
+
+#define SEGMENT_SIMPLE(flags, istart, icount)   \
+   CONCAT(vsplit_segment_simple_, ELT_TYPE)(vsplit, flags, istart, icount)
+
+#define SEGMENT_LOOP(flags, istart, icount, i0) \
+   CONCAT(vsplit_segment_loop_, ELT_TYPE)(vsplit, flags, istart, icount, i0)
+
+#define SEGMENT_FAN(flags, istart, icount, i0)  \
+   CONCAT(vsplit_segment_fan_, ELT_TYPE)(vsplit, flags, istart, icount, i0)
+
+#include "draw_split_tmp.h"
+
+#undef CONCAT2
+#undef CONCAT
+
+#undef ELT_TYPE
+#undef ADD_CACHE
diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
index 6d8937a0b41..7fafde9d5e6 100644
--- a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h
@@ -7,11 +7,9 @@
 
 #define FUNC_ENTER                                                \
    /* declare more local vars */                                  \
-   struct draw_context *draw = so->draw;                          \
    const unsigned prim = input_prims->prim;                       \
-   const boolean last_vertex_last =                               \
-      !(draw->rasterizer->flatshade &&                            \
-        draw->rasterizer->flatshade_first);                       \
+   const unsigned prim_flags = input_prims->flags;                \
+   const boolean last_vertex_last = TRUE;                         \
    do {                                                           \
       debug_assert(input_prims->primitive_count == 1);            \
       switch (prim) {                                             \
diff --git a/src/gallium/auxiliary/draw/draw_split_tmp.h b/src/gallium/auxiliary/draw/draw_split_tmp.h
new file mode 100644
index 00000000000..47defc62b96
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_split_tmp.h
@@ -0,0 +1,176 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+static void
+FUNC(FUNC_VARS)
+{
+   unsigned first, incr;
+   LOCAL_VARS
+
+   /*
+    * prim, start, count, and max_count_{simple,loop,fan} should have been
+    * defined
+    */
+   if (0) {
+      debug_printf("%s: prim 0x%x, start %d, count %d, max_count_simple %d, "
+                   "max_count_loop %d, max_count_fan %d\n",
+                   __FUNCTION__, prim, start, count, max_count_simple,
+                   max_count_loop, max_count_fan);
+   }
+
+   draw_pt_split_prim(prim, &first, &incr);
+   /* sanitize primitive length */
+   count = draw_pt_trim_count(count, first, incr);
+   if (count < first)
+      return;
+
+   /* try flushing the entire primitive */
+   if (PRIMITIVE(start, count))
+      return;
+
+   /* must be able to at least flush two complete primitives */
+   assert(max_count_simple >= first + incr &&
+          max_count_loop >= first + incr &&
+          max_count_fan >= first + incr);
+
+   /* no splitting required */
+   if (count <= max_count_simple) {
+      SEGMENT_SIMPLE(0x0, start, count);
+   }
+   else {
+      const unsigned rollback = first - incr;
+      unsigned flags = DRAW_SPLIT_AFTER, seg_start = 0, seg_max;
+
+      /*
+       * Both count and seg_max below are explicitly trimmed.  Because
+       *
+       *   seg_start = N * (seg_max - rollback) = N' * incr,
+       *
+       * we have
+       *
+       *   remaining = count - seg_start = first + N'' * incr.
+       *
+       * That is, remaining is implicitly trimmed.
+       */
+      switch (prim) {
+      case PIPE_PRIM_POINTS:
+      case PIPE_PRIM_LINES:
+      case PIPE_PRIM_LINE_STRIP:
+      case PIPE_PRIM_TRIANGLES:
+      case PIPE_PRIM_TRIANGLE_STRIP:
+      case PIPE_PRIM_QUADS:
+      case PIPE_PRIM_QUAD_STRIP:
+      case PIPE_PRIM_LINES_ADJACENCY:
+      case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_simple, count), first, incr);
+         if (prim == PIPE_PRIM_TRIANGLE_STRIP ||
+             prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) {
+            /* make sure we flush even number of triangles at a time */
+            if (seg_max < count && !(((seg_max - first) / incr) & 1))
+               seg_max -= incr;
+         }
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_SIMPLE(flags, start + seg_start, seg_max);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_SIMPLE(flags, start + seg_start, remaining);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      case PIPE_PRIM_LINE_LOOP:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_loop, count), first, incr);
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_LOOP(flags, start + seg_start, seg_max, start);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_LOOP(flags, start + seg_start, remaining, start);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      case PIPE_PRIM_TRIANGLE_FAN:
+      case PIPE_PRIM_POLYGON:
+         seg_max =
+            draw_pt_trim_count(MIN2(max_count_fan, count), first, incr);
+
+         do {
+            const unsigned remaining = count - seg_start;
+
+            if (remaining > seg_max) {
+               SEGMENT_FAN(flags, start + seg_start, seg_max, start);
+               seg_start += seg_max - rollback;
+
+               flags |= DRAW_SPLIT_BEFORE;
+            }
+            else {
+               flags &= ~DRAW_SPLIT_AFTER;
+
+               SEGMENT_FAN(flags, start + seg_start, remaining, start);
+               seg_start += remaining;
+            }
+         } while (seg_start < count);
+         break;
+
+      default:
+         assert(0);
+         break;
+      }
+   }
+}
+
+#undef FUNC
+#undef FUNC_VARS
+#undef LOCAL_VARS
+
+#undef PRIMITIVE
+#undef SEGMENT_SIMPLE
+#undef SEGMENT_LOOP
+#undef SEGMENT_FAN
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
index f5f2623e467..7b35dd4bb49 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -59,6 +59,19 @@
 #include "lp_bld_arit.h"
 
 
+/*
+ * XXX: Increasing eliminates some artifacts, but adds others, most
+ * noticeably corruption in the Earth halo in Google Earth.
+ */
+#define RCP_NEWTON_STEPS 0
+
+#define RSQRT_NEWTON_STEPS 0
+
+#define EXP_POLY_DEGREE 3
+
+#define LOG_POLY_DEGREE 5
+
+
 /**
  * Generate min(a, b)
  * No checks for special case values of a or b = 1 or 0 are done.
@@ -72,6 +85,9 @@ lp_build_min_simple(struct lp_build_context *bld,
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    /* TODO: optimize the constant case */
 
    if(type.width * type.length == 128) {
@@ -118,6 +134,9 @@ lp_build_max_simple(struct lp_build_context *bld,
    const char *intrinsic = NULL;
    LLVMValueRef cond;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    /* TODO: optimize the constant case */
 
    if(type.width * type.length == 128) {
@@ -160,6 +179,8 @@ lp_build_comp(struct lp_build_context *bld,
 {
    const struct lp_type type = bld->type;
 
+   assert(lp_check_value(type, a));
+
    if(a == bld->one)
       return bld->zero;
    if(a == bld->zero)
@@ -173,9 +194,15 @@ lp_build_comp(struct lp_build_context *bld,
    }
 
    if(LLVMIsConstant(a))
-      return LLVMConstSub(bld->one, a);
+      if (type.floating)
+          return LLVMConstFSub(bld->one, a);
+      else
+          return LLVMConstSub(bld->one, a);
    else
-      return LLVMBuildSub(bld->builder, bld->one, a, "");
+      if (type.floating)
+         return LLVMBuildFSub(bld->builder, bld->one, a, "");
+      else
+         return LLVMBuildSub(bld->builder, bld->one, a, "");
 }
 
 
@@ -190,6 +217,9 @@ lp_build_add(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == bld->zero)
       return b;
    if(b == bld->zero)
@@ -217,9 +247,15 @@ lp_build_add(struct lp_build_context *bld,
    }
 
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      res = LLVMConstAdd(a, b);
+      if (type.floating)
+         res = LLVMConstFAdd(a, b);
+      else
+         res = LLVMConstAdd(a, b);
    else
-      res = LLVMBuildAdd(bld->builder, a, b, "");
+      if (type.floating)
+         res = LLVMBuildFAdd(bld->builder, a, b, "");
+      else
+         res = LLVMBuildAdd(bld->builder, a, b, "");
 
    /* clamp to ceiling of 1.0 */
    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
@@ -240,6 +276,8 @@ lp_build_sum_vector(struct lp_build_context *bld,
    LLVMValueRef index, res;
    unsigned i;
 
+   assert(lp_check_value(type, a));
+
    if (a == bld->zero)
       return bld->zero;
    if (a == bld->undef)
@@ -253,9 +291,16 @@ lp_build_sum_vector(struct lp_build_context *bld,
 
    for (i = 1; i < type.length; i++) {
       index = LLVMConstInt(LLVMInt32Type(), i, 0);
-      res = LLVMBuildAdd(bld->builder, res,
-                         LLVMBuildExtractElement(bld->builder, a, index, ""),
-                         "");
+      if (type.floating)
+         res = LLVMBuildFAdd(bld->builder, res,
+                            LLVMBuildExtractElement(bld->builder,
+                                                    a, index, ""),
+                            "");
+      else
+         res = LLVMBuildAdd(bld->builder, res,
+                            LLVMBuildExtractElement(bld->builder,
+                                                    a, index, ""),
+                            "");
    }
 
    return res;
@@ -273,6 +318,9 @@ lp_build_sub(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(b == bld->zero)
       return a;
    if(a == bld->undef || b == bld->undef)
@@ -300,9 +348,15 @@ lp_build_sub(struct lp_build_context *bld,
    }
 
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      res = LLVMConstSub(a, b);
+      if (type.floating)
+         res = LLVMConstFSub(a, b);
+      else
+         res = LLVMConstSub(a, b);
    else
-      res = LLVMBuildSub(bld->builder, a, b, "");
+      if (type.floating)
+         res = LLVMBuildFSub(bld->builder, a, b, "");
+      else
+         res = LLVMBuildSub(bld->builder, a, b, "");
 
    if(bld->type.norm && (bld->type.floating || bld->type.fixed))
       res = lp_build_max_simple(bld, res, bld->zero);
@@ -360,6 +414,10 @@ lp_build_mul_u8n(LLVMBuilderRef builder,
    LLVMValueRef c8;
    LLVMValueRef ab;
 
+   assert(!i16_type.floating);
+   assert(lp_check_value(i16_type, a));
+   assert(lp_check_value(i16_type, b));
+
    c8 = lp_build_const_int_vec(i16_type, 8);
    
 #if 0
@@ -395,6 +453,9 @@ lp_build_mul(struct lp_build_context *bld,
    LLVMValueRef shift;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == bld->zero)
       return bld->zero;
    if(a == bld->one)
@@ -433,7 +494,10 @@ lp_build_mul(struct lp_build_context *bld,
       shift = NULL;
 
    if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
-      res =  LLVMConstMul(a, b);
+      if (type.floating)
+         res = LLVMConstFMul(a, b);
+      else
+         res = LLVMConstMul(a, b);
       if(shift) {
          if(type.sign)
             res = LLVMConstAShr(res, shift);
@@ -442,7 +506,10 @@ lp_build_mul(struct lp_build_context *bld,
       }
    }
    else {
-      res = LLVMBuildMul(bld->builder, a, b, "");
+      if (type.floating)
+         res = LLVMBuildFMul(bld->builder, a, b, "");
+      else
+         res = LLVMBuildMul(bld->builder, a, b, "");
       if(shift) {
          if(type.sign)
             res = LLVMBuildAShr(bld->builder, res, shift, "");
@@ -465,6 +532,8 @@ lp_build_mul_imm(struct lp_build_context *bld,
 {
    LLVMValueRef factor;
 
+   assert(lp_check_value(bld->type, a));
+
    if(b == 0)
       return bld->zero;
 
@@ -472,7 +541,7 @@ lp_build_mul_imm(struct lp_build_context *bld,
       return a;
 
    if(b == -1)
-      return LLVMBuildNeg(bld->builder, a, "");
+      return lp_build_negate(bld, a);
 
    if(b == 2 && bld->type.floating)
       return lp_build_add(bld, a, a);
@@ -518,6 +587,9 @@ lp_build_div(struct lp_build_context *bld,
 {
    const struct lp_type type = bld->type;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == bld->zero)
       return bld->zero;
    if(a == bld->one)
@@ -529,13 +601,24 @@ lp_build_div(struct lp_build_context *bld,
    if(a == bld->undef || b == bld->undef)
       return bld->undef;
 
-   if(LLVMIsConstant(a) && LLVMIsConstant(b))
-      return LLVMConstFDiv(a, b);
+   if(LLVMIsConstant(a) && LLVMIsConstant(b)) {
+      if (type.floating)
+         return LLVMConstFDiv(a, b);
+      else if (type.sign)
+         return LLVMConstSDiv(a, b);
+      else
+         return LLVMConstUDiv(a, b);
+   }
 
    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
 
-   return LLVMBuildFDiv(bld->builder, a, b, "");
+   if (type.floating)
+      return LLVMBuildFDiv(bld->builder, a, b, "");
+   else if (type.sign)
+      return LLVMBuildSDiv(bld->builder, a, b, "");
+   else
+      return LLVMBuildUDiv(bld->builder, a, b, "");
 }
 
 
@@ -555,6 +638,10 @@ lp_build_lerp(struct lp_build_context *bld,
    LLVMValueRef delta;
    LLVMValueRef res;
 
+   assert(lp_check_value(bld->type, x));
+   assert(lp_check_value(bld->type, v0));
+   assert(lp_check_value(bld->type, v1));
+
    delta = lp_build_sub(bld, v1, v0);
 
    res = lp_build_mul(bld, x, delta);
@@ -596,6 +683,9 @@ lp_build_min(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
+   assert(lp_check_value(bld->type, a));
+   assert(lp_check_value(bld->type, b));
+
    if(a == bld->undef || b == bld->undef)
       return bld->undef;
 
@@ -624,6 +714,9 @@ lp_build_max(struct lp_build_context *bld,
              LLVMValueRef a,
              LLVMValueRef b)
 {
+   assert(lp_check_value(bld->type, a));
+   assert(lp_check_value(bld->type, b));
+
    if(a == bld->undef || b == bld->undef)
       return bld->undef;
 
@@ -653,6 +746,10 @@ lp_build_clamp(struct lp_build_context *bld,
                LLVMValueRef min,
                LLVMValueRef max)
 {
+   assert(lp_check_value(bld->type, a));
+   assert(lp_check_value(bld->type, min));
+   assert(lp_check_value(bld->type, max));
+
    a = lp_build_min(bld, a, max);
    a = lp_build_max(bld, a, min);
    return a;
@@ -669,6 +766,8 @@ lp_build_abs(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
    LLVMTypeRef vec_type = lp_build_vec_type(type);
 
+   assert(lp_check_value(type, a));
+
    if(!type.sign)
       return a;
 
@@ -702,7 +801,16 @@ LLVMValueRef
 lp_build_negate(struct lp_build_context *bld,
                 LLVMValueRef a)
 {
-   return LLVMBuildNeg(bld->builder, a, "");
+   assert(lp_check_value(bld->type, a));
+
+#if HAVE_LLVM >= 0x0207
+   if (bld->type.floating)
+      a = LLVMBuildFNeg(bld->builder, a, "");
+   else
+#endif
+      a = LLVMBuildNeg(bld->builder, a, "");
+
+   return a;
 }
 
 
@@ -715,6 +823,8 @@ lp_build_sgn(struct lp_build_context *bld,
    LLVMValueRef cond;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+
    /* Handle non-zero case */
    if(!type.sign) {
       /* if not zero then sign must be positive */
@@ -773,6 +883,7 @@ lp_build_set_sign(struct lp_build_context *bld,
    LLVMValueRef val, res;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
    /* val = reinterpret_cast<int>(a) */
    val = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
@@ -1021,7 +1132,7 @@ lp_build_iround(struct lp_build_context *bld,
       half = LLVMBuildOr(bld->builder, sign, half, "");
       half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
 
-      res = LLVMBuildAdd(bld->builder, a, half, "");
+      res = LLVMBuildFAdd(bld->builder, a, half, "");
    }
 
    res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
@@ -1070,7 +1181,7 @@ lp_build_ifloor(struct lp_build_context *bld,
       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset");
 
-      res = LLVMBuildAdd(bld->builder, a, offset, "ifloor.res");
+      res = LLVMBuildFAdd(bld->builder, a, offset, "ifloor.res");
    }
 
    /* round to nearest (toward zero) */
@@ -1120,7 +1231,7 @@ lp_build_iceil(struct lp_build_context *bld,
       offset = LLVMBuildAnd(bld->builder, offset, sign, "");
       offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset");
 
-      res = LLVMBuildAdd(bld->builder, a, offset, "iceil.res");
+      res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res");
    }
 
    /* round to nearest (toward zero) */
@@ -1138,6 +1249,8 @@ lp_build_sqrt(struct lp_build_context *bld,
    LLVMTypeRef vec_type = lp_build_vec_type(type);
    char intrinsic[32];
 
+   assert(lp_check_value(type, a));
+
    /* TODO: optimize the constant case */
    /* TODO: optimize the constant case */
 
@@ -1148,12 +1261,39 @@ lp_build_sqrt(struct lp_build_context *bld,
 }
 
 
+/**
+ * Do one Newton-Raphson step to improve reciprocate precision:
+ *
+ *   x_{i+1} = x_i * (2 - a * x_i)
+ *
+ * See also:
+ * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division
+ * - http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
+static INLINE LLVMValueRef
+lp_build_rcp_refine(struct lp_build_context *bld,
+                    LLVMValueRef a,
+                    LLVMValueRef rcp_a)
+{
+   LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
+   LLVMValueRef res;
+
+   res = LLVMBuildFMul(bld->builder, a, rcp_a, "");
+   res = LLVMBuildFSub(bld->builder, two, res, "");
+   res = LLVMBuildFMul(bld->builder, rcp_a, res, "");
+
+   return res;
+}
+
+
 LLVMValueRef
 lp_build_rcp(struct lp_build_context *bld,
              LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
 
+   assert(lp_check_value(type, a));
+
    if(a == bld->zero)
       return bld->undef;
    if(a == bld->one)
@@ -1167,32 +1307,16 @@ lp_build_rcp(struct lp_build_context *bld,
       return LLVMConstFDiv(bld->one, a);
 
    if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
-      /*
-       * XXX: Added precision is not always necessary, so only enable this
-       * when we have a better system in place to track minimum precision.
-       */
-
-#if 0
-      /*
-       * Do one Newton-Raphson step to improve precision:
-       *
-       *   x1 = (2 - a * rcp(a)) * rcp(a)
-       */
-
-      LLVMValueRef two = lp_build_const_vec(bld->type, 2.0);
-      LLVMValueRef rcp_a;
       LLVMValueRef res;
+      unsigned i;
 
-      rcp_a = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
+      res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a);
 
-      res = LLVMBuildMul(bld->builder, a, rcp_a, "");
-      res = LLVMBuildSub(bld->builder, two, res, "");
-      res = LLVMBuildMul(bld->builder, res, rcp_a, "");
+      for (i = 0; i < RCP_NEWTON_STEPS; ++i) {
+         res = lp_build_rcp_refine(bld, a, res);
+      }
 
-      return rcp_a;
-#else
-      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
-#endif
+      return res;
    }
 
    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
@@ -1200,6 +1324,33 @@ lp_build_rcp(struct lp_build_context *bld,
 
 
 /**
+ * Do one Newton-Raphson step to improve rsqrt precision:
+ *
+ *   x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i)
+ *
+ * See also:
+ * - http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
+static INLINE LLVMValueRef
+lp_build_rsqrt_refine(struct lp_build_context *bld,
+                      LLVMValueRef a,
+                      LLVMValueRef rsqrt_a)
+{
+   LLVMValueRef half = lp_build_const_vec(bld->type, 0.5);
+   LLVMValueRef three = lp_build_const_vec(bld->type, 3.0);
+   LLVMValueRef res;
+
+   res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, "");
+   res = LLVMBuildFMul(bld->builder, a, res, "");
+   res = LLVMBuildFSub(bld->builder, three, res, "");
+   res = LLVMBuildFMul(bld->builder, rsqrt_a, res, "");
+   res = LLVMBuildFMul(bld->builder, half, res, "");
+
+   return res;
+}
+
+
+/**
  * Generate 1/sqrt(a)
  */
 LLVMValueRef
@@ -1208,10 +1359,22 @@ lp_build_rsqrt(struct lp_build_context *bld,
 {
    const struct lp_type type = bld->type;
 
+   assert(lp_check_value(type, a));
+
    assert(type.floating);
 
-   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
-      return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) {
+      LLVMValueRef res;
+      unsigned i;
+
+      res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a);
+
+      for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) {
+         res = lp_build_rsqrt_refine(bld, a, res);
+      }
+
+      return res;
+   }
 
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
@@ -1270,7 +1433,7 @@ lp_build_sin(struct lp_build_context *bld,
     */
    
    LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
-   LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y");
+   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
 
    /*
     * store the integer part of y in mm0
@@ -1344,9 +1507,9 @@ lp_build_sin(struct lp_build_context *bld,
     * xmm2 = _mm_mul_ps(y, xmm2);
     * xmm3 = _mm_mul_ps(y, xmm3);
     */
-   LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1");
-   LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2");
-   LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3");
+   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
+   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
+   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
 
    /*
     * x = _mm_add_ps(x, xmm1);
@@ -1354,16 +1517,16 @@ lp_build_sin(struct lp_build_context *bld,
     * x = _mm_add_ps(x, xmm3);
     */ 
 
-   LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1");
-   LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2");
-   LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3");
+   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
+   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
+   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
 
    /*
     * Evaluate the first polynom  (0 <= x <= Pi/4)
     *
     * z = _mm_mul_ps(x,x);
     */
-   LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z");
+   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
 
    /*
     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
@@ -1378,12 +1541,12 @@ lp_build_sin(struct lp_build_context *bld,
     * y = *(v4sf*)_ps_coscof_p0;
     * y = _mm_mul_ps(y, z);
     */
-   LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3");
-   LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4");
-   LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5");
-   LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6");
-   LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7");
-   LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8");
+   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
+   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
+   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
+   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
+   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
 
 
    /*
@@ -1392,10 +1555,10 @@ lp_build_sin(struct lp_build_context *bld,
     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
     */ 
    LLVMValueRef half = lp_build_const_v4sf(0.5);
-   LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp");
-   LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8");
+   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
+   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
    LLVMValueRef one = lp_build_const_v4sf(1.0);
-   LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9");
+   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
 
    /*
     * _PS_CONST(sincof_p0, -1.9515295891E-4);
@@ -1419,13 +1582,13 @@ lp_build_sin(struct lp_build_context *bld,
     * y2 = _mm_add_ps(y2, x);
     */
 
-   LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3");
-   LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4");
-   LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5");
-   LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6");
-   LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7");
-   LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8");
-   LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9");
+   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
+   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
+   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
+   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
+   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
+   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
 
    /*
     * select the correct result from the two polynoms
@@ -1481,7 +1644,7 @@ lp_build_cos(struct lp_build_context *bld,
     */
    
    LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516);
-   LLVMValueRef scale_y = LLVMBuildMul(b, x_abs, FOPi, "scale_y");
+   LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y");
 
    /*
     * store the integer part of y in mm0
@@ -1561,9 +1724,9 @@ lp_build_cos(struct lp_build_context *bld,
     * xmm2 = _mm_mul_ps(y, xmm2);
     * xmm3 = _mm_mul_ps(y, xmm3);
     */
-   LLVMValueRef xmm1 = LLVMBuildMul(b, y_2, DP1, "xmm1");
-   LLVMValueRef xmm2 = LLVMBuildMul(b, y_2, DP2, "xmm2");
-   LLVMValueRef xmm3 = LLVMBuildMul(b, y_2, DP3, "xmm3");
+   LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1");
+   LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2");
+   LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3");
 
    /*
     * x = _mm_add_ps(x, xmm1);
@@ -1571,16 +1734,16 @@ lp_build_cos(struct lp_build_context *bld,
     * x = _mm_add_ps(x, xmm3);
     */ 
 
-   LLVMValueRef x_1 = LLVMBuildAdd(b, x_abs, xmm1, "x_1");
-   LLVMValueRef x_2 = LLVMBuildAdd(b, x_1, xmm2, "x_2");
-   LLVMValueRef x_3 = LLVMBuildAdd(b, x_2, xmm3, "x_3");
+   LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1");
+   LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2");
+   LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3");
 
    /*
     * Evaluate the first polynom  (0 <= x <= Pi/4)
     *
     * z = _mm_mul_ps(x,x);
     */
-   LLVMValueRef z = LLVMBuildMul(b, x_3, x_3, "z");
+   LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z");
 
    /*
     * _PS_CONST(coscof_p0,  2.443315711809948E-005);
@@ -1595,12 +1758,12 @@ lp_build_cos(struct lp_build_context *bld,
     * y = *(v4sf*)_ps_coscof_p0;
     * y = _mm_mul_ps(y, z);
     */
-   LLVMValueRef y_3 = LLVMBuildMul(b, z, coscof_p0, "y_3");
-   LLVMValueRef y_4 = LLVMBuildAdd(b, y_3, coscof_p1, "y_4");
-   LLVMValueRef y_5 = LLVMBuildMul(b, y_4, z, "y_5");
-   LLVMValueRef y_6 = LLVMBuildAdd(b, y_5, coscof_p2, "y_6");
-   LLVMValueRef y_7 = LLVMBuildMul(b, y_6, z, "y_7");
-   LLVMValueRef y_8 = LLVMBuildMul(b, y_7, z, "y_8");
+   LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3");
+   LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4");
+   LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5");
+   LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6");
+   LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7");
+   LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8");
 
 
    /*
@@ -1609,10 +1772,10 @@ lp_build_cos(struct lp_build_context *bld,
     * y = _mm_add_ps(y, *(v4sf*)_ps_1);
     */ 
    LLVMValueRef half = lp_build_const_v4sf(0.5);
-   LLVMValueRef tmp = LLVMBuildMul(b, z, half, "tmp");
-   LLVMValueRef y_9 = LLVMBuildSub(b, y_8, tmp, "y_8");
+   LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp");
+   LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8");
    LLVMValueRef one = lp_build_const_v4sf(1.0);
-   LLVMValueRef y_10 = LLVMBuildAdd(b, y_9, one, "y_9");
+   LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9");
 
    /*
     * _PS_CONST(sincof_p0, -1.9515295891E-4);
@@ -1636,13 +1799,13 @@ lp_build_cos(struct lp_build_context *bld,
     * y2 = _mm_add_ps(y2, x);
     */
 
-   LLVMValueRef y2_3 = LLVMBuildMul(b, z, sincof_p0, "y2_3");
-   LLVMValueRef y2_4 = LLVMBuildAdd(b, y2_3, sincof_p1, "y2_4");
-   LLVMValueRef y2_5 = LLVMBuildMul(b, y2_4, z, "y2_5");
-   LLVMValueRef y2_6 = LLVMBuildAdd(b, y2_5, sincof_p2, "y2_6");
-   LLVMValueRef y2_7 = LLVMBuildMul(b, y2_6, z, "y2_7");
-   LLVMValueRef y2_8 = LLVMBuildMul(b, y2_7, x_3, "y2_8");
-   LLVMValueRef y2_9 = LLVMBuildAdd(b, y2_8, x_3, "y2_9");
+   LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3");
+   LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4");
+   LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5");
+   LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6");
+   LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7");
+   LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8");
+   LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9");
 
    /*
     * select the correct result from the two polynoms
@@ -1695,6 +1858,8 @@ lp_build_exp(struct lp_build_context *bld,
    /* log2(e) = 1/log(2) */
    LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634);
 
+   assert(lp_check_value(bld->type, x));
+
    return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
 }
 
@@ -1709,14 +1874,12 @@ lp_build_log(struct lp_build_context *bld,
    /* log(2) */
    LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529);
 
+   assert(lp_check_value(bld->type, x));
+
    return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
 }
 
 
-#define EXP_POLY_DEGREE 3
-#define LOG_POLY_DEGREE 5
-
-
 /**
  * Generate polynomial.
  * Ex:  coeffs[0] + x * coeffs[1] + x^2 * coeffs[2].
@@ -1731,6 +1894,8 @@ lp_build_polynomial(struct lp_build_context *bld,
    LLVMValueRef res = NULL;
    unsigned i;
 
+   assert(lp_check_value(bld->type, x));
+
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x))
       debug_printf("%s: inefficient/imprecise constant arithmetic\n",
@@ -1802,6 +1967,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
    LLVMValueRef expfpart = NULL;
    LLVMValueRef res = NULL;
 
+   assert(lp_check_value(bld->type, x));
+
    if(p_exp2_int_part || p_frac_part || p_exp2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
@@ -1817,7 +1984,7 @@ lp_build_exp2_approx(struct lp_build_context *bld,
       ipart = lp_build_floor(bld, x);
 
       /* fpart = x - ipart */
-      fpart = LLVMBuildSub(bld->builder, x, ipart, "");
+      fpart = LLVMBuildFSub(bld->builder, x, ipart, "");
    }
 
    if(p_exp2_int_part || p_exp2) {
@@ -1832,7 +1999,7 @@ lp_build_exp2_approx(struct lp_build_context *bld,
       expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
                                      Elements(lp_build_exp2_polynomial));
 
-      res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
+      res = LLVMBuildFMul(bld->builder, expipart, expfpart, "");
    }
 
    if(p_exp2_int_part)
@@ -1915,6 +2082,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    LLVMValueRef logmant = NULL;
    LLVMValueRef res = NULL;
 
+   assert(lp_check_value(bld->type, x));
+
    if(p_exp || p_floor_log2 || p_log2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
@@ -1945,9 +2114,9 @@ lp_build_log2_approx(struct lp_build_context *bld,
                                     Elements(lp_build_log2_polynomial));
 
       /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-      logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), "");
+      logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), "");
 
-      res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
+      res = LLVMBuildFAdd(bld->builder, logmant, logexp, "");
    }
 
    if(p_exp) {
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 77012f1fac6..8b477313d48 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -117,8 +117,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder,
    scale = (double)mask/ubound;
    bias = (double)((unsigned long long)1 << (mantissa - n));
 
-   res = LLVMBuildMul(builder, src, lp_build_const_vec(src_type, scale), "");
-   res = LLVMBuildAdd(builder, res, lp_build_const_vec(src_type, bias), "");
+   res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), "");
+   res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), "");
    res = LLVMBuildBitCast(builder, res, int_vec_type, "");
 
    if(dst_width > n) {
@@ -175,6 +175,8 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
    double scale;
    double bias;
 
+   assert(dst_type.floating);
+
    mantissa = lp_mantissa(dst_type);
 
    n = MIN2(mantissa, src_width);
@@ -199,8 +201,8 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder,
 
    res = LLVMBuildBitCast(builder, res, vec_type, "");
 
-   res = LLVMBuildSub(builder, res, bias_, "");
-   res = LLVMBuildMul(builder, res, lp_build_const_vec(dst_type, scale), "");
+   res = LLVMBuildFSub(builder, res, bias_, "");
+   res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), "");
 
    return res;
 }
@@ -296,7 +298,7 @@ lp_build_conv(LLVMBuilderRef builder,
          if (dst_scale != 1.0) {
             LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale);
             for(i = 0; i < num_tmps; ++i)
-               tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
+               tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
          }
 
          /* Use an equally sized integer for intermediate computations */
@@ -391,7 +393,7 @@ lp_build_conv(LLVMBuilderRef builder,
           if (src_scale != 1.0) {
              LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale);
              for(i = 0; i < num_tmps; ++i)
-                tmp[i] = LLVMBuildMul(builder, tmp[i], scale, "");
+                tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, "");
           }
       }
     }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 0f01fc1d75f..247cb83ce6c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -240,7 +240,7 @@ lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder,
     */
 
    if (normalized)
-      scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), "");
+      scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), "");
    else
       scaled = casted;
 
@@ -322,7 +322,7 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder,
    }
 
    if (normalized)
-      scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
+      scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), "");
    else
       scaled = unswizzled;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index 9f405921b0a..c724a4453e6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -197,7 +197,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
             if (format_desc->channel[chan].normalized) {
                double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1);
                LLVMValueRef scale_val = lp_build_const_vec(type, scale);
-               input = LLVMBuildMul(builder, input, scale_val, "");
+               input = LLVMBuildFMul(builder, input, scale_val, "");
             }
          }
          else {
@@ -227,7 +227,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder,
             double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1);
             LLVMValueRef scale_val = lp_build_const_vec(type, scale);
             input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), "");
-            input = LLVMBuildMul(builder, input, scale_val, "");
+            input = LLVMBuildFMul(builder, input, scale_val, "");
          }
          else {
             /* FIXME */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index ef0888079c7..60d8bcfa55e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -46,7 +46,7 @@ static const struct debug_named_value lp_bld_debug_flags[] = {
    DEBUG_NAMED_VALUE_END
 };
 
-DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags, 0);
+DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags, 0)
 #endif
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index ab4ddb81c40..7d7db3b0d92 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -83,6 +83,8 @@ lp_build_compare(LLVMBuilderRef builder,
 
    assert(func >= PIPE_FUNC_NEVER);
    assert(func <= PIPE_FUNC_ALWAYS);
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
 
    if(func == PIPE_FUNC_NEVER)
       return zeros;
@@ -374,6 +376,9 @@ lp_build_select_bitwise(struct lp_build_context *bld,
    struct lp_type type = bld->type;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if (a == b) {
       return a;
    }
@@ -419,6 +424,9 @@ lp_build_select(struct lp_build_context *bld,
    struct lp_type type = bld->type;
    LLVMValueRef res;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == b)
       return a;
 
@@ -484,6 +492,9 @@ lp_build_select_aos(struct lp_build_context *bld,
    const unsigned n = type.length;
    unsigned i, j;
 
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
    if(a == b)
       return a;
    if(cond[0] && cond[1] && cond[2] && cond[3])
@@ -539,7 +550,22 @@ lp_build_select_aos(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b)
 {
+   const struct lp_type type = bld->type;
+
+   assert(lp_check_value(type, a));
+   assert(lp_check_value(type, b));
+
+   /* can't do bitwise ops on floating-point values */
+   if(type.floating) {
+      a = LLVMBuildBitCast(bld->builder, a, bld->int_vec_type, "");
+      b = LLVMBuildBitCast(bld->builder, b, bld->int_vec_type, "");
+   }
+
    b = LLVMBuildNot(bld->builder, b, "");
    b = LLVMBuildAnd(bld->builder, a, b, "");
+
+   if(type.floating) {
+      b = LLVMBuildBitCast(bld->builder, b, bld->vec_type, "");
+   }
    return b;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 7748f8f0999..b7b630f2e8d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -171,14 +171,13 @@ lp_build_unpack2(LLVMBuilderRef builder,
       msb = lp_build_zero(src_type);
 
    /* Interleave bits */
-   if(util_cpu_caps.little_endian) {
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
       *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0);
       *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1);
-   }
-   else {
+#else
       *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0);
       *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1);
-   }
+#endif
 
    /* Cast the result into the new type (twice as wide) */
 
@@ -261,13 +260,14 @@ lp_build_pack2(LLVMBuilderRef builder,
 #endif
    LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type);
    LLVMValueRef shuffle;
-   LLVMValueRef res;
+   LLVMValueRef res = NULL;
 
    assert(!src_type.floating);
    assert(!dst_type.floating);
    assert(src_type.width == dst_type.width * 2);
    assert(src_type.length * 2 == dst_type.length);
 
+   /* Check for special cases first */
    if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
       switch(src_type.width) {
       case 32:
@@ -283,8 +283,8 @@ lp_build_pack2(LLVMBuilderRef builder,
                return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi);
             }
             else {
-               assert(0);
-               return LLVMGetUndef(dst_vec_type);
+               /* use generic shuffle below */
+               res = NULL;
             }
          }
          break;
@@ -310,10 +310,13 @@ lp_build_pack2(LLVMBuilderRef builder,
          break;
       }
 
-      res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
-      return res;
+      if (res) {
+         res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
+         return res;
+      }
    }
 
+   /* generic shuffle */
    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
index ca36046d222..7b1088939b9 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_quad.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c
@@ -85,7 +85,7 @@ lp_build_scalar_ddx(struct lp_build_context *bld,
    LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0);
    LLVMValueRef a_left  = LLVMBuildExtractElement(bld->builder, a, idx_left, "");
    LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, "");
-   return LLVMBuildSub(bld->builder, a_right, a_left, "");
+   return lp_build_sub(bld, a_right, a_left);
 }
 
 
@@ -97,5 +97,5 @@ lp_build_scalar_ddy(struct lp_build_context *bld,
    LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0);
    LLVMValueRef a_top    = LLVMBuildExtractElement(bld->builder, a, idx_top, "");
    LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, "");
-   return LLVMBuildSub(bld->builder, a_bottom, a_top, "");
+   return lp_build_sub(bld, a_bottom, a_top);
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 1a20d74cac8..806c7d56a87 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -40,7 +40,6 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
-#include "util/u_cpu_detect.h"
 #include "lp_bld_debug.h"
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -811,7 +810,7 @@ lp_build_minify(struct lp_build_sample_context *bld,
                 LLVMValueRef base_size,
                 LLVMValueRef level)
 {
-   LLVMValueRef size = LLVMBuildAShr(bld->builder, base_size, level, "minify");
+   LLVMValueRef size = LLVMBuildLShr(bld->builder, base_size, level, "minify");
    size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one);
    return size;
 }
@@ -888,17 +887,17 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          /* Compute rho = max of all partial derivatives scaled by texture size.
           * XXX this could be vectorized somewhat
           */
-         rho = LLVMBuildMul(bld->builder,
+         rho = LLVMBuildFMul(bld->builder,
                             lp_build_max(float_bld, dsdx, dsdy),
                             lp_build_int_to_float(float_bld, width), "");
          if (dims > 1) {
             LLVMValueRef max;
-            max = LLVMBuildMul(bld->builder,
+            max = LLVMBuildFMul(bld->builder,
                                lp_build_max(float_bld, dtdx, dtdy),
                                lp_build_int_to_float(float_bld, height), "");
             rho = lp_build_max(float_bld, rho, max);
             if (dims > 2) {
-               max = LLVMBuildMul(bld->builder,
+               max = LLVMBuildFMul(bld->builder,
                                   lp_build_max(float_bld, drdx, drdy),
                                   lp_build_int_to_float(float_bld, depth), "");
                rho = lp_build_max(float_bld, rho, max);
@@ -912,12 +911,12 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
          if (lod_bias) {
             lod_bias = LLVMBuildExtractElement(bld->builder, lod_bias,
                                                index0, "");
-            lod = LLVMBuildAdd(bld->builder, lod, lod_bias, "shader_lod_bias");
+            lod = LLVMBuildFAdd(bld->builder, lod, lod_bias, "shader_lod_bias");
          }
       }
 
       /* add sampler lod bias */
-      lod = LLVMBuildAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
+      lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias");
 
       /* clamp lod */
       lod = lp_build_clamp(float_bld, lod, min_lod, max_lod);
@@ -1219,8 +1218,7 @@ lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord)
    /* ima = -0.5 / abs(coord); */
    LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5);
    LLVMValueRef absCoord = lp_build_abs(coord_bld, coord);
-   LLVMValueRef ima = lp_build_mul(coord_bld, negHalf,
-                                   lp_build_rcp(coord_bld, absCoord));
+   LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord);
    return ima;
 }
 
@@ -1841,7 +1839,11 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld,
       unsigned i, j;
 
       for(j = 0; j < h16.type.length; j += 4) {
-         unsigned subindex = util_cpu_caps.little_endian ? 0 : 1;
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+         unsigned subindex = 0;
+#else
+         unsigned subindex = 1;
+#endif
          LLVMValueRef index;
 
          index = LLVMConstInt(elem_type, j/2 + subindex, 0);
@@ -2029,6 +2031,8 @@ lp_build_sample_soa(LLVMBuilderRef builder,
       debug_printf("Sample from %s\n", util_format_name(fmt));
    }
 
+   assert(type.floating);
+
    /* Setup our build context */
    memset(&bld, 0, sizeof bld);
    bld.builder = builder;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 21236839fb7..0aa64affacc 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -489,7 +489,7 @@ get_indirect_offsets(struct lp_build_tgsi_soa_context *bld,
                               int_vec_type, "");
 
    /* addr_vec = addr_vec * 4 */
-   addr_vec = lp_build_mul(&bld->base, addr_vec, vec4);
+   addr_vec = lp_build_mul(&bld->int_bld, addr_vec, vec4);
 
    return addr_vec;
 }
@@ -533,7 +533,7 @@ emit_fetch(
                                             reg->Register.Index * 4 + swizzle);
 
          /* index_vec = index_vec + addr_vec */
-         index_vec = lp_build_add(&bld->base, index_vec, addr_vec);
+         index_vec = lp_build_add(&bld->int_bld, index_vec, addr_vec);
 
          /* Gather values from the constant buffer */
          res = build_gather(bld, bld->consts_ptr, index_vec);
@@ -612,11 +612,9 @@ emit_fetch(
    case TGSI_UTIL_SIGN_SET:
       /* TODO: Use bitwese OR for floating point */
       res = lp_build_abs( &bld->base, res );
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
-      break;
-
+      /* fall through */
    case TGSI_UTIL_SIGN_TOGGLE:
-      res = LLVMBuildNeg( bld->base.builder, res, "" );
+      res = lp_build_negate( &bld->base, res );
       break;
 
    case TGSI_UTIL_SIGN_KEEP:
@@ -773,7 +771,9 @@ emit_store(
       addr = LLVMBuildExtractElement(bld->base.builder,
                                      addr, LLVMConstInt(LLVMInt32Type(), 0, 0),
                                      "");
-      addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0));
+      addr = LLVMBuildMul(bld->base.builder,
+                          addr, LLVMConstInt(LLVMInt32Type(), 4, 0),
+                          "");
    }
 
    switch( reg->Register.File ) {
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index 2e15751e508..0461c815504 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -30,7 +30,7 @@
 #include "rtasm_cpu.h"
 
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 static boolean rtasm_sse_enabled(void)
 {
    static boolean firsttime = 1;
@@ -49,7 +49,7 @@ static boolean rtasm_sse_enabled(void)
 int rtasm_cpu_has_sse(void)
 {
    /* FIXME: actually detect this at run-time */
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    return rtasm_sse_enabled();
 #else
    return 0;
@@ -59,7 +59,7 @@ int rtasm_cpu_has_sse(void)
 int rtasm_cpu_has_sse2(void) 
 {
    /* FIXME: actually detect this at run-time */
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    return rtasm_sse_enabled();
 #else
    return 0;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 9f70b73698a..0fe6ebfcb45 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -22,8 +22,9 @@
  **************************************************************************/
 
 #include "pipe/p_config.h"
+#include "util/u_cpu_detect.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 #include "pipe/p_compiler.h"
 #include "util/u_debug.h"
@@ -231,6 +232,10 @@ static void emit_modrm( struct x86_function *p,
    
    assert(reg.mod == mod_REG);
    
+   /* TODO: support extended x86-64 registers */
+   assert(reg.idx < 8);
+   assert(regmem.idx < 8);
+
    val |= regmem.mod << 6;     	/* mod field */
    val |= reg.idx << 3;		/* reg field */
    val |= regmem.idx;		/* r/m field */
@@ -363,6 +368,12 @@ int x86_get_label( struct x86_function *p )
  */
 
 
+void x64_rexw(struct x86_function *p)
+{
+   if(x86_target(p) != X86_32)
+      emit_1ub(p, 0x48);
+}
+
 void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
 	      int label )
@@ -449,6 +460,52 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
    emit_1i(p, imm);
 }
 
+void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   if(dst.mod == mod_REG)
+      x86_mov_reg_imm(p, dst, imm);
+   else
+   {
+      emit_1ub(p, 0xc7);
+      emit_modrm_noreg(p, 0, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_mov16_imm( struct x86_function *p, struct x86_reg dst, uint16_t imm )
+{
+   DUMP_RI( dst, imm );
+   emit_1ub(p, 0x66);
+   if(dst.mod == mod_REG)
+   {
+      emit_1ub(p, 0xb8 + dst.idx);
+      emit_2ub(p, imm & 0xff, imm >> 8);
+   }
+   else
+   {
+      emit_1ub(p, 0xc7);
+      emit_modrm_noreg(p, 0, dst);
+      emit_2ub(p, imm & 0xff, imm >> 8);
+   }
+}
+
+void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm )
+{
+   DUMP_RI( dst, imm );
+   if(dst.mod == mod_REG)
+   {
+      emit_1ub(p, 0xb0 + dst.idx);
+      emit_1ub(p, imm);
+   }
+   else
+   {
+      emit_1ub(p, 0xc6);
+      emit_modrm_noreg(p, 0, dst);
+      emit_1ub(p, imm);
+   }
+}
+
 /**
  * Immediate group 1 instructions.
  */
@@ -520,7 +577,7 @@ void x86_push( struct x86_function *p,
    }
 
 
-   p->stack_offset += 4;
+   p->stack_offset += sizeof(void*);
 }
 
 void x86_push_imm32( struct x86_function *p,
@@ -530,7 +587,7 @@ void x86_push_imm32( struct x86_function *p,
    emit_1ub(p, 0x68);
    emit_1i(p,  imm32);
 
-   p->stack_offset += 4;
+   p->stack_offset += sizeof(void*);
 }
 
 
@@ -540,23 +597,33 @@ void x86_pop( struct x86_function *p,
    DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x58 + reg.idx);
-   p->stack_offset -= 4;
+   p->stack_offset -= sizeof(void*);
 }
 
 void x86_inc( struct x86_function *p,
 	      struct x86_reg reg )
 {
    DUMP_R( reg );
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x40 + reg.idx);
+   if(x86_target(p) == X86_32 && reg.mod == mod_REG)
+   {
+      emit_1ub(p, 0x40 + reg.idx);
+      return;
+   }
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 0, reg);
 }
 
 void x86_dec( struct x86_function *p,
 	      struct x86_reg reg )
 {
    DUMP_R( reg );
-   assert(reg.mod == mod_REG);
-   emit_1ub(p, 0x48 + reg.idx);
+   if(x86_target(p) == X86_32 && reg.mod == mod_REG)
+   {
+      emit_1ub(p, 0x48 + reg.idx);
+      return;
+   }
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 1, reg);
 }
 
 void x86_ret( struct x86_function *p )
@@ -583,9 +650,82 @@ void x86_mov( struct x86_function *p,
 	      struct x86_reg src )
 {
    DUMP_RR( dst, src );
+   /* special hack for reading arguments until we support x86-64 registers everywhere */
+   if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8))
+   {
+      uint8_t rex = 0x40;
+      if(dst.idx >= 8)
+      {
+         rex |= 4;
+         dst.idx -= 8;
+      }
+      if(src.idx >= 8)
+      {
+         rex |= 1;
+         src.idx -= 8;
+      }
+      emit_1ub(p, rex);
+   }
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_mov16( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, 0x66);
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_mov8( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x8a, 0x88, dst, src );
+}
+
+void x64_mov64( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   uint8_t rex = 0x48;
+   DUMP_RR( dst, src );
+   assert(x86_target(p) != X86_32);
+
+   /* special hack for reading arguments until we support x86-64 registers everywhere */
+   if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8))
+   {
+      if(dst.idx >= 8)
+      {
+         rex |= 4;
+         dst.idx -= 8;
+      }
+      if(src.idx >= 8)
+      {
+         rex |= 1;
+         src.idx -= 8;
+      }
+   }
+   emit_1ub(p, rex);
    emit_op_modrm( p, 0x8b, 0x89, dst, src );
 }
 
+void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x0f, 0xb6);
+   emit_modrm(p, dst, src);
+}
+
+void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x0f, 0xb7);
+   emit_modrm(p, dst, src);
+}
+
 void x86_xor( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
@@ -680,6 +820,61 @@ void x86_div( struct x86_function *p,
    emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src);
 }
 
+void x86_bswap( struct x86_function *p, struct x86_reg reg )
+{
+   DUMP_R(reg);
+   assert(reg.file == file_REG32);
+   assert(reg.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0xc8 + reg.idx);
+}
+
+void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 5, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 5, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 7, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 7, reg);
+      emit_1ub(p, imm);
+   }
+}
+
+void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm  )
+{
+   DUMP_RI(reg, imm);
+   if(imm == 1)
+   {
+      emit_1ub(p, 0xd1);
+      emit_modrm_noreg(p, 4, reg);
+   }
+   else
+   {
+      emit_1ub(p, 0xc1);
+      emit_modrm_noreg(p, 4, reg);
+      emit_1ub(p, imm);
+   }
+}
 
 
 /***********************************************************************
@@ -1013,6 +1208,77 @@ void sse_movmskps( struct x86_function *p,
  * SSE2 instructions
  */
 
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   if(dst.mod == mod_REG && dst.file == file_REG32)
+   {
+      emit_1ub(p, 0x7e);
+      emit_modrm(p, src, dst);
+   }
+   else
+   {
+      emit_op_modrm(p, 0x6e, 0x7e, dst, src);
+   }
+}
+
+void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   switch (dst.mod) {
+   case mod_REG:
+      emit_3ub(p, 0xf3, 0x0f, 0x7e);
+      emit_modrm(p, dst, src);
+      break;
+   case mod_INDIRECT:
+   case mod_DISP32:
+   case mod_DISP8:
+      assert(src.mod == mod_REG);
+      emit_3ub(p, 0x66, 0x0f, 0xd6);
+      emit_modrm(p, src, dst);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0xf3, 0x0f);
+   emit_op_modrm(p, 0x6f, 0x7f, dst, src);
+}
+
+void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x6f, 0x7f, dst, src);
+}
+
+void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0xf2, 0x0f);
+   emit_op_modrm(p, 0x10, 0x11, dst, src);
+}
+
+void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x10, 0x11, dst, src);
+}
+
+void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_2ub(p, 0x66, 0x0f);
+   emit_op_modrm(p, 0x28, 0x29, dst, src);
+}
+
 /**
  * Perform a reduced swizzle:
  */
@@ -1027,6 +1293,28 @@ void sse2_pshufd( struct x86_function *p,
    emit_1ub(p, shuf); 
 }
 
+void sse2_pshuflw( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src,
+                  unsigned char shuf)
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0xf2, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf);
+}
+
+void sse2_pshufhw( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src,
+                  unsigned char shuf)
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0xf3, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf);
+}
+
 void sse2_cvttps2dq( struct x86_function *p,
                      struct x86_reg dst,
                      struct x86_reg src )
@@ -1045,6 +1333,24 @@ void sse2_cvtps2dq( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_cvtsd2ss( struct x86_function *p,
+                    struct x86_reg dst,
+                    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xf2, 0x0f, 0x5a);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtpd2ps( struct x86_function *p,
+                    struct x86_reg dst,
+                    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x5a);
+   emit_modrm( p, dst, src );
+}
+
 void sse2_packssdw( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
@@ -1081,6 +1387,97 @@ void sse2_punpcklbw( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x61);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x62);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, 0x0f, 0x6c);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x73);
+   emit_modrm_noreg(p, 6, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x73);
+   emit_modrm_noreg(p, 2, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x71);
+   emit_modrm_noreg(p, 4, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm )
+{
+   DUMP_RI(dst, imm);
+   emit_3ub(p, 0x66, 0x0f, 0x72);
+   emit_modrm_noreg(p, 4, dst);
+   emit_1ub(p, imm);
+}
+
+void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR(dst, src);
+   emit_3ub(p, 0x66, 0x0f, 0xeb);
+   emit_modrm(p, dst, src);
+}
 
 void sse2_rcpps( struct x86_function *p,
                  struct x86_reg dst,
@@ -1100,18 +1497,6 @@ void sse2_rcpss( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
-void sse2_movd( struct x86_function *p,
-		struct x86_reg dst,
-		struct x86_reg src )
-{
-   DUMP_RR( dst, src );
-   emit_2ub(p, 0x66, X86_TWOB);
-   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
-}
-
-
-
-
 /***********************************************************************
  * x87 instructions
  */
@@ -1702,23 +2087,79 @@ void x86_cdecl_caller_pop_regs( struct x86_function *p )
 }
 
 
-/* Retreive a reference to one of the function arguments, taking into
- * account any push/pop activity:
- */
 struct x86_reg x86_fn_arg( struct x86_function *p,
-			   unsigned arg )
+                           unsigned arg )
 {
-   return x86_make_disp(x86_make_reg(file_REG32, reg_SP), 
+   switch(x86_target(p))
+   {
+   case X86_64_WIN64_ABI:
+      /* Microsoft uses a different calling convention than the rest of the world */
+      switch(arg)
+      {
+      case 1:
+         return x86_make_reg(file_REG32, reg_CX);
+      case 2:
+         return x86_make_reg(file_REG32, reg_DX);
+      case 3:
+         return x86_make_reg(file_REG32, reg_R8);
+      case 4:
+         return x86_make_reg(file_REG32, reg_R9);
+      default:
+         return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+               p->stack_offset + (arg - 4) * 8);     /* ??? */
+      }
+   case X86_64_STD_ABI:
+      switch(arg)
+      {
+      case 1:
+         return x86_make_reg(file_REG32, reg_DI);
+      case 2:
+         return x86_make_reg(file_REG32, reg_SI);
+      case 3:
+         return x86_make_reg(file_REG32, reg_DX);
+      case 4:
+         return x86_make_reg(file_REG32, reg_CX);
+      case 5:
+         return x86_make_reg(file_REG32, reg_R8);
+      case 6:
+         return x86_make_reg(file_REG32, reg_R9);
+      default:
+         return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
+               p->stack_offset + (arg - 6) * 8);     /* ??? */
+      }
+   case X86_32:
+      return x86_make_disp(x86_make_reg(file_REG32, reg_SP),
 			p->stack_offset + arg * 4);	/* ??? */
+   default:
+      abort();
+   }
 }
 
+static void x86_init_func_common( struct x86_function *p )
+{
+   util_cpu_detect();
+   p->caps = 0;
+   if(util_cpu_caps.has_mmx)
+      p->caps |= X86_MMX;
+   if(util_cpu_caps.has_mmx2)
+      p->caps |= X86_MMX2;
+   if(util_cpu_caps.has_sse)
+      p->caps |= X86_SSE;
+   if(util_cpu_caps.has_sse2)
+      p->caps |= X86_SSE2;
+   if(util_cpu_caps.has_sse3)
+      p->caps |= X86_SSE3;
+   if(util_cpu_caps.has_sse4_1)
+      p->caps |= X86_SSE4_1;
+   p->csr = p->store;
+   DUMP_START();
+}
 
 void x86_init_func( struct x86_function *p )
 {
    p->size = 0;
    p->store = NULL;
-   p->csr = p->store;
-   DUMP_START();
+   x86_init_func_common(p);
 }
 
 void x86_init_func_size( struct x86_function *p, unsigned code_size )
@@ -1728,8 +2169,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size )
    if (p->store == NULL) {
       p->store = p->error_overflow;
    }
-   p->csr = p->store;
-   DUMP_START();
+   x86_init_func_common(p);
 }
 
 void x86_release_func( struct x86_function *p )
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 6208e8f707f..aa77892b2dc 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -26,20 +26,28 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 /* It is up to the caller to ensure that instructions issued are
  * suitable for the host cpu.  There are no checks made in this module
  * for mmx/sse/sse2 support on the cpu.
  */
 struct x86_reg {
-   unsigned file:3;
-   unsigned idx:3;
+   unsigned file:2;
+   unsigned idx:4;
    unsigned mod:2;		/* mod_REG if this is just a register */
    int      disp:24;		/* only +/- 23bits of offset - should be enough... */
 };
 
+#define X86_MMX 1
+#define X86_MMX2 2
+#define X86_SSE 4
+#define X86_SSE2 8
+#define X86_SSE3 0x10
+#define X86_SSE4_1 0x20
+
 struct x86_function {
+   unsigned caps;
    unsigned size;
    unsigned char *store;
    unsigned char *csr;
@@ -75,7 +83,15 @@ enum x86_reg_name {
    reg_SP,
    reg_BP,
    reg_SI,
-   reg_DI
+   reg_DI,
+   reg_R8,
+   reg_R9,
+   reg_R10,
+   reg_R11,
+   reg_R12,
+   reg_R13,
+   reg_R14,
+   reg_R15
 };
 
 
@@ -110,6 +126,29 @@ typedef void (*x86_func)(void);
 /* Begin/end/retrieve function creation:
  */
 
+enum x86_target
+{
+   X86_32,
+   X86_64_STD_ABI,
+   X86_64_WIN64_ABI
+};
+
+/* make this read a member of x86_function if target != host is desired */
+static INLINE enum x86_target x86_target( struct x86_function* p )
+{
+#ifdef PIPE_ARCH_X86
+   return X86_32;
+#elif defined(_WIN64)
+   return X86_64_WIN64_ABI;
+#elif defined(PIPE_ARCH_X86_64)
+   return X86_64_STD_ABI;
+#endif
+}
+
+static INLINE unsigned x86_target_caps( struct x86_function* p )
+{
+   return p->caps;
+}
 
 void x86_init_func( struct x86_function *p );
 void x86_init_func_size( struct x86_function *p, unsigned code_size );
@@ -138,6 +177,8 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg );
  */
 int x86_get_label( struct x86_function *p );
 
+void x64_rexw(struct x86_function *p);
+
 void x86_jcc( struct x86_function *p,
 	      enum x86_cc cc,
 	      int label );
@@ -178,18 +219,54 @@ void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
 void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtsd2ss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtpd2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
 void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
                   unsigned char shuf );
+void sse2_pshuflw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_pshufhw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm );
+
+void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_pshuflw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+void sse2_pshufhw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm );
 
 void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
 void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
@@ -227,7 +304,6 @@ void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg
 void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
-void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_movmskps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
 
 void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -237,6 +313,14 @@ void x86_dec( struct x86_function *p, struct x86_reg reg );
 void x86_inc( struct x86_function *p, struct x86_reg reg );
 void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x64_mov64( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov8( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov16( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov_imm(struct x86_function *p, struct x86_reg dst, int imm );
+void x86_mov8_imm(struct x86_function *p, struct x86_reg dst, uint8_t imm );
+void x86_mov16_imm(struct x86_function *p, struct x86_reg dst, uint16_t imm );
 void x86_mul( struct x86_function *p, struct x86_reg src );
 void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -250,7 +334,10 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_sahf( struct x86_function *p );
 void x86_div( struct x86_function *p, struct x86_reg src );
-
+void x86_bswap( struct x86_function *p, struct x86_reg src );
+void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm );
+void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm );
+void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm  );
 
 void x86_cdecl_caller_push_regs( struct x86_function *p );
 void x86_cdecl_caller_pop_regs( struct x86_function *p );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 9e02040f6c4..acbff103efe 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -34,7 +34,7 @@
 #include "tgsi_iterate.h"
 
 
-DEBUG_GET_ONCE_BOOL_OPTION(print_sanity, "TGSI_PRINT_SANITY", TRUE);
+DEBUG_GET_ONCE_BOOL_OPTION(print_sanity, "TGSI_PRINT_SANITY", FALSE)
 
 
 typedef struct {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
index 46d8d184190..73f0f414e3f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
@@ -36,7 +36,7 @@ extern "C" {
 
 /* Check the given token stream for errors and common mistakes.
  * Diagnostic messages are printed out to the debug output, and is
- * controlled by the debug option TGSI_PRINT_SANITY (default true).
+ * controlled by the debug option TGSI_PRINT_SANITY (default false).
  * Returns TRUE if there are no errors, even though there could be some warnings.
  */
 boolean
diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c
index a9b7253bf44..03a7f050aa2 100644
--- a/src/gallium/auxiliary/translate/translate.c
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -38,7 +38,8 @@ struct translate *translate_create( const struct translate_key *key )
 {
    struct translate *translate = NULL;
 
-#if defined(PIPE_ARCH_X86)
+/* TODO: enable Win64 once it has actually been tested */
+#if defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(_WIN64))
    translate = translate_sse2_create( key );
    if (translate)
       return translate;
@@ -48,3 +49,8 @@ struct translate *translate_create( const struct translate_key *key )
 
    return translate_generic_create( key );
 }
+
+boolean translate_is_output_format_supported(enum pipe_format format)
+{
+   return translate_generic_is_output_format_supported(format);
+}
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index edd95e07882..a75380228b1 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -85,6 +85,18 @@ struct translate {
                                 unsigned instance_id,
                                 void *output_buffer);
 
+   void (PIPE_CDECL *run_elts16)( struct translate *,
+                                const uint16_t *elts,
+                                unsigned count,
+                                unsigned instance_id,
+                                void *output_buffer);
+
+   void (PIPE_CDECL *run_elts8)( struct translate *,
+                                const uint8_t *elts,
+                                unsigned count,
+                                unsigned instance_id,
+                                void *output_buffer);
+
    void (PIPE_CDECL *run)( struct translate *,
                            unsigned start,
                            unsigned count,
@@ -105,6 +117,8 @@ struct translate *translate_lookup_or_create( struct translate_context *tctx,
 
 struct translate *translate_create( const struct translate_key *key );
 
+boolean translate_is_output_format_supported(enum pipe_format format);
+
 static INLINE int translate_keysize( const struct translate_key *key )
 {
    return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element);
@@ -138,5 +152,6 @@ struct translate *translate_sse2_create( const struct translate_key *key );
 
 struct translate *translate_generic_create( const struct translate_key *key );
 
+boolean translate_generic_is_output_format_supported(enum pipe_format format);
 
 #endif
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 4d1977229e2..ad809db720d 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -64,6 +64,14 @@ struct translate_generic {
       unsigned input_stride;
       unsigned max_index;
 
+      /* this value is set to -1 if this is a normal element with output_format != input_format:
+       * in this case, u_format is used to do a full conversion
+       *
+       * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids:
+       * in this case, memcpy is used to copy this amount of bytes
+       */
+      int copy_size;
+
    } attrib[PIPE_MAX_ATTRIBS];
 
    unsigned nr_attrib;
@@ -187,9 +195,15 @@ ATTRIB( R8G8B8_SNORM,    3, char, TO_8_SNORM )
 ATTRIB( R8G8_SNORM,      2, char, TO_8_SNORM )
 ATTRIB( R8_SNORM,        1, char, TO_8_SNORM )
 
-ATTRIB( A8R8G8B8_UNORM,       4, ubyte, TO_8_UNORM )
-/*ATTRIB( R8G8B8A8_UNORM,       4, ubyte, TO_8_UNORM )*/
-
+static void
+emit_A8R8G8B8_UNORM( const float *attrib, void *ptr)
+{
+   ubyte *out = (ubyte *)ptr;
+   out[0] = TO_8_UNORM(attrib[3]);
+   out[1] = TO_8_UNORM(attrib[0]);
+   out[2] = TO_8_UNORM(attrib[1]);
+   out[3] = TO_8_UNORM(attrib[2]);
+}
 
 static void
 emit_B8G8R8A8_UNORM( const float *attrib, void *ptr)
@@ -348,7 +362,65 @@ static emit_func get_emit_func( enum pipe_format format )
    }
 }
 
+static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg,
+                                         unsigned elt,
+                                         unsigned instance_id,
+                                         void *vert )
+{
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+
+   for (attr = 0; attr < nr_attrs; attr++) {
+      float data[4];
+      uint8_t *dst = (uint8_t *)vert + tg->attrib[attr].output_offset;
 
+      if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
+         const uint8_t *src;
+         unsigned index;
+         int copy_size;
+
+         if (tg->attrib[attr].instance_divisor) {
+            index = instance_id / tg->attrib[attr].instance_divisor;
+         }
+         else {
+            index = elt;
+         }
+
+         /* clamp to void going out of bounds */
+         index = MIN2(index, tg->attrib[attr].max_index);
+
+         src = tg->attrib[attr].input_ptr +
+               tg->attrib[attr].input_stride * index;
+
+         copy_size = tg->attrib[attr].copy_size;
+         if(likely(copy_size >= 0))
+            memcpy(dst, src, copy_size);
+         else
+         {
+            tg->attrib[attr].fetch( data, src, 0, 0 );
+
+            if (0)
+               debug_printf("Fetch linear attr %d  from %p  stride %d  index %d: "
+                         " %f, %f, %f, %f \n",
+                         attr,
+                         tg->attrib[attr].input_ptr,
+                         tg->attrib[attr].input_stride,
+                         index,
+                         data[0], data[1],data[2], data[3]);
+
+            tg->attrib[attr].emit( data, dst );
+         }
+      } else {
+         if(likely(tg->attrib[attr].copy_size >= 0))
+            memcpy(data, &instance_id, 4);
+         else
+         {
+            data[0] = (float)instance_id;
+            tg->attrib[attr].emit( data, dst );
+         }
+      }
+   }
+}
 
 /**
  * Fetch vertex attributes for 'count' vertices.
@@ -361,62 +433,45 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate,
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
-   unsigned nr_attrs = tg->nr_attrib;
-   unsigned attr;
    unsigned i;
 
-   /* loop over vertex attributes (vertex shader inputs)
-    */
    for (i = 0; i < count; i++) {
-      const unsigned elt = *elts++;
-
-      for (attr = 0; attr < nr_attrs; attr++) {
-	 float data[4];
-	 char *dst = vert + tg->attrib[attr].output_offset;
-
-         if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
-            const uint8_t *src;
-            unsigned index;
-
-            if (tg->attrib[attr].instance_divisor) {
-               index = instance_id / tg->attrib[attr].instance_divisor;
-            } else {
-               index = elt;
-            }
-
-            /* clamp to void going out of bounds */
-            index = MIN2(index, tg->attrib[attr].max_index);
-
-            src = tg->attrib[attr].input_ptr +
-                  tg->attrib[attr].input_stride * index;
-
-            tg->attrib[attr].fetch( data, src, 0, 0 );
-
-            if (0)
-               debug_printf("Fetch elt attr %d  from %p  stride %d  div %u  max %u  index %d:  "
-                            " %f, %f, %f, %f \n",
-                            attr,
-                            tg->attrib[attr].input_ptr,
-                            tg->attrib[attr].input_stride,
-                            tg->attrib[attr].instance_divisor,
-                            tg->attrib[attr].max_index,
-                            index,
-                            data[0], data[1],data[2], data[3]);
-         } else {
-            data[0] = (float)instance_id;
-         }
+      generic_run_one(tg, *elts++, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
 
-         if (0)
-            debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
-                         i, elt, attr, data[0], data[1], data[2], data[3]);
+static void PIPE_CDECL generic_run_elts16( struct translate *translate,
+                                         const uint16_t *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
 
-	 tg->attrib[attr].emit( data, dst );
-      }
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
 
+static void PIPE_CDECL generic_run_elts8( struct translate *translate,
+                                         const uint8_t *elts,
+                                         unsigned count,
+                                         unsigned instance_id,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned i;
 
+   for (i = 0; i < count; i++) {
+      generic_run_one(tg, *elts++, instance_id, vert);
+      vert += tg->translate.key.output_stride;
+   }
+}
 
 static void PIPE_CDECL generic_run( struct translate *translate,
                                     unsigned start,
@@ -426,57 +481,10 @@ static void PIPE_CDECL generic_run( struct translate *translate,
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
-   unsigned nr_attrs = tg->nr_attrib;
-   unsigned attr;
    unsigned i;
 
-   /* loop over vertex attributes (vertex shader inputs)
-    */
    for (i = 0; i < count; i++) {
-      unsigned elt = start + i;
-
-      for (attr = 0; attr < nr_attrs; attr++) {
-	 float data[4];
-	 char *dst = vert + tg->attrib[attr].output_offset;
-
-         if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) {
-            const uint8_t *src;
-            unsigned index;
-
-            if (tg->attrib[attr].instance_divisor) {
-               index = instance_id / tg->attrib[attr].instance_divisor;
-            }
-            else {
-               index = elt;
-            }
-
-            /* clamp to void going out of bounds */
-            index = MIN2(index, tg->attrib[attr].max_index);
-
-            src = tg->attrib[attr].input_ptr +
-                  tg->attrib[attr].input_stride * index;
-
-            tg->attrib[attr].fetch( data, src, 0, 0 );
-
-            if (0)
-               debug_printf("Fetch linear attr %d  from %p  stride %d  index %d: "
-                            " %f, %f, %f, %f \n",
-                            attr,
-                            tg->attrib[attr].input_ptr,
-                            tg->attrib[attr].input_stride,
-                            index,
-                            data[0], data[1],data[2], data[3]);
-         } else {
-            data[0] = (float)instance_id;
-         }
-
-         if (0)
-            debug_printf("vert %d attr %d: %f %f %f %f\n",
-                         i, attr, data[0], data[1], data[2], data[3]);
-
-	 tg->attrib[attr].emit( data, dst );
-      }
-      
+      generic_run_one(tg, start + i, instance_id, vert);
       vert += tg->translate.key.output_stride;
    }
 }
@@ -522,6 +530,8 @@ struct translate *translate_generic_create( const struct translate_key *key )
    tg->translate.release = generic_release;
    tg->translate.set_buffer = generic_set_buffer;
    tg->translate.run_elts = generic_run_elts;
+   tg->translate.run_elts16 = generic_run_elts16;
+   tg->translate.run_elts8 = generic_run_elts8;
    tg->translate.run = generic_run;
 
    for (i = 0; i < key->nr_elements; i++) {
@@ -538,9 +548,28 @@ struct translate *translate_generic_create( const struct translate_key *key )
       tg->attrib[i].input_offset = key->element[i].input_offset;
       tg->attrib[i].instance_divisor = key->element[i].instance_divisor;
 
-      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
       tg->attrib[i].output_offset = key->element[i].output_offset;
 
+      tg->attrib[i].copy_size = -1;
+      if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID)
+      {
+            if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED
+                  || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED)
+               tg->attrib[i].copy_size = 4;
+      }
+      else
+      {
+         if(key->element[i].input_format == key->element[i].output_format
+               && format_desc->block.width == 1
+               && format_desc->block.height == 1
+               && !(format_desc->block.bits & 7))
+            tg->attrib[i].copy_size = format_desc->block.bits >> 3;
+      }
+
+      if(tg->attrib[i].copy_size < 0)
+	      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+      else
+	      tg->attrib[i].emit  = NULL;
    }
 
    tg->nr_attrib = key->nr_elements;
@@ -548,3 +577,83 @@ struct translate *translate_generic_create( const struct translate_key *key )
 
    return &tg->translate;
 }
+
+boolean translate_generic_is_output_format_supported(enum pipe_format format)
+{
+   switch(format)
+   {
+   case PIPE_FORMAT_R64G64B64A64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64G64B64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64G64_FLOAT: return TRUE;
+   case PIPE_FORMAT_R64_FLOAT: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32G32B32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32G32_FLOAT: return TRUE;
+   case PIPE_FORMAT_R32_FLOAT: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32G32B32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32G32_USCALED: return TRUE;
+   case PIPE_FORMAT_R32_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32G32B32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32G32_SSCALED: return TRUE;
+   case PIPE_FORMAT_R32_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32G32B32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32G32_UNORM: return TRUE;
+   case PIPE_FORMAT_R32_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R32G32B32A32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32G32B32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32G32_SNORM: return TRUE;
+   case PIPE_FORMAT_R32_SNORM: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16G16B16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16G16_USCALED: return TRUE;
+   case PIPE_FORMAT_R16_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16G16B16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16G16_SSCALED: return TRUE;
+   case PIPE_FORMAT_R16_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16G16B16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16G16_UNORM: return TRUE;
+   case PIPE_FORMAT_R16_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R16G16B16A16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16G16B16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16G16_SNORM: return TRUE;
+   case PIPE_FORMAT_R16_SNORM: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8G8B8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8G8_USCALED: return TRUE;
+   case PIPE_FORMAT_R8_USCALED: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8G8B8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8G8_SSCALED: return TRUE;
+   case PIPE_FORMAT_R8_SSCALED: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8G8B8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8G8_UNORM: return TRUE;
+   case PIPE_FORMAT_R8_UNORM: return TRUE;
+
+   case PIPE_FORMAT_R8G8B8A8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8G8B8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8G8_SNORM: return TRUE;
+   case PIPE_FORMAT_R8_SNORM: return TRUE;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM: return TRUE;
+   case PIPE_FORMAT_B8G8R8A8_UNORM: return TRUE;
+   default: return FALSE;
+   }
+}
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index ef3aa674a34..56c5b36ce28 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -30,11 +30,12 @@
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_format.h"
 
 #include "translate.h"
 
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
 
 #include "rtasm/rtasm_cpu.h"
 #include "rtasm/rtasm_x86sse.h"
@@ -46,21 +47,9 @@
 #define W    3
 
 
-typedef void (PIPE_CDECL *run_func)( struct translate *translate,
-                                     unsigned start,
-                                     unsigned count,
-                                     unsigned instance_id,
-                                     void *output_buffer);
-
-typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
-                                          const unsigned *elts,
-                                          unsigned count,
-                                          unsigned instance_id,
-                                          void *output_buffer);
-
 struct translate_buffer {
    const void *base_ptr;
-   unsigned stride;
+   uintptr_t stride;
    unsigned max_index;
 };
 
@@ -79,15 +68,15 @@ struct translate_sse {
 
    struct x86_function linear_func;
    struct x86_function elt_func;
+   struct x86_function elt16_func;
+   struct x86_function elt8_func;
    struct x86_function *func;
 
    boolean loaded_identity;
-   boolean loaded_255;
-   boolean loaded_inv_255;
+   boolean loaded_const[5];
 
    float identity[4];
-   float float_255[4];
-   float inv_255[4];
+   float const_value[5][4];
 
    struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
    unsigned nr_buffers;
@@ -102,17 +91,16 @@ struct translate_sse {
    boolean use_instancing;
    unsigned instance_id;
 
-   run_func      gen_run;
-   run_elts_func gen_run_elts;
-
    /* these are actually known values, but putting them in a struct
     * like this is helpful to keep them in sync across the file.
     */
    struct x86_reg tmp_EAX;
-   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
-   struct x86_reg outbuf_ECX;
-   struct x86_reg machine_EDX;
-   struct x86_reg count_ESI;    /* decrements to zero */
+   struct x86_reg tmp2_EDX;
+   struct x86_reg tmp3_ECX;
+   struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
+   struct x86_reg machine_EDI;
+   struct x86_reg outbuf_EBX;
+   struct x86_reg count_EBP;    /* decrements to zero */
 };
 
 static int get_offset( const void *a, const void *b )
@@ -124,7 +112,7 @@ static int get_offset( const void *a, const void *b )
 
 static struct x86_reg get_identity( struct translate_sse *p )
 {
-   struct x86_reg reg = x86_make_reg(file_XMM, 6);
+   struct x86_reg reg = x86_make_reg(file_XMM, 7);
 
    if (!p->loaded_identity) {
       p->loaded_identity = TRUE;
@@ -134,267 +122,924 @@ static struct x86_reg get_identity( struct translate_sse *p )
       p->identity[3] = 1;
 
       sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
+		 x86_make_disp(p->machine_EDI,
 			       get_offset(p, &p->identity[0])));
    }
 
    return reg;
 }
 
-static struct x86_reg get_255( struct translate_sse *p )
+static struct x86_reg get_const( struct translate_sse *p, unsigned i, float v)
 {
-   struct x86_reg reg = x86_make_reg(file_XMM, 7);
-
-   if (!p->loaded_255) {
-      p->loaded_255 = TRUE;
-      p->float_255[0] =
-	 p->float_255[1] =
-	 p->float_255[2] =
-	 p->float_255[3] = 255.0f;
-
-      sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
-			       get_offset(p, &p->float_255[0])));
+   struct x86_reg reg = x86_make_reg(file_XMM, 2 + i);
+
+   if (!p->loaded_const[i]) {
+      p->loaded_const[i] = TRUE;
+      p->const_value[i][0] =
+         p->const_value[i][1] =
+         p->const_value[i][2] =
+         p->const_value[i][3] = v;
+
+      sse_movups(p->func, reg,
+                 x86_make_disp(p->machine_EDI,
+                               get_offset(p, &p->const_value[i][0])));
    }
 
    return reg;
 }
 
-static struct x86_reg get_inv_255( struct translate_sse *p )
+static struct x86_reg get_inv_127( struct translate_sse *p )
 {
-   struct x86_reg reg = x86_make_reg(file_XMM, 5);
-
-   if (!p->loaded_inv_255) {
-      p->loaded_inv_255 = TRUE;
-      p->inv_255[0] =
-	 p->inv_255[1] =
-	 p->inv_255[2] =
-	 p->inv_255[3] = 1.0f / 255.0f;
-
-      sse_movups(p->func, reg, 
-		 x86_make_disp(p->machine_EDX, 
-			       get_offset(p, &p->inv_255[0])));
-   }
-
-   return reg;
+   return get_const(p, 0, 1.0f / 127.0f);
 }
 
-
-static void emit_load_R32G32B32A32( struct translate_sse *p, 			   
-				    struct x86_reg data,
-				    struct x86_reg arg0 )
+static struct x86_reg get_inv_255( struct translate_sse *p )
 {
-   sse_movups(p->func, data, arg0);
+   return get_const(p, 1, 1.0f / 255.0f);
 }
 
-static void emit_load_R32G32B32( struct translate_sse *p, 			   
-				 struct x86_reg data,
-				 struct x86_reg arg0 )
+static struct x86_reg get_inv_32767( struct translate_sse *p )
 {
-   /* Have to jump through some hoops:
-    *
-    * c 0 0 0
-    * c 0 0 1
-    * 0 0 c 1
-    * a b c 1
-    */
-   sse_movss(p->func, data, x86_make_disp(arg0, 8));
-   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
-   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
-   sse_movlps(p->func, data, arg0);
+   return get_const(p, 2, 1.0f / 32767.0f);
 }
 
-static void emit_load_R32G32( struct translate_sse *p, 
-			   struct x86_reg data,
-			   struct x86_reg arg0 )
+static struct x86_reg get_inv_65535( struct translate_sse *p )
 {
-   /* 0 0 0 1
-    * a b 0 1
-    */
-   sse_movups(p->func, data, get_identity(p) );
-   sse_movlps(p->func, data, arg0);
+   return get_const(p, 3, 1.0f / 65535.0f);
 }
 
-
-static void emit_load_R32( struct translate_sse *p, 
-			   struct x86_reg data,
-			   struct x86_reg arg0 )
+static struct x86_reg get_inv_2147483647( struct translate_sse *p )
 {
-   /* a 0 0 0
-    * a 0 0 1
-    */
-   sse_movss(p->func, data, arg0);
-   sse_orps(p->func, data, get_identity(p) );
+   return get_const(p, 4, 1.0f / 2147483647.0f);
 }
 
-
-static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
+/* load the data in a SSE2 register, padding with zeros */
+static boolean emit_load_sse2( struct translate_sse *p,
 				       struct x86_reg data,
-				       struct x86_reg src )
+				       struct x86_reg src,
+				       unsigned size)
 {
-
-   /* Load and unpack twice:
-    */
-   sse_movss(p->func, data, src);
-   sse2_punpcklbw(p->func, data, get_identity(p));
-   sse2_punpcklbw(p->func, data, get_identity(p));
-
-   /* Convert to float:
-    */
-   sse2_cvtdq2ps(p->func, data, data);
-
-
-   /* Scale by 1/255.0
-    */
-   sse_mulps(p->func, data, get_inv_255(p));
+   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+   struct x86_reg tmp = p->tmp_EAX;
+   switch(size)
+   {
+   case 1:
+      x86_movzx8(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+      break;
+   case 2:
+      x86_movzx16(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+   case 3:
+      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
+      x86_shl_imm(p->func, tmp, 16);
+      x86_mov16(p->func, tmp, src);
+      sse2_movd(p->func, data, tmp);
+   case 4:
+      sse2_movd(p->func, data, src);
+      break;
+   case 6:
+      sse2_movd(p->func, data, src);
+      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
+      sse2_movd(p->func, tmpXMM, tmp);
+      sse2_punpckldq(p->func, data, tmpXMM);
+      break;
+   case 8:
+      sse2_movq(p->func, data, src);
+      break;
+   case 12:
+      sse2_movq(p->func, data, src);
+      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
+      sse2_punpcklqdq(p->func, data, tmpXMM);
+      break;
+   case 16:
+      sse2_movdqu(p->func, data, src);
+      break;
+   default:
+      return FALSE;
+   }
+   return TRUE;
 }
 
+/* this value can be passed for the out_chans argument */
+#define CHANNELS_0001 5
 
+/* this function will load #chans float values, and will
+ * pad the register with zeroes at least up to out_chans.
+ *
+ * If out_chans is set to CHANNELS_0001, then the fourth
+ * value will be padded with 1. Only pass this value if
+ * chans < 4 or results are undefined.
+ */
+static void emit_load_float32( struct translate_sse *p,
+                                       struct x86_reg data,
+                                       struct x86_reg arg0,
+                                       unsigned out_chans,
+                                       unsigned chans)
+{
+   switch(chans)
+   {
+   case 1:
+      /* a 0 0 0
+       * a 0 0 1
+       */
+      sse_movss(p->func, data, arg0);
+      if(out_chans == CHANNELS_0001)
+         sse_orps(p->func, data, get_identity(p) );
+      break;
+   case 2:
+      /* 0 0 0 1
+       * a b 0 1
+       */
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
+      else if(out_chans > 2)
+         sse_movlhps(p->func, data, get_identity(p) );
+      sse_movlps(p->func, data, arg0);
+      break;
+   case 3:
+      /* Have to jump through some hoops:
+       *
+       * c 0 0 0
+       * c 0 0 1 if out_chans == CHANNELS_0001
+       * 0 0 c 0/1
+       * a b c 0/1
+       */
+      sse_movss(p->func, data, x86_make_disp(arg0, 8));
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
+      sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+      sse_movlps(p->func, data, arg0);
+      break;
+   case 4:
+      sse_movups(p->func, data, arg0);
+      break;
+   }
+}
 
+/* this function behaves like emit_load_float32, but loads
+   64-bit floating point numbers, converting them to 32-bit
+  ones */
+static void emit_load_float64to32( struct translate_sse *p,
+                                       struct x86_reg data,
+                                       struct x86_reg arg0,
+                                       unsigned out_chans,
+                                       unsigned chans)
+{
+   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+   switch(chans)
+   {
+   case 1:
+      sse2_movsd(p->func, data, arg0);
+      if(out_chans > 1)
+         sse2_cvtpd2ps(p->func, data, data);
+      else
+         sse2_cvtsd2ss(p->func, data, data);
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W)  );
+      break;
+   case 2:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      if(out_chans == CHANNELS_0001)
+         sse_shufps(p->func, data, get_identity(p), SHUF(X, Y, Z, W) );
+      else if(out_chans > 2)
+         sse_movlhps(p->func, data, get_identity(p) );
+       break;
+   case 3:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+      if(out_chans > 3)
+         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+      else
+         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
+      sse_movlhps(p->func, data, tmpXMM);
+      if(out_chans == CHANNELS_0001)
+         sse_orps(p->func, data, get_identity(p) );
+      break;
+   case 4:
+      sse2_movupd(p->func, data, arg0);
+      sse2_cvtpd2ps(p->func, data, data);
+      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
+      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
+      sse_movlhps(p->func, data, tmpXMM);
+      break;
+   }
+}
 
-static void emit_store_R32G32B32A32( struct translate_sse *p, 			   
-				     struct x86_reg dest,
-				     struct x86_reg dataXMM )
+static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
 {
-   sse_movups(p->func, dest, dataXMM);
+   if(x86_target(p->func) != X86_32)
+      x64_mov64(p->func, dst_gpr, src_gpr);
+   else
+   {
+      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
+      if(x86_target_caps(p->func) & X86_SSE2)
+         sse2_movq(p->func, dst_xmm, src_xmm);
+      else
+         sse_movlps(p->func, dst_xmm, src_xmm);
+   }
 }
 
-static void emit_store_R32G32B32( struct translate_sse *p, 
-				  struct x86_reg dest,
-				  struct x86_reg dataXMM )
+static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
 {
-   /* Emit two, shuffle, emit one.
-    */
-   sse_movlps(p->func, dest, dataXMM);
-   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
-   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
+   emit_mov64(p, dst_gpr, dst_xmm, src, src);
 }
 
-static void emit_store_R32G32( struct translate_sse *p, 
-			       struct x86_reg dest,
-			       struct x86_reg dataXMM )
+static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
 {
-   sse_movlps(p->func, dest, dataXMM);
+   emit_mov64(p, dst, dst, src_gpr, src_xmm);
 }
 
-static void emit_store_R32( struct translate_sse *p, 
-			    struct x86_reg dest,
-			    struct x86_reg dataXMM )
+static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
 {
-   sse_movss(p->func, dest, dataXMM);
+   if(x86_target_caps(p->func) & X86_SSE2)
+      sse2_movdqu(p->func, dst, src);
+   else
+      sse_movups(p->func, dst, src);
 }
 
+/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
+ * but may or may not be good on older processors
+ * TODO: may perhaps want to use non-temporal stores here if possible
+ */
+static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
+{
+   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
+   struct x86_reg dataGPR = p->tmp_EAX;
+   struct x86_reg dataGPR2 = p->tmp2_EDX;
+
+   if(size < 8)
+   {
+      switch (size)
+      {
+      case 1:
+         x86_mov8(p->func, dataGPR, src);
+         x86_mov8(p->func, dst, dataGPR);
+         break;
+      case 2:
+         x86_mov16(p->func, dataGPR, src);
+         x86_mov16(p->func, dst, dataGPR);
+         break;
+      case 3:
+         x86_mov16(p->func, dataGPR, src);
+         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
+         x86_mov16(p->func, dst, dataGPR);
+         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
+         break;
+      case 4:
+         x86_mov(p->func, dataGPR, src);
+         x86_mov(p->func, dst, dataGPR);
+         break;
+      case 6:
+         x86_mov(p->func, dataGPR, src);
+         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
+         x86_mov(p->func, dst, dataGPR);
+         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
+         break;
+      }
+   }
+   else if(!(x86_target_caps(p->func) & X86_SSE))
+   {
+      unsigned i = 0;
+      assert((size & 3) == 0);
+      for(i = 0; i < size; i += 4)
+      {
+         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
+         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
+      }
+   }
+   else
+   {
+      switch(size)
+      {
+      case 8:
+         emit_load64(p, dataGPR, dataXMM, src);
+         emit_store64(p, dst, dataGPR, dataXMM);
+         break;
+      case 12:
+         emit_load64(p, dataGPR2, dataXMM, src);
+         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
+         emit_store64(p, dst, dataGPR2, dataXMM);
+         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
+         break;
+      case 16:
+         emit_mov128(p, dataXMM, src);
+         emit_mov128(p, dst, dataXMM);
+         break;
+      case 24:
+         emit_mov128(p, dataXMM, src);
+         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
+         emit_mov128(p, dst, dataXMM);
+         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
+         break;
+      case 32:
+         emit_mov128(p, dataXMM, src);
+         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
+         emit_mov128(p, dst, dataXMM);
+         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
+         break;
+      default:
+         assert(0);
+      }
+   }
+}
 
+static boolean translate_attr_convert( struct translate_sse *p,
+                               const struct translate_element *a,
+                               struct x86_reg src,
+                               struct x86_reg dst)
 
-static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
-				       struct x86_reg dest,
-				       struct x86_reg dataXMM )
 {
-   /* Scale by 255.0
-    */
-   sse_mulps(p->func, dataXMM, get_255(p));
+   const struct util_format_description* input_desc = util_format_description(a->input_format);
+   const struct util_format_description* output_desc = util_format_description(a->output_format);
+   unsigned i;
+   boolean id_swizzle = TRUE;
+   unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
+   unsigned needed_chans = 0;
+   unsigned imms[2] = {0, 0x3f800000};
 
-   /* Pack and emit:
-    */
-   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
-   sse2_packssdw(p->func, dataXMM, dataXMM);
-   sse2_packuswb(p->func, dataXMM, dataXMM);
-   sse_movss(p->func, dest, dataXMM);
-}
+   if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
+      return FALSE;
+
+   if(input_desc->channel[0].size & 7)
+      return FALSE;
 
+   if(input_desc->colorspace != output_desc->colorspace)
+      return FALSE;
 
+   for(i = 1; i < input_desc->nr_channels; ++i)
+   {
+      if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
+         return FALSE;
+   }
 
+   for(i = 1; i < output_desc->nr_channels; ++i)
+   {
+      if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
+         return FALSE;
+   }
 
+   for(i = 0; i < output_desc->nr_channels; ++i)
+   {
+      if(output_desc->swizzle[i] < 4)
+         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
+   }
 
-/* Extended swizzles?  Maybe later.
- */  
-static void emit_swizzle( struct translate_sse *p,
-			  struct x86_reg dest,
-			  struct x86_reg src,
-			  unsigned char shuffle )
-{
-   sse_shufps(p->func, dest, src, shuffle);
-}
+   if((x86_target_caps(p->func) & X86_SSE) && (0
+         || a->output_format == PIPE_FORMAT_R32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
+         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
 
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+            swizzle[i] = i;
+      }
 
-static boolean translate_attr( struct translate_sse *p,
-			       const struct translate_element *a,
-			       struct x86_reg srcECX,
-			       struct x86_reg dstEAX)
-{
-   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] < 4)
+            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+            id_swizzle = FALSE;
+      }
 
-   switch (a->input_format) {
-   case PIPE_FORMAT_R32_FLOAT:
-      emit_load_R32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      emit_load_R32G32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      emit_load_R32G32B32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      emit_load_R32G32B32A32(p, dataXMM, srcECX);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
-      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
-      break;
-   default:
-      return FALSE;
+      if(needed_chans > 0)
+      {
+         switch(input_desc->channel[0].type)
+         {
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if(!(x86_target_caps(p->func) & X86_SSE2))
+               return FALSE;
+            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+            /* TODO: add support for SSE4.1 pmovzx */
+            switch(input_desc->channel[0].size)
+            {
+            case 8:
+               /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
+               sse2_punpcklbw(p->func, dataXMM, get_identity(p));
+               sse2_punpcklbw(p->func, dataXMM, get_identity(p));
+               break;
+            case 16:
+               sse2_punpcklwd(p->func, dataXMM, get_identity(p));
+               break;
+            case 32: /* we lose precision here */
+               sse2_psrld_imm(p->func, dataXMM, 1);
+               break;
+            default:
+               return FALSE;
+            }
+            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if(input_desc->channel[0].normalized)
+            {
+               struct x86_reg factor;
+               switch(input_desc->channel[0].size)
+               {
+               case 8:
+                  factor = get_inv_255(p);
+                  break;
+               case 16:
+                  factor = get_inv_65535(p);
+                  break;
+               case 32:
+                  factor = get_inv_2147483647(p);
+                  break;
+               }
+               sse_mulps(p->func, dataXMM, factor);
+            }
+            else if(input_desc->channel[0].size == 32)
+               sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if(!(x86_target_caps(p->func) & X86_SSE2))
+               return FALSE;
+            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+            /* TODO: add support for SSE4.1 pmovsx */
+            switch(input_desc->channel[0].size)
+            {
+            case 8:
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_psrad_imm(p->func, dataXMM, 24);
+               break;
+            case 16:
+               sse2_punpcklwd(p->func, dataXMM, dataXMM);
+               sse2_psrad_imm(p->func, dataXMM, 16);
+               break;
+            case 32: /* we lose precision here */
+               break;
+            default:
+               return FALSE;
+            }
+            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
+            if(input_desc->channel[0].normalized)
+            {
+               struct x86_reg factor;
+               switch(input_desc->channel[0].size)
+               {
+               case 8:
+                  factor = get_inv_127(p);
+                  break;
+               case 16:
+                  factor = get_inv_32767(p);
+                  break;
+               case 32:
+                  factor = get_inv_2147483647(p);
+                  break;
+               }
+               sse_mulps(p->func, dataXMM, factor);
+            }
+            break;
+
+            break;
+         case UTIL_FORMAT_TYPE_FLOAT:
+            if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
+               return FALSE;
+            if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
+            {
+               swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
+               needed_chans = CHANNELS_0001;
+            }
+            switch(input_desc->channel[0].size)
+            {
+            case 32:
+               emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+               break;
+            case 64: /* we lose precision here */
+               if(!(x86_target_caps(p->func) & X86_SSE2))
+                  return FALSE;
+               emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
+               break;
+            default:
+               return FALSE;
+            }
+            break;
+         default:
+            return FALSE;
+         }
+
+         if(!id_swizzle)
+            sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
+      }
+
+      if(output_desc->nr_channels >= 4
+            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+            )
+         sse_movups(p->func, dst, dataXMM);
+      else
+      {
+         if(output_desc->nr_channels >= 2
+               && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+               && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+            sse_movlps(p->func, dst, dataXMM);
+         else
+         {
+            if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+               sse_movss(p->func, dst, dataXMM);
+            else
+               x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+
+            if(output_desc->nr_channels >= 2)
+            {
+               if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
+                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
+               }
+               else
+                  x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+            }
+         }
+
+         if(output_desc->nr_channels >= 3)
+         {
+            if(output_desc->nr_channels >= 4
+                  && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+                  && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
+            else
+            {
+               if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
+                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
+               }
+               else
+                  x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+               if(output_desc->nr_channels >= 4)
+               {
+                  if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+                  {
+                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
+                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
+                  }
+                  else
+                     x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+               }
+            }
+         }
+      }
+      return TRUE;
    }
+   else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
+         && output_desc->channel[0].normalized == input_desc->channel[0].normalized
+         && (0
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+               ))
+   {
+      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
+      struct x86_reg tmp = p->tmp_EAX;
+      unsigned imms[2] = {0, 1};
+
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
+            swizzle[i] = i;
+      }
 
-   switch (a->output_format) {
-   case PIPE_FORMAT_R32_FLOAT:
-      emit_store_R32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      emit_store_R32G32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      emit_store_R32G32B32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
-      break;
-   default:
-      return FALSE;
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         if(swizzle[i] < 4)
+            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
+         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
+            id_swizzle = FALSE;
+      }
+
+      if(needed_chans > 0)
+      {
+         emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
+
+         switch(input_desc->channel[0].type)
+         {
+         case UTIL_FORMAT_TYPE_UNSIGNED:
+            if(input_desc->channel[0].normalized)
+            {
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
+        	       sse2_psrlw_imm(p->func, dataXMM, 1);
+            }
+            else
+               sse2_punpcklbw(p->func, dataXMM, get_identity(p));
+            break;
+         case UTIL_FORMAT_TYPE_SIGNED:
+            if(input_desc->channel[0].normalized)
+            {
+               sse2_movq(p->func, tmpXMM, get_identity(p));
+               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
+               sse2_psllw_imm(p->func, dataXMM, 9);
+               sse2_psrlw_imm(p->func, dataXMM, 8);
+               sse2_por(p->func, tmpXMM, dataXMM);
+               sse2_psrlw_imm(p->func, dataXMM, 7);
+               sse2_por(p->func, tmpXMM, dataXMM);
+               {
+                  struct x86_reg t = dataXMM;
+                  dataXMM = tmpXMM;
+                  tmpXMM = t;
+               }
+            }
+            else
+            {
+               sse2_punpcklbw(p->func, dataXMM, dataXMM);
+               sse2_psraw_imm(p->func, dataXMM, 8);
+            }
+            break;
+         default:
+            assert(0);
+         }
+
+         if(output_desc->channel[0].normalized)
+            imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
+
+         if(!id_swizzle)
+            sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
+      }
+
+      if(output_desc->nr_channels >= 4
+            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
+            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
+            )
+         sse2_movq(p->func, dst, dataXMM);
+      else
+      {
+         if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
+         {
+            if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
+               sse2_movd(p->func, dst, dataXMM);
+            else
+            {
+               sse2_movd(p->func, tmp, dataXMM);
+               x86_mov16(p->func, dst, tmp);
+               if(output_desc->nr_channels >= 2)
+                  x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
+            }
+         }
+         else
+         {
+            if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
+               x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+            else
+            {
+               x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
+               if(output_desc->nr_channels >= 2)
+               {
+                  sse2_movd(p->func, tmp, dataXMM);
+                  x86_shr_imm(p->func, tmp, 16);
+                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
+               }
+            }
+         }
+
+         if(output_desc->nr_channels >= 3)
+         {
+            if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
+            {
+               if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
+               {
+                  sse2_psrlq_imm(p->func, dataXMM, 32);
+                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
+               }
+               else
+               {
+                  sse2_psrlq_imm(p->func, dataXMM, 32);
+                  sse2_movd(p->func, tmp, dataXMM);
+                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
+                  if(output_desc->nr_channels >= 4)
+                  {
+                     x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
+                  }
+               }
+            }
+            else
+            {
+               if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
+                  x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+               else
+               {
+                  x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
+
+                  if(output_desc->nr_channels >= 4)
+                  {
+                     sse2_psrlq_imm(p->func, dataXMM, 48);
+                     sse2_movd(p->func, tmp, dataXMM);
+                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
+                  }
+               }
+            }
+         }
+      }
+      return TRUE;
    }
+   else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
+   {
+      struct x86_reg tmp = p->tmp_EAX;
+      unsigned i;
+      if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
+                     && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
+                     && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
+                     && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
+                     && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
+      {
+         /* TODO: support movbe */
+         x86_mov(p->func, tmp, src);
+         x86_bswap(p->func, tmp);
+         x86_mov(p->func, dst, tmp);
+         return TRUE;
+      }
 
-   return TRUE;
+      for(i = 0; i < output_desc->nr_channels; ++i)
+      {
+         switch(output_desc->channel[0].size)
+         {
+         case 8:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[0].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[0].normalized ? 0xff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[0].normalized ? 0x7f : 1;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
+            }
+            else
+            {
+               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
+               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
+            }
+            break;
+         case 16:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[1].normalized ? 0xffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     v = 0x3c00;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
+            }
+            else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
+               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
+            else
+            {
+               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
+               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
+            }
+            break;
+         case 32:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned v = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     v = 0x3f800000;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
+            }
+            else
+            {
+               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
+               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
+            }
+            break;
+         case 64:
+            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
+            {
+               unsigned l = 0;
+               unsigned h = 0;
+               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
+               {
+                  switch(output_desc->channel[1].type)
+                  {
+                  case UTIL_FORMAT_TYPE_UNSIGNED:
+                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
+                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_SIGNED:
+                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
+                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
+                     break;
+                  case UTIL_FORMAT_TYPE_FLOAT:
+                     h = 0x3ff00000;
+                     l = 0;
+                     break;
+                  default:
+                     return FALSE;
+                  }
+               }
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
+               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
+            }
+            else
+            {
+               if(x86_target_caps(p->func) & X86_SSE)
+               {
+                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
+                  emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
+                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
+               }
+               else
+               {
+                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
+                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
+                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
+                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
+               }
+            }
+            break;
+         default:
+            return FALSE;
+         }
+      }
+      return TRUE;
+   }
+   return FALSE;
 }
 
+static boolean translate_attr( struct translate_sse *p,
+			       const struct translate_element *a,
+			       struct x86_reg src,
+			       struct x86_reg dst)
+{
+   if(a->input_format == a->output_format)
+   {
+      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
+      return TRUE;
+   }
+
+   return translate_attr_convert(p, a, src, dst);
+}
 
 static boolean init_inputs( struct translate_sse *p,
-                            boolean linear )
+                            unsigned index_size )
 {
    unsigned i;
-   struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
+   struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
                                               get_offset(p, &p->instance_id));
 
    for (i = 0; i < p->nr_buffer_varients; i++) {
       struct translate_buffer_varient *varient = &p->buffer_varient[i];
       struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
 
-      if (linear || varient->instance_divisor) {
-         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
+      if (!index_size || varient->instance_divisor) {
+         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &buffer->stride));
-         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &varient->ptr));
-         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
                                                      get_offset(p, &buffer->base_ptr));
-         struct x86_reg elt = p->idx_EBX;
+         struct x86_reg elt = p->idx_ESI;
          struct x86_reg tmp_EAX = p->tmp_EAX;
 
          /* Calculate pointer to first attrib:
@@ -406,20 +1051,16 @@ static boolean init_inputs( struct translate_sse *p,
             x86_mov(p->func, tmp_EAX, instance_id);
 
             if (varient->instance_divisor != 1) {
-               struct x86_reg tmp_EDX = p->machine_EDX;
-               struct x86_reg tmp_ECX = p->outbuf_ECX;
+               struct x86_reg tmp_EDX = p->tmp2_EDX;
+               struct x86_reg tmp_ECX = p->tmp3_ECX;
 
                /* TODO: Add x86_shr() to rtasm and use it whenever
                 *       instance divisor is power of two.
                 */
 
-               x86_push(p->func, tmp_EDX);
-               x86_push(p->func, tmp_ECX);
                x86_xor(p->func, tmp_EDX, tmp_EDX);
                x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
                x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
-               x86_pop(p->func, tmp_ECX);
-               x86_pop(p->func, tmp_EDX);
             }
          } else {
             x86_mov(p->func, tmp_EAX, elt);
@@ -430,16 +1071,23 @@ static boolean init_inputs( struct translate_sse *p,
           */
 
          x86_imul(p->func, tmp_EAX, buf_stride);
+         x64_rexw(p->func);
          x86_add(p->func, tmp_EAX, buf_base_ptr);
 
 
          /* In the linear case, keep the buffer pointer instead of the
           * index number.
           */
-         if (linear && p->nr_buffer_varients == 1)
+         if (!index_size && p->nr_buffer_varients == 1)
+         {
+            x64_rexw(p->func);
             x86_mov(p->func, elt, tmp_EAX);
+         }
          else
+         {
+            x64_rexw(p->func);
             x86_mov(p->func, buf_ptr, tmp_EAX);
+         }
       }
    }
 
@@ -448,23 +1096,24 @@ static boolean init_inputs( struct translate_sse *p,
 
 
 static struct x86_reg get_buffer_ptr( struct translate_sse *p,
-                                      boolean linear,
+                                      unsigned index_size,
                                       unsigned var_idx,
                                       struct x86_reg elt )
 {
    if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
-      return x86_make_disp(p->machine_EDX,
+      return x86_make_disp(p->machine_EDI,
                            get_offset(p, &p->instance_id));
    }
-   if (linear && p->nr_buffer_varients == 1) {
-      return p->idx_EBX;
+   if (!index_size && p->nr_buffer_varients == 1) {
+      return p->idx_ESI;
    }
-   else if (linear || p->buffer_varient[var_idx].instance_divisor) {
+   else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
       struct x86_reg ptr = p->tmp_EAX;
       struct x86_reg buf_ptr = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer_varient[var_idx].ptr));
       
+      x64_rexw(p->func);
       x86_mov(p->func, ptr, buf_ptr);
       return ptr;
    }
@@ -473,19 +1122,31 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p,
       const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
 
       struct x86_reg buf_stride = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer[varient->buffer_index].stride));
 
       struct x86_reg buf_base_ptr = 
-         x86_make_disp(p->machine_EDX, 
+         x86_make_disp(p->machine_EDI,
                        get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
 
 
 
       /* Calculate pointer to current attrib:
        */
-      x86_mov(p->func, ptr, buf_stride);
-      x86_imul(p->func, ptr, elt);
+      switch(index_size)
+      {
+      case 1:
+         x86_movzx8(p->func, ptr, elt);
+         break;
+      case 2:
+         x86_movzx16(p->func, ptr, elt);
+         break;
+      case 4:
+         x86_mov(p->func, ptr, elt);
+         break;
+      }
+      x86_imul(p->func, ptr, buf_stride);
+      x64_rexw(p->func);
       x86_add(p->func, ptr, buf_base_ptr);
       return ptr;
    }
@@ -494,39 +1155,42 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p,
 
 
 static boolean incr_inputs( struct translate_sse *p, 
-                            boolean linear )
+                            unsigned index_size )
 {
-   if (linear && p->nr_buffer_varients == 1) {
-      struct x86_reg stride = x86_make_disp(p->machine_EDX,
+   if (!index_size && p->nr_buffer_varients == 1) {
+      struct x86_reg stride = x86_make_disp(p->machine_EDI,
                                             get_offset(p, &p->buffer[0].stride));
 
       if (p->buffer_varient[0].instance_divisor == 0) {
-         x86_add(p->func, p->idx_EBX, stride);
-         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+         x64_rexw(p->func);
+         x86_add(p->func, p->idx_ESI, stride);
+         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
       }
    }
-   else if (linear) {
+   else if (!index_size) {
       unsigned i;
 
       /* Is this worthwhile??
        */
       for (i = 0; i < p->nr_buffer_varients; i++) {
          struct translate_buffer_varient *varient = &p->buffer_varient[i];
-         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
                                                 get_offset(p, &varient->ptr));
-         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
+         struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
                                                    get_offset(p, &p->buffer[varient->buffer_index].stride));
 
          if (varient->instance_divisor == 0) {
-            x86_mov(p->func, p->tmp_EAX, buf_ptr);
-            x86_add(p->func, p->tmp_EAX, buf_stride);
+            x86_mov(p->func, p->tmp_EAX, buf_stride);
+            x64_rexw(p->func);
+            x86_add(p->func, p->tmp_EAX, buf_ptr);
             if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+            x64_rexw(p->func);
             x86_mov(p->func, buf_ptr, p->tmp_EAX);
          }
       }
    } 
    else {
-      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
+      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
    }
    
    return TRUE;
@@ -551,35 +1215,51 @@ static boolean incr_inputs( struct translate_sse *p,
  */
 static boolean build_vertex_emit( struct translate_sse *p,
 				  struct x86_function *func,
-				  boolean linear )
+				  unsigned index_size )
 {
    int fixup, label;
    unsigned j;
 
    p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
-   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
-   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
-   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
-   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
+   p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
+   p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
+   p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
+   p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
+   p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
+   p->tmp3_ECX     = x86_make_reg(file_REG32, reg_CX);
 
    p->func = func;
-   p->loaded_inv_255 = FALSE;
-   p->loaded_255 = FALSE;
+   memset(&p->loaded_const, 0, sizeof(p->loaded_const));
    p->loaded_identity = FALSE;
 
    x86_init_func(p->func);
 
-   /* Push a few regs?
-    */
-   x86_push(p->func, p->idx_EBX);
-   x86_push(p->func, p->count_ESI);
+   if(x86_target(p->func) == X86_64_WIN64_ABI)
+   {
+	   /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
+	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
+	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
+   }
 
-   /* Load arguments into regs:
-    */
-   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
-   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
-   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
-   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
+   x86_push(p->func, p->outbuf_EBX);
+   x86_push(p->func, p->count_EBP);
+
+/* on non-Win64 x86-64, these are already in the right registers */
+   if(x86_target(p->func) != X86_64_STD_ABI)
+   {
+      x86_push(p->func, p->machine_EDI);
+      x86_push(p->func, p->idx_ESI);
+
+      x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
+      x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
+   }
+
+   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
+
+   if(x86_target(p->func) != X86_32)
+      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
+   else
+      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
 
    /* Load instance ID.
     */
@@ -588,25 +1268,25 @@ static boolean build_vertex_emit( struct translate_sse *p,
               p->tmp_EAX,
               x86_fn_arg(p->func, 4));
       x86_mov(p->func,
-              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
+              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
               p->tmp_EAX);
    }
 
    /* Get vertex count, compare to zero
     */
    x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
-   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
+   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
    fixup = x86_jcc_forward(p->func, cc_E);
 
    /* always load, needed or not:
     */
-   init_inputs(p, linear);
+   init_inputs(p, index_size);
 
    /* Note address for loop jump
     */
    label = x86_get_label(p->func);
    {
-      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
+      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
       int last_varient = -1;
       struct x86_reg vb;
 
@@ -618,30 +1298,31 @@ static boolean build_vertex_emit( struct translate_sse *p,
           */
          if (varient != last_varient) {
             last_varient = varient;
-            vb = get_buffer_ptr(p, linear, varient, elt);
+            vb = get_buffer_ptr(p, index_size, varient, elt);
          }
          
          if (!translate_attr( p, a, 
                               x86_make_disp(vb, a->input_offset), 
-                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
+                              x86_make_disp(p->outbuf_EBX, a->output_offset)))
             return FALSE;
       }
 
       /* Next output vertex:
        */
+      x64_rexw(p->func);
       x86_lea(p->func, 
-              p->outbuf_ECX, 
-              x86_make_disp(p->outbuf_ECX, 
+              p->outbuf_EBX,
+              x86_make_disp(p->outbuf_EBX,
                             p->translate.key.output_stride));
 
       /* Incr index
        */ 
-      incr_inputs( p, linear );
+      incr_inputs( p, index_size );
    }
 
    /* decr count, loop if not zero
     */
-   x86_dec(p->func, p->count_ESI);
+   x86_dec(p->func, p->count_EBP);
    x86_jcc(p->func, cc_NZ, label);
 
    /* Exit mmx state?
@@ -656,8 +1337,20 @@ static boolean build_vertex_emit( struct translate_sse *p,
    /* Pop regs and return
     */
    
-   x86_pop(p->func, p->count_ESI);
-   x86_pop(p->func, p->idx_EBX);
+   if(x86_target(p->func) != X86_64_STD_ABI)
+   {
+      x86_pop(p->func, p->idx_ESI);
+      x86_pop(p->func, p->machine_EDI);
+   }
+
+   x86_pop(p->func, p->count_EBP);
+   x86_pop(p->func, p->outbuf_EBX);
+
+   if(x86_target(p->func) == X86_64_WIN64_ABI)
+   {
+	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
+	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
+   }
    x86_ret(p->func);
 
    return TRUE;
@@ -700,43 +1393,14 @@ static void translate_sse_release( struct translate *translate )
    FREE(p);
 }
 
-static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
-			      const unsigned *elts,
-			      unsigned count,
-                              unsigned instance_id,
-			      void *output_buffer )
-{
-   struct translate_sse *p = (struct translate_sse *)translate;
-
-   p->gen_run_elts( translate,
-		    elts,
-		    count,
-                    instance_id,
-                    output_buffer);
-}
-
-static void PIPE_CDECL translate_sse_run( struct translate *translate,
-			 unsigned start,
-			 unsigned count,
-                         unsigned instance_id,
-			 void *output_buffer )
-{
-   struct translate_sse *p = (struct translate_sse *)translate;
-
-   p->gen_run( translate,
-	       start,
-	       count,
-               instance_id,
-               output_buffer);
-}
-
 
 struct translate *translate_sse2_create( const struct translate_key *key )
 {
    struct translate_sse *p = NULL;
    unsigned i;
 
-   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
+   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
+   if (!rtasm_cpu_has_sse())
       goto fail;
 
    p = CALLOC_STRUCT( translate_sse );
@@ -746,8 +1410,6 @@ struct translate *translate_sse2_create( const struct translate_key *key )
    p->translate.key = *key;
    p->translate.release = translate_sse_release;
    p->translate.set_buffer = translate_sse_set_buffer;
-   p->translate.run_elts = translate_sse_run_elts;
-   p->translate.run = translate_sse_run;
 
    for (i = 0; i < key->nr_elements; i++) {
       if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
@@ -783,18 +1445,32 @@ struct translate *translate_sse2_create( const struct translate_key *key )
 
    if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
 
-   if (!build_vertex_emit(p, &p->linear_func, TRUE))
+   if (!build_vertex_emit(p, &p->linear_func, 0))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt_func, 4))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt16_func, 2))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt8_func, 1))
+      goto fail;
+
+   p->translate.run = (void*)x86_get_func(&p->linear_func);
+   if (p->translate.run == NULL)
       goto fail;
 
-   if (!build_vertex_emit(p, &p->elt_func, FALSE))
+   p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
+   if (p->translate.run_elts == NULL)
       goto fail;
 
-   p->gen_run = (run_func)x86_get_func(&p->linear_func);
-   if (p->gen_run == NULL)
+   p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
+   if (p->translate.run_elts16 == NULL)
       goto fail;
 
-   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
-   if (p->gen_run_elts == NULL)
+   p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
+   if (p->translate.run_elts8 == NULL)
       goto fail;
 
    return &p->translate;
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
index 6f38d222854..b9b9f9257af 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.c
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -73,7 +73,7 @@
 #endif
 
 
-DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", TRUE);
+DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE)
 
 
 struct util_cpu_caps util_cpu_caps;
@@ -194,123 +194,8 @@ check_os_altivec_support(void)
 }
 #endif /* PIPE_ARCH_PPC */
 
-/* If we're running on a processor that can do SSE, let's see if we
- * are allowed to or not.  This will catch 2.4.0 or later kernels that
- * haven't been configured for a Pentium III but are running on one,
- * and RedHat patched 2.2 kernels that have broken exception handling
- * support for user space apps that do SSE.
- */
-#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
-static void
-check_os_katmai_support(void)
-{
-#if defined(PIPE_ARCH_X86)
-#if defined(PIPE_OS_FREEBSD)
-   int has_sse=0, ret;
-   int len = sizeof (has_sse);
-
-   ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
-   if (ret || !has_sse)
-      util_cpu_caps.has_sse=0;
-
-#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD)
-   int has_sse, has_sse2, ret, mib[2];
-   int varlen;
-
-   mib[0] = CTL_MACHDEP;
-   mib[1] = CPU_SSE;
-   varlen = sizeof (has_sse);
-
-   ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
-   if (ret < 0 || !has_sse) {
-      util_cpu_caps.has_sse = 0;
-   } else {
-      util_cpu_caps.has_sse = 1;
-   }
-
-   mib[1] = CPU_SSE2;
-   varlen = sizeof (has_sse2);
-   ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
-   if (ret < 0 || !has_sse2) {
-      util_cpu_caps.has_sse2 = 0;
-   } else {
-      util_cpu_caps.has_sse2 = 1;
-   }
-   util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */
-
-#elif defined(PIPE_OS_WINDOWS)
-   LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
-   if (util_cpu_caps.has_sse) {
-      exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
-#if defined(PIPE_CC_GCC)
-      __asm __volatile ("xorps %xmm0, %xmm0");
-#elif defined(PIPE_CC_MSVC)
-      __asm {
-          xorps xmm0, xmm0        /* executing SSE instruction */
-      }
-#else
-#error Unsupported compiler
-#endif
-      SetUnhandledExceptionFilter(exc_fil);
-   }
-#elif defined(PIPE_OS_LINUX)
-   struct sigaction saved_sigill;
-   struct sigaction saved_sigfpe;
-
-   /* Save the original signal handlers.
-   */
-   sigaction(SIGILL, NULL, &saved_sigill);
-   sigaction(SIGFPE, NULL, &saved_sigfpe);
-
-   signal(SIGILL, (void (*)(int))sigill_handler_sse);
-   signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
-
-   /* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
-    * supports the extended FPU save and restore required for SSE.  If
-    * we execute an SSE instruction on a PIII and get a SIGILL, the OS
-    * doesn't support Streaming SIMD Exceptions, even if the processor
-    * does.
-    */
-   if (util_cpu_caps.has_sse) {
-      __asm __volatile ("xorps %xmm1, %xmm0");
-   }
-
-   /* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
-    * it supports unmasked SIMD FPU exceptions.  If we unmask the
-    * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
-    * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
-    * as expected, we're okay but we need to clean up after it.
-    *
-    * Are we being too stringent in our requirement that the OS support
-    * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
-    * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
-    * doesn't even support them.  We at least know the user-space SSE
-    * support is good in kernels that do support unmasked exceptions,
-    * and therefore to be safe I'm going to leave this test in here.
-    */
-   if (util_cpu_caps.has_sse) {
-      /* test_os_katmai_exception_support(); */
-   }
-
-   /* Restore the original signal handlers.
-   */
-   sigaction(SIGILL, &saved_sigill, NULL);
-   sigaction(SIGFPE, &saved_sigfpe, NULL);
-
-#else
-   /* We can't use POSIX signal handling to test the availability of
-    * SSE, so we disable it by default.
-    */
-   util_cpu_caps.has_sse = 0;
-#endif /* __linux__ */
-#endif
-
-#if defined(PIPE_ARCH_X86_64)
-   util_cpu_caps.has_sse = 1;
-#endif
-}
-
 
+#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64)
 static int has_cpuid(void)
 {
 #if defined(PIPE_ARCH_X86)
@@ -391,23 +276,6 @@ util_cpu_detect(void)
 
    memset(&util_cpu_caps, 0, sizeof util_cpu_caps);
 
-   /* Check for arch type */
-#if defined(PIPE_ARCH_MIPS)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_MIPS;
-#elif defined(PIPE_ARCH_ALPHA)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_ALPHA;
-#elif defined(PIPE_ARCH_SPARC)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_SPARC;
-#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_X86;
-   util_cpu_caps.little_endian = 1;
-#elif defined(PIPE_ARCH_PPC)
-   util_cpu_caps.arch = UTIL_CPU_ARCH_POWERPC;
-   util_cpu_caps.little_endian = 0;
-#else
-   util_cpu_caps.arch = UTIL_CPU_ARCH_UNKNOWN;
-#endif
-
    /* Count the number of CPUs in system */
 #if defined(PIPE_OS_WINDOWS)
    {
@@ -486,9 +354,6 @@ util_cpu_detect(void)
          util_cpu_caps.cacheline = regs2[2] & 0xFF;
       }
 
-      if (util_cpu_caps.has_sse)
-         check_os_katmai_support();
-
       if (!util_cpu_caps.has_sse) {
          util_cpu_caps.has_sse2 = 0;
          util_cpu_caps.has_sse3 = 0;
@@ -504,7 +369,6 @@ util_cpu_detect(void)
 
 #ifdef DEBUG
    if (debug_get_option_dump_cpu()) {
-      debug_printf("util_cpu_caps.arch = %i\n", util_cpu_caps.arch);
       debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus);
 
       debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type);
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
index 4b3dc39c342..f3bef0993c7 100644
--- a/src/gallium/auxiliary/util/u_cpu_detect.h
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -36,26 +36,15 @@
 #define _UTIL_CPU_DETECT_H
 
 #include "pipe/p_compiler.h"
-
-enum util_cpu_arch {
-   UTIL_CPU_ARCH_UNKNOWN = 0,
-   UTIL_CPU_ARCH_MIPS,
-   UTIL_CPU_ARCH_ALPHA,
-   UTIL_CPU_ARCH_SPARC,
-   UTIL_CPU_ARCH_X86,
-   UTIL_CPU_ARCH_POWERPC
-};
+#include "pipe/p_config.h"
 
 struct util_cpu_caps {
-   enum util_cpu_arch arch;
    unsigned nr_cpus;
 
    /* Feature flags */
    int x86_cpu_type;
    unsigned cacheline;
 
-   unsigned little_endian:1;
-
    unsigned has_tsc:1;
    unsigned has_mmx:1;
    unsigned has_mmx2:1;
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index ad162558bc1..504e6d2a18f 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -88,7 +88,7 @@ debug_get_option_should_print(void)
     * but its cool since we set first to false
     */
    first = FALSE;
-   value = debug_get_bool_option("GALLIUM_PRINT_OPTIONS", TRUE);
+   value = debug_get_bool_option("GALLIUM_PRINT_OPTIONS", FALSE);
    /* XXX should we print this option? Currently it wont */
    return value;
 }
diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h
index 38254b1096d..8e786a390a0 100644
--- a/src/gallium/auxiliary/util/u_format.h
+++ b/src/gallium/auxiliary/util/u_format.h
@@ -631,6 +631,44 @@ util_format_has_alpha(enum pipe_format format)
 }
 
 /**
+ * Return the matching SRGB format, or PIPE_FORMAT_NONE if none.
+ */
+static INLINE enum pipe_format
+util_format_srgb(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_L8_UNORM:
+      return PIPE_FORMAT_L8_SRGB;
+   case PIPE_FORMAT_L8A8_UNORM:
+      return PIPE_FORMAT_L8A8_SRGB;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return PIPE_FORMAT_R8G8B8_SRGB;
+   case PIPE_FORMAT_A8B8G8R8_UNORM:
+      return PIPE_FORMAT_A8B8G8R8_SRGB;
+   case PIPE_FORMAT_X8B8G8R8_UNORM:
+      return PIPE_FORMAT_X8B8G8R8_SRGB;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return PIPE_FORMAT_B8G8R8A8_SRGB;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return PIPE_FORMAT_B8G8R8X8_SRGB;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return PIPE_FORMAT_A8R8G8B8_SRGB;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      return PIPE_FORMAT_X8R8G8B8_SRGB;
+   case PIPE_FORMAT_DXT1_RGB:
+      return PIPE_FORMAT_DXT1_SRGB;
+   case PIPE_FORMAT_DXT1_RGBA:
+      return PIPE_FORMAT_DXT1_SRGBA;
+   case PIPE_FORMAT_DXT3_RGBA:
+      return PIPE_FORMAT_DXT3_SRGBA;
+   case PIPE_FORMAT_DXT5_RGBA:
+      return PIPE_FORMAT_DXT5_SRGBA;
+   default:
+      return PIPE_FORMAT_NONE;
+   }
+}
+
+/**
  * Return the number of components stored.
  * Formats with block size != 1x1 will always have 1 component (the block).
  */
diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c
index 768ae9ceb5d..7803ec6a8b5 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.c
+++ b/src/gallium/auxiliary/util/u_framebuffer.c
@@ -85,9 +85,11 @@ util_copy_framebuffer_state(struct pipe_framebuffer_state *dst,
    dst->width = src->width;
    dst->height = src->height;
 
-   for (i = 0; i < Elements(src->cbufs); i++) {
+   for (i = 0; i < src->nr_cbufs; i++)
       pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
-   }
+
+   for (i = src->nr_cbufs; i < dst->nr_cbufs; i++)
+      pipe_surface_reference(&dst->cbufs[i], NULL);
 
    dst->nr_cbufs = src->nr_cbufs;
 
diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h
new file mode 100644
index 00000000000..206e1ec3118
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_split_prim.h
@@ -0,0 +1,105 @@
+/* Originally written by Ben Skeggs for the nv50 driver*/
+#include <pipe/p_defines.h>
+
+struct util_split_prim {
+   void *priv;
+   void (*emit)(void *priv, unsigned start, unsigned count);
+   void (*edge)(void *priv, boolean enabled);
+
+   unsigned mode;
+   unsigned start;
+   unsigned p_start;
+   unsigned p_end;
+
+   uint repeat_first:1;
+   uint close_first:1;
+   uint edgeflag_off:1;
+};
+
+static INLINE void
+util_split_prim_init(struct util_split_prim *s,
+                  unsigned mode, unsigned start, unsigned count)
+{
+   if (mode == PIPE_PRIM_LINE_LOOP) {
+      s->mode = PIPE_PRIM_LINE_STRIP;
+      s->close_first = 1;
+   } else {
+      s->mode = mode;
+      s->close_first = 0;
+   }
+   s->start = start;
+   s->p_start = start;
+   s->p_end = start + count;
+   s->edgeflag_off = 0;
+   s->repeat_first = 0;
+}
+
+static INLINE boolean
+util_split_prim_next(struct util_split_prim *s, unsigned max_verts)
+{
+   int repeat = 0;
+
+   if (s->repeat_first) {
+      s->emit(s->priv, s->start, 1);
+      max_verts--;
+      if (s->edgeflag_off) {
+         s->edge(s->priv, TRUE);
+         s->edgeflag_off = FALSE;
+      }
+   }
+
+   if (s->p_start + s->close_first + max_verts >= s->p_end) {
+      s->emit(s->priv, s->p_start, s->p_end - s->p_start);
+      if (s->close_first)
+         s->emit(s->priv, s->start, 1);
+      return TRUE;
+   }
+
+   switch (s->mode) {
+   case PIPE_PRIM_LINES:
+      max_verts &= ~1;
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+      repeat = 1;
+      break;
+   case PIPE_PRIM_POLYGON:
+      max_verts--;
+      s->emit(s->priv, s->p_start, max_verts);
+      s->edge(s->priv, FALSE);
+      s->emit(s->priv, s->p_start + max_verts, 1);
+      s->p_start += max_verts;
+      s->repeat_first = TRUE;
+      s->edgeflag_off = TRUE;
+      return FALSE;
+   case PIPE_PRIM_TRIANGLES:
+      max_verts = max_verts - (max_verts % 3);
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      /* to ensure winding stays correct, always split
+       * on an even number of generated triangles
+       */
+      max_verts = max_verts & ~1;
+      repeat = 2;
+      break;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      s->repeat_first = TRUE;
+      repeat = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      max_verts &= ~3;
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      max_verts &= ~1;
+      repeat = 2;
+      break;
+   case PIPE_PRIM_POINTS:
+      break;
+   default:
+      /* TODO: implement adjacency primitives */
+      assert(0);
+   }
+
+   s->emit (s->priv, s->p_start, max_verts);
+   s->p_start += (max_verts - repeat);
+   return FALSE;
+}
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 6145e34aa3f..87959ab0aab 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -71,6 +71,35 @@ _mm_castps_si128(__m128 a)
 
 #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
+
+#if defined(PIPE_ARCH_SSSE3)
+
+#include <tmmintrin.h>
+
+#else /* !PIPE_ARCH_SSSE3 */
+
+#include <emmintrin.h>
+
+/**
+ * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases
+ * where -mssse3 is not supported/enabled.
+ *
+ * MSVC will never get in here as its intrinsics support do not rely on
+ * compiler command line options.
+ */
+static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi8(__m128i a, __m128i mask)
+{
+    __m128i result;
+    __asm__("pshufb %1, %0"
+            : "=x" (result)
+            : "xm" (mask), "0" (a));
+    return result;
+}
+
+#endif /* !PIPE_ARCH_SSSE3 */
+
+
 #endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
 
 #endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_staging.c b/src/gallium/auxiliary/util/u_staging.c
new file mode 100644
index 00000000000..607c31f5ee7
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_staging.c
@@ -0,0 +1,95 @@
+#include "util/u_staging.h"
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+
+static void
+util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigned height, unsigned depth, struct pipe_resource *template)
+{
+   memset(template, 0, sizeof(struct pipe_resource));
+   if(pt->target != PIPE_BUFFER && depth <= 1)
+      template->target = PIPE_TEXTURE_2D;
+   else
+      template->target = pt->target;
+   template->format = pt->format;
+   template->width0 = width;
+   template->height0 = height;
+   template->depth0 = depth;
+   template->last_level = 0;
+   template->nr_samples = pt->nr_samples;
+   template->bind = 0;
+   template->usage = PIPE_USAGE_STAGING;
+   template->flags = 0;
+}
+
+struct util_staging_transfer *
+util_staging_transfer_new(struct pipe_context *pipe,
+           struct pipe_resource *pt,
+           struct pipe_subresource sr,
+           unsigned usage,
+           const struct pipe_box *box,
+           bool direct)
+{
+   struct pipe_screen *pscreen = pipe->screen;
+   struct util_staging_transfer *tx;
+   struct pipe_resource staging_resource_template;
+
+   tx = CALLOC_STRUCT(util_staging_transfer);
+   if (!tx)
+      return NULL;
+
+   pipe_resource_reference(&tx->base.resource, pt);
+   tx->base.sr = sr;
+   tx->base.usage = usage;
+   tx->base.box = *box;
+
+   if (direct)
+   {
+      tx->staging_resource = pt;
+      return tx;
+   }
+
+   util_staging_resource_template(pt, box->width, box->height, box->depth, &staging_resource_template);
+   tx->staging_resource = pscreen->resource_create(pscreen, &staging_resource_template);
+   if (!tx->staging_resource)
+   {
+      pipe_resource_reference(&tx->base.resource, NULL);
+      FREE(tx);
+      return NULL;
+   }
+
+   if (usage & PIPE_TRANSFER_READ)
+   {
+      struct pipe_subresource dstsr;
+      unsigned zi;
+      dstsr.face = 0;
+      dstsr.level = 0;
+      for(zi = 0; zi < box->depth; ++zi)
+         pipe->resource_copy_region(pipe, tx->staging_resource, dstsr, 0, 0, 0, tx->base.resource, sr, box->x, box->y, box->z + zi, box->width, box->height);
+   }
+
+   return tx;
+}
+
+void
+util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+   struct util_staging_transfer *tx = (struct util_staging_transfer *)ptx;
+
+   if (tx->staging_resource != tx->base.resource)
+   {
+      if(tx->base.usage & PIPE_TRANSFER_WRITE) {
+         struct pipe_subresource srcsr;
+         unsigned zi;
+         srcsr.face = 0;
+         srcsr.level = 0;
+         for(zi = 0; zi < tx->base.box.depth; ++zi)
+            pipe->resource_copy_region(pipe, tx->base.resource, tx->base.sr, tx->base.box.x, tx->base.box.y, tx->base.box.z + zi, tx->staging_resource, srcsr, 0, 0, 0, tx->base.box.width, tx->base.box.height);
+      }
+
+      pipe_resource_reference(&tx->staging_resource, NULL);
+   }
+
+   pipe_resource_reference(&ptx->resource, NULL);
+   FREE(ptx);
+}
diff --git a/src/gallium/auxiliary/util/u_staging.h b/src/gallium/auxiliary/util/u_staging.h
new file mode 100644
index 00000000000..602faa2971d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_staging.h
@@ -0,0 +1,37 @@
+/* Direct3D 10/11 has no concept of transfers. Applications instead
+ * create resources with a STAGING or DYNAMIC usage, copy between them
+ * and the real resource and use Map to map the STAGING/DYNAMIC resource.
+ *
+ * This util module allows to implement Gallium drivers as a Direct3D
+ * driver would be implemented: transfers allocate a resource with
+ * PIPE_USAGE_STAGING, and copy the data between it and the real resource
+ * with resource_copy_region.
+ */
+
+#ifndef U_STAGING_H
+#define U_STAGING_H
+
+#include "pipe/p_state.h"
+
+struct util_staging_transfer {
+   struct pipe_transfer base;
+
+   /* if direct, same as base.resource, otherwise the temporary staging resource */
+   struct pipe_resource *staging_resource;
+};
+
+/* user must be stride, slice_stride and offset */
+/* pt->usage == PIPE_USAGE_DYNAMIC should be a good value to pass for direct */
+/* staging resource is currently created with PIPE_USAGE_DYNAMIC */
+struct util_staging_transfer *
+util_staging_transfer_new(struct pipe_context *pipe,
+           struct pipe_resource *pt,
+           struct pipe_subresource sr,
+           unsigned usage,
+           const struct pipe_box *box,
+           bool direct);
+
+void
+util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx);
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_surfaces.c b/src/gallium/auxiliary/util/u_surfaces.c
index b5d21570d57..7733ad24d0d 100644
--- a/src/gallium/auxiliary/util/u_surfaces.c
+++ b/src/gallium/auxiliary/util/u_surfaces.c
@@ -3,40 +3,22 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 
-/* TODO: ouch, util_hash_table should do these by default when passed a null function pointer
- * this indirect function call is quite bad
- */
-static unsigned
-hash(void *key)
-{
-   return (unsigned)(uintptr_t)key;
-}
-
-static int
-compare(void *key1, void *key2)
-{
-   return (unsigned)(uintptr_t)key1 - (unsigned)(uintptr_t)key2;
-}
-
 struct pipe_surface *
 util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags)
 {
    struct pipe_surface *ps;
-   void *key = NULL;
 
    if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
-   {	/* or 2D array */
-      if(!us->u.table)
-	 us->u.table = util_hash_table_create(hash, compare);
-      key = (void *)(uintptr_t)(((zslice + face) << 8) | level);
-      /* TODO: ouch, should have a get-reference function...
-       * also, shouldn't allocate a two-pointer structure for each item... */
-      ps = util_hash_table_get(us->u.table, key);
+   {    /* or 2D array */
+      if(!us->u.hash)
+         us->u.hash = cso_hash_create();
+
+      ps = cso_hash_iter_data(cso_hash_find(us->u.hash, ((zslice + face) << 8) | level));
    }
    else
    {
       if(!us->u.array)
-	 us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *));
+         us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *));
       ps = us->u.array[level];
    }
 
@@ -54,7 +36,7 @@ util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, str
    ps->offset = ~0;
 
    if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
-      util_hash_table_set(us->u.table, key, ps);
+      cso_hash_insert(us->u.hash, ((zslice + face) << 8) | level, ps);
    else
       us->u.array[level] = ps;
 
@@ -66,47 +48,44 @@ util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps)
 {
    struct pipe_resource *pt = ps->texture;
    if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
-   {	/* or 2D array */
-      void* key = (void*)(uintptr_t)(((ps->zslice + ps->face) << 8) | ps->level);
-      util_hash_table_remove(us->u.table, key);
+   {    /* or 2D array */
+      cso_hash_erase(us->u.hash, cso_hash_find(us->u.hash, ((ps->zslice + ps->face) << 8) | ps->level));
    }
    else
       us->u.array[ps->level] = 0;
 }
 
-static enum pipe_error
-util_surfaces_destroy_callback(void *key, void *value, void *data)
-{
-   void (*destroy_surface) (struct pipe_surface * ps) = data;
-   destroy_surface((struct pipe_surface *)value);
-   return PIPE_OK;
-}
-
 void
 util_surfaces_destroy(struct util_surfaces *us, struct pipe_resource *pt, void (*destroy_surface) (struct pipe_surface *))
 {
    if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)
-   {	/* or 2D array */
-      if(us->u.table)
+   {    /* or 2D array */
+      if(us->u.hash)
       {
-	 util_hash_table_foreach(us->u.table, util_surfaces_destroy_callback, destroy_surface);
-	 util_hash_table_destroy(us->u.table);
-	 us->u.table = NULL;
+         struct cso_hash_iter iter;
+         iter = cso_hash_first_node(us->u.hash);
+         while (!cso_hash_iter_is_null(iter)) {
+            destroy_surface(cso_hash_iter_data(iter));
+            iter = cso_hash_iter_next(iter);
+         }
+
+         cso_hash_delete(us->u.hash);
+         us->u.hash = NULL;
       }
    }
    else
    {
       if(us->u.array)
       {
-	 unsigned i;
-	 for(i = 0; i < pt->last_level; ++i)
-	 {
-	    struct pipe_surface *ps = us->u.array[i];
-	    if(ps)
-	       destroy_surface(ps);
-	 }
-	 FREE(us->u.array);
-	 us->u.array = NULL;
+         unsigned i;
+         for(i = 0; i <= pt->last_level; ++i)
+         {
+            struct pipe_surface *ps = us->u.array[i];
+            if(ps)
+               destroy_surface(ps);
+         }
+         FREE(us->u.array);
+         us->u.array = NULL;
       }
    }
 }
diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h
index 0195bf5afba..af978c70579 100644
--- a/src/gallium/auxiliary/util/u_surfaces.h
+++ b/src/gallium/auxiliary/util/u_surfaces.h
@@ -4,15 +4,15 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_state.h"
 #include "util/u_atomic.h"
-
-struct util_hash_table;
+#include "cso_cache/cso_hash.h"
 
 struct util_surfaces
 {
    union
    {
-      struct util_hash_table *table;
+      struct cso_hash *hash;
       struct pipe_surface **array;
+      void* pv;
    } u;
 };
 
@@ -35,6 +35,18 @@ util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct
    return util_surfaces_do_get(us, surface_struct_size, pscreen, pt, face, level, zslice, flags);
 }
 
+static INLINE struct pipe_surface *
+util_surfaces_peek(struct util_surfaces *us, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice)
+{
+   if(!us->u.pv)
+      return 0;
+
+   if(unlikely(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE))
+      return cso_hash_iter_data(cso_hash_find(us->u.hash, ((zslice + face) << 8) | level));
+   else
+      return us->u.array[level];
+}
+
 void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps);
 
 static INLINE void
diff --git a/src/gallium/docs/source/conf.py b/src/gallium/docs/source/conf.py
index 99e665234eb..0846e7d0ece 100644
--- a/src/gallium/docs/source/conf.py
+++ b/src/gallium/docs/source/conf.py
@@ -22,7 +22,7 @@ sys.path.append(os.path.abspath('exts'))
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.pngmath', 'tgsi']
+extensions = ['sphinx.ext.pngmath', 'formatting']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
diff --git a/src/gallium/docs/source/debugging.rst b/src/gallium/docs/source/debugging.rst
new file mode 100644
index 00000000000..42bda5aee93
--- /dev/null
+++ b/src/gallium/docs/source/debugging.rst
@@ -0,0 +1,101 @@
+Debugging
+=========
+
+Debugging utilities in gallium.
+
+Debug Variables
+^^^^^^^^^^^^^^^
+
+All drivers respond to a set of common debug environment variables, as well as
+some driver-specific variables. Set them as normal environment variables for
+the platform or operating system you are running. For example, for Linux this
+can be done by typing "export var=value" into a console and then running the
+program from that console.
+
+Common
+""""""
+
+.. envvar:: GALLIUM_PRINT_OPTIONS <bool> (false)
+
+This option controls if the debug variables should be printed to stderr. This
+is probably the most useful variable, since it allows you to find which
+variables a driver uses.
+
+.. envvar:: GALLIUM_RBUG <bool> (false)
+
+Controls if the :ref:`rbug` should be used.
+
+.. envvar:: GALLIUM_TRACE <string> ("")
+
+If set, this variable will cause the :ref:`Trace` output to be written to the
+specified file. Paths may be relative or absolute; relative paths are relative
+to the working directory.  For example, setting it to "trace.xml" will cause
+the trace to be written to a file of the same name in the working directory.
+
+.. envvar:: GALLIUM_DUMP_CPU <bool> (false)
+
+Dump information about the current CPU that the driver is running on.
+
+.. envvar:: TGSI_PRINT_SANITY <bool> (false)
+
+Gallium has a built-in shader sanity checker.  This option controls whether
+the shader sanity checker prints its warnings and errors to stderr.
+
+.. envvar:: DRAW_USE_LLVM <bool> (false)
+
+Whether the :ref:`Draw` module will attempt to use LLVM for vertex and geometry shaders.
+
+
+State tracker-specific
+""""""""""""""""""""""
+
+.. envvar:: ST_DEBUG <flags> (0x0)
+
+Debug :ref:`flags` for the GL state tracker.
+
+
+Driver-specific
+"""""""""""""""
+
+.. envvar:: I915_DEBUG <flags> (0x0)
+
+Debug :ref:`flags` for the i915 driver.
+
+.. envvar:: I915_NO_HW <bool> (false)
+
+Stop the i915 driver from submitting commands to the hardware.
+
+.. envvar:: I915_DUMP_CMD <bool> (false)
+
+Dump all commands going to the hardware.
+
+.. envvar:: LP_DEBUG <flags> (0x0)
+
+Debug :ref:`flags` for the llvmpipe driver.
+
+.. envvar:: LP_NUM_THREADS <int> (number of CPUs)
+
+Number of threads that the llvmpipe driver should use.
+
+
+.. _flags:
+
+Flags
+"""""
+
+The variables of type "flags" all take a string with comma-separated flags to
+enable different debugging for different parts of the drivers or state
+tracker. If set to "help", the driver will print a list of flags which the
+variable accepts. Order does not matter.
+
+
+.. _rbug:
+
+Remote Debugger
+^^^^^^^^^^^^^^^
+
+The remote debugger, commonly known as rbug, allows for runtime inspections of
+:ref:`Context`, :ref:`Screen`, :ref:`Resource` and :ref:`Shader` objects; and
+pausing and stepping of :ref:`Draw` calls. Is used with rbug-gui which is
+hosted outside of the main mesa repository. rbug is can be used over a network
+connection, so the debugger does not need to be on the same machine.
diff --git a/src/gallium/docs/source/distro.rst b/src/gallium/docs/source/distro.rst
index e379ad32719..70d75b51e65 100644
--- a/src/gallium/docs/source/distro.rst
+++ b/src/gallium/docs/source/distro.rst
@@ -74,6 +74,11 @@ Trace
 Wrapper driver. Trace dumps an XML record of the calls made to the
 :ref:`Context` and :ref:`Screen` objects that it wraps.
 
+Rbug
+^^^^
+
+Wrapper driver. :ref:`rbug` driver used with stand alone rbug-gui.
+
 State Trackers
 --------------
 
diff --git a/src/gallium/docs/source/exts/formatting.py b/src/gallium/docs/source/exts/formatting.py
new file mode 100644
index 00000000000..14865f36033
--- /dev/null
+++ b/src/gallium/docs/source/exts/formatting.py
@@ -0,0 +1,31 @@
+# formatting.py
+# Sphinx extension providing formatting for Gallium-specific data
+# (c) Corbin Simpson 2010
+# Public domain to the extent permitted; contact author for special licensing
+
+import docutils.nodes
+import sphinx.addnodes
+
+def parse_envvar(env, sig, signode):
+    envvar, t, default = sig.split(" ", 2)
+    envvar = envvar.strip().upper()
+    t = " Type: %s" % t.strip(" <>").lower()
+    default = " Default: %s" % default.strip(" ()")
+    signode += sphinx.addnodes.desc_name(envvar, envvar)
+    signode += sphinx.addnodes.desc_type(t, t)
+    signode += sphinx.addnodes.desc_annotation(default, default)
+    return envvar
+
+def parse_opcode(env, sig, signode):
+    opcode, desc = sig.split("-", 1)
+    opcode = opcode.strip().upper()
+    desc = " (%s)" % desc.strip()
+    signode += sphinx.addnodes.desc_name(opcode, opcode)
+    signode += sphinx.addnodes.desc_annotation(desc, desc)
+    return opcode
+
+def setup(app):
+    app.add_description_unit("envvar", "envvar", "%s (environment variable)",
+        parse_envvar)
+    app.add_description_unit("opcode", "opcode", "%s (TGSI opcode)",
+        parse_opcode)
diff --git a/src/gallium/docs/source/exts/tgsi.py b/src/gallium/docs/source/exts/tgsi.py
deleted file mode 100644
index e92cd5c4d1b..00000000000
--- a/src/gallium/docs/source/exts/tgsi.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# tgsi.py
-# Sphinx extension providing formatting for TGSI opcodes
-# (c) Corbin Simpson 2010
-
-import docutils.nodes
-import sphinx.addnodes
-
-def parse_opcode(env, sig, signode):
-    opcode, desc = sig.split("-", 1)
-    opcode = opcode.strip().upper()
-    desc = " (%s)" % desc.strip()
-    signode += sphinx.addnodes.desc_name(opcode, opcode)
-    signode += sphinx.addnodes.desc_annotation(desc, desc)
-    return opcode
-
-def setup(app):
-    app.add_description_unit("opcode", "opcode", "%s (TGSI opcode)", parse_opcode)
diff --git a/src/gallium/docs/source/index.rst b/src/gallium/docs/source/index.rst
index 54bc883fced..6c19842dac4 100644
--- a/src/gallium/docs/source/index.rst
+++ b/src/gallium/docs/source/index.rst
@@ -12,6 +12,7 @@ Contents:
    :maxdepth: 2
 
    intro
+   debugging
    tgsi
    screen
    context
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 78744da500b..2cf6f38c4b8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -141,7 +141,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
                else {
                   dadx = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dadx_ptr, &index, 1, ""), "");
                   dady = LLVMBuildLoad(builder, LLVMBuildGEP(builder, dady_ptr, &index, 1, ""), "");
-                  dadxy = LLVMBuildAdd(builder, dadx, dady, "");
+                  dadxy = LLVMBuildFAdd(builder, dadx, dady, "");
                   attrib_name(dadx, attrib, chan, ".dadx");
                   attrib_name(dady, attrib, chan, ".dady");
                   attrib_name(dadxy, attrib, chan, ".dadxy");
@@ -177,7 +177,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
              * dadq2 = 2 * dq
              */
 
-            dadq2 = LLVMBuildAdd(builder, dadq, dadq, "");
+            dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
 
             /*
              * a = a0 + x * dadx + y * dady
@@ -193,12 +193,11 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
                a = a0;
                if (interp != LP_INTERP_CONSTANT &&
                    interp != LP_INTERP_FACING) {
-                  a = LLVMBuildAdd(builder, a,
-                                   LLVMBuildMul(builder, bld->x, dadx, ""),
-                                   "");
-                  a = LLVMBuildAdd(builder, a,
-                                   LLVMBuildMul(builder, bld->y, dady, ""),
-                                   "");
+                  LLVMValueRef tmp;
+                  tmp = LLVMBuildFMul(builder, bld->x, dadx, "");
+                  a = LLVMBuildFAdd(builder, a, tmp, "");
+                  tmp = LLVMBuildFMul(builder, bld->y, dady, "");
+                  a = LLVMBuildFAdd(builder, a, tmp, "");
                }
             }
 
@@ -212,7 +211,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
              * Compute the attrib values on the upper-left corner of each quad.
              */
 
-            a = LLVMBuildAdd(builder, a, dadq2, "");
+            a = LLVMBuildFAdd(builder, a, dadq2, "");
 
             /*
              * a    *= 1 / w
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 28793682edf..7543bd7b2b0 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -47,7 +47,7 @@
 #include "lp_setup.h"
 
 
-DEBUG_GET_ONCE_BOOL_OPTION(lp_no_rast, "LP_NO_RAST", FALSE);
+DEBUG_GET_ONCE_BOOL_OPTION(lp_no_rast, "LP_NO_RAST", FALSE)
 
 
 static void llvmpipe_destroy( struct pipe_context *pipe )
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index eaf2a6f3345..102e902d02c 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -104,9 +104,6 @@ struct lp_rast_plane {
 
    int dcdx;
    int dcdy;
-   
-   /* edge/step info for 3 edges and 4x4 block of pixels */
-   const int *step;
 };
 
 /**
@@ -119,8 +116,6 @@ struct lp_rast_triangle {
    /* inputs for the shader */
    struct lp_rast_shader_inputs inputs;
 
-   int step[3][16];
-
 #ifdef DEBUG
    float v[3][2];
 #endif
@@ -261,5 +256,9 @@ void lp_rast_begin_query(struct lp_rasterizer_task *,
 void lp_rast_end_query(struct lp_rasterizer_task *,
                        const union lp_rast_cmd_arg );
 
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg);
+
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index ebe9a8e92b4..673f67386bc 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -37,52 +37,6 @@
 #include "lp_tile_soa.h"
 
 
-/**
- * Map an index in [0,15] to an x,y position, multiplied by 4.
- * This is used to get the position of each subtile in a 4x4
- * grid of edge step values.
- * Note: we can use some bit twiddling to compute these values instead
- * of using a look-up table, but there's no measurable performance
- * difference.
- */
-static const int pos_table4[16][2] = {
-   { 0, 0 },
-   { 4, 0 },
-   { 0, 4 },
-   { 4, 4 },
-   { 8, 0 },
-   { 12, 0 },
-   { 8, 4 },
-   { 12, 4 },
-   { 0, 8 },
-   { 4, 8 },
-   { 0, 12 },
-   { 4, 12 },
-   { 8, 8 },
-   { 12, 8 },
-   { 8, 12 },
-   { 12, 12 }
-};
-
-
-static const int pos_table16[16][2] = {
-   { 0, 0 },
-   { 16, 0 },
-   { 0, 16 },
-   { 16, 16 },
-   { 32, 0 },
-   { 48, 0 },
-   { 32, 16 },
-   { 48, 16 },
-   { 0, 32 },
-   { 16, 32 },
-   { 0, 48 },
-   { 16, 48 },
-   { 32, 32 },
-   { 48, 32 },
-   { 32, 48 },
-   { 48, 48 }
-};
 
 
 /**
@@ -113,6 +67,68 @@ block_full_16(struct lp_rasterizer_task *task,
 	 block_full_4(task, tri, x + ix, y + iy);
 }
 
+
+static INLINE unsigned
+build_mask(int c, int dcdx, int dcdy)
+{
+   int mask = 0;
+
+   int c0 = c;
+   int c1 = c0 + dcdx;
+   int c2 = c1 + dcdx;
+   int c3 = c2 + dcdx;
+
+   mask |= ((c0 + 0 * dcdy) >> 31) & (1 << 0);
+   mask |= ((c0 + 1 * dcdy) >> 31) & (1 << 2);
+   mask |= ((c0 + 2 * dcdy) >> 31) & (1 << 8);
+   mask |= ((c0 + 3 * dcdy) >> 31) & (1 << 10);
+   mask |= ((c1 + 0 * dcdy) >> 31) & (1 << 1);
+   mask |= ((c1 + 1 * dcdy) >> 31) & (1 << 3);
+   mask |= ((c1 + 2 * dcdy) >> 31) & (1 << 9);
+   mask |= ((c1 + 3 * dcdy) >> 31) & (1 << 11); 
+   mask |= ((c2 + 0 * dcdy) >> 31) & (1 << 4);
+   mask |= ((c2 + 1 * dcdy) >> 31) & (1 << 6);
+   mask |= ((c2 + 2 * dcdy) >> 31) & (1 << 12);
+   mask |= ((c2 + 3 * dcdy) >> 31) & (1 << 14);
+   mask |= ((c3 + 0 * dcdy) >> 31) & (1 << 5);
+   mask |= ((c3 + 1 * dcdy) >> 31) & (1 << 7);
+   mask |= ((c3 + 2 * dcdy) >> 31) & (1 << 13);
+   mask |= ((c3 + 3 * dcdy) >> 31) & (1 << 15);
+  
+   return mask;
+}
+
+static INLINE unsigned
+build_mask_linear(int c, int dcdx, int dcdy)
+{
+   int mask = 0;
+
+   int c0 = c;
+   int c1 = c0 + dcdy;
+   int c2 = c1 + dcdy;
+   int c3 = c2 + dcdy;
+
+   mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
+   mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
+   mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
+   mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
+   mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
+   mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
+   mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
+   mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); 
+   mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
+   mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
+   mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
+   mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
+   mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
+   mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
+   mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
+   mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
+  
+   return mask;
+}
+
+
 #define TAG(x) x##_1
 #define NR_PLANES 1
 #include "lp_rast_tri_tmp.h"
@@ -141,3 +157,85 @@ block_full_16(struct lp_rasterizer_task *task,
 #define NR_PLANES 7
 #include "lp_rast_tri_tmp.h"
 
+
+/* Special case for 3 plane triangle which is contained entirely
+ * within a 16x16 block.
+ */
+void
+lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = tri->plane;
+   unsigned mask = arg.triangle.plane_mask;
+   const int x = task->x + (mask & 0xf) * 16;
+   const int y = task->y + (mask >> 4) * 16;
+   unsigned outmask, inmask, partmask, partial_mask;
+   unsigned j;
+   int c[3];
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   partmask = 0;                /* outside one or more trivial accept planes */
+
+   for (j = 0; j < 3; j++) {
+      c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+
+      {
+	 const int dcdx = -plane[j].dcdx * 4;
+	 const int dcdy = plane[j].dcdy * 4;
+	 const int cox = c[j] + plane[j].eo * 4;
+	 const int cio = c[j] + plane[j].ei * 4 - 1;
+
+	 outmask |= build_mask_linear(cox, dcdx, dcdy);
+	 partmask |= build_mask_linear(cio, dcdx, dcdy);
+      }
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+   /* Mask of sub-blocks which are inside all trivial accept planes:
+    */
+   inmask = ~partmask & 0xffff;
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = partmask & ~outmask;
+
+   assert((partial_mask & inmask) == 0);
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+      int cx[3];
+
+      partial_mask &= ~(1 << i);
+
+      for (j = 0; j < 3; j++)
+         cx[j] = (c[j] 
+		  - plane[j].dcdx * ix
+		  + plane[j].dcdy * iy);
+
+      do_block_4_3(task, tri, plane, px, py, cx);
+   }
+
+   /* Iterate over fulls: 
+    */
+   while (inmask) {
+      int i = ffs(inmask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+
+      inmask &= ~(1 << i);
+
+      block_full_4(task, tri, px, py);
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index a410c611a3f..43f72d8ca8f 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -46,19 +46,13 @@ TAG(do_block_4)(struct lp_rasterizer_task *task,
                 int x, int y,
                 const int *c)
 {
-   unsigned mask = 0;
-   int i;
+   unsigned mask = 0xffff;
+   int j;
 
-   for (i = 0; i < 16; i++) {
-      int any_negative = 0;
-      int j;
-
-      for (j = 0; j < NR_PLANES; j++) 
-         any_negative |= (c[j] - 1 + plane[j].step[i]);
-         
-      any_negative >>= 31;
-
-      mask |= (~any_negative) & (1 << i);
+   for (j = 0; j < NR_PLANES; j++) {
+      mask &= ~build_mask(c[j] - 1, 
+			  -plane[j].dcdx,
+			  plane[j].dcdy);
    }
 
    /* Now pass to the shader:
@@ -79,24 +73,19 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
                  const int *c)
 {
    unsigned outmask, inmask, partmask, partial_mask;
-   unsigned i, j;
+   unsigned j;
 
    outmask = 0;                 /* outside one or more trivial reject planes */
    partmask = 0;                /* outside one or more trivial accept planes */
 
    for (j = 0; j < NR_PLANES; j++) {
-      const int *step = plane[j].step;
-      const int eo = plane[j].eo * 4;
-      const int ei = plane[j].ei * 4;
-      const int cox = c[j] + eo;
-      const int cio = ei - 1 - eo;
-
-      for (i = 0; i < 16; i++) {
-         int out = cox + step[i] * 4;
-         int part = out + cio;
-         outmask  |= (out >> 31) & (1 << i);
-         partmask |= (part >> 31) & (1 << i);
-      }
+      const int dcdx = -plane[j].dcdx * 4;
+      const int dcdy = plane[j].dcdy * 4;
+      const int cox = c[j] + plane[j].eo * 4;
+      const int cio = c[j] + plane[j].ei * 4 - 1;
+
+      outmask |= build_mask_linear(cox, dcdx, dcdy);
+      partmask |= build_mask_linear(cio, dcdx, dcdy);
    }
 
    if (outmask == 0xffff)
@@ -117,15 +106,19 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
     */
    while (partial_mask) {
       int i = ffs(partial_mask) - 1;
-      int px = x + pos_table4[i][0];
-      int py = y + pos_table4[i][1];
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
       int cx[NR_PLANES];
 
-      for (j = 0; j < NR_PLANES; j++)
-         cx[j] = c[j] + plane[j].step[i] * 4;
-
       partial_mask &= ~(1 << i);
 
+      for (j = 0; j < NR_PLANES; j++)
+         cx[j] = (c[j] 
+		  - plane[j].dcdx * ix
+		  + plane[j].dcdy * iy);
+
       TAG(do_block_4)(task, tri, plane, px, py, cx);
    }
 
@@ -133,8 +126,10 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
     */
    while (inmask) {
       int i = ffs(inmask) - 1;
-      int px = x + pos_table4[i][0];
-      int py = y + pos_table4[i][1];
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
 
       inmask &= ~(1 << i);
 
@@ -157,35 +152,28 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
    struct lp_rast_plane plane[NR_PLANES];
    int c[NR_PLANES];
    unsigned outmask, inmask, partmask, partial_mask;
-   unsigned i, j, nr_planes = 0;
+   unsigned j = 0;
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   partmask = 0;                /* outside one or more trivial accept planes */
 
    while (plane_mask) {
       int i = ffs(plane_mask) - 1;
-      plane[nr_planes] = tri->plane[i];
+      plane[j] = tri->plane[i];
       plane_mask &= ~(1 << i);
-      nr_planes++;
-   };
-
-   assert(nr_planes == NR_PLANES);
-   outmask = 0;                 /* outside one or more trivial reject planes */
-   partmask = 0;                /* outside one or more trivial accept planes */
+      c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
 
-   for (j = 0; j < NR_PLANES; j++) {
-      const int *step = plane[j].step;
-      const int eo = plane[j].eo * 16;
-      const int ei = plane[j].ei * 16;
-      int cox, cio;
+      {
+	 const int dcdx = -plane[j].dcdx * 16;
+	 const int dcdy = plane[j].dcdy * 16;
+	 const int cox = c[j] + plane[j].eo * 16;
+	 const int cio = c[j] + plane[j].ei * 16 - 1;
 
-      c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
-      cox = c[j] + eo;
-      cio = ei - 1 - eo;
-
-      for (i = 0; i < 16; i++) {
-         int out = cox + step[i] * 16;
-         int part = out + cio;
-         outmask  |= (out >> 31) & (1 << i);
-         partmask |= (part >> 31) & (1 << i);
+	 outmask |= build_mask_linear(cox, dcdx, dcdy);
+	 partmask |= build_mask_linear(cio, dcdx, dcdy);
       }
+
+      j++;
    }
 
    if (outmask == 0xffff)
@@ -206,12 +194,16 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
     */
    while (partial_mask) {
       int i = ffs(partial_mask) - 1;
-      int px = x + pos_table16[i][0];
-      int py = y + pos_table16[i][1];
+      int ix = (i & 3) * 16;
+      int iy = (i >> 2) * 16;
+      int px = x + ix;
+      int py = y + iy;
       int cx[NR_PLANES];
 
       for (j = 0; j < NR_PLANES; j++)
-         cx[j] = c[j] + plane[j].step[i] * 16;
+         cx[j] = (c[j]
+		  - plane[j].dcdx * ix
+		  + plane[j].dcdy * iy);
 
       partial_mask &= ~(1 << i);
 
@@ -223,8 +215,10 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
     */
    while (inmask) {
       int i = ffs(inmask) - 1;
-      int px = x + pos_table16[i][0];
-      int py = y + pos_table16[i][1];
+      int ix = (i & 3) * 16;
+      int iy = (i >> 2) * 16;
+      int px = x + ix;
+      int py = y + iy;
 
       inmask &= ~(1 << i);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 7e432503c12..614a6372b42 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -61,36 +61,6 @@ struct tri_info {
 
 
 
-static const int step_scissor_minx[16] = {
-   0, 1, 0, 1,
-   2, 3, 2, 3,
-   0, 1, 0, 1,
-   2, 3, 2, 3
-};
-
-static const int step_scissor_maxx[16] = {
-    0, -1,  0, -1,
-   -2, -3, -2, -3,
-    0, -1,  0, -1,
-   -2, -3, -2, -3
-};
-
-static const int step_scissor_miny[16] = {
-   0, 0, 1, 1,
-   0, 0, 1, 1,
-   2, 2, 3, 3,
-   2, 2, 3, 3
-};
-
-static const int step_scissor_maxy[16] = {
-    0,  0, -1, -1,
-    0,  0, -1, -1,
-   -2, -2, -3, -3,
-   -2, -2, -3, -3
-};
-
-
-
    
 static INLINE int
 subpixel_snap(float a)
@@ -260,13 +230,13 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,
 {
    unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
    unsigned slot;
+   unsigned i;
 
    /* setup interpolation for all the remaining attributes:
     */
    for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
       unsigned vert_attr = setup->fs.input[slot].src_index;
       unsigned usage_mask = setup->fs.input[slot].usage_mask;
-      unsigned i;
 
       switch (setup->fs.input[slot].interp) {
       case LP_INTERP_CONSTANT:
@@ -316,6 +286,34 @@ static void setup_tri_coefficients( struct lp_setup_context *setup,
    /* The internal position input is in slot zero:
     */
    setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask);
+
+   if (0) {
+      for (i = 0; i < NUM_CHANNELS; i++) {
+         float a0   = tri->inputs.a0  [0][i];
+         float dadx = tri->inputs.dadx[0][i];
+         float dady = tri->inputs.dady[0][i];
+
+         debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n",
+                      "xyzw"[i],
+                      a0, dadx, dady);
+      }
+
+      for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
+         unsigned usage_mask = setup->fs.input[slot].usage_mask;
+         for (i = 0; i < NUM_CHANNELS; i++) {
+            if (usage_mask & (1 << i)) {
+               float a0   = tri->inputs.a0  [1 + slot][i];
+               float dadx = tri->inputs.dadx[1 + slot][i];
+               float dady = tri->inputs.dady[1 + slot][i];
+
+               debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n",
+                            slot,
+                            "xyzw"[i],
+                            a0, dadx, dady);
+            }
+         }
+      }
+   }
 }
 
 
@@ -525,7 +523,7 @@ do_triangle_ccw(struct lp_setup_context *setup,
    info.dx20 = info.v2[0][0] - info.v0[0][0];
    info.dy01 = info.v0[0][1] - info.v1[0][1];
    info.dy20 = info.v2[0][1] - info.v0[0][1];
-   info.oneoverarea = 1.0 / (info.dx01 * info.dy20 - info.dx20 * info.dy01);
+   info.oneoverarea = 1.0f / (info.dx01 * info.dy20 - info.dx20 * info.dy01);
    info.frontfacing = frontfacing;
 
    /* Setup parameter interpolants:
@@ -590,35 +588,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
       /* Calculate trivial accept offsets from the above.
        */
       plane->ei = plane->dcdy - plane->dcdx - plane->eo;
-
-      plane->step = tri->step[i];
-
-      /* Fill in the inputs.step[][] arrays.
-       * We've manually unrolled some loops here.
-       */
-#define SETUP_STEP(j, x, y) \
-      tri->step[i][j] = y * plane->dcdy - x * plane->dcdx
-      
-      SETUP_STEP(0, 0, 0);
-      SETUP_STEP(1, 1, 0);
-      SETUP_STEP(2, 0, 1);
-      SETUP_STEP(3, 1, 1);
-
-      SETUP_STEP(4, 2, 0);
-      SETUP_STEP(5, 3, 0);
-      SETUP_STEP(6, 2, 1);
-      SETUP_STEP(7, 3, 1);
-
-      SETUP_STEP(8, 0, 2);
-      SETUP_STEP(9, 1, 2);
-      SETUP_STEP(10, 0, 3);
-      SETUP_STEP(11, 1, 3);
-
-      SETUP_STEP(12, 2, 2);
-      SETUP_STEP(13, 3, 2);
-      SETUP_STEP(14, 2, 3);
-      SETUP_STEP(15, 3, 3);
-#undef STEP
    }
 
 
@@ -641,28 +610,24 @@ do_triangle_ccw(struct lp_setup_context *setup,
     * these planes elsewhere.
     */
    if (nr_planes == 7) {
-      tri->plane[3].step = step_scissor_minx;
       tri->plane[3].dcdx = -1;
       tri->plane[3].dcdy = 0;
       tri->plane[3].c = 1-minx;
       tri->plane[3].ei = 0;
       tri->plane[3].eo = 1;
 
-      tri->plane[4].step = step_scissor_maxx;
       tri->plane[4].dcdx = 1;
       tri->plane[4].dcdy = 0;
       tri->plane[4].c = maxx;
       tri->plane[4].ei = -1;
       tri->plane[4].eo = 0;
 
-      tri->plane[5].step = step_scissor_miny;
       tri->plane[5].dcdx = 0;
       tri->plane[5].dcdy = 1;
       tri->plane[5].c = 1-miny;
       tri->plane[5].ei = 0;
       tri->plane[5].eo = 1;
 
-      tri->plane[6].step = step_scissor_maxy;
       tri->plane[6].dcdx = 0;
       tri->plane[6].dcdy = -1;
       tri->plane[6].c = maxy;
@@ -678,6 +643,26 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    /* Convert to tile coordinates, and inclusive ranges:
     */
+   if (nr_planes == 3) {
+      int ix0 = minx / 16;
+      int iy0 = miny / 16;
+      int ix1 = (maxx-1) / 16;
+      int iy1 = (maxy-1) / 16;
+      
+      if (iy0 == iy1 && ix0 == ix1)
+      {
+
+	 /* Triangle is contained in a single 16x16 block:
+	  */
+	 int mask = (ix0 & 3) | ((iy0 & 3) << 4);
+
+	 lp_scene_bin_command( scene, ix0/4, iy0/4,
+			       lp_rast_triangle_3_16,
+			       lp_rast_arg_triangle(tri, mask) );
+	 return;
+      }
+   }
+
    ix0 = minx / TILE_SIZE;
    iy0 = miny / TILE_SIZE;
    ix1 = (maxx-1) / TILE_SIZE;
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index c71ec8066c7..2ba39052aba 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -293,34 +293,7 @@ def generate_ssse3():
     print '''
 #if defined(PIPE_ARCH_SSE)
 
-
-#if defined(PIPE_ARCH_SSSE3)
-
-#include <tmmintrin.h>
-
-#else
-
-#include <emmintrin.h>
-
-/**
- * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases
- * where -mssse3 is not supported/enabled.
- *
- * MSVC will never get in here as its intrinsics support do not rely on
- * compiler command line options.
- */
-static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_epi8(__m128i a, __m128i mask)
-{
-    __m128i result;
-    __asm__("pshufb %1, %0"
-            : "=x" (result)
-            : "xm" (mask), "0" (a));
-    return result;
-}
-
-#endif
-
+#include "util/u_sse.h"
 
 static void
 lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
index f5c1c5ca2c3..e920cf9f3bc 100644
--- a/src/gallium/drivers/nouveau/nouveau_stateobj.h
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -151,9 +151,9 @@ so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
 	if (so->start_alloc <= so->cur_start) {
 		debug_printf("exceeding num_start size\n");
 		assert(0);
-	} else
+	}
 #endif /* DEBUG_NOUVEAU_STATEOBJ */
-		start = so->start;
+	start = so->start;
 
 #ifdef DEBUG_NOUVEAU_STATEOBJ
 	if (so->cur_start > 0 && start[so->cur_start - 1].size > so->cur) {
@@ -162,7 +162,6 @@ so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
 	}
 #endif /* DEBUG_NOUVEAU_STATEOBJ */
 
-	so->start = start;
 	start[so->cur_start].gr = gr;
 	start[so->cur_start].mthd = mthd;
 	start[so->cur_start].size = size;
@@ -193,11 +192,10 @@ so_reloc(struct nouveau_stateobj *so, struct nouveau_bo *bo,
 	if (so->reloc_alloc <= so->cur_reloc) {
 		debug_printf("exceeding num_reloc size\n");
 		assert(0);
-	} else
+	}
 #endif /* DEBUG_NOUVEAU_STATEOBJ */
-		r = so->reloc;
+	r = so->reloc;
 
-	so->reloc = r;
 	r[so->cur_reloc].bo = NULL;
 	nouveau_bo_ref(bo, &(r[so->cur_reloc].bo));
 	r[so->cur_reloc].gr = so->start[so->cur_start-1].gr;
diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h
index a5e8537533e..b165f7a611a 100644
--- a/src/gallium/drivers/nouveau/nouveau_util.h
+++ b/src/gallium/drivers/nouveau/nouveau_util.h
@@ -88,104 +88,4 @@ static INLINE unsigned log2i(unsigned i)
 	return r;
 }
 
-struct u_split_prim {
-   void *priv;
-   void (*emit)(void *priv, unsigned start, unsigned count);
-   void (*edge)(void *priv, boolean enabled);
-
-   unsigned mode;
-   unsigned start;
-   unsigned p_start;
-   unsigned p_end;
-
-   uint repeat_first:1;
-   uint close_first:1;
-   uint edgeflag_off:1;
-};
-
-static INLINE void
-u_split_prim_init(struct u_split_prim *s,
-                  unsigned mode, unsigned start, unsigned count)
-{
-   if (mode == PIPE_PRIM_LINE_LOOP) {
-      s->mode = PIPE_PRIM_LINE_STRIP;
-      s->close_first = 1;
-   } else {
-      s->mode = mode;
-      s->close_first = 0;
-   }
-   s->start = start;
-   s->p_start = start;
-   s->p_end = start + count;
-   s->edgeflag_off = 0;
-   s->repeat_first = 0;
-}
-
-static INLINE boolean
-u_split_prim_next(struct u_split_prim *s, unsigned max_verts)
-{
-   int repeat = 0;
-
-   if (s->repeat_first) {
-      s->emit(s->priv, s->start, 1);
-      max_verts--;
-      if (s->edgeflag_off) {
-         s->edge(s->priv, TRUE);
-         s->edgeflag_off = FALSE;
-      }
-   }
-
-   if (s->p_start + s->close_first + max_verts >= s->p_end) {
-      s->emit(s->priv, s->p_start, s->p_end - s->p_start);
-      if (s->close_first)
-         s->emit(s->priv, s->start, 1);
-      return TRUE;
-   }
-
-   switch (s->mode) {
-   case PIPE_PRIM_LINES:
-      max_verts &= ~1;
-      break;
-   case PIPE_PRIM_LINE_STRIP:
-      repeat = 1;
-      break;
-   case PIPE_PRIM_POLYGON:
-      max_verts--;
-      s->emit(s->priv, s->p_start, max_verts);
-      s->edge(s->priv, FALSE);
-      s->emit(s->priv, s->p_start + max_verts, 1);
-      s->p_start += max_verts;
-      s->repeat_first = TRUE;
-      s->edgeflag_off = TRUE;
-      return FALSE;
-   case PIPE_PRIM_TRIANGLES:
-      max_verts = max_verts - (max_verts % 3);
-      break;
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      /* to ensure winding stays correct, always split
-       * on an even number of generated triangles
-       */
-      max_verts = max_verts & ~1;
-      repeat = 2;
-      break;
-   case PIPE_PRIM_TRIANGLE_FAN:
-      s->repeat_first = TRUE;
-      repeat = 1;
-      break;
-   case PIPE_PRIM_QUADS:
-      max_verts &= ~3;
-      break;
-   case PIPE_PRIM_QUAD_STRIP:
-      max_verts &= ~1;
-      repeat = 2;
-      break;
-   default:
-      break;
-   }
-
-   s->emit (s->priv, s->p_start, max_verts);
-   s->p_start += (max_verts - repeat);
-   return FALSE;
-}
-
 #endif
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index df79ca89ca1..c6c93d40b8f 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -24,11 +24,10 @@ nouveau_screen_transfer_flags(unsigned pipe)
 		flags |= NOUVEAU_BO_WR;
 	if (pipe & PIPE_TRANSFER_DISCARD)
 		flags |= NOUVEAU_BO_INVAL;
-	if (pipe & PIPE_TRANSFER_DONTBLOCK)
-		flags |= NOUVEAU_BO_NOWAIT;
-	else
 	if (pipe & PIPE_TRANSFER_UNSYNCHRONIZED)
 		flags |= NOUVEAU_BO_NOSYNC;
+	else if (pipe & PIPE_TRANSFER_DONTBLOCK)
+		flags |= NOUVEAU_BO_NOWAIT;
 
 	return flags;
 }
diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c
index c3ac8041462..6a2ffd5a3c8 100644
--- a/src/gallium/drivers/nv50/nv50_push.c
+++ b/src/gallium/drivers/nv50/nv50_push.c
@@ -2,8 +2,8 @@
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "util/u_split_prim.h"
 
-#include "nouveau/nouveau_util.h"
 #include "nv50_context.h"
 #include "nv50_resource.h"
 
@@ -217,7 +217,7 @@ nv50_push_elements_instanced(struct pipe_context *pipe,
                                4; /* potential edgeflag enable/disable */
    const unsigned v_overhead = 1 + /* VERTEX_DATA packet header */
                                2; /* potential edgeflag modification */
-   struct u_split_prim s;
+   struct util_split_prim s;
    unsigned vtx_size;
    boolean nzi = FALSE;
    int i;
@@ -335,7 +335,7 @@ nv50_push_elements_instanced(struct pipe_context *pipe,
          ctx.attr[i].map = (uint8_t *)ctx.attr[i].map + ctx.attr[i].stride;
       }
 
-      u_split_prim_init(&s, mode, start, count);
+      util_split_prim_init(&s, mode, start, count);
       do {
          if (AVAIL_RING(chan) < p_overhead + (6 * vtx_size)) {
             FIRE_RING(chan);
@@ -351,7 +351,7 @@ nv50_push_elements_instanced(struct pipe_context *pipe,
 
          BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
          OUT_RING  (chan, nv50_prim(s.mode) | (nzi ? (1 << 28) : 0));
-         done = u_split_prim_next(&s, max_verts);
+         done = util_split_prim_next(&s, max_verts);
          BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
          OUT_RING  (chan, 0);
       } while (!done);
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index e7f8fe33edf..1f119501999 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -24,8 +24,8 @@
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "util/u_split_prim.h"
 
-#include "nouveau/nouveau_util.h"
 #include "nv50_context.h"
 #include "nv50_resource.h"
 
@@ -311,7 +311,7 @@ nv50_draw_elements_inline(struct pipe_context *pipe,
 	struct pipe_transfer *transfer;
 	struct instance a[16];
 	struct inline_ctx ctx;
-	struct u_split_prim s;
+	struct util_split_prim s;
 	boolean nzi = FALSE;
 	unsigned overhead;
 
@@ -347,7 +347,7 @@ nv50_draw_elements_inline(struct pipe_context *pipe,
 		unsigned max_verts;
 		boolean done;
 
-		u_split_prim_init(&s, mode, start, count);
+		util_split_prim_init(&s, mode, start, count);
 		do {
 			if (AVAIL_RING(chan) < (overhead + 6)) {
 				FIRE_RING(chan);
@@ -366,7 +366,7 @@ nv50_draw_elements_inline(struct pipe_context *pipe,
 
 			BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
 			OUT_RING  (chan, nv50_prim(s.mode) | (nzi ? (1<<28) : 0));
-			done = u_split_prim_next(&s, max_verts);
+			done = util_split_prim_next(&s, max_verts);
 			BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
 			OUT_RING  (chan, 0);
 		} while (!done);
diff --git a/src/gallium/drivers/r300/r300_blit.c b/src/gallium/drivers/r300/r300_blit.c
index 6f8d9abfc81..47ffc0cb3c6 100644
--- a/src/gallium/drivers/r300/r300_blit.c
+++ b/src/gallium/drivers/r300/r300_blit.c
@@ -21,6 +21,8 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
 #include "r300_context.h"
+#include "r300_emit.h"
+#include "r300_hyperz.h"
 #include "r300_texture.h"
 #include "r300_winsys.h"
 
@@ -99,9 +101,6 @@ static boolean r300_cbzb_clear_allowed(struct r300_context *r300,
     struct pipe_framebuffer_state *fb =
         (struct pipe_framebuffer_state*)r300->fb_state.state;
 
-    if (r300->z_fastfill)
-        clear_buffers &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
-
     /* Only color clear allowed, and only one colorbuffer. */
     if (clear_buffers != PIPE_CLEAR_COLOR || fb->nr_cbufs != 1)
         return FALSE;
@@ -173,22 +172,25 @@ static void r300_clear(struct pipe_context* pipe,
         (struct pipe_framebuffer_state*)r300->fb_state.state;
     struct r300_hyperz_state *hyperz =
         (struct r300_hyperz_state*)r300->hyperz_state.state;
+    struct r300_texture *zstex =
+            fb->zsbuf ? r300_texture(fb->zsbuf->texture) : NULL;
     uint32_t width = fb->width;
     uint32_t height = fb->height;
     boolean has_hyperz = r300->rws->get_value(r300->rws, R300_CAN_HYPERZ);
-    uint32_t hyperz_dcv = 0;
+    uint32_t hyperz_dcv = hyperz->zb_depthclearvalue;
 
     /* Enable fast Z clear.
      * The zbuffer must be in micro-tiled mode, otherwise it locks up. */
-    if ((buffers & (PIPE_CLEAR_DEPTH|PIPE_CLEAR_STENCIL)) && has_hyperz) {
-      
+    if ((buffers & PIPE_CLEAR_DEPTHSTENCIL) && has_hyperz) {
         hyperz_dcv = hyperz->zb_depthclearvalue =
             r300_depth_clear_value(fb->zsbuf->format, depth, stencil);
 
         r300_mark_fb_state_dirty(r300, R300_CHANGED_ZCLEAR_FLAG);
-        if (r300->z_compression || r300->z_fastfill)
+        if (zstex->zmask_mem[fb->zsbuf->level]) {
             r300->zmask_clear.dirty = TRUE;
-        if (r300->hiz_enable)
+            buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
+        }
+        if (zstex->hiz_mem[fb->zsbuf->level])
             r300->hiz_clear.dirty = TRUE;
     }
 
@@ -207,13 +209,43 @@ static void r300_clear(struct pipe_context* pipe,
     }
 
     /* Clear. */
-    r300_blitter_begin(r300, R300_CLEAR);
-    util_blitter_clear(r300->blitter,
-                       width,
-                       height,
-                       fb->nr_cbufs,
-                       buffers, rgba, depth, stencil);
-    r300_blitter_end(r300);
+    if (buffers) {
+        /* Clear using the blitter. */
+        r300_blitter_begin(r300, R300_CLEAR);
+        util_blitter_clear(r300->blitter,
+                           width,
+                           height,
+                           fb->nr_cbufs,
+                           buffers, rgba, depth, stencil);
+        r300_blitter_end(r300);
+    } else if (r300->zmask_clear.dirty) {
+        /* Just clear zmask and hiz now, this does not use a standard draw
+         * procedure. */
+        unsigned dwords;
+
+        /* Calculate zmask_clear and hiz_clear atom sizes. */
+        r300_update_hyperz_state(r300);
+        dwords = r300->zmask_clear.size +
+                 (r300->hiz_clear.dirty ? r300->hiz_clear.size : 0) +
+                 r300_get_num_cs_end_dwords(r300);
+
+        /* Reserve CS space. */
+        if (dwords > (r300->cs->ndw - r300->cs->cdw)) {
+            r300->context.flush(&r300->context, 0, NULL);
+        }
+
+        /* Emit clear packets. */
+        r300_emit_zmask_clear(r300, r300->zmask_clear.size,
+                              r300->zmask_clear.state);
+        r300->zmask_clear.dirty = FALSE;
+        if (r300->hiz_clear.dirty) {
+            r300_emit_hiz_clear(r300, r300->hiz_clear.size,
+                                r300->hiz_clear.state);
+            r300->hiz_clear.dirty = FALSE;
+        }
+    } else {
+        assert(0);
+    }
 
     /* Disable CBZB clear. */
     if (r300->cbzb_clear) {
@@ -222,6 +254,16 @@ static void r300_clear(struct pipe_context* pipe,
         r300_mark_fb_state_dirty(r300, R300_CHANGED_CBZB_FLAG);
     }
 
+    /* Enable fastfill and/or hiz.
+     *
+     * If we cleared zmask/hiz, it's in use now. The Hyper-Z state update
+     * looks if zmask/hiz is in use and enables fastfill accordingly. */
+    if (zstex &&
+        (zstex->zmask_in_use[fb->zsbuf->level] ||
+         zstex->hiz_in_use[fb->zsbuf->level])) {
+        r300->hyperz_state.dirty = TRUE;
+    }
+
     /* XXX this flush "fixes" a hardlock in the cubestorm xscreensaver */
     if (r300->flush_counter == 0)
         pipe->flush(pipe, 0, NULL);
@@ -259,27 +301,31 @@ static void r300_clear_depth_stencil(struct pipe_context *pipe,
     r300_blitter_end(r300);
 }
 
-/* Clear a region of a depth stencil surface. */
-static void r300_flush_depth_stencil(struct pipe_context *pipe,
-                                     struct pipe_resource *dst,
-                                     struct pipe_subresource subdst)
+/* Flush a depth stencil buffer. */
+void r300_flush_depth_stencil(struct pipe_context *pipe,
+                              struct pipe_resource *dst,
+                              struct pipe_subresource subdst,
+                              unsigned zslice)
 {
     struct r300_context *r300 = r300_context(pipe);
     struct pipe_surface *dstsurf;
     struct r300_texture *tex = r300_texture(dst);
 
-    /* only flush the zmask if we have one attached to this texture */
     if (!tex->zmask_mem[subdst.level])
         return;
+    if (!tex->zmask_in_use[subdst.level])
+        return;
 
     dstsurf = pipe->screen->get_tex_surface(pipe->screen, dst,
-                                            subdst.face, subdst.level, 0,
+                                            subdst.face, subdst.level, zslice,
                                             PIPE_BIND_DEPTH_STENCIL);
     r300->z_decomp_rd = TRUE;
     r300_blitter_begin(r300, R300_CLEAR_SURFACE);
     util_blitter_flush_depth_stencil(r300->blitter, dstsurf);
     r300_blitter_end(r300);
     r300->z_decomp_rd = FALSE;
+
+    tex->zmask_in_use[subdst.level] = FALSE;
 }
 
 /* Copy a block of pixels from one surface to another using HW. */
@@ -342,7 +388,7 @@ static void r300_resource_copy_region(struct pipe_context *pipe,
 
     is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0;
     if (is_depth) {
-        r300_flush_depth_stencil(pipe, src, subsrc);
+        r300_flush_depth_stencil(pipe, src, subsrc, srcz);
     }
     if (old_format != new_format) {
         dst->format = new_format;
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index e8b6c4f7af8..a83ad892eaa 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -99,8 +99,10 @@ static void r300_destroy_context(struct pipe_context* context)
     struct r300_context* r300 = r300_context(context);
     struct r300_atom *atom;
 
-    util_blitter_destroy(r300->blitter);
-    draw_destroy(r300->draw);
+    if (r300->blitter)
+        util_blitter_destroy(r300->blitter);
+    if (r300->draw)
+        draw_destroy(r300->draw);
 
     /* Print stats, if enabled. */
     if (SCREEN_DBG_ON(r300->screen, DBG_STATS)) {
@@ -112,40 +114,48 @@ static void r300_destroy_context(struct pipe_context* context)
         }
     }
 
-    u_upload_destroy(r300->upload_vb);
-    u_upload_destroy(r300->upload_ib);
+    if (r300->upload_vb)
+        u_upload_destroy(r300->upload_vb);
+    if (r300->upload_ib)
+        u_upload_destroy(r300->upload_ib);
 
-    /* setup hyper-z mm */
-    if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
+    if (r300->zmask_mm)
         r300_hyperz_destroy_mm(r300);
 
-    translate_cache_destroy(r300->tran.translate_cache);
+    if (r300->tran.translate_cache)
+        translate_cache_destroy(r300->tran.translate_cache);
 
+    /* XXX: This function assumes r300->query_list was initialized */
     r300_release_referenced_objects(r300);
 
-    r300->rws->cs_destroy(r300->cs);
+    if (r300->cs)
+        r300->rws->cs_destroy(r300->cs);
 
+    /* XXX: No way to tell if this was initialized or not? */
     util_mempool_destroy(&r300->pool_transfers);
 
     r300_update_num_contexts(r300->screen, -1);
 
-    FREE(r300->aa_state.state);
-    FREE(r300->blend_color_state.state);
-    FREE(r300->clip_state.state);
-    FREE(r300->fb_state.state);
-    FREE(r300->gpu_flush.state);
-    FREE(r300->hyperz_state.state);
-    FREE(r300->invariant_state.state);
-    FREE(r300->rs_block_state.state);
-    FREE(r300->scissor_state.state);
-    FREE(r300->textures_state.state);
-    FREE(r300->vap_invariant_state.state);
-    FREE(r300->viewport_state.state);
-    FREE(r300->ztop_state.state);
-    FREE(r300->fs_constants.state);
-    FREE(r300->vs_constants.state);
-    if (!r300->screen->caps.has_tcl) {
-        FREE(r300->vertex_stream_state.state);
+    /* Free the structs allocated in r300_setup_atoms() */
+    if (r300->aa_state.state) {
+        FREE(r300->aa_state.state);
+        FREE(r300->blend_color_state.state);
+        FREE(r300->clip_state.state);
+        FREE(r300->fb_state.state);
+        FREE(r300->gpu_flush.state);
+        FREE(r300->hyperz_state.state);
+        FREE(r300->invariant_state.state);
+        FREE(r300->rs_block_state.state);
+        FREE(r300->scissor_state.state);
+        FREE(r300->textures_state.state);
+        FREE(r300->vap_invariant_state.state);
+        FREE(r300->viewport_state.state);
+        FREE(r300->ztop_state.state);
+        FREE(r300->fs_constants.state);
+        FREE(r300->vs_constants.state);
+        if (!r300->screen->caps.has_tcl) {
+            FREE(r300->vertex_stream_state.state);
+        }
     }
     FREE(r300);
 }
@@ -158,12 +168,14 @@ void r300_flush_cb(void *data)
 }
 
 #define R300_INIT_ATOM(atomname, atomsize) \
+ do { \
     r300->atomname.name = #atomname; \
     r300->atomname.state = NULL; \
     r300->atomname.size = atomsize; \
     r300->atomname.emit = r300_emit_##atomname; \
     r300->atomname.dirty = FALSE; \
-    insert_at_tail(&r300->atom_list, &r300->atomname);
+    insert_at_tail(&r300->atom_list, &r300->atomname); \
+ } while (0)
 
 static void r300_setup_atoms(struct r300_context* r300)
 {
@@ -404,12 +416,16 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     r300->context.destroy = r300_destroy_context;
 
-    r300->cs = rws->cs_create(rws);
+    make_empty_list(&r300->query_list);
 
     util_mempool_create(&r300->pool_transfers,
                         sizeof(struct pipe_transfer), 64,
                         UTIL_MEMPOOL_SINGLETHREADED);
 
+    r300->cs = rws->cs_create(rws);
+    if (r300->cs == NULL)
+        goto fail;
+
     if (!r300screen->caps.has_tcl) {
         /* Create a Draw. This is used for SW TCL. */
         r300->draw = draw_create(&r300->context);
@@ -424,8 +440,6 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     r300_setup_atoms(r300);
 
-    make_empty_list(&r300->query_list);
-
     r300_init_blit_functions(r300);
     r300_init_flush_functions(r300);
     r300_init_query_functions(r300);
@@ -433,6 +447,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     r300_init_resource_functions(r300);
 
     r300->blitter = util_blitter_create(&r300->context);
+    if (r300->blitter == NULL)
+        goto fail;
 
     /* Render functions must be initialized after blitter. */
     r300_init_render_functions(r300);
@@ -441,22 +457,25 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     /* setup hyper-z mm */
     if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
-        r300_hyperz_init_mm(r300);
+        if (!r300_hyperz_init_mm(r300))
+            goto fail;
 
     r300->upload_ib = u_upload_create(&r300->context,
 				      32 * 1024, 16,
 				      PIPE_BIND_INDEX_BUFFER);
 
     if (r300->upload_ib == NULL)
-        goto no_upload_ib;
+        goto fail;
 
     r300->upload_vb = u_upload_create(&r300->context,
 				      128 * 1024, 16,
 				      PIPE_BIND_VERTEX_BUFFER);
     if (r300->upload_vb == NULL)
-        goto no_upload_vb;
+        goto fail;
 
     r300->tran.translate_cache = translate_cache_create();
+    if (r300->tran.translate_cache == NULL)
+        goto fail;
 
     r300_init_states(&r300->context);
 
@@ -486,10 +505,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
 
     return &r300->context;
 
- no_upload_ib:
-    u_upload_destroy(r300->upload_ib);
- no_upload_vb:
-    FREE(r300);
+ fail:
+    r300_destroy_context(&r300->context);
     return NULL;
 }
 
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index d86a5c8fc98..6fa7f470f98 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -397,6 +397,8 @@ struct r300_texture {
     /* hyper-z memory allocs */
     struct mem_block *hiz_mem[R300_MAX_TEXTURE_LEVELS];
     struct mem_block *zmask_mem[R300_MAX_TEXTURE_LEVELS];
+    boolean zmask_in_use[R300_MAX_TEXTURE_LEVELS];
+    boolean hiz_in_use[R300_MAX_TEXTURE_LEVELS];
 
     /* This is the level tiling flags were last time set for.
      * It's used to prevent redundant tiling-flags changes from happening.*/
@@ -564,12 +566,9 @@ struct r300_context {
     boolean two_sided_color;
     /* Incompatible vertex buffer layout? (misaligned stride or buffer_offset) */
     boolean incompatible_vb_layout;
-    /* Whether fast zclear is enabled. */
-    boolean z_fastfill;
 #define R300_Z_COMPRESS_44 1
 #define RV350_Z_COMPRESS_88 2
     int z_compression;
-    boolean hiz_enable;
     boolean cbzb_clear;
     boolean z_decomp_rd;
 
@@ -628,6 +627,12 @@ void r300_init_render_functions(struct r300_context *r300);
 void r300_init_state_functions(struct r300_context* r300);
 void r300_init_resource_functions(struct r300_context* r300);
 
+/* r300_blit.c */
+void r300_flush_depth_stencil(struct pipe_context *pipe,
+                              struct pipe_resource *dst,
+                              struct pipe_subresource subdst,
+                              unsigned zslice);
+
 /* r300_query.c */
 void r300_resume_query(struct r300_context *r300,
                        struct r300_query *query);
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 17e180a79ac..d0fd45349e3 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -393,7 +393,7 @@ void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state)
             /* HiZ RAM. */
             if (r300->screen->caps.hiz_ram) {
                 if (tex->hiz_mem[level]) {
-                    OUT_CS_REG(R300_ZB_HIZ_OFFSET, tex->hiz_mem[level]->ofs);
+                    OUT_CS_REG(R300_ZB_HIZ_OFFSET, tex->hiz_mem[level]->ofs << 2);
                     OUT_CS_REG(R300_ZB_HIZ_PITCH, surf_pitch);
                 } else {
                     OUT_CS_REG(R300_ZB_HIZ_OFFSET, 0);
@@ -402,7 +402,7 @@ void r300_emit_fb_state(struct r300_context* r300, unsigned size, void* state)
             }
             /* Z Mask RAM. (compressed zbuffer) */
             if (tex->zmask_mem[level]) {
-                OUT_CS_REG(R300_ZB_ZMASK_OFFSET, tex->zmask_mem[level]->ofs);
+                OUT_CS_REG(R300_ZB_ZMASK_OFFSET, tex->zmask_mem[level]->ofs << 2);
                 OUT_CS_REG(R300_ZB_ZMASK_PITCH, surf_pitch);
             } else {
                 OUT_CS_REG(R300_ZB_ZMASK_OFFSET, 0);
@@ -936,6 +936,22 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
             OUT_CS_TABLE(data, 4);
         }
     }
+
+    /* Emit flow control instructions. */
+    if (code->num_fc_ops) {
+
+        OUT_CS_REG(R300_VAP_PVS_FLOW_CNTL_OPC, code->fc_ops);
+        if (r300screen->caps.is_r500) {
+            OUT_CS_REG_SEQ(R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0, code->num_fc_ops * 2);
+            OUT_CS_TABLE(code->fc_op_addrs.r500, code->num_fc_ops * 2);
+        } else {
+            OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_ADDRS_0, code->num_fc_ops);
+            OUT_CS_TABLE(code->fc_op_addrs.r300, code->num_fc_ops);
+        }
+        OUT_CS_REG_SEQ(R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0, code->num_fc_ops);
+        OUT_CS_TABLE(code->fc_loop_index, code->num_fc_ops);
+    }
+
     END_CS;
 }
 
@@ -1008,6 +1024,8 @@ void r300_emit_hiz_clear(struct r300_context *r300, unsigned size, void *state)
     int i;
 
     tex = r300_texture(fb->zsbuf->texture);
+
+    offset = tex->hiz_mem[fb->zsbuf->level]->ofs;
     stride = tex->desc.stride_in_pixels[fb->zsbuf->level];
 
     /* convert from pixels to 4x4 blocks */
@@ -1028,6 +1046,9 @@ void r300_emit_hiz_clear(struct r300_context *r300, unsigned size, void *state)
         r300_emit_hiz_line_clear(r300, offset, stride, 0xffffffff);
     }
     z->current_func = -1;
+
+    /* Mark the current zbuffer's hiz ram as in use. */
+    tex->hiz_in_use[fb->zsbuf->level] = TRUE;
 }
 
 void r300_emit_zmask_clear(struct r300_context *r300, unsigned size, void *state)
@@ -1043,6 +1064,8 @@ void r300_emit_zmask_clear(struct r300_context *r300, unsigned size, void *state
     tex = r300_texture(fb->zsbuf->texture);
     stride = tex->desc.stride_in_pixels[fb->zsbuf->level];
 
+    offset = tex->zmask_mem[fb->zsbuf->level]->ofs;
+
     if (r300->z_compression == RV350_Z_COMPRESS_88)
         mult = 8;
     else
@@ -1065,6 +1088,9 @@ void r300_emit_zmask_clear(struct r300_context *r300, unsigned size, void *state
         offset <<= offset_shift;
         r300_emit_zmask_line_clear(r300, offset, stride, 0x0);//0xffffffff);
     }
+
+    /* Mark the current zbuffer's zmask as in use. */
+    tex->zmask_in_use[fb->zsbuf->level] = TRUE;
 }
 
 void r300_emit_ztop_state(struct r300_context* r300,
@@ -1186,6 +1212,17 @@ unsigned r300_get_num_dirty_dwords(struct r300_context *r300)
     return dwords;
 }
 
+unsigned r300_get_num_cs_end_dwords(struct r300_context *r300)
+{
+    unsigned dwords = 0;
+
+    /* Emitted in flush. */
+    dwords += 26; /* emit_query_end */
+    dwords += r300->hyperz_state.size + 2; /* emit_hyperz_end + zcache flush */
+
+    return dwords;
+}
+
 /* Emit all dirty state. */
 void r300_emit_dirty_state(struct r300_context* r300)
 {
diff --git a/src/gallium/drivers/r300/r300_emit.h b/src/gallium/drivers/r300/r300_emit.h
index 2f2c2f2dcb4..bae25256346 100644
--- a/src/gallium/drivers/r300/r300_emit.h
+++ b/src/gallium/drivers/r300/r300_emit.h
@@ -116,6 +116,7 @@ void r300_emit_hiz_clear(struct r300_context *r300, unsigned size, void *state);
 void r300_emit_zmask_clear(struct r300_context *r300, unsigned size, void *state);
 
 unsigned r300_get_num_dirty_dwords(struct r300_context *r300);
+unsigned r300_get_num_cs_end_dwords(struct r300_context *r300);
 
 /* Emit all dirty state. */
 void r300_emit_dirty_state(struct r300_context* r300);
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index 7fed9b5d074..fe182b6615b 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -44,8 +44,7 @@ static void r300_flush(struct pipe_context* pipe,
     u_upload_flush(r300->upload_ib);
 
     if (r300->dirty_hw) {
-        if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
-            r300_emit_hyperz_end(r300);
+        r300_emit_hyperz_end(r300);
         r300_emit_query_end(r300);
 
         r300->flush_counter++;
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 87ff49a90c7..2a0c30620ad 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -72,6 +72,11 @@ void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
                 fs_inputs->wpos = i;
                 break;
 
+            case TGSI_SEMANTIC_FACE:
+                assert(index == 0);
+                fs_inputs->face = i;
+                break;
+
             default:
                 fprintf(stderr, "r300: FP: Unknown input semantic: %i\n",
                         info->input_semantic_name[i]);
@@ -120,6 +125,9 @@ static void allocate_hardware_inputs(
             allocate(mydata, inputs->color[i], reg++);
         }
     }
+    if (inputs->face != ATTR_UNUSED) {
+        allocate(mydata, inputs->face, reg++);
+    }
     for (i = 0; i < ATTR_GENERIC_COUNT; i++) {
         if (inputs->generic[i] != ATTR_UNUSED) {
             allocate(mydata, inputs->generic[i], reg++);
@@ -360,13 +368,14 @@ static void r300_translate_fragment_shader(
 {
     struct r300_fragment_program_compiler compiler;
     struct tgsi_to_rc ttr;
-    int wpos;
+    int wpos, face;
     unsigned i;
 
     tgsi_scan_shader(tokens, &shader->info);
     r300_shader_read_fs_inputs(&shader->info, &shader->inputs);
 
     wpos = shader->inputs.wpos;
+    face = shader->inputs.face;
 
     /* Setup the compiler. */
     memset(&compiler, 0, sizeof(compiler));
@@ -383,7 +392,7 @@ static void r300_translate_fragment_shader(
     find_output_registers(&compiler, shader);
 
     if (compiler.Base.Debug) {
-        debug_printf("r300: Initial fragment program\n");
+        DBG(r300, DBG_FP, "r300: Initial fragment program\n");
         tgsi_dump(tokens, 0);
     }
 
@@ -406,6 +415,10 @@ static void r300_translate_fragment_shader(
         rc_transform_fragment_wpos(&compiler.Base, wpos, wpos, TRUE);
     }
 
+    if (face != ATTR_UNUSED) {
+        rc_transform_fragment_face(&compiler.Base, face);
+    }
+
     /* Invoke the compiler */
     r3xx_compile_fragment_program(&compiler);
 
@@ -418,7 +431,7 @@ static void r300_translate_fragment_shader(
     }
 
     if (compiler.Base.Error) {
-        fprintf(stderr, "r300 FP: Compiler Error:\n%sUsing a dummy shader"
+        DBG(r300, DBG_FP, "r300 FP: Compiler Error:\n%sUsing a dummy shader"
                 " instead.\nIf there's an 'unknown opcode' message, please"
                 " file a bug report and attach this log.\n", compiler.Base.ErrorMsg);
 
diff --git a/src/gallium/drivers/r300/r300_hyperz.c b/src/gallium/drivers/r300/r300_hyperz.c
index 10e440ce306..a471b7353bf 100644
--- a/src/gallium/drivers/r300/r300_hyperz.c
+++ b/src/gallium/drivers/r300/r300_hyperz.c
@@ -21,12 +21,14 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
-#include "util/u_format.h"
-#include "util/u_mm.h"
 #include "r300_context.h"
 #include "r300_hyperz.h"
 #include "r300_reg.h"
 #include "r300_fs.h"
+#include "r300_winsys.h"
+
+#include "util/u_format.h"
+#include "util/u_mm.h"
 
 /*
   HiZ rules - taken from various docs 
@@ -127,6 +129,12 @@ static void r300_update_hyperz(struct r300_context* r300)
 {
     struct r300_hyperz_state *z =
         (struct r300_hyperz_state*)r300->hyperz_state.state;
+    struct pipe_framebuffer_state *fb =
+        (struct pipe_framebuffer_state*)r300->fb_state.state;
+    struct r300_texture *zstex =
+            fb->zsbuf ? r300_texture(fb->zsbuf->texture) : NULL;
+    boolean zmask_in_use = FALSE;
+    boolean hiz_in_use = FALSE;
 
     z->gb_z_peq_config = 0;
     z->zb_bw_cntl = 0;
@@ -138,22 +146,32 @@ static void r300_update_hyperz(struct r300_context* r300)
         return;
     }
 
+    if (!zstex)
+        return;
+
+    if (!r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
+        return;
+
+    zmask_in_use = zstex->zmask_in_use[fb->zsbuf->level];
+    hiz_in_use = zstex->hiz_in_use[fb->zsbuf->level];
+
+    /* Z fastfill. */
+    if (zmask_in_use) {
+        z->zb_bw_cntl |= R300_FAST_FILL_ENABLE; /*  | R300_FORCE_COMPRESSED_STENCIL_VALUE_ENABLE;*/
+    }
+
     /* Zbuffer compression. */
-    if (r300->z_compression) {
+    if (zmask_in_use && r300->z_compression) {
         z->zb_bw_cntl |= R300_RD_COMP_ENABLE;
         if (r300->z_decomp_rd == false)
             z->zb_bw_cntl |= R300_WR_COMP_ENABLE;
-       /* RV350 and up optimizations. */
-       if (r300->z_compression == RV350_Z_COMPRESS_88)
-           z->gb_z_peq_config |= R300_GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_8_8;
-    }
-
-    /* Z fastfill. */
-    if (r300->z_fastfill) {
-        z->zb_bw_cntl |= R300_FAST_FILL_ENABLE; /*  | R300_FORCE_COMPRESSED_STENCIL_VALUE_ENABLE;*/
     }
+    /* RV350 and up optimizations. */
+    /* The section 10.4.9 in the docs is a lie. */
+    if (r300->z_compression == RV350_Z_COMPRESS_88)
+        z->gb_z_peq_config |= R300_GB_Z_PEQ_CONFIG_Z_PEQ_SIZE_8_8;
 
-    if (r300->hiz_enable) {
+    if (hiz_in_use) {
         bool can_hiz = r300_can_hiz(r300);
         if (can_hiz) {
             z->zb_bw_cntl |= R300_HIZ_ENABLE;
@@ -163,8 +181,8 @@ static void r300_update_hyperz(struct r300_context* r300)
         }
     }
 
+    /* R500-specific features and optimizations. */
     if (r300->screen->caps.is_r500) {
-        /* XXX Are these bits really available on RV350? */
         z->zb_bw_cntl |= R500_HIZ_FP_EXP_BITS_3;
         z->zb_bw_cntl |=
                 R500_HIZ_EQUAL_REJECT_ENABLE |
@@ -333,6 +351,12 @@ void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf
 
     tex = r300_texture(surf->base.texture);
 
+    /* We currently don't handle decompression for 3D textures and cubemaps
+     * correctly. */
+    if (tex->desc.b.b.target != PIPE_TEXTURE_1D &&
+        tex->desc.b.b.target != PIPE_TEXTURE_2D)
+        return;
+
     if (tex->zmask_mem[level])
         return;
 
@@ -349,23 +373,36 @@ void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf
     return;
 }
 
-void r300_hyperz_init_mm(struct r300_context *r300)
+boolean r300_hyperz_init_mm(struct r300_context *r300)
 {
     struct r300_screen* r300screen = r300->screen;
     int frag_pipes = r300screen->caps.num_frag_pipes;
 
-    if (r300screen->caps.hiz_ram)
+    r300->zmask_mm = u_mmInit(0, r300screen->caps.zmask_ram * frag_pipes);
+    if (!r300->zmask_mm)
+      return FALSE;
+
+    if (r300screen->caps.hiz_ram) {
       r300->hiz_mm = u_mmInit(0, r300screen->caps.hiz_ram * frag_pipes);
+      if (!r300->hiz_mm) {
+        u_mmDestroy(r300->zmask_mm);
+        r300->zmask_mm = NULL;
+        return FALSE;
+      }
+    }
 
-    r300->zmask_mm = u_mmInit(0, r300screen->caps.zmask_ram * frag_pipes);
+    return TRUE;
 }
 
 void r300_hyperz_destroy_mm(struct r300_context *r300)
 {
     struct r300_screen* r300screen = r300->screen;
 
-    if (r300screen->caps.hiz_ram)
+    if (r300screen->caps.hiz_ram) {
       u_mmDestroy(r300->hiz_mm);
+      r300->hiz_mm = NULL;
+    }
 
     u_mmDestroy(r300->zmask_mm);
+    r300->zmask_mm = NULL;
 }
diff --git a/src/gallium/drivers/r300/r300_hyperz.h b/src/gallium/drivers/r300/r300_hyperz.h
index 09e1ff6625c..30a23ec6493 100644
--- a/src/gallium/drivers/r300/r300_hyperz.h
+++ b/src/gallium/drivers/r300/r300_hyperz.h
@@ -30,6 +30,6 @@ void r300_update_hyperz_state(struct r300_context* r300);
 void r300_hiz_alloc_block(struct r300_context *r300, struct r300_surface *surf);
 void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf, int compress);
 
-void r300_hyperz_init_mm(struct r300_context *r300);
+boolean r300_hyperz_init_mm(struct r300_context *r300);
 void r300_hyperz_destroy_mm(struct r300_context *r300);
 #endif
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 99a9d650551..60d3b600cb7 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -496,6 +496,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
 #define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
 
+#define R300_VAP_PVS_FLOW_CNTL_ADDRS_0      0x2230
+#define R300_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x)            ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x)             ((x) << 24)
+
 /* gap */
 
 /* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -514,6 +520,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_2288_R300                    0x00750000 /* -- nh */
 #       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
 
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x)        ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x)        ((x) << 8)
+
 /* gap */
 
 /* Addresses are relative to the vertex program instruction area of the
@@ -548,6 +558,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
 #define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x)         (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x)         (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x)          (3 << (2 * (x)))
 
 /* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
  * immediate vertices
@@ -564,6 +577,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* write 0 to indicate end of packet? */
 #define R300_VAP_VTX_END_OF_PKT             0x24AC
 
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0   0x2500
+#define R500_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0   0x2504
+#define R500_PVS_FC_LAST_INST(x)            ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x)             ((x) << 16)
+
 /* gap */
 
 /* These are values from r300_reg/r300_reg.h - they are known to be correct
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 910f5f7113e..86b11ca0458 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -186,20 +186,14 @@ enum r300_prepare_flags {
  * \param cs_dwords     The number of dwords to reserve in CS.
  * \param aos_offset    The offset passed to emit_aos.
  * \param index_bias    The index bias to emit.
- * \param end_cs_dwords The number of free dwords which must be available
- *                      at the end of CS after drawing in case the CS space
- *                      management is performed by a draw_* function manually.
- *                      The parameter may be NULL.
  */
 static void r300_prepare_for_rendering(struct r300_context *r300,
                                        enum r300_prepare_flags flags,
                                        struct pipe_resource *index_buffer,
                                        unsigned cs_dwords,
                                        int aos_offset,
-                                       int index_bias,
-                                       unsigned *end_cs_dwords)
+                                       int index_bias)
 {
-    unsigned end_dwords    = 0;
     boolean flushed        = FALSE;
     boolean first_draw     = flags & PREP_FIRST_DRAW;
     boolean emit_aos       = flags & PREP_EMIT_AOS;
@@ -221,12 +215,7 @@ static void r300_prepare_for_rendering(struct r300_context *r300,
             cs_dwords += 7; /* emit_aos_swtcl */
     }
 
-    /* Emitted in flush. */
-    end_dwords += 26; /* emit_query_end */
-    if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
-        end_dwords += r300->hyperz_state.size + 2; /* emit_hyperz_end + zcache flush */
-
-    cs_dwords += end_dwords;
+    cs_dwords += r300_get_num_cs_end_dwords(r300);
 
     /* Reserve requested CS space. */
     if (cs_dwords > (r300->cs->ndw - r300->cs->cdw)) {
@@ -251,9 +240,6 @@ static void r300_prepare_for_rendering(struct r300_context *r300,
         if (emit_aos_swtcl)
             r300_emit_aos_swtcl(r300, indexed);
     }
-
-    if (end_cs_dwords)
-        *end_cs_dwords = end_dwords;
 }
 
 static boolean immd_is_good_idea(struct r300_context *r300,
@@ -354,7 +340,7 @@ static void r300_emit_draw_arrays_immediate(struct r300_context *r300,
 
     dwords = 9 + count * vertex_size;
 
-    r300_prepare_for_rendering(r300, PREP_FIRST_DRAW, NULL, dwords, 0, 0, NULL);
+    r300_prepare_for_rendering(r300, PREP_FIRST_DRAW, NULL, dwords, 0, 0);
 
     BEGIN_CS(dwords);
     OUT_CS_REG(R300_GA_COLOR_CONTROL,
@@ -534,7 +520,7 @@ static void r300_draw_range_elements(struct pipe_context* pipe,
     /* 15 dwords for emit_draw_elements */
     r300_prepare_for_rendering(r300,
         PREP_FIRST_DRAW | PREP_VALIDATE_VBOS | PREP_EMIT_AOS | PREP_INDEXED,
-        indexBuffer, 15, buffer_offset, indexBias, NULL);
+        indexBuffer, 15, buffer_offset, indexBias);
 
     if (alt_num_verts || count <= 65535) {
         r300_emit_draw_elements(r300, indexBuffer, indexSize,
@@ -553,7 +539,7 @@ static void r300_draw_range_elements(struct pipe_context* pipe,
             if (count) {
                 r300_prepare_for_rendering(r300,
                     PREP_VALIDATE_VBOS | PREP_EMIT_AOS | PREP_INDEXED,
-                    indexBuffer, 15, buffer_offset, indexBias, NULL);
+                    indexBuffer, 15, buffer_offset, indexBias);
             }
         } while (count);
     }
@@ -598,7 +584,7 @@ static void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
     } else {
         /* 9 spare dwords for emit_draw_arrays. */
         r300_prepare_for_rendering(r300, PREP_FIRST_DRAW | PREP_VALIDATE_VBOS | PREP_EMIT_AOS,
-                               NULL, 9, start, 0, NULL);
+                               NULL, 9, start, 0);
 
         if (alt_num_verts || count <= 65535) {
             r300_emit_draw_arrays(r300, mode, count);
@@ -614,7 +600,7 @@ static void r300_draw_arrays(struct pipe_context* pipe, unsigned mode,
                 if (count) {
                     r300_prepare_for_rendering(r300,
                         PREP_VALIDATE_VBOS | PREP_EMIT_AOS, NULL, 9,
-                        start, 0, NULL);
+                        start, 0);
                 }
             } while (count);
         }
@@ -855,7 +841,7 @@ static void r300_render_draw_arrays(struct vbuf_render* render,
     (void) i; (void) ptr;
 
     r300_prepare_for_rendering(r300, PREP_FIRST_DRAW | PREP_EMIT_AOS_SWTCL,
-                               NULL, dwords, 0, 0, NULL);
+                               NULL, dwords, 0, 0);
 
     DBG(r300, DBG_DRAW, "r300: render_draw_arrays (count: %d)\n", count);
 
@@ -908,7 +894,8 @@ static void r300_render_draw_elements(struct vbuf_render* render,
      * indices than it can fit in CS. */
     r300_prepare_for_rendering(r300,
         PREP_FIRST_DRAW | PREP_EMIT_AOS_SWTCL | PREP_INDEXED,
-        NULL, 256, 0, 0, &end_cs_dwords);
+        NULL, 256, 0, 0);
+    end_cs_dwords = r300_get_num_cs_end_dwords(r300);
 
     while (count) {
         free_dwords = r300->cs->ndw - r300->cs->cdw;
@@ -938,7 +925,8 @@ static void r300_render_draw_elements(struct vbuf_render* render,
         if (count) {
             r300_prepare_for_rendering(r300,
                 PREP_EMIT_AOS_SWTCL | PREP_INDEXED,
-                NULL, 256, 0, 0, &end_cs_dwords);
+                NULL, 256, 0, 0);
+            end_cs_dwords = r300_get_num_cs_end_dwords(r300);
         }
     }
 }
@@ -1032,7 +1020,7 @@ static void r300_blitter_draw_rectangle(struct blitter_context *blitter,
     r300->clip_state.dirty = FALSE;
     r300->viewport_state.dirty = FALSE;
 
-    r300_prepare_for_rendering(r300, PREP_FIRST_DRAW, NULL, dwords, 0, 0, NULL);
+    r300_prepare_for_rendering(r300, PREP_FIRST_DRAW, NULL, dwords, 0, 0);
 
     DBG(r300, DBG_DRAW, "r300: draw_rectangle\n");
 
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 6268001054b..1e4edcdbc31 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -115,7 +115,6 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
         case PIPE_CAP_BLEND_EQUATION_SEPARATE:
         case PIPE_CAP_TEXTURE_SWIZZLE:
-        case PIPE_CAP_DEPTH_CLAMP:
             return 1;
 
         /* Unsupported features (boolean caps). */
@@ -124,6 +123,8 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_TGSI_CONT_SUPPORTED:
         case PIPE_CAP_INDEP_BLEND_ENABLE:
         case PIPE_CAP_INDEP_BLEND_FUNC:
+        case PIPE_CAP_DEPTH_CLAMP: /* XXX implemented, but breaks Regnum Online */
+        case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
             return 0;
 
         /* Texturing. */
@@ -150,9 +151,6 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_CONST_BUFFER_SIZE:
             return 256;
 
-        case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
-            return 1;
-
         /* Fragment coordinate conventions. */
         case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
         case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
diff --git a/src/gallium/drivers/r300/r300_shader_semantics.h b/src/gallium/drivers/r300/r300_shader_semantics.h
index cb7a37033f3..4be23e64ce7 100644
--- a/src/gallium/drivers/r300/r300_shader_semantics.h
+++ b/src/gallium/drivers/r300/r300_shader_semantics.h
@@ -38,6 +38,7 @@ struct r300_shader_semantics {
     int psize;
     int color[ATTR_COLOR_COUNT];
     int bcolor[ATTR_COLOR_COUNT];
+    int face;
     int generic[ATTR_GENERIC_COUNT];
     int fog;
     int wpos;
@@ -50,6 +51,7 @@ static INLINE void r300_shader_semantics_reset(
 
     info->pos = ATTR_UNUSED;
     info->psize = ATTR_UNUSED;
+    info->face = ATTR_UNUSED;
     info->fog = ATTR_UNUSED;
     info->wpos = ATTR_UNUSED;
 
diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c
index 1e6b81d7989..239edd98e32 100644
--- a/src/gallium/drivers/r300/r300_state.c
+++ b/src/gallium/drivers/r300/r300_state.c
@@ -689,8 +689,7 @@ void r300_mark_fb_state_dirty(struct r300_context *r300,
     /* What is marked as dirty depends on the enum r300_fb_state_change. */
     r300->gpu_flush.dirty = TRUE;
     r300->fb_state.dirty = TRUE;
-    if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
-        r300->hyperz_state.dirty = TRUE;
+    r300->hyperz_state.dirty = TRUE;
 
     if (change == R300_CHANGED_FB_STATE) {
         r300->aa_state.dirty = TRUE;
@@ -753,8 +752,6 @@ static void
 
     r300_mark_fb_state_dirty(r300, R300_CHANGED_FB_STATE);
 
-    r300->hiz_enable = false;
-    r300->z_fastfill = false;
     r300->z_compression = false;
     
     if (state->zsbuf) {
@@ -781,23 +778,18 @@ static void
             /* work out whether we can support zmask features on this buffer */
             r300_zmask_alloc_block(r300, zs_surf, compress);
 
-            if (tex->hiz_mem[level]) {
-                r300->hiz_enable = 1;
-            }
-
             if (tex->zmask_mem[level]) {
-                r300->z_fastfill = 1;
                 /* compression causes hangs on 16-bit */
                 if (zbuffer_bpp == 24)
                     r300->z_compression = compress;
             }
             DBG(r300, DBG_HYPERZ,
-                "hyper-z features: hiz: %d @ %08x z-compression: %d z-fastfill: %d @ %08x\n", r300->hiz_enable,
+                "hyper-z features: hiz: %d @ %08x z-compression: %d z-fastfill: %d @ %08x\n", tex->hiz_mem[level] ? 1 : 0,
                 tex->hiz_mem[level] ? tex->hiz_mem[level]->ofs : 0xdeadbeef,
-                r300->z_compression, r300->z_fastfill,
+                r300->z_compression, tex->zmask_mem[level] ? 1 : 0,
                 tex->zmask_mem[level] ? tex->zmask_mem[level]->ofs : 0xdeadbeef);
         }
-            
+
         /* Polygon offset depends on the zbuffer bit depth. */
         if (r300->zbuffer_bpp != zbuffer_bpp) {
             r300->zbuffer_bpp = zbuffer_bpp;
@@ -1759,10 +1751,12 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader)
     r300->rs_block_state.dirty = TRUE; /* Will be updated before the emission. */
 
     if (r300->screen->caps.has_tcl) {
+        unsigned fc_op_dwords = r300->screen->caps.is_r500 ? 3 : 2;
         r300->vs_state.dirty = TRUE;
         r300->vs_state.size =
                 vs->code.length + 9 +
-                (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0);
+                (vs->immediates_count ? vs->immediates_count * 4 + 3 : 0) +
+        (vs->code.num_fc_ops ? vs->code.num_fc_ops * fc_op_dwords + 4 : 0);
 
         if (vs->externals_count) {
             r300->vs_constants.dirty = TRUE;
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index f3dad4c2923..c8de3e1c523 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -35,7 +35,6 @@
 #include "r300_state_inlines.h"
 #include "r300_texture.h"
 #include "r300_vs.h"
-#include "r300_winsys.h"
 
 /* r300_state_derived: Various bits of state which are dependent upon
  * currently bound CSO data. */
@@ -47,6 +46,11 @@ enum r300_rs_swizzle {
     SWIZ_0001,
 };
 
+enum r300_rs_col_write_type {
+    WRITE_COLOR = 0,
+    WRITE_FACE
+};
+
 static void r300_draw_emit_attrib(struct r300_context* r300,
                                   enum attrib_emit emit,
                                   enum interp_mode interp,
@@ -204,8 +208,10 @@ static void r300_rs_col(struct r300_rs_block* rs, int id, int ptr,
     rs->inst[id] |= R300_RS_INST_COL_ID(id);
 }
 
-static void r300_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset)
+static void r300_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset,
+                              enum r300_rs_col_write_type type)
 {
+    assert(type == WRITE_COLOR);
     rs->inst[id] |= R300_RS_INST_COL_CN_WRITE |
                     R300_RS_INST_COL_ADDR(fp_offset);
 }
@@ -253,10 +259,16 @@ static void r500_rs_col(struct r300_rs_block* rs, int id, int ptr,
     rs->inst[id] |= R500_RS_INST_COL_ID(id);
 }
 
-static void r500_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset)
+static void r500_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset,
+                              enum r300_rs_col_write_type type)
 {
-    rs->inst[id] |= R500_RS_INST_COL_CN_WRITE |
-                    R500_RS_INST_COL_ADDR(fp_offset);
+    if (type == WRITE_FACE)
+        rs->inst[id] |= R500_RS_INST_COL_CN_WRITE_BACKFACE |
+                        R500_RS_INST_COL_ADDR(fp_offset);
+    else
+        rs->inst[id] |= R500_RS_INST_COL_CN_WRITE |
+                        R500_RS_INST_COL_ADDR(fp_offset);
+
 }
 
 static void r500_rs_tex(struct r300_rs_block* rs, int id, int ptr,
@@ -306,7 +318,7 @@ static void r300_update_rs_block(struct r300_context *r300)
     struct r300_rs_block rs = {0};
     int i, col_count = 0, tex_count = 0, fp_offset = 0, count, loc = 0, tex_ptr = 0;
     void (*rX00_rs_col)(struct r300_rs_block*, int, int, enum r300_rs_swizzle);
-    void (*rX00_rs_col_write)(struct r300_rs_block*, int, int);
+    void (*rX00_rs_col_write)(struct r300_rs_block*, int, int, enum r300_rs_col_write_type);
     void (*rX00_rs_tex)(struct r300_rs_block*, int, int, enum r300_rs_swizzle);
     void (*rX00_rs_tex_write)(struct r300_rs_block*, int, int);
     boolean any_bcolor_used = vs_outputs->bcolor[0] != ATTR_UNUSED ||
@@ -325,6 +337,11 @@ static void r300_update_rs_block(struct r300_context *r300)
         rX00_rs_tex_write = r300_rs_tex_write;
     }
 
+    /* 0x5555 copied from classic, which means:
+     * Select user color 0 for COLOR0 up to COLOR7.
+     * What the hell does that mean? */
+    rs.vap_vtx_state_cntl = 0x5555;
+
     /* The position is always present in VAP. */
     rs.vap_vsm_vtx_assm |= R300_INPUT_CNTL_POS;
     rs.vap_out_vtx_fmt[0] |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
@@ -351,7 +368,7 @@ static void r300_update_rs_block(struct r300_context *r300)
 
             /* Write it to the FS input register if it's needed by the FS. */
             if (fs_inputs->color[i] != ATTR_UNUSED) {
-                rX00_rs_col_write(&rs, col_count, fp_offset);
+                rX00_rs_col_write(&rs, col_count, fp_offset, WRITE_COLOR);
                 fp_offset++;
 
                 DBG(r300, DBG_RS,
@@ -399,6 +416,24 @@ static void r300_update_rs_block(struct r300_context *r300)
         }
     }
 
+    /* gl_FrontFacing.
+     * Note that we can use either the two-sided color selection based on
+     * the front and back vertex shader colors, or gl_FrontFacing,
+     * but not both! It locks up otherwise.
+     *
+     * In Direct3D 9, the two-sided color selection can be used
+     * with shaders 2.0 only, while gl_FrontFacing can be used
+     * with shaders 3.0 only. The hardware apparently hasn't been designed
+     * to support both at the same time. */
+    if (r300->screen->caps.is_r500 && fs_inputs->face != ATTR_UNUSED &&
+        !(any_bcolor_used && r300->two_sided_color)) {
+        rX00_rs_col(&rs, col_count, col_count, SWIZ_XYZW);
+        rX00_rs_col_write(&rs, col_count, fp_offset, WRITE_FACE);
+        fp_offset++;
+        col_count++;
+        DBG(r300, DBG_RS, "r300: Rasterized FACE written to FS.\n");
+    }
+
     /* Rasterize texture coordinates. */
     for (i = 0; i < ATTR_GENERIC_COUNT && tex_count < 8; i++) {
 	bool sprite_coord = !!(r300->sprite_coord_enable & (1 << i));
@@ -677,8 +712,44 @@ static void r300_merge_textures_and_samplers(struct r300_context* r300)
     }
 }
 
+/* We can't use compressed zbuffers as samplers. */
+static void r300_flush_depth_textures(struct r300_context *r300)
+{
+    struct r300_textures_state *state =
+        (struct r300_textures_state*)r300->textures_state.state;
+    unsigned i, level;
+    unsigned count = MIN2(state->sampler_view_count,
+                          state->sampler_state_count);
+
+    if (r300->z_decomp_rd)
+        return;
+
+    for (i = 0; i < count; i++)
+        if (state->sampler_views[i] && state->sampler_states[i]) {
+            struct pipe_resource *tex = state->sampler_views[i]->base.texture;
+
+            if (tex->target == PIPE_TEXTURE_3D ||
+                tex->target == PIPE_TEXTURE_CUBE)
+                continue;
+
+            /* Ignore non-depth textures.
+             * Also ignore reinterpreted depth textures, e.g. resource_copy. */
+            if (!util_format_is_depth_or_stencil(tex->format))
+                continue;
+
+            for (level = 0; level <= tex->last_level; level++)
+                if (r300_texture(tex)->zmask_in_use[level]) {
+                    /* We don't handle 3D textures and cubemaps yet. */
+                    r300_flush_depth_stencil(&r300->context, tex,
+                                             u_subresource(0, level), 0);
+                }
+        }
+}
+
 void r300_update_derived_state(struct r300_context* r300)
 {
+    r300_flush_depth_textures(r300);
+
     if (r300->textures_state.dirty) {
         r300_merge_textures_and_samplers(r300);
     }
@@ -694,6 +765,5 @@ void r300_update_derived_state(struct r300_context* r300)
         }
     }
 
-    if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ))
-        r300_update_hyperz_state(r300);
+    r300_update_hyperz_state(r300);
 }
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index 51b2c555502..a4911b9a2a6 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -97,13 +97,13 @@ static unsigned translate_opcode(unsigned opcode)
      /* case TGSI_OPCODE_BRA: return RC_OPCODE_BRA; */
      /* case TGSI_OPCODE_CAL: return RC_OPCODE_CAL; */
      /* case TGSI_OPCODE_RET: return RC_OPCODE_RET; */
-     /* case TGSI_OPCODE_SSG: return RC_OPCODE_SSG; */
+        case TGSI_OPCODE_SSG: return RC_OPCODE_SSG;
         case TGSI_OPCODE_CMP: return RC_OPCODE_CMP;
         case TGSI_OPCODE_SCS: return RC_OPCODE_SCS;
         case TGSI_OPCODE_TXB: return RC_OPCODE_TXB;
      /* case TGSI_OPCODE_NRM: return RC_OPCODE_NRM; */
      /* case TGSI_OPCODE_DIV: return RC_OPCODE_DIV; */
-     /* case TGSI_OPCODE_DP2: return RC_OPCODE_DP2; */
+        case TGSI_OPCODE_DP2: return RC_OPCODE_DP2;
         case TGSI_OPCODE_TXL: return RC_OPCODE_TXL;
         case TGSI_OPCODE_BRK: return RC_OPCODE_BRK;
         case TGSI_OPCODE_IF: return RC_OPCODE_IF;
@@ -126,7 +126,7 @@ static unsigned translate_opcode(unsigned opcode)
      /* case TGSI_OPCODE_SAD: return RC_OPCODE_SAD; */
      /* case TGSI_OPCODE_TXF: return RC_OPCODE_TXF; */
      /* case TGSI_OPCODE_TXQ: return RC_OPCODE_TXQ; */
-     /* case TGSI_OPCODE_CONT: return RC_OPCODE_CONT; */
+        case TGSI_OPCODE_CONT: return RC_OPCODE_CONT;
      /* case TGSI_OPCODE_EMIT: return RC_OPCODE_EMIT; */
      /* case TGSI_OPCODE_ENDPRIM: return RC_OPCODE_ENDPRIM; */
      /* case TGSI_OPCODE_BGNLOOP2: return RC_OPCODE_BGNLOOP2; */
diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c
index b25c786d6b3..54c8de12419 100644
--- a/src/gallium/drivers/r300/r300_vs.c
+++ b/src/gallium/drivers/r300/r300_vs.c
@@ -207,7 +207,7 @@ void r300_translate_vertex_shader(struct r300_context *r300,
     compiler.Base.max_temp_regs = 32;
 
     if (compiler.Base.Debug) {
-        debug_printf("r300: Initial vertex program\n");
+        DBG(r300, DBG_VP, "r300: Initial vertex program\n");
         tgsi_dump(vs->state.tokens, 0);
     }
 
@@ -227,8 +227,7 @@ void r300_translate_vertex_shader(struct r300_context *r300,
     /* Invoke the compiler */
     r3xx_compile_vertex_program(&compiler);
     if (compiler.Base.Error) {
-        /* XXX We should fallback using Draw. */
-        fprintf(stderr, "r300 VP: Compiler error:\n%sUsing a dummy shader"
+        DBG(r300, DBG_VP, "r300 VP: Compiler error:\n%sUsing a dummy shader"
                 " instead.\nIf there's an 'unknown opcode' message, please"
                 " file a bug report and attach this log.\n", compiler.Base.ErrorMsg);
 
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index f1dc3dc3a96..9ea9d4354d6 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -167,8 +167,7 @@ int r600_bc_add_literal(struct r600_bc *bc, const u32 *value)
 	struct r600_bc_alu *alu;
 
 	if (bc->cf_last == NULL) {
-		R600_ERR("no last CF\n");
-		return -EINVAL;
+		return 0;
 	}
 	if (bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) {
 		return 0;
@@ -179,12 +178,13 @@ int r600_bc_add_literal(struct r600_bc *bc, const u32 *value)
 		return -EINVAL;
 	}
 	alu = LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list);
-	if (!alu->last || !alu->nliteral) {
+	if (!alu->last || !alu->nliteral || alu->literal_added) {
 		return 0;
 	}
 	memcpy(alu->value, value, 4 * 4);
 	bc->cf_last->ndw += alu->nliteral;
 	bc->ndw += alu->nliteral;
+	alu->literal_added = 1;
 	return 0;
 }
 
@@ -287,7 +287,7 @@ static int r600_bc_tex_build(struct r600_bc *bc, struct r600_bc_tex *tex, unsign
 	return 0;
 }
 
-int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
+static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
 {
 	unsigned i;
 
@@ -331,7 +331,7 @@ int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsigned id)
 	return 0;
 }
 
-int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
+static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
 {
 	unsigned id = cf->id;
 
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 3fd94dbda03..10d98afaf00 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -48,6 +48,7 @@ struct r600_bc_alu {
 	unsigned			last;
 	unsigned			is_op3;
 	unsigned			nliteral;
+	unsigned			literal_added;
 	u32				value[4];
 };
 
diff --git a/src/gallium/drivers/r600/r600_context.c b/src/gallium/drivers/r600/r600_context.c
index ae1780a1d40..edde80c660a 100644
--- a/src/gallium/drivers/r600/r600_context.c
+++ b/src/gallium/drivers/r600/r600_context.c
@@ -47,20 +47,25 @@ void r600_flush(struct pipe_context *ctx, unsigned flags,
 	struct r600_context *rctx = r600_context(ctx);
 	struct r600_screen *rscreen = rctx->screen;
 	static int dc = 0;
+	char dname[256];
 
 	if (radeon_ctx_pm4(rctx->ctx))
 		return;
 	/* FIXME dumping should be removed once shader support instructions
 	 * without throwing bad code
 	 */
-	if (!dc)
-		radeon_ctx_dump_bof(rctx->ctx, "gallium.bof");
+	if (!rctx->ctx->cpm4)
+		goto out;
+	sprintf(dname, "gallium-%08d.bof", dc);
+	if (dc < 1)
+		radeon_ctx_dump_bof(rctx->ctx, dname);
 #if 1
 	radeon_ctx_submit(rctx->ctx);
 #endif
+	dc++;
+out:
 	rctx->ctx = radeon_ctx_decref(rctx->ctx);
 	rctx->ctx = radeon_ctx(rscreen->rw);
-	dc++;
 }
 
 static void r600_init_config(struct r600_context *rctx)
@@ -202,24 +207,6 @@ static void r600_init_config(struct r600_context *rctx)
 		num_es_stack_entries = 0;
 		break;
 	}
-	printf("ps_prio : %d\n", ps_prio);
-	printf("vs_prio : %d\n", vs_prio);
-	printf("gs_prio : %d\n", gs_prio);
-	printf("es_prio : %d\n", es_prio);
-	printf("num_ps_gprs : %d\n", num_ps_gprs);
-	printf("num_vs_gprs : %d\n", num_vs_gprs);
-	printf("num_gs_gprs : %d\n", num_gs_gprs);
-	printf("num_es_gprs : %d\n", num_es_gprs);
-	printf("num_temp_gprs : %d\n", num_temp_gprs);
-	printf("num_ps_threads : %d\n", num_ps_threads);
-	printf("num_vs_threads : %d\n", num_vs_threads);
-	printf("num_gs_threads : %d\n", num_gs_threads);
-	printf("num_es_threads : %d\n", num_es_threads);
-	printf("num_ps_stack_entries : %d\n", num_ps_stack_entries);
-	printf("num_vs_stack_entries : %d\n", num_vs_stack_entries);
-	printf("num_gs_stack_entries : %d\n", num_gs_stack_entries);
-	printf("num_es_stack_entries : %d\n", num_es_stack_entries);
-
 	rctx->hw_states.config = radeon_state(rctx->rw, R600_CONFIG_TYPE, R600_CONFIG);
 
 	rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] = 0x00000000;
diff --git a/src/gallium/drivers/r600/r600_context.h b/src/gallium/drivers/r600/r600_context.h
index 431f8951b2a..76d5de86532 100644
--- a/src/gallium/drivers/r600/r600_context.h
+++ b/src/gallium/drivers/r600/r600_context.h
@@ -94,7 +94,7 @@ struct r600_context_hw_states {
 	struct radeon_state	*dsa;
 	struct radeon_state	*blend;
 	struct radeon_state	*viewport;
-	struct radeon_state	*cb[7];
+	struct radeon_state	*cb[8];
 	struct radeon_state	*config;
 	struct radeon_state	*cb_cntl;
 	struct radeon_state	*db;
@@ -175,4 +175,7 @@ extern int r600_pipe_shader_update(struct pipe_context *ctx,
 #define R600_ERR(fmt, args...) \
 	fprintf(stderr, "EE %s/%s:%d - "fmt, __FILE__, __func__, __LINE__, ##args)
 
+uint32_t r600_translate_texformat(enum pipe_format format,
+				  const unsigned char *swizzle_view, 
+				  uint32_t *word4_p, uint32_t *yuv_format_p);
 #endif
diff --git a/src/gallium/drivers/r600/r600_draw.c b/src/gallium/drivers/r600/r600_draw.c
index 2420b763188..f0584551620 100644
--- a/src/gallium/drivers/r600/r600_draw.c
+++ b/src/gallium/drivers/r600/r600_draw.c
@@ -127,7 +127,7 @@ static int r600_draw_common(struct r600_draw *draw)
 	draw->draw->states[R600_DRAW__VGT_NUM_INDICES] = draw->count;
 	draw->draw->states[R600_DRAW__VGT_DRAW_INITIATOR] = vgt_draw_initiator;
 	if (draw->index_buffer) {
-		rbuffer = (struct r600_buffer*)draw->index_buffer;
+		rbuffer = (struct r600_resource*)draw->index_buffer;
 		draw->draw->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo);
 		draw->draw->placement[0] = RADEON_GEM_DOMAIN_GTT;
 		draw->draw->placement[1] = RADEON_GEM_DOMAIN_GTT;
diff --git a/src/gallium/drivers/r600/r600_screen.c b/src/gallium/drivers/r600/r600_screen.c
index 4b87327a7cf..cdaca9ed7db 100644
--- a/src/gallium/drivers/r600/r600_screen.c
+++ b/src/gallium/drivers/r600/r600_screen.c
@@ -53,59 +53,100 @@ static const char* r600_get_name(struct pipe_screen* pscreen)
 static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 {
 	switch (param) {
-	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
-	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
-		return 16;
+	/* Supported features (boolean caps). */
 	case PIPE_CAP_NPOT_TEXTURES:
-		return 1;
 	case PIPE_CAP_TWO_SIDED_STENCIL:
-		return 1;
 	case PIPE_CAP_GLSL:
-		return 1;
 	case PIPE_CAP_DUAL_SOURCE_BLEND:
-		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
-		return 1;
 	case PIPE_CAP_POINT_SPRITE:
-		return 1;
-	case PIPE_CAP_MAX_RENDER_TARGETS:
-		/* FIXME some r6xx are buggy and can only do 4 */
-		return 8;
 	case PIPE_CAP_OCCLUSION_QUERY:
-		return 1;
 	case PIPE_CAP_TEXTURE_SHADOW_MAP:
-		return 1;
-	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-		/* FIXME not sure here */
-		return 13;
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
-		return 1;
 	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
-		return 1;
-	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
-		/* FIXME allow this once infrastructure is there */
-		return 0;
-	case PIPE_CAP_TGSI_CONT_SUPPORTED:
-		return 0;
 	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
-		return 1;
 	case PIPE_CAP_SM3:
-		return 1;
+	case PIPE_CAP_TEXTURE_SWIZZLE:
 	case PIPE_CAP_INDEP_BLEND_ENABLE:
-		return 1;
-	case PIPE_CAP_INDEP_BLEND_FUNC:
-		/* FIXME allow this */
-		return 0;
 	case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
 		return 1;
+
+	/* Unsupported features (boolean caps). */
+	case PIPE_CAP_TIMER_QUERY:
+	case PIPE_CAP_TGSI_CONT_SUPPORTED:
+	case PIPE_CAP_STREAM_OUTPUT:
+	case PIPE_CAP_INDEP_BLEND_FUNC: /* FIXME allow this */
+	case PIPE_CAP_GEOMETRY_SHADER4:
+	case PIPE_CAP_DEPTH_CLAMP: /* FIXME allow this */
+		return 0;
+
+	/* Texturing. */
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 14;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		/* FIXME allow this once infrastructure is there */
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+		return 16;
+
+	/* Render targets. */
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		/* FIXME some r6xx are buggy and can only do 4 */
+		return 8;
+
+	/* Fragment coordinate conventions. */
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
 		return 1;
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
 	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
 		return 0;
+
+	/* Shader limits. */
+	case PIPE_CAP_MAX_VS_INSTRUCTIONS:
+		return 16384;  //max native instructions, not greater than max instructions
+	case PIPE_CAP_MAX_VS_ALU_INSTRUCTIONS:
+	case PIPE_CAP_MAX_VS_TEX_INSTRUCTIONS:
+	case PIPE_CAP_MAX_VS_TEX_INDIRECTIONS:
+		return 16384;
+	case PIPE_CAP_MAX_FS_INSTRUCTIONS:
+		return 16384; //max program native instructions
+	case PIPE_CAP_MAX_FS_ALU_INSTRUCTIONS:
+		return 16384; //max program native ALU instructions
+	case PIPE_CAP_MAX_FS_TEX_INSTRUCTIONS:
+		return 16384; //max program native texture instructions
+	case PIPE_CAP_MAX_FS_TEX_INDIRECTIONS:
+		return 2048; //max program native texture indirections
+	case PIPE_CAP_MAX_VS_CONTROL_FLOW_DEPTH:
+	case PIPE_CAP_MAX_FS_CONTROL_FLOW_DEPTH:
+		return 8; /* FIXME */
+	case PIPE_CAP_MAX_VS_INPUTS:
+		return 16; //max native attributes
+	case PIPE_CAP_MAX_FS_INPUTS:
+		return 10; //max native attributes
+	case PIPE_CAP_MAX_VS_TEMPS:
+		return 256; //max native temporaries
+	case PIPE_CAP_MAX_FS_TEMPS:
+		return 256; //max native temporaries
+	case PIPE_CAP_MAX_VS_ADDRS:
+	case PIPE_CAP_MAX_FS_ADDRS:
+		return 1; //max native address registers/* FIXME Isn't this equal to TEMPS? */
+	case PIPE_CAP_MAX_VS_CONSTS:
+		return 256; //max native parameters
+	case PIPE_CAP_MAX_FS_CONSTS:
+		return 256; //max program native parameters
+	case PIPE_CAP_MAX_CONST_BUFFERS:
+		return 1;
+	case PIPE_CAP_MAX_CONST_BUFFER_SIZE: /* in bytes */
+		return 4096;
+	case PIPE_CAP_MAX_PREDICATE_REGISTERS:
+	case PIPE_CAP_MAX_VS_PREDS:
+	case PIPE_CAP_MAX_FS_PREDS:
+		return 0; /* FIXME */
+
 	default:
 		R600_ERR("r600: unknown param %d\n", param);
 		return 0;
diff --git a/src/gallium/drivers/r600/r600_screen.h b/src/gallium/drivers/r600/r600_screen.h
index 9a452ecfe3f..53b560c617f 100644
--- a/src/gallium/drivers/r600/r600_screen.h
+++ b/src/gallium/drivers/r600/r600_screen.h
@@ -80,4 +80,6 @@ void r600_texture_transfer_unmap(struct pipe_context *ctx,
 int r600_conv_pipe_format(unsigned pformat, unsigned *format);
 int r600_conv_pipe_prim(unsigned pprim, unsigned *prim);
 
+void r600_init_screen_texture_functions(struct pipe_screen *screen);
+
 #endif
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index dc8d4cb3151..956c7e7930c 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -155,11 +155,14 @@ static int r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_context_sta
 
 static int r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_context_state *rpshader)
 {
+	const struct pipe_rasterizer_state *rasterizer;
 	struct r600_screen *rscreen = r600_screen(ctx->screen);
 	struct r600_shader *rshader = &rpshader->shader;
+	struct r600_context *rctx = r600_context(ctx);
 	struct radeon_state *state;
 	unsigned i, tmp, exports_ps, num_cout;
 
+	rasterizer = &rctx->rasterizer->state.rasterizer;
 	rpshader->rstate = radeon_state_decref(rpshader->rstate);
 	state = radeon_state(rscreen->rw, R600_PS_SHADER_TYPE, R600_PS_SHADER);
 	if (state == NULL)
@@ -171,6 +174,9 @@ static int r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_context_sta
 			rshader->input[i].name == TGSI_SEMANTIC_BCOLOR) {
 			tmp |= S_028644_FLAT_SHADE(rshader->flat_shade);
 		}
+		if (rasterizer->sprite_coord_enable & (1 << i)) {
+			tmp |= S_028644_PT_SPRITE_TEX(1);
+		}
 		state->states[R600_PS_SHADER__SPI_PS_INPUT_CNTL_0 + i] = tmp;
 	}
 
@@ -339,7 +345,8 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 {
 	struct tgsi_full_immediate *immediate;
 	struct r600_shader_ctx ctx;
-	struct r600_bc_output output;
+	struct r600_bc_output output[32];
+	unsigned output_done, noutput;
 	unsigned opcode;
 	int i, r = 0, pos0;
 
@@ -417,34 +424,41 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 		}
 	}
 	/* export output */
-	for (i = 0, pos0 = 0; i < shader->noutput; i++) {
-		memset(&output, 0, sizeof(struct r600_bc_output));
-		output.gpr = shader->output[i].gpr;
-		output.elem_size = 3;
-		output.swizzle_x = 0;
-		output.swizzle_y = 1;
-		output.swizzle_z = 2;
-		output.swizzle_w = 3;
-		output.barrier = 1;
-		output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
-		output.array_base = i - pos0;
-		output.inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE;
-		switch (ctx.type == TGSI_PROCESSOR_VERTEX) {
+	noutput = shader->noutput;
+	for (i = 0, pos0 = 0; i < noutput; i++) {
+		memset(&output[i], 0, sizeof(struct r600_bc_output));
+		output[i].gpr = shader->output[i].gpr;
+		output[i].elem_size = 3;
+		output[i].swizzle_x = 0;
+		output[i].swizzle_y = 1;
+		output[i].swizzle_z = 2;
+		output[i].swizzle_w = 3;
+		output[i].barrier = 1;
+		output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+		output[i].array_base = i - pos0;
+		output[i].inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT;
+		switch (ctx.type) {
 		case TGSI_PROCESSOR_VERTEX:
 			if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
-				output.array_base = 60;
-				output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+				output[i].array_base = 60;
+				output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
 				/* position doesn't count in array_base */
-				pos0 = 1;
+				pos0++;
+			}
+			if (shader->output[i].name == TGSI_SEMANTIC_PSIZE) {
+				output[i].array_base = 61;
+				output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
+				/* position doesn't count in array_base */
+				pos0++;
 			}
 			break;
 		case TGSI_PROCESSOR_FRAGMENT:
 			if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
-				output.array_base = 0;
-				output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+				output[i].array_base = shader->output[i].sid;
+				output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
-				output.array_base = 61;
-				output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
+				output[i].array_base = 61;
+				output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
 			} else {
 				R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
 				r = -EINVAL;
@@ -456,10 +470,58 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 			r = -EINVAL;
 			goto out_err;
 		}
-		if (i == (shader->noutput - 1)) {
-			output.end_of_program = 1;
+	}
+	/* add fake param output for vertex shader if no param is exported */
+	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
+		for (i = 0, pos0 = 0; i < noutput; i++) {
+			if (output[i].type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM) {
+				pos0 = 1;
+				break;
+			}
+		}
+		if (!pos0) {
+			memset(&output[i], 0, sizeof(struct r600_bc_output));
+			output[i].gpr = 0;
+			output[i].elem_size = 3;
+			output[i].swizzle_x = 0;
+			output[i].swizzle_y = 1;
+			output[i].swizzle_z = 2;
+			output[i].swizzle_w = 3;
+			output[i].barrier = 1;
+			output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+			output[i].array_base = 0;
+			output[i].inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT;
+			noutput++;
 		}
-		r = r600_bc_add_output(ctx.bc, &output);
+	}
+	/* add fake pixel export */
+	if (ctx.type == TGSI_PROCESSOR_FRAGMENT && !noutput) {
+		memset(&output[0], 0, sizeof(struct r600_bc_output));
+		output[0].gpr = 0;
+		output[0].elem_size = 3;
+		output[0].swizzle_x = 7;
+		output[0].swizzle_y = 7;
+		output[0].swizzle_z = 7;
+		output[0].swizzle_w = 7;
+		output[0].barrier = 1;
+		output[0].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
+		output[0].array_base = 0;
+		output[0].inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT;
+		noutput++;
+	}
+	/* set export done on last export of each type */
+	for (i = noutput - 1, output_done = 0; i >= 0; i--) {
+		if (i == (noutput - 1)) {
+			output[i].end_of_program = 1;
+		}
+		if (!(output_done & (1 << output[i].type))) {
+			output_done |= (1 << output[i].type);
+			output[i].inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE;
+		}
+	}
+	/* add output to bytecode */
+	for (i = 0; i < noutput; i++) {
+		r = r600_bc_add_output(ctx.bc, &output[i]);
 		if (r)
 			goto out_err;
 	}
@@ -490,6 +552,7 @@ static int tgsi_src(struct r600_shader_ctx *ctx,
 	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
 		r600_src->sel = 0;
 	}
+	r600_src->neg = tgsi_src->Register.Negate;
 	r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
 	return 0;
 }
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index ee0381e8bdd..2ee7780ead0 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -28,6 +28,7 @@
 struct r600_shader_io {
 	unsigned		name;
 	unsigned		gpr;
+	unsigned		done;
 	int			sid;
 	unsigned		interpolate;
 };
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index deb9bf3395a..3efd409ae0d 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -379,6 +379,8 @@ static void r600_set_scissor_state(struct pipe_context *ctx,
 
 	rstate = r600_context_state(rctx, pipe_scissor_type, state);
 	r600_bind_state(ctx, rstate);
+	/* refcount is taken care of this */
+	r600_delete_state(ctx, rstate);
 }
 
 static void r600_set_stencil_ref(struct pipe_context *ctx,
@@ -389,6 +391,8 @@ static void r600_set_stencil_ref(struct pipe_context *ctx,
 
 	rstate = r600_context_state(rctx, pipe_stencil_ref_type, state);
 	r600_bind_state(ctx, rstate);
+	/* refcount is taken care of this */
+	r600_delete_state(ctx, rstate);
 }
 
 static void r600_set_vertex_buffers(struct pipe_context *ctx,
@@ -433,6 +437,7 @@ static void r600_set_viewport_state(struct pipe_context *ctx,
 
 	rstate = r600_context_state(rctx, pipe_viewport_type, state);
 	r600_bind_state(ctx, rstate);
+	r600_delete_state(ctx, rstate);
 }
 
 void r600_init_state_functions(struct r600_context *rctx)
@@ -675,9 +680,8 @@ static struct radeon_state *r600_cb(struct r600_context *rctx, int cb)
 	unsigned color_info;
 	unsigned format, swap, ntype;
 	const struct util_format_description *desc;
-	int id = R600_CB0 + cb;
 
-	rstate = radeon_state(rscreen->rw, R600_CB0_TYPE, id);
+	rstate = radeon_state(rscreen->rw, R600_CB0_TYPE + cb, R600_CB0 + cb);
 	if (rstate == NULL)
 		return NULL;
 	rtex = (struct r600_resource_texture*)state->cbufs[cb]->texture;
@@ -728,7 +732,7 @@ static struct radeon_state *r600_db(struct r600_context *rctx)
 	struct r600_resource *rbuffer;
 	struct radeon_state *rstate;
 	const struct pipe_framebuffer_state *state = &rctx->framebuffer->state.framebuffer;
-	unsigned level = state->cbufs[0]->level;
+	unsigned level;
 	unsigned pitch, slice, format;
 
 	if (state->zsbuf == NULL)
@@ -770,7 +774,8 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
 	float offset_units = 0, offset_scale = 0;
 	char depth = 0;
 	unsigned offset_db_fmt_cntl = 0;
-
+	unsigned tmp;
+	unsigned prov_vtx = 1;
 	if (fb->zsbuf) {
 		offset_units = state->offset_units;
 		offset_scale = state->offset_scale * 12.0f;
@@ -796,23 +801,43 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
 	}
 	offset_db_fmt_cntl |= S_028DF8_POLY_OFFSET_NEG_NUM_DB_BITS(depth);
 
+	if (state->flatshade_first)
+		prov_vtx = 0;
+
 	rctx->flat_shade = state->flatshade;
 	rstate = radeon_state(rscreen->rw, R600_RASTERIZER_TYPE, R600_RASTERIZER);
 	if (rstate == NULL)
 		return NULL;
 	rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] = 0x00000001;
+	if (state->sprite_coord_enable) {
+		rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] |=
+				S_0286D4_PNT_SPRITE_ENA(1) |
+				S_0286D4_PNT_SPRITE_OVRD_X(2) |
+				S_0286D4_PNT_SPRITE_OVRD_Y(3) |
+				S_0286D4_PNT_SPRITE_OVRD_Z(0) |
+				S_0286D4_PNT_SPRITE_OVRD_W(1);
+		if (state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT) {
+			rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] |=
+					S_0286D4_PNT_SPRITE_TOP_1(1);
+		}
+	}
 	rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] = 0x00000000;
-	rstate->states[R600_RASTERIZER__PA_SU_SC_MODE_CNTL] = 0x00080000 |
-			S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
-			S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
-			S_028814_FACE(!state->front_ccw) |
-			S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
-			S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
-			S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri);
-	rstate->states[R600_RASTERIZER__PA_CL_VS_OUT_CNTL] = 0x00000000;
+	rstate->states[R600_RASTERIZER__PA_SU_SC_MODE_CNTL] =
+		S_028814_PROVOKING_VTX_LAST(prov_vtx) |
+		S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
+		S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
+		S_028814_FACE(!state->front_ccw) |
+		S_028814_POLY_OFFSET_FRONT_ENABLE(state->offset_tri) |
+		S_028814_POLY_OFFSET_BACK_ENABLE(state->offset_tri) |
+		S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_tri);
+	rstate->states[R600_RASTERIZER__PA_CL_VS_OUT_CNTL] =
+			S_02881C_USE_VTX_POINT_SIZE(state->point_size_per_vertex) |
+			S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex);
 	rstate->states[R600_RASTERIZER__PA_CL_NANINF_CNTL] = 0x00000000;
-	rstate->states[R600_RASTERIZER__PA_SU_POINT_SIZE] = 0x00080008;
-	rstate->states[R600_RASTERIZER__PA_SU_POINT_MINMAX] = 0x00000000;
+	/* point size 12.4 fixed point */
+	tmp = (unsigned)(state->point_size * 8.0 / 2.0);
+	rstate->states[R600_RASTERIZER__PA_SU_POINT_SIZE] = S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp);
+	rstate->states[R600_RASTERIZER__PA_SU_POINT_MINMAX] = 0x80000000;
 	rstate->states[R600_RASTERIZER__PA_SU_LINE_CNTL] = 0x00000008;
 	rstate->states[R600_RASTERIZER__PA_SC_LINE_STIPPLE] = 0x00000005;
 	rstate->states[R600_RASTERIZER__PA_SC_MPASS_PS_CNTL] = 0x00000000;
@@ -837,12 +862,25 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx)
 static struct radeon_state *r600_scissor(struct r600_context *rctx)
 {
 	const struct pipe_scissor_state *state = &rctx->scissor->state.scissor;
+	const struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer;
 	struct r600_screen *rscreen = rctx->screen;
 	struct radeon_state *rstate;
+	unsigned minx, maxx, miny, maxy;
 	u32 tl, br;
 
-	tl = S_028240_TL_X(state->minx) | S_028240_TL_Y(state->miny) | S_028240_WINDOW_OFFSET_DISABLE(1);
-	br = S_028244_BR_X(state->maxx) | S_028244_BR_Y(state->maxy);
+	if (state == NULL) {
+		minx = 0;
+		miny = 0;
+		maxx = fb->cbufs[0]->width;
+		maxy = fb->cbufs[0]->height;
+	} else {
+		minx = state->minx;
+		miny = state->miny;
+		maxx = state->maxx;
+		maxy = state->maxy;
+	}
+	tl = S_028240_TL_X(minx) | S_028240_TL_Y(miny) | S_028240_WINDOW_OFFSET_DISABLE(1);
+	br = S_028244_BR_X(maxx) | S_028244_BR_Y(maxy);
 	rstate = radeon_state(rscreen->rw, R600_SCISSOR_TYPE, R600_SCISSOR);
 	if (rstate == NULL)
 		return NULL;
@@ -1140,8 +1178,16 @@ static struct radeon_state *r600_resource(struct r600_context *rctx,
 	struct r600_resource *rbuffer;
 	struct radeon_state *rstate;
 	unsigned format;
-
-	format = r600_translate_colorformat(view->texture->format);
+	uint32_t word4 = 0, yuv_format = 0;
+	unsigned char swizzle[4];
+
+	swizzle[0] = view->swizzle_r;
+	swizzle[1] = view->swizzle_g;
+	swizzle[2] = view->swizzle_b;
+	swizzle[3] = view->swizzle_a;
+	format = r600_translate_texformat(view->texture->format,
+					  swizzle,
+					  &word4, &yuv_format);
 	if (format == ~0)
 		return NULL;
 	desc = util_format_description(view->texture->format);
@@ -1175,18 +1221,10 @@ static struct radeon_state *r600_resource(struct r600_context *rctx,
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD2] = 0;
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD3] = tmp->offset[1] >> 8;
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD4] =
-			S_038010_FORMAT_COMP_X(r600_format_type(UTIL_FORMAT_TYPE_UNSIGNED)) |
-			S_038010_FORMAT_COMP_Y(r600_format_type(UTIL_FORMAT_TYPE_UNSIGNED)) |
-			S_038010_FORMAT_COMP_Z(r600_format_type(UTIL_FORMAT_TYPE_UNSIGNED)) |
-			S_038010_FORMAT_COMP_W(r600_format_type(UTIL_FORMAT_TYPE_UNSIGNED)) |
+		        word4 | 
 			S_038010_NUM_FORMAT_ALL(V_038010_SQ_NUM_FORMAT_NORM) |
 			S_038010_SRF_MODE_ALL(V_038010_SFR_MODE_NO_ZERO) |
 			S_038010_REQUEST_SIZE(1) |
-			S_038010_DST_SEL_X(r600_tex_swizzle(view->swizzle_b)) |
-			S_038010_DST_SEL_Y(r600_tex_swizzle(view->swizzle_g)) |
-			S_038010_DST_SEL_Z(r600_tex_swizzle(view->swizzle_r)) |
-			S_038010_DST_SEL_W(r600_tex_swizzle(view->swizzle_a)) |
-		        S_038010_FORCE_DEGAMMA(desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB ? 1 : 0) |
 			S_038010_BASE_LEVEL(view->first_level);
 	rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD5] =
 			S_038014_LAST_LEVEL(view->last_level) |
@@ -1206,7 +1244,7 @@ static struct radeon_state *r600_cb_cntl(struct r600_context *rctx)
 	struct r600_screen *rscreen = rctx->screen;
 	struct radeon_state *rstate;
 	const struct pipe_blend_state *pbs = &rctx->blend->state.blend;
-	int nr_cbufs = rctx->framebuffer->state.framebuffer.nr_cbufs;	
+	int nr_cbufs = rctx->framebuffer->state.framebuffer.nr_cbufs;
 	uint32_t color_control, target_mask, shader_mask;
 	int i;
 
@@ -1215,20 +1253,29 @@ static struct radeon_state *r600_cb_cntl(struct r600_context *rctx)
 	color_control = S_028808_PER_MRT_BLEND(1);
 
 	for (i = 0; i < nr_cbufs; i++) {
-		shader_mask |= 0xf << i;
+		shader_mask |= 0xf << (i * 4);
 	}
 
 	if (pbs->logicop_enable) {
 		color_control |= (pbs->logicop_func) << 16;
-	} else
+	} else {
 		color_control |= (0xcc << 16);
+	}
 
-	for (i = 0; i < 8; i++) {
-		if (pbs->rt[i].blend_enable) {
-			color_control |= S_028808_TARGET_BLEND_ENABLE(1 << i);
+	if (pbs->independent_blend_enable) {
+		for (i = 0; i < nr_cbufs; i++) {
+			if (pbs->rt[i].blend_enable) {
+				color_control |= S_028808_TARGET_BLEND_ENABLE(1 << i);
+			}
+			target_mask |= (pbs->rt[i].colormask << (4 * i));
+		}
+	} else {
+		for (i = 0; i < nr_cbufs; i++) {
+			if (pbs->rt[0].blend_enable) {
+				color_control |= S_028808_TARGET_BLEND_ENABLE(1 << i);
+			}
+			target_mask |= (pbs->rt[0].colormask << (4 * i));
 		}
-		target_mask |= (pbs->rt[i].colormask << (4 * i));
-		
 	}
 	rstate = radeon_state(rscreen->rw, R600_CB_CNTL_TYPE, R600_CB_CNTL);
 	rstate->states[R600_CB_CNTL__CB_SHADER_MASK] = shader_mask;
diff --git a/src/gallium/drivers/r600/r600_state_inlines.h b/src/gallium/drivers/r600/r600_state_inlines.h
index 8271ad19fba..f93c20da35e 100644
--- a/src/gallium/drivers/r600/r600_state_inlines.h
+++ b/src/gallium/drivers/r600/r600_state_inlines.h
@@ -110,7 +110,7 @@ static INLINE uint32_t r600_translate_stencil_op(int s_op)
 	case PIPE_STENCIL_OP_DECR:
 		return V_028800_STENCIL_DECR;
 	case PIPE_STENCIL_OP_INCR_WRAP:
-		return V_028800_STENCIL_INVERT;
+		return V_028800_STENCIL_INCR_WRAP;
 	case PIPE_STENCIL_OP_DECR_WRAP:
 		return V_028800_STENCIL_DECR_WRAP;
 	case PIPE_STENCIL_OP_INVERT:
@@ -289,7 +289,7 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format)
 
 static INLINE boolean r600_is_sampler_format_supported(enum pipe_format format)
 {
-	return r600_translate_colorformat(format) != ~0;
+	return r600_translate_texformat(format, NULL, NULL, NULL) != ~0;
 }
 
 static INLINE boolean r600_is_colorbuffer_format_supported(enum pipe_format format)
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
index 1bce9113066..30d79ebdd6f 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -33,6 +33,7 @@
 #include "r600_screen.h"
 #include "r600_context.h"
 #include "r600_resource.h"
+#include "r600d.h"
 
 extern struct u_resource_vtbl r600_texture_vtbl;
 
@@ -277,3 +278,250 @@ void r600_init_screen_texture_functions(struct pipe_screen *screen)
 	screen->get_tex_surface = r600_get_tex_surface;
 	screen->tex_surface_destroy = r600_tex_surface_destroy;
 }
+
+static unsigned r600_get_swizzle_combined(const unsigned char *swizzle_format,
+					  const unsigned char *swizzle_view)
+{
+    unsigned i;
+    unsigned char swizzle[4];
+    unsigned result = 0;
+    const uint32_t swizzle_shift[4] = {
+	    16, 19, 22, 25,
+    };
+    const uint32_t swizzle_bit[4] = {
+	    0, 1, 2, 3,
+    };
+
+    if (swizzle_view) {
+        /* Combine two sets of swizzles. */
+        for (i = 0; i < 4; i++) {
+            swizzle[i] = swizzle_view[i] <= UTIL_FORMAT_SWIZZLE_W ?
+                         swizzle_format[swizzle_view[i]] : swizzle_view[i];
+        }
+    } else {
+        memcpy(swizzle, swizzle_format, 4);
+    }
+
+    /* Get swizzle. */
+    for (i = 0; i < 4; i++) {
+        switch (swizzle[i]) {
+            case UTIL_FORMAT_SWIZZLE_Y:
+                result |= swizzle_bit[1] << swizzle_shift[i];
+                break;
+            case UTIL_FORMAT_SWIZZLE_Z:
+                result |= swizzle_bit[2] << swizzle_shift[i];
+                break;
+            case UTIL_FORMAT_SWIZZLE_W:
+                result |= swizzle_bit[3] << swizzle_shift[i];
+                break;
+            case UTIL_FORMAT_SWIZZLE_0:
+                result |= V_038010_SQ_SEL_0 << swizzle_shift[i];
+                break;
+            case UTIL_FORMAT_SWIZZLE_1:
+                result |= V_038010_SQ_SEL_1 << swizzle_shift[i];
+                break;
+            default: /* UTIL_FORMAT_SWIZZLE_X */
+                result |= swizzle_bit[0] << swizzle_shift[i];
+        }
+    }
+    return result;
+}
+
+/* texture format translate */
+uint32_t r600_translate_texformat(enum pipe_format format,
+				  const unsigned char *swizzle_view, 
+				  uint32_t *word4_p, uint32_t *yuv_format_p)
+{
+	uint32_t result = 0, word4 = 0, yuv_format = 0;
+	const struct util_format_description *desc;
+	boolean uniform = TRUE;
+	int i;
+	const uint32_t sign_bit[4] = {
+		S_038010_FORMAT_COMP_X(V_038010_SQ_FORMAT_COMP_SIGNED),
+		S_038010_FORMAT_COMP_Y(V_038010_SQ_FORMAT_COMP_SIGNED),
+		S_038010_FORMAT_COMP_Z(V_038010_SQ_FORMAT_COMP_SIGNED),
+		S_038010_FORMAT_COMP_W(V_038010_SQ_FORMAT_COMP_SIGNED)
+	};
+	desc = util_format_description(format);
+
+	/* Colorspace (return non-RGB formats directly). */
+	switch (desc->colorspace) {
+		/* Depth stencil formats */
+	case UTIL_FORMAT_COLORSPACE_ZS:
+		switch (format) {
+		case PIPE_FORMAT_Z16_UNORM:
+			result = V_028010_DEPTH_16;
+			goto out_word4;
+		case PIPE_FORMAT_Z24X8_UNORM:
+			result = V_028010_DEPTH_X8_24;
+			goto out_word4;
+		case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+			result = V_028010_DEPTH_8_24;
+			goto out_word4;
+		default:
+			goto out_unknown;
+		}
+
+	case UTIL_FORMAT_COLORSPACE_YUV:
+		yuv_format |= (1 << 30);
+		switch (format) {
+                case PIPE_FORMAT_UYVY:
+                case PIPE_FORMAT_YUYV:
+		default:
+			break;
+		}
+		goto out_unknown; /* TODO */
+		
+	case UTIL_FORMAT_COLORSPACE_SRGB:
+		word4 |= S_038010_FORCE_DEGAMMA(1);
+		if (format == PIPE_FORMAT_L8A8_SRGB || format == PIPE_FORMAT_L8_SRGB)
+			goto out_unknown; /* fails for some reason - TODO */
+		break;
+
+	default:
+		break;
+	}
+	
+	word4 |= r600_get_swizzle_combined(desc->swizzle, swizzle_view);
+
+	/* S3TC formats. TODO */
+	if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+		goto out_unknown;
+	}
+
+
+	for (i = 0; i < desc->nr_channels; i++) {	
+		if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
+			word4 |= sign_bit[i];
+		}
+	}
+
+	/* R8G8Bx_SNORM - TODO CxV8U8 */
+
+	/* RGTC - TODO */
+
+	/* See whether the components are of the same size. */
+	for (i = 1; i < desc->nr_channels; i++) {
+		uniform = uniform && desc->channel[0].size == desc->channel[i].size;
+	}
+	
+	/* Non-uniform formats. */
+	if (!uniform) {
+		switch(desc->nr_channels) {
+		case 3:
+			if (desc->channel[0].size == 5 &&
+			    desc->channel[1].size == 6 &&
+			    desc->channel[2].size == 5) {
+				result |= V_0280A0_COLOR_5_6_5;
+				goto out_word4;
+			}
+			goto out_unknown;
+		case 4:
+			if (desc->channel[0].size == 5 &&
+			    desc->channel[1].size == 5 &&
+			    desc->channel[2].size == 5 &&
+			    desc->channel[3].size == 1) {
+				result |= V_0280A0_COLOR_1_5_5_5;
+				goto out_word4;
+			}
+			if (desc->channel[0].size == 10 &&
+			    desc->channel[1].size == 10 &&
+			    desc->channel[2].size == 10 &&
+			    desc->channel[3].size == 2) {
+				result |= V_0280A0_COLOR_10_10_10_2;
+				goto out_word4;
+			}
+			goto out_unknown;
+		}
+		goto out_unknown;
+	}
+
+	/* uniform formats */
+	switch (desc->channel[0].type) {
+	case UTIL_FORMAT_TYPE_UNSIGNED:
+	case UTIL_FORMAT_TYPE_SIGNED:
+		if (!desc->channel[0].normalized &&
+		    desc->colorspace != UTIL_FORMAT_COLORSPACE_SRGB) {
+			goto out_unknown;
+		}
+
+		switch (desc->channel[0].size) {
+		case 4:
+			switch (desc->nr_channels) {
+			case 2:
+				result |= V_0280A0_COLOR_4_4;
+				goto out_word4;
+			case 4:
+				result |= V_0280A0_COLOR_4_4_4_4;
+				goto out_word4;
+			}
+			goto out_unknown;
+		case 8:
+			switch (desc->nr_channels) {
+			case 1:
+				result |= V_0280A0_COLOR_8;
+				goto out_word4;
+			case 2:
+				result |= V_0280A0_COLOR_8_8;
+				goto out_word4;
+			case 4:
+				result |= V_0280A0_COLOR_8_8_8_8;
+				goto out_word4;
+			}
+			goto out_unknown;
+		case 16:
+			switch (desc->nr_channels) {
+			case 1:
+				result |= V_0280A0_COLOR_16;
+				goto out_word4;
+			case 2:
+				result |= V_0280A0_COLOR_16_16;
+				goto out_word4;
+			case 4:
+				result |= V_0280A0_COLOR_16_16_16_16;
+				goto out_word4;
+			}
+		}
+		goto out_unknown;
+
+	case UTIL_FORMAT_TYPE_FLOAT:
+		switch (desc->channel[0].size) {
+		case 16:
+			switch (desc->nr_channels) {
+			case 1:
+				result |= V_0280A0_COLOR_16_FLOAT;
+				goto out_word4;
+			case 2:
+				result |= V_0280A0_COLOR_16_16_FLOAT;
+				goto out_word4;
+			case 4:
+				result |= V_0280A0_COLOR_16_16_16_16_FLOAT;
+				goto out_word4;
+			}
+			goto out_unknown;
+		case 32:
+			switch (desc->nr_channels) {
+			case 1:
+				result |= V_0280A0_COLOR_32_FLOAT;
+				goto out_word4;
+			case 2:
+				result |= V_0280A0_COLOR_32_32_FLOAT;
+				goto out_word4;
+			case 4:
+				result |= V_0280A0_COLOR_32_32_32_32_FLOAT;
+				goto out_word4;
+			}
+		}
+		
+	}
+out_word4:
+	if (word4_p)
+		*word4_p = word4;
+	if (yuv_format_p)
+		*yuv_format_p = yuv_format;
+//	fprintf(stderr,"returning %08x %08x %08x\n", result, word4, yuv_format);
+	return result;
+out_unknown:
+//	R600_ERR("Unable to handle texformat %d %s\n", format, util_format_name(format));
+	return ~0;
+}
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index fb71b1e5d1d..53388f822ea 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -352,6 +352,61 @@
 #define   S_028808_ROP3(x)                             (((x) & 0xFF) << 16)
 #define   G_028808_ROP3(x)                             (((x) >> 16) & 0xFF)
 #define   C_028808_ROP3                                0xFF00FFFF
+#define R_028810_PA_CL_CLIP_CNTL                     0x028810
+#define   S_028810_UCP_ENA_0(x)                        (((x) & 0x1) << 0)
+#define   G_028810_UCP_ENA_0(x)                        (((x) >> 0) & 0x1)
+#define   C_028810_UCP_ENA_0                           0xFFFFFFFE
+#define   S_028810_UCP_ENA_1(x)                        (((x) & 0x1) << 1)
+#define   G_028810_UCP_ENA_1(x)                        (((x) >> 1) & 0x1)
+#define   C_028810_UCP_ENA_1                           0xFFFFFFFD
+#define   S_028810_UCP_ENA_2(x)                        (((x) & 0x1) << 2)
+#define   G_028810_UCP_ENA_2(x)                        (((x) >> 2) & 0x1)
+#define   C_028810_UCP_ENA_2                           0xFFFFFFFB
+#define   S_028810_UCP_ENA_3(x)                        (((x) & 0x1) << 3)
+#define   G_028810_UCP_ENA_3(x)                        (((x) >> 3) & 0x1)
+#define   C_028810_UCP_ENA_3                           0xFFFFFFF7
+#define   S_028810_UCP_ENA_4(x)                        (((x) & 0x1) << 4)
+#define   G_028810_UCP_ENA_4(x)                        (((x) >> 4) & 0x1)
+#define   C_028810_UCP_ENA_4                           0xFFFFFFEF
+#define   S_028810_UCP_ENA_5(x)                        (((x) & 0x1) << 5)
+#define   G_028810_UCP_ENA_5(x)                        (((x) >> 5) & 0x1)
+#define   C_028810_UCP_ENA_5                           0xFFFFFFDF
+#define   S_028810_PS_UCP_Y_SCALE_NEG(x)               (((x) & 0x1) << 13)
+#define   G_028810_PS_UCP_Y_SCALE_NEG(x)               (((x) >> 13) & 0x1)
+#define   C_028810_PS_UCP_Y_SCALE_NEG                  0xFFFFDFFF
+#define   S_028810_PS_UCP_MODE(x)                      (((x) & 0x3) << 14)
+#define   G_028810_PS_UCP_MODE(x)                      (((x) >> 14) & 0x3)
+#define   C_028810_PS_UCP_MODE                         0xFFFF3FFF
+#define   S_028810_CLIP_DISABLE(x)                     (((x) & 0x1) << 16)
+#define   G_028810_CLIP_DISABLE(x)                     (((x) >> 16) & 0x1)
+#define   C_028810_CLIP_DISABLE                        0xFFFEFFFF
+#define   S_028810_UCP_CULL_ONLY_ENA(x)                (((x) & 0x1) << 17)
+#define   G_028810_UCP_CULL_ONLY_ENA(x)                (((x) >> 17) & 0x1)
+#define   C_028810_UCP_CULL_ONLY_ENA                   0xFFFDFFFF
+#define   S_028810_BOUNDARY_EDGE_FLAG_ENA(x)           (((x) & 0x1) << 18)
+#define   G_028810_BOUNDARY_EDGE_FLAG_ENA(x)           (((x) >> 18) & 0x1)
+#define   C_028810_BOUNDARY_EDGE_FLAG_ENA              0xFFFBFFFF
+#define   S_028810_DX_CLIP_SPACE_DEF(x)                (((x) & 0x1) << 19)
+#define   G_028810_DX_CLIP_SPACE_DEF(x)                (((x) >> 19) & 0x1)
+#define   C_028810_DX_CLIP_SPACE_DEF                   0xFFF7FFFF
+#define   S_028810_DIS_CLIP_ERR_DETECT(x)              (((x) & 0x1) << 20)
+#define   G_028810_DIS_CLIP_ERR_DETECT(x)              (((x) >> 20) & 0x1)
+#define   C_028810_DIS_CLIP_ERR_DETECT                 0xFFEFFFFF
+#define   S_028810_VTX_KILL_OR(x)                      (((x) & 0x1) << 21)
+#define   G_028810_VTX_KILL_OR(x)                      (((x) >> 21) & 0x1)
+#define   C_028810_VTX_KILL_OR                         0xFFDFFFFF
+#define   S_028810_DX_LINEAR_ATTR_CLIP_ENA(x)          (((x) & 0x1) << 24)
+#define   G_028810_DX_LINEAR_ATTR_CLIP_ENA(x)          (((x) >> 24) & 0x1)
+#define   C_028810_DX_LINEAR_ATTR_CLIP_ENA             0xFEFFFFFF
+#define   S_028810_VTE_VPORT_PROVOKE_DISABLE(x)        (((x) & 0x1) << 25)
+#define   G_028810_VTE_VPORT_PROVOKE_DISABLE(x)        (((x) >> 25) & 0x1)
+#define   C_028810_VTE_VPORT_PROVOKE_DISABLE           0xFDFFFFFF
+#define   S_028810_ZCLIP_NEAR_DISABLE(x)               (((x) & 0x1) << 26)
+#define   G_028810_ZCLIP_NEAR_DISABLE(x)               (((x) >> 26) & 0x1)
+#define   C_028810_ZCLIP_NEAR_DISABLE                  0xFBFFFFFF
+#define   S_028810_ZCLIP_FAR_DISABLE(x)                (((x) & 0x1) << 27)
+#define   G_028810_ZCLIP_FAR_DISABLE(x)                (((x) >> 27) & 0x1)
+#define   C_028810_ZCLIP_FAR_DISABLE                   0xF7FFFFFF
 #define R_028010_DB_DEPTH_INFO                       0x028010
 #define   S_028010_FORMAT(x)                           (((x) & 0x7) << 0)
 #define   G_028010_FORMAT(x)                           (((x) >> 0) & 0x7)
@@ -599,6 +654,13 @@
 #define   S_028E0C_OFFSET(x)                           (((x) & 0xFFFFFFFF) << 0)
 #define   G_028E0C_OFFSET(x)                           (((x) >> 0) & 0xFFFFFFFF)
 #define   C_028E0C_OFFSET                              0x00000000
+#define R_028A00_PA_SU_POINT_SIZE                    0x028A00
+#define   S_028A00_HEIGHT(x)                           (((x) & 0xFFFF) << 0)
+#define   G_028A00_HEIGHT(x)                           (((x) >> 0) & 0xFFFF)
+#define   C_028A00_HEIGHT                              0xFFFF0000
+#define   S_028A00_WIDTH(x)                            (((x) & 0xFFFF) << 16)
+#define   G_028A00_WIDTH(x)                            (((x) >> 16) & 0xFFFF)
+#define   C_028A00_WIDTH                               0x0000FFFF
 #define R_028A40_VGT_GS_MODE                         0x028A40
 #define   S_028A40_MODE(x)                             (((x) & 0x3) << 0)
 #define   G_028A40_MODE(x)                             (((x) >> 0) & 0x3)
@@ -1098,6 +1160,79 @@
 #define     V_008958_DI_PT_2D_FILL_RECT_LIST           0x0000001A
 #define     V_008958_DI_PT_2D_LINE_STRIP               0x0000001B
 #define     V_008958_DI_PT_2D_TRI_STRIP                0x0000001C
+#define R_02881C_PA_CL_VS_OUT_CNTL                   0x02881C
+#define   S_02881C_CLIP_DIST_ENA_0(x)                  (((x) & 0x1) << 0)
+#define   G_02881C_CLIP_DIST_ENA_0(x)                  (((x) >> 0) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_0                     0xFFFFFFFE
+#define   S_02881C_CLIP_DIST_ENA_1(x)                  (((x) & 0x1) << 1)
+#define   G_02881C_CLIP_DIST_ENA_1(x)                  (((x) >> 1) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_1                     0xFFFFFFFD
+#define   S_02881C_CLIP_DIST_ENA_2(x)                  (((x) & 0x1) << 2)
+#define   G_02881C_CLIP_DIST_ENA_2(x)                  (((x) >> 2) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_2                     0xFFFFFFFB
+#define   S_02881C_CLIP_DIST_ENA_3(x)                  (((x) & 0x1) << 3)
+#define   G_02881C_CLIP_DIST_ENA_3(x)                  (((x) >> 3) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_3                     0xFFFFFFF7
+#define   S_02881C_CLIP_DIST_ENA_4(x)                  (((x) & 0x1) << 4)
+#define   G_02881C_CLIP_DIST_ENA_4(x)                  (((x) >> 4) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_4                     0xFFFFFFEF
+#define   S_02881C_CLIP_DIST_ENA_5(x)                  (((x) & 0x1) << 5)
+#define   G_02881C_CLIP_DIST_ENA_5(x)                  (((x) >> 5) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_5                     0xFFFFFFDF
+#define   S_02881C_CLIP_DIST_ENA_6(x)                  (((x) & 0x1) << 6)
+#define   G_02881C_CLIP_DIST_ENA_6(x)                  (((x) >> 6) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_6                     0xFFFFFFBF
+#define   S_02881C_CLIP_DIST_ENA_7(x)                  (((x) & 0x1) << 7)
+#define   G_02881C_CLIP_DIST_ENA_7(x)                  (((x) >> 7) & 0x1)
+#define   C_02881C_CLIP_DIST_ENA_7                     0xFFFFFF7F
+#define   S_02881C_CULL_DIST_ENA_0(x)                  (((x) & 0x1) << 8)
+#define   G_02881C_CULL_DIST_ENA_0(x)                  (((x) >> 8) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_0                     0xFFFFFEFF
+#define   S_02881C_CULL_DIST_ENA_1(x)                  (((x) & 0x1) << 9)
+#define   G_02881C_CULL_DIST_ENA_1(x)                  (((x) >> 9) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_1                     0xFFFFFDFF
+#define   S_02881C_CULL_DIST_ENA_2(x)                  (((x) & 0x1) << 10)
+#define   G_02881C_CULL_DIST_ENA_2(x)                  (((x) >> 10) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_2                     0xFFFFFBFF
+#define   S_02881C_CULL_DIST_ENA_3(x)                  (((x) & 0x1) << 11)
+#define   G_02881C_CULL_DIST_ENA_3(x)                  (((x) >> 11) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_3                     0xFFFFF7FF
+#define   S_02881C_CULL_DIST_ENA_4(x)                  (((x) & 0x1) << 12)
+#define   G_02881C_CULL_DIST_ENA_4(x)                  (((x) >> 12) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_4                     0xFFFFEFFF
+#define   S_02881C_CULL_DIST_ENA_5(x)                  (((x) & 0x1) << 13)
+#define   G_02881C_CULL_DIST_ENA_5(x)                  (((x) >> 13) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_5                     0xFFFFDFFF
+#define   S_02881C_CULL_DIST_ENA_6(x)                  (((x) & 0x1) << 14)
+#define   G_02881C_CULL_DIST_ENA_6(x)                  (((x) >> 14) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_6                     0xFFFFBFFF
+#define   S_02881C_CULL_DIST_ENA_7(x)                  (((x) & 0x1) << 15)
+#define   G_02881C_CULL_DIST_ENA_7(x)                  (((x) >> 15) & 0x1)
+#define   C_02881C_CULL_DIST_ENA_7                     0xFFFF7FFF
+#define   S_02881C_USE_VTX_POINT_SIZE(x)               (((x) & 0x1) << 16)
+#define   G_02881C_USE_VTX_POINT_SIZE(x)               (((x) >> 16) & 0x1)
+#define   C_02881C_USE_VTX_POINT_SIZE                  0xFFFEFFFF
+#define   S_02881C_USE_VTX_EDGE_FLAG(x)                (((x) & 0x1) << 17)
+#define   G_02881C_USE_VTX_EDGE_FLAG(x)                (((x) >> 17) & 0x1)
+#define   C_02881C_USE_VTX_EDGE_FLAG                   0xFFFDFFFF
+#define   S_02881C_USE_VTX_RENDER_TARGET_INDX(x)       (((x) & 0x1) << 18)
+#define   G_02881C_USE_VTX_RENDER_TARGET_INDX(x)       (((x) >> 18) & 0x1)
+#define   C_02881C_USE_VTX_RENDER_TARGET_INDX          0xFFFBFFFF
+#define   S_02881C_USE_VTX_VIEWPORT_INDX(x)            (((x) & 0x1) << 19)
+#define   G_02881C_USE_VTX_VIEWPORT_INDX(x)            (((x) >> 19) & 0x1)
+#define   C_02881C_USE_VTX_VIEWPORT_INDX               0xFFF7FFFF
+#define   S_02881C_USE_VTX_KILL_FLAG(x)                (((x) & 0x1) << 20)
+#define   G_02881C_USE_VTX_KILL_FLAG(x)                (((x) >> 20) & 0x1)
+#define   C_02881C_USE_VTX_KILL_FLAG                   0xFFEFFFFF
+#define   S_02881C_VS_OUT_MISC_VEC_ENA(x)              (((x) & 0x1) << 21)
+#define   G_02881C_VS_OUT_MISC_VEC_ENA(x)              (((x) >> 21) & 0x1)
+#define   C_02881C_VS_OUT_MISC_VEC_ENA                 0xFFDFFFFF
+#define   S_02881C_VS_OUT_CCDIST0_VEC_ENA(x)           (((x) & 0x1) << 22)
+#define   G_02881C_VS_OUT_CCDIST0_VEC_ENA(x)           (((x) >> 22) & 0x1)
+#define   C_02881C_VS_OUT_CCDIST0_VEC_ENA              0xFFBFFFFF
+#define   S_02881C_VS_OUT_CCDIST1_VEC_ENA(x)           (((x) & 0x1) << 23)
+#define   G_02881C_VS_OUT_CCDIST1_VEC_ENA(x)           (((x) >> 23) & 0x1)
+#define   C_02881C_VS_OUT_CCDIST1_VEC_ENA              0xFF7FFFFF
 #define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
 #define   S_028868_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
 #define   G_028868_NUM_GPRS(x)                         (((x) >> 0) & 0xFF)
diff --git a/src/gallium/drivers/r600/radeon.h b/src/gallium/drivers/r600/radeon.h
index 3a8405f9b40..8f00a4895a0 100644
--- a/src/gallium/drivers/r600/radeon.h
+++ b/src/gallium/drivers/r600/radeon.h
@@ -157,11 +157,42 @@ int radeon_ctx_submit(struct radeon_ctx *ctx);
 void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file);
 
 /*
+ * radeon context functions
+ */
+#pragma pack(1)
+struct radeon_cs_reloc {
+	uint32_t	handle;
+	uint32_t	read_domain;
+	uint32_t	write_domain;
+	uint32_t	flags;
+};
+#pragma pack()
+
+struct radeon_ctx {
+	int				refcount;
+	struct radeon			*radeon;
+	u32				*pm4;
+	u32				cpm4;
+	u32				draw_cpm4;
+	unsigned			id;
+	unsigned			next_id;
+	unsigned			nreloc;
+	struct radeon_cs_reloc		*reloc;
+	unsigned			nbo;
+	struct radeon_bo		**bo;
+	unsigned			ndraw;
+	struct radeon_draw		*cdraw;
+	struct radeon_draw		**draw;
+	unsigned			nstate;
+	struct radeon_state		**state;
+};
+
+/*
  * R600/R700
  */
 
-#define R600_NSTATE				1273
-#define R600_NTYPE				25
+#define R600_NSTATE				1280
+#define R600_NTYPE				32
 
 #define R600_CONFIG				0
 #define R600_CONFIG_TYPE				0
@@ -207,12 +238,26 @@ void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file);
 #define R600_GS_SAMPLER_BORDER_TYPE				20
 #define R600_CB0				1269
 #define R600_CB0_TYPE				21
-#define R600_DB				1270
-#define R600_DB_TYPE				22
-#define R600_VGT				1271
-#define R600_VGT_TYPE				23
-#define R600_DRAW				1272
-#define R600_DRAW_TYPE				24
+#define R600_CB1				1270
+#define R600_CB1_TYPE				22
+#define R600_CB2				1271
+#define R600_CB2_TYPE				23
+#define R600_CB3				1272
+#define R600_CB3_TYPE				24
+#define R600_CB4				1273
+#define R600_CB4_TYPE				25
+#define R600_CB5				1274
+#define R600_CB5_TYPE				26
+#define R600_CB6				1275
+#define R600_CB6_TYPE				27
+#define R600_CB7				1276
+#define R600_CB7_TYPE				28
+#define R600_DB				1277
+#define R600_DB_TYPE				29
+#define R600_VGT				1278
+#define R600_VGT_TYPE				30
+#define R600_DRAW				1279
+#define R600_DRAW_TYPE				31
 /* R600_CONFIG */
 #define R600_CONFIG__SQ_CONFIG			0
 #define R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1			1
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
index 0358c14e24b..1fa3ec8300a 100644
--- a/src/gallium/include/pipe/p_compiler.h
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -102,6 +102,16 @@ typedef unsigned char boolean;
 #  endif
 #endif
 
+/* Forced function inlining */
+#ifndef ALWAYS_INLINE
+#  ifdef __GNUC__
+#    define ALWAYS_INLINE inline __attribute__((always_inline))
+#  elif defined(_MSC_VER)
+#    define ALWAYS_INLINE __forceinline
+#  else
+#    define ALWAYS_INLINE INLINE
+#  endif
+#endif
 
 /* Function visibility */
 #ifndef PUBLIC
diff --git a/src/gallium/state_trackers/dri/common/dri_drawable.c b/src/gallium/state_trackers/dri/common/dri_drawable.c
index c67ca2224d0..1bdfdccf439 100644
--- a/src/gallium/state_trackers/dri/common/dri_drawable.c
+++ b/src/gallium/state_trackers/dri/common/dri_drawable.c
@@ -30,6 +30,7 @@
  */
 
 #include "dri_screen.h"
+#include "dri_context.h"
 #include "dri_drawable.h"
 
 #include "pipe/p_screen.h"
@@ -157,9 +158,9 @@ dri_destroy_buffer(__DRIdrawable * dPriv)
 
 /**
  * Validate the texture at an attachment.  Allocate the texture if it does not
- * exist.
+ * exist.  Used by the TFP extension.
  */
-void
+static void
 dri_drawable_validate_att(struct dri_drawable *drawable,
                           enum st_attachment_type statt)
 {
@@ -180,11 +181,61 @@ dri_drawable_validate_att(struct dri_drawable *drawable,
 
    drawable->texture_stamp = drawable->dPriv->lastStamp - 1;
 
-   /* this calles into the manager */
    drawable->base.validate(&drawable->base, statts, count, NULL);
 }
 
 /**
+ * These are used for GLX_EXT_texture_from_pixmap
+ */
+static void
+dri_set_tex_buffer2(__DRIcontext *pDRICtx, GLint target,
+                    GLint format, __DRIdrawable *dPriv)
+{
+   struct dri_context *ctx = dri_context(pDRICtx);
+   struct dri_drawable *drawable = dri_drawable(dPriv);
+   struct pipe_resource *pt;
+
+   dri_drawable_validate_att(drawable, ST_ATTACHMENT_FRONT_LEFT);
+
+   pt = drawable->textures[ST_ATTACHMENT_FRONT_LEFT];
+
+   if (pt) {
+      enum pipe_format internal_format = pt->format;
+
+      if (format == __DRI_TEXTURE_FORMAT_RGB)  {
+         /* only need to cover the formats recognized by dri_fill_st_visual */
+         switch (internal_format) {
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            internal_format = PIPE_FORMAT_B8G8R8X8_UNORM;
+            break;
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            internal_format = PIPE_FORMAT_X8R8G8B8_UNORM;
+            break;
+         default:
+            break;
+         }
+      }
+
+      ctx->st->teximage(ctx->st,
+            (target == GL_TEXTURE_2D) ? ST_TEXTURE_2D : ST_TEXTURE_RECT,
+            0, internal_format, pt, FALSE);
+   }
+}
+
+static void
+dri_set_tex_buffer(__DRIcontext *pDRICtx, GLint target,
+                   __DRIdrawable *dPriv)
+{
+   dri_set_tex_buffer2(pDRICtx, target, __DRI_TEXTURE_FORMAT_RGBA, dPriv);
+}
+
+const __DRItexBufferExtension driTexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   dri_set_tex_buffer,
+   dri_set_tex_buffer2,
+};
+
+/**
  * Get the format and binding of an attachment.
  */
 void
diff --git a/src/gallium/state_trackers/dri/common/dri_drawable.h b/src/gallium/state_trackers/dri/common/dri_drawable.h
index 3f2e24fc158..74e662d36c4 100644
--- a/src/gallium/state_trackers/dri/common/dri_drawable.h
+++ b/src/gallium/state_trackers/dri/common/dri_drawable.h
@@ -89,9 +89,7 @@ dri_drawable_get_format(struct dri_drawable *drawable,
                         enum pipe_format *format,
                         unsigned *bind);
 
-void
-dri_drawable_validate_att(struct dri_drawable *drawable,
-                          enum st_attachment_type statt);
+extern const __DRItexBufferExtension driTexBufferExtension;
 
 #endif
 
diff --git a/src/gallium/state_trackers/dri/drm/dri2.c b/src/gallium/state_trackers/dri/drm/dri2.c
index 1fb89963371..47005c17e2b 100644
--- a/src/gallium/state_trackers/dri/drm/dri2.c
+++ b/src/gallium/state_trackers/dri/drm/dri2.c
@@ -67,86 +67,6 @@ static const __DRI2flushExtension dri2FlushExtension = {
 };
 
 /**
- * These are used for GLX_EXT_texture_from_pixmap
- */
-static void
-dri2_set_tex_buffer2(__DRIcontext *pDRICtx, GLint target,
-                     GLint format, __DRIdrawable *dPriv)
-{
-   struct dri_context *ctx = dri_context(pDRICtx);
-   struct dri_drawable *drawable = dri_drawable(dPriv);
-   struct pipe_resource *pt;
-
-   dri_drawable_validate_att(drawable, ST_ATTACHMENT_FRONT_LEFT);
-
-   pt = drawable->textures[ST_ATTACHMENT_FRONT_LEFT];
-
-   if (pt) {
-      enum pipe_format internal_format = pt->format;
-
-      if (format == __DRI_TEXTURE_FORMAT_RGB)  {
-         /* only need to cover the formats recognized by dri_fill_st_visual */
-         switch (internal_format) {
-         case PIPE_FORMAT_B8G8R8A8_UNORM:
-            internal_format = PIPE_FORMAT_B8G8R8X8_UNORM;
-            break;
-         case PIPE_FORMAT_A8R8G8B8_UNORM:
-            internal_format = PIPE_FORMAT_X8R8G8B8_UNORM;
-            break;
-         default:
-            break;
-         }
-      }
-
-      ctx->st->teximage(ctx->st,
-            (target == GL_TEXTURE_2D) ? ST_TEXTURE_2D : ST_TEXTURE_RECT,
-            0, internal_format, pt, FALSE);
-   }
-}
-
-static void
-dri2_set_tex_buffer(__DRIcontext *pDRICtx, GLint target,
-                    __DRIdrawable *dPriv)
-{
-   dri2_set_tex_buffer2(pDRICtx, target, __DRI_TEXTURE_FORMAT_RGBA, dPriv);
-}
-
-static const __DRItexBufferExtension dri2TexBufferExtension = {
-    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
-   dri2_set_tex_buffer,
-   dri2_set_tex_buffer2,
-};
-
-/**
- * Get the format and binding of an attachment.
- */
-static INLINE void
-dri2_drawable_get_format(struct dri_drawable *drawable,
-                         enum st_attachment_type statt,
-                         enum pipe_format *format,
-                         unsigned *bind)
-{
-   switch (statt) {
-   case ST_ATTACHMENT_FRONT_LEFT:
-   case ST_ATTACHMENT_BACK_LEFT:
-   case ST_ATTACHMENT_FRONT_RIGHT:
-   case ST_ATTACHMENT_BACK_RIGHT:
-      *format = drawable->stvis.color_format;
-      *bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW;
-      break;
-   case ST_ATTACHMENT_DEPTH_STENCIL:
-      *format = drawable->stvis.depth_stencil_format;
-      *bind = PIPE_BIND_DEPTH_STENCIL; /* XXX sampler? */
-      break;
-   default:
-      *format = PIPE_FORMAT_NONE;
-      *bind = 0;
-      break;
-   }
-}
-
-
-/**
  * Retrieve __DRIbuffer from the DRI loader.
  */
 static __DRIbuffer *
@@ -176,7 +96,7 @@ dri2_drawable_get_buffers(struct dri_drawable *drawable,
       unsigned bind;
       int att, bpp;
 
-      dri2_drawable_get_format(drawable, statts[i], &format, &bind);
+      dri_drawable_get_format(drawable, statts[i], &format, &bind);
       if (format == PIPE_FORMAT_NONE)
          continue;
 
@@ -318,7 +238,7 @@ dri2_drawable_process_buffers(struct dri_drawable *drawable,
          break;
       }
 
-      dri2_drawable_get_format(drawable, statt, &format, &bind);
+      dri_drawable_get_format(drawable, statt, &format, &bind);
       if (statt == ST_ATTACHMENT_INVALID || format == PIPE_FORMAT_NONE)
          continue;
 
@@ -483,7 +403,7 @@ static const __DRIextension *dri_screen_extensions[] = {
    &driCopySubBufferExtension.base,
    &driSwapControlExtension.base,
    &driMediaStreamCounterExtension.base,
-   &dri2TexBufferExtension.base,
+   &driTexBufferExtension.base,
    &dri2FlushExtension.base,
    &dri2ImageExtension.base,
    &dri2ConfigQueryExtension.base,
diff --git a/src/gallium/state_trackers/dri/sw/drisw.c b/src/gallium/state_trackers/dri/sw/drisw.c
index ae96f1b20e2..249ccd7fcf6 100644
--- a/src/gallium/state_trackers/dri/sw/drisw.c
+++ b/src/gallium/state_trackers/dri/sw/drisw.c
@@ -201,7 +201,7 @@ drisw_allocate_textures(struct dri_drawable *drawable,
    struct pipe_resource templ;
    unsigned width, height;
    boolean resized;
-   int i;
+   unsigned i;
 
    width  = drawable->dPriv->w;
    height = drawable->dPriv->h;
@@ -222,7 +222,7 @@ drisw_allocate_textures(struct dri_drawable *drawable,
    templ.depth0 = 1;
    templ.last_level = 0;
 
-   for (i = 0; i < ST_ATTACHMENT_COUNT; i++) {
+   for (i = 0; i < count; i++) {
       enum pipe_format format;
       unsigned bind;
 
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.c b/src/gallium/state_trackers/egl/common/egl_g3d.c
index 56d575ffe08..02b9f6aec4f 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d.c
@@ -530,6 +530,9 @@ egl_g3d_initialize(_EGLDriver *drv, _EGLDisplay *dpy,
    if (gdpy->native->get_param(gdpy->native, NATIVE_PARAM_USE_NATIVE_BUFFER))
       dpy->Extensions.KHR_image_pixmap = EGL_TRUE;
 
+   dpy->Extensions.KHR_reusable_sync = EGL_TRUE;
+   dpy->Extensions.KHR_fence_sync = EGL_TRUE;
+
    if (egl_g3d_add_configs(drv, dpy, 1) == 1) {
       _eglError(EGL_NOT_INITIALIZED, "eglInitialize(unable to add configs)");
       goto fail;
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.h b/src/gallium/state_trackers/egl/common/egl_g3d.h
index f33dc91cf90..be450bbede3 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d.h
+++ b/src/gallium/state_trackers/egl/common/egl_g3d.h
@@ -30,12 +30,14 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
 #include "pipe/p_format.h"
+#include "os/os_thread.h"
 #include "egldriver.h"
 #include "egldisplay.h"
 #include "eglcontext.h"
 #include "eglsurface.h"
 #include "eglconfig.h"
 #include "eglimage.h"
+#include "eglsync.h"
 #include "eglscreen.h"
 #include "eglmode.h"
 
@@ -99,6 +101,24 @@ struct egl_g3d_image {
 _EGL_DRIVER_STANDARD_TYPECASTS(egl_g3d)
 _EGL_DRIVER_TYPECAST(egl_g3d_image, _EGLImage, obj)
 
+#ifdef EGL_KHR_reusable_sync
+
+struct egl_g3d_sync {
+   _EGLSync base;
+
+   int refs;
+
+   /* the mutex protects only the condvar, not the struct */
+   pipe_mutex mutex;
+   pipe_condvar condvar;
+
+   /* for fence sync */
+   struct pipe_fence_handle *fence;
+};
+_EGL_DRIVER_TYPECAST(egl_g3d_sync, _EGLSync, obj)
+
+#endif /* EGL_KHR_reusable_sync */
+
 #ifdef EGL_MESA_screen_surface
 
 struct egl_g3d_screen {
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_api.c b/src/gallium/state_trackers/egl/common/egl_g3d_api.c
index edac72a8223..1120945edc7 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d_api.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_api.c
@@ -34,6 +34,7 @@
 #include "egl_g3d.h"
 #include "egl_g3d_api.h"
 #include "egl_g3d_image.h"
+#include "egl_g3d_sync.h"
 #include "egl_g3d_st.h"
 #include "egl_g3d_loader.h"
 #include "native.h"
@@ -806,6 +807,13 @@ egl_g3d_init_driver_api(_EGLDriver *drv)
    drv->API.CreateImageKHR = egl_g3d_create_image;
    drv->API.DestroyImageKHR = egl_g3d_destroy_image;
 
+#ifdef EGL_KHR_reusable_sync
+   drv->API.CreateSyncKHR = egl_g3d_create_sync;
+   drv->API.DestroySyncKHR = egl_g3d_destroy_sync;
+   drv->API.ClientWaitSyncKHR = egl_g3d_client_wait_sync;
+   drv->API.SignalSyncKHR = egl_g3d_signal_sync;
+#endif
+
 #ifdef EGL_MESA_screen_surface
    drv->API.CreateScreenSurfaceMESA = egl_g3d_create_screen_surface;
    drv->API.ShowScreenSurfaceMESA = egl_g3d_show_screen_surface;
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_sync.c b/src/gallium/state_trackers/egl/common/egl_g3d_sync.c
new file mode 100644
index 00000000000..ec74e9eb94c
--- /dev/null
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_sync.c
@@ -0,0 +1,284 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#include "util/u_memory.h"
+#include "util/u_atomic.h"
+#include "os/os_thread.h"
+#include "eglsync.h"
+#include "eglcurrent.h"
+
+#include "egl_g3d.h"
+#include "egl_g3d_sync.h"
+
+#ifdef EGL_KHR_reusable_sync
+
+/**
+ * Wait for the conditional variable.
+ */
+static EGLint
+egl_g3d_wait_sync_condvar(struct egl_g3d_sync *gsync, EGLTimeKHR timeout)
+{
+   _EGLDisplay *dpy = gsync->base.Resource.Display;
+
+   pipe_mutex_lock(gsync->mutex);
+
+   /* unlock display lock just before waiting */
+   _eglUnlockMutex(&dpy->Mutex);
+
+   /* No timed wait.  Always treat timeout as EGL_FOREVER_KHR */
+   pipe_condvar_wait(gsync->condvar, gsync->mutex);
+
+   _eglLockMutex(&dpy->Mutex);
+
+   pipe_mutex_unlock(gsync->mutex);
+
+   return EGL_CONDITION_SATISFIED_KHR;
+}
+
+/**
+ * Signal the conditional variable.
+ */
+static void
+egl_g3d_signal_sync_condvar(struct egl_g3d_sync *gsync)
+{
+   pipe_mutex_lock(gsync->mutex);
+   pipe_condvar_broadcast(gsync->condvar);
+   pipe_mutex_unlock(gsync->mutex);
+}
+
+/**
+ * Insert a fence command to the command stream of the current context.
+ */
+static EGLint
+egl_g3d_insert_fence_sync(struct egl_g3d_sync *gsync)
+{
+   _EGLContext *ctx = _eglGetCurrentContext();
+   struct egl_g3d_context *gctx = egl_g3d_context(ctx);
+
+   /* already checked in egl_g3d_create_sync */
+   assert(gctx);
+
+   /* insert the fence command */
+   gctx->stctxi->flush(gctx->stctxi, 0x0, &gsync->fence);
+   if (!gsync->fence)
+      gsync->base.SyncStatus = EGL_SIGNALED_KHR;
+
+   return EGL_SUCCESS;
+}
+
+/**
+ * Wait for the fence sync to be signaled.
+ */
+static EGLint
+egl_g3d_wait_fence_sync(struct egl_g3d_sync *gsync, EGLTimeKHR timeout)
+{
+   EGLint ret;
+
+   if (gsync->fence) {
+      _EGLDisplay *dpy = gsync->base.Resource.Display;
+      struct egl_g3d_display *gdpy = egl_g3d_display(dpy);
+      struct pipe_screen *screen = gdpy->native->screen;
+      struct pipe_fence_handle *fence = gsync->fence;
+
+      gsync->fence = NULL;
+
+      _eglUnlockMutex(&dpy->Mutex);
+      /* no timed finish? */
+      screen->fence_finish(screen, fence, 0x0);
+      ret = EGL_CONDITION_SATISFIED_KHR;
+      _eglLockMutex(&dpy->Mutex);
+
+      gsync->base.SyncStatus = EGL_SIGNALED_KHR;
+
+      screen->fence_reference(screen, &fence, NULL);
+      egl_g3d_signal_sync_condvar(gsync);
+   }
+   else {
+      ret = egl_g3d_wait_sync_condvar(gsync, timeout);
+   }
+
+   return ret;
+}
+
+static INLINE void
+egl_g3d_ref_sync(struct egl_g3d_sync *gsync)
+{
+   p_atomic_inc(&gsync->refs);
+}
+
+static INLINE void
+egl_g3d_unref_sync(struct egl_g3d_sync *gsync)
+{
+   if (p_atomic_dec_zero(&gsync->refs)) {
+      pipe_condvar_destroy(gsync->condvar);
+      pipe_mutex_destroy(gsync->mutex);
+
+      if (gsync->fence) {
+         struct egl_g3d_display *gdpy =
+            egl_g3d_display(gsync->base.Resource.Display);
+         struct pipe_screen *screen = gdpy->native->screen;
+
+         screen->fence_reference(screen, &gsync->fence, NULL);
+      }
+
+      FREE(gsync);
+   }
+}
+
+_EGLSync *
+egl_g3d_create_sync(_EGLDriver *drv, _EGLDisplay *dpy,
+                    EGLenum type, const EGLint *attrib_list)
+{
+   _EGLContext *ctx = _eglGetCurrentContext();
+   struct egl_g3d_sync *gsync;
+   EGLint err;
+
+   if (!ctx || ctx->Resource.Display != dpy) {
+      _eglError(EGL_BAD_MATCH, "eglCreateSyncKHR");
+      return NULL;
+   }
+
+   gsync = CALLOC_STRUCT(egl_g3d_sync);
+   if (!gsync) {
+      _eglError(EGL_BAD_ALLOC, "eglCreateSyncKHR");
+      return NULL;
+   }
+
+   if (!_eglInitSync(&gsync->base, dpy, type, attrib_list)) {
+      FREE(gsync);
+      return NULL;
+   }
+
+   switch (type) {
+   case EGL_SYNC_REUSABLE_KHR:
+      err = EGL_SUCCESS;
+      break;
+   case EGL_SYNC_FENCE_KHR:
+      err = egl_g3d_insert_fence_sync(gsync);
+      break;
+   default:
+      err = EGL_BAD_ATTRIBUTE;
+      break;
+   }
+
+   if (err != EGL_SUCCESS) {
+      _eglError(err, "eglCreateSyncKHR");
+      FREE(gsync);
+      return NULL;
+   }
+
+   pipe_mutex_init(gsync->mutex);
+   pipe_condvar_init(gsync->condvar);
+   p_atomic_set(&gsync->refs, 1);
+
+   return &gsync->base;
+}
+
+EGLBoolean
+egl_g3d_destroy_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync)
+{
+   struct egl_g3d_sync *gsync = egl_g3d_sync(sync);
+
+   switch (gsync->base.Type) {
+   case EGL_SYNC_REUSABLE_KHR:
+      /* signal the waiters */
+      if (gsync->base.SyncStatus != EGL_SIGNALED_KHR) {
+         gsync->base.SyncStatus = EGL_SIGNALED_KHR;
+         egl_g3d_signal_sync_condvar(gsync);
+      }
+      break;
+   default:
+      break;
+   }
+
+   egl_g3d_unref_sync(gsync);
+
+   return EGL_TRUE;
+}
+
+EGLint
+egl_g3d_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                         EGLint flags, EGLTimeKHR timeout)
+{
+   struct egl_g3d_sync *gsync = egl_g3d_sync(sync);
+   EGLint ret = EGL_CONDITION_SATISFIED_KHR;
+
+   if (gsync->base.SyncStatus != EGL_SIGNALED_KHR) {
+      /* flush if there is a current context */
+      if (flags & EGL_SYNC_FLUSH_COMMANDS_BIT_KHR) {
+         _EGLContext *ctx = _eglGetCurrentContext();
+         struct egl_g3d_context *gctx = egl_g3d_context(ctx);
+
+         if (gctx)
+            gctx->stctxi->flush(gctx->stctxi, PIPE_FLUSH_RENDER_CACHE , NULL);
+      }
+
+      if (timeout) {
+         /* reference the sync object in case it is destroyed while waiting */
+         egl_g3d_ref_sync(gsync);
+
+         switch (gsync->base.Type) {
+         case EGL_SYNC_REUSABLE_KHR:
+            ret = egl_g3d_wait_sync_condvar(gsync, timeout);
+            break;
+         case EGL_SYNC_FENCE_KHR:
+            ret = egl_g3d_wait_fence_sync(gsync, timeout);
+         default:
+            break;
+         }
+
+         egl_g3d_unref_sync(gsync);
+      }
+      else {
+         ret = EGL_TIMEOUT_EXPIRED_KHR;
+      }
+   }
+
+   return ret;
+}
+
+EGLBoolean
+egl_g3d_signal_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                    EGLenum mode)
+{
+   struct egl_g3d_sync *gsync = egl_g3d_sync(sync);
+
+   /* only for reusable sync */
+   if (sync->Type != EGL_SYNC_REUSABLE_KHR)
+      return _eglError(EGL_BAD_MATCH, "eglSignalSyncKHR");
+
+   if (gsync->base.SyncStatus != mode) {
+      gsync->base.SyncStatus = mode;
+      if (mode == EGL_SIGNALED_KHR)
+         egl_g3d_signal_sync_condvar(gsync);
+   }
+
+   return EGL_TRUE;
+}
+
+#endif /* EGL_KHR_reusable_sync */
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_sync.h b/src/gallium/state_trackers/egl/common/egl_g3d_sync.h
new file mode 100644
index 00000000000..3179ca04e1a
--- /dev/null
+++ b/src/gallium/state_trackers/egl/common/egl_g3d_sync.h
@@ -0,0 +1,53 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.9
+ *
+ * Copyright (C) 2010 LunarG Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <[email protected]>
+ */
+
+#ifndef _EGL_G3D_SYNC_H_
+#define _EGL_G3D_SYNC_H_
+
+#include "egl_g3d.h"
+
+#ifdef EGL_KHR_reusable_sync
+
+_EGLSync *
+egl_g3d_create_sync(_EGLDriver *drv, _EGLDisplay *dpy,
+                    EGLenum type, const EGLint *attrib_list);
+
+EGLBoolean
+egl_g3d_destroy_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
+
+EGLint
+egl_g3d_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                         EGLint flags, EGLTimeKHR timeout);
+
+EGLBoolean
+egl_g3d_signal_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                    EGLenum mode);
+
+#endif /* EGL_KHR_reusable_sync */
+
+#endif /* _EGL_G3D_SYNC_H_ */
diff --git a/src/gallium/targets/dri-r600/Makefile b/src/gallium/targets/dri-r600/Makefile
index 932303d194e..ff886b37f26 100644
--- a/src/gallium/targets/dri-r600/Makefile
+++ b/src/gallium/targets/dri-r600/Makefile
@@ -21,6 +21,6 @@ DRIVER_DEFINES = \
 
 include ../Makefile.dri
 
-DRI_LIB_DEPS += -ldrm_radeon
+DRI_LIB_DEPS +=
 
 symlinks:
diff --git a/src/gallium/targets/dri-r600/SConscript b/src/gallium/targets/dri-r600/SConscript
index 97c5df01fe2..64d6d2a7f6f 100644
--- a/src/gallium/targets/dri-r600/SConscript
+++ b/src/gallium/targets/dri-r600/SConscript
@@ -12,7 +12,7 @@ env.Append(CPPDEFINES = ['GALLIUM_RBUG', 'GALLIUM_TRACE'])
 
 env.Prepend(LIBS = [
     st_dri,
-    r600drm,
+    r600winsys,
     r600,
     trace,
     rbug,
diff --git a/src/gallium/targets/egl/st_GLESv1_CM.c b/src/gallium/targets/egl/st_GLESv1_CM.c
index 0c8de8992f3..c1652d5131a 100644
--- a/src/gallium/targets/egl/st_GLESv1_CM.c
+++ b/src/gallium/targets/egl/st_GLESv1_CM.c
@@ -1,3 +1,4 @@
+#include "state_tracker/st_api.h"
 #include "state_tracker/st_gl_api.h"
 
 PUBLIC struct st_api *
diff --git a/src/gallium/targets/egl/st_GLESv2.c b/src/gallium/targets/egl/st_GLESv2.c
index 87b3e65e239..9c269890089 100644
--- a/src/gallium/targets/egl/st_GLESv2.c
+++ b/src/gallium/targets/egl/st_GLESv2.c
@@ -1,3 +1,4 @@
+#include "state_tracker/st_api.h"
 #include "state_tracker/st_gl_api.h"
 
 PUBLIC struct st_api *
diff --git a/src/gallium/tests/graw/SConscript b/src/gallium/tests/graw/SConscript
index 7e39ec21a41..860a17e13e7 100644
--- a/src/gallium/tests/graw/SConscript
+++ b/src/gallium/tests/graw/SConscript
@@ -11,9 +11,12 @@ env = env.Clone()
 env.Prepend(LIBPATH = [graw.dir])
 env.Prepend(LIBS = ['graw'] + gallium)
 
-if platform == 'sunos5':
+if platform in ('freebsd8', 'sunos5'):
     env.Append(LIBS = ['m'])
 
+if platform == 'freebsd8':
+    env.Append(LIBS = ['pthread'])
+
 progs = [
     'clear',
     'tri',
diff --git a/src/gallium/tests/unit/Makefile b/src/gallium/tests/unit/Makefile
index f65958dadd5..345bd1f6941 100644
--- a/src/gallium/tests/unit/Makefile
+++ b/src/gallium/tests/unit/Makefile
@@ -22,7 +22,8 @@ SOURCES = \
 	pipe_barrier_test.c \
 	u_cache_test.c \
 	u_half_test.c \
-	u_format_test.c
+	u_format_test.c \
+	translate_test.c
 
 
 OBJECTS = $(SOURCES:.c=.o)
diff --git a/src/gallium/tests/unit/SConscript b/src/gallium/tests/unit/SConscript
index a200123f446..edc68e34d9e 100644
--- a/src/gallium/tests/unit/SConscript
+++ b/src/gallium/tests/unit/SConscript
@@ -4,14 +4,18 @@ env = env.Clone()
 
 env.Prepend(LIBS = [gallium])
 
-if platform == 'sunos5':
+if platform in ('freebsd8', 'sunos5'):
     env.Append(LIBS = ['m'])
 
+if platform == 'freebsd8':
+    env.Append(LIBS = ['pthread'])
+
 progs = [
     'pipe_barrier_test',
     'u_cache_test',
     'u_format_test',
-    'u_half_test'
+    'u_half_test',
+    'translate_test'
 ]
 
 for prog in progs:
diff --git a/src/gallium/tests/unit/translate_test.c b/src/gallium/tests/unit/translate_test.c
new file mode 100644
index 00000000000..d0946a91a26
--- /dev/null
+++ b/src/gallium/tests/unit/translate_test.c
@@ -0,0 +1,310 @@
+/**************************************************************************
+ *
+ * Copyright © 2010 Luca Barbieri
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdio.h>
+#include <translate/translate.h>
+#include <util/u_memory.h>
+#include <util/u_format.h>
+#include <util/u_cpu_detect.h>
+#include <rtasm/rtasm_cpu.h>
+
+/* don't use this for serious use */
+static double rand_double()
+{
+   const double rm = (double)RAND_MAX + 1;
+   double div = 1;
+   double v = 0;
+   unsigned i;
+   for(i = 0; i < 4; ++i)
+   {
+      div *= rm;
+      v += (double)rand() / div;
+   }
+   return v;
+}
+
+int main(int argc, char** argv)
+{
+   struct translate *(*create_fn)(const struct translate_key *key) = 0;
+
+   struct translate_key key;
+   unsigned output_format;
+   unsigned input_format;
+   unsigned buffer_size = 4096;
+   unsigned char* buffer[5];
+   unsigned char* byte_buffer;
+   float* float_buffer;
+   double* double_buffer;
+   unsigned count = 4;
+   unsigned i, j, k;
+   unsigned passed = 0;
+   unsigned total = 0;
+   const float error = 0.03125;
+
+   create_fn = 0;
+
+   util_cpu_detect();
+
+   if(argc <= 1)
+   {}
+   else if (!strcmp(argv[1], "generic"))
+      create_fn = translate_generic_create;
+   else if (!strcmp(argv[1], "x86"))
+      create_fn = translate_sse2_create;
+   else if (!strcmp(argv[1], "nosse"))
+   {
+      util_cpu_caps.has_sse = 0;
+      util_cpu_caps.has_sse2 = 0;
+      util_cpu_caps.has_sse3 = 0;
+      util_cpu_caps.has_sse4_1 = 0;
+      create_fn = translate_sse2_create;
+   }
+   else if (!strcmp(argv[1], "sse"))
+   {
+      if(!util_cpu_caps.has_sse || !rtasm_cpu_has_sse())
+      {
+         printf("Error: CPU doesn't support SSE (test with qemu)\n");
+         return 2;
+      }
+      util_cpu_caps.has_sse2 = 0;
+      util_cpu_caps.has_sse3 = 0;
+      util_cpu_caps.has_sse4_1 = 0;
+      create_fn = translate_sse2_create;
+   }
+   else if (!strcmp(argv[1], "sse2"))
+   {
+      if(!util_cpu_caps.has_sse2 || !rtasm_cpu_has_sse())
+      {
+         printf("Error: CPU doesn't support SSE2 (test with qemu)\n");
+         return 2;
+      }
+      util_cpu_caps.has_sse3 = 0;
+      util_cpu_caps.has_sse4_1 = 0;
+      create_fn = translate_sse2_create;
+   }
+   else if (!strcmp(argv[1], "sse3"))
+   {
+      if(!util_cpu_caps.has_sse3 || !rtasm_cpu_has_sse())
+      {
+         printf("Error: CPU doesn't support SSE3 (test with qemu)\n");
+         return 2;
+      }
+      util_cpu_caps.has_sse4_1 = 0;
+      create_fn = translate_sse2_create;
+   }
+   else if (!strcmp(argv[1], "sse4.1"))
+   {
+      if(!util_cpu_caps.has_sse4_1 || !rtasm_cpu_has_sse())
+      {
+         printf("Error: CPU doesn't support SSE4.1 (test with qemu)\n");
+         return 2;
+      }
+      create_fn = translate_sse2_create;
+   }
+
+   if (!create_fn)
+   {
+      printf("Usage: ./translate_test [generic|x86|nosse|sse|sse2|sse3|sse4.1]\n");
+      return 2;
+   }
+
+   for (i = 1; i < Elements(buffer); ++i)
+      buffer[i] = align_malloc(buffer_size, 4096);
+
+   byte_buffer = align_malloc(buffer_size, 4096);
+   float_buffer = align_malloc(buffer_size, 4096);
+   double_buffer = align_malloc(buffer_size, 4096);
+
+   key.nr_elements = 1;
+   key.element[0].input_buffer = 0;
+   key.element[0].input_offset = 0;
+   key.element[0].output_offset = 0;
+   key.element[0].type = TRANSLATE_ELEMENT_NORMAL;
+   key.element[0].instance_divisor = 0;
+
+   srand(4359025);
+
+   /* avoid negative values that work badly when converted to unsigned format*/
+   for (i = 0; i < buffer_size; ++i)
+      byte_buffer[i] = rand() & 0x7f7f7f7f;
+
+   for (i = 0; i < buffer_size / sizeof(float); ++i)
+      float_buffer[i] = (float)rand_double();
+
+   for (i = 0; i < buffer_size / sizeof(double); ++i)
+      double_buffer[i] = rand_double();
+
+   for (output_format = 1; output_format < PIPE_FORMAT_COUNT; ++output_format)
+   {
+      const struct util_format_description* output_format_desc = util_format_description(output_format);
+      unsigned output_format_size;
+      unsigned output_normalized = 0;
+
+      if (!output_format_desc
+            || !output_format_desc->fetch_rgba_float
+            || !output_format_desc->pack_rgba_float
+            || output_format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB
+            || output_format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN
+            || !translate_is_output_format_supported(output_format))
+         continue;
+
+      for(i = 0; i < output_format_desc->nr_channels; ++i)
+      {
+         if(output_format_desc->channel[i].type != UTIL_FORMAT_TYPE_FLOAT)
+            output_normalized |= (1 << output_format_desc->channel[i].normalized);
+      }
+
+      output_format_size = util_format_get_stride(output_format, 1);
+
+      for (input_format = 1; input_format < PIPE_FORMAT_COUNT; ++input_format)
+      {
+         const struct util_format_description* input_format_desc = util_format_description(input_format);
+         unsigned input_format_size;
+         struct translate* translate[2];
+         unsigned fail = 0;
+         unsigned used_generic = 0;
+         unsigned input_normalized = 0;
+         boolean input_is_float = FALSE;
+
+         if (!input_format_desc
+               || !input_format_desc->fetch_rgba_float
+               || !input_format_desc->pack_rgba_float
+               || input_format_desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB
+               || input_format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN
+               || !translate_is_output_format_supported(input_format))
+            continue;
+
+         input_format_size = util_format_get_stride(input_format, 1);
+
+         for(i = 0; i < input_format_desc->nr_channels; ++i)
+         {
+            if(input_format_desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT)
+            {
+               input_is_float = 1;
+               input_normalized |= 1 << 1;
+            }
+            else
+               input_normalized |= (1 << input_format_desc->channel[i].normalized);
+         }
+
+         if(((input_normalized | output_normalized) == 3)
+               || ((input_normalized & 1) && (output_normalized & 1)
+                     && input_format_size * output_format_desc->nr_channels > output_format_size * input_format_desc->nr_channels))
+            continue;
+
+         key.element[0].input_format = input_format;
+         key.element[0].output_format = output_format;
+         key.output_stride = output_format_size;
+         translate[0] = create_fn(&key);
+         if (!translate[0])
+            continue;
+
+         key.element[0].input_format = output_format;
+         key.element[0].output_format = input_format;
+         key.output_stride = input_format_size;
+         translate[1] = create_fn(&key);
+         if(!translate[1])
+         {
+            used_generic = 1;
+            translate[1] = translate_generic_create(&key);
+            if(!translate[1])
+               continue;
+         }
+
+         for(i = 1; i < 5; ++i)
+            memset(buffer[i], 0xcd - (0x22 * i), 4096);
+
+         if(input_is_float && input_format_desc->channel[0].size == 32)
+            buffer[0] = (unsigned char*)float_buffer;
+         else if(input_is_float && input_format_desc->channel[0].size == 64)
+            buffer[0] = (unsigned char*)double_buffer;
+         else if(input_is_float)
+            abort();
+         else
+            buffer[0] = byte_buffer;
+
+         translate[0]->set_buffer(translate[0], 0, buffer[0], input_format_size, ~0);
+         translate[0]->run(translate[0], 0, count, 0, buffer[1]);
+         translate[1]->set_buffer(translate[1], 0, buffer[1], output_format_size, ~0);
+         translate[1]->run(translate[1], 0, count, 0, buffer[2]);
+         translate[0]->set_buffer(translate[0], 0, buffer[2], input_format_size, ~0);
+         translate[0]->run(translate[0], 0, count, 0, buffer[3]);
+         translate[1]->set_buffer(translate[1], 0, buffer[3], output_format_size, ~0);
+         translate[1]->run(translate[1], 0, count, 0, buffer[4]);
+
+         for (i = 0; i < count; ++i)
+         {
+            float a[4];
+            float b[4];
+            input_format_desc->fetch_rgba_float(a, buffer[2] + i * input_format_size, 0, 0);
+            input_format_desc->fetch_rgba_float(b, buffer[4] + i * input_format_size, 0, 0);
+
+            for (j = 0; j < count; ++j)
+            {
+               float d = a[j] - b[j];
+               if (d > error || d < -error)
+               {
+                  fail = 1;
+                  break;
+               }
+            }
+         }
+
+         printf("%s%s: %s -> %s -> %s -> %s -> %s\n",
+               fail ? "FAIL" : "PASS",
+               used_generic ? "[GENERIC]" : "",
+               input_format_desc->name, output_format_desc->name, input_format_desc->name, output_format_desc->name, input_format_desc->name);
+
+         if (1)
+         {
+            for (i = 0; i < Elements(buffer); ++i)
+            {
+               unsigned format_size = (i & 1) ? output_format_size : input_format_size;
+               printf("%c ", (i == 2 || i == 4) ? '*' : ' ');
+               for (j = 0; j < count; ++j)
+               {
+                  for (k = 0; k < format_size; ++k)
+                  {
+                     printf("%02x", buffer[i][j * format_size + k]);
+                  }
+                  printf(" ");
+               }
+               printf("\n");
+            }
+         }
+
+         if (!fail)
+            ++passed;
+         ++total;
+
+         if(translate[1])
+            translate[1]->release(translate[1]);
+         translate[0]->release(translate[0]);
+      }
+   }
+
+   printf("%u/%u tests passed for translate_%s\n", passed, total, argv[1]);
+   return passed != total;
+}
diff --git a/src/gallium/winsys/r600/drm/SConscript b/src/gallium/winsys/r600/drm/SConscript
new file mode 100644
index 00000000000..2f20d9f8957
--- /dev/null
+++ b/src/gallium/winsys/r600/drm/SConscript
@@ -0,0 +1,25 @@
+Import('*')
+
+env = env.Clone()
+
+r600_sources = [
+    'bof.c',
+    'r600_state.c',
+    'radeon_ctx.c',
+    'radeon_draw.c',
+    'radeon_state.c',
+    'radeon_bo.c',
+    'radeon_pciid.c',
+    'radeon.c',
+    'r600_drm.c'
+]
+
+env.ParseConfig('pkg-config --cflags libdrm_radeon')
+env.Append(CPPPATH = '#/src/gallium/drivers/r600')
+
+r600winsys = env.ConvenienceLibrary(
+    target ='r600winsys',
+    source = r600_sources,
+)
+
+Export('r600winsys')
diff --git a/src/gallium/winsys/r600/drm/r600_drm.c b/src/gallium/winsys/r600/drm/r600_drm.c
index 9520792f54d..c76e7f5fa51 100644
--- a/src/gallium/winsys/r600/drm/r600_drm.c
+++ b/src/gallium/winsys/r600/drm/r600_drm.c
@@ -31,7 +31,6 @@
 #include "radeon_priv.h"
 #include "r600_screen.h"
 #include "r600_resource.h"
-#include "r600_public.h"
 #include "r600_drm_public.h"
 #include "state_tracker/drm_driver.h"
 
@@ -45,7 +44,7 @@ boolean r600_buffer_get_handle(struct radeon *rw,
 			       struct winsys_handle *whandle)
 {
 	struct drm_gem_flink flink;
-	struct r600_resource* rbuffer = (struct r600_buffer*)buf;
+	struct r600_resource* rbuffer = (struct r600_resource*)buf;
 
 	if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) {
 		if (!rbuffer->flink) {
diff --git a/src/gallium/winsys/r600/drm/r600_states.h b/src/gallium/winsys/r600/drm/r600_states.h
index 5896df21b21..e40c77d8f6c 100644
--- a/src/gallium/winsys/r600/drm/r600_states.h
+++ b/src/gallium/winsys/r600/drm/r600_states.h
@@ -372,6 +372,76 @@ static const struct radeon_register R600_CB0_names[] = {
 	{0x00028100, 0, 0, "CB_COLOR0_MASK"},
 };
 
+static const struct radeon_register R600_CB1_names[] = {
+	{0x00028044, 1, 0, "CB_COLOR1_BASE"},
+	{0x000280A4, 0, 0, "CB_COLOR1_INFO"},
+	{0x00028064, 0, 0, "CB_COLOR1_SIZE"},
+	{0x00028084, 0, 0, "CB_COLOR1_VIEW"},
+	{0x000280E4, 1, 1, "CB_COLOR1_FRAG"},
+	{0x000280C4, 1, 2, "CB_COLOR1_TILE"},
+	{0x00028104, 0, 0, "CB_COLOR1_MASK"},
+};
+
+static const struct radeon_register R600_CB2_names[] = {
+	{0x00028048, 1, 0, "CB_COLOR2_BASE"},
+	{0x000280A8, 0, 0, "CB_COLOR2_INFO"},
+	{0x00028068, 0, 0, "CB_COLOR2_SIZE"},
+	{0x00028088, 0, 0, "CB_COLOR2_VIEW"},
+	{0x000280E8, 1, 1, "CB_COLOR2_FRAG"},
+	{0x000280C8, 1, 2, "CB_COLOR2_TILE"},
+	{0x00028108, 0, 0, "CB_COLOR2_MASK"},
+};
+
+static const struct radeon_register R600_CB3_names[] = {
+	{0x0002804C, 1, 0, "CB_COLOR3_BASE"},
+	{0x000280AC, 0, 0, "CB_COLOR3_INFO"},
+	{0x0002806C, 0, 0, "CB_COLOR3_SIZE"},
+	{0x0002808C, 0, 0, "CB_COLOR3_VIEW"},
+	{0x000280EC, 1, 1, "CB_COLOR3_FRAG"},
+	{0x000280CC, 1, 2, "CB_COLOR3_TILE"},
+	{0x0002810C, 0, 0, "CB_COLOR3_MASK"},
+};
+
+static const struct radeon_register R600_CB4_names[] = {
+	{0x00028050, 1, 0, "CB_COLOR4_BASE"},
+	{0x000280B0, 0, 0, "CB_COLOR4_INFO"},
+	{0x00028070, 0, 0, "CB_COLOR4_SIZE"},
+	{0x00028090, 0, 0, "CB_COLOR4_VIEW"},
+	{0x000280F0, 1, 1, "CB_COLOR4_FRAG"},
+	{0x000280D0, 1, 2, "CB_COLOR4_TILE"},
+	{0x00028110, 0, 0, "CB_COLOR4_MASK"},
+};
+
+static const struct radeon_register R600_CB5_names[] = {
+	{0x00028054, 1, 0, "CB_COLOR5_BASE"},
+	{0x000280B4, 0, 0, "CB_COLOR5_INFO"},
+	{0x00028074, 0, 0, "CB_COLOR5_SIZE"},
+	{0x00028094, 0, 0, "CB_COLOR5_VIEW"},
+	{0x000280F4, 1, 1, "CB_COLOR5_FRAG"},
+	{0x000280D4, 1, 2, "CB_COLOR5_TILE"},
+	{0x00028114, 0, 0, "CB_COLOR5_MASK"},
+};
+
+static const struct radeon_register R600_CB6_names[] = {
+	{0x00028058, 1, 0, "CB_COLOR6_BASE"},
+	{0x000280B8, 0, 0, "CB_COLOR6_INFO"},
+	{0x00028078, 0, 0, "CB_COLOR6_SIZE"},
+	{0x00028098, 0, 0, "CB_COLOR6_VIEW"},
+	{0x000280F8, 1, 1, "CB_COLOR6_FRAG"},
+	{0x000280D8, 1, 2, "CB_COLOR6_TILE"},
+	{0x00028118, 0, 0, "CB_COLOR6_MASK"},
+};
+
+static const struct radeon_register R600_CB7_names[] = {
+	{0x0002805C, 1, 0, "CB_COLOR7_BASE"},
+	{0x000280BC, 0, 0, "CB_COLOR7_INFO"},
+	{0x0002807C, 0, 0, "CB_COLOR7_SIZE"},
+	{0x0002809C, 0, 0, "CB_COLOR7_VIEW"},
+	{0x000280FC, 1, 1, "CB_COLOR7_FRAG"},
+	{0x000280DC, 1, 2, "CB_COLOR7_TILE"},
+	{0x0002811C, 0, 0, "CB_COLOR7_MASK"},
+};
+
 static const struct radeon_register R600_DB_names[] = {
 	{0x0002800C, 1, 0, "DB_DEPTH_BASE"},
 	{0x00028000, 0, 0, "DB_DEPTH_SIZE"},
@@ -425,9 +495,16 @@ static struct radeon_type R600_types[] = {
 	{ 128, 1233, 0x0000A600, 0x0000A720, 0x0010, 0, "R600_VS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_VS_SAMPLER_BORDER_names},
 	{ 128, 1251, 0x0000A800, 0x0000A920, 0x0010, 0, "R600_GS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_GS_SAMPLER_BORDER_names},
 	{ 128, 1269, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB0", 7, r600_state_pm4_cb0, R600_CB0_names},
-	{ 128, 1270, 0x00000000, 0x00000000, 0x0000, 0, "R600_DB", 6, r600_state_pm4_db, R600_DB_names},
-	{ 128, 1271, 0x00000000, 0x00000000, 0x0000, 0, "R600_VGT", 11, r600_state_pm4_vgt, R600_VGT_names},
-	{ 128, 1272, 0x00000000, 0x00000000, 0x0000, 0, "R600_DRAW", 4, r600_state_pm4_draw, R600_DRAW_names},
+	{ 128, 1270, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB1", 7, r600_state_pm4_cb0, R600_CB1_names},
+	{ 128, 1271, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB2", 7, r600_state_pm4_cb0, R600_CB2_names},
+	{ 128, 1272, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB3", 7, r600_state_pm4_cb0, R600_CB3_names},
+	{ 128, 1273, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB4", 7, r600_state_pm4_cb0, R600_CB4_names},
+	{ 128, 1274, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB5", 7, r600_state_pm4_cb0, R600_CB5_names},
+	{ 128, 1275, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB6", 7, r600_state_pm4_cb0, R600_CB6_names},
+	{ 128, 1276, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB7", 7, r600_state_pm4_cb0, R600_CB7_names},
+	{ 128, 1277, 0x00000000, 0x00000000, 0x0000, 0, "R600_DB", 6, r600_state_pm4_db, R600_DB_names},
+	{ 128, 1278, 0x00000000, 0x00000000, 0x0000, 0, "R600_VGT", 11, r600_state_pm4_vgt, R600_VGT_names},
+	{ 128, 1279, 0x00000000, 0x00000000, 0x0000, 0, "R600_DRAW", 4, r600_state_pm4_draw, R600_DRAW_names},
 };
 
 static struct radeon_type R700_types[] = {
@@ -453,9 +530,16 @@ static struct radeon_type R700_types[] = {
 	{ 128, 1233, 0x0000A600, 0x0000A720, 0x0010, 0, "R600_VS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_VS_SAMPLER_BORDER_names},
 	{ 128, 1251, 0x0000A800, 0x0000A920, 0x0010, 0, "R600_GS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_GS_SAMPLER_BORDER_names},
 	{ 128, 1269, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB0", 7, r700_state_pm4_cb0, R600_CB0_names},
-	{ 128, 1270, 0x00000000, 0x00000000, 0x0000, 0, "R600_DB", 6, r700_state_pm4_db, R600_DB_names},
-	{ 128, 1271, 0x00000000, 0x00000000, 0x0000, 0, "R600_VGT", 11, r600_state_pm4_vgt, R600_VGT_names},
-	{ 128, 1272, 0x00000000, 0x00000000, 0x0000, 0, "R600_DRAW", 4, r600_state_pm4_draw, R600_DRAW_names},
+	{ 128, 1270, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB1", 7, r600_state_pm4_cb0, R600_CB1_names},
+	{ 128, 1271, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB2", 7, r600_state_pm4_cb0, R600_CB2_names},
+	{ 128, 1272, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB3", 7, r600_state_pm4_cb0, R600_CB3_names},
+	{ 128, 1273, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB4", 7, r600_state_pm4_cb0, R600_CB4_names},
+	{ 128, 1274, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB5", 7, r600_state_pm4_cb0, R600_CB5_names},
+	{ 128, 1275, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB6", 7, r600_state_pm4_cb0, R600_CB6_names},
+	{ 128, 1276, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB7", 7, r600_state_pm4_cb0, R600_CB7_names},
+	{ 128, 1277, 0x00000000, 0x00000000, 0x0000, 0, "R600_DB", 6, r700_state_pm4_db, R600_DB_names},
+	{ 128, 1278, 0x00000000, 0x00000000, 0x0000, 0, "R600_VGT", 11, r600_state_pm4_vgt, R600_VGT_names},
+	{ 128, 1279, 0x00000000, 0x00000000, 0x0000, 0, "R600_DRAW", 4, r600_state_pm4_draw, R600_DRAW_names},
 };
 
 #endif
diff --git a/src/gallium/winsys/r600/drm/radeon.c b/src/gallium/winsys/r600/drm/radeon.c
index 7e656698064..80b0a1d3972 100644
--- a/src/gallium/winsys/r600/drm/radeon.c
+++ b/src/gallium/winsys/r600/drm/radeon.c
@@ -23,7 +23,6 @@
 #include "xf86drm.h"
 #include "radeon_priv.h"
 #include "radeon_drm.h"
-#include "r600d.h"
 
 enum radeon_family radeon_get_family(struct radeon *radeon)
 {
diff --git a/src/gallium/winsys/r600/drm/radeon_bo_pb.c b/src/gallium/winsys/r600/drm/radeon_bo_pb.c
new file mode 100644
index 00000000000..e8e53a971f2
--- /dev/null
+++ b/src/gallium/winsys/r600/drm/radeon_bo_pb.c
@@ -0,0 +1,186 @@
+#include "radeon_priv.h"
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "pipebuffer/pb_buffer.h"
+#include "pipebuffer/pb_bufmgr.h"
+
+struct radeon_bo_pb {
+	struct pb_buffer b;
+	struct radeon_bo *bo;
+
+	struct radeon_bo_pbmgr *mgr;
+	struct list_head maplist;
+};
+
+extern const struct pb_vtbl radeon_bo_pb_vtbl;
+
+static INLINE struct radeon_bo_pb *radeon_bo_pb(struct pb_buffer *buf)
+{
+	assert(buf);
+	assert(buf->vtbl == &radeon_bo_pb_vtbl);
+	return (struct radeon_bo_pb *)buf;
+}
+
+struct radeon_bo_pbmgr {
+	struct pb_manager b;
+	struct radeon *radeon;
+	struct list_head buffer_map_list;
+};
+
+static INLINE struct radeon_bo_pbmgr *radeon_bo_pbmgr(struct pb_manager *mgr)
+{
+	assert(mgr);
+	return (struct radeon_bo_pbmgr *)mgr;
+}
+
+static void radeon_bo_pb_destroy(struct pb_buffer *_buf)
+{
+	struct radeon_bo_pb *buf = radeon_bo_pb(_buf);
+
+	if (buf->bo->data != NULL) {
+		LIST_DEL(&buf->maplist);
+		radeon_bo_unmap(buf->mgr->radeon, buf->bo);
+	}
+	radeon_bo_decref(buf->mgr->radeon, buf->bo);
+	FREE(buf);
+}
+
+static void *
+radeon_bo_pb_map_internal(struct pb_buffer *_buf,
+			  unsigned flags)
+{
+	struct radeon_bo_pb *buf = radeon_bo_pb(_buf);
+	
+	if (buf->bo->data != NULL)
+		return buf->bo->data;
+
+	if (flags & PB_USAGE_DONTBLOCK) {
+		uint32_t domain;
+		if (radeon_bo_busy(buf->mgr->radeon, buf->bo, &domain))
+			return NULL;
+	}
+
+	if (radeon_bo_map(buf->mgr->radeon, buf->bo)) {
+		return NULL;
+	}
+	LIST_ADDTAIL(&buf->maplist, &buf->mgr->buffer_map_list);
+	return buf->bo->data;
+}
+
+static void radeon_bo_pb_unmap_internal(struct pb_buffer *_buf)
+{
+	(void)_buf;
+}
+
+static void
+radeon_bo_pb_get_base_buffer(struct pb_buffer *buf,
+			     struct pb_buffer **base_buf,
+			     unsigned *offset)
+{
+	*base_buf = buf;
+	*offset = 0;
+}
+
+static enum pipe_error
+radeon_bo_pb_validate(struct pb_buffer *_buf, 
+		      struct pb_validate *vl,
+		      unsigned flags)
+{
+	/* Always pinned */
+	return PIPE_OK;
+}
+
+static void
+radeon_bo_pb_fence(struct pb_buffer *buf,
+		   struct pipe_fence_handle *fence)
+{
+}
+
+const struct pb_vtbl radeon_bo_pb_vtbl = {
+    radeon_bo_pb_destroy,
+    radeon_bo_pb_map_internal,
+    radeon_bo_pb_unmap_internal,
+    radeon_bo_pb_validate,
+    radeon_bo_pb_fence,
+    radeon_bo_pb_get_base_buffer,
+};
+
+static struct pb_buffer *
+radeon_bo_pb_create_buffer(struct pb_manager *_mgr,
+			   pb_size size,
+			   const struct pb_desc *desc)
+{
+	struct radeon_bo_pbmgr *mgr = radeon_bo_pbmgr(_mgr);
+	struct radeon *radeon = mgr->radeon;
+	struct radeon_bo_pb *bo;
+	uint32_t domain;
+
+	bo = CALLOC_STRUCT(radeon_bo_pb);
+	if (!bo)
+		goto error1;
+
+	pipe_reference_init(&bo->b.base.reference, 1);
+	bo->b.base.alignment = desc->alignment;
+	bo->b.base.usage = desc->usage;
+	bo->b.base.size = size;
+	bo->b.vtbl = &radeon_bo_pb_vtbl;
+	bo->mgr = mgr;
+
+	LIST_INITHEAD(&bo->maplist);
+
+	bo->bo = radeon_bo(radeon, 0, size,
+			   desc->alignment, NULL);
+	if (bo->bo == NULL)
+		goto error2;
+	return &bo->b;
+
+error2:
+	FREE(bo);
+error1:
+	return NULL;
+}
+
+static void
+radeon_bo_pbmgr_flush(struct pb_manager *mgr)
+{
+    /* NOP */
+}
+
+static void
+radeon_bo_pbmgr_destroy(struct pb_manager *_mgr)
+{
+	struct radeon_bo_pbmgr *mgr = radeon_bo_pbmgr(_mgr);
+	FREE(mgr);
+}
+
+struct pb_manager *radeon_bo_pbmgr_create(struct radeon *radeon)
+{
+	struct radeon_bo_pbmgr *mgr;
+
+	mgr = CALLOC_STRUCT(radeon_bo_pbmgr);
+	if (!mgr)
+		return NULL;
+
+	mgr->b.destroy = radeon_bo_pbmgr_destroy;
+	mgr->b.create_buffer = radeon_bo_pb_create_buffer;
+	mgr->b.flush = radeon_bo_pbmgr_flush;
+
+	mgr->radeon = radeon;
+	LIST_INITHEAD(&mgr->buffer_map_list);
+	return &mgr->b;
+}
+
+void radeon_bo_pbmgr_flush_maps(struct pb_manager *_mgr)
+{
+	struct radeon_bo_pbmgr *mgr = radeon_bo_pbmgr(_mgr);
+	struct radeon_bo_pb *rpb, *t_rpb;
+
+	LIST_FOR_EACH_ENTRY_SAFE(rpb, t_rpb, &mgr->buffer_map_list, maplist) {
+		radeon_bo_unmap(mgr->radeon, rpb->bo);
+		LIST_DEL(&rpb->maplist);
+	}
+
+	LIST_INITHEAD(&mgr->buffer_map_list);
+}
diff --git a/src/gallium/winsys/r600/drm/radeon_ctx.c b/src/gallium/winsys/r600/drm/radeon_ctx.c
index 6b0eba0b289..45b706bb0f9 100644
--- a/src/gallium/winsys/r600/drm/radeon_ctx.c
+++ b/src/gallium/winsys/r600/drm/radeon_ctx.c
@@ -112,6 +112,7 @@ struct radeon_ctx *radeon_ctx_decref(struct radeon_ctx *ctx)
 		ctx->bo[i] = radeon_bo_decref(ctx->radeon, ctx->bo[i]);
 	}
 	ctx->radeon = radeon_decref(ctx->radeon);
+	free(ctx->state);
 	free(ctx->draw);
 	free(ctx->bo);
 	free(ctx->pm4);
@@ -151,6 +152,8 @@ int radeon_ctx_submit(struct radeon_ctx *ctx)
 	uint64_t chunk_array[2];
 	int r = 0;
 
+	if (!ctx->cpm4)
+		return 0;
 #if 0
 	for (r = 0; r < ctx->cpm4; r++) {
 		fprintf(stderr, "0x%08X\n", ctx->pm4[r]);
diff --git a/src/gallium/winsys/r600/drm/radeon_priv.h b/src/gallium/winsys/r600/drm/radeon_priv.h
index b91421f4389..96c0d060f7e 100644
--- a/src/gallium/winsys/r600/drm/radeon_priv.h
+++ b/src/gallium/winsys/r600/drm/radeon_priv.h
@@ -68,36 +68,6 @@ extern int radeon_is_family_compatible(unsigned family1, unsigned family2);
 extern int radeon_reg_id(struct radeon *radeon, unsigned offset, unsigned *typeid, unsigned *stateid, unsigned *id);
 extern unsigned radeon_type_from_id(struct radeon *radeon, unsigned id);
 
-/*
- * radeon context functions
- */
-#pragma pack(1)
-struct radeon_cs_reloc {
-	uint32_t	handle;
-	uint32_t	read_domain;
-	uint32_t	write_domain;
-	uint32_t	flags;
-};
-#pragma pack()
-
-struct radeon_ctx {
-	int				refcount;
-	struct radeon			*radeon;
-	u32				*pm4;
-	u32				cpm4;
-	u32				draw_cpm4;
-	unsigned			id;
-	unsigned			next_id;
-	unsigned			nreloc;
-	struct radeon_cs_reloc		*reloc;
-	unsigned			nbo;
-	struct radeon_bo		**bo;
-	unsigned			ndraw;
-	struct radeon_draw		*cdraw;
-	struct radeon_draw		**draw;
-	unsigned			nstate;
-	struct radeon_state		**state;
-};
 
 int radeon_ctx_set_bo_new(struct radeon_ctx *ctx, struct radeon_bo *bo);
 struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc);
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm.c b/src/gallium/winsys/radeon/drm/radeon_drm.c
index ecaf096dea2..86d4f949697 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm.c
@@ -39,7 +39,6 @@
 #include "util/u_memory.h"
 
 #include "xf86drm.h"
-#include <sys/ioctl.h>
 
 static struct radeon_libdrm_winsys *
 radeon_winsys_create(int fd)
@@ -55,6 +54,31 @@ radeon_winsys_create(int fd)
     return rws;
 }
 
+/* Enable/disable Hyper-Z access. Return TRUE on success. */
+static boolean radeon_set_hyperz_access(int fd, boolean enable)
+{
+#ifndef RADEON_INFO_WANT_HYPERZ
+#define RADEON_INFO_WANT_HYPERZ 7
+#endif
+
+    struct drm_radeon_info info = {0};
+    unsigned value = enable ? 1 : 0;
+
+    if (!debug_get_bool_option("RADEON_HYPERZ", FALSE))
+        return FALSE;
+
+    info.value = (unsigned long)&value;
+    info.request = RADEON_INFO_WANT_HYPERZ;
+
+    if (drmCommandWriteRead(fd, DRM_RADEON_INFO, &info, sizeof(info)) != 0)
+        return FALSE;
+
+    if (enable && !value)
+        return FALSE;
+
+    return TRUE;
+}
+
 /* Helper function to do the ioctls needed for setup and init. */
 static void do_ioctls(int fd, struct radeon_libdrm_winsys* winsys)
 {
@@ -134,15 +158,7 @@ static void do_ioctls(int fd, struct radeon_libdrm_winsys* winsys)
     }
     winsys->z_pipes = target;
 
-    winsys->hyperz = FALSE;
-#ifndef RADEON_INFO_WANT_HYPERZ
-#define RADEON_INFO_WANT_HYPERZ 7
-#endif
-    info.request = RADEON_INFO_WANT_HYPERZ;
-    retval = drmCommandWriteRead(fd, DRM_RADEON_INFO, &info, sizeof(info));
-    if (!retval && target == 1) {
-        winsys->hyperz = TRUE;
-    }
+    winsys->hyperz = radeon_set_hyperz_access(fd, TRUE);
 
     retval = drmCommandWriteRead(fd, DRM_RADEON_GEM_INFO,
             &gem_info, sizeof(gem_info));
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_dri.c b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
index 1b0d10f60d6..7bd4407e9f1 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_dri.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
@@ -32,8 +32,6 @@
 
 #include "vmw_screen.h"
 #include "vmw_surface.h"
-#include "vmw_fence.h"
-#include "vmw_context.h"
 #include "svga_drm_public.h"
 
 #include "state_tracker/drm_driver.h"
diff --git a/src/glsl/apps/compile.c b/src/glsl/apps/compile.c
index 3aa4fd4d53e..5114fc9d0be 100644
--- a/src/glsl/apps/compile.c
+++ b/src/glsl/apps/compile.c
@@ -30,6 +30,7 @@
 #include <string.h>
 #include <assert.h>
 #include "../pp/sl_pp_public.h"
+#include "../pp/sl_pp_purify.h"
 #include "../cl/sl_cl_parse.h"
 
 
diff --git a/src/glsl/apps/process.c b/src/glsl/apps/process.c
index caf72a71cf1..6d5ce6eea3f 100644
--- a/src/glsl/apps/process.c
+++ b/src/glsl/apps/process.c
@@ -30,6 +30,8 @@
 #include <string.h>
 #include <assert.h>
 #include "../pp/sl_pp_public.h"
+#include "../pp/sl_pp_purify.h"
+#include "../pp/sl_pp_token.h"
 
 
 int
diff --git a/src/glsl/apps/purify.c b/src/glsl/apps/purify.c
index 0f09b157efd..e3fca59ab45 100644
--- a/src/glsl/apps/purify.c
+++ b/src/glsl/apps/purify.c
@@ -30,6 +30,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "../pp/sl_pp_public.h"
+#include "../pp/sl_pp_purify.h"
 
 
 int
diff --git a/src/glsl/apps/tokenise.c b/src/glsl/apps/tokenise.c
index f89f47d0611..3d68334bed3 100644
--- a/src/glsl/apps/tokenise.c
+++ b/src/glsl/apps/tokenise.c
@@ -30,6 +30,8 @@
 #include <string.h>
 #include <assert.h>
 #include "../pp/sl_pp_public.h"
+#include "../pp/sl_pp_purify.h"
+#include "../pp/sl_pp_token.h"
 
 
 int
diff --git a/src/glsl/apps/version.c b/src/glsl/apps/version.c
index fa5c226da83..8506f35ba11 100644
--- a/src/glsl/apps/version.c
+++ b/src/glsl/apps/version.c
@@ -30,6 +30,7 @@
 #include <string.h>
 #include <assert.h>
 #include "../pp/sl_pp_public.h"
+#include "../pp/sl_pp_purify.h"
 
 
 int
diff --git a/src/glsl/cl/sl_cl_parse.c b/src/glsl/cl/sl_cl_parse.c
index 09456f5219a..c1bc6031ce6 100644
--- a/src/glsl/cl/sl_cl_parse.c
+++ b/src/glsl/cl/sl_cl_parse.c
@@ -29,6 +29,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "../pp/sl_pp_public.h"
+#include "../pp/sl_pp_token.h"
 #include "sl_cl_parse.h"
 
 
diff --git a/src/glsl/cl/sl_cl_parse.h b/src/glsl/cl/sl_cl_parse.h
index dd5791d5901..a954d439276 100644
--- a/src/glsl/cl/sl_cl_parse.h
+++ b/src/glsl/cl/sl_cl_parse.h
@@ -28,6 +28,8 @@
 #ifndef SL_CL_PARSE_H
 #define SL_CL_PARSE_H
 
+struct sl_pp_context;
+
 int
 sl_cl_compile(struct sl_pp_context *context,
               unsigned int shader_type,
diff --git a/src/glsl/pp/sl_pp_context.c b/src/glsl/pp/sl_pp_context.c
index 74a9bdddfdc..b8e1e99fc86 100644
--- a/src/glsl/pp/sl_pp_context.c
+++ b/src/glsl/pp/sl_pp_context.c
@@ -27,6 +27,7 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include "sl_pp_macro.h"
 #include "sl_pp_public.h"
 #include "sl_pp_context.h"
 
diff --git a/src/glsl/pp/sl_pp_context.h b/src/glsl/pp/sl_pp_context.h
index 8abb9708b85..e6244f62575 100644
--- a/src/glsl/pp/sl_pp_context.h
+++ b/src/glsl/pp/sl_pp_context.h
@@ -29,7 +29,6 @@
 #define SL_PP_CONTEXT_H
 
 #include "sl_pp_dict.h"
-#include "sl_pp_macro.h"
 #include "sl_pp_process.h"
 #include "sl_pp_purify.h"
 #include "sl_pp_token_util.h"
diff --git a/src/glsl/pp/sl_pp_define.c b/src/glsl/pp/sl_pp_define.c
index 808a6a0d4f1..370e6aa6606 100644
--- a/src/glsl/pp/sl_pp_define.c
+++ b/src/glsl/pp/sl_pp_define.c
@@ -28,8 +28,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include "sl_pp_context.h"
+#include "sl_pp_macro.h"
 #include "sl_pp_process.h"
 #include "sl_pp_public.h"
+#include "sl_pp_token.h"
 
 
 static void
diff --git a/src/glsl/pp/sl_pp_error.c b/src/glsl/pp/sl_pp_error.c
index b628e37ce83..482b67fcafb 100644
--- a/src/glsl/pp/sl_pp_error.c
+++ b/src/glsl/pp/sl_pp_error.c
@@ -30,6 +30,7 @@
 #include "sl_pp_context.h"
 #include "sl_pp_process.h"
 #include "sl_pp_public.h"
+#include "sl_pp_token.h"
 
 
 void
diff --git a/src/glsl/pp/sl_pp_expression.c b/src/glsl/pp/sl_pp_expression.c
index ec904787dd7..c3f48356b09 100644
--- a/src/glsl/pp/sl_pp_expression.c
+++ b/src/glsl/pp/sl_pp_expression.c
@@ -27,8 +27,10 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include "sl_pp_context.h"
 #include "sl_pp_expression.h"
 #include "sl_pp_public.h"
+#include "sl_pp_token.h"
 
 
 struct parse_context {
diff --git a/src/glsl/pp/sl_pp_expression.h b/src/glsl/pp/sl_pp_expression.h
index 377d5b4cbd9..522263bb259 100644
--- a/src/glsl/pp/sl_pp_expression.h
+++ b/src/glsl/pp/sl_pp_expression.h
@@ -28,8 +28,8 @@
 #ifndef SL_PP_EXPRESSION_H
 #define SL_PP_EXPRESSION_H
 
-#include "sl_pp_context.h"
-#include "sl_pp_token.h"
+struct sl_pp_context;
+struct sl_pp_token_info;
 
 
 int
diff --git a/src/glsl/pp/sl_pp_extension.c b/src/glsl/pp/sl_pp_extension.c
index d119677c268..00dbdcf22bc 100644
--- a/src/glsl/pp/sl_pp_extension.c
+++ b/src/glsl/pp/sl_pp_extension.c
@@ -25,11 +25,13 @@
  * 
  **************************************************************************/
 
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include "sl_pp_context.h"
 #include "sl_pp_process.h"
 #include "sl_pp_public.h"
+#include "sl_pp_token.h"
 
 
 /**
diff --git a/src/glsl/pp/sl_pp_if.c b/src/glsl/pp/sl_pp_if.c
index 25cb7a3ca11..6b7a1590b42 100644
--- a/src/glsl/pp/sl_pp_if.c
+++ b/src/glsl/pp/sl_pp_if.c
@@ -27,8 +27,11 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include "sl_pp_context.h"
 #include "sl_pp_expression.h"
+#include "sl_pp_macro.h"
 #include "sl_pp_process.h"
+#include "sl_pp_token.h"
 
 
 static int
diff --git a/src/glsl/pp/sl_pp_line.c b/src/glsl/pp/sl_pp_line.c
index 6f7e9eb562c..51581c7bb59 100644
--- a/src/glsl/pp/sl_pp_line.c
+++ b/src/glsl/pp/sl_pp_line.c
@@ -28,8 +28,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include "sl_pp_context.h"
+#include "sl_pp_macro.h"
 #include "sl_pp_public.h"
 #include "sl_pp_process.h"
+#include "sl_pp_token.h"
 
 
 int
diff --git a/src/glsl/pp/sl_pp_macro.c b/src/glsl/pp/sl_pp_macro.c
index 9f520b8fc53..2cf9ea342b2 100644
--- a/src/glsl/pp/sl_pp_macro.c
+++ b/src/glsl/pp/sl_pp_macro.c
@@ -32,6 +32,7 @@
 #include "sl_pp_public.h"
 #include "sl_pp_macro.h"
 #include "sl_pp_process.h"
+#include "sl_pp_token.h"
 
 
 static void
diff --git a/src/glsl/pp/sl_pp_macro.h b/src/glsl/pp/sl_pp_macro.h
index 1d210681091..6e65a0a588d 100644
--- a/src/glsl/pp/sl_pp_macro.h
+++ b/src/glsl/pp/sl_pp_macro.h
@@ -28,9 +28,6 @@
 #ifndef SL_PP_MACRO_H
 #define SL_PP_MACRO_H
 
-#include "sl_pp_token.h"
-
-
 struct sl_pp_context;
 struct sl_pp_process_state;
 struct sl_pp_token_buffer;
diff --git a/src/glsl/pp/sl_pp_pragma.c b/src/glsl/pp/sl_pp_pragma.c
index caf4c63f657..6789704db0c 100644
--- a/src/glsl/pp/sl_pp_pragma.c
+++ b/src/glsl/pp/sl_pp_pragma.c
@@ -29,6 +29,7 @@
 #include <string.h>
 #include "sl_pp_context.h"
 #include "sl_pp_process.h"
+#include "sl_pp_token.h"
 
 
 int
diff --git a/src/glsl/pp/sl_pp_process.c b/src/glsl/pp/sl_pp_process.c
index 315ad9bf1cd..2f12393237c 100644
--- a/src/glsl/pp/sl_pp_process.c
+++ b/src/glsl/pp/sl_pp_process.c
@@ -25,11 +25,14 @@
  * 
  **************************************************************************/
 
+#include <assert.h>
 #include <stdlib.h>
 #include <string.h>
 #include "sl_pp_context.h"
+#include "sl_pp_macro.h"
 #include "sl_pp_process.h"
 #include "sl_pp_public.h"
+#include "sl_pp_token.h"
 
 
 int
diff --git a/src/glsl/pp/sl_pp_process.h b/src/glsl/pp/sl_pp_process.h
index fe6ff0d4648..04e9be43989 100644
--- a/src/glsl/pp/sl_pp_process.h
+++ b/src/glsl/pp/sl_pp_process.h
@@ -28,11 +28,8 @@
 #ifndef SL_PP_PROCESS_H
 #define SL_PP_PROCESS_H
 
-#include "sl_pp_macro.h"
-#include "sl_pp_token.h"
-
-
 struct sl_pp_context;
+struct sl_pp_token_buffer;
 
 struct sl_pp_process_state {
    struct sl_pp_token_info *out;
diff --git a/src/glsl/pp/sl_pp_public.h b/src/glsl/pp/sl_pp_public.h
index ca6f722543d..66ced6cf589 100644
--- a/src/glsl/pp/sl_pp_public.h
+++ b/src/glsl/pp/sl_pp_public.h
@@ -28,13 +28,9 @@
 #ifndef SL_PP_PUBLIC_H
 #define SL_PP_PUBLIC_H
 
-
 struct sl_pp_context;
-
-
-#include "sl_pp_purify.h"
-#include "sl_pp_token.h"
-
+struct sl_pp_purify_options;
+struct sl_pp_token_info;
 
 struct sl_pp_context *
 sl_pp_context_create(const char *input,
diff --git a/src/glsl/pp/sl_pp_token_util.c b/src/glsl/pp/sl_pp_token_util.c
index c85263d9a11..43be49670b0 100644
--- a/src/glsl/pp/sl_pp_token_util.c
+++ b/src/glsl/pp/sl_pp_token_util.c
@@ -28,6 +28,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include "sl_pp_token_util.h"
+#include "sl_pp_token.h"
 
 
 int
diff --git a/src/glsl/pp/sl_pp_token_util.h b/src/glsl/pp/sl_pp_token_util.h
index 2a668ad0a84..35d72101ca5 100644
--- a/src/glsl/pp/sl_pp_token_util.h
+++ b/src/glsl/pp/sl_pp_token_util.h
@@ -28,11 +28,6 @@
 #ifndef SL_PP_TOKEN_UTIL_H
 #define SL_PP_TOKEN_UTIL_H
 
-#include <assert.h>
-#include <stdlib.h>
-#include "sl_pp_token.h"
-
-
 struct sl_pp_context;
 
 /*
diff --git a/src/glsl/pp/sl_pp_version.c b/src/glsl/pp/sl_pp_version.c
index 3c995b77501..6735c05e8ae 100644
--- a/src/glsl/pp/sl_pp_version.c
+++ b/src/glsl/pp/sl_pp_version.c
@@ -29,6 +29,7 @@
 #include <string.h>
 #include "sl_pp_public.h"
 #include "sl_pp_context.h"
+#include "sl_pp_token.h"
 
 
 int
diff --git a/src/glut/glx/glut_dstr.c b/src/glut/glx/glut_dstr.c
index 2513af45394..319930c4b11 100644
--- a/src/glut/glx/glut_dstr.c
+++ b/src/glut/glx/glut_dstr.c
@@ -232,7 +232,7 @@ loadVisuals(int *nitems_return)
   XVisualInfo *vinfo, **vlist, template;
   FrameBufferMode *fbmodes, *mode;
   int n, i, j, rc, glcapable;
-#if defined(GLX_VERSION_1_1) && defined(GLX_SGIS_multisample)
+#if defined(GLX_VERSION_1_1) && (defined(GLX_SGIS_multisample) || defined(GLX_ARB_multisample))
   int multisample;
 #endif
 #if defined(GLX_VERSION_1_1) && defined(GLX_EXT_visual_info)
@@ -275,8 +275,9 @@ loadVisuals(int *nitems_return)
     }
   }
 
-#if defined(GLX_VERSION_1_1) && defined(GLX_SGIS_multisample)
-  multisample = __glutIsSupportedByGLX("GLX_SGIS_multisample");
+#if defined(GLX_VERSION_1_1) && (defined(GLX_SGIS_multisample) || defined(GLX_ARB_multisample))
+  multisample = __glutIsSupportedByGLX("GLX_SGIS_multisample") ||
+                __glutIsSupportedByGLX("GLX_ARB_multisample");
 #endif
 #if defined(GLX_VERSION_1_1) && defined(GLX_EXT_visual_info)
   visual_info = __glutIsSupportedByGLX("GLX_EXT_visual_info");
@@ -572,7 +573,7 @@ loadVisuals(int *nitems_return)
 #else
                 mode->cap[TRANSPARENT] = 0;
 #endif
-#if defined(GLX_VERSION_1_1) && defined(GLX_SGIS_multisample)
+#if defined(GLX_VERSION_1_1) && (defined(GLX_SGIS_multisample) || defined(GLX_ARB_multisample))
                 if (multisample) {
                   rc = __glut_glXGetFBConfigAttribSGIX(__glutDisplay,
 		    fbc, GLX_SAMPLES_SGIS, &mode->cap[SAMPLES]);
@@ -1250,8 +1251,9 @@ parseModeString(char *mode, int *ncriteria, Bool * allowDoubleAsSingle,
     word = strtok(NULL, " \t");
   }
 
-#if defined(GLX_VERSION_1_1) && defined(GLX_SGIS_multisample)
-  if (__glutIsSupportedByGLX("GLX_SGIS_multisample")) {
+#if defined(GLX_VERSION_1_1) && (defined(GLX_SGIS_multisample) || defined(GLX_ARB_multisample))
+  if (__glutIsSupportedByGLX("GLX_SGIS_multisample") ||
+      __glutIsSupportedByGLX("GLX_ARB_multisample")) {
     if (!(mask & (1 << SAMPLES))) {
       criteria[n].capability = SAMPLES;
       criteria[n].comparison = EQ;
diff --git a/src/glut/glx/glut_overlay.c b/src/glut/glx/glut_overlay.c
index 32434650ebe..ce8e68d164a 100644
--- a/src/glut/glx/glut_overlay.c
+++ b/src/glut/glx/glut_overlay.c
@@ -81,10 +81,11 @@ checkOverlayAcceptability(XVisualInfo * vi, unsigned int mode)
   if (GLUT_WIND_HAS_STENCIL(mode) && (value <= 0))
     return 1;
 
-#if defined(GLX_VERSION_1_1) && defined(GLX_SGIS_multisample)
+#if defined(GLX_VERSION_1_1) && (defined(GLX_SGIS_multisample) || defined(GLX_ARB_multisample))
   /* XXX Multisampled overlay color index??  Pretty unlikely. */
   /* Look for multisampling if requested. */
-  if (__glutIsSupportedByGLX("GLX_SGIS_multisample"))
+  if (__glutIsSupportedByGLX("GLX_SGIS_multisample") ||
+      __glutIsSupportedByGLX("GLX_ARB_multisample"))
     glXGetConfig(__glutDisplay, vi, GLX_SAMPLES_SGIS, &value);
   else
     value = 0;
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index 5200a244f26..5c02abc914f 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -82,6 +82,7 @@ if env['platform'] != 'winddk':
 		'main/pixelstore.c',
 		'main/points.c',
 		'main/polygon.c',
+		'main/querymatrix.c',
 		'main/queryobj.c',
 		'main/rastpos.c',
 		'main/readpix.c',
diff --git a/src/mesa/drivers/dri/i810/i810render.c b/src/mesa/drivers/dri/i810/i810render.c
index b543d4f012c..205f0cebc1c 100644
--- a/src/mesa/drivers/dri/i810/i810render.c
+++ b/src/mesa/drivers/dri/i810/i810render.c
@@ -37,6 +37,8 @@
 #include "main/imports.h"
 #include "main/mtypes.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 
 #include "i810screen.h"
diff --git a/src/mesa/drivers/dri/i915/intel_render.c b/src/mesa/drivers/dri/i915/intel_render.c
index ec209391ab4..add0adacb56 100644
--- a/src/mesa/drivers/dri/i915/intel_render.c
+++ b/src/mesa/drivers/dri/i915/intel_render.c
@@ -37,6 +37,8 @@
 #include "main/mtypes.h"
 #include "main/enums.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
 #include "tnl/t_pipeline.h"
diff --git a/src/mesa/drivers/dri/mga/mgarender.c b/src/mesa/drivers/dri/mga/mgarender.c
index 8b8fc485d31..cc0cea618d1 100644
--- a/src/mesa/drivers/dri/mga/mgarender.c
+++ b/src/mesa/drivers/dri/mga/mgarender.c
@@ -44,6 +44,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/imports.h"
 #include "main/mtypes.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 
 #include "mgacontext.h"
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_driver.c b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
index 4ec864c181c..6452fe218e5 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_driver.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_driver.c
@@ -138,5 +138,7 @@ nouveau_driver_functions_init(struct dd_function_table *functions)
 	functions->DrawPixels = _mesa_meta_DrawPixels;
 	functions->CopyPixels = _mesa_meta_CopyPixels;
 	functions->Bitmap = _mesa_meta_Bitmap;
+#if FEATURE_EXT_framebuffer_blit
 	functions->BlitFramebuffer = _mesa_meta_BlitFramebuffer;
+#endif
 }
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
index bd1273beea7..32d8f2d0f9b 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
@@ -262,10 +262,12 @@ nouveau_finish_render_texture(GLcontext *ctx,
 void
 nouveau_fbo_functions_init(struct dd_function_table *functions)
 {
+#if FEATURE_EXT_framebuffer_object
 	functions->NewFramebuffer = nouveau_framebuffer_new;
 	functions->NewRenderbuffer = nouveau_renderbuffer_new;
 	functions->BindFramebuffer = nouveau_bind_framebuffer;
 	functions->FramebufferRenderbuffer = nouveau_framebuffer_renderbuffer;
 	functions->RenderTexture = nouveau_render_texture;
 	functions->FinishRenderTexture = nouveau_finish_render_texture;
+#endif
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index c6246a81a24..d2fa816894c 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -113,7 +113,7 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 		debug_program_log(c, "after unroll loops");
 	}
 	else{
-		rc_transform_loops(&c->Base, &loop_state);
+		rc_transform_loops(&c->Base, &loop_state, -1);
 		debug_program_log(c, "after transform loops");
 
 		rc_emulate_branches(&c->Base);
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index e940fedec20..666c9c2a7a9 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -32,6 +32,11 @@
 #include "radeon_emulate_branches.h"
 #include "radeon_emulate_loops.h"
 
+struct loop {
+	int BgnLoop;
+
+};
+
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
  * obtain a constant ZERO or ONE source.
@@ -332,11 +337,140 @@ static void ei_pow(struct r300_vertex_program_code *vp,
 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
 }
 
+static void mark_write(void * userdata,	struct rc_instruction * inst,
+		rc_register_file file,	unsigned int index, unsigned int mask)
+{
+	unsigned int * writemasks = userdata;
+
+	if (file != RC_FILE_TEMPORARY)
+		return;
+
+	if (index >= R300_VS_MAX_TEMPS)
+		return;
+
+	writemasks[index] |= mask;
+}
+
+static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
+{
+	return PVS_SRC_OPERAND(compiler->PredicateIndex,
+		t_swizzle(RC_SWIZZLE_ZERO),
+		t_swizzle(RC_SWIZZLE_ZERO),
+		t_swizzle(RC_SWIZZLE_ZERO),
+		t_swizzle(RC_SWIZZLE_W),
+		t_src_class(RC_FILE_TEMPORARY),
+		0);
+}
+
+static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
+					unsigned int hw_opcode, int is_math)
+{
+	return PVS_OP_DST_OPERAND(hw_opcode,
+	     is_math,
+	     0,
+	     compiler->PredicateIndex,
+	     RC_MASK_W,
+	     t_dst_class(RC_FILE_TEMPORARY));
+
+}
+
+static void ei_if(struct r300_vertex_program_compiler * compiler,
+					struct rc_instruction *rci,
+					unsigned int * inst,
+					unsigned int branch_depth)
+{
+	unsigned int predicate_opcode;
+	int is_math = 0;
+
+	if (!compiler->Base.is_r500) {
+		rc_error(&compiler->Base,"Opcode IF not supported\n");
+		return;
+	}
+
+	/* Reserve a temporary to use as our predicate stack counter, if we
+	 * don't already have one. */
+	if (!compiler->PredicateMask) {
+		unsigned int writemasks[R300_VS_MAX_TEMPS];
+		memset(writemasks, 0, sizeof(writemasks));
+		struct rc_instruction * inst;
+		unsigned int i;
+		for(inst = compiler->Base.Program.Instructions.Next;
+				inst != &compiler->Base.Program.Instructions;
+							inst = inst->Next) {
+			rc_for_all_writes_mask(inst, mark_write, writemasks);
+		}
+		for(i = 0; i < R300_VS_MAX_TEMPS; i++) {
+			unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
+			/* Only the W component can be used fo the predicate
+			 * stack counter. */
+			if (mask & RC_MASK_W) {
+				compiler->PredicateMask = RC_MASK_W;
+				compiler->PredicateIndex = i;
+				break;
+			}
+		}
+		if (i == R300_VS_MAX_TEMPS) {
+			rc_error(&compiler->Base, "No free temporary to use for"
+					" predicate stack counter.\n");
+			return;
+		}
+	}
+	predicate_opcode =
+			branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
+
+	rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
+	if (branch_depth == 0) {
+		is_math = 1;
+		predicate_opcode = ME_PRED_SET_NEQ;
+		inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
+		inst[2] = 0;
+	} else {
+		predicate_opcode = VE_PRED_SET_NEQ_PUSH;
+		inst[1] = t_pred_src(compiler);
+		inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
+	}
+
+	inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
+	inst[3] = 0;
+
+}
+
+static void ei_else(struct r300_vertex_program_compiler * compiler,
+							unsigned int * inst)
+{
+	if (!compiler->Base.is_r500) {
+		rc_error(&compiler->Base,"Opcode ELSE not supported\n");
+		return;
+	}
+	inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
+	inst[1] = t_pred_src(compiler);
+	inst[2] = 0;
+	inst[3] = 0;
+}
+
+static void ei_endif(struct r300_vertex_program_compiler *compiler,
+							unsigned int * inst)
+{
+	if (!compiler->Base.is_r500) {
+		rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
+		return;
+	}
+	inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
+	inst[1] = t_pred_src(compiler);
+	inst[2] = 0;
+	inst[3] = 0;
+}
 
 static void translate_vertex_program(struct r300_vertex_program_compiler * compiler)
 {
 	struct rc_instruction *rci;
 
+	struct loop * loops;
+	int current_loop_depth = 0;
+	int loops_reserved = 0;
+
+	unsigned int branch_depth = 0;
+
 	compiler->code->pos_end = 0;	/* Not supported yet */
 	compiler->code->length = 0;
 
@@ -366,9 +500,12 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
+		case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
+		case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
 		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
 		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
+		case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
 		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
 		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
 		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
@@ -385,11 +522,86 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
+		case RC_OPCODE_BGNLOOP:
+		{
+			struct loop * l;
+
+			if ((!compiler->Base.is_r500
+				&& loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
+				|| loops_reserved >= R500_VS_MAX_FC_DEPTH) {
+				rc_error(&compiler->Base,
+						"Loops are nested too deep.");
+				return;
+			}
+			memory_pool_array_reserve(&compiler->Base.Pool,
+					struct loop, loops, current_loop_depth,
+					loops_reserved, 1);
+			l = &loops[current_loop_depth++];
+			memset(l , 0, sizeof(struct loop));
+			l->BgnLoop = (compiler->code->length / 4);
+			continue;
+		}
+		case RC_OPCODE_ENDLOOP:
+		{
+			struct loop * l = &loops[current_loop_depth - 1];
+			unsigned int act_addr = l->BgnLoop - 1;
+			unsigned int last_addr = (compiler->code->length / 4) - 1;
+			unsigned int ret_addr = l->BgnLoop;
+
+			if (loops_reserved >= R300_VS_MAX_FC_OPS) {
+				rc_error(&compiler->Base,
+					"Too many flow control instructions.");
+				return;
+			}
+			if (compiler->Base.is_r500) {
+				compiler->code->fc_op_addrs.r500
+					[compiler->code->num_fc_ops].lw =
+					R500_PVS_FC_ACT_ADRS(act_addr)
+					| R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
+					;
+				compiler->code->fc_op_addrs.r500
+					[compiler->code->num_fc_ops].uw =
+					R500_PVS_FC_LAST_INST(last_addr)
+					| R500_PVS_FC_RTN_INST(ret_addr)
+					;
+			} else {
+				compiler->code->fc_op_addrs.r300
+					[compiler->code->num_fc_ops] =
+					R300_PVS_FC_ACT_ADRS(act_addr)
+					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
+					| R300_PVS_FC_LAST_INST(last_addr)
+					| R300_PVS_FC_RTN_INST(ret_addr)
+					;
+			}
+			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
+				R300_PVS_FC_LOOP_INIT_VAL(0x0)
+				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
+				;
+			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
+						compiler->code->num_fc_ops);
+			compiler->code->num_fc_ops++;
+			current_loop_depth--;
+			continue;
+		}
+
 		default:
 			rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name);
 			return;
 		}
 
+		/* Non-flow control instructions that are inside an if statement
+		 * need to pay attention to the predicate bit. */
+		if (branch_depth
+			&& vpi->Opcode != RC_OPCODE_IF
+			&& vpi->Opcode != RC_OPCODE_ELSE
+			&& vpi->Opcode != RC_OPCODE_ENDIF) {
+
+			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
+						<< PVS_DST_PRED_ENABLE_SHIFT);
+			inst[0] |= (PVS_DST_PRED_SENSE_MASK
+						<< PVS_DST_PRED_SENSE_SHIFT);
+		}
+
 		compiler->code->length += 4;
 
 		if (compiler->Base.Error)
@@ -406,6 +618,7 @@ struct temporary_allocation {
 static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
 {
 	struct rc_instruction *inst;
+	struct rc_instruction *end_loop = NULL;
 	unsigned int num_orig_temps = 0;
 	char hwtemps[R300_VS_MAX_TEMPS];
 	struct temporary_allocation * ta;
@@ -440,10 +653,35 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 	/* Pass 2: Determine original temporary lifetimes */
 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+		/* Instructions inside of loops need to use the ENDLOOP
+		 * instruction as their LastRead. */
+		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+			int endloops = 1;
+			struct rc_instruction * ptr;
+			for(ptr = inst->Next;
+				ptr != &compiler->Base.Program.Instructions;
+							ptr = ptr->Next){
+				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+					endloops++;
+				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
+					endloops--;
+					if (endloops <= 0) {
+						end_loop = ptr;
+						break;
+					}
+				}
+			}
+		}
+
+		if (inst == end_loop) {
+			end_loop = NULL;
+			continue;
+		}
 
 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY)
-				ta[inst->U.I.SrcReg[i].Index].LastRead = inst;
+				ta[inst->U.I.SrcReg[i].Index].LastRead =
+						end_loop ? end_loop : inst;
 		}
 	}
 
@@ -640,22 +878,17 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 
 	debug_program_log(compiler, "before compilation");
 
-	/* XXX Ideally this should be done only for r3xx, but since
-	 * we don't have branching support for r5xx, we use the emulation
-	 * on all chipsets. */
+	if (compiler->Base.is_r500)
+		rc_transform_loops(&compiler->Base, &loop_state, R500_VS_MAX_ALU);
+	else
+		rc_transform_loops(&compiler->Base, &loop_state, R300_VS_MAX_ALU);
 
-	if (compiler->Base.is_r500){
-		rc_transform_loops(&compiler->Base, &loop_state);
-		rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
-	} else {
-		rc_transform_loops(&compiler->Base, &loop_state);
-		rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
-	}
 	debug_program_log(compiler, "after emulate loops");
 
-	rc_emulate_branches(&compiler->Base);
-
-	debug_program_log(compiler, "after emulate branches");
+	if (!compiler->Base.is_r500) {
+		rc_emulate_branches(&compiler->Base);
+		debug_program_log(compiler, "after emulate branches");
+	}
 
 	if (compiler->Base.is_r500) {
 		struct radeon_program_transformation transformations[] = {
@@ -717,6 +950,6 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 
 	if (compiler->Base.Debug) {
 		fprintf(stderr, "Final vertex program code:\n");
-		r300_vertex_program_dump(compiler->code);
+		r300_vertex_program_dump(compiler);
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
index 5800f1a78e1..e6009338e2e 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
@@ -20,7 +20,9 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "radeon_compiler.h"
 #include "radeon_code.h"
+#include "../r300_reg.h"
 
 #include <stdio.h>
 
@@ -133,6 +135,10 @@ static void r300_vs_op_dump(uint32_t op)
 {
 	fprintf(stderr, " dst: %d%s op: ",
 			(op >> 13) & 0x7f, r300_vs_dst_debug[(op >> 8) & 0x7]);
+	if ((op >> PVS_DST_PRED_ENABLE_SHIFT) & 0x1) {
+		fprintf(stderr, "PRED %u",
+				(op >> PVS_DST_PRED_SENSE_SHIFT) & 0x1);
+	}
 	if (op & 0x80) {
 		if (op & 0x1) {
 			fprintf(stderr, "PVS_MACRO_OP_2CLK_M2X_ADD\n");
@@ -160,8 +166,9 @@ static void r300_vs_src_dump(uint32_t src)
 			r300_vs_swiz_debug[(src >> 22) & 0x7]);
 }
 
-void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
+void r300_vertex_program_dump(struct r300_vertex_program_compiler * c)
 {
+	struct r300_vertex_program_code * vs = c->code;
 	unsigned instrcount = vs->length / 4;
 	unsigned i;
 
@@ -177,4 +184,21 @@ void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
 			r300_vs_src_dump(vs->body.d[offset+1+src]);
 		}
 	}
+
+	fprintf(stderr, "Flow Control Ops: 0x%08x\n",vs->fc_ops);
+	for(i = 0; i < vs->num_fc_ops; i++) {
+		switch((vs->fc_ops >> (i * 2)) & 0x3 ) {
+		case 0: fprintf(stderr, "NOP"); break;
+		case 1: fprintf(stderr, "JUMP"); break;
+		case 2: fprintf(stderr, "LOOP"); break;
+		case 3: fprintf(stderr, "JSR"); break;
+		}
+		if (c->Base.is_r500) {
+			fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x\n",
+				vs->fc_op_addrs.r500[i].uw,
+				vs->fc_op_addrs.r500[i].lw);
+		} else {
+			fprintf(stderr,": 0x%08x\n", vs->fc_op_addrs.r300[i]);
+		}
+	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index c3f817ad4e8..9b60e30f586 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -70,6 +70,10 @@ struct loop_info {
 	int * Brks;
 	int BrkCount;
 	int BrkReserved;
+
+	int * Conts;
+	int ContCount;
+	int ContReserved;
 };
 
 struct emit_state {
@@ -413,20 +417,22 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			;
 		break;
 
-	case RC_OPCODE_CONTINUE:
+	case RC_OPCODE_CONT:
 		loop = &s->Loops[s->CurrentLoopDepth - 1];
-		s->Code->inst[newip].inst2 = R500_FC_OP_JUMP
+		memory_pool_array_reserve(&s->C->Pool, int, loop->Conts,
+					loop->ContCount, loop->ContReserved, 1);
+		loop->Conts[loop->ContCount++] = newip;
+		s->Code->inst[newip].inst2 = R500_FC_OP_CONTINUE
 			| R500_FC_JUMP_FUNC(0xff)
 			| R500_FC_B_OP1_DECR
 			| R500_FC_B_POP_CNT(
 				s->CurrentBranchDepth -	loop->BranchDepth)
+			| R500_FC_IGNORE_UNCOVERED
 			;
-		s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->BgnLoop);
 		break;
 
 	case RC_OPCODE_ENDLOOP:
 	{
-		unsigned int i;
 		loop = &s->Loops[s->CurrentLoopDepth - 1];
 		/* Emit ENDLOOP */
 		s->Code->inst[newip].inst2 = R500_FC_OP_ENDLOOP
@@ -449,6 +455,12 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->Code->inst[loop->Brks[loop->BrkCount]].inst3 =
 						R500_FC_JUMP_ADDR(newip + 1);
 		}
+
+		/* Set jump address for CONT instructions. */
+		while(loop->ContCount--) {
+			s->Code->inst[loop->Conts[loop->ContCount]].inst3 =
+						R500_FC_JUMP_ADDR(newip);
+		}
 		s->CurrentLoopDepth--;
 		break;
 	}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index e14a3520dd3..896246d2035 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -243,6 +243,12 @@ struct rX00_fragment_program_code {
 #define R500_VS_MAX_ALU	        1024
 #define R500_VS_MAX_ALU_DWORDS  (R500_VS_MAX_ALU * 4)
 #define R300_VS_MAX_TEMPS	32
+/* This is the max for all chipsets (r300-r500) */
+#define R300_VS_MAX_FC_OPS 16
+/* The r500 maximum depth is not just for loops, but any combination of loops
+ * and subroutine jumps. */
+#define R500_VS_MAX_FC_DEPTH 8
+#define R300_VS_MAX_LOOP_DEPTH 1
 
 #define VSF_MAX_INPUTS 32
 #define VSF_MAX_OUTPUTS 32
@@ -263,9 +269,18 @@ struct r300_vertex_program_code {
 
 	uint32_t InputsRead;
 	uint32_t OutputsWritten;
-};
 
-void r300_vertex_program_dump(struct r300_vertex_program_code * vs);
+	unsigned int num_fc_ops;
+	uint32_t fc_ops;
+	union {
+	        uint32_t r300[R300_VS_MAX_FC_OPS];
+		struct {
+			uint32_t lw;
+			uint32_t uw;
+		} r500[R300_VS_MAX_FC_OPS];
+	} fc_op_addrs;
+	int32_t fc_loop_index[R300_VS_MAX_FC_OPS];
+};
 
 #endif /* RADEON_CODE_H */
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
index 1c8ba864a41..935dc9b0a80 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
@@ -307,3 +307,46 @@ void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsig
 	}
 }
 
+
+/**
+ * The FACE input in hardware contains 1 if it's a back face, 0 otherwise.
+ * Gallium and OpenGL define it the other way around.
+ *
+ * So let's just negate FACE at the beginning of the shader and rewrite the rest
+ * of the shader to read from the newly allocated temporary.
+ */
+void rc_transform_fragment_face(struct radeon_compiler *c, unsigned face)
+{
+	unsigned tempregi = rc_find_free_temporary(c);
+	struct rc_instruction *inst_add;
+	struct rc_instruction *inst;
+
+	/* perspective divide */
+	inst_add = rc_insert_new_instruction(c, &c->Program.Instructions);
+	inst_add->U.I.Opcode = RC_OPCODE_ADD;
+
+	inst_add->U.I.DstReg.File = RC_FILE_TEMPORARY;
+	inst_add->U.I.DstReg.Index = tempregi;
+	inst_add->U.I.DstReg.WriteMask = RC_MASK_X;
+
+	inst_add->U.I.SrcReg[0].File = RC_FILE_NONE;
+	inst_add->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_1111;
+
+	inst_add->U.I.SrcReg[1].File = RC_FILE_INPUT;
+	inst_add->U.I.SrcReg[1].Index = face;
+	inst_add->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XXXX;
+	inst_add->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
+
+	for (inst = inst_add->Next; inst != &c->Program.Instructions; inst = inst->Next) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+		unsigned i;
+
+		for(i = 0; i < opcode->NumSrcRegs; i++) {
+			if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT &&
+			    inst->U.I.SrcReg[i].Index == face) {
+				inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
+				inst->U.I.SrcReg[i].Index = tempregi;
+			}
+		}
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
index f15905d79d4..7c42eb3ae57 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
@@ -81,6 +81,7 @@ void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_ou
 void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_output);
 void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input,
                                 int full_vtransform);
+void rc_transform_fragment_face(struct radeon_compiler *c, unsigned face);
 
 struct r300_fragment_program_compiler {
 	struct radeon_compiler Base;
@@ -110,8 +111,12 @@ struct r300_vertex_program_compiler {
 
 	void * UserData;
 	void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c);
+
+	int PredicateIndex;
+	unsigned int PredicateMask;
 };
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c);
+void r300_vertex_program_dump(struct r300_vertex_program_compiler * c);
 
 #endif /* RADEON_COMPILER_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
index 31566a937f4..faf531b412e 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
@@ -274,7 +274,7 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f
 			}
 			break;
 		}
-		case RC_OPCODE_CONTINUE:
+		case RC_OPCODE_CONT:
 			break;
 		case RC_OPCODE_ENDIF:
 			push_branch(&s);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
index 24c3ae57b6e..32d4b45dd6d 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -421,12 +421,9 @@ static int build_loop_info(struct radeon_compiler * c, struct loop_info * loop,
  * ENDLOOP;                         	-> ENDLOOP
  *
  * @param inst A pointer to a BGNLOOP instruction.
- * @return If the loop can be unrolled, a pointer to the first instruction of
- * 		the unrolled loop.
- * 	   Otherwise, A pointer to the ENDLOOP instruction.
- * 	   Null if there is an error.
+ * @return 1 for success, 0 for failure
  */
-static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
+static int transform_loop(struct emulate_loop_state * s,
 						struct rc_instruction * inst)
 {
 	struct loop_info * loop;
@@ -437,10 +434,10 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 	loop = &s->Loops[s->LoopCount++];
 
 	if (!build_loop_info(s->C, loop, inst))
-		return NULL;
+		return 0;
 
-	if(try_unroll_loop(s->C, loop, -1)){
-		return loop->BeginLoop->Next;
+	if(try_unroll_loop(s->C, loop, s->prog_inst_limit)){
+		return 1;
 	}
 
 	/* Reverse the conditional instruction */
@@ -465,33 +462,31 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 		break;
 	default:
 		rc_error(s->C, "loop->Cond is not a conditional.\n");
-		return NULL;
+		return 0;
 	}
 
 	/* Prepare the loop to be emulated */
 	rc_remove_instruction(loop->Brk);
 	rc_remove_instruction(loop->EndIf);
 	rc_insert_instruction(loop->EndLoop->Prev, loop->EndIf);
-	return loop->EndLoop;
+	return 1;
 }
 
 void rc_transform_loops(struct radeon_compiler *c,
-						struct emulate_loop_state * s)
+			struct emulate_loop_state * s, int prog_inst_limit)
 {
 	struct rc_instruction * ptr;
 
 	memset(s, 0, sizeof(struct emulate_loop_state));
 	s->C = c;
-	ptr = s->C->Program.Instructions.Next;
-	while(ptr != &s->C->Program.Instructions) {
+	s->prog_inst_limit = prog_inst_limit;
+	for(ptr = s->C->Program.Instructions.Next;
+			ptr != &s->C->Program.Instructions; ptr = ptr->Next) {
 		if(ptr->Type == RC_INSTRUCTION_NORMAL &&
 					ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
-			ptr = transform_loop(s, ptr);
-			if(!ptr){
+			if (!transform_loop(s, ptr))
 				return;
-			}
 		}
-		ptr = ptr->Next;
 	}
 }
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
index 86d91ef14bd..bba1f68e308 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -21,10 +21,11 @@ struct emulate_loop_state {
 	struct loop_info * Loops;
 	unsigned int LoopCount;
 	unsigned int LoopReserved;
+	int prog_inst_limit;
 };
 
 void rc_transform_loops(struct radeon_compiler *c,
-						struct emulate_loop_state * s);
+			struct emulate_loop_state * s, int prog_inst_limit);
 
 void rc_unroll_loops(struct radeon_compiler * c, int prog_inst_limit);
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index 04f234f11d8..da495a3afaa 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -95,6 +95,12 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.IsComponentwise = 1
 	},
 	{
+		.Opcode = RC_OPCODE_DP2,
+		.Name = "DP2",
+		.NumSrcRegs = 2,
+		.HasDstReg = 1
+	},
+	{
 		.Opcode = RC_OPCODE_DP3,
 		.Name = "DP3",
 		.NumSrcRegs = 2,
@@ -295,6 +301,13 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.IsComponentwise = 1
 	},
 	{
+		.Opcode = RC_OPCODE_SSG,
+		.Name = "SSG",
+		.NumSrcRegs = 1,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
 		.Opcode = RC_OPCODE_SUB,
 		.Name = "SUB",
 		.NumSrcRegs = 2,
@@ -386,8 +399,8 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.NumSrcRegs = 0,
 	},
 	{
-		.Opcode = RC_OPCODE_CONTINUE,
-		.Name = "CONTINUE",
+		.Opcode = RC_OPCODE_CONT,
+		.Name = "CONT",
 		.IsFlowControl = 1,
 		.NumSrcRegs = 0
 	},
@@ -435,6 +448,10 @@ void rc_compute_sources_for_writemask(
 		case RC_OPCODE_ARL:
 			srcmasks[0] |= RC_MASK_X;
 			break;
+		case RC_OPCODE_DP2:
+			srcmasks[0] |= RC_MASK_XY;
+			srcmasks[1] |= RC_MASK_XY;
+			break;
 		case RC_OPCODE_DP3:
 			srcmasks[0] |= RC_MASK_XYZ;
 			srcmasks[1] |= RC_MASK_XYZ;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index 8b9fa07dde2..d3f639c8701 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -64,6 +64,9 @@ typedef enum {
 	 * dst.c = d src0.c / dy */
 	RC_OPCODE_DDY,
 
+	/** scalar instruction: dst = src0.x*src1.x + src0.y*src1.y */
+	RC_OPCODE_DP2,
+
 	/** scalar instruction: dst = src0.x*src1.x + src0.y*src1.y + src0.z*src1.z */
 	RC_OPCODE_DP3,
 
@@ -154,6 +157,9 @@ typedef enum {
 	/** vec4 instruction: dst.c = (src0.c != src1.c) ? 1.0 : 0.0 */
 	RC_OPCODE_SNE,
 
+	/** vec4 instruction: dst.c = (src0.c < 0 ?) -1 : ((src0.c > 0) : 1 : 0) */
+	RC_OPCODE_SSG,
+
 	/** vec4 instruction: dst.c = src0.c - src1.c */
 	RC_OPCODE_SUB,
 
@@ -187,7 +193,7 @@ typedef enum {
 
 	RC_OPCODE_ENDLOOP,
 
-	RC_OPCODE_CONTINUE,
+	RC_OPCODE_CONT,
 
 	/** special instruction, used in R300-R500 fragment program pair instructions
 	 * indicates that the result of the alpha operation shall be replicated
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
index 8a912da4613..ce72cd97ab2 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
@@ -65,6 +65,11 @@ struct regalloc_state {
 
 	struct hardware_register * HwTemporary;
 	unsigned int NumHwTemporaries;
+	/**
+	 * If an instruction is inside of a loop, end_loop will be the
+	 * IP of the ENDLOOP instruction, otherwise end_loop will be 0
+	 */
+	int end_loop;
 };
 
 static void print_live_intervals(struct live_intervals * src)
@@ -178,10 +183,10 @@ static void scan_callback(void * data, struct rc_instruction * inst,
 		else
 			reg->Live.Start = inst->IP;
 		reg->Live.End = inst->IP;
-	} else {
-		if (inst->IP > reg->Live.End)
-			reg->Live.End = inst->IP;
-	}
+	} else if (s->end_loop)
+		reg->Live.End = s->end_loop;
+	else if (inst->IP > reg->Live.End)
+		reg->Live.End = inst->IP;
 }
 
 static void compute_live_intervals(struct regalloc_state * s)
@@ -191,6 +196,31 @@ static void compute_live_intervals(struct regalloc_state * s)
 	for(struct rc_instruction * inst = s->C->Program.Instructions.Next;
 	    inst != &s->C->Program.Instructions;
 	    inst = inst->Next) {
+
+		/* For all instructions inside of a loop, the ENDLOOP
+		 * instruction is used as the end of the live interval. */
+		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP && !s->end_loop) {
+			int loops = 1;
+			struct rc_instruction * tmp;
+			for(tmp = inst->Next;
+					tmp != &s->C->Program.Instructions;
+					tmp = tmp->Next) {
+				if (tmp->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+					loops++;
+					break;
+				} else if (tmp->U.I.Opcode
+							== RC_OPCODE_ENDLOOP) {
+					if(!--loops) {
+						s->end_loop = tmp->IP;
+						break;
+					}
+				}
+			}
+		}
+
+		if (inst->IP == s->end_loop)
+			s->end_loop = 0;
+
 		rc_for_all_reads_mask(inst, scan_callback, s);
 		rc_for_all_writes_mask(inst, scan_callback, s);
 	}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
index 857aae55145..704a7bb2d23 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
@@ -216,18 +216,18 @@ static void transform_CEIL(struct radeon_compiler* c,
 	rc_remove_instruction(inst);
 }
 
-static void transform_DP3(struct radeon_compiler* c,
+static void transform_DP2(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
 	struct rc_src_register src0 = inst->U.I.SrcReg[0];
 	struct rc_src_register src1 = inst->U.I.SrcReg[1];
-	src0.Negate &= ~RC_MASK_W;
-	src0.Swizzle &= ~(7 << (3 * 3));
-	src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
-	src1.Negate &= ~RC_MASK_W;
-	src1.Swizzle &= ~(7 << (3 * 3));
-	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
-	emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
+	src0.Negate &= ~(RC_MASK_Z | RC_MASK_W);
+	src0.Swizzle &= ~(63 << (3 * 2));
+	src0.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
+	src1.Negate &= ~(RC_MASK_Z | RC_MASK_W);
+	src1.Swizzle &= ~(63 << (3 * 2));
+	src1.Swizzle |= (RC_SWIZZLE_ZERO << (3 * 2)) | (RC_SWIZZLE_ZERO << (3 * 3));
+	emit2(c, inst->Prev, RC_OPCODE_DP3, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
 	rc_remove_instruction(inst);
 }
 
@@ -464,6 +464,43 @@ static void transform_SNE(struct radeon_compiler* c,
 	rc_remove_instruction(inst);
 }
 
+static void transform_SSG(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	/* result = sign(x)
+	 *
+	 *   CMP tmp0, -x, 1, 0
+	 *   CMP tmp1, x, 1, 0
+	 *   ADD result, tmp0, -tmp1;
+	 */
+	unsigned tmp0, tmp1;
+
+	/* 0 < x */
+	tmp0 = rc_find_free_temporary(c);
+	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
+	      dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
+	      negate(inst->U.I.SrcReg[0]),
+	      builtin_one,
+	      builtin_zero);
+
+	/* x < 0 */
+	tmp1 = rc_find_free_temporary(c);
+	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
+	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
+	      inst->U.I.SrcReg[0],
+	      builtin_one,
+	      builtin_zero);
+
+	/* Either both are zero, or one of them is one and the other is zero. */
+	/* result = tmp0 - tmp1 */
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
+	      inst->U.I.DstReg,
+	      srcreg(RC_FILE_TEMPORARY, tmp0),
+	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
+
+	rc_remove_instruction(inst);
+}
+
 static void transform_SUB(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
@@ -516,6 +553,7 @@ int radeonTransformALU(
 	switch(inst->U.I.Opcode) {
 	case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
+	case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
 	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
 	case RC_OPCODE_DST: transform_DST(c, inst); return 1;
 	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
@@ -530,6 +568,7 @@ int radeonTransformALU(
 	case RC_OPCODE_SLE: transform_SLE(c, inst); return 1;
 	case RC_OPCODE_SLT: transform_SLT(c, inst); return 1;
 	case RC_OPCODE_SNE: transform_SNE(c, inst); return 1;
+	case RC_OPCODE_SSG: transform_SSG(c, inst); return 1;
 	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
 	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
 	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
@@ -577,6 +616,29 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
 	rc_remove_instruction(inst);
 }
 
+static void transform_r300_vertex_DP2(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	struct rc_instruction *next_inst = inst->Next;
+	transform_DP2(c, inst);
+	next_inst->Prev->U.I.Opcode = RC_OPCODE_DP4;
+}
+
+static void transform_r300_vertex_DP3(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	struct rc_src_register src0 = inst->U.I.SrcReg[0];
+	struct rc_src_register src1 = inst->U.I.SrcReg[1];
+	src0.Negate &= ~RC_MASK_W;
+	src0.Swizzle &= ~(7 << (3 * 3));
+	src0.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
+	src1.Negate &= ~RC_MASK_W;
+	src1.Swizzle &= ~(7 << (3 * 3));
+	src1.Swizzle |= RC_SWIZZLE_ZERO << (3 * 3);
+	emit2(c, inst->Prev, RC_OPCODE_DP4, inst->U.I.SaturateMode, inst->U.I.DstReg, src0, src1);
+	rc_remove_instruction(inst);
+}
+
 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
@@ -672,6 +734,41 @@ static void transform_r300_vertex_SLE(struct radeon_compiler* c,
 	inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
 }
 
+static void transform_r300_vertex_SSG(struct radeon_compiler* c,
+	struct rc_instruction* inst)
+{
+	/* result = sign(x)
+	 *
+	 *   SLT tmp0, 0, x;
+	 *   SLT tmp1, x, 0;
+	 *   ADD result, tmp0, -tmp1;
+	 */
+	unsigned tmp0, tmp1;
+
+	/* 0 < x */
+	tmp0 = rc_find_free_temporary(c);
+	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
+	      dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
+	      builtin_zero,
+	      inst->U.I.SrcReg[0]);
+
+	/* x < 0 */
+	tmp1 = rc_find_free_temporary(c);
+	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
+	      dstregtmpmask(tmp1, inst->U.I.DstReg.WriteMask),
+	      inst->U.I.SrcReg[0],
+	      builtin_zero);
+
+	/* Either both are zero, or one of them is one and the other is zero. */
+	/* result = tmp0 - tmp1 */
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
+	      inst->U.I.DstReg,
+	      srcreg(RC_FILE_TEMPORARY, tmp0),
+	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
+
+	rc_remove_instruction(inst);
+}
+
 /**
  * For use with radeonLocalTransform, this transforms non-native ALU
  * instructions of the r300 up to r500 vertex engine.
@@ -685,7 +782,8 @@ int r300_transform_vertex_alu(
 	case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
 	case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
-	case RC_OPCODE_DP3: transform_DP3(c, inst); return 1;
+	case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
+	case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
 	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
 	case RC_OPCODE_FLR: transform_FLR(c, inst); return 1;
 	case RC_OPCODE_LIT: transform_r300_vertex_fix_LIT(c, inst); return 1;
@@ -705,6 +803,7 @@ int r300_transform_vertex_alu(
 			return 1;
 		}
 		return 0;
+	case RC_OPCODE_SSG: transform_r300_vertex_SSG(c, inst); return 1;
 	case RC_OPCODE_SUB: transform_SUB(c, inst); return 1;
 	case RC_OPCODE_SWZ: transform_SWZ(c, inst); return 1;
 	case RC_OPCODE_XPD: transform_XPD(c, inst); return 1;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
index 9c4b65f4c00..ddce590ee66 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
@@ -117,8 +117,8 @@ int radeonTransformTEX(
 			struct rc_instruction * inst_rcp = NULL;
 			struct rc_instruction * inst_mad;
 			struct rc_instruction * inst_cmp;
-			unsigned tmp_texsample = rc_find_free_temporary(c);
-			unsigned tmp_sum = rc_find_free_temporary(c);
+			unsigned tmp_texsample;
+			unsigned tmp_sum;
 			unsigned tmp_recip_w = 0;
 			int pass, fail, tex;
 
@@ -126,6 +126,7 @@ int radeonTransformTEX(
 			struct rc_dst_register output_reg = inst->U.I.DstReg;
 
 			/* Redirect TEX to a new temp. */
+			tmp_texsample = rc_find_free_temporary(c);
 			inst->U.I.DstReg.File = RC_FILE_TEMPORARY;
 			inst->U.I.DstReg.Index = tmp_texsample;
 			inst->U.I.DstReg.WriteMask = RC_MASK_XYZW;
@@ -144,6 +145,7 @@ int radeonTransformTEX(
 			}
 
 			/* Perspective-divide r by W (if it's TXP) and add the texture sample (see below). */
+			tmp_sum = rc_find_free_temporary(c);
 			inst_mad = rc_insert_new_instruction(c, inst_rcp ? inst_rcp : inst);
 			inst_mad->U.I.DstReg.File = RC_FILE_TEMPORARY;
 			inst_mad->U.I.DstReg.Index = tmp_sum;
@@ -199,6 +201,8 @@ int radeonTransformTEX(
 			inst_cmp->U.I.SrcReg[pass].File = RC_FILE_NONE;
 			inst_cmp->U.I.SrcReg[pass].Swizzle = RC_SWIZZLE_1111;
 			inst_cmp->U.I.SrcReg[fail] = shadow_ambient(compiler, inst->U.I.TexSrcUnit);
+
+			assert(tmp_texsample != tmp_sum && tmp_sum != tmp_recip_w);
 		}
 	}
 
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index e4b302bbad9..3d2f8928fa6 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -461,7 +461,7 @@ static void r300InitGLExtensions(GLcontext *ctx)
 	if (!r300->radeon.radeonScreen->drmSupportsOcclusionQueries) {
 		_mesa_disable_extension(ctx, "GL_ARB_occlusion_query");
 	}
-	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV350)
+        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_R420)
   		_mesa_enable_extension(ctx, "GL_ARB_half_float_vertex");
 
 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index f25264b6f2d..f7705b0f6fe 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -441,6 +441,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
 #define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
 
+#define R300_VAP_PVS_FLOW_CNTL_ADDRS_0      0x2230
+#define R300_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x)            ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x)             ((x) << 24)
+
 /* gap */
 
 /* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -459,6 +465,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_2288_R300                    0x00750000 /* -- nh */
 #       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
 
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x)        ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x)        ((x) << 8)
+
 /* gap */
 
 /* Addresses are relative to the vertex program instruction area of the
@@ -489,6 +499,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
 #define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x)         (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x)         (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x)          (3 << (2 * (x)))
 
 /* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
  * immediate vertices
@@ -505,6 +518,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* write 0 to indicate end of packet? */
 #define R300_VAP_VTX_END_OF_PKT             0x24AC
 
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0   0x2500
+#define R500_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0   0x2504
+#define R500_PVS_FC_LAST_INST(x)            ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x)             ((x) << 16)
+
 /* gap */
 
 /* These are values from r300_reg/r300_reg.h - they are known to be correct
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index 4ba6740e3d9..94588698265 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -152,8 +152,8 @@ int32_t r300TranslateTexFormat(gl_format mesaFormat)
 		case MESA_FORMAT_Z32:
 			return R300_EASY_TX_FORMAT(X, X, X, X, X32);
 		/* EXT_texture_sRGB */
-		case MESA_FORMAT_SRGBA8:
-			return R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA;
+		case MESA_FORMAT_SARGB8:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA;
 		case MESA_FORMAT_SLA8:
 			return R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8) | R300_TX_FORMAT_GAMMA;
 		case MESA_FORMAT_SL8:
diff --git a/src/mesa/drivers/dri/r600/r600_blit.c b/src/mesa/drivers/dri/r600/r600_blit.c
index 619678214f0..4fd425b8094 100644
--- a/src/mesa/drivers/dri/r600/r600_blit.c
+++ b/src/mesa/drivers/dri/r600/r600_blit.c
@@ -72,7 +72,7 @@ unsigned r600_check_blit(gl_format mesa_format)
     case MESA_FORMAT_Z24_S8:
     case MESA_FORMAT_Z16:
     case MESA_FORMAT_Z32:
-    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SARGB8:
     case MESA_FORMAT_SLA8:
     case MESA_FORMAT_SL8:
 	    break;
@@ -320,9 +320,9 @@ set_render_target(context_t *context, struct radeon_bo *bo, gl_format mesa_forma
 	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
 	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
             break;
-    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SARGB8:
             format = COLOR_8_8_8_8;
-            comp_swap = SWAP_STD_REV;
+            comp_swap = SWAP_ALT;
 	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
 	    SETfield(cb_color0_info, NUMBER_SRGB, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
             break;
@@ -1050,17 +1050,17 @@ set_tex_resource(context_t * context,
 	    SETfield(sq_tex_resource4, SQ_SEL_X,
 		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 	    break;
-    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SARGB8:
 	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
 		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-	    SETfield(sq_tex_resource4, SQ_SEL_W,
-		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 	    SETfield(sq_tex_resource4, SQ_SEL_Z,
-		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 	    SETfield(sq_tex_resource4, SQ_SEL_Y,
-		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
 	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
 		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 	    SETbit(sq_tex_resource4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
 	    break;
@@ -1454,7 +1454,7 @@ set_default_state(context_t *context)
 	    SETbit(sq_dyn_gpr_cntl_ps_flush_req, VS_PC_LIMIT_ENABLE_bit);
     }
 
-    BEGIN_BATCH_NO_AUTOSTATE(114);
+    BEGIN_BATCH_NO_AUTOSTATE(117);
     R600_OUT_BATCH_REGSEQ(SQ_CONFIG, 6);
     R600_OUT_BATCH(sq_config);
     R600_OUT_BATCH(sq_gpr_resource_mgmt_1);
@@ -1499,9 +1499,10 @@ set_default_state(context_t *context)
     R600_OUT_BATCH_REGVAL(PA_SU_VTX_CNTL, (PIX_CENTER_bit) |
         (X_ROUND_TO_EVEN << PA_SU_VTX_CNTL__ROUND_MODE_shift) |
         (X_1_256TH << QUANT_MODE_shift));
+    R600_OUT_BATCH_REGVAL(PA_SC_AA_CONFIG, 0);
 
     R600_OUT_BATCH_REGSEQ(VGT_MAX_VTX_INDX, 4);
-    R600_OUT_BATCH(2048);
+    R600_OUT_BATCH(0xffffff);
     R600_OUT_BATCH(0);
     R600_OUT_BATCH(0);
     R600_OUT_BATCH(0);
@@ -1532,6 +1533,7 @@ set_default_state(context_t *context)
     R600_OUT_BATCH(0);
 
     R600_OUT_BATCH_REGVAL(VGT_STRMOUT_BUFFER_EN, 0);
+    R600_OUT_BATCH_REGVAL(SX_ALPHA_TEST_CONTROL, 0);
 
     END_BATCH();
     COMMIT_BATCH();
@@ -1613,7 +1615,7 @@ unsigned r600_blit(GLcontext *ctx,
     /* Flush is needed to make sure that source buffer has correct data */
     radeonFlush(ctx);
 
-    rcommonEnsureCmdBufSpace(&context->radeon, 305, __FUNCTION__);
+    rcommonEnsureCmdBufSpace(&context->radeon, 311, __FUNCTION__);
 
     /* load shaders */
     load_shaders(context->radeon.glCtx);
@@ -1622,7 +1624,7 @@ unsigned r600_blit(GLcontext *ctx,
         return GL_FALSE;
 
     /* set clear state */
-    /* 114 */
+    /* 120 */
     set_default_state(context);
 
     /* shaders */
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index 84d9d423124..389b0412baa 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -72,6 +72,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R600_ENABLE_GLSL_TEST 1
 
 #define need_GL_VERSION_2_0
+#define need_GL_VERSION_2_1
+#define need_GL_ARB_draw_elements_base_vertex
 #define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_vertex_program
@@ -140,6 +142,7 @@ static const struct dri_extension card_extensions[] = {
   {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
   {"GL_SGIS_generate_mipmap",		NULL},
   {"GL_ARB_pixel_buffer_object",        NULL},
+  {"GL_ARB_draw_elements_base_vertex",	GL_ARB_draw_elements_base_vertex_functions },
   {NULL,				NULL}
   /* *INDENT-ON* */
 };
@@ -157,6 +160,7 @@ static const struct dri_extension mm_extensions[] = {
 static const struct dri_extension gl_20_extension[] = {
 #ifdef R600_ENABLE_GLSL_TEST
     {"GL_ARB_shading_language_100",			GL_VERSION_2_0_functions },
+    {"GL_ARB_shading_language_120",			GL_VERSION_2_1_functions },
 #else
   {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
 #endif /* R600_ENABLE_GLSL_TEST */
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
index 41419f84601..512a52ede3e 100644
--- a/src/mesa/drivers/dri/r600/r600_tex.c
+++ b/src/mesa/drivers/dri/r600/r600_tex.c
@@ -431,7 +431,7 @@ unsigned r600IsFormatRenderable(gl_format mesa_format)
 	case MESA_FORMAT_Z24_S8:
 	case MESA_FORMAT_Z16:
 	case MESA_FORMAT_Z32:
-	case MESA_FORMAT_SRGBA8:
+	case MESA_FORMAT_SARGB8:
 	case MESA_FORMAT_SLA8:
 	case MESA_FORMAT_SL8:
 		return 1;
diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c
index 1600033b9bd..ba3690b70ed 100644
--- a/src/mesa/drivers/dri/r600/r600_texstate.c
+++ b/src/mesa/drivers/dri/r600/r600_texstate.c
@@ -605,17 +605,17 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa
 		}
 		break;
 	/* EXT_texture_sRGB */
-	case MESA_FORMAT_SRGBA8:
+	case MESA_FORMAT_SARGB8:
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 		SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
 		break;
diff --git a/src/mesa/drivers/dri/r600/r700_chip.c b/src/mesa/drivers/dri/r600/r700_chip.c
index 1e955b93b2b..bf8063391a2 100644
--- a/src/mesa/drivers/dri/r600/r700_chip.c
+++ b/src/mesa/drivers/dri/r600/r700_chip.c
@@ -265,17 +265,6 @@ static void r700SendVTXState(GLcontext *ctx, struct radeon_state_atom *atom)
     if (context->radeon.tcl.aos_count == 0)
 	    return;
 
-    BEGIN_BATCH_NO_AUTOSTATE(6);
-    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
-    R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
-    R600_OUT_BATCH(0);
-
-    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
-    R600_OUT_BATCH(mmSQ_VTX_START_INST_LOC - ASIC_CTL_CONST_BASE_INDEX);
-    R600_OUT_BATCH(0);
-    END_BATCH();
-    COMMIT_BATCH();
-
     for(i=0; i<VERT_ATTRIB_MAX; i++) {
 	    if(vp->mesa_program->Base.InputsRead & (1 << i))
 	    {
@@ -523,9 +512,9 @@ static void r700SetRenderTarget(context_t *context, int id)
 		     CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
 	    CLEARbit(r700->render_target[id].CB_COLOR0_INFO.u32All, SOURCE_FORMAT_bit);
             break;
-    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SARGB8:
             format = COLOR_8_8_8_8;
-            comp_swap = SWAP_STD_REV;
+            comp_swap = SWAP_ALT;
 	    number_type = NUMBER_SRGB;
 	    SETbit(r700->render_target[id].CB_COLOR0_INFO.u32All, SOURCE_FORMAT_bit);
             break;
@@ -1480,9 +1469,6 @@ static int check_vtx(GLcontext *ctx, struct radeon_state_atom *atom)
 	context_t *context = R700_CONTEXT(ctx);
 	int count = context->radeon.tcl.aos_count * 18;
 
-	if (count)
-		count += 6;
-
 	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
 	return count;
 }
@@ -1605,7 +1591,7 @@ void r600InitAtoms(context_t *context)
 	ALLOC_STATE(ps, always, 24, r700SendPSState);
 	ALLOC_STATE(vs_consts, vs_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendVSConsts);
 	ALLOC_STATE(ps_consts, ps_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendPSConsts);
-	ALLOC_STATE(vtx, vtx, (6 + (VERT_ATTRIB_MAX * 18)), r700SendVTXState);
+	ALLOC_STATE(vtx, vtx, (VERT_ATTRIB_MAX * 18), r700SendVTXState);
 	ALLOC_STATE(tx, tx, (R700_TEXTURE_NUMBERUNITS * 20), r700SendTexState);
 	ALLOC_STATE(tx_smplr, tx, (R700_TEXTURE_NUMBERUNITS * 5), r700SendTexSamplerState);
 	ALLOC_STATE(tx_brdr_clr, tx, (R700_TEXTURE_NUMBERUNITS * 6), r700SendTexBorderColorState);
diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c
index ba55f38e054..c5771f9fd0b 100644
--- a/src/mesa/drivers/dri/r600/r700_render.c
+++ b/src/mesa/drivers/dri/r600/r700_render.c
@@ -244,7 +244,8 @@ static int r700NumVerts(int num_verts, int prim)
 	return num_verts - verts_off;
 }
 
-static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
+static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end,
+				   int prim, GLint basevertex)
 {
     context_t *context = R700_CONTEXT(ctx);
     BATCH_LOCALS(&context->radeon);
@@ -282,6 +283,7 @@ static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim
     total_emit =   3  /* VGT_PRIMITIVE_TYPE */
 	         + 2  /* VGT_INDEX_TYPE */
 	         + 2  /* NUM_INSTANCES */
+		 + 4  /* VTX_BASE_VTX_LOC + VTX_START_INST_LOC */
 	         + 5 + 2; /* DRAW_INDEX */
 
     BEGIN_BATCH_NO_AUTOSTATE(total_emit);
@@ -294,6 +296,11 @@ static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim
     // num instances
     R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
     R600_OUT_BATCH(1);
+    /* offset */
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 2));
+    R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(basevertex); //VTX_BASE_VTX_LOC
+    R600_OUT_BATCH(0);          //VTX_START_INST_LOC
     // draw packet
     R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX, 3));
     R600_OUT_BATCH(context->ind_buf.bo_offset);
@@ -364,6 +371,7 @@ static void r700RunRenderPrimitiveImmediate(GLcontext * ctx, int start, int end,
     total_emit +=   3 /* VGT_PRIMITIVE_TYPE */
 	          + 2 /* VGT_INDEX_TYPE */
 	          + 2 /* NUM_INSTANCES */
+		  + 4 /* VTX_BASE_VTX_LOC + VTX_START_INST_LOC */
 	          + 3; /* DRAW */
 
     BEGIN_BATCH_NO_AUTOSTATE(total_emit);
@@ -376,6 +384,11 @@ static void r700RunRenderPrimitiveImmediate(GLcontext * ctx, int start, int end,
     // num instances
     R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
     R600_OUT_BATCH(1);
+    /* offset */
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 2));
+    R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(0); //VTX_BASE_VTX_LOC
+    R600_OUT_BATCH(0); //VTX_START_INST_LOC
     // draw packet
     if(start == 0)
     {
@@ -433,16 +446,16 @@ static GLuint r700PredictRenderSize(GLcontext* ctx,
 
     dwords = PRE_EMIT_STATE_BUFSZ;
     if (ib)
-	    dwords += nr_prims * 14;
+	    dwords += nr_prims * 18;
     else {
 	    for (i = 0; i < nr_prims; ++i)
 	    {
 		    if (prim[i].start == 0)
-			    dwords += 10;
+			    dwords += 14;
 		    else if (prim[i].count > 0xffff)
-			    dwords += prim[i].count + 10;
+			    dwords += prim[i].count + 14;
 		    else
-			    dwords += ((prim[i].count + 1) / 2) + 10;
+			    dwords += ((prim[i].count + 1) / 2) + 14;
 	    }
     }
 
@@ -923,7 +936,8 @@ static GLboolean r700TryDrawPrims(GLcontext *ctx,
 		    r700RunRenderPrimitive(ctx,
 					   prim[i].start,
 					   prim[i].start + prim[i].count,
-					   prim[i].mode);
+					   prim[i].mode,
+					   prim[i].basevertex);
 	    else
 		    r700RunRenderPrimitiveImmediate(ctx,
 						    prim[i].start,
@@ -975,15 +989,17 @@ static void r700DrawPrims(GLcontext *ctx,
 	/* This check should get folded into just the places that
 	 * min/max index are really needed.
 	 */
-	if (!index_bounds_valid) {
-		vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
-	}
 
-	if (min_index) {
+	if (!vbo_all_varyings_in_vbos(arrays)) {
+	    if (!index_bounds_valid)
+		vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+	    /* do we want to rebase, minimizes the 
+	     * amount of data to upload? */
+	    if (min_index) {
 		vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r700DrawPrims );
 		return;
+	    }
 	}
-
 	/* Make an attempt at drawing */
 	retval = r700TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index b7ee9a134bf..7d54fabebbc 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -414,9 +414,9 @@ enum {
    CHIP_FAMILY_R350,
    CHIP_FAMILY_RV350,
    CHIP_FAMILY_RV380,
+   CHIP_FAMILY_RS400,
    CHIP_FAMILY_R420,
    CHIP_FAMILY_RV410,
-   CHIP_FAMILY_RS400,
    CHIP_FAMILY_RS600,
    CHIP_FAMILY_RS690,
    CHIP_FAMILY_RS740,
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index 517485091a2..0597d4250de 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -609,6 +609,7 @@ radeon_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
 
 void radeon_fbo_init(struct radeon_context *radeon)
 {
+#if FEATURE_EXT_framebuffer_object
   radeon->glCtx->Driver.NewFramebuffer = radeon_new_framebuffer;
   radeon->glCtx->Driver.NewRenderbuffer = radeon_new_renderbuffer;
   radeon->glCtx->Driver.BindFramebuffer = radeon_bind_framebuffer;
@@ -617,7 +618,10 @@ void radeon_fbo_init(struct radeon_context *radeon)
   radeon->glCtx->Driver.FinishRenderTexture = radeon_finish_render_texture;
   radeon->glCtx->Driver.ResizeBuffers = radeon_resize_buffers;
   radeon->glCtx->Driver.ValidateFramebuffer = radeon_validate_framebuffer;
+#endif
+#if FEATURE_EXT_framebuffer_blit
   radeon->glCtx->Driver.BlitFramebuffer = _mesa_meta_BlitFramebuffer;
+#endif
 }
 
   
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index c877e6c1765..c6e5f110ea3 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -133,7 +133,7 @@ static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree
 	height = _mesa_next_pow_two_32(lvl->height);
 
 	lvl->rowstride = get_texture_image_row_stride(rmesa, mt->mesaFormat, lvl->width, mt->tilebits);
-	lvl->size = get_texture_image_size(mt->mesaFormat, lvl->rowstride, lvl->height, lvl->depth, mt->tilebits);
+	lvl->size = get_texture_image_size(mt->mesaFormat, lvl->rowstride, height, lvl->depth, mt->tilebits);
 
 	assert(lvl->size > 0);
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index 67be466c3f8..29defe73a70 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -40,7 +40,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/simple_list.h"
 
+#include "math/m_xform.h"
+
 #include "swrast_setup/swrast_setup.h"
+
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index d2b190e42e0..8c6a50d2f0d 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -551,7 +551,7 @@ gl_format radeonChooseTextureFormat(GLcontext * ctx,
 	case GL_SRGB8_ALPHA8:
 	case GL_COMPRESSED_SRGB:
 	case GL_COMPRESSED_SRGB_ALPHA:
-		return MESA_FORMAT_SRGBA8;
+		return MESA_FORMAT_SARGB8;
 
 	case GL_SLUMINANCE:
 	case GL_SLUMINANCE8:
diff --git a/src/mesa/drivers/dri/savage/savagerender.c b/src/mesa/drivers/dri/savage/savagerender.c
index c369bb124c2..2d9e80e29c4 100644
--- a/src/mesa/drivers/dri/savage/savagerender.c
+++ b/src/mesa/drivers/dri/savage/savagerender.c
@@ -33,6 +33,8 @@
 #include "main/imports.h"
 #include "main/mtypes.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 
 #include "savagecontext.h"
diff --git a/src/mesa/drivers/dri/unichrome/via_render.c b/src/mesa/drivers/dri/unichrome/via_render.c
index 896c43db1b0..4351f119555 100644
--- a/src/mesa/drivers/dri/unichrome/via_render.c
+++ b/src/mesa/drivers/dri/unichrome/via_render.c
@@ -33,6 +33,8 @@
 #include "main/macros.h"
 #include "main/mtypes.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 
 #include "via_context.h"
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 32f7d969d8d..0f2d1a8f8da 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -177,7 +177,7 @@
 /**
  * Per-program constants (power of two)
  *
- * \c MAX_PROGRAM_LOCAL_PARAMS and \c MAX_UNIFORMS are just the assmebly shader
+ * \c MAX_PROGRAM_LOCAL_PARAMS and \c MAX_UNIFORMS are just the assembly shader
  * and GLSL shader names for the same thing.  They should \b always have the
  * same value.  Each refers to the number of vec4 values supplied as
  * per-program parameters.
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index a369532e99c..b01fed1781e 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -462,7 +462,7 @@ _mesa_init_current(GLcontext *ctx)
 
 
 /**
- * Init vertex/fragment program limits.
+ * Init vertex/fragment/geometry program limits.
  * Important: drivers should override these with actual limits.
  */
 static void
@@ -477,16 +477,18 @@ init_program_limits(GLenum type, struct gl_program_constants *prog)
    prog->MaxLocalParams = MAX_PROGRAM_LOCAL_PARAMS;
    prog->MaxUniformComponents = 4 * MAX_UNIFORMS;
 
-   if (type == GL_VERTEX_PROGRAM_ARB) {
+   switch (type) {
+   case GL_VERTEX_PROGRAM_ARB:
       prog->MaxParameters = MAX_VERTEX_PROGRAM_PARAMS;
       prog->MaxAttribs = MAX_NV_VERTEX_PROGRAM_INPUTS;
       prog->MaxAddressRegs = MAX_VERTEX_PROGRAM_ADDRESS_REGS;
-   }
-   else if (type == GL_FRAGMENT_PROGRAM_ARB) {
+      break;
+   case GL_FRAGMENT_PROGRAM_ARB:
       prog->MaxParameters = MAX_NV_FRAGMENT_PROGRAM_PARAMS;
       prog->MaxAttribs = MAX_NV_FRAGMENT_PROGRAM_INPUTS;
       prog->MaxAddressRegs = MAX_FRAGMENT_PROGRAM_ADDRESS_REGS;
-   } else {
+      break;
+   case MESA_GEOMETRY_PROGRAM:
       prog->MaxParameters = MAX_NV_VERTEX_PROGRAM_PARAMS;
       prog->MaxAttribs = MAX_NV_VERTEX_PROGRAM_INPUTS;
       prog->MaxAddressRegs = MAX_VERTEX_PROGRAM_ADDRESS_REGS;
@@ -497,6 +499,9 @@ init_program_limits(GLenum type, struct gl_program_constants *prog)
       prog->MaxGeometryUniformComponents = MAX_GEOMETRY_UNIFORM_COMPONENTS;
       prog->MaxGeometryOutputVertices = MAX_GEOMETRY_OUTPUT_VERTICES;
       prog->MaxGeometryTotalOutputComponents = MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS;
+      break;
+   default:
+      assert(0 && "Bad program type in init_program_limits()");
    }
 
    /* Set the native limits to zero.  This implies that there is no native
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index c3993510967..46e5c932d0f 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -756,7 +756,7 @@ _mesa_strdup( const char *s )
 float
 _mesa_strtof( const char *s, char **end )
 {
-#if defined(_GNU_SOURCE) && !defined(__CYGWIN__)
+#if defined(_GNU_SOURCE) && !defined(__CYGWIN__) && !defined(__FreeBSD__)
    static locale_t loc = NULL;
    if (!loc) {
       loc = newlocale(LC_CTYPE_MASK, "C", NULL);
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index e89f55befe4..b8bcda56bfa 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2505,29 +2505,29 @@ struct gl_framebuffer
 
 
 /**
- * Limits for vertex and fragment programs.
+ * Limits for vertex and fragment programs/shaders.
  */
 struct gl_program_constants
 {
    /* logical limits */
    GLuint MaxInstructions;
-   GLuint MaxAluInstructions; /* fragment programs only, for now */
-   GLuint MaxTexInstructions; /* fragment programs only, for now */
-   GLuint MaxTexIndirections; /* fragment programs only, for now */
+   GLuint MaxAluInstructions;
+   GLuint MaxTexInstructions;
+   GLuint MaxTexIndirections;
    GLuint MaxAttribs;
    GLuint MaxTemps;
-   GLuint MaxAddressRegs; /* vertex program only, for now */
+   GLuint MaxAddressRegs;
    GLuint MaxParameters;
    GLuint MaxLocalParams;
    GLuint MaxEnvParams;
    /* native/hardware limits */
    GLuint MaxNativeInstructions;
-   GLuint MaxNativeAluInstructions; /* fragment programs only, for now */
-   GLuint MaxNativeTexInstructions; /* fragment programs only, for now */
-   GLuint MaxNativeTexIndirections; /* fragment programs only, for now */
+   GLuint MaxNativeAluInstructions;
+   GLuint MaxNativeTexInstructions;
+   GLuint MaxNativeTexIndirections;
    GLuint MaxNativeAttribs;
    GLuint MaxNativeTemps;
-   GLuint MaxNativeAddressRegs; /* vertex program only, for now */
+   GLuint MaxNativeAddressRegs;
    GLuint MaxNativeParameters;
    /* For shaders */
    GLuint MaxUniformComponents;
diff --git a/src/mesa/main/querymatrix.c b/src/mesa/main/querymatrix.c
index 6f62415ba8c..32aaa79f7fb 100644
--- a/src/mesa/main/querymatrix.c
+++ b/src/mesa/main/querymatrix.c
@@ -36,9 +36,9 @@
 #define INT_TO_FIXED(x) ((GLfixed) ((x) << 16))
 #define FLOAT_TO_FIXED(x) ((GLfixed) ((x) * 65536.0))
 
-#if defined(WIN32) || defined(_WIN32_WCE)
+#if defined(_MSC_VER)
 /* Oddly, the fpclassify() function doesn't exist in such a form
- * on Windows.  This is an implementation using slightly different
+ * on MSVC.  This is an implementation using slightly different
  * lower-level Windows functions.
  */
 #include <float.h>
@@ -72,7 +72,7 @@ fpclassify(double x)
 
 #elif defined(__APPLE__) || defined(__CYGWIN__) || defined(__FreeBSD__) || \
      defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || \
-     (defined(__sun) && defined(__C99FEATURES__))
+     (defined(__sun) && defined(__C99FEATURES__)) || defined(__MINGW32__)
 
 /* fpclassify is available. */
 
diff --git a/src/mesa/program/hash_table.h b/src/mesa/program/hash_table.h
index ec088c7dde1..f1c4fdcd1fa 100644
--- a/src/mesa/program/hash_table.h
+++ b/src/mesa/program/hash_table.h
@@ -31,8 +31,6 @@
 #ifndef HASH_TABLE_H
 #define HASH_TABLE_H
 
-#include <string.h>
-
 struct hash_table;
 
 typedef unsigned (*hash_func_t)(const void *key);
diff --git a/src/mesa/program/nvfragparse.h b/src/mesa/program/nvfragparse.h
index 544ab80c56c..e28a6c49349 100644
--- a/src/mesa/program/nvfragparse.h
+++ b/src/mesa/program/nvfragparse.h
@@ -30,6 +30,7 @@
 #ifndef NVFRAGPARSE_H
 #define NVFRAGPARSE_H
 
+#include "main/mtypes.h"
 
 extern void
 _mesa_parse_nv_fragment_program(GLcontext *ctx, GLenum target,
diff --git a/src/mesa/program/nvvertparse.h b/src/mesa/program/nvvertparse.h
index 9919e22388d..91ef79e6c3c 100644
--- a/src/mesa/program/nvvertparse.h
+++ b/src/mesa/program/nvvertparse.h
@@ -29,6 +29,7 @@
 #ifndef NVVERTPARSE_H
 #define NVVERTPARSE_H
 
+#include "main/mtypes.h"
 
 extern void
 _mesa_parse_nv_vertex_program(GLcontext *ctx, GLenum target,
diff --git a/src/mesa/program/prog_cache.h b/src/mesa/program/prog_cache.h
index 4e1ccac03ff..bfe8f99d445 100644
--- a/src/mesa/program/prog_cache.h
+++ b/src/mesa/program/prog_cache.h
@@ -30,6 +30,9 @@
 #define PROG_CACHE_H
 
 
+#include "main/mtypes.h"
+
+
 /** Opaque type */
 struct gl_program_cache;
 
diff --git a/src/mesa/program/prog_execute.h b/src/mesa/program/prog_execute.h
index adefc5439de..f59b65176ff 100644
--- a/src/mesa/program/prog_execute.h
+++ b/src/mesa/program/prog_execute.h
@@ -26,6 +26,7 @@
 #define PROG_EXECUTE_H
 
 #include "main/config.h"
+#include "main/mtypes.h"
 
 
 typedef void (*FetchTexelLodFunc)(GLcontext *ctx, const GLfloat texcoord[4],
diff --git a/src/mesa/program/prog_instruction.h b/src/mesa/program/prog_instruction.h
index 1d850c4d5d5..ca90de7ce1c 100644
--- a/src/mesa/program/prog_instruction.h
+++ b/src/mesa/program/prog_instruction.h
@@ -38,7 +38,7 @@
 #define PROG_INSTRUCTION_H
 
 
-#include "main/mfeatures.h"
+#include "main/glheader.h"
 
 
 /**
diff --git a/src/mesa/program/prog_noise.h b/src/mesa/program/prog_noise.h
index c4779479f9b..dd7986efcdb 100644
--- a/src/mesa/program/prog_noise.h
+++ b/src/mesa/program/prog_noise.h
@@ -25,6 +25,8 @@
 #ifndef PROG_NOISE
 #define PROG_NOISE
 
+#include "main/glheader.h"
+
 extern GLfloat _mesa_noise1(GLfloat);
 extern GLfloat _mesa_noise2(GLfloat, GLfloat);
 extern GLfloat _mesa_noise3(GLfloat, GLfloat, GLfloat);
diff --git a/src/mesa/program/prog_optimize.c b/src/mesa/program/prog_optimize.c
index 457ace14c6b..ab878755e25 100644
--- a/src/mesa/program/prog_optimize.c
+++ b/src/mesa/program/prog_optimize.c
@@ -43,40 +43,117 @@
 
 static GLboolean dbg = GL_FALSE;
 
-/* Returns the mask of channels read from the given srcreg in this instruction.
+#define NO_MASK 0xf
+
+/**
+ * Returns the mask of channels (bitmask of WRITEMASK_X,Y,Z,W) which
+ * are read from the given src in this instruction, We also provide
+ * one optional masks which may mask other components in the dst
+ * register
  */
 static GLuint
-get_src_arg_mask(const struct prog_instruction *inst, int arg)
+get_src_arg_mask(const struct prog_instruction *inst,
+                 GLuint arg, GLuint dst_mask)
 {
-   int writemask = inst->DstReg.WriteMask;
+   GLuint read_mask, channel_mask;
+   GLuint comp;
 
-   if (inst->CondUpdate)
-      writemask = WRITEMASK_XYZW;
+   ASSERT(arg < _mesa_num_inst_src_regs(inst->Opcode));
 
-   switch (inst->Opcode) {
-   case OPCODE_MOV:
-   case OPCODE_ABS:
-   case OPCODE_ADD:
-   case OPCODE_MUL:
-   case OPCODE_SUB:
-      return writemask;
-   case OPCODE_RCP:
-   case OPCODE_SIN:
-   case OPCODE_COS:
-   case OPCODE_RSQ:
-   case OPCODE_POW:
-   case OPCODE_EX2:
-      return WRITEMASK_X;
-   case OPCODE_DP2:
-      return WRITEMASK_XY;
-   case OPCODE_DP3:
-   case OPCODE_XPD:
-      return WRITEMASK_XYZ;
-   default:
-      return WRITEMASK_XYZW;
+   /* Form the dst register, find the written channels */
+   if (inst->CondUpdate) {
+      channel_mask = WRITEMASK_XYZW;
+   }
+   else {
+      switch (inst->Opcode) {
+      case OPCODE_MOV:
+      case OPCODE_MIN:
+      case OPCODE_MAX:
+      case OPCODE_ABS:
+      case OPCODE_ADD:
+      case OPCODE_MAD:
+      case OPCODE_MUL:
+      case OPCODE_SUB:
+         channel_mask = inst->DstReg.WriteMask & dst_mask;
+         break;
+      case OPCODE_RCP:
+      case OPCODE_SIN:
+      case OPCODE_COS:
+      case OPCODE_RSQ:
+      case OPCODE_POW:
+      case OPCODE_EX2:
+      case OPCODE_LOG:
+         channel_mask = WRITEMASK_X;
+         break;
+      case OPCODE_DP2:
+         channel_mask = WRITEMASK_XY;
+         break;
+      case OPCODE_DP3:
+      case OPCODE_XPD:
+         channel_mask = WRITEMASK_XYZ;
+         break;
+      default:
+         channel_mask = WRITEMASK_XYZW;
+         break;
+      }
    }
+
+   /* Now, given the src swizzle and the written channels, find which
+    * components are actually read
+    */
+   read_mask = 0x0;
+   for (comp = 0; comp < 4; ++comp) {
+      const GLuint coord = GET_SWZ(inst->SrcReg[arg].Swizzle, comp);
+      ASSERT(coord < 4);
+      if (channel_mask & (1 << comp) && coord <= SWIZZLE_W)
+         read_mask |= 1 << coord;
+   }
+
+   return read_mask;
+}
+
+
+/**
+ * For a MOV instruction, compute a write mask when src register also has
+ * a mask
+ */
+static GLuint
+get_dst_mask_for_mov(const struct prog_instruction *mov, GLuint src_mask)
+{
+   const GLuint mask = mov->DstReg.WriteMask;
+   GLuint comp;
+   GLuint updated_mask = 0x0;
+
+   ASSERT(mov->Opcode == OPCODE_MOV);
+
+   for (comp = 0; comp < 4; ++comp) {
+      GLuint src_comp;
+      if ((mask & (1 << comp)) == 0)
+         continue;
+      src_comp = GET_SWZ(mov->SrcReg[0].Swizzle, comp);
+      if ((src_mask & (1 << src_comp)) == 0)
+         continue;
+      updated_mask |= 1 << comp;
+   }
+
+   return updated_mask;
+}
+
+
+/**
+ * Ensure that the swizzle is regular.  That is, all of the swizzle
+ * terms are SWIZZLE_X,Y,Z,W and not SWIZZLE_ZERO or SWIZZLE_ONE.
+ */
+static GLboolean
+is_swizzle_regular(GLuint swz)
+{
+   return GET_SWZ(swz,0) <= SWIZZLE_W &&
+          GET_SWZ(swz,1) <= SWIZZLE_W &&
+          GET_SWZ(swz,2) <= SWIZZLE_W &&
+          GET_SWZ(swz,3) <= SWIZZLE_W;
 }
 
+
 /**
  * In 'prog' remove instruction[i] if removeFlags[i] == TRUE.
  * \return number of instructions removed
@@ -153,82 +230,13 @@ replace_regs(struct gl_program *prog, gl_register_file file, const GLint map[])
 
 
 /**
- * Consolidate temporary registers to use low numbers.  For example, if the
- * shader only uses temps 4, 5, 8, replace them with 0, 1, 2.
- */
-static void
-_mesa_consolidate_registers(struct gl_program *prog)
-{
-   GLboolean tempUsed[REG_ALLOCATE_MAX_PROGRAM_TEMPS];
-   GLint tempMap[REG_ALLOCATE_MAX_PROGRAM_TEMPS];
-   GLuint tempMax = 0, i;
-
-   if (dbg) {
-      printf("Optimize: Begin register consolidation\n");
-   }
-
-   memset(tempUsed, 0, sizeof(tempUsed));
-
-   for (i = 0; i < REG_ALLOCATE_MAX_PROGRAM_TEMPS; i++) {
-      tempMap[i] = -1;
-   }
-
-   /* set tempUsed[i] if temporary [i] is referenced */
-   for (i = 0; i < prog->NumInstructions; i++) {
-      const struct prog_instruction *inst = prog->Instructions + i;
-      const GLuint numSrc = _mesa_num_inst_src_regs(inst->Opcode);
-      GLuint j;
-      for (j = 0; j < numSrc; j++) {
-         if (inst->SrcReg[j].File == PROGRAM_TEMPORARY) {
-            const GLuint index = inst->SrcReg[j].Index;
-            ASSERT(index < REG_ALLOCATE_MAX_PROGRAM_TEMPS);
-            tempUsed[index] = GL_TRUE;
-            tempMax = MAX2(tempMax, index);
-            break;
-         }
-      }
-      if (inst->DstReg.File == PROGRAM_TEMPORARY) {
-         const GLuint index = inst->DstReg.Index;
-         ASSERT(index < REG_ALLOCATE_MAX_PROGRAM_TEMPS);
-         tempUsed[index] = GL_TRUE;
-         tempMax = MAX2(tempMax, index);
-      }
-   }
-
-   /* allocate a new index for each temp that's used */
-   {
-      GLuint freeTemp = 0;
-      for (i = 0; i <= tempMax; i++) {
-         if (tempUsed[i]) {
-            tempMap[i] = freeTemp++;
-            /*printf("replace %u with %u\n", i, tempMap[i]);*/
-         }
-      }
-      if (freeTemp == tempMax + 1) {
-         /* no consolidation possible */
-         return;
-      }         
-      if (dbg) {
-         printf("Replace regs 0..%u with 0..%u\n", tempMax, freeTemp-1);
-      }
-   }
-
-   replace_regs(prog, PROGRAM_TEMPORARY, tempMap);
-
-   if (dbg) {
-      printf("Optimize: End register consolidation\n");
-   }
-}
-
-
-/**
  * Remove dead instructions from the given program.
  * This is very primitive for now.  Basically look for temp registers
  * that are written to but never read.  Remove any instructions that
  * write to such registers.  Be careful with condition code setters.
  */
-static void
-_mesa_remove_dead_code(struct gl_program *prog)
+static GLboolean
+_mesa_remove_dead_code_global(struct gl_program *prog)
 {
    GLboolean tempRead[REG_ALLOCATE_MAX_PROGRAM_TEMPS][4];
    GLboolean *removeInst; /* per-instruction removal flag */
@@ -256,7 +264,7 @@ _mesa_remove_dead_code(struct gl_program *prog)
             const GLuint index = inst->SrcReg[j].Index;
             GLuint read_mask;
             ASSERT(index < REG_ALLOCATE_MAX_PROGRAM_TEMPS);
-	    read_mask = get_src_arg_mask(inst, j);
+	    read_mask = get_src_arg_mask(inst, j, NO_MASK);
 
             if (inst->SrcReg[j].RelAddr) {
                if (dbg)
@@ -265,25 +273,12 @@ _mesa_remove_dead_code(struct gl_program *prog)
             }
 
 	    for (comp = 0; comp < 4; comp++) {
-	       GLuint swz = (inst->SrcReg[j].Swizzle >> (3 * comp)) & 0x7;
-
-	       if ((read_mask & (1 << comp)) == 0)
+	       const GLuint swz = GET_SWZ(inst->SrcReg[j].Swizzle, comp);
+	       ASSERT(swz < 4);
+               if ((read_mask & (1 << swz)) == 0)
 		  continue;
-
-	       switch (swz) {
-	       case SWIZZLE_X:
-		  tempRead[index][0] = GL_TRUE;
-		  break;
-	       case SWIZZLE_Y:
-		  tempRead[index][1] = GL_TRUE;
-		  break;
-	       case SWIZZLE_Z:
-		  tempRead[index][2] = GL_TRUE;
-		  break;
-	       case SWIZZLE_W:
-		  tempRead[index][3] = GL_TRUE;
-		  break;
-	       }
+               if (swz <= SWIZZLE_W)
+                  tempRead[index][swz] = GL_TRUE;
 	    }
          }
       }
@@ -353,10 +348,11 @@ _mesa_remove_dead_code(struct gl_program *prog)
 
 done:
    free(removeInst);
+   return rem != 0;
 }
 
 
-enum temp_use
+enum inst_use
 {
    READ,
    WRITE,
@@ -364,13 +360,19 @@ enum temp_use
    END
 };
 
+
 /**
- * Scan forward in program from 'start' for the next occurance of TEMP[index].
+ * Scan forward in program from 'start' for the next occurances of TEMP[index].
+ * We look if an instruction reads the component given by the masks and if they
+ * are overwritten.
  * Return READ, WRITE, FLOW or END to indicate the next usage or an indicator
  * that we can't look further.
  */
-static enum temp_use
-find_next_temp_use(const struct gl_program *prog, GLuint start, GLuint index)
+static enum inst_use
+find_next_use(const struct gl_program *prog,
+              GLuint start,
+              GLuint index,
+              GLuint mask)
 {
    GLuint i;
 
@@ -378,30 +380,50 @@ find_next_temp_use(const struct gl_program *prog, GLuint start, GLuint index)
       const struct prog_instruction *inst = prog->Instructions + i;
       switch (inst->Opcode) {
       case OPCODE_BGNLOOP:
-      case OPCODE_ENDLOOP:
       case OPCODE_BGNSUB:
+      case OPCODE_BRA:
+      case OPCODE_CAL:
+      case OPCODE_CONT:
+      case OPCODE_IF:
+      case OPCODE_ELSE:
+      case OPCODE_ENDIF:
+      case OPCODE_ENDLOOP:
       case OPCODE_ENDSUB:
+      case OPCODE_RET:
          return FLOW;
+      case OPCODE_END:
+         return END;
       default:
          {
             const GLuint numSrc = _mesa_num_inst_src_regs(inst->Opcode);
             GLuint j;
             for (j = 0; j < numSrc; j++) {
-               if (inst->SrcReg[j].File == PROGRAM_TEMPORARY &&
-                   inst->SrcReg[j].Index == index)
+               if (inst->SrcReg[j].RelAddr ||
+                   (inst->SrcReg[j].File == PROGRAM_TEMPORARY &&
+                   inst->SrcReg[j].Index == index &&
+                   (get_src_arg_mask(inst,j,NO_MASK) & mask)))
                   return READ;
             }
-            if (inst->DstReg.File == PROGRAM_TEMPORARY &&
-                inst->DstReg.Index == index)
-               return WRITE;
+            if (_mesa_num_inst_dst_regs(inst->Opcode) == 1 &&
+                inst->DstReg.File == PROGRAM_TEMPORARY &&
+                inst->DstReg.Index == index) {
+               mask &= ~inst->DstReg.WriteMask;
+               if (mask == 0)
+                  return WRITE;
+            }
          }
       }
    }
-
    return END;
 }
 
-static GLboolean _mesa_is_flow_control_opcode(enum prog_opcode opcode)
+
+/**
+ * Is the given instruction opcode a flow-control opcode?
+ * XXX maybe move this into prog_instruction.[ch]
+ */
+static GLboolean
+_mesa_is_flow_control_opcode(enum prog_opcode opcode)
 {
    switch (opcode) {
    case OPCODE_BGNLOOP:
@@ -422,6 +444,37 @@ static GLboolean _mesa_is_flow_control_opcode(enum prog_opcode opcode)
    }
 }
 
+
+/**
+ * Test if the given instruction is a simple MOV (no conditional updating,
+ * not relative addressing, no negation/abs, etc).
+ */
+static GLboolean
+can_downward_mov_be_modifed(const struct prog_instruction *mov)
+{
+   return
+      mov->Opcode == OPCODE_MOV &&
+      mov->CondUpdate == GL_FALSE &&
+      mov->SrcReg[0].RelAddr == 0 &&
+      mov->SrcReg[0].Negate == 0 &&
+      mov->SrcReg[0].Abs == 0 &&
+      mov->SrcReg[0].HasIndex2 == 0 &&
+      mov->SrcReg[0].RelAddr2 == 0 &&
+      mov->DstReg.RelAddr == 0 &&
+      mov->DstReg.CondMask == COND_TR &&
+      mov->SaturateMode == SATURATE_OFF;
+}
+
+
+static GLboolean
+can_upward_mov_be_modifed(const struct prog_instruction *mov)
+{
+   return
+      can_downward_mov_be_modifed(mov) &&
+      mov->DstReg.File == PROGRAM_TEMPORARY;
+}
+
+
 /**
  * Try to remove use of extraneous MOV instructions, to free them up for dead
  * code removal.
@@ -449,14 +502,15 @@ _mesa_remove_extra_move_use(struct gl_program *prog)
 
    for (i = 0; i + 1 < prog->NumInstructions; i++) {
       const struct prog_instruction *mov = prog->Instructions + i;
+      GLuint dst_mask, src_mask;
+      if (can_upward_mov_be_modifed(mov) == GL_FALSE)
+         continue;
 
-      if (mov->Opcode != OPCODE_MOV ||
-	  mov->DstReg.File != PROGRAM_TEMPORARY ||
-	  mov->DstReg.RelAddr ||
-	  mov->DstReg.CondMask != COND_TR ||
-	  mov->SaturateMode != SATURATE_OFF ||
-	  mov->SrcReg[0].RelAddr)
-	 continue;
+      /* Scanning the code, we maintain the components which are still active in
+       * these two masks
+       */
+      dst_mask = mov->DstReg.WriteMask;
+      src_mask = get_src_arg_mask(mov, 0, NO_MASK);
 
       /* Walk through remaining instructions until the or src reg gets
        * rewritten or we get into some flow-control, eliminating the use of
@@ -464,61 +518,60 @@ _mesa_remove_extra_move_use(struct gl_program *prog)
        */
       for (j = i + 1; j < prog->NumInstructions; j++) {
 	 struct prog_instruction *inst2 = prog->Instructions + j;
-	 GLuint arg;
+         GLuint arg;
 
 	 if (_mesa_is_flow_control_opcode(inst2->Opcode))
 	     break;
 
 	 /* First rewrite this instruction's args if appropriate. */
 	 for (arg = 0; arg < _mesa_num_inst_src_regs(inst2->Opcode); arg++) {
-	    int comp;
-	    int read_mask = get_src_arg_mask(inst2, arg);
+	    GLuint comp, read_mask;
 
 	    if (inst2->SrcReg[arg].File != mov->DstReg.File ||
 		inst2->SrcReg[arg].Index != mov->DstReg.Index ||
 		inst2->SrcReg[arg].RelAddr ||
 		inst2->SrcReg[arg].Abs)
 	       continue;
+            read_mask = get_src_arg_mask(inst2, arg, NO_MASK);
 
-	    /* Check that all the sources for this arg of inst2 come from inst1
-	     * or constants.
-	     */
-	    for (comp = 0; comp < 4; comp++) {
-	       int src_swz = GET_SWZ(inst2->SrcReg[arg].Swizzle, comp);
-
-	       /* If the MOV didn't write that channel, can't use it. */
-	       if ((read_mask & (1 << comp)) &&
-		   src_swz <= SWIZZLE_W &&
-		   (mov->DstReg.WriteMask & (1 << src_swz)) == 0)
-		  break;
-	    }
-	    if (comp != 4)
-	       continue;
-
-	    /* Adjust the swizzles of inst2 to point at MOV's source */
-	    for (comp = 0; comp < 4; comp++) {
-	       int inst2_swz = GET_SWZ(inst2->SrcReg[arg].Swizzle, comp);
-
-	       if (inst2_swz <= SWIZZLE_W) {
-		  GLuint s = GET_SWZ(mov->SrcReg[0].Swizzle, inst2_swz);
-		  inst2->SrcReg[arg].Swizzle &= ~(7 << (3 * comp));
-		  inst2->SrcReg[arg].Swizzle |= s << (3 * comp);
-		  inst2->SrcReg[arg].Negate ^= (((mov->SrcReg[0].Negate >>
-						  inst2_swz) & 0x1) << comp);
-	       }
-	    }
-	    inst2->SrcReg[arg].File = mov->SrcReg[0].File;
-	    inst2->SrcReg[arg].Index = mov->SrcReg[0].Index;
+	    /* Adjust the swizzles of inst2 to point at MOV's source if ALL the
+             * components read still come from the mov instructions
+             */
+            if (is_swizzle_regular(inst2->SrcReg[arg].Swizzle) &&
+               (read_mask & dst_mask) == read_mask) {
+               for (comp = 0; comp < 4; comp++) {
+                  const GLuint inst2_swz =
+                     GET_SWZ(inst2->SrcReg[arg].Swizzle, comp);
+                  const GLuint s = GET_SWZ(mov->SrcReg[0].Swizzle, inst2_swz);
+                  inst2->SrcReg[arg].Swizzle &= ~(7 << (3 * comp));
+                  inst2->SrcReg[arg].Swizzle |= s << (3 * comp);
+                  inst2->SrcReg[arg].Negate ^= (((mov->SrcReg[0].Negate >>
+                                                  inst2_swz) & 0x1) << comp);
+               }
+               inst2->SrcReg[arg].File = mov->SrcReg[0].File;
+               inst2->SrcReg[arg].Index = mov->SrcReg[0].Index;
+            }
 	 }
 
-	 /* If this instruction overwrote part of the move, our time is up. */
-	 if ((inst2->DstReg.File == mov->DstReg.File &&
-	      (inst2->DstReg.RelAddr ||
-	       inst2->DstReg.Index == mov->DstReg.Index)) ||
-	     (inst2->DstReg.File == mov->SrcReg[0].File &&
-	      (inst2->DstReg.RelAddr ||
-	       inst2->DstReg.Index == mov->SrcReg[0].Index)))
-	    break;
+	 /* The source of MOV is written. This potentially deactivates some
+          * components from the src and dst of the MOV instruction
+          */
+	 if (inst2->DstReg.File == mov->DstReg.File &&
+	     (inst2->DstReg.RelAddr ||
+	      inst2->DstReg.Index == mov->DstReg.Index)) {
+            dst_mask &= ~inst2->DstReg.WriteMask;
+            src_mask = get_src_arg_mask(mov, 0, dst_mask);
+         }
+
+         /* Idem when the destination of mov is written */
+	 if (inst2->DstReg.File == mov->SrcReg[0].File &&
+	     (inst2->DstReg.RelAddr ||
+	      inst2->DstReg.Index == mov->SrcReg[0].Index)) {
+            src_mask &= ~inst2->DstReg.WriteMask;
+            dst_mask &= get_dst_mask_for_mov(mov, src_mask);
+         }
+         if (dst_mask == 0)
+            break;
       }
    }
 
@@ -528,14 +581,151 @@ _mesa_remove_extra_move_use(struct gl_program *prog)
    }
 }
 
+
+/**
+ * Complements dead_code_global. Try to remove code in block of code by
+ * carefully monitoring the swizzles. Both functions should be merged into one
+ * with a proper control flow graph
+ */
+static GLboolean
+_mesa_remove_dead_code_local(struct gl_program *prog)
+{
+   GLboolean *removeInst;
+   GLuint i, arg, rem = 0;
+
+   removeInst = (GLboolean *)
+      calloc(1, prog->NumInstructions * sizeof(GLboolean));
+
+   for (i = 0; i < prog->NumInstructions; i++) {
+      const struct prog_instruction *inst = prog->Instructions + i;
+      const GLuint index = inst->DstReg.Index;
+      const GLuint mask = inst->DstReg.WriteMask;
+      enum inst_use use;
+
+      /* We must deactivate the pass as soon as some indirection is used */
+      if (inst->DstReg.RelAddr)
+         goto done;
+      for (arg = 0; arg < _mesa_num_inst_src_regs(inst->Opcode); arg++)
+         if (inst->SrcReg[arg].RelAddr)
+            goto done;
+
+      if (_mesa_is_flow_control_opcode(inst->Opcode) ||
+          _mesa_num_inst_dst_regs(inst->Opcode) == 0 ||
+          inst->DstReg.File != PROGRAM_TEMPORARY ||
+          inst->DstReg.RelAddr)
+         continue;
+
+      use = find_next_use(prog, i+1, index, mask);
+      if (use == WRITE || use == END)
+         removeInst[i] = GL_TRUE;
+   }
+
+   rem = remove_instructions(prog, removeInst);
+
+done:
+   free(removeInst);
+   return rem != 0;
+}
+
+
+/**
+ * Try to inject the destination of mov as the destination of inst and recompute
+ * the swizzles operators for the sources of inst if required. Return GL_TRUE
+ * of the substitution was possible, GL_FALSE otherwise
+ */
+static GLboolean
+_mesa_merge_mov_into_inst(struct prog_instruction *inst,
+                          const struct prog_instruction *mov)
+{
+   /* Indirection table which associates destination and source components for
+    * the mov instruction
+    */
+   const GLuint mask = get_src_arg_mask(mov, 0, NO_MASK);
+
+   /* Some components are not written by inst. We cannot remove the mov */
+   if (mask != (inst->DstReg.WriteMask & mask))
+      return GL_FALSE;
+
+   /* Depending on the instruction, we may need to recompute the swizzles.
+    * Also, some other instructions (like TEX) are not linear. We will only
+    * consider completely active sources and destinations
+    */
+   switch (inst->Opcode) {
+
+   /* Carstesian instructions: we compute the swizzle */
+   case OPCODE_MOV:
+   case OPCODE_MIN:
+   case OPCODE_MAX:
+   case OPCODE_ABS:
+   case OPCODE_ADD:
+   case OPCODE_MAD:
+   case OPCODE_MUL:
+   case OPCODE_SUB:
+   {
+      GLuint dst_to_src_comp[4] = {0,0,0,0};
+      GLuint dst_comp, arg;
+      for (dst_comp = 0; dst_comp < 4; ++dst_comp) {
+         if (mov->DstReg.WriteMask & (1 << dst_comp)) {
+            const GLuint src_comp = GET_SWZ(mov->SrcReg[0].Swizzle, dst_comp);
+            ASSERT(src_comp < 4);
+            dst_to_src_comp[dst_comp] = src_comp;
+         }
+      }
+
+      /* Patch each source of the instruction */
+      for (arg = 0; arg < _mesa_num_inst_src_regs(inst->Opcode); arg++) {
+         const GLuint arg_swz = inst->SrcReg[arg].Swizzle;
+         inst->SrcReg[arg].Swizzle = 0;
+
+         /* Reset each active component of the swizzle */
+         for (dst_comp = 0; dst_comp < 4; ++dst_comp) {
+            GLuint src_comp, arg_comp;
+            if ((mov->DstReg.WriteMask & (1 << dst_comp)) == 0)
+               continue;
+            src_comp = dst_to_src_comp[dst_comp];
+            ASSERT(src_comp < 4);
+            arg_comp = GET_SWZ(arg_swz, src_comp);
+            ASSERT(arg_comp < 4);
+            inst->SrcReg[arg].Swizzle |= arg_comp << (3*dst_comp);
+         }
+      }
+      inst->DstReg = mov->DstReg;
+      return GL_TRUE;
+   }
+
+   /* Dot products and scalar instructions: we only change the destination */
+   case OPCODE_RCP:
+   case OPCODE_SIN:
+   case OPCODE_COS:
+   case OPCODE_RSQ:
+   case OPCODE_POW:
+   case OPCODE_EX2:
+   case OPCODE_LOG:
+   case OPCODE_DP2:
+   case OPCODE_DP3:
+   case OPCODE_DP4:
+      inst->DstReg = mov->DstReg;
+      return GL_TRUE;
+
+   /* All other instructions require fully active components with no swizzle */
+   default:
+      if (mov->SrcReg[0].Swizzle != SWIZZLE_XYZW ||
+          inst->DstReg.WriteMask != WRITEMASK_XYZW)
+         return GL_FALSE;
+      inst->DstReg = mov->DstReg;
+      return GL_TRUE;
+   }
+}
+
+
 /**
  * Try to remove extraneous MOV instructions from the given program.
  */
-static void
+static GLboolean
 _mesa_remove_extra_moves(struct gl_program *prog)
 {
    GLboolean *removeInst; /* per-instruction removal flag */
-   GLuint i, rem, loopNesting = 0, subroutineNesting = 0;
+   GLuint i, rem = 0, nesting = 0;
 
    if (dbg) {
       printf("Optimize: Begin remove extra moves\n");
@@ -554,29 +744,24 @@ _mesa_remove_extra_moves(struct gl_program *prog)
     */
 
    for (i = 0; i < prog->NumInstructions; i++) {
-      const struct prog_instruction *inst = prog->Instructions + i;
+      const struct prog_instruction *mov = prog->Instructions + i;
 
-      switch (inst->Opcode) {
+      switch (mov->Opcode) {
       case OPCODE_BGNLOOP:
-         loopNesting++;
-         break;
-      case OPCODE_ENDLOOP:
-         loopNesting--;
-         break;
       case OPCODE_BGNSUB:
-         subroutineNesting++;
+      case OPCODE_IF:
+         nesting++;
          break;
+      case OPCODE_ENDLOOP:
       case OPCODE_ENDSUB:
-         subroutineNesting--;
+      case OPCODE_ENDIF:
+         nesting--;
          break;
       case OPCODE_MOV:
-         if (i > 0 &&
-             loopNesting == 0 &&
-             subroutineNesting == 0 &&
-             inst->SrcReg[0].File == PROGRAM_TEMPORARY &&
-             inst->SrcReg[0].Swizzle == SWIZZLE_XYZW) {
+         if (i > 0 && can_downward_mov_be_modifed(mov) && nesting == 0) {
+
             /* see if this MOV can be removed */
-            const GLuint tempIndex = inst->SrcReg[0].Index;
+            const GLuint id = mov->SrcReg[0].Index;
             struct prog_instruction *prevInst;
             GLuint prevI;
 
@@ -587,11 +772,13 @@ _mesa_remove_extra_moves(struct gl_program *prog)
             prevInst = prog->Instructions + prevI;
 
             if (prevInst->DstReg.File == PROGRAM_TEMPORARY &&
-                prevInst->DstReg.Index == tempIndex &&
-                prevInst->DstReg.WriteMask == WRITEMASK_XYZW) {
+                prevInst->DstReg.Index == id &&
+                prevInst->DstReg.RelAddr == 0 &&
+                prevInst->DstReg.CondSrc == 0 && 
+                prevInst->DstReg.CondMask == COND_TR) {
 
-               enum temp_use next_use =
-                  find_next_temp_use(prog, i + 1, tempIndex);
+               const GLuint dst_mask = prevInst->DstReg.WriteMask;
+               enum inst_use next_use = find_next_use(prog, i+1, id, dst_mask);
 
                if (next_use == WRITE || next_use == END) {
                   /* OK, we can safely remove this MOV instruction.
@@ -601,18 +788,13 @@ _mesa_remove_extra_moves(struct gl_program *prog)
                    * Into:
                    *   prevI: FOO z, x, y;
                    */
-
-                  /* patch up prev inst */
-                  prevInst->DstReg.File = inst->DstReg.File;
-                  prevInst->DstReg.Index = inst->DstReg.Index;
-
-                  /* flag this instruction for removal */
-                  removeInst[i] = GL_TRUE;
-
-                  if (dbg) {
-                     printf("Remove MOV at %u\n", i);
-                     printf("new prev inst %u: ", prevI);
-                     _mesa_print_instruction(prevInst);
+                  if (_mesa_merge_mov_into_inst(prevInst, mov)) {
+                     removeInst[i] = GL_TRUE;
+                     if (dbg) {
+                        printf("Remove MOV at %u\n", i);
+                        printf("new prev inst %u: ", prevI);
+                        _mesa_print_instruction(prevInst);
+                     }
                   }
                }
             }
@@ -632,6 +814,8 @@ _mesa_remove_extra_moves(struct gl_program *prog)
       printf("Optimize: End remove extra moves.  %u instructions removed\n", rem);
       /*_mesa_print_program(prog);*/
    }
+
+   return rem != 0;
 }
 
 
@@ -718,6 +902,7 @@ compare_start(const void *a, const void *b)
       return 0;
 }
 
+
 /** sort the interval list according to interval starts */
 static void
 sort_interval_list_by_start(struct interval_list *list)
@@ -1025,6 +1210,17 @@ _mesa_reallocate_registers(struct gl_program *prog)
 }
 
 
+#if 0
+static void
+print_it(GLcontext *ctx, struct gl_program *program, const char *txt) {
+   fprintf(stderr, "%s (%u inst):\n", txt, program->NumInstructions);
+   _mesa_print_program(program);
+   _mesa_print_program_parameters(ctx, program);
+   fprintf(stderr, "\n\n");
+}
+#endif
+
+
 /**
  * Apply optimizations to the given program to eliminate unnecessary
  * instructions, temp regs, etc.
@@ -1032,16 +1228,19 @@ _mesa_reallocate_registers(struct gl_program *prog)
 void
 _mesa_optimize_program(GLcontext *ctx, struct gl_program *program)
 {
-   _mesa_remove_extra_move_use(program);
-
-   if (1)
-      _mesa_remove_dead_code(program);
-
-   if (0) /* not tested much yet */
-      _mesa_remove_extra_moves(program);
-
-   if (0)
-      _mesa_consolidate_registers(program);
-   else
+   GLboolean any_change;
+
+   /* Stop when no modifications were output */
+   do {
+      any_change = GL_FALSE;
+      _mesa_remove_extra_move_use(program);
+      if (_mesa_remove_dead_code_global(program))
+         any_change = GL_TRUE;
+      if (_mesa_remove_extra_moves(program))
+         any_change = GL_TRUE;
+      if (_mesa_remove_dead_code_local(program))
+         any_change = GL_TRUE;
       _mesa_reallocate_registers(program);
+   } while (any_change);
 }
+
diff --git a/src/mesa/program/prog_optimize.h b/src/mesa/program/prog_optimize.h
index 43894a27237..06cd9cb2c20 100644
--- a/src/mesa/program/prog_optimize.h
+++ b/src/mesa/program/prog_optimize.h
@@ -27,6 +27,7 @@
 
 
 #include "main/config.h"
+#include "main/mtypes.h"
 
 
 struct gl_program;
diff --git a/src/mesa/program/prog_print.h b/src/mesa/program/prog_print.h
index 4ffd5ab96c6..78b90aeb4d6 100644
--- a/src/mesa/program/prog_print.h
+++ b/src/mesa/program/prog_print.h
@@ -26,6 +26,16 @@
 #ifndef PROG_PRINT_H
 #define PROG_PRINT_H
 
+#include <stdio.h>
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+
+struct gl_program;
+struct gl_program_parameter_list;
+struct gl_shader;
+struct prog_instruction;
+
 
 /**
  * The output style to use when printing programs.
diff --git a/src/mesa/program/prog_uniform.h b/src/mesa/program/prog_uniform.h
index a671d30bfe8..7988d534a7d 100644
--- a/src/mesa/program/prog_uniform.h
+++ b/src/mesa/program/prog_uniform.h
@@ -31,8 +31,7 @@
 #ifndef PROG_UNIFORM_H
 #define PROG_UNIFORM_H
 
-#include "main/mtypes.h"
-#include "prog_statevars.h"
+#include "main/glheader.h"
 
 
 /**
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index d99584d63bf..3b6d6827446 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -55,13 +55,21 @@ _mesa_init_program(GLcontext *ctx)
 
    /*
     * If this assertion fails, we need to increase the field
-    * size for register indexes.
+    * size for register indexes (see INST_INDEX_BITS).
     */
    ASSERT(ctx->Const.VertexProgram.MaxUniformComponents / 4
           <= (1 << INST_INDEX_BITS));
    ASSERT(ctx->Const.FragmentProgram.MaxUniformComponents / 4
           <= (1 << INST_INDEX_BITS));
 
+   ASSERT(ctx->Const.VertexProgram.MaxTemps <= (1 << INST_INDEX_BITS));
+   ASSERT(ctx->Const.VertexProgram.MaxLocalParams <= (1 << INST_INDEX_BITS));
+   ASSERT(ctx->Const.FragmentProgram.MaxTemps <= (1 << INST_INDEX_BITS));
+   ASSERT(ctx->Const.FragmentProgram.MaxLocalParams <= (1 << INST_INDEX_BITS));
+
+   ASSERT(ctx->Const.VertexProgram.MaxUniformComponents <= 4 * MAX_UNIFORMS);
+   ASSERT(ctx->Const.FragmentProgram.MaxUniformComponents <= 4 * MAX_UNIFORMS);
+
    /* If this fails, increase prog_instruction::TexSrcUnit size */
    ASSERT(MAX_TEXTURE_UNITS < (1 << 5));
 
diff --git a/src/mesa/program/programopt.h b/src/mesa/program/programopt.h
index 21fac07849a..4af6357f976 100644
--- a/src/mesa/program/programopt.h
+++ b/src/mesa/program/programopt.h
@@ -26,6 +26,7 @@
 #ifndef PROGRAMOPT_H
 #define PROGRAMOPT_H 1
 
+#include "main/mtypes.h"
 
 extern void
 _mesa_insert_mvp_code(GLcontext *ctx, struct gl_vertex_program *vprog);
diff --git a/src/mesa/slang/slang_builtin.c b/src/mesa/slang/slang_builtin.c
index a7e0efcb7b5..179571fab42 100644
--- a/src/mesa/slang/slang_builtin.c
+++ b/src/mesa/slang/slang_builtin.c
@@ -35,8 +35,10 @@
 #include "program/prog_instruction.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
-#include "slang/slang_ir.h"
 #include "slang/slang_builtin.h"
+#include "slang/slang_compile_struct.h"
+#include "slang/slang_ir.h"
+#include "slang/slang_typeinfo.h"
 
 
 /** special state token (see below) */
diff --git a/src/mesa/slang/slang_builtin.h b/src/mesa/slang/slang_builtin.h
index ed9ae80b3c3..dc92f83f8ef 100644
--- a/src/mesa/slang/slang_builtin.h
+++ b/src/mesa/slang/slang_builtin.h
@@ -26,8 +26,8 @@
 #ifndef SLANG_BUILTIN_H
 #define SLANG_BUILTIN_H
 
-#include "program/prog_parameter.h"
-#include "slang_utility.h"
+#include "main/glheader.h"
+#include "main/mtypes.h"
 #include "slang_ir.h"
 
 
diff --git a/src/mesa/slang/slang_codegen.h b/src/mesa/slang/slang_codegen.h
index 461633fe346..376a8cc2647 100644
--- a/src/mesa/slang/slang_codegen.h
+++ b/src/mesa/slang/slang_codegen.h
@@ -27,9 +27,14 @@
 #define SLANG_CODEGEN_H
 
 
-#include "main/imports.h"
+#include "main/glheader.h"
 #include "slang_compile.h"
+#include "slang_compile_variable.h"
+#include "slang_typeinfo.h"
+#include "slang_utility.h"
+#include "slang_vartable.h"
 
+struct slang_function_;
 
 #define MAX_LOOP_DEPTH 30
 
diff --git a/src/mesa/slang/slang_compile.c b/src/mesa/slang/slang_compile.c
index 12ab4666aed..de1bb56cd9a 100644
--- a/src/mesa/slang/slang_compile.c
+++ b/src/mesa/slang/slang_compile.c
@@ -36,6 +36,7 @@
 #include "program/prog_print.h"
 #include "program/prog_parameter.h"
 #include "../../glsl/pp/sl_pp_public.h"
+#include "../../glsl/pp/sl_pp_purify.h"
 #include "../../glsl/cl/sl_cl_parse.h"
 #include "slang_codegen.h"
 #include "slang_compile.h"
diff --git a/src/mesa/slang/slang_compile.h b/src/mesa/slang/slang_compile.h
index 71fcaa39931..6061f878e75 100644
--- a/src/mesa/slang/slang_compile.h
+++ b/src/mesa/slang/slang_compile.h
@@ -25,13 +25,14 @@
 #if !defined SLANG_COMPILE_H
 #define SLANG_COMPILE_H
 
-#include "main/imports.h"
+#include "main/glheader.h"
 #include "main/mtypes.h"
-#include "slang_typeinfo.h"
-#include "slang_compile_variable.h"
-#include "slang_compile_struct.h"
-#include "slang_compile_operation.h"
 #include "slang_compile_function.h"
+#include "slang_compile_struct.h"
+#include "slang_compile_variable.h"
+#include "slang_utility.h"
+
+struct slang_code_object_;
 
 #if defined __cplusplus
 extern "C" {
diff --git a/src/mesa/slang/slang_compile_function.h b/src/mesa/slang/slang_compile_function.h
index a5445ec2537..0eced3ca1a1 100644
--- a/src/mesa/slang/slang_compile_function.h
+++ b/src/mesa/slang/slang_compile_function.h
@@ -25,6 +25,14 @@
 #ifndef SLANG_COMPILE_FUNCTION_H
 #define SLANG_COMPILE_FUNCTION_H
 
+#include "main/glheader.h"
+#include "slang_compile_operation.h"
+#include "slang_compile_variable.h"
+#include "slang_log.h"
+#include "slang_utility.h"
+
+struct slang_name_space_;
+struct slang_operation_;
 
 /**
  * Types of functions.
diff --git a/src/mesa/slang/slang_compile_operation.h b/src/mesa/slang/slang_compile_operation.h
index 1f15c198963..b32573e0224 100644
--- a/src/mesa/slang/slang_compile_operation.h
+++ b/src/mesa/slang/slang_compile_operation.h
@@ -26,6 +26,11 @@
 #define SLANG_COMPILE_OPERATION_H
 
 
+#include "main/compiler.h"
+#include "main/glheader.h"
+#include "slang_compile_variable.h"
+#include "slang_utility.h"
+
 /**
  * Types of slang operations.
  * These are the types of the AST (abstract syntax tree) nodes.
diff --git a/src/mesa/slang/slang_compile_struct.h b/src/mesa/slang/slang_compile_struct.h
index 90c5512f4d3..7be6f204e11 100644
--- a/src/mesa/slang/slang_compile_struct.h
+++ b/src/mesa/slang/slang_compile_struct.h
@@ -29,6 +29,9 @@
 extern "C" {
 #endif
 
+#include "main/glheader.h"
+#include "slang_utility.h"
+
 struct slang_function_;
 
 typedef struct slang_struct_scope_
diff --git a/src/mesa/slang/slang_compile_variable.h b/src/mesa/slang/slang_compile_variable.h
index 5c9d248b354..48dc6efca4b 100644
--- a/src/mesa/slang/slang_compile_variable.h
+++ b/src/mesa/slang/slang_compile_variable.h
@@ -26,7 +26,9 @@
 #define SLANG_COMPILE_VARIABLE_H
 
 
-struct slang_ir_storage_;
+#include "main/glheader.h"
+#include "slang_typeinfo.h"
+#include "slang_utility.h"
 
 
 /**
diff --git a/src/mesa/slang/slang_emit.h b/src/mesa/slang/slang_emit.h
index ab4c202d673..f93d6b00d69 100644
--- a/src/mesa/slang/slang_emit.h
+++ b/src/mesa/slang/slang_emit.h
@@ -25,11 +25,9 @@
 #ifndef SLANG_EMIT_H
 #define SLANG_EMIT_H
 
-
-#include "main/imports.h"
-#include "slang_compile.h"
+#include "main/glheader.h"
 #include "slang_ir.h"
-#include "main/mtypes.h"
+#include "slang_vartable.h"
 
 
 extern GLuint
diff --git a/src/mesa/slang/slang_ir.h b/src/mesa/slang/slang_ir.h
index b7a373746b4..a010efcb342 100644
--- a/src/mesa/slang/slang_ir.h
+++ b/src/mesa/slang/slang_ir.h
@@ -33,10 +33,11 @@
 #define SLANG_IR_H
 
 
-#include "main/imports.h"
-#include "slang_compile.h"
-#include "slang_label.h"
+#include "main/glheader.h"
 #include "main/mtypes.h"
+#include "program/prog_instruction.h"
+#include "slang_compile_variable.h"
+#include "slang_label.h"
 
 
 /**
diff --git a/src/mesa/slang/slang_label.c b/src/mesa/slang/slang_label.c
index 8e3a8ebc1aa..a1611398008 100644
--- a/src/mesa/slang/slang_label.c
+++ b/src/mesa/slang/slang_label.c
@@ -7,6 +7,9 @@
  */
 
 
+#include "main/imports.h"
+#include "main/mtypes.h"
+#include "program/prog_instruction.h"
 #include "slang_label.h"
 #include "slang_mem.h"
 
diff --git a/src/mesa/slang/slang_label.h b/src/mesa/slang/slang_label.h
index 4d04df18d25..b0cff3a8e89 100644
--- a/src/mesa/slang/slang_label.h
+++ b/src/mesa/slang/slang_label.h
@@ -1,10 +1,9 @@
 #ifndef SLANG_LABEL_H
 #define SLANG_LABEL_H 1
 
-#include "main/imports.h"
-#include "main/mtypes.h"
-#include "program/prog_instruction.h"
+#include "main/glheader.h"
 
+struct gl_program;
 
 struct slang_label_
 {
diff --git a/src/mesa/slang/slang_link.c b/src/mesa/slang/slang_link.c
index 94db6f918d7..28ad0b74e8c 100644
--- a/src/mesa/slang/slang_link.c
+++ b/src/mesa/slang/slang_link.c
@@ -42,6 +42,7 @@
 #include "program/prog_statevars.h"
 #include "program/prog_uniform.h"
 #include "slang_builtin.h"
+#include "slang_compile.h"
 #include "slang_link.h"
 
 
@@ -1206,11 +1207,11 @@ _slang_link(GLcontext *ctx,
       vertNotify = ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_PROGRAM_ARB,
                                                  &shProg->FragmentProgram->Base);
       if (ctx->Shader.Flags & GLSL_DUMP) {
-         printf("Mesa pre-link fragment program:\n");
+         fprintf(stderr, "Mesa pre-link fragment program:\n");
          _mesa_print_program(&fragProg->Base);
          _mesa_print_program_parameters(ctx, &fragProg->Base);
 
-         printf("Mesa post-link fragment program:\n");
+         fprintf(stderr, "Mesa post-link fragment program:\n");
          _mesa_print_program(&shProg->FragmentProgram->Base);
          _mesa_print_program_parameters(ctx, &shProg->FragmentProgram->Base);
       }
@@ -1229,11 +1230,11 @@ _slang_link(GLcontext *ctx,
       geomNotify = ctx->Driver.ProgramStringNotify(ctx, MESA_GEOMETRY_PROGRAM,
                                                    &shProg->GeometryProgram->Base);
       if (ctx->Shader.Flags & GLSL_DUMP) {
-         printf("Mesa pre-link geometry program:\n");
+         fprintf(stderr, "Mesa pre-link geometry program:\n");
          _mesa_print_program(&geomProg->Base);
          _mesa_print_program_parameters(ctx, &geomProg->Base);
 
-         printf("Mesa post-link geometry program:\n");
+         fprintf(stderr, "Mesa post-link geometry program:\n");
          _mesa_print_program(&shProg->GeometryProgram->Base);
          _mesa_print_program_parameters(ctx, &shProg->GeometryProgram->Base);
       }
@@ -1247,11 +1248,11 @@ _slang_link(GLcontext *ctx,
       fragNotify = ctx->Driver.ProgramStringNotify(ctx, GL_VERTEX_PROGRAM_ARB,
                                                    &shProg->VertexProgram->Base);
       if (ctx->Shader.Flags & GLSL_DUMP) {
-         printf("Mesa pre-link vertex program:\n");
+         fprintf(stderr, "Mesa pre-link vertex program:\n");
          _mesa_print_program(&vertProg->Base);
          _mesa_print_program_parameters(ctx, &vertProg->Base);
 
-         printf("Mesa post-link vertex program:\n");
+         fprintf(stderr, "Mesa post-link vertex program:\n");
          _mesa_print_program(&shProg->VertexProgram->Base);
          _mesa_print_program_parameters(ctx, &shProg->VertexProgram->Base);
       }
@@ -1266,10 +1267,10 @@ _slang_link(GLcontext *ctx,
    }
 
    if (ctx->Shader.Flags & GLSL_DUMP) {
-      printf("Varying vars:\n");
+      fprintf(stderr, "Varying vars:\n");
       _mesa_print_parameter_list(shProg->Varying);
       if (shProg->InfoLog) {
-         printf("Info Log: %s\n", shProg->InfoLog);
+         fprintf(stderr, "Info Log: %s\n", shProg->InfoLog);
       }
    }
 
diff --git a/src/mesa/slang/slang_link.h b/src/mesa/slang/slang_link.h
index 2b44d20787a..3e9fa2d743d 100644
--- a/src/mesa/slang/slang_link.h
+++ b/src/mesa/slang/slang_link.h
@@ -25,7 +25,7 @@
 #ifndef SLANG_LINK_H
 #define SLANG_LINK_H 1
 
-#include "slang_compile.h"
+#include "main/mtypes.h"
 
 
 extern void
diff --git a/src/mesa/slang/slang_log.h b/src/mesa/slang/slang_log.h
index dcaba0285a7..544a26654e7 100644
--- a/src/mesa/slang/slang_log.h
+++ b/src/mesa/slang/slang_log.h
@@ -27,6 +27,8 @@
 #define SLANG_LOG_H
 
 
+#include "main/glheader.h"
+
 typedef struct slang_info_log_
 {
    char *text;
diff --git a/src/mesa/slang/slang_mem.h b/src/mesa/slang/slang_mem.h
index b5bfae24791..0f06df3c0c0 100644
--- a/src/mesa/slang/slang_mem.h
+++ b/src/mesa/slang/slang_mem.h
@@ -27,7 +27,7 @@
 #define SLANG_MEM_H
 
 
-#include "main/imports.h"
+#include "main/glheader.h"
 
 
 typedef struct slang_mempool_ slang_mempool;
diff --git a/src/mesa/slang/slang_print.h b/src/mesa/slang/slang_print.h
index 46605c80610..99da3041437 100644
--- a/src/mesa/slang/slang_print.h
+++ b/src/mesa/slang/slang_print.h
@@ -3,6 +3,12 @@
 #ifndef SLANG_PRINT
 #define SLANG_PRINT
 
+#include "main/glheader.h"
+#include "slang_compile_function.h"
+#include "slang_compile_operation.h"
+#include "slang_compile_variable.h"
+#include "slang_typeinfo.h"
+
 extern void
 slang_print_function(const slang_function *f, GLboolean body);
 
diff --git a/src/mesa/slang/slang_simplify.h b/src/mesa/slang/slang_simplify.h
index 8689c23b1a0..37fb938d4fb 100644
--- a/src/mesa/slang/slang_simplify.h
+++ b/src/mesa/slang/slang_simplify.h
@@ -26,6 +26,13 @@
 #define SLANG_SIMPLIFY_H
 
 
+#include "main/glheader.h"
+#include "slang_compile.h"
+#include "slang_compile_function.h"
+#include "slang_compile_operation.h"
+#include "slang_log.h"
+#include "slang_utility.h"
+
 extern GLint
 _slang_lookup_constant(const char *name);
 
diff --git a/src/mesa/slang/slang_storage.h b/src/mesa/slang/slang_storage.h
index 1876a36dd63..de1f1841a35 100644
--- a/src/mesa/slang/slang_storage.h
+++ b/src/mesa/slang/slang_storage.h
@@ -25,7 +25,12 @@
 #ifndef SLANG_STORAGE_H
 #define SLANG_STORAGE_H
 
-#include "slang_compile.h"
+#include "main/glheader.h"
+#include "slang_compile_function.h"
+#include "slang_compile_struct.h"
+#include "slang_compile_variable.h"
+#include "slang_typeinfo.h"
+#include "slang_utility.h"
 
 
 /*
diff --git a/src/mesa/slang/slang_typeinfo.h b/src/mesa/slang/slang_typeinfo.h
index 2251b063253..5ddfe9612cb 100644
--- a/src/mesa/slang/slang_typeinfo.h
+++ b/src/mesa/slang/slang_typeinfo.h
@@ -25,11 +25,9 @@
 #ifndef SLANG_TYPEINFO_H
 #define SLANG_TYPEINFO_H 1
 
-#include "main/imports.h"
-#include "main/mtypes.h"
+#include "main/glheader.h"
 #include "slang_log.h"
 #include "slang_utility.h"
-#include "slang_vartable.h"
 
 
 struct slang_operation_;
diff --git a/src/mesa/slang/slang_utility.h b/src/mesa/slang/slang_utility.h
index 2c0d0bcbb2a..cb9b6d2aaaa 100644
--- a/src/mesa/slang/slang_utility.h
+++ b/src/mesa/slang/slang_utility.h
@@ -26,6 +26,8 @@
 #define SLANG_UTILITY_H
 
 
+#include "main/glheader.h"
+
 /* Compile-time assertions.  If the expression is zero, try to declare an
  * array of size [-1] to cause compilation error.
  */
diff --git a/src/mesa/slang/slang_vartable.h b/src/mesa/slang/slang_vartable.h
index 94bcd63f45a..97945b89d03 100644
--- a/src/mesa/slang/slang_vartable.h
+++ b/src/mesa/slang/slang_vartable.h
@@ -2,6 +2,9 @@
 #ifndef SLANG_VARTABLE_H
 #define SLANG_VARTABLE_H
 
+#include "main/glheader.h"
+#include "slang_utility.h"
+
 struct slang_ir_storage_;
 
 typedef struct slang_var_table_ slang_var_table;
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index cebaad5f000..05442ef91b5 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -40,7 +40,6 @@
 #include "program/program.h"
 
 #include "pipe/p_context.h"
-#include "pipe/p_shader_tokens.h"
 
 #include "util/u_simple_shaders.h"
 
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 13119ce2037..86bb7889032 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -543,6 +543,7 @@ st_ReadBuffer(GLcontext *ctx, GLenum buffer)
 
 void st_init_fbo_functions(struct dd_function_table *functions)
 {
+#if FEATURE_EXT_framebuffer_object
    functions->NewFramebuffer = st_new_framebuffer;
    functions->NewRenderbuffer = st_new_renderbuffer;
    functions->BindFramebuffer = st_bind_framebuffer;
@@ -550,6 +551,7 @@ void st_init_fbo_functions(struct dd_function_table *functions)
    functions->RenderTexture = st_render_texture;
    functions->FinishRenderTexture = st_finish_render_texture;
    functions->ValidateFramebuffer = st_validate_framebuffer;
+#endif
    /* no longer needed by core Mesa, drivers handle resizes...
    functions->ResizeBuffers = st_resize_buffers;
    */
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 3eb47086887..2ce5f087536 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -63,7 +63,7 @@
 #include "cso_cache/cso_context.h"
 
 
-DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE);
+DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE)
 
 
 /**
diff --git a/src/mesa/state_tracker/st_debug.c b/src/mesa/state_tracker/st_debug.c
index 1260fe04b1b..df32491d044 100644
--- a/src/mesa/state_tracker/st_debug.c
+++ b/src/mesa/state_tracker/st_debug.c
@@ -56,7 +56,7 @@ static const struct debug_named_value st_debug_flags[] = {
    DEBUG_NAMED_VALUE_END
 };
 
-DEBUG_GET_ONCE_FLAGS_OPTION(st_debug, "ST_DEBUG", st_debug_flags, 0);
+DEBUG_GET_ONCE_FLAGS_OPTION(st_debug, "ST_DEBUG", st_debug_flags, 0)
 #endif
 
 
diff --git a/src/mesa/swrast/s_alpha.h b/src/mesa/swrast/s_alpha.h
index 7a5b72e650a..239484a9743 100644
--- a/src/mesa/swrast/s_alpha.h
+++ b/src/mesa/swrast/s_alpha.h
@@ -28,7 +28,8 @@
 #define S_ALPHA_H
 
 
-#include "s_context.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 extern GLint
diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c
index 336415d57af..1338b6802d4 100644
--- a/src/mesa/swrast/s_atifragshader.c
+++ b/src/mesa/swrast/s_atifragshader.c
@@ -24,6 +24,7 @@
 #include "main/macros.h"
 #include "main/atifragshader.h"
 #include "swrast/s_atifragshader.h"
+#include "swrast/s_context.h"
 
 
 /**
diff --git a/src/mesa/swrast/s_atifragshader.h b/src/mesa/swrast/s_atifragshader.h
index 871a0c04559..cce455a0465 100644
--- a/src/mesa/swrast/s_atifragshader.h
+++ b/src/mesa/swrast/s_atifragshader.h
@@ -27,7 +27,8 @@
 #define S_ATIFRAGSHADER_H
 
 
-#include "s_context.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 extern void
diff --git a/src/mesa/swrast/s_blend.h b/src/mesa/swrast/s_blend.h
index 8d5a81635d5..9cedde3bf20 100644
--- a/src/mesa/swrast/s_blend.h
+++ b/src/mesa/swrast/s_blend.h
@@ -27,7 +27,8 @@
 #define S_BLEND_H
 
 
-#include "s_context.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 extern void
diff --git a/src/mesa/swrast/s_context.h b/src/mesa/swrast/s_context.h
index c9755e6da18..6d81f74768f 100644
--- a/src/mesa/swrast/s_context.h
+++ b/src/mesa/swrast/s_context.h
@@ -43,6 +43,7 @@
 #ifndef S_CONTEXT_H
 #define S_CONTEXT_H
 
+#include "main/compiler.h"
 #include "main/mtypes.h"
 #include "program/prog_execute.h"
 #include "swrast.h"
diff --git a/src/mesa/swrast/s_depth.c b/src/mesa/swrast/s_depth.c
index ed637cac124..f952fd6baa7 100644
--- a/src/mesa/swrast/s_depth.c
+++ b/src/mesa/swrast/s_depth.c
@@ -30,7 +30,6 @@
 #include "main/imports.h"
 
 #include "s_depth.h"
-#include "s_context.h"
 #include "s_span.h"
 
 
diff --git a/src/mesa/swrast/s_depth.h b/src/mesa/swrast/s_depth.h
index 7eae3667428..878d242f5e5 100644
--- a/src/mesa/swrast/s_depth.h
+++ b/src/mesa/swrast/s_depth.h
@@ -27,7 +27,8 @@
 #define S_DEPTH_H
 
 
-#include "s_context.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 extern GLuint
diff --git a/src/mesa/swrast/s_fog.h b/src/mesa/swrast/s_fog.h
index 06107de3f9d..a496746d106 100644
--- a/src/mesa/swrast/s_fog.h
+++ b/src/mesa/swrast/s_fog.h
@@ -28,7 +28,8 @@
 #define S_FOG_H
 
 
-#include "swrast.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 extern GLfloat
diff --git a/src/mesa/swrast/s_fragprog.c b/src/mesa/swrast/s_fragprog.c
index a2c2a10c3de..9facb44d9bf 100644
--- a/src/mesa/swrast/s_fragprog.c
+++ b/src/mesa/swrast/s_fragprog.c
@@ -26,6 +26,7 @@
 #include "main/colormac.h"
 #include "program/prog_instruction.h"
 
+#include "s_context.h"
 #include "s_fragprog.h"
 #include "s_span.h"
 
diff --git a/src/mesa/swrast/s_fragprog.h b/src/mesa/swrast/s_fragprog.h
index e1b7e679185..92b9d01e173 100644
--- a/src/mesa/swrast/s_fragprog.h
+++ b/src/mesa/swrast/s_fragprog.h
@@ -27,7 +27,8 @@
 #define S_FRAGPROG_H
 
 
-#include "s_context.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 extern void
diff --git a/src/mesa/swrast/s_logic.h b/src/mesa/swrast/s_logic.h
index e8cfae33f23..d609513348d 100644
--- a/src/mesa/swrast/s_logic.h
+++ b/src/mesa/swrast/s_logic.h
@@ -27,7 +27,8 @@
 #define S_LOGIC_H
 
 
-#include "swrast.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 extern void
 _swrast_logicop_rgba_span(GLcontext *ctx, struct gl_renderbuffer *rb,
diff --git a/src/mesa/swrast/s_masking.h b/src/mesa/swrast/s_masking.h
index 3ba4f8356cb..cb000da0fd8 100644
--- a/src/mesa/swrast/s_masking.h
+++ b/src/mesa/swrast/s_masking.h
@@ -27,7 +27,8 @@
 #define S_MASKING_H
 
 
-#include "swrast.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 extern void
diff --git a/src/mesa/swrast/s_span.c b/src/mesa/swrast/s_span.c
index 3b3929a41be..8931cdec1bc 100644
--- a/src/mesa/swrast/s_span.c
+++ b/src/mesa/swrast/s_span.c
@@ -970,6 +970,10 @@ shade_texture_span(GLcontext *ctx, SWspan *span)
       if (span->primitive == GL_BITMAP && span->array->ChanType != GL_FLOAT) {
          convert_color_type(span, GL_FLOAT, 0);
       }
+      else {
+         span->array->rgba = (void *) span->array->attribs[FRAG_ATTRIB_COL0];
+      }
+
       if (span->primitive != GL_POINT ||
 	  (span->interpMask & SPAN_RGBA) ||
 	  ctx->Point.PointSprite) {
@@ -1221,9 +1225,22 @@ _swrast_write_rgba_span( GLcontext *ctx, SWspan *span)
             GLchan rgbaSave[MAX_WIDTH][4];
             const GLuint fragOutput = multiFragOutputs ? buf : 0;
 
+            /* set span->array->rgba to colors for render buffer's datatype */
             if (rb->DataType != span->array->ChanType || fragOutput > 0) {
                convert_color_type(span, rb->DataType, fragOutput);
             }
+            else {
+               if (rb->DataType == GL_UNSIGNED_BYTE) {
+                  span->array->rgba = span->array->rgba8;
+               }
+               else if (rb->DataType == GL_UNSIGNED_SHORT) {
+                  span->array->rgba = (void *) span->array->rgba16;
+               }
+               else {
+                  span->array->rgba = (void *)
+                     span->array->attribs[FRAG_ATTRIB_COL0];
+               }
+            }
 
             if (!multiFragOutputs && numBuffers > 1) {
                /* save colors for second, third renderbuffer writes */
diff --git a/src/mesa/swrast/s_stencil.h b/src/mesa/swrast/s_stencil.h
index cd6cbc57b0b..c076ebbe2a1 100644
--- a/src/mesa/swrast/s_stencil.h
+++ b/src/mesa/swrast/s_stencil.h
@@ -27,7 +27,8 @@
 #define S_STENCIL_H
 
 
-#include "swrast.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 
diff --git a/src/mesa/swrast/s_texcombine.h b/src/mesa/swrast/s_texcombine.h
index 9ed96efb879..4f5dfbe1afe 100644
--- a/src/mesa/swrast/s_texcombine.h
+++ b/src/mesa/swrast/s_texcombine.h
@@ -27,7 +27,8 @@
 #define S_TEXCOMBINE_H
 
 
-#include "swrast.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 extern void
 _swrast_texture_span( GLcontext *ctx, SWspan *span );
diff --git a/src/mesa/swrast/s_texfilter.h b/src/mesa/swrast/s_texfilter.h
index 2e265d685c5..eceab59658e 100644
--- a/src/mesa/swrast/s_texfilter.h
+++ b/src/mesa/swrast/s_texfilter.h
@@ -27,7 +27,8 @@
 #define S_TEXFILTER_H
 
 
-#include "swrast.h"
+#include "main/mtypes.h"
+#include "s_context.h"
 
 
 extern texture_sample_func
diff --git a/src/mesa/swrast/s_zoom.h b/src/mesa/swrast/s_zoom.h
index 43917be65fc..09f624efad5 100644
--- a/src/mesa/swrast/s_zoom.h
+++ b/src/mesa/swrast/s_zoom.h
@@ -25,7 +25,8 @@
 #ifndef S_ZOOM_H
 #define S_ZOOM_H
 
-#include "swrast.h"
+#include "main/mtypes.h"
+#include "s_span.h"
 
 
 extern void
diff --git a/src/mesa/swrast_setup/ss_context.h b/src/mesa/swrast_setup/ss_context.h
index 1ec293fade1..56551ab273c 100644
--- a/src/mesa/swrast_setup/ss_context.h
+++ b/src/mesa/swrast_setup/ss_context.h
@@ -28,9 +28,8 @@
 #ifndef SS_CONTEXT_H
 #define SS_CONTEXT_H
 
-#include "main/mtypes.h"
+#include "main/glheader.h"
 #include "swrast/swrast.h"
-#include "swrast_setup.h"
 #include "tnl/t_context.h"
 
 typedef struct {
diff --git a/src/mesa/swrast_setup/ss_triangle.h b/src/mesa/swrast_setup/ss_triangle.h
index 007fa2e9141..ac553cbd018 100644
--- a/src/mesa/swrast_setup/ss_triangle.h
+++ b/src/mesa/swrast_setup/ss_triangle.h
@@ -29,7 +29,7 @@
 #ifndef SS_TRIANGLE_H
 #define SS_TRIANGLE_H
 
-#include "ss_context.h"
+#include "main/mtypes.h"
 
 
 void _swsetup_trifuncs_init( GLcontext *ctx );
diff --git a/src/mesa/swrast_setup/ss_vb.h b/src/mesa/swrast_setup/ss_vb.h
index 2ad1f56f396..944a3b78d8c 100644
--- a/src/mesa/swrast_setup/ss_vb.h
+++ b/src/mesa/swrast_setup/ss_vb.h
@@ -30,7 +30,6 @@
 #define SS_VB_H
 
 #include "main/mtypes.h"
-#include "swrast_setup.h"
 
 void _swsetup_vb_init( GLcontext *ctx );
 void _swsetup_choose_rastersetup_func( GLcontext *ctx );
diff --git a/src/mesa/tnl/t_context.h b/src/mesa/tnl/t_context.h
index ebaae6335b9..258906f7956 100644
--- a/src/mesa/tnl/t_context.h
+++ b/src/mesa/tnl/t_context.h
@@ -53,9 +53,7 @@
 #include "main/bitset.h"
 #include "main/mtypes.h"
 
-#include "math/m_matrix.h"
 #include "math/m_vector.h"
-#include "math/m_xform.h"
 
 #include "vbo/vbo.h"
 
diff --git a/src/mesa/tnl/t_vb_program.c b/src/mesa/tnl/t_vb_program.c
index 3973df9a67a..f3a338ef1ed 100644
--- a/src/mesa/tnl/t_vb_program.c
+++ b/src/mesa/tnl/t_vb_program.c
@@ -35,6 +35,7 @@
 #include "main/colormac.h"
 #include "main/macros.h"
 #include "main/imports.h"
+#include "math/m_xform.h"
 #include "program/prog_instruction.h"
 #include "program/prog_statevars.h"
 #include "program/prog_execute.h"
diff --git a/src/mesa/tnl/t_vb_render.c b/src/mesa/tnl/t_vb_render.c
index c1bebc99423..7d991009a14 100644
--- a/src/mesa/tnl/t_vb_render.c
+++ b/src/mesa/tnl/t_vb_render.c
@@ -44,6 +44,7 @@
 #include "main/macros.h"
 #include "main/imports.h"
 #include "main/mtypes.h"
+#include "math/m_xform.h"
 
 #include "t_pipeline.h"
 
diff --git a/src/mesa/vf/vf.h b/src/mesa/vf/vf.h
index 83d7547619c..5fe392bbe51 100644
--- a/src/mesa/vf/vf.h
+++ b/src/mesa/vf/vf.h
@@ -28,7 +28,7 @@
 #ifndef VF_VERTEX_H
 #define VF_VERTEX_H
 
-#include "main/mtypes.h"
+#include "main/glheader.h"
 #include "math/m_vector.h"
 
 enum {
diff --git a/src/mesa/x86/3dnow.h b/src/mesa/x86/3dnow.h
index df9f2638d76..1c1fedcd4f3 100644
--- a/src/mesa/x86/3dnow.h
+++ b/src/mesa/x86/3dnow.h
@@ -31,8 +31,6 @@
 #ifndef __3DNOW_H__
 #define __3DNOW_H__
 
-#include "math/m_xform.h"
-
 void _mesa_init_3dnow_transform_asm( void );
 
 #endif
diff --git a/src/mesa/x86/mmx.h b/src/mesa/x86/mmx.h
index 5641936bdb0..47a0d4b54dd 100644
--- a/src/mesa/x86/mmx.h
+++ b/src/mesa/x86/mmx.h
@@ -26,6 +26,9 @@
 #ifndef ASM_MMX_H
 #define ASM_MMX_H
 
+#include "main/compiler.h"
+#include "main/mtypes.h"
+
 extern void _ASMAPI
 _mesa_mmx_blend_transparency( GLcontext *ctx, GLuint n, const GLubyte mask[],
                               GLvoid *rgba, const GLvoid *dest,
diff --git a/src/mesa/x86/sse.h b/src/mesa/x86/sse.h
index 521f91e4117..e92ddc13941 100644
--- a/src/mesa/x86/sse.h
+++ b/src/mesa/x86/sse.h
@@ -31,8 +31,6 @@
 #ifndef __SSE_H__
 #define __SSE_H__
 
-#include "math/m_xform.h"
-
 void _mesa_init_sse_transform_asm( void );
 
 #endif