82 files changed, 2534 insertions, 1001 deletions
diff --git a/docs/egl.html b/docs/egl.html
index 57b1d1488a8..30cbe0eaedd 100644
--- a/docs/egl.html
+++ b/docs/egl.html
@@ -126,10 +126,21 @@ test your build.  For example,</p>
 runtime</p>
 
 <ul>
+<li><code>EGL_DRIVERS_PATH</code>
+
+<p>By default, the main library will look for drivers in the directory where
+the drivers are installed to.  This variable specifies a list of
+colon-separated directories where the main library will look for drivers, in
+addition to the default directory.  This variable is ignored for setuid/setgid
+binaries.</p>
+
+</li>
+
 <li><code>EGL_DRIVER</code>
 
-<p>This variable forces the specified EGL driver to be loaded.  It comes in
-handy when one wants to test a specific driver.</p>
+<p>This variable specifies a full path to an EGL driver and it forces the
+specified EGL driver to be loaded.  It comes in handy when one wants to test a
+specific driver.  This variable is ignored for setuid/setgid binaries.</p>
 
 </li>
 
diff --git a/docs/envvars.html b/docs/envvars.html
index bb1c914cc73..fd1700a02f1 100644
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -69,6 +69,10 @@ These environment variables are for the Radeon R300 driver:
 <li>R300_NO_TCL - if set, disable hardware-accelerated Transform/Clip/Lighting.
 </ul>
 
+<p>
+Mesa EGL supports different sets of environment variables.  See the
+<a href="egl.html">Mesa EGL</a> page for the details.
+</p>
 
 </BODY>
 </HTML>
diff --git a/progs/es2/xegl/tri.c b/progs/es2/xegl/tri.c
index 7729a099578..8981d8a7e21 100644
--- a/progs/es2/xegl/tri.c
+++ b/progs/es2/xegl/tri.c
@@ -334,14 +334,15 @@ make_x_window(Display *x_dpy, EGLDisplay egl_dpy,
       exit(1);
    }
 
+   /* sanity checks */
    {
       EGLint val;
       eglQuerySurface(egl_dpy, *surfRet, EGL_WIDTH, &val);
       assert(val == width);
       eglQuerySurface(egl_dpy, *surfRet, EGL_HEIGHT, &val);
       assert(val == height);
-      eglQuerySurface(egl_dpy, *surfRet, EGL_SURFACE_TYPE, &val);
-      assert(val == EGL_WINDOW_BIT);
+      assert(eglGetConfigAttrib(egl_dpy, config, EGL_SURFACE_TYPE, &val));
+      assert(val & EGL_WINDOW_BIT);
    }
 
    XFree(visInfo);
diff --git a/progs/fpglsl/.gitignore b/progs/fpglsl/.gitignore
new file mode 100644
index 00000000000..9fe73ab0678
--- /dev/null
+++ b/progs/fpglsl/.gitignore
@@ -0,0 +1 @@
+fp-tri
diff --git a/progs/fpglsl/Makefile b/progs/fpglsl/Makefile
new file mode 100644
index 00000000000..3bf14b4b709
--- /dev/null
+++ b/progs/fpglsl/Makefile
@@ -0,0 +1,52 @@
+# progs/tests/Makefile
+
+
+# These programs aren't intended to be included with the normal distro.
+# They're not too interesting but they're good for testing.
+
+TOP = ../..
+include $(TOP)/configs/current
+
+LIBS = -L$(TOP)/$(LIB_DIR) -l$(GLUT_LIB)  -l$(GLEW_LIB) -l$(GLU_LIB) -l$(GL_LIB) $(APP_LIB_DEPS)
+
+SOURCES = \
+	fp-tri.c
+
+
+
+PROGS = $(SOURCES:%.c=%)
+
+INCLUDES = -I. -I$(TOP)/include -I../samples
+
+
+##### RULES #####
+
+.SUFFIXES:
+.SUFFIXES: .c
+
+.c:
+	$(CC) $(INCLUDES) $(CFLAGS) $< $(LIBS) -o $@
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DEFINES)  $< -o $@
+
+
+##### TARGETS #####
+
+default: $(PROGS)
+
+clean:
+	rm -f $(PROGS)
+	rm -f *.o
+	rm -f getproclist.h
+
+
+
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
diff --git a/progs/fpglsl/SConscript b/progs/fpglsl/SConscript
new file mode 100644
index 00000000000..e31fa320238
--- /dev/null
+++ b/progs/fpglsl/SConscript
@@ -0,0 +1,13 @@
+Import('env')
+
+if not env['GLUT']:
+    Return()
+
+env = env.Clone()
+
+env.Prepend(LIBS = ['$GLUT_LIB'])
+
+env.Program(
+        target = 'fp-tri',
+        source = ['fp-tri.c'],
+    )
diff --git a/progs/fpglsl/fp-tri.c b/progs/fpglsl/fp-tri.c
new file mode 100644
index 00000000000..c9b08fbbad7
--- /dev/null
+++ b/progs/fpglsl/fp-tri.c
@@ -0,0 +1,415 @@
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifndef WIN32
+#include <unistd.h>
+#include <signal.h>
+#endif
+
+#include <GL/glew.h>
+#include <GL/glut.h>
+
+#include "readtex.c"
+
+
+#define TEXTURE_FILE "../images/bw.rgb"
+
+unsigned show_fps = 0;
+unsigned int frame_cnt = 0;
+void alarmhandler(int);
+static const char *filename = NULL;
+
+static GLuint fragShader;
+static GLuint vertShader;
+static GLuint program;
+
+
+static void usage(char *name)
+{
+   fprintf(stderr, "usage: %s [ options ] shader_filename\n", name);
+#ifndef WIN32
+   fprintf(stderr, "\n" );
+   fprintf(stderr, "options:\n");
+   fprintf(stderr, "    -fps  show frames per second\n");
+#endif
+}
+
+#ifndef WIN32
+void alarmhandler (int sig)
+{
+   if (sig == SIGALRM) {
+      printf("%d frames in 5.0 seconds = %.3f FPS\n", frame_cnt,
+             frame_cnt / 5.0);
+
+      frame_cnt = 0;
+   }
+   signal(SIGALRM, alarmhandler);
+   alarm(5);
+}
+#endif
+
+
+
+
+static void load_and_compile_shader(GLuint shader, const char *text)
+{
+   GLint stat;
+
+   glShaderSource(shader, 1, (const GLchar **) &text, NULL);
+
+   glCompileShader(shader);
+
+   glGetShaderiv(shader, GL_COMPILE_STATUS, &stat);
+   if (!stat) {
+      GLchar log[1000];
+      GLsizei len;
+      glGetShaderInfoLog(shader, 1000, &len, log);
+      fprintf(stderr, "fp-tri: problem compiling shader:\n%s\n", log);
+      exit(1);
+   }
+}
+
+static void read_shader(GLuint shader, const char *filename)
+{
+   const int max = 100*1000;
+   int n;
+   char *buffer = (char*) malloc(max);
+   FILE *f = fopen(filename, "r");
+   if (!f) {
+      fprintf(stderr, "fp-tri: Unable to open shader file %s\n", filename);
+      exit(1);
+   }
+
+   n = fread(buffer, 1, max, f);
+   printf("fp-tri: read %d bytes from shader file %s\n", n, filename);
+   if (n > 0) {
+      buffer[n] = 0;
+      load_and_compile_shader(shader, buffer);
+   }
+
+   fclose(f);
+   free(buffer);
+}
+
+static void check_link(GLuint prog)
+{
+   GLint stat;
+   glGetProgramiv(prog, GL_LINK_STATUS, &stat);
+   if (!stat) {
+      GLchar log[1000];
+      GLsizei len;
+      glGetProgramInfoLog(prog, 1000, &len, log);
+      fprintf(stderr, "Linker error:\n%s\n", log);
+   }
+}
+
+static void setup_uniforms()
+{
+   {
+      GLint loc1f = glGetUniformLocationARB(program, "Offset1f");
+      GLint loc2f = glGetUniformLocationARB(program, "Offset2f");
+      GLint loc4f = glGetUniformLocationARB(program, "Offset4f");
+      GLfloat vecKer[] =
+         { 1.0, 0.0, 0.0,  1.0,
+           0.0, 1.0, 0.0,  1.0,
+           1.0, 0.0, 0.0,  1.0,
+           0.0, 0.0, 0.0,  1.0
+         };
+      if (loc1f >= 0)
+         glUniform1fv(loc1f, 16, vecKer);
+
+      if (loc2f >= 0)
+         glUniform2fv(loc2f, 8, vecKer);
+
+      if (loc4f >= 0)
+         glUniform4fv(loc4f, 4, vecKer);
+
+   }
+
+   {
+      GLint loc1f = glGetUniformLocationARB(program, "KernelValue1f");
+      GLint loc2f = glGetUniformLocationARB(program, "KernelValue2f");
+      GLint loc4f = glGetUniformLocationARB(program, "KernelValue4f");
+      GLfloat vecKer[] =
+         { 1.0, 0.0, 0.0,  0.25,
+           0.0, 1.0, 0.0,  0.25,
+           0.0, 0.0, 1.0,  0.25,
+           0.0, 0.0, 0.0,  0.25,
+           0.5, 0.0, 0.0,  0.35,
+           0.0, 0.5, 0.0,  0.35,
+           0.0, 0.0, 0.5,  0.35,
+           0.0, 0.0, 0.0,  0.35
+         };
+      if (loc1f >= 0)
+         glUniform1fv(loc1f, 16, vecKer);
+
+      if (loc2f >= 0)
+         glUniform2fv(loc2f, 8, vecKer);
+
+      if (loc4f >= 0)
+         glUniform4fv(loc4f, 4, vecKer);
+   }
+}
+
+static void prepare_shaders()
+{
+   static const char *fragShaderText =
+      "void main() {\n"
+      "    gl_FragColor = gl_Color;\n"
+      "}\n";
+   static const char *vertShaderText =
+      "void main() {\n"
+      "   gl_FrontColor = gl_Color;\n"
+      "   gl_Position = gl_ModelViewProjectionMatrix * gl_Vertex;\n"
+      "}\n";
+   fragShader = glCreateShader(GL_FRAGMENT_SHADER);
+   if (filename)
+      read_shader(fragShader, filename);
+   else
+      load_and_compile_shader(fragShader, fragShaderText);
+
+
+   vertShader = glCreateShader(GL_VERTEX_SHADER);
+   load_and_compile_shader(vertShader, vertShaderText);
+
+   program = glCreateProgram();
+   glAttachShader(program, fragShader);
+   glAttachShader(program, vertShader);
+   glLinkProgram(program);
+   check_link(program);
+   glUseProgram(program);
+
+   setup_uniforms();
+}
+
+#define LEVELS 8
+#define SIZE (1<<LEVELS)
+static int TexWidth = SIZE, TexHeight = SIZE;
+
+
+static void
+ResetTextureLevel( int i )
+{
+   GLubyte tex2d[SIZE*SIZE][4];
+      
+   {
+      GLint Width = TexWidth / (1 << i);
+      GLint Height = TexHeight / (1 << i);
+      GLint s, t;
+         
+      for (s = 0; s < Width; s++) {
+         for (t = 0; t < Height; t++) {
+            tex2d[t*Width+s][0] = ((s / 16) % 2) ? 0 : 255;
+            tex2d[t*Width+s][1] = ((t / 16) % 2) ? 0 : 255;
+            tex2d[t*Width+s][2] = 128;
+            tex2d[t*Width+s][3] = 255;
+         }
+      }
+         
+      glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+         
+      glTexImage2D(GL_TEXTURE_2D, i, GL_RGB, Width, Height, 0,
+                   GL_RGBA, GL_UNSIGNED_BYTE, tex2d);
+   }
+}
+
+
+static void
+ResetTexture( void )
+{
+   int i;
+      
+   for (i = 0; i <= LEVELS; i++)
+   {
+      ResetTextureLevel(i);
+   }
+}
+
+static void Init( void )
+{
+   GLuint Texture;
+
+   /* Load texture */
+   glGenTextures(1, &Texture);
+   glBindTexture(GL_TEXTURE_2D, Texture);
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+   glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+   if (!LoadRGBMipmaps(TEXTURE_FILE, GL_RGB)) {
+      printf("Error: couldn't load texture image file %s\n", TEXTURE_FILE);
+      exit(1);
+   }
+
+
+   glGenTextures(1, &Texture);
+   glActiveTextureARB(GL_TEXTURE0_ARB + 1);
+   glBindTexture(GL_TEXTURE_2D, Texture);
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+   glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+   {
+      GLubyte data[32][32];
+      int width = 32;
+      int height = 32;
+      int i;
+      int j;
+
+      for (i = 0; i < 32; i++)
+         for (j = 0; j < 32; j++)
+	 {
+	    /**
+	     ** +-----------+
+	     ** |     W     |
+	     ** |  +-----+  |
+	     ** |  |     |  |
+	     ** |  |  B  |  |
+	     ** |  |     |  |
+	     ** |  +-----+  |
+	     ** |           |
+	     ** +-----------+
+	     **/
+	    int i2 = i - height / 2;
+	    int j2 = j - width / 2;
+	    int h8 = height / 8;
+	    int w8 = width / 8;
+	    if ( -h8 <= i2 && i2 <= h8 && -w8 <= j2 && j2 <= w8 ) {
+	       data[i][j] = 0x00;
+	    } else if ( -2 * h8 <= i2 && i2 <= 2 * h8 && -2 * w8 <= j2 && j2 <= 2 * w8 ) {
+	       data[i][j] = 0x55;
+	    } else if ( -3 * h8 <= i2 && i2 <= 3 * h8 && -3 * w8 <= j2 && j2 <= 3 * w8 ) {
+	       data[i][j] = 0xaa;
+	    } else {
+	       data[i][j] = 0xff;
+	    }
+	 }
+
+      glTexImage2D( GL_TEXTURE_2D, 0,
+                    GL_ALPHA8,
+                    32, 32, 0,
+                    GL_ALPHA, GL_UNSIGNED_BYTE, data );
+   }
+
+   glGenTextures(1, &Texture);
+   glActiveTextureARB(GL_TEXTURE0_ARB + 2);
+   glBindTexture(GL_TEXTURE_2D, Texture);
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST_MIPMAP_NEAREST);
+   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+   glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+   ResetTexture();
+
+   glClearColor(.1, .3, .5, 0);
+}
+
+
+
+
+static void args(int argc, char *argv[])
+{
+   GLint i;
+
+   for (i = 1; i < argc; i++) {
+      if (strcmp(argv[i], "-fps") == 0) {
+         show_fps = 1;
+      }
+      else if (i == argc - 1) {
+	 filename = argv[i];
+      }
+      else {
+	 usage(argv[0]);
+	 exit(1);
+      }
+   }
+}
+
+
+
+
+
+static void Reshape(int width, int height)
+{
+
+    glViewport(0, 0, (GLint)width, (GLint)height);
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    glOrtho(-1.0, 1.0, -1.0, 1.0, -0.5, 1000.0);
+    glMatrixMode(GL_MODELVIEW);
+}
+
+static void CleanUp(void)
+{
+   glDeleteShader(fragShader);
+   glDeleteShader(vertShader);
+   glDeleteProgram(program);
+}
+
+static void Key(unsigned char key, int x, int y)
+{
+
+   switch (key) {
+   case 27:
+      CleanUp();
+      exit(1);
+   default:
+      break;
+   }
+
+   glutPostRedisplay();
+}
+
+static void Display(void)
+{
+   glClear(GL_COLOR_BUFFER_BIT);
+
+   glUseProgram(program);
+   glProgramLocalParameter4fARB(GL_FRAGMENT_PROGRAM_ARB, 0, 1.0, 1.0, 0.0, 0.0);
+   glProgramLocalParameter4fARB(GL_FRAGMENT_PROGRAM_ARB, 1, 0.0, 0.0, 1.0, 1.0);
+   glBegin(GL_TRIANGLES);
+
+   glColor3f(0,0,1);
+   glTexCoord3f(1,1,0);
+   glVertex3f( 0.9, -0.9, -30.0);
+
+   glColor3f(1,0,0);
+   glTexCoord3f(1,-1,0);
+   glVertex3f( 0.9,  0.9, -30.0);
+
+   glColor3f(0,1,0);
+   glTexCoord3f(-1,0,0);
+   glVertex3f(-0.9,  0.0, -30.0);
+   glEnd();
+
+   glFlush();
+   if (show_fps) {
+      ++frame_cnt;
+      glutPostRedisplay();
+   }
+}
+
+
+int main(int argc, char **argv)
+{
+   glutInit(&argc, argv);
+   glutInitWindowPosition(0, 0);
+   glutInitWindowSize(250, 250);
+   glutInitDisplayMode(GLUT_RGB | GLUT_SINGLE | GLUT_DEPTH);
+   args(argc, argv);
+   glutCreateWindow(filename ? filename : "fp-tri");
+   glewInit();
+   glutReshapeFunc(Reshape);
+   glutKeyboardFunc(Key);
+   glutDisplayFunc(Display);
+   prepare_shaders();
+   Init();
+#ifndef WIN32
+   if (show_fps) {
+      signal(SIGALRM, alarmhandler);
+      alarm(5);
+   }
+#endif
+   glutMainLoop();
+   return 0;
+}
diff --git a/progs/fpglsl/mov-imm.glsl b/progs/fpglsl/mov-imm.glsl
new file mode 100644
index 00000000000..cbb75ce342c
--- /dev/null
+++ b/progs/fpglsl/mov-imm.glsl
@@ -0,0 +1,3 @@
+void main() {
+    gl_FragColor = vec4(1,0,1,1);
+}
diff --git a/progs/fpglsl/mov.glsl b/progs/fpglsl/mov.glsl
new file mode 100644
index 00000000000..4a1f185ba44
--- /dev/null
+++ b/progs/fpglsl/mov.glsl
@@ -0,0 +1,3 @@
+void main() {
+    gl_FragColor = gl_Color;
+}
diff --git a/progs/fpglsl/tex-multi.glsl b/progs/fpglsl/tex-multi.glsl
new file mode 100644
index 00000000000..5220b7efaf2
--- /dev/null
+++ b/progs/fpglsl/tex-multi.glsl
@@ -0,0 +1,15 @@
+// Multi-texture fragment shader
+// Brian Paul
+
+// Composite second texture over first.
+// We're assuming the 2nd texture has a meaningful alpha channel.
+
+uniform sampler2D tex1;
+uniform sampler2D tex2;
+
+void main()
+{
+   vec4 t1 = texture2D(tex1, gl_Color.xy);
+   vec4 t2 = texture2D(tex2, gl_Color.yz);
+   gl_FragColor = mix(t1, t2, t2.w);
+}
diff --git a/progs/fpglsl/tex.glsl b/progs/fpglsl/tex.glsl
new file mode 100644
index 00000000000..4302fabe2d5
--- /dev/null
+++ b/progs/fpglsl/tex.glsl
@@ -0,0 +1,6 @@
+uniform sampler2D tex1;
+
+void main()
+{
+   gl_FragColor = texture2D(tex1, gl_Color.xy);
+}
diff --git a/progs/tests/Makefile b/progs/tests/Makefile
index 836396b2499..a38f411def8 100644
--- a/progs/tests/Makefile
+++ b/progs/tests/Makefile
@@ -48,6 +48,7 @@ SOURCES = \
 	floattex.c \
 	fbotest1.c \
 	fbotest2.c \
+	fbotest3.c \
 	fillrate.c \
 	fog.c \
 	fogcoord.c \
diff --git a/progs/tests/SConscript b/progs/tests/SConscript
index e2c65382887..0a11b965f79 100644
--- a/progs/tests/SConscript
+++ b/progs/tests/SConscript
@@ -51,6 +51,7 @@ progs = [
     'ext422square',
     'fbotest1',
     'fbotest2',
+    'fbotest3',
     'fillrate',
     'floattex',
     'fog',
diff --git a/progs/tests/fbotest3.c b/progs/tests/fbotest3.c
new file mode 100644
index 00000000000..8e288b38b83
--- /dev/null
+++ b/progs/tests/fbotest3.c
@@ -0,0 +1,231 @@
+/*
+ * Test GL_EXT_framebuffer_object
+ * Like fbotest2.c but use a texture for the Z buffer / renderbuffer.
+ * Note: the Z texture is never resized so that limits what can be
+ * rendered if the window is resized.
+ *
+ * This tests a bug reported by Christoph Bumiller on 1 Feb 2010
+ * on mesa3d-dev.
+ *
+ * XXX this should be made into a piglit test.
+ *
+ * Brian Paul
+ * 1 Feb 2010
+ */
+
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <GL/glew.h>
+#include <GL/glut.h>
+
+
+static int Win = 0;
+static int Width = 400, Height = 400;
+static GLuint Tex = 0;
+static GLuint MyFB, ColorRb, DepthRb;
+static GLboolean Animate = GL_FALSE;
+static GLfloat Rotation = 0.0;
+
+
+static void
+CheckError(int line)
+{
+   GLenum err = glGetError();
+   if (err) {
+      printf("fbotest3: GL Error 0x%x at line %d\n", (int) err, line);
+   }
+}
+
+
+static void
+Display( void )
+{
+   GLubyte *buffer = malloc(Width * Height * 4);
+   GLenum status;
+
+   CheckError(__LINE__);
+
+   /* draw to user framebuffer */
+   glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, MyFB);
+   glDrawBuffer(GL_COLOR_ATTACHMENT1_EXT);
+   glReadBuffer(GL_COLOR_ATTACHMENT1_EXT);
+
+   status = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT);
+   if (status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+      printf("fbotest3: Error: Framebuffer is incomplete!!!\n");
+   }
+
+   CheckError(__LINE__);
+
+   glClearColor(0.5, 0.5, 1.0, 0.0);
+   glClear( GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT );
+
+   glEnable(GL_DEPTH_TEST);
+   glEnable(GL_LIGHTING);
+   glEnable(GL_LIGHT0);
+
+   glPushMatrix();
+   glRotatef(30.0, 1, 0, 0);
+   glRotatef(Rotation, 0, 1, 0);
+   glutSolidTeapot(2.0);
+   glPopMatrix();
+
+   /* read from user framebuffer */
+   glReadPixels(0, 0, Width, Height, GL_RGBA, GL_UNSIGNED_BYTE, buffer);
+
+   /* draw to window */
+   glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
+   glDisable(GL_DEPTH_TEST);  /* in case window has depth buffer */
+   glWindowPos2iARB(0, 0);
+   glDrawPixels(Width, Height, GL_RGBA, GL_UNSIGNED_BYTE, buffer);
+
+   free(buffer);
+   glutSwapBuffers();
+   CheckError(__LINE__);
+}
+
+
+static void
+Reshape( int width, int height )
+{
+   float ar = (float) width / (float) height;
+
+   glViewport( 0, 0, width, height );
+   glMatrixMode( GL_PROJECTION );
+   glLoadIdentity();
+   glFrustum( -ar, ar, -1.0, 1.0, 5.0, 25.0 );
+
+   glMatrixMode( GL_MODELVIEW );
+   glLoadIdentity();
+   glTranslatef( 0.0, 0.0, -15.0 );
+
+   glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, ColorRb);
+   glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGB, width, height);
+
+   Width = width;
+   Height = height;
+}
+
+
+static void
+CleanUp(void)
+{
+   glDeleteFramebuffersEXT(1, &MyFB);
+   glDeleteRenderbuffersEXT(1, &ColorRb);
+   glDeleteRenderbuffersEXT(1, &DepthRb);
+   glDeleteTextures(1, &Tex);
+   assert(!glIsFramebufferEXT(MyFB));
+   assert(!glIsRenderbufferEXT(ColorRb));
+   assert(!glIsRenderbufferEXT(DepthRb));
+   glutDestroyWindow(Win);
+   exit(0);
+}
+
+
+static void
+Idle(void)
+{
+   Rotation = glutGet(GLUT_ELAPSED_TIME) * 0.1;
+   glutPostRedisplay();
+}
+
+
+static void
+Key( unsigned char key, int x, int y )
+{
+   (void) x;
+   (void) y;
+   switch (key) {
+   case 'a':
+      Animate = !Animate;
+      if (Animate)
+         glutIdleFunc(Idle);
+      else
+         glutIdleFunc(NULL);
+      break;
+   case 27:
+      CleanUp();
+      break;
+   }
+   glutPostRedisplay();
+}
+
+
+static void
+Init( void )
+{
+   if (!glutExtensionSupported("GL_EXT_framebuffer_object")) {
+      printf("fbotest3: GL_EXT_framebuffer_object not found!\n");
+      exit(0);
+   }
+   printf("fbotest3: GL_RENDERER = %s\n", (char *) glGetString(GL_RENDERER));
+
+   /* create initial tex obj as an RGBA texture */
+   glGenTextures(1, &Tex);
+   glBindTexture(GL_TEXTURE_2D, Tex);
+   glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 256, 256, 0,
+                GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+   glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+   glTexParameterf(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+   glEnable(GL_TEXTURE_2D);
+
+   /* draw something to make sure the texture is used */
+   glBegin(GL_POINTS);
+   glVertex2f(0, 0);
+   glEnd();
+
+   /* done w/ texturing */
+   glDisable(GL_TEXTURE_2D);
+
+   /* Create my Framebuffer Object */
+   glGenFramebuffersEXT(1, &MyFB);
+   glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, MyFB);
+   assert(glIsFramebufferEXT(MyFB));
+
+   /* Setup color renderbuffer */
+   glGenRenderbuffersEXT(1, &ColorRb);
+   glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, ColorRb);
+   assert(glIsRenderbufferEXT(ColorRb));
+   glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT1_EXT,
+                                GL_RENDERBUFFER_EXT, ColorRb);
+   glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGB, Width, Height);
+
+   /* Setup depth renderbuffer (a texture) */
+   glGenRenderbuffersEXT(1, &DepthRb);
+   glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, DepthRb);
+   assert(glIsRenderbufferEXT(DepthRb));
+   /* replace RGBA texture with Z texture */
+   glTexImage2D(GL_TEXTURE_2D, 0, GL_DEPTH_COMPONENT, Width, Height, 0,
+                GL_DEPTH_COMPONENT, GL_UNSIGNED_INT, NULL);
+   glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_DEPTH_ATTACHMENT_EXT,
+                             GL_TEXTURE_2D, Tex, 0);
+
+   CheckError(__LINE__);
+
+   /* restore to default */
+   glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, 0);
+   CheckError(__LINE__);
+}
+
+
+int
+main( int argc, char *argv[] )
+{
+   glutInit( &argc, argv );
+   glutInitWindowPosition( 0, 0 );
+   glutInitWindowSize(Width, Height);
+   glutInitDisplayMode( GLUT_RGB | GLUT_DOUBLE );
+   Win = glutCreateWindow(argv[0]);
+   glewInit();
+   glutReshapeFunc( Reshape );
+   glutKeyboardFunc( Key );
+   glutDisplayFunc( Display );
+   if (Animate)
+      glutIdleFunc(Idle);
+   Init();
+   glutMainLoop();
+   return 0;
+}
diff --git a/src/egl/main/egldriver.c b/src/egl/main/egldriver.c
index 1dadbf783b6..a8a8e302e48 100644
--- a/src/egl/main/egldriver.c
+++ b/src/egl/main/egldriver.c
@@ -25,6 +25,7 @@
 #include <dlfcn.h>
 #include <sys/types.h>
 #include <dirent.h>
+#include <unistd.h>
 #endif
 
 
@@ -55,21 +56,7 @@ close_library(HMODULE lib)
 static const char *
 library_suffix(void)
 {
-   return "dll";
-}
-
-
-static EGLBoolean
-make_library_path(char *buf, unsigned int size, const char *name)
-{
-   EGLBoolean need_suffix;
-   const char *suffix = ".dll";
-   int ret;
-
-   need_suffix = (strchr(name, '.') == NULL);
-   ret = snprintf(buf, size, "%s%s", name, (need_suffix) ? suffix : "");
-
-   return ((unsigned int) ret < size);
+   return ".dll";
 }
 
 
@@ -96,30 +83,13 @@ close_library(void *lib)
 static const char *
 library_suffix(void)
 {
-   return "so";
-}
-
-
-static EGLBoolean
-make_library_path(char *buf, unsigned int size, const char *name)
-{
-   EGLBoolean need_dir, need_suffix;
-   const char *suffix = ".so";
-   int ret;
-
-   need_dir = (strchr(name, '/') == NULL);
-   need_suffix = (strchr(name, '.') == NULL);
-
-   ret = snprintf(buf, size, "%s%s%s",
-         (need_dir) ? _EGL_DRIVER_SEARCH_DIR"/" : "", name,
-         (need_suffix) ? suffix : "");
-
-   return ((unsigned int) ret < size);
+   return ".so";
 }
 
 
 #else /* _EGL_PLATFORM_NO_OS */
 
+
 static const char DefaultDriverName[] = "builtin";
 
 typedef void *lib_handle;
@@ -143,14 +113,6 @@ library_suffix(void)
 }
 
 
-static EGLBoolean
-make_library_path(char *buf, unsigned int size, const char *name)
-{
-   int ret = snprintf(buf, size, name);
-   return ((unsigned int) ret < size);
-}
-
-
 #endif
 
 
@@ -299,122 +261,260 @@ _eglMatchDriver(_EGLDisplay *dpy)
 
 
 /**
- * Preload a user driver.
- *
- * A user driver can be specified by EGL_DRIVER.
+ * A loader function for use with _eglPreloadForEach.  The loader data is the
+ * filename of the driver.   This function stops on the first valid driver.
  */
 static EGLBoolean
-_eglPreloadUserDriver(void)
+_eglLoaderFile(const char *dir, size_t len, void *loader_data)
 {
-#if defined(_EGL_PLATFORM_POSIX) || defined(_EGL_PLATFORM_WINDOWS)
    _EGLDriver *drv;
    char path[1024];
-   char *env;
-
-   env = getenv("EGL_DRIVER");
-   if (!env)
-      return EGL_FALSE;
+   const char *filename = (const char *) loader_data;
+   size_t flen = strlen(filename);
 
-   if (!make_library_path(path, sizeof(path), env))
-      return EGL_FALSE;
+   /* make a full path */
+   if (len + flen + 2 > sizeof(path))
+      return EGL_TRUE;
+   if (len) {
+      memcpy(path, dir, len);
+      path[len++] = '/';
+   }
+   memcpy(path + len, filename, flen);
+   len += flen;
+   path[len] = '\0';
 
    drv = _eglLoadDriver(path, NULL);
-   if (!drv) {
-      _eglLog(_EGL_WARNING, "EGL_DRIVER is set to an invalid driver");
-      return EGL_FALSE;
+   /* fix the path and load again */
+   if (!drv && library_suffix()) {
+      const char *suffix = library_suffix();
+      size_t slen = strlen(suffix);
+      const char *p;
+      EGLBoolean need_suffix;
+
+      p = filename + flen - slen;
+      need_suffix = (p < filename || strcmp(p, suffix) != 0);
+      if (need_suffix && len + slen + 1 <= sizeof(path)) {
+         strcpy(path + len, suffix);
+         drv = _eglLoadDriver(path, NULL);
+      }
    }
+   if (!drv)
+      return EGL_TRUE;
 
+   /* remember the driver and stop */
    _eglGlobal.Drivers[_eglGlobal.NumDrivers++] = drv;
-
-   return EGL_TRUE;
-#else /* _EGL_PLATFORM_POSIX || _EGL_PLATFORM_WINDOWS */
    return EGL_FALSE;
-#endif
 }
 
 
 /**
- * Preload display drivers.
- *
- * Display drivers are a set of drivers that support a certain display system.
- * The display system may be specified by EGL_DISPLAY.
- *
- * FIXME This makes libEGL a memory hog if an user driver is not specified and
- * there are many display drivers.
+ * A loader function for use with _eglPreloadForEach.  The loader data is the
+ * pattern (prefix) of the files to look for.
  */
 static EGLBoolean
-_eglPreloadDisplayDrivers(void)
+_eglLoaderPattern(const char *dir, size_t len, void *loader_data)
 {
 #if defined(_EGL_PLATFORM_POSIX)
-   const char *dpy, *suffix;
-   char path[1024], prefix[32];
+   const char *prefix, *suffix;
+   size_t prefix_len, suffix_len;
    DIR *dirp;
    struct dirent *dirent;
+   char path[1024];
 
-   dpy = getenv("EGL_DISPLAY");
-   if (!dpy || !dpy[0])
-      dpy = _EGL_DEFAULT_DISPLAY;
-   if (!dpy || !dpy[0])
-      return EGL_FALSE;
-
-   snprintf(prefix, sizeof(prefix), "egl_%s_", dpy);
-   suffix = library_suffix();
+   if (len + 2 > sizeof(path))
+      return EGL_TRUE;
+   if (len) {
+      memcpy(path, dir, len);
+      path[len++] = '/';
+   }
+   path[len] = '\0';
 
-   dirp = opendir(_EGL_DRIVER_SEARCH_DIR);
+   dirp = opendir(path);
    if (!dirp)
-      return EGL_FALSE;
+      return EGL_TRUE;
+
+   prefix = (const char *) loader_data;
+   prefix_len = strlen(prefix);
+   suffix = library_suffix();
+   suffix_len = (suffix) ? strlen(suffix) : 0;
 
    while ((dirent = readdir(dirp))) {
       _EGLDriver *drv;
+      size_t dirent_len = strlen(dirent->d_name);
       const char *p;
 
       /* match the prefix */
-      if (strncmp(dirent->d_name, prefix, strlen(prefix)) != 0)
+      if (strncmp(dirent->d_name, prefix, prefix_len) != 0)
          continue;
-
       /* match the suffix */
-      p = strrchr(dirent->d_name, '.');
-      if ((p && !suffix) || (!p && suffix))
-         continue;
-      else if (p && suffix && strcmp(p + 1, suffix) != 0)
-         continue;
-
-      snprintf(path, sizeof(path),
-            _EGL_DRIVER_SEARCH_DIR"/%s", dirent->d_name);
+      if (suffix) {
+         p = dirent->d_name + dirent_len - suffix_len;
+         if (p < dirent->d_name || strcmp(p, suffix) != 0)
+            continue;
+      }
 
-      drv = _eglLoadDriver(path, NULL);
-      if (drv)
-         _eglGlobal.Drivers[_eglGlobal.NumDrivers++] = drv;
+      /* make a full path and load the driver */
+      if (len + dirent_len + 1 <= sizeof(path)) {
+         strcpy(path + len, dirent->d_name);
+         drv = _eglLoadDriver(path, NULL);
+         if (drv)
+            _eglGlobal.Drivers[_eglGlobal.NumDrivers++] = drv;
+      }
    }
 
    closedir(dirp);
 
-   return (_eglGlobal.NumDrivers > 0);
+   return EGL_TRUE;
 #else /* _EGL_PLATFORM_POSIX */
+   /* stop immediately */
    return EGL_FALSE;
 #endif
 }
 
 
 /**
- * Preload the default driver.
+ * Run the preload function on each driver directory and return the number of
+ * drivers loaded.
+ *
+ * The process may end prematurely if the callback function returns false.
+ */
+static EGLint
+_eglPreloadForEach(const char *search_path,
+                   EGLBoolean (*loader)(const char *, size_t, void *),
+                   void *loader_data)
+{
+   const char *cur, *next;
+   size_t len;
+   EGLint num_drivers = _eglGlobal.NumDrivers;
+
+   cur = search_path;
+   while (cur) {
+      next = strchr(cur, ':');
+      len = (next) ? next - cur : strlen(cur);
+
+      if (!loader(cur, len, loader))
+         break;
+
+      cur = (next) ? next + 1 : NULL;
+   }
+
+   return (_eglGlobal.NumDrivers - num_drivers);
+}
+
+
+/**
+ * Return a list of colon-separated driver directories.
+ */
+static const char *
+_eglGetSearchPath(void)
+{
+   static const char *search_path;
+
+#if defined(_EGL_PLATFORM_POSIX) || defined(_EGL_PLATFORM_WINDOWS)
+   if (!search_path) {
+      static char buffer[1024];
+      const char *p;
+      int ret;
+
+      p = getenv("EGL_DRIVERS_PATH");
+#if defined(_EGL_PLATFORM_POSIX)
+      if (p && (geteuid() != getuid() || getegid() != getgid())) {
+         _eglLog(_EGL_DEBUG,
+               "ignore EGL_DRIVERS_PATH for setuid/setgid binaries");
+         p = NULL;
+      }
+#endif /* _EGL_PLATFORM_POSIX */
+
+      if (p) {
+         ret = snprintf(buffer, sizeof(buffer),
+               "%s:%s", p, _EGL_DRIVER_SEARCH_DIR);
+         if (ret > 0 && ret < sizeof(buffer))
+            search_path = buffer;
+      }
+   }
+   if (!search_path)
+      search_path = _EGL_DRIVER_SEARCH_DIR;
+#else
+   search_path = "";
+#endif
+
+   return search_path;
+}
+
+
+/**
+ * Preload a user driver.
+ *
+ * A user driver can be specified by EGL_DRIVER.
  */
 static EGLBoolean
-_eglPreloadDefaultDriver(void)
+_eglPreloadUserDriver(void)
 {
-   _EGLDriver *drv;
-   char path[1024];
+   const char *search_path = _eglGetSearchPath();
+   char *env;
+
+   env = getenv("EGL_DRIVER");
+#if defined(_EGL_PLATFORM_POSIX)
+   if (env && strchr(env, '/')) {
+      search_path = "";
+      if ((geteuid() != getuid() || getegid() != getgid())) {
+         _eglLog(_EGL_DEBUG,
+               "ignore EGL_DRIVER for setuid/setgid binaries");
+         env = NULL;
+      }
+   }
+#endif /* _EGL_PLATFORM_POSIX */
+   if (!env)
+      return EGL_FALSE;
 
-   if (!make_library_path(path, sizeof(path), DefaultDriverName))
+   if (!_eglPreloadForEach(search_path, _eglLoaderFile, (void *) env)) {
+      _eglLog(_EGL_WARNING, "EGL_DRIVER is set to an invalid driver");
       return EGL_FALSE;
+   }
 
-   drv = _eglLoadDriver(path, NULL);
-   if (!drv)
+   return EGL_TRUE;
+}
+
+
+/**
+ * Preload display drivers.
+ *
+ * Display drivers are a set of drivers that support a certain display system.
+ * The display system may be specified by EGL_DISPLAY.
+ *
+ * FIXME This makes libEGL a memory hog if an user driver is not specified and
+ * there are many display drivers.
+ */
+static EGLBoolean
+_eglPreloadDisplayDrivers(void)
+{
+   const char *dpy;
+   char prefix[32];
+   int ret;
+
+   dpy = getenv("EGL_DISPLAY");
+   if (!dpy || !dpy[0])
+      dpy = _EGL_DEFAULT_DISPLAY;
+   if (!dpy || !dpy[0])
       return EGL_FALSE;
 
-   _eglGlobal.Drivers[_eglGlobal.NumDrivers++] = drv;
+   ret = snprintf(prefix, sizeof(prefix), "egl_%s_", dpy);
+   if (ret < 0 || ret >= sizeof(prefix))
+      return EGL_FALSE;
 
-   return EGL_TRUE;
+   return (_eglPreloadForEach(_eglGetSearchPath(),
+            _eglLoaderPattern, (void *) prefix) > 0);
+}
+
+
+/**
+ * Preload the default driver.
+ */
+static EGLBoolean
+_eglPreloadDefaultDriver(void)
+{
+   return (_eglPreloadForEach(_eglGetSearchPath(),
+            _eglLoaderFile, (void *) DefaultDriverName) > 0);
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index d3084fd4283..d5ddc4a6a92 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -352,7 +352,10 @@ draw_find_shader_output(const struct draw_context *draw,
 
 
 /**
- * Return number of the shader outputs.
+ * Return total number of the shader outputs.  This function is similar to
+ * draw_current_shader_outputs() but this function also counts any extra
+ * vertex/geometry output attributes that may be filled in by some draw
+ * stages (such as AA point, AA line).
  *
  * If geometry shader is present, its output will be returned,
  * if not vertex shader is used.
@@ -362,8 +365,9 @@ draw_num_shader_outputs(const struct draw_context *draw)
 {
    uint count = draw->vs.vertex_shader->info.num_outputs;
 
-   /* if geometry shader is present, its outputs go to te
-    * driver, not the vertex shaders */
+   /* If a geometry shader is present, its outputs go to the
+    * driver, else the vertex shader's outputs.
+    */
    if (draw->gs.geometry_shader)
       count = draw->gs.geometry_shader->info.num_outputs;
 
@@ -374,7 +378,8 @@ draw_num_shader_outputs(const struct draw_context *draw)
 
 
 /**
- * Provide TGSI sampler objects for vertex/geometry shaders that use texture fetches.
+ * Provide TGSI sampler objects for vertex/geometry shaders that use
+ * texture fetches.
  * This might only be used by software drivers for the time being.
  */
 void
@@ -454,14 +459,27 @@ void draw_do_flush( struct draw_context *draw, unsigned flags )
 }
 
 
-int draw_current_shader_outputs(struct draw_context *draw)
+/**
+ * Return the number of output attributes produced by the geometry
+ * shader, if present.  If no geometry shader, return the number of
+ * outputs from the vertex shader.
+ * \sa draw_num_shader_outputs
+ */
+uint
+draw_current_shader_outputs(const struct draw_context *draw)
 {
    if (draw->gs.geometry_shader)
       return draw->gs.num_gs_outputs;
    return draw->vs.num_vs_outputs;
 }
 
-int draw_current_shader_position_output(struct draw_context *draw)
+
+/**
+ * Return the index of the shader output which will contain the
+ * vertex position.
+ */
+uint
+draw_current_shader_position_output(const struct draw_context *draw)
 {
    if (draw->gs.geometry_shader)
       return draw->gs.position_output;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index e58129b99d8..8f6ca15dfa2 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -49,6 +49,10 @@
 #include "draw_pipe.h"
 
 
+/** Approx number of new tokens for instructions in aa_transform_inst() */
+#define NUM_NEW_TOKENS 50
+
+
 /**
  * Max texture level for the alpha texture used for antialiasing
  */
@@ -179,12 +183,7 @@ aa_transform_decl(struct tgsi_transform_context *ctx,
 static int
 free_bit(uint bitfield)
 {
-   int i;
-   for (i = 0; i < 32; i++) {
-      if ((bitfield & (1 << i)) == 0)
-         return i;
-   }
-   return -1;
+   return ffs(~bitfield) - 1;
 }
 
 
@@ -343,11 +342,10 @@ generate_aaline_fs(struct aaline_stage *aaline)
    const struct pipe_shader_state *orig_fs = &aaline->fs->state;
    struct pipe_shader_state aaline_fs;
    struct aa_transform_context transform;
-
-#define MAX 1000
+   const uint newLen = tgsi_num_tokens(orig_fs->tokens) + NUM_NEW_TOKENS;
 
    aaline_fs = *orig_fs; /* copy to init */
-   aaline_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   aaline_fs.tokens = tgsi_alloc_tokens(newLen);
    if (aaline_fs.tokens == NULL)
       return FALSE;
 
@@ -363,7 +361,7 @@ generate_aaline_fs(struct aaline_stage *aaline)
 
    tgsi_transform_shader(orig_fs->tokens,
                          (struct tgsi_token *) aaline_fs.tokens,
-                         MAX, &transform.base);
+                         newLen, &transform.base);
 
 #if 0 /* DEBUG */
    tgsi_dump(orig_fs->tokens, 0);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index d86717e5182..97f34808793 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -53,6 +53,10 @@
 #include "draw_pipe.h"
 
 
+/** Approx number of new tokens for instructions in aa_transform_inst() */
+#define NUM_NEW_TOKENS 200
+
+
 /*
  * Enabling NORMALIZE might give _slightly_ better results.
  * Basically, it controls whether we compute distance as d=sqrt(x*x+y*y) or
@@ -81,16 +85,19 @@ struct aapoint_stage
 {
    struct draw_stage stage;
 
-   int psize_slot;
+   /** half of pipe_rasterizer_state::point_size */
    float radius;
 
+   /** vertex attrib slot containing point size */
+   int psize_slot;
+
    /** this is the vertex attrib slot for the new texcoords */
    uint tex_slot;
+
+   /** vertex attrib slot containing position */
    uint pos_slot;
 
-   /*
-    * Currently bound state
-    */
+   /** Currently bound fragment shader */
    struct aapoint_fragment_shader *fs;
 
    /*
@@ -491,11 +498,10 @@ generate_aapoint_fs(struct aapoint_stage *aapoint)
    const struct pipe_shader_state *orig_fs = &aapoint->fs->state;
    struct pipe_shader_state aapoint_fs;
    struct aa_transform_context transform;
-
-#define MAX 1000
+   const uint newLen = tgsi_num_tokens(orig_fs->tokens) + NUM_NEW_TOKENS;
 
    aapoint_fs = *orig_fs; /* copy to init */
-   aapoint_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   aapoint_fs.tokens = tgsi_alloc_tokens(newLen);
    if (aapoint_fs.tokens == NULL)
       return FALSE;
 
@@ -511,7 +517,7 @@ generate_aapoint_fs(struct aapoint_stage *aapoint)
 
    tgsi_transform_shader(orig_fs->tokens,
                          (struct tgsi_token *) aapoint_fs.tokens,
-                         MAX, &transform.base);
+                         newLen, &transform.base);
 
 #if 0 /* DEBUG */
    printf("draw_aapoint, orig shader:\n");
@@ -575,8 +581,8 @@ aapoint_point(struct draw_stage *stage, struct prim_header *header)
    const struct aapoint_stage *aapoint = aapoint_stage(stage);
    struct prim_header tri;
    struct vertex_header *v[4];
-   uint texPos = aapoint->tex_slot;
-   uint pos_slot = aapoint->pos_slot;
+   const uint tex_slot = aapoint->tex_slot;
+   const uint pos_slot = aapoint->pos_slot;
    float radius, *pos, *tex;
    uint i;
    float k;
@@ -643,16 +649,16 @@ aapoint_point(struct draw_stage *stage, struct prim_header *header)
    pos[1] += radius;
 
    /* new texcoords */
-   tex = v[0]->data[texPos];
+   tex = v[0]->data[tex_slot];
    ASSIGN_4V(tex, -1, -1, k, 1);
 
-   tex = v[1]->data[texPos];
+   tex = v[1]->data[tex_slot];
    ASSIGN_4V(tex,  1, -1, k, 1);
 
-   tex = v[2]->data[texPos];
+   tex = v[2]->data[tex_slot];
    ASSIGN_4V(tex,  1,  1, k, 1);
 
-   tex = v[3]->data[texPos];
+   tex = v[3]->data[tex_slot];
    ASSIGN_4V(tex, -1,  1, k, 1);
 
    /* emit 2 tris for the quad strip */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index f047d8359c4..d0d99aa331a 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -50,6 +50,9 @@
 #include "draw_pipe.h"
 
 
+/** Approx number of new tokens for instructions in pstip_transform_inst() */
+#define NUM_NEW_TOKENS 50
+
 
 /**
  * Subclass of pipe_shader_state to carry extra fragment shader info.
@@ -172,12 +175,7 @@ pstip_transform_immed(struct tgsi_transform_context *ctx,
 static int
 free_bit(uint bitfield)
 {
-   int i;
-   for (i = 0; i < 32; i++) {
-      if ((bitfield & (1 << i)) == 0)
-         return i;
-   }
-   return -1;
+   return ffs(~bitfield) - 1;
 }
 
 
@@ -333,11 +331,10 @@ generate_pstip_fs(struct pstip_stage *pstip)
    /*struct draw_context *draw = pstip->stage.draw;*/
    struct pipe_shader_state pstip_fs;
    struct pstip_transform_context transform;
-
-#define MAX 1000
+   const uint newLen = tgsi_num_tokens(orig_fs->tokens) + NUM_NEW_TOKENS;
 
    pstip_fs = *orig_fs; /* copy to init */
-   pstip_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   pstip_fs.tokens = tgsi_alloc_tokens(newLen);
    if (pstip_fs.tokens == NULL)
       return FALSE;
 
@@ -352,7 +349,7 @@ generate_pstip_fs(struct pstip_stage *pstip)
 
    tgsi_transform_shader(orig_fs->tokens,
                          (struct tgsi_token *) pstip_fs.tokens,
-                         MAX, &transform.base);
+                         newLen, &transform.base);
 
 #if 0 /* DEBUG */
    tgsi_dump(orig_fs->tokens, 0);
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 6a7190e9750..69466d8749d 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -280,8 +280,8 @@ void draw_gs_destroy( struct draw_context *draw );
 /*******************************************************************************
  * Common shading code:
  */
-int draw_current_shader_outputs(struct draw_context *draw);
-int draw_current_shader_position_output(struct draw_context *draw);
+uint draw_current_shader_outputs(const struct draw_context *draw);
+uint draw_current_shader_position_output(const struct draw_context *draw);
 
 /*******************************************************************************
  * Vertex processing (was passthrough) code:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index fbb9aa0e63a..f7a1bb74a9d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -264,6 +264,12 @@ static void
 micro_rcp(union tgsi_exec_channel *dst,
           const union tgsi_exec_channel *src)
 {
+#if 0 /* for debugging */
+   assert(src->f[0] != 0.0f);
+   assert(src->f[1] != 0.0f);
+   assert(src->f[2] != 0.0f);
+   assert(src->f[3] != 0.0f);
+#endif
    dst->f[0] = 1.0f / src->f[0];
    dst->f[1] = 1.0f / src->f[1];
    dst->f[2] = 1.0f / src->f[2];
@@ -284,6 +290,12 @@ static void
 micro_rsq(union tgsi_exec_channel *dst,
           const union tgsi_exec_channel *src)
 {
+#if 0 /* for debugging */
+   assert(src->f[0] != 0.0f);
+   assert(src->f[1] != 0.0f);
+   assert(src->f[2] != 0.0f);
+   assert(src->f[3] != 0.0f);
+#endif
    dst->f[0] = 1.0f / sqrtf(fabsf(src->f[0]));
    dst->f[1] = 1.0f / sqrtf(fabsf(src->f[1]));
    dst->f[2] = 1.0f / sqrtf(fabsf(src->f[2]));
@@ -450,12 +462,20 @@ static const union tgsi_exec_channel ZeroVec =
    { { 0.0, 0.0, 0.0, 0.0 } };
 
 
-#define CHECK_INF_OR_NAN(chan) do {\
-      assert(!util_is_inf_or_nan((chan)->f[0]));\
-      assert(!util_is_inf_or_nan((chan)->f[1]));\
-      assert(!util_is_inf_or_nan((chan)->f[2]));\
-      assert(!util_is_inf_or_nan((chan)->f[3]));\
-   } while (0)
+/**
+ * Assert that none of the float values in 'chan' are infinite or NaN.
+ * NaN and Inf may occur normally during program execution and should
+ * not lead to crashes, etc.  But when debugging, it's helpful to catch
+ * them.
+ */
+static INLINE void
+check_inf_or_nan(const union tgsi_exec_channel *chan)
+{
+   assert(!util_is_inf_or_nan((chan)->f[0]));
+   assert(!util_is_inf_or_nan((chan)->f[1]));
+   assert(!util_is_inf_or_nan((chan)->f[2]));
+   assert(!util_is_inf_or_nan((chan)->f[3]));
+}
 
 
 #ifdef DEBUG
@@ -1219,8 +1239,9 @@ store_dest(struct tgsi_exec_machine *mach,
    int offset = 0;  /* indirection offset */
    int index;
 
-   if (dst_datatype == TGSI_EXEC_DATA_FLOAT) {
-      CHECK_INF_OR_NAN(chan);
+   /* for debugging */
+   if (0 && dst_datatype == TGSI_EXEC_DATA_FLOAT) {
+      check_inf_or_nan(chan);
    }
 
    /* There is an extra source register that indirectly subscripts
@@ -1478,7 +1499,7 @@ emit_primitive(struct tgsi_exec_machine *mach)
 }
 
 /*
- * Fetch a four texture samples using STR texture coordinates.
+ * Fetch four texture samples using STR texture coordinates.
  */
 static void
 fetch_texel( struct tgsi_sampler *sampler,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index fd37fc3079b..7e19e1fe36f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -284,3 +284,14 @@ tgsi_dup_tokens(const struct tgsi_token *tokens)
       memcpy(new_tokens, tokens, bytes);
    return new_tokens;
 }
+
+
+/**
+ * Allocate memory for num_tokens tokens.
+ */
+struct tgsi_token *
+tgsi_alloc_tokens(unsigned num_tokens)
+{
+   unsigned bytes = num_tokens * sizeof(struct tgsi_token);
+   return (struct tgsi_token *) MALLOC(bytes);
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
index 8150e3cd29d..b45ccee2f63 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -130,6 +130,10 @@ tgsi_num_tokens(const struct tgsi_token *tokens);
 struct tgsi_token *
 tgsi_dup_tokens(const struct tgsi_token *tokens);
 
+struct tgsi_token *
+tgsi_alloc_tokens(unsigned num_tokens);
+
+
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 60a1cb1af4f..27960bac221 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -104,6 +104,8 @@ struct ureg_program
 
    struct {
       unsigned index;
+      unsigned semantic_name;
+      unsigned semantic_index;
    } gs_input[UREG_MAX_INPUT];
    unsigned nr_gs_inputs;
 
@@ -326,10 +328,14 @@ ureg_DECL_vs_input( struct ureg_program *ureg,
 
 struct ureg_src
 ureg_DECL_gs_input(struct ureg_program *ureg,
-                   unsigned index)
+                   unsigned index,
+                   unsigned semantic_name,
+                   unsigned semantic_index)
 {
    if (ureg->nr_gs_inputs < UREG_MAX_INPUT) {
       ureg->gs_input[ureg->nr_gs_inputs].index = index;
+      ureg->gs_input[ureg->nr_gs_inputs].semantic_name = semantic_name;
+      ureg->gs_input[ureg->nr_gs_inputs].semantic_index = semantic_index;
       ureg->nr_gs_inputs++;
    } else {
       set_bad(ureg);
@@ -1252,10 +1258,12 @@ static void emit_decls( struct ureg_program *ureg )
       }
    } else {
       for (i = 0; i < ureg->nr_gs_inputs; i++) {
-         emit_decl_range(ureg, 
-                         TGSI_FILE_INPUT, 
-                         ureg->gs_input[i].index,
-                         1);
+         emit_decl(ureg,
+                   TGSI_FILE_INPUT,
+                   ureg->gs_input[i].index,
+                   ureg->gs_input[i].semantic_name,
+                   ureg->gs_input[i].semantic_index,
+                   TGSI_INTERPOLATE_CONSTANT);
       }
    }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 6198ca34642..6be66d0694b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -161,7 +161,9 @@ ureg_DECL_vs_input( struct ureg_program *,
 
 struct ureg_src
 ureg_DECL_gs_input(struct ureg_program *,
-                   unsigned index);
+                   unsigned index,
+                   unsigned semantic_name,
+                   unsigned semantic_index);
 
 struct ureg_src
 ureg_DECL_system_value(struct ureg_program *,
diff --git a/src/gallium/docs/source/conf.py b/src/gallium/docs/source/conf.py
index 9b0c86babdb..59c19ed98dd 100644
--- a/src/gallium/docs/source/conf.py
+++ b/src/gallium/docs/source/conf.py
@@ -16,13 +16,13 @@ import sys, os
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#sys.path.append(os.path.abspath('.'))
+sys.path.append(os.path.abspath('exts'))
 
 # -- General configuration -----------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
 # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.pngmath']
+extensions = ['sphinx.ext.pngmath', 'tgsi']
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
diff --git a/src/gallium/docs/source/exts/tgsi.py b/src/gallium/docs/source/exts/tgsi.py
new file mode 100644
index 00000000000..e92cd5c4d1b
--- /dev/null
+++ b/src/gallium/docs/source/exts/tgsi.py
@@ -0,0 +1,17 @@
+# tgsi.py
+# Sphinx extension providing formatting for TGSI opcodes
+# (c) Corbin Simpson 2010
+
+import docutils.nodes
+import sphinx.addnodes
+
+def parse_opcode(env, sig, signode):
+    opcode, desc = sig.split("-", 1)
+    opcode = opcode.strip().upper()
+    desc = " (%s)" % desc.strip()
+    signode += sphinx.addnodes.desc_name(opcode, opcode)
+    signode += sphinx.addnodes.desc_annotation(desc, desc)
+    return opcode
+
+def setup(app):
+    app.add_description_unit("opcode", "opcode", "%s (TGSI opcode)", parse_opcode)
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 3e57a282fd4..55a4c6990de 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -227,7 +227,7 @@ buffer_map
 
 Map a buffer into memory.
 
-**usage** is a bitmask of :ref:`PIPE_TEXTURE_USAGE` flags.
+**usage** is a bitmask of :ref:`PIPE_BUFFER_USAGE` flags.
 
 Returns a pointer to the map, or NULL if the mapping failed.
 
@@ -248,7 +248,7 @@ Flush a range of mapped memory into a buffer.
 
 The buffer must have been mapped with ``PIPE_BUFFER_USAGE_FLUSH_EXPLICIT``.
 
-**usage** is a bitmask of :ref:`PIPE_TEXTURE_USAGE` flags.
+**usage** is a bitmask of :ref:`PIPE_BUFFER_USAGE` flags.
 
 buffer_unmap
 ^^^^^^^^^^^^
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 3e702ceeda4..5478d866678 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -6,6 +6,23 @@ for describing shaders. Since Gallium is inherently shaderful, shaders are
 an important part of the API. TGSI is the only intermediate representation
 used by all drivers.
 
+Basics
+------
+
+All TGSI instructions, known as *opcodes*, operate on arbitrary-precision
+floating-point four-component vectors. An opcode may have up to one
+destination register, known as *dst*, and between zero and three source
+registers, called *src0* through *src2*, or simply *src* if there is only
+one.
+
+Some instructions, like :opcode:`I2F`, permit re-interpretation of vector
+components as integers. Other instructions permit using registers as
+two-component vectors with double precision; see :ref:`Double Opcodes`.
+
+When an instruction has a scalar result, the result is usually copied into
+each of the components of *dst*. When this happens, the result is said to be
+*replicated* to *dst*. :opcode:`RCP` is one such instruction.
+
 Instruction Set
 ---------------
 
@@ -13,7 +30,7 @@ From GL_NV_vertex_program
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
 
-ARL - Address Register Load
+.. opcode:: ARL - Address Register Load
 
 .. math::
 
@@ -26,7 +43,7 @@ ARL - Address Register Load
   dst.w = \lfloor src.w\rfloor
 
 
-MOV - Move
+.. opcode:: MOV - Move
 
 .. math::
 
@@ -39,7 +56,7 @@ MOV - Move
   dst.w = src.w
 
 
-LIT - Light Coefficients
+.. opcode:: LIT - Light Coefficients
 
 .. math::
 
@@ -52,33 +69,25 @@ LIT - Light Coefficients
   dst.w = 1
 
 
-RCP - Reciprocal
-
-.. math::
+.. opcode:: RCP - Reciprocal
 
-  dst.x = \frac{1}{src.x}
+This instruction replicates its result.
 
-  dst.y = \frac{1}{src.x}
+.. math::
 
-  dst.z = \frac{1}{src.x}
+  dst = \frac{1}{src.x}
 
-  dst.w = \frac{1}{src.x}
 
+.. opcode:: RSQ - Reciprocal Square Root
 
-RSQ - Reciprocal Square Root
+This instruction replicates its result.
 
 .. math::
 
-  dst.x = \frac{1}{\sqrt{|src.x|}}
-
-  dst.y = \frac{1}{\sqrt{|src.x|}}
-
-  dst.z = \frac{1}{\sqrt{|src.x|}}
+  dst = \frac{1}{\sqrt{|src.x|}}
 
-  dst.w = \frac{1}{\sqrt{|src.x|}}
 
-
-EXP - Approximate Exponential Base 2
+.. opcode:: EXP - Approximate Exponential Base 2
 
 .. math::
 
@@ -91,7 +100,7 @@ EXP - Approximate Exponential Base 2
   dst.w = 1
 
 
-LOG - Approximate Logarithm Base 2
+.. opcode:: LOG - Approximate Logarithm Base 2
 
 .. math::
 
@@ -104,7 +113,7 @@ LOG - Approximate Logarithm Base 2
   dst.w = 1
 
 
-MUL - Multiply
+.. opcode:: MUL - Multiply
 
 .. math::
 
@@ -117,7 +126,7 @@ MUL - Multiply
   dst.w = src0.w \times src1.w
 
 
-ADD - Add
+.. opcode:: ADD - Add
 
 .. math::
 
@@ -130,33 +139,25 @@ ADD - Add
   dst.w = src0.w + src1.w
 
 
-DP3 - 3-component Dot Product
-
-.. math::
+.. opcode:: DP3 - 3-component Dot Product
 
-  dst.x = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
+This instruction replicates its result.
 
-  dst.y = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
+.. math::
 
-  dst.z = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
+  dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
 
-  dst.w = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z
 
+.. opcode:: DP4 - 4-component Dot Product
 
-DP4 - 4-component Dot Product
+This instruction replicates its result.
 
 .. math::
 
-  dst.x = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
-
-  dst.y = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
+  dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
 
-  dst.z = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
 
-  dst.w = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src0.w \times src1.w
-
-
-DST - Distance Vector
+.. opcode:: DST - Distance Vector
 
 .. math::
 
@@ -169,7 +170,7 @@ DST - Distance Vector
   dst.w = src1.w
 
 
-MIN - Minimum
+.. opcode:: MIN - Minimum
 
 .. math::
 
@@ -182,7 +183,7 @@ MIN - Minimum
   dst.w = min(src0.w, src1.w)
 
 
-MAX - Maximum
+.. opcode:: MAX - Maximum
 
 .. math::
 
@@ -195,7 +196,7 @@ MAX - Maximum
   dst.w = max(src0.w, src1.w)
 
 
-SLT - Set On Less Than
+.. opcode:: SLT - Set On Less Than
 
 .. math::
 
@@ -208,7 +209,7 @@ SLT - Set On Less Than
   dst.w = (src0.w < src1.w) ? 1 : 0
 
 
-SGE - Set On Greater Equal Than
+.. opcode:: SGE - Set On Greater Equal Than
 
 .. math::
 
@@ -221,7 +222,7 @@ SGE - Set On Greater Equal Than
   dst.w = (src0.w >= src1.w) ? 1 : 0
 
 
-MAD - Multiply And Add
+.. opcode:: MAD - Multiply And Add
 
 .. math::
 
@@ -234,7 +235,7 @@ MAD - Multiply And Add
   dst.w = src0.w \times src1.w + src2.w
 
 
-SUB - Subtract
+.. opcode:: SUB - Subtract
 
 .. math::
 
@@ -247,7 +248,7 @@ SUB - Subtract
   dst.w = src0.w - src1.w
 
 
-LRP - Linear Interpolate
+.. opcode:: LRP - Linear Interpolate
 
 .. math::
 
@@ -260,7 +261,7 @@ LRP - Linear Interpolate
   dst.w = src0.w \times src1.w + (1 - src0.w) \times src2.w
 
 
-CND - Condition
+.. opcode:: CND - Condition
 
 .. math::
 
@@ -273,7 +274,7 @@ CND - Condition
   dst.w = (src2.w > 0.5) ? src0.w : src1.w
 
 
-DP2A - 2-component Dot Product And Add
+.. opcode:: DP2A - 2-component Dot Product And Add
 
 .. math::
 
@@ -286,7 +287,7 @@ DP2A - 2-component Dot Product And Add
   dst.w = src0.x \times src1.x + src0.y \times src1.y + src2.x
 
 
-FRAC - Fraction
+.. opcode:: FRAC - Fraction
 
 .. math::
 
@@ -299,7 +300,7 @@ FRAC - Fraction
   dst.w = src.w - \lfloor src.w\rfloor
 
 
-CLAMP - Clamp
+.. opcode:: CLAMP - Clamp
 
 .. math::
 
@@ -312,9 +313,9 @@ CLAMP - Clamp
   dst.w = clamp(src0.w, src1.w, src2.w)
 
 
-FLR - Floor
+.. opcode:: FLR - Floor
 
-This is identical to ARL.
+This is identical to :opcode:`ARL`.
 
 .. math::
 
@@ -327,7 +328,7 @@ This is identical to ARL.
   dst.w = \lfloor src.w\rfloor
 
 
-ROUND - Round
+.. opcode:: ROUND - Round
 
 .. math::
 
@@ -340,45 +341,33 @@ ROUND - Round
   dst.w = round(src.w)
 
 
-EX2 - Exponential Base 2
+.. opcode:: EX2 - Exponential Base 2
 
-.. math::
+This instruction replicates its result.
 
-  dst.x = 2^{src.x}
-
-  dst.y = 2^{src.x}
+.. math::
 
-  dst.z = 2^{src.x}
+  dst = 2^{src.x}
 
-  dst.w = 2^{src.x}
 
+.. opcode:: LG2 - Logarithm Base 2
 
-LG2 - Logarithm Base 2
+This instruction replicates its result.
 
 .. math::
 
-  dst.x = \log_2{src.x}
-
-  dst.y = \log_2{src.x}
-
-  dst.z = \log_2{src.x}
+  dst = \log_2{src.x}
 
-  dst.w = \log_2{src.x}
 
+.. opcode:: POW - Power
 
-POW - Power
+This instruction replicates its result.
 
 .. math::
 
-  dst.x = src0.x^{src1.x}
+  dst = src0.x^{src1.x}
 
-  dst.y = src0.x^{src1.x}
-
-  dst.z = src0.x^{src1.x}
-
-  dst.w = src0.x^{src1.x}
-
-XPD - Cross Product
+.. opcode:: XPD - Cross Product
 
 .. math::
 
@@ -391,7 +380,7 @@ XPD - Cross Product
   dst.w = 1
 
 
-ABS - Absolute
+.. opcode:: ABS - Absolute
 
 .. math::
 
@@ -404,48 +393,36 @@ ABS - Absolute
   dst.w = |src.w|
 
 
-RCC - Reciprocal Clamped
+.. opcode:: RCC - Reciprocal Clamped
+
+This instruction replicates its result.
 
 XXX cleanup on aisle three
 
 .. math::
 
-  dst.x = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
-
-  dst.y = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
-
-  dst.z = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
+  dst = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
 
-  dst.w = (1 / src.x) > 0 ? clamp(1 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1 / src.x, -1.884467e+019, -5.42101e-020)
 
+.. opcode:: DPH - Homogeneous Dot Product
 
-DPH - Homogeneous Dot Product
+This instruction replicates its result.
 
 .. math::
 
-  dst.x = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
+  dst = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
 
-  dst.y = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
 
-  dst.z = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
+.. opcode:: COS - Cosine
 
-  dst.w = src0.x \times src1.x + src0.y \times src1.y + src0.z \times src1.z + src1.w
-
-
-COS - Cosine
+This instruction replicates its result.
 
 .. math::
 
-  dst.x = \cos{src.x}
-
-  dst.y = \cos{src.x}
-
-  dst.z = \cos{src.x}
+  dst = \cos{src.x}
 
-  dst.w = \cos{src.x}
 
-
-DDX - Derivative Relative To X
+.. opcode:: DDX - Derivative Relative To X
 
 .. math::
 
@@ -458,7 +435,7 @@ DDX - Derivative Relative To X
   dst.w = partialx(src.w)
 
 
-DDY - Derivative Relative To Y
+.. opcode:: DDY - Derivative Relative To Y
 
 .. math::
 
@@ -471,32 +448,32 @@ DDY - Derivative Relative To Y
   dst.w = partialy(src.w)
 
 
-KILP - Predicated Discard
+.. opcode:: KILP - Predicated Discard
 
   discard
 
 
-PK2H - Pack Two 16-bit Floats
+.. opcode:: PK2H - Pack Two 16-bit Floats
 
   TBD
 
 
-PK2US - Pack Two Unsigned 16-bit Scalars
+.. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars
 
   TBD
 
 
-PK4B - Pack Four Signed 8-bit Scalars
+.. opcode:: PK4B - Pack Four Signed 8-bit Scalars
 
   TBD
 
 
-PK4UB - Pack Four Unsigned 8-bit Scalars
+.. opcode:: PK4UB - Pack Four Unsigned 8-bit Scalars
 
   TBD
 
 
-RFL - Reflection Vector
+.. opcode:: RFL - Reflection Vector
 
 .. math::
 
@@ -508,10 +485,12 @@ RFL - Reflection Vector
 
   dst.w = 1
 
-Considered for removal.
+.. note::
+
+   Considered for removal.
 
 
-SEQ - Set On Equal
+.. opcode:: SEQ - Set On Equal
 
 .. math::
 
@@ -524,21 +503,20 @@ SEQ - Set On Equal
   dst.w = (src0.w == src1.w) ? 1 : 0
 
 
-SFL - Set On False
+.. opcode:: SFL - Set On False
 
-.. math::
+This instruction replicates its result.
 
-  dst.x = 0
+.. math::
 
-  dst.y = 0
+  dst = 0
 
-  dst.z = 0
+.. note::
 
-  dst.w = 0
+   Considered for removal.
 
-Considered for removal.
 
-SGT - Set On Greater Than
+.. opcode:: SGT - Set On Greater Than
 
 .. math::
 
@@ -551,20 +529,16 @@ SGT - Set On Greater Than
   dst.w = (src0.w > src1.w) ? 1 : 0
 
 
-SIN - Sine
+.. opcode:: SIN - Sine
 
-.. math::
+This instruction replicates its result.
 
-  dst.x = \sin{src.x}
-
-  dst.y = \sin{src.x}
-
-  dst.z = \sin{src.x}
+.. math::
 
-  dst.w = \sin{src.x}
+  dst = \sin{src.x}
 
 
-SLE - Set On Less Equal Than
+.. opcode:: SLE - Set On Less Equal Than
 
 .. math::
 
@@ -577,7 +551,7 @@ SLE - Set On Less Equal Than
   dst.w = (src0.w <= src1.w) ? 1 : 0
 
 
-SNE - Set On Not Equal
+.. opcode:: SNE - Set On Not Equal
 
 .. math::
 
@@ -590,59 +564,63 @@ SNE - Set On Not Equal
   dst.w = (src0.w != src1.w) ? 1 : 0
 
 
-STR - Set On True
+.. opcode:: STR - Set On True
 
-.. math::
+This instruction replicates its result.
 
-  dst.x = 1
-
-  dst.y = 1
-
-  dst.z = 1
+.. math::
 
-  dst.w = 1
+  dst = 1
 
 
-TEX - Texture Lookup
+.. opcode:: TEX - Texture Lookup
 
   TBD
 
 
-TXD - Texture Lookup with Derivatives
+.. opcode:: TXD - Texture Lookup with Derivatives
 
   TBD
 
 
-TXP - Projective Texture Lookup
+.. opcode:: TXP - Projective Texture Lookup
 
   TBD
 
 
-UP2H - Unpack Two 16-Bit Floats
+.. opcode:: UP2H - Unpack Two 16-Bit Floats
 
   TBD
 
-  Considered for removal.
+.. note::
 
-UP2US - Unpack Two Unsigned 16-Bit Scalars
+   Considered for removal.
+
+.. opcode:: UP2US - Unpack Two Unsigned 16-Bit Scalars
 
   TBD
 
-  Considered for removal.
+.. note::
+
+   Considered for removal.
 
-UP4B - Unpack Four Signed 8-Bit Values
+.. opcode:: UP4B - Unpack Four Signed 8-Bit Values
 
   TBD
 
-  Considered for removal.
+.. note::
+
+   Considered for removal.
 
-UP4UB - Unpack Four Unsigned 8-Bit Scalars
+.. opcode:: UP4UB - Unpack Four Unsigned 8-Bit Scalars
 
   TBD
 
-  Considered for removal.
+.. note::
 
-X2D - 2D Coordinate Transformation
+   Considered for removal.
+
+.. opcode:: X2D - 2D Coordinate Transformation
 
 .. math::
 
@@ -654,20 +632,24 @@ X2D - 2D Coordinate Transformation
 
   dst.w = src0.y + src1.x \times src2.z + src1.y \times src2.w
 
-Considered for removal.
+.. note::
+
+   Considered for removal.
 
 
 From GL_NV_vertex_program2
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 
-ARA - Address Register Add
+.. opcode:: ARA - Address Register Add
 
   TBD
 
-  Considered for removal.
+.. note::
 
-ARR - Address Register Load With Round
+   Considered for removal.
+
+.. opcode:: ARR - Address Register Load With Round
 
 .. math::
 
@@ -680,26 +662,28 @@ ARR - Address Register Load With Round
   dst.w = round(src.w)
 
 
-BRA - Branch
+.. opcode:: BRA - Branch
 
   pc = target
 
-  Considered for removal.
+.. note::
+
+   Considered for removal.
 
-CAL - Subroutine Call
+.. opcode:: CAL - Subroutine Call
 
   push(pc)
   pc = target
 
 
-RET - Subroutine Call Return
+.. opcode:: RET - Subroutine Call Return
 
   pc = pop()
 
   Potential restrictions:  
   * Only occurs at end of function.
 
-SSG - Set Sign
+.. opcode:: SSG - Set Sign
 
 .. math::
 
@@ -712,7 +696,7 @@ SSG - Set Sign
   dst.w = (src.w > 0) ? 1 : (src.w < 0) ? -1 : 0
 
 
-CMP - Compare
+.. opcode:: CMP - Compare
 
 .. math::
 
@@ -725,7 +709,7 @@ CMP - Compare
   dst.w = (src0.w < 0) ? src1.w : src2.w
 
 
-KIL - Conditional Discard
+.. opcode:: KIL - Conditional Discard
 
 .. math::
 
@@ -734,7 +718,7 @@ KIL - Conditional Discard
   endif
 
 
-SCS - Sine Cosine
+.. opcode:: SCS - Sine Cosine
 
 .. math::
 
@@ -747,12 +731,12 @@ SCS - Sine Cosine
   dst.y = 1
 
 
-TXB - Texture Lookup With Bias
+.. opcode:: TXB - Texture Lookup With Bias
 
   TBD
 
 
-NRM - 3-component Vector Normalise
+.. opcode:: NRM - 3-component Vector Normalise
 
 .. math::
 
@@ -765,7 +749,7 @@ NRM - 3-component Vector Normalise
   dst.w = 1
 
 
-DIV - Divide
+.. opcode:: DIV - Divide
 
 .. math::
 
@@ -778,35 +762,31 @@ DIV - Divide
   dst.w = \frac{src0.w}{src1.w}
 
 
-DP2 - 2-component Dot Product
-
-.. math::
-
-  dst.x = src0.x \times src1.x + src0.y \times src1.y
+.. opcode:: DP2 - 2-component Dot Product
 
-  dst.y = src0.x \times src1.x + src0.y \times src1.y
+This instruction replicates its result.
 
-  dst.z = src0.x \times src1.x + src0.y \times src1.y
+.. math::
 
-  dst.w = src0.x \times src1.x + src0.y \times src1.y
+  dst = src0.x \times src1.x + src0.y \times src1.y
 
 
-TXL - Texture Lookup With LOD
+.. opcode:: TXL - Texture Lookup With LOD
 
   TBD
 
 
-BRK - Break
+.. opcode:: BRK - Break
 
   TBD
 
 
-IF - If
+.. opcode:: IF - If
 
   TBD
 
 
-BGNFOR - Begin a For-Loop
+.. opcode:: BGNFOR - Begin a For-Loop
 
   dst.x = floor(src.x)
   dst.y = floor(src.y)
@@ -819,25 +799,31 @@ BGNFOR - Begin a For-Loop
   Note: The destination must be a loop register.
         The source must be a constant register.
 
-  Considered for cleanup / removal.
+.. note::
 
+   Considered for cleanup.
 
-REP - Repeat
+.. note::
+
+   Considered for removal.
+
+
+.. opcode:: REP - Repeat
 
   TBD
 
 
-ELSE - Else
+.. opcode:: ELSE - Else
 
   TBD
 
 
-ENDIF - End If
+.. opcode:: ENDIF - End If
 
   TBD
 
 
-ENDFOR - End a For-Loop
+.. opcode:: ENDFOR - End a For-Loop
 
   dst.x = dst.x + dst.z
   dst.y = dst.y - 1.0
@@ -848,30 +834,48 @@ ENDFOR - End a For-Loop
 
   Note: The destination must be a loop register.
 
-  Considered for cleanup / removal.
+.. note::
+
+   Considered for cleanup.
+
+.. note::
 
-ENDREP - End Repeat
+   Considered for removal.
+
+.. opcode:: ENDREP - End Repeat
 
   TBD
 
 
-PUSHA - Push Address Register On Stack
+.. opcode:: PUSHA - Push Address Register On Stack
 
   push(src.x)
   push(src.y)
   push(src.z)
   push(src.w)
 
-  Considered for cleanup / removal.
+.. note::
+
+   Considered for cleanup.
+
+.. note::
 
-POPA - Pop Address Register From Stack
+   Considered for removal.
+
+.. opcode:: POPA - Pop Address Register From Stack
 
   dst.w = pop()
   dst.z = pop()
   dst.y = pop()
   dst.x = pop()
 
-  Considered for cleanup / removal.
+.. note::
+
+   Considered for cleanup.
+
+.. note::
+
+   Considered for removal.
 
 
 From GL_NV_gpu_program4
@@ -879,7 +883,7 @@ From GL_NV_gpu_program4
 
 Support for these opcodes indicated by a special pipe capability bit (TBD).
 
-CEIL - Ceiling
+.. opcode:: CEIL - Ceiling
 
 .. math::
 
@@ -892,7 +896,7 @@ CEIL - Ceiling
   dst.w = \lceil src.w\rceil
 
 
-I2F - Integer To Float
+.. opcode:: I2F - Integer To Float
 
 .. math::
 
@@ -905,7 +909,7 @@ I2F - Integer To Float
   dst.w = (float) src.w
 
 
-NOT - Bitwise Not
+.. opcode:: NOT - Bitwise Not
 
 .. math::
 
@@ -918,7 +922,7 @@ NOT - Bitwise Not
   dst.w = ~src.w
 
 
-TRUNC - Truncate
+.. opcode:: TRUNC - Truncate
 
 .. math::
 
@@ -931,7 +935,7 @@ TRUNC - Truncate
   dst.w = trunc(src.w)
 
 
-SHL - Shift Left
+.. opcode:: SHL - Shift Left
 
 .. math::
 
@@ -944,7 +948,7 @@ SHL - Shift Left
   dst.w = src0.w << src1.x
 
 
-SHR - Shift Right
+.. opcode:: SHR - Shift Right
 
 .. math::
 
@@ -957,7 +961,7 @@ SHR - Shift Right
   dst.w = src0.w >> src1.x
 
 
-AND - Bitwise And
+.. opcode:: AND - Bitwise And
 
 .. math::
 
@@ -970,7 +974,7 @@ AND - Bitwise And
   dst.w = src0.w & src1.w
 
 
-OR - Bitwise Or
+.. opcode:: OR - Bitwise Or
 
 .. math::
 
@@ -983,7 +987,7 @@ OR - Bitwise Or
   dst.w = src0.w | src1.w
 
 
-MOD - Modulus
+.. opcode:: MOD - Modulus
 
 .. math::
 
@@ -996,7 +1000,7 @@ MOD - Modulus
   dst.w = src0.w \bmod src1.w
 
 
-XOR - Bitwise Xor
+.. opcode:: XOR - Bitwise Xor
 
 .. math::
 
@@ -1009,7 +1013,7 @@ XOR - Bitwise Xor
   dst.w = src0.w \oplus src1.w
 
 
-SAD - Sum Of Absolute Differences
+.. opcode:: SAD - Sum Of Absolute Differences
 
 .. math::
 
@@ -1022,17 +1026,17 @@ SAD - Sum Of Absolute Differences
   dst.w = |src0.w - src1.w| + src2.w
 
 
-TXF - Texel Fetch
+.. opcode:: TXF - Texel Fetch
 
   TBD
 
 
-TXQ - Texture Size Query
+.. opcode:: TXQ - Texture Size Query
 
   TBD
 
 
-CONT - Continue
+.. opcode:: CONT - Continue
 
   TBD
 
@@ -1041,12 +1045,12 @@ From GL_NV_geometry_program4
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 
-EMIT - Emit
+.. opcode:: EMIT - Emit
 
   TBD
 
 
-ENDPRIM - End Primitive
+.. opcode:: ENDPRIM - End Primitive
 
   TBD
 
@@ -1055,66 +1059,64 @@ From GLSL
 ^^^^^^^^^^
 
 
-BGNLOOP - Begin a Loop
+.. opcode:: BGNLOOP - Begin a Loop
 
   TBD
 
 
-BGNSUB - Begin Subroutine
+.. opcode:: BGNSUB - Begin Subroutine
 
   TBD
 
 
-ENDLOOP - End a Loop
+.. opcode:: ENDLOOP - End a Loop
 
   TBD
 
 
-ENDSUB - End Subroutine
+.. opcode:: ENDSUB - End Subroutine
 
   TBD
 
 
-NOP - No Operation
+.. opcode:: NOP - No Operation
 
   Do nothing.
 
 
-NRM4 - 4-component Vector Normalise
-
-.. math::
+.. opcode:: NRM4 - 4-component Vector Normalise
 
-  dst.x = \frac{src.x}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
+This instruction replicates its result.
 
-  dst.y = \frac{src.y}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
-
-  dst.z = \frac{src.z}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
+.. math::
 
-  dst.w = \frac{src.w}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
+  dst = \frac{src.x}{src.x \times src.x + src.y \times src.y + src.z \times src.z + src.w \times src.w}
 
 
 ps_2_x
 ^^^^^^^^^^^^
 
 
-CALLNZ - Subroutine Call If Not Zero
+.. opcode:: CALLNZ - Subroutine Call If Not Zero
 
   TBD
 
 
-IFC - If
+.. opcode:: IFC - If
 
   TBD
 
 
-BREAKC - Break Conditional
+.. opcode:: BREAKC - Break Conditional
 
   TBD
 
+.. _doubleopcodes:
+
 Double Opcodes
 ^^^^^^^^^^^^^^^
 
-DADD - Add Double
+.. opcode:: DADD - Add Double
 
 .. math::
 
@@ -1123,7 +1125,7 @@ DADD - Add Double
   dst.zw = src0.zw + src1.zw
 
 
-DDIV - Divide Double
+.. opcode:: DDIV - Divide Double
 
 .. math::
 
@@ -1131,7 +1133,7 @@ DDIV - Divide Double
 
   dst.zw = src0.zw / src1.zw
 
-DSEQ - Set Double on Equal
+.. opcode:: DSEQ - Set Double on Equal
 
 .. math::
 
@@ -1139,7 +1141,7 @@ DSEQ - Set Double on Equal
 
   dst.zw = src0.zw == src1.zw ? 1.0F : 0.0F
 
-DSLT - Set Double on Less than
+.. opcode:: DSLT - Set Double on Less than
 
 .. math::
 
@@ -1147,7 +1149,7 @@ DSLT - Set Double on Less than
 
   dst.zw = src0.zw < src1.zw ? 1.0F : 0.0F
 
-DFRAC - Double Fraction
+.. opcode:: DFRAC - Double Fraction
 
 .. math::
 
@@ -1156,7 +1158,7 @@ DFRAC - Double Fraction
   dst.zw = src.zw - \lfloor src.zw\rfloor
 
 
-DFRACEXP - Convert Double Number to Fractional and Integral Components
+.. opcode:: DFRACEXP - Convert Double Number to Fractional and Integral Components
 
 .. math::
 
@@ -1164,7 +1166,7 @@ DFRACEXP - Convert Double Number to Fractional and Integral Components
 
   dst0.zw = frexp(src.zw, dst1.zw)
 
-DLDEXP - Multiple Double Number by Integral Power of 2
+.. opcode:: DLDEXP - Multiple Double Number by Integral Power of 2
 
 .. math::
 
@@ -1172,7 +1174,7 @@ DLDEXP - Multiple Double Number by Integral Power of 2
 
   dst.zw = ldexp(src0.zw, src1.zw)
 
-DMIN - Minimum Double
+.. opcode:: DMIN - Minimum Double
 
 .. math::
 
@@ -1180,7 +1182,7 @@ DMIN - Minimum Double
 
   dst.zw = min(src0.zw, src1.zw)
 
-DMAX - Maximum Double
+.. opcode:: DMAX - Maximum Double
 
 .. math::
 
@@ -1188,7 +1190,7 @@ DMAX - Maximum Double
 
   dst.zw = max(src0.zw, src1.zw)
 
-DMUL - Multiply Double
+.. opcode:: DMUL - Multiply Double
 
 .. math::
 
@@ -1197,7 +1199,7 @@ DMUL - Multiply Double
   dst.zw = src0.zw \times src1.zw
 
 
-DMAD - Multiply And Add Doubles
+.. opcode:: DMAD - Multiply And Add Doubles
 
 .. math::
 
@@ -1206,7 +1208,7 @@ DMAD - Multiply And Add Doubles
   dst.zw = src0.zw \times src1.zw + src2.zw
 
 
-DRCP - Reciprocal Double
+.. opcode:: DRCP - Reciprocal Double
 
 .. math::
 
@@ -1214,7 +1216,7 @@ DRCP - Reciprocal Double
 
    dst.zw = \frac{1}{src.zw}
 
-DSQRT - Square root double
+.. opcode:: DSQRT - Square root double
 
 .. math::
 
@@ -1269,20 +1271,8 @@ Keywords
 
   discard           Discard fragment.
 
-  dst               First destination register.
-
-  dst0              First destination register.
-
   pc                Program counter.
 
-  src               First source register.
-
-  src0              First source register.
-
-  src1              Second source register.
-
-  src2              Third source register.
-
   target            Label of target instruction.
 
 
@@ -1441,3 +1431,43 @@ GL_ARB_fragment_coord_conventions extension.
 
 DirectX 9 uses INTEGER.
 DirectX 10 uses HALF_INTEGER.
+
+
+
+Texture Sampling and Texture Formats
+------------------------------------
+
+This table shows how texture image components are returned as (x,y,z,w) tuples
+by TGSI texture instructions, such as :opcode:`TEX`, :opcode:`TXD`, and
+:opcode:`TXP`. For reference, OpenGL and Direct3D conventions are shown as
+well.
+
++--------------------+--------------+--------------------+--------------+
+| Texture Components | Gallium      | OpenGL             | Direct3D 9   |
++====================+==============+====================+==============+
+| R                  | XXX TBD      | (r, 0, 0, 1)       | (r, 1, 1, 1) |
++--------------------+--------------+--------------------+--------------+
+| RG                 | XXX TBD      | (r, g, 0, 1)       | (r, g, 1, 1) |
++--------------------+--------------+--------------------+--------------+
+| RGB                | (r, g, b, 1) | (r, g, b, 1)       | (r, g, b, 1) |
++--------------------+--------------+--------------------+--------------+
+| RGBA               | (r, g, b, a) | (r, g, b, a)       | (r, g, b, a) |
++--------------------+--------------+--------------------+--------------+
+| A                  | (0, 0, 0, a) | (0, 0, 0, a)       | (0, 0, 0, a) |
++--------------------+--------------+--------------------+--------------+
+| L                  | (l, l, l, 1) | (l, l, l, 1)       | (l, l, l, 1) |
++--------------------+--------------+--------------------+--------------+
+| LA                 | (l, l, l, a) | (l, l, l, a)       | (l, l, l, a) |
++--------------------+--------------+--------------------+--------------+
+| I                  | (i, i, i, i) | (i, i, i, i)       | N/A          |
++--------------------+--------------+--------------------+--------------+
+| UV                 | XXX TBD      | (0, 0, 0, 1)       | (u, v, 1, 1) |
+|                    |              | [#envmap-bumpmap]_ |              |
++--------------------+--------------+--------------------+--------------+
+| Z                  | XXX TBD      | (z, z, z, 1)       | (0, z, 0, 1) |
+|                    |              | [#depth-tex-mode]_ |              |
++--------------------+--------------+--------------------+--------------+
+
+.. [#envmap-bumpmap] http://www.opengl.org/registry/specs/ATI/envmap_bumpmap.txt
+.. [#depth-tex-mode] the default is (z, z, z, 1) but may also be (0, 0, 0, z)
+   or (z, z, z, z) depending on the value of GL_DEPTH_TEXTURE_MODE.
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 0155b9be501..353ae176fdb 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -65,7 +65,7 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
       /* compute vertex layout now */
       const struct lp_fragment_shader *lpfs = llvmpipe->fs;
       struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
-      const uint num = draw_current_shader_outputs(llvmpipe->draw);
+      const uint num = draw_num_shader_outputs(llvmpipe->draw);
       uint i;
 
       /* Tell draw_vbuf to simply emit the whole post-xform vertex
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index 60ea9c171d5..39bcdc8fe60 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -77,17 +77,21 @@ void r300_shader_read_fs_inputs(struct tgsi_shader_info* info,
 static void find_output_registers(struct r300_fragment_program_compiler * compiler,
                                   struct r300_fragment_shader * fs)
 {
-    unsigned i;
+    unsigned i, colorbuf_count = 0;
 
     /* Mark the outputs as not present initially */
-    compiler->OutputColor = fs->info.num_outputs;
+    compiler->OutputColor[0] = fs->info.num_outputs;
+    compiler->OutputColor[1] = fs->info.num_outputs;
+    compiler->OutputColor[2] = fs->info.num_outputs;
+    compiler->OutputColor[3] = fs->info.num_outputs;
     compiler->OutputDepth = fs->info.num_outputs;
 
     /* Now see where they really are. */
     for(i = 0; i < fs->info.num_outputs; ++i) {
         switch(fs->info.output_semantic_name[i]) {
             case TGSI_SEMANTIC_COLOR:
-                compiler->OutputColor = i;
+                compiler->OutputColor[colorbuf_count] = i;
+                colorbuf_count++;
                 break;
             case TGSI_SEMANTIC_POSITION:
                 compiler->OutputDepth = i;
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 453fb1accc2..b37be261337 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -37,24 +37,31 @@ unsigned r300_texture_get_stride(struct r300_screen* screen,
 unsigned r300_texture_get_offset(struct r300_texture* tex, unsigned level,
                                  unsigned zslice, unsigned face);
 
-/* Note the signature of R300_EASY_TX_FORMAT(A, R, G, B, FORMAT)... */
+/* Translate a pipe_format into a useful texture format for sampling.
+ *
+ * R300_EASY_TX_FORMAT swizzles the texture.
+ * Note the signature of R300_EASY_TX_FORMAT:
+ *   R300_EASY_TX_FORMAT(B, G, R, A, FORMAT);
+ *
+ * The FORMAT specifies how the texture sampler will treat the texture, and
+ * makes available X, Y, Z, W, ZERO, and ONE for swizzling. */
 static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
 {
     switch (format) {
         /* X8 */
         case PIPE_FORMAT_A8_UNORM:
+            return R300_EASY_TX_FORMAT(ZERO, ZERO, ZERO, X, X8);
         case PIPE_FORMAT_I8_UNORM:
             return R300_EASY_TX_FORMAT(X, X, X, X, X8);
         case PIPE_FORMAT_L8_UNORM:
             return R300_EASY_TX_FORMAT(X, X, X, ONE, X8);
         /* X16 */
         case PIPE_FORMAT_R16_UNORM:
+        case PIPE_FORMAT_Z16_UNORM:
             return R300_EASY_TX_FORMAT(X, X, X, X, X16);
         case PIPE_FORMAT_R16_SNORM:
             return R300_EASY_TX_FORMAT(X, X, X, X, X16) |
                 R300_TX_FORMAT_SIGNED;
-        case PIPE_FORMAT_Z16_UNORM:
-            return R300_EASY_TX_FORMAT(X, X, X, X, X16);
         /* Y8X8 */
         case PIPE_FORMAT_A8L8_UNORM:
             return R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8);
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index a792c2cf989..941ec17016b 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -201,6 +201,8 @@ static void transform_srcreg(
     struct rc_src_register * dst,
     struct tgsi_full_src_register * src)
 {
+    unsigned i, j;
+
     dst->File = translate_register_file(src->Register.File);
     dst->Index = translate_register_index(ttr, src->Register.File, src->Register.Index);
     dst->RelAddr = src->Register.Indirect;
@@ -210,6 +212,21 @@ static void transform_srcreg(
     dst->Swizzle |= tgsi_util_get_full_src_register_swizzle(src, 3) << 9;
     dst->Abs = src->Register.Absolute;
     dst->Negate = src->Register.Negate ? RC_MASK_XYZW : 0;
+
+    if (src->Register.File == TGSI_FILE_IMMEDIATE) {
+        for (i = 0; i < ttr->imms_to_swizzle_count; i++) {
+            if (ttr->imms_to_swizzle[i].index == src->Register.Index) {
+                dst->File = RC_FILE_TEMPORARY;
+                dst->Index = 0;
+                dst->Swizzle = 0;
+                for (j = 0; j < 4; j++) {
+                    dst->Swizzle |= GET_SWZ(ttr->imms_to_swizzle[i].swizzle,
+                        tgsi_util_get_full_src_register_swizzle(src, j)) << (j * 3);
+                }
+                break;
+            }
+        }
+    }
 }
 
 static void transform_texture(struct rc_instruction * dst, struct tgsi_instruction_texture src,
@@ -277,21 +294,45 @@ static void transform_instruction(struct tgsi_to_rc * ttr, struct tgsi_full_inst
                           &ttr->compiler->Program.ShadowSamplers);
 }
 
-static void handle_immediate(struct tgsi_to_rc * ttr, struct tgsi_full_immediate * imm)
+static void handle_immediate(struct tgsi_to_rc * ttr,
+                             struct tgsi_full_immediate * imm,
+                             unsigned index)
 {
     struct rc_constant constant;
-    int i;
+    unsigned swizzle = 0;
+    boolean can_swizzle = TRUE;
+    unsigned i;
 
-    constant.Type = RC_CONSTANT_IMMEDIATE;
-    constant.Size = 4;
-    for(i = 0; i < 4; ++i)
-        constant.u.Immediate[i] = imm->u[i].Float;
-    rc_constants_add(&ttr->compiler->Program.Constants, &constant);
+    for (i = 0; i < 4; i++) {
+        if (imm->u[i].Float == 0.0f) {
+            swizzle |= RC_SWIZZLE_ZERO << (i * 3);
+        } else if (imm->u[i].Float == 0.5f) {
+            swizzle |= RC_SWIZZLE_HALF << (i * 3);
+        } else if (imm->u[i].Float == 1.0f) {
+            swizzle |= RC_SWIZZLE_ONE << (i * 3);
+        } else {
+            can_swizzle = FALSE;
+            break;
+        }
+    }
+
+    if (can_swizzle) {
+        ttr->imms_to_swizzle[ttr->imms_to_swizzle_count].index = index;
+        ttr->imms_to_swizzle[ttr->imms_to_swizzle_count].swizzle = swizzle;
+        ttr->imms_to_swizzle_count++;
+    } else {
+        constant.Type = RC_CONSTANT_IMMEDIATE;
+        constant.Size = 4;
+        for(i = 0; i < 4; ++i)
+            constant.u.Immediate[i] = imm->u[i].Float;
+        rc_constants_add(&ttr->compiler->Program.Constants, &constant);
+    }
 }
 
 void r300_tgsi_to_rc(struct tgsi_to_rc * ttr, const struct tgsi_token * tokens)
 {
     struct tgsi_parse_context parser;
+    unsigned imm_index = 0;
     int i;
 
     /* Allocate constants placeholders.
@@ -308,6 +349,9 @@ void r300_tgsi_to_rc(struct tgsi_to_rc * ttr, const struct tgsi_token * tokens)
 
     ttr->immediate_offset = ttr->compiler->Program.Constants.Count;
 
+    ttr->imms_to_swizzle = malloc(ttr->info->immediate_count * sizeof(struct swizzled_imms));
+    ttr->imms_to_swizzle_count = 0;
+
     tgsi_parse_init(&parser, tokens);
 
     while (!tgsi_parse_end_of_tokens(&parser)) {
@@ -317,7 +361,8 @@ void r300_tgsi_to_rc(struct tgsi_to_rc * ttr, const struct tgsi_token * tokens)
             case TGSI_TOKEN_TYPE_DECLARATION:
                 break;
             case TGSI_TOKEN_TYPE_IMMEDIATE:
-                handle_immediate(ttr, &parser.FullToken.FullImmediate);
+                handle_immediate(ttr, &parser.FullToken.FullImmediate, imm_index);
+                imm_index++;
                 break;
             case TGSI_TOKEN_TYPE_INSTRUCTION:
                 transform_instruction(ttr, &parser.FullToken.FullInstruction);
@@ -327,6 +372,8 @@ void r300_tgsi_to_rc(struct tgsi_to_rc * ttr, const struct tgsi_token * tokens)
 
     tgsi_parse_free(&parser);
 
+    free(ttr->imms_to_swizzle);
+
     rc_calculate_inputs_outputs(ttr->compiler);
 }
 
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.h b/src/gallium/drivers/r300/r300_tgsi_to_rc.h
index 93e90ec6d2c..39b473c7bf5 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.h
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.h
@@ -29,11 +29,18 @@ struct tgsi_full_declaration;
 struct tgsi_shader_info;
 struct tgsi_token;
 
+struct swizzled_imms {
+    unsigned index;
+    unsigned swizzle;
+};
+
 struct tgsi_to_rc {
     struct radeon_compiler * compiler;
     const struct tgsi_shader_info * info;
 
     int immediate_offset;
+    struct swizzled_imms * imms_to_swizzle;
+    unsigned imms_to_swizzle_count;
 };
 
 void r300_tgsi_to_rc(struct tgsi_to_rc * ttr, const struct tgsi_token * tokens);
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index f6856a5f691..d2eda7324ca 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -30,7 +30,6 @@
 #include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vertex.h"
-#include "draw/draw_private.h"
 #include "sp_context.h"
 #include "sp_screen.h"
 #include "sp_state.h"
@@ -67,7 +66,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       /* compute vertex layout now */
       const struct sp_fragment_shader *spfs = softpipe->fs;
       struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-      const uint num = draw_current_shader_outputs(softpipe->draw);
+      const uint num = draw_num_shader_outputs(softpipe->draw);
       uint i;
 
       /* Tell draw_vbuf to simply emit the whole post-xform vertex
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index e88ef8d8fe3..c4181c3f5b7 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -215,7 +215,6 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
    svga->state.hw_draw.num_views = 0;
 
    svga->dirty = ~0;
-   svga->state.white_fs_id = SVGA3D_INVALID_ID;
 
    LIST_INITHEAD(&svga->dirty_buffers);
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 66259fd0103..ba86256eb26 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -327,10 +327,6 @@ struct svga_context
 
       unsigned texture_timestamp;
 
-      /* Internally generated shaders:
-       */
-      unsigned white_fs_id;
-
       /* 
        */
       struct svga_sw_state          sw;
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index adc7120217b..2973444d0ab 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -108,70 +108,6 @@ fail:
    return ret;
 }
 
-/* The blend workaround for simulating logicop xor behaviour requires
- * that the incoming fragment color be white.  This change achieves
- * that by hooking up a hard-wired fragment shader that just emits
- * color 1,1,1,1
- *   
- * This is a slightly incomplete solution as it assumes that the
- * actual bound shader has no other effects beyond generating a
- * fragment color.  In particular shaders containing TEXKIL and/or
- * depth-write will not have the correct behaviour, nor will those
- * expecting to use alphatest.
- *   
- * These are avoidable issues, but they are not much worse than the
- * unavoidable ones associated with this technique, so it's not clear
- * how much effort should be expended trying to resolve them - the
- * ultimate result will still not be correct in most cases.
- *
- * Shader below was generated with:
- *   SVGA_DEBUG=tgsi ./mesa/progs/fp/fp-tri white.txt
- */
-static int emit_white_fs( struct svga_context *svga )
-{
-   int ret = PIPE_ERROR;
-
-   /* ps_3_0
-    * def c0, 1.000000, 0.000000, 0.000000, 1.000000
-    * mov oC0, c0.x
-    * end
-    */
-   static const unsigned white_tokens[] = {
-      0xffff0300,
-      0x05000051,
-      0xa00f0000,
-      0x3f800000,
-      0x00000000,
-      0x00000000,
-      0x3f800000,
-      0x02000001,
-      0x800f0800,
-      0xa0000000,
-      0x0000ffff,
-   };
-
-   assert(SVGA3D_INVALID_ID == UTIL_BITMASK_INVALID_INDEX);
-   svga->state.white_fs_id = util_bitmask_add(svga->fs_bm);
-   if(svga->state.white_fs_id == SVGA3D_INVALID_ID)
-      goto no_fs_id;
-
-   ret = SVGA3D_DefineShader(svga->swc, 
-                             svga->state.white_fs_id,
-                             SVGA3D_SHADERTYPE_PS,
-                             white_tokens, 
-                             sizeof(white_tokens));
-   if (ret)
-      goto no_definition;
-
-   return 0;
-
-no_definition:
-   util_bitmask_clear(svga->fs_bm, svga->state.white_fs_id);
-   svga->state.white_fs_id = SVGA3D_INVALID_ID;
-no_fs_id:
-   return ret;
-}
-
 
 /* SVGA_NEW_TEXTURE_BINDING
  * SVGA_NEW_RAST
@@ -199,6 +135,23 @@ static int make_fs_key( const struct svga_context *svga,
                        PIPE_WINDING_CW);
    }
 
+   /* The blend workaround for simulating logicop xor behaviour
+    * requires that the incoming fragment color be white.  This change
+    * achieves that by creating a varient of the current fragment
+    * shader that overrides all output colors with 1,1,1,1
+    *   
+    * This will work for most shaders, including those containing
+    * TEXKIL and/or depth-write.  However, it will break on the
+    * combination of xor-logicop plus alphatest.
+    *
+    * Ultimately, we could implement alphatest in the shader using
+    * texkil prior to overriding the outgoing fragment color.
+    *   
+    * SVGA_NEW_BLEND
+    */
+   if (svga->curr.blend->need_white_fragments) {
+      key->white_fragments = 1;
+   }
    
    /* XXX: want to limit this to the textures that the shader actually
     * refers to.
@@ -238,40 +191,29 @@ static int emit_hw_fs( struct svga_context *svga,
    unsigned id = SVGA3D_INVALID_ID;
    int ret = 0;
 
+   struct svga_fragment_shader *fs = svga->curr.fs;
+   struct svga_fs_compile_key key;
+
    /* SVGA_NEW_BLEND
+    * SVGA_NEW_TEXTURE_BINDING
+    * SVGA_NEW_RAST
+    * SVGA_NEW_NEED_SWTNL
+    * SVGA_NEW_SAMPLER
     */
-   if (svga->curr.blend->need_white_fragments) {
-      if (svga->state.white_fs_id == SVGA3D_INVALID_ID) {
-         ret = emit_white_fs( svga );
-         if (ret)
-            return ret;
-      }
-      id = svga->state.white_fs_id;
-   }
-   else {
-      struct svga_fragment_shader *fs = svga->curr.fs;
-      struct svga_fs_compile_key key;
-
-      /* SVGA_NEW_TEXTURE_BINDING
-       * SVGA_NEW_RAST
-       * SVGA_NEW_NEED_SWTNL
-       * SVGA_NEW_SAMPLER
-       */
-      ret = make_fs_key( svga, &key );
+   ret = make_fs_key( svga, &key );
+   if (ret)
+      return ret;
+
+   result = search_fs_key( fs, &key );
+   if (!result) {
+      ret = compile_fs( svga, fs, &key, &result );
       if (ret)
          return ret;
-
-      result = search_fs_key( fs, &key );
-      if (!result) {
-         ret = compile_fs( svga, fs, &key, &result );
-         if (ret)
-            return ret;
-      }
-
-      assert (result);
-      id = result->id;
    }
 
+   assert (result);
+   id = result->id;
+
    assert(id != SVGA3D_INVALID_ID);
 
    if (result != svga->state.hw_draw.fs) {
diff --git a/src/gallium/drivers/svga/svga_tgsi.h b/src/gallium/drivers/svga/svga_tgsi.h
index 737a2213af5..063c9cf4221 100644
--- a/src/gallium/drivers/svga/svga_tgsi.h
+++ b/src/gallium/drivers/svga/svga_tgsi.h
@@ -49,6 +49,7 @@ struct svga_fs_compile_key
 {
    unsigned light_twoside:1;
    unsigned front_cw:1;
+   unsigned white_fragments:1;
    unsigned num_textures:8;
    unsigned num_unnormalized_coords:8;
    struct {
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
index 43fc0d32359..73102a72a83 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -194,8 +194,19 @@ static boolean ps30_output( struct svga_shader_emitter *emit,
 
    switch (semantic.Name) {
    case TGSI_SEMANTIC_COLOR:
-      emit->output_map[idx] = dst_register( SVGA3DREG_COLOROUT, 
-                                            semantic.Index );
+      if (emit->unit == PIPE_SHADER_FRAGMENT &&
+          emit->key.fkey.white_fragments) {
+
+         emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
+                                               emit->nr_hw_temp++ );
+         emit->temp_col[idx] = emit->output_map[idx];
+         emit->true_col[idx] = dst_register( SVGA3DREG_COLOROUT, 
+                                              semantic.Index );
+      }
+      else {
+         emit->output_map[idx] = dst_register( SVGA3DREG_COLOROUT, 
+                                               semantic.Index );
+      }
       break;
    case TGSI_SEMANTIC_POSITION:
       emit->output_map[idx] = dst_register( SVGA3DREG_TEMP,
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 2557824293e..e8f75485d55 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -79,6 +79,8 @@ struct svga_shader_emitter
 
    int ps30_input_count;
 
+   int dynamic_branching_level;
+
    boolean in_main_func;
 
    boolean created_zero_immediate;
@@ -199,6 +201,23 @@ static INLINE boolean emit_op3( struct svga_shader_emitter *emit,
 }
 
 
+static INLINE boolean emit_op4( struct svga_shader_emitter *emit,
+                                SVGA3dShaderInstToken inst,
+                                SVGA3dShaderDestToken dest,
+                                struct src_register src0,
+                                struct src_register src1,
+                                struct src_register src2,
+                                struct src_register src3)
+{
+   return (emit_instruction( emit, inst ) &&
+           emit_dst( emit, dest ) &&
+           emit_src( emit, src0 ) &&
+           emit_src( emit, src1 ) &&
+           emit_src( emit, src2 ) &&
+           emit_src( emit, src3 ));
+}
+
+
 #define TRANSLATE_SWIZZLE(x,y,z,w)  ((x) | ((y) << 2) | ((z) << 4) | ((w) << 6))
 #define SWIZZLE_XYZW  \
  TRANSLATE_SWIZZLE(TGSI_SWIZZLE_X,TGSI_SWIZZLE_Y,TGSI_SWIZZLE_Z,TGSI_SWIZZLE_W)
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index dc5eb8fc606..be821e98217 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -46,8 +46,6 @@ translate_opcode(
    case TGSI_OPCODE_ABS:        return SVGA3DOP_ABS;
    case TGSI_OPCODE_ADD:        return SVGA3DOP_ADD;
    case TGSI_OPCODE_BREAKC:     return SVGA3DOP_BREAKC;
-   case TGSI_OPCODE_DDX:        return SVGA3DOP_DSX;
-   case TGSI_OPCODE_DDY:        return SVGA3DOP_DSY;
    case TGSI_OPCODE_DP2A:       return SVGA3DOP_DP2ADD;
    case TGSI_OPCODE_DP3:        return SVGA3DOP_DP3;
    case TGSI_OPCODE_DP4:        return SVGA3DOP_DP4;
@@ -415,6 +413,88 @@ static boolean submit_op3( struct svga_shader_emitter *emit,
 }
 
 
+
+
+/* SVGA shaders may not refer to >1 constant register in a single
+ * instruction.  This function checks for that usage and inserts a
+ * move to temporary if detected.
+ */
+static boolean submit_op4( struct svga_shader_emitter *emit,
+                           SVGA3dShaderInstToken inst,
+                           SVGA3dShaderDestToken dest,
+                           struct src_register src0,
+                           struct src_register src1,
+                           struct src_register src2,
+                           struct src_register src3)
+{
+   SVGA3dShaderDestToken temp0;
+   SVGA3dShaderDestToken temp3;
+   boolean need_temp0 = FALSE;
+   boolean need_temp3 = FALSE;
+   SVGA3dShaderRegType type0, type1, type2, type3;
+
+   temp0.value = 0;
+   temp3.value = 0;
+   type0 = SVGA3dShaderGetRegType( src0.base.value );
+   type1 = SVGA3dShaderGetRegType( src1.base.value );
+   type2 = SVGA3dShaderGetRegType( src2.base.value );
+   type3 = SVGA3dShaderGetRegType( src2.base.value );
+
+   /* Make life a little easier - this is only used by the TXD
+    * instruction which is guaranteed not to have a constant/input reg
+    * in one slot at least:
+    */
+   assert(type1 == SVGA3DREG_SAMPLER);
+
+   if (type0 == SVGA3DREG_CONST &&
+       ((type3 == SVGA3DREG_CONST && src0.base.num != src3.base.num) ||
+        (type2 == SVGA3DREG_CONST && src0.base.num != src2.base.num)))
+      need_temp0 = TRUE;
+
+   if (type3 == SVGA3DREG_CONST &&
+       (type2 == SVGA3DREG_CONST && src3.base.num != src2.base.num))
+      need_temp3 = TRUE;
+
+   if (type0 == SVGA3DREG_INPUT &&
+       ((type3 == SVGA3DREG_INPUT && src0.base.num != src3.base.num) ||
+        (type2 == SVGA3DREG_INPUT && src0.base.num != src2.base.num)))
+      need_temp0 = TRUE;
+
+   if (type3 == SVGA3DREG_INPUT &&
+       (type2 == SVGA3DREG_INPUT && src3.base.num != src2.base.num))
+      need_temp3 = TRUE;
+
+   if (need_temp0)
+   {
+      temp0 = get_temp( emit );
+ 
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp0, src0 ))
+         return FALSE;
+         
+      src0 = src( temp0 );
+   }
+
+   if (need_temp3)
+   {
+      temp3 = get_temp( emit );
+
+      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp3, src3 ))
+         return FALSE;
+
+      src3 = src( temp3 );
+   }
+
+   if (!emit_op4( emit, inst, dest, src0, src1, src2, src3 ))
+      return FALSE;
+
+   if (need_temp3)
+      release_temp( emit, temp3 );
+   if (need_temp0)
+      release_temp( emit, temp0 );
+   return TRUE;
+}
+
+
 static boolean emit_def_const( struct svga_shader_emitter *emit,
                                SVGA3dShaderConstType type,
                                unsigned idx,
@@ -660,6 +740,8 @@ static boolean emit_if(struct svga_shader_emitter *emit,
    if_token.control = SVGA3DOPCOMPC_NE;
    zero = scalar(zero, TGSI_SWIZZLE_X);
 
+   emit->dynamic_branching_level++;
+
    return (emit_instruction( emit, if_token ) &&
            emit_src( emit, src ) &&
            emit_src( emit, zero ) );
@@ -668,6 +750,8 @@ static boolean emit_if(struct svga_shader_emitter *emit,
 static boolean emit_endif(struct svga_shader_emitter *emit,
                        const struct tgsi_full_instruction *insn)
 {
+   emit->dynamic_branching_level--;
+
    return (emit_instruction( emit,
                              inst_token( SVGA3DOP_ENDIF )));
 }
@@ -1011,10 +1095,10 @@ static boolean emit_kilp(struct svga_shader_emitter *emit,
 {
    SVGA3dShaderInstToken inst;
    SVGA3dShaderDestToken temp;
-   struct src_register one = get_zero_immediate( emit );
+   struct src_register one = scalar( get_zero_immediate( emit ),
+                                     TGSI_SWIZZLE_W );
 
    inst = inst_token( SVGA3DOP_TEXKILL );
-   one = scalar( one, TGSI_SWIZZLE_W );
 
    /* texkill doesn't allow negation on the operand so lets move
     * negation of {1} to a temp register */
@@ -1169,41 +1253,79 @@ static boolean emit_tex2(struct svga_shader_emitter *emit,
                          SVGA3dShaderDestToken dst )
 {
    SVGA3dShaderInstToken inst;
-   struct src_register src0;
-   struct src_register src1;
-
+   struct src_register texcoord;
+   struct src_register sampler;
+   SVGA3dShaderDestToken tmp;
+   
    inst.value = 0;
-   inst.op = SVGA3DOP_TEX;
 
    switch (insn->Instruction.Opcode) {
    case TGSI_OPCODE_TEX:
+      inst.op = SVGA3DOP_TEX;
       break;
    case TGSI_OPCODE_TXP:
+      inst.op = SVGA3DOP_TEX;
       inst.control = SVGA3DOPCONT_PROJECT;
       break;
    case TGSI_OPCODE_TXB:
+      inst.op = SVGA3DOP_TEX;
       inst.control = SVGA3DOPCONT_BIAS;
       break;
+   case TGSI_OPCODE_TXL:
+      inst.op = SVGA3DOP_TEXLDL;
+      break;
    default:
       assert(0);
       return FALSE;
    }
 
-   src0 = translate_src_register( emit, &insn->Src[0] );
-   src1 = translate_src_register( emit, &insn->Src[1] );
+   texcoord = translate_src_register( emit, &insn->Src[0] );
+   sampler = translate_src_register( emit, &insn->Src[1] );
 
-   if (emit->key.fkey.tex[src1.base.num].unnormalized) {
-      struct src_register wh = get_tex_dimensions( emit, src1.base.num );
-      SVGA3dShaderDestToken tmp = get_temp( emit );
+   if (emit->key.fkey.tex[sampler.base.num].unnormalized ||
+       emit->dynamic_branching_level > 0)
+      tmp = get_temp( emit );
+
+   /* Can't do mipmapping inside dynamic branch constructs.  Force LOD
+    * zero in that case.
+    */
+   if (emit->dynamic_branching_level > 0 &&
+       inst.op == SVGA3DOP_TEX &&
+       SVGA3dShaderGetRegType(texcoord.base.value) == SVGA3DREG_TEMP) {
+      struct src_register zero = get_zero_immediate( emit );
+
+      /* MOV  tmp, texcoord */
+      if (!submit_op1( emit,
+                       inst_token( SVGA3DOP_MOV ),
+                       tmp,
+                       texcoord ))
+         return FALSE;
+
+      /* MOV  tmp.w, zero */
+      if (!submit_op1( emit, 
+                       inst_token( SVGA3DOP_MOV ),
+                       writemask( tmp, TGSI_WRITEMASK_W ), 
+                       scalar( zero, TGSI_SWIZZLE_X )))
+         return FALSE;
+      
+      texcoord = src( tmp );
+      inst.op = SVGA3DOP_TEXLDL;
+   }
+
+   /* Explicit normalization of texcoords:
+    */
+   if (emit->key.fkey.tex[sampler.base.num].unnormalized) {
+      struct src_register wh = get_tex_dimensions( emit, sampler.base.num );
 
       /* MUL  tmp, SRC0, WH */
       if (!submit_op2( emit, inst_token( SVGA3DOP_MUL ),
-                       tmp, src0, wh ))
+                       tmp, texcoord, wh ))
          return FALSE;
-      src0 = src( tmp );
+
+      texcoord = src( tmp );
    }
 
-   return submit_op2( emit, inst, dst, src0, src1 );
+   return submit_op2( emit, inst, dst, texcoord, sampler );
 }
 
 
@@ -1211,31 +1333,33 @@ static boolean emit_tex2(struct svga_shader_emitter *emit,
 
 /* Translate texture instructions to SVGA3D representation.
  */
-static boolean emit_tex3(struct svga_shader_emitter *emit,
+static boolean emit_tex4(struct svga_shader_emitter *emit,
                          const struct tgsi_full_instruction *insn,
                          SVGA3dShaderDestToken dst )
 {
    SVGA3dShaderInstToken inst;
-   struct src_register src0;
-   struct src_register src1;
-   struct src_register src2;
+   struct src_register texcoord;
+   struct src_register ddx;
+   struct src_register ddy;
+   struct src_register sampler;
+
+   texcoord = translate_src_register( emit, &insn->Src[0] );
+   ddx      = translate_src_register( emit, &insn->Src[1] );
+   ddy      = translate_src_register( emit, &insn->Src[2] );
+   sampler  = translate_src_register( emit, &insn->Src[3] );
 
    inst.value = 0;
 
    switch (insn->Instruction.Opcode) {
    case TGSI_OPCODE_TXD: 
-      inst.op = SVGA3DOP_TEXLDD;
-      break;
-   case TGSI_OPCODE_TXL:
-      inst.op = SVGA3DOP_TEXLDL;
+      inst.op = SVGA3DOP_TEXLDD; /* 4 args! */
       break;
+   default:
+      assert(0);
+      return FALSE;
    }
 
-   src0 = translate_src_register( emit, &insn->Src[0] );
-   src1 = translate_src_register( emit, &insn->Src[1] );
-   src2 = translate_src_register( emit, &insn->Src[2] );
-
-   return submit_op3( emit, inst, dst, src0, src1, src2 );
+   return submit_op4( emit, inst, dst, texcoord, sampler, ddx, ddy );
 }
 
 
@@ -1271,12 +1395,12 @@ static boolean emit_tex(struct svga_shader_emitter *emit,
    case TGSI_OPCODE_TEX:
    case TGSI_OPCODE_TXB:
    case TGSI_OPCODE_TXP:
+   case TGSI_OPCODE_TXL:
       if (!emit_tex2( emit, insn, tex_result ))
          return FALSE;
       break;
-   case TGSI_OPCODE_TXL:
    case TGSI_OPCODE_TXD:
-      if (!emit_tex3( emit, insn, tex_result ))
+      if (!emit_tex4( emit, insn, tex_result ))
          return FALSE;
       break;
    default:
@@ -1330,6 +1454,8 @@ static boolean emit_bgnloop2( struct svga_shader_emitter *emit,
    struct src_register loop_reg = src_register( SVGA3DREG_LOOP, 0 );
    struct src_register const_int = get_loop_const( emit );
 
+   emit->dynamic_branching_level++;
+
    return (emit_instruction( emit, inst ) &&
            emit_src( emit, loop_reg ) &&
            emit_src( emit, const_int ) );
@@ -1339,6 +1465,9 @@ static boolean emit_endloop2( struct svga_shader_emitter *emit,
                               const struct tgsi_full_instruction *insn )
 {
    SVGA3dShaderInstToken inst = inst_token( SVGA3DOP_ENDLOOP );
+
+   emit->dynamic_branching_level--;
+
    return emit_instruction( emit, inst );
 }
 
@@ -1398,6 +1527,46 @@ static boolean emit_simple_instruction(struct svga_shader_emitter *emit,
    }
 }
 
+
+static boolean emit_deriv(struct svga_shader_emitter *emit,
+                          const struct tgsi_full_instruction *insn )
+{
+   if (emit->dynamic_branching_level > 0 &&
+       insn->Src[0].Register.File == TGSI_FILE_TEMPORARY) 
+   {
+      struct src_register zero = get_zero_immediate( emit );
+      SVGA3dShaderDestToken dst = 
+         translate_dst_register( emit, insn, 0 );
+
+      /* Deriv opcodes not valid inside dynamic branching, workaround
+       * by zeroing out the destination.
+       */
+      if (!submit_op1(emit, 
+                      inst_token( SVGA3DOP_MOV ), 
+                      dst,
+                      scalar(zero, TGSI_SWIZZLE_X)))
+         return FALSE;
+      
+      return TRUE;
+   }
+   else {
+      unsigned opcode;
+
+      switch (insn->Instruction.Opcode) {
+      case TGSI_OPCODE_DDX:
+         opcode = SVGA3DOP_DSX;
+         break;
+      case TGSI_OPCODE_DDY:
+         opcode = SVGA3DOP_DSY;
+         break;
+      default:
+         return FALSE;
+      }
+
+      return emit_simple_instruction( emit, opcode, insn );
+   }
+}
+
 static boolean emit_arl(struct svga_shader_emitter *emit,
                         const struct tgsi_full_instruction *insn)
 {
@@ -2002,6 +2171,10 @@ static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
    case TGSI_OPCODE_TXD:
       return emit_tex( emit, insn );
 
+   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDY:
+      return emit_deriv( emit, insn );
+
    case TGSI_OPCODE_BGNSUB:
       return emit_bgnsub( emit, position, insn );
 
@@ -2254,11 +2427,28 @@ static boolean emit_ps_postamble( struct svga_shader_emitter *emit )
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       if (SVGA3dShaderGetRegType(emit->true_col[i].value) != 0) {
 
-         if (!submit_op1( emit,
-                          inst_token(SVGA3DOP_MOV),
-                          emit->true_col[i],
-                          src(emit->temp_col[i]) ))
-            return FALSE;
+         /* Potentially override output colors with white for XOR
+          * logicop workaround.
+          */
+         if (emit->unit == PIPE_SHADER_FRAGMENT &&
+             emit->key.fkey.white_fragments) {
+
+            struct src_register one = scalar( get_zero_immediate( emit ),
+                                              TGSI_SWIZZLE_W );
+
+            if (!submit_op1( emit,
+                             inst_token(SVGA3DOP_MOV),
+                             emit->true_col[i],
+                             one ))
+               return FALSE;
+         }
+         else {
+            if (!submit_op1( emit,
+                             inst_token(SVGA3DOP_MOV),
+                             emit->true_col[i],
+                             src(emit->temp_col[i]) ))
+               return FALSE;
+         }
       }
    }
 
@@ -2467,6 +2657,9 @@ needs_to_create_zero( struct svga_shader_emitter *emit )
       if (emit->key.fkey.light_twoside)
          return TRUE;
 
+      if (emit->key.fkey.white_fragments)
+         return TRUE;
+
       if (emit->emit_frontface)
          return TRUE;
 
@@ -2476,6 +2669,10 @@ needs_to_create_zero( struct svga_shader_emitter *emit )
    }
 
    if (emit->info.opcode_count[TGSI_OPCODE_IF] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_BGNLOOP] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_BGNFOR] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_DDX] >= 1 ||
+       emit->info.opcode_count[TGSI_OPCODE_DDY] >= 1 ||
        emit->info.opcode_count[TGSI_OPCODE_SGE] >= 1 ||
        emit->info.opcode_count[TGSI_OPCODE_SGT] >= 1 ||
        emit->info.opcode_count[TGSI_OPCODE_SLE] >= 1 ||
@@ -2702,6 +2899,8 @@ boolean svga_shader_emit_instructions( struct svga_shader_emitter *emit,
          goto done;
    }
 
+   assert(emit->dynamic_branching_level == 0);
+
    /* Need to terminate the whole shader:
     */
    ret = emit_instruction( emit, inst_token( SVGA3DOP_END ) );
diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.c b/src/gallium/state_trackers/egl/common/egl_g3d.c
index 8a73e81d4ac..30e2c347bd4 100644
--- a/src/gallium/state_trackers/egl/common/egl_g3d.c
+++ b/src/gallium/state_trackers/egl/common/egl_g3d.c
@@ -1087,8 +1087,8 @@ egl_g3d_bind_tex_image(_EGLDriver *drv, _EGLDisplay *dpy,
                        _EGLSurface *surf, EGLint buffer)
 {
    struct egl_g3d_surface *gsurf = egl_g3d_surface(surf);
-   _EGLContext *ctx = _eglGetAPIContext(EGL_OPENGL_ES_API);
-   struct egl_g3d_context *gctx = egl_g3d_context(ctx);
+   _EGLContext *es1 = _eglGetAPIContext(EGL_OPENGL_ES_API);
+   struct egl_g3d_context *gctx;
    enum pipe_format target_format;
    int target;
 
@@ -1118,6 +1118,11 @@ egl_g3d_bind_tex_image(_EGLDriver *drv, _EGLDisplay *dpy,
       return _eglError(EGL_BAD_MATCH, "eglBindTexImage");
    }
 
+   if (!es1)
+      return EGL_TRUE;
+   if (!gsurf->render_surface)
+      return EGL_FALSE;
+
    /* flush properly if the surface is bound */
    if (gsurf->base.CurrentContext) {
       gctx = egl_g3d_context(gsurf->base.CurrentContext);
@@ -1125,14 +1130,11 @@ egl_g3d_bind_tex_image(_EGLDriver *drv, _EGLDisplay *dpy,
             PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_FRAME, NULL);
    }
 
-   if (gctx) {
-      if (!gsurf->render_surface)
-         return EGL_FALSE;
+   gctx = egl_g3d_context(es1);
+   gctx->stapi->st_bind_texture_surface(gsurf->render_surface,
+         target, gsurf->base.MipmapLevel, target_format);
 
-      gctx->stapi->st_bind_texture_surface(gsurf->render_surface,
-            target, gsurf->base.MipmapLevel, target_format);
-      gsurf->base.BoundToTexture = EGL_TRUE;
-   }
+   gsurf->base.BoundToTexture = EGL_TRUE;
 
    return EGL_TRUE;
 }
diff --git a/src/gallium/state_trackers/vega/asm_fill.h b/src/gallium/state_trackers/vega/asm_fill.h
index 2f394ad6c55..27773467fa8 100644
--- a/src/gallium/state_trackers/vega/asm_fill.h
+++ b/src/gallium/state_trackers/vega/asm_fill.h
@@ -27,166 +27,375 @@
 #ifndef ASM_FILL_H
 #define ASM_FILL_H
 
-static const char solid_fill_asm[] =
-   "MOV %s, CONST[0]\n";
-
-
-static const char linear_grad_asm[] =
-   "MOV TEMP[0].xy, IN[0]\n"
-   "MOV TEMP[0].z, CONST[1].yyyy\n"
-   "DP3 TEMP[1], CONST[2], TEMP[0]\n"
-   "DP3 TEMP[2], CONST[3], TEMP[0]\n"
-   "DP3 TEMP[3], CONST[4], TEMP[0]\n"
-   "RCP TEMP[3], TEMP[3]\n"
-   "MUL TEMP[1], TEMP[1], TEMP[3]\n"
-   "MUL TEMP[2], TEMP[2], TEMP[3]\n"
-   "MOV TEMP[4].x, TEMP[1]\n"
-   "MOV TEMP[4].y, TEMP[2]\n"
-   "MUL TEMP[0], CONST[0].yyyy, TEMP[4].yyyy\n"
-   "MAD TEMP[1], CONST[0].xxxx, TEMP[4].xxxx, TEMP[0]\n"
-   "MUL TEMP[2], TEMP[1], CONST[0].zzzz\n"
-   "TEX %s, TEMP[2], SAMP[0], 1D\n";
-
-static const char radial_grad_asm[] =
-   "MOV TEMP[0].xy, IN[0]\n"
-   "MOV TEMP[0].z, CONST[1].yyyy\n"
-   "DP3 TEMP[1], CONST[2], TEMP[0]\n"
-   "DP3 TEMP[2], CONST[3], TEMP[0]\n"
-   "DP3 TEMP[3], CONST[4], TEMP[0]\n"
-   "RCP TEMP[3], TEMP[3]\n"
-   "MUL TEMP[1], TEMP[1], TEMP[3]\n"
-   "MUL TEMP[2], TEMP[2], TEMP[3]\n"
-   "MOV TEMP[5].x, TEMP[1]\n"
-   "MOV TEMP[5].y, TEMP[2]\n"
-   "MUL TEMP[0], CONST[0].yyyy, TEMP[5].yyyy\n"
-   "MAD TEMP[1], CONST[0].xxxx, TEMP[5].xxxx, TEMP[0]\n"
-   "ADD TEMP[1], TEMP[1], TEMP[1]\n"
-   "MUL TEMP[3], TEMP[5].yyyy, TEMP[5].yyyy\n"
-   "MAD TEMP[4], TEMP[5].xxxx, TEMP[5].xxxx, TEMP[3]\n"
-   "MOV TEMP[4], -TEMP[4]\n"
-   "MUL TEMP[2], CONST[0].zzzz, TEMP[4]\n"
-   "MUL TEMP[0], CONST[1].wwww, TEMP[2]\n"
-   "MUL TEMP[3], TEMP[1], TEMP[1]\n"
-   "SUB TEMP[2], TEMP[3], TEMP[0]\n"
-   "RSQ TEMP[2], |TEMP[2]|\n"
-   "RCP TEMP[2], TEMP[2]\n"
-   "SUB TEMP[1], TEMP[2], TEMP[1]\n"
-   "ADD TEMP[0], CONST[0].zzzz, CONST[0].zzzz\n"
-   "RCP TEMP[0], TEMP[0]\n"
-   "MUL TEMP[2], TEMP[1], TEMP[0]\n"
-   "TEX %s, TEMP[2], SAMP[0], 1D\n";
-
-static const char pattern_asm[] =
-   "MOV TEMP[0].xy, IN[0]\n"
-   "MOV TEMP[0].z, CONST[1].yyyy\n"
-   "DP3 TEMP[1], CONST[2], TEMP[0]\n"
-   "DP3 TEMP[2], CONST[3], TEMP[0]\n"
-   "DP3 TEMP[3], CONST[4], TEMP[0]\n"
-   "RCP TEMP[3], TEMP[3]\n"
-   "MUL TEMP[1], TEMP[1], TEMP[3]\n"
-   "MUL TEMP[2], TEMP[2], TEMP[3]\n"
-   "MOV TEMP[4].x, TEMP[1]\n"
-   "MOV TEMP[4].y, TEMP[2]\n"
-   "RCP TEMP[0], CONST[1].zwzw\n"
-   "MOV TEMP[1], TEMP[4]\n"
-   "MUL TEMP[1].x, TEMP[1], TEMP[0]\n"
-   "MUL TEMP[1].y, TEMP[1], TEMP[0]\n"
-   "TEX %s, TEMP[1], SAMP[0], 2D\n";
-
-
-static const char mask_asm[] =
-   "TEX TEMP[1], IN[0], SAMP[1], 2D\n"
-   "MUL TEMP[0].w, TEMP[0].wwww, TEMP[1].wwww\n"
-   "MOV %s, TEMP[0]\n";
-
-
-static const char image_normal_asm[] =
-   "TEX %s, IN[1], SAMP[3], 2D\n";
-
-static const char image_multiply_asm[] =
-   "TEX TEMP[1], IN[1], SAMP[3], 2D\n"
-   "MUL %s, TEMP[0], TEMP[1]\n";
-
-static const char image_stencil_asm[] =
-   "TEX TEMP[1], IN[1], SAMP[3], 2D\n"
-   "MUL %s, TEMP[0], TEMP[1]\n";
-
-
-#define EXTENDED_BLEND_OVER                     \
-   "SUB TEMP[3], CONST[1].yyyy, TEMP[1].wwww\n" \
-   "SUB TEMP[4], CONST[1].yyyy, TEMP[0].wwww\n" \
-   "MUL TEMP[3], TEMP[0], TEMP[3]\n"            \
-   "MUL TEMP[4], TEMP[1], TEMP[4]\n"            \
-   "ADD TEMP[3], TEMP[3], TEMP[4]\n"
-
-static const char blend_multiply_asm[] =
-   "TEX TEMP[1], IN[0], SAMP[2], 2D\n"
-   EXTENDED_BLEND_OVER
-   "MUL TEMP[4], TEMP[0], TEMP[1]\n"
-   "ADD TEMP[1], TEMP[4], TEMP[3]\n"/*result.rgb*/
-   "MUL TEMP[2], TEMP[0].wwww, TEMP[1].wwww\n"
-   "ADD TEMP[3], TEMP[0].wwww, TEMP[1].wwww\n"
-   "SUB TEMP[1].w, TEMP[3], TEMP[2]\n"
-   "MOV %s, TEMP[1]\n";
-#if 1
-static const char blend_screen_asm[] =
-   "TEX TEMP[1], IN[0], SAMP[2], 2D\n"
-   "ADD TEMP[3], TEMP[0], TEMP[1]\n"
-   "MUL TEMP[2], TEMP[0], TEMP[1]\n"
-   "SUB %s, TEMP[3], TEMP[2]\n";
-#else
-static const char blend_screen_asm[] =
-   "TEX TEMP[1], IN[0], SAMP[2], 2D\n"
-   "MOV %s, TEMP[1]\n";
-#endif
-
-static const char blend_darken_asm[] =
-   "TEX TEMP[1], IN[0], SAMP[2], 2D\n"
-   EXTENDED_BLEND_OVER
-   "MUL TEMP[4], TEMP[0], TEMP[1].wwww\n"
-   "MUL TEMP[5], TEMP[1], TEMP[0].wwww\n"
-   "MIN TEMP[4], TEMP[4], TEMP[5]\n"
-   "ADD TEMP[1], TEMP[3], TEMP[4]\n"
-   "MUL TEMP[2], TEMP[0].wwww, TEMP[1].wwww\n"
-   "ADD TEMP[3], TEMP[0].wwww, TEMP[1].wwww\n"
-   "SUB TEMP[1].w, TEMP[3], TEMP[2]\n"
-   "MOV %s, TEMP[1]\n";
-
-static const char blend_lighten_asm[] =
-   "TEX TEMP[1], IN[0], SAMP[2], 2D\n"
-   EXTENDED_BLEND_OVER
-   "MUL TEMP[4], TEMP[0], TEMP[1].wwww\n"
-   "MUL TEMP[5], TEMP[1], TEMP[0].wwww\n"
-   "MAX TEMP[4], TEMP[4], TEMP[5]\n"
-   "ADD TEMP[1], TEMP[3], TEMP[4]\n"
-   "MUL TEMP[2], TEMP[0].wwww, TEMP[1].wwww\n"
-   "ADD TEMP[3], TEMP[0].wwww, TEMP[1].wwww\n"
-   "SUB TEMP[1].w, TEMP[3], TEMP[2]\n"
-   "MOV %s, TEMP[1]\n";
-
-
-static const char premultiply_asm[] =
-   "MUL TEMP[0].xyz, TEMP[0], TEMP[0].wwww\n";
-
-static const char unpremultiply_asm[] =
-   "TEX TEMP[0], IN[0], SAMP[1], 2D\n";
-
-
-static const char color_bw_asm[] =
-   "ADD TEMP[1], CONST[1].yyyy, CONST[1].yyyy\n"
-   "RCP TEMP[2], TEMP[1]\n"
-   "ADD TEMP[1], CONST[1].yyyy, TEMP[2]\n"
-   "ADD TEMP[2].x, TEMP[0].xxxx, TEMP[0].yyyy\n"
-   "ADD TEMP[2].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
-   "SGE TEMP[0].xyz, TEMP[2].xxxx, TEMP[1]\n"
-   "SGE TEMP[0].w, TEMP[0].wwww, TEMP[2].yyyy\n"
-   "MOV %s, TEMP[0]\n";
+#include "tgsi/tgsi_ureg.h"
+
+typedef void (* ureg_func)( struct ureg_program *ureg,
+                            struct ureg_dst *out,
+                            struct ureg_src *in,
+                            struct ureg_src *sampler,
+                            struct ureg_dst *temp,
+                            struct ureg_src *constant);
+
+static INLINE void
+solid_fill( struct ureg_program *ureg,
+            struct ureg_dst *out,
+            struct ureg_src *in,
+            struct ureg_src *sampler,
+            struct ureg_dst *temp,
+            struct ureg_src *constant)
+{
+   ureg_MOV(ureg, *out, constant[0]);
+}
+
+static INLINE void
+linear_grad( struct ureg_program *ureg,
+             struct ureg_dst *out,
+             struct ureg_src *in,
+             struct ureg_src *sampler,
+             struct ureg_dst *temp,
+             struct ureg_src *constant)
+{
+
+   ureg_MOV(ureg,
+            ureg_writemask(temp[0], TGSI_WRITEMASK_XY),
+            in[0]);
+   ureg_MOV(ureg,
+            ureg_writemask(temp[0], TGSI_WRITEMASK_Z),
+            ureg_scalar(constant[1], TGSI_SWIZZLE_Y));
+   ureg_DP3(ureg, temp[1], constant[2], ureg_src(temp[0]));
+   ureg_DP3(ureg, temp[2], constant[3], ureg_src(temp[0]));
+   ureg_DP3(ureg, temp[3], constant[4], ureg_src(temp[0]));
+   ureg_RCP(ureg, temp[3], ureg_src(temp[3]));
+   ureg_MUL(ureg, temp[1], ureg_src(temp[1]), ureg_src(temp[3]));
+   ureg_MUL(ureg, temp[2], ureg_src(temp[2]), ureg_src(temp[3]));
+   ureg_MOV(ureg, ureg_writemask(temp[4], TGSI_WRITEMASK_X), ureg_src(temp[1]));
+   ureg_MOV(ureg, ureg_writemask(temp[4], TGSI_WRITEMASK_Y), ureg_src(temp[2]));
+   ureg_MUL(ureg, temp[0],
+            ureg_scalar(constant[0], TGSI_SWIZZLE_Y),
+            ureg_scalar(ureg_src(temp[4]), TGSI_SWIZZLE_Y));
+   ureg_MAD(ureg, temp[1],
+            ureg_scalar(constant[0], TGSI_SWIZZLE_X),
+            ureg_scalar(ureg_src(temp[4]), TGSI_SWIZZLE_X),
+            ureg_src(temp[0]));
+   ureg_MUL(ureg, temp[2], ureg_src(temp[1]),
+            ureg_scalar(constant[0], TGSI_SWIZZLE_Z));
+   ureg_TEX(ureg, *out, TGSI_TEXTURE_1D, ureg_src(temp[2]), sampler[0]);
+}
+
+static INLINE void
+radial_grad( struct ureg_program *ureg,
+             struct ureg_dst *out,
+             struct ureg_src *in,
+             struct ureg_src *sampler,
+             struct ureg_dst *temp,
+             struct ureg_src *constant)
+{
+
+   ureg_MOV(ureg, ureg_writemask(temp[0], TGSI_WRITEMASK_XY), in[0]);
+   ureg_MOV(ureg,
+            ureg_writemask(temp[0], TGSI_WRITEMASK_Z),
+            ureg_scalar(constant[1], TGSI_SWIZZLE_Y));
+   ureg_DP3(ureg, temp[1], constant[2], ureg_src(temp[0]));
+   ureg_DP3(ureg, temp[2], constant[3], ureg_src(temp[0]));
+   ureg_DP3(ureg, temp[3], constant[4], ureg_src(temp[0]));
+   ureg_RCP(ureg, temp[3], ureg_src(temp[3]));
+   ureg_MUL(ureg, temp[1], ureg_src(temp[1]), ureg_src(temp[3]));
+   ureg_MUL(ureg, temp[2], ureg_src(temp[2]), ureg_src(temp[3]));
+   ureg_MOV(ureg, ureg_writemask(temp[5], TGSI_WRITEMASK_X), ureg_src(temp[1]));
+   ureg_MOV(ureg, ureg_writemask(temp[5], TGSI_WRITEMASK_Y), ureg_src(temp[2]));
+   ureg_MUL(ureg, temp[0], ureg_scalar(constant[0], TGSI_SWIZZLE_Y),
+            ureg_scalar(ureg_src(temp[5]), TGSI_SWIZZLE_Y));
+   ureg_MAD(ureg, temp[1],
+            ureg_scalar(constant[0], TGSI_SWIZZLE_X),
+            ureg_scalar(ureg_src(temp[5]), TGSI_SWIZZLE_X), ureg_src(temp[0]));
+   ureg_ADD(ureg, temp[1], ureg_src(temp[1]), ureg_src(temp[1]));
+   ureg_MUL(ureg, temp[3],
+            ureg_scalar(ureg_src(temp[5]), TGSI_SWIZZLE_Y),
+            ureg_scalar(ureg_src(temp[5]), TGSI_SWIZZLE_Y));
+   ureg_MAD(ureg, temp[4],
+            ureg_scalar(ureg_src(temp[5]), TGSI_SWIZZLE_X),
+            ureg_scalar(ureg_src(temp[5]), TGSI_SWIZZLE_X),
+            ureg_src(temp[3]));
+   ureg_MOV(ureg, temp[4], ureg_negate(ureg_src(temp[4])));
+   ureg_MUL(ureg, temp[2],
+            ureg_scalar(constant[0], TGSI_SWIZZLE_Z),
+            ureg_src(temp[4]));
+   ureg_MUL(ureg, temp[0],
+            ureg_scalar(constant[1], TGSI_SWIZZLE_W),
+            ureg_src(temp[2]));
+   ureg_MUL(ureg, temp[3], ureg_src(temp[1]), ureg_src(temp[1]));
+
+   ureg_SUB(ureg, temp[2], ureg_src(temp[3]), ureg_src(temp[0]));
+   ureg_RSQ(ureg, temp[2], ureg_abs(ureg_src(temp[2])));
+   ureg_RCP(ureg, temp[2], ureg_src(temp[2]));
+   ureg_SUB(ureg, temp[1], ureg_src(temp[2]), ureg_src(temp[1]));
+   ureg_ADD(ureg, temp[0],
+            ureg_scalar(constant[0], TGSI_SWIZZLE_Z),
+            ureg_scalar(constant[0], TGSI_SWIZZLE_Z));
+   ureg_RCP(ureg, temp[0], ureg_src(temp[0]));
+   ureg_MUL(ureg, temp[2], ureg_src(temp[1]), ureg_src(temp[0]));
+   ureg_TEX(ureg, *out, TGSI_TEXTURE_1D, ureg_src(temp[2]), sampler[0]);
+
+}
+
+
+static INLINE void
+pattern( struct ureg_program *ureg,
+         struct ureg_dst     *out,
+         struct ureg_src     *in,
+         struct ureg_src     *sampler,
+         struct ureg_dst     *temp,
+         struct ureg_src     *constant)
+{
+   ureg_MOV(ureg,
+            ureg_writemask(temp[0], TGSI_WRITEMASK_XY),
+            in[0]);
+   ureg_MOV(ureg,
+            ureg_writemask(temp[0], TGSI_WRITEMASK_Z),
+            ureg_scalar(constant[1], TGSI_SWIZZLE_Y));
+   ureg_DP3(ureg, temp[1], constant[2], ureg_src(temp[0]));
+   ureg_DP3(ureg, temp[2], constant[3], ureg_src(temp[0]));
+   ureg_DP3(ureg, temp[3], constant[4], ureg_src(temp[0]));
+   ureg_RCP(ureg, temp[3], ureg_src(temp[3]));
+   ureg_MUL(ureg, temp[1], ureg_src(temp[1]), ureg_src(temp[3]));
+   ureg_MUL(ureg, temp[2], ureg_src(temp[2]), ureg_src(temp[3]));
+   ureg_MOV(ureg, ureg_writemask(temp[4], TGSI_WRITEMASK_X), ureg_src(temp[1]));
+   ureg_MOV(ureg, ureg_writemask(temp[4], TGSI_WRITEMASK_Y), ureg_src(temp[2]));
+   ureg_RCP(ureg, temp[0],
+            ureg_swizzle(constant[1],
+                         TGSI_SWIZZLE_Z,
+                         TGSI_SWIZZLE_W,
+                         TGSI_SWIZZLE_Z,
+                         TGSI_SWIZZLE_W));
+   ureg_MOV(ureg, temp[1], ureg_src(temp[4]));
+   ureg_MUL(ureg,
+            ureg_writemask(temp[1], TGSI_WRITEMASK_X),
+            ureg_src(temp[1]),
+            ureg_src(temp[0]));
+   ureg_MUL(ureg,
+            ureg_writemask(temp[1], TGSI_WRITEMASK_Y),
+            ureg_src(temp[1]),
+            ureg_src(temp[0]));
+   ureg_TEX(ureg, *out, TGSI_TEXTURE_2D, ureg_src(temp[1]), sampler[0]);
+}
+
+static INLINE void
+mask( struct ureg_program *ureg,
+      struct ureg_dst *out,
+      struct ureg_src *in,
+      struct ureg_src *sampler,
+      struct ureg_dst *temp,
+      struct ureg_src *constant)
+{
+   ureg_TEX(ureg, temp[1], TGSI_TEXTURE_2D, in[0], sampler[1]);
+   ureg_MUL(ureg, ureg_writemask(temp[0], TGSI_WRITEMASK_W),
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_MOV(ureg, *out, ureg_src(temp[0]));
+}
+
+static INLINE void
+image_normal( struct ureg_program *ureg,
+              struct ureg_dst *out,
+              struct ureg_src *in,
+              struct ureg_src *sampler,
+              struct ureg_dst *temp,
+              struct ureg_src *constant)
+{
+   ureg_TEX(ureg, *out, TGSI_TEXTURE_2D, in[1], sampler[3]);
+}
+
+
+static INLINE void
+image_multiply( struct ureg_program *ureg,
+                struct ureg_dst *out,
+                struct ureg_src *in,
+                struct ureg_src *sampler,
+                struct ureg_dst *temp,
+                struct ureg_src *constant)
+{
+   ureg_TEX(ureg, temp[1], TGSI_TEXTURE_2D, in[1], sampler[3]);
+   ureg_MUL(ureg, *out, ureg_src(temp[0]), ureg_src(temp[1]));
+}
+
+
+static INLINE void
+image_stencil( struct ureg_program *ureg,
+               struct ureg_dst *out,
+               struct ureg_src *in,
+               struct ureg_src *sampler,
+               struct ureg_dst *temp,
+               struct ureg_src *constant)
+{
+   ureg_TEX(ureg, temp[1], TGSI_TEXTURE_2D, in[1], sampler[3]);
+   ureg_MUL(ureg, *out, ureg_src(temp[0]), ureg_src(temp[1]));
+}
+
+#define EXTENDED_BLENDER_OVER_FUNC                                      \
+   ureg_SUB(ureg, temp[3],                                              \
+            ureg_scalar(constant[1], TGSI_SWIZZLE_Y),                   \
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));            \
+   ureg_SUB(ureg, temp[3],                                              \
+            ureg_scalar(constant[1], TGSI_SWIZZLE_Y),                   \
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W));            \
+   ureg_MUL(ureg, temp[3], ureg_src(temp[0]), ureg_src(temp[3]));       \
+   ureg_MUL(ureg, temp[4], ureg_src(temp[1]), ureg_src(temp[4]));       \
+   ureg_ADD(ureg, temp[3], ureg_src(temp[3]), ureg_src(temp[4]));
+
+
+static INLINE void
+blend_multiply( struct ureg_program *ureg,
+                struct ureg_dst *out,
+                struct ureg_src *in,
+                struct ureg_src *sampler,
+                struct ureg_dst *temp,
+                struct ureg_src *constant)
+{
+   ureg_TEX(ureg, temp[1], TGSI_TEXTURE_2D, in[0], sampler[2]);
+   EXTENDED_BLENDER_OVER_FUNC
+   ureg_MUL(ureg, temp[4], ureg_src(temp[0]), ureg_src(temp[1]));
+   ureg_ADD(ureg, temp[1], ureg_src(temp[4]), ureg_src(temp[3]));
+
+   ureg_MUL(ureg, temp[2], ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_ADD(ureg, temp[3], ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_SUB(ureg, ureg_writemask(temp[1], TGSI_WRITEMASK_W),
+            ureg_src(temp[3]), ureg_src(temp[2]));
+
+   ureg_MOV(ureg, *out, ureg_src(temp[1]));
+}
+
+static INLINE void
+blend_screen( struct ureg_program *ureg,
+              struct ureg_dst     *out,
+              struct ureg_src     *in,
+              struct ureg_src     *sampler,
+              struct ureg_dst     *temp,
+              struct ureg_src     *constant)
+{
+   ureg_TEX(ureg, temp[1], TGSI_TEXTURE_2D, in[0], sampler[2]);
+   ureg_ADD(ureg, temp[3], ureg_src(temp[0]), ureg_src(temp[1]));
+   ureg_MUL(ureg, temp[2], ureg_src(temp[0]), ureg_src(temp[1]));
+   ureg_SUB(ureg, *out, ureg_src(temp[3]), ureg_src(temp[2]));
+}
+
+static INLINE void
+blend_darken( struct ureg_program *ureg,
+              struct ureg_dst     *out,
+              struct ureg_src     *in,
+              struct ureg_src     *sampler,
+              struct ureg_dst     *temp,
+              struct ureg_src     *constant)
+{
+   ureg_TEX(ureg, temp[1], TGSI_TEXTURE_2D, in[0], sampler[2]);
+   EXTENDED_BLENDER_OVER_FUNC
+   ureg_MUL(ureg, temp[4], ureg_src(temp[0]),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_MUL(ureg, temp[5], ureg_src(temp[1]),
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W));
+   ureg_MIN(ureg, temp[4], ureg_src(temp[4]), ureg_src(temp[5]));
+   ureg_ADD(ureg, temp[1], ureg_src(temp[3]), ureg_src(temp[4]));
+
+   ureg_MUL(ureg, temp[2], ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_ADD(ureg, temp[3], ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_SUB(ureg, ureg_writemask(temp[1], TGSI_WRITEMASK_W),
+            ureg_src(temp[3]), ureg_src(temp[2]));
+
+   ureg_MOV(ureg, *out, ureg_src(temp[1]));
+}
+
+static INLINE void
+blend_lighten( struct ureg_program *ureg,
+               struct ureg_dst     *out,
+               struct ureg_src     *in,
+               struct ureg_src     *sampler,
+               struct ureg_dst *temp,
+               struct ureg_src     *constant)
+{
+   ureg_TEX(ureg, temp[1], TGSI_TEXTURE_2D, in[0], sampler[2]);
+   EXTENDED_BLENDER_OVER_FUNC
+   ureg_MUL(ureg, temp[4], ureg_src(temp[0]),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_MUL(ureg, temp[5], ureg_src(temp[1]),
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W));
+   ureg_MAX(ureg, temp[4], ureg_src(temp[4]), ureg_src(temp[5]));
+   ureg_ADD(ureg, temp[1], ureg_src(temp[3]), ureg_src(temp[4]));
+
+   ureg_MUL(ureg, temp[2], ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_ADD(ureg, temp[3], ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W),
+            ureg_scalar(ureg_src(temp[1]), TGSI_SWIZZLE_W));
+   ureg_SUB(ureg, ureg_writemask(temp[1], TGSI_WRITEMASK_W),
+            ureg_src(temp[3]), ureg_src(temp[2]));
+
+   ureg_MOV(ureg, *out, ureg_src(temp[1]));
+}
+
+static INLINE void
+premultiply( struct ureg_program *ureg,
+                struct ureg_dst *out,
+                struct ureg_src *in,
+                struct ureg_src *sampler,
+                struct ureg_dst *temp,
+                struct ureg_src *constant)
+{
+   ureg_MUL(ureg,
+            ureg_writemask(temp[0], TGSI_WRITEMASK_XYZ),
+            ureg_src(temp[0]),
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W));
+}
+
+static INLINE void
+unpremultiply( struct ureg_program *ureg,
+                struct ureg_dst *out,
+                struct ureg_src *in,
+                struct ureg_src *sampler,
+                struct ureg_dst *temp,
+                struct ureg_src *constant)
+{
+   ureg_TEX(ureg, temp[0], TGSI_TEXTURE_2D, in[0], sampler[1]);
+}
+
+
+static INLINE void
+color_bw( struct ureg_program *ureg,
+                struct ureg_dst *out,
+                struct ureg_src *in,
+                struct ureg_src *sampler,
+                struct ureg_dst *temp,
+                struct ureg_src *constant)
+{
+   ureg_ADD(ureg, temp[1],
+            ureg_scalar(constant[1], TGSI_SWIZZLE_Y),
+            ureg_scalar(constant[1], TGSI_SWIZZLE_Y));
+   ureg_RCP(ureg, temp[2], ureg_src(temp[1]));
+   ureg_ADD(ureg, temp[1],
+            ureg_scalar(constant[1], TGSI_SWIZZLE_Y),
+            ureg_src(temp[2]));
+   ureg_ADD(ureg, ureg_writemask(temp[2], TGSI_WRITEMASK_X),
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_X),
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_Y));
+   ureg_ADD(ureg, ureg_writemask(temp[2], TGSI_WRITEMASK_X),
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_Z),
+            ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_X));
+   ureg_SGE(ureg,
+            ureg_writemask(temp[0], TGSI_WRITEMASK_XYZ),
+            ureg_scalar(ureg_src(temp[2]), TGSI_SWIZZLE_X),
+            ureg_src(temp[1]));
+  ureg_SGE(ureg,
+           ureg_writemask(temp[0], TGSI_WRITEMASK_W),
+           ureg_scalar(ureg_src(temp[0]), TGSI_SWIZZLE_W),
+           ureg_scalar(ureg_src(temp[2]), TGSI_SWIZZLE_Y));
+  ureg_MOV(ureg, *out, ureg_src(temp[0]));
+}
 
 
 struct shader_asm_info {
    VGint id;
-   VGint num_tokens;
-   const char * txt;
+   ureg_func func;
 
    VGboolean needs_position;
 
@@ -203,44 +412,44 @@ struct shader_asm_info {
 
 static const struct shader_asm_info shaders_asm[] = {
    /* fills */
-   {VEGA_SOLID_FILL_SHADER,       40,  solid_fill_asm,
+   {VEGA_SOLID_FILL_SHADER, solid_fill,
     VG_FALSE, 0, 1, 0, 0, 0, 0},
-   {VEGA_LINEAR_GRADIENT_SHADER, 200,  linear_grad_asm,
+   {VEGA_LINEAR_GRADIENT_SHADER, linear_grad,
     VG_TRUE,  0, 5, 0, 1, 0, 5},
-   {VEGA_RADIAL_GRADIENT_SHADER, 200,  radial_grad_asm,
+   {VEGA_RADIAL_GRADIENT_SHADER, radial_grad,
     VG_TRUE,  0, 5, 0, 1, 0, 6},
-   {VEGA_PATTERN_SHADER,         100,      pattern_asm,
+   {VEGA_PATTERN_SHADER, pattern,
     VG_TRUE,  1, 4, 0, 1, 0, 5},
 
    /* image draw modes */
-   {VEGA_IMAGE_NORMAL_SHADER,    200, image_normal_asm,
+   {VEGA_IMAGE_NORMAL_SHADER, image_normal,
     VG_TRUE,  0, 0, 3, 1, 0, 0},
-   {VEGA_IMAGE_MULTIPLY_SHADER,  200, image_multiply_asm,
+   {VEGA_IMAGE_MULTIPLY_SHADER, image_multiply,
     VG_TRUE,  0, 0, 3, 1, 0, 2},
-   {VEGA_IMAGE_STENCIL_SHADER,   200, image_stencil_asm,
+   {VEGA_IMAGE_STENCIL_SHADER, image_stencil,
     VG_TRUE,  0, 0, 3, 1, 0, 2},
 
-   {VEGA_MASK_SHADER,            100,         mask_asm,
+   {VEGA_MASK_SHADER, mask,
     VG_TRUE,  0, 0, 1, 1, 0, 2},
 
    /* extra blend modes */
-   {VEGA_BLEND_MULTIPLY_SHADER,  200, blend_multiply_asm,
+   {VEGA_BLEND_MULTIPLY_SHADER, blend_multiply,
     VG_TRUE,  1, 1, 2, 1, 0, 5},
-   {VEGA_BLEND_SCREEN_SHADER,    200, blend_screen_asm,
+   {VEGA_BLEND_SCREEN_SHADER, blend_screen,
     VG_TRUE,  0, 0, 2, 1, 0, 4},
-   {VEGA_BLEND_DARKEN_SHADER,    200, blend_darken_asm,
+   {VEGA_BLEND_DARKEN_SHADER, blend_darken,
     VG_TRUE,  1, 1, 2, 1, 0, 6},
-   {VEGA_BLEND_LIGHTEN_SHADER,   200, blend_lighten_asm,
+   {VEGA_BLEND_LIGHTEN_SHADER, blend_lighten,
     VG_TRUE,  1, 1, 2, 1, 0, 6},
 
    /* premultiply */
-   {VEGA_PREMULTIPLY_SHADER,   100, premultiply_asm,
+   {VEGA_PREMULTIPLY_SHADER, premultiply,
     VG_FALSE,  0, 0, 0, 0, 0, 1},
-   {VEGA_UNPREMULTIPLY_SHADER,   100, unpremultiply_asm,
+   {VEGA_UNPREMULTIPLY_SHADER, unpremultiply,
     VG_FALSE,  0, 0, 0, 0, 0, 1},
 
    /* color transform to black and white */
-   {VEGA_BW_SHADER,   150, color_bw_asm,
+   {VEGA_BW_SHADER, color_bw,
     VG_FALSE,  1, 1, 0, 0, 0, 3},
 };
 #endif
diff --git a/src/gallium/state_trackers/vega/shaders_cache.c b/src/gallium/state_trackers/vega/shaders_cache.c
index 593e60fc63a..f43fe6ee4cb 100644
--- a/src/gallium/state_trackers/vega/shaders_cache.c
+++ b/src/gallium/state_trackers/vega/shaders_cache.c
@@ -123,17 +123,23 @@ static INLINE VGint range_max(VGint max, VGint current)
    return MAX2(max, current);
 }
 
-static void
-create_preamble(char *txt,
-                const struct shader_asm_info *shaders[SHADER_STAGES],
-                int num_shaders)
+static void *
+combine_shaders(const struct shader_asm_info *shaders[SHADER_STAGES], int num_shaders,
+                struct pipe_context *pipe,
+                struct pipe_shader_state *shader)
 {
    VGboolean declare_input = VG_FALSE;
    VGint start_const   = -1, end_const   = 0;
    VGint start_temp    = -1, end_temp    = 0;
    VGint start_sampler = -1, end_sampler = 0;
-   VGint i;
+   VGint i, current_shader = 0;
    VGint num_consts, num_temps, num_samplers;
+   struct ureg_program *ureg;
+   struct ureg_src in[2];
+   struct ureg_src *sampler = NULL;
+   struct ureg_src *constant = NULL;
+   struct ureg_dst out, *temp = NULL;
+   void *p = NULL;
 
    for (i = 0; i < num_shaders; ++i) {
       if (shaders[i]->num_consts)
@@ -158,99 +164,94 @@ create_preamble(char *txt,
    if (start_temp < 0)
       start_temp = 0;
    if (start_sampler < 0)
-      start_sampler = 0;
+       start_sampler = 0;
 
    num_consts   = end_const   - start_const;
    num_temps    = end_temp    - start_temp;
    num_samplers = end_sampler - start_sampler;
-   /* end exclusive */
-   --end_const;
-   --end_temp;
-   --end_sampler;
 
-   sprintf(txt, "FRAG\n");
+   ureg = ureg_create(TGSI_PROCESSOR_FRAGMENT);
+   if (!ureg)
+       return NULL;
 
    if (declare_input) {
-      sprintf(txt + strlen(txt), "DCL IN[0], POSITION, LINEAR\n");
-      sprintf(txt + strlen(txt), "DCL IN[1], GENERIC[0], PERSPECTIVE\n");
+      in[0] = ureg_DECL_fs_input(ureg,
+                                 TGSI_SEMANTIC_POSITION,
+                                 0,
+                                 TGSI_INTERPOLATE_LINEAR);
+      in[1] = ureg_DECL_fs_input(ureg,
+                                 TGSI_SEMANTIC_GENERIC,
+                                 0,
+                                 TGSI_INTERPOLATE_PERSPECTIVE);
    }
 
    /* we always have a color output */
-   sprintf(txt + strlen(txt), "DCL OUT[0], COLOR, CONSTANT\n");
-
-   if (num_consts > 1)
-      sprintf(txt + strlen(txt), "DCL CONST[%d..%d], CONSTANT\n", start_const, end_const);
-   else if (num_consts == 1)
-      sprintf(txt + strlen(txt), "DCL CONST[%d], CONSTANT\n", start_const);
-
-   if (num_temps > 1)
-      sprintf(txt + strlen(txt), "DCL TEMP[%d..%d], CONSTANT\n", start_temp, end_temp);
-   else if (num_temps > 1)
-      sprintf(txt + strlen(txt), "DCL TEMP[%d], CONSTANT\n", start_temp);
-
-   if (num_samplers > 1)
-      sprintf(txt + strlen(txt), "DCL SAMP[%d..%d], CONSTANT\n", start_sampler, end_sampler);
-   else if (num_samplers == 1)
-      sprintf(txt + strlen(txt), "DCL SAMP[%d], CONSTANT\n", start_sampler);
-}
+   out = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
 
-static void *
-combine_shaders(const struct shader_asm_info *shaders[SHADER_STAGES], int num_shaders,
-                struct pipe_context *pipe,
-                struct pipe_shader_state *shader)
-{
-   char *combined_txt;
-   int combined_len = MAX_PREAMBLE;
-   int combined_tokens = 0;
-   int i = 0;
-   int current_shader = 0;
-   int current_len;
+   if (num_consts >= 1) {
+      constant = (struct ureg_src *) malloc(sizeof(struct ureg_src) * end_const);
+      for (i = start_const; i < end_const; i++) {
+         constant[i] = ureg_DECL_constant(ureg, i);
+      }
 
-   for (i = 0; i < num_shaders; ++i) {
-      combined_len += strlen(shaders[i]->txt);
-      combined_tokens += shaders[i]->num_tokens;
    }
-   /* add for the %s->TEMP[0] substitutions */
-   combined_len += num_shaders * 7 /*TEMP[0]*/ + 4 /*"END\n"*/;
 
-   combined_txt = (char*)malloc(combined_len);
-   combined_txt[0] = '\0';
+   if (num_temps >= 1) {
+      temp = (struct ureg_dst *) malloc(sizeof(struct ureg_dst) * end_temp);
+      for (i = start_temp; i < end_temp; i++) {
+         temp[i] = ureg_DECL_temporary(ureg);
+      }
+   }
 
-   create_preamble(combined_txt, shaders, num_shaders);
+   if (num_samplers >= 1) {
+      sampler = (struct ureg_src *) malloc(sizeof(struct ureg_src) * end_sampler);
+      for (i = start_sampler; i < end_sampler; i++) {
+         sampler[i] = ureg_DECL_sampler(ureg, i);
+      }
+   }
 
    while (current_shader < num_shaders) {
-      const char temp[] = "TEMP[0]";
-      const char out[] = "OUT[0]";
-      const char *subst = temp;
-
-      current_len = strlen(combined_txt);
-
-      /* if the last shader then output */
-      if (current_shader + 1 == num_shaders)
-         subst = out;
-
-      snprintf(combined_txt + current_len,
-               combined_len - current_len,
-               shaders[current_shader]->txt,
-               subst);
-      ++current_shader;
+      if ((current_shader + 1) == num_shaders) {
+         shaders[current_shader]->func(ureg,
+                                       &out,
+                                       in,
+                                       sampler,
+                                       temp,
+                                       constant);
+      } else {
+         shaders[current_shader]->func(ureg,
+                                      &temp[0],
+                                      in,
+                                      sampler,
+                                      temp,
+                                      constant);
+      }
+      current_shader++;
    }
 
+   ureg_END(ureg);
 
-   current_len = strlen(combined_txt);
-   snprintf(combined_txt + current_len,
-            combined_len - current_len,
-            "END\n");
+   shader->tokens = ureg_finalize(ureg);
+   if(!shader->tokens)
+      return NULL;
 
-   debug_printf("Combined shader is : \n%s\n",
-                 combined_txt);
+   p = pipe->create_fs_state(pipe, shader);
+   ureg_destroy(ureg);
 
-   shader->tokens = tokens_from_assembly(
-            combined_txt, combined_tokens);
+   if (num_temps >= 1) {
+      for (i = start_temp; i < end_temp; i++) {
+         ureg_release_temporary(ureg, temp[i]);
+      }
+   }
 
-   free(combined_txt);
+   if (temp)
+      free(temp);
+   if (constant)
+      free(constant);
+   if (sampler)
+      free(sampler);
 
-   return pipe->create_fs_state(pipe, shader);
+   return p;
 }
 
 static void *
diff --git a/src/mesa/drivers/directfb/idirectfbgl_mesa.c b/src/mesa/drivers/directfb/idirectfbgl_mesa.c
index 62a3269d171..85a6f036724 100644
--- a/src/mesa/drivers/directfb/idirectfbgl_mesa.c
+++ b/src/mesa/drivers/directfb/idirectfbgl_mesa.c
@@ -813,7 +813,7 @@ directfbgl_create_context( GLcontext        *context,
 {
      struct dd_function_table functions;
      
-     _mesa_initialize_framebuffer( framebuffer, visual ); 
+     _mesa_initialize_window_framebuffer( framebuffer, visual );
      
      _mesa_init_driver_functions( &functions );
      functions.GetString     = dfbGetString;
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index e8cc202f889..c9ef1647a32 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -202,7 +202,7 @@ intelCreateBuffer(__DRIscreen * driScrnPriv,
       if (!fb)
 	 return GL_FALSE;
 
-      _mesa_initialize_framebuffer(fb, mesaVis);
+      _mesa_initialize_window_framebuffer(fb, mesaVis);
 
       if (mesaVis->redBits == 5)
 	 rgbFormat = MESA_FORMAT_RGB565;
diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
index 249c0bbc11d..aecba7f8949 100644
--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
@@ -74,7 +74,7 @@ static void r200_emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
 			     GLvoid *data, int stride, int count)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-	uint32_t *out;
+	GLfloat *out;
 	int i;
 	int size = 1;
 
@@ -91,7 +91,7 @@ static void r200_emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
 	aos->count = count;
 
 	radeon_bo_map(aos->bo, 1);
-	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
+	out = (GLfloat*)((char*)aos->bo->ptr + aos->offset);
 	for (i = 0; i < count; i++) {
 	  out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
 	  out++;
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog.c
index aa69b0fc72b..928c15e1e40 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog.c
@@ -297,7 +297,7 @@ void r300FragmentProgramDump(struct rX00_fragment_program_code *c)
 			if (flags[0] != 0) {
 				sprintf(tmp, "o%i.%s",
 					(code->alu.inst[i].
-					 rgb_addr >> R300_ALU_DSTC_SHIFT) & 31,
+					 rgb_addr >> 29) & 3,
 					flags);
 				strcat(dstc, tmp);
 			}
@@ -311,7 +311,7 @@ void r300FragmentProgramDump(struct rX00_fragment_program_code *c)
 			if (code->alu.inst[i].alpha_addr & R300_ALU_DSTA_OUTPUT) {
 				sprintf(tmp, "o%i.w ",
 					(code->alu.inst[i].
-					 alpha_addr >> R300_ALU_DSTA_SHIFT) & 31);
+					 alpha_addr >> 25) & 3);
 				strcat(dsta, tmp);
 			}
 			if (code->alu.inst[i].alpha_addr & R300_ALU_DSTA_DEPTH) {
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
index 375838d98e7..cc552aee176 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
@@ -176,7 +176,9 @@ static int emit_alu(struct r300_emit_state * emit, struct rc_pair_instruction* i
 			(inst->RGB.WriteMask << R300_ALU_DSTC_REG_MASK_SHIFT);
 	}
 	if (inst->RGB.OutputWriteMask) {
-		code->alu.inst[ip].rgb_addr |= (inst->RGB.OutputWriteMask << R300_ALU_DSTC_OUTPUT_MASK_SHIFT);
+		code->alu.inst[ip].rgb_addr |=
+            (inst->RGB.OutputWriteMask << R300_ALU_DSTC_OUTPUT_MASK_SHIFT) |
+            R300_RGB_TARGET(inst->RGB.Target);
 		emit->node_flags |= R300_RGBA_OUT;
 	}
 
@@ -187,7 +189,8 @@ static int emit_alu(struct r300_emit_state * emit, struct rc_pair_instruction* i
 			R300_ALU_DSTA_REG;
 	}
 	if (inst->Alpha.OutputWriteMask) {
-		code->alu.inst[ip].alpha_addr |= R300_ALU_DSTA_OUTPUT;
+		code->alu.inst[ip].alpha_addr |= R300_ALU_DSTA_OUTPUT |
+            R300_ALPHA_TARGET(inst->Alpha.Target);
 		emit->node_flags |= R300_RGBA_OUT;
 	}
 	if (inst->Alpha.DepthWriteMask) {
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 5581f25352d..c2d5dc27b49 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -35,7 +35,10 @@ static void dataflow_outputs_mark_use(void * userdata, void * data,
 		void (*callback)(void *, unsigned int, unsigned int))
 {
 	struct r300_fragment_program_compiler * c = userdata;
-	callback(data, c->OutputColor, RC_MASK_XYZW);
+	callback(data, c->OutputColor[0], RC_MASK_XYZW);
+	callback(data, c->OutputColor[1], RC_MASK_XYZW);
+	callback(data, c->OutputColor[2], RC_MASK_XYZW);
+	callback(data, c->OutputColor[3], RC_MASK_XYZW);
 	callback(data, c->OutputDepth, RC_MASK_W);
 }
 
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index b1b14394b6e..c2eb613b23f 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -241,6 +241,9 @@ static void emit_paired(struct r300_fragment_program_compiler *c, struct rc_pair
 	code->inst[ip].inst4 |= translate_arg_alpha(inst, 1) << R500_ALPHA_SEL_B_SHIFT;
 	code->inst[ip].inst5 |= translate_arg_alpha(inst, 2) << R500_ALU_RGBA_ALPHA_SEL_C_SHIFT;
 
+    code->inst[ip].inst3 |= R500_ALU_RGB_TARGET(inst->RGB.Target);
+    code->inst[ip].inst4 |= R500_ALPHA_TARGET(inst->Alpha.Target);
+
 	if (inst->WriteALUResult) {
 		code->inst[ip].inst3 |= R500_ALU_RGB_WMASK;
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
index f27f858652e..6bfda0574f6 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
@@ -83,8 +83,10 @@ struct r300_fragment_program_compiler {
 	struct rX00_fragment_program_code *code;
 	struct r300_fragment_program_external_state state;
 	unsigned is_r500;
+    /* Register corresponding to the depthbuffer. */
 	unsigned OutputDepth;
-	unsigned OutputColor;
+    /* Registers corresponding to the four colorbuffers. */
+	unsigned OutputColor[4];
 
 	void * UserData;
 	void (*AllocateHwInputs)(
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
index 72117682725..fff5b0c2173 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
@@ -203,12 +203,21 @@ static void set_pair_instruction(struct r300_fragment_program_compiler *c,
 
 	/* Destination handling */
 	if (inst->DstReg.File == RC_FILE_OUTPUT) {
-		if (inst->DstReg.Index == c->OutputColor) {
-			pair->RGB.OutputWriteMask |= inst->DstReg.WriteMask & RC_MASK_XYZ;
-			pair->Alpha.OutputWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
-		} else if (inst->DstReg.Index == c->OutputDepth) {
-			pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
-		}
+        if (inst->DstReg.Index == c->OutputDepth) {
+            pair->Alpha.DepthWriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+        } else {
+            for (i = 0; i < 4; i++) {
+                if (inst->DstReg.Index == c->OutputColor[i]) {
+                    pair->RGB.Target = i;
+                    pair->Alpha.Target = i;
+                    pair->RGB.OutputWriteMask |=
+                        inst->DstReg.WriteMask & RC_MASK_XYZ;
+                    pair->Alpha.OutputWriteMask |=
+                        GET_BIT(inst->DstReg.WriteMask, 3);
+                    break;
+                }
+            }
+        }
 	} else {
 		if (needrgb) {
 			pair->RGB.DestIndex = inst->DstReg.Index;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
index 6685ade3ea8..511cc707a38 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
@@ -60,6 +60,7 @@ struct radeon_pair_instruction_rgb {
 	unsigned int Opcode:8;
 	unsigned int DestIndex:RC_REGISTER_INDEX_BITS;
 	unsigned int WriteMask:3;
+    unsigned int Target:2;
 	unsigned int OutputWriteMask:3;
 	unsigned int Saturate:1;
 
@@ -77,6 +78,7 @@ struct radeon_pair_instruction_alpha {
 	unsigned int Opcode:8;
 	unsigned int DestIndex:RC_REGISTER_INDEX_BITS;
 	unsigned int WriteMask:1;
+    unsigned int Target:2;
 	unsigned int OutputWriteMask:1;
 	unsigned int DepthWriteMask:1;
 	unsigned int Saturate:1;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
index d863b82d53f..28fb9eae925 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
@@ -229,7 +229,7 @@ static void rc_print_pair_instruction(FILE * f, struct rc_instruction * fullinst
 				(inst->RGB.WriteMask & 2) ? "y" : "",
 				(inst->RGB.WriteMask & 4) ? "z" : "");
 		if (inst->RGB.OutputWriteMask)
-			fprintf(f, " color.%s%s%s",
+			fprintf(f, " color[%i].%s%s%s", inst->RGB.Target,
 				(inst->RGB.OutputWriteMask & 1) ? "x" : "",
 				(inst->RGB.OutputWriteMask & 2) ? "y" : "",
 				(inst->RGB.OutputWriteMask & 4) ? "z" : "");
@@ -255,7 +255,7 @@ static void rc_print_pair_instruction(FILE * f, struct rc_instruction * fullinst
 		if (inst->Alpha.WriteMask)
 			fprintf(f, " temp[%i].w", inst->Alpha.DestIndex);
 		if (inst->Alpha.OutputWriteMask)
-			fprintf(f, " color.w");
+			fprintf(f, " color[%i].w", inst->Alpha.Target);
 		if (inst->Alpha.DepthWriteMask)
 			fprintf(f, " depth.w");
 		if (inst->WriteALUResult == RC_ALURESULT_W)
diff --git a/src/mesa/drivers/dri/r300/r300_blit.c b/src/mesa/drivers/dri/r300/r300_blit.c
index e24c7955d4b..54ac2510e7a 100644
--- a/src/mesa/drivers/dri/r300/r300_blit.c
+++ b/src/mesa/drivers/dri/r300/r300_blit.c
@@ -114,7 +114,7 @@ static void create_fragment_program(struct r300_context *r300)
     inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_XYZW;
 
     compiler.Base.Program.InputsRead = (1 << FRAG_ATTRIB_TEX0);
-    compiler.OutputColor = FRAG_RESULT_COLOR;
+    compiler.OutputColor[0] = FRAG_RESULT_COLOR;
     compiler.OutputDepth = FRAG_RESULT_DEPTH;
     compiler.is_r500 = (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515);
     compiler.code = &r300->blit.fp_code;
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.c b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
index e41aeff91a4..a0e2dd3c09f 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_common.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
@@ -221,7 +221,8 @@ static void translate_fragment_program(GLcontext *ctx, struct r300_fragment_prog
 	compiler.state = fp->state;
 	compiler.is_r500 = (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) ? GL_TRUE : GL_FALSE;
 	compiler.OutputDepth = FRAG_RESULT_DEPTH;
-	compiler.OutputColor = FRAG_RESULT_COLOR;
+	memset(compiler.OutputColor, 0, 4 * sizeof(unsigned));
+	compiler.OutputColor[0] = FRAG_RESULT_COLOR;
 	compiler.AllocateHwInputs = &allocate_hw_inputs;
 
 	if (compiler.Base.Debug) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 6fa1a0663ba..93b6399a669 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -1479,7 +1479,7 @@ radeonCreateBuffer( __DRIscreen *driScrnPriv,
     if (!rfb)
       return GL_FALSE;
 
-    _mesa_initialize_framebuffer(&rfb->base, mesaVis);
+    _mesa_initialize_window_framebuffer(&rfb->base, mesaVis);
 
     if (mesaVis->redBits == 5)
         rgbFormat = _mesa_little_endian() ? MESA_FORMAT_RGB565 : MESA_FORMAT_RGB565_REV;
diff --git a/src/mesa/drivers/dri/swrast/swrast.c b/src/mesa/drivers/dri/swrast/swrast.c
index 8340861aff8..4e823669bfc 100644
--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -368,7 +368,7 @@ driCreateNewDrawable(__DRIscreen *screen,
     buf->row = _mesa_malloc(MAX_WIDTH * 4);
 
     /* basic framebuffer setup */
-    _mesa_initialize_framebuffer(&buf->Base, &config->modes);
+    _mesa_initialize_window_framebuffer(&buf->Base, &config->modes);
 
     /* add front renderbuffer */
     frontrb = swrast_new_renderbuffer(&config->modes, GL_TRUE);
diff --git a/src/mesa/drivers/fbdev/glfbdev.c b/src/mesa/drivers/fbdev/glfbdev.c
index 531558dc4d6..1a56b2395fa 100644
--- a/src/mesa/drivers/fbdev/glfbdev.c
+++ b/src/mesa/drivers/fbdev/glfbdev.c
@@ -626,7 +626,7 @@ glFBDevCreateBuffer( const struct fb_fix_screeninfo *fixInfo,
       return NULL;
 
    /* basic framebuffer setup */
-   _mesa_initialize_framebuffer(&buf->glframebuffer, &visual->glvisual);
+   _mesa_initialize_window_framebuffer(&buf->glframebuffer, &visual->glvisual);
    /* add front renderbuffer */
    frontrb = new_glfbdev_renderbuffer(frontBuffer, visual);
    _mesa_add_renderbuffer(&buf->glframebuffer, BUFFER_FRONT_LEFT,
diff --git a/src/mesa/drivers/glslcompiler/Makefile b/src/mesa/drivers/glslcompiler/Makefile
index fa8293d039a..080fe475c16 100644
--- a/src/mesa/drivers/glslcompiler/Makefile
+++ b/src/mesa/drivers/glslcompiler/Makefile
@@ -10,6 +10,7 @@ PROGRAM = glslcompiler
 OBJECTS = \
 	glslcompiler.o \
 	../../glapi/glapi.o \
+	../../glapi/glapi_nop.o \
 	../../glapi/glthread.o \
 	../../main/dispatch.o \
 	../common/driverfuncs.o \
diff --git a/src/mesa/drivers/windows/gdi/wmesa.c b/src/mesa/drivers/windows/gdi/wmesa.c
index ceeafd5f909..b24b758cfb2 100644
--- a/src/mesa/drivers/windows/gdi/wmesa.c
+++ b/src/mesa/drivers/windows/gdi/wmesa.c
@@ -35,7 +35,7 @@ wmesa_new_framebuffer(HDC hdc, GLvisual *visual)
     WMesaFramebuffer pwfb
         = (WMesaFramebuffer) malloc(sizeof(struct wmesa_framebuffer));
     if (pwfb) {
-        _mesa_initialize_framebuffer(&pwfb->Base, visual);
+        _mesa_initialize_window_framebuffer(&pwfb->Base, visual);
         pwfb->hDC = hdc;
         /* insert at head of list */
         pwfb->next = FirstFramebuffer;
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index bf767bceddf..1a5456e1be2 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -375,7 +375,7 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type,
    b->type = type;
    b->cmap = cmap;
 
-   _mesa_initialize_framebuffer(&b->mesa_buffer, &vis->mesa_visual);
+   _mesa_initialize_window_framebuffer(&b->mesa_buffer, &vis->mesa_visual);
    b->mesa_buffer.Delete = xmesa_delete_framebuffer;
 
    /*
diff --git a/src/mesa/main/bitset.h b/src/mesa/main/bitset.h
index 8bd4526cb6f..f2709abc9fd 100644
--- a/src/mesa/main/bitset.h
+++ b/src/mesa/main/bitset.h
@@ -27,7 +27,12 @@
  * \brief Bitset of arbitrary size definitions.
  * \author Michal Krol
  */
- 
+
+#ifndef BITSET_H
+#define BITSET_H
+
+#include "imports.h"
+
 /****************************************************************************
  * generic bitset implementation
  */
@@ -74,6 +79,23 @@
    ((x)[BITSET_BITWORD(b)] &= ~BITSET_RANGE(b, e)) : \
    (assert (!"BITSET_CLEAR_RANGE: bit range crosses word boundary"), 0))
 
+/* Get first bit set in a bitset.
+ */
+static INLINE int
+__bitset_ffs(const BITSET_WORD *x, int n)
+{
+   int i;
+
+   for (i = 0; i < n; i++) {
+      if (x[i])
+	 return _mesa_ffs(x[i]) + BITSET_WORDBITS * i;
+   }
+
+   return 0;
+}
+
+#define BITSET_FFS(x) __bitset_ffs(x, Elements(x))
+
 /****************************************************************************
  * 64-bit bitset implementation
  */
@@ -120,3 +142,4 @@
    ((x)[BITSET64_BITWORD(b)] &= ~BITSET64_RANGE(b, e)) : \
    (assert (!"BITSET64_CLEAR_RANGE: bit range crosses word boundary"), 0))
 
+#endif
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index d958dbf7d48..96e53443836 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -88,7 +88,7 @@ _mesa_create_framebuffer(const GLvisual *visual)
    struct gl_framebuffer *fb = CALLOC_STRUCT(gl_framebuffer);
    assert(visual);
    if (fb) {
-      _mesa_initialize_framebuffer(fb, visual);
+      _mesa_initialize_window_framebuffer(fb, visual);
    }
    return fb;
 }
@@ -109,15 +109,7 @@ _mesa_new_framebuffer(GLcontext *ctx, GLuint name)
    assert(name != 0);
    fb = CALLOC_STRUCT(gl_framebuffer);
    if (fb) {
-      fb->Name = name;
-      fb->RefCount = 1;
-      fb->_NumColorDrawBuffers = 1;
-      fb->ColorDrawBuffer[0] = GL_COLOR_ATTACHMENT0_EXT;
-      fb->_ColorDrawBufferIndexes[0] = BUFFER_COLOR0;
-      fb->ColorReadBuffer = GL_COLOR_ATTACHMENT0_EXT;
-      fb->_ColorReadBufferIndex = BUFFER_COLOR0;
-      fb->Delete = _mesa_destroy_framebuffer;
-      _glthread_INIT_MUTEX(fb->Mutex);
+      _mesa_initialize_user_framebuffer(fb, name);
    }
    return fb;
 }
@@ -126,10 +118,11 @@ _mesa_new_framebuffer(GLcontext *ctx, GLuint name)
 /**
  * Initialize a gl_framebuffer object.  Typically used to initialize
  * window system-created framebuffers, not user-created framebuffers.
- * \sa _mesa_create_framebuffer
+ * \sa _mesa_initialize_user_framebuffer
  */
 void
-_mesa_initialize_framebuffer(struct gl_framebuffer *fb, const GLvisual *visual)
+_mesa_initialize_window_framebuffer(struct gl_framebuffer *fb,
+				     const GLvisual *visual)
 {
    assert(fb);
    assert(visual);
@@ -167,6 +160,30 @@ _mesa_initialize_framebuffer(struct gl_framebuffer *fb, const GLvisual *visual)
 
 
 /**
+ * Initialize a user-created gl_framebuffer object.
+ * \sa _mesa_initialize_window_framebuffer
+ */
+void
+_mesa_initialize_user_framebuffer(struct gl_framebuffer *fb, GLuint name)
+{
+   assert(fb);
+   assert(name);
+
+   _mesa_bzero(fb, sizeof(struct gl_framebuffer));
+
+   fb->Name = name;
+   fb->RefCount = 1;
+   fb->_NumColorDrawBuffers = 1;
+   fb->ColorDrawBuffer[0] = GL_COLOR_ATTACHMENT0_EXT;
+   fb->_ColorDrawBufferIndexes[0] = BUFFER_COLOR0;
+   fb->ColorReadBuffer = GL_COLOR_ATTACHMENT0_EXT;
+   fb->_ColorReadBufferIndex = BUFFER_COLOR0;
+   fb->Delete = _mesa_destroy_framebuffer;
+   _glthread_INIT_MUTEX(fb->Mutex);
+}
+
+
+/**
  * Deallocate buffer and everything attached to it.
  * Typically called via the gl_framebuffer->Delete() method.
  */
diff --git a/src/mesa/main/framebuffer.h b/src/mesa/main/framebuffer.h
index ef21dd98e83..960513812cf 100644
--- a/src/mesa/main/framebuffer.h
+++ b/src/mesa/main/framebuffer.h
@@ -34,7 +34,11 @@ extern struct gl_framebuffer *
 _mesa_new_framebuffer(GLcontext *ctx, GLuint name);
 
 extern void
-_mesa_initialize_framebuffer(struct gl_framebuffer *fb, const GLvisual *visual);
+_mesa_initialize_window_framebuffer(struct gl_framebuffer *fb,
+				     const GLvisual *visual);
+
+extern void
+_mesa_initialize_user_framebuffer(struct gl_framebuffer *fb, GLuint name);
 
 extern void
 _mesa_destroy_framebuffer(struct gl_framebuffer *buffer);
diff --git a/src/mesa/shader/prog_print.c b/src/mesa/shader/prog_print.c
index 9f9789e010c..54fd88ad4fb 100644
--- a/src/mesa/shader/prog_print.c
+++ b/src/mesa/shader/prog_print.c
@@ -150,6 +150,10 @@ arb_input_attrib_string(GLint index, GLenum progType)
       "fragment.varying[7]"
    };
 
+   /* sanity checks */
+   assert(strcmp(vertAttribs[VERT_ATTRIB_TEX0], "vertex.texcoord[0]") == 0);
+   assert(strcmp(vertAttribs[VERT_ATTRIB_GENERIC15], "vertex.attrib[15]") == 0);
+
    if (progType == GL_VERTEX_PROGRAM_ARB) {
       assert(index < sizeof(vertAttribs) / sizeof(vertAttribs[0]));
       return vertAttribs[index];
@@ -162,6 +166,43 @@ arb_input_attrib_string(GLint index, GLenum progType)
 
 
 /**
+ * Print a vertex program's InputsRead field in human-readable format.
+ * For debugging.
+ */
+void
+_mesa_print_vp_inputs(GLbitfield inputs)
+{
+   _mesa_printf("VP Inputs 0x%x: \n", inputs);
+   while (inputs) {
+      GLint attr = _mesa_ffs(inputs) - 1;
+      const char *name = arb_input_attrib_string(attr,
+                                                 GL_VERTEX_PROGRAM_ARB);
+      _mesa_printf("  %d: %s\n", attr, name);
+      inputs &= ~(1 << attr);
+   }
+}
+
+
+/**
+ * Print a fragment program's InputsRead field in human-readable format.
+ * For debugging.
+ */
+void
+_mesa_print_fp_inputs(GLbitfield inputs)
+{
+   _mesa_printf("FP Inputs 0x%x: \n", inputs);
+   while (inputs) {
+      GLint attr = _mesa_ffs(inputs) - 1;
+      const char *name = arb_input_attrib_string(attr,
+                                                 GL_FRAGMENT_PROGRAM_ARB);
+      _mesa_printf("  %d: %s\n", attr, name);
+      inputs &= ~(1 << attr);
+   }
+}
+
+
+
+/**
  * Return ARB_v/f_prog-style output attrib string.
  */
 static const char *
diff --git a/src/mesa/shader/prog_print.h b/src/mesa/shader/prog_print.h
index fc286ded540..9ab74560169 100644
--- a/src/mesa/shader/prog_print.h
+++ b/src/mesa/shader/prog_print.h
@@ -37,6 +37,12 @@ typedef enum {
 } gl_prog_print_mode;
 
 
+extern void
+_mesa_print_vp_inputs(GLbitfield inputs);
+
+extern void
+_mesa_print_fp_inputs(GLbitfield inputs);
+
 extern const char *
 _mesa_condcode_string(GLuint condcode);
 
diff --git a/src/mesa/shader/program.c b/src/mesa/shader/program.c
index 3e86d0adad4..aaf5f96e2a5 100644
--- a/src/mesa/shader/program.c
+++ b/src/mesa/shader/program.c
@@ -677,6 +677,8 @@ _mesa_combine_programs(GLcontext *ctx,
    const GLuint lenB = progB->NumInstructions;
    const GLuint numParamsA = _mesa_num_parameters(progA->Parameters);
    const GLuint newLength = lenA + lenB;
+   GLboolean usedTemps[MAX_PROGRAM_TEMPS];
+   GLuint firstTemp = 0;
    GLbitfield inputsB;
    GLuint i;
 
@@ -698,6 +700,10 @@ _mesa_combine_programs(GLcontext *ctx,
    newProg->Instructions = newInst;
    newProg->NumInstructions = newLength;
 
+   /* find used temp regs (we may need new temps below) */
+   _mesa_find_used_registers(newProg, PROGRAM_TEMPORARY,
+                             usedTemps, MAX_PROGRAM_TEMPS);
+
    if (newProg->Target == GL_FRAGMENT_PROGRAM_ARB) {
       struct gl_fragment_program *fprogA, *fprogB, *newFprog;
       GLbitfield progB_inputsRead = progB->InputsRead;
@@ -741,12 +747,15 @@ _mesa_combine_programs(GLcontext *ctx,
        */
       if ((progA->OutputsWritten & (1 << FRAG_RESULT_COLOR)) &&
           (progB_inputsRead & FRAG_BIT_COL0)) {
-         GLint tempReg = _mesa_find_free_register(newProg, PROGRAM_TEMPORARY);
+         GLint tempReg = _mesa_find_free_register(usedTemps, MAX_PROGRAM_TEMPS,
+                                                  firstTemp);
          if (tempReg < 0) {
             _mesa_problem(ctx, "No free temp regs found in "
                           "_mesa_combine_programs(), using 31");
             tempReg = 31;
          }
+         firstTemp = tempReg + 1;
+
          /* replace writes to result.color[0] with tempReg */
          replace_registers(newInst, lenA,
                            PROGRAM_OUTPUT, FRAG_RESULT_COLOR,
@@ -784,53 +793,64 @@ _mesa_combine_programs(GLcontext *ctx,
 }
 
 
-
-
 /**
- * Scan the given program to find a free register of the given type.
- * \param regFile - PROGRAM_INPUT, PROGRAM_OUTPUT or PROGRAM_TEMPORARY
+ * Populate the 'used' array with flags indicating which registers (TEMPs,
+ * INPUTs, OUTPUTs, etc, are used by the given program.
+ * \param file  type of register to scan for
+ * \param used  returns true/false flags for in use / free
+ * \param usedSize  size of the 'used' array
  */
-GLint
-_mesa_find_free_register(const struct gl_program *prog, GLuint regFile)
+void
+_mesa_find_used_registers(const struct gl_program *prog,
+                          gl_register_file file,
+                          GLboolean used[], GLuint usedSize)
 {
-   GLboolean used[MAX_PROGRAM_TEMPS];
-   GLuint i, k;
-
-   assert(regFile == PROGRAM_INPUT ||
-          regFile == PROGRAM_OUTPUT ||
-          regFile == PROGRAM_TEMPORARY);
+   GLuint i, j;
 
-   _mesa_memset(used, 0, sizeof(used));
+   _mesa_memset(used, 0, usedSize);
 
    for (i = 0; i < prog->NumInstructions; i++) {
       const struct prog_instruction *inst = prog->Instructions + i;
       const GLuint n = _mesa_num_inst_src_regs(inst->Opcode);
 
-      /* check dst reg first */
-      if (inst->DstReg.File == regFile) {
+      if (inst->DstReg.File == file) {
          used[inst->DstReg.Index] = GL_TRUE;
       }
-      else {
-         /* check src regs otherwise */
-         for (k = 0; k < n; k++) {
-            if (inst->SrcReg[k].File == regFile) {
-               used[inst->SrcReg[k].Index] = GL_TRUE;
-               break;
-            }
+
+      for (j = 0; j < n; j++) {
+         if (inst->SrcReg[j].File == file) {
+            used[inst->SrcReg[j].Index] = GL_TRUE;
          }
       }
    }
+}
 
-   for (i = 0; i < MAX_PROGRAM_TEMPS; i++) {
+
+/**
+ * Scan the given 'used' register flag array for the first entry
+ * that's >= firstReg.
+ * \param used  vector of flags indicating registers in use (as returned
+ *              by _mesa_find_used_registers())
+ * \param usedSize  size of the 'used' array
+ * \param firstReg  first register to start searching at
+ * \return index of unused register, or -1 if none.
+ */
+GLint
+_mesa_find_free_register(const GLboolean used[],
+                         GLuint usedSize, GLuint firstReg)
+{
+   GLuint i;
+
+   assert(firstReg < usedSize);
+
+   for (i = firstReg; i < usedSize; i++)
       if (!used[i])
          return i;
-   }
 
    return -1;
 }
 
 
-
 /**
  * "Post-process" a GPU program.  This is intended to be used for debugging.
  * Example actions include no-op'ing instructions or changing instruction
diff --git a/src/mesa/shader/program.h b/src/mesa/shader/program.h
index 56a4191f578..0187a2c55ff 100644
--- a/src/mesa/shader/program.h
+++ b/src/mesa/shader/program.h
@@ -119,8 +119,14 @@ _mesa_combine_programs(GLcontext *ctx,
                        const struct gl_program *progA,
                        const struct gl_program *progB);
 
+extern void
+_mesa_find_used_registers(const struct gl_program *prog,
+                          gl_register_file file,
+                          GLboolean used[], GLuint usedSize);
+
 extern GLint
-_mesa_find_free_register(const struct gl_program *prog, GLuint regFile);
+_mesa_find_free_register(const GLboolean used[],
+                         GLuint maxRegs, GLuint firstReg);
 
 extern void
 _mesa_postprocess_program(GLcontext *ctx, struct gl_program *prog);
diff --git a/src/mesa/shader/programopt.c b/src/mesa/shader/programopt.c
index 9514545709d..fb2ebe6338f 100644
--- a/src/mesa/shader/programopt.c
+++ b/src/mesa/shader/programopt.c
@@ -495,6 +495,11 @@ _mesa_remove_output_reads(struct gl_program *prog, gl_register_file type)
    GLuint i;
    GLint outputMap[VERT_RESULT_MAX];
    GLuint numVaryingReads = 0;
+   GLboolean usedTemps[MAX_PROGRAM_TEMPS];
+   GLuint firstTemp = 0;
+
+   _mesa_find_used_registers(prog, PROGRAM_TEMPORARY,
+                             usedTemps, MAX_PROGRAM_TEMPS);
 
    assert(type == PROGRAM_VARYING || type == PROGRAM_OUTPUT);
    assert(prog->Target == GL_VERTEX_PROGRAM_ARB || type != PROGRAM_VARYING);
@@ -513,8 +518,10 @@ _mesa_remove_output_reads(struct gl_program *prog, gl_register_file type)
             const GLuint var = inst->SrcReg[j].Index;
             if (outputMap[var] == -1) {
                numVaryingReads++;
-               outputMap[var] = _mesa_find_free_register(prog,
-                                                         PROGRAM_TEMPORARY);
+               outputMap[var] = _mesa_find_free_register(usedTemps,
+                                                         MAX_PROGRAM_TEMPS,
+                                                         firstTemp);
+               firstTemp = outputMap[var] + 1;
             }
             inst->SrcReg[j].File = PROGRAM_TEMPORARY;
             inst->SrcReg[j].Index = outputMap[var];
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index aa0508a41fc..d9b508537d0 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -60,26 +60,10 @@ static void
 translate_fp(struct st_context *st,
              struct st_fragment_program *stfp)
 {
-   const GLbitfield fragInputsRead = stfp->Base.Base.InputsRead;
-
    if (!stfp->state.tokens) {
-      GLuint inAttr, numIn = 0;
-
-      for (inAttr = 0; inAttr < FRAG_ATTRIB_MAX; inAttr++) {
-         if (fragInputsRead & (1 << inAttr)) {
-            stfp->input_to_slot[inAttr] = numIn;
-            numIn++;
-         }
-         else {
-            stfp->input_to_slot[inAttr] = -1;
-         }
-      }
-
-      stfp->num_input_slots = numIn;
-
       assert(stfp->Base.Base.NumInstructions > 0);
 
-      st_translate_fragment_program(st, stfp, stfp->input_to_slot);
+      st_translate_fragment_program(st, stfp);
    }
 }
 
diff --git a/src/mesa/state_tracker/st_atom_viewport.c b/src/mesa/state_tracker/st_atom_viewport.c
index 27ec2eb0331..b82bbfe410f 100644
--- a/src/mesa/state_tracker/st_atom_viewport.c
+++ b/src/mesa/state_tracker/st_atom_viewport.c
@@ -62,9 +62,9 @@ update_viewport( struct st_context *st )
       GLfloat x = (GLfloat)ctx->Viewport.X;
       GLfloat y = (GLfloat)ctx->Viewport.Y;
       GLfloat z = ctx->Viewport.Near;
-      GLfloat half_width = (GLfloat)ctx->Viewport.Width / 2.0f;
-      GLfloat half_height = (GLfloat)ctx->Viewport.Height / 2.0f;
-      GLfloat half_depth = (GLfloat)(ctx->Viewport.Far - ctx->Viewport.Near) / 2.0f;
+      GLfloat half_width = (GLfloat)ctx->Viewport.Width * 0.5f;
+      GLfloat half_height = (GLfloat)ctx->Viewport.Height * 0.5f;
+      GLfloat half_depth = (GLfloat)(ctx->Viewport.Far - ctx->Viewport.Near) * 0.5f;
       
       st->state.viewport.scale[0] = half_width;
       st->state.viewport.scale[1] = half_height * yScale;
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index c70bbc880c0..85420a950f4 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -221,7 +221,7 @@ combined_bitmap_fragment_program(GLcontext *ctx)
 #endif
 
       /* translate to TGSI tokens */
-      st_translate_fragment_program(st, stfp->bitmap_program, NULL);
+      st_translate_fragment_program(st, stfp->bitmap_program);
    }
 
    return stfp->bitmap_program;
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 9b1d2103f17..2a084ca5779 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -140,7 +140,7 @@ combined_drawpix_fragment_program(GLcontext *ctx)
 #endif
 
       /* translate to TGSI tokens */
-      st_translate_fragment_program(st, stfp, NULL);
+      st_translate_fragment_program(st, stfp);
 
       /* save new program, update serial numbers */
       st->pixel_xfer.xfer_prog_sn = st->pixel_xfer.program->serialNo;
@@ -221,7 +221,7 @@ make_fragment_shader_z(struct st_context *st)
    p->SamplersUsed = 0x1;  /* sampler 0 (bit 0) is used */
 
    st->drawpix.z_shader = (struct st_fragment_program *) p;
-   st_translate_fragment_program(st, st->drawpix.z_shader, NULL);
+   st_translate_fragment_program(st, st->drawpix.z_shader);
 
    return st->drawpix.z_shader;
 }
diff --git a/src/mesa/state_tracker/st_cb_strings.c b/src/mesa/state_tracker/st_cb_strings.c
index 996e065fedc..0fcb427f30a 100644
--- a/src/mesa/state_tracker/st_cb_strings.c
+++ b/src/mesa/state_tracker/st_cb_strings.c
@@ -39,7 +39,7 @@
 #include "st_context.h"
 #include "st_cb_strings.h"
 
-#define ST_VERSION_STRING "0.3"
+#define ST_VERSION_STRING "0.4"
 
 static const GLubyte *
 st_get_string(GLcontext * ctx, GLenum name)
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 7c5664be1d8..13f050900a6 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -543,22 +543,15 @@ st_TexImage(GLcontext * ctx,
       _mesa_align_free(texImage->Data);
    }
 
-   if (width == 0 || height == 0 || depth == 0) {
-      /* stop after freeing old image */
-      return;
-   }
-
-   /* If this is the only mipmap level in the texture, could call
-    * bmBufferData with NULL data to free the old block and avoid
-    * waiting on any outstanding fences.
+   /*
+    * See if the new image is somehow incompatible with the existing
+    * mipmap.  If so, free the old mipmap.
     */
    if (stObj->pt) {
       if (stObj->teximage_realloc ||
           level > (GLint) stObj->pt->last_level ||
-          (stObj->pt->last_level == level &&
-           stObj->pt->target != PIPE_TEXTURE_CUBE &&
-           !st_texture_match_image(stObj->pt, &stImage->base,
-                                   stImage->face, stImage->level))) {
+          !st_texture_match_image(stObj->pt, &stImage->base,
+                                  stImage->face, stImage->level)) {
          DBG("release it\n");
          pipe_texture_reference(&stObj->pt, NULL);
          assert(!stObj->pt);
@@ -566,6 +559,11 @@ st_TexImage(GLcontext * ctx,
       }
    }
 
+   if (width == 0 || height == 0 || depth == 0) {
+      /* stop after freeing old image */
+      return;
+   }
+
    if (!stObj->pt) {
       guess_and_alloc_texture(ctx->st, stObj, stImage);
       if (!stObj->pt) {
diff --git a/src/mesa/state_tracker/st_framebuffer.c b/src/mesa/state_tracker/st_framebuffer.c
index ed9c0ff5b76..4e225a123c8 100644
--- a/src/mesa/state_tracker/st_framebuffer.c
+++ b/src/mesa/state_tracker/st_framebuffer.c
@@ -54,7 +54,7 @@ st_create_framebuffer( const __GLcontextModes *visual,
       if (visual->sampleBuffers)
          samples = visual->samples;
 
-      _mesa_initialize_framebuffer(&stfb->Base, visual);
+      _mesa_initialize_window_framebuffer(&stfb->Base, visual);
 
       if (visual->doubleBufferMode) {
          struct gl_renderbuffer *rb
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 5c87e47ca3d..a639003dbd0 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -269,24 +269,20 @@ fail:
 
 /**
  * Translate a Mesa fragment shader into a TGSI shader.
- * \param inputMapping  to map fragment program input registers to TGSI
- *                      input slots
  * \return  pointer to cached pipe_shader object.
  */
 void
 st_translate_fragment_program(struct st_context *st,
-                              struct st_fragment_program *stfp,
-                              const GLuint inputMapping[])
+                              struct st_fragment_program *stfp )
 {
    struct pipe_context *pipe = st->pipe;
    GLuint outputMapping[FRAG_RESULT_MAX];
-   GLuint defaultInputMapping[FRAG_ATTRIB_MAX];
+   GLuint inputMapping[FRAG_ATTRIB_MAX];
    GLuint interpMode[16];  /* XXX size? */
    GLuint attr;
    enum pipe_error error;
    const GLbitfield inputsRead = stfp->Base.Base.InputsRead;
    struct ureg_program *ureg;
-   GLuint vslot = 0;
 
    uint fs_num_inputs = 0;
 
@@ -294,24 +290,14 @@ st_translate_fragment_program(struct st_context *st,
    ubyte fs_output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
    uint fs_num_outputs = 0;
 
-   /* which vertex output goes to the first fragment input: */
-   if (inputsRead & FRAG_BIT_WPOS)
-      vslot = 0;
-   else
-      vslot = 1;
-
    /*
     * Convert Mesa program inputs to TGSI input register semantics.
     */
    for (attr = 0; attr < FRAG_ATTRIB_MAX; attr++) {
       if (inputsRead & (1 << attr)) {
-         const GLuint slot = fs_num_inputs;
-
-         defaultInputMapping[attr] = slot;
-
-         stfp->input_map[slot] = vslot++;
+         const GLuint slot = fs_num_inputs++;
 
-         fs_num_inputs++;
+         inputMapping[attr] = slot;
 
          switch (attr) {
          case FRAG_ATTRIB_WPOS:
@@ -376,6 +362,9 @@ st_translate_fragment_program(struct st_context *st,
             break;
          }
       }
+      else {
+	 inputMapping[attr] = -1;
+      }
    }
 
    /*
@@ -417,9 +406,6 @@ st_translate_fragment_program(struct st_context *st,
       }
    }
 
-   if (!inputMapping)
-      inputMapping = defaultInputMapping;
-
    ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
    if (ureg == NULL)
       return;
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index 6b9a9226df5..d9822e50f55 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -52,12 +52,6 @@ struct st_fragment_program
    struct gl_fragment_program Base;
    GLuint serialNo;
 
-   GLuint input_to_slot[FRAG_ATTRIB_MAX];  /**< Maps FRAG_ATTRIB_x to slot */
-   GLuint num_input_slots;
-
-   /** map FP input back to VP output */
-   GLuint input_map[PIPE_MAX_SHADER_INPUTS];
-
    ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS];
    ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
 
@@ -171,8 +165,7 @@ st_reference_fragprog(struct st_context *st,
 
 extern void
 st_translate_fragment_program(struct st_context *st,
-                              struct st_fragment_program *fp,
-                              const GLuint inputMapping[]);
+                              struct st_fragment_program *fp);
 
 
 /* Called after program string change, discard all previous
diff --git a/src/mesa/vbo/vbo_split_inplace.c b/src/mesa/vbo/vbo_split_inplace.c
index da84eaa6ead..2fc866c5773 100644
--- a/src/mesa/vbo/vbo_split_inplace.c
+++ b/src/mesa/vbo/vbo_split_inplace.c
@@ -30,12 +30,15 @@
 #include "main/mtypes.h"
 #include "main/macros.h"
 #include "main/enums.h"
+#include "main/image.h"
 #include "vbo_split.h"
 
 
 #define MAX_PRIM 32
 
-/* Used for splitting without copying.
+/* Used for splitting without copying. No attempt is made to handle
+ * too large indexed vertex buffers: In general you need to copy to do
+ * that.
  */
 struct split_context {
    GLcontext *ctx;
@@ -48,6 +51,7 @@ struct split_context {
    vbo_draw_func draw;
 
    const struct split_limits *limits;
+   GLuint limit;
 
    struct _mesa_prim dstprim[MAX_PRIM];
    GLuint dstprim_nr;
@@ -58,38 +62,38 @@ struct split_context {
 
 static void flush_vertex( struct split_context *split )
 {
-   GLuint min_index, max_index;
+   struct _mesa_index_buffer ib;
    GLuint i;
 
    if (!split->dstprim_nr) 
       return;
 
-   min_index = split->dstprim[0].start;
-   max_index = min_index + split->dstprim[0].count - 1;
+   if (split->ib) {
+      ib = *split->ib;
 
-   for (i = 1; i < split->dstprim_nr; i++) {
-      GLuint tmp_min = split->dstprim[i].start;
-      GLuint tmp_max = tmp_min + split->dstprim[i].count - 1;
+      ib.count = split->max_index - split->min_index + 1;
+      ib.ptr = (const void *)((const char *)ib.ptr + 
+                              split->min_index * _mesa_sizeof_type(ib.type));
 
-      if (tmp_min < min_index)
-	 min_index = tmp_min;
-
-      if (tmp_max > max_index)
-	 max_index = tmp_max;
+      /* Rebase the primitives to save index buffer entries. */
+      for (i = 0; i < split->dstprim_nr; i++)
+	 split->dstprim[i].start -= split->min_index;
    }
 
-   assert(max_index >= min_index);
+   assert(split->max_index >= split->min_index);
 
-   split->draw( split->ctx, 
-		split->array, 
-		split->dstprim,
-		split->dstprim_nr,
-		NULL,
-		GL_TRUE,
-		min_index,
-		max_index);
+   split->draw(split->ctx,
+	       split->array,
+	       split->dstprim,
+	       split->dstprim_nr,
+	       split->ib ? &ib : NULL,
+	       !split->ib,
+	       split->min_index,
+	       split->max_index);
 
    split->dstprim_nr = 0;
+   split->min_index = ~0;
+   split->max_index = 0;
 }
 
 
@@ -106,62 +110,67 @@ static struct _mesa_prim *next_outprim( struct split_context *split )
    }
 }
 
-static int align(int value, int alignment)
+static void update_index_bounds(struct split_context *split,
+				const struct _mesa_prim *prim)
 {
-   return (value + alignment - 1) & ~(alignment - 1);
+   split->min_index = MIN2(split->min_index, prim->start);
+   split->max_index = MAX2(split->max_index, prim->start + prim->count - 1);
 }
 
-
+/* Return the maximum amount of vertices that can be emitted for a
+ * primitive starting at 'prim->start', depending on the previous
+ * index bounds.
+ */
+static GLuint get_max_vertices(struct split_context *split,
+			       const struct _mesa_prim *prim)
+{
+   if ((prim->start > split->min_index &&
+	prim->start - split->min_index >= split->limit) ||
+       (prim->start < split->max_index &&
+        split->max_index - prim->start >= split->limit))
+      /* "prim" starts too far away from the old range. */
+      return 0;
+
+   return MIN2(split->min_index, prim->start) + split->limit - prim->start;
+}
 
 /* Break large primitives into smaller ones.  If not possible, convert
  * the primitive to indexed and pass to split_elts().
  */
 static void split_prims( struct split_context *split) 
 {
-   GLuint csr = 0;
    GLuint i;
 
    for (i = 0; i < split->nr_prims; i++) {
       const struct _mesa_prim *prim = &split->prim[i];
       GLuint first, incr;
       GLboolean split_inplace = split_prim_inplace(prim->mode, &first, &incr);
-      GLuint count;
-
-      /* Always wrap on an even numbered vertex to avoid problems with
-       * triangle strips.  
-       */
-      GLuint available = align(split->limits->max_verts - csr - 1, 2); 
-      assert(split->limits->max_verts >= csr);
+      GLuint available = get_max_vertices(split, prim);
+      GLuint count = prim->count - (prim->count - first) % incr;
 
       if (prim->count < first)
 	 continue;
-      
-      count = prim->count - (prim->count - first) % incr; 
 
-
-      if ((available < count && !split_inplace) || 
+      if ((available < count && !split_inplace) ||
 	  (available < first && split_inplace)) {
 	 flush_vertex(split);
-	 csr = 0;
-	 available = align(split->limits->max_verts - csr - 1, 2);
+	 available = get_max_vertices(split, prim);
       }
       
       if (available >= count) {
 	 struct _mesa_prim *outprim = next_outprim(split);
+
 	 *outprim = *prim;
-	 csr += prim->count;
-	 available = align(split->limits->max_verts - csr - 1, 2); 
-      } 
+	 update_index_bounds(split, outprim);
+      }
       else if (split_inplace) {
 	 GLuint j, nr;
 
-
 	 for (j = 0 ; j < count ; ) {
 	    GLuint remaining = count - j;
 	    struct _mesa_prim *outprim = next_outprim(split);
 
 	    nr = MIN2( available, remaining );
-	    
 	    nr -= (nr - first) % incr;
 	    
 	    outprim->mode = prim->mode;
@@ -169,21 +178,20 @@ static void split_prims( struct split_context *split)
 	    outprim->end = (nr == remaining && prim->end);
 	    outprim->start = prim->start + j;
 	    outprim->count = nr;
-	    
+
+	    update_index_bounds(split, outprim);
+
 	    if (nr == remaining) {
 	       /* Finished. 
 		*/
-	       j += nr;		
-	       csr += nr;
-	       available = align(split->limits->max_verts - csr - 1, 2); 
+	       j += nr;
 	    }
 	    else {
 	       /* Wrapped the primitive: 
 		*/
 	       j += nr - (first - incr);
 	       flush_vertex(split);
-	       csr = 0;
-	       available = align(split->limits->max_verts - csr - 1, 2); 
+	       available = get_max_vertices(split, prim);
 	    }
 	 }
       }
@@ -260,10 +268,14 @@ void vbo_split_inplace( GLcontext *ctx,
    split.prim = prim;
    split.nr_prims = nr_prims;
    split.ib = ib;
-   split.min_index = min_index;
-   split.max_index = max_index;
+
+   /* Empty interval, makes calculations simpler. */
+   split.min_index = ~0;
+   split.max_index = 0;
+
    split.draw = draw;
    split.limits = limits;
+   split.limit = ib ? limits->max_indices : limits->max_verts;
 
    split_prims( &split );
 }