173 files changed, 4616 insertions, 4373 deletions
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index 08d731de2d5..cc4ad09fa33 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -16,7 +16,6 @@ if env['platform'] == 'windows':
     env.Append(CPPDEFINES = [
         '_GDI32_', # prevent gl* being declared __declspec(dllimport) in MS headers
         'BUILD_GL32', # declare gl* as __declspec(dllexport) in Mesa headers
-        'WIN32_THREADS', # use Win32 thread API
     ])
     env.Prepend(CPPPATH = ['#src/talloc'])
 else:
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index ba8be125718..cdb2500f7c2 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -266,13 +266,16 @@ struct gen_mipmap_state
    GLuint FBO;
 };
 
-
+#define MAX_META_OPS_DEPTH      2
 /**
  * All per-context meta state.
  */
 struct gl_meta_state
 {
-   struct save_state Save;    /**< state saved during meta-ops */
+   /** Stack of state saved during meta-ops */
+   struct save_state Save[MAX_META_OPS_DEPTH];
+   /** Save stack depth */
+   GLuint SaveStackDepth;
 
    struct temp_texture TempTex;
 
@@ -324,8 +327,13 @@ _mesa_meta_free(struct gl_context *ctx)
 static void
 _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
 {
-   struct save_state *save = &ctx->Meta->Save;
+   struct save_state *save;
+
+   /* hope MAX_META_OPS_DEPTH is large enough */
+   assert(ctx->Meta->SaveStackDepth < MAX_META_OPS_DEPTH);
 
+   save = &ctx->Meta->Save[ctx->Meta->SaveStackDepth++];
+   memset(save, 0, sizeof(*save));
    save->SavedState = state;
 
    if (state & META_ALPHA_TEST) {
@@ -575,7 +583,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
 static void
 _mesa_meta_end(struct gl_context *ctx)
 {
-   struct save_state *save = &ctx->Meta->Save;
+   struct save_state *save = &ctx->Meta->Save[--ctx->Meta->SaveStackDepth];
    const GLbitfield state = save->SavedState;
 
    if (state & META_ALPHA_TEST) {
@@ -1398,6 +1406,7 @@ _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers)
    struct vertex verts[4];
    /* save all state but scissor, pixel pack/unpack */
    GLbitfield metaSave = META_ALL - META_SCISSOR - META_PIXEL_STORE;
+   const GLuint stencilMax = (1 << ctx->DrawBuffer->Visual.stencilBits) - 1;
 
    if (buffers & BUFFER_BITS_COLOR) {
       /* if clearing color buffers, don't save/restore colormask */
@@ -1453,7 +1462,7 @@ _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers)
       _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
                               GL_REPLACE, GL_REPLACE, GL_REPLACE);
       _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
-                                ctx->Stencil.Clear & 0x7fffffff,
+                                ctx->Stencil.Clear & stencilMax,
                                 ctx->Stencil.WriteMask[0]);
    }
    else {
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index f943f81dd05..f32f3cf6020 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -176,6 +176,7 @@ i915CreateContext(int api,
    ctx->ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitCondCodes = GL_TRUE;
    ctx->ShaderCompilerOptions[MESA_SHADER_FRAGMENT].EmitNoIfs = GL_TRUE;
    ctx->ShaderCompilerOptions[MESA_SHADER_FRAGMENT].EmitNoNoise = GL_TRUE;
+   ctx->ShaderCompilerOptions[MESA_SHADER_FRAGMENT].EmitNoPow = GL_TRUE;
 
    ctx->Const.MaxDrawBuffers = 1;
 
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index c00ee415b6b..7a9fb7f088b 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -569,10 +569,14 @@ upload_program(struct i915_fragment_program *p)
 	 if (inst->DstReg.CondMask == COND_TR) {
 	    tmp = i915_get_utemp(p);
 
+	    /* The KIL instruction discards the fragment if any component of
+	     * the source is < 0.  Emit an immediate operand of {-1}.xywz.
+	     */
 	    i915_emit_texld(p, get_live_regs(p, inst),
 			    tmp, A0_DEST_CHANNEL_ALL,
 			    0, /* use a dummy dest reg */
-			    swizzle(tmp, ONE, ONE, ONE, ONE), /* always */
+			    negate(swizzle(tmp, ONE, ONE, ONE, ONE),
+				   1, 1, 1, 1),
 			    T0_TEXKILL);
 	 } else {
 	    p->error = 1;
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index e3ca863fe51..7c3ac0c14ef 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -81,7 +81,6 @@ DRIVER_SOURCES = \
 	brw_wm_emit.c \
 	brw_wm_fp.c \
 	brw_wm_iz.c \
-	brw_wm_glsl.c \
 	brw_wm_pass0.c \
 	brw_wm_pass1.c \
 	brw_wm_pass2.c \
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index a8369b07c35..d3a1233aac0 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -232,3 +232,28 @@ const struct brw_tracked_state brw_cc_unit = {
    .prepare = prepare_cc_unit,
    .emit = upload_cc_unit,
 };
+
+static void upload_blend_constant_color(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->intel.ctx;
+   struct brw_blend_constant_color bcc;
+
+   memset(&bcc, 0, sizeof(bcc));
+   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc.header.length = sizeof(bcc)/4-2;
+   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
+   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
+   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
+   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+}
+
+const struct brw_tracked_state brw_blend_constant_color = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .emit = upload_blend_constant_color
+};
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index cb0a8b96c9c..28549f2574a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -122,9 +122,6 @@ GLboolean brwCreateContext( int api,
 	 (i == MESA_SHADER_FRAGMENT);
       ctx->ShaderCompilerOptions[i].EmitNoIndirectTemp =
 	 (i == MESA_SHADER_FRAGMENT);
-
-      if (intel->gen == 6)
-	 ctx->ShaderCompilerOptions[i].EmitNoIfs = (i == MESA_SHADER_VERTEX);
    }
 
    ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 335339515a2..7069724466a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -171,7 +171,6 @@ struct brw_vertex_program {
 struct brw_fragment_program {
    struct gl_fragment_program program;
    GLuint id;  /**< serial no. to identify frag progs, never re-used */
-   GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
    /** for debugging, which texture units are referenced */
    GLbitfield tex_units_used;
@@ -211,6 +210,7 @@ struct brw_wm_prog_data {
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
    GLboolean error;
+   int dispatch_width;
 
    /* Pointer to tracked values (only valid once
     * _mesa_load_state_parameters has been called at runtime).
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 7b823eb201b..877b22fec19 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -242,21 +242,13 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      if (vp->use_const_buffer) {
-	 /* Load the subset of push constants that will get used when
-	  * we also have a pull constant buffer.
-	  */
-	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
-	    if (brw->vs.constant_map[i] != -1) {
-	       assert(brw->vs.constant_map[i] <= nr);
-	       memcpy(buf + offset + brw->vs.constant_map[i] * 4,
-		      vp->program.Base.Parameters->ParameterValues[i],
-		      4 * sizeof(float));
-	    }
-	 }
-      } else {
-	 for (i = 0; i < nr; i++) {
-	    memcpy(buf + offset + i * 4,
+      /* Load the subset of push constants that will get used when
+       * we also have a pull constant buffer.
+       */
+      for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	 if (brw->vs.constant_map[i] != -1) {
+	    assert(brw->vs.constant_map[i] <= nr);
+	    memcpy(buf + offset + brw->vs.constant_map[i] * 4,
 		   vp->program.Base.Parameters->ParameterValues[i],
 		   4 * sizeof(float));
 	 }
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 239586a0366..7f3e4986808 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -462,6 +462,13 @@
 #define BRW_COMPRESSION_2NDHALF       1
 #define BRW_COMPRESSION_COMPRESSED    2
 
+#define GEN6_COMPRESSION_1Q		0
+#define GEN6_COMPRESSION_2Q		1
+#define GEN6_COMPRESSION_3Q		2
+#define GEN6_COMPRESSION_4Q		3
+#define GEN6_COMPRESSION_1H		0
+#define GEN6_COMPRESSION_2H		2
+
 #define BRW_CONDITIONAL_NONE  0
 #define BRW_CONDITIONAL_Z     1
 #define BRW_CONDITIONAL_NZ    2
@@ -1022,6 +1029,13 @@
 # define ATTRIBUTE_0_CONST_SOURCE_SHIFT			9
 # define ATTRIBUTE_0_SWIZZLE_SHIFT			6
 # define ATTRIBUTE_0_SOURCE_SHIFT			0
+
+# define ATTRIBUTE_SWIZZLE_INPUTATTR                    0
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_FACING             1
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_W                  2
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_FACING_W           3
+# define ATTRIBUTE_SWIZZLE_SHIFT                        6
+
 /* DW16: Point sprite texture coordinate enables */
 /* DW17: Constant interpolation enables */
 /* DW18: attr 0-7 wrap shortest enables */
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 962c04128b8..6b61f7af15d 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -899,7 +899,8 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 	err |= dest (file, inst);
     } else if (gen >= 6 && (inst->header.opcode == BRW_OPCODE_IF ||
 			    inst->header.opcode == BRW_OPCODE_ELSE ||
-			    inst->header.opcode == BRW_OPCODE_ENDIF)) {
+			    inst->header.opcode == BRW_OPCODE_ENDIF ||
+			    inst->header.opcode == BRW_OPCODE_WHILE)) {
        format (file, " %d", inst->bits1.branch_gen6.jump_count);
     }
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c
index 2ff39e8e64a..3b5c4c071e3 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.c
+++ b/src/mesa/drivers/dri/i965/brw_eu.c
@@ -72,7 +72,37 @@ void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
 
 void brw_set_compression_control( struct brw_compile *p, GLboolean compression_control )
 {
-   p->current->header.compression_control = compression_control;
+   p->compressed = (compression_control == BRW_COMPRESSION_COMPRESSED);
+
+   if (p->brw->intel.gen >= 6) {
+      /* Since we don't use the 32-wide support in gen6, we translate
+       * the pre-gen6 compression control here.
+       */
+      switch (compression_control) {
+      case BRW_COMPRESSION_NONE:
+	 /* This is the "use the first set of bits of dmask/vmask/arf
+	  * according to execsize" option.
+	  */
+	 p->current->header.compression_control = GEN6_COMPRESSION_1Q;
+	 break;
+      case BRW_COMPRESSION_2NDHALF:
+	 /* For 8-wide, this is "use the second set of 8 bits." */
+	 p->current->header.compression_control = GEN6_COMPRESSION_2Q;
+	 break;
+      case BRW_COMPRESSION_COMPRESSED:
+	 /* For 16-wide instruction compression, use the first set of 16 bits
+	  * since we don't do 32-wide dispatch.
+	  */
+	 p->current->header.compression_control = GEN6_COMPRESSION_1H;
+	 break;
+      default:
+	 assert(!"not reached");
+	 p->current->header.compression_control = GEN6_COMPRESSION_1H;
+	 break;
+      }
+   } else {
+      p->current->header.compression_control = compression_control;
+   }
 }
 
 void brw_set_mask_control( struct brw_compile *p, GLuint value )
@@ -95,6 +125,7 @@ void brw_push_insn_state( struct brw_compile *p )
 {
    assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
    memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->compressed_stack[p->current - p->stack] = p->compressed;
    p->current++;   
 }
 
@@ -102,6 +133,7 @@ void brw_pop_insn_state( struct brw_compile *p )
 {
    assert(p->current != p->stack);
    p->current--;
+   p->compressed = p->compressed_stack[p->current - p->stack];
 }
 
 
@@ -112,6 +144,7 @@ void brw_init_compile( struct brw_context *brw, struct brw_compile *p )
    p->brw = brw;
    p->nr_insn = 0;
    p->current = p->stack;
+   p->compressed = false;
    memset(p->current, 0, sizeof(p->current[0]));
 
    /* Some defaults?
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index b4538e6e8a7..4dbdc522100 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -33,6 +33,7 @@
 #ifndef BRW_EU_H
 #define BRW_EU_H
 
+#include <stdbool.h>
 #include "brw_structs.h"
 #include "brw_defines.h"
 #include "program/prog_instruction.h"
@@ -106,10 +107,12 @@ struct brw_compile {
    /* Allow clients to push/pop instruction state:
     */
    struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   bool compressed_stack[BRW_EU_MAX_INSN_STACK];
    struct brw_instruction *current;
 
    GLuint flag_value;
    GLboolean single_program_flow;
+   bool compressed;
    struct brw_context *brw;
 
    struct brw_glsl_label *first_label;  /**< linked list of labels */
@@ -954,6 +957,8 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 	       struct brw_instruction *patch_insn);
 
 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count);
+struct brw_instruction *brw_CONT_gen6(struct brw_compile *p,
+				      struct brw_instruction *do_insn);
 struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count);
 /* Forward jumps:
  */
@@ -1009,6 +1014,7 @@ void brw_math_invert( struct brw_compile *p,
 void brw_set_src1( struct brw_instruction *insn,
                           struct brw_reg reg );
 
+void brw_set_uip_jip(struct brw_compile *p);
 
 /* brw_optimize.c */
 void brw_optimize(struct brw_compile *p);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 9cb941dacfd..9c764fe779d 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -41,19 +41,20 @@
  * Internal helper for constructing instructions
  */
 
-static void guess_execution_size( struct brw_instruction *insn,
-				  struct brw_reg reg )
+static void guess_execution_size(struct brw_compile *p,
+				 struct brw_instruction *insn,
+				 struct brw_reg reg)
 {
-   if (reg.width == BRW_WIDTH_8 && 
-       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED) 
+   if (reg.width == BRW_WIDTH_8 && p->compressed)
       insn->header.execution_size = BRW_EXECUTE_16;
    else
       insn->header.execution_size = reg.width;	/* note - definitions are compatible */
 }
 
 
-static void brw_set_dest( struct brw_instruction *insn,
-			  struct brw_reg dest )
+static void brw_set_dest(struct brw_compile *p,
+			 struct brw_instruction *insn,
+			 struct brw_reg dest)
 {
    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
        dest.file != BRW_MESSAGE_REGISTER_FILE)
@@ -100,7 +101,7 @@ static void brw_set_dest( struct brw_instruction *insn,
    /* NEW: Set the execution size based on dest.width and
     * insn->compression_control:
     */
-   guess_execution_size(insn, dest);
+   guess_execution_size(p, insn, dest);
 }
 
 extern int reg_type_size[];
@@ -629,7 +630,7 @@ static struct brw_instruction *brw_alu1( struct brw_compile *p,
 					 struct brw_reg src )
 {
    struct brw_instruction *insn = next_insn(p, opcode);
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src);   
    return insn;
 }
@@ -641,7 +642,7 @@ static struct brw_instruction *brw_alu2(struct brw_compile *p,
 					struct brw_reg src1 )
 {
    struct brw_instruction *insn = next_insn(p, opcode);   
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
    return insn;
@@ -680,7 +681,7 @@ void brw_##OP(struct brw_compile *p,					      \
 {									      \
    struct brw_instruction *rnd, *add;					      \
    rnd = next_insn(p, BRW_OPCODE_##OP);					      \
-   brw_set_dest(rnd, dest);						      \
+   brw_set_dest(p, rnd, dest);						      \
    brw_set_src0(rnd, src);						      \
    rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
 									      \
@@ -779,7 +780,7 @@ struct brw_instruction *brw_MUL(struct brw_compile *p,
 void brw_NOP(struct brw_compile *p)
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);   
-   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    brw_set_src1(insn, brw_imm_ud(0x0));
 }
@@ -840,11 +841,11 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
    /* Override the defaults for this instruction:
     */
    if (intel->gen < 6) {
-      brw_set_dest(insn, brw_ip_reg());
+      brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(insn, brw_ip_reg());
       brw_set_src1(insn, brw_imm_d(0x0));
    } else {
-      brw_set_dest(insn, brw_imm_w(0));
+      brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -870,7 +871,7 @@ brw_IF_gen6(struct brw_compile *p, uint32_t conditional,
 
    insn = next_insn(p, BRW_OPCODE_IF);
 
-   brw_set_dest(insn, brw_imm_w(0));
+   brw_set_dest(p, insn, brw_imm_w(0));
    insn->header.execution_size = BRW_EXECUTE_8;
    insn->bits1.branch_gen6.jump_count = 0;
    brw_set_src0(insn, src0);
@@ -905,11 +906,11 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
    }
 
    if (intel->gen < 6) {
-      brw_set_dest(insn, brw_ip_reg());
+      brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(insn, brw_ip_reg());
       brw_set_src1(insn, brw_imm_d(0x0));
    } else {
-      brw_set_dest(insn, brw_imm_w(0));
+      brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -965,11 +966,11 @@ void brw_ENDIF(struct brw_compile *p,
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
 
       if (intel->gen < 6) {
-	 brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+	 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 	 brw_set_src1(insn, brw_imm_d(0x0));
       } else {
-	 brw_set_dest(insn, brw_imm_w(0));
+	 brw_set_dest(p, insn, brw_imm_w(0));
 	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       }
@@ -1029,16 +1030,44 @@ void brw_ENDIF(struct brw_compile *p,
 
 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
+
    insn = next_insn(p, BRW_OPCODE_BREAK);
-   brw_set_dest(insn, brw_ip_reg());
+   if (intel->gen >= 6) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(insn, brw_imm_d(0x0));
+   } else {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(insn, brw_ip_reg());
+      brw_set_src1(insn, brw_imm_d(0x0));
+      insn->bits3.if_else.pad0 = 0;
+      insn->bits3.if_else.pop_count = pop_count;
+   }
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+
+   return insn;
+}
+
+struct brw_instruction *brw_CONT_gen6(struct brw_compile *p,
+				      struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+   int br = 2;
+
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_dest(p, insn, brw_ip_reg());
    brw_set_src0(insn, brw_ip_reg());
    brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->bits3.break_cont.uip = br * (do_insn - insn);
+
    insn->header.compression_control = BRW_COMPRESSION_NONE;
    insn->header.execution_size = BRW_EXECUTE_8;
-   /* insn->header.mask_control = BRW_MASK_DISABLE; */
-   insn->bits3.if_else.pad0 = 0;
-   insn->bits3.if_else.pop_count = pop_count;
    return insn;
 }
 
@@ -1046,7 +1075,7 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
 {
    struct brw_instruction *insn;
    insn = next_insn(p, BRW_OPCODE_CONTINUE);
-   brw_set_dest(insn, brw_ip_reg());
+   brw_set_dest(p, insn, brw_ip_reg());
    brw_set_src0(insn, brw_ip_reg());
    brw_set_src1(insn, brw_imm_d(0x0));
    insn->header.compression_control = BRW_COMPRESSION_NONE;
@@ -1058,17 +1087,33 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
 }
 
 /* DO/WHILE loop:
+ *
+ * The DO/WHILE is just an unterminated loop -- break or continue are
+ * used for control within the loop.  We have a few ways they can be
+ * done.
+ *
+ * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
+ * jip and no DO instruction.
+ *
+ * For non-uniform control flow pre-gen6, there's a DO instruction to
+ * push the mask, and a WHILE to jump back, and BREAK to get out and
+ * pop the mask.
+ *
+ * For gen6, there's no more mask stack, so no need for DO.  WHILE
+ * just points back to the first instruction of the loop.
  */
 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 {
-   if (p->single_program_flow) {
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6 || p->single_program_flow) {
       return &p->store[p->nr_insn];
    } else {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
 
       /* Override the defaults for this instruction:
        */
-      brw_set_dest(insn, brw_null_reg());
+      brw_set_dest(p, insn, brw_null_reg());
       brw_set_src0(insn, brw_null_reg());
       brw_set_src1(insn, brw_null_reg());
 
@@ -1094,34 +1139,42 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
    if (intel->gen >= 5)
       br = 2;
 
-   if (p->single_program_flow)
-      insn = next_insn(p, BRW_OPCODE_ADD);
-   else
+   if (intel->gen >= 6) {
       insn = next_insn(p, BRW_OPCODE_WHILE);
 
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
+      brw_set_dest(p, insn, brw_imm_w(0));
+      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
+      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = do_insn->header.execution_size;
+      assert(insn->header.execution_size == BRW_EXECUTE_8);
+   } else {
+      if (p->single_program_flow) {
+	 insn = next_insn(p, BRW_OPCODE_ADD);
 
-   if (p->single_program_flow) {
-      insn->header.execution_size = BRW_EXECUTE_1;
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(insn, brw_ip_reg());
+	 brw_set_src1(insn, brw_imm_d((do_insn - insn) * 16));
+	 insn->header.execution_size = BRW_EXECUTE_1;
+      } else {
+	 insn = next_insn(p, BRW_OPCODE_WHILE);
 
-      insn->bits3.d = (do_insn - insn) * 16;
-   } else {
-      insn->header.execution_size = do_insn->header.execution_size;
+	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
 
-      assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
-      insn->bits3.if_else.pop_count = 0;
-      insn->bits3.if_else.pad0 = 0;
-   }
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(insn, brw_ip_reg());
+	 brw_set_src1(insn, brw_imm_d(0));
 
-/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+	 insn->header.execution_size = do_insn->header.execution_size;
+	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
+	 insn->bits3.if_else.pop_count = 0;
+	 insn->bits3.if_else.pad0 = 0;
+      }
+   }
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
-   /* insn->header.mask_control = BRW_MASK_DISABLE; */
-   p->current->header.predicate_control = BRW_PREDICATE_NONE;   
    return insn;
 }
 
@@ -1159,7 +1212,7 @@ void brw_CMP(struct brw_compile *p,
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
    insn->header.destreg__conditionalmod = conditional;
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
 
@@ -1184,7 +1237,7 @@ void brw_WAIT (struct brw_compile *p)
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
    struct brw_reg src = brw_notification_1_reg();
 
-   brw_set_dest(insn, src);
+   brw_set_dest(p, insn, src);
    brw_set_src0(insn, src);
    brw_set_src1(insn, brw_null_reg());
    insn->header.execution_size = 0; /* must */
@@ -1219,6 +1272,10 @@ void brw_math( struct brw_compile *p,
       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
 
+      /* Source modifiers are ignored for extended math instructions. */
+      assert(!src.negate);
+      assert(!src.abs);
+
       if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
 	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
 	 assert(src.type == BRW_REGISTER_TYPE_F);
@@ -1228,8 +1285,9 @@ void brw_math( struct brw_compile *p,
        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
        */
       insn->header.destreg__conditionalmod = function;
+      insn->header.saturate = saturate;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_src1(insn, brw_null_reg());
    } else {
@@ -1242,7 +1300,7 @@ void brw_math( struct brw_compile *p,
       insn->header.predicate_control = 0;
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_math_message(p->brw,
 			   insn,
@@ -1284,12 +1342,18 @@ void brw_math2(struct brw_compile *p,
       assert(src1.type == BRW_REGISTER_TYPE_F);
    }
 
+   /* Source modifiers are ignored for extended math instructions. */
+   assert(!src0.negate);
+   assert(!src0.abs);
+   assert(!src1.negate);
+   assert(!src1.abs);
+
    /* Math is the same ISA format as other opcodes, except that CondModifier
     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
     */
    insn->header.destreg__conditionalmod = function;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
 }
@@ -1318,8 +1382,13 @@ void brw_math_16( struct brw_compile *p,
        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
        */
       insn->header.destreg__conditionalmod = function;
+      insn->header.saturate = saturate;
+
+      /* Source modifiers are ignored for extended math instructions. */
+      assert(!src.negate);
+      assert(!src.abs);
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_src1(insn, brw_null_reg());
       return;
@@ -1334,7 +1403,7 @@ void brw_math_16( struct brw_compile *p,
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src);
    brw_set_math_message(p->brw,
 			insn, 
@@ -1351,7 +1420,7 @@ void brw_math_16( struct brw_compile *p,
    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
    insn->header.destreg__conditionalmod = msg_reg_nr+1;
 
-   brw_set_dest(insn, offset(dest,1));
+   brw_set_dest(p, insn, offset(dest,1));
    brw_set_src0(insn, src);
    brw_set_math_message(p->brw, 
 			insn, 
@@ -1446,7 +1515,7 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
 	 send_commit_msg = 1;
       }
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
       brw_set_dp_write_message(p->brw,
@@ -1516,7 +1585,7 @@ brw_oword_block_read_scratch(struct brw_compile *p,
       insn->header.compression_control = BRW_COMPRESSION_NONE;
       insn->header.destreg__conditionalmod = mrf.nr;
 
-      brw_set_dest(insn, dest);	/* UW? */
+      brw_set_dest(p, insn, dest);	/* UW? */
       brw_set_src0(insn, brw_null_reg());
 
       brw_set_dp_read_message(p->brw,
@@ -1569,7 +1638,7 @@ void brw_oword_block_read(struct brw_compile *p,
    /* cast dest to a uword[8] vector */
    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    if (intel->gen >= 6) {
       brw_set_src0(insn, mrf);
    } else {
@@ -1614,7 +1683,7 @@ void brw_dword_scattered_read(struct brw_compile *p,
    /* cast dest to a uword[8] vector */
    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, brw_null_reg());
 
    brw_set_dp_read_message(p->brw,
@@ -1639,29 +1708,21 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
                       GLuint location,
                       GLuint bind_table_index)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_reg_nr = 1;
-   struct brw_reg b;
 
-   /*
-   printf("vs const read msg, location %u, msg_reg_nr %d\n",
-          location, msg_reg_nr);
-   */
+   if (intel->gen >= 6)
+      location /= 16;
 
    /* Setup MRF[1] with location/offset into const buffer */
    brw_push_insn_state(p);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
-    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
-    */
-   b = brw_message_reg(msg_reg_nr);
-   b = retype(b, BRW_REGISTER_TYPE_UD);
-   /*b = get_element_ud(b, 2);*/
-   brw_MOV(p, b, brw_imm_ud(location));
-
+   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
+		     BRW_REGISTER_TYPE_UD),
+	   brw_imm_ud(location));
    brw_pop_insn_state(p);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
@@ -1671,8 +1732,12 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
    insn->header.destreg__conditionalmod = msg_reg_nr;
    insn->header.mask_control = BRW_MASK_DISABLE;
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, brw_null_reg());
+   brw_set_dest(p, insn, dest);
+   if (intel->gen >= 6) {
+      brw_set_src0(insn, brw_message_reg(msg_reg_nr));
+   } else {
+      brw_set_src0(insn, brw_null_reg());
+   }
 
    brw_set_dp_read_message(p->brw,
 			   insn,
@@ -1706,7 +1771,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
     * fields ignored.
     */
-   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
+   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
 	   addr_reg, brw_imm_d(offset));
    brw_pop_insn_state(p);
 
@@ -1717,7 +1782,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    insn->header.destreg__conditionalmod = 0;
    insn->header.mask_control = BRW_MASK_DISABLE;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, brw_vec8_grf(0, 0));
 
    if (intel->gen == 6)
@@ -1782,7 +1847,7 @@ void brw_fb_WRITE(struct brw_compile *p,
    else
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_dp_write_message(p->brw,
 			    insn,
@@ -1860,7 +1925,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 
-	 guess_execution_size(p->current, dest);
+	 guess_execution_size(p, p->current, dest);
 	 if (p->current->header.execution_size == BRW_EXECUTE_16)
 	    dispatch_16 = GL_TRUE;
 
@@ -1895,12 +1960,15 @@ void brw_SAMPLE(struct brw_compile *p,
        * and the first message register index comes from src0.
        */
       if (intel->gen >= 6) {
-	  brw_push_insn_state(p);
-	  brw_set_mask_control( p, BRW_MASK_DISABLE );
-	  /* m1 contains header? */
-	  brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
-	  brw_pop_insn_state(p);
-	  src0 = brw_message_reg(msg_reg_nr);
+	 if (src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	     src0.nr != BRW_ARF_NULL) {
+	    brw_push_insn_state(p);
+	    brw_set_mask_control( p, BRW_MASK_DISABLE );
+	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	    brw_MOV(p, retype(brw_message_reg(msg_reg_nr), src0.type), src0);
+	    brw_pop_insn_state(p);
+	 }
+	 src0 = brw_message_reg(msg_reg_nr);
       }
 
       insn = next_insn(p, BRW_OPCODE_SEND);
@@ -1909,7 +1977,7 @@ void brw_SAMPLE(struct brw_compile *p,
       if (intel->gen < 6)
 	  insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src0);
       brw_set_sampler_message(p->brw, insn,
 			      binding_table_index,
@@ -1970,7 +2038,7 @@ void brw_urb_WRITE(struct brw_compile *p,
 
    assert(msg_length < BRW_MAX_MRF);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
@@ -1989,6 +2057,80 @@ void brw_urb_WRITE(struct brw_compile *p,
 		       swizzle);
 }
 
+static int
+brw_find_next_block_end(struct brw_compile *p, int start)
+{
+   int ip;
+
+   for (ip = start + 1; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      switch (insn->header.opcode) {
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_WHILE:
+	 return ip;
+      }
+   }
+   assert(!"not reached");
+   return start + 1;
+}
+
+/* There is no DO instruction on gen6, so to find the end of the loop
+ * we have to see if the loop is jumping back before our start
+ * instruction.
+ */
+static int
+brw_find_loop_end(struct brw_compile *p, int start)
+{
+   int ip;
+   int br = 2;
+
+   for (ip = start + 1; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      if (insn->header.opcode == BRW_OPCODE_WHILE) {
+	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
+	    return ip;
+      }
+   }
+   assert(!"not reached");
+   return start + 1;
+}
+
+/* After program generation, go back and update the UIP and JIP of
+ * BREAK and CONT instructions to their correct locations.
+ */
+void
+brw_set_uip_jip(struct brw_compile *p)
+{
+   struct intel_context *intel = &p->brw->intel;
+   int ip;
+   int br = 2;
+
+   if (intel->gen < 6)
+      return;
+
+   for (ip = 0; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      switch (insn->header.opcode) {
+      case BRW_OPCODE_BREAK:
+	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+	 /* JIP is set at CONTINUE emit time, since that's when we
+	  * know where the start of the loop is.
+	  */
+	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+	 assert(insn->bits3.break_cont.uip != 0);
+	 assert(insn->bits3.break_cont.jip != 0);
+	 break;
+      }
+   }
+}
+
 void brw_ff_sync(struct brw_compile *p,
 		   struct brw_reg dest,
 		   GLuint msg_reg_nr,
@@ -2013,7 +2155,7 @@ void brw_ff_sync(struct brw_compile *p,
    }
 
    insn = next_insn(p, BRW_OPCODE_SEND);
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index edb02fabb23..c3cbe0df618 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -600,8 +600,13 @@ fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
     * might be able to do better by doing execsize = 1 math and then
     * expanding that result out, but we would need to be careful with
     * masking.
+    *
+    * The hardware ignores source modifiers (negate and abs) on math
+    * instructions, so we also move to a temp to set those up.
     */
-   if (intel->gen >= 6 && src.file == UNIFORM) {
+   if (intel->gen >= 6 && (src.file == UNIFORM ||
+			   src.abs ||
+			   src.negate)) {
       fs_reg expanded = fs_reg(this, glsl_type::float_type);
       emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
       src = expanded;
@@ -933,6 +938,10 @@ fs_visitor::visit(ir_expression *ir)
       assert(!"not reached: should be handled by lower_noise");
       break;
 
+   case ir_quadop_vector:
+      assert(!"not reached: should be handled by lower_quadop_vector");
+      break;
+
    case ir_unop_sqrt:
       emit_math(FS_OPCODE_SQRT, this->result, op[0]);
       break;
@@ -1423,28 +1432,70 @@ fs_visitor::visit(ir_discard *ir)
 void
 fs_visitor::visit(ir_constant *ir)
 {
-   fs_reg reg(this, ir->type);
-   this->result = reg;
+   /* Set this->result to reg at the bottom of the function because some code
+    * paths will cause this visitor to be applied to other fields.  This will
+    * cause the value stored in this->result to be modified.
+    *
+    * Make reg constant so that it doesn't get accidentally modified along the
+    * way.  Yes, I actually had this problem. :(
+    */
+   const fs_reg reg(this, ir->type);
+   fs_reg dst_reg = reg;
 
-   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
-      switch (ir->type->base_type) {
-      case GLSL_TYPE_FLOAT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.f[i])));
-	 break;
-      case GLSL_TYPE_UINT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.u[i])));
-	 break;
-      case GLSL_TYPE_INT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.i[i])));
-	 break;
-      case GLSL_TYPE_BOOL:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg((int)ir->value.b[i])));
-	 break;
-      default:
-	 assert(!"Non-float/uint/int/bool constant");
+   if (ir->type->is_array()) {
+      const unsigned size = type_size(ir->type->fields.array);
+
+      for (unsigned i = 0; i < ir->type->length; i++) {
+	 ir->array_elements[i]->accept(this);
+	 fs_reg src_reg = this->result;
+
+	 dst_reg.type = src_reg.type;
+	 for (unsigned j = 0; j < size; j++) {
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
+	    src_reg.reg_offset++;
+	    dst_reg.reg_offset++;
+	 }
+      }
+   } else if (ir->type->is_record()) {
+      foreach_list(node, &ir->components) {
+	 ir_instruction *const field = (ir_instruction *) node;
+	 const unsigned size = type_size(field->type);
+
+	 field->accept(this);
+	 fs_reg src_reg = this->result;
+
+	 dst_reg.type = src_reg.type;
+	 for (unsigned j = 0; j < size; j++) {
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
+	    src_reg.reg_offset++;
+	    dst_reg.reg_offset++;
+	 }
+      }
+   } else {
+      const unsigned size = type_size(ir->type);
+
+      for (unsigned i = 0; i < size; i++) {
+	 switch (ir->type->base_type) {
+	 case GLSL_TYPE_FLOAT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])));
+	    break;
+	 case GLSL_TYPE_UINT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])));
+	    break;
+	 case GLSL_TYPE_INT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])));
+	    break;
+	 case GLSL_TYPE_BOOL:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])));
+	    break;
+	 default:
+	    assert(!"Non-float/uint/int/bool constant");
+	 }
+	 dst_reg.reg_offset++;
       }
-      reg.reg_offset++;
    }
+
+   this->result = reg;
 }
 
 void
@@ -1574,7 +1625,7 @@ fs_visitor::emit_if_gen6(ir_if *ir)
 
       switch (expr->operation) {
       case ir_unop_logic_not:
-	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(1)));
+	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0)));
 	 inst->conditional_mod = BRW_CONDITIONAL_Z;
 	 return;
 
@@ -1951,7 +2002,7 @@ fs_visitor::emit_interpolation_setup_gen6()
    emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
 
    this->current_annotation = "compute 1/pos.w";
-   this->wpos_w = fs_reg(brw_vec8_grf(c->key.source_w_reg, 0));
+   this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
    this->pixel_w = fs_reg(this, glsl_type::float_type);
    emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
 
@@ -1979,17 +2030,17 @@ fs_visitor::emit_fb_writes()
       nr += 2;
    }
 
-   if (c->key.aa_dest_stencil_reg) {
+   if (c->aa_dest_stencil_reg) {
       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		   fs_reg(brw_vec8_grf(c->key.aa_dest_stencil_reg, 0))));
+		   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
    }
 
    /* Reserve space for color. It'll be filled in per MRT below. */
    int color_mrf = nr;
    nr += 4;
 
-   if (c->key.source_depth_to_render_target) {
-      if (c->key.computes_depth) {
+   if (c->source_depth_to_render_target) {
+      if (c->computes_depth) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth);
 	 fs_reg depth = *(variable_storage(this->frag_depth));
@@ -1998,20 +2049,22 @@ fs_visitor::emit_fb_writes()
       } else {
 	 /* Pass through the payload depth. */
 	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		      fs_reg(brw_vec8_grf(c->key.source_depth_reg, 0))));
+		      fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
       }
    }
 
-   if (c->key.dest_depth_reg) {
+   if (c->dest_depth_reg) {
       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		   fs_reg(brw_vec8_grf(c->key.dest_depth_reg, 0))));
+		   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
    }
 
    fs_reg color = reg_undef;
    if (this->frag_color)
       color = *(variable_storage(this->frag_color));
-   else if (this->frag_data)
+   else if (this->frag_data) {
       color = *(variable_storage(this->frag_data));
+      color.type = BRW_REGISTER_TYPE_F;
+   }
 
    for (int target = 0; target < c->key.nr_color_regions; target++) {
       this->current_annotation = talloc_asprintf(this->mem_ctx,
@@ -2452,7 +2505,7 @@ fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
 void
 fs_visitor::assign_curb_setup()
 {
-   c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
+   c->prog_data.first_curbe_grf = c->nr_payload_regs;
    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
 
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
@@ -3227,6 +3280,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
 	 break;
       default:
 	 assert(!"not reached");
+	 brw_reg = brw_null_reg();
 	 break;
       }
       break;
@@ -3241,6 +3295,10 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
       assert(!"not reached");
       brw_reg = brw_null_reg();
       break;
+   default:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
    }
    if (reg->abs)
       brw_reg = brw_abs(brw_reg);
@@ -3373,10 +3431,6 @@ fs_visitor::generate_code()
 	 break;
 
       case BRW_OPCODE_DO:
-	 /* FINISHME: We need to write the loop instruction support still. */
-	 if (intel->gen >= 6)
-	    this->fail = true;
-
 	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
 	 if_depth_in_loop[loop_stack_depth] = 0;
 	 break;
@@ -3386,7 +3440,11 @@ fs_visitor::generate_code()
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 	 break;
       case BRW_OPCODE_CONTINUE:
-	 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
+	 /* FINISHME: We need to write the loop instruction support still. */
+	 if (intel->gen >= 6)
+	    brw_CONT_gen6(p, loop_stack[loop_stack_depth - 1]);
+	 else
+	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 	 break;
 
@@ -3400,16 +3458,18 @@ fs_visitor::generate_code()
 	 assert(loop_stack_depth > 0);
 	 loop_stack_depth--;
 	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
-	 /* patch all the BREAK/CONT instructions from last BGNLOOP */
-	 while (inst0 > loop_stack[loop_stack_depth]) {
-	    inst0--;
-	    if (inst0->header.opcode == BRW_OPCODE_BREAK &&
-		inst0->bits3.if_else.jump_count == 0) {
-	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
+	    while (inst0 > loop_stack[loop_stack_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+		   inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
 	    }
-	    else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-		     inst0->bits3.if_else.jump_count == 0) {
-	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
 	    }
 	 }
       }
@@ -3486,6 +3546,26 @@ fs_visitor::generate_code()
 
       last_native_inst = p->nr_insn;
    }
+
+   brw_set_uip_jip(p);
+
+   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
+    * emit issues, it doesn't get the jump distances into the output,
+    * which is often something we want to debug.  So this is here in
+    * case you're doing that.
+    */
+   if (0) {
+      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+	 for (unsigned int i = 0; i < p->nr_insn; i++) {
+	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		   ((uint32_t *)&p->store[i])[3],
+		   ((uint32_t *)&p->store[i])[2],
+		   ((uint32_t *)&p->store[i])[1],
+		   ((uint32_t *)&p->store[i])[0]);
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+   }
 }
 
 GLboolean
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 3b7b03a05b8..20bfa4c3ea3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -205,6 +205,8 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_round_even:
    case ir_unop_sin:
    case ir_unop_cos:
+   case ir_unop_sin_reduced:
+   case ir_unop_cos_reduced:
    case ir_unop_dFdx:
    case ir_unop_dFdy:
       for (i = 0; i < vector_elements; i++) {
@@ -328,6 +330,9 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_noise:
       assert(!"noise should have been broken down to function call");
       break;
+   case ir_quadop_vector:
+      assert(!"should have been lowered");
+      break;
    }
 
    ir->remove();
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index b0c76f4094d..73b41fdbcef 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -166,6 +166,9 @@ static void populate_key( struct brw_context *brw,
 			  struct brw_gs_prog_key *key )
 {
    struct gl_context *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   int prim_gs_always;
+
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
@@ -185,10 +188,14 @@ static void populate_key( struct brw_context *brw,
       key->pv_first = GL_TRUE;
    }
 
-   key->need_gs_prog = (key->hint_gs_always ||
-			brw->primitive == GL_QUADS ||
+   if (intel->gen == 6)
+       prim_gs_always = brw->primitive == GL_LINE_LOOP;
+   else
+       prim_gs_always = brw->primitive == GL_QUADS ||
 			brw->primitive == GL_QUAD_STRIP ||
-			brw->primitive == GL_LINE_LOOP);
+			brw->primitive == GL_LINE_LOOP;
+
+   key->need_gs_prog = (key->hint_gs_always || prim_gs_always);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
@@ -205,8 +212,10 @@ static void prepare_gs_prog(struct brw_context *brw)
       brw->gs.prog_active = key.need_gs_prog;
    }
 
+   drm_intel_bo_unreference(brw->gs.prog_bo);
+   brw->gs.prog_bo = NULL;
+
    if (brw->gs.prog_active) {
-      drm_intel_bo_unreference(brw->gs.prog_bo);
       brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
 					 &key, sizeof(key),
 					 NULL, 0,
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 1d350bc0413..a91b0528fac 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -38,40 +38,6 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-
-
-
-
-/***********************************************************************
- * Blend color
- */
-
-static void upload_blend_constant_color(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_blend_constant_color bcc;
-
-   memset(&bcc, 0, sizeof(bcc));      
-   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
-   bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
-   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
-   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
-   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
-}
-
-
-const struct brw_tracked_state brw_blend_constant_color = {
-   .dirty = {
-      .mesa = _NEW_COLOR,
-      .brw = BRW_NEW_CONTEXT,
-      .cache = 0
-   },
-   .emit = upload_blend_constant_color
-};
-
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 static void upload_drawing_rect(struct brw_context *brw)
 {
@@ -339,6 +305,9 @@ static void upload_polygon_stipple(struct brw_context *brw)
    struct brw_polygon_stipple bps;
    GLuint i;
 
+   if (!ctx->Polygon.StippleFlag)
+      return;
+
    memset(&bps, 0, sizeof(bps));
    bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
    bps.header.length = sizeof(bps)/4-2;
@@ -381,6 +350,9 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
    struct gl_context *ctx = &brw->intel.ctx;
    struct brw_polygon_stipple_offset bpso;
 
+   if (!ctx->Polygon.StippleFlag)
+      return;
+
    memset(&bpso, 0, sizeof(bpso));
    bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
    bpso.header.length = sizeof(bpso)/4-2;
@@ -409,7 +381,7 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
 
 const struct brw_tracked_state brw_polygon_stipple_offset = {
    .dirty = {
-      .mesa = _NEW_WINDOW_POS,
+      .mesa = _NEW_WINDOW_POS | _NEW_POLYGONSTIPPLE,
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
@@ -421,9 +393,10 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
  */
 static void upload_aa_line_parameters(struct brw_context *brw)
 {
+   struct gl_context *ctx = &brw->intel.ctx;
    struct brw_aa_line_parameters balp;
 
-   if (!brw->has_aa_line_parameters)
+   if (!ctx->Line.SmoothFlag || !brw->has_aa_line_parameters)
       return;
 
    /* use legacy aa line coverage computation */
@@ -436,7 +409,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
 
 const struct brw_tracked_state brw_aa_line_parameters = {
    .dirty = {
-      .mesa = 0,
+      .mesa = _NEW_LINE,
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
@@ -454,6 +427,9 @@ static void upload_line_stipple(struct brw_context *brw)
    GLfloat tmp;
    GLint tmpi;
 
+   if (!ctx->Line.StippleFlag)
+      return;
+
    memset(&bls, 0, sizeof(bls));
    bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
    bls.header.length = sizeof(bls)/4 - 2;
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 1367d814696..94efa791091 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -142,7 +142,6 @@ static GLboolean brwProgramStringNotify( struct gl_context *ctx,
       if (newFP == curFP)
 	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
       newFP->id = brw->program_id++;      
-      newFP->isGLSL = brw_wm_is_glsl(fprog);
 
       /* Don't reject fragment shaders for their Mesa IR state when we're
        * using the new FS backend.
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 338f3876b31..eba4411ca70 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -129,7 +129,7 @@ const struct brw_tracked_state *gen6_atoms[] =
 
    &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
    &brw_wm_constants, /* Before wm_surfaces and constant_buffer */
-   &gen6_wm_constants, /* Before wm_surfaces and constant_buffer */
+   &gen6_wm_constants, /* Before wm_state */
 
    &brw_vs_surfaces,		/* must do before unit */
    &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 8ce9af9c4fe..461f27048cc 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -1064,6 +1064,15 @@ struct brw_sampler_default_color {
    GLfloat color[4];
 };
 
+struct gen5_sampler_default_color {
+   uint8_t ub[4];
+   float f[4];
+   uint16_t hf[4];
+   uint16_t us[4];
+   int16_t s[4];
+   uint8_t b[4];
+};
+
 struct brw_sampler_state
 {
    
@@ -1169,7 +1178,12 @@ struct brw_surface_state
       GLuint cube_neg_y:1; 
       GLuint cube_pos_x:1; 
       GLuint cube_neg_x:1; 
-      GLuint pad:4;
+      GLuint pad:2;
+      /* Required on gen6 for surfaces accessed through render cache messages.
+       */
+      GLuint render_cache_read_write:1;
+      /* Ironlake and newer: instead of replicating one of the texels */
+      GLuint cube_corner_average:1;
       GLuint mipmap_layout_mode:1; 
       GLuint vert_line_stride_ofs:1; 
       GLuint vert_line_stride:1; 
@@ -1539,6 +1553,21 @@ struct brw_instruction
 	 GLuint  pad0:12;
       } if_else;
 
+      struct
+      {
+	 /* Signed jump distance to the ip to jump to if all channels
+	  * are disabled after the break or continue.  It should point
+	  * to the end of the innermost control flow block, as that's
+	  * where some channel could get re-enabled.
+	  */
+	 int jip:16;
+
+	 /* Signed jump distance to the location to resume execution
+	  * of this channel if it's enabled for the break or continue.
+	  */
+	 int uip:16;
+      } break_cont;
+
       struct {
 	 GLuint function:4;
 	 GLuint int_type:1;
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 4a41c7a5176..6ae75d22c14 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -99,8 +99,8 @@ static void do_vs_prog( struct brw_context *brw,
    (void) ctx;
 
    aux_size = sizeof(c.prog_data);
-   if (c.vp->use_const_buffer)
-      aux_size += c.vp->program.Base.Parameters->NumParameters;
+   /* constant_map */
+   aux_size += c.vp->program.Base.Parameters->NumParameters;
 
    drm_intel_bo_unreference(brw->vs.prog_bo);
    brw->vs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_VS_PROG,
@@ -130,6 +130,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
    key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
    key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
 			ctx->Polygon.BackMode != GL_FILL);
+   key.two_side_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
    /* _NEW_POINT */
    if (ctx->Point.PointSprite) {
@@ -157,7 +158,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON | _NEW_POINT,
+      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON | _NEW_POINT | _NEW_LIGHT,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 9338a6b7dbf..0b88cc1ec76 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -44,6 +44,7 @@ struct brw_vs_prog_key {
    GLuint nr_userclip:4;
    GLuint copy_edgeflag:1;
    GLuint point_coord_replace:8;
+   GLuint two_side_color: 1;
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 7e43324a1f9..09887dae95d 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -140,9 +140,13 @@ clear_current_const(struct brw_vs_compile *c)
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    struct intel_context *intel = &c->func.brw->intel;
-   GLuint i, reg = 0, mrf;
+   GLuint i, reg = 0, mrf, j;
    int attributes_in_vue;
    int first_reladdr_output;
+   int max_constant;
+   int constant = 0;
+   int vert_result_reoder[VERT_RESULT_MAX];
+   int bfc = 0;
 
    /* Determine whether to use a real constant buffer or use a block
     * of GRF registers for constants.  The later is faster but only
@@ -181,62 +185,81 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
    }
 
-   /* Vertex program parameters from curbe:
+   /* Assign some (probably all) of the vertex program constants to
+    * the push constant buffer/CURBE.
+    *
+    * There's an obvious limit to the numer of push constants equal to
+    * the number of register available, and that number is smaller
+    * than the minimum maximum number of vertex program parameters, so
+    * support for pull constants is required if we overflow.
+    * Additionally, on gen6 the number of push constants is even
+    * lower.
+    *
+    * When there's relative addressing, we don't know what range of
+    * Mesa IR registers can be accessed.  And generally, when relative
+    * addressing is used we also have too many constants to load them
+    * all as push constants.  So, we'll just support relative
+    * addressing out of the pull constant buffers, and try to load as
+    * many statically-accessed constants into the push constant buffer
+    * as we can.
     */
-   if (c->vp->use_const_buffer) {
-      int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
-      int constant = 0;
-
-      /* We've got more constants than we can load with the push
-       * mechanism.  This is often correlated with reladdr loads where
-       * we should probably be using a pull mechanism anyway to avoid
-       * excessive reading.  However, the pull mechanism is slow in
-       * general.  So, we try to allocate as many non-reladdr-loaded
-       * constants through the push buffer as we can before giving up.
-       */
-      memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
-      for (i = 0;
-	   i < c->vp->program.Base.NumInstructions && constant < max_constant;
-	   i++) {
-	 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
-	 int arg;
-
-	 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
-	    if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
-		 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
-		 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
-		 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
-		 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
-		inst->SrcReg[arg].RelAddr)
-	       continue;
-
-	    if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
-	       c->constant_map[inst->SrcReg[arg].Index] = constant++;
-	    }
+   if (intel->gen >= 6) {
+      /* We can only load 32 regs of push constants. */
+      max_constant = 32 * 2 - c->key.nr_userclip;
+   } else {
+      max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
+   }
+
+   /* constant_map maps from ParameterValues[] index to index in the
+    * push constant buffer, or -1 if it's only in the pull constant
+    * buffer.
+    */
+   memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
+   for (i = 0;
+	i < c->vp->program.Base.NumInstructions && constant < max_constant;
+	i++) {
+      struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
+      int arg;
+
+      for (arg = 0; arg < 3 && constant < max_constant; arg++) {
+	 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
+	     inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
+	     inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
+	     inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
+	     inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
+	    continue;
 	 }
-      }
 
-      for (i = 0; i < constant; i++) {
-         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
-							      (i%2) * 4),
-						 0, 4, 1);
+	 if (inst->SrcReg[arg].RelAddr) {
+	    c->vp->use_const_buffer = GL_TRUE;
+	    continue;
+	 }
+
+	 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
+	    c->constant_map[inst->SrcReg[arg].Index] = constant++;
+	 }
       }
-      reg += (constant + 1) / 2;
-      c->prog_data.curb_read_length = reg - 1;
-      /* XXX 0 causes a bug elsewhere... */
-      c->prog_data.nr_params = MAX2(constant * 4, 4);
    }
-   else {
-      /* use a section of the GRF for constants */
-      GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
-      for (i = 0; i < nr_params; i++) {
-         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
-      }
-      reg += (nr_params + 1) / 2;
-      c->prog_data.curb_read_length = reg - 1;
 
-      c->prog_data.nr_params = nr_params * 4;
+   /* If we ran out of push constant space, then we'll also upload all
+    * constants through the pull constant buffer so that they can be
+    * accessed no matter what.  For relative addressing (the common
+    * case) we need them all in place anyway.
+    */
+   if (constant == max_constant)
+      c->vp->use_const_buffer = GL_TRUE;
+
+   for (i = 0; i < constant; i++) {
+      c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
+							  (i % 2) * 4),
+					     0, 4, 1);
    }
+   reg += (constant + 1) / 2;
+   c->prog_data.curb_read_length = reg - 1;
+   c->prog_data.nr_params = constant * 4;
+   /* XXX 0 causes a bug elsewhere... */
+   if (intel->gen < 6 && c->prog_data.nr_params == 0)
+      c->prog_data.nr_params = 4;
 
    /* Allocate input regs:  
     */
@@ -270,7 +293,36 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       mrf = 4;
 
    first_reladdr_output = get_first_reladdr_output(&c->vp->program);
-   for (i = 0; i < VERT_RESULT_MAX; i++) {
+
+   for (i = 0; i < VERT_RESULT_MAX; i++)
+       vert_result_reoder[i] = i;
+
+   /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
+   if (intel->gen >= 6 && c->key.two_side_color) {
+       if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
+           (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
+           assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
+           assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
+           bfc = 2;
+       } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
+           (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
+           bfc = 1;
+
+       if (bfc) {
+           for (i = 0; i < bfc; i++) {
+               vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i;
+               vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i;
+           }
+
+           for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) {
+               vert_result_reoder[i] = i - bfc;
+           }
+       }
+   }
+
+   for (j = 0; j < VERT_RESULT_MAX; j++) {
+      i = vert_result_reoder[j];
+
       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 	 c->nr_outputs++;
          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
@@ -281,7 +333,6 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	 else if (i == VERT_RESULT_PSIZ) {
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	    reg++;
-	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
 	 }
 	 else {
 	    /* Two restrictions on our compute-to-MRF here.  The
@@ -574,9 +625,18 @@ static void emit_max( struct brw_compile *p,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
-   brw_SEL(p, dst, arg0, arg1);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
+      brw_SEL(p, dst, arg0, arg1);
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   } else {
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
+      brw_SEL(p, dst, arg0, arg1);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
 }
 
 static void emit_min( struct brw_compile *p, 
@@ -584,9 +644,18 @@ static void emit_min( struct brw_compile *p,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
-   brw_SEL(p, dst, arg0, arg1);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+      brw_SEL(p, dst, arg0, arg1);
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   } else {
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+      brw_SEL(p, dst, arg0, arg1);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
 }
 
 static void emit_math1_gen4(struct brw_vs_compile *c,
@@ -680,7 +749,7 @@ emit_math1(struct brw_vs_compile *c,
       emit_math1_gen4(c, function, dst, arg0, precision);
 }
 
-static void emit_math2( struct brw_vs_compile *c, 
+static void emit_math2_gen4( struct brw_vs_compile *c, 
 			GLuint function,
 			struct brw_reg dst,
 			struct brw_reg arg0,
@@ -688,14 +757,11 @@ static void emit_math2( struct brw_vs_compile *c,
 			GLuint precision)
 {
    struct brw_compile *p = &c->func;
-   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = dst;
    GLboolean need_tmp = GL_FALSE;
 
-   if (dst.file != BRW_GENERAL_REGISTER_FILE)
-      need_tmp = GL_TRUE;
-
-   if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
+   if (dst.file != BRW_GENERAL_REGISTER_FILE ||
+       dst.dw1.bits.writemask != 0xf)
       need_tmp = GL_TRUE;
 
    if (need_tmp) 
@@ -718,6 +784,53 @@ static void emit_math2( struct brw_vs_compile *c,
    }
 }
 
+static void emit_math2_gen6( struct brw_vs_compile *c, 
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp_src0, tmp_src1, tmp_dst;
+
+   tmp_src0 = get_tmp(c);
+   tmp_src1 = get_tmp(c);
+   tmp_dst = get_tmp(c);
+
+   brw_MOV(p, tmp_src0, arg0);
+   brw_MOV(p, tmp_src1, arg1);
+   
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_math2(p,
+	    tmp_dst,
+	    function,
+	    tmp_src0,
+	    tmp_src1);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   brw_MOV(p, dst, tmp_dst);
+
+   release_tmp(c, tmp_src0);
+   release_tmp(c, tmp_src1);
+   release_tmp(c, tmp_dst);
+}
+
+static void emit_math2( struct brw_vs_compile *c, 
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6)
+      emit_math2_gen6(c, function, dst, arg0, arg1, precision);
+   else
+      emit_math2_gen4(c, function, dst, arg0, arg1, precision);
+}
 
 static void emit_exp_noalias( struct brw_vs_compile *c,
 			      struct brw_reg dst,
@@ -990,8 +1103,6 @@ get_constant(struct brw_vs_compile *c,
 
    assert(argIndex < 3);
 
-   assert(c->func.brw->intel.gen < 6); /* FINISHME */
-
    if (c->current_const[argIndex].index != src->Index) {
       /* Keep track of the last constant loaded in this slot, for reuse. */
       c->current_const[argIndex].index = src->Index;
@@ -1022,14 +1133,14 @@ get_reladdr_constant(struct brw_vs_compile *c,
 {
    const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    struct brw_reg const_reg = c->current_const[argIndex].reg;
-   struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
-   struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
+   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   uint32_t offset;
 
    assert(argIndex < 3);
 
-   assert(c->func.brw->intel.gen < 6); /* FINISHME */
-
    /* Can't reuse a reladdr constant load. */
    c->current_const[argIndex].index = -1;
 
@@ -1038,15 +1149,21 @@ get_reladdr_constant(struct brw_vs_compile *c,
 	  src->Index, argIndex, c->current_const[argIndex].reg.nr);
 #endif
 
-   brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
+   if (intel->gen >= 6) {
+      offset = src->Index;
+   } else {
+      struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
+      brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
+      addr_reg = byte_addr_reg;
+      offset = 16 * src->Index;
+   }
 
    /* fetch the first vec4 */
    brw_dp_READ_4_vs_relative(p,
-			     const_reg,                     /* writeback dest */
-			     byte_addr_reg,                 /* address register */
-			     16 * src->Index,               /* byte offset */
-			     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
-			     );
+			     const_reg,
+			     addr_reg,
+			     offset,
+			     SURF_INDEX_VERT_CONST_BUFFER);
 
    return const_reg;
 }
@@ -1241,22 +1358,18 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_UNIFORM:
    case PROGRAM_ENV_PARAM:
    case PROGRAM_LOCAL_PARAM:
-      if (c->vp->use_const_buffer) {
-	 if (!relAddr && c->constant_map[index] != -1) {
-	    assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
-	    return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
-	 } else if (relAddr)
+      if (!relAddr && c->constant_map[index] != -1) {
+	 /* Take from the push constant buffer if possible. */
+	 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
+	 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
+      } else {
+	 /* Must be in the pull constant buffer then .*/
+	 assert(c->vp->use_const_buffer);
+	 if (relAddr)
 	    return get_reladdr_constant(c, inst, argIndex);
 	 else
 	    return get_constant(c, inst, argIndex);
       }
-      else if (relAddr) {
-         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
-      }
-      else {
-         assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
-         return c->regs[PROGRAM_STATE_VAR][index];
-      }
    case PROGRAM_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
@@ -1585,6 +1698,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 	 break;
       if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
 	 continue;
+      if (i == VERT_RESULT_PSIZ)
+	 continue;
 
       if (i >= VERT_RESULT_TEX0 &&
 	  c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
@@ -1895,7 +2010,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
       case OPCODE_RSQ:
-	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
+	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
 	 break;
 
       case OPCODE_SEQ:
@@ -1969,35 +2084,42 @@ void brw_vs_emit(struct brw_vs_compile *c )
          break;
       case OPCODE_CONT:
 	 brw_set_predicate_control(p, get_predicate(inst));
-	 brw_CONT(p, if_depth_in_loop[loop_depth]);
+	 if (intel->gen >= 6) {
+	    brw_CONT_gen6(p, loop_inst[loop_depth - 1]);
+	 } else {
+	    brw_CONT(p, if_depth_in_loop[loop_depth]);
+	 }
          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_ENDLOOP: 
-         {
-	    clear_current_const(c);
-            struct brw_instruction *inst0, *inst1;
-	    GLuint br = 1;
-
-            loop_depth--;
-
-	    if (intel->gen == 5)
-	       br = 2;
-
-            inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
-            /* patch all the BREAK/CONT instructions from last BEGINLOOP */
-            while (inst0 > loop_inst[loop_depth]) {
-               inst0--;
-               if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+
+      case OPCODE_ENDLOOP: {
+	 clear_current_const(c);
+	 struct brw_instruction *inst0, *inst1;
+	 GLuint br = 1;
+
+	 loop_depth--;
+
+	 if (intel->gen == 5)
+	    br = 2;
+
+	 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+	    while (inst0 > loop_inst[loop_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
 		   inst0->bits3.if_else.jump_count == 0) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-               }
-               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-			inst0->bits3.if_else.jump_count == 0) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-               }
-            }
-         }
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	       } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			  inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
+	    }
+	 }
+      }
          break;
+
       case OPCODE_BRA:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
@@ -2088,6 +2210,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
    }
 
    brw_resolve_cals(p);
+   brw_set_uip_jip(p);
 
    brw_optimize(p);
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index ccdc18e0b8d..656501b4f79 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -119,6 +119,62 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
    brw_wm_emit(c);
 }
 
+static void
+brw_wm_payload_setup(struct brw_context *brw,
+		     struct brw_wm_compile *c)
+{
+   struct intel_context *intel = &brw->intel;
+   bool uses_depth = (c->fp->program.Base.InputsRead &
+		      (1 << FRAG_ATTRIB_WPOS)) != 0;
+
+   if (intel->gen >= 6) {
+      /* R0-1: masks, pixel X/Y coordinates. */
+      c->nr_payload_regs = 2;
+      /* R2: only for 32-pixel dispatch.*/
+      /* R3-4: perspective pixel location barycentric */
+      c->nr_payload_regs += 2;
+      /* R5-6: perspective pixel location bary for dispatch width != 8 */
+      if (c->dispatch_width == 16) {
+	 c->nr_payload_regs += 2;
+      }
+      /* R7-10: perspective centroid barycentric */
+      /* R11-14: perspective sample barycentric */
+      /* R15-18: linear pixel location barycentric */
+      /* R19-22: linear centroid barycentric */
+      /* R23-26: linear sample barycentric */
+
+      /* R27: interpolated depth if uses source depth */
+      if (uses_depth) {
+	 c->source_depth_reg = c->nr_payload_regs;
+	 c->nr_payload_regs++;
+	 if (c->dispatch_width == 16) {
+	    /* R28: interpolated depth if not 8-wide. */
+	    c->nr_payload_regs++;
+	 }
+      }
+      /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W.
+       */
+      if (uses_depth) {
+	 c->source_w_reg = c->nr_payload_regs;
+	 c->nr_payload_regs++;
+	 if (c->dispatch_width == 16) {
+	    /* R30: interpolated W if not 8-wide. */
+	    c->nr_payload_regs++;
+	 }
+      }
+      /* R31: MSAA position offsets. */
+      /* R32-: bary for 32-pixel. */
+      /* R58-59: interp W for 32-pixel. */
+
+      if (c->fp->program.Base.OutputsWritten &
+	  BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+	 c->source_depth_to_render_target = GL_TRUE;
+	 c->computes_depth = GL_TRUE;
+      }
+   } else {
+      brw_wm_lookup_iz(intel, c);
+   }
+}
 
 /**
  * All Mesa program -> GPU code generation goes through this function.
@@ -167,23 +223,18 @@ static void do_wm_prog( struct brw_context *brw,
 
    brw_init_compile(brw, &c->func);
 
-   /* temporary sanity check assertion */
-   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+   brw_wm_payload_setup(brw, c);
 
    if (!brw_wm_fs_emit(brw, c)) {
       /*
        * Shader which use GLSL features such as flow control are handled
        * differently from "simple" shaders.
        */
-      if (fp->isGLSL) {
-	 c->dispatch_width = 8;
-	 brw_wm_glsl_emit(brw, c);
-      }
-      else {
-	 c->dispatch_width = 16;
-	 brw_wm_non_glsl_emit(brw, c);
-      }
+      c->dispatch_width = 16;
+      brw_wm_payload_setup(brw, c);
+      brw_wm_non_glsl_emit(brw, c);
    }
+   c->prog_data.dispatch_width = c->dispatch_width;
 
    /* Scratch space is used for register spilling */
    if (c->last_scratch) {
@@ -220,12 +271,10 @@ static void do_wm_prog( struct brw_context *brw,
 static void brw_wm_populate_key( struct brw_context *brw,
 				 struct brw_wm_prog_key *key )
 {
-   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
-   GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
    GLuint lookup = 0;
    GLuint line_aa;
    GLuint i;
@@ -285,57 +334,9 @@ static void brw_wm_populate_key( struct brw_context *brw,
       }
    }
 
-   if (intel->gen >= 6) {
-      /* R0-1: masks, pixel X/Y coordinates. */
-      key->nr_payload_regs = 2;
-      /* R2: only for 32-pixel dispatch.*/
-      /* R3-4: perspective pixel location barycentric */
-      key->nr_payload_regs += 2;
-      /* R5-6: perspective pixel location bary for dispatch width != 8 */
-      if (!fp->isGLSL) { /* dispatch_width != 8 */
-	 key->nr_payload_regs += 2;
-      }
-      /* R7-10: perspective centroid barycentric */
-      /* R11-14: perspective sample barycentric */
-      /* R15-18: linear pixel location barycentric */
-      /* R19-22: linear centroid barycentric */
-      /* R23-26: linear sample barycentric */
-
-      /* R27: interpolated depth if uses source depth */
-      if (uses_depth) {
-	 key->source_depth_reg = key->nr_payload_regs;
-	 key->nr_payload_regs++;
-	 if (!fp->isGLSL) { /* dispatch_width != 8 */
-	    /* R28: interpolated depth if not 8-wide. */
-	    key->nr_payload_regs++;
-	 }
-      }
-      /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W.
-       */
-      if (uses_depth) {
-	 key->source_w_reg = key->nr_payload_regs;
-	 key->nr_payload_regs++;
-	 if (!fp->isGLSL) { /* dispatch_width != 8 */
-	    /* R30: interpolated W if not 8-wide. */
-	    key->nr_payload_regs++;
-	 }
-      }
-      /* R31: MSAA position offsets. */
-      /* R32-: bary for 32-pixel. */
-      /* R58-59: interp W for 32-pixel. */
-
-      if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-	 key->source_depth_to_render_target = GL_TRUE;
-	 key->computes_depth = GL_TRUE;
-      }
-
-   } else {
-      brw_wm_lookup_iz(intel,
-	      	       line_aa,
-		       lookup,
-		       uses_depth,
-		       key);
-   }
+   key->iz_lookup = lookup;
+   key->line_aa = line_aa;
+   key->stats_wm = brw->intel.stats_wm;
 
    /* BRW_NEW_WM_INPUT_DIMENSIONS */
    key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
@@ -377,6 +378,10 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	       swizzles[2] = SWIZZLE_ZERO;
 	    } else if (t->DepthMode == GL_LUMINANCE) {
 	       swizzles[3] = SWIZZLE_ONE;
+	    } else if (t->DepthMode == GL_RED) {
+	       swizzles[1] = SWIZZLE_ZERO;
+	       swizzles[2] = SWIZZLE_ZERO;
+	       swizzles[3] = SWIZZLE_ZERO;
 	    }
 	 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 2ca685784fc..e7f3cfbb75f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -59,16 +59,9 @@
 #define AA_ALWAYS    2
 
 struct brw_wm_prog_key {
-   GLuint source_depth_reg:3;
-   GLuint source_w_reg:3;
-   GLuint aa_dest_stencil_reg:3;
-   GLuint dest_depth_reg:3;
-   GLuint nr_payload_regs:4;
-   GLuint computes_depth:1;	/* could be derived from program string */
-   GLuint source_depth_to_render_target:1;
+   GLuint stats_wm:1;
    GLuint flat_shade:1;
    GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
-   GLuint runtime_check_aads_emit:1;
    GLuint nr_color_regions:5;
    GLuint render_to_fbo:1;
 
@@ -81,6 +74,8 @@ struct brw_wm_prog_key {
 
    GLushort drawable_height;
    GLbitfield64 vp_outputs_written;
+   GLuint iz_lookup;
+   GLuint line_aa;
    GLuint program_string_id:32;
 };
 
@@ -204,6 +199,15 @@ struct brw_wm_compile {
       PASS2_DONE
    } state;
 
+   GLuint source_depth_reg:3;
+   GLuint source_w_reg:3;
+   GLuint aa_dest_stencil_reg:3;
+   GLuint dest_depth_reg:3;
+   GLuint nr_payload_regs:4;
+   GLuint computes_depth:1;	/* could be derived from program string */
+   GLuint source_depth_to_render_target:1;
+   GLuint runtime_check_aads_emit:1;
+
    /* Initial pass - translate fp instructions to fp instructions,
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
@@ -306,14 +310,9 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 void brw_wm_print_program( struct brw_wm_compile *c,
 			   const char *stage );
 
-void brw_wm_lookup_iz( struct intel_context *intel,
-		       GLuint line_aa,
-		       GLuint lookup,
-		       GLboolean ps_uses_depth,
-		       struct brw_wm_prog_key *key );
+void brw_wm_lookup_iz(struct intel_context *intel,
+		      struct brw_wm_compile *c);
 
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
-void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
 GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 /* brw_wm_emit.c */
@@ -381,7 +380,6 @@ void emit_fb_write(struct brw_wm_compile *c,
 void emit_frontfacing(struct brw_compile *p,
 		      const struct brw_reg *dst,
 		      GLuint mask);
-void emit_kil_nv(struct brw_wm_compile *c);
 void emit_linterp(struct brw_compile *p,
 		  const struct brw_reg *dst,
 		  GLuint mask,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index 96fecc97ee2..a0e86034e1e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -896,10 +896,14 @@ void emit_math1(struct brw_wm_compile *c,
 		      BRW_MATH_SATURATE_NONE);
    struct brw_reg src;
 
-   if (intel->gen >= 6 && (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
-			   arg0[0].file != BRW_GENERAL_REGISTER_FILE)) {
+   if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
+			    arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
+			   arg0[0].negate || arg0[0].abs)) {
       /* Gen6 math requires that source and dst horizontal stride be 1,
        * and that the argument be in the GRF.
+       *
+       * The hardware ignores source modifiers (negate and abs) on math
+       * instructions, so we also move to a temp to set those up.
        */
       src = dst[dst_chan];
       brw_MOV(p, src, arg0[0]);
@@ -1301,9 +1305,15 @@ static void emit_kil( struct brw_wm_compile *c,
 		      struct brw_reg *arg0)
 {
    struct brw_compile *p = &c->func;
-   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_reg pixelmask;
    GLuint i, j;
 
+   if (intel->gen >= 6)
+      pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+   else
+      pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
    for (i = 0; i < 4; i++) {
       /* Check if we've already done the comparison for this reg
        * -- common when someone does KIL TEMP.wwww.
@@ -1319,26 +1329,11 @@ static void emit_kil( struct brw_wm_compile *c,
       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));   
       brw_set_predicate_control_flag_value(p, 0xff);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_AND(p, r0uw, brw_flag_reg(), r0uw);
+      brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
       brw_pop_insn_state(p);
    }
 }
 
-/* KIL_NV kills the pixels that are currently executing, not based on a test
- * of the arguments.
- */
-void emit_kil_nv( struct brw_wm_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-
-   brw_push_insn_state(p);
-   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
-   brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
-   brw_pop_insn_state(p);
-}
-
 static void fire_fb_write( struct brw_wm_compile *c,
 			   GLuint base_reg,
 			   GLuint nr,
@@ -1387,8 +1382,8 @@ static void emit_aa( struct brw_wm_compile *c,
 		     GLuint reg )
 {
    struct brw_compile *p = &c->func;
-   GLuint comp = c->key.aa_dest_stencil_reg / 2;
-   GLuint off = c->key.aa_dest_stencil_reg % 2;
+   GLuint comp = c->aa_dest_stencil_reg / 2;
+   GLuint off = c->aa_dest_stencil_reg % 2;
    struct brw_reg aa = offset(arg1[comp], off);
 
    brw_push_insn_state(p);
@@ -1416,11 +1411,10 @@ void emit_fb_write(struct brw_wm_compile *c,
    struct intel_context *intel = &brw->intel;
    GLuint nr = 2;
    GLuint channel;
-   int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
 
    /* Reserve a space for AA - may not be needed:
     */
-   if (c->key.aa_dest_stencil_reg)
+   if (c->aa_dest_stencil_reg)
       nr += 1;
 
    /* I don't really understand how this achieves the color interleave
@@ -1428,11 +1422,6 @@ void emit_fb_write(struct brw_wm_compile *c,
     */
    brw_push_insn_state(p);
 
-   if (intel->gen >= 6)
-	base_reg = nr;
-   else
-	base_reg = 0;
-
    for (channel = 0; channel < 4; channel++) {
       if (intel->gen >= 6) {
 	 /* gen6 SIMD16 single source DP write looks like:
@@ -1493,9 +1482,9 @@ void emit_fb_write(struct brw_wm_compile *c,
 
    brw_pop_insn_state(p);
 
-   if (c->key.source_depth_to_render_target)
+   if (c->source_depth_to_render_target)
    {
-      if (c->key.computes_depth) 
+      if (c->computes_depth)
 	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
       else 
 	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
@@ -1503,10 +1492,10 @@ void emit_fb_write(struct brw_wm_compile *c,
       nr += 2;
    }
 
-   if (c->key.dest_depth_reg)
+   if (c->dest_depth_reg)
    {
-      GLuint comp = c->key.dest_depth_reg / 2;
-      GLuint off = c->key.dest_depth_reg % 2;
+      GLuint comp = c->dest_depth_reg / 2;
+      GLuint off = c->dest_depth_reg % 2;
 
       if (off != 0) {
          brw_push_insn_state(p);
@@ -1524,15 +1513,27 @@ void emit_fb_write(struct brw_wm_compile *c,
    }
 
    if (intel->gen >= 6) {
-      /* Subtract off the message header, since we send headerless. */
-      nr -= 2;
+      /* Load the message header.  There's no implied move from src0
+       * to the base mrf on gen6.
+       */
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_MOV(p, brw_message_reg(0), brw_vec8_grf(0, 0));
+      brw_pop_insn_state(p);
+
+      if (target != 0) {
+	 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+					0,
+					2), BRW_REGISTER_TYPE_UD),
+		 brw_imm_ud(target));
+      }
    }
 
-   if (!c->key.runtime_check_aads_emit) {
-      if (c->key.aa_dest_stencil_reg)
+   if (!c->runtime_check_aads_emit) {
+      if (c->aa_dest_stencil_reg)
 	 emit_aa(c, arg1, 2);
 
-      fire_fb_write(c, base_reg, nr, target, eot);
+      fire_fb_write(c, 0, nr, target, eot);
    }
    else {
       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
@@ -1897,10 +1898,6 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_kil(c, args[0]);
 	 break;
 
-      case OPCODE_KIL_NV:
-	 emit_kil_nv(c);
-	 break;
-
       default:
 	 printf("Unsupported opcode %i (%s) in fragment shader\n",
 		inst->opcode, inst->opcode < MAX_OPCODE ?
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 2cae6988804..4759b289a0c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -338,11 +338,13 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
 
 static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
 {
-   /* This is only called for producing 1/w in pre-gen6 interp.  for
-    * gen6, the interp opcodes don't use this argument.
+   /* This is called for producing 1/w in pre-gen6 interp.  for gen6,
+    * the interp opcodes don't use this argument.  But to keep the
+    * nr_args = 3 expectations of pinterp happy, just stuff delta_xy
+    * into the slot.
     */
    if (c->func.brw->intel.gen >= 6)
-      return src_undef();
+      return c->delta_xy;
 
    if (src_is_undef(c->pixel_w)) {
       struct prog_dst_register pixel_w = get_temp(c);
@@ -373,11 +375,7 @@ static void emit_interp( struct brw_wm_compile *c,
    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
    struct prog_src_register deltas;
 
-   if (c->func.brw->intel.gen < 6) {
-      deltas = get_delta_xy(c);
-   } else {
-      deltas = src_undef();
-   }
+   deltas = get_delta_xy(c);
 
    /* Need to use PINTERP on attributes which have been
     * multiplied by 1/W in the SF program, and LINTERP on those
@@ -1133,6 +1131,11 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 precalc_lit(c, inst);
 	 break;
 
+      case OPCODE_RSQ:
+	 out = emit_scalar_insn(c, inst);
+	 out->SrcReg[0].Abs = GL_TRUE;
+	 break;
+
       case OPCODE_TEX:
 	 precalc_tex(c, inst);
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
deleted file mode 100644
index 7fe8ab1f334..00000000000
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ /dev/null
@@ -1,1035 +0,0 @@
-#include "main/macros.h"
-#include "program/prog_parameter.h"
-#include "program/prog_print.h"
-#include "program/prog_optimize.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-
-static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
-                                  const struct prog_instruction *inst,
-                                  GLuint component);
-
-/**
- * Determine if the given fragment program uses GLSL features such
- * as flow conditionals, loops, subroutines.
- * Some GLSL shaders may use these features, others might not.
- */
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
-{
-    int i;
-
-    if (unlikely(INTEL_DEBUG & DEBUG_GLSL_FORCE))
-       return GL_TRUE;
-
-    for (i = 0; i < fp->Base.NumInstructions; i++) {
-	const struct prog_instruction *inst = &fp->Base.Instructions[i];
-	switch (inst->Opcode) {
-	    case OPCODE_ARL:
-	    case OPCODE_IF:
-	    case OPCODE_ENDIF:
-	    case OPCODE_CAL:
-	    case OPCODE_BRK:
-	    case OPCODE_RET:
-	    case OPCODE_BGNLOOP:
-		return GL_TRUE; 
-	    default:
-		break;
-	}
-    }
-    return GL_FALSE; 
-}
-
-
-
-static void
-reclaim_temps(struct brw_wm_compile *c);
-
-
-/** Mark GRF register as used. */
-static void
-prealloc_grf(struct brw_wm_compile *c, int r)
-{
-   c->used_grf[r] = GL_TRUE;
-}
-
-
-/** Mark given GRF register as not in use. */
-static void
-release_grf(struct brw_wm_compile *c, int r)
-{
-   /*assert(c->used_grf[r]);*/
-   c->used_grf[r] = GL_FALSE;
-   c->first_free_grf = MIN2(c->first_free_grf, r);
-}
-
-
-/** Return index of a free GRF, mark it as used. */
-static int
-alloc_grf(struct brw_wm_compile *c)
-{
-   GLuint r;
-   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
-      if (!c->used_grf[r]) {
-         c->used_grf[r] = GL_TRUE;
-         c->first_free_grf = r + 1;  /* a guess */
-         return r;
-      }
-   }
-
-   /* no free temps, try to reclaim some */
-   reclaim_temps(c);
-   c->first_free_grf = 0;
-
-   /* try alloc again */
-   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
-      if (!c->used_grf[r]) {
-         c->used_grf[r] = GL_TRUE;
-         c->first_free_grf = r + 1;  /* a guess */
-         return r;
-      }
-   }
-
-   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
-      assert(c->used_grf[r]);
-   }
-
-   /* really, no free GRF regs found */
-   if (!c->out_of_regs) {
-      /* print warning once per compilation */
-      _mesa_warning(NULL, "i965: ran out of registers for fragment program");
-      c->out_of_regs = GL_TRUE;
-   }
-
-   return -1;
-}
-
-
-/** Return number of GRF registers used */
-static int
-num_grf_used(const struct brw_wm_compile *c)
-{
-   int r;
-   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
-      if (c->used_grf[r])
-         return r + 1;
-   return 0;
-}
-
-
-
-/**
- * Record the mapping of a Mesa register to a hardware register.
- */
-static void set_reg(struct brw_wm_compile *c, int file, int index, 
-	int component, struct brw_reg reg)
-{
-    c->wm_regs[file][index][component].reg = reg;
-    c->wm_regs[file][index][component].inited = GL_TRUE;
-}
-
-static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
-{
-    struct brw_reg reg;
-
-    /* if we need to allocate another temp, grow the tmp_regs[] array */
-    if (c->tmp_index == c->tmp_max) {
-       int r = alloc_grf(c);
-       if (r < 0) {
-          /*printf("Out of temps in %s\n", __FUNCTION__);*/
-          r = 50; /* XXX random register! */
-       }
-       c->tmp_regs[ c->tmp_max++ ] = r;
-    }
-
-    /* form the GRF register */
-    reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
-    /*printf("alloc_temp %d\n", reg.nr);*/
-    assert(reg.nr < BRW_WM_MAX_GRF);
-    return reg;
-
-}
-
-/**
- * Save current temp register info.
- * There must be a matching call to release_tmps().
- */
-static int mark_tmps(struct brw_wm_compile *c)
-{
-    return c->tmp_index;
-}
-
-static void release_tmps(struct brw_wm_compile *c, int mark)
-{
-    c->tmp_index = mark;
-}
-
-/**
- * Convert Mesa src register to brw register.
- *
- * Since we're running in SOA mode each Mesa register corresponds to four
- * hardware registers.  We allocate the hardware registers as needed here.
- *
- * \param file  register file, one of PROGRAM_x
- * \param index  register number
- * \param component  src component (X=0, Y=1, Z=2, W=3)
- * \param nr  not used?!?
- * \param neg  negate value?
- * \param abs  take absolute value?
- */
-static struct brw_reg 
-get_reg(struct brw_wm_compile *c, int file, int index, int component,
-        int nr, GLuint neg, GLuint abs)
-{
-    struct brw_reg reg;
-    switch (file) {
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_CONSTANT:
-	case PROGRAM_UNIFORM:
-	    file = PROGRAM_STATE_VAR;
-	    break;
-	case PROGRAM_UNDEFINED:
-	    return brw_null_reg();	
-	case PROGRAM_TEMPORARY:
-	case PROGRAM_INPUT:
-	case PROGRAM_OUTPUT:
-	case PROGRAM_PAYLOAD:
-	    break;
-	default:
-	    _mesa_problem(NULL, "Unexpected file in get_reg()");
-	    return brw_null_reg();
-    }
-
-    assert(index < 256);
-    assert(component < 4);
-
-    /* see if we've already allocated a HW register for this Mesa register */
-    if (c->wm_regs[file][index][component].inited) {
-       /* yes, re-use */
-       reg = c->wm_regs[file][index][component].reg;
-    }
-    else {
-	/* no, allocate new register */
-       int grf = alloc_grf(c);
-       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
-       if (grf < 0) {
-          /* totally out of temps */
-          grf = 51; /* XXX random register! */
-       }
-
-       reg = brw_vec8_grf(grf, 0);
-       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
-
-       set_reg(c, file, index, component, reg);
-    }
-
-    if (neg & (1 << component)) {
-	reg = negate(reg);
-    }
-    if (abs)
-	reg = brw_abs(reg);
-    return reg;
-}
-
-
-
-/**
- * This is called if we run out of GRF registers.  Examine the live intervals
- * of temp regs in the program and free those which won't be used again.
- */
-static void
-reclaim_temps(struct brw_wm_compile *c)
-{
-   GLint intBegin[MAX_PROGRAM_TEMPS];
-   GLint intEnd[MAX_PROGRAM_TEMPS];
-   int index;
-
-   /*printf("Reclaim temps:\n");*/
-
-   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
-                             intBegin, intEnd);
-
-   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
-      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
-         /* program temp[i] can be freed */
-         int component;
-         /*printf("  temp[%d] is dead\n", index);*/
-         for (component = 0; component < 4; component++) {
-            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
-               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
-               release_grf(c, r);
-               /*
-               printf("  Reclaim temp %d, reg %d at inst %d\n",
-                      index, r, c->cur_inst);
-               */
-               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
-            }
-         }
-      }
-   }
-}
-
-
-
-
-/**
- * Preallocate registers.  This sets up the Mesa to hardware register
- * mapping for certain registers, such as constants (uniforms/state vars)
- * and shader inputs.
- */
-static void prealloc_reg(struct brw_wm_compile *c)
-{
-    struct intel_context *intel = &c->func.brw->intel;
-    int i, j;
-    struct brw_reg reg;
-    int urb_read_length = 0;
-    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
-    GLuint reg_index = 0;
-
-    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
-    c->first_free_grf = 0;
-
-    for (i = 0; i < 4; i++) {
-	if (i < (c->key.nr_payload_regs + 1) / 2)
-            reg = brw_vec8_grf(i * 2, 0);
-        else
-            reg = brw_vec8_grf(0, 0);
-	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
-    }
-    set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_W, 0,
-	    brw_vec8_grf(c->key.source_w_reg, 0));
-    reg_index += c->key.nr_payload_regs;
-
-    /* constants */
-    {
-        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
-        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
-
-        /* use a real constant buffer, or just use a section of the GRF? */
-        /* XXX this heuristic may need adjustment... */
-        if ((nr_params + nr_temps) * 4 + reg_index > 80) {
-	   for (i = 0; i < nr_params; i++) {
-	      float *pv = c->fp->program.Base.Parameters->ParameterValues[i];
-	      for (j = 0; j < 4; j++) {
-		 c->prog_data.pull_param[c->prog_data.nr_pull_params] = &pv[j];
-		 c->prog_data.nr_pull_params++;
-	      }
-	   }
-
-	   c->prog_data.nr_params = 0;
-	}
-        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
-
-        if (!c->prog_data.nr_pull_params) {
-           const struct gl_program_parameter_list *plist = 
-              c->fp->program.Base.Parameters;
-           int index = 0;
-
-           /* number of float constants in CURBE */
-           c->prog_data.nr_params = 4 * nr_params;
-
-           /* loop over program constants (float[4]) */
-           for (i = 0; i < nr_params; i++) {
-              /* loop over XYZW channels */
-              for (j = 0; j < 4; j++, index++) {
-                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
-                 /* Save pointer to parameter/constant value.
-                  * Constants will be copied in prepare_constant_buffer()
-                  */
-                 c->prog_data.param[index] = &plist->ParameterValues[i][j];
-                 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
-              }
-           }
-           /* number of constant regs used (each reg is float[8]) */
-	   c->nr_creg = ALIGN(nr_params, 2) / 2;
-	   reg_index += c->nr_creg;
-        }
-    }
-
-    /* fragment shader inputs: One 2-reg pair of interpolation
-     * coefficients for each vec4 to be set up.
-     */
-    if (intel->gen >= 6) {
-       for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-	  if (!(c->fp->program.Base.InputsRead & BITFIELD64_BIT(i)))
-	     continue;
-
-	  reg = brw_vec8_grf(reg_index, 0);
-	  for (j = 0; j < 4; j++) {
-	     set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
-	  }
-	  reg_index += 2;
-       }
-       urb_read_length = reg_index;
-    } else {
-       for (i = 0; i < VERT_RESULT_MAX; i++) {
-	  int fp_input;
-
-	  if (i >= VERT_RESULT_VAR0)
-	     fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
-	  else if (i <= VERT_RESULT_TEX7)
-	     fp_input = i;
-	  else
-	     fp_input = -1;
-
-	  if (fp_input >= 0 && inputs & (1 << fp_input)) {
-	     urb_read_length = reg_index;
-	     reg = brw_vec8_grf(reg_index, 0);
-	     for (j = 0; j < 4; j++)
-		set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
-	  }
-	  if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
-	     reg_index += 2;
-	  }
-       }
-    }
-
-    c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
-    c->prog_data.urb_read_length = urb_read_length;
-    c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
-    reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
-    reg_index += 2;
-
-    /* mark GRF regs [0..reg_index-1] as in-use */
-    for (i = 0; i < reg_index; i++)
-       prealloc_grf(c, i);
-
-    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
-    prealloc_grf(c, 126);
-    prealloc_grf(c, 127);
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-	const struct prog_instruction *inst = &c->prog_instructions[i];
-	struct brw_reg dst[4];
-
-	switch (inst->Opcode) {
-	case OPCODE_TEX:
-	case OPCODE_TXB:
-	    /* Allocate the channels of texture results contiguously,
-	     * since they are written out that way by the sampler unit.
-	     */
-	    for (j = 0; j < 4; j++) {
-		dst[j] = get_dst_reg(c, inst, j);
-		if (j != 0)
-		    assert(dst[j].nr == dst[j - 1].nr + 1);
-	    }
-	    break;
-	default:
-	    break;
-	}
-    }
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-	const struct prog_instruction *inst = &c->prog_instructions[i];
-
-	switch (inst->Opcode) {
-	case WM_DELTAXY:
-	    /* Allocate WM_DELTAXY destination on G45/GM45 to an
-	     * even-numbered GRF if possible so that we can use the PLN
-	     * instruction.
-	     */
-	    if (inst->DstReg.WriteMask == WRITEMASK_XY &&
-		!c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
-		!c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
-		(IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
-		int grf;
-
-		for (grf = c->first_free_grf & ~1;
-		     grf < BRW_WM_MAX_GRF;
-		     grf += 2)
-		{
-		    if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
-			c->used_grf[grf] = GL_TRUE;
-			c->used_grf[grf + 1] = GL_TRUE;
-			c->first_free_grf = grf + 2;  /* a guess */
-
-			set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
-				brw_vec8_grf(grf, 0));
-			set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
-				brw_vec8_grf(grf + 1, 0));
-			break;
-		    }
-		}
-	    }
-	default:
-	    break;
-	}
-    }
-
-    /* An instruction may reference up to three constants.
-     * They'll be found in these registers.
-     * XXX alloc these on demand!
-     */
-    if (c->prog_data.nr_pull_params) {
-       for (i = 0; i < 3; i++) {
-          c->current_const[i].index = -1;
-          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
-       }
-    }
-#if 0
-    printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
-    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
-#endif
-}
-
-
-/**
- * Check if any of the instruction's src registers are constants, uniforms,
- * or statevars.  If so, fetch any constants that we don't already have in
- * the three GRF slots.
- */
-static void fetch_constants(struct brw_wm_compile *c,
-                            const struct prog_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   GLuint i;
-
-   /* loop over instruction src regs */
-   for (i = 0; i < 3; i++) {
-      const struct prog_src_register *src = &inst->SrcReg[i];
-      if (src->File == PROGRAM_STATE_VAR ||
-          src->File == PROGRAM_CONSTANT ||
-          src->File == PROGRAM_UNIFORM) {
-	 c->current_const[i].index = src->Index;
-
-#if 0
-	 printf("  fetch const[%d] for arg %d into reg %d\n",
-		src->Index, i, c->current_const[i].reg.nr);
-#endif
-
-	 /* need to fetch the constant now */
-	 brw_oword_block_read(p,
-			      c->current_const[i].reg,
-			      brw_message_reg(1),
-			      16 * src->Index,
-			      SURF_INDEX_FRAG_CONST_BUFFER);
-      }
-   }
-}
-
-
-/**
- * Convert Mesa dst register to brw register.
- */
-static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
-                                  const struct prog_instruction *inst,
-                                  GLuint component)
-{
-    const int nr = 1;
-    return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
-	    0, 0);
-}
-
-
-static struct brw_reg
-get_src_reg_const(struct brw_wm_compile *c,
-                  const struct prog_instruction *inst,
-                  GLuint srcRegIndex, GLuint component)
-{
-   /* We should have already fetched the constant from the constant
-    * buffer in fetch_constants().  Now we just have to return a
-    * register description that extracts the needed component and
-    * smears it across all eight vector components.
-    */
-   const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-   struct brw_reg const_reg;
-
-   assert(component < 4);
-   assert(srcRegIndex < 3);
-   assert(c->current_const[srcRegIndex].index != -1);
-   const_reg = c->current_const[srcRegIndex].reg;
-
-   /* extract desired float from the const_reg, and smear */
-   const_reg = stride(const_reg, 0, 1, 0);
-   const_reg.subnr = component * 4;
-
-   if (src->Negate & (1 << component))
-      const_reg = negate(const_reg);
-   if (src->Abs)
-      const_reg = brw_abs(const_reg);
-
-#if 0
-   printf("  form const[%d].%d for arg %d, reg %d\n",
-          c->current_const[srcRegIndex].index,
-          component,
-          srcRegIndex,
-          const_reg.nr);
-#endif
-
-   return const_reg;
-}
-
-
-/**
- * Convert Mesa src register to brw register.
- */
-static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
-                                  const struct prog_instruction *inst,
-                                  GLuint srcRegIndex, GLuint channel)
-{
-    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-    const GLuint nr = 1;
-    const GLuint component = GET_SWZ(src->Swizzle, channel);
-
-    /* Only one immediate value can be used per native opcode, and it
-     * has be in the src1 slot, so not all Mesa instructions will get
-     * to take advantage of immediate constants.
-     */
-    if (brw_wm_arg_can_be_immediate(inst->Opcode, srcRegIndex)) {
-       const struct gl_program_parameter_list *params;
-
-       params = c->fp->program.Base.Parameters;
-
-       /* Extended swizzle terms */
-       if (component == SWIZZLE_ZERO) {
-	  return brw_imm_f(0.0F);
-       } else if (component == SWIZZLE_ONE) {
-	  if (src->Negate)
-	     return brw_imm_f(-1.0F);
-	  else
-	     return brw_imm_f(1.0F);
-       }
-
-       if (src->File == PROGRAM_CONSTANT) {
-	  float f = params->ParameterValues[src->Index][component];
-
-	  if (src->Abs)
-	     f = fabs(f);
-	  if (src->Negate)
-	     f = -f;
-
-	  return brw_imm_f(f);
-       }
-    }
-
-    if (c->prog_data.nr_pull_params &&
-        (src->File == PROGRAM_STATE_VAR ||
-         src->File == PROGRAM_CONSTANT ||
-         src->File == PROGRAM_UNIFORM)) {
-       return get_src_reg_const(c, inst, srcRegIndex, component);
-    }
-    else {
-       /* other type of source register */
-       return get_reg(c, src->File, src->Index, component, nr, 
-                      src->Negate, src->Abs);
-    }
-}
-
-static void emit_arl(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, addr_reg;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
-                           BRW_ARF_ADDRESS, 0);
-    src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
-    brw_MOV(p, addr_reg, src0);
-    brw_set_saturate(p, 0);
-}
-
-static INLINE struct brw_reg high_words( struct brw_reg reg )
-{
-    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
-		   0, 8, 2 );
-}
-
-static INLINE struct brw_reg low_words( struct brw_reg reg )
-{
-    return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
-}
-
-static INLINE struct brw_reg even_bytes( struct brw_reg reg )
-{
-    return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
-}
-
-static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
-{
-    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
-		   0, 16, 2 );
-}
-
-/**
- * Resolve subroutine calls after code emit is done.
- */
-static void post_wm_emit( struct brw_wm_compile *c )
-{
-    brw_resolve_cals(&c->func);
-}
-
-static void
-get_argument_regs(struct brw_wm_compile *c,
-		  const struct prog_instruction *inst,
-		  int index,
-		  struct brw_reg *dst,
-		  struct brw_reg *regs,
-		  int mask)
-{
-    struct brw_compile *p = &c->func;
-    int i, j;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1 << i)) {
-	    regs[i] = get_src_reg(c, inst, index, i);
-
-	    /* Unalias destination registers from our sources. */
-	    if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
-	       for (j = 0; j < 4; j++) {
-		   if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
-		       struct brw_reg tmp = alloc_tmp(c);
-		       brw_MOV(p, tmp, regs[i]);
-		       regs[i] = tmp;
-		       break;
-		   }
-	       }
-	    }
-	}
-    }
-}
-
-static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
-{
-   struct intel_context *intel = &brw->intel;
-#define MAX_IF_DEPTH 32
-#define MAX_LOOP_DEPTH 32
-    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
-    int if_depth_in_loop[MAX_LOOP_DEPTH];
-    GLuint i, if_depth = 0, loop_depth = 0;
-    struct brw_compile *p = &c->func;
-    struct brw_indirect stack_index = brw_indirect(0, 0);
-
-    c->out_of_regs = GL_FALSE;
-
-    if_depth_in_loop[loop_depth] = 0;
-
-    prealloc_reg(c);
-    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
-
-    if (intel->gen >= 6)
-	brw_set_acc_write_control(p, 1);
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-        const struct prog_instruction *inst = &c->prog_instructions[i];
-	int dst_flags;
-	struct brw_reg args[3][4], dst[4];
-	int j;
-	int mark = mark_tmps( c );
-
-        c->cur_inst = i;
-
-#if 0
-        printf("Inst %d: ", i);
-        _mesa_print_instruction(inst);
-#endif
-
-        /* fetch any constants that this instruction needs */
-        if (c->prog_data.nr_pull_params)
-           fetch_constants(c, inst);
-
-	if (inst->Opcode != OPCODE_ARL) {
-	   for (j = 0; j < 4; j++) {
-	      if (inst->DstReg.WriteMask & (1 << j))
-		 dst[j] = get_dst_reg(c, inst, j);
-	      else
-		 dst[j] = brw_null_reg();
-	   }
-	}
-	for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
-	    get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
-
-	dst_flags = inst->DstReg.WriteMask;
-	if (inst->SaturateMode == SATURATE_ZERO_ONE)
-	    dst_flags |= SATURATE;
-
-	if (inst->CondUpdate)
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	else
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
-
-	switch (inst->Opcode) {
-	    case WM_PIXELXY:
-		emit_pixel_xy(c, dst, dst_flags);
-		break;
-	    case WM_DELTAXY: 
-		emit_delta_xy(p, dst, dst_flags, args[0]);
-		break;
-	    case WM_PIXELW:
-		emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
-		break;	
-	    case WM_LINTERP:
-		emit_linterp(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case WM_PINTERP:
-		emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case WM_CINTERP:
-		emit_cinterp(p, dst, dst_flags, args[0]);
-		break;
-	    case WM_WPOSXY:
-		emit_wpos_xy(c, dst, dst_flags, args[0]);
-		break;
-	    case WM_FB_WRITE:
-		emit_fb_write(c, args[0], args[1], args[2],
-			      INST_AUX_GET_TARGET(inst->Aux),
-			      inst->Aux & INST_AUX_EOT);
-		break;
-	    case WM_FRONTFACING:
-		emit_frontfacing(p, dst, dst_flags);
-		break;
-	    case OPCODE_ADD:
-		emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_ARL:
-		emit_arl(c, inst);
-		break;
-	    case OPCODE_FRC:
-		emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_FLR:
-		emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_LRP:
-		emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_TRUNC:
-		emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_MOV:
-	    case OPCODE_SWZ:
-		emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_DP2:
-		emit_dp2(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DP3:
-		emit_dp3(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DP4:
-		emit_dp4(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_XPD:
-		emit_xpd(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DPH:
-		emit_dph(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_RCP:
-		emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_RSQ:
-		emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_SIN:
-		emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_COS:
-		emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_EX2:
-		emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_LG2:
-		emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_CMP:
-		emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_MIN:	
-		emit_min(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_MAX:	
-		emit_max(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DDX:
-	    case OPCODE_DDY:
-		emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
-			  args[0]);
-                break;
-	    case OPCODE_SLT:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_L, args[0], args[1]);
-		break;
-	    case OPCODE_SLE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_LE, args[0], args[1]);
-		break;
-	    case OPCODE_SGT:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_G, args[0], args[1]);
-		break;
-	    case OPCODE_SGE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_GE, args[0], args[1]);
-		break;
-	    case OPCODE_SEQ:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_EQ, args[0], args[1]);
-		break;
-	    case OPCODE_SNE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_NEQ, args[0], args[1]);
-		break;
-	    case OPCODE_SSG:
-		emit_sign(p, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_MUL:
-		emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_POW:
-		emit_math2(c, BRW_MATH_FUNCTION_POW,
-			   dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_MAD:
-		emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_TEX:
-		emit_tex(c, dst, dst_flags, args[0],
-			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
-				 0, 1, 0, 0),
-			 inst->TexSrcTarget,
-			 inst->TexSrcUnit,
-			 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
-		break;
-	    case OPCODE_TXB:
-		emit_txb(c, dst, dst_flags, args[0],
-			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
-				 0, 1, 0, 0),
-			 inst->TexSrcTarget,
-			 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
-		break;
-	    case OPCODE_KIL_NV:
-		emit_kil_nv(c);
-		break;
-	    case OPCODE_IF:
-		assert(if_depth < MAX_IF_DEPTH);
-		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
-		if_depth_in_loop[loop_depth]++;
-		break;
-	    case OPCODE_ELSE:
-		assert(if_depth > 0);
-		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
-		break;
-	    case OPCODE_ENDIF:
-		assert(if_depth > 0);
-		brw_ENDIF(p, if_inst[--if_depth]);
-		if_depth_in_loop[loop_depth]--;
-		break;
-	    case OPCODE_BGNSUB:
-		brw_save_label(p, inst->Comment, p->nr_insn);
-		break;
-	    case OPCODE_ENDSUB:
-		/* no-op */
-		break;
-	    case OPCODE_CAL: 
-		brw_push_insn_state(p);
-		brw_set_mask_control(p, BRW_MASK_DISABLE);
-                brw_set_access_mode(p, BRW_ALIGN_1);
-                brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
-                brw_set_access_mode(p, BRW_ALIGN_16);
-                brw_ADD(p, get_addr_reg(stack_index),
-                         get_addr_reg(stack_index), brw_imm_d(4));
-		brw_save_call(&c->func, inst->Comment, p->nr_insn);
-                brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-                brw_pop_insn_state(p);
-		break;
-
-	    case OPCODE_RET:
-		brw_push_insn_state(p);
-		brw_set_mask_control(p, BRW_MASK_DISABLE);
-                brw_ADD(p, get_addr_reg(stack_index),
-                        get_addr_reg(stack_index), brw_imm_d(-4));
-                brw_set_access_mode(p, BRW_ALIGN_1);
-                brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
-                brw_set_access_mode(p, BRW_ALIGN_16);
-		brw_pop_insn_state(p);
-
-		break;
-	    case OPCODE_BGNLOOP:
-                /* XXX may need to invalidate the current_constant regs */
-		loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
-		if_depth_in_loop[loop_depth] = 0;
-		break;
-	    case OPCODE_BRK:
-		brw_BREAK(p, if_depth_in_loop[loop_depth]);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-		break;
-	    case OPCODE_CONT:
-		brw_CONT(p, if_depth_in_loop[loop_depth]);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-		break;
-	    case OPCODE_ENDLOOP: 
-               {
-                  struct brw_instruction *inst0, *inst1;
-                  GLuint br = 1;
-
-                  if (intel->gen == 5)
-                     br = 2;
-
-		  assert(loop_depth > 0);
-                  loop_depth--;
-                  inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
-                  /* patch all the BREAK/CONT instructions from last BGNLOOP */
-                  while (inst0 > loop_inst[loop_depth]) {
-                     inst0--;
-                     if (inst0->header.opcode == BRW_OPCODE_BREAK &&
-			 inst0->bits3.if_else.jump_count == 0) {
-			inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-                     }
-                     else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-			      inst0->bits3.if_else.jump_count == 0) {
-                        inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-                     }
-                  }
-               }
-               break;
-	    default:
-		printf("unsupported opcode %d (%s) in fragment shader\n",
-		       inst->Opcode, inst->Opcode < MAX_OPCODE ?
-		       _mesa_opcode_string(inst->Opcode) : "unknown");
-	}
-
-	/* Release temporaries containing any unaliased source regs. */
-	release_tmps( c, mark );
-
-	if (inst->CondUpdate)
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	else
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    }
-    post_wm_emit(c);
-
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("wm-native:\n");
-      for (i = 0; i < p->nr_insn; i++)
-	 brw_disasm(stdout, &p->store[i], intel->gen);
-      printf("\n");
-    }
-}
-
-/**
- * Do GPU code generation for shaders that use GLSL features such as
- * flow control.  Other shaders will be compiled with the 
- */
-void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
-{
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-        printf("brw_wm_glsl_emit:\n");
-    }
-
-    /* initial instruction translation/simplification */
-    brw_wm_pass_fp(c);
-
-    /* actual code generation */
-    brw_wm_emit_glsl(brw, c);
-
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-        brw_wm_print_program(c, "brw_wm_glsl_emit done");
-    }
-
-    c->prog_data.total_grf = num_grf_used(c);
-    c->prog_data.total_scratch = 0;
-}
diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.c b/src/mesa/drivers/dri/i965/brw_wm_iz.c
index 62e556698ba..471ea1c18d6 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_iz.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_iz.c
@@ -120,14 +120,14 @@ const struct {
  * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
  * \param lookup  bitmask of IZ_* flags
  */
-void brw_wm_lookup_iz( struct intel_context *intel,
-		       GLuint line_aa,
-		       GLuint lookup,
-		       GLboolean ps_uses_depth,
-		       struct brw_wm_prog_key *key )
+void brw_wm_lookup_iz(struct intel_context *intel,
+		      struct brw_wm_compile *c)
 {
    GLuint reg = 2;
    GLboolean kill_stats_promoted_workaround = GL_FALSE;
+   int lookup = c->key.iz_lookup;
+   bool uses_depth = (c->fp->program.Base.InputsRead &
+		      (1 << FRAG_ATTRIB_WPOS)) != 0;
 
    assert (lookup < IZ_BIT_MAX);
 
@@ -136,36 +136,36 @@ void brw_wm_lookup_iz( struct intel_context *intel,
     * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
     * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
     */
-   if (intel->stats_wm &&
+   if (c->key.stats_wm &&
        (lookup & IZ_PS_KILL_ALPHATEST_BIT) &&
        wm_iz_table[lookup].mode == P) {
       kill_stats_promoted_workaround = GL_TRUE;
    }
 
    if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
-      key->computes_depth = 1;
+      c->computes_depth = 1;
 
-   if (wm_iz_table[lookup].sd_present || ps_uses_depth ||
+   if (wm_iz_table[lookup].sd_present || uses_depth ||
        kill_stats_promoted_workaround) {
-      key->source_depth_reg = reg;
+      c->source_depth_reg = reg;
       reg += 2;
    }
 
    if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
-      key->source_depth_to_render_target = 1;
+      c->source_depth_to_render_target = 1;
 
-   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
-      key->aa_dest_stencil_reg = reg;
-      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
-				      line_aa == AA_SOMETIMES);
+   if (wm_iz_table[lookup].ds_present || c->key.line_aa != AA_NEVER) {
+      c->aa_dest_stencil_reg = reg;
+      c->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				    c->key.line_aa == AA_SOMETIMES);
       reg++;
    }
 
    if (wm_iz_table[lookup].dd_present) {
-      key->dest_depth_reg = reg;
+      c->dest_depth_reg = reg;
       reg+=2;
    }
 
-   key->nr_payload_regs = reg;
+   c->nr_payload_regs = reg;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 83152526b3a..f78bdc31866 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -380,7 +380,7 @@ static void pass0_init_payload( struct brw_wm_compile *c )
    GLuint i;
 
    for (i = 0; i < 4; i++) {
-      GLuint j = i >= (c->key.nr_payload_regs + 1) / 2 ? 0 : i;
+      GLuint j = i >= (c->nr_payload_regs + 1) / 2 ? 0 : i;
       pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, 
 			     &c->payload.depth[j] );
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass1.c b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
index 3a2874b6ddf..7d6a3fa9f12 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass1.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
@@ -128,8 +128,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       if (inst->opcode == WM_FB_WRITE) {
 	 track_arg(c, inst, 0, WRITEMASK_XYZW); 
 	 track_arg(c, inst, 1, WRITEMASK_XYZW); 
-	 if (c->key.source_depth_to_render_target &&
-	     c->key.computes_depth)
+	 if (c->source_depth_to_render_target && c->computes_depth)
 	    track_arg(c, inst, 2, WRITEMASK_Z); 
 	 else
 	    track_arg(c, inst, 2, 0); 
@@ -281,7 +280,6 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 
       case OPCODE_DST:
       case WM_FRONTFACING:
-      case OPCODE_KIL_NV:
       default:
 	 break;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass2.c b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
index 44e39538145..8c2b9e7020b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass2.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
@@ -69,6 +69,8 @@ static void prealloc_reg(struct brw_wm_compile *c,
  */
 static void init_registers( struct brw_wm_compile *c )
 {
+   struct brw_context *brw = c->func.brw;
+   struct intel_context *intel = &brw->intel;
    GLuint nr_interp_regs = 0;
    GLuint i = 0;
    GLuint j;
@@ -76,32 +78,41 @@ static void init_registers( struct brw_wm_compile *c )
    for (j = 0; j < c->grf_limit; j++) 
       c->pass2_grf[j].nextuse = BRW_WM_MAX_INSN;
 
-   for (j = 0; j < (c->key.nr_payload_regs + 1) / 2; j++)
+   for (j = 0; j < (c->nr_payload_regs + 1) / 2; j++)
       prealloc_reg(c, &c->payload.depth[j], i++);
 
    for (j = 0; j < c->nr_creg; j++) 
       prealloc_reg(c, &c->creg[j], i++);
 
-   for (j = 0; j < VERT_RESULT_MAX; j++) {
-      if (c->key.vp_outputs_written & BITFIELD64_BIT(j)) {
-	 int fp_index;
-
-	 if (j >= VERT_RESULT_VAR0)
-	    fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
-	 else if (j <= VERT_RESULT_TEX7)
-	    fp_index = j;
-	 else
-	    fp_index = -1;
-
-	 nr_interp_regs++;
-	 if (fp_index >= 0)
-	    prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
+   if (intel->gen >= 6) {
+      for (unsigned int j = 0; j < FRAG_ATTRIB_MAX; j++) {
+	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(j)) {
+	    nr_interp_regs++;
+	    prealloc_reg(c, &c->payload.input_interp[j], i++);
+	 }
+      }
+   } else {
+      for (j = 0; j < VERT_RESULT_MAX; j++) {
+	 if (c->key.vp_outputs_written & BITFIELD64_BIT(j)) {
+	    int fp_index;
+
+	    if (j >= VERT_RESULT_VAR0)
+	       fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+	    else if (j <= VERT_RESULT_TEX7)
+	       fp_index = j;
+	    else
+	       fp_index = -1;
+
+	    nr_interp_regs++;
+	    if (fp_index >= 0)
+	       prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
+	 }
       }
+      assert(nr_interp_regs >= 1);
    }
 
-   assert(nr_interp_regs >= 1);
 
-   c->prog_data.first_curbe_grf = ALIGN(c->key.nr_payload_regs, 2);
+   c->prog_data.first_curbe_grf = ALIGN(c->nr_payload_regs, 2);
    c->prog_data.urb_read_length = nr_interp_regs * 2;
    c->prog_data.curb_read_length = c->nr_creg * 2;
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index fea96d35381..e7c97a1cb05 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -69,12 +69,43 @@ static GLuint translate_wrap_mode( GLenum wrap )
 static drm_intel_bo *upload_default_color( struct brw_context *brw,
 				     const GLfloat *color )
 {
-   struct brw_sampler_default_color sdc;
+   struct intel_context *intel = &brw->intel;
 
-   COPY_4V(sdc.color, color); 
-   
-   return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
-			 &sdc, sizeof(sdc));
+   if (intel->gen >= 5) {
+      struct gen5_sampler_default_color sdc;
+
+      memset(&sdc, 0, sizeof(sdc));
+
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[0], color[0]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[1], color[1]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[2], color[2]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[3], color[3]);
+
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[0], color[0]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[1], color[1]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[2], color[2]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[3], color[3]);
+
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[0], color[0]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[1], color[1]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[2], color[2]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[3], color[3]);
+
+      /* XXX: Fill in half floats */
+      /* XXX: Fill in signed bytes */
+
+      COPY_4V(sdc.f, color);
+
+      return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
+			    &sdc, sizeof(sdc));
+   } else {
+      struct brw_sampler_default_color sdc;
+
+      COPY_4V(sdc.color, color);
+
+      return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
+			    &sdc, sizeof(sdc));
+   }
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 76de7b7b6f6..e9ef635bca2 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -87,7 +87,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 {
    struct gl_context *ctx = &brw->intel.ctx;
    const struct gl_fragment_program *fp = brw->fragment_program;
-   const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
    struct intel_context *intel = &brw->intel;
 
    memset(key, 0, sizeof(*key));
@@ -132,7 +131,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* _NEW_COLOR */
    key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
-   key->is_glsl = bfp->isGLSL;
 
    /* If using the fragment shader backend, the program is always
     * 8-wide.
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 76fc94df1f6..ad744044c70 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -139,6 +139,8 @@ static GLuint translate_tex_format( gl_format mesa_format,
 	  return BRW_SURFACEFORMAT_I16_UNORM;
       else if (depth_mode == GL_ALPHA)
 	  return BRW_SURFACEFORMAT_A16_UNORM;
+      else if (depth_mode == GL_RED)
+	  return BRW_SURFACEFORMAT_R16_UNORM;
       else
 	  return BRW_SURFACEFORMAT_L16_UNORM;
 
@@ -174,6 +176,8 @@ static GLuint translate_tex_format( gl_format mesa_format,
          return BRW_SURFACEFORMAT_I24X8_UNORM;
       else if (depth_mode == GL_ALPHA)
          return BRW_SURFACEFORMAT_A24X8_UNORM;
+      else if (depth_mode == GL_RED)
+         return BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS;
       else
          return BRW_SURFACEFORMAT_L24X8_UNORM;
 
@@ -274,6 +278,7 @@ brw_create_constant_surface(struct brw_context *brw,
 			    drm_intel_bo **out_bo,
 			    uint32_t *out_offset)
 {
+   struct intel_context *intel = &brw->intel;
    const GLint w = width - 1;
    struct brw_surface_state surf;
    void *map;
@@ -284,6 +289,9 @@ brw_create_constant_surface(struct brw_context *brw,
    surf.ss0.surface_type = BRW_SURFACE_BUFFER;
    surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
 
+   if (intel->gen >= 6)
+      surf.ss0.render_cache_read_write = 1;
+
    assert(bo);
    surf.ss1.base_addr = bo->offset; /* reloc */
 
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 800a2555214..c2631a7b4df 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -35,6 +35,7 @@
 struct gen6_blend_state_key {
    GLboolean color_blend, alpha_enabled;
    GLboolean dither;
+   GLboolean color_mask[BRW_MAX_DRAW_BUFFERS][4];
 
    GLenum logic_op;
 
@@ -54,6 +55,9 @@ blend_state_populate_key(struct brw_context *brw,
    memset(key, 0, sizeof(*key));
 
    /* _NEW_COLOR */
+   memcpy(key->color_mask, ctx->Color.ColorMask, sizeof(key->color_mask));
+
+   /* _NEW_COLOR */
    if (ctx->Color._LogicOpEnabled)
       key->logic_op = ctx->Color.LogicOp;
    else
@@ -87,54 +91,62 @@ static drm_intel_bo *
 blend_state_create_from_key(struct brw_context *brw,
 			    struct gen6_blend_state_key *key)
 {
-   struct gen6_blend_state blend;
+   struct gen6_blend_state blend[BRW_MAX_DRAW_BUFFERS];
    drm_intel_bo *bo;
+   int b;
 
    memset(&blend, 0, sizeof(blend));
 
-   if (key->logic_op != GL_COPY) {
-      blend.blend1.logic_op_enable = 1;
-      blend.blend1.logic_op_func = intel_translate_logic_op(key->logic_op);
-   } else if (key->color_blend) {
-      GLenum eqRGB = key->blend_eq_rgb;
-      GLenum eqA = key->blend_eq_a;
-      GLenum srcRGB = key->blend_src_rgb;
-      GLenum dstRGB = key->blend_dst_rgb;
-      GLenum srcA = key->blend_src_a;
-      GLenum dstA = key->blend_dst_a;
-
-      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
-	 srcRGB = dstRGB = GL_ONE;
-      }
-
-      if (eqA == GL_MIN || eqA == GL_MAX) {
-	 srcA = dstA = GL_ONE;
+   for (b = 0; b < BRW_MAX_DRAW_BUFFERS; b++) {
+      if (key->logic_op != GL_COPY) {
+	 blend[b].blend1.logic_op_enable = 1;
+	 blend[b].blend1.logic_op_func = intel_translate_logic_op(key->logic_op);
+      } else if (key->color_blend & (1 << b)) {
+	 GLenum eqRGB = key->blend_eq_rgb;
+	 GLenum eqA = key->blend_eq_a;
+	 GLenum srcRGB = key->blend_src_rgb;
+	 GLenum dstRGB = key->blend_dst_rgb;
+	 GLenum srcA = key->blend_src_a;
+	 GLenum dstA = key->blend_dst_a;
+
+	 if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
+	    srcRGB = dstRGB = GL_ONE;
+	 }
+
+	 if (eqA == GL_MIN || eqA == GL_MAX) {
+	    srcA = dstA = GL_ONE;
+	 }
+
+	 blend[b].blend0.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+	 blend[b].blend0.source_blend_factor = brw_translate_blend_factor(srcRGB);
+	 blend[b].blend0.blend_func = brw_translate_blend_equation(eqRGB);
+
+	 blend[b].blend0.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+	 blend[b].blend0.ia_source_blend_factor = brw_translate_blend_factor(srcA);
+	 blend[b].blend0.ia_blend_func = brw_translate_blend_equation(eqA);
+
+	 blend[b].blend0.blend_enable = 1;
+	 blend[b].blend0.ia_blend_enable = (srcA != srcRGB ||
+					 dstA != dstRGB ||
+					 eqA != eqRGB);
       }
 
-      blend.blend0.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-      blend.blend0.source_blend_factor = brw_translate_blend_factor(srcRGB);
-      blend.blend0.blend_func = brw_translate_blend_equation(eqRGB);
-
-      blend.blend0.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-      blend.blend0.ia_source_blend_factor = brw_translate_blend_factor(srcA);
-      blend.blend0.ia_blend_func = brw_translate_blend_equation(eqA);
+      if (key->alpha_enabled) {
+	 blend[b].blend1.alpha_test_enable = 1;
+	 blend[b].blend1.alpha_test_func = intel_translate_compare_func(key->alpha_func);
 
-      blend.blend0.blend_enable = 1;
-      blend.blend0.ia_blend_enable = (srcA != srcRGB ||
-				      dstA != dstRGB ||
-				      eqA != eqRGB);
-   }
-
-   if (key->alpha_enabled) {
-      blend.blend1.alpha_test_enable = 1;
-      blend.blend1.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      }
 
-   }
+      if (key->dither) {
+	 blend[b].blend1.dither_enable = 1;
+	 blend[b].blend1.y_dither_offset = 0;
+	 blend[b].blend1.x_dither_offset = 0;
+      }
 
-   if (key->dither) {
-      blend.blend1.dither_enable = 1;
-      blend.blend1.y_dither_offset = 0;
-      blend.blend1.x_dither_offset = 0;
+      blend[b].blend1.write_disable_r = !key->color_mask[b][0];
+      blend[b].blend1.write_disable_g = !key->color_mask[b][1];
+      blend[b].blend1.write_disable_b = !key->color_mask[b][2];
+      blend[b].blend1.write_disable_a = !key->color_mask[b][3];
    }
 
    bo = brw_upload_cache(&brw->cache, BRW_BLEND_STATE,
@@ -172,7 +184,7 @@ const struct brw_tracked_state gen6_blend_state = {
 };
 
 struct gen6_color_calc_state_key {
-   GLubyte blend_constant_color[4];
+   float blend_constant_color[4];
    GLclampf alpha_ref;
    GLubyte stencil_ref[2];
 };
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index c65b41e2b6b..c7c4eb1f27d 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -64,7 +64,9 @@ upload_clip_state(struct brw_context *brw)
 	     userclip << GEN6_USER_CLIP_CLIP_DISTANCES_SHIFT |
 	     depth_clamp |
 	     provoking);
-   OUT_BATCH(GEN6_CLIP_FORCE_ZERO_RTAINDEX);
+   OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
+             U_FIXED(225.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
+             GEN6_CLIP_FORCE_ZERO_RTAINDEX);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 471067e8f02..45c148baedd 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -33,9 +33,10 @@
 #include "intel_batchbuffer.h"
 
 static uint32_t
-get_attr_override(struct brw_context *brw, int fs_attr)
+get_attr_override(struct brw_context *brw, int fs_attr, int two_side_color)
 {
    int attr_index = 0, i, vs_attr;
+   int bfc = 0;
 
    if (fs_attr <= FRAG_ATTRIB_TEX7)
       vs_attr = fs_attr;
@@ -57,6 +58,30 @@ get_attr_override(struct brw_context *brw, int fs_attr)
 	 attr_index++;
    }
 
+   assert(attr_index < 32);
+
+   if (two_side_color) {
+       if ((brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
+           (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
+           assert(brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
+           assert(brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
+           bfc = 2;
+       } else if ((brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
+                (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
+           bfc = 1;
+   }
+
+   if (bfc && (fs_attr <= FRAG_ATTRIB_TEX7 && fs_attr > FRAG_ATTRIB_WPOS)) {
+       if (fs_attr == FRAG_ATTRIB_COL0)
+           attr_index |= (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT);
+       else if (fs_attr == FRAG_ATTRIB_COL1 && bfc == 2) {
+           attr_index++;
+           attr_index |= (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT);
+       } else {
+           attr_index += bfc;
+       }
+   }
+
    return attr_index;
 }
 
@@ -67,13 +92,15 @@ upload_sf_state(struct brw_context *brw)
    struct gl_context *ctx = &intel->ctx;
    /* CACHE_NEW_VS_PROG */
    uint32_t num_inputs = brw_count_bits(brw->vs.prog_data->outputs_written);
+   /* BRW_NEW_FRAGMENT_PROGRAM */
    uint32_t num_outputs = brw_count_bits(brw->fragment_program->Base.InputsRead);
-   uint32_t dw1, dw2, dw3, dw4, dw16;
+   uint32_t dw1, dw2, dw3, dw4, dw16, dw17;
    int i;
    /* _NEW_BUFFER */
    GLboolean render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
    int attr = 0;
    int urb_start;
+   int two_side_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
    /* _NEW_TRANSFORM */
    if (ctx->Transform.ClipPlanesEnabled)
@@ -91,6 +118,7 @@ upload_sf_state(struct brw_context *brw)
    dw3 = 0;
    dw4 = 0;
    dw16 = 0;
+   dw17 = 0;
 
    /* _NEW_POLYGON */
    if ((ctx->Polygon.FrontFace == GL_CCW) ^ render_to_fbo)
@@ -99,6 +127,48 @@ upload_sf_state(struct brw_context *brw)
    if (ctx->Polygon.OffsetFill)
        dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_SOLID;
 
+   if (ctx->Polygon.OffsetLine)
+       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_WIREFRAME;
+
+   if (ctx->Polygon.OffsetPoint)
+       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_POINT;
+
+   switch (ctx->Polygon.FrontMode) {
+   case GL_FILL:
+       dw2 |= GEN6_SF_FRONT_SOLID;
+       break;
+
+   case GL_LINE:
+       dw2 |= GEN6_SF_FRONT_WIREFRAME;
+       break;
+
+   case GL_POINT:
+       dw2 |= GEN6_SF_FRONT_POINT;
+       break;
+
+   default:
+       assert(0);
+       break;
+   }
+
+   switch (ctx->Polygon.BackMode) {
+   case GL_FILL:
+       dw2 |= GEN6_SF_BACK_SOLID;
+       break;
+
+   case GL_LINE:
+       dw2 |= GEN6_SF_BACK_WIREFRAME;
+       break;
+
+   case GL_POINT:
+       dw2 |= GEN6_SF_BACK_POINT;
+       break;
+
+   default:
+       assert(0);
+       break;
+   }
+
    /* _NEW_SCISSOR */
    if (ctx->Scissor.Enabled)
       dw3 |= GEN6_SF_SCISSOR_ENABLE;
@@ -160,6 +230,12 @@ upload_sf_state(struct brw_context *brw)
        }
    }
 
+   /* flat shading */
+   if (ctx->Light.ShadeModel == GL_FLAT) {
+       dw17 |= ((brw->fragment_program->Base.InputsRead & (FRAG_BIT_COL0 | FRAG_BIT_COL1)) >>
+                ((brw->fragment_program->Base.InputsRead & FRAG_BIT_WPOS) ? 0 : 1));
+   }
+
    BEGIN_BATCH(20);
    OUT_BATCH(CMD_3D_SF_STATE << 16 | (20 - 2));
    OUT_BATCH(dw1);
@@ -174,7 +250,7 @@ upload_sf_state(struct brw_context *brw)
 
       for (; attr < 64; attr++) {
 	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr)) {
-	    attr_overrides |= get_attr_override(brw, attr);
+	    attr_overrides |= get_attr_override(brw, attr, two_side_color);
 	    attr++;
 	    break;
 	 }
@@ -182,7 +258,7 @@ upload_sf_state(struct brw_context *brw)
 
       for (; attr < 64; attr++) {
 	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr)) {
-	    attr_overrides |= get_attr_override(brw, attr) << 16;
+	    attr_overrides |= get_attr_override(brw, attr, two_side_color) << 16;
 	    attr++;
 	    break;
 	 }
@@ -190,7 +266,7 @@ upload_sf_state(struct brw_context *brw)
       OUT_BATCH(attr_overrides);
    }
    OUT_BATCH(dw16); /* point sprite texcoord bitmask */
-   OUT_BATCH(0); /* constant interp bitmask */
+   OUT_BATCH(dw17); /* constant interp bitmask */
    OUT_BATCH(0); /* wrapshortest enables 0-7 */
    OUT_BATCH(0); /* wrapshortest enables 8-15 */
    ADVANCE_BATCH();
@@ -205,7 +281,8 @@ const struct brw_tracked_state gen6_sf_state = {
 		_NEW_BUFFERS |
 		_NEW_POINT |
 		_NEW_TRANSFORM),
-      .brw   = BRW_NEW_CONTEXT,
+      .brw   = (BRW_NEW_CONTEXT |
+		BRW_NEW_FRAGMENT_PROGRAM),
       .cache = CACHE_NEW_VS_PROG
    },
    .emit = upload_sf_state,
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index a34123478fb..de97fd3783d 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -72,7 +72,7 @@ const struct brw_tracked_state gen6_urb = {
    .dirty = {
       .mesa = 0,
       .brw = BRW_NEW_CONTEXT,
-      .cache = CACHE_NEW_VS_PROG,
+      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG),
    },
    .prepare = prepare_urb,
    .emit = upload_urb,
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index e94d0c0ddbb..4ef9e2e6072 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -54,7 +54,7 @@ upload_vs_state(struct brw_context *brw)
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else {
-      int params_uploaded = 0;
+      int params_uploaded = 0, param_regs;
       float *param;
 
       if (brw->vertex_program->IsNVProgram)
@@ -88,20 +88,11 @@ upload_vs_state(struct brw_context *brw)
 	 params_uploaded++;
       }
 
-      if (vp->use_const_buffer) {
-	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
-	    if (brw->vs.constant_map[i] != -1) {
-	       memcpy(param + brw->vs.constant_map[i] * 4,
-		      vp->program.Base.Parameters->ParameterValues[i],
-		      4 * sizeof(float));
-	       params_uploaded++;
-	    }
-	 }
-      } else {
-	 for (i = 0; i < nr_params; i++) {
-	    memcpy(param, vp->program.Base.Parameters->ParameterValues[i],
+      for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	 if (brw->vs.constant_map[i] != -1) {
+	    memcpy(param + brw->vs.constant_map[i] * 4,
+		   vp->program.Base.Parameters->ParameterValues[i],
 		   4 * sizeof(float));
-	    param += 4;
 	    params_uploaded++;
 	 }
       }
@@ -117,13 +108,16 @@ upload_vs_state(struct brw_context *brw)
 
       drm_intel_gem_bo_unmap_gtt(constant_bo);
 
+      param_regs = (params_uploaded + 1) / 2;
+      assert(param_regs <= 32);
+
       BEGIN_BATCH(5);
       OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 |
 		GEN6_CONSTANT_BUFFER_0_ENABLE |
 		(5 - 2));
       OUT_RELOC(constant_bo,
 		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
-		ALIGN(params_uploaded, 2) / 2 - 1);
+		param_regs - 1);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index ea5418bacf1..d80df4e254b 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -66,6 +66,21 @@ prepare_wm_constants(struct brw_context *brw)
 	 constants[i] = convert_param(brw->wm.prog_data->param_convert[i],
 				      *brw->wm.prog_data->param[i]);
       }
+
+      if (0) {
+	 printf("WM constants:\n");
+	 for (i = 0; i < brw->wm.prog_data->nr_params; i++) {
+	    if ((i & 7) == 0)
+	       printf("g%d: ", brw->wm.prog_data->first_curbe_grf + i / 8);
+	    printf("%8f ", constants[i]);
+	    if ((i & 7) == 7)
+	       printf("\n");
+	 }
+	 if ((i & 7) != 0)
+	    printf("\n");
+	 printf("\n");
+      }
+
       drm_intel_gem_bo_unmap_gtt(brw->wm.push_const_bo);
    }
 }
@@ -88,6 +103,7 @@ upload_wm_state(struct brw_context *brw)
       brw_fragment_program_const(brw->fragment_program);
    uint32_t dw2, dw4, dw5, dw6;
 
+   /* CACHE_NEW_WM_PROG */
    if (brw->wm.prog_data->nr_params == 0) {
       /* Disable the push constant buffers. */
       BEGIN_BATCH(5);
@@ -104,7 +120,8 @@ upload_wm_state(struct brw_context *brw)
 		(5 - 2));
       OUT_RELOC(brw->wm.push_const_bo,
 		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
-		ALIGN(brw->wm.prog_data->nr_params, 8) / 8 - 1);
+		ALIGN(brw->wm.prog_data->nr_params,
+		      brw->wm.prog_data->dispatch_width) / 8 - 1);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -126,8 +143,8 @@ upload_wm_state(struct brw_context *brw)
 
    dw5 |= (40 - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   if (fp->isGLSL)
+   /* CACHE_NEW_WM_PROG */
+   if (brw->wm.prog_data->dispatch_width == 8)
       dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
    else
       dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
@@ -176,13 +193,14 @@ upload_wm_state(struct brw_context *brw)
 const struct brw_tracked_state gen6_wm_state = {
    .dirty = {
       .mesa  = (_NEW_LINE | _NEW_POLYGONSTIPPLE | _NEW_COLOR | _NEW_BUFFERS |
-		_NEW_PROGRAM_CONSTANTS),
+		_NEW_PROGRAM_CONSTANTS | _NEW_POLYGON),
       .brw   = (BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_FRAGMENT_PROGRAM |
                 BRW_NEW_NR_WM_SURFACES |
 		BRW_NEW_URB_FENCE |
 		BRW_NEW_BATCH),
-      .cache = CACHE_NEW_SAMPLER
+      .cache = (CACHE_NEW_SAMPLER |
+		CACHE_NEW_WM_PROG)
    },
    .emit = upload_wm_state,
 };
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 4b498f8c5b2..21fc9ece886 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -92,7 +92,7 @@ do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
 
    batch->ptr = NULL;
 
-   if (!intel->no_hw) {
+   if (!intel->intelScreen->no_hw) {
       drm_intel_bo_exec(batch->buf, used, NULL, 0,
 			(x_off & 0xffff) | (y_off << 16));
    }
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 152cdcaf37d..9c222c7b485 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -519,7 +519,6 @@ static const struct dri_debug_control debug_control[] = {
    { "sing",  DEBUG_SINGLE_THREAD },
    { "thre",  DEBUG_SINGLE_THREAD },
    { "wm",    DEBUG_WM },
-   { "glsl_force", DEBUG_GLSL_FORCE },
    { "urb",   DEBUG_URB },
    { "vs",    DEBUG_VS },
    { "clip",  DEBUG_CLIP },
@@ -800,11 +799,6 @@ intelInitContext(struct intel_context *intel,
    if (INTEL_DEBUG & DEBUG_BUFMGR)
       dri_bufmgr_set_debug(intel->bufmgr, GL_TRUE);
 
-   /* XXX force SIMD8 kernel for Sandybridge before we fixed
-      SIMD16 interpolation. */
-   if (intel->gen == 6)
-       INTEL_DEBUG |= DEBUG_GLSL_FORCE;
-
    intel->batch = intel_batchbuffer_alloc(intel);
 
    intel_fbo_init(intel);
@@ -838,11 +832,6 @@ intelInitContext(struct intel_context *intel,
       intel->always_flush_cache = 1;
    }
 
-   /* Disable all hardware rendering (skip emitting batches and fences/waits
-    * to the kernel)
-    */
-   intel->no_hw = getenv("INTEL_NO_HW") != NULL;
-
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 9d5139c0000..96493c0f2bb 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -207,7 +207,6 @@ struct intel_context
    GLboolean hw_stipple;
    GLboolean depth_buffer_is_float;
    GLboolean no_rast;
-   GLboolean no_hw;
    GLboolean always_flush_batch;
    GLboolean always_flush_cache;
 
@@ -362,7 +361,6 @@ extern int INTEL_DEBUG;
 #define DEBUG_WM        0x800000
 #define DEBUG_URB       0x1000000
 #define DEBUG_VS        0x2000000
-#define DEBUG_GLSL_FORCE 0x4000000
 #define DEBUG_CLIP      0x8000000
 
 #define DBG(...) do {						\
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 862a13d2ea5..18e796a1186 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -465,10 +465,12 @@ intel_update_wrapper(struct gl_context *ctx, struct intel_renderbuffer *irb,
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to XGBA8 texture OK\n");
    }
+#ifndef I915
    else if (texImage->TexFormat == MESA_FORMAT_SARGB8) {
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to SARGB8 texture OK\n");
    }
+#endif
    else if (texImage->TexFormat == MESA_FORMAT_RGB565) {
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to RGB5 texture OK\n");
@@ -481,6 +483,7 @@ intel_update_wrapper(struct gl_context *ctx, struct intel_renderbuffer *irb,
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to ARGB4444 texture OK\n");
    }
+#ifndef I915
    else if (texImage->TexFormat == MESA_FORMAT_A8) {
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to A8 texture OK\n");
@@ -501,6 +504,7 @@ intel_update_wrapper(struct gl_context *ctx, struct intel_renderbuffer *irb,
       irb->Base.DataType = GL_UNSIGNED_SHORT;
       DBG("Render to RG88 texture OK\n");
    }
+#endif
    else if (texImage->TexFormat == MESA_FORMAT_Z16) {
       irb->Base.DataType = GL_UNSIGNED_SHORT;
       DBG("Render to DEPTH16 texture OK\n");
@@ -710,15 +714,17 @@ intel_validate_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb)
       switch (irb->Base.Format) {
       case MESA_FORMAT_ARGB8888:
       case MESA_FORMAT_XRGB8888:
-      case MESA_FORMAT_SARGB8:
       case MESA_FORMAT_RGB565:
       case MESA_FORMAT_ARGB1555:
       case MESA_FORMAT_ARGB4444:
+#ifndef I915
+      case MESA_FORMAT_SARGB8:
       case MESA_FORMAT_A8:
       case MESA_FORMAT_R8:
       case MESA_FORMAT_R16:
       case MESA_FORMAT_RG88:
       case MESA_FORMAT_RG1616:
+#endif
 	 break;
       default:
 	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 061f0d278d6..3f13589a214 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -452,7 +452,7 @@ intelCreateContext(gl_api api,
       return brwCreateContext(api, mesaVis,
 			      driContextPriv, sharedContextPrivate);
 #endif
-   fprintf(stderr, "Unrecognized deviceID %x\n", intelScreen->deviceID);
+   fprintf(stderr, "Unrecognized deviceID 0x%x\n", intelScreen->deviceID);
    return GL_FALSE;
 }
 
@@ -462,7 +462,8 @@ intel_init_bufmgr(struct intel_screen *intelScreen)
    __DRIscreen *spriv = intelScreen->driScrnPriv;
    int num_fences = 0;
 
-   intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
+   intelScreen->no_hw = (getenv("INTEL_NO_HW") != NULL ||
+			 getenv("INTEL_DEVID_OVERRIDE") != NULL);
 
    intelScreen->bufmgr = intel_bufmgr_gem_init(spriv->fd, BATCH_SZ);
    if (intelScreen->bufmgr == NULL) {
@@ -497,6 +498,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    GLenum fb_format[3];
    GLenum fb_type[3];
    unsigned int api_mask;
+   char *devid_override;
 
    static const GLenum back_buffer_modes[] = {
        GLX_NONE, GLX_SWAP_UNDEFINED_OML, GLX_SWAP_COPY_OML
@@ -523,6 +525,16 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
 			&intelScreen->deviceID))
       return GL_FALSE;
 
+   /* Allow an override of the device ID for the purpose of making the
+    * driver produce dumps for debugging of new chipset enablement.
+    * This implies INTEL_NO_HW, to avoid programming your actual GPU
+    * incorrectly.
+    */
+   devid_override = getenv("INTEL_DEVID_OVERRIDE");
+   if (devid_override) {
+      intelScreen->deviceID = strtod(devid_override, NULL);
+   }
+
    api_mask = (1 << __DRI_API_OPENGL);
 #if FEATURE_ES1
    api_mask |= (1 << __DRI_API_GLES);
diff --git a/src/mesa/drivers/dri/intel/intel_tex_format.c b/src/mesa/drivers/dri/intel/intel_tex_format.c
index 9d73a2fb375..f8316ae2f8d 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_format.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_format.c
@@ -204,11 +204,13 @@ intelChooseTextureFormat(struct gl_context * ctx, GLint internalFormat,
     * { R, G, 1.0, 1.0 } from a red-green texture would be useful.
     */
    case GL_RED:
+   case GL_COMPRESSED_RED:
    case GL_R8:
       return MESA_FORMAT_R8;
    case GL_R16:
       return MESA_FORMAT_R16;
    case GL_RG:
+   case GL_COMPRESSED_RG:
    case GL_RG8:
       return MESA_FORMAT_RG88;
    case GL_RG16:
diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
index 8a047e6419b..b62290231b9 100644
--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
@@ -200,6 +200,7 @@ void r200EmitArrays( struct gl_context *ctx, GLubyte *vimap_rev )
 	    }
 	 default:
 	    assert(0);
+	    emitsize = 0;
 	 }
 	 if (!rmesa->radeon.tcl.aos[nr].bo) {
 	   rcommon_emit_vector( ctx,
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
index 8be32ea91fe..1db8678e890 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
@@ -76,6 +76,9 @@ static void use_temporary(struct r300_fragment_program_code *code, unsigned int
 
 static unsigned int use_source(struct r300_fragment_program_code* code, struct rc_pair_instruction_source src)
 {
+	if (!src.Used)
+		return 0;
+
 	if (src.File == RC_FILE_CONSTANT) {
 		return src.Index | (1 << 5);
 	} else if (src.File == RC_FILE_TEMPORARY) {
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
index 2d28b065390..05d3da8a10d 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
@@ -94,6 +94,7 @@ static const struct swizzle_data* lookup_native_swizzle(unsigned int swizzle)
  */
 static int r300_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
+	const struct swizzle_data* sd;
 	unsigned int relevant;
 	int j;
 
@@ -127,7 +128,8 @@ static int r300_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 	if ((reg.Negate & relevant) && ((reg.Negate & relevant) != relevant))
 		return 0;
 
-	if (!lookup_native_swizzle(reg.Swizzle))
+	sd = lookup_native_swizzle(reg.Swizzle);
+	if (!sd || (reg.File == RC_FILE_PRESUB && sd->srcp_stride == 0))
 		return 0;
 
 	return 1;
@@ -201,7 +203,7 @@ unsigned int r300FPTranslateRGBSwizzle(unsigned int src, unsigned int swizzle)
 {
 	const struct swizzle_data* sd = lookup_native_swizzle(swizzle);
 
-	if (!sd) {
+	if (!sd || (src == RC_PAIR_PRESUB_SRC && sd->srcp_stride == 0)) {
 		fprintf(stderr, "Not a native swizzle: %08x\n", swizzle);
 		return 0;
 	}
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 2f130198d35..e0d349b98ce 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -24,6 +24,7 @@
 
 #include <stdio.h>
 
+#include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
 #include "radeon_emulate_branches.h"
 #include "radeon_emulate_loops.h"
@@ -54,6 +55,8 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user)
 
 	for (rci = c->Base.Program.Instructions.Next; rci != &c->Base.Program.Instructions; rci = rci->Next) {
 		struct rc_sub_instruction * inst = &rci->U.I;
+		unsigned i;
+		const struct rc_opcode_info *info = rc_get_opcode_info(inst->Opcode);
 
 		if (inst->DstReg.File != RC_FILE_OUTPUT || inst->DstReg.Index != c->OutputDepth)
 			continue;
@@ -65,27 +68,12 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user)
 			continue;
 		}
 
-		switch (inst->Opcode) {
-			case RC_OPCODE_FRC:
-			case RC_OPCODE_MOV:
-				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
-				break;
-			case RC_OPCODE_ADD:
-			case RC_OPCODE_MAX:
-			case RC_OPCODE_MIN:
-			case RC_OPCODE_MUL:
-				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
-				inst->SrcReg[1] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[1]);
-				break;
-			case RC_OPCODE_CMP:
-			case RC_OPCODE_MAD:
-				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
-				inst->SrcReg[1] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[1]);
-				inst->SrcReg[2] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[2]);
-				break;
-			default:
-				// Scalar instructions needn't be reswizzled
-				break;
+		if (!info->IsComponentwise) {
+			continue;
+		}
+
+		for (i = 0; i < info->NumSrcRegs; i++) {
+			inst->SrcReg[i] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[i]);
 		}
 	}
 }
@@ -93,7 +81,6 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user)
 void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 {
 	int is_r500 = c->Base.is_r500;
-	int kill_consts = c->Base.remove_unused_constants;
 	int opt = !c->Base.disable_optimizations;
 
 	/* Lists of instruction transformations. */
@@ -133,11 +120,11 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 		{"emulate loops",		1, !is_r500,	rc_emulate_loops,		NULL},
 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
 		{"dataflow swizzles",		1, 1,		rc_dataflow_swizzles,		NULL},
-		{"dead constants",		1, kill_consts, rc_remove_unused_constants,	&c->code->constants_remap_table},
+		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
 		/* This pass makes it easier for the scheduler to group TEX
 		 * instructions and reduces the chances of creating too
 		 * many texture indirections.*/
-		{"register rename",		1, !is_r500,	rc_rename_regs,			NULL},
+		{"register rename",		1, !is_r500 || opt, rc_rename_regs,		NULL},
 		{"pair translate",		1, 1,		rc_pair_translate,		NULL},
 		{"pair scheduling",		1, 1,		rc_pair_schedule,		NULL},
 		{"register allocation",		1, opt,		rc_pair_regalloc,		NULL},
@@ -150,9 +137,10 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 		{NULL, 0, 0, NULL, NULL}
 	};
 
+	c->Base.type = RC_FRAGMENT_PROGRAM;
 	c->Base.SwizzleCaps = c->Base.is_r500 ? &r500_swizzle_caps : &r300_swizzle_caps;
 
-	rc_run_compiler(&c->Base, fs_list, "Fragment Program");
+	rc_run_compiler(&c->Base, fs_list);
 
 	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index bf8341f0173..472029f63d0 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -26,6 +26,7 @@
 
 #include "../r300_reg.h"
 
+#include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
 #include "radeon_program_alu.h"
 #include "radeon_swizzle.h"
@@ -790,19 +791,14 @@ static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
 						if (!hwtemps[j])
 							break;
 					}
-					if (j >= c->max_temp_regs) {
-						rc_error(c, "Too many temporaries\n");
-						return;
+					ta[orig].Allocated = 1;
+					if (last_inst_src_reladdr &&
+					    last_inst_src_reladdr->IP > inst->IP) {
+						ta[orig].HwTemp = orig;
 					} else {
-						ta[orig].Allocated = 1;
-						if (last_inst_src_reladdr &&
-						    last_inst_src_reladdr->IP > inst->IP) {
-							ta[orig].HwTemp = orig;
-						} else {
-							ta[orig].HwTemp = j;
-						}
-						hwtemps[ta[orig].HwTemp] = 1;
+						ta[orig].HwTemp = j;
 					}
+					hwtemps[ta[orig].HwTemp] = 1;
 				}
 
 				inst->U.I.DstReg.Index = ta[orig].HwTemp;
@@ -1018,7 +1014,6 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
 {
 	int is_r500 = c->Base.is_r500;
-	int kill_consts = c->Base.remove_unused_constants;
 	int opt = !c->Base.disable_optimizations;
 
 	/* Lists of instruction transformations. */
@@ -1062,18 +1057,18 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
 		/* This pass must be done after optimizations. */
 		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
-		{"dataflow swizzles",		1, 1,		rc_dataflow_swizzles,		NULL},
 		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
-		{"dead constants",		1, kill_consts, rc_remove_unused_constants,	&c->code->constants_remap_table},
+		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
 		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
 		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
 		{NULL, 0, 0, NULL, NULL}
 	};
 
+	c->Base.type = RC_VERTEX_PROGRAM;
 	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
 
-	rc_run_compiler(&c->Base, vs_list, "Vertex Program");
+	rc_run_compiler(&c->Base, vs_list);
 
 	c->code->InputsRead = c->Base.Program.InputsRead;
 	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
index 289bb87ae59..ef81be48f77 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -29,6 +29,7 @@
 
 #include <stdio.h>
 
+#include "radeon_compiler_util.h"
 #include "../r300_reg.h"
 
 /**
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index 6f101c68eb6..5da82d90f67 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -45,9 +45,6 @@
 
 #include "radeon_program_pair.h"
 
-#define MAX_BRANCH_DEPTH_FULL 32
-#define MAX_BRANCH_DEPTH_PARTIAL 4
-
 #define PROG_CODE \
 	struct r500_fragment_program_code *code = &c->code->code.r500
 
@@ -200,6 +197,9 @@ static void use_temporary(struct r500_fragment_program_code* code, unsigned int
 
 static unsigned int use_source(struct r500_fragment_program_code* code, struct rc_pair_instruction_source src)
 {
+	if (!src.Used)
+		return 0;
+
 	if (src.File == RC_FILE_CONSTANT) {
 		return src.Index | 0x100;
 	} else if (src.File == RC_FILE_TEMPORARY) {
@@ -506,7 +506,7 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 		break;
 	}
 	case RC_OPCODE_IF:
-		if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) {
+		if ( s->CurrentBranchDepth >= R500_PFS_MAX_BRANCH_DEPTH_FULL) {
 			rc_error(s->C, "Branch depth exceeds hardware limit");
 			return;
 		}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index cfb6df2cd79..b69e81698ae 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -34,6 +34,8 @@
 #define R500_PFS_MAX_INST         512
 #define R500_PFS_NUM_TEMP_REGS    128
 #define R500_PFS_NUM_CONST_REGS   256
+#define R500_PFS_MAX_BRANCH_DEPTH_FULL 32
+#define R500_PFS_MAX_BRANCH_DEPTH_PARTIAL 4
 
 
 #define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
index 4286baed0c6..65548604bcc 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
@@ -29,6 +29,7 @@
 #include "radeon_dataflow.h"
 #include "radeon_program.h"
 #include "radeon_program_pair.h"
+#include "radeon_compiler_util.h"
 
 
 void rc_init(struct radeon_compiler * c)
@@ -356,66 +357,92 @@ void rc_transform_fragment_face(struct radeon_compiler *c, unsigned face)
 static void reg_count_callback(void * userdata, struct rc_instruction * inst,
 		rc_register_file file, unsigned int index, unsigned int mask)
 {
-	unsigned int * max_reg = userdata;
+	int *max_reg = userdata;
 	if (file == RC_FILE_TEMPORARY)
-		index > *max_reg ? *max_reg = index : 0;
+		(int)index > *max_reg ? *max_reg = index : 0;
 }
 
-static void print_stats(struct radeon_compiler * c)
+void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s)
 {
+	int max_reg = -1;
 	struct rc_instruction * tmp;
-	unsigned max_reg, insts, fc, tex, alpha, rgb, presub;
-	max_reg = insts = fc = tex = alpha = rgb = presub = 0;
+	memset(s, 0, sizeof(*s));
+
 	for(tmp = c->Program.Instructions.Next; tmp != &c->Program.Instructions;
 							tmp = tmp->Next){
 		const struct rc_opcode_info * info;
 		rc_for_all_reads_mask(tmp, reg_count_callback, &max_reg);
 		if (tmp->Type == RC_INSTRUCTION_NORMAL) {
 			if (tmp->U.I.PreSub.Opcode != RC_PRESUB_NONE)
-				presub++;
+				s->num_presub_ops++;
 			info = rc_get_opcode_info(tmp->U.I.Opcode);
 		} else {
 			if (tmp->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Used)
-				presub++;
+				s->num_presub_ops++;
 			if (tmp->U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Used)
-				presub++;
+				s->num_presub_ops++;
 			/* Assuming alpha will never be a flow control or
 			 * a tex instruction. */
 			if (tmp->U.P.Alpha.Opcode != RC_OPCODE_NOP)
-				alpha++;
+				s->num_alpha_insts++;
 			if (tmp->U.P.RGB.Opcode != RC_OPCODE_NOP)
-				rgb++;
+				s->num_rgb_insts++;
 			info = rc_get_opcode_info(tmp->U.P.RGB.Opcode);
 		}
 		if (info->IsFlowControl)
-			fc++;
+			s->num_fc_insts++;
 		if (info->HasTexture)
-			tex++;
-		insts++;
+			s->num_tex_insts++;
+		s->num_insts++;
 	}
-	if (insts < 4)
-		return;
-	fprintf(stderr,"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
-		       "~%4u Instructions\n"
-		       "~%4u Vector Instructions (RGB)\n"
-		       "~%4u Scalar Instructions (Alpha)\n"
-		       "~%4u Flow Control Instructions\n"
-		       "~%4u Texture Instructions\n"
-		       "~%4u Presub Operations\n"
-		       "~%4u Temporary Registers\n"
-		       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
-		       insts, rgb, alpha, fc, tex, presub, max_reg + 1);
+	s->num_temp_regs = max_reg + 1;
 }
 
-/* Executes a list of compiler passes given in the parameter 'list'. */
-void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list,
-		     const char *shader_name)
+static void print_stats(struct radeon_compiler * c)
 {
-	if (c->Debug & RC_DBG_LOG) {
-		fprintf(stderr, "%s: before compilation\n", shader_name);
-		rc_print_program(&c->Program);
+	struct rc_program_stats s;
+
+	rc_get_stats(c, &s);
+
+	if (s.num_insts < 4)
+		return;
+
+	switch (c->type) {
+	case RC_VERTEX_PROGRAM:
+		fprintf(stderr,"~~~~~~~~~ VERTEX PROGRAM ~~~~~~~~\n"
+			       "~%4u Instructions\n"
+			       "~%4u Flow Control Instructions\n"
+			       "~%4u Temporary Registers\n"
+			       "~~~~~~~~~~~~~~ END ~~~~~~~~~~~~~~\n",
+			       s.num_insts, s.num_fc_insts, s.num_temp_regs);
+		break;
+
+	case RC_FRAGMENT_PROGRAM:
+		fprintf(stderr,"~~~~~~~~ FRAGMENT PROGRAM ~~~~~~~\n"
+			       "~%4u Instructions\n"
+			       "~%4u Vector Instructions (RGB)\n"
+			       "~%4u Scalar Instructions (Alpha)\n"
+			       "~%4u Flow Control Instructions\n"
+			       "~%4u Texture Instructions\n"
+			       "~%4u Presub Operations\n"
+			       "~%4u Temporary Registers\n"
+			       "~~~~~~~~~~~~~~ END ~~~~~~~~~~~~~~\n",
+			       s.num_insts, s.num_rgb_insts, s.num_alpha_insts,
+			       s.num_fc_insts, s.num_tex_insts, s.num_presub_ops,
+			       s.num_temp_regs);
+		break;
+	default:
+		assert(0);
 	}
+}
 
+static const char *shader_name[RC_NUM_PROGRAM_TYPES] = {
+	"Vertex Program",
+	"Fragment Program"
+};
+
+void rc_run_compiler_passes(struct radeon_compiler *c, struct radeon_compiler_pass *list)
+{
 	for (unsigned i = 0; list[i].name; i++) {
 		if (list[i].predicate) {
 			list[i].run(c, list[i].user);
@@ -424,11 +451,23 @@ void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *lis
 				return;
 
 			if ((c->Debug & RC_DBG_LOG) && list[i].dump) {
-				fprintf(stderr, "%s: after '%s'\n", shader_name, list[i].name);
+				fprintf(stderr, "%s: after '%s'\n", shader_name[c->type], list[i].name);
 				rc_print_program(&c->Program);
 			}
 		}
 	}
+}
+
+/* Executes a list of compiler passes given in the parameter 'list'. */
+void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list)
+{
+	if (c->Debug & RC_DBG_LOG) {
+		fprintf(stderr, "%s: before compilation\n", shader_name[c->type]);
+		rc_print_program(&c->Program);
+	}
+
+	rc_run_compiler_passes(c, list);
+
 	if (c->Debug & RC_DBG_STATS)
 		print_stats(c);
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
index 31fd469a04f..e6633395895 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
@@ -35,9 +35,16 @@
 
 struct rc_swizzle_caps;
 
+enum rc_program_type {
+	RC_VERTEX_PROGRAM,
+	RC_FRAGMENT_PROGRAM,
+	RC_NUM_PROGRAM_TYPES
+};
+
 struct radeon_compiler {
 	struct memory_pool Pool;
 	struct rc_program Program;
+	enum rc_program_type type;
 	unsigned Debug:2;
 	unsigned Error:1;
 	char * ErrorMsg;
@@ -140,9 +147,21 @@ struct radeon_compiler_pass {
 	void *user;		/* Optional parameter which is passed to the run function. */
 };
 
+struct rc_program_stats {
+	unsigned num_insts;
+	unsigned num_fc_insts;
+	unsigned num_tex_insts;
+	unsigned num_rgb_insts;
+	unsigned num_alpha_insts;
+	unsigned num_presub_ops;
+	unsigned num_temp_regs;
+};
+
+void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s);
+
 /* Executes a list of compiler passes given in the parameter 'list'. */
-void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list,
-		     const char *shader_name);
+void rc_run_compiler_passes(struct radeon_compiler *c, struct radeon_compiler_pass *list);
+void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list);
 void rc_validate_final_shader(struct radeon_compiler *c, void *user);
 
 #endif /* RADEON_COMPILER_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
index 97f4c758492..bf393a9fb16 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
@@ -31,6 +31,8 @@
 
 #include "radeon_compiler_util.h"
 
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
 /**
  */
 unsigned int rc_swizzle_to_writemask(unsigned int swz)
@@ -46,6 +48,91 @@ unsigned int rc_swizzle_to_writemask(unsigned int swz)
 	return mask;
 }
 
+rc_swizzle get_swz(unsigned int swz, rc_swizzle idx)
+{
+	if (idx & 0x4)
+		return idx;
+	return GET_SWZ(swz, idx);
+}
+
+unsigned int combine_swizzles4(unsigned int src,
+		rc_swizzle swz_x, rc_swizzle swz_y, rc_swizzle swz_z, rc_swizzle swz_w)
+{
+	unsigned int ret = 0;
+
+	ret |= get_swz(src, swz_x);
+	ret |= get_swz(src, swz_y) << 3;
+	ret |= get_swz(src, swz_z) << 6;
+	ret |= get_swz(src, swz_w) << 9;
+
+	return ret;
+}
+
+unsigned int combine_swizzles(unsigned int src, unsigned int swz)
+{
+	unsigned int ret = 0;
+
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_X));
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Y)) << 3;
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Z)) << 6;
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_W)) << 9;
+
+	return ret;
+}
+
+/**
+ * @param mask Must be either RC_MASK_X, RC_MASK_Y, RC_MASK_Z, or RC_MASK_W
+ */
+rc_swizzle rc_mask_to_swizzle(unsigned int mask)
+{
+	switch (mask) {
+	case RC_MASK_X: return RC_SWIZZLE_X;
+	case RC_MASK_Y: return RC_SWIZZLE_Y;
+	case RC_MASK_Z: return RC_SWIZZLE_Z;
+	case RC_MASK_W: return RC_SWIZZLE_W;
+	}
+	return RC_SWIZZLE_UNUSED;
+}
+
+/* Reorder mask bits according to swizzle. */
+unsigned swizzle_mask(unsigned swizzle, unsigned mask)
+{
+	unsigned ret = 0;
+	for (unsigned chan = 0; chan < 4; ++chan) {
+		unsigned swz = GET_SWZ(swizzle, chan);
+		if (swz < 4)
+			ret |= GET_BIT(mask, swz) << chan;
+	}
+	return ret;
+}
+
+/**
+ * Left multiplication of a register with a swizzle
+ */
+struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg)
+{
+	struct rc_src_register tmp = srcreg;
+	int i;
+	tmp.Swizzle = 0;
+	tmp.Negate = 0;
+	for(i = 0; i < 4; ++i) {
+		rc_swizzle swz = GET_SWZ(swizzle, i);
+		if (swz < 4) {
+			tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
+			tmp.Negate |= GET_BIT(srcreg.Negate, swz) << i;
+		} else {
+			tmp.Swizzle |= swz << (i*3);
+		}
+	}
+	return tmp;
+}
+
+void reset_srcreg(struct rc_src_register* reg)
+{
+	memset(reg, 0, sizeof(struct rc_src_register));
+	reg->Swizzle = RC_SWIZZLE_XYZW;
+}
+
 unsigned int rc_src_reads_dst_mask(
 		rc_register_file src_file,
 		unsigned int src_idx,
@@ -59,3 +146,123 @@ unsigned int rc_src_reads_dst_mask(
 	}
 	return dst_mask & rc_swizzle_to_writemask(src_swz);
 }
+
+unsigned int rc_source_type_swz(unsigned int swizzle, unsigned int channels)
+{
+	unsigned int chan;
+	unsigned int swz = RC_SWIZZLE_UNUSED;
+	unsigned int ret = RC_SOURCE_NONE;
+
+	for(chan = 0; chan < channels; chan++) {
+		swz = GET_SWZ(swizzle, chan);
+		if (swz == RC_SWIZZLE_W) {
+			ret |= RC_SOURCE_ALPHA;
+		} else if (swz == RC_SWIZZLE_X || swz == RC_SWIZZLE_Y
+						|| swz == RC_SWIZZLE_Z) {
+			ret |= RC_SOURCE_RGB;
+		}
+	}
+	return ret;
+}
+
+unsigned int rc_source_type_mask(unsigned int mask)
+{
+	unsigned int ret = RC_SOURCE_NONE;
+
+	if (mask & RC_MASK_XYZ)
+		ret |= RC_SOURCE_RGB;
+
+	if (mask & RC_MASK_W)
+		ret |= RC_SOURCE_ALPHA;
+
+	return ret;
+}
+
+struct can_use_presub_data {
+	struct rc_src_register RemoveSrcs[3];
+	unsigned int RGBCount;
+	unsigned int AlphaCount;
+};
+
+static void can_use_presub_read_cb(
+	void * userdata,
+	struct rc_instruction * inst,
+	rc_register_file file,
+	unsigned int index,
+	unsigned int mask)
+{
+	struct can_use_presub_data * d = userdata;
+	unsigned int src_type = rc_source_type_mask(mask);
+	unsigned int i;
+
+	if (file == RC_FILE_NONE)
+		return;
+
+	for(i = 0; i < 3; i++) {
+		if (d->RemoveSrcs[i].File == file
+		    && d->RemoveSrcs[i].Index == index) {
+			src_type &=
+				~rc_source_type_swz(d->RemoveSrcs[i].Swizzle, 4);
+		}
+	}
+
+	if (src_type & RC_SOURCE_RGB)
+		d->RGBCount++;
+
+	if (src_type & RC_SOURCE_ALPHA)
+		d->AlphaCount++;
+}
+
+unsigned int rc_inst_can_use_presub(
+	struct rc_instruction * inst,
+	rc_presubtract_op presub_op,
+	unsigned int presub_writemask,
+	struct rc_src_register replace_reg,
+	struct rc_src_register presub_src0,
+	struct rc_src_register presub_src1)
+{
+	struct can_use_presub_data d;
+	unsigned int num_presub_srcs;
+	unsigned int presub_src_type = rc_source_type_mask(presub_writemask);
+	const struct rc_opcode_info * info =
+					rc_get_opcode_info(inst->U.I.Opcode);
+
+	if (presub_op == RC_PRESUB_NONE) {
+		return 1;
+	}
+
+	if (info->HasTexture) {
+		return 0;
+	}
+
+	/* We can't use more than one presubtract value in an
+	 * instruction, unless the two prsubtract operations
+	 * are the same and read from the same registers.
+	 * XXX For now we will limit instructions to only one presubtract
+	 * value.*/
+	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE) {
+		return 0;
+	}
+
+	memset(&d, 0, sizeof(d));
+	d.RemoveSrcs[0] = replace_reg;
+	d.RemoveSrcs[1] = presub_src0;
+	d.RemoveSrcs[2] = presub_src1;
+
+	rc_for_all_reads_mask(inst, can_use_presub_read_cb, &d);
+
+	num_presub_srcs = rc_presubtract_src_reg_count(presub_op);
+
+	if ((presub_src_type & RC_SOURCE_RGB)
+					&& d.RGBCount + num_presub_srcs > 3) {
+		return 0;
+	}
+
+	if ((presub_src_type & RC_SOURCE_ALPHA)
+					&& d.AlphaCount + num_presub_srcs > 3) {
+		return 0;
+	}
+
+	return 1;
+}
+
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
index 1a14e7cb0ef..461ab9ffb10 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
@@ -3,8 +3,27 @@
 #ifndef RADEON_PROGRAM_UTIL_H
 #define RADEON_PROGRAM_UTIL_H
 
+struct rc_instruction;
+struct rc_src_register;
+
 unsigned int rc_swizzle_to_writemask(unsigned int swz);
 
+rc_swizzle get_swz(unsigned int swz, rc_swizzle idx);
+
+unsigned int combine_swizzles4(unsigned int src,
+			       rc_swizzle swz_x, rc_swizzle swz_y,
+			       rc_swizzle swz_z, rc_swizzle swz_w);
+
+unsigned int combine_swizzles(unsigned int src, unsigned int swz);
+
+rc_swizzle rc_mask_to_swizzle(unsigned int mask);
+
+unsigned swizzle_mask(unsigned swizzle, unsigned mask);
+
+struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg);
+
+void reset_srcreg(struct rc_src_register* reg);
+
 unsigned int rc_src_reads_dst_mask(
 		rc_register_file src_file,
 		unsigned int src_idx,
@@ -13,4 +32,16 @@ unsigned int rc_src_reads_dst_mask(
 		unsigned int dst_idx,
 		unsigned int dst_mask);
 
+unsigned int rc_source_type_swz(unsigned int swizzle, unsigned int channels);
+
+unsigned int rc_source_type_mask(unsigned int mask);
+
+unsigned int rc_inst_can_use_presub(
+	struct rc_instruction * inst,
+	rc_presubtract_op presub_op,
+	unsigned int presub_writemask,
+	struct rc_src_register replace_reg,
+	struct rc_src_register presub_src0,
+	struct rc_src_register presub_src1);
+
 #endif /* RADEON_PROGRAM_UTIL_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
index fd94194dc34..d0a64d936e0 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
@@ -139,7 +139,46 @@ static void pair_sub_for_all_args(
 	const struct rc_opcode_info * info = rc_get_opcode_info(sub->Opcode);
 
 	for(i = 0; i < info->NumSrcRegs; i++) {
-		cb(userdata, fullinst, &sub->Arg[i]);
+		unsigned int src_type;
+		unsigned int channels = 0;
+		if (&fullinst->U.P.RGB == sub)
+			channels = 3;
+		else if (&fullinst->U.P.Alpha == sub)
+			channels = 1;
+
+		assert(channels > 0);
+		src_type = rc_source_type_swz(sub->Arg[i].Swizzle, channels);
+
+		if (src_type == RC_SOURCE_NONE)
+			continue;
+
+		if (sub->Arg[i].Source == RC_PAIR_PRESUB_SRC) {
+			unsigned int presub_type;
+			unsigned int presub_src_count;
+			struct rc_pair_instruction_source * src_array;
+			unsigned int j;
+			if (src_type & RC_SOURCE_RGB) {
+				presub_type = fullinst->
+					U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Index;
+				src_array = fullinst->U.P.RGB.Src;
+			} else {
+				presub_type = fullinst->
+					U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Index;
+				src_array = fullinst->U.P.Alpha.Src;
+			}
+			presub_src_count
+				= rc_presubtract_src_reg_count(presub_type);
+			for(j = 0; j < presub_src_count; j++) {
+				cb(userdata, fullinst, &sub->Arg[i],
+								&src_array[j]);
+			}
+		} else {
+			struct rc_pair_instruction_source * src =
+				rc_pair_get_src(&fullinst->U.P, &sub->Arg[i]);
+			if (src) {
+				cb(userdata, fullinst, &sub->Arg[i], src);
+			}
+		}
 	}
 }
 
@@ -308,6 +347,7 @@ static void remap_normal_instruction(struct rc_instruction * fullinst,
 {
 	struct rc_sub_instruction * inst = &fullinst->U.I;
 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Opcode);
+	unsigned int remapped_presub = 0;
 
 	if (opcode->HasDstReg) {
 		rc_register_file file = inst->DstReg.File;
@@ -327,6 +367,12 @@ static void remap_normal_instruction(struct rc_instruction * fullinst,
 			unsigned int i;
 			unsigned int srcp_srcs = rc_presubtract_src_reg_count(
 						inst->PreSub.Opcode);
+			/* Make sure we only remap presubtract sources once in
+			 * case more than one source register reads the
+			 * presubtract result. */
+			if (remapped_presub)
+				continue;
+
 			for(i = 0; i < srcp_srcs; i++) {
 				file = inst->PreSub.SrcReg[i].File;
 				index = inst->PreSub.SrcReg[i].Index;
@@ -334,7 +380,7 @@ static void remap_normal_instruction(struct rc_instruction * fullinst,
 				inst->PreSub.SrcReg[i].File = file;
 				inst->PreSub.SrcReg[i].Index = index;
 			}
-
+			remapped_presub = 1;
 		}
 		else {
 			cb(userdata, fullinst, &file, &index);
@@ -430,12 +476,29 @@ static rc_opcode get_flow_control_inst(struct rc_instruction * inst)
 
 }
 
+struct branch_write_mask {
+	unsigned int IfWriteMask:4;
+	unsigned int ElseWriteMask:4;
+	unsigned int HasElse:1;
+};
+
+union get_readers_read_cb {
+	rc_read_src_fn I;
+	rc_pair_read_arg_fn P;
+};
+
 struct get_readers_callback_data {
 	struct radeon_compiler * C;
 	struct rc_reader_data * ReaderData;
-	rc_read_src_fn ReadCB;
+	rc_read_src_fn ReadNormalCB;
+	rc_pair_read_arg_fn ReadPairCB;
 	rc_read_write_mask_fn WriteCB;
+	rc_register_file DstFile;
+	unsigned int DstIndex;
+	unsigned int DstMask;
 	unsigned int AliveWriteMask;
+	/*  For convenience, this is indexed starting at 1 */
+	struct branch_write_mask BranchMasks[R500_PFS_MAX_BRANCH_DEPTH_FULL + 1];
 };
 
 static void add_reader(
@@ -443,7 +506,7 @@ static void add_reader(
 	struct rc_reader_data * data,
 	struct rc_instruction * inst,
 	unsigned int mask,
-	struct rc_src_register * src)
+	void * arg_or_src)
 {
 	struct rc_reader * new;
 	memory_pool_array_reserve(pool, struct rc_reader, data->Readers,
@@ -451,7 +514,74 @@ static void add_reader(
 	new = &data->Readers[data->ReaderCount++];
 	new->Inst = inst;
 	new->WriteMask = mask;
-	new->Src = src;
+	if (inst->Type == RC_INSTRUCTION_NORMAL) {
+		new->U.Src = arg_or_src;
+	} else {
+		new->U.Arg = arg_or_src;
+	}
+}
+
+static unsigned int get_readers_read_callback(
+	struct get_readers_callback_data * cb_data,
+	unsigned int has_rel_addr,
+	rc_register_file file,
+	unsigned int index,
+	unsigned int swizzle)
+{
+	unsigned int shared_mask, read_mask;
+
+	if (has_rel_addr) {
+		cb_data->ReaderData->Abort = 1;
+		return RC_MASK_NONE;
+	}
+
+	shared_mask = rc_src_reads_dst_mask(file, index, swizzle,
+		cb_data->DstFile, cb_data->DstIndex, cb_data->AliveWriteMask);
+
+	if (shared_mask == RC_MASK_NONE)
+		return shared_mask;
+
+	/* If we make it this far, it means that this source reads from the
+	 * same register written to by d->ReaderData->Writer. */
+
+	read_mask = rc_swizzle_to_writemask(swizzle);
+	if (cb_data->ReaderData->AbortOnRead & read_mask) {
+		cb_data->ReaderData->Abort = 1;
+		return shared_mask;
+	}
+
+	/* XXX The behavior in this case should be configurable. */
+	if ((read_mask & cb_data->AliveWriteMask) != read_mask) {
+		cb_data->ReaderData->Abort = 1;
+		return shared_mask;
+	}
+
+	return shared_mask;
+}
+
+static void get_readers_pair_read_callback(
+	void * userdata,
+	struct rc_instruction * inst,
+	struct rc_pair_instruction_arg * arg,
+	struct rc_pair_instruction_source * src)
+{
+	unsigned int shared_mask;
+	struct get_readers_callback_data * d = userdata;
+
+	shared_mask = get_readers_read_callback(d,
+				0 /*Pair Instructions don't use RelAddr*/,
+				src->File, src->Index, arg->Swizzle);
+
+	if (shared_mask == RC_MASK_NONE)
+		return;
+
+	if (d->ReadPairCB)
+		d->ReadPairCB(d->ReaderData, inst, arg, src);
+
+	if (d->ReaderData->Abort)
+		return;
+
+	add_reader(&d->C->Pool, d->ReaderData, inst, shared_mask, arg);
 }
 
 /**
@@ -464,37 +594,18 @@ static void get_readers_normal_read_callback(
 	struct rc_src_register * src)
 {
 	struct get_readers_callback_data * d = userdata;
-	unsigned int read_mask;
 	unsigned int shared_mask;
 
-	if (src->RelAddr)
-		d->ReaderData->Abort = 1;
-
-	shared_mask = rc_src_reads_dst_mask(src->File, src->Index,
-		src->Swizzle,
-		d->ReaderData->Writer->U.I.DstReg.File,
-		d->ReaderData->Writer->U.I.DstReg.Index,
-		d->AliveWriteMask);
+	shared_mask = get_readers_read_callback(d,
+			src->RelAddr, src->File, src->Index, src->Swizzle);
 
 	if (shared_mask == RC_MASK_NONE)
 		return;
+	/* The callback function could potentially clear d->ReaderData->Abort,
+	 * so we need to call it before we return. */
+	if (d->ReadNormalCB)
+		d->ReadNormalCB(d->ReaderData, inst, src);
 
-	/* If we make it this far, it means that this source reads from the
-	 * same register written to by d->ReaderData->Writer. */
-
-	if (d->ReaderData->AbortOnRead) {
-		d->ReaderData->Abort = 1;
-		return;
-	}
-
-	read_mask = rc_swizzle_to_writemask(src->Swizzle);
-	/* XXX The behavior in this case should be configurable. */
-	if ((read_mask & d->AliveWriteMask) != read_mask) {
-		d->ReaderData->Abort = 1;
-		return;
-	}
-
-	d->ReadCB(d->ReaderData, inst, src);
 	if (d->ReaderData->Abort)
 		return;
 
@@ -515,29 +626,132 @@ static void get_readers_write_callback(
 {
 	struct get_readers_callback_data * d = userdata;
 
-	if (index == d->ReaderData->Writer->U.I.DstReg.Index
-		&& file == d->ReaderData->Writer->U.I.DstReg.File) {
-			unsigned int shared_mask = mask
-				& d->ReaderData->Writer->U.I.DstReg.WriteMask;
-		if (d->ReaderData->InElse) {
-			if (shared_mask & d->AliveWriteMask) {
-				/* We set AbortOnRead here because the
-				 * destination register of d->ReaderData->Writer
-				 * is written to in both the IF and the
-				 * ELSE block of this IF/ELSE statement.
-				 * This means that readers of this
-				 * destination register that follow this IF/ELSE
-				 * statement use the value of different
-				 * instructions depending on the control flow
-				 * decisions made by the program. */
-				d->ReaderData->AbortOnRead = 1;
+	if (index == d->DstIndex && file == d->DstFile) {
+		unsigned int shared_mask = mask & d->DstMask;
+		d->ReaderData->AbortOnRead &= ~shared_mask;
+		d->AliveWriteMask &= ~shared_mask;
+	}
+
+	if(d->WriteCB)
+		d->WriteCB(d->ReaderData, inst, file, index, mask);
+}
+
+static void get_readers_for_single_write(
+	void * userdata,
+	struct rc_instruction * writer,
+	rc_register_file dst_file,
+	unsigned int dst_index,
+	unsigned int dst_mask)
+{
+	struct rc_instruction * tmp;
+	unsigned int branch_depth = 0;
+	struct get_readers_callback_data * d = userdata;
+
+	d->ReaderData->Writer = writer;
+	d->ReaderData->AbortOnRead = 0;
+	d->ReaderData->InElse = 0;
+	d->DstFile = dst_file;
+	d->DstIndex = dst_index;
+	d->DstMask = dst_mask;
+	d->AliveWriteMask = dst_mask;
+	memset(d->BranchMasks, 0, sizeof(d->BranchMasks));
+
+	if (!dst_mask)
+		return;
+
+	for(tmp = writer->Next; tmp != &d->C->Program.Instructions;
+							tmp = tmp->Next){
+		rc_opcode opcode = get_flow_control_inst(tmp);
+		switch(opcode) {
+		case RC_OPCODE_BGNLOOP:
+			/* XXX We can do better when we see a BGNLOOP if we
+			 * add a flag called AbortOnWrite to struct
+			 * rc_reader_data and leave it set until the next
+			 * ENDLOOP. */
+		case RC_OPCODE_ENDLOOP:
+			/* XXX We can do better when we see an ENDLOOP by
+			 * searching backwards from writer and looking for
+			 * readers of writer's destination index.  If we find a
+			 * reader before we get to the BGNLOOP, we must abort
+			 * unless there is another writer between that reader
+			 * and the BGNLOOP. */
+		case RC_OPCODE_BRK:
+		case RC_OPCODE_CONT:
+			d->ReaderData->Abort = 1;
+			return;
+		case RC_OPCODE_IF:
+			branch_depth++;
+			if (branch_depth > R500_PFS_MAX_BRANCH_DEPTH_FULL) {
+				d->ReaderData->Abort = 1;
+				return;
+			}
+			d->BranchMasks[branch_depth].IfWriteMask =
+							d->AliveWriteMask;
+			break;
+		case RC_OPCODE_ELSE:
+			if (branch_depth == 0) {
+				d->ReaderData->InElse = 1;
+			} else {
+				unsigned int temp_mask = d->AliveWriteMask;
+				d->AliveWriteMask =
+					d->BranchMasks[branch_depth].IfWriteMask;
+				d->BranchMasks[branch_depth].ElseWriteMask =
+								temp_mask;
+				d->BranchMasks[branch_depth].HasElse = 1;
 			}
+			break;
+		case RC_OPCODE_ENDIF:
+			if (branch_depth == 0) {
+				d->ReaderData->AbortOnRead = d->AliveWriteMask;
+				d->ReaderData->InElse = 0;
+			}
+			else {
+				struct branch_write_mask * masks =
+					&d->BranchMasks[branch_depth];
+
+				if (masks->HasElse) {
+					d->ReaderData->AbortOnRead |=
+						masks->IfWriteMask
+							& ~masks->ElseWriteMask;
+					d->AliveWriteMask = masks->IfWriteMask
+						^ ((masks->IfWriteMask ^
+							masks->ElseWriteMask)
+						& (masks->IfWriteMask
+							^ d->AliveWriteMask));
+				} else {
+					d->ReaderData->AbortOnRead |=
+						masks->IfWriteMask
+							& ~d->AliveWriteMask;
+					d->AliveWriteMask = masks->IfWriteMask;
+
+				}
+				memset(masks, 0,
+					sizeof(struct branch_write_mask));
+				branch_depth--;
+			}
+			break;
+		default:
+			break;
+		}
+
+		if (d->ReaderData->InElse)
+			continue;
+
+		if (tmp->Type == RC_INSTRUCTION_NORMAL) {
+			rc_for_all_reads_src(tmp,
+				get_readers_normal_read_callback, d);
 		} else {
-			d->AliveWriteMask &= ~shared_mask;
+			rc_pair_for_all_reads_arg(tmp,
+				get_readers_pair_read_callback, d);
 		}
-	}
+		rc_for_all_writes_mask(tmp, get_readers_write_callback, d);
 
-	d->WriteCB(d->ReaderData, inst, file, index, mask);
+		if (d->ReaderData->Abort)
+			return;
+
+		if (branch_depth == 0 && !d->AliveWriteMask)
+			return;
+	}
 }
 
 /**
@@ -578,83 +792,26 @@ static void get_readers_write_callback(
  * @param write_cb This function will be called for every instruction after
  * writer.
  */
-void  rc_get_readers_normal(
+void rc_get_readers(
 	struct radeon_compiler * c,
 	struct rc_instruction * writer,
 	struct rc_reader_data * data,
-	rc_read_src_fn read_cb,
+	rc_read_src_fn read_normal_cb,
+	rc_pair_read_arg_fn read_pair_cb,
 	rc_read_write_mask_fn write_cb)
 {
-	struct rc_instruction * tmp;
 	struct get_readers_callback_data d;
-	unsigned int branch_depth = 0;
 
-	data->Writer = writer;
 	data->Abort = 0;
-	data->AbortOnRead = 0;
-	data->InElse = 0;
 	data->ReaderCount = 0;
 	data->ReadersReserved = 0;
 	data->Readers = NULL;
 
 	d.C = c;
-	d.AliveWriteMask = writer->U.I.DstReg.WriteMask;
 	d.ReaderData = data;
-	d.ReadCB = read_cb;
+	d.ReadNormalCB = read_normal_cb;
+	d.ReadPairCB = read_pair_cb;
 	d.WriteCB = write_cb;
 
-	if (!writer->U.I.DstReg.WriteMask)
-		return;
-
-	for(tmp = writer->Next; tmp != &c->Program.Instructions;
-							tmp = tmp->Next){
-		rc_opcode opcode = get_flow_control_inst(tmp);
-		switch(opcode) {
-		case RC_OPCODE_BGNLOOP:
-			/* XXX We can do better when we see a BGNLOOP if we
-			 * add a flag called AbortOnWrite to struct
-			 * rc_reader_data and leave it set until the next
-			 * ENDLOOP. */
-		case RC_OPCODE_ENDLOOP:
-			/* XXX We can do better when we see an ENDLOOP by
-			 * searching backwards from writer and looking for
-			 * readers of writer's destination index.  If we find a
-			 * reader before we get to the BGNLOOP, we must abort
-			 * unless there is another writer between that reader
-			 * and the BGNLOOP. */
-			data->Abort = 1;
-			return;
-		case RC_OPCODE_IF:
-			/* XXX We can do better here, but this will have to
-			 * do until this dataflow analysis is more mature. */
-			data->Abort = 1;
-			branch_depth++;
-			break;
-		case RC_OPCODE_ELSE:
-			if (branch_depth == 0)
-				data->InElse = 1;
-			break;
-		case RC_OPCODE_ENDIF:
-			if (branch_depth == 0) {
-				data->AbortOnRead = 1;
-				data->InElse = 0;
-			}
-			else {
-				branch_depth--;
-			}
-			break;
-		default:
-			break;
-		}
-
-		if (!data->InElse)
-			rc_for_all_reads_src(tmp, get_readers_normal_read_callback, &d);
-		rc_for_all_writes_mask(tmp, get_readers_write_callback, &d);
-
-		if (data->Abort)
-			return;
-
-		if (!d.AliveWriteMask)
-			return;
-	}
+	rc_for_all_writes_mask(writer, get_readers_for_single_write, &d);
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
index 7de6b98f763..ef971c5b234 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
@@ -36,6 +36,7 @@ struct rc_instruction;
 struct rc_swizzle_caps;
 struct rc_src_register;
 struct rc_pair_instruction_arg;
+struct rc_pair_instruction_source;
 struct rc_compiler;
 
 
@@ -59,7 +60,8 @@ void rc_for_all_reads_src(struct rc_instruction * inst, rc_read_src_fn cb,
 			void * userdata);
 
 typedef void (*rc_pair_read_arg_fn)(void * userdata,
-	struct rc_instruction * inst, struct rc_pair_instruction_arg * arg);
+	struct rc_instruction * inst, struct rc_pair_instruction_arg * arg,
+	struct rc_pair_instruction_source * src);
 void rc_pair_for_all_reads_arg(struct rc_instruction * inst,
 					rc_pair_read_arg_fn cb, void * userdata);
 
@@ -71,7 +73,10 @@ void rc_remap_registers(struct rc_instruction * inst, rc_remap_register_fn cb, v
 struct rc_reader {
 	struct rc_instruction * Inst;
 	unsigned int WriteMask;
-	struct rc_src_register * Src;
+	union {
+		struct rc_src_register * Src;
+		struct rc_pair_instruction_arg * Arg;
+	} U;
 };
 
 struct rc_reader_data {
@@ -87,14 +92,13 @@ struct rc_reader_data {
 	void * CbData;
 };
 
-void rc_get_readers_normal(
+void rc_get_readers(
 	struct radeon_compiler * c,
-	struct rc_instruction * inst,
+	struct rc_instruction * writer,
 	struct rc_reader_data * data,
-	/*XXX: These should be their own function types. */
-	rc_read_src_fn read_cb,
+	rc_read_src_fn read_normal_cb,
+	rc_pair_read_arg_fn read_pair_cb,
 	rc_read_write_mask_fn write_cb);
-
 /**
  * Compiler passes based on dataflow analysis.
  */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index da495a3afaa..25afd272bee 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -67,6 +67,13 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.IsComponentwise = 1
 	},
 	{
+		.Opcode = RC_OPCODE_CLAMP,
+		.Name = "CLAMP",
+		.NumSrcRegs = 3,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
 		.Opcode = RC_OPCODE_CMP,
 		.Name = "CMP",
 		.NumSrcRegs = 3,
@@ -453,6 +460,7 @@ void rc_compute_sources_for_writemask(
 			srcmasks[1] |= RC_MASK_XY;
 			break;
 		case RC_OPCODE_DP3:
+		case RC_OPCODE_XPD:
 			srcmasks[0] |= RC_MASK_XYZ;
 			srcmasks[1] |= RC_MASK_XYZ;
 			break;
@@ -460,6 +468,10 @@ void rc_compute_sources_for_writemask(
 			srcmasks[0] |= RC_MASK_XYZW;
 			srcmasks[1] |= RC_MASK_XYZW;
 			break;
+		case RC_OPCODE_DPH:
+			srcmasks[0] |= RC_MASK_XYZ;
+			srcmasks[1] |= RC_MASK_XYZW;
+			break;
 		case RC_OPCODE_TXB:
 		case RC_OPCODE_TXP:
 			srcmasks[0] |= RC_MASK_W;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index d3f639c8701..7e666101276 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -50,6 +50,9 @@ typedef enum {
 	/** vec4 instruction: dst.c = ceil(src0.c) */
 	RC_OPCODE_CEIL,
 
+	/** vec4 instruction: dst.c = clamp(src0.c, src1.c, src2.c) */
+	RC_OPCODE_CLAMP,
+
 	/** vec4 instruction: dst.c = src0.c < 0.0 ? src1.c : src2.c */
 	RC_OPCODE_CMP,
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index 15b9c5e7dc3..44f4c0fbdc7 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -54,12 +54,7 @@ static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct
 		combine.Negate = outer.Negate;
 	} else {
 		combine.Abs = inner.Abs;
-		combine.Negate = 0;
-		for(unsigned int chan = 0; chan < 4; ++chan) {
-			unsigned int swz = GET_SWZ(outer.Swizzle, chan);
-			if (swz < 4)
-				combine.Negate |= GET_BIT(inner.Negate, swz) << chan;
-		}
+		combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
 		combine.Negate ^= outer.Negate;
 	}
 	combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
@@ -71,12 +66,13 @@ static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
 {
 	rc_register_file file = src->File;
 	struct rc_reader_data * reader_data = data;
-	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
 
-	/* It is possible to do copy propigation in this situation,
-	 * just not right now, see peephole_add_presub_inv() */
-	if (reader_data->Writer->U.I.PreSub.Opcode != RC_PRESUB_NONE &&
-			(info->NumSrcRegs > 2 || info->HasTexture)) {
+	if(!rc_inst_can_use_presub(inst,
+				reader_data->Writer->U.I.PreSub.Opcode,
+				rc_swizzle_to_writemask(src->Swizzle),
+				*src,
+				reader_data->Writer->U.I.PreSub.SrcReg[0],
+				reader_data->Writer->U.I.PreSub.SrcReg[1])) {
 		reader_data->Abort = 1;
 		return;
 	}
@@ -112,11 +108,11 @@ static void src_clobbered_reads_cb(
 	    && src->Index == sc_data->Index
 	    && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
 
-		sc_data->ReaderData->AbortOnRead = 1;
+		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
 	}
 
 	if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
-		sc_data->ReaderData->AbortOnRead = 1;
+		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
 	}
 }
 
@@ -149,8 +145,9 @@ static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * i
 		return;
 
 	/* Get a list of all the readers of this MOV instruction. */
-	rc_get_readers_normal(c, inst_mov, &reader_data,
-			copy_propagate_scan_read, is_src_clobbered_scan_write);
+	rc_get_readers(c, inst_mov, &reader_data,
+		       copy_propagate_scan_read, NULL,
+		       is_src_clobbered_scan_write);
 
 	if (reader_data.Abort || reader_data.ReaderCount == 0)
 		return;
@@ -158,7 +155,7 @@ static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * i
 	/* Propagate the MOV instruction. */
 	for (i = 0; i < reader_data.ReaderCount; i++) {
 		struct rc_instruction * inst = reader_data.Readers[i].Inst;
-		*reader_data.Readers[i].Src = chain_srcregs(*reader_data.Readers[i].Src, inst_mov->U.I.SrcReg[0]);
+		*reader_data.Readers[i].U.Src = chain_srcregs(*reader_data.Readers[i].U.Src, inst_mov->U.I.SrcReg[0]);
 
 		if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
 			inst->U.I.PreSub = inst_mov->U.I.PreSub;
@@ -423,24 +420,13 @@ static void presub_scan_read(
 	struct rc_src_register * src)
 {
 	struct rc_reader_data * reader_data = data;
-	const struct rc_opcode_info * info =
-					rc_get_opcode_info(inst->U.I.Opcode);
-	/* XXX: There are some situations where instructions
-	 * with more than 2 src registers can use the
-	 * presubtract select, but to keep things simple we
-	 * will disable presubtract on these instructions for
-	 * now. */
-	if (info->NumSrcRegs > 2 || info->HasTexture) {
-		reader_data->Abort = 1;
-		return;
-	}
+	rc_presubtract_op * presub_opcode = reader_data->CbData;
 
-	/* We can't use more than one presubtract value in an
-	 * instruction, unless the two prsubtract operations
-	 * are the same and read from the same registers.
-	 * XXX For now we will limit instructions to only one presubtract
-	 * value.*/
-	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE) {
+	if (!rc_inst_can_use_presub(inst, *presub_opcode,
+			reader_data->Writer->U.I.DstReg.WriteMask,
+			*src,
+			reader_data->Writer->U.I.SrcReg[0],
+			reader_data->Writer->U.I.SrcReg[1])) {
 		reader_data->Abort = 1;
 		return;
 	}
@@ -454,8 +440,10 @@ static int presub_helper(
 {
 	struct rc_reader_data reader_data;
 	unsigned int i;
+	rc_presubtract_op cb_op = presub_opcode;
 
-	rc_get_readers_normal(c, inst_add, &reader_data, presub_scan_read,
+	reader_data.CbData = &cb_op;
+	rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
 						is_src_clobbered_scan_write);
 
 	if (reader_data.Abort || reader_data.ReaderCount == 0)
@@ -468,7 +456,7 @@ static int presub_helper(
 				rc_get_opcode_info(reader.Inst->U.I.Opcode);
 
 		for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
-			if (&reader.Inst->U.I.SrcReg[src_index] == reader.Src)
+			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.Src)
 				presub_replace(inst_add, reader.Inst, src_index);
 		}
 	}
@@ -505,7 +493,9 @@ static void presub_replace_add(
 	inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
 }
 
-static int is_presub_candidate(struct rc_instruction * inst)
+static int is_presub_candidate(
+	struct radeon_compiler * c,
+	struct rc_instruction * inst)
 {
 	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
 	unsigned int i;
@@ -514,7 +504,12 @@ static int is_presub_candidate(struct rc_instruction * inst)
 		return 0;
 
 	for(i = 0; i < info->NumSrcRegs; i++) {
-		if (src_reads_dst_mask(inst->U.I.SrcReg[i], inst->U.I.DstReg))
+		struct rc_src_register src = inst->U.I.SrcReg[i];
+		if (src_reads_dst_mask(src, inst->U.I.DstReg))
+			return 0;
+
+		src.File = RC_FILE_PRESUB;
+		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
 			return 0;
 	}
 	return 1;
@@ -528,7 +523,7 @@ static int peephole_add_presub_add(
 	struct rc_src_register * src1 = NULL;
 	unsigned int i;
 
-	if (!is_presub_candidate(inst_add))
+	if (!is_presub_candidate(c, inst_add))
 		return 0;
 
 	if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
@@ -592,7 +587,7 @@ static int peephole_add_presub_inv(
 {
 	unsigned int i, swz, mask;
 
-	if (!is_presub_candidate(inst_add))
+	if (!is_presub_candidate(c, inst_add))
 		return 0;
 
 	mask = inst_add->U.I.DstReg.WriteMask;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
index 91524f5ec68..d53181e1f75 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
@@ -66,10 +66,13 @@ struct regalloc_state {
 	struct hardware_register * HwTemporary;
 	unsigned int NumHwTemporaries;
 	/**
-	 * If an instruction is inside of a loop, end_loop will be the
-	 * IP of the ENDLOOP instruction, otherwise end_loop will be 0
+	 * If an instruction is inside of a loop, EndLoop will be the
+	 * IP of the ENDLOOP instruction, and BeginLoop will be the IP
+	 * of the BGNLOOP instruction.  Otherwise, EndLoop and BeginLoop
+	 * will be -1.
 	 */
-	int end_loop;
+	int EndLoop;
+	int BeginLoop;
 };
 
 static void print_live_intervals(struct live_intervals * src)
@@ -180,11 +183,13 @@ static void scan_callback(void * data, struct rc_instruction * inst,
 		reg->Used = 1;
 		if (file == RC_FILE_INPUT)
 			reg->Live.Start = -1;
+		else if (s->BeginLoop >= 0)
+			reg->Live.Start = s->BeginLoop;
 		else
 			reg->Live.Start = inst->IP;
 		reg->Live.End = inst->IP;
-	} else if (s->end_loop)
-		reg->Live.End = s->end_loop;
+	} else if (s->EndLoop >= 0)
+		reg->Live.End = s->EndLoop;
 	else if (inst->IP > reg->Live.End)
 		reg->Live.End = inst->IP;
 }
@@ -195,6 +200,8 @@ static void compute_live_intervals(struct radeon_compiler *c,
 	memset(s, 0, sizeof(*s));
 	s->C = c;
 	s->NumHwTemporaries = c->max_temp_regs;
+	s->BeginLoop = -1;
+	s->EndLoop = -1;
 	s->HwTemporary =
 		memory_pool_malloc(&c->Pool,
 				   s->NumHwTemporaries * sizeof(struct hardware_register));
@@ -207,10 +214,12 @@ static void compute_live_intervals(struct radeon_compiler *c,
 	    inst = inst->Next) {
 
 		/* For all instructions inside of a loop, the ENDLOOP
-		 * instruction is used as the end of the live interval. */
-		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP && !s->end_loop) {
+		 * instruction is used as the end of the live interval and
+		 * the BGNLOOP instruction is used as the beginning. */
+		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP && s->EndLoop < 0) {
 			int loops = 1;
 			struct rc_instruction * tmp;
+			s->BeginLoop = inst->IP;
 			for(tmp = inst->Next;
 					tmp != &s->C->Program.Instructions;
 					tmp = tmp->Next) {
@@ -219,15 +228,17 @@ static void compute_live_intervals(struct radeon_compiler *c,
 				} else if (tmp->U.I.Opcode
 							== RC_OPCODE_ENDLOOP) {
 					if(!--loops) {
-						s->end_loop = tmp->IP;
+						s->EndLoop = tmp->IP;
 						break;
 					}
 				}
 			}
 		}
 
-		if (inst->IP == s->end_loop)
-			s->end_loop = 0;
+		if (inst->IP == s->EndLoop) {
+			s->EndLoop = -1;
+			s->BeginLoop = -1;
+		}
 
 		rc_for_all_reads_mask(inst, scan_callback, s);
 		rc_for_all_writes_mask(inst, scan_callback, s);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
index 553e9dcf7c1..9beb5d63579 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
@@ -30,6 +30,7 @@
 #include <stdio.h>
 
 #include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
 
 
@@ -54,6 +55,11 @@ struct schedule_instruction {
 	 * this instruction can be scheduled.
 	 */
 	unsigned int NumDependencies:5;
+
+	/** List of all readers (see rc_get_readers() for the definition of
+	 * "all readers"), even those outside the basic block this instruction
+	 * lives in. */
+	struct rc_reader_data GlobalReaders;
 };
 
 
@@ -94,6 +100,16 @@ struct register_state {
 	struct reg_value * Values[4];
 };
 
+struct remap_reg {
+	struct rc_instruciont * Inst;
+	unsigned int OldIndex:(RC_REGISTER_INDEX_BITS+1);
+	unsigned int OldSwizzle:3;
+	unsigned int NewIndex:(RC_REGISTER_INDEX_BITS+1);
+	unsigned int NewSwizzle:3;
+	unsigned int OnlyTexReads:1;
+	struct remap_reg * Next;
+};
+
 struct schedule_state {
 	struct radeon_compiler * C;
 	struct schedule_instruction * Current;
@@ -126,15 +142,6 @@ static struct reg_value ** get_reg_valuep(struct schedule_state * s,
 	return &s->Temporary[index].Values[chan];
 }
 
-static struct reg_value * get_reg_value(struct schedule_state * s,
-		rc_register_file file, unsigned int index, unsigned int chan)
-{
-	struct reg_value ** pv = get_reg_valuep(s, file, index, chan);
-	if (!pv)
-		return 0;
-	return *pv;
-}
-
 static void add_inst_to_list(struct schedule_instruction ** list, struct schedule_instruction * inst)
 {
 	inst->NextReady = *list;
@@ -295,12 +302,12 @@ static int merge_presub_sources(
 	assert(dst_full->Alpha.Opcode == RC_OPCODE_NOP);
 
 	switch(type) {
-	case RC_PAIR_SOURCE_RGB:
+	case RC_SOURCE_RGB:
 		is_rgb = 1;
 		is_alpha = 0;
 		dst_sub = &dst_full->RGB;
 		break;
-	case RC_PAIR_SOURCE_ALPHA:
+	case RC_SOURCE_ALPHA:
 		is_rgb = 0;
 		is_alpha = 1;
 		dst_sub = &dst_full->Alpha;
@@ -341,6 +348,8 @@ static int merge_presub_sources(
 				continue;
 			free_source = rc_pair_alloc_source(dst_full, is_rgb,
 					is_alpha, temp.File, temp.Index);
+			if (free_source < 0)
+				return 0;
 			one_way = 1;
 		} else {
 			dst_sub->Src[free_source] = temp;
@@ -356,11 +365,11 @@ static int merge_presub_sources(
 		for(arg = 0; arg < info->NumSrcRegs; arg++) {
 			/*If this arg does not read from an rgb source,
 			 * do nothing. */
-			if (!(rc_source_type_that_arg_reads(
-				dst_full->RGB.Arg[arg].Source,
-				dst_full->RGB.Arg[arg].Swizzle) & type)) {
+			if (!(rc_source_type_swz(dst_full->RGB.Arg[arg].Swizzle,
+								3) & type)) {
 				continue;
 			}
+
 			if (dst_full->RGB.Arg[arg].Source == srcp_src)
 				dst_full->RGB.Arg[arg].Source = free_source;
 			/* We need to do this just in case register
@@ -392,13 +401,13 @@ static int destructive_merge_instructions(
 
 	/* Merge the rgb presubtract registers. */
 	if (alpha->RGB.Src[RC_PAIR_PRESUB_SRC].Used) {
-		if (!merge_presub_sources(rgb, alpha->RGB, RC_PAIR_SOURCE_RGB)) {
+		if (!merge_presub_sources(rgb, alpha->RGB, RC_SOURCE_RGB)) {
 			return 0;
 		}
 	}
 	/* Merge the alpha presubtract registers */
 	if (alpha->Alpha.Src[RC_PAIR_PRESUB_SRC].Used) {
-		if(!merge_presub_sources(rgb,  alpha->Alpha, RC_PAIR_SOURCE_ALPHA)){
+		if(!merge_presub_sources(rgb,  alpha->Alpha, RC_SOURCE_ALPHA)){
 			return 0;
 		}
 	}
@@ -525,6 +534,222 @@ static void presub_nop(struct rc_instruction * emitted) {
 		}
 	}
 }
+
+static void rgb_to_alpha_remap (
+	struct rc_instruction * inst,
+	struct rc_pair_instruction_arg * arg,
+	rc_register_file old_file,
+	rc_swizzle old_swz,
+	unsigned int new_index)
+{
+	int new_src_index;
+	unsigned int i;
+	struct rc_pair_instruction_source * old_src =
+					rc_pair_get_src(&inst->U.P, arg);
+	if (!old_src) {
+		return;
+	}
+
+	for (i = 0; i < 3; i++) {
+		if (get_swz(arg->Swizzle, i) == old_swz) {
+			SET_SWZ(arg->Swizzle, i, RC_SWIZZLE_W);
+		}
+	}
+	memset(old_src, 0, sizeof(struct rc_pair_instruction_source));
+	new_src_index = rc_pair_alloc_source(&inst->U.P, 0, 1,
+							old_file, new_index);
+	/* This conversion is not possible, we must have made a mistake in
+	 * is_rgb_to_alpha_possible. */
+	if (new_src_index < 0) {
+		assert(0);
+		return;
+	}
+
+	arg->Source = new_src_index;
+}
+
+static int can_remap(unsigned int opcode)
+{
+	switch(opcode) {
+	case RC_OPCODE_DDX:
+	case RC_OPCODE_DDY:
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static int can_convert_opcode_to_alpha(unsigned int opcode)
+{
+	switch(opcode) {
+	case RC_OPCODE_DDX:
+	case RC_OPCODE_DDY:
+	case RC_OPCODE_DP2:
+	case RC_OPCODE_DP3:
+	case RC_OPCODE_DP4:
+	case RC_OPCODE_DPH:
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void is_rgb_to_alpha_possible(
+	void * userdata,
+	struct rc_instruction * inst,
+	struct rc_pair_instruction_arg * arg,
+	struct rc_pair_instruction_source * src)
+{
+	unsigned int chan_count = 0;
+	unsigned int alpha_sources = 0;
+	unsigned int i;
+	struct rc_reader_data * reader_data = userdata;
+
+	if (!can_remap(inst->U.P.RGB.Opcode)
+	    || !can_remap(inst->U.P.Alpha.Opcode)) {
+		reader_data->Abort = 1;
+		return;
+	}
+
+	if (!src)
+		return;
+
+	/* XXX There are some cases where we can still do the conversion if
+	 * a reader reads from a presubtract source, but for now we'll prevent
+	 * it. */
+	if (arg->Source == RC_PAIR_PRESUB_SRC) {
+		reader_data->Abort = 1;
+		return;
+	}
+
+	/* Make sure the source only reads from one component.
+	 * XXX We should allow the source to read from the same component twice.
+	 * XXX If the index we will be converting to is the same as the
+	 * current index, then it is OK to read from more than one component.
+	 */
+	for (i = 0; i < 3; i++) {
+		rc_swizzle swz = get_swz(arg->Swizzle, i);
+		switch(swz) {
+		case RC_SWIZZLE_X:
+		case RC_SWIZZLE_Y:
+		case RC_SWIZZLE_Z:
+		case RC_SWIZZLE_W:
+			chan_count++;
+			break;
+		default:
+			break;
+		}
+	}
+	if (chan_count > 1) {
+		reader_data->Abort = 1;
+		return;
+	}
+
+	/* Make sure there are enough alpha sources.
+	 * XXX If we know what register all the readers are going
+	 * to be remapped to, then in some situations we can still do
+	 * the subsitution, even if all 3 alpha sources are being used.*/
+	for (i = 0; i < 3; i++) {
+		if (inst->U.P.Alpha.Src[i].Used) {
+			alpha_sources++;
+		}
+	}
+	if (alpha_sources > 2) {
+		reader_data->Abort = 1;
+		return;
+	}
+}
+
+static int convert_rgb_to_alpha(
+	struct schedule_state * s,
+	struct schedule_instruction * sched_inst)
+{
+	struct rc_pair_instruction * pair_inst = &sched_inst->Instruction->U.P;
+	unsigned int old_mask = pair_inst->RGB.WriteMask;
+	unsigned int old_swz = rc_mask_to_swizzle(old_mask);
+	const struct rc_opcode_info * info =
+				rc_get_opcode_info(pair_inst->RGB.Opcode);
+	int new_index = -1;
+	unsigned int i;
+
+	if (sched_inst->GlobalReaders.Abort)
+		return 0;
+
+	if (!pair_inst->RGB.WriteMask)
+		return 0;
+
+	if (!can_convert_opcode_to_alpha(pair_inst->RGB.Opcode)
+	    || !can_convert_opcode_to_alpha(pair_inst->Alpha.Opcode)) {
+		return 0;
+	}
+
+	assert(sched_inst->NumWriteValues == 1);
+
+	if (!sched_inst->WriteValues[0]) {
+		assert(0);
+		return 0;
+	}
+
+	/* We start at the old index, because if we can reuse the same
+	 * register and just change the swizzle then it is more likely we
+	 * will be able to convert all the readers. */
+	for (i = pair_inst->RGB.DestIndex; i < RC_REGISTER_MAX_INDEX; i++) {
+		struct reg_value ** new_regvalp = get_reg_valuep(
+						s, RC_FILE_TEMPORARY, i, 3);
+		if (!*new_regvalp) {
+			struct reg_value ** old_regvalp =
+				get_reg_valuep(s,
+					RC_FILE_TEMPORARY,
+					pair_inst->RGB.DestIndex,
+					rc_mask_to_swizzle(old_mask));
+			new_index = i;
+			*new_regvalp = *old_regvalp;
+			*old_regvalp = NULL;
+			new_regvalp = get_reg_valuep(s, RC_FILE_TEMPORARY, i, 3);
+			break;
+		}
+	}
+	if (new_index < 0) {
+		return 0;
+	}
+
+	pair_inst->Alpha.Opcode = pair_inst->RGB.Opcode;
+	pair_inst->Alpha.DestIndex = new_index;
+	pair_inst->Alpha.WriteMask = 1;
+	pair_inst->Alpha.Target = pair_inst->RGB.Target;
+	pair_inst->Alpha.OutputWriteMask = pair_inst->RGB.OutputWriteMask;
+	pair_inst->Alpha.DepthWriteMask = pair_inst->RGB.DepthWriteMask;
+	pair_inst->Alpha.Saturate = pair_inst->RGB.Saturate;
+	memcpy(pair_inst->Alpha.Arg, pair_inst->RGB.Arg,
+						sizeof(pair_inst->Alpha.Arg));
+	/* Move the swizzles into the first chan */
+	for (i = 0; i < info->NumSrcRegs; i++) {
+		unsigned int j;
+		for (j = 0; j < 3; j++) {
+			unsigned int swz = get_swz(pair_inst->Alpha.Arg[i].Swizzle, j);
+			if (swz != RC_SWIZZLE_UNUSED) {
+				pair_inst->Alpha.Arg[i].Swizzle = swz;
+				break;
+			}
+		}
+	}
+	pair_inst->RGB.Opcode = RC_OPCODE_NOP;
+	pair_inst->RGB.DestIndex = 0;
+	pair_inst->RGB.WriteMask = 0;
+	pair_inst->RGB.Target = 0;
+	pair_inst->RGB.OutputWriteMask = 0;
+	pair_inst->RGB.DepthWriteMask = 0;
+	pair_inst->RGB.Saturate = 0;
+	memset(pair_inst->RGB.Arg, 0, sizeof(pair_inst->RGB.Arg));
+
+	for(i = 0; i < sched_inst->GlobalReaders.ReaderCount; i++) {
+		struct rc_reader reader = sched_inst->GlobalReaders.Readers[i];
+		rgb_to_alpha_remap(reader.Inst, reader.U.Arg,
+					RC_FILE_TEMPORARY, old_swz, new_index);
+	}
+	return 1;
+}
+
 /**
  * Find a good ALU instruction or pair of ALU instruction and emit it.
  *
@@ -536,24 +761,16 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 {
 	struct schedule_instruction * sinst;
 
-	if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) {
-		if (s->ReadyFullALU) {
-			sinst = s->ReadyFullALU;
-			s->ReadyFullALU = s->ReadyFullALU->NextReady;
-		} else if (s->ReadyRGB) {
-			sinst = s->ReadyRGB;
-			s->ReadyRGB = s->ReadyRGB->NextReady;
-		} else {
-			sinst = s->ReadyAlpha;
-			s->ReadyAlpha = s->ReadyAlpha->NextReady;
-		}
-
+	if (s->ReadyFullALU) {
+		sinst = s->ReadyFullALU;
+		s->ReadyFullALU = s->ReadyFullALU->NextReady;
 		rc_insert_instruction(before->Prev, sinst->Instruction);
 		commit_alu_instruction(s, sinst);
 	} else {
 		struct schedule_instruction **prgb;
 		struct schedule_instruction **palpha;
-
+		struct schedule_instruction *prev;
+pair:
 		/* Some pairings might fail because they require too
 		 * many source slots; try all possible pairings if necessary */
 		for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
@@ -572,10 +789,43 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 				goto success;
 			}
 		}
-
-		/* No success in pairing; just take the first RGB instruction */
-		sinst = s->ReadyRGB;
-		s->ReadyRGB = s->ReadyRGB->NextReady;
+		prev = NULL;
+		/* No success in pairing, now try to convert one of the RGB
+		 * instructions to an Alpha so we can pair it with another RGB.
+		 */
+		if (s->ReadyRGB && s->ReadyRGB->NextReady) {
+		for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
+			if ((*prgb)->NumWriteValues == 1) {
+				struct schedule_instruction * prgb_next;
+				if (!convert_rgb_to_alpha(s, *prgb))
+					goto cont_loop;
+				prgb_next = (*prgb)->NextReady;
+				/* Add instruction to the Alpha ready list. */
+				(*prgb)->NextReady = s->ReadyAlpha;
+				s->ReadyAlpha = *prgb;
+				/* Remove instruction from the RGB ready list.*/
+				if (prev)
+					prev->NextReady = prgb_next;
+				else
+					s->ReadyRGB = prgb_next;
+				goto pair;
+			}
+cont_loop:
+			prev = *prgb;
+		}
+		}
+		/* Still no success in pairing, just take the first RGB
+		 * or alpha instruction. */
+		if (s->ReadyRGB) {
+			sinst = s->ReadyRGB;
+			s->ReadyRGB = s->ReadyRGB->NextReady;
+		} else if (s->ReadyAlpha) {
+			sinst = s->ReadyAlpha;
+			s->ReadyAlpha = s->ReadyAlpha->NextReady;
+		} else {
+			/*XXX Something real bad has happened. */
+			assert(0);
+		}
 
 		rc_insert_instruction(before->Prev, sinst->Instruction);
 		commit_alu_instruction(s, sinst);
@@ -591,13 +841,13 @@ static void scan_read(void * data, struct rc_instruction * inst,
 		rc_register_file file, unsigned int index, unsigned int chan)
 {
 	struct schedule_state * s = data;
-	struct reg_value * v = get_reg_value(s, file, index, chan);
+	struct reg_value ** v = get_reg_valuep(s, file, index, chan);
 	struct reg_value_reader * reader;
 
 	if (!v)
 		return;
 
-	if (v->Writer == s->Current) {
+	if (*v && (*v)->Writer == s->Current) {
 		/* The instruction reads and writes to a register component.
 		 * In this case, we only want to increment dependencies by one.
 		 */
@@ -608,16 +858,28 @@ static void scan_read(void * data, struct rc_instruction * inst,
 
 	reader = memory_pool_malloc(&s->C->Pool, sizeof(*reader));
 	reader->Reader = s->Current;
-	reader->Next = v->Readers;
-	v->Readers = reader;
-	v->NumReaders++;
-
-	s->Current->NumDependencies++;
+	if (!*v) {
+		/* In this situation, the instruction reads from a register
+		 * that hasn't been written to or read from in the current
+		 * block. */
+		*v = memory_pool_malloc(&s->C->Pool, sizeof(struct reg_value));
+		memset(*v, 0, sizeof(struct reg_value));
+		(*v)->Readers = reader;
+	} else {
+		reader->Next = (*v)->Readers;
+		(*v)->Readers = reader;
+		/* Only update the current instruction's dependencies if the
+		 * register it reads from has been written to in this block. */
+		if ((*v)->Writer) {
+			s->Current->NumDependencies++;
+		}
+	}
+	(*v)->NumReaders++;
 
 	if (s->Current->NumReadValues >= 12) {
 		rc_error(s->C, "%s: NumReadValues overflow\n", __FUNCTION__);
 	} else {
-		s->Current->ReadValues[s->Current->NumReadValues++] = v;
+		s->Current->ReadValues[s->Current->NumReadValues++] = *v;
 	}
 }
 
@@ -652,6 +914,16 @@ static void scan_write(void * data, struct rc_instruction * inst,
 	}
 }
 
+static void is_rgb_to_alpha_possible_normal(
+	void * userdata,
+	struct rc_instruction * inst,
+	struct rc_src_register * src)
+{
+	struct rc_reader_data * reader_data = userdata;
+	reader_data->Abort = 1;
+
+}
+
 static void schedule_block(struct r300_fragment_program_compiler * c,
 		struct rc_instruction * begin, struct rc_instruction * end)
 {
@@ -683,6 +955,11 @@ static void schedule_block(struct r300_fragment_program_compiler * c,
 
 		if (!s.Current->NumDependencies)
 			instruction_ready(&s, s.Current);
+
+		/* Get global readers for possible RGB->Alpha conversion. */
+		rc_get_readers(s.C, inst, &s.Current->GlobalReaders,
+				is_rgb_to_alpha_possible_normal,
+				is_rgb_to_alpha_possible, NULL);
 	}
 
 	/* Temporarily unlink all instructions */
@@ -711,8 +988,13 @@ static int is_controlflow(struct rc_instruction * inst)
 
 void rc_pair_schedule(struct radeon_compiler *cc, void *user)
 {
+	struct schedule_state s;
+
 	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)cc;
 	struct rc_instruction * inst = c->Base.Program.Instructions.Next;
+
+	memset(&s, 0, sizeof(s));
+	s.C = &c->Base;
 	while(inst != &c->Base.Program.Instructions) {
 		struct rc_instruction * first;
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
index c549be52183..fc05366f50e 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
@@ -280,9 +280,12 @@ static void set_pair_instruction(struct r300_fragment_program_compiler *c,
 			pair->RGB.DestIndex = inst->DstReg.Index;
 			pair->RGB.WriteMask |= inst->DstReg.WriteMask & RC_MASK_XYZ;
 		}
+
 		if (needalpha) {
-			pair->Alpha.DestIndex = inst->DstReg.Index;
 			pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+			if (pair->Alpha.WriteMask) {
+				pair->Alpha.DestIndex = inst->DstReg.Index;
+			}
 		}
 	}
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program.c b/src/mesa/drivers/dri/r300/compiler/radeon_program.c
index 24b685fbeb4..fe5756ebc45 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.c
@@ -30,6 +30,7 @@
 #include <stdio.h>
 
 #include "radeon_compiler.h"
+#include "radeon_dataflow.h"
 
 
 /**
@@ -70,58 +71,98 @@ void rc_local_transform(
 	}
 }
 
+struct get_used_temporaries_data {
+	unsigned char * Used;
+	unsigned int UsedLength;
+};
+
+static void get_used_temporaries_cb(
+	void * userdata,
+	struct rc_instruction * inst,
+	rc_register_file file,
+	unsigned int index,
+	unsigned int mask)
+{
+	struct get_used_temporaries_data * d = userdata;
+
+	if (file != RC_FILE_TEMPORARY)
+		return;
+
+	if (index >= d->UsedLength)
+		return;
+
+	d->Used[index] |= mask;
+}
+
 /**
- * Left multiplication of a register with a swizzle
+ * This function fills in the parameter 'used' with a writemask that
+ * represent which components of each temporary register are used by the
+ * program.  This is meant to be combined with rc_find_free_temporary_list as a
+ * more efficient version of rc_find_free_temporary.
+ * @param used The function does not initialize this parameter.
  */
-struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg)
+void rc_get_used_temporaries(
+	struct radeon_compiler * c,
+	unsigned char * used,
+	unsigned int used_length)
+{
+	struct rc_instruction * inst;
+	struct get_used_temporaries_data d;
+	d.Used = used;
+	d.UsedLength = used_length;
+
+	for(inst = c->Program.Instructions.Next;
+			inst != &c->Program.Instructions; inst = inst->Next) {
+
+		rc_for_all_reads_mask(inst, get_used_temporaries_cb, &d);
+		rc_for_all_writes_mask(inst, get_used_temporaries_cb, &d);
+	}
+}
+
+/* Search a list of used temporaries for a free one
+ * \sa rc_get_used_temporaries
+ * @note If this functions finds a free temporary, it will mark it as used
+ * in the used temporary list (param 'used')
+ * @param used list of used temporaries
+ * @param used_length number of items in param 'used'
+ * @param mask which components must be free in the temporary index that is
+ * returned.
+ * @return -1 If there are no more free temporaries, otherwise the index of
+ * a temporary register where the components specified in param 'mask' are
+ * not being used.
+ */
+int rc_find_free_temporary_list(
+	struct radeon_compiler * c,
+	unsigned char * used,
+	unsigned int used_length,
+	unsigned int mask)
 {
-	struct rc_src_register tmp = srcreg;
 	int i;
-	tmp.Swizzle = 0;
-	tmp.Negate = 0;
-	for(i = 0; i < 4; ++i) {
-		rc_swizzle swz = GET_SWZ(swizzle, i);
-		if (swz < 4) {
-			tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
-			tmp.Negate |= GET_BIT(srcreg.Negate, swz) << i;
-		} else {
-			tmp.Swizzle |= swz << (i*3);
+	for(i = 0; i < used_length; i++) {
+		if ((~used[i] & mask) == mask) {
+			used[i] |= mask;
+			return i;
 		}
 	}
-	return tmp;
+	return -1;
 }
 
 unsigned int rc_find_free_temporary(struct radeon_compiler * c)
 {
-	char used[RC_REGISTER_MAX_INDEX];
-	unsigned int i;
-	struct rc_instruction * rcinst;
+	unsigned char used[RC_REGISTER_MAX_INDEX];
+	int free;
 
 	memset(used, 0, sizeof(used));
 
-	for (rcinst = c->Program.Instructions.Next; rcinst != &c->Program.Instructions; rcinst = rcinst->Next) {
-		const struct rc_sub_instruction *inst = &rcinst->U.I;
-		const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->Opcode);
-		unsigned int k;
-
-		for (k = 0; k < opcode->NumSrcRegs; k++) {
-			if (inst->SrcReg[k].File == RC_FILE_TEMPORARY)
-				used[inst->SrcReg[k].Index] = 1;
-		}
-
-		if (opcode->HasDstReg) {
-			if (inst->DstReg.File == RC_FILE_TEMPORARY)
-				used[inst->DstReg.Index] = 1;
-		}
-	}
+	rc_get_used_temporaries(c, used, RC_REGISTER_MAX_INDEX);
 
-	for (i = 0; i < RC_REGISTER_MAX_INDEX; i++) {
-		if (!used[i])
-			return i;
+	free = rc_find_free_temporary_list(c, used, RC_REGISTER_MAX_INDEX,
+								RC_MASK_XYZW);
+	if (free < 0) {
+		rc_error(c, "Ran out of temporary registers\n");
+		return 0;
 	}
-
-	rc_error(c, "Ran out of temporary registers\n");
-	return 0;
+	return free;
 }
 
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program.h b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
index f0a77d7b539..df6c94b35f9 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
@@ -159,47 +159,6 @@ struct rc_program {
 	struct rc_constant_list Constants;
 };
 
-static inline rc_swizzle get_swz(unsigned int swz, rc_swizzle idx)
-{
-	if (idx & 0x4)
-		return idx;
-	return GET_SWZ(swz, idx);
-}
-
-static inline unsigned int combine_swizzles4(unsigned int src,
-		rc_swizzle swz_x, rc_swizzle swz_y, rc_swizzle swz_z, rc_swizzle swz_w)
-{
-	unsigned int ret = 0;
-
-	ret |= get_swz(src, swz_x);
-	ret |= get_swz(src, swz_y) << 3;
-	ret |= get_swz(src, swz_z) << 6;
-	ret |= get_swz(src, swz_w) << 9;
-
-	return ret;
-}
-
-static inline unsigned int combine_swizzles(unsigned int src, unsigned int swz)
-{
-	unsigned int ret = 0;
-
-	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_X));
-	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Y)) << 3;
-	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Z)) << 6;
-	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_W)) << 9;
-
-	return ret;
-}
-
-struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg);
-
-static inline void reset_srcreg(struct rc_src_register* reg)
-{
-	memset(reg, 0, sizeof(struct rc_src_register));
-	reg->Swizzle = RC_SWIZZLE_XYZW;
-}
-
-
 /**
  * A transformation that can be passed to \ref rc_local_transform.
  *
@@ -222,6 +181,17 @@ void rc_local_transform(
 	struct radeon_compiler *c,
 	void *user);
 
+void rc_get_used_temporaries(
+	struct radeon_compiler * c,
+	unsigned char * used,
+	unsigned int used_length);
+
+int rc_find_free_temporary_list(
+	struct radeon_compiler * c,
+	unsigned char * used,
+	unsigned int used_length,
+	unsigned int mask);
+
 unsigned int rc_find_free_temporary(struct radeon_compiler * c);
 
 struct rc_instruction *rc_alloc_instruction(struct radeon_compiler * c);
@@ -233,4 +203,5 @@ unsigned int rc_recompute_ips(struct radeon_compiler * c);
 
 void rc_print_program(const struct rc_program *prog);
 
+rc_swizzle rc_mask_to_swizzle(unsigned int mask);
 #endif
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
index 39408845d5a..58977a40c7c 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
@@ -36,6 +36,7 @@
 #include "radeon_program_alu.h"
 
 #include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
 
 
 static struct rc_instruction *emit1(
@@ -84,16 +85,6 @@ static struct rc_instruction *emit3(
 	return fpi;
 }
 
-static struct rc_dst_register dstreg(int file, int index)
-{
-	struct rc_dst_register dst;
-	dst.File = file;
-	dst.Index = index;
-	dst.WriteMask = RC_MASK_XYZW;
-	dst.RelAddr = 0;
-	return dst;
-}
-
 static struct rc_dst_register dstregtmpmask(int index, int mask)
 {
 	struct rc_dst_register dst = {0};
@@ -186,6 +177,38 @@ static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
 	return swizzle_smear(reg, RC_SWIZZLE_W);
 }
 
+static int is_dst_safe_to_reuse(struct rc_instruction *inst)
+{
+	const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
+	unsigned i;
+
+	assert(info->HasDstReg);
+
+	if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
+		return 0;
+
+	for (i = 0; i < info->NumSrcRegs; i++) {
+		if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
+		    inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
+			return 0;
+	}
+
+	return 1;
+}
+
+static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
+					       struct rc_instruction *inst)
+{
+	unsigned tmp;
+
+	if (is_dst_safe_to_reuse(inst))
+		tmp = inst->U.I.DstReg.Index;
+	else
+		tmp = rc_find_free_temporary(c);
+
+	return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
+}
+
 static void transform_ABS(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
@@ -209,10 +232,26 @@ static void transform_CEIL(struct radeon_compiler* c,
 	 *     ceil(x) = x+frac(-x)
 	 */
 
-	int tempreg = rc_find_free_temporary(c);
-	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]));
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
 	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg));
+		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
+	rc_remove_instruction(inst);
+}
+
+static void transform_CLAMP(struct radeon_compiler *c,
+	struct rc_instruction *inst)
+{
+	/* CLAMP dst, src, min, max
+	 *    into:
+	 * MIN tmp, src, max
+	 * MAX dst, tmp, min
+	 */
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+	emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
+		inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
+	emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
 	rc_remove_instruction(inst);
 }
 
@@ -258,10 +297,10 @@ static void transform_DST(struct radeon_compiler* c,
 static void transform_FLR(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
-	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0]);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
 	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
+		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
 	rc_remove_instruction(inst);
 }
 
@@ -351,14 +390,14 @@ static void transform_LIT(struct radeon_compiler* c,
 static void transform_LRP(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
-		dstreg(RC_FILE_TEMPORARY, tempreg),
+		dst,
 		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
 	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
 		inst->U.I.DstReg,
-		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[2]);
+		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
 
 	rc_remove_instruction(inst);
 }
@@ -366,9 +405,8 @@ static void transform_LRP(struct radeon_compiler* c,
 static void transform_POW(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
-	struct rc_dst_register tempdst = dstreg(RC_FILE_TEMPORARY, tempreg);
-	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempreg);
+	struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
+	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
 	tempdst.WriteMask = RC_MASK_W;
 	tempsrc.Swizzle = RC_SWIZZLE_WWWW;
 
@@ -388,11 +426,11 @@ static void transform_RSQ(struct radeon_compiler* c,
 static void transform_SEQ(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_zero, builtin_one);
+		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
 }
@@ -407,11 +445,11 @@ static void transform_SFL(struct radeon_compiler* c,
 static void transform_SGE(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
+		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
 }
@@ -419,11 +457,11 @@ static void transform_SGE(struct radeon_compiler* c,
 static void transform_SGT(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
+		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
 }
@@ -431,11 +469,11 @@ static void transform_SGT(struct radeon_compiler* c,
 static void transform_SLE(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
+		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
 }
@@ -443,11 +481,11 @@ static void transform_SLE(struct radeon_compiler* c,
 static void transform_SLT(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
+		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
 }
@@ -455,11 +493,11 @@ static void transform_SLT(struct radeon_compiler* c,
 static void transform_SNE(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_one, builtin_zero);
+		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
 }
@@ -473,12 +511,13 @@ static void transform_SSG(struct radeon_compiler* c,
 	 *   CMP tmp1, x, 1, 0
 	 *   ADD result, tmp0, -tmp1;
 	 */
-	unsigned tmp0, tmp1;
+	struct rc_dst_register dst0;
+	unsigned tmp1;
 
 	/* 0 < x */
-	tmp0 = rc_find_free_temporary(c);
+	dst0 = try_to_reuse_dst(c, inst);
 	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
-	      dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
+	      dst0,
 	      negate(inst->U.I.SrcReg[0]),
 	      builtin_one,
 	      builtin_zero);
@@ -495,7 +534,7 @@ static void transform_SSG(struct radeon_compiler* c,
 	/* result = tmp0 - tmp1 */
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
 	      inst->U.I.DstReg,
-	      srcreg(RC_FILE_TEMPORARY, tmp0),
+	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
 	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
 
 	rc_remove_instruction(inst);
@@ -517,15 +556,15 @@ static void transform_SWZ(struct radeon_compiler* c,
 static void transform_XPD(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstreg(RC_FILE_TEMPORARY, tempreg),
+	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
 	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
-		negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
+		negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
 
 	rc_remove_instruction(inst);
 }
@@ -553,6 +592,7 @@ int radeonTransformALU(
 	switch(inst->U.I.Opcode) {
 	case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
+	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
 	case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
 	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
 	case RC_OPCODE_DST: transform_DST(c, inst); return 1;
@@ -592,7 +632,7 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
 {
 	/* There is no decent CMP available, so let's rig one up.
 	 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
-	 * The following sequence consumes two temps and two extra slots
+	 * The following sequence consumes zero to two temps and two extra slots
 	 * (the second temp and the second slot is consumed by transform_LRP),
 	 * but should be equivalent:
 	 *
@@ -600,18 +640,18 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
 	 * LRP dst, tmp0, src1, src2
 	 *
 	 * Yes, I know, I'm a mad scientist. ~ C. & M. */
-	int tempreg0 = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	/* SLT tmp0, src0, 0.0 */
 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
-		dstreg(RC_FILE_TEMPORARY, tempreg0),
+		dst,
 		inst->U.I.SrcReg[0], builtin_zero);
 
 	/* LRP dst, tmp0, src1, src2 */
 	transform_LRP(c,
 		emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
 		      inst->U.I.DstReg,
-		      srcreg(RC_FILE_TEMPORARY, tempreg0), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
+		      srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
 
 	rc_remove_instruction(inst);
 }
@@ -642,7 +682,7 @@ static void transform_r300_vertex_DP3(struct radeon_compiler* c,
 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 	unsigned constant_swizzle;
 	int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
 							 0.0000000000000000001,
@@ -650,16 +690,16 @@ static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
 
 	/* MOV dst, src */
 	emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
-		dstreg(RC_FILE_TEMPORARY, tempreg),
+		dst,
 		inst->U.I.SrcReg[0]);
 
 	/* MAX dst.z, src, 0.00...001 */
 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
-		dstregtmpmask(tempreg, RC_MASK_Y),
-		srcreg(RC_FILE_TEMPORARY, tempreg),
+		dstregtmpmask(dst.Index, RC_MASK_Y),
+		srcreg(RC_FILE_TEMPORARY, dst.Index),
 		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
 
-	inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, tempreg);
+	inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
 }
 
 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
@@ -743,12 +783,13 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
 	 *   SLT tmp1, x, 0;
 	 *   ADD result, tmp0, -tmp1;
 	 */
-	unsigned tmp0, tmp1;
+	struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
+	unsigned tmp1;
 
 	/* 0 < x */
-	tmp0 = rc_find_free_temporary(c);
+	dst0 = try_to_reuse_dst(c, inst);
 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
-	      dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
+	      dst0,
 	      builtin_zero,
 	      inst->U.I.SrcReg[0]);
 
@@ -763,7 +804,7 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
 	/* result = tmp0 - tmp1 */
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
 	      inst->U.I.DstReg,
-	      srcreg(RC_FILE_TEMPORARY, tmp0),
+	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
 	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
 
 	rc_remove_instruction(inst);
@@ -781,6 +822,7 @@ int r300_transform_vertex_alu(
 	switch(inst->U.I.Opcode) {
 	case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
+	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
 	case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
 	case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
 	case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
index 9dcd44c522d..45f79ece5ba 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
@@ -181,4 +181,9 @@ static inline int rc_presubtract_src_reg_count(rc_presubtract_op op){
 		return 0;
 	}
 }
+
+#define RC_SOURCE_NONE  0x0
+#define RC_SOURCE_RGB   0x1
+#define RC_SOURCE_ALPHA 0x2
+
 #endif /* RADEON_PROGRAM_CONSTANTS_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
index a21fe8d3df8..5905d26e521 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
@@ -27,6 +27,9 @@
 
 #include "radeon_program_pair.h"
 
+#include "radeon_compiler_util.h"
+
+#include <stdlib.h>
 
 /**
  * Return the source slot where we installed the given register access,
@@ -204,24 +207,37 @@ void rc_pair_foreach_source_that_rgb_reads(
 	}
 }
 
-/*return 0 for rgb, 1 for alpha -1 for error. */
-
-unsigned int rc_source_type_that_arg_reads(
-	unsigned int source,
-	unsigned int swizzle)
+struct rc_pair_instruction_source * rc_pair_get_src(
+	struct rc_pair_instruction * pair_inst,
+	struct rc_pair_instruction_arg * arg)
 {
-	unsigned int chan;
-	unsigned int swz = RC_SWIZZLE_UNUSED;
-	unsigned int ret = RC_PAIR_SOURCE_NONE;
-
-	for(chan = 0; chan < 3; chan++) {
-		swz = GET_SWZ(swizzle, chan);
-		if (swz == RC_SWIZZLE_W) {
-			ret |= RC_PAIR_SOURCE_ALPHA;
-		} else if (swz == RC_SWIZZLE_X || swz == RC_SWIZZLE_Y
-						|| swz == RC_SWIZZLE_Z) {
-			ret |= RC_PAIR_SOURCE_RGB;
+	unsigned int i, type;
+	unsigned int channels = 0;
+
+	for(i = 0; i < 3; i++) {
+		if (arg == pair_inst->RGB.Arg + i) {
+			channels = 3;
+			break;
 		}
 	}
-	return ret;
+
+	if (channels == 0) {
+		for (i = 0; i < 3; i++) {
+			if (arg == pair_inst->Alpha.Arg + i) {
+				channels = 1;
+				break;
+			}
+		}
+	}
+
+	assert(channels > 0);
+	type = rc_source_type_swz(arg->Swizzle, channels);
+
+	if (type & RC_SOURCE_RGB) {
+		return &pair_inst->RGB.Src[arg->Source];
+	} else if (type & RC_SOURCE_ALPHA) {
+		return &pair_inst->Alpha.Src[arg->Source];
+	} else {
+		return NULL;
+	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
index 54d44a2098b..ccf7a0070cd 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
@@ -55,10 +55,6 @@ struct radeon_compiler;
  */
 #define RC_PAIR_PRESUB_SRC 3
 
-#define RC_PAIR_SOURCE_NONE  0x0
-#define RC_PAIR_SOURCE_RGB   0x1
-#define RC_PAIR_SOURCE_ALPHA 0x2
-
 struct rc_pair_instruction_source {
 	unsigned int Used:1;
 	unsigned int File:3;
@@ -115,9 +111,9 @@ void rc_pair_foreach_source_that_rgb_reads(
 	void * data,
 	rc_pair_foreach_src_fn cb);
 
-unsigned int rc_source_type_that_arg_reads(
-	unsigned int source,
-	unsigned int swizzle);
+struct rc_pair_instruction_source * rc_pair_get_src(
+	struct rc_pair_instruction * pair_inst,
+	struct rc_pair_instruction_arg * arg);
 /*@}*/
 
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
index 618ab5a099b..ae13f6742f8 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
@@ -129,6 +129,7 @@ static char rc_swizzle_char(unsigned int swz)
 	case RC_SWIZZLE_HALF: return 'H';
 	case RC_SWIZZLE_UNUSED: return '_';
 	}
+	fprintf(stderr, "bad swz: %u\n", swz);
 	return '?';
 }
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
index 530afa5e08e..f9d9f34b6ad 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
@@ -28,6 +28,8 @@
 
 #include "radeon_program_tex.h"
 
+#include "radeon_compiler_util.h"
+
 /* Series of transformations to be done on textures. */
 
 static struct rc_src_register shadow_ambient(struct r300_fragment_program_compiler *compiler,
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_remove_constants.c b/src/mesa/drivers/dri/r300/compiler/radeon_remove_constants.c
index 5f67f536f61..7d76585a593 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_remove_constants.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_remove_constants.c
@@ -87,8 +87,9 @@ void rc_remove_unused_constants(struct radeon_compiler *c, void *user)
 		rc_for_all_reads_src(inst, mark_used, &d);
 	}
 
-	/* Pass 2: If there is relative addressing, mark all externals as used. */
-	if (has_rel_addr) {
+	/* Pass 2: If there is relative addressing or dead constant elimination
+	 * is disabled, mark all externals as used. */
+	if (has_rel_addr || !c->remove_unused_constants) {
 		for (unsigned i = 0; i < c->Program.Constants.Count; i++)
 			if (constants[i].Type == RC_CONSTANT_EXTERNAL)
 				const_used[i] = 1;
@@ -119,7 +120,7 @@ void rc_remove_unused_constants(struct radeon_compiler *c, void *user)
 	/*  is_identity ==> new_count == old_count
 	 * !is_identity ==> new_count <  old_count */
 	assert( is_identity || new_count <  c->Program.Constants.Count);
-	assert(!(has_rel_addr && are_externals_remapped));
+	assert(!((has_rel_addr || !c->remove_unused_constants) && are_externals_remapped));
 
 	/* Pass 4: Redirect reads of all constants to their new locations. */
 	if (!is_identity) {
@@ -127,7 +128,6 @@ void rc_remove_unused_constants(struct radeon_compiler *c, void *user)
 		     inst != &c->Program.Instructions; inst = inst->Next) {
 			rc_remap_registers(inst, remap_regs, inv_remap_table);
 		}
-
 	}
 
 	/* Set the new constant count. Note that new_count may be less than
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
index 60e228be5bd..88165f78953 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
@@ -33,100 +33,51 @@
 
 #include "radeon_compiler.h"
 #include "radeon_dataflow.h"
-
-struct reg_rename {
-	int old_index;
-	int new_index;
-	int temp_index;
-};
-
-static void rename_reg(void * data, struct rc_instruction * inst,
-			rc_register_file * file, unsigned int * index)
-{
-	struct reg_rename *r = data;
-
-	if(r->old_index == *index && *file == RC_FILE_TEMPORARY) {
-		*index = r->new_index;
-	}
-	else if(r->new_index == *index && *file == RC_FILE_TEMPORARY) {
-		*index = r->temp_index;
-	}
-}
-
-static void rename_all(
-	struct radeon_compiler *c,
-	struct rc_instruction * start,
-	unsigned int old,
-	unsigned int new,
-	unsigned int temp)
-{
-	struct rc_instruction * inst;
-	struct reg_rename r;
-	r.old_index = old;
-	r.new_index = new;
-	r.temp_index = temp;
-	for(inst = start; inst != &c->Program.Instructions;
-						inst = inst->Next) {
-		rc_remap_registers(inst, rename_reg, &r);
-	}
-}
+#include "radeon_program.h"
 
 /**
  * This function renames registers in an attempt to get the code close to
  * SSA form.  After this function has completed, most of the register are only
- * written to one time, with a few exceptions.  For example, this block of code
- * will not be modified by this function:
- * Mov Temp[0].x Const[0].x
- * Mov Temp[0].y Const[0].y
- * Basically, destination registers will be renamed if:
- * 1. There have been no previous writes to that register
- * or
- * 2. If the instruction is writting to the exact components (no more, no less)
- * of a register that has been written to by previous instructions.
+ * written to one time, with a few exceptions.
  *
  * This function assumes all the instructions are still of type
  * RC_INSTRUCTION_NORMAL.
  */
 void rc_rename_regs(struct radeon_compiler *c, void *user)
 {
-	unsigned int cur_index = 0;
-	unsigned int icount;
+	unsigned int i, used_length;
+	int new_index;
 	struct rc_instruction * inst;
-	unsigned int * masks;
+	struct rc_reader_data reader_data;
+	unsigned char * used;
 
-	/* The number of instructions in the program is also the maximum
-	 * number of temp registers that could potentially be used. */
-	icount = rc_recompute_ips(c);
-	masks = memory_pool_malloc(&c->Pool, icount * sizeof(unsigned int));
-	memset(masks, 0, icount * sizeof(unsigned int));
+	used_length = 2 * rc_recompute_ips(c);
+	used = memory_pool_malloc(&c->Pool, sizeof(unsigned char) * used_length);
+	memset(used, 0, sizeof(unsigned char) * used_length);
 
+	rc_get_used_temporaries(c, used, used_length);
 	for(inst = c->Program.Instructions.Next;
 					inst != &c->Program.Instructions;
 					inst = inst->Next) {
-		const struct rc_opcode_info * info;
-		unsigned int old_index, temp_index;
-		struct rc_dst_register * dst;
-		if(inst->Type != RC_INSTRUCTION_NORMAL) {
-			rc_error(c, "%s only works with normal instructions.",
-								__FUNCTION__);
-			return;
-		}
-		dst = &inst->U.I.DstReg;
-		info = rc_get_opcode_info(inst->U.I.Opcode);
-		if(!info->HasDstReg || dst->File != RC_FILE_TEMPORARY) {
+
+		if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
 			continue;
+
+		rc_get_readers(c, inst, &reader_data, NULL, NULL, NULL);
+
+		if (reader_data.Abort || reader_data.ReaderCount == 0)
+			continue;
+
+		new_index = rc_find_free_temporary_list(c, used, used_length,
+						RC_MASK_XYZW);
+		if (new_index < 0) {
+			rc_error(c, "Ran out of temporary registers\n");
+			return;
 		}
-		if(dst->Index >= icount || !masks[dst->Index] ||
-					masks[dst->Index] == dst->WriteMask) {
-			old_index = dst->Index;
-			/* We need to set dst->Index here so get free temporary
-			 * will work. */
-			dst->Index = cur_index++;
-			temp_index = rc_find_free_temporary(c);
-			rename_all(c, inst->Next, old_index,
-						dst->Index, temp_index);
+
+		reader_data.Writer->U.I.DstReg.Index = new_index;
+		for(i = 0; i < reader_data.ReaderCount; i++) {
+			reader_data.Readers[i].U.Src->Index = new_index;
 		}
-		assert(dst->Index < icount);
-		masks[dst->Index] |= dst->WriteMask;
 	}
 }
diff --git a/src/mesa/drivers/dri/r600/evergreen_chip.c b/src/mesa/drivers/dri/r600/evergreen_chip.c
index 2c9e4e2b844..53dacbfdf39 100644
--- a/src/mesa/drivers/dri/r600/evergreen_chip.c
+++ b/src/mesa/drivers/dri/r600/evergreen_chip.c
@@ -286,7 +286,11 @@ static void evergreenSetupVTXConstants(struct gl_context  * ctx,
     if (!paos->bo)
 	    return;
 
-	r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
+    if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_CEDAR) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_PALM))
+	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, TC_ACTION_ENA_bit);
+    else
+	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
 
     //uSQ_VTX_CONSTANT_WORD0_0
     uSQ_VTX_CONSTANT_WORD0_0 = paos->offset;
diff --git a/src/mesa/drivers/dri/r600/evergreen_state.c b/src/mesa/drivers/dri/r600/evergreen_state.c
index a77be183a12..076a608573c 100644
--- a/src/mesa/drivers/dri/r600/evergreen_state.c
+++ b/src/mesa/drivers/dri/r600/evergreen_state.c
@@ -1461,6 +1461,14 @@ static void evergreenInitSQConfig(struct gl_context * ctx)
         uMaxThreads = 248;
         uMaxStackEntries = 512;
 	    break;
+    case CHIP_FAMILY_PALM:
+	    uSqNumCfInsts       = 1;
+        bVC_ENABLE = GL_FALSE;
+        uMaxGPRs = 256;
+        uPSThreadCount = 96;
+        uMaxThreads = 192;
+        uMaxStackEntries = 256;
+	    break;
     default:
         uSqNumCfInsts       = 2;
         bVC_ENABLE = GL_TRUE;
diff --git a/src/mesa/drivers/dri/r600/evergreen_tex.c b/src/mesa/drivers/dri/r600/evergreen_tex.c
index 2f4c92d6767..3b5448a0e4e 100644
--- a/src/mesa/drivers/dri/r600/evergreen_tex.c
+++ b/src/mesa/drivers/dri/r600/evergreen_tex.c
@@ -31,7 +31,6 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/teximage.h"
-#include "main/mipmap.h"
 #include "main/simple_list.h"
 #include "main/texobj.h"
 
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index b6443bf0c53..aa1891eac32 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -259,7 +259,7 @@ static void r600InitConstValues(struct gl_context *ctx, radeonScreenPtr screen)
     R700_CHIP_CONTEXT *r700    = (R700_CHIP_CONTEXT*)(&context->hw);
 
     if(  (context->radeon.radeonScreen->chip_family >= CHIP_FAMILY_CEDAR)
-       &&(context->radeon.radeonScreen->chip_family <= CHIP_FAMILY_HEMLOCK) )
+       &&(context->radeon.radeonScreen->chip_family <= CHIP_FAMILY_PALM) )
     {
         r700->bShaderUseMemConstant = GL_TRUE;
     }
@@ -285,8 +285,13 @@ static void r600InitConstValues(struct gl_context *ctx, radeonScreenPtr screen)
 	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 	ctx->Const.MaxTextureLodBias = 16.0;
 
-	ctx->Const.MaxTextureLevels = 13; /* hw support 14 */
-	ctx->Const.MaxTextureRectSize = 4096; /* hw support 8192 */
+	if (screen->chip_family >= CHIP_FAMILY_CEDAR) {
+		ctx->Const.MaxTextureLevels = 15;
+		ctx->Const.MaxTextureRectSize = 16384;
+	} else {
+		ctx->Const.MaxTextureLevels = 14;
+		ctx->Const.MaxTextureRectSize = 8192;
+	}
 
 	ctx->Const.MinPointSize   = 0x0001 / 8.0;
 	ctx->Const.MinPointSizeAA = 0x0001 / 8.0;
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index 2bf24096a0d..1fa559cec1a 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -3334,7 +3334,14 @@ GLboolean assemble_CMP(r700_AssemblerBase *pAsm)
         return GL_FALSE;
     }
 
-    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGE;
+    if(8 == pAsm->unAsic)
+    {
+	pAsm->D.dst.opcode = EG_OP3_INST_CNDGE;
+    }
+    else
+    {
+	pAsm->D.dst.opcode = SQ_OP3_INST_CNDGE;
+    }
     pAsm->D.dst.op3     = 1;  
 
     tmp = (-1);
@@ -3416,8 +3423,14 @@ GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode)
     checkop1(pAsm);
 
     tmp = gethelpr(pAsm);
-
-    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_MULADD;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    }
     pAsm->D.dst.op3    = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
@@ -3457,7 +3470,14 @@ GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode)
     {
         return GL_FALSE;
     }
-    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_MULADD;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    }
     pAsm->D.dst.op3    = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
@@ -4742,7 +4762,14 @@ GLboolean assemble_SCS(r700_AssemblerBase *pAsm)
 
     tmp = gethelpr(pAsm);
 
-    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_MULADD;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    }
     pAsm->D.dst.op3    = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
@@ -4782,7 +4809,14 @@ GLboolean assemble_SCS(r700_AssemblerBase *pAsm)
     {
         return GL_FALSE;
     }
-    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_MULADD;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    }
     pAsm->D.dst.op3    = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
@@ -5010,7 +5044,14 @@ GLboolean assemble_SSG(r700_AssemblerBase *pAsm)
     
     GLuint tmp = gethelpr(pAsm);
     /* tmp = (src > 0 ? 1 : src) */
-    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGT;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_CNDGT;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_CNDGT;
+    }
     pAsm->D.dst.op3    = 1;
     pAsm->D.dst.rtype = DST_REG_TEMPORARY;
     pAsm->D.dst.reg   = tmp;
@@ -5033,7 +5074,14 @@ GLboolean assemble_SSG(r700_AssemblerBase *pAsm)
     }
 
     /* dst = (-tmp > 0 ? -1 : tmp) */
-    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGT;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_CNDGT;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_CNDGT;
+    }
     pAsm->D.dst.op3    = 1;
 
     if( GL_FALSE == assemble_dst(pAsm) )
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index 61106fbc43f..82789cec5ed 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -440,6 +440,11 @@
 #define PCI_CHIP_HEMLOCK_689C           0x689C
 #define PCI_CHIP_HEMLOCK_689D           0x689D
 
+#define PCI_CHIP_PALM_9802              0x9802
+#define PCI_CHIP_PALM_9803              0x9803
+#define PCI_CHIP_PALM_9804              0x9804
+#define PCI_CHIP_PALM_9805              0x9805
+
 enum {
    CHIP_FAMILY_R100,
    CHIP_FAMILY_RV100,
@@ -483,6 +488,7 @@ enum {
    CHIP_FAMILY_JUNIPER,
    CHIP_FAMILY_CYPRESS,
    CHIP_FAMILY_HEMLOCK,
+   CHIP_FAMILY_PALM,
    CHIP_FAMILY_LAST
 };
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index fecdd119059..ca6ab46ca43 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -99,6 +99,7 @@ static const char* get_chip_family_name(int chip_family)
 	case CHIP_FAMILY_JUNIPER: return "JUNIPER";
 	case CHIP_FAMILY_CYPRESS: return "CYPRESS";
 	case CHIP_FAMILY_HEMLOCK: return "HEMLOCK";
+	case CHIP_FAMILY_PALM: return "PALM";
 	default: return "unknown";
 	}
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
index 088f9701722..a68a9768779 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
@@ -49,7 +49,7 @@ struct _radeon_mipmap_level {
 };
 
 /* store the max possible in the miptree */
-#define RADEON_MIPTREE_MAX_TEXTURE_LEVELS 13
+#define RADEON_MIPTREE_MAX_TEXTURE_LEVELS 15
 
 /**
  * A mipmap tree contains texture images in the layout that the hardware
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index b379240579d..94e56c2ade6 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -1155,6 +1155,14 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
        screen->chip_flags = RADEON_CHIPSET_TCL;
        break;
 
+    case PCI_CHIP_PALM_9802:
+    case PCI_CHIP_PALM_9803:
+    case PCI_CHIP_PALM_9804:
+    case PCI_CHIP_PALM_9805:
+       screen->chip_family = CHIP_FAMILY_PALM;
+       screen->chip_flags = RADEON_CHIPSET_TCL;
+       break;
+
    default:
       fprintf(stderr, "unknown chip id 0x%x, can't guess.\n",
 	      device_id);
diff --git a/src/mesa/drivers/dri/sis/server/sis_dri.h b/src/mesa/drivers/dri/sis/server/sis_dri.h
index f0171f3c0f8..7d8f507115d 100644
--- a/src/mesa/drivers/dri/sis/server/sis_dri.h
+++ b/src/mesa/drivers/dri/sis/server/sis_dri.h
@@ -72,13 +72,4 @@ typedef struct {
   int dummy;
 } SISDRIContextRec, *SISDRIContextPtr;
 
-#ifdef XFree86Server
-
-#include "screenint.h"
-
-Bool SISDRIScreenInit(ScreenPtr pScreen);
-void SISDRICloseScreen(ScreenPtr pScreen);
-Bool SISDRIFinishScreenInit(ScreenPtr pScreen);
-
-#endif
 #endif
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_context.h b/src/mesa/drivers/dri/tdfx/tdfx_context.h
index fb38419dcdd..7e2f0e00a8e 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_context.h
+++ b/src/mesa/drivers/dri/tdfx/tdfx_context.h
@@ -41,11 +41,7 @@
 
 #include <sys/time.h>
 #include "dri_util.h"
-#ifdef XFree86Server
-#include "GL/xf86glx.h"
-#else
 #include "main/glheader.h"
-#endif
 #if defined(__linux__)
 #include <signal.h>
 #endif
diff --git a/src/mesa/drivers/dri/unichrome/server/via_dri.h b/src/mesa/drivers/dri/unichrome/server/via_dri.h
index b47397d5728..c6eed03c1c9 100644
--- a/src/mesa/drivers/dri/unichrome/server/via_dri.h
+++ b/src/mesa/drivers/dri/unichrome/server/via_dri.h
@@ -35,9 +35,7 @@
 #define VIA_DRIDDX_VERSION_MINOR  0
 #define VIA_DRIDDX_VERSION_PATCH  0
 
-#ifndef XFree86Server
 typedef int Bool;
-#endif
 
 typedef struct {
     drm_handle_t handle;
diff --git a/src/mesa/drivers/windows/gdi/InitCritSections.cpp b/src/mesa/drivers/windows/gdi/InitCritSections.cpp
index 7145bffa510..69f03b8e47c 100644
--- a/src/mesa/drivers/windows/gdi/InitCritSections.cpp
+++ b/src/mesa/drivers/windows/gdi/InitCritSections.cpp
@@ -1,7 +1,8 @@
 #include "glapi.h"
 #include "glThread.h"
 
-#ifdef WIN32_THREADS
+#ifdef WIN32
+
 extern "C" _glthread_Mutex OneTimeLock;
 extern "C" _glthread_Mutex GenTexturesLock;
 
@@ -29,4 +30,4 @@ public:
 _CriticalSectionInit _CriticalSectionInit::m_inst;
 
 
-#endif
+#endif /* WIN32 */
diff --git a/src/mesa/drivers/x11/glxheader.h b/src/mesa/drivers/x11/glxheader.h
index d88afba20e7..ee002191bc0 100644
--- a/src/mesa/drivers/x11/glxheader.h
+++ b/src/mesa/drivers/x11/glxheader.h
@@ -32,13 +32,6 @@
 
 #include "main/glheader.h"
 
-#ifdef XFree86Server
-
-# include "xorg-server.h"
-# include "resource.h"
-# include "windowstr.h"
-
-#else
 
 # include <X11/Xlib.h>
 # include <X11/Xlibint.h>
@@ -51,7 +44,6 @@
 # include <GL/glx.h>
 # include <sys/time.h>
 
-#endif
 
 
 
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index 00ceb960c62..b5eabadf486 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -158,14 +158,12 @@ static short hpcr_rgbTbl[3][256] = {
 /**
  * Return the host's byte order as LSBFirst or MSBFirst ala X.
  */
-#ifndef XFree86Server
 static int host_byte_order( void )
 {
    int i = 1;
    char *cptr = (char *) &i;
    return (*cptr==1) ? LSBFirst : MSBFirst;
 }
-#endif
 
 
 /**
@@ -176,7 +174,7 @@ static int host_byte_order( void )
  */
 static int check_for_xshm( XMesaDisplay *display )
 {
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
    int major, minor, ignore;
    Bool pixmaps;
 
@@ -227,16 +225,6 @@ gamma_adjust( GLfloat gamma, GLint value, GLint max )
 static int
 bits_per_pixel( XMesaVisual xmv )
 {
-#ifdef XFree86Server
-   const int depth = xmv->nplanes;
-   int i;
-   assert(depth > 0);
-   for (i = 0; i < screenInfo.numPixmapFormats; i++) {
-      if (screenInfo.formats[i].depth == depth)
-         return screenInfo.formats[i].bitsPerPixel;
-   }
-   return depth;  /* should never get here, but this should be safe */
-#else
    XMesaDisplay *dpy = xmv->display;
    XMesaVisualInfo visinfo = xmv->visinfo;
    XMesaImage *img;
@@ -257,7 +245,6 @@ bits_per_pixel( XMesaVisual xmv )
    img->data = NULL;
    XMesaDestroyImage( img );
    return bitsPerPixel;
-#endif
 }
 
 
@@ -271,7 +258,6 @@ bits_per_pixel( XMesaVisual xmv )
  * Return:  GL_TRUE - window exists
  *          GL_FALSE - window doesn't exist
  */
-#ifndef XFree86Server
 static GLboolean WindowExistsFlag;
 
 static int window_exists_err_handler( XMesaDisplay* dpy, XErrorEvent* xerr )
@@ -306,7 +292,6 @@ get_drawable_size( XMesaDisplay *dpy, Drawable d, GLuint *width, GLuint *height
    *height = h;
    return stat;
 }
-#endif
 
 
 /**
@@ -319,10 +304,6 @@ void
 xmesa_get_window_size(XMesaDisplay *dpy, XMesaBuffer b,
                       GLuint *width, GLuint *height)
 {
-#ifdef XFree86Server
-   *width = MIN2(b->frontxrb->drawable->width, MAX_WIDTH);
-   *height = MIN2(b->frontxrb->drawable->height, MAX_HEIGHT);
-#else
    Status stat;
 
    _glthread_LOCK_MUTEX(_xmesa_lock);
@@ -335,7 +316,6 @@ xmesa_get_window_size(XMesaDisplay *dpy, XMesaBuffer b,
       _mesa_warning(NULL, "XGetGeometry failed!\n");
       *width = *height = 1;
    }
-#endif
 }
 
 
@@ -549,16 +529,11 @@ noFaultXAllocColor( int client,
                     XMesaColor *color,
                     int *exact, int *alloced )
 {
-#ifdef XFree86Server
-   Pixel *ppixIn;
-   xrgb *ctable;
-#else
    /* we'll try to cache ctable for better remote display performance */
    static Display *prevDisplay = NULL;
    static XMesaColormap prevCmap = 0;
    static int prevCmapSize = 0;
    static XMesaColor *ctable = NULL;
-#endif
    XMesaColor subColor;
    int i, bestmatch;
    double mindist;       /* 3*2^16^2 exceeds long int precision. */
@@ -566,14 +541,7 @@ noFaultXAllocColor( int client,
    (void) client;
 
    /* First try just using XAllocColor. */
-#ifdef XFree86Server
-   if (AllocColor(cmap,
-		  &color->red, &color->green, &color->blue,
-		  &color->pixel,
-		  client) == Success)
-#else
    if (XAllocColor(dpy, cmap, color))
-#endif
    {
       *exact = 1;
       *alloced = 1;
@@ -584,14 +552,6 @@ noFaultXAllocColor( int client,
 
    /* Retrieve color table entries. */
    /* XXX alloca candidate. */
-#ifdef XFree86Server
-   ppixIn = (Pixel *) MALLOC(cmapSize * sizeof(Pixel));
-   ctable = (xrgb *) MALLOC(cmapSize * sizeof(xrgb));
-   for (i = 0; i < cmapSize; i++) {
-      ppixIn[i] = i;
-   }
-   QueryColors(cmap, cmapSize, ppixIn, ctable);
-#else
    if (prevDisplay != dpy || prevCmap != cmap
        || prevCmapSize != cmapSize || !ctable) {
       /* free previously cached color table */
@@ -608,7 +568,6 @@ noFaultXAllocColor( int client,
       prevCmap = cmap;
       prevCmapSize = cmapSize;
    }
-#endif
 
    /* Find best match. */
    bestmatch = -1;
@@ -632,14 +591,7 @@ noFaultXAllocColor( int client,
     * fail if the cell is read/write.  Otherwise, we're incrementing
     * the cell's reference count.
     */
-#ifdef XFree86Server
-   if (AllocColor(cmap,
-		  &subColor.red, &subColor.green, &subColor.blue,
-		  &subColor.pixel,
-		  client) == Success) {
-#else
    if (XAllocColor(dpy, cmap, &subColor)) {
-#endif
       *alloced = 1;
    }
    else {
@@ -651,12 +603,7 @@ noFaultXAllocColor( int client,
       subColor.flags = DoRed | DoGreen | DoBlue;
       *alloced = 0;
    }
-#ifdef XFree86Server
-   free(ppixIn);
-   free(ctable);
-#else
    /* don't free table, save it for next time */
-#endif
 
    *color = subColor;
    *exact = 0;
@@ -873,10 +820,8 @@ setup_8bit_hpcr(XMesaVisual v)
       v->hpcr_clear_pixmap = XMesaCreatePixmap(v->display,
                                                DefaultRootWindow(v->display),
                                                16, 2, 8);
-#ifndef XFree86Server
       v->hpcr_clear_ximage = XGetImage(v->display, v->hpcr_clear_pixmap,
                                        0, 0, 16, 2, AllPlanes, ZPixmap);
-#endif
    }
 }
 
@@ -1049,9 +994,6 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
    int client = 0;
    const int xclass = v->visualType;
 
-#ifdef XFree86Server
-   client = (window) ? CLIENT_ID(window->id) : 0;
-#endif
 
    ASSERT(!b || b->xm_visual == v);
 
@@ -1120,40 +1062,23 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
       }
 
       /* X11 graphics contexts */
-#ifdef XFree86Server
-      b->gc = CreateScratchGC(v->display, window->depth);
-#else
       b->gc = XCreateGC( v->display, window, 0, NULL );
-#endif
       XMesaSetFunction( v->display, b->gc, GXcopy );
 
       /* cleargc - for glClear() */
-#ifdef XFree86Server
-      b->cleargc = CreateScratchGC(v->display, window->depth);
-#else
       b->cleargc = XCreateGC( v->display, window, 0, NULL );
-#endif
       XMesaSetFunction( v->display, b->cleargc, GXcopy );
 
       /*
        * Don't generate Graphics Expose/NoExpose events in swapbuffers().
        * Patch contributed by Michael Pichler May 15, 1995.
        */
-#ifdef XFree86Server
-      b->swapgc = CreateScratchGC(v->display, window->depth);
-      {
-         CARD32 v[1];
-         v[0] = FALSE;
-         dixChangeGC(NullClient, b->swapgc, GCGraphicsExposures, v, NULL);
-      }
-#else
       {
          XGCValues gcvalues;
          gcvalues.graphics_exposures = False;
          b->swapgc = XCreateGC(v->display, window,
                                GCGraphicsExposures, &gcvalues);
       }
-#endif
       XMesaSetFunction( v->display, b->swapgc, GXcopy );
       /*
        * Set fill style and tile pixmap once for all for HPCR stuff
@@ -1175,9 +1100,6 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
 
       /* Initialize the row buffer XImage for use in write_color_span() */
       data = (char*) MALLOC(MAX_WIDTH*4);
-#ifdef XFree86Server
-      b->rowimage = XMesaCreateImage(GET_VISUAL_DEPTH(v), MAX_WIDTH, 1, data);
-#else
       b->rowimage = XCreateImage( v->display,
                                   v->visinfo->visual,
                                   v->visinfo->depth,
@@ -1186,7 +1108,6 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
                                   MAX_WIDTH, 1,         /*width, height*/
                                   32,                   /*bitmap_pad*/
                                   0                     /*bytes_per_line*/ );
-#endif
       if (!b->rowimage)
          return GL_FALSE;
    }
@@ -1334,7 +1255,6 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
    XMesaVisual v;
    GLint red_bits, green_bits, blue_bits, alpha_bits;
 
-#ifndef XFree86Server
    /* For debugging only */
    if (_mesa_getenv("MESA_XSYNC")) {
       /* This makes debugging X easier.
@@ -1343,7 +1263,6 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
        */
       XSynchronize( display, 1 );
    }
-#endif
 
    /* Color-index rendering not supported. */
    if (!rgb_flag)
@@ -1360,14 +1279,12 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
     * the struct but we may need some of the information contained in it
     * at a later time.
     */
-#ifndef XFree86Server
    v->visinfo = (XVisualInfo *) MALLOC(sizeof(*visinfo));
    if(!v->visinfo) {
       free(v);
       return NULL;
    }
    memcpy(v->visinfo, visinfo, sizeof(*visinfo));
-#endif
 
    /* check for MESA_GAMMA environment variable */
    gamma = _mesa_getenv("MESA_GAMMA");
@@ -1384,30 +1301,13 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
 
    v->ximage_flag = ximage_flag;
 
-#ifdef XFree86Server
-   /* We could calculate these values by ourselves.  nplanes is either the sum
-    * of the red, green, and blue bits or the number index bits.
-    * ColormapEntries is either (1U << index_bits) or
-    * (1U << max(redBits, greenBits, blueBits)).
-    */
-   assert(visinfo->nplanes > 0);
-   v->nplanes = visinfo->nplanes;
-   v->ColormapEntries = visinfo->ColormapEntries;
-
-   v->mesa_visual.redMask = visinfo->redMask;
-   v->mesa_visual.greenMask = visinfo->greenMask;
-   v->mesa_visual.blueMask = visinfo->blueMask;
-   v->visualID = visinfo->vid;
-   v->screen = 0; /* FIXME: What should be done here? */
-#else
    v->mesa_visual.redMask = visinfo->red_mask;
    v->mesa_visual.greenMask = visinfo->green_mask;
    v->mesa_visual.blueMask = visinfo->blue_mask;
    v->visualID = visinfo->visualid;
    v->screen = visinfo->screen;
-#endif
 
-#if defined(XFree86Server) || !(defined(__cplusplus) || defined(c_plusplus))
+#if !(defined(__cplusplus) || defined(c_plusplus))
    v->visualType = xmesa_convert_from_x_visual_type(visinfo->class);
 #else
    v->visualType = xmesa_convert_from_x_visual_type(visinfo->c_class);
@@ -1461,9 +1361,7 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
 PUBLIC
 void XMesaDestroyVisual( XMesaVisual v )
 {
-#ifndef XFree86Server
    free(v->visinfo);
-#endif
    free(v);
 }
 
@@ -1532,12 +1430,6 @@ XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
     _mesa_enable_extension(mesaCtx, "GL_EXT_timer_query");
 #endif
 
-#ifdef XFree86Server
-   /* If we're running in the X server, do bounds checking to prevent
-    * segfaults and server crashes!
-    */
-   mesaCtx->Const.CheckArrayBounds = GL_TRUE;
-#endif
 
    /* finish up xmesa context initializations */
    c->swapbytes = CHECK_BYTE_ORDER(v) ? GL_FALSE : GL_TRUE;
@@ -1602,9 +1494,7 @@ void XMesaDestroyContext( XMesaContext c )
 PUBLIC XMesaBuffer
 XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
 {
-#ifndef XFree86Server
    XWindowAttributes attr;
-#endif
    XMesaBuffer b;
    XMesaColormap cmap;
    int depth;
@@ -1613,12 +1503,8 @@ XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
    assert(w);
 
    /* Check that window depth matches visual depth */
-#ifdef XFree86Server
-   depth = ((XMesaDrawable)w)->depth;
-#else
    XGetWindowAttributes( v->display, w, &attr );
    depth = attr.depth;
-#endif
    if (GET_VISUAL_DEPTH(v) != depth) {
       _mesa_warning(NULL, "XMesaCreateWindowBuffer: depth mismatch between visual (%d) and window (%d)!\n",
                     GET_VISUAL_DEPTH(v), depth);
@@ -1626,9 +1512,6 @@ XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
    }
 
    /* Find colormap */
-#ifdef XFree86Server
-   cmap = (ColormapPtr)LookupIDByType(wColormap(w), RT_COLORMAP);
-#else
    if (attr.colormap) {
       cmap = attr.colormap;
    }
@@ -1638,7 +1521,6 @@ XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
       /* OK, let's just allocate a new one and hope for the best */
       cmap = XCreateColormap(v->display, w, attr.visual, AllocNone);
    }
-#endif
 
    b = create_xmesa_buffer((XMesaDrawable) w, WINDOW, v, cmap);
    if (!b)
@@ -1748,7 +1630,6 @@ XMesaBuffer
 XMesaCreatePBuffer(XMesaVisual v, XMesaColormap cmap,
                    unsigned int width, unsigned int height)
 {
-#ifndef XFree86Server
    XMesaWindow root;
    XMesaDrawable drawable;  /* X Pixmap Drawable */
    XMesaBuffer b;
@@ -1770,9 +1651,6 @@ XMesaCreatePBuffer(XMesaVisual v, XMesaColormap cmap,
    }
 
    return b;
-#else
-   return 0;
-#endif
 }
 
 
@@ -1931,40 +1809,6 @@ XMesaBuffer XMesaGetCurrentReadBuffer( void )
 }
 
 
-#ifdef XFree86Server
-PUBLIC
-GLboolean XMesaForceCurrent(XMesaContext c)
-{
-   if (c) {
-      _glapi_set_dispatch(c->mesa.CurrentDispatch);
-
-      if (&(c->mesa) != _mesa_get_current_context()) {
-	 _mesa_make_current(&c->mesa, c->mesa.DrawBuffer, c->mesa.ReadBuffer);
-      }
-   }
-   else {
-      _mesa_make_current(NULL, NULL, NULL);
-   }
-   return GL_TRUE;
-}
-
-
-PUBLIC
-GLboolean XMesaLoseCurrent(XMesaContext c)
-{
-   (void) c;
-   _mesa_make_current(NULL, NULL, NULL);
-   return GL_TRUE;
-}
-
-
-PUBLIC
-GLboolean XMesaCopyContext( XMesaContext xm_src, XMesaContext xm_dst, GLuint mask )
-{
-   _mesa_copy_context(&xm_src->mesa, &xm_dst->mesa, mask);
-   return GL_TRUE;
-}
-#endif /* XFree86Server */
 
 
 #ifndef FX
@@ -2004,7 +1848,7 @@ void XMesaSwapBuffers( XMesaBuffer b )
 #endif
       if (b->backxrb->ximage) {
 	 /* Copy Ximage (back buf) from client memory to server window */
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
 	 if (b->shm) {
             /*_glthread_LOCK_MUTEX(_xmesa_lock);*/
 	    XShmPutImage( b->xm_visual->display, b->frontxrb->drawable,
@@ -2041,9 +1885,7 @@ void XMesaSwapBuffers( XMesaBuffer b )
       if (b->swAlpha)
          _mesa_copy_soft_alpha_renderbuffers(ctx, &b->mesa_buffer);
    }
-#if !defined(XFree86Server)
    XSync( b->xm_visual->display, False );
-#endif
 }
 
 
@@ -2074,7 +1916,7 @@ void XMesaCopySubBuffer( XMesaBuffer b, int x, int y, int width, int height )
 #endif
       if (b->backxrb->ximage) {
          /* Copy Ximage from host's memory to server's window */
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
          if (b->shm) {
             /* XXX assuming width and height aren't too large! */
             XShmPutImage( b->xm_visual->display, b->frontxrb->drawable,
@@ -2116,7 +1958,6 @@ void XMesaCopySubBuffer( XMesaBuffer b, int x, int y, int width, int height )
  * Return:  GL_TRUE = context is double buffered
  *          GL_FALSE = context is single buffered
  */
-#ifndef XFree86Server
 GLboolean XMesaGetBackBuffer( XMesaBuffer b,
                               XMesaPixmap *pixmap,
                               XMesaImage **ximage )
@@ -2134,7 +1975,6 @@ GLboolean XMesaGetBackBuffer( XMesaBuffer b,
       return GL_FALSE;
    }
 }
-#endif /* XFree86Server */
 
 
 /*
@@ -2171,11 +2011,7 @@ GLboolean XMesaGetDepthBuffer( XMesaBuffer b, GLint *width, GLint *height,
 void XMesaFlush( XMesaContext c )
 {
    if (c && c->xm_visual) {
-#ifdef XFree86Server
-      /* NOT_NEEDED */
-#else
       XSync( c->xm_visual->display, False );
-#endif
    }
 }
 
@@ -2234,15 +2070,11 @@ void XMesaGarbageCollect( void )
    for (b=XMesaBufferList; b; b=next) {
       next = b->Next;
       if (b->display && b->frontxrb->drawable && b->type == WINDOW) {
-#ifdef XFree86Server
-	 /* NOT_NEEDED */
-#else
          XSync(b->display, False);
          if (!window_exists( b->display, b->frontxrb->drawable )) {
             /* found a dead window, free the ancillary info */
             XMesaDestroyBuffer( b );
          }
-#endif
       }
    }
 }
diff --git a/src/mesa/drivers/x11/xm_buffer.c b/src/mesa/drivers/x11/xm_buffer.c
index 2683bd44d19..10829b4284f 100644
--- a/src/mesa/drivers/x11/xm_buffer.c
+++ b/src/mesa/drivers/x11/xm_buffer.c
@@ -37,7 +37,7 @@
 #include "main/renderbuffer.h"
 
 
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
 static volatile int mesaXErrorFlag = 0;
 
 /**
@@ -170,7 +170,7 @@ alloc_back_buffer(XMesaBuffer b, GLuint width, GLuint height)
    if (b->db_mode == BACK_XIMAGE) {
       /* Deallocate the old backxrb->ximage, if any */
       if (b->backxrb->ximage) {
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
 	 if (b->shm) {
 	    XShmDetach(b->xm_visual->display, &b->shminfo);
 	    XDestroyImage(b->backxrb->ximage);
@@ -188,10 +188,6 @@ alloc_back_buffer(XMesaBuffer b, GLuint width, GLuint height)
       /* Allocate new back buffer */
       if (b->shm == 0 || !alloc_back_shm_ximage(b, width, height)) {
 	 /* Allocate a regular XImage for the back buffer. */
-#ifdef XFree86Server
-	 b->backxrb->ximage = XMesaCreateImage(b->xm_visual->BitsPerPixel,
-                                               width, height, NULL);
-#else
 	 b->backxrb->ximage = XCreateImage(b->xm_visual->display,
                                       b->xm_visual->visinfo->visual,
                                       GET_VISUAL_DEPTH(b->xm_visual),
@@ -199,7 +195,6 @@ alloc_back_buffer(XMesaBuffer b, GLuint width, GLuint height)
 				      NULL,
                                       width, height,
 				      8, 0);  /* pad, bytes_per_line */
-#endif
 	 if (!b->backxrb->ximage) {
 	    _mesa_warning(NULL, "alloc_back_buffer: XCreateImage failed.\n");
             return;
@@ -359,16 +354,8 @@ xmesa_delete_framebuffer(struct gl_framebuffer *fb)
    if (b->num_alloced > 0) {
       /* If no other buffer uses this X colormap then free the colors. */
       if (!xmesa_find_buffer(b->display, b->cmap, b)) {
-#ifdef XFree86Server
-         int client = 0;
-         if (b->frontxrb->drawable)
-            client = CLIENT_ID(b->frontxrb->drawable->id);
-         (void)FreeColors(b->cmap, client,
-                          b->num_alloced, b->alloced_colors, 0);
-#else
          XFreeColors(b->display, b->cmap,
                      b->alloced_colors, b->num_alloced, 0);
-#endif
       }
    }
 
@@ -382,7 +369,7 @@ xmesa_delete_framebuffer(struct gl_framebuffer *fb)
    if (fb->Visual.doubleBufferMode) {
       /* free back ximage/pixmap/shmregion */
       if (b->backxrb->ximage) {
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
          if (b->shm) {
             XShmDetach( b->display, &b->shminfo );
             XDestroyImage( b->backxrb->ximage );
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index acece2025cf..b8d9e20c426 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -93,16 +93,12 @@ const int xmesa_kernel1[16] = {
 static void
 finish_or_flush( struct gl_context *ctx )
 {
-#ifdef XFree86Server
-      /* NOT_NEEDED */
-#else
    const XMesaContext xmesa = XMESA_CONTEXT(ctx);
    if (xmesa) {
       _glthread_LOCK_MUTEX(_xmesa_lock);
       XSync( xmesa->display, False );
       _glthread_UNLOCK_MUTEX(_xmesa_lock);
    }
-#endif
 }
 
 
@@ -388,7 +384,6 @@ clear_buffers(struct gl_context *ctx, GLbitfield buffers)
 }
 
 
-#ifndef XFree86Server
 /* XXX these functions haven't been tested in the Xserver environment */
 
 
@@ -731,7 +726,6 @@ xmesa_CopyPixels( struct gl_context *ctx,
    }
 }
 
-#endif /* XFree86Server */
 
 
 
@@ -745,17 +739,9 @@ get_string( struct gl_context *ctx, GLenum name )
    (void) ctx;
    switch (name) {
       case GL_RENDERER:
-#ifdef XFree86Server
-         return (const GLubyte *) "Mesa GLX Indirect";
-#else
          return (const GLubyte *) "Mesa X11";
-#endif
       case GL_VENDOR:
-#ifdef XFree86Server
-         return (const GLubyte *) "Mesa project: www.mesa3d.org";
-#else
          return NULL;
-#endif
       default:
          return NULL;
    }
@@ -948,43 +934,6 @@ xmesa_update_state( struct gl_context *ctx, GLbitfield new_state )
 
 
 /**
- * Called via ctx->Driver.TestProxyTeximage().  Normally, we'd just use
- * the _mesa_test_proxy_teximage() fallback function, but we're going to
- * special-case the 3D texture case to allow textures up to 512x512x32
- * texels.
- */
-static GLboolean
-test_proxy_teximage(struct gl_context *ctx, GLenum target, GLint level,
-                    GLint internalFormat, GLenum format, GLenum type,
-                    GLint width, GLint height, GLint depth, GLint border)
-{
-   if (target == GL_PROXY_TEXTURE_3D) {
-      /* special case for 3D textures */
-      if (width * height * depth > 512 * 512 * 64 ||
-          width  < 2 * border ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           _mesa_bitcount(width  - 2 * border) != 1) ||
-          height < 2 * border ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           _mesa_bitcount(height - 2 * border) != 1) ||
-          depth  < 2 * border ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           _mesa_bitcount(depth  - 2 * border) != 1)) {
-         /* Bad size, or too many texels */
-         return GL_FALSE;
-      }
-      return GL_TRUE;
-   }
-   else {
-      /* use the fallback routine for 1D, 2D, cube and rect targets */
-      return _mesa_test_proxy_teximage(ctx, target, level, internalFormat,
-                                       format, type, width, height, depth,
-                                       border);
-   }
-}
-
-
-/**
  * In SW, we don't really compress GL_COMPRESSED_RGB[A] textures!
  */
 static gl_format
@@ -1124,7 +1073,6 @@ xmesa_init_driver_functions( XMesaVisual xmvisual,
    }
    else {
       driver->Clear = clear_buffers;
-#ifndef XFree86Server
       driver->CopyPixels = xmesa_CopyPixels;
       if (xmvisual->undithered_pf == PF_8R8G8B &&
           xmvisual->dithered_pf == PF_8R8G8B &&
@@ -1134,9 +1082,8 @@ xmesa_init_driver_functions( XMesaVisual xmvisual,
       else if (xmvisual->undithered_pf == PF_5R6G5B) {
          driver->DrawPixels = xmesa_DrawPixels_5R6G5B;
       }
-#endif
    }
-   driver->TestProxyTexImage = test_proxy_teximage;
+
 #if ENABLE_EXT_texure_compression_s3tc
    driver->ChooseTextureFormat = choose_tex_format;
 #else
diff --git a/src/mesa/drivers/x11/xm_glide.c b/src/mesa/drivers/x11/xm_glide.c
index cbd69b011a1..d8a0e6de6d0 100644
--- a/src/mesa/drivers/x11/xm_glide.c
+++ b/src/mesa/drivers/x11/xm_glide.c
@@ -140,16 +140,8 @@ static void FXgetImage( XMesaBuffer b )
    GLuint x, y;
    GLuint width, height;
 
-#ifdef XFree86Server
-   x = b->frontxrb->pixmap->x;
-   y = b->frontxrb->pixmap->y;
-   width = b->frontxrb->pixmap->width;
-   height = b->frontxrb->pixmap->height;
-   depth = b->frontxrb->pixmap->depth;
-#else
    xmesa_get_window_size(b->display, b, &width, &height);
    x = y = 0;
-#endif
    if (b->mesa_buffer.Width != width || b->mesa_buffer.Height != height) {
       b->mesa_buffer.Width = MIN2((int)width, b->FXctx->width);
       b->mesa_buffer.Height = MIN2((int)height, b->FXctx->height);
diff --git a/src/mesa/drivers/x11/xm_image.c b/src/mesa/drivers/x11/xm_image.c
index 087b4e4c3a7..12fef7dad34 100644
--- a/src/mesa/drivers/x11/xm_image.c
+++ b/src/mesa/drivers/x11/xm_image.c
@@ -37,97 +37,3 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "glxheader.h"
 #include "xmesaP.h"
 
-#ifdef XFree86Server
-
-#ifdef ROUNDUP
-#undef ROUNDUP
-#endif
-
-#define ROUNDUP(nbytes, pad) ((((nbytes) + ((pad)-1)) / (pad)) * ((pad)>>3))
-
-XMesaImage *XMesaCreateImage(int bitsPerPixel, int width, int height, char *data)
-{
-    XMesaImage *image;
-
-    image = (XMesaImage *)xalloc(sizeof(XMesaImage));
-
-    if (image) {
-	image->width = width;
-	image->height = height;
-	image->data = data;
-	/* Always pad to 32 bits */
-	image->bytes_per_line = ROUNDUP((bitsPerPixel * width), 32);
-	image->bits_per_pixel = bitsPerPixel;
-    }
-
-    return image;
-}
-
-void XMesaDestroyImage(XMesaImage *image)
-{
-    if (image->data)
-	free(image->data);
-    xfree(image);
-}
-
-unsigned long XMesaGetPixel(XMesaImage *image, int x, int y)
-{
-    CARD8  *row = (CARD8 *)(image->data + y*image->bytes_per_line);
-    CARD8  *i8;
-    CARD16 *i16;
-    CARD32 *i32;
-    switch (image->bits_per_pixel) {
-    case 8:
-	i8 = (CARD8 *)row;
-	return i8[x];
-	break;
-    case 15:
-    case 16:
-	i16 = (CARD16 *)row;
-	return i16[x];
-	break;
-    case 24: /* WARNING: architecture specific code */
-	i8 = (CARD8 *)row;
-	return (((CARD32)i8[x*3]) |
-		(((CARD32)i8[x*3+1])<<8) |
-		(((CARD32)i8[x*3+2])<<16));
-	break;
-    case 32:
-	i32 = (CARD32 *)row;
-	return i32[x];
-	break;
-    }
-    return 0;
-}
-
-#ifndef XMESA_USE_PUTPIXEL_MACRO
-void XMesaPutPixel(XMesaImage *image, int x, int y, unsigned long pixel)
-{
-    CARD8  *row = (CARD8 *)(image->data + y*image->bytes_per_line);
-    CARD8  *i8;
-    CARD16 *i16;
-    CARD32 *i32;
-    switch (image->bits_per_pixel) {
-    case 8:
-	i8 = (CARD8 *)row;
-	i8[x] = (CARD8)pixel;
-	break;
-    case 15:
-    case 16:
-	i16 = (CARD16 *)row;
-	i16[x] = (CARD16)pixel;
-	break;
-    case 24: /* WARNING: architecture specific code */
-	i8 = (CARD8 *)__row;
-	i8[x*3]   = (CARD8)(p);
-	i8[x*3+1] = (CARD8)(p>>8);
-	i8[x*3+2] = (CARD8)(p>>16);
-    case 32:
-	i32 = (CARD32 *)row;
-	i32[x] = (CARD32)pixel;
-	break;
-    }
-}
-#endif
-
-#endif /* XFree86Server */
diff --git a/src/mesa/drivers/x11/xm_line.c b/src/mesa/drivers/x11/xm_line.c
index f03f99f918f..04cedcd4ec0 100644
--- a/src/mesa/drivers/x11/xm_line.c
+++ b/src/mesa/drivers/x11/xm_line.c
@@ -537,7 +537,6 @@ void xmesa_choose_point( struct gl_context *ctx )
 
 
 
-#ifndef XFree86Server
 /**
  * Draw fast, XOR line with XDrawLine in front color buffer.
  * WARNING: this isn't fully OpenGL conformant because different pixels
@@ -567,7 +566,6 @@ xor_line(struct gl_context *ctx, const SWvertex *vert0, const SWvertex *vert1)
    XDrawLine(dpy, xrb->pixmap, gc, x0, y0, x1, y1);
    XMesaSetFunction(dpy, gc, GXcopy);  /* this gc is used elsewhere */
 }
-#endif /* XFree86Server */
 
 
 #endif /* CHAN_BITS == 8 */
@@ -660,7 +658,6 @@ get_line_func(struct gl_context *ctx)
       }
    }
 
-#ifndef XFree86Server
    if (ctx->DrawBuffer->_NumColorDrawBuffers == 1
        && ctx->DrawBuffer->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT
        && swrast->_RasterMask == LOGIC_OP_BIT
@@ -669,7 +666,6 @@ get_line_func(struct gl_context *ctx)
        && !ctx->Line.SmoothFlag) {
       return xor_line;
    }
-#endif /* XFree86Server */
 
 #endif /* CHAN_BITS == 8 */
    return (swrast_line_func) NULL;
diff --git a/src/mesa/drivers/x11/xm_span.c b/src/mesa/drivers/x11/xm_span.c
index ab66c5e1f12..294b93a57cc 100644
--- a/src/mesa/drivers/x11/xm_span.c
+++ b/src/mesa/drivers/x11/xm_span.c
@@ -42,7 +42,6 @@
  * generate BadMatch errors if the drawable isn't mapped.
  */
 
-#ifndef XFree86Server
 static int caught_xgetimage_error = 0;
 static int (*old_xerror_handler)( XMesaDisplay *dpy, XErrorEvent *ev );
 static unsigned long xgetimage_serial;
@@ -87,7 +86,6 @@ static int check_xgetimage_errors( void )
    /* return 0=no error, 1=error caught */
    return caught_xgetimage_error;
 }
-#endif
 
 
 /*
@@ -97,7 +95,6 @@ static unsigned long read_pixel( XMesaDisplay *dpy,
                                  XMesaDrawable d, int x, int y )
 {
    unsigned long p;
-#ifndef XFree86Server
    XMesaImage *pixel = NULL;
    int error;
 
@@ -113,9 +110,6 @@ static unsigned long read_pixel( XMesaDisplay *dpy,
    if (pixel) {
       XMesaDestroyImage( pixel );
    }
-#else
-   (*dpy->GetImage)(d, x, y, 1, 1, ZPixmap, ~0L, (pointer)&p);
-#endif
    return p;
 }
 
@@ -3763,7 +3757,6 @@ static void put_values_ci_ximage( PUT_VALUES_ARGS )
 /*****                      Pixel reading                         *****/
 /**********************************************************************/
 
-#ifndef XFree86Server
 /**
  * Do clip testing prior to calling XGetImage.  If any of the region lies
  * outside the screen's bounds, XGetImage will return NULL.
@@ -3806,7 +3799,6 @@ clip_for_xgetimage(struct gl_context *ctx, XMesaPixmap pixmap, GLuint *n, GLint
    }
    return 0;
 }
-#endif
 
 
 /*
@@ -3824,7 +3816,6 @@ get_row_ci(struct gl_context *ctx, struct gl_renderbuffer *rb,
    y = YFLIP(xrb, y);
 
    if (xrb->pixmap) {
-#ifndef XFree86Server
       XMesaImage *span = NULL;
       int error;
       int k = clip_for_xgetimage(ctx, xrb->pixmap, &n, &x, &y);
@@ -3850,11 +3841,6 @@ get_row_ci(struct gl_context *ctx, struct gl_renderbuffer *rb,
       if (span) {
 	 XMesaDestroyImage( span );
       }
-#else
-      (*xmesa->display->GetImage)(xrb->drawable,
-				  x, y, n, 1, ZPixmap,
-				  ~0L, (pointer)index);
-#endif
    }
    else if (xrb->ximage) {
       XMesaImage *img = xrb->ximage;
@@ -3882,14 +3868,6 @@ get_row_rgba(struct gl_context *ctx, struct gl_renderbuffer *rb,
       /* Read from Pixmap or Window */
       XMesaImage *span = NULL;
       int error;
-#ifdef XFree86Server
-      span = XMesaCreateImage(xmesa->xm_visual->BitsPerPixel, n, 1, NULL);
-      span->data = (char *)MALLOC(span->height * span->bytes_per_line);
-      error = (!span->data);
-      (*xmesa->display->GetImage)(xrb->drawable,
-				  x, YFLIP(xrb, y), n, 1, ZPixmap,
-				  ~0L, (pointer)span->data);
-#else
       int k;
       y = YFLIP(xrb, y);
       k = clip_for_xgetimage(ctx, xrb->pixmap, &n, &x, &y);
@@ -3900,7 +3878,6 @@ get_row_rgba(struct gl_context *ctx, struct gl_renderbuffer *rb,
       span = XGetImage( xmesa->display, xrb->pixmap,
 		        x, y, n, 1, AllPlanes, ZPixmap );
       error = check_xgetimage_errors();
-#endif
       if (span && !error) {
 	 switch (xmesa->pixelformat) {
 	    case PF_Truecolor:
diff --git a/src/mesa/drivers/x11/xmesa.h b/src/mesa/drivers/x11/xmesa.h
index f63626a9702..98737fab248 100644
--- a/src/mesa/drivers/x11/xmesa.h
+++ b/src/mesa/drivers/x11/xmesa.h
@@ -72,13 +72,9 @@ and create a window, you must do the following to use the X/Mesa interface:
 extern "C" {
 #endif
 
-#ifdef XFree86Server
-#include "xmesa_xf86.h"
-#else
 #include <X11/Xlib.h>
 #include <X11/Xutil.h>
 #include "xmesa_x.h"
-#endif
 #include "GL/gl.h"
 
 #ifdef AMIWIN
@@ -180,19 +176,6 @@ extern XMesaContext XMesaCreateContext( XMesaVisual v,
 extern void XMesaDestroyContext( XMesaContext c );
 
 
-#ifdef XFree86Server
-/*
- * These are the extra routines required for integration with XFree86.
- * None of these routines should be user visible. -KEM
- */
-extern GLboolean XMesaForceCurrent( XMesaContext c );
-
-extern GLboolean XMesaLoseCurrent( XMesaContext c );
-
-extern GLboolean XMesaCopyContext( XMesaContext src,
-				   XMesaContext dst,
-				   GLuint mask );
-#endif /* XFree86Server */
 
 
 /*
diff --git a/src/mesa/drivers/x11/xmesaP.h b/src/mesa/drivers/x11/xmesaP.h
index 5d34b430cb6..63e3e211bf6 100644
--- a/src/mesa/drivers/x11/xmesaP.h
+++ b/src/mesa/drivers/x11/xmesaP.h
@@ -33,9 +33,6 @@
 #include "fxmesa.h"
 #include "xm_glide.h"
 #endif
-#ifdef XFree86Server
-#include "xm_image.h"
-#endif
 
 
 extern _glthread_Mutex _xmesa_lock;
@@ -88,13 +85,8 @@ struct xmesa_visual {
    XMesaDisplay *display;	/* The X11 display */
    int screen, visualID;
    int visualType;
-#ifdef XFree86Server
-   GLint ColormapEntries;
-   GLint nplanes;
-#else
    XMesaVisualInfo visinfo;	/* X's visual info (pointer to private copy) */
    XVisualInfo *vishandle;	/* Only used in fakeglx.c */
-#endif
    GLint BitsPerPixel;		/* True bits per pixel for XImages */
 
    GLboolean ximage_flag;	/* Use XImage for back buffer (not pixmap)? */
@@ -233,7 +225,7 @@ struct xmesa_buffer {
 				/*    0 = not available			*/
 				/*    1 = XImage support available	*/
 				/*    2 = Pixmap support available too	*/
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
    XShmSegmentInfo shminfo;
 #endif
 
@@ -259,11 +251,7 @@ struct xmesa_buffer {
 
    /* Used to do XAllocColor/XFreeColors accounting: */
    int num_alloced;
-#if defined(XFree86Server)
-   Pixel alloced_colors[256];
-#else
    unsigned long alloced_colors[256];
-#endif
 
 #if defined( FX )
    /* For 3Dfx Glide only */
@@ -578,9 +566,7 @@ extern void xmesa_register_swrast_functions( struct gl_context *ctx );
 
 #define ENABLE_EXT_texure_compression_s3tc 0 /* SW texture compression */
 
-#ifdef XFree86Server
-#define ENABLE_EXT_timer_query 0
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#if   defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 #define ENABLE_EXT_timer_query 1 /* should have 64-bit GLuint64EXT */
 #else
 #define ENABLE_EXT_timer_query 0 /* may not have 64-bit GLuint64EXT */
diff --git a/src/mesa/main/APIspec.xml b/src/mesa/main/APIspec.xml
index 4dc0b0d4851..16d0c9413d0 100644
--- a/src/mesa/main/APIspec.xml
+++ b/src/mesa/main/APIspec.xml
@@ -3536,7 +3536,7 @@
 <api name="mesa" implementation="true">
 	<category name="MESA"/>
 
-	<function name="Color4f"  default_prefix="_vbo_" template="Color" gltype="GLfloat" vector_size="4" expand_vector="true"/>
+	<function name="Color4f"  default_prefix="_es_" template="Color" gltype="GLfloat" vector_size="4" expand_vector="true"/>
 	<function name="ClipPlane" template="ClipPlane" gltype="GLdouble"/>
 	<function name="CullFace" template="CullFace"/>
 
@@ -3554,8 +3554,8 @@
 
 	<function name="LineWidth" template="LineWidth" gltype="GLfloat"/>
 
-	<function name="Materialf" default_prefix="_vbo_" template="Material" gltype="GLfloat" expand_vector="true"/>
-	<function name="Materialfv" default_prefix="_vbo_" template="Material" gltype="GLfloat"/>
+	<function name="Materialf" default_prefix="_es_" template="Material" gltype="GLfloat" expand_vector="true"/>
+	<function name="Materialfv" default_prefix="_es_" template="Material" gltype="GLfloat"/>
 
 	<function name="PointSize" template="PointSize" gltype="GLfloat"/>
 	<function name="PointSizePointer" template="PointSizePointer"/>
@@ -3650,7 +3650,7 @@
 	<function name="EnableClientState" template="EnableClientState"/>
 
 	<function name="GetPointerv" template="GetPointer"/>
-	<function name="Normal3f" default_prefix="_vbo_" template="Normal" gltype="GLfloat" expand_vector="true"/>
+	<function name="Normal3f" default_prefix="_es_" template="Normal" gltype="GLfloat" expand_vector="true"/>
 	<function name="NormalPointer" template="NormalPointer"/>
 	<function name="TexCoordPointer" template="TexCoordPointer"/>
 	<function name="VertexPointer" template="VertexPointer"/>
@@ -3679,7 +3679,7 @@
 	<function name="ActiveTextureARB" template="ActiveTexture"/>
 	<function name="ClientActiveTextureARB" template="ClientActiveTexture"/>
 
-	<function name="MultiTexCoord4f" default_prefix="_vbo_" template="MultiTexCoord" gltype="GLfloat" vector_size="4" expand_vector="true"/>
+	<function name="MultiTexCoord4f" default_prefix="_es_" template="MultiTexCoord" gltype="GLfloat" vector_size="4" expand_vector="true"/>
 
 	<function name="SampleCoverageARB" template="SampleCoverage" gltype="GLclampf"/>
 
@@ -3691,14 +3691,14 @@
 	<function name="PointParameterf" template="PointParameter" gltype="GLfloat" expand_vector="true"/>
 	<function name="PointParameterfv" template="PointParameter" gltype="GLfloat"/>
 
-	<function name="VertexAttrib1f" default_prefix="_vbo_" template="VertexAttrib" gltype="GLfloat" vector_size="1" expand_vector="true"/>
-	<function name="VertexAttrib2f" default_prefix="_vbo_" template="VertexAttrib" gltype="GLfloat" vector_size="2" expand_vector="true"/>
-	<function name="VertexAttrib3f" default_prefix="_vbo_" template="VertexAttrib" gltype="GLfloat" vector_size="3" expand_vector="true"/>
-	<function name="VertexAttrib4f" default_prefix="_vbo_" template="VertexAttrib" gltype="GLfloat" vector_size="4" expand_vector="true"/>
-	<function name="VertexAttrib1fv" default_prefix="_vbo_" template="VertexAttrib" gltype="GLfloat" vector_size="1"/>
-	<function name="VertexAttrib2fv" default_prefix="_vbo_" template="VertexAttrib" gltype="GLfloat" vector_size="2"/>
-	<function name="VertexAttrib3fv" default_prefix="_vbo_" template="VertexAttrib" gltype="GLfloat" vector_size="3"/>
-	<function name="VertexAttrib4fv" default_prefix="_vbo_" template="VertexAttrib" gltype="GLfloat" vector_size="4"/>
+	<function name="VertexAttrib1f" default_prefix="_es_" template="VertexAttrib" gltype="GLfloat" vector_size="1" expand_vector="true"/>
+	<function name="VertexAttrib2f" default_prefix="_es_" template="VertexAttrib" gltype="GLfloat" vector_size="2" expand_vector="true"/>
+	<function name="VertexAttrib3f" default_prefix="_es_" template="VertexAttrib" gltype="GLfloat" vector_size="3" expand_vector="true"/>
+	<function name="VertexAttrib4f" default_prefix="_es_" template="VertexAttrib" gltype="GLfloat" vector_size="4" expand_vector="true"/>
+	<function name="VertexAttrib1fv" default_prefix="_es_" template="VertexAttrib" gltype="GLfloat" vector_size="1"/>
+	<function name="VertexAttrib2fv" default_prefix="_es_" template="VertexAttrib" gltype="GLfloat" vector_size="2"/>
+	<function name="VertexAttrib3fv" default_prefix="_es_" template="VertexAttrib" gltype="GLfloat" vector_size="3"/>
+	<function name="VertexAttrib4fv" default_prefix="_es_" template="VertexAttrib" gltype="GLfloat" vector_size="4"/>
 
 	<function name="VertexAttribPointerARB" template="VertexAttribPointer"/>
 	<function name="EnableVertexAttribArrayARB" template="EnableVertexAttribArray"/>
@@ -3850,7 +3850,7 @@
 
 	<category name="OES_matrix_palette"/>
 
-	<function name="Color4f" template="Color" gltype="GLfloat" vector_size="4" expand_vector="true"/>
+	<function name="Color4f" external="true" template="Color" gltype="GLfloat" vector_size="4" expand_vector="true"/>
 	<function name="Color4ub" template="Color" gltype="GLubyte" vector_size="4" expand_vector="true"/>
 	<function name="Color4x" template="Color" gltype="GLfixed" vector_size="4" expand_vector="true"/>
 
@@ -3880,8 +3880,8 @@
 	<function name="LineWidth" template="LineWidth" gltype="GLfloat"/>
 	<function name="LineWidthx" template="LineWidth" gltype="GLfixed"/>
 
-	<function name="Materialf" template="Material" gltype="GLfloat" expand_vector="true"/>
-	<function name="Materialfv" template="Material" gltype="GLfloat"/>
+	<function name="Materialf" external="true" template="Material" gltype="GLfloat" expand_vector="true"/>
+	<function name="Materialfv" external="true" template="Material" gltype="GLfloat"/>
 	<function name="Materialx" template="Material" gltype="GLfixed" expand_vector="true"/>
 	<function name="Materialxv" template="Material" gltype="GLfixed"/>
 
@@ -4012,7 +4012,7 @@
 
 	<function name="GetPointerv" template="GetPointer"/>
 
-	<function name="Normal3f" template="Normal" gltype="GLfloat" expand_vector="true"/>
+	<function name="Normal3f" external="true" template="Normal" gltype="GLfloat" expand_vector="true"/>
 	<function name="Normal3x" template="Normal" gltype="GLfixed" expand_vector="true"/>
 	<function name="NormalPointer" template="NormalPointer"/>
 	<function name="TexCoordPointer" template="TexCoordPointer"/>
@@ -4039,7 +4039,7 @@
 	<function name="ActiveTexture" template="ActiveTexture"/>
 	<function name="ClientActiveTexture" template="ClientActiveTexture"/>
 
-	<function name="MultiTexCoord4f" template="MultiTexCoord" gltype="GLfloat" vector_size="4" expand_vector="true"/>
+	<function name="MultiTexCoord4f" external="true" template="MultiTexCoord" gltype="GLfloat" vector_size="4" expand_vector="true"/>
 
 	<function name="SampleCoverage" template="SampleCoverage" gltype="GLclampf"/>
 	<function name="SampleCoveragex" template="SampleCoverage" gltype="GLclampx"/>
@@ -4227,14 +4227,14 @@
 
 	<function name="BlendFuncSeparate" template="BlendFuncSeparate"/>
 
-	<function name="VertexAttrib1f" template="VertexAttrib" gltype="GLfloat" vector_size="1" expand_vector="true"/>
-	<function name="VertexAttrib2f" template="VertexAttrib" gltype="GLfloat" vector_size="2" expand_vector="true"/>
-	<function name="VertexAttrib3f" template="VertexAttrib" gltype="GLfloat" vector_size="3" expand_vector="true"/>
-	<function name="VertexAttrib4f" template="VertexAttrib" gltype="GLfloat" vector_size="4" expand_vector="true"/>
-	<function name="VertexAttrib1fv" template="VertexAttrib" gltype="GLfloat" vector_size="1"/>
-	<function name="VertexAttrib2fv" template="VertexAttrib" gltype="GLfloat" vector_size="2"/>
-	<function name="VertexAttrib3fv" template="VertexAttrib" gltype="GLfloat" vector_size="3"/>
-	<function name="VertexAttrib4fv" template="VertexAttrib" gltype="GLfloat" vector_size="4"/>
+	<function name="VertexAttrib1f" external="true" template="VertexAttrib" gltype="GLfloat" vector_size="1" expand_vector="true"/>
+	<function name="VertexAttrib2f" external="true" template="VertexAttrib" gltype="GLfloat" vector_size="2" expand_vector="true"/>
+	<function name="VertexAttrib3f" external="true" template="VertexAttrib" gltype="GLfloat" vector_size="3" expand_vector="true"/>
+	<function name="VertexAttrib4f" external="true" template="VertexAttrib" gltype="GLfloat" vector_size="4" expand_vector="true"/>
+	<function name="VertexAttrib1fv" external="true" template="VertexAttrib" gltype="GLfloat" vector_size="1"/>
+	<function name="VertexAttrib2fv" external="true" template="VertexAttrib" gltype="GLfloat" vector_size="2"/>
+	<function name="VertexAttrib3fv" external="true" template="VertexAttrib" gltype="GLfloat" vector_size="3"/>
+	<function name="VertexAttrib4fv" external="true" template="VertexAttrib" gltype="GLfloat" vector_size="4"/>
 
 	<function name="VertexAttribPointer" template="VertexAttribPointer"/>
 
diff --git a/src/mesa/main/compiler.h b/src/mesa/main/compiler.h
index 800eb839005..5557a3b5cb5 100644
--- a/src/mesa/main/compiler.h
+++ b/src/mesa/main/compiler.h
@@ -358,6 +358,10 @@ static INLINE GLuint CPU_TO_LE32(GLuint x)
 #define M_E (2.7182818284590452354)
 #endif
 
+#ifndef M_LOG2E
+#define M_LOG2E     (1.4426950408889634074)
+#endif
+
 #ifndef ONE_DIV_LN2
 #define ONE_DIV_LN2 (1.442695040888963456)
 #endif
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 0f2d1a8f8da..fffb1a7d2ec 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -97,17 +97,20 @@
 /** Max texture palette / color table size */
 #define MAX_COLOR_TABLE_SIZE 256
 
+/** Max memory to allow for a single texture image (in megabytes) */
+#define MAX_TEXTURE_MBYTES 1024
+
 /** Number of 1D/2D texture mipmap levels */
-#define MAX_TEXTURE_LEVELS 13
+#define MAX_TEXTURE_LEVELS 15
 
 /** Number of 3D texture mipmap levels */
-#define MAX_3D_TEXTURE_LEVELS 9
+#define MAX_3D_TEXTURE_LEVELS 15
 
 /** Number of cube texture mipmap levels - GL_ARB_texture_cube_map */
-#define MAX_CUBE_TEXTURE_LEVELS 13
+#define MAX_CUBE_TEXTURE_LEVELS 15
 
 /** Maximum rectangular texture size - GL_NV_texture_rectangle */
-#define MAX_TEXTURE_RECT_SIZE 4096
+#define MAX_TEXTURE_RECT_SIZE 16384
 
 /** Maximum number of layers in a 1D or 2D array texture - GL_MESA_texture_array */
 #define MAX_ARRAY_TEXTURE_LAYERS 64
@@ -140,11 +143,28 @@
  */
 
 #ifndef MAX_WIDTH
-#   define MAX_WIDTH 4096
+#   define MAX_WIDTH 16384
 #endif
 /** Maximum viewport/image height */
 #ifndef MAX_HEIGHT
-#   define MAX_HEIGHT 4096
+#   define MAX_HEIGHT 16384
+#endif
+
+/* XXX: hack to prevent stack overflow on windows until all temporary arrays
+ * [MAX_WIDTH] are allocated from the heap */
+#ifdef WIN32
+#undef MAX_TEXTURE_LEVELS
+#undef MAX_3D_TEXTURE_LEVELS
+#undef MAX_CUBE_TEXTURE_LEVELS
+#undef MAX_TEXTURE_RECT_SIZE
+#undef MAX_WIDTH
+#undef MAX_HEIGHT
+#define MAX_TEXTURE_LEVELS 13
+#define MAX_3D_TEXTURE_LEVELS 9
+#define MAX_CUBE_TEXTURE_LEVELS 13
+#define MAX_TEXTURE_RECT_SIZE 4096
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
 #endif
 
 /** Maxmimum size for CVA.  May be overridden by the drivers.  */
@@ -168,7 +188,7 @@
 #define MAX_TEXTURE_MAX_ANISOTROPY 16.0
 
 /** For GL_EXT_texture_lod_bias (typically MAX_TEXTURE_LEVELS - 1) */
-#define MAX_TEXTURE_LOD_BIAS 12.0
+#define MAX_TEXTURE_LOD_BIAS 14.0
 
 /** For any program target/extension */
 /*@{*/
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index b132030b9b1..f42a566c302 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -535,6 +535,7 @@ _mesa_init_constants(struct gl_context *ctx)
    assert(ctx);
 
    /* Constants, may be overriden (usually only reduced) by device drivers */
+   ctx->Const.MaxTextureMbytes = MAX_TEXTURE_MBYTES;
    ctx->Const.MaxTextureLevels = MAX_TEXTURE_LEVELS;
    ctx->Const.Max3DTextureLevels = MAX_3D_TEXTURE_LEVELS;
    ctx->Const.MaxCubeTextureLevels = MAX_CUBE_TEXTURE_LEVELS;
@@ -1399,6 +1400,8 @@ _mesa_make_current( struct gl_context *newCtx,
                     struct gl_framebuffer *drawBuffer,
                     struct gl_framebuffer *readBuffer )
 {
+   GET_CURRENT_CONTEXT(curCtx);
+
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(newCtx, "_mesa_make_current()\n");
 
@@ -1419,6 +1422,11 @@ _mesa_make_current( struct gl_context *newCtx,
       }
    }
 
+   if (curCtx && 
+      (curCtx->WinSysDrawBuffer || curCtx->WinSysReadBuffer) && /* make sure this context is valid for flushing */
+      curCtx != newCtx)
+      _mesa_flush(curCtx);
+
    /* We used to call _glapi_check_multithread() here.  Now do it in drivers */
    _glapi_set_context((void *) newCtx);
    ASSERT(_mesa_get_current_context() == newCtx);
@@ -1822,7 +1830,7 @@ _mesa_valid_to_render(struct gl_context *ctx, const char *where)
 #ifdef DEBUG
    if (ctx->Shader.Flags & GLSL_LOG) {
       struct gl_shader_program *shProg[MESA_SHADER_TYPES];
-      unsigned i;
+      gl_shader_type i;
 
       shProg[MESA_SHADER_VERTEX] = ctx->Shader.CurrentVertexProgram;
       shProg[MESA_SHADER_GEOMETRY] = ctx->Shader.CurrentGeometryProgram;
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 3d5830c01cc..b71afdd61f3 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -873,8 +873,12 @@ make_extension_string_es2(const struct gl_context *ctx, GLubyte *str)
    if (ctx->Extensions.ARB_vertex_buffer_object)
       len += append_extension(&str, "GL_OES_mapbuffer");
 
+#if 0
+   /* disabled because of missing GLSL support */
    if (ctx->Extensions.EXT_texture3D)
       len += append_extension(&str, "GL_OES_texture_3D");
+#endif
+
    if (ctx->Extensions.ARB_texture_non_power_of_two)
       len += append_extension(&str, "GL_OES_texture_npot");
    if (ctx->Extensions.EXT_texture_filter_anisotropic)
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 7c3357043fa..975063d0d78 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -2014,7 +2014,7 @@ _mesa_GetFramebufferAttachmentParameterivEXT(GLenum target, GLenum attachment,
 
    switch (pname) {
    case GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE_EXT:
-      *params = att->Type;
+      *params = buffer->Name == 0 ? GL_FRAMEBUFFER_DEFAULT : att->Type;
       return;
    case GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME_EXT:
       if (att->Type == GL_RENDERBUFFER_EXT) {
@@ -2024,8 +2024,8 @@ _mesa_GetFramebufferAttachmentParameterivEXT(GLenum target, GLenum attachment,
 	 *params = att->Texture->Name;
       }
       else {
-	 _mesa_error(ctx, GL_INVALID_ENUM,
-		     "glGetFramebufferAttachmentParameterivEXT(pname)");
+         assert(att->Type == GL_NONE);
+         *params = 0;
       }
       return;
    case GL_FRAMEBUFFER_ATTACHMENT_TEXTURE_LEVEL_EXT:
@@ -2146,6 +2146,7 @@ _mesa_GenerateMipmapEXT(GLenum target)
       /* OK, legal value */
       break;
    default:
+      /* XXX need to implement GL_TEXTURE_1D_ARRAY and GL_TEXTURE_2D_ARRAY */
       _mesa_error(ctx, GL_INVALID_ENUM, "glGenerateMipmapEXT(target)");
       return;
    }
@@ -2157,6 +2158,13 @@ _mesa_GenerateMipmapEXT(GLenum target)
       return;
    }
 
+   if (texObj->Target == GL_TEXTURE_CUBE_MAP &&
+       !_mesa_cube_complete(texObj)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGenerateMipmap(incomplete cube map)");
+      return;
+   }
+
    _mesa_lock_texture(ctx, texObj);
    if (target == GL_TEXTURE_CUBE_MAP) {
       GLuint face;
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index 88a04e888e4..cd9eb81852f 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -1057,11 +1057,12 @@ _mesa_format_image_size(gl_format format, GLsizei width,
    const struct gl_format_info *info = _mesa_get_format_info(format);
    /* Strictly speaking, a conditional isn't needed here */
    if (info->BlockWidth > 1 || info->BlockHeight > 1) {
-      /* compressed format */
+      /* compressed format (2D only for now) */
       const GLuint bw = info->BlockWidth, bh = info->BlockHeight;
       const GLuint wblocks = (width + bw - 1) / bw;
       const GLuint hblocks = (height + bh - 1) / bh;
       const GLuint sz = wblocks * hblocks * info->BytesPerBlock;
+      assert(depth == 1);
       return sz;
    }
    else {
@@ -1072,6 +1073,36 @@ _mesa_format_image_size(gl_format format, GLsizei width,
 }
 
 
+/**
+ * Same as _mesa_format_image_size() but returns a 64-bit value to
+ * accomodate very large textures.
+ */
+uint64_t
+_mesa_format_image_size64(gl_format format, GLsizei width,
+                          GLsizei height, GLsizei depth)
+{
+   const struct gl_format_info *info = _mesa_get_format_info(format);
+   /* Strictly speaking, a conditional isn't needed here */
+   if (info->BlockWidth > 1 || info->BlockHeight > 1) {
+      /* compressed format (2D only for now) */
+      const uint64_t bw = info->BlockWidth, bh = info->BlockHeight;
+      const uint64_t wblocks = (width + bw - 1) / bw;
+      const uint64_t hblocks = (height + bh - 1) / bh;
+      const uint64_t sz = wblocks * hblocks * info->BytesPerBlock;
+      assert(depth == 1);
+      return sz;
+   }
+   else {
+      /* non-compressed */
+      const uint64_t sz = ((uint64_t) width *
+                           (uint64_t) height *
+                           (uint64_t) depth *
+                           info->BytesPerBlock);
+      return sz;
+   }
+}
+
+
 
 GLint
 _mesa_format_row_stride(gl_format format, GLsizei width)
diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index eeb460dabe7..997229bf9f4 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -209,6 +209,10 @@ extern GLuint
 _mesa_format_image_size(gl_format format, GLsizei width,
                         GLsizei height, GLsizei depth);
 
+extern uint64_t
+_mesa_format_image_size64(gl_format format, GLsizei width,
+                          GLsizei height, GLsizei depth);
+
 extern GLint
 _mesa_format_row_stride(gl_format format, GLsizei width);
 
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index b54af6ee86b..5ae35b868e3 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -314,6 +314,7 @@ EXTRA_EXT2(ARB_vertex_program, NV_vertex_program);
 EXTRA_EXT2(ARB_vertex_program, ARB_fragment_program);
 EXTRA_EXT(ARB_vertex_buffer_object);
 EXTRA_EXT(ARB_geometry_shader4);
+EXTRA_EXT(ARB_copy_buffer);
 
 static const int
 extra_ARB_vertex_program_ARB_fragment_program_NV_vertex_program[] = {
@@ -469,6 +470,10 @@ static const struct value_desc values[] = {
    { GL_ELEMENT_ARRAY_BUFFER_BINDING_ARB, LOC_CUSTOM, TYPE_INT, 0,
      extra_ARB_vertex_buffer_object },
 
+   /* GL_ARB_copy_buffer */
+   { GL_COPY_READ_BUFFER, LOC_CUSTOM, TYPE_INT, 0, extra_ARB_copy_buffer },
+   { GL_COPY_WRITE_BUFFER, LOC_CUSTOM, TYPE_INT, 0, extra_ARB_copy_buffer },
+
    /* GL_OES_read_format */
    { GL_IMPLEMENTATION_COLOR_READ_TYPE_OES, LOC_CUSTOM, TYPE_INT, 0,
      extra_new_buffers_OES_read_format },
@@ -700,12 +705,9 @@ static const struct value_desc values[] = {
 #if FEATURE_ES2
    /* Enums unique to OpenGL ES 2.0 */
    { 0, 0, TYPE_API_MASK, API_OPENGLES2_BIT, NO_EXTRA },
-   { GL_MAX_FRAGMENT_UNIFORM_VECTORS, LOC_CUSTOM, TYPE_INT,
-     offsetof(struct gl_context, Const.FragmentProgram.MaxUniformComponents), NO_EXTRA },
-   { GL_MAX_VARYING_VECTORS, LOC_CUSTOM, TYPE_INT,
-     offsetof(struct gl_context, Const.MaxVarying), NO_EXTRA },
-   { GL_MAX_VERTEX_UNIFORM_VECTORS, LOC_CUSTOM, TYPE_INT,
-     offsetof(struct gl_context, Const.VertexProgram.MaxUniformComponents), NO_EXTRA },
+   { GL_MAX_FRAGMENT_UNIFORM_VECTORS, LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA },
+   { GL_MAX_VARYING_VECTORS, CONTEXT_INT(Const.MaxVarying), NO_EXTRA },
+   { GL_MAX_VERTEX_UNIFORM_VECTORS, LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA },
    { GL_SHADER_COMPILER, CONST(1), NO_EXTRA },
    /* OES_get_program_binary */
    { GL_NUM_SHADER_BINARY_FORMATS, CONST(0), NO_EXTRA },
@@ -1112,6 +1114,14 @@ static const struct value_desc values[] = {
      extra_valid_draw_buffer },
    { GL_DRAW_BUFFER3_ARB, BUFFER_ENUM(ColorDrawBuffer[3]),
      extra_valid_draw_buffer },
+   { GL_DRAW_BUFFER4_ARB, BUFFER_ENUM(ColorDrawBuffer[4]),
+     extra_valid_draw_buffer },
+   { GL_DRAW_BUFFER5_ARB, BUFFER_ENUM(ColorDrawBuffer[5]),
+     extra_valid_draw_buffer },
+   { GL_DRAW_BUFFER6_ARB, BUFFER_ENUM(ColorDrawBuffer[6]),
+     extra_valid_draw_buffer },
+   { GL_DRAW_BUFFER7_ARB, BUFFER_ENUM(ColorDrawBuffer[7]),
+     extra_valid_draw_buffer },
 
    /* GL_ATI_fragment_shader */
    { GL_NUM_FRAGMENT_REGISTERS_ATI, CONST(6), extra_ATI_fragment_shader },
@@ -1564,6 +1574,14 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       v->value_int = ctx->Array.ElementArrayBufferObj->Name;
       break;
 
+   /* ARB_copy_buffer */
+   case GL_COPY_READ_BUFFER:
+      v->value_int = ctx->CopyReadBuffer->Name;
+      break;
+   case GL_COPY_WRITE_BUFFER:
+      v->value_int = ctx->CopyWriteBuffer->Name;
+      break;
+
    case GL_FRAGMENT_PROGRAM_BINDING_NV:
       v->value_int = 
 	 ctx->FragmentProgram.Current ? ctx->FragmentProgram.Current->Base.Id : 0;
@@ -1604,6 +1622,14 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
    case GL_POINT_SIZE_ARRAY_BUFFER_BINDING_OES:
       v->value_int = ctx->Array.ArrayObj->PointSize.BufferObj->Name;
       break;
+
+   case GL_MAX_VERTEX_UNIFORM_VECTORS:
+      v->value_int = ctx->Const.VertexProgram.MaxUniformComponents / 4;
+      break;
+
+   case GL_MAX_FRAGMENT_UNIFORM_VECTORS:
+      v->value_int = ctx->Const.FragmentProgram.MaxUniformComponents / 4;
+      break;
    }   
 }
 
@@ -1733,16 +1759,18 @@ find_value(const char *func, GLenum pname, void **p, union value *v)
    hash = (pname * prime_factor);
    while (1) {
       d = &values[table[hash & mask]];
-      if (likely(d->pname == pname))
-	 break;
 
       /* If the enum isn't valid, the hash walk ends with index 0,
        * which is the API mask entry at the beginning of values[]. */
-      if (d->type == TYPE_API_MASK) {
+      if (unlikely(d->type == TYPE_API_MASK)) {
 	 _mesa_error(ctx, GL_INVALID_ENUM, "%s(pname=%s)", func,
                      _mesa_lookup_enum_by_nr(pname));
 	 return &error_value;
       }
+
+      if (likely(d->pname == pname))
+	 break;
+
       hash += prime_step;
    }
 
diff --git a/src/mesa/main/image.c b/src/mesa/main/image.c
index df1527b47f1..f9f2ed73077 100644
--- a/src/mesa/main/image.c
+++ b/src/mesa/main/image.c
@@ -154,6 +154,8 @@ _mesa_sizeof_type( GLenum type )
 	 return sizeof(GLdouble);
       case GL_HALF_FLOAT_ARB:
 	 return sizeof(GLhalfARB);
+      case GL_FIXED:
+	 return sizeof(GLfixed);
       default:
          return -1;
    }
diff --git a/src/mesa/main/imports.c b/src/mesa/main/imports.c
index bcca4edc1aa..cefbf4d8c98 100644
--- a/src/mesa/main/imports.c
+++ b/src/mesa/main/imports.c
@@ -88,7 +88,8 @@ _mesa_align_malloc(size_t bytes, unsigned long alignment)
 #if defined(HAVE_POSIX_MEMALIGN)
    void *mem;
    int err = posix_memalign(& mem, alignment, bytes);
-   (void) err;
+   if (err)
+      return NULL;
    return mem;
 #elif defined(_WIN32) && defined(_MSC_VER)
    return _aligned_malloc(bytes, alignment);
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 87b96489dbf..1c549a8e247 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -41,14 +41,6 @@
 #include "math/m_matrix.h"	/* GLmatrix */
 #include "main/simple_list.h"	/* struct simple_node */
 
-/* Shader stages. Note that these will become 5 with tessellation.
- * These MUST have the same values as PIPE_SHADER_*
- */
-#define MESA_SHADER_VERTEX   0
-#define MESA_SHADER_FRAGMENT 1
-#define MESA_SHADER_GEOMETRY 2
-#define MESA_SHADER_TYPES    3
-
 
 /**
  * Color channel data type.
@@ -130,6 +122,20 @@ struct st_context;
 
 
 /**
+ * Shader stages. Note that these will become 5 with tessellation.
+ * These MUST have the same values as gallium's PIPE_SHADER_*
+ */
+typedef enum
+{
+   MESA_SHADER_VERTEX = 0,
+   MESA_SHADER_FRAGMENT = 1,
+   MESA_SHADER_GEOMETRY = 2,
+   MESA_SHADER_TYPES = 3
+} gl_shader_type;
+
+
+
+/**
  * Indexes for vertex program attributes.
  * GL_NV_vertex_program aliases generic attributes over the conventional
  * attributes.  In GL_ARB_vertex_program shader the aliasing is optional.
@@ -2191,6 +2197,7 @@ struct gl_shader_compiler_options
    GLboolean EmitNoCont;                  /**< Emit CONT opcode? */
    GLboolean EmitNoMainReturn;            /**< Emit CONT/RET opcodes? */
    GLboolean EmitNoNoise;                 /**< Emit NOISE opcodes? */
+   GLboolean EmitNoPow;                   /**< Emit POW opcodes? */
 
    /**
     * \name Forms of indirect addressing the driver cannot do.
@@ -2560,6 +2567,7 @@ struct gl_program_constants
  */
 struct gl_constants
 {
+   GLint MaxTextureMbytes;      /**< Max memory per image, in MB */
    GLint MaxTextureLevels;      /**< Max mipmap levels. */ 
    GLint Max3DTextureLevels;    /**< Max mipmap levels for 3D textures */
    GLint MaxCubeTextureLevels;  /**< Max mipmap levels for cube textures */
diff --git a/src/mesa/main/pack.c b/src/mesa/main/pack.c
index fdb647c7ea8..6d524e64908 100644
--- a/src/mesa/main/pack.c
+++ b/src/mesa/main/pack.c
@@ -484,11 +484,25 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4],
                            const struct gl_pixelstore_attrib *dstPacking,
                            GLbitfield transferOps)
 {
-   GLfloat luminance[MAX_WIDTH];
+   GLfloat *luminance;
    const GLint comps = _mesa_components_in_format(dstFormat);
    const GLboolean intDstFormat = _mesa_is_integer_format(dstFormat);
    GLuint i;
 
+   if (dstFormat == GL_LUMINANCE ||
+       dstFormat == GL_LUMINANCE_ALPHA ||
+       dstFormat == GL_LUMINANCE_INTEGER_EXT ||
+       dstFormat == GL_LUMINANCE_ALPHA_INTEGER_EXT) {
+      luminance = (GLfloat *) malloc(n * sizeof(GLfloat));
+      if (!luminance) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel packing");
+         return;
+      }
+   }
+   else {
+      luminance = NULL;
+   }
+
    /* XXX
     * This test should probably go away.  Have the caller set/clear the
     * IMAGE_CLAMP_BIT as needed.
@@ -1907,6 +1921,8 @@ _mesa_pack_rgba_span_float(struct gl_context *ctx, GLuint n, GLfloat rgba[][4],
          }
       }
    }
+
+   free(luminance);
 }
 
 
@@ -3462,7 +3478,12 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
    {
       GLint dstComponents;
       GLint rDst, gDst, bDst, aDst, lDst, iDst;
-      GLfloat rgba[MAX_WIDTH][4];
+      GLfloat (*rgba)[4] = (GLfloat (*)[4]) malloc(4 * n * sizeof(GLfloat));
+
+      if (!rgba) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel unpacking");
+         return;
+      }
 
       dstComponents = _mesa_components_in_format( dstFormat );
       /* source & dest image formats should have been error checked by now */
@@ -3471,9 +3492,14 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
       /*
        * Extract image data and convert to RGBA floats
        */
-      assert(n <= MAX_WIDTH);
       if (srcFormat == GL_COLOR_INDEX) {
-         GLuint indexes[MAX_WIDTH];
+         GLuint *indexes = (GLuint *) malloc(n * sizeof(GLuint));
+
+         if (!indexes) {
+            _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel unpacking");
+            return;
+         }
+
          extract_uint_indexes(n, indexes, srcFormat, srcType, source,
                               srcPacking);
 
@@ -3484,6 +3510,8 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
             for (i = 0; i < n; i++) {
                dest[i] = (GLchan) (indexes[i] & 0xff);
             }
+            free(indexes);
+            free(rgba);
             return;
          }
          else {
@@ -3498,6 +3526,8 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
           * with color indexes.
           */
          transferOps &= ~(IMAGE_SCALE_BIAS_BIT | IMAGE_MAP_COLOR_BIT);
+
+         free(indexes);
       }
       else {
          /* non-color index data */
@@ -3575,6 +3605,8 @@ _mesa_unpack_color_span_chan( struct gl_context *ctx,
             dst += dstComponents;
          }
       }
+
+      free(rgba);
    }
 }
 
@@ -3652,7 +3684,12 @@ _mesa_unpack_color_span_float( struct gl_context *ctx,
    {
       GLint dstComponents;
       GLint rDst, gDst, bDst, aDst, lDst, iDst;
-      GLfloat rgba[MAX_WIDTH][4];
+      GLfloat (*rgba)[4] = (GLfloat (*)[4]) malloc(4 * n * sizeof(GLfloat));
+
+      if (!rgba) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel unpacking");
+         return;
+      }
 
       dstComponents = _mesa_components_in_format( dstFormat );
       /* source & dest image formats should have been error checked by now */
@@ -3661,9 +3698,15 @@ _mesa_unpack_color_span_float( struct gl_context *ctx,
       /*
        * Extract image data and convert to RGBA floats
        */
-      assert(n <= MAX_WIDTH);
       if (srcFormat == GL_COLOR_INDEX) {
-         GLuint indexes[MAX_WIDTH];
+         GLuint *indexes = (GLuint *) malloc(n * sizeof(GLuint));
+
+         if (!indexes) {
+            _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel unpacking");
+            free(rgba);
+            return;
+         }
+
          extract_uint_indexes(n, indexes, srcFormat, srcType, source,
                               srcPacking);
 
@@ -3674,6 +3717,8 @@ _mesa_unpack_color_span_float( struct gl_context *ctx,
             for (i = 0; i < n; i++) {
                dest[i] = (GLchan) (indexes[i] & 0xff);
             }
+            free(indexes);
+            free(rgba);
             return;
          }
          else {
@@ -3688,6 +3733,8 @@ _mesa_unpack_color_span_float( struct gl_context *ctx,
           * with color indexes.
           */
          transferOps &= ~(IMAGE_SCALE_BIAS_BIT | IMAGE_MAP_COLOR_BIT);
+
+         free(indexes);
       }
       else {
          /* non-color index data */
@@ -3760,6 +3807,8 @@ _mesa_unpack_color_span_float( struct gl_context *ctx,
             dst += dstComponents;
          }
       }
+
+      free(rgba);
    }
 }
 
@@ -3776,9 +3825,12 @@ _mesa_unpack_color_span_uint(struct gl_context *ctx,
                              const GLvoid *source,
                              const struct gl_pixelstore_attrib *srcPacking)
 {
-   GLuint rgba[MAX_WIDTH][4];
+   GLuint (*rgba)[4] = (GLuint (*)[4]) malloc(n * 4 * sizeof(GLfloat));
 
-   ASSERT(n <= MAX_WIDTH);
+   if (!rgba) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel unpacking");
+      return;
+   }
 
    ASSERT(dstFormat == GL_ALPHA ||
           dstFormat == GL_LUMINANCE ||
@@ -3912,6 +3964,8 @@ _mesa_unpack_color_span_uint(struct gl_context *ctx,
          }
       }
    }
+
+   free(rgba);
 }
 
 
@@ -3943,9 +3997,14 @@ _mesa_unpack_dudv_span_byte( struct gl_context *ctx,
    /* general solution */
    {
       GLint dstComponents;
-      GLfloat rgba[MAX_WIDTH][4];
       GLbyte *dst = dest;
       GLuint i;
+      GLfloat (*rgba)[4] = (GLfloat (*)[4]) malloc(4 * n * sizeof(GLfloat));
+
+      if (!rgba) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel unpacking");
+         return;
+      }
 
       dstComponents = _mesa_components_in_format( dstFormat );
       /* source & dest image formats should have been error checked by now */
@@ -3954,7 +4013,6 @@ _mesa_unpack_dudv_span_byte( struct gl_context *ctx,
       /*
        * Extract image data and convert to RGBA floats
        */
-      assert(n <= MAX_WIDTH);
       extract_float_rgba(n, rgba, srcFormat, srcType, source,
                          srcPacking->SwapBytes);
 
@@ -3970,6 +4028,8 @@ _mesa_unpack_dudv_span_byte( struct gl_context *ctx,
          dst[1] = FLOAT_TO_BYTE(rgba[i][GCOMP]);
          dst += dstComponents;
       }
+
+      free(rgba);
    }
 }
 
@@ -3988,7 +4048,7 @@ _mesa_unpack_dudv_span_byte( struct gl_context *ctx,
  *        transferOps - the pixel transfer operations to apply
  */
 void
-_mesa_unpack_index_span( const struct gl_context *ctx, GLuint n,
+_mesa_unpack_index_span( struct gl_context *ctx, GLuint n,
                          GLenum dstType, GLvoid *dest,
                          GLenum srcType, const GLvoid *source,
                          const struct gl_pixelstore_attrib *srcPacking,
@@ -4026,8 +4086,12 @@ _mesa_unpack_index_span( const struct gl_context *ctx, GLuint n,
       /*
        * general solution
        */
-      GLuint indexes[MAX_WIDTH];
-      assert(n <= MAX_WIDTH);
+      GLuint *indexes = (GLuint *) malloc(n * sizeof(GLuint));
+
+      if (!indexes) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel unpacking");
+         return;
+      }
 
       extract_uint_indexes(n, indexes, GL_COLOR_INDEX, srcType, source,
                            srcPacking);
@@ -4061,19 +4125,24 @@ _mesa_unpack_index_span( const struct gl_context *ctx, GLuint n,
          default:
             _mesa_problem(ctx, "bad dstType in _mesa_unpack_index_span");
       }
+
+      free(indexes);
    }
 }
 
 
 void
-_mesa_pack_index_span( const struct gl_context *ctx, GLuint n,
+_mesa_pack_index_span( struct gl_context *ctx, GLuint n,
                        GLenum dstType, GLvoid *dest, const GLuint *source,
                        const struct gl_pixelstore_attrib *dstPacking,
                        GLbitfield transferOps )
 {
-   GLuint indexes[MAX_WIDTH];
+   GLuint *indexes = (GLuint *) malloc(n * sizeof(GLuint));
 
-   ASSERT(n <= MAX_WIDTH);
+   if (!indexes) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel packing");
+      return;
+   }
 
    transferOps &= (IMAGE_MAP_COLOR_BIT | IMAGE_SHIFT_OFFSET_BIT);
 
@@ -4178,6 +4247,8 @@ _mesa_pack_index_span( const struct gl_context *ctx, GLuint n,
    default:
       _mesa_problem(ctx, "bad type in _mesa_pack_index_span");
    }
+
+   free(indexes);
 }
 
 
@@ -4196,7 +4267,7 @@ _mesa_pack_index_span( const struct gl_context *ctx, GLuint n,
  *        transferOps - apply offset/bias/lookup ops?
  */
 void
-_mesa_unpack_stencil_span( const struct gl_context *ctx, GLuint n,
+_mesa_unpack_stencil_span( struct gl_context *ctx, GLuint n,
                            GLenum dstType, GLvoid *dest,
                            GLenum srcType, const GLvoid *source,
                            const struct gl_pixelstore_attrib *srcPacking,
@@ -4240,8 +4311,12 @@ _mesa_unpack_stencil_span( const struct gl_context *ctx, GLuint n,
       /*
        * general solution
        */
-      GLuint indexes[MAX_WIDTH];
-      assert(n <= MAX_WIDTH);
+      GLuint *indexes = (GLuint *) malloc(n * sizeof(GLuint));
+
+      if (!indexes) {
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "stencil unpacking");
+         return;
+      }
 
       extract_uint_indexes(n, indexes, GL_STENCIL_INDEX, srcType, source,
                            srcPacking);
@@ -4286,18 +4361,23 @@ _mesa_unpack_stencil_span( const struct gl_context *ctx, GLuint n,
          default:
             _mesa_problem(ctx, "bad dstType in _mesa_unpack_stencil_span");
       }
+
+      free(indexes);
    }
 }
 
 
 void
-_mesa_pack_stencil_span( const struct gl_context *ctx, GLuint n,
+_mesa_pack_stencil_span( struct gl_context *ctx, GLuint n,
                          GLenum dstType, GLvoid *dest, const GLstencil *source,
                          const struct gl_pixelstore_attrib *dstPacking )
 {
-   GLstencil stencil[MAX_WIDTH];
+   GLstencil *stencil = (GLstencil *) malloc(n * sizeof(GLstencil));
 
-   ASSERT(n <= MAX_WIDTH);
+   if (!stencil) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "stencil packing");
+      return;
+   }
 
    if (ctx->Pixel.IndexShift || ctx->Pixel.IndexOffset ||
        ctx->Pixel.MapStencilFlag) {
@@ -4436,6 +4516,8 @@ _mesa_pack_stencil_span( const struct gl_context *ctx, GLuint n,
    default:
       _mesa_problem(ctx, "bad type in _mesa_pack_index_span");
    }
+
+   free(stencil);
 }
 
 #define DEPTH_VALUES(GLTYPE, GLTYPE2FLOAT)                              \
@@ -4466,12 +4548,12 @@ _mesa_pack_stencil_span( const struct gl_context *ctx, GLuint n,
  *                  (ignored for GLfloat).
  */
 void
-_mesa_unpack_depth_span( const struct gl_context *ctx, GLuint n,
+_mesa_unpack_depth_span( struct gl_context *ctx, GLuint n,
                          GLenum dstType, GLvoid *dest, GLuint depthMax,
                          GLenum srcType, const GLvoid *source,
                          const struct gl_pixelstore_attrib *srcPacking )
 {
-   GLfloat depthTemp[MAX_WIDTH], *depthValues;
+   GLfloat *depthTemp, *depthValues;
    GLboolean needClamp = GL_FALSE;
 
    /* Look for special cases first.
@@ -4517,6 +4599,12 @@ _mesa_unpack_depth_span( const struct gl_context *ctx, GLuint n,
 
    /* general case path follows */
 
+   depthTemp = (GLfloat *) malloc(n * sizeof(GLfloat));
+   if (!depthTemp) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel unpacking");
+      return;
+   }
+
    if (dstType == GL_FLOAT) {
       depthValues = (GLfloat *) dest;
    }
@@ -4599,6 +4687,7 @@ _mesa_unpack_depth_span( const struct gl_context *ctx, GLuint n,
          break;
       default:
          _mesa_problem(NULL, "bad type in _mesa_unpack_depth_span()");
+         free(depthTemp);
          return;
    }
 
@@ -4658,6 +4747,8 @@ _mesa_unpack_depth_span( const struct gl_context *ctx, GLuint n,
       ASSERT(dstType == GL_FLOAT);
       /*ASSERT(depthMax == 1.0F);*/
    }
+
+   free(depthTemp);
 }
 
 
@@ -4665,13 +4756,15 @@ _mesa_unpack_depth_span( const struct gl_context *ctx, GLuint n,
  * Pack an array of depth values.  The values are floats in [0,1].
  */
 void
-_mesa_pack_depth_span( const struct gl_context *ctx, GLuint n, GLvoid *dest,
+_mesa_pack_depth_span( struct gl_context *ctx, GLuint n, GLvoid *dest,
                        GLenum dstType, const GLfloat *depthSpan,
                        const struct gl_pixelstore_attrib *dstPacking )
 {
-   GLfloat depthCopy[MAX_WIDTH];
-
-   ASSERT(n <= MAX_WIDTH);
+   GLfloat *depthCopy = (GLfloat *) malloc(n * sizeof(GLfloat));
+   if (!depthCopy) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel packing");
+      return;
+   }
 
    if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0) {
       memcpy(depthCopy, depthSpan, n * sizeof(GLfloat));
@@ -4773,6 +4866,8 @@ _mesa_pack_depth_span( const struct gl_context *ctx, GLuint n, GLvoid *dest,
    default:
       _mesa_problem(ctx, "bad type in _mesa_pack_depth_span");
    }
+
+   free(depthCopy);
 }
 
 
@@ -4781,16 +4876,21 @@ _mesa_pack_depth_span( const struct gl_context *ctx, GLuint n, GLvoid *dest,
  * Pack depth and stencil values as GL_DEPTH_STENCIL/GL_UNSIGNED_INT_24_8.
  */
 void
-_mesa_pack_depth_stencil_span(const struct gl_context *ctx, GLuint n, GLuint *dest,
+_mesa_pack_depth_stencil_span(struct gl_context *ctx, GLuint n, GLuint *dest,
                               const GLfloat *depthVals,
                               const GLstencil *stencilVals,
                               const struct gl_pixelstore_attrib *dstPacking)
 {
-   GLfloat depthCopy[MAX_WIDTH];
-   GLstencil stencilCopy[MAX_WIDTH];
+   GLfloat *depthCopy = (GLfloat *) malloc(n * sizeof(GLfloat));
+   GLstencil *stencilCopy = (GLstencil *) malloc(n * sizeof(GLstencil));
    GLuint i;
 
-   ASSERT(n <= MAX_WIDTH);
+   if (!depthCopy || !stencilCopy) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "pixel packing");
+      free(depthCopy);
+      free(stencilCopy);
+      return;
+   }
 
    if (ctx->Pixel.DepthScale != 1.0 || ctx->Pixel.DepthBias != 0.0) {
       memcpy(depthCopy, depthVals, n * sizeof(GLfloat));
@@ -4814,6 +4914,9 @@ _mesa_pack_depth_stencil_span(const struct gl_context *ctx, GLuint n, GLuint *de
    if (dstPacking->SwapBytes) {
       _mesa_swap4(dest, n);
    }
+
+   free(depthCopy);
+   free(stencilCopy);
 }
 
 
diff --git a/src/mesa/main/pack.h b/src/mesa/main/pack.h
index 65d3f7a0fb2..78238ea5839 100644
--- a/src/mesa/main/pack.h
+++ b/src/mesa/main/pack.h
@@ -90,7 +90,7 @@ _mesa_unpack_dudv_span_byte(struct gl_context *ctx,
                             GLbitfield transferOps);
 
 extern void
-_mesa_unpack_index_span(const struct gl_context *ctx, GLuint n,
+_mesa_unpack_index_span(struct gl_context *ctx, GLuint n,
                         GLenum dstType, GLvoid *dest,
                         GLenum srcType, const GLvoid *source,
                         const struct gl_pixelstore_attrib *srcPacking,
@@ -98,39 +98,39 @@ _mesa_unpack_index_span(const struct gl_context *ctx, GLuint n,
 
 
 extern void
-_mesa_pack_index_span(const struct gl_context *ctx, GLuint n,
+_mesa_pack_index_span(struct gl_context *ctx, GLuint n,
                       GLenum dstType, GLvoid *dest, const GLuint *source,
                       const struct gl_pixelstore_attrib *dstPacking,
                       GLbitfield transferOps);
 
 
 extern void
-_mesa_unpack_stencil_span(const struct gl_context *ctx, GLuint n,
+_mesa_unpack_stencil_span(struct gl_context *ctx, GLuint n,
                           GLenum dstType, GLvoid *dest,
                           GLenum srcType, const GLvoid *source,
                           const struct gl_pixelstore_attrib *srcPacking,
                           GLbitfield transferOps);
 
 extern void
-_mesa_pack_stencil_span(const struct gl_context *ctx, GLuint n,
+_mesa_pack_stencil_span(struct gl_context *ctx, GLuint n,
                         GLenum dstType, GLvoid *dest, const GLstencil *source,
                         const struct gl_pixelstore_attrib *dstPacking);
 
 
 extern void
-_mesa_unpack_depth_span(const struct gl_context *ctx, GLuint n,
+_mesa_unpack_depth_span(struct gl_context *ctx, GLuint n,
                         GLenum dstType, GLvoid *dest, GLuint depthMax,
                         GLenum srcType, const GLvoid *source,
                         const struct gl_pixelstore_attrib *srcPacking);
 
 extern void
-_mesa_pack_depth_span(const struct gl_context *ctx, GLuint n, GLvoid *dest,
+_mesa_pack_depth_span(struct gl_context *ctx, GLuint n, GLvoid *dest,
                       GLenum dstType, const GLfloat *depthSpan,
                       const struct gl_pixelstore_attrib *dstPacking);
 
 
 extern void
-_mesa_pack_depth_stencil_span(const struct gl_context *ctx,
+_mesa_pack_depth_stencil_span(struct gl_context *ctx,
                               GLuint n, GLuint *dest,
                               const GLfloat *depthVals,
                               const GLstencil *stencilVals,
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 030236e7350..96df58d35c2 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -95,7 +95,7 @@ _mesa_init_shader_state(struct gl_context *ctx)
     * are generated by the GLSL compiler.
     */
    struct gl_shader_compiler_options options;
-   GLuint i;
+   gl_shader_type sh;
 
    memset(&options, 0, sizeof(options));
    options.MaxUnrollIterations = 32;
@@ -103,8 +103,8 @@ _mesa_init_shader_state(struct gl_context *ctx)
    /* Default pragma settings */
    options.DefaultPragmas.Optimize = GL_TRUE;
 
-   for(i = 0; i < MESA_SHADER_TYPES; ++i)
-      memcpy(&ctx->ShaderCompilerOptions[i], &options, sizeof(options));
+   for (sh = 0; sh < MESA_SHADER_TYPES; ++sh)
+      memcpy(&ctx->ShaderCompilerOptions[sh], &options, sizeof(options));
 
    ctx->Shader.Flags = get_shader_flags();
 }
diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c
index b6594cbe6f0..216bbce0032 100644
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -291,6 +291,7 @@ _mesa_free_shader_program_data(struct gl_context *ctx,
                                struct gl_shader_program *shProg)
 {
    GLuint i;
+   gl_shader_type sh;
 
    assert(shProg->Type == GL_SHADER_PROGRAM_MESA);
 
@@ -326,10 +327,10 @@ _mesa_free_shader_program_data(struct gl_context *ctx,
    shProg->TransformFeedback.NumVarying = 0;
 
 
-   for (i = 0; i < MESA_SHADER_TYPES; i++) {
-      if (shProg->_LinkedShaders[i] != NULL) {
-	 ctx->Driver.DeleteShader(ctx, shProg->_LinkedShaders[i]);
-	 shProg->_LinkedShaders[i] = NULL;
+   for (sh = 0; sh < MESA_SHADER_TYPES; sh++) {
+      if (shProg->_LinkedShaders[sh] != NULL) {
+	 ctx->Driver.DeleteShader(ctx, shProg->_LinkedShaders[sh]);
+	 shProg->_LinkedShaders[sh] = NULL;
       }
    }
 }
diff --git a/src/mesa/main/shaderobj.h b/src/mesa/main/shaderobj.h
index 346a5b75175..de7c998cf0e 100644
--- a/src/mesa/main/shaderobj.h
+++ b/src/mesa/main/shaderobj.h
@@ -98,7 +98,7 @@ extern void
 _mesa_free_shader_state(struct gl_context *ctx);
 
 
-static INLINE GLuint
+static INLINE gl_shader_type
 _mesa_shader_type_to_index(GLenum v)
 {
    switch (v) {
@@ -110,7 +110,7 @@ _mesa_shader_type_to_index(GLenum v)
       return MESA_SHADER_GEOMETRY;
    default:
       ASSERT(0 && "bad value in _mesa_shader_type_to_index()");
-      return ~0;
+      return MESA_SHADER_TYPES;
    }
 }
 
diff --git a/src/mesa/main/syncobj.h b/src/mesa/main/syncobj.h
index f3c0046cf3d..51de9bf4809 100644
--- a/src/mesa/main/syncobj.h
+++ b/src/mesa/main/syncobj.h
@@ -31,9 +31,13 @@
 #ifndef SYNCOBJ_H
 #define SYNCOBJ_H
 
-#include "main/mtypes.h"
+#include "glheader.h"
+#include "mfeatures.h"
 
+struct _glapi_table;
 struct dd_function_table;
+struct gl_context;
+struct gl_sync_object;
 
 #if FEATURE_ARB_sync
 
diff --git a/src/mesa/main/texcompress.h b/src/mesa/main/texcompress.h
index 83856429c54..19b08bbadf6 100644
--- a/src/mesa/main/texcompress.h
+++ b/src/mesa/main/texcompress.h
@@ -25,8 +25,11 @@
 #ifndef TEXCOMPRESS_H
 #define TEXCOMPRESS_H
 
-#include "mtypes.h"
 #include "formats.h"
+#include "glheader.h"
+#include "mfeatures.h"
+
+struct gl_context;
 
 #if _HAVE_FULL_GL
 
diff --git a/src/mesa/main/texcompress_s3tc.h b/src/mesa/main/texcompress_s3tc.h
index d0a5b186b71..74a0343b9b9 100644
--- a/src/mesa/main/texcompress_s3tc.h
+++ b/src/mesa/main/texcompress_s3tc.h
@@ -25,9 +25,13 @@
 #ifndef TEXCOMPRESS_S3TC_H
 #define TEXCOMPRESS_S3TC_H
 
-#include "main/mtypes.h"
+#include "compiler.h"
+#include "glheader.h"
+#include "mfeatures.h"
 #include "texstore.h"
 
+struct gl_context;
+struct gl_texture_image;
 
 #if FEATURE_texture_s3tc
 
diff --git a/src/mesa/main/texenvprogram.h b/src/mesa/main/texenvprogram.h
index abfb916d21b..22e30a51943 100644
--- a/src/mesa/main/texenvprogram.h
+++ b/src/mesa/main/texenvprogram.h
@@ -27,7 +27,7 @@
 #define TEXENVPROGRAM_H
 
 
-#include "mtypes.h"
+struct gl_context;
 
 extern struct gl_fragment_program *
 _mesa_get_fixed_func_fragment_program(struct gl_context *ctx);
diff --git a/src/mesa/main/texformat.h b/src/mesa/main/texformat.h
index 8bd15076753..3cf09213ac4 100644
--- a/src/mesa/main/texformat.h
+++ b/src/mesa/main/texformat.h
@@ -27,9 +27,9 @@
 #define TEXFORMAT_H
 
 
-#include "mtypes.h"
 #include "formats.h"
 
+struct gl_context;
 
 extern gl_format
 _mesa_choose_tex_format( struct gl_context *ctx, GLint internalFormat,
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index 879ac529a01..c94f88e16e6 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -120,10 +120,15 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
    const GLint height = texImage->Height;
    const GLint depth = texImage->Depth;
    GLint img, row, col;
+   GLfloat *depthRow = (GLfloat *) malloc(width * sizeof(GLfloat));
+
+   if (!depthRow) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
+      return;
+   }
 
    for (img = 0; img < depth; img++) {
       for (row = 0; row < height; row++) {
-         GLfloat depthRow[MAX_WIDTH];
          void *dest = _mesa_image_address(dimensions, &ctx->Pack, pixels,
                                           width, height, format, type,
                                           img, row, 0);
@@ -135,6 +140,8 @@ get_tex_depth(struct gl_context *ctx, GLuint dimensions,
          _mesa_pack_depth_span(ctx, width, dest, type, depthRow, &ctx->Pack);
       }
    }
+
+   free(depthRow);
 }
 
 
@@ -244,6 +251,12 @@ get_tex_srgb(struct gl_context *ctx, GLuint dimensions,
    const GLint depth = texImage->Depth;
    const GLbitfield transferOps = 0x0;
    GLint img, row;
+   GLfloat (*rgba)[4] = (GLfloat (*)[4]) malloc(4 * width * sizeof(GLfloat));
+
+   if (!rgba) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
+      return;
+   }
 
    for (img = 0; img < depth; img++) {
       for (row = 0; row < height; row++) {
@@ -251,7 +264,6 @@ get_tex_srgb(struct gl_context *ctx, GLuint dimensions,
                                           width, height, format, type,
                                           img, row, 0);
 
-         GLfloat rgba[MAX_WIDTH][4];
          GLint col;
 
          /* convert row to RGBA format */
@@ -279,6 +291,8 @@ get_tex_srgb(struct gl_context *ctx, GLuint dimensions,
                                     &ctx->Pack, transferOps);
       }
    }
+
+   free(rgba);
 }
 
 
@@ -314,13 +328,18 @@ get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
     */
    GLbitfield transferOps = 0x0;
    GLint img, row;
+   GLfloat (*rgba)[4] = (GLfloat (*)[4]) malloc(4 * width * sizeof(GLfloat));
+
+   if (!rgba) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage");
+      return;
+   }
 
    for (img = 0; img < depth; img++) {
       for (row = 0; row < height; row++) {
          void *dest = _mesa_image_address(dimensions, &ctx->Pack, pixels,
                                           width, height, format, type,
                                           img, row, 0);
-         GLfloat rgba[MAX_WIDTH][4];
          GLint col;
          GLenum dataType = _mesa_get_format_datatype(texImage->TexFormat);
 
@@ -364,6 +383,8 @@ get_tex_rgba(struct gl_context *ctx, GLuint dimensions,
                                     &ctx->Pack, transferOps);
       }
    }
+
+   free(rgba);
 }
 
 
diff --git a/src/mesa/main/texgetimage.h b/src/mesa/main/texgetimage.h
index 81a3bbbd9a7..ef420ddabf5 100644
--- a/src/mesa/main/texgetimage.h
+++ b/src/mesa/main/texgetimage.h
@@ -27,7 +27,11 @@
 #ifndef TEXGETIMAGE_H
 #define TEXGETIMAGE_H
 
-#include "mtypes.h"
+#include "glheader.h"
+
+struct gl_context;
+struct gl_texture_image;
+struct gl_texture_object;
 
 extern void
 _mesa_get_teximage(struct gl_context *ctx, GLenum target, GLint level,
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 060f34b7f97..c5ae63052a7 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -636,6 +636,47 @@ _mesa_is_proxy_texture(GLenum target)
 
 
 /**
+ * Return the proxy target which corresponds to the given texture target
+ */
+static GLenum
+get_proxy_target(GLenum target)
+{
+   switch (target) {
+   case GL_TEXTURE_1D:
+   case GL_PROXY_TEXTURE_1D:
+      return GL_PROXY_TEXTURE_1D;
+   case GL_TEXTURE_2D:
+   case GL_PROXY_TEXTURE_2D:
+      return GL_PROXY_TEXTURE_2D;
+   case GL_TEXTURE_3D:
+   case GL_PROXY_TEXTURE_3D:
+      return GL_PROXY_TEXTURE_3D;
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X_ARB:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y_ARB:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y_ARB:
+   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z_ARB:
+   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB:
+   case GL_TEXTURE_CUBE_MAP_ARB:
+   case GL_PROXY_TEXTURE_CUBE_MAP_ARB:
+      return GL_PROXY_TEXTURE_CUBE_MAP_ARB;
+   case GL_TEXTURE_RECTANGLE_NV:
+   case GL_PROXY_TEXTURE_RECTANGLE_NV:
+      return GL_PROXY_TEXTURE_RECTANGLE_NV;
+   case GL_TEXTURE_1D_ARRAY_EXT:
+   case GL_PROXY_TEXTURE_1D_ARRAY_EXT:
+      return GL_PROXY_TEXTURE_1D_ARRAY_EXT;
+   case GL_TEXTURE_2D_ARRAY_EXT:
+   case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
+      return GL_PROXY_TEXTURE_2D_ARRAY_EXT;
+   default:
+      _mesa_problem(NULL, "unexpected target in get_proxy_target()");
+      return 0;
+   }
+}
+
+
+/**
  * Get the texture object that corresponds to the target of the given
  * texture unit.
  *
@@ -1178,94 +1219,110 @@ _mesa_test_proxy_teximage(struct gl_context *ctx, GLenum target, GLint level,
    switch (target) {
    case GL_PROXY_TEXTURE_1D:
       maxSize = 1 << (ctx->Const.MaxTextureLevels - 1);
-      if (width < 2 * border || width > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           width >0 && !_mesa_is_pow_two(width - 2 * border)) ||
-          level >= ctx->Const.MaxTextureLevels) {
-         /* bad width or level */
+      if (width < 2 * border || width > 2 + maxSize)
          return GL_FALSE;
+      if (level >= ctx->Const.MaxTextureLevels)
+         return GL_FALSE;
+      if (!ctx->Extensions.ARB_texture_non_power_of_two) {
+         if (width > 0 && !_mesa_is_pow_two(width - 2 * border))
+            return GL_FALSE;
       }
       return GL_TRUE;
+
    case GL_PROXY_TEXTURE_2D:
       maxSize = 1 << (ctx->Const.MaxTextureLevels - 1);
-      if (width < 2 * border || width > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           width > 0 && !_mesa_is_pow_two(width - 2 * border)) ||
-          height < 2 * border || height > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           height > 0 && !_mesa_is_pow_two(height - 2 * border)) ||
-          level >= ctx->Const.MaxTextureLevels) {
-         /* bad width or height or level */
+      if (width < 2 * border || width > 2 + maxSize)
+         return GL_FALSE;
+      if (height < 2 * border || height > 2 + maxSize)
+         return GL_FALSE;
+      if (level >= ctx->Const.MaxTextureLevels)
          return GL_FALSE;
+      if (!ctx->Extensions.ARB_texture_non_power_of_two) {
+         if (width > 0 && !_mesa_is_pow_two(width - 2 * border))
+            return GL_FALSE;
+         if (height > 0 && !_mesa_is_pow_two(height - 2 * border))
+            return GL_FALSE;
       }
       return GL_TRUE;
+
    case GL_PROXY_TEXTURE_3D:
       maxSize = 1 << (ctx->Const.Max3DTextureLevels - 1);
-      if (width < 2 * border || width > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           width > 0 && !_mesa_is_pow_two(width - 2 * border)) ||
-          height < 2 * border || height > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           height > 0 && !_mesa_is_pow_two(height - 2 * border)) ||
-          depth < 2 * border || depth > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           depth > 0 && !_mesa_is_pow_two(depth - 2 * border)) ||
-          level >= ctx->Const.Max3DTextureLevels) {
-         /* bad width or height or depth or level */
+      if (width < 2 * border || width > 2 + maxSize)
+         return GL_FALSE;
+      if (height < 2 * border || height > 2 + maxSize)
+         return GL_FALSE;
+      if (depth < 2 * border || depth > 2 + maxSize)
          return GL_FALSE;
+      if (level >= ctx->Const.Max3DTextureLevels)
+         return GL_FALSE;
+      if (!ctx->Extensions.ARB_texture_non_power_of_two) {
+         if (width > 0 && !_mesa_is_pow_two(width - 2 * border))
+            return GL_FALSE;
+         if (height > 0 && !_mesa_is_pow_two(height - 2 * border))
+            return GL_FALSE;
+         if (depth > 0 && !_mesa_is_pow_two(depth - 2 * border))
+            return GL_FALSE;
       }
       return GL_TRUE;
+
    case GL_PROXY_TEXTURE_RECTANGLE_NV:
-      if (width < 0 || width > ctx->Const.MaxTextureRectSize ||
-          height < 0 || height > ctx->Const.MaxTextureRectSize ||
-          level != 0) {
-         /* bad width or height or level */
+      maxSize = ctx->Const.MaxTextureRectSize;
+      if (width < 0 || width > maxSize)
+         return GL_FALSE;
+      if (height < 0 || height > maxSize)
+         return GL_FALSE;
+      if (level != 0)
          return GL_FALSE;
-      }
       return GL_TRUE;
+
    case GL_PROXY_TEXTURE_CUBE_MAP_ARB:
       maxSize = 1 << (ctx->Const.MaxCubeTextureLevels - 1);
-      if (width < 2 * border || width > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           width > 0 && !_mesa_is_pow_two(width - 2 * border)) ||
-          height < 2 * border || height > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           height > 0 && !_mesa_is_pow_two(height - 2 * border)) ||
-          level >= ctx->Const.MaxCubeTextureLevels) {
-         /* bad width or height */
+      if (width < 2 * border || width > 2 + maxSize)
+         return GL_FALSE;
+      if (height < 2 * border || height > 2 + maxSize)
          return GL_FALSE;
+      if (level >= ctx->Const.MaxCubeTextureLevels)
+         return GL_FALSE;
+      if (!ctx->Extensions.ARB_texture_non_power_of_two) {
+         if (width > 0 && !_mesa_is_pow_two(width - 2 * border))
+            return GL_FALSE;
+         if (height > 0 && !_mesa_is_pow_two(height - 2 * border))
+            return GL_FALSE;
       }
       return GL_TRUE;
+
    case GL_PROXY_TEXTURE_1D_ARRAY_EXT:
       maxSize = 1 << (ctx->Const.MaxTextureLevels - 1);
-      if (width < 2 * border || width > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           width > 0 && !_mesa_is_pow_two(width - 2 * border)) ||
-          level >= ctx->Const.MaxTextureLevels) {
-         /* bad width or level */
+      if (width < 2 * border || width > 2 + maxSize)
          return GL_FALSE;
-      }
-
-      if (height < 1 || height > ctx->Const.MaxArrayTextureLayers) {
+      if (height < 1 || height > ctx->Const.MaxArrayTextureLayers)
+         return GL_FALSE;
+      if (level >= ctx->Const.MaxTextureLevels)
          return GL_FALSE;
+      if (!ctx->Extensions.ARB_texture_non_power_of_two) {
+         if (width > 0 && !_mesa_is_pow_two(width - 2 * border))
+            return GL_FALSE;
       }
       return GL_TRUE;
+
    case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
       maxSize = 1 << (ctx->Const.MaxTextureLevels - 1);
-      if (width < 2 * border || width > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           width > 0 && !_mesa_is_pow_two(width - 2 * border)) ||
-          height < 2 * border || height > 2 + maxSize ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           height > 0 && !_mesa_is_pow_two(height - 2 * border)) ||
-          level >= ctx->Const.MaxTextureLevels) {
-         /* bad width or height or level */
+      if (width < 2 * border || width > 2 + maxSize)
          return GL_FALSE;
-      }
-      if (depth < 1 || depth > ctx->Const.MaxArrayTextureLayers) {
+      if (height < 2 * border || height > 2 + maxSize)
+         return GL_FALSE;
+      if (depth < 1 || depth > ctx->Const.MaxArrayTextureLayers)
+         return GL_FALSE;
+      if (level >= ctx->Const.MaxTextureLevels)
          return GL_FALSE;
+      if (!ctx->Extensions.ARB_texture_non_power_of_two) {
+         if (width > 0 && !_mesa_is_pow_two(width - 2 * border))
+            return GL_FALSE;
+         if (height > 0 && !_mesa_is_pow_two(height - 2 * border))
+            return GL_FALSE;
       }
       return GL_TRUE;
+
    default:
       _mesa_problem(ctx, "Invalid target in _mesa_test_proxy_teximage");
       return GL_FALSE;
@@ -1274,15 +1331,37 @@ _mesa_test_proxy_teximage(struct gl_context *ctx, GLenum target, GLint level,
 
 
 /**
- * Helper function to determine whether a target supports compressed textures
+ * Check if the memory used by the texture would exceed the driver's limit.
+ * This lets us support a max 3D texture size of 8K (for example) but
+ * prevents allocating a full 8K x 8K x 8K texture.
+ * XXX this could be rolled into the proxy texture size test (above) but
+ * we don't have the actual texture internal format at that point.
+ */
+static GLboolean
+legal_texture_size(struct gl_context *ctx, gl_format format,
+                   GLint width, GLint height, GLint depth)
+{
+   uint64_t bytes = _mesa_format_image_size64(format, width, height, depth);
+   uint64_t mbytes = bytes / (1024 * 1024); /* convert to MB */
+   return mbytes <= (uint64_t) ctx->Const.MaxTextureMbytes;
+}
+
+
+
+/**
+ * Helper function to determine whether a target and specific compression
+ * format are supported.
  */
 static GLboolean
-target_can_be_compressed(struct gl_context *ctx, GLenum target)
+target_can_be_compressed(const struct gl_context *ctx, GLenum target,
+                         GLenum intFormat)
 {
+   (void) intFormat;  /* not used yet */
+
    switch (target) {
    case GL_TEXTURE_2D:
    case GL_PROXY_TEXTURE_2D:
-      return GL_TRUE;
+      return GL_TRUE; /* true for any compressed format so far */
    case GL_PROXY_TEXTURE_CUBE_MAP:
    case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
    case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
@@ -1301,6 +1380,109 @@ target_can_be_compressed(struct gl_context *ctx, GLenum target)
 
 
 /**
+ * Check if the given texture target value is legal for a
+ * glTexImage1/2/3D call.
+ */
+static GLboolean
+legal_teximage_target(struct gl_context *ctx, GLuint dims, GLenum target)
+{
+   switch (dims) {
+   case 1:
+      switch (target) {
+      case GL_TEXTURE_1D:
+      case GL_PROXY_TEXTURE_1D:
+         return GL_TRUE;
+      default:
+         return GL_FALSE;
+      }
+   case 2:
+      switch (target) {
+      case GL_TEXTURE_2D:
+      case GL_PROXY_TEXTURE_2D:
+         return GL_TRUE;
+      case GL_PROXY_TEXTURE_CUBE_MAP:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+         return ctx->Extensions.ARB_texture_cube_map;
+      case GL_TEXTURE_RECTANGLE_NV:
+      case GL_PROXY_TEXTURE_RECTANGLE_NV:
+         return ctx->Extensions.NV_texture_rectangle;
+      case GL_TEXTURE_1D_ARRAY_EXT:
+      case GL_PROXY_TEXTURE_1D_ARRAY_EXT:
+         return ctx->Extensions.MESA_texture_array;
+      default:
+         return GL_FALSE;
+      }
+   case 3:
+      switch (target) {
+      case GL_TEXTURE_3D:
+      case GL_PROXY_TEXTURE_3D:
+         return GL_TRUE;
+      case GL_TEXTURE_2D_ARRAY_EXT:
+      case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
+         return ctx->Extensions.MESA_texture_array;
+      default:
+         return GL_FALSE;
+      }
+   default:
+      _mesa_problem(ctx, "invalid dims=%u in legal_teximage_target()", dims);
+      return GL_FALSE;
+   }
+}
+
+
+/**
+ * Check if the given texture target value is legal for a
+ * glTexSubImage, glCopyTexSubImage or glCopyTexImage call.
+ * The difference compared to legal_teximage_target() above is that
+ * proxy targets are not supported.
+ */
+static GLboolean
+legal_texsubimage_target(struct gl_context *ctx, GLuint dims, GLenum target)
+{
+   switch (dims) {
+   case 1:
+      return target == GL_TEXTURE_1D;
+   case 2:
+      switch (target) {
+      case GL_TEXTURE_2D:
+         return GL_TRUE;
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+         return ctx->Extensions.ARB_texture_cube_map;
+      case GL_TEXTURE_RECTANGLE_NV:
+         return ctx->Extensions.NV_texture_rectangle;
+      case GL_TEXTURE_1D_ARRAY_EXT:
+         return ctx->Extensions.MESA_texture_array;
+      default:
+         return GL_FALSE;
+      }
+   case 3:
+      switch (target) {
+      case GL_TEXTURE_3D:
+         return GL_TRUE;
+      case GL_TEXTURE_2D_ARRAY_EXT:
+         return ctx->Extensions.MESA_texture_array;
+      default:
+         return GL_FALSE;
+      }
+   default:
+      _mesa_problem(ctx, "invalid dims=%u in legal_texsubimage_target()",
+                    dims);
+      return GL_FALSE;
+   }
+}
+
+
+/**
  * Test the glTexImage[123]D() parameters for errors.
  * 
  * \param ctx GL context.
@@ -1329,10 +1511,10 @@ texture_error_check( struct gl_context *ctx,
                      GLint width, GLint height,
                      GLint depth, GLint border )
 {
-   const GLboolean isProxy = _mesa_is_proxy_texture(target);
+   const GLenum proxyTarget = get_proxy_target(target);
+   const GLboolean isProxy = target == proxyTarget;
    GLboolean sizeOK = GL_TRUE;
    GLboolean colorFormat, indexFormat;
-   GLenum proxy_target;
 
    /* Basic level check (more checking in ctx->Driver.TestProxyTexImage) */
    if (level < 0 || level >= MAX_TEXTURE_LEVELS) {
@@ -1362,71 +1544,16 @@ texture_error_check( struct gl_context *ctx,
       return GL_TRUE;
    }
 
-   /* Check target and call ctx->Driver.TestProxyTexImage() to check the
-    * level, width, height and depth.
-    */
-   if (dimensions == 1) {
-      if (target == GL_PROXY_TEXTURE_1D || target == GL_TEXTURE_1D) {
-         proxy_target = GL_PROXY_TEXTURE_1D;
-         height = 1;
-         depth = 1;
-      }
-      else {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glTexImage1D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else if (dimensions == 2) {
-      depth = 1;
-      if (target == GL_PROXY_TEXTURE_2D || target == GL_TEXTURE_2D) {
-         proxy_target = GL_PROXY_TEXTURE_2D;
-      }
-      else if (target == GL_PROXY_TEXTURE_CUBE_MAP_ARB ||
-               (target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
-                target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB)) {
-         if (!ctx->Extensions.ARB_texture_cube_map) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "glTexImage2D(target)");
-            return GL_TRUE;
-         }
-         proxy_target = GL_PROXY_TEXTURE_CUBE_MAP_ARB;
-         sizeOK = (width == height);
-      }
-      else if (target == GL_PROXY_TEXTURE_RECTANGLE_NV ||
-               target == GL_TEXTURE_RECTANGLE_NV) {
-         if (!ctx->Extensions.NV_texture_rectangle) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "glTexImage2D(target)");
-            return GL_TRUE;
-         }
-         proxy_target = GL_PROXY_TEXTURE_RECTANGLE_NV;
-      }
-      else if (target == GL_PROXY_TEXTURE_1D_ARRAY_EXT ||
-               target == GL_TEXTURE_1D_ARRAY_EXT) {
-         proxy_target = GL_PROXY_TEXTURE_1D_ARRAY_EXT;
-      }
-      else {
-         _mesa_error(ctx, GL_INVALID_ENUM, "glTexImage2D(target)");
-         return GL_TRUE;
-      }
-   }
-   else if (dimensions == 3) {
-      if (target == GL_PROXY_TEXTURE_3D || target == GL_TEXTURE_3D) {
-         proxy_target = GL_PROXY_TEXTURE_3D;
-      }
-      else if (target == GL_PROXY_TEXTURE_2D_ARRAY_EXT ||
-               target == GL_TEXTURE_2D_ARRAY_EXT) {
-         proxy_target = GL_PROXY_TEXTURE_2D_ARRAY_EXT;
-      }
-      else {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glTexImage3D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else {
-      _mesa_problem( ctx, "bad dims in texture_error_check" );
-      return GL_TRUE;
+   /* Do this simple check before calling the TestProxyTexImage() function */
+   if (proxyTarget == GL_PROXY_TEXTURE_CUBE_MAP_ARB) {
+      sizeOK = (width == height);
    }
 
-   sizeOK = sizeOK && ctx->Driver.TestProxyTexImage(ctx, proxy_target, level,
+   /*
+    * Use the proxy texture driver hook to see if the size/level/etc are
+    * legal.
+    */
+   sizeOK = sizeOK && ctx->Driver.TestProxyTexImage(ctx, proxyTarget, level,
                                                     internalFormat, format,
                                                     type, width, height,
                                                     depth, border);
@@ -1531,9 +1658,10 @@ texture_error_check( struct gl_context *ctx,
 
    /* additional checks for compressed textures */
    if (_mesa_is_compressed_format(ctx, internalFormat)) {
-      if (!target_can_be_compressed(ctx, target) && !isProxy) {
-         _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glTexImage%dD(target)", dimensions);
+      if (!target_can_be_compressed(ctx, target, internalFormat)) {
+         if (!isProxy)
+            _mesa_error(ctx, GL_INVALID_ENUM,
+                        "glTexImage%dD(target)", dimensions);
          return GL_TRUE;
       }
       if (border != 0) {
@@ -1591,61 +1719,13 @@ subtexture_error_check( struct gl_context *ctx, GLuint dimensions,
                         GLint width, GLint height, GLint depth,
                         GLenum format, GLenum type )
 {
-   /* Check target */
-   if (dimensions == 1) {
-      if (target != GL_TEXTURE_1D) {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glTexSubImage1D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else if (dimensions == 2) {
-      if (target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
-          target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB) {
-         if (!ctx->Extensions.ARB_texture_cube_map) {
-            _mesa_error( ctx, GL_INVALID_ENUM, "glTexSubImage2D(target)" );
-            return GL_TRUE;
-         }
-      }
-      else if (target == GL_TEXTURE_RECTANGLE_NV) {
-         if (!ctx->Extensions.NV_texture_rectangle) {
-            _mesa_error( ctx, GL_INVALID_ENUM, "glTexSubImage2D(target)" );
-            return GL_TRUE;
-         }
-      }
-      else if (target == GL_TEXTURE_1D_ARRAY_EXT) {
-        if (!ctx->Extensions.MESA_texture_array) {
-           _mesa_error( ctx, GL_INVALID_ENUM, "glTexSubImage2D(target)" );
-           return GL_TRUE;
-        }
-      }
-      else if (target != GL_TEXTURE_2D) {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glTexSubImage2D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else if (dimensions == 3) {
-      if (target == GL_TEXTURE_2D_ARRAY_EXT) {
-         if (!ctx->Extensions.MESA_texture_array) {
-            _mesa_error( ctx, GL_INVALID_ENUM, "glTexSubImage3D(target)" );
-            return GL_TRUE;
-         }
-      }
-      else if (target != GL_TEXTURE_3D) {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glTexSubImage3D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else {
-      _mesa_problem( ctx, "invalid dims in texture_error_check" );
-      return GL_TRUE;
-   }
-
    /* Basic level check */
    if (level < 0 || level >= MAX_TEXTURE_LEVELS) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glTexSubImage2D(level=%d)", level);
       return GL_TRUE;
    }
 
+   /* Check for negative sizes */
    if (width < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glTexSubImage%dD(width=%d)", dimensions, width);
@@ -1732,13 +1812,6 @@ subtexture_error_check2( struct gl_context *ctx, GLuint dimensions,
    if (_mesa_is_format_compressed(destTex->TexFormat)) {
       GLuint bw, bh;
 
-      if (!target_can_be_compressed(ctx, target)) {
-         _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glTexSubImage%dD(target=%s)", dimensions,
-                     _mesa_lookup_enum_by_nr(target));
-         return GL_TRUE;
-      }
-
       /* do tests which depend on compression block size */
       _mesa_get_format_block_size(destTex->TexFormat, &bw, &bh);
 
@@ -1789,10 +1862,18 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
                          GLenum target, GLint level, GLint internalFormat,
                          GLint width, GLint height, GLint border )
 {
-   GLenum type;
+   const GLenum proxyTarget = get_proxy_target(target);
+   const GLenum type = GL_FLOAT;
    GLboolean sizeOK;
    GLint format;
 
+   /* check target */
+   if (!legal_texsubimage_target(ctx, dimensions, target)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glCopyTexImage%uD(target=%s)",
+                  dimensions, _mesa_lookup_enum_by_nr(target));
+      return GL_TRUE;
+   }       
+
    /* Basic level check (more checking in ctx->Driver.TestProxyTexImage) */
    if (level < 0 || level >= MAX_TEXTURE_LEVELS) {
       _mesa_error(ctx, GL_INVALID_VALUE,
@@ -1830,75 +1911,14 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
       return GL_TRUE;
    }
 
-   /* NOTE: the format and type aren't really significant for
-    * TestProxyTexImage().  Only the internalformat really matters.
-    */
-   type = GL_FLOAT;
+   /* Do size, level checking */
+   sizeOK = (proxyTarget == GL_PROXY_TEXTURE_CUBE_MAP_ARB)
+      ? (width == height) : 1;
 
-   /* Check target and call ctx->Driver.TestProxyTexImage() to check the
-    * level, width, height and depth.
-    */
-   if (dimensions == 1) {
-      if (target == GL_TEXTURE_1D) {
-         sizeOK = ctx->Driver.TestProxyTexImage(ctx, GL_PROXY_TEXTURE_1D,
-                                                level, internalFormat,
-                                                format, type,
-                                                width, 1, 1, border);
-      }
-      else {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexImage1D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else if (dimensions == 2) {
-      if (target == GL_TEXTURE_2D) {
-         sizeOK = ctx->Driver.TestProxyTexImage(ctx, GL_PROXY_TEXTURE_2D,
-                                                level, internalFormat,
-                                                format, type,
-                                                width, height, 1, border);
-      }
-      else if (target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
-               target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB) {
-         if (!ctx->Extensions.ARB_texture_cube_map) {
-            _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexImage2D(target)" );
-            return GL_TRUE;
-         }
-         sizeOK = (width == height) &&
-            ctx->Driver.TestProxyTexImage(ctx, GL_PROXY_TEXTURE_CUBE_MAP_ARB,
-                                          level, internalFormat, format, type,
-                                          width, height, 1, border);
-      }
-      else if (target == GL_TEXTURE_RECTANGLE_NV) {
-         if (!ctx->Extensions.NV_texture_rectangle) {
-            _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexImage2D(target)" );
-            return GL_TRUE;
-         }
-         sizeOK = ctx->Driver.TestProxyTexImage(ctx,
-                                                GL_PROXY_TEXTURE_RECTANGLE_NV,
-                                                level, internalFormat,
-                                                format, type,
-                                                width, height, 1, border);
-      }
-      else if (target == GL_TEXTURE_1D_ARRAY_EXT) {
-         if (!ctx->Extensions.MESA_texture_array) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "glCopyTexImage2D(target)");
-            return GL_TRUE;
-         }
-         sizeOK = ctx->Driver.TestProxyTexImage(ctx,
-                                                GL_PROXY_TEXTURE_1D_ARRAY_EXT,
-                                                level, internalFormat,
-                                                format, type,
-                                                width, height, 1, border);
-      }
-      else {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexImage2D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else {
-      _mesa_problem(ctx, "invalid dimensions in copytexture_error_check");
-      return GL_TRUE;
-   }
+   sizeOK = sizeOK && ctx->Driver.TestProxyTexImage(ctx, proxyTarget, level,
+                                                    internalFormat, format,
+                                                    type, width, height,
+                                                    1, border);
 
    if (!sizeOK) {
       if (dimensions == 1) {
@@ -1914,7 +1934,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
    }
 
    if (_mesa_is_compressed_format(ctx, internalFormat)) {
-      if (!target_can_be_compressed(ctx, target)) {
+      if (!target_can_be_compressed(ctx, target, internalFormat)) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "glCopyTexImage%dD(target)", dimensions);
          return GL_TRUE;
@@ -1973,45 +1993,11 @@ copytexsubimage_error_check1( struct gl_context *ctx, GLuint dimensions,
       }
    }
 
-   /* Check target */
-   if (dimensions == 1) {
-      if (target != GL_TEXTURE_1D) {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexSubImage1D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else if (dimensions == 2) {
-      if (target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
-          target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB) {
-         if (!ctx->Extensions.ARB_texture_cube_map) {
-            _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexSubImage2D(target)" );
-            return GL_TRUE;
-         }
-      }
-      else if (target == GL_TEXTURE_RECTANGLE_NV) {
-         if (!ctx->Extensions.NV_texture_rectangle) {
-            _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexSubImage2D(target)" );
-            return GL_TRUE;
-         }
-      }
-      else if (target == GL_TEXTURE_1D_ARRAY_EXT) {
-         if (!ctx->Extensions.MESA_texture_array) {
-            _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexSubImage2D(target)" );
-            return GL_TRUE;
-         }
-      }
-      else if (target != GL_TEXTURE_2D) {
-         _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexSubImage2D(target)" );
-         return GL_TRUE;
-      }
-   }
-   else if (dimensions == 3) {
-      if (((target != GL_TEXTURE_2D_ARRAY_EXT) ||
-	   (!ctx->Extensions.MESA_texture_array))
-	  && (target != GL_TEXTURE_3D)) {
-	 _mesa_error( ctx, GL_INVALID_ENUM, "glCopyTexSubImage3D(target)" );
-	 return GL_TRUE;
-      }
+   /* check target (proxies not allowed) */
+   if (!legal_texsubimage_target(ctx, dimensions, target)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glCopyTexSubImage%uD(target=%s)",
+                  dimensions, _mesa_lookup_enum_by_nr(target));
+      return GL_TRUE;
    }
 
    /* Check level */
@@ -2100,11 +2086,6 @@ copytexsubimage_error_check2( struct gl_context *ctx, GLuint dimensions,
    }
 
    if (_mesa_is_format_compressed(teximage->TexFormat)) {
-      if (!target_can_be_compressed(ctx, target)) {
-         _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glCopyTexSubImage%dD(target)", dimensions);
-         return GL_TRUE;
-      }
       /* offset must be multiple of 4 */
       if ((xoffset & 3) || (yoffset & 3)) {
          _mesa_error(ctx, GL_INVALID_VALUE,
@@ -2337,89 +2318,48 @@ _mesa_choose_texture_format(struct gl_context *ctx,
 }
 
 
-
-/*
- * Called from the API.  Note that width includes the border.
+/**
+ * Common code to implement all the glTexImage1D/2D/3D functions.
  */
-void GLAPIENTRY
-_mesa_TexImage1D( GLenum target, GLint level, GLint internalFormat,
-                  GLsizei width, GLint border, GLenum format,
-                  GLenum type, const GLvoid *pixels )
+static void
+teximage(struct gl_context *ctx, GLuint dims,
+         GLenum target, GLint level, GLint internalFormat,
+         GLsizei width, GLsizei height, GLsizei depth,
+         GLint border, GLenum format, GLenum type,
+         const GLvoid *pixels)
 {
-   GET_CURRENT_CONTEXT(ctx);
+   GLboolean error;
+
    ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glTexImage1D %s %d %s %d %d %s %s %p\n",
+      _mesa_debug(ctx, "glTexImage%uD %s %d %s %d %d %d %d %s %s %p\n",
+                  dims,
                   _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat), width, border,
+                  _mesa_lookup_enum_by_nr(internalFormat),
+                  width, height, depth, border,
                   _mesa_lookup_enum_by_nr(format),
                   _mesa_lookup_enum_by_nr(type), pixels);
 
-   internalFormat = override_internal_format(internalFormat, width, 1);
-
-   if (target == GL_TEXTURE_1D) {
-      /* non-proxy target */
-      struct gl_texture_object *texObj;
-      struct gl_texture_image *texImage;
-      const GLuint face = _mesa_tex_target_to_face(target);
-
-      if (texture_error_check(ctx, 1, target, level, internalFormat,
-                              format, type, width, 1, 1, border)) {
-         return;   /* error was recorded */
-      }
-
-      if (ctx->NewState & _MESA_NEW_TRANSFER_STATE)
-	 _mesa_update_state(ctx);
-
-      texObj = _mesa_get_current_tex_object(ctx, target);
-      _mesa_lock_texture(ctx, texObj);
-      {
-	 texImage = _mesa_get_tex_image(ctx, texObj, target, level);
-	 if (!texImage) {
-	    _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-	 }
-         else {
-            gl_format texFormat;
-
-            if (texImage->Data) {
-               ctx->Driver.FreeTexImageData( ctx, texImage );
-            }
-
-            ASSERT(texImage->Data == NULL);
-
-            texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
-                                                    internalFormat, format,
-                                                    type);
-
-            _mesa_init_teximage_fields(ctx, target, texImage,
-                                       width, 1, 1,
-                                       border, internalFormat,
-                                       texFormat);
+   internalFormat = override_internal_format(internalFormat, width, height);
 
-            /* Give the texture to the driver.  <pixels> may be null. */
-            ASSERT(ctx->Driver.TexImage1D);
-            ctx->Driver.TexImage1D(ctx, target, level, internalFormat,
-                                   width, border, format, type, pixels,
-                                   &ctx->Unpack, texObj, texImage);
+   /* target error checking */
+   if (!legal_teximage_target(ctx, dims, target)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glTexImage%uD(target=%s)",
+                  dims, _mesa_lookup_enum_by_nr(target));
+      return;
+   }
 
-            check_gen_mipmap(ctx, target, texObj, level);
+   /* general error checking */
+   error = texture_error_check(ctx, dims, target, level, internalFormat,
+                               format, type, width, height, depth, border);
 
-            update_fbo_texture(ctx, texObj, face, level);
+   if (_mesa_is_proxy_texture(target)) {
+      /* Proxy texture: just clear or set state depending on error checking */
+      struct gl_texture_image *texImage =
+         _mesa_get_proxy_tex_image(ctx, target, level);
 
-            /* state update */
-            texObj->_Complete = GL_FALSE;
-            ctx->NewState |= _NEW_TEXTURE;
-         }
-      }
-      _mesa_unlock_texture(ctx, texObj);
-   }
-   else if (target == GL_PROXY_TEXTURE_1D) {
-      /* Proxy texture: check for errors and update proxy state */
-      struct gl_texture_image *texImage;
-      texImage = _mesa_get_proxy_tex_image(ctx, target, level);
-      if (texture_error_check(ctx, 1, target, level, internalFormat,
-                              format, type, width, 1, 1, border)) {
+      if (error) {
          /* when error, clear all proxy texture image parameters */
          if (texImage)
             clear_teximage_fields(texImage);
@@ -2428,54 +2368,28 @@ _mesa_TexImage1D( GLenum target, GLint level, GLint internalFormat,
          /* no error, set the tex image parameters */
          struct gl_texture_object *texObj =
             _mesa_get_current_tex_object(ctx, target);
-         gl_format texFormat = _mesa_choose_texture_format(ctx, texObj, target,
-                                                           level,
+         gl_format texFormat = _mesa_choose_texture_format(ctx, texObj,
+                                                           target, level,
                                                            internalFormat,
                                                            format, type);
-         _mesa_init_teximage_fields(ctx, target, texImage, width, 1, 1,
-                                    border, internalFormat, texFormat);
+
+         if (legal_texture_size(ctx, texFormat, width, height, depth)) {
+            _mesa_init_teximage_fields(ctx, target, texImage, width, height,
+                                       depth, border, internalFormat,
+                                       texFormat);
+         }
+         else if (texImage) {
+            clear_teximage_fields(texImage);
+         }
       }
    }
    else {
-      _mesa_error( ctx, GL_INVALID_ENUM, "glTexImage1D(target)" );
-      return;
-   }
-}
-
-
-void GLAPIENTRY
-_mesa_TexImage2D( GLenum target, GLint level, GLint internalFormat,
-                  GLsizei width, GLsizei height, GLint border,
-                  GLenum format, GLenum type,
-                  const GLvoid *pixels )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glTexImage2D %s %d %s %d %d %d %s %s %p\n",
-                  _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat), width, height,
-                  border, _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
-
-   internalFormat = override_internal_format(internalFormat, width, height);
-
-   if (target == GL_TEXTURE_2D ||
-       (ctx->Extensions.ARB_texture_cube_map &&
-        target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
-        target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB) ||
-       (ctx->Extensions.NV_texture_rectangle &&
-        target == GL_TEXTURE_RECTANGLE_NV) ||
-       (ctx->Extensions.MESA_texture_array &&
-        target == GL_TEXTURE_1D_ARRAY_EXT)) {
       /* non-proxy target */
+      const GLuint face = _mesa_tex_target_to_face(target);
       struct gl_texture_object *texObj;
       struct gl_texture_image *texImage;
-      const GLuint face = _mesa_tex_target_to_face(target);
 
-      if (texture_error_check(ctx, 2, target, level, internalFormat,
-                              format, type, width, height, 1, border)) {
+      if (error) {
          return;   /* error was recorded */
       }
 
@@ -2483,11 +2397,13 @@ _mesa_TexImage2D( GLenum target, GLint level, GLint internalFormat,
 	 _mesa_update_state(ctx);
 
       texObj = _mesa_get_current_tex_object(ctx, target);
+
       _mesa_lock_texture(ctx, texObj);
       {
 	 texImage = _mesa_get_tex_image(ctx, texObj, target, level);
+
 	 if (!texImage) {
-	    _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
+	    _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage%uD", dims);
 	 }
          else {
             gl_format texFormat;
@@ -2497,63 +2413,81 @@ _mesa_TexImage2D( GLenum target, GLint level, GLint internalFormat,
             }
 
             ASSERT(texImage->Data == NULL);
-
             texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
                                                     internalFormat, format,
                                                     type);
 
-            _mesa_init_teximage_fields(ctx, target, texImage, width, height, 1,
-                                       border, internalFormat, texFormat);
-
-            /* Give the texture to the driver.  <pixels> may be null. */
-            ASSERT(ctx->Driver.TexImage2D);
-            ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
-                                   width, height, border, format, type,
-                                   pixels, &ctx->Unpack, texObj, texImage);
+            if (legal_texture_size(ctx, texFormat, width, height, depth)) {
+               _mesa_init_teximage_fields(ctx, target, texImage,
+                                          width, height, depth,
+                                          border, internalFormat, texFormat);
+
+               /* Give the texture to the driver.  <pixels> may be null. */
+               ASSERT(ctx->Driver.TexImage3D);
+               switch (dims) {
+               case 1:
+                  ctx->Driver.TexImage1D(ctx, target, level, internalFormat,
+                                         width, border, format,
+                                         type, pixels, &ctx->Unpack, texObj,
+                                         texImage);
+                  break;
+               case 2:
+                  ctx->Driver.TexImage2D(ctx, target, level, internalFormat,
+                                         width, height, border, format,
+                                         type, pixels, &ctx->Unpack, texObj,
+                                         texImage);
+                  break;
+               case 3:
+                  ctx->Driver.TexImage3D(ctx, target, level, internalFormat,
+                                         width, height, depth, border, format,
+                                         type, pixels, &ctx->Unpack, texObj,
+                                         texImage);
+                  break;
+               default:
+                  _mesa_problem(ctx, "invalid dims=%u in teximage()", dims);
+               }
 
-            check_gen_mipmap(ctx, target, texObj, level);
+               check_gen_mipmap(ctx, target, texObj, level);
 
-            update_fbo_texture(ctx, texObj, face, level);
+               update_fbo_texture(ctx, texObj, face, level);
 
-            /* state update */
-            texObj->_Complete = GL_FALSE;
-            ctx->NewState |= _NEW_TEXTURE;
+               /* state update */
+               texObj->_Complete = GL_FALSE;
+               ctx->NewState |= _NEW_TEXTURE;
+            }
+            else {
+               _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage%uD", dims);
+            }
          }
       }
       _mesa_unlock_texture(ctx, texObj);
    }
-   else if (target == GL_PROXY_TEXTURE_2D ||
-            (target == GL_PROXY_TEXTURE_CUBE_MAP_ARB &&
-             ctx->Extensions.ARB_texture_cube_map) ||
-            (target == GL_PROXY_TEXTURE_RECTANGLE_NV &&
-             ctx->Extensions.NV_texture_rectangle) ||
-            (ctx->Extensions.MESA_texture_array &&
-             target == GL_PROXY_TEXTURE_1D_ARRAY_EXT)) {
-      /* Proxy texture: check for errors and update proxy state */
-      struct gl_texture_image *texImage;
-      texImage = _mesa_get_proxy_tex_image(ctx, target, level);
-      if (texture_error_check(ctx, 2, target, level, internalFormat,
-                              format, type, width, height, 1, border)) {
-         /* when error, clear all proxy texture image parameters */
-         if (texImage)
-            clear_teximage_fields(texImage);
-      }
-      else {
-         /* no error, set the tex image parameters */
-         struct gl_texture_object *texObj =
-            _mesa_get_current_tex_object(ctx, target);
-         gl_format texFormat = _mesa_choose_texture_format(ctx, texObj,
-                                                           target, level,
-                                                           internalFormat,
-                                                           format, type);
-         _mesa_init_teximage_fields(ctx, target, texImage, width, height, 1,
-                                    border, internalFormat, texFormat);
-      }
-   }
-   else {
-      _mesa_error( ctx, GL_INVALID_ENUM, "glTexImage2D(target)" );
-      return;
-   }
+}
+
+
+/*
+ * Called from the API.  Note that width includes the border.
+ */
+void GLAPIENTRY
+_mesa_TexImage1D( GLenum target, GLint level, GLint internalFormat,
+                  GLsizei width, GLint border, GLenum format,
+                  GLenum type, const GLvoid *pixels )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   teximage(ctx, 1, target, level, internalFormat, width, 1, 1,
+            border, format, type, pixels);
+}
+
+
+void GLAPIENTRY
+_mesa_TexImage2D( GLenum target, GLint level, GLint internalFormat,
+                  GLsizei width, GLsizei height, GLint border,
+                  GLenum format, GLenum type,
+                  const GLvoid *pixels )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   teximage(ctx, 2, target, level, internalFormat, width, height, 1,
+            border, format, type, pixels);
 }
 
 
@@ -2568,100 +2502,8 @@ _mesa_TexImage3D( GLenum target, GLint level, GLint internalFormat,
                   const GLvoid *pixels )
 {
    GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glTexImage3D %s %d %s %d %d %d %d %s %s %p\n",
-                  _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat), width, height,
-                  depth, border, _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
-
-   internalFormat = override_internal_format(internalFormat, width, height);
-
-   if (target == GL_TEXTURE_3D ||
-       (ctx->Extensions.MESA_texture_array &&
-        target == GL_TEXTURE_2D_ARRAY_EXT)) {
-      /* non-proxy target */
-      struct gl_texture_object *texObj;
-      struct gl_texture_image *texImage;
-      const GLuint face = _mesa_tex_target_to_face(target);
-
-      if (texture_error_check(ctx, 3, target, level, (GLint) internalFormat,
-                              format, type, width, height, depth, border)) {
-         return;   /* error was recorded */
-      }
-
-      if (ctx->NewState & _MESA_NEW_TRANSFER_STATE)
-	 _mesa_update_state(ctx);
-
-      texObj = _mesa_get_current_tex_object(ctx, target);
-      _mesa_lock_texture(ctx, texObj);
-      {
-	 texImage = _mesa_get_tex_image(ctx, texObj, target, level);
-	 if (!texImage) {
-	    _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
-	 }
-         else {
-            gl_format texFormat;
-
-            if (texImage->Data) {
-               ctx->Driver.FreeTexImageData( ctx, texImage );
-            }
-
-            ASSERT(texImage->Data == NULL);
-            texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
-                                                    internalFormat, format,
-                                                    type);
-            _mesa_init_teximage_fields(ctx, target, texImage,
-                                       width, height, depth,
-                                       border, internalFormat, texFormat);
-
-            /* Give the texture to the driver.  <pixels> may be null. */
-            ASSERT(ctx->Driver.TexImage3D);
-            ctx->Driver.TexImage3D(ctx, target, level, internalFormat,
-                                   width, height, depth, border, format, type,
-                                   pixels, &ctx->Unpack, texObj, texImage);
-
-            check_gen_mipmap(ctx, target, texObj, level);
-
-            update_fbo_texture(ctx, texObj, face, level);
-
-            /* state update */
-            texObj->_Complete = GL_FALSE;
-            ctx->NewState |= _NEW_TEXTURE;
-         }
-      }
-      _mesa_unlock_texture(ctx, texObj);
-   }
-   else if (target == GL_PROXY_TEXTURE_3D ||
-       (ctx->Extensions.MESA_texture_array &&
-        target == GL_PROXY_TEXTURE_2D_ARRAY_EXT)) {
-      /* Proxy texture: check for errors and update proxy state */
-      struct gl_texture_image *texImage;
-      texImage = _mesa_get_proxy_tex_image(ctx, target, level);
-      if (texture_error_check(ctx, 3, target, level, internalFormat,
-                              format, type, width, height, depth, border)) {
-         /* when error, clear all proxy texture image parameters */
-         if (texImage)
-            clear_teximage_fields(texImage);
-      }
-      else {
-         /* no error, set the tex image parameters */
-         struct gl_texture_object *texObj =
-            _mesa_get_current_tex_object(ctx, target);
-         gl_format texFormat = _mesa_choose_texture_format(ctx, texObj,
-                                                           target, level,
-                                                           internalFormat,
-                                                           format, type);
-         _mesa_init_teximage_fields(ctx, target, texImage, width, height,
-                                    depth, border, internalFormat, texFormat);
-      }
-   }
-   else {
-      _mesa_error( ctx, GL_INVALID_ENUM, "glTexImage3D(target)" );
-      return;
-   }
+   teximage(ctx, 3, target, level, internalFormat, width, height, depth,
+            border, format, type, pixels);
 }
 
 
@@ -2724,51 +2566,92 @@ _mesa_EGLImageTargetTexture2DOES (GLenum target, GLeglImageOES image)
 #endif
 
 
-void GLAPIENTRY
-_mesa_TexSubImage1D( GLenum target, GLint level,
-                     GLint xoffset, GLsizei width,
-                     GLenum format, GLenum type,
-                     const GLvoid *pixels )
+
+/**
+ * Implement all the glTexSubImage1/2/3D() functions.
+ */
+static void
+texsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
+            GLint xoffset, GLint yoffset, GLint zoffset,
+            GLsizei width, GLsizei height, GLsizei depth,
+            GLenum format, GLenum type, const GLvoid *pixels )
 {
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
-   GET_CURRENT_CONTEXT(ctx);
+
    ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glTexSubImage1D %s %d %d %d %s %s %p\n",
+      _mesa_debug(ctx, "glTexSubImage%uD %s %d %d %d %d %d %d %d %s %s %p\n",
+                  dims,
                   _mesa_lookup_enum_by_nr(target), level,
-                  xoffset, width, _mesa_lookup_enum_by_nr(format),
+                  xoffset, yoffset, zoffset, width, height, depth,
+                  _mesa_lookup_enum_by_nr(format),
                   _mesa_lookup_enum_by_nr(type), pixels);
 
+   /* check target (proxies not allowed) */
+   if (!legal_texsubimage_target(ctx, dims, target)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glTexSubImage%uD(target=%s)",
+                  dims, _mesa_lookup_enum_by_nr(target));
+      return;
+   }       
+
    if (ctx->NewState & _MESA_NEW_TRANSFER_STATE)
       _mesa_update_state(ctx);
 
-   if (subtexture_error_check(ctx, 1, target, level, xoffset, 0, 0,
-			       width, 1, 1, format, type)) {
+   if (subtexture_error_check(ctx, dims, target, level, xoffset, yoffset, zoffset,
+                              width, height, depth, format, type)) {
       return;   /* error was detected */
    }
 
-
    texObj = _mesa_get_current_tex_object(ctx, target);
-   assert(texObj);
 
    _mesa_lock_texture(ctx, texObj);
    {
       texImage = _mesa_select_tex_image(ctx, texObj, target, level);
 
-      if (subtexture_error_check2(ctx, 1, target, level, xoffset, 0, 0,
-				  width, 1, 1, format, type, texImage)) {
+      if (subtexture_error_check2(ctx, dims, target, level,
+                                  xoffset, yoffset, zoffset,
+				  width, height, depth,
+                                  format, type, texImage)) {
          /* error was recorded */
       }
-      else if (width > 0) {
-         /* If we have a border, xoffset=-1 is legal.  Bias by border width */
-         xoffset += texImage->Border;
+      else if (width > 0 && height > 0 && height > 0) {
+         /* If we have a border, offset=-1 is legal.  Bias by border width. */
+         switch (dims) {
+         case 3:
+            zoffset += texImage->Border;
+            /* fall-through */
+         case 2:
+            yoffset += texImage->Border;
+            /* fall-through */
+         case 1:
+            xoffset += texImage->Border;
+         }
 
-         ASSERT(ctx->Driver.TexSubImage1D);
-         ctx->Driver.TexSubImage1D(ctx, target, level, xoffset, width,
-                                   format, type, pixels, &ctx->Unpack,
-                                   texObj, texImage);
+         switch (dims) {
+         case 1:
+            ctx->Driver.TexSubImage1D(ctx, target, level,
+                                      xoffset, width,
+                                      format, type, pixels,
+                                      &ctx->Unpack, texObj, texImage );
+            break;
+         case 2:
+            ctx->Driver.TexSubImage2D(ctx, target, level,
+                                      xoffset, yoffset, width, height,
+                                      format, type, pixels,
+                                      &ctx->Unpack, texObj, texImage );
+            break;
+         case 3:
+            ctx->Driver.TexSubImage3D(ctx, target, level,
+                                      xoffset, yoffset, zoffset,
+                                      width, height, depth,
+                                      format, type, pixels,
+                                      &ctx->Unpack, texObj, texImage );
+            break;
+         default:
+            _mesa_problem(ctx, "unexpected dims in subteximage()");
+         }
 
          check_gen_mipmap(ctx, target, texObj, level);
 
@@ -2780,58 +2663,31 @@ _mesa_TexSubImage1D( GLenum target, GLint level,
 
 
 void GLAPIENTRY
+_mesa_TexSubImage1D( GLenum target, GLint level,
+                     GLint xoffset, GLsizei width,
+                     GLenum format, GLenum type,
+                     const GLvoid *pixels )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   texsubimage(ctx, 1, target, level,
+               xoffset, 0, 0,
+               width, 1, 1,
+               format, type, pixels);
+}
+
+
+void GLAPIENTRY
 _mesa_TexSubImage2D( GLenum target, GLint level,
                      GLint xoffset, GLint yoffset,
                      GLsizei width, GLsizei height,
                      GLenum format, GLenum type,
                      const GLvoid *pixels )
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
    GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glTexSubImage2D %s %d %d %d %d %d %s %s %p\n",
-                  _mesa_lookup_enum_by_nr(target), level,
-                  xoffset, yoffset, width, height,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
-
-   if (ctx->NewState & _MESA_NEW_TRANSFER_STATE)
-      _mesa_update_state(ctx);
-
-   if (subtexture_error_check(ctx, 2, target, level, xoffset, yoffset, 0,
-			      width, height, 1, format, type)) {
-      return;   /* error was detected */
-   }
-
-   texObj = _mesa_get_current_tex_object(ctx, target);
-
-   _mesa_lock_texture(ctx, texObj);
-   {
-      texImage = _mesa_select_tex_image(ctx, texObj, target, level);
-
-      if (subtexture_error_check2(ctx, 2, target, level, xoffset, yoffset, 0,
-				  width, height, 1, format, type, texImage)) {
-	 /* error was recorded */
-      }
-      else if (width > 0 && height >= 0) {
-         /* If we have a border, xoffset=-1 is legal.  Bias by border width */
-         xoffset += texImage->Border;
-         yoffset += texImage->Border;
-
-         ASSERT(ctx->Driver.TexSubImage2D);
-         ctx->Driver.TexSubImage2D(ctx, target, level, xoffset, yoffset,
-                                   width, height, format, type, pixels,
-                                   &ctx->Unpack, texObj, texImage);
-
-         check_gen_mipmap(ctx, target, texObj, level);
-
-         ctx->NewState |= _NEW_TEXTURE;
-      }
-   }
-   _mesa_unlock_texture(ctx, texObj);
+   texsubimage(ctx, 2, target, level,
+               xoffset, yoffset, 0,
+               width, height, 1,
+               format, type, pixels);
 }
 
 
@@ -2843,84 +2699,41 @@ _mesa_TexSubImage3D( GLenum target, GLint level,
                      GLenum format, GLenum type,
                      const GLvoid *pixels )
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
    GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glTexSubImage3D %s %d %d %d %d %d %d %d %s %s %p\n",
-                  _mesa_lookup_enum_by_nr(target), level,
-                  xoffset, yoffset, zoffset, width, height, depth,
-                  _mesa_lookup_enum_by_nr(format),
-                  _mesa_lookup_enum_by_nr(type), pixels);
-
-   if (ctx->NewState & _MESA_NEW_TRANSFER_STATE)
-      _mesa_update_state(ctx);
-
-   if (subtexture_error_check(ctx, 3, target, level, xoffset, yoffset, zoffset,
-                              width, height, depth, format, type)) {
-      return;   /* error was detected */
-   }
-
-   texObj = _mesa_get_current_tex_object(ctx, target);
-
-   _mesa_lock_texture(ctx, texObj);
-   {
-      texImage = _mesa_select_tex_image(ctx, texObj, target, level);
-
-      if (subtexture_error_check2(ctx, 3, target, level,
-                                  xoffset, yoffset, zoffset,
-				  width, height, depth,
-                                  format, type, texImage)) {
-         /* error was recorded */
-      }
-      else if (width > 0 && height > 0 && height > 0) {
-         /* If we have a border, xoffset=-1 is legal.  Bias by border width */
-         xoffset += texImage->Border;
-         yoffset += texImage->Border;
-         zoffset += texImage->Border;
-
-         ASSERT(ctx->Driver.TexSubImage3D);
-         ctx->Driver.TexSubImage3D(ctx, target, level,
-                                   xoffset, yoffset, zoffset,
-                                   width, height, depth,
-                                   format, type, pixels,
-                                   &ctx->Unpack, texObj, texImage );
-
-         check_gen_mipmap(ctx, target, texObj, level);
-
-         ctx->NewState |= _NEW_TEXTURE;
-      }
-   }
-   _mesa_unlock_texture(ctx, texObj);
+   texsubimage(ctx, 3, target, level,
+               xoffset, yoffset, zoffset,
+               width, height, depth,
+               format, type, pixels);
 }
 
 
 
-void GLAPIENTRY
-_mesa_CopyTexImage1D( GLenum target, GLint level,
-                      GLenum internalFormat,
-                      GLint x, GLint y,
-                      GLsizei width, GLint border )
+/**
+ * Implement the glCopyTexImage1/2D() functions.
+ */
+static void
+copyteximage(struct gl_context *ctx, GLuint dims,
+             GLenum target, GLint level, GLenum internalFormat,
+             GLint x, GLint y, GLsizei width, GLsizei height, GLint border )
 {
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
    const GLuint face = _mesa_tex_target_to_face(target);
-   GET_CURRENT_CONTEXT(ctx);
+
    ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glCopyTexImage1D %s %d %s %d %d %d %d\n",
+      _mesa_debug(ctx, "glCopyTexImage%uD %s %d %s %d %d %d %d %d\n",
+                  dims,
                   _mesa_lookup_enum_by_nr(target), level,
                   _mesa_lookup_enum_by_nr(internalFormat),
-                  x, y, width, border);
+                  x, y, width, height, border);
 
    if (ctx->NewState & NEW_COPY_TEX_STATE)
       _mesa_update_state(ctx);
 
-   if (copytexture_error_check(ctx, 1, target, level, internalFormat,
-                               width, 1, border))
+   if (copytexture_error_check(ctx, dims, target, level, internalFormat,
+                               width, height, border))
       return;
 
    texObj = _mesa_get_current_tex_object(ctx, target);
@@ -2928,8 +2741,9 @@ _mesa_CopyTexImage1D( GLenum target, GLint level,
    _mesa_lock_texture(ctx, texObj);
    {
       texImage = _mesa_get_tex_image(ctx, texObj, target, level);
+
       if (!texImage) {
-	 _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyTexImage1D");
+	 _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyTexImage%uD", dims);
       }
       else {
          gl_format texFormat;
@@ -2944,20 +2758,29 @@ _mesa_CopyTexImage1D( GLenum target, GLint level,
                                                  internalFormat, GL_NONE,
                                                  GL_NONE);
 
-         _mesa_init_teximage_fields(ctx, target, texImage, width, 1, 1,
-                                    border, internalFormat, texFormat);
+         if (legal_texture_size(ctx, texFormat, width, height, 1)) {
+            _mesa_init_teximage_fields(ctx, target, texImage, width, height, 1,
+                                       border, internalFormat, texFormat);
 
-         ASSERT(ctx->Driver.CopyTexImage1D);
-         ctx->Driver.CopyTexImage1D(ctx, target, level, internalFormat,
-                                    x, y, width, border);
+            ASSERT(ctx->Driver.CopyTexImage2D);
+            if (dims == 1)
+               ctx->Driver.CopyTexImage1D(ctx, target, level, internalFormat,
+                                          x, y, width, border);
+            else
+               ctx->Driver.CopyTexImage2D(ctx, target, level, internalFormat,
+                                          x, y, width, height, border);
 
-         check_gen_mipmap(ctx, target, texObj, level);
+            check_gen_mipmap(ctx, target, texObj, level);
 
-         update_fbo_texture(ctx, texObj, face, level);
+            update_fbo_texture(ctx, texObj, face, level);
 
-         /* state update */
-         texObj->_Complete = GL_FALSE;
-         ctx->NewState |= _NEW_TEXTURE;
+            /* state update */
+            texObj->_Complete = GL_FALSE;
+            ctx->NewState |= _NEW_TEXTURE;
+         }
+         else {
+            _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyTexImage%uD", dims);
+         }
       }
    }
    _mesa_unlock_texture(ctx, texObj);
@@ -2966,92 +2789,52 @@ _mesa_CopyTexImage1D( GLenum target, GLint level,
 
 
 void GLAPIENTRY
-_mesa_CopyTexImage2D( GLenum target, GLint level, GLenum internalFormat,
-                      GLint x, GLint y, GLsizei width, GLsizei height,
-                      GLint border )
+_mesa_CopyTexImage1D( GLenum target, GLint level,
+                      GLenum internalFormat,
+                      GLint x, GLint y,
+                      GLsizei width, GLint border )
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
-   const GLuint face = _mesa_tex_target_to_face(target);
    GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glCopyTexImage2D %s %d %s %d %d %d %d %d\n",
-                  _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat),
-                  x, y, width, height, border);
-
-   if (ctx->NewState & NEW_COPY_TEX_STATE)
-      _mesa_update_state(ctx);
-
-   if (copytexture_error_check(ctx, 2, target, level, internalFormat,
-                               width, height, border))
-      return;
-
-   texObj = _mesa_get_current_tex_object(ctx, target);
-
-   _mesa_lock_texture(ctx, texObj);
-   {
-      texImage = _mesa_get_tex_image(ctx, texObj, target, level);
-
-      if (!texImage) {
-	 _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCopyTexImage2D");
-      }
-      else {
-         gl_format texFormat;
-
-         if (texImage->Data) {
-            ctx->Driver.FreeTexImageData( ctx, texImage );
-         }
-
-         ASSERT(texImage->Data == NULL);
-
-         texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
-                                                 internalFormat, GL_NONE,
-                                                 GL_NONE);
-
-         _mesa_init_teximage_fields(ctx, target, texImage, width, height, 1,
-                                    border, internalFormat, texFormat);
+   copyteximage(ctx, 1, target, level, internalFormat, x, y, width, 1, border);
+}
 
-         ASSERT(ctx->Driver.CopyTexImage2D);
-         ctx->Driver.CopyTexImage2D(ctx, target, level, internalFormat,
-                                    x, y, width, height, border);
 
-         check_gen_mipmap(ctx, target, texObj, level);
 
-         update_fbo_texture(ctx, texObj, face, level);
-
-         /* state update */
-         texObj->_Complete = GL_FALSE;
-         ctx->NewState |= _NEW_TEXTURE;
-      }
-   }
-   _mesa_unlock_texture(ctx, texObj);
+void GLAPIENTRY
+_mesa_CopyTexImage2D( GLenum target, GLint level, GLenum internalFormat,
+                      GLint x, GLint y, GLsizei width, GLsizei height,
+                      GLint border )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   copyteximage(ctx, 2, target, level, internalFormat,
+                x, y, width, height, border);
 }
 
 
-void GLAPIENTRY
-_mesa_CopyTexSubImage1D( GLenum target, GLint level,
-                         GLint xoffset, GLint x, GLint y, GLsizei width )
+
+/**
+ * Implementation for glCopyTexSubImage1/2/3D() functions.
+ */
+static void
+copytexsubimage(struct gl_context *ctx, GLuint dims, GLenum target, GLint level,
+                GLint xoffset, GLint yoffset, GLint zoffset,
+                GLint x, GLint y, GLsizei width, GLsizei height)
 {
    struct gl_texture_object *texObj;
    struct gl_texture_image *texImage;
-   GLint yoffset = 0;
-   GLsizei height = 1;
 
-   GET_CURRENT_CONTEXT(ctx);
    ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glCopyTexSubImage1D %s %d %d %d %d %d\n",
+      _mesa_debug(ctx, "glCopyTexSubImage%uD %s %d %d %d %d %d %d %d %d\n",
+                  dims,
                   _mesa_lookup_enum_by_nr(target),
-                  level, xoffset, x, y, width);
+                  level, xoffset, yoffset, zoffset, x, y, width, height);
 
    if (ctx->NewState & NEW_COPY_TEX_STATE)
       _mesa_update_state(ctx);
 
-   if (copytexsubimage_error_check1(ctx, 1, target, level))
+   if (copytexsubimage_error_check1(ctx, dims, target, level))
       return;
 
    texObj = _mesa_get_current_tex_object(ctx, target);
@@ -3060,19 +2843,43 @@ _mesa_CopyTexSubImage1D( GLenum target, GLint level,
    {
       texImage = _mesa_select_tex_image(ctx, texObj, target, level);
 
-      if (copytexsubimage_error_check2(ctx, 1, target, level,
-				       xoffset, 0, 0, width, 1, texImage)) {
-         /* error was recorded */
+      if (copytexsubimage_error_check2(ctx, dims, target, level, xoffset, yoffset,
+				       zoffset, width, height, texImage)) {
+         /* error was recored */
       }
       else {
-         /* If we have a border, xoffset=-1 is legal.  Bias by border width */
-         xoffset += texImage->Border;
+         /* If we have a border, offset=-1 is legal.  Bias by border width. */
+         switch (dims) {
+         case 3:
+            zoffset += texImage->Border;
+            /* fall-through */
+         case 2:
+            yoffset += texImage->Border;
+            /* fall-through */
+         case 1:
+            xoffset += texImage->Border;
+         }
 
          if (_mesa_clip_copytexsubimage(ctx, &xoffset, &yoffset, &x, &y,
                                         &width, &height)) {
-            ASSERT(ctx->Driver.CopyTexSubImage1D);
-            ctx->Driver.CopyTexSubImage1D(ctx, target, level,
-                                          xoffset, x, y, width);
+            switch (dims) {
+            case 1:
+               ctx->Driver.CopyTexSubImage1D(ctx, target, level,
+                                             xoffset, x, y, width);
+               break;
+            case 2:
+               ctx->Driver.CopyTexSubImage2D(ctx, target, level,
+                                             xoffset, yoffset,
+                                             x, y, width, height);
+               break;
+            case 3:
+               ctx->Driver.CopyTexSubImage3D(ctx, target, level,
+                                             xoffset, yoffset, zoffset,
+                                             x, y, width, height);
+               break;
+            default:
+               _mesa_problem(ctx, "bad dims in copytexsubimage()");
+            }
 
             check_gen_mipmap(ctx, target, texObj, level);
 
@@ -3084,57 +2891,24 @@ _mesa_CopyTexSubImage1D( GLenum target, GLint level,
 }
 
 
+void GLAPIENTRY
+_mesa_CopyTexSubImage1D( GLenum target, GLint level,
+                         GLint xoffset, GLint x, GLint y, GLsizei width )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   copytexsubimage(ctx, 1, target, level, xoffset, 0, 0, x, y, width, 1);
+}
+
+
 
 void GLAPIENTRY
 _mesa_CopyTexSubImage2D( GLenum target, GLint level,
                          GLint xoffset, GLint yoffset,
                          GLint x, GLint y, GLsizei width, GLsizei height )
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
    GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glCopyTexSubImage2D %s %d %d %d %d %d %d %d\n",
-                  _mesa_lookup_enum_by_nr(target),
-                  level, xoffset, yoffset, x, y, width, height);
-
-   if (ctx->NewState & NEW_COPY_TEX_STATE)
-      _mesa_update_state(ctx);
-
-   if (copytexsubimage_error_check1(ctx, 2, target, level))
-      return;
-
-   texObj = _mesa_get_current_tex_object(ctx, target);
-
-   _mesa_lock_texture(ctx, texObj);
-   {
-      texImage = _mesa_select_tex_image(ctx, texObj, target, level);
-
-      if (copytexsubimage_error_check2(ctx, 2, target, level,
-                                       xoffset, yoffset, 0,
-				       width, height, texImage)) {
-         /* error was recorded */
-      }
-      else {
-         /* If we have a border, xoffset=-1 is legal.  Bias by border width */
-         xoffset += texImage->Border;
-         yoffset += texImage->Border;
-
-         if (_mesa_clip_copytexsubimage(ctx, &xoffset, &yoffset, &x, &y,
-                                        &width, &height)) {
-            ASSERT(ctx->Driver.CopyTexSubImage2D);
-            ctx->Driver.CopyTexSubImage2D(ctx, target, level, xoffset, yoffset,
-                                          x, y, width, height);
-
-            check_gen_mipmap(ctx, target, texObj, level);
-
-            ctx->NewState |= _NEW_TEXTURE;
-         }
-      }
-   }
-   _mesa_unlock_texture(ctx, texObj);
+   copytexsubimage(ctx, 2, target, level, xoffset, yoffset, 0, x, y,
+                   width, height);
 }
 
 
@@ -3144,52 +2918,9 @@ _mesa_CopyTexSubImage3D( GLenum target, GLint level,
                          GLint xoffset, GLint yoffset, GLint zoffset,
                          GLint x, GLint y, GLsizei width, GLsizei height )
 {
-   struct gl_texture_object *texObj;
-   struct gl_texture_image *texImage;
    GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glCopyTexSubImage3D %s %d %d %d %d %d %d %d %d\n",
-                  _mesa_lookup_enum_by_nr(target),
-                  level, xoffset, yoffset, zoffset, x, y, width, height);
-
-   if (ctx->NewState & NEW_COPY_TEX_STATE)
-      _mesa_update_state(ctx);
-
-   if (copytexsubimage_error_check1(ctx, 3, target, level))
-      return;
-
-   texObj = _mesa_get_current_tex_object(ctx, target);
-
-   _mesa_lock_texture(ctx, texObj);
-   {
-      texImage = _mesa_select_tex_image(ctx, texObj, target, level);
-
-      if (copytexsubimage_error_check2(ctx, 3, target, level, xoffset, yoffset,
-				       zoffset, width, height, texImage)) {
-         /* error was recored */
-      }
-      else {
-         /* If we have a border, xoffset=-1 is legal.  Bias by border width */
-         xoffset += texImage->Border;
-         yoffset += texImage->Border;
-         zoffset += texImage->Border;
-
-         if (_mesa_clip_copytexsubimage(ctx, &xoffset, &yoffset, &x, &y,
-                                        &width, &height)) {
-            ASSERT(ctx->Driver.CopyTexSubImage3D);
-            ctx->Driver.CopyTexSubImage3D(ctx, target, level,
-                                          xoffset, yoffset, zoffset,
-                                          x, y, width, height);
-
-            check_gen_mipmap(ctx, target, texObj, level);
-
-            ctx->NewState |= _NEW_TEXTURE;
-         }
-      }
-   }
-   _mesa_unlock_texture(ctx, texObj);
+   copytexsubimage(ctx, 3, target, level, xoffset, yoffset, zoffset,
+                   x, y, width, height);
 }
 
 
@@ -3234,45 +2965,18 @@ compressed_texture_error_check(struct gl_context *ctx, GLint dimensions,
                                GLsizei height, GLsizei depth, GLint border,
                                GLsizei imageSize)
 {
-   GLint expectedSize, maxLevels = 0, maxTextureSize;
+   const GLenum proxyTarget = get_proxy_target(target);
+   const GLint maxLevels = _mesa_max_texture_levels(ctx, target);
+   GLint expectedSize;
 
-   if (dimensions == 1) {
-      /* 1D compressed textures not allowed */
-      return GL_INVALID_ENUM;
-   }
-   else if (dimensions == 2) {
-      if (target == GL_PROXY_TEXTURE_2D) {
-         maxLevels = ctx->Const.MaxTextureLevels;
-      }
-      else if (target == GL_TEXTURE_2D) {
-         maxLevels = ctx->Const.MaxTextureLevels;
-      }
-      else if (target == GL_PROXY_TEXTURE_CUBE_MAP_ARB) {
-         if (!ctx->Extensions.ARB_texture_cube_map)
-            return GL_INVALID_ENUM; /*target*/
-         maxLevels = ctx->Const.MaxCubeTextureLevels;
-      }
-      else if (target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
-               target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB) {
-         if (!ctx->Extensions.ARB_texture_cube_map)
-            return GL_INVALID_ENUM; /*target*/
-         maxLevels = ctx->Const.MaxCubeTextureLevels;
-      }
-      else {
-         return GL_INVALID_ENUM; /*target*/
-      }
-   }
-   else if (dimensions == 3) {
-      /* 3D compressed textures not allowed */
-      return GL_INVALID_ENUM;
-   }
-   else {
-      assert(0);
+   /* check level */
+   if (level < 0 || level >= maxLevels)
+      return GL_INVALID_VALUE;
+
+   if (!target_can_be_compressed(ctx, target, internalFormat)) {
       return GL_INVALID_ENUM;
    }
 
-   maxTextureSize = 1 << (maxLevels - 1);
-
    /* This will detect any invalid internalFormat value */
    if (!_mesa_is_compressed_format(ctx, internalFormat))
       return GL_INVALID_ENUM;
@@ -3281,47 +2985,51 @@ compressed_texture_error_check(struct gl_context *ctx, GLint dimensions,
    if (_mesa_base_tex_format(ctx, internalFormat) < 0)
       return GL_INVALID_ENUM;
 
+   /* No compressed formats support borders at this time */
    if (border != 0)
       return GL_INVALID_VALUE;
 
-   /*
-    * XXX We should probably use the proxy texture error check function here.
-    */
-   if (width < 1 || width > maxTextureSize ||
-       (!ctx->Extensions.ARB_texture_non_power_of_two && !_mesa_is_pow_two(width)))
-      return GL_INVALID_VALUE;
-
-   if ((height < 1 || height > maxTextureSize ||
-       (!ctx->Extensions.ARB_texture_non_power_of_two && !_mesa_is_pow_two(height)))
-       && dimensions > 1)
-      return GL_INVALID_VALUE;
-
-   if ((depth < 1 || depth > maxTextureSize ||
-       (!ctx->Extensions.ARB_texture_non_power_of_two && !_mesa_is_pow_two(depth)))
-       && dimensions > 2)
-      return GL_INVALID_VALUE;
-
    /* For cube map, width must equal height */
    if (target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
        target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB && width != height)
       return GL_INVALID_VALUE;
 
-   if (level < 0 || level >= maxLevels)
-      return GL_INVALID_VALUE;
+   /* check image size against compression block size */
+   {
+      gl_format texFormat =
+         ctx->Driver.ChooseTextureFormat(ctx, internalFormat,
+                                         GL_NONE, GL_NONE);
+      GLuint bw, bh;
 
-   expectedSize = compressed_tex_size(width, height, depth, internalFormat);
-   if (expectedSize != imageSize)
-      return GL_INVALID_VALUE;
+      _mesa_get_format_block_size(texFormat, &bw, &bh);
+      if ((width > bw && width % bw > 0) ||
+          (height > bh && height % bh > 0)) {
+         /*
+          * Per GL_ARB_texture_compression:  GL_INVALID_OPERATION is
+          * generated [...] if any parameter combinations are not
+          * supported by the specific compressed internal format. 
+          */
+         return GL_INVALID_OPERATION;
+      }
+   }
 
-#if FEATURE_EXT_texture_sRGB
-   if ((internalFormat == GL_COMPRESSED_SRGB_S3TC_DXT1_EXT ||
-        internalFormat == GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT ||
-        internalFormat == GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT ||
-        internalFormat == GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT)
-       && border != 0) {
+   /* check image sizes */
+   if (!ctx->Driver.TestProxyTexImage(ctx, proxyTarget, level,
+                                      internalFormat, GL_NONE, GL_NONE,
+                                      width, height, depth, border)) {
+      /* See error comment above */
       return GL_INVALID_OPERATION;
    }
-#endif
+
+   /* check image size in bytes */
+   expectedSize = compressed_tex_size(width, height, depth, internalFormat);
+   if (expectedSize != imageSize) {
+      /* Per GL_ARB_texture_compression:  GL_INVALID_VALUE is generated [...]
+       * if <imageSize> is not consistent with the format, dimensions, and
+       * contents of the specified image.
+       */
+      return GL_INVALID_VALUE;
+   }
 
    return GL_NO_ERROR;
 }
@@ -3463,159 +3171,101 @@ compressed_subtexture_error_check2(struct gl_context *ctx, GLuint dims,
 }
 
 
-
-void GLAPIENTRY
-_mesa_CompressedTexImage1DARB(GLenum target, GLint level,
-                              GLenum internalFormat, GLsizei width,
-                              GLint border, GLsizei imageSize,
-                              const GLvoid *data)
+/**
+ * Implementation of the glCompressedTexImage1/2/3D() functions.
+ */
+static void
+compressedteximage(struct gl_context *ctx, GLuint dims,
+                   GLenum target, GLint level,
+                   GLenum internalFormat, GLsizei width,
+                   GLsizei height, GLsizei depth, GLint border,
+                   GLsizei imageSize, const GLvoid *data)
 {
-   GET_CURRENT_CONTEXT(ctx);
+   GLenum error;
+
    ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
 
    if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glCompressedTexImage1DARB %s %d %s %d %d %d %p\n",
+      _mesa_debug(ctx,
+                  "glCompressedTexImage%uDARB %s %d %s %d %d %d %d %d %p\n",
+                  dims,
                   _mesa_lookup_enum_by_nr(target), level,
                   _mesa_lookup_enum_by_nr(internalFormat),
-                  width, border, imageSize, data);
-
-   if (target == GL_TEXTURE_1D) {
-      /* non-proxy target */
-      struct gl_texture_object *texObj;
-      struct gl_texture_image *texImage;
-      GLenum error = compressed_texture_error_check(ctx, 1, target, level,
-                               internalFormat, width, 1, 1, border, imageSize);
-      if (error) {
-         _mesa_error(ctx, error, "glCompressedTexImage1D");
-         return;
-      }
-
-      texObj = _mesa_get_current_tex_object(ctx, target);
-
-      _mesa_lock_texture(ctx, texObj);
-      {
-	 texImage = _mesa_get_tex_image(ctx, texObj, target, level);
-	 if (!texImage) {
-	    _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage1D");
-	 }
-         else {
-            gl_format texFormat;
-
-            if (texImage->Data) {
-               ctx->Driver.FreeTexImageData( ctx, texImage );
-            }
-            ASSERT(texImage->Data == NULL);
-
-            texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
-                                                    internalFormat, GL_NONE,
-                                                    GL_NONE);
+                  width, height, depth, border, imageSize, data);
 
-            _mesa_init_teximage_fields(ctx, target, texImage, width, 1, 1,
-                                       border, internalFormat, texFormat);
+   /* check target */
+   if (!legal_teximage_target(ctx, dims, target)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glCompressedTexImage%uD(target=%s)",
+                  dims, _mesa_lookup_enum_by_nr(target));
+      return;
+   }
 
-            ASSERT(ctx->Driver.CompressedTexImage1D);
-            ctx->Driver.CompressedTexImage1D(ctx, target, level,
-                                             internalFormat, width, border,
-                                             imageSize, data,
-                                             texObj, texImage);
+   error = compressed_texture_error_check(ctx, dims, target, level,
+                                          internalFormat, width, height, depth,
+                                          border, imageSize);
 
-            check_gen_mipmap(ctx, target, texObj, level);
+#if FEATURE_ES
+   /* XXX this is kind of a hack */
+   if (error) {
+      _mesa_error(ctx, error, "glTexImage2D");
+      return;
+   }
 
-            /* state update */
-            texObj->_Complete = GL_FALSE;
-            ctx->NewState |= _NEW_TEXTURE;
-         }
+   if (dims == 2) {
+      switch (internalFormat) {
+      case GL_PALETTE4_RGB8_OES:
+      case GL_PALETTE4_RGBA8_OES:
+      case GL_PALETTE4_R5_G6_B5_OES:
+      case GL_PALETTE4_RGBA4_OES:
+      case GL_PALETTE4_RGB5_A1_OES:
+      case GL_PALETTE8_RGB8_OES:
+      case GL_PALETTE8_RGBA8_OES:
+      case GL_PALETTE8_R5_G6_B5_OES:
+      case GL_PALETTE8_RGBA4_OES:
+      case GL_PALETTE8_RGB5_A1_OES:
+         _mesa_cpal_compressed_teximage2d(target, level, internalFormat,
+                                          width, height, imageSize, data);
+         return;
       }
-      _mesa_unlock_texture(ctx, texObj);
    }
-   else if (target == GL_PROXY_TEXTURE_1D) {
-      /* Proxy texture: check for errors and update proxy state */
-      GLenum error = compressed_texture_error_check(ctx, 1, target, level,
-                               internalFormat, width, 1, 1, border, imageSize);
+#endif
+
+   if (_mesa_is_proxy_texture(target)) {
+      /* Proxy texture: just check for errors and update proxy state */
+      struct gl_texture_image *texImage;
+
       if (!error) {
-         ASSERT(ctx->Driver.TestProxyTexImage);
-         error = !(*ctx->Driver.TestProxyTexImage)(ctx, target, level,
-                                             internalFormat, GL_NONE, GL_NONE,
-                                             width, 1, 1, border);
-      }
-      if (error) {
-         /* if error, clear all proxy texture image parameters */
-         struct gl_texture_image *texImage;
-         texImage = _mesa_get_proxy_tex_image(ctx, target, level);
-         if (texImage)
-            clear_teximage_fields(texImage);
+         struct gl_texture_object *texObj =
+            _mesa_get_current_tex_object(ctx, target);
+         gl_format texFormat =
+            _mesa_choose_texture_format(ctx, texObj, target, level,
+                                        internalFormat, GL_NONE, GL_NONE);
+         if (!legal_texture_size(ctx, texFormat, width, height, depth)) {
+            error = GL_OUT_OF_MEMORY;
+         }
       }
-      else {
-         /* store the teximage parameters */
-         struct gl_texture_object *texObj;
-         struct gl_texture_image *texImage;
-         gl_format texFormat;
 
-         texObj = _mesa_get_current_tex_object(ctx, target);
-
-	 _mesa_lock_texture(ctx, texObj);
-	 {
-	    texImage = _mesa_select_tex_image(ctx, texObj, target, level);
-            texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
-                                                    internalFormat, GL_NONE,
-                                                    GL_NONE);
-	    _mesa_init_teximage_fields(ctx, target, texImage, width, 1, 1,
-				       border, internalFormat, texFormat);
-	 }
-	 _mesa_unlock_texture(ctx, texObj);
+      texImage = _mesa_get_proxy_tex_image(ctx, target, level);
+      if (texImage) {
+         if (error) {
+            /* if error, clear all proxy texture image parameters */
+            clear_teximage_fields(texImage);
+         }
+         else {
+            /* no error: store the teximage parameters */
+            _mesa_init_teximage_fields(ctx, target, texImage, width, height,
+                                       depth, border, internalFormat,
+                                       MESA_FORMAT_NONE);
+         }
       }
    }
    else {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glCompressedTexImage1D(target)");
-      return;
-   }
-}
-
-void GLAPIENTRY
-_mesa_CompressedTexImage2DARB(GLenum target, GLint level,
-                              GLenum internalFormat, GLsizei width,
-                              GLsizei height, GLint border, GLsizei imageSize,
-                              const GLvoid *data)
-{
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glCompressedTexImage2DARB %s %d %s %d %d %d %d %p\n",
-                  _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat),
-                  width, height, border, imageSize, data);
-
-#if FEATURE_ES
-   switch (internalFormat) {
-   case GL_PALETTE4_RGB8_OES:
-   case GL_PALETTE4_RGBA8_OES:
-   case GL_PALETTE4_R5_G6_B5_OES:
-   case GL_PALETTE4_RGBA4_OES:
-   case GL_PALETTE4_RGB5_A1_OES:
-   case GL_PALETTE8_RGB8_OES:
-   case GL_PALETTE8_RGBA8_OES:
-   case GL_PALETTE8_R5_G6_B5_OES:
-   case GL_PALETTE8_RGBA4_OES:
-   case GL_PALETTE8_RGB5_A1_OES:
-      _mesa_cpal_compressed_teximage2d(target, level, internalFormat,
-				       width, height, imageSize, data);
-      return;
-   }
-#endif
-
-   if (target == GL_TEXTURE_2D ||
-       (ctx->Extensions.ARB_texture_cube_map &&
-        target >= GL_TEXTURE_CUBE_MAP_POSITIVE_X_ARB &&
-        target <= GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB)) {
       /* non-proxy target */
       struct gl_texture_object *texObj;
       struct gl_texture_image *texImage;
 
-      GLenum error = compressed_texture_error_check(ctx, 2, target, level,
-                          internalFormat, width, height, 1, border, imageSize);
       if (error) {
-         _mesa_error(ctx, error, "glCompressedTexImage2D");
+         _mesa_error(ctx, error, "glCompressedTexImage%uD", dims);
          return;
       }
 
@@ -3625,7 +3275,8 @@ _mesa_CompressedTexImage2DARB(GLenum target, GLint level,
       {
 	 texImage = _mesa_get_tex_image(ctx, texObj, target, level);
 	 if (!texImage) {
-	    _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
+	    _mesa_error(ctx, GL_OUT_OF_MEMORY,
+                        "glCompressedTexImage%uD", dims);
 	 }
          else {
             gl_format texFormat;
@@ -3639,67 +3290,78 @@ _mesa_CompressedTexImage2DARB(GLenum target, GLint level,
                                                     internalFormat, GL_NONE,
                                                     GL_NONE);
 
-            _mesa_init_teximage_fields(ctx, target, texImage, width, height, 1,
-                                       border, internalFormat, texFormat);
-
-            ASSERT(ctx->Driver.CompressedTexImage2D);
-            ctx->Driver.CompressedTexImage2D(ctx, target, level,
-                                             internalFormat, width, height,
-                                             border, imageSize, data,
-                                             texObj, texImage);
+            if (legal_texture_size(ctx, texFormat, width, height, depth)) {
+               _mesa_init_teximage_fields(ctx, target, texImage,
+                                          width, height, depth,
+                                          border, internalFormat, texFormat);
+
+               switch (dims) {
+               case 1:
+                  ASSERT(ctx->Driver.CompressedTexImage1D);
+                  ctx->Driver.CompressedTexImage1D(ctx, target, level,
+                                                   internalFormat,
+                                                   width,
+                                                   border, imageSize, data,
+                                                   texObj, texImage);
+                  break;
+               case 2:
+                  ASSERT(ctx->Driver.CompressedTexImage2D);
+                  ctx->Driver.CompressedTexImage2D(ctx, target, level,
+                                                   internalFormat,
+                                                   width, height,
+                                                   border, imageSize, data,
+                                                   texObj, texImage);
+                  break;
+               case 3:
+                  ASSERT(ctx->Driver.CompressedTexImage3D);
+                  ctx->Driver.CompressedTexImage3D(ctx, target, level,
+                                                   internalFormat,
+                                                   width, height, depth,
+                                                   border, imageSize, data,
+                                                   texObj, texImage);
+                  break;
+               default:
+                  _mesa_problem(ctx, "bad dims in compressedteximage");
+               }
 
-            check_gen_mipmap(ctx, target, texObj, level);
+               check_gen_mipmap(ctx, target, texObj, level);
 
-            /* state update */
-            texObj->_Complete = GL_FALSE;
-            ctx->NewState |= _NEW_TEXTURE;
+               /* state update */
+               texObj->_Complete = GL_FALSE;
+               ctx->NewState |= _NEW_TEXTURE;
+            }
+            else {
+               _mesa_error(ctx, GL_OUT_OF_MEMORY,
+                           "glCompressedTexImage%uD", dims);
+            }
          }
       }
       _mesa_unlock_texture(ctx, texObj);
    }
-   else if (target == GL_PROXY_TEXTURE_2D ||
-            (target == GL_PROXY_TEXTURE_CUBE_MAP_ARB &&
-             ctx->Extensions.ARB_texture_cube_map)) {
-      /* Proxy texture: check for errors and update proxy state */
-      GLenum error = compressed_texture_error_check(ctx, 2, target, level,
-                          internalFormat, width, height, 1, border, imageSize);
-      if (!error) {
-         ASSERT(ctx->Driver.TestProxyTexImage);
-         error = !(*ctx->Driver.TestProxyTexImage)(ctx, target, level,
-                                              internalFormat, GL_NONE, GL_NONE,
-                                              width, height, 1, border);
-      }
-      if (error) {
-         /* if error, clear all proxy texture image parameters */
-         struct gl_texture_image *texImage;
-         texImage = _mesa_get_proxy_tex_image(ctx, target, level);
-         if (texImage)
-            clear_teximage_fields(texImage);
-      }
-      else {
-         /* store the teximage parameters */
-         struct gl_texture_object *texObj;
-         struct gl_texture_image *texImage;
-         gl_format texFormat;
+}
 
-         texObj = _mesa_get_current_tex_object(ctx, target);
 
-	 _mesa_lock_texture(ctx, texObj);
-	 {
-	    texImage = _mesa_select_tex_image(ctx, texObj, target, level);
-            texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
-                                                    internalFormat, GL_NONE,
-                                                    GL_NONE);
-	    _mesa_init_teximage_fields(ctx, target, texImage, width, height, 1,
-				       border, internalFormat, texFormat);
-	 }
-	 _mesa_unlock_texture(ctx, texObj);
-      }
-   }
-   else {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glCompressedTexImage2D(target)");
-      return;
-   }
+void GLAPIENTRY
+_mesa_CompressedTexImage1DARB(GLenum target, GLint level,
+                              GLenum internalFormat, GLsizei width,
+                              GLint border, GLsizei imageSize,
+                              const GLvoid *data)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   compressedteximage(ctx, 1, target, level, internalFormat,
+                      width, 1, 1, border, imageSize, data);
+}
+
+
+void GLAPIENTRY
+_mesa_CompressedTexImage2DARB(GLenum target, GLint level,
+                              GLenum internalFormat, GLsizei width,
+                              GLsizei height, GLint border, GLsizei imageSize,
+                              const GLvoid *data)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   compressedteximage(ctx, 2, target, level, internalFormat,
+                      width, height, 1, border, imageSize, data);
 }
 
 
@@ -3710,107 +3372,8 @@ _mesa_CompressedTexImage3DARB(GLenum target, GLint level,
                               GLsizei imageSize, const GLvoid *data)
 {
    GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_TEXTURE))
-      _mesa_debug(ctx, "glCompressedTexImage3DARB %s %d %s %d %d %d %d %d %p\n",
-                  _mesa_lookup_enum_by_nr(target), level,
-                  _mesa_lookup_enum_by_nr(internalFormat),
-                  width, height, depth, border, imageSize, data);
-
-   if (target == GL_TEXTURE_3D) {
-      /* non-proxy target */
-      struct gl_texture_object *texObj;
-      struct gl_texture_image *texImage;
-      GLenum error = compressed_texture_error_check(ctx, 3, target, level,
-                      internalFormat, width, height, depth, border, imageSize);
-      if (error) {
-         _mesa_error(ctx, error, "glCompressedTexImage3D");
-         return;
-      }
-
-      texObj = _mesa_get_current_tex_object(ctx, target);
-
-      _mesa_lock_texture(ctx, texObj);
-      {
-	 texImage = _mesa_get_tex_image(ctx, texObj, target, level);
-	 if (!texImage) {
-	    _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage3D");
-	 }
-         else {
-            gl_format texFormat;
-
-            if (texImage->Data) {
-               ctx->Driver.FreeTexImageData( ctx, texImage );
-            }
-            ASSERT(texImage->Data == NULL);
-
-            texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
-                                                    internalFormat, GL_NONE,
-                                                    GL_NONE);
-
-            _mesa_init_teximage_fields(ctx, target, texImage,
-                                       width, height, depth,
-                                       border, internalFormat, texFormat);
-
-            ASSERT(ctx->Driver.CompressedTexImage3D);
-            ctx->Driver.CompressedTexImage3D(ctx, target, level,
-                                             internalFormat,
-                                             width, height, depth,
-                                             border, imageSize, data,
-                                             texObj, texImage);
-
-            check_gen_mipmap(ctx, target, texObj, level);
-
-            /* state update */
-            texObj->_Complete = GL_FALSE;
-            ctx->NewState |= _NEW_TEXTURE;
-         }
-      }
-      _mesa_unlock_texture(ctx, texObj);
-   }
-   else if (target == GL_PROXY_TEXTURE_3D) {
-      /* Proxy texture: check for errors and update proxy state */
-      GLenum error = compressed_texture_error_check(ctx, 3, target, level,
-                      internalFormat, width, height, depth, border, imageSize);
-      if (!error) {
-         ASSERT(ctx->Driver.TestProxyTexImage);
-         error = !(*ctx->Driver.TestProxyTexImage)(ctx, target, level,
-                                             internalFormat, GL_NONE, GL_NONE,
-                                             width, height, depth, border);
-      }
-      if (error) {
-         /* if error, clear all proxy texture image parameters */
-         struct gl_texture_image *texImage;
-         texImage = _mesa_get_proxy_tex_image(ctx, target, level);
-         if (texImage)
-            clear_teximage_fields(texImage);
-      }
-      else {
-         /* store the teximage parameters */
-         struct gl_texture_object *texObj;
-         struct gl_texture_image *texImage;
-         gl_format texFormat;
-
-         texObj = _mesa_get_current_tex_object(ctx, target);
-
-	 _mesa_lock_texture(ctx, texObj);
-	 {
-	    texImage = _mesa_select_tex_image(ctx, texObj, target, level);
-            texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
-                                                    internalFormat, GL_NONE,
-                                                    GL_NONE);
-	    _mesa_init_teximage_fields(ctx, target, texImage, width, height,
-				       depth, border, internalFormat,
-                                       texFormat);
-	 }
-	 _mesa_unlock_texture(ctx, texObj);
-      }
-   }
-   else {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glCompressedTexImage3D(target)");
-      return;
-   }
+   compressedteximage(ctx, 3, target, level, internalFormat,
+                      width, height, depth, border, imageSize, data);
 }
 
 
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index 72dbf10cc4b..5bc5639dbf7 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -372,15 +372,12 @@ _mesa_reference_texobj(struct gl_texture_object **ptr,
 
 
 /**
- * Report why a texture object is incomplete.  
- *
- * \param t texture object.
- * \param why string describing why it's incomplete.
- *
- * \note For debug purposes only.
+ * Mark a texture object as incomplete.
+ * \param t  texture object
+ * \param fmt...  string describing why it's incomplete (for debugging).
  */
 static void
-incomplete(const struct gl_texture_object *t, const char *fmt, ...)
+incomplete(struct gl_texture_object *t, const char *fmt, ...)
 {
 #if 0
    va_list args;
@@ -392,6 +389,7 @@ incomplete(const struct gl_texture_object *t, const char *fmt, ...)
 
    printf("Texture Obj %d incomplete because: %s\n", t->Name, s);
 #endif
+   t->_Complete = GL_FALSE;
 }
 
 
@@ -421,14 +419,12 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
     */
    if ((baseLevel < 0) || (baseLevel >= MAX_TEXTURE_LEVELS)) {
       incomplete(t, "base level = %d is invalid", baseLevel);
-      t->_Complete = GL_FALSE;
       return;
    }
 
    /* Always need the base level image */
    if (!t->Image[0][baseLevel]) {
       incomplete(t, "Image[baseLevel=%d] == NULL", baseLevel);
-      t->_Complete = GL_FALSE;
       return;
    }
 
@@ -437,7 +433,6 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
        t->Image[0][baseLevel]->Height == 0 ||
        t->Image[0][baseLevel]->Depth == 0) {
       incomplete(t, "texture width = 0");
-      t->_Complete = GL_FALSE;
       return;
    }
 
@@ -491,7 +486,6 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
          if (t->Image[face][baseLevel] == NULL ||
              t->Image[face][baseLevel]->Width2 != w ||
              t->Image[face][baseLevel]->Height2 != h) {
-            t->_Complete = GL_FALSE;
             incomplete(t, "Cube face missing or mismatched size");
             return;
          }
@@ -508,7 +502,6 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
       GLint maxLevel = t->_MaxLevel;
 
       if (minLevel > maxLevel) {
-         t->_Complete = GL_FALSE;
          incomplete(t, "minLevel > maxLevel");
          return;
       }
@@ -517,12 +510,10 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
       for (i = minLevel; i <= maxLevel; i++) {
          if (t->Image[0][i]) {
             if (t->Image[0][i]->TexFormat != t->Image[0][baseLevel]->TexFormat) {
-               t->_Complete = GL_FALSE;
                incomplete(t, "Format[i] != Format[baseLevel]");
                return;
             }
             if (t->Image[0][i]->Border != t->Image[0][baseLevel]->Border) {
-               t->_Complete = GL_FALSE;
                incomplete(t, "Border[i] != Border[baseLevel]");
                return;
             }
@@ -540,12 +531,10 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
             }
             if (i >= minLevel && i <= maxLevel) {
                if (!t->Image[0][i]) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "1D Image[0][i] == NULL");
                   return;
                }
                if (t->Image[0][i]->Width2 != width ) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "1D Image[0][i] bad width");
                   return;
                }
@@ -569,17 +558,14 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
             }
             if (i >= minLevel && i <= maxLevel) {
                if (!t->Image[0][i]) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "2D Image[0][i] == NULL");
                   return;
                }
                if (t->Image[0][i]->Width2 != width) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "2D Image[0][i] bad width");
                   return;
                }
                if (t->Image[0][i]->Height2 != height) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "2D Image[0][i] bad height");
                   return;
                }
@@ -607,26 +593,21 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
             if (i >= minLevel && i <= maxLevel) {
                if (!t->Image[0][i]) {
                   incomplete(t, "3D Image[0][i] == NULL");
-                  t->_Complete = GL_FALSE;
                   return;
                }
                if (t->Image[0][i]->_BaseFormat == GL_DEPTH_COMPONENT) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "GL_DEPTH_COMPONENT only works with 1/2D tex");
                   return;
                }
                if (t->Image[0][i]->Width2 != width) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "3D Image[0][i] bad width");
                   return;
                }
                if (t->Image[0][i]->Height2 != height) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "3D Image[0][i] bad height");
                   return;
                }
                if (t->Image[0][i]->Depth2 != depth) {
-                  t->_Complete = GL_FALSE;
                   incomplete(t, "3D Image[0][i] bad depth");
                   return;
                }
@@ -652,20 +633,17 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
 	       for (face = 0; face < 6; face++) {
 		  /* check that we have images defined */
 		  if (!t->Image[face][i]) {
-		     t->_Complete = GL_FALSE;
 		     incomplete(t, "CubeMap Image[n][i] == NULL");
 		     return;
 		  }
 		  /* Don't support GL_DEPTH_COMPONENT for cube maps */
 		  if (t->Image[face][i]->_BaseFormat == GL_DEPTH_COMPONENT) {
-		     t->_Complete = GL_FALSE;
 		     incomplete(t, "GL_DEPTH_COMPONENT only works with 1/2D tex");
 		     return;
 		  }
 		  /* check that all six images have same size */
                   if (t->Image[face][i]->Width2 != width || 
                       t->Image[face][i]->Height2 != height) {
-		     t->_Complete = GL_FALSE;
 		     incomplete(t, "CubeMap Image[n][i] bad size");
 		     return;
 		  }
@@ -688,6 +666,44 @@ _mesa_test_texobj_completeness( const struct gl_context *ctx,
 
 
 /**
+ * Check if the given cube map texture is "cube complete" as defined in
+ * the OpenGL specification.
+ */
+GLboolean
+_mesa_cube_complete(const struct gl_texture_object *texObj)
+{
+   const GLint baseLevel = texObj->BaseLevel;
+   const struct gl_texture_image *img0, *img;
+   GLuint face;
+
+   if (texObj->Target != GL_TEXTURE_CUBE_MAP)
+      return GL_FALSE;
+
+   if ((baseLevel < 0) || (baseLevel >= MAX_TEXTURE_LEVELS))
+      return GL_FALSE;
+
+   /* check first face */
+   img0 = texObj->Image[0][baseLevel];
+   if (!img0 ||
+       img0->Width < 1 ||
+       img0->Width != img0->Height)
+      return GL_FALSE;
+
+   /* check remaining faces vs. first face */
+   for (face = 1; face < 6; face++) {
+      img = texObj->Image[face][baseLevel];
+      if (!img ||
+          img->Width != img0->Width ||
+          img->Height != img0->Height ||
+          img->TexFormat != img0->TexFormat)
+         return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+
+/**
  * Mark a texture object dirty.  It forces the object to be incomplete
  * and optionally forces the context to re-validate its state.
  *
diff --git a/src/mesa/main/texobj.h b/src/mesa/main/texobj.h
index 821b35caa36..2461b063efd 100644
--- a/src/mesa/main/texobj.h
+++ b/src/mesa/main/texobj.h
@@ -32,8 +32,9 @@
 #define TEXTOBJ_H
 
 
-#include "mtypes.h"
+#include "glheader.h"
 
+struct gl_context;
 
 /**
  * \name Internal functions
@@ -68,6 +69,9 @@ extern void
 _mesa_test_texobj_completeness( const struct gl_context *ctx,
                                 struct gl_texture_object *obj );
 
+extern GLboolean
+_mesa_cube_complete(const struct gl_texture_object *texObj);
+
 extern void
 _mesa_dirty_texobj(struct gl_context *ctx, struct gl_texture_object *texObj,
                    GLboolean invalidate_state);
diff --git a/src/mesa/main/texrender.h b/src/mesa/main/texrender.h
index 5e68fb03b57..cacd091160e 100644
--- a/src/mesa/main/texrender.h
+++ b/src/mesa/main/texrender.h
@@ -1,7 +1,9 @@
 #ifndef TEXRENDER_H
 #define TEXRENDER_H
 
-#include "mtypes.h"
+struct gl_context;
+struct gl_framebuffer;
+struct gl_renderbuffer_attachment;
 
 extern void
 _mesa_render_texture(struct gl_context *ctx,
diff --git a/src/mesa/main/transformfeedback.h b/src/mesa/main/transformfeedback.h
index 752cd4e201f..b0d5b70f2b7 100644
--- a/src/mesa/main/transformfeedback.h
+++ b/src/mesa/main/transformfeedback.h
@@ -25,8 +25,13 @@
 #ifndef TRANSFORM_FEEDBACK_H
 #define TRANSFORM_FEEDBACK_H
 
-#include "main/mtypes.h"
+#include "compiler.h"
+#include "glheader.h"
+#include "mfeatures.h"
 
+struct _glapi_table;
+struct dd_function_table;
+struct gl_context;
 
 extern void
 _mesa_init_transform_feedback(struct gl_context *ctx);
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index 340c3fe1d39..32bf95e3ed1 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -127,8 +127,8 @@ update_array(struct gl_context *ctx,
    GLsizei elementSize;
    GLenum format = GL_RGBA;
 
-   if (ctx->API != API_OPENGLES) {
-      /* fixed point arrays / data is only allowed with OpenGL ES 1.x */
+   if (ctx->API != API_OPENGLES && ctx->API != API_OPENGLES2) {
+      /* fixed point arrays / data is only allowed with OpenGL ES 1.x/2.0 */
       legalTypesMask &= ~FIXED_BIT;
    }
 
@@ -297,7 +297,8 @@ _mesa_TexCoordPointer(GLint size, GLenum type, GLsizei stride,
                       const GLvoid *ptr)
 {
    GLbitfield legalTypes = (SHORT_BIT | INT_BIT |
-                            HALF_BIT | FLOAT_BIT | DOUBLE_BIT);
+                            HALF_BIT | FLOAT_BIT | DOUBLE_BIT |
+                            FIXED_BIT);
    GET_CURRENT_CONTEXT(ctx);
    const GLuint unit = ctx->Array.ActiveTexture;
    ASSERT_OUTSIDE_BEGIN_END_AND_FLUSH(ctx);
diff --git a/src/mesa/main/varray.h b/src/mesa/main/varray.h
index fb96478cfc3..af9324134ec 100644
--- a/src/mesa/main/varray.h
+++ b/src/mesa/main/varray.h
@@ -28,7 +28,11 @@
 #define VARRAY_H
 
 
-#include "mtypes.h"
+#include "glheader.h"
+#include "mfeatures.h"
+
+struct gl_client_array;
+struct gl_context;
 
 #if _HAVE_FULL_GL
 
diff --git a/src/mesa/main/viewport.h b/src/mesa/main/viewport.h
index ccfa37588b8..909ff92eee5 100644
--- a/src/mesa/main/viewport.h
+++ b/src/mesa/main/viewport.h
@@ -28,7 +28,8 @@
 #define VIEWPORT_H
 
 #include "glheader.h"
-#include "mtypes.h"
+
+struct gl_context;
 
 extern void GLAPIENTRY
 _mesa_Viewport(GLint x, GLint y, GLsizei width, GLsizei height);
diff --git a/src/mesa/math/m_debug_clip.c b/src/mesa/math/m_debug_clip.c
index e97afafac3c..bbad6ef024b 100644
--- a/src/mesa/math/m_debug_clip.c
+++ b/src/mesa/math/m_debug_clip.c
@@ -208,6 +208,24 @@ ALIGN16(static GLfloat, d[TEST_COUNT][4]);
 ALIGN16(static GLfloat, r[TEST_COUNT][4]);
 
 
+/**
+ * Check if X, Y or Z component of the coordinate is close to W, in terms
+ * of the clip test.
+ */
+static GLboolean
+xyz_close_to_w(const GLfloat c[4])
+{
+   float k = 0.0001;
+   return (fabs(c[0] - c[3]) < k ||
+           fabs(c[1] - c[3]) < k ||
+           fabs(c[2] - c[3]) < k ||
+           fabs(-c[0] - c[3]) < k ||
+           fabs(-c[1] - c[3]) < k ||
+           fabs(-c[2] - c[3]) < k);
+}
+
+
+
 static int test_cliptest_function( clip_func func, int np,
 				   int psize, long *cycles )
 {
@@ -281,9 +299,18 @@ static int test_cliptest_function( clip_func func, int np,
    }
    for ( i = 0 ; i < TEST_COUNT ; i++ ) {
       if ( dm[i] != rm[i] ) {
+         GLfloat *c = source->start;
+         STRIDE_F(c, source->stride * i);
+         if (psize == 4 && xyz_close_to_w(c)) {
+            /* The coordinate is very close to the clip plane.  The clipmask
+             * may vary depending on code path, but that's OK.
+             */
+            continue;
+         }
 	 printf( "\n-----------------------------\n" );
-	 printf( "(i = %i)\n", i );
-	 printf( "dm = 0x%02x   rm = 0x%02x\n", dm[i], rm[i] );
+	 printf( "mask[%d] = 0x%02x   ref mask[%d] = 0x%02x\n", i, dm[i], i,rm[i] );
+         printf(" coord = %f, %f, %f, %f\n",
+                c[0], c[1], c[2], c[3]);
 	 return 0;
       }
    }
diff --git a/src/mesa/program/arbprogparse.h b/src/mesa/program/arbprogparse.h
index 08e25a1c168..4c0c3007205 100644
--- a/src/mesa/program/arbprogparse.h
+++ b/src/mesa/program/arbprogparse.h
@@ -26,7 +26,11 @@
 #ifndef ARBPROGPARSE_H
 #define ARBPROGPARSE_H
 
-#include "main/mtypes.h"
+#include "main/glheader.h"
+
+struct gl_context;
+struct gl_fragment_program;
+struct gl_vertex_program;
 
 extern void
 _mesa_parse_arb_vertex_program(struct gl_context *ctx, GLenum target,
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 98da90d359a..b274a961b28 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -65,7 +65,7 @@ static int swizzle_for_size(int size);
 typedef struct ir_to_mesa_src_reg {
    ir_to_mesa_src_reg(int file, int index, const glsl_type *type)
    {
-      this->file = file;
+      this->file = (gl_register_file) file;
       this->index = index;
       if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
 	 this->swizzle = swizzle_for_size(type->vector_elements);
@@ -84,7 +84,7 @@ typedef struct ir_to_mesa_src_reg {
       this->reladdr = NULL;
    }
 
-   int file; /**< PROGRAM_* from Mesa */
+   gl_register_file file; /**< PROGRAM_* from Mesa */
    int index; /**< temporary index, VERT_ATTRIB_*, FRAG_ATTRIB_*, etc. */
    GLuint swizzle; /**< SWIZZLE_XYZWONEZERO swizzles from Mesa. */
    int negate; /**< NEGATE_XYZW mask from mesa */
@@ -133,13 +133,13 @@ public:
 
 class variable_storage : public exec_node {
 public:
-   variable_storage(ir_variable *var, int file, int index)
+   variable_storage(ir_variable *var, gl_register_file file, int index)
       : file(file), index(index), var(var)
    {
       /* empty */
    }
 
-   int file;
+   gl_register_file file;
    int index;
    ir_variable *var; /* variable that maps to this, if any */
 };
@@ -2166,9 +2166,14 @@ ir_to_mesa_visitor::visit(ir_discard *ir)
 {
    struct gl_fragment_program *fp = (struct gl_fragment_program *)this->prog;
 
-   assert(ir->condition == NULL); /* FINISHME */
+   if (ir->condition) {
+      ir->condition->accept(this);
+      this->result.negate = ~this->result.negate;
+      ir_to_mesa_emit_op1(ir, OPCODE_KIL, ir_to_mesa_undef_dst, this->result);
+   } else {
+      ir_to_mesa_emit_op0(ir, OPCODE_KIL_NV);
+   }
 
-   ir_to_mesa_emit_op0(ir, OPCODE_KIL_NV);
    fp->UsesKill = GL_TRUE;
 }
 
@@ -2239,7 +2244,7 @@ mesa_src_reg_from_ir_src_reg(ir_to_mesa_src_reg reg)
    struct prog_src_register mesa_reg;
 
    mesa_reg.File = reg.file;
-   assert(reg.index < (1 << INST_INDEX_BITS) - 1);
+   assert(reg.index < (1 << INST_INDEX_BITS));
    mesa_reg.Index = reg.index;
    mesa_reg.Swizzle = reg.swizzle;
    mesa_reg.RelAddr = reg.reladdr != NULL;
@@ -2609,8 +2614,9 @@ set_uniform_initializers(struct gl_context *ctx,
 /**
  * Convert a shader's GLSL IR into a Mesa gl_program.
  */
-struct gl_program *
-get_mesa_program(struct gl_context *ctx, struct gl_shader_program *shader_program,
+static struct gl_program *
+get_mesa_program(struct gl_context *ctx,
+                 struct gl_shader_program *shader_program,
 		 struct gl_shader *shader)
 {
    ir_to_mesa_visitor v;
@@ -2633,6 +2639,10 @@ get_mesa_program(struct gl_context *ctx, struct gl_shader_program *shader_progra
       target = GL_FRAGMENT_PROGRAM_ARB;
       target_string = "fragment";
       break;
+   case GL_GEOMETRY_SHADER:
+      target = GL_GEOMETRY_PROGRAM_NV;
+      target_string = "geometry";
+      break;
    default:
       assert(!"should not be reached");
       return NULL;
@@ -2756,6 +2766,15 @@ get_mesa_program(struct gl_context *ctx, struct gl_shader_program *shader_progra
 
       mesa_inst++;
       i++;
+
+      if (!shader_program->LinkStatus)
+         break;
+   }
+
+   if (!shader_program->LinkStatus) {
+      free(mesa_instructions);
+      _mesa_reference_program(ctx, &shader->Program, NULL);
+      return NULL;
    }
 
    set_branchtargets(&v, mesa_instructions, num_instructions);
@@ -2830,8 +2849,9 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 
 	 /* Lowering */
 	 do_mat_op_to_vec(ir);
-	 lower_instructions(ir, MOD_TO_FRACT | DIV_TO_MUL_RCP | EXP_TO_EXP2
-			      | LOG_TO_LOG2);
+	 lower_instructions(ir, (MOD_TO_FRACT | DIV_TO_MUL_RCP | EXP_TO_EXP2
+				 | LOG_TO_LOG2
+				 | ((options->EmitNoPow) ? POW_TO_EXP2 : 0)));
 
 	 progress = do_lower_jumps(ir, true, true, options->EmitNoMainReturn, options->EmitNoCont, options->EmitNoLoops) || progress;
 
@@ -2839,8 +2859,10 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 
 	 progress = lower_quadop_vector(ir, true) || progress;
 
-	 if (options->EmitNoIfs)
+	 if (options->EmitNoIfs) {
+	    progress = lower_discard(ir) || progress;
 	    progress = do_if_to_cond_assign(ir) || progress;
+	 }
 
 	 if (options->EmitNoNoise)
 	    progress = lower_noise(ir) || progress;
@@ -2866,30 +2888,40 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
 
    for (unsigned i = 0; i < MESA_SHADER_TYPES; i++) {
       struct gl_program *linked_prog;
-      bool ok = true;
 
       if (prog->_LinkedShaders[i] == NULL)
 	 continue;
 
       linked_prog = get_mesa_program(ctx, prog, prog->_LinkedShaders[i]);
 
-      switch (prog->_LinkedShaders[i]->Type) {
-      case GL_VERTEX_SHADER:
-	 _mesa_reference_vertprog(ctx, &prog->VertexProgram,
-				  (struct gl_vertex_program *)linked_prog);
-	 ok = ctx->Driver.ProgramStringNotify(ctx, GL_VERTEX_PROGRAM_ARB,
-					      linked_prog);
-	 break;
-      case GL_FRAGMENT_SHADER:
-	 _mesa_reference_fragprog(ctx, &prog->FragmentProgram,
-				  (struct gl_fragment_program *)linked_prog);
-	 ok = ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_PROGRAM_ARB,
-					      linked_prog);
-	 break;
-      }
-      if (!ok) {
-	 return GL_FALSE;
+      if (linked_prog) {
+         bool ok = true;
+
+         switch (prog->_LinkedShaders[i]->Type) {
+         case GL_VERTEX_SHADER:
+            _mesa_reference_vertprog(ctx, &prog->VertexProgram,
+                                     (struct gl_vertex_program *)linked_prog);
+            ok = ctx->Driver.ProgramStringNotify(ctx, GL_VERTEX_PROGRAM_ARB,
+                                                 linked_prog);
+            break;
+         case GL_FRAGMENT_SHADER:
+            _mesa_reference_fragprog(ctx, &prog->FragmentProgram,
+                                     (struct gl_fragment_program *)linked_prog);
+            ok = ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_PROGRAM_ARB,
+                                                 linked_prog);
+            break;
+         case GL_GEOMETRY_SHADER:
+            _mesa_reference_geomprog(ctx, &prog->GeometryProgram,
+                                     (struct gl_geometry_program *)linked_prog);
+            ok = ctx->Driver.ProgramStringNotify(ctx, GL_GEOMETRY_PROGRAM_NV,
+                                                 linked_prog);
+            break;
+         }
+         if (!ok) {
+            return GL_FALSE;
+         }
       }
+
       _mesa_reference_program(ctx, &linked_prog, NULL);
    }
 
@@ -3007,6 +3039,7 @@ _mesa_glsl_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
    prog->Varying = _mesa_new_parameter_list();
    _mesa_reference_vertprog(ctx, &prog->VertexProgram, NULL);
    _mesa_reference_fragprog(ctx, &prog->FragmentProgram, NULL);
+   _mesa_reference_geomprog(ctx, &prog->GeometryProgram, NULL);
 
    if (prog->LinkStatus) {
       link_shaders(ctx, prog);
diff --git a/src/mesa/program/prog_instruction.h b/src/mesa/program/prog_instruction.h
index ca90de7ce1c..a383828e344 100644
--- a/src/mesa/program/prog_instruction.h
+++ b/src/mesa/program/prog_instruction.h
@@ -247,7 +247,7 @@ typedef enum prog_opcode {
  * Number of bits for the src/dst register Index field.
  * This limits the size of temp/uniform register files.
  */
-#define INST_INDEX_BITS 10
+#define INST_INDEX_BITS 11
 
 
 /**
diff --git a/src/mesa/program/prog_print.c b/src/mesa/program/prog_print.c
index 79c01020eb2..abebf392c0a 100644
--- a/src/mesa/program/prog_print.c
+++ b/src/mesa/program/prog_print.c
@@ -42,8 +42,8 @@
 /**
  * Return string name for given program/register file.
  */
-static const char *
-file_string(gl_register_file f, gl_prog_print_mode mode)
+const char *
+_mesa_register_file_name(gl_register_file f)
 {
    switch (f) {
    case PROGRAM_TEMPORARY:
@@ -275,7 +275,8 @@ reg_string(gl_register_file f, GLint index, gl_prog_print_mode mode,
 
    switch (mode) {
    case PROG_PRINT_DEBUG:
-      sprintf(str, "%s[%s%d]", file_string(f, mode), addr, index);
+      sprintf(str, "%s[%s%d]",
+              _mesa_register_file_name(f), addr, index);
       if (hasIndex2) {
          int offset = strlen(str);
          const char *addr2 = relAddr2 ? "ADDR+" : "";
@@ -497,7 +498,7 @@ fprint_dst_reg(FILE * f,
 
 #if 0
    fprintf(f, "%s[%d]%s",
-	   file_string((gl_register_file) dstReg->File, mode),
+	   _mesa_register_file_name((gl_register_file) dstReg->File),
 	   dstReg->Index,
 	   _mesa_writemask_string(dstReg->WriteMask));
 #endif
@@ -522,7 +523,7 @@ fprint_src_reg(FILE *f,
 	   abs);
 #if 0
    fprintf(f, "%s[%d]%s",
-	   file_string((gl_register_file) srcReg->File, mode),
+	   _mesa_register_file_name((gl_register_file) srcReg->File),
 	   srcReg->Index,
 	   _mesa_swizzle_string(srcReg->Swizzle,
 				srcReg->Negate, GL_FALSE));
@@ -615,8 +616,7 @@ _mesa_fprint_instruction_opt(FILE *f,
       if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
          fprintf(f, ", ");
          fprintf(f, "%s[%d]%s",
-		 file_string((gl_register_file) inst->SrcReg[0].File,
-			     mode),
+                 _mesa_register_file_name((gl_register_file) inst->SrcReg[0].File),
 		 inst->SrcReg[0].Index,
 		 _mesa_swizzle_string(inst->SrcReg[0].Swizzle,
 				      inst->SrcReg[0].Negate, GL_FALSE));
@@ -632,8 +632,7 @@ _mesa_fprint_instruction_opt(FILE *f,
       fprintf(f, " ");
       fprint_dst_reg(f, &inst->DstReg, mode, prog);
       fprintf(f, ", %s[%d], %s",
-	      file_string((gl_register_file) inst->SrcReg[0].File,
-			  mode),
+	      _mesa_register_file_name((gl_register_file) inst->SrcReg[0].File),
 	      inst->SrcReg[0].Index,
 	      _mesa_swizzle_string(inst->SrcReg[0].Swizzle,
 				   inst->SrcReg[0].Negate, GL_TRUE));
@@ -964,7 +963,6 @@ static void
 _mesa_fprint_parameter_list(FILE *f,
                             const struct gl_program_parameter_list *list)
 {
-   const gl_prog_print_mode mode = PROG_PRINT_DEBUG;
    GLuint i;
 
    if (!list)
@@ -978,7 +976,7 @@ _mesa_fprint_parameter_list(FILE *f,
       const GLfloat *v = list->ParameterValues[i];
       fprintf(f, "param[%d] sz=%d %s %s = {%.3g, %.3g, %.3g, %.3g}",
 	      i, param->Size,
-	      file_string(list->Parameters[i].Type, mode),
+	      _mesa_register_file_name(list->Parameters[i].Type),
 	      param->Name, v[0], v[1], v[2], v[3]);
       if (param->Flags & PROG_PARAM_BIT_CENTROID)
          fprintf(f, " Centroid");
diff --git a/src/mesa/program/prog_print.h b/src/mesa/program/prog_print.h
index f080b3fd2e6..d962087db38 100644
--- a/src/mesa/program/prog_print.h
+++ b/src/mesa/program/prog_print.h
@@ -47,6 +47,9 @@ typedef enum {
 } gl_prog_print_mode;
 
 
+extern const char *
+_mesa_register_file_name(gl_register_file f);
+
 extern void
 _mesa_print_vp_inputs(GLbitfield inputs);
 
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c
index baac29ff0dd..c310acb01d4 100644
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -572,6 +572,24 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[],
          value[3] = 0.0F;
          return;
 
+      case STATE_FB_WPOS_Y_TRANSFORM:
+         /* A driver may negate this conditional by using ZW swizzle
+          * instead of XY (based on e.g. some other state). */
+         if (ctx->DrawBuffer->Name != 0) {
+            /* Identity (XY) followed by flipping Y upside down (ZW). */
+            value[0] = 1.0F;
+            value[1] = 0.0F;
+            value[2] = -1.0F;
+            value[3] = (GLfloat) (ctx->DrawBuffer->Height - 1);
+         } else {
+            /* Flipping Y upside down (XY) followed by identity (ZW). */
+            value[0] = -1.0F;
+            value[1] = (GLfloat) (ctx->DrawBuffer->Height - 1);
+            value[2] = 1.0F;
+            value[3] = 0.0F;
+         }
+         return;
+
       case STATE_ROT_MATRIX_0:
          {
             const int unit = (int) state[2];
@@ -695,6 +713,7 @@ _mesa_program_state_flags(const gl_state_index state[STATE_LENGTH])
          return _NEW_PIXEL;
 
       case STATE_FB_SIZE:
+      case STATE_FB_WPOS_Y_TRANSFORM:
          return _NEW_BUFFERS;
 
       default:
@@ -900,6 +919,9 @@ append_token(char *dst, gl_state_index k)
    case STATE_FB_SIZE:
       append(dst, "FbSize");
       break;
+   case STATE_FB_WPOS_Y_TRANSFORM:
+      append(dst, "FbWposYTransform");
+      break;
    case STATE_ROT_MATRIX_0:
       append(dst, "rotMatrixRow0");
       break;
@@ -1046,7 +1068,9 @@ _mesa_program_state_string(const gl_state_index state[STATE_LENGTH])
  * Loop over all the parameters in a parameter list.  If the parameter
  * is a GL state reference, look up the current value of that state
  * variable and put it into the parameter's Value[4] array.
- * This would be called at glBegin time when using a fragment program.
+ * Other parameter types never change or are explicitly set by the user
+ * with glUniform() or glProgramParameter(), etc.
+ * This would be called at glBegin time.
  */
 void
 _mesa_load_state_parameters(struct gl_context *ctx,
@@ -1057,12 +1081,10 @@ _mesa_load_state_parameters(struct gl_context *ctx,
    if (!paramList)
       return;
 
-   /*assert(ctx->Driver.NeedFlush == 0);*/
-
    for (i = 0; i < paramList->NumParameters; i++) {
       if (paramList->Parameters[i].Type == PROGRAM_STATE_VAR) {
          _mesa_fetch_state(ctx,
-			   (gl_state_index *) paramList->Parameters[i].StateIndexes,
+			   paramList->Parameters[i].StateIndexes,
                            paramList->ParameterValues[i]);
       }
    }
diff --git a/src/mesa/program/prog_statevars.h b/src/mesa/program/prog_statevars.h
index 6e5be53630c..009ebde0012 100644
--- a/src/mesa/program/prog_statevars.h
+++ b/src/mesa/program/prog_statevars.h
@@ -117,6 +117,7 @@ typedef enum gl_state_index_ {
    STATE_PT_BIAS,               /**< Pixel transfer RGBA bias */
    STATE_SHADOW_AMBIENT,        /**< ARB_shadow_ambient fail value; token[2] is texture unit index */
    STATE_FB_SIZE,               /**< (width-1, height-1, 0, 0) */
+   STATE_FB_WPOS_Y_TRANSFORM,   /**< (1, 0, -1, height-1) if a FBO is bound, (-1, height-1, 1, 0) otherwise */
    STATE_ROT_MATRIX_0,          /**< ATI_envmap_bumpmap, rot matrix row 0 */
    STATE_ROT_MATRIX_1,          /**< ATI_envmap_bumpmap, rot matrix row 1 */
    STATE_INTERNAL_DRIVER	/* first available state index for drivers (must be last) */
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index 4cacde9aed1..9ffa49bb013 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -917,6 +917,103 @@ _mesa_find_free_register(const GLboolean used[],
 }
 
 
+
+/**
+ * Check if the given register index is valid (doesn't exceed implementation-
+ * dependent limits).
+ * \return GL_TRUE if OK, GL_FALSE if bad index
+ */
+GLboolean
+_mesa_valid_register_index(const struct gl_context *ctx,
+                           gl_shader_type shaderType,
+                           gl_register_file file, GLint index)
+{
+   const struct gl_program_constants *c;
+
+   switch (shaderType) {
+   case MESA_SHADER_VERTEX:
+      c = &ctx->Const.VertexProgram;
+      break;
+   case MESA_SHADER_FRAGMENT:
+      c = &ctx->Const.FragmentProgram;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      c = &ctx->Const.GeometryProgram;
+      break;
+   default:
+      _mesa_problem(ctx,
+                    "unexpected shader type in _mesa_valid_register_index()");
+      return GL_FALSE;
+   }
+
+   switch (file) {
+   case PROGRAM_UNDEFINED:
+      return GL_TRUE;  /* XXX or maybe false? */
+
+   case PROGRAM_TEMPORARY:
+      return index >= 0 && index < c->MaxTemps;
+
+   case PROGRAM_ENV_PARAM:
+      return index >= 0 && index < c->MaxEnvParams;
+
+   case PROGRAM_LOCAL_PARAM:
+      return index >= 0 && index < c->MaxLocalParams;
+
+   case PROGRAM_NAMED_PARAM:
+      return index >= 0 && index < c->MaxParameters;
+
+   case PROGRAM_UNIFORM:
+   case PROGRAM_STATE_VAR:
+      /* aka constant buffer */
+      return index >= 0 && index < c->MaxUniformComponents / 4;
+
+   case PROGRAM_CONSTANT:
+      /* constant buffer w/ possible relative negative addressing */
+      return (index > (int) c->MaxUniformComponents / -4 &&
+              index < c->MaxUniformComponents / 4);
+
+   case PROGRAM_INPUT:
+      if (index < 0)
+         return GL_FALSE;
+
+      switch (shaderType) {
+      case MESA_SHADER_VERTEX:
+         return index < VERT_ATTRIB_GENERIC0 + c->MaxAttribs;
+      case MESA_SHADER_FRAGMENT:
+         return index < FRAG_ATTRIB_VAR0 + ctx->Const.MaxVarying;
+      case MESA_SHADER_GEOMETRY:
+         return index < GEOM_ATTRIB_VAR0 + ctx->Const.MaxVarying;
+      default:
+         return GL_FALSE;
+      }
+
+   case PROGRAM_OUTPUT:
+      if (index < 0)
+         return GL_FALSE;
+
+      switch (shaderType) {
+      case MESA_SHADER_VERTEX:
+         return index < VERT_RESULT_VAR0 + ctx->Const.MaxVarying;
+      case MESA_SHADER_FRAGMENT:
+         return index < FRAG_RESULT_DATA0 + ctx->Const.MaxDrawBuffers;
+      case MESA_SHADER_GEOMETRY:
+         return index < GEOM_RESULT_VAR0 + ctx->Const.MaxVarying;
+      default:
+         return GL_FALSE;
+      }
+
+   case PROGRAM_ADDRESS:
+      return index >= 0 && index < c->MaxAddressRegs;
+
+   default:
+      _mesa_problem(ctx,
+                    "unexpected register file in _mesa_valid_register_index()");
+      return GL_FALSE;
+   }
+}
+
+
+
 /**
  * "Post-process" a GPU program.  This is intended to be used for debugging.
  * Example actions include no-op'ing instructions or changing instruction
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 70cc2c3aaed..ce37b95bf82 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -165,6 +165,12 @@ extern GLint
 _mesa_find_free_register(const GLboolean used[],
                          GLuint maxRegs, GLuint firstReg);
 
+
+extern GLboolean
+_mesa_valid_register_index(const struct gl_context *ctx,
+                           gl_shader_type shaderType,
+                           gl_register_file file, GLint index);
+
 extern void
 _mesa_postprocess_program(struct gl_context *ctx, struct gl_program *prog);
 
diff --git a/src/mesa/program/symbol_table.c b/src/mesa/program/symbol_table.c
index 09e7cb44ef3..004f1f8fa7b 100644
--- a/src/mesa/program/symbol_table.c
+++ b/src/mesa/program/symbol_table.c
@@ -336,12 +336,12 @@ _mesa_symbol_table_add_symbol(struct _mesa_symbol_table *table,
     check_symbol_table(table);
 
     if (hdr == NULL) {
-        hdr = calloc(1, sizeof(*hdr));
-        hdr->name = strdup(name);
+       hdr = calloc(1, sizeof(*hdr));
+       hdr->name = strdup(name);
 
-        hash_table_insert(table->ht, hdr, hdr->name);
-	hdr->next = table->hdr;
-	table->hdr = hdr;
+       hash_table_insert(table->ht, hdr, hdr->name);
+       hdr->next = table->hdr;
+       table->hdr = hdr;
     }
 
     check_symbol_table(table);
@@ -376,6 +376,81 @@ _mesa_symbol_table_add_symbol(struct _mesa_symbol_table *table,
 }
 
 
+int
+_mesa_symbol_table_add_global_symbol(struct _mesa_symbol_table *table,
+				     int name_space, const char *name,
+				     void *declaration)
+{
+    struct symbol_header *hdr;
+    struct symbol *sym;
+    struct symbol *curr;
+    struct scope_level *top_scope;
+
+    check_symbol_table(table);
+
+    hdr = find_symbol(table, name);
+
+    check_symbol_table(table);
+
+    if (hdr == NULL) {
+        hdr = calloc(1, sizeof(*hdr));
+        hdr->name = strdup(name);
+
+        hash_table_insert(table->ht, hdr, hdr->name);
+        hdr->next = table->hdr;
+        table->hdr = hdr;
+    }
+
+    check_symbol_table(table);
+
+    /* If the symbol already exists in this namespace at this scope, it cannot
+     * be added to the table.
+     */
+    for (sym = hdr->symbols
+	 ; (sym != NULL) && (sym->name_space != name_space)
+	 ; sym = sym->next_with_same_name) {
+       /* empty */
+    }
+
+    if (sym && sym->depth == 0)
+       return -1;
+
+    /* Find the top-level scope */
+    for (top_scope = table->current_scope
+	 ; top_scope->next != NULL
+	 ; top_scope = top_scope->next) {
+       /* empty */
+    }
+
+    sym = calloc(1, sizeof(*sym));
+    sym->next_with_same_scope = top_scope->symbols;
+    sym->hdr = hdr;
+    sym->name_space = name_space;
+    sym->data = declaration;
+
+    assert(sym->hdr == hdr);
+
+    /* Since next_with_same_name is ordered by scope, we need to append the
+     * new symbol to the _end_ of the list.
+     */
+    if (hdr->symbols == NULL) {
+       hdr->symbols = sym;
+    } else {
+       for (curr = hdr->symbols
+	    ; curr->next_with_same_name != NULL
+	    ; curr = curr->next_with_same_name) {
+	  /* empty */
+       }
+       curr->next_with_same_name = sym;
+    }
+    top_scope->symbols = sym;
+
+    check_symbol_table(table);
+    return 0;
+}
+
+
+
 struct _mesa_symbol_table *
 _mesa_symbol_table_ctor(void)
 {
diff --git a/src/mesa/program/symbol_table.h b/src/mesa/program/symbol_table.h
index 1d570fc1a09..f9d91649bbc 100644
--- a/src/mesa/program/symbol_table.h
+++ b/src/mesa/program/symbol_table.h
@@ -33,6 +33,10 @@ extern void _mesa_symbol_table_pop_scope(struct _mesa_symbol_table *table);
 extern int _mesa_symbol_table_add_symbol(struct _mesa_symbol_table *symtab,
     int name_space, const char *name, void *declaration);
 
+extern int _mesa_symbol_table_add_global_symbol(
+    struct _mesa_symbol_table *symtab, int name_space, const char *name,
+    void *declaration);
+
 extern int _mesa_symbol_table_symbol_scope(struct _mesa_symbol_table *table,
     int name_space, const char *name);
 
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 8d1dc792bc8..f1d08a3e166 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -66,6 +66,11 @@ void st_upload_constants( struct st_context *st,
    if (params && params->NumParameters) {
       const uint paramBytes = params->NumParameters * sizeof(GLfloat) * 4;
 
+      /* Update the constants which come from fixed-function state, such as
+       * transformation matrices, fog factors, etc.  The rest of the values in
+       * the parameters list are explicitly set by the user with glUniform,
+       * glProgramParameter(), etc.
+       */
       _mesa_load_state_parameters(st->ctx, params);
 
       /* We always need to get a new buffer, to keep the drivers simple and
diff --git a/src/mesa/state_tracker/st_atom_framebuffer.c b/src/mesa/state_tracker/st_atom_framebuffer.c
index 036bc60049a..2843b7b1764 100644
--- a/src/mesa/state_tracker/st_atom_framebuffer.c
+++ b/src/mesa/state_tracker/st_atom_framebuffer.c
@@ -51,7 +51,7 @@ static void
 update_renderbuffer_surface(struct st_context *st,
                             struct st_renderbuffer *strb)
 {
-   struct pipe_screen *screen = st->pipe->screen;
+   struct pipe_context *pipe = st->pipe;
    struct pipe_resource *resource = strb->rtt->pt;
    int rtt_width = strb->Base.Width;
    int rtt_height = strb->Base.Height;
@@ -65,15 +65,19 @@ update_renderbuffer_surface(struct st_context *st,
       for (level = 0; level <= resource->last_level; level++) {
          if (u_minify(resource->width0, level) == rtt_width &&
              u_minify(resource->height0, level) == rtt_height) {
+            struct pipe_surface surf_tmpl;
+            memset(&surf_tmpl, 0, sizeof(surf_tmpl));
+            surf_tmpl.format = resource->format;
+            surf_tmpl.usage = PIPE_BIND_RENDER_TARGET;
+            surf_tmpl.u.tex.level = level;
+            surf_tmpl.u.tex.first_layer = strb->rtt_face + strb->rtt_slice;
+            surf_tmpl.u.tex.last_layer = strb->rtt_face + strb->rtt_slice;
 
             pipe_surface_reference(&strb->surface, NULL);
 
-            strb->surface = screen->get_tex_surface(screen,
-						    resource,
-						    strb->rtt_face,
-						    level,
-						    strb->rtt_slice,
-						    PIPE_BIND_RENDER_TARGET);
+            strb->surface = pipe->create_surface(pipe,
+                                                 resource,
+                                                 &surf_tmpl);
 #if 0
             printf("-- alloc new surface %d x %d into tex %p\n",
                    strb->surface->width, strb->surface->height,
diff --git a/src/mesa/state_tracker/st_atom_pixeltransfer.c b/src/mesa/state_tracker/st_atom_pixeltransfer.c
index 6be03376d01..378b30e57cc 100644
--- a/src/mesa/state_tracker/st_atom_pixeltransfer.c
+++ b/src/mesa/state_tracker/st_atom_pixeltransfer.c
@@ -122,8 +122,8 @@ load_color_map_texture(struct gl_context *ctx, struct pipe_resource *pt)
    uint i, j;
 
    transfer = pipe_get_transfer(st_context(ctx)->pipe,
-					     pt, 0, 0, 0, PIPE_TRANSFER_WRITE,
-					     0, 0, texSize, texSize);
+                                pt, 0, 0, PIPE_TRANSFER_WRITE,
+                                0, 0, texSize, texSize);
    dest = (uint *) pipe_transfer_map(pipe, transfer);
 
    /* Pack four 1D maps into a 2D texture:
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index f147d768084..b67068df373 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -121,6 +121,18 @@ static void
 xlate_border_color(const GLfloat *colorIn, GLenum baseFormat, GLfloat *colorOut)
 {
    switch (baseFormat) {
+   case GL_RED:
+      colorOut[0] = colorIn[0];
+      colorOut[1] = 0.0F;
+      colorOut[2] = 0.0F;
+      colorOut[3] = 1.0F;
+      break;
+   case GL_RG:
+      colorOut[0] = colorIn[0];
+      colorOut[1] = colorIn[1];
+      colorOut[2] = 0.0F;
+      colorOut[3] = 1.0F;
+      break;
    case GL_RGB:
       colorOut[0] = colorIn[0];
       colorOut[1] = colorIn[1];
diff --git a/src/mesa/state_tracker/st_cb_accum.c b/src/mesa/state_tracker/st_cb_accum.c
index 6c5caf42e35..a76ae92dc3d 100644
--- a/src/mesa/state_tracker/st_cb_accum.c
+++ b/src/mesa/state_tracker/st_cb_accum.c
@@ -138,10 +138,10 @@ accum_accum(struct st_context *st, GLfloat value,
       debug_printf("%s: fallback processing\n", __FUNCTION__);
 
    color_trans = pipe_get_transfer(st->pipe,
-						color_strb->texture,
-						0, 0, 0,
-						PIPE_TRANSFER_READ, xpos, ypos,
-						width, height);
+                                   color_strb->texture,
+                                   0, 0,
+                                   PIPE_TRANSFER_READ, xpos, ypos,
+                                   width, height);
 
    buf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
 
@@ -187,9 +187,9 @@ accum_load(struct st_context *st, GLfloat value,
       debug_printf("%s: fallback processing\n", __FUNCTION__);
 
    color_trans = pipe_get_transfer(st->pipe, color_strb->texture,
-						0, 0, 0,
-						PIPE_TRANSFER_READ, xpos, ypos,
-						width, height);
+                                   0, 0,
+                                   PIPE_TRANSFER_READ, xpos, ypos,
+                                   width, height);
 
    buf = (GLfloat *) malloc(width * height * 4 * sizeof(GLfloat));
 
@@ -241,12 +241,12 @@ accum_return(struct gl_context *ctx, GLfloat value,
       usage = PIPE_TRANSFER_READ_WRITE;
    else
       usage = PIPE_TRANSFER_WRITE;
-   
+
    color_trans = pipe_get_transfer(st_context(ctx)->pipe,
-						color_strb->texture, 0, 0, 0,
-						usage,
-						xpos, ypos,
-						width, height);
+                                   color_strb->texture, 0, 0,
+                                   usage,
+                                   xpos, ypos,
+                                   width, height);
 
    if (usage & PIPE_TRANSFER_READ)
       pipe_get_tile_rgba(pipe, color_trans, 0, 0, width, height, buf);
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index 3c0ee6c2883..f08697fe23b 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -283,9 +283,9 @@ make_bitmap_texture(struct gl_context *ctx, GLsizei width, GLsizei height,
       return NULL;
    }
 
-   transfer = pipe_get_transfer(st->pipe, pt, 0, 0, 0,
-					   PIPE_TRANSFER_WRITE,
-					   0, 0, width, height);
+   transfer = pipe_get_transfer(st->pipe, pt, 0, 0,
+                                PIPE_TRANSFER_WRITE,
+                                0, 0, width, height);
 
    dest = pipe_transfer_map(pipe, transfer);
 
@@ -585,10 +585,10 @@ create_cache_trans(struct st_context *st)
    /* Map the texture transfer.
     * Subsequent glBitmap calls will write into the texture image.
     */
-   cache->trans = pipe_get_transfer(st->pipe, cache->texture, 0, 0, 0,
-					       PIPE_TRANSFER_WRITE, 0, 0,
-					       BITMAP_CACHE_WIDTH,
-					       BITMAP_CACHE_HEIGHT);
+   cache->trans = pipe_get_transfer(st->pipe, cache->texture, 0, 0,
+                                    PIPE_TRANSFER_WRITE, 0, 0,
+                                    BITMAP_CACHE_WIDTH,
+                                    BITMAP_CACHE_HEIGHT);
    cache->buffer = pipe_transfer_map(pipe, cache->trans);
 
    /* init image to all 0xff */
diff --git a/src/mesa/state_tracker/st_cb_blit.c b/src/mesa/state_tracker/st_cb_blit.c
index af41835326a..06cee520b37 100644
--- a/src/mesa/state_tracker/st_cb_blit.c
+++ b/src/mesa/state_tracker/st_cb_blit.c
@@ -115,17 +115,14 @@ st_BlitFramebuffer(struct gl_context *ctx,
             st_texture_object(srcAtt->Texture);
          struct st_renderbuffer *dstRb =
             st_renderbuffer(drawFB->_ColorDrawBuffers[0]);
-         struct pipe_subresource srcSub;
          struct pipe_surface *dstSurf = dstRb->surface;
 
          if (!srcObj->pt)
             return;
 
-         srcSub.face = srcAtt->CubeMapFace;
-         srcSub.level = srcAtt->TextureLevel;
-
-         util_blit_pixels(st->blit, srcObj->pt, srcSub,
-                          srcX0, srcY0, srcX1, srcY1, srcAtt->Zoffset,
+         util_blit_pixels(st->blit, srcObj->pt, srcAtt->TextureLevel,
+                          srcX0, srcY0, srcX1, srcY1,
+                          srcAtt->Zoffset + srcAtt->CubeMapFace,
                           dstSurf, dstX0, dstY0, dstX1, dstY1,
                           0.0, pFilter);
       }
@@ -136,14 +133,11 @@ st_BlitFramebuffer(struct gl_context *ctx,
             st_renderbuffer(drawFB->_ColorDrawBuffers[0]);
          struct pipe_surface *srcSurf = srcRb->surface;
          struct pipe_surface *dstSurf = dstRb->surface;
-         struct pipe_subresource srcSub;
-
-         srcSub.face = srcSurf->face;
-         srcSub.level = srcSurf->level;
 
          util_blit_pixels(st->blit,
-                          srcRb->texture, srcSub, srcX0, srcY0, srcX1, srcY1,
-                          srcSurf->zslice,
+                          srcRb->texture, srcSurf->u.tex.level,
+                          srcX0, srcY0, srcX1, srcY1,
+                          srcSurf->u.tex.first_layer,
                           dstSurf, dstX0, dstY0, dstX1, dstY1,
                           0.0, pFilter);
       }
@@ -176,11 +170,11 @@ st_BlitFramebuffer(struct gl_context *ctx,
          /* Blitting depth and stencil values between combined
           * depth/stencil buffers.  This is the ideal case for such buffers.
           */
-         util_blit_pixels(st->blit, srcDepthRb->texture,
-                          u_subresource(srcDepthRb->surface->face,
-                                        srcDepthRb->surface->level),
+         util_blit_pixels(st->blit,
+                          srcDepthRb->texture,
+                          srcDepthRb->surface->u.tex.level,
                           srcX0, srcY0, srcX1, srcY1,
-                          srcDepthRb->surface->zslice,
+                          srcDepthRb->surface->u.tex.first_layer,
                           dstDepthSurf, dstX0, dstY0, dstX1, dstY1,
                           0.0, pFilter);
       }
@@ -189,10 +183,9 @@ st_BlitFramebuffer(struct gl_context *ctx,
 
          if (mask & GL_DEPTH_BUFFER_BIT) {
             util_blit_pixels(st->blit, srcDepthRb->texture,
-                             u_subresource(srcDepthRb->surface->face,
-                                           srcDepthRb->surface->level),
+                             srcDepthRb->surface->u.tex.level,
                              srcX0, srcY0, srcX1, srcY1,
-                             srcDepthRb->surface->zslice,
+                             srcDepthRb->surface->u.tex.first_layer,
                              dstDepthSurf, dstX0, dstY0, dstX1, dstY1,
                              0.0, pFilter);
          }
diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c
index 27540c36ce7..8b60f9040d0 100644
--- a/src/mesa/state_tracker/st_cb_bufferobjects.c
+++ b/src/mesa/state_tracker/st_cb_bufferobjects.c
@@ -211,6 +211,13 @@ st_bufferobj_data(struct gl_context *ctx,
 
 
 /**
+ * Dummy data whose's pointer is used for zero size buffers or ranges.
+ */
+static long st_bufferobj_zero_length = 0;
+
+
+
+/**
  * Called via glMapBufferARB().
  */
 static void *
@@ -233,10 +240,16 @@ st_bufferobj_map(struct gl_context *ctx, GLenum target, GLenum access,
       break;      
    }
 
-   obj->Pointer = pipe_buffer_map(st_context(ctx)->pipe,
-                                  st_obj->buffer,
-                                  flags,
-                                  &st_obj->transfer);
+   /* Handle zero-size buffers here rather than in drivers */
+   if (obj->Size == 0) {
+      obj->Pointer = &st_bufferobj_zero_length;
+   }
+   else {
+      obj->Pointer = pipe_buffer_map(st_context(ctx)->pipe,
+                                     st_obj->buffer,
+                                     flags,
+                                     &st_obj->transfer);
+   }
 
    if (obj->Pointer) {
       obj->Offset = 0;
@@ -247,13 +260,6 @@ st_bufferobj_map(struct gl_context *ctx, GLenum target, GLenum access,
 
 
 /**
- * Dummy data whose's pointer is used for zero length ranges.
- */
-static long
-st_bufferobj_zero_length_range = 0;
-
-
-/**
  * Called via glMapBufferRange().
  */
 static void *
@@ -273,6 +279,12 @@ st_bufferobj_map_range(struct gl_context *ctx, GLenum target,
 
    if (access & GL_MAP_FLUSH_EXPLICIT_BIT)
       flags |= PIPE_TRANSFER_FLUSH_EXPLICIT;
+
+   if (access & GL_MAP_INVALIDATE_RANGE_BIT)
+      flags |= PIPE_TRANSFER_DISCARD;
+
+   if (access & GL_MAP_INVALIDATE_BUFFER_BIT)
+      flags |= PIPE_TRANSFER_DISCARD;
    
    if (access & GL_MAP_UNSYNCHRONIZED_BIT)
       flags |= PIPE_TRANSFER_UNSYNCHRONIZED;
@@ -293,7 +305,7 @@ st_bufferobj_map_range(struct gl_context *ctx, GLenum target,
     * length range from the pipe driver.
     */
    if (!length) {
-      obj->Pointer = &st_bufferobj_zero_length_range;
+      obj->Pointer = &st_bufferobj_zero_length;
    }
    else {
       obj->Pointer = pipe_buffer_map_range(pipe, 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index d80c068ea81..c9786024575 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -427,9 +427,9 @@ make_texture(struct st_context *st,
       /* we'll do pixel transfer in a fragment shader */
       ctx->_ImageTransferState = 0x0;
 
-      transfer = pipe_get_transfer(st->pipe, pt, 0, 0, 0,
-					      PIPE_TRANSFER_WRITE, 0, 0,
-					      width, height);
+      transfer = pipe_get_transfer(st->pipe, pt, 0, 0,
+                                   PIPE_TRANSFER_WRITE, 0, 0,
+                                   width, height);
 
       /* map texture transfer */
       dest = pipe_transfer_map(pipe, transfer);
@@ -763,9 +763,9 @@ draw_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
    else
       usage = PIPE_TRANSFER_WRITE;
 
-   pt = pipe_get_transfer(st_context(ctx)->pipe, strb->texture, 0, 0, 0,
-				       usage, x, y,
-				       width, height);
+   pt = pipe_get_transfer(st_context(ctx)->pipe, strb->texture, 0, 0,
+                                     usage, x, y,
+                                     width, height);
 
    stmap = pipe_transfer_map(pipe, pt);
 
@@ -1025,15 +1025,15 @@ copy_stencil_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
       usage = PIPE_TRANSFER_READ_WRITE;
    else
       usage = PIPE_TRANSFER_WRITE;
-   
+
    if (st_fb_orientation(ctx->DrawBuffer) == Y_0_TOP) {
       dsty = rbDraw->Base.Height - dsty - height;
    }
 
    ptDraw = pipe_get_transfer(st_context(ctx)->pipe,
-					   rbDraw->texture, 0, 0, 0,
-					   usage, dstx, dsty,
-					   width, height);
+                              rbDraw->texture, 0, 0,
+                              usage, dstx, dsty,
+                              width, height);
 
    assert(util_format_get_blockwidth(ptDraw->resource->format) == 1);
    assert(util_format_get_blockheight(ptDraw->resource->format) == 1);
@@ -1209,27 +1209,24 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
    /* Make temporary texture which is a copy of the src region.
     */
    if (srcFormat == texFormat) {
-      struct pipe_subresource srcsub, dstsub;
-      srcsub.face = 0;
-      srcsub.level = 0;
-      dstsub.face = 0;
-      dstsub.level = 0;
-      /* copy source framebuffer surface into mipmap/texture */
+      struct pipe_box src_box;
+      u_box_2d(readX, readY, readW, readH, &src_box);
+    /* copy source framebuffer surface into mipmap/texture */
       pipe->resource_copy_region(pipe,
                                  pt,                                /* dest tex */
-                                 dstsub,
+                                 0,
                                  pack.SkipPixels, pack.SkipRows, 0, /* dest pos */
                                  rbRead->texture,                   /* src tex */
-                                 srcsub,
-                                 readX, readY, 0, readW, readH);    /* src region */
+                                 0,
+                                 &src_box);
 
    }
    else {
       /* CPU-based fallback/conversion */
       struct pipe_transfer *ptRead =
-         pipe_get_transfer(st->pipe, rbRead->texture, 0, 0, 0,
-                                        PIPE_TRANSFER_READ,
-                                        readX, readY, readW, readH);
+         pipe_get_transfer(st->pipe, rbRead->texture, 0, 0,
+                           PIPE_TRANSFER_READ,
+                           readX, readY, readW, readH);
       struct pipe_transfer *ptTex;
       enum pipe_transfer_usage transfer_usage;
 
@@ -1241,8 +1238,8 @@ st_CopyPixels(struct gl_context *ctx, GLint srcx, GLint srcy,
       else
          transfer_usage = PIPE_TRANSFER_WRITE;
 
-      ptTex = pipe_get_transfer(st->pipe, pt, 0, 0, 0, transfer_usage,
-                                             0, 0, width, height);
+      ptTex = pipe_get_transfer(st->pipe, pt, 0, 0, transfer_usage,
+                                0, 0, width, height);
 
       /* copy image from ptRead surface to ptTex surface */
       if (type == GL_COLOR) {
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 9425f07aee6..cd718a31a14 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -52,6 +52,7 @@
 
 #include "util/u_format.h"
 #include "util/u_inlines.h"
+#include "util/u_surface.h"
 
 
 /**
@@ -65,9 +66,11 @@ st_renderbuffer_alloc_storage(struct gl_context * ctx, struct gl_renderbuffer *r
                               GLuint width, GLuint height)
 {
    struct st_context *st = st_context(ctx);
+   struct pipe_context *pipe = st->pipe;
    struct pipe_screen *screen = st->pipe->screen;
    struct st_renderbuffer *strb = st_renderbuffer(rb);
    enum pipe_format format;
+   struct pipe_surface surf_tmpl;
 
    if (strb->format != PIPE_FORMAT_NONE)
       format = strb->format;
@@ -113,6 +116,7 @@ st_renderbuffer_alloc_storage(struct gl_context * ctx, struct gl_renderbuffer *r
       template.width0 = width;
       template.height0 = height;
       template.depth0 = 1;
+      template.array_size = 1;
       template.last_level = 0;
       template.nr_samples = rb->NumSamples;
       if (util_format_is_depth_or_stencil(format)) {
@@ -120,7 +124,7 @@ st_renderbuffer_alloc_storage(struct gl_context * ctx, struct gl_renderbuffer *r
       }
       else {
          template.bind = (PIPE_BIND_DISPLAY_TARGET |
-			  PIPE_BIND_RENDER_TARGET);
+                          PIPE_BIND_RENDER_TARGET);
       }
 
       strb->texture = screen->resource_create(screen, &template);
@@ -128,10 +132,11 @@ st_renderbuffer_alloc_storage(struct gl_context * ctx, struct gl_renderbuffer *r
       if (!strb->texture) 
          return FALSE;
 
-      strb->surface = screen->get_tex_surface(screen,
-                                              strb->texture,
-                                              0, 0, 0,
-                                              template.bind);
+      memset(&surf_tmpl, 0, sizeof(surf_tmpl));
+      u_surface_default_template(&surf_tmpl, strb->texture, template.bind);
+      strb->surface = pipe->create_surface(pipe,
+                                           strb->texture,
+                                           &surf_tmpl);
       if (strb->surface) {
          assert(strb->surface->texture);
          assert(strb->surface->format);
@@ -327,12 +332,12 @@ st_render_texture(struct gl_context *ctx,
 {
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
-   struct pipe_screen *screen = pipe->screen;
    struct st_renderbuffer *strb;
    struct gl_renderbuffer *rb;
    struct pipe_resource *pt = st_get_texobj_resource(att->Texture);
    struct st_texture_object *stObj;
    const struct gl_texture_image *texImage;
+   struct pipe_surface surf_tmpl;
 
    /* When would this fail?  Perhaps assert? */
    if (!pt) 
@@ -381,12 +386,15 @@ st_render_texture(struct gl_context *ctx,
    assert(strb->rtt_level <= strb->texture->last_level);
 
    /* new surface for rendering into the texture */
-   strb->surface = screen->get_tex_surface(screen,
-                                           strb->texture,
-                                           strb->rtt_face,
-                                           strb->rtt_level,
-                                           strb->rtt_slice,
-                                           PIPE_BIND_RENDER_TARGET);
+   memset(&surf_tmpl, 0, sizeof(surf_tmpl));
+   surf_tmpl.format = strb->texture->format;
+   surf_tmpl.usage = PIPE_BIND_RENDER_TARGET;
+   surf_tmpl.u.tex.level = strb->rtt_level;
+   surf_tmpl.u.tex.first_layer = strb->rtt_face + strb->rtt_slice;
+   surf_tmpl.u.tex.last_layer = strb->rtt_face + strb->rtt_slice;
+   strb->surface = pipe->create_surface(pipe,
+                                        strb->texture,
+                                        &surf_tmpl);
 
    strb->format = pt->format;
 
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index bcd46ac9d54..0507be74578 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -80,7 +80,7 @@ st_read_stencil_pixels(struct gl_context *ctx, GLint x, GLint y,
    /* Create a read transfer from the renderbuffer's texture */
 
    pt = pipe_get_transfer(pipe, strb->texture,
-                          0, 0, 0,  /* face, level, zslice */
+                          0, 0,
                           PIPE_TRANSFER_READ,
                           x, y, width, height);
 
@@ -236,7 +236,7 @@ st_fast_readpixels(struct gl_context *ctx, struct st_renderbuffer *strb,
       }
 
       trans = pipe_get_transfer(pipe, strb->texture,
-                                0, 0, 0,  /* face, level, zslice */
+                                0, 0,
                                 PIPE_TRANSFER_READ,
                                 x, y, width, height);
       if (!trans) {
@@ -328,7 +328,7 @@ st_readpixels(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GLsizei h
 {
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
-   GLfloat temp[MAX_WIDTH][4];
+   GLfloat (*temp)[4];
    const GLbitfield transferOps = ctx->_ImageTransferState;
    GLsizei i, j;
    GLint yStep, dfStride;
@@ -381,6 +381,13 @@ st_readpixels(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GLsizei h
       return;
    }
 
+   /* allocate temp pixel row buffer */
+   temp = (GLfloat (*)[4]) malloc(4 * width * sizeof(GLfloat));
+   if (!temp) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glReadPixels");
+      return;
+   }
+
    if (format == GL_RGBA && type == GL_FLOAT) {
       /* write tile(row) directly into user's buffer */
       df = (GLfloat *) _mesa_image_address2d(&clippedPacking, dest, width,
@@ -400,7 +407,7 @@ st_readpixels(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GLsizei h
 
    /* Create a read transfer from the renderbuffer's texture */
    trans = pipe_get_transfer(pipe, strb->texture,
-                             0, 0, 0,  /* face, level, zslice */
+                             0, 0,
                              PIPE_TRANSFER_READ,
                              x, y, width, height);
 
@@ -533,6 +540,8 @@ st_readpixels(struct gl_context *ctx, GLint x, GLint y, GLsizei width, GLsizei h
       }
    }
 
+   free(temp);
+
    pipe->transfer_destroy(pipe, trans);
 
    _mesa_unmap_pbo_dest(ctx, &clippedPacking);
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 15e69e1fa07..866426a7549 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -63,7 +63,7 @@
 #include "util/u_surface.h"
 #include "util/u_sampler.h"
 #include "util/u_math.h"
-
+#include "util/u_box.h"
 
 #define DBG if (0) printf
 
@@ -431,7 +431,7 @@ compress_with_blit(struct gl_context * ctx,
    struct pipe_resource *src_tex;
    struct pipe_sampler_view view_templ;
    struct pipe_sampler_view *src_view;
-   struct pipe_surface *dst_surface;
+   struct pipe_surface *dst_surface, surf_tmpl;
    struct pipe_transfer *tex_xfer;
    void *map;
 
@@ -441,9 +441,13 @@ compress_with_blit(struct gl_context * ctx,
    }
 
    /* get destination surface (in the compressed texture) */
-   dst_surface = screen->get_tex_surface(screen, stImage->pt,
-                                         stImage->face, stImage->level, 0,
-                                         0 /* flags */);
+   memset(&surf_tmpl, 0, sizeof(surf_tmpl));
+   surf_tmpl.format = stImage->pt->format;
+   surf_tmpl.usage = PIPE_BIND_RENDER_TARGET;
+   surf_tmpl.u.tex.level = stImage->level;
+   surf_tmpl.u.tex.first_layer = stImage->face;
+   surf_tmpl.u.tex.last_layer = stImage->face;
+   dst_surface = pipe->create_surface(pipe, stImage->pt, &surf_tmpl);
    if (!dst_surface) {
       /* can't render into this format (or other problem) */
       return GL_FALSE;
@@ -464,6 +468,7 @@ compress_with_blit(struct gl_context * ctx,
    templ.width0 = width;
    templ.height0 = height;
    templ.depth0 = 1;
+   templ.array_size = 1;
    templ.last_level = 0;
    templ.usage = PIPE_USAGE_DEFAULT;
    templ.bind = PIPE_BIND_SAMPLER_VIEW;
@@ -475,9 +480,9 @@ compress_with_blit(struct gl_context * ctx,
    /* Put user's tex data into the temporary texture
     */
    tex_xfer = pipe_get_transfer(st_context(ctx)->pipe, src_tex,
-					     0, 0, 0, /* face, level are zero */
-					     PIPE_TRANSFER_WRITE,
-					     0, 0, width, height); /* x, y, w, h */
+                                0, 0, /* layer, level are zero */
+                                PIPE_TRANSFER_WRITE,
+                                0, 0, width, height); /* x, y, w, h */
    map = pipe_transfer_map(pipe, tex_xfer);
 
    _mesa_texstore(ctx, 2, GL_RGBA, mesa_format,
@@ -857,7 +862,6 @@ decompress_with_blit(struct gl_context * ctx, GLenum target, GLint level,
 {
    struct st_context *st = st_context(ctx);
    struct pipe_context *pipe = st->pipe;
-   struct pipe_screen *screen = pipe->screen;
    struct st_texture_image *stImage = st_texture_image(texImage);
    struct st_texture_object *stObj = st_texture_object(texObj);
    struct pipe_sampler_view *src_view =
@@ -871,7 +875,7 @@ decompress_with_blit(struct gl_context * ctx, GLenum target, GLint level,
 		    PIPE_BIND_TRANSFER_READ);
 
    /* create temp / dest surface */
-   if (!util_create_rgba_surface(screen, width, height, bind,
+   if (!util_create_rgba_surface(pipe, width, height, bind,
                                  &dst_texture, &dst_surface)) {
       _mesa_problem(ctx, "util_create_rgba_surface() failed "
                     "in decompress_with_blit()");
@@ -891,9 +895,9 @@ decompress_with_blit(struct gl_context * ctx, GLenum target, GLint level,
 
    /* map the dst_surface so we can read from it */
    tex_xfer = pipe_get_transfer(st_context(ctx)->pipe,
-					     dst_texture, 0, 0, 0,
-					     PIPE_TRANSFER_READ,
-					     0, 0, width, height);
+                                dst_texture, 0, 0,
+                                PIPE_TRANSFER_READ,
+                                0, 0, width, height);
 
    pixels = _mesa_map_pbo_dest(ctx, &ctx->Pack, pixels);
 
@@ -1310,7 +1314,7 @@ fallback_copy_texsubimage(struct gl_context *ctx, GLenum target, GLint level,
    struct pipe_transfer *src_trans;
    GLvoid *texDest;
    enum pipe_transfer_usage transfer_usage;
-   
+
    if (ST_DEBUG & DEBUG_FALLBACK)
       debug_printf("%s: fallback processing\n", __FUNCTION__);
 
@@ -1321,11 +1325,11 @@ fallback_copy_texsubimage(struct gl_context *ctx, GLenum target, GLint level,
    }
 
    src_trans = pipe_get_transfer(st_context(ctx)->pipe,
-					       strb->texture,
-					       0, 0, 0,
-					       PIPE_TRANSFER_READ,
-					       srcX, srcY,
-					       width, height);
+                                 strb->texture,
+                                 0, 0,
+                                 PIPE_TRANSFER_READ,
+                                 srcX, srcY,
+                                 width, height);
 
    if ((baseFormat == GL_DEPTH_COMPONENT ||
         baseFormat == GL_DEPTH_STENCIL) &&
@@ -1334,7 +1338,8 @@ fallback_copy_texsubimage(struct gl_context *ctx, GLenum target, GLint level,
    else
       transfer_usage = PIPE_TRANSFER_WRITE;
 
-   texDest = st_texture_image_map(st, stImage, 0, transfer_usage,
+   /* XXX this used to ignore destZ param */
+   texDest = st_texture_image_map(st, stImage, destZ, transfer_usage,
                                   destX, destY, width, height);
 
    if (baseFormat == GL_DEPTH_COMPONENT ||
@@ -1592,27 +1597,23 @@ st_copy_texsubimage(struct gl_context *ctx,
 
       if (matching_base_formats &&
           src_format == dest_format &&
-          !do_flip) 
+          !do_flip)
       {
          /* use surface_copy() / blit */
-         struct pipe_subresource subdst, subsrc;
-         subdst.face = stImage->face;
-         subdst.level = stImage->level;
-         subsrc.face = strb->surface->face;
-         subsrc.level = strb->surface->level;
+         struct pipe_box src_box;
+         u_box_2d_zslice(srcX, srcY, strb->surface->u.tex.first_layer,
+                         width, height, &src_box);
 
          /* for resource_copy_region(), y=0=top, always */
          pipe->resource_copy_region(pipe,
                                     /* dest */
                                     stImage->pt,
-                                    subdst,
-                                    destX, destY, destZ,
+                                    stImage->level,
+                                    destX, destY, destZ + stImage->face,
                                     /* src */
                                     strb->texture,
-                                    subsrc,
-                                    srcX, srcY, strb->surface->zslice,
-                                    /* size */
-                                    width, height);
+                                    strb->surface->u.tex.level,
+                                    &src_box);
          use_fallback = GL_FALSE;
       }
       else if (format_writemask &&
@@ -1628,12 +1629,16 @@ st_copy_texsubimage(struct gl_context *ctx,
                                            0)) {
          /* draw textured quad to do the copy */
          GLint srcY0, srcY1;
-         struct pipe_subresource subsrc;
+         struct pipe_surface surf_tmpl;
+         memset(&surf_tmpl, 0, sizeof(surf_tmpl));
+         surf_tmpl.format = stImage->pt->format;
+         surf_tmpl.usage = PIPE_BIND_RENDER_TARGET;
+         surf_tmpl.u.tex.level = stImage->level;
+         surf_tmpl.u.tex.first_layer = stImage->face + destZ;
+         surf_tmpl.u.tex.last_layer = stImage->face + destZ;
 
-         dest_surface = screen->get_tex_surface(screen, stImage->pt,
-                                                stImage->face, stImage->level,
-                                                destZ,
-                                                PIPE_BIND_RENDER_TARGET);
+         dest_surface = pipe->create_surface(pipe, stImage->pt,
+                                             &surf_tmpl);
 
          if (do_flip) {
             srcY1 = strb->Base.Height - srcY - height;
@@ -1643,15 +1648,13 @@ st_copy_texsubimage(struct gl_context *ctx,
             srcY0 = srcY;
             srcY1 = srcY0 + height;
          }
-         subsrc.face = strb->surface->face;
-         subsrc.level = strb->surface->level;
 
          util_blit_pixels_writemask(st->blit,
                                     strb->texture,
-                                    subsrc,
+                                    strb->surface->u.tex.level,
                                     srcX, srcY0,
                                     srcX + width, srcY1,
-                                    strb->surface->zslice,
+                                    strb->surface->u.tex.first_layer,
                                     dest_surface,
                                     destX, destY,
                                     destX + width, destY + height,
@@ -1852,8 +1855,9 @@ st_finalize_texture(struct gl_context *ctx,
     * will match.
     */
    if (firstImage->pt &&
+       stObj->pt &&
        firstImage->pt != stObj->pt &&
-       firstImage->pt->last_level >= stObj->lastLevel) {
+       firstImage->pt->last_level >= stObj->pt->last_level) {
       pipe_resource_reference(&stObj->pt, firstImage->pt);
       pipe_sampler_view_reference(&stObj->sampler_view, NULL);
    }
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index d0dcdd4e29b..6ec9c699a26 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -170,6 +170,11 @@ struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
    struct gl_context *shareCtx = share ? share->ctx : NULL;
    struct dd_function_table funcs;
 
+   /* Sanity checks */
+   assert(MESA_SHADER_VERTEX == PIPE_SHADER_VERTEX);
+   assert(MESA_SHADER_FRAGMENT == PIPE_SHADER_FRAGMENT);
+   assert(MESA_SHADER_GEOMETRY == PIPE_SHADER_GEOMETRY);
+
    memset(&funcs, 0, sizeof(funcs));
    st_init_driver_functions(&funcs);
 
@@ -239,8 +244,8 @@ void st_destroy_context( struct st_context *st )
    pipe->set_index_buffer(pipe, NULL);
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, NULL);
-      pipe_resource_reference(&st->state.constants[PIPE_SHADER_VERTEX], NULL);
+      pipe->set_constant_buffer(pipe, i, 0, NULL);
+      pipe_resource_reference(&st->state.constants[i], NULL);
    }
 
    _mesa_delete_program_cache(st->ctx, st->pixel_xfer.cache);
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 132749130af..930b60ade2d 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -67,7 +67,7 @@ void st_init_limits(struct st_context *st)
 {
    struct pipe_screen *screen = st->pipe->screen;
    struct gl_constants *c = &st->ctx->Const;
-   unsigned i;
+   gl_shader_type sh;
 
    c->MaxTextureLevels
       = _min(screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS),
@@ -137,11 +137,12 @@ void st_init_limits(struct st_context *st)
    /* Quads always follow GL provoking rules. */
    c->QuadsFollowProvokingVertexConvention = GL_FALSE;
 
-   for(i = 0; i < MESA_SHADER_TYPES; ++i) {
-      struct gl_shader_compiler_options *options = &st->ctx->ShaderCompilerOptions[i];
+   for (sh = 0; sh < MESA_SHADER_TYPES; ++sh) {
+      struct gl_shader_compiler_options *options =
+         &st->ctx->ShaderCompilerOptions[sh];
       struct gl_program_constants *pc;
-      switch(i)
-      {
+
+      switch (sh) {
       case PIPE_SHADER_FRAGMENT:
          pc = &c->FragmentProgram;
          break;
@@ -156,36 +157,37 @@ void st_init_limits(struct st_context *st)
          continue;
       }
 
-      pc->MaxNativeInstructions    = screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_INSTRUCTIONS);
-      pc->MaxNativeAluInstructions = screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS);
-      pc->MaxNativeTexInstructions = screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS);
-      pc->MaxNativeTexIndirections = screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS);
-      pc->MaxNativeAttribs         = screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_INPUTS);
-      pc->MaxNativeTemps           = screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_TEMPS);
-      pc->MaxNativeAddressRegs     = screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_ADDRS);
-      pc->MaxNativeParameters      = screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_CONSTS);
+      pc->MaxNativeInstructions    = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS);
+      pc->MaxNativeAluInstructions = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS);
+      pc->MaxNativeTexInstructions = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS);
+      pc->MaxNativeTexIndirections = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS);
+      pc->MaxNativeAttribs         = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS);
+      pc->MaxNativeTemps           = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEMPS);
+      pc->MaxNativeAddressRegs     = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_ADDRS);
+      pc->MaxNativeParameters      = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONSTS);
+      pc->MaxUniformComponents     = 4 * MIN2(pc->MaxNativeParameters, MAX_UNIFORMS);
 
       options->EmitNoNoise = TRUE;
 
       /* TODO: make these more fine-grained if anyone needs it */
-      options->EmitNoIfs = !screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
-      options->EmitNoFunctions = !screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
-      options->EmitNoLoops = !screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
-      options->EmitNoMainReturn = !screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
+      options->EmitNoIfs = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
+      options->EmitNoLoops = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
+      options->EmitNoFunctions = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES);
+      options->EmitNoMainReturn = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES);
 
-      options->EmitNoCont = !screen->get_shader_param(screen, i, PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED);
+      options->EmitNoCont = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED);
 
-      options->EmitNoIndirectInput = !screen->get_shader_param(screen, i,
+      options->EmitNoIndirectInput = !screen->get_shader_param(screen, sh,
                                         PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR);
-      options->EmitNoIndirectOutput = !screen->get_shader_param(screen, i,
+      options->EmitNoIndirectOutput = !screen->get_shader_param(screen, sh,
                                         PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR);
-      options->EmitNoIndirectTemp = !screen->get_shader_param(screen, i,
+      options->EmitNoIndirectTemp = !screen->get_shader_param(screen, sh,
                                         PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR);
-      options->EmitNoIndirectUniform = !screen->get_shader_param(screen, i,
+      options->EmitNoIndirectUniform = !screen->get_shader_param(screen, sh,
                                         PIPE_SHADER_CAP_INDIRECT_CONST_ADDR);
 
       if(options->EmitNoLoops)
-         options->MaxUnrollIterations = MIN2(screen->get_shader_param(screen, i, PIPE_SHADER_CAP_MAX_INSTRUCTIONS), 65536);
+         options->MaxUnrollIterations = MIN2(screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS), 65536);
    }
 
    /* PIPE_CAP_MAX_FS_INPUTS specifies the number of COLORn + GENERICn inputs
@@ -298,6 +300,8 @@ void st_init_extensions(struct st_context *st)
       ctx->Extensions.ARB_vertex_shader = GL_TRUE;
       ctx->Extensions.ARB_shader_objects = GL_TRUE;
       ctx->Extensions.ARB_shading_language_100 = GL_TRUE;
+      ctx->Extensions.ARB_explicit_attrib_location = GL_TRUE;
+      ctx->Extensions.EXT_separate_shader_objects = GL_TRUE;
    }
 
    if (screen->get_param(screen, PIPE_CAP_TEXTURE_MIRROR_REPEAT) > 0) {
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index 1fc4f40488f..c5f6008a222 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -79,11 +79,15 @@ st_render_mipmap(struct st_context *st,
    const uint face = _mesa_tex_target_to_face(target);
 
    assert(psv->texture == stObj->pt);
-   assert(target != GL_TEXTURE_3D); /* not done yet */
+#if 0
+   assert(target != GL_TEXTURE_3D); /* implemented but untested */
+#endif
 
    /* check if we can render in the texture's format */
-   if (!screen->is_format_supported(screen, psv->format, psv->texture->target, 0,
-                                    PIPE_BIND_RENDER_TARGET, 0)) {
+   /* XXX should probably kill this and always use util_gen_mipmap
+      since this implements a sw fallback as well */
+   if (!screen->is_format_supported(screen, psv->format, psv->texture->target,
+                                    0, PIPE_BIND_RENDER_TARGET, 0)) {
       return FALSE;
    }
 
@@ -161,12 +165,12 @@ fallback_generate_mipmap(struct gl_context *ctx, GLenum target,
    struct pipe_resource *pt = st_get_texobj_resource(texObj);
    const uint baseLevel = texObj->BaseLevel;
    const uint lastLevel = pt->last_level;
-   const uint face = _mesa_tex_target_to_face(target), zslice = 0;
+   const uint face = _mesa_tex_target_to_face(target);
    uint dstLevel;
    GLenum datatype;
    GLuint comps;
    GLboolean compressed;
-   
+
    if (ST_DEBUG & DEBUG_FALLBACK)
       debug_printf("%s: fallback processing\n", __FUNCTION__);
 
@@ -198,16 +202,15 @@ fallback_generate_mipmap(struct gl_context *ctx, GLenum target,
       ubyte *dstData;
       int srcStride, dstStride;
 
-      srcTrans = pipe_get_transfer(st_context(ctx)->pipe, pt, face,
-						srcLevel, zslice,
-						PIPE_TRANSFER_READ, 0, 0,
-                                                srcWidth, srcHeight);
-						
+      srcTrans = pipe_get_transfer(st_context(ctx)->pipe, pt, srcLevel,
+                                   face,
+                                   PIPE_TRANSFER_READ, 0, 0,
+                                   srcWidth, srcHeight);
 
-      dstTrans = pipe_get_transfer(st_context(ctx)->pipe, pt, face,
-						dstLevel, zslice,
-						PIPE_TRANSFER_WRITE, 0, 0,
-						dstWidth, dstHeight);
+      dstTrans = pipe_get_transfer(st_context(ctx)->pipe, pt, dstLevel,
+                                   face,
+                                   PIPE_TRANSFER_WRITE, 0, 0,
+                                   dstWidth, dstHeight);
 
       srcData = (ubyte *) pipe_transfer_map(pipe, srcTrans);
       dstData = (ubyte *) pipe_transfer_map(pipe, dstTrans);
@@ -215,6 +218,8 @@ fallback_generate_mipmap(struct gl_context *ctx, GLenum target,
       srcStride = srcTrans->stride / util_format_get_blocksize(srcTrans->resource->format);
       dstStride = dstTrans->stride / util_format_get_blocksize(dstTrans->resource->format);
 
+     /* this cannot work correctly for 3d since it does
+        not respect layerStride. */
       if (compressed) {
          const enum pipe_format format = pt->format;
          const uint bw = util_format_get_blockwidth(format);
@@ -365,6 +370,12 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
 
       pt = stObj->pt;
    }
+   else {
+      /* Make sure that the base texture image data is present in the
+       * texture buffer.
+       */
+      st_finalize_texture(ctx, st->pipe, texObj);
+   }
 
    assert(pt->last_level >= lastLevel);
 
@@ -372,6 +383,8 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
     * use the software fallback.
     */
    if (!st_render_mipmap(st, target, stObj, baseLevel, lastLevel)) {
+      /* since the util code actually also has a fallback, should
+         probably make it never fail and kill this */
       fallback_generate_mipmap(ctx, target, texObj);
    }
 
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 05733e818ea..0307b48978b 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -34,6 +34,7 @@
 #include "util/u_pointer.h"
 #include "util/u_inlines.h"
 #include "util/u_atomic.h"
+#include "util/u_surface.h"
 
 #include "main/mtypes.h"
 #include "main/context.h"
@@ -142,7 +143,7 @@ buffer_index_to_attachment(gl_buffer_index index)
 static void
 st_framebuffer_validate(struct st_framebuffer *stfb, struct st_context *st)
 {
-   struct pipe_screen *screen = st->pipe->screen;
+   struct pipe_context *pipe = st->pipe;
    struct pipe_resource *textures[ST_ATTACHMENT_COUNT];
    uint width, height;
    unsigned i;
@@ -160,7 +161,7 @@ st_framebuffer_validate(struct st_framebuffer *stfb, struct st_context *st)
 
    for (i = 0; i < stfb->num_statts; i++) {
       struct st_renderbuffer *strb;
-      struct pipe_surface *ps;
+      struct pipe_surface *ps, surf_tmpl;
       gl_buffer_index idx;
 
       if (!textures[i])
@@ -179,8 +180,10 @@ st_framebuffer_validate(struct st_framebuffer *stfb, struct st_context *st)
          continue;
       }
 
-      ps = screen->get_tex_surface(screen, textures[i], 0, 0, 0,
-				   PIPE_BIND_RENDER_TARGET);
+      memset(&surf_tmpl, 0, sizeof(surf_tmpl));
+      u_surface_default_template(&surf_tmpl, textures[i],
+                                 PIPE_BIND_RENDER_TARGET);
+      ps = pipe->create_surface(pipe, textures[i], &surf_tmpl);
       if (ps) {
          pipe_surface_reference(&strb->surface, ps);
          pipe_resource_reference(&strb->texture, ps->texture);
@@ -813,6 +816,7 @@ st_manager_flush_frontbuffer(struct st_context *st)
 
 /**
  * Return the surface of an EGLImage.
+ * FIXME: I think this should operate on resources, not surfaces
  */
 struct pipe_surface *
 st_manager_get_egl_image_surface(struct st_context *st,
@@ -821,7 +825,7 @@ st_manager_get_egl_image_surface(struct st_context *st,
    struct st_manager *smapi =
       (struct st_manager *) st->iface.st_context_private;
    struct st_egl_image stimg;
-   struct pipe_surface *ps;
+   struct pipe_surface *ps, surf_tmpl;
 
    if (!smapi || !smapi->get_egl_image)
       return NULL;
@@ -830,8 +834,13 @@ st_manager_get_egl_image_surface(struct st_context *st,
    if (!smapi->get_egl_image(smapi, eglimg, &stimg))
       return NULL;
 
-   ps = smapi->screen->get_tex_surface(smapi->screen,
-         stimg.texture, stimg.face, stimg.level, stimg.zslice, usage);
+   memset(&surf_tmpl, 0, sizeof(surf_tmpl));
+   surf_tmpl.format = stimg.texture->format;
+   surf_tmpl.usage = usage;
+   surf_tmpl.u.tex.level = stimg.level;
+   surf_tmpl.u.tex.first_layer = stimg.layer;
+   surf_tmpl.u.tex.last_layer = stimg.layer;
+   ps = st->pipe->create_surface(st->pipe, stimg.texture, &surf_tmpl);
    pipe_resource_reference(&stimg.texture, NULL);
 
    return ps;
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index c5c239b2c95..f848462310e 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -760,10 +760,13 @@ emit_adjusted_wpos( struct st_translate *t,
 
 /**
  * Emit the TGSI instructions for inverting the WPOS y coordinate.
+ * This code is unavoidable because it also depends on whether
+ * a FBO is bound (STATE_FB_WPOS_Y_TRANSFORM).
  */
 static void
-emit_inverted_wpos( struct st_translate *t,
-                    const struct gl_program *program )
+emit_wpos_inversion( struct st_translate *t,
+                     const struct gl_program *program,
+                     boolean invert)
 {
    struct ureg_program *ureg = t->ureg;
 
@@ -771,17 +774,17 @@ emit_inverted_wpos( struct st_translate *t,
     * Need to replace instances of INPUT[WPOS] with temp T
     * where T = INPUT[WPOS] by y is inverted.
     */
-   static const gl_state_index winSizeState[STATE_LENGTH]
-      = { STATE_INTERNAL, STATE_FB_SIZE, 0, 0, 0 };
+   static const gl_state_index wposTransformState[STATE_LENGTH]
+      = { STATE_INTERNAL, STATE_FB_WPOS_Y_TRANSFORM, 0, 0, 0 };
    
    /* XXX: note we are modifying the incoming shader here!  Need to
     * do this before emitting the constant decls below, or this
     * will be missed:
     */
-   unsigned winHeightConst = _mesa_add_state_reference(program->Parameters,
-                                                       winSizeState);
+   unsigned wposTransConst = _mesa_add_state_reference(program->Parameters,
+                                                       wposTransformState);
 
-   struct ureg_src winsize = ureg_DECL_constant( ureg, winHeightConst );
+   struct ureg_src wpostrans = ureg_DECL_constant( ureg, wposTransConst );
    struct ureg_dst wpos_temp;
    struct ureg_src wpos_input = t->inputs[t->inputMapping[FRAG_ATTRIB_WPOS]];
 
@@ -794,12 +797,23 @@ emit_inverted_wpos( struct st_translate *t,
       ureg_MOV( ureg, wpos_temp, wpos_input );
    }
 
-   /* SUB wpos_temp.y, winsize_const, wpos_input
-    */
-   ureg_SUB( ureg,
-             ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
-             winsize,
-             wpos_input);
+   if (invert) {
+      /* MAD wpos_temp.y, wpos_input, wpostrans.xxxx, wpostrans.yyyy
+       */
+      ureg_MAD( ureg,
+                ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
+                wpos_input,
+                ureg_scalar(wpostrans, 0),
+                ureg_scalar(wpostrans, 1));
+   } else {
+      /* MAD wpos_temp.y, wpos_input, wpostrans.zzzz, wpostrans.wwww
+       */
+      ureg_MAD( ureg,
+                ureg_writemask(wpos_temp, TGSI_WRITEMASK_Y ),
+                wpos_input,
+                ureg_scalar(wpostrans, 2),
+                ureg_scalar(wpostrans, 3));
+   }
 
    /* Use wpos_temp as position input from here on:
     */
@@ -861,8 +875,7 @@ emit_wpos(struct st_context *st,
 
    /* we invert after adjustment so that we avoid the MOV to temporary,
     * and reuse the adjustment ADD instead */
-   if (invert)
-      emit_inverted_wpos(t, program);
+   emit_wpos_inversion(t, program, invert);
 }
 
 
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 76799287fe1..aae2913c202 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -216,6 +216,8 @@ st_translate_vertex_program(struct st_context *st,
       return NULL;
    }
 
+   vpv->key = *key;
+
    vpv->num_inputs = stvp->num_inputs;
    num_outputs = stvp->num_outputs;
    if (key->passthrough_edgeflags) {
diff --git a/src/mesa/state_tracker/st_texture.c b/src/mesa/state_tracker/st_texture.c
index c6cf2ba061b..155ea39f18c 100644
--- a/src/mesa/state_tracker/st_texture.c
+++ b/src/mesa/state_tracker/st_texture.c
@@ -84,6 +84,7 @@ st_texture_create(struct st_context *st,
    pt.width0 = width0;
    pt.height0 = height0;
    pt.depth0 = depth0;
+   pt.array_size = (target == PIPE_TEXTURE_CUBE ? 6 : 1);
    pt.usage = PIPE_USAGE_DEFAULT;
    pt.bind = bind;
    pt.flags = 0;
@@ -136,7 +137,7 @@ st_texture_match_image(const struct pipe_resource *pt,
  */
 GLubyte *
 st_texture_image_map(struct st_context *st, struct st_texture_image *stImage,
-		     GLuint zoffset, enum pipe_transfer_usage usage,
+                     GLuint zoffset, enum pipe_transfer_usage usage,
                      GLuint x, GLuint y, GLuint w, GLuint h)
 {
    struct pipe_context *pipe = st->pipe;
@@ -144,9 +145,9 @@ st_texture_image_map(struct st_context *st, struct st_texture_image *stImage,
 
    DBG("%s \n", __FUNCTION__);
 
-   stImage->transfer = pipe_get_transfer(st->pipe, pt, stImage->face,
-						    stImage->level, zoffset,
-						    usage, x, y, w, h);
+   stImage->transfer = pipe_get_transfer(st->pipe, pt, stImage->level,
+                                         stImage->face + zoffset,
+                                         usage, x, y, w, h);
 
    if (stImage->transfer)
       return pipe_transfer_map(pipe, stImage->transfer);
@@ -219,10 +220,10 @@ st_texture_image_data(struct st_context *st,
    DBG("%s\n", __FUNCTION__);
 
    for (i = 0; i < depth; i++) {
-      dst_transfer = pipe_get_transfer(st->pipe, dst, face, level, i,
-						  PIPE_TRANSFER_WRITE, 0, 0,
-						  u_minify(dst->width0, level),
-                                                  u_minify(dst->height0, level));
+      dst_transfer = pipe_get_transfer(st->pipe, dst, level, face + i,
+                                       PIPE_TRANSFER_WRITE, 0, 0,
+                                       u_minify(dst->width0, level),
+                                       u_minify(dst->height0, level));
 
       st_surface_data(pipe, dst_transfer,
 		      0, 0,                             /* dstx, dsty */
@@ -230,7 +231,7 @@ st_texture_image_data(struct st_context *st,
 		      src_row_stride,
 		      0, 0,                             /* source x, y */
 		      u_minify(dst->width0, level),
-                      u_minify(dst->height0, level));      /* width, height */
+                      u_minify(dst->height0, level));    /* width, height */
 
       pipe->transfer_destroy(pipe, dst_transfer);
 
@@ -245,14 +246,10 @@ st_texture_image_data(struct st_context *st,
 static void
 print_center_pixel(struct pipe_context *pipe, struct pipe_resource *src)
 {
-   struct pipe_subresource rect;
    struct pipe_transfer *xfer;
    struct pipe_box region;
    ubyte *map;
 
-   rect.face = 0;
-   rect.level = 0;
-
    region.x = src->width0 / 2;
    region.y = src->height0 / 2;
    region.z = 0;
@@ -260,7 +257,7 @@ print_center_pixel(struct pipe_context *pipe, struct pipe_resource *src)
    region.height = 1;
    region.depth = 1;
 
-   xfer = pipe->get_transfer(pipe, src, rect, PIPE_TRANSFER_READ, &region);
+   xfer = pipe->get_transfer(pipe, src, 0, PIPE_TRANSFER_READ, &region);
    map = pipe->transfer_map(pipe, xfer);
 
    printf("center pixel: %d %d %d %d\n", map[0], map[1], map[2], map[3]);
@@ -282,22 +279,26 @@ st_texture_image_copy(struct pipe_context *pipe,
                       struct pipe_resource *src, GLuint srcLevel,
                       GLuint face)
 {
-   GLuint width = u_minify(dst->width0, dstLevel); 
-   GLuint height = u_minify(dst->height0, dstLevel); 
-   GLuint depth = u_minify(dst->depth0, dstLevel); 
-   struct pipe_subresource dstsub, srcsub;
+   GLuint width = u_minify(dst->width0, dstLevel);
+   GLuint height = u_minify(dst->height0, dstLevel);
+   GLuint depth = u_minify(dst->depth0, dstLevel);
+   struct pipe_box src_box;
    GLuint i;
 
    assert(u_minify(src->width0, srcLevel) == width);
    assert(u_minify(src->height0, srcLevel) == height);
    assert(u_minify(src->depth0, srcLevel) == depth);
 
-   dstsub.face = face;
-   dstsub.level = dstLevel;
-   srcsub.face = face;
-   srcsub.level = srcLevel;
+   src_box.x = 0;
+   src_box.y = 0;
+   src_box.width = width;
+   src_box.height = height;
+   src_box.depth = 1;
    /* Loop over 3D image slices */
-   for (i = 0; i < depth; i++) {
+   /* could (and probably should) use "true" 3d box here -
+      but drivers can't quite handle it yet */
+   for (i = face; i < face + depth; i++) {
+      src_box.z = i;
 
       if (0)  {
          print_center_pixel(pipe, src);
@@ -305,12 +306,11 @@ st_texture_image_copy(struct pipe_context *pipe,
 
       pipe->resource_copy_region(pipe,
                                  dst,
-                                 dstsub,
+                                 dstLevel,
                                  0, 0, i,/* destX, Y, Z */
                                  src,
-                                 srcsub,
-                                 0, 0, i,/* srcX, Y, Z */
-                                 width, height);
+                                 srcLevel,
+                                 &src_box);
    }
 }
 
diff --git a/src/mesa/swrast/s_blend.c b/src/mesa/swrast/s_blend.c
index 1a550c445d3..d61baba0f33 100644
--- a/src/mesa/swrast/s_blend.c
+++ b/src/mesa/swrast/s_blend.c
@@ -819,7 +819,16 @@ static void
 blend_general(struct gl_context *ctx, GLuint n, const GLubyte mask[],
               void *src, const void *dst, GLenum chanType)
 {
-   GLfloat rgbaF[MAX_WIDTH][4], destF[MAX_WIDTH][4];
+   GLfloat (*rgbaF)[4], (*destF)[4];
+
+   rgbaF = (GLfloat (*)[4]) malloc(4 * n * sizeof(GLfloat));
+   destF = (GLfloat (*)[4]) malloc(4 * n * sizeof(GLfloat));
+   if (!rgbaF || !destF) {
+      free(rgbaF);
+      free(destF);
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "blending");
+      return;
+   }
 
    if (chanType == GL_UNSIGNED_BYTE) {
       GLubyte (*rgba)[4] = (GLubyte (*)[4]) src;
@@ -883,6 +892,9 @@ blend_general(struct gl_context *ctx, GLuint n, const GLubyte mask[],
       blend_general_float(ctx, n, mask, (GLfloat (*)[4]) src,
                           (GLfloat (*)[4]) dst, chanType);
    }
+
+   free(rgbaF);
+   free(destF);
 }
 
 
diff --git a/src/mesa/swrast/s_drawpix.c b/src/mesa/swrast/s_drawpix.c
index 4e9b5307cc7..4d0666898b4 100644
--- a/src/mesa/swrast/s_drawpix.c
+++ b/src/mesa/swrast/s_drawpix.c
@@ -720,13 +720,16 @@ _swrast_DrawPixels( struct gl_context *ctx,
    if (swrast->NewState)
       _swrast_validate_derived( ctx );
 
-    pixels = _mesa_map_pbo_source(ctx, unpack, pixels);
-    if (!pixels) {
-       swrast_render_finish(ctx);
-       _mesa_set_vp_override(ctx, save_vp_override);
-       return;
-    }
+   pixels = _mesa_map_pbo_source(ctx, unpack, pixels);
+   if (!pixels) {
+      swrast_render_finish(ctx);
+      _mesa_set_vp_override(ctx, save_vp_override);
+      return;
+   }
 
+   /*
+    * By time we get here, all error checking should have been done.
+    */
    switch (format) {
    case GL_STENCIL_INDEX:
       draw_stencil_pixels( ctx, x, y, width, height, type, unpack, pixels );
@@ -734,27 +737,12 @@ _swrast_DrawPixels( struct gl_context *ctx,
    case GL_DEPTH_COMPONENT:
       draw_depth_pixels( ctx, x, y, width, height, type, unpack, pixels );
       break;
-   case GL_COLOR_INDEX:
-   case GL_RED:
-   case GL_GREEN:
-   case GL_BLUE:
-   case GL_ALPHA:
-   case GL_LUMINANCE:
-   case GL_LUMINANCE_ALPHA:
-   case GL_RGB:
-   case GL_BGR:
-   case GL_RGBA:
-   case GL_BGRA:
-   case GL_ABGR_EXT:
-      draw_rgba_pixels(ctx, x, y, width, height, format, type, unpack, pixels);
-      break;
    case GL_DEPTH_STENCIL_EXT:
-      draw_depth_stencil_pixels(ctx, x, y, width, height,
-                                type, unpack, pixels);
+      draw_depth_stencil_pixels(ctx, x, y, width, height, type, unpack, pixels);
       break;
    default:
-      _mesa_problem(ctx, "unexpected format 0x%x in _swrast_DrawPixels", format);
-      /* don't return yet, clean-up */
+      /* all other formats should be color formats */
+      draw_rgba_pixels(ctx, x, y, width, height, format, type, unpack, pixels);
    }
 
    swrast_render_finish(ctx);
diff --git a/src/mesa/swrast/s_readpix.c b/src/mesa/swrast/s_readpix.c
index 5e6356c0d54..9fe0752a37f 100644
--- a/src/mesa/swrast/s_readpix.c
+++ b/src/mesa/swrast/s_readpix.c
@@ -476,49 +476,33 @@ _swrast_ReadPixels( struct gl_context *ctx,
       _swrast_validate_derived( ctx );
 
    /* Do all needed clipping here, so that we can forget about it later */
-   if (!_mesa_clip_readpixels(ctx, &x, &y, &width, &height, &clippedPacking)) {
-      /* The ReadPixels region is totally outside the window bounds */
-      swrast_render_finish(ctx);
-      return;
-   }
-
-   pixels = _mesa_map_pbo_dest(ctx, &clippedPacking, pixels);
-   if (!pixels)
-      return;
-  
-   switch (format) {
-      case GL_STENCIL_INDEX:
-	 read_stencil_pixels(ctx, x, y, width, height, type, pixels,
+   if (_mesa_clip_readpixels(ctx, &x, &y, &width, &height, &clippedPacking)) {
+
+      pixels = _mesa_map_pbo_dest(ctx, &clippedPacking, pixels);
+
+      if (pixels) {
+         switch (format) {
+         case GL_STENCIL_INDEX:
+            read_stencil_pixels(ctx, x, y, width, height, type, pixels,
+                                &clippedPacking);
+            break;
+         case GL_DEPTH_COMPONENT:
+            read_depth_pixels(ctx, x, y, width, height, type, pixels,
+                              &clippedPacking);
+            break;
+         case GL_DEPTH_STENCIL_EXT:
+            read_depth_stencil_pixels(ctx, x, y, width, height, type, pixels,
+                                      &clippedPacking);
+            break;
+         default:
+            /* all other formats should be color formats */
+            read_rgba_pixels(ctx, x, y, width, height, format, type, pixels,
                              &clippedPacking);
-         break;
-      case GL_DEPTH_COMPONENT:
-	 read_depth_pixels(ctx, x, y, width, height, type, pixels,
-                           &clippedPacking);
-	 break;
-      case GL_RED:
-      case GL_GREEN:
-      case GL_BLUE:
-      case GL_ALPHA:
-      case GL_RGB:
-      case GL_LUMINANCE:
-      case GL_LUMINANCE_ALPHA:
-      case GL_RGBA:
-      case GL_BGR:
-      case GL_BGRA:
-      case GL_ABGR_EXT:
-         read_rgba_pixels(ctx, x, y, width, height,
-                          format, type, pixels, &clippedPacking);
-	 break;
-      case GL_DEPTH_STENCIL_EXT:
-         read_depth_stencil_pixels(ctx, x, y, width, height,
-                                   type, pixels, &clippedPacking);
-         break;
-      default:
-	 _mesa_problem(ctx, "unexpected format 0x%x in _swrast_ReadPixels", format);
-         /* don't return yet, clean-up */
+         }
+
+         _mesa_unmap_pbo_dest(ctx, &clippedPacking);
+      }
    }
 
    swrast_render_finish(ctx);
-
-   _mesa_unmap_pbo_dest(ctx, &clippedPacking);
 }
diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
index 1836d074aee..99c44413fbe 100644
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -86,10 +86,28 @@ texture_combine( struct gl_context *ctx, GLuint unit, GLuint n,
    const GLfloat scaleA = (GLfloat) (1 << combine->ScaleShiftA);
    const GLuint numArgsRGB = combine->_NumArgsRGB;
    const GLuint numArgsA = combine->_NumArgsA;
-   GLfloat ccolor[MAX_COMBINER_TERMS][MAX_WIDTH][4]; /* temp color buffers */
-   GLfloat rgba[MAX_WIDTH][4];
+   float4_array ccolor[4], rgba;
    GLuint i, term;
 
+   /* alloc temp pixel buffers */
+   rgba = (float4_array) malloc(4 * n * sizeof(GLfloat));
+   if (!rgba) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_combine");
+      return;
+   }
+
+   for (i = 0; i < numArgsRGB || i < numArgsA; i++) {
+      ccolor[i] = (float4_array) malloc(4 * n * sizeof(GLfloat));
+      if (!ccolor[i]) {
+         while (i) {
+            free(ccolor[i]);
+            i--;
+         }
+         _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_combine");
+         return;
+      }
+   }
+
    for (i = 0; i < n; i++) {
       rgba[i][RCOMP] = CHAN_TO_FLOAT(rgbaChan[i][RCOMP]);
       rgba[i][GCOMP] = CHAN_TO_FLOAT(rgbaChan[i][GCOMP]);
@@ -163,7 +181,7 @@ texture_combine( struct gl_context *ctx, GLuint unit, GLuint n,
                const GLuint srcUnit = srcRGB - GL_TEXTURE0;
                ASSERT(srcUnit < ctx->Const.MaxTextureUnits);
                if (!ctx->Texture.Unit[srcUnit]._ReallyEnabled)
-                  return;
+                  goto end;
                argRGB[term] = get_texel_array(swrast, srcUnit);
             }
       }
@@ -253,7 +271,7 @@ texture_combine( struct gl_context *ctx, GLuint unit, GLuint n,
                const GLuint srcUnit = srcA - GL_TEXTURE0;
                ASSERT(srcUnit < ctx->Const.MaxTextureUnits);
                if (!ctx->Texture.Unit[srcUnit]._ReallyEnabled)
-                  return;
+                  goto end;
                argA[term] = get_texel_array(swrast, srcUnit);
             }
       }
@@ -411,7 +429,7 @@ texture_combine( struct gl_context *ctx, GLuint unit, GLuint n,
             rgba[i][BCOMP] = 0.0;
             rgba[i][ACOMP] = 1.0;
 	 }
-         return; /* no alpha processing */
+         goto end; /* no alpha processing */
       default:
          _mesa_problem(ctx, "invalid combine mode");
       }
@@ -519,6 +537,12 @@ texture_combine( struct gl_context *ctx, GLuint unit, GLuint n,
       UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][BCOMP], rgba[i][BCOMP]);
       UNCLAMPED_FLOAT_TO_CHAN(rgbaChan[i][ACOMP], rgba[i][ACOMP]);
    }
+
+end:
+   for (i = 0; i < numArgsRGB || i < numArgsA; i++) {
+      free(ccolor[i]);
+   }
+   free(rgba);
 }
 
 
@@ -559,9 +583,16 @@ void
 _swrast_texture_span( struct gl_context *ctx, SWspan *span )
 {
    SWcontext *swrast = SWRAST_CONTEXT(ctx);
-   GLfloat primary_rgba[MAX_WIDTH][4];
+   float4_array primary_rgba;
    GLuint unit;
 
+   primary_rgba = (float4_array) malloc(span->end * 4 * sizeof(GLfloat));
+
+   if (!primary_rgba) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "texture_span");
+      return;
+   }
+
    ASSERT(span->end <= MAX_WIDTH);
 
    /*
@@ -706,4 +737,6 @@ _swrast_texture_span( struct gl_context *ctx, SWspan *span )
                           span->array->rgba );
       }
    }
+
+   free(primary_rgba);
 }
diff --git a/src/mesa/swrast/s_texfilter.c b/src/mesa/swrast/s_texfilter.c
index ec281776d0d..539d878ddb4 100644
--- a/src/mesa/swrast/s_texfilter.c
+++ b/src/mesa/swrast/s_texfilter.c
@@ -1371,6 +1371,7 @@ opt_sample_rgb_2d(struct gl_context *ctx,
       rgba[k][RCOMP] = UBYTE_TO_FLOAT(texel[2]);
       rgba[k][GCOMP] = UBYTE_TO_FLOAT(texel[1]);
       rgba[k][BCOMP] = UBYTE_TO_FLOAT(texel[0]);
+      rgba[k][ACOMP] = 1.0F;
    }
 }
 
diff --git a/src/mesa/vbo/vbo.h b/src/mesa/vbo/vbo.h
index 7b8da8eb843..79f76655354 100644
--- a/src/mesa/vbo/vbo.h
+++ b/src/mesa/vbo/vbo.h
@@ -40,7 +40,8 @@ struct _mesa_prim {
    GLuint begin:1;
    GLuint end:1;
    GLuint weak:1;
-   GLuint pad:20;
+   GLuint no_current_update:1;
+   GLuint pad:19;
 
    GLuint start;
    GLuint count;
@@ -128,42 +129,42 @@ void vbo_set_draw_func(struct gl_context *ctx, vbo_draw_func func);
 
 
 void GLAPIENTRY
-_vbo_Color4f(GLfloat r, GLfloat g, GLfloat b, GLfloat a);
+_es_Color4f(GLfloat r, GLfloat g, GLfloat b, GLfloat a);
 
 void GLAPIENTRY
-_vbo_Normal3f(GLfloat x, GLfloat y, GLfloat z);
+_es_Normal3f(GLfloat x, GLfloat y, GLfloat z);
 
 void GLAPIENTRY
-_vbo_MultiTexCoord4f(GLenum target, GLfloat s, GLfloat t, GLfloat r, GLfloat q);
+_es_MultiTexCoord4f(GLenum target, GLfloat s, GLfloat t, GLfloat r, GLfloat q);
 
 void GLAPIENTRY
-_vbo_Materialfv(GLenum face, GLenum pname, const GLfloat *params);
+_es_Materialfv(GLenum face, GLenum pname, const GLfloat *params);
 
 void GLAPIENTRY
-_vbo_Materialf(GLenum face, GLenum pname, GLfloat param);
+_es_Materialf(GLenum face, GLenum pname, GLfloat param);
 
 void GLAPIENTRY
-_vbo_VertexAttrib4f(GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
+_es_VertexAttrib4f(GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
 
 void GLAPIENTRY
-_vbo_VertexAttrib1f(GLuint indx, GLfloat x);
+_es_VertexAttrib1f(GLuint indx, GLfloat x);
 
 void GLAPIENTRY
-_vbo_VertexAttrib1fv(GLuint indx, const GLfloat* values);
+_es_VertexAttrib1fv(GLuint indx, const GLfloat* values);
 
 void GLAPIENTRY
-_vbo_VertexAttrib2f(GLuint indx, GLfloat x, GLfloat y);
+_es_VertexAttrib2f(GLuint indx, GLfloat x, GLfloat y);
 
 void GLAPIENTRY
-_vbo_VertexAttrib2fv(GLuint indx, const GLfloat* values);
+_es_VertexAttrib2fv(GLuint indx, const GLfloat* values);
 
 void GLAPIENTRY
-_vbo_VertexAttrib3f(GLuint indx, GLfloat x, GLfloat y, GLfloat z);
+_es_VertexAttrib3f(GLuint indx, GLfloat x, GLfloat y, GLfloat z);
 
 void GLAPIENTRY
-_vbo_VertexAttrib3fv(GLuint indx, const GLfloat* values);
+_es_VertexAttrib3fv(GLuint indx, const GLfloat* values);
 
 void GLAPIENTRY
-_vbo_VertexAttrib4fv(GLuint indx, const GLfloat* values);
+_es_VertexAttrib4fv(GLuint indx, const GLfloat* values);
 
 #endif
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index 9b2d59f9e4c..fb981ccc3bc 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -369,8 +369,6 @@ static void vbo_exec_fixup_vertex( struct gl_context *ctx,
 }
 
 
-#if FEATURE_beginend
-
 /* 
  */
 #define ATTR( A, N, V0, V1, V2, V3 )				\
@@ -411,7 +409,7 @@ do {								\
 #include "vbo_attrib_tmp.h"
 
 
-
+#if FEATURE_beginend
 
 
 #if FEATURE_evaluators
@@ -706,30 +704,6 @@ static void vbo_exec_vtxfmt_init( struct vbo_exec_context *exec )
 #else /* FEATURE_beginend */
 
 
-#define ATTR( A, N, V0, V1, V2, V3 )				\
-do {								\
-   struct vbo_exec_context *exec = &vbo_context(ctx)->exec;	\
-								\
-   /* FLUSH_UPDATE_CURRENT needs to be set manually */		\
-   exec->ctx->Driver.NeedFlush |= FLUSH_UPDATE_CURRENT;		\
-								\
-   if (exec->vtx.active_sz[A] != N)				\
-      vbo_exec_fixup_vertex(ctx, A, N);				\
-								\
-   {								\
-      GLfloat *dest = exec->vtx.attrptr[A];			\
-      if (N>0) dest[0] = V0;					\
-      if (N>1) dest[1] = V1;					\
-      if (N>2) dest[2] = V2;					\
-      if (N>3) dest[3] = V3;					\
-   }								\
-} while (0)
-
-#define ERROR() _mesa_error( ctx, GL_INVALID_ENUM, __FUNCTION__ )
-#define TAG(x) vbo_##x
-
-#include "vbo_attrib_tmp.h"
-
 static void vbo_exec_vtxfmt_init( struct vbo_exec_context *exec )
 {
    /* silence warnings */
@@ -998,35 +972,35 @@ static void reset_attrfv( struct vbo_exec_context *exec )
       
 
 void GLAPIENTRY
-_vbo_Color4f(GLfloat r, GLfloat g, GLfloat b, GLfloat a)
+_es_Color4f(GLfloat r, GLfloat g, GLfloat b, GLfloat a)
 {
    vbo_Color4f(r, g, b, a);
 }
 
 
 void GLAPIENTRY
-_vbo_Normal3f(GLfloat x, GLfloat y, GLfloat z)
+_es_Normal3f(GLfloat x, GLfloat y, GLfloat z)
 {
    vbo_Normal3f(x, y, z);
 }
 
 
 void GLAPIENTRY
-_vbo_MultiTexCoord4f(GLenum target, GLfloat s, GLfloat t, GLfloat r, GLfloat q)
+_es_MultiTexCoord4f(GLenum target, GLfloat s, GLfloat t, GLfloat r, GLfloat q)
 {
    vbo_MultiTexCoord4f(target, s, t, r, q);
 }
 
 
 void GLAPIENTRY
-_vbo_Materialfv(GLenum face, GLenum pname, const GLfloat *params)
+_es_Materialfv(GLenum face, GLenum pname, const GLfloat *params)
 {
    vbo_Materialfv(face, pname, params);
 }
 
 
 void GLAPIENTRY
-_vbo_Materialf(GLenum face, GLenum pname, GLfloat param)
+_es_Materialf(GLenum face, GLenum pname, GLfloat param)
 {
    GLfloat p[4];
    p[0] = param;
@@ -1035,57 +1009,71 @@ _vbo_Materialf(GLenum face, GLenum pname, GLfloat param)
 }
 
 
+/**
+ * A special version of glVertexAttrib4f that does not treat index 0 as
+ * VBO_ATTRIB_POS.
+ */
+static void
+VertexAttrib4f_nopos(GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (index < MAX_VERTEX_GENERIC_ATTRIBS)
+      ATTR(VBO_ATTRIB_GENERIC0 + index, 4, x, y, z, w);
+   else
+      ERROR();
+}
+
 void GLAPIENTRY
-_vbo_VertexAttrib4f(GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w)
+_es_VertexAttrib4f(GLuint index, GLfloat x, GLfloat y, GLfloat z, GLfloat w)
 {
-   vbo_VertexAttrib4fARB(index, x, y, z, w);
+   VertexAttrib4f_nopos(index, x, y, z, w);
 }
 
 
 void GLAPIENTRY
-_vbo_VertexAttrib1f(GLuint indx, GLfloat x)
+_es_VertexAttrib1f(GLuint indx, GLfloat x)
 {
-   vbo_VertexAttrib1fARB(indx, x);
+   VertexAttrib4f_nopos(indx, x, 0.0f, 0.0f, 1.0f);
 }
 
 
 void GLAPIENTRY
-_vbo_VertexAttrib1fv(GLuint indx, const GLfloat* values)
+_es_VertexAttrib1fv(GLuint indx, const GLfloat* values)
 {
-   vbo_VertexAttrib1fvARB(indx, values);
+   VertexAttrib4f_nopos(indx, values[0], 0.0f, 0.0f, 1.0f);
 }
 
 
 void GLAPIENTRY
-_vbo_VertexAttrib2f(GLuint indx, GLfloat x, GLfloat y)
+_es_VertexAttrib2f(GLuint indx, GLfloat x, GLfloat y)
 {
-   vbo_VertexAttrib2fARB(indx, x, y);
+   VertexAttrib4f_nopos(indx, x, y, 0.0f, 1.0f);
 }
 
 
 void GLAPIENTRY
-_vbo_VertexAttrib2fv(GLuint indx, const GLfloat* values)
+_es_VertexAttrib2fv(GLuint indx, const GLfloat* values)
 {
-   vbo_VertexAttrib2fvARB(indx, values);
+   VertexAttrib4f_nopos(indx, values[0], values[1], 0.0f, 1.0f);
 }
 
 
 void GLAPIENTRY
-_vbo_VertexAttrib3f(GLuint indx, GLfloat x, GLfloat y, GLfloat z)
+_es_VertexAttrib3f(GLuint indx, GLfloat x, GLfloat y, GLfloat z)
 {
-   vbo_VertexAttrib3fARB(indx, x, y, z);
+   VertexAttrib4f_nopos(indx, x, y, z, 1.0f);
 }
 
 
 void GLAPIENTRY
-_vbo_VertexAttrib3fv(GLuint indx, const GLfloat* values)
+_es_VertexAttrib3fv(GLuint indx, const GLfloat* values)
 {
-   vbo_VertexAttrib3fvARB(indx, values);
+   VertexAttrib4f_nopos(indx, values[0], values[1], values[2], 1.0f);
 }
 
 
 void GLAPIENTRY
-_vbo_VertexAttrib4fv(GLuint indx, const GLfloat* values)
+_es_VertexAttrib4fv(GLuint indx, const GLfloat* values)
 {
-   vbo_VertexAttrib4fvARB(indx, values);
+   VertexAttrib4f_nopos(indx, values[0], values[1], values[2], values[3]);
 }
diff --git a/src/mesa/vbo/vbo_save.h b/src/mesa/vbo/vbo_save.h
index f5a407ced14..23cbea2afc1 100644
--- a/src/mesa/vbo/vbo_save.h
+++ b/src/mesa/vbo/vbo_save.h
@@ -96,7 +96,9 @@ struct vbo_save_vertex_list {
  */
 #define VBO_SAVE_BUFFER_SIZE (8*1024) /* dwords */
 #define VBO_SAVE_PRIM_SIZE   128
-#define VBO_SAVE_PRIM_WEAK 0x40
+#define VBO_SAVE_PRIM_MODE_MASK         0x3f
+#define VBO_SAVE_PRIM_WEAK              0x40
+#define VBO_SAVE_PRIM_NO_CURRENT_UPDATE 0x80
 
 #define VBO_SAVE_FALLBACK    0x10000000
 
diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index 817d478e2ac..bf5ceda78f8 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -294,26 +294,30 @@ static void _save_compile_vertex_list( struct gl_context *ctx )
    node->vertex_store->refcount++;
    node->prim_store->refcount++;
 
-
-   node->current_size = node->vertex_size - node->attrsz[0];
-   node->current_data = NULL;
-
-   if (node->current_size) {
-      /* If the malloc fails, we just pull the data out of the VBO
-       * later instead.
-       */
-      node->current_data = MALLOC( node->current_size * sizeof(GLfloat) );
-      if (node->current_data) {
-         const char *buffer = (const char *)save->vertex_store->buffer;
-         unsigned attr_offset = node->attrsz[0] * sizeof(GLfloat);
-         unsigned vertex_offset = 0;
-
-         if (node->count)
-            vertex_offset = (node->count-1) * node->vertex_size * sizeof(GLfloat);
-
-         memcpy( node->current_data,
-                 buffer + node->buffer_offset + vertex_offset + attr_offset,
-                 node->current_size * sizeof(GLfloat) );
+   if (node->prim[0].no_current_update) {
+      node->current_size = 0;
+      node->current_data = NULL;
+   } else {
+      node->current_size = node->vertex_size - node->attrsz[0];
+      node->current_data = NULL;
+     
+      if (node->current_size) {
+         /* If the malloc fails, we just pull the data out of the VBO
+          * later instead.
+          */
+         node->current_data = MALLOC( node->current_size * sizeof(GLfloat) );
+         if (node->current_data) {
+            const char *buffer = (const char *)save->vertex_store->buffer;
+            unsigned attr_offset = node->attrsz[0] * sizeof(GLfloat);
+            unsigned vertex_offset = 0;
+            
+            if (node->count)
+               vertex_offset = (node->count-1) * node->vertex_size * sizeof(GLfloat);
+         
+            memcpy( node->current_data,
+                    buffer + node->buffer_offset + vertex_offset + attr_offset,
+                    node->current_size * sizeof(GLfloat) );
+         }
       }
    }
 
@@ -397,6 +401,7 @@ static void _save_wrap_buffers( struct gl_context *ctx )
    GLint i = save->prim_count - 1;
    GLenum mode;
    GLboolean weak;
+   GLboolean no_current_update;
 
    assert(i < (GLint) save->prim_max);
    assert(i >= 0);
@@ -407,6 +412,7 @@ static void _save_wrap_buffers( struct gl_context *ctx )
 			  save->prim[i].start);
    mode = save->prim[i].mode;
    weak = save->prim[i].weak;
+   no_current_update = save->prim[i].no_current_update;
    
    /* store the copied vertices, and allocate a new list.
     */
@@ -416,6 +422,7 @@ static void _save_wrap_buffers( struct gl_context *ctx )
     */
    save->prim[0].mode = mode;
    save->prim[0].weak = weak;
+   save->prim[0].no_current_update = no_current_update;
    save->prim[0].begin = 0;
    save->prim[0].end = 0;
    save->prim[0].pad = 0;
@@ -770,10 +777,11 @@ GLboolean vbo_save_NotifyBegin( struct gl_context *ctx, GLenum mode )
    GLuint i = save->prim_count++;
 
    assert(i < save->prim_max);
-   save->prim[i].mode = mode & ~VBO_SAVE_PRIM_WEAK;
+   save->prim[i].mode = mode & VBO_SAVE_PRIM_MODE_MASK;
    save->prim[i].begin = 1;
    save->prim[i].end = 0;
    save->prim[i].weak = (mode & VBO_SAVE_PRIM_WEAK) ? 1 : 0;
+   save->prim[i].no_current_update = (mode & VBO_SAVE_PRIM_NO_CURRENT_UPDATE) ? 1 : 0;
    save->prim[i].pad = 0;
    save->prim[i].start = save->vert_count;
    save->prim[i].count = 0;   
@@ -934,7 +942,7 @@ static void GLAPIENTRY _save_OBE_DrawArrays(GLenum mode, GLint start, GLsizei co
 
    _ae_map_vbos( ctx );
 
-   vbo_save_NotifyBegin( ctx, mode | VBO_SAVE_PRIM_WEAK );
+   vbo_save_NotifyBegin( ctx, mode | VBO_SAVE_PRIM_WEAK | VBO_SAVE_PRIM_NO_CURRENT_UPDATE);
 
    for (i = 0; i < count; i++)
        CALL_ArrayElement(GET_DISPATCH(), (start + i));
@@ -960,7 +968,7 @@ static void GLAPIENTRY _save_OBE_DrawElements(GLenum mode, GLsizei count, GLenum
    if (_mesa_is_bufferobj(ctx->Array.ElementArrayBufferObj))
       indices = ADD_POINTERS(ctx->Array.ElementArrayBufferObj->Pointer, indices);
 
-   vbo_save_NotifyBegin( ctx, mode | VBO_SAVE_PRIM_WEAK );
+   vbo_save_NotifyBegin( ctx, mode | VBO_SAVE_PRIM_WEAK | VBO_SAVE_PRIM_NO_CURRENT_UPDATE );
 
    switch (type) {
    case GL_UNSIGNED_BYTE: