100 files changed, 2601 insertions, 2588 deletions
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index ba8be125718..cdb2500f7c2 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -266,13 +266,16 @@ struct gen_mipmap_state
    GLuint FBO;
 };
 
-
+#define MAX_META_OPS_DEPTH      2
 /**
  * All per-context meta state.
  */
 struct gl_meta_state
 {
-   struct save_state Save;    /**< state saved during meta-ops */
+   /** Stack of state saved during meta-ops */
+   struct save_state Save[MAX_META_OPS_DEPTH];
+   /** Save stack depth */
+   GLuint SaveStackDepth;
 
    struct temp_texture TempTex;
 
@@ -324,8 +327,13 @@ _mesa_meta_free(struct gl_context *ctx)
 static void
 _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
 {
-   struct save_state *save = &ctx->Meta->Save;
+   struct save_state *save;
+
+   /* hope MAX_META_OPS_DEPTH is large enough */
+   assert(ctx->Meta->SaveStackDepth < MAX_META_OPS_DEPTH);
 
+   save = &ctx->Meta->Save[ctx->Meta->SaveStackDepth++];
+   memset(save, 0, sizeof(*save));
    save->SavedState = state;
 
    if (state & META_ALPHA_TEST) {
@@ -575,7 +583,7 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state)
 static void
 _mesa_meta_end(struct gl_context *ctx)
 {
-   struct save_state *save = &ctx->Meta->Save;
+   struct save_state *save = &ctx->Meta->Save[--ctx->Meta->SaveStackDepth];
    const GLbitfield state = save->SavedState;
 
    if (state & META_ALPHA_TEST) {
@@ -1398,6 +1406,7 @@ _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers)
    struct vertex verts[4];
    /* save all state but scissor, pixel pack/unpack */
    GLbitfield metaSave = META_ALL - META_SCISSOR - META_PIXEL_STORE;
+   const GLuint stencilMax = (1 << ctx->DrawBuffer->Visual.stencilBits) - 1;
 
    if (buffers & BUFFER_BITS_COLOR) {
       /* if clearing color buffers, don't save/restore colormask */
@@ -1453,7 +1462,7 @@ _mesa_meta_Clear(struct gl_context *ctx, GLbitfield buffers)
       _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
                               GL_REPLACE, GL_REPLACE, GL_REPLACE);
       _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
-                                ctx->Stencil.Clear & 0x7fffffff,
+                                ctx->Stencil.Clear & stencilMax,
                                 ctx->Stencil.WriteMask[0]);
    }
    else {
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index f943f81dd05..f32f3cf6020 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -176,6 +176,7 @@ i915CreateContext(int api,
    ctx->ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitCondCodes = GL_TRUE;
    ctx->ShaderCompilerOptions[MESA_SHADER_FRAGMENT].EmitNoIfs = GL_TRUE;
    ctx->ShaderCompilerOptions[MESA_SHADER_FRAGMENT].EmitNoNoise = GL_TRUE;
+   ctx->ShaderCompilerOptions[MESA_SHADER_FRAGMENT].EmitNoPow = GL_TRUE;
 
    ctx->Const.MaxDrawBuffers = 1;
 
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index c00ee415b6b..7a9fb7f088b 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -569,10 +569,14 @@ upload_program(struct i915_fragment_program *p)
 	 if (inst->DstReg.CondMask == COND_TR) {
 	    tmp = i915_get_utemp(p);
 
+	    /* The KIL instruction discards the fragment if any component of
+	     * the source is < 0.  Emit an immediate operand of {-1}.xywz.
+	     */
 	    i915_emit_texld(p, get_live_regs(p, inst),
 			    tmp, A0_DEST_CHANNEL_ALL,
 			    0, /* use a dummy dest reg */
-			    swizzle(tmp, ONE, ONE, ONE, ONE), /* always */
+			    negate(swizzle(tmp, ONE, ONE, ONE, ONE),
+				   1, 1, 1, 1),
 			    T0_TEXKILL);
 	 } else {
 	    p->error = 1;
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index e3ca863fe51..7c3ac0c14ef 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -81,7 +81,6 @@ DRIVER_SOURCES = \
 	brw_wm_emit.c \
 	brw_wm_fp.c \
 	brw_wm_iz.c \
-	brw_wm_glsl.c \
 	brw_wm_pass0.c \
 	brw_wm_pass1.c \
 	brw_wm_pass2.c \
diff --git a/src/mesa/drivers/dri/i965/brw_cc.c b/src/mesa/drivers/dri/i965/brw_cc.c
index a8369b07c35..d3a1233aac0 100644
--- a/src/mesa/drivers/dri/i965/brw_cc.c
+++ b/src/mesa/drivers/dri/i965/brw_cc.c
@@ -232,3 +232,28 @@ const struct brw_tracked_state brw_cc_unit = {
    .prepare = prepare_cc_unit,
    .emit = upload_cc_unit,
 };
+
+static void upload_blend_constant_color(struct brw_context *brw)
+{
+   struct gl_context *ctx = &brw->intel.ctx;
+   struct brw_blend_constant_color bcc;
+
+   memset(&bcc, 0, sizeof(bcc));
+   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc.header.length = sizeof(bcc)/4-2;
+   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
+   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
+   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
+   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+}
+
+const struct brw_tracked_state brw_blend_constant_color = {
+   .dirty = {
+      .mesa = _NEW_COLOR,
+      .brw = BRW_NEW_CONTEXT,
+      .cache = 0
+   },
+   .emit = upload_blend_constant_color
+};
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index cb0a8b96c9c..28549f2574a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -122,9 +122,6 @@ GLboolean brwCreateContext( int api,
 	 (i == MESA_SHADER_FRAGMENT);
       ctx->ShaderCompilerOptions[i].EmitNoIndirectTemp =
 	 (i == MESA_SHADER_FRAGMENT);
-
-      if (intel->gen == 6)
-	 ctx->ShaderCompilerOptions[i].EmitNoIfs = (i == MESA_SHADER_VERTEX);
    }
 
    ctx->Const.VertexProgram.MaxNativeInstructions = (16 * 1024);
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 335339515a2..7069724466a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -171,7 +171,6 @@ struct brw_vertex_program {
 struct brw_fragment_program {
    struct gl_fragment_program program;
    GLuint id;  /**< serial no. to identify frag progs, never re-used */
-   GLboolean isGLSL;  /**< really, any IF/LOOP/CONT/BREAK instructions */
 
    /** for debugging, which texture units are referenced */
    GLbitfield tex_units_used;
@@ -211,6 +210,7 @@ struct brw_wm_prog_data {
    GLuint nr_params;       /**< number of float params/constants */
    GLuint nr_pull_params;
    GLboolean error;
+   int dispatch_width;
 
    /* Pointer to tracked values (only valid once
     * _mesa_load_state_parameters has been called at runtime).
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 7b823eb201b..877b22fec19 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -242,21 +242,13 @@ static void prepare_constant_buffer(struct brw_context *brw)
       GLuint offset = brw->curbe.vs_start * 16;
       GLuint nr = brw->vs.prog_data->nr_params / 4;
 
-      if (vp->use_const_buffer) {
-	 /* Load the subset of push constants that will get used when
-	  * we also have a pull constant buffer.
-	  */
-	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
-	    if (brw->vs.constant_map[i] != -1) {
-	       assert(brw->vs.constant_map[i] <= nr);
-	       memcpy(buf + offset + brw->vs.constant_map[i] * 4,
-		      vp->program.Base.Parameters->ParameterValues[i],
-		      4 * sizeof(float));
-	    }
-	 }
-      } else {
-	 for (i = 0; i < nr; i++) {
-	    memcpy(buf + offset + i * 4,
+      /* Load the subset of push constants that will get used when
+       * we also have a pull constant buffer.
+       */
+      for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	 if (brw->vs.constant_map[i] != -1) {
+	    assert(brw->vs.constant_map[i] <= nr);
+	    memcpy(buf + offset + brw->vs.constant_map[i] * 4,
 		   vp->program.Base.Parameters->ParameterValues[i],
 		   4 * sizeof(float));
 	 }
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 239586a0366..7f3e4986808 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -462,6 +462,13 @@
 #define BRW_COMPRESSION_2NDHALF       1
 #define BRW_COMPRESSION_COMPRESSED    2
 
+#define GEN6_COMPRESSION_1Q		0
+#define GEN6_COMPRESSION_2Q		1
+#define GEN6_COMPRESSION_3Q		2
+#define GEN6_COMPRESSION_4Q		3
+#define GEN6_COMPRESSION_1H		0
+#define GEN6_COMPRESSION_2H		2
+
 #define BRW_CONDITIONAL_NONE  0
 #define BRW_CONDITIONAL_Z     1
 #define BRW_CONDITIONAL_NZ    2
@@ -1022,6 +1029,13 @@
 # define ATTRIBUTE_0_CONST_SOURCE_SHIFT			9
 # define ATTRIBUTE_0_SWIZZLE_SHIFT			6
 # define ATTRIBUTE_0_SOURCE_SHIFT			0
+
+# define ATTRIBUTE_SWIZZLE_INPUTATTR                    0
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_FACING             1
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_W                  2
+# define ATTRIBUTE_SWIZZLE_INPUTATTR_FACING_W           3
+# define ATTRIBUTE_SWIZZLE_SHIFT                        6
+
 /* DW16: Point sprite texture coordinate enables */
 /* DW17: Constant interpolation enables */
 /* DW18: attr 0-7 wrap shortest enables */
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 962c04128b8..6b61f7af15d 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -899,7 +899,8 @@ int brw_disasm (FILE *file, struct brw_instruction *inst, int gen)
 	err |= dest (file, inst);
     } else if (gen >= 6 && (inst->header.opcode == BRW_OPCODE_IF ||
 			    inst->header.opcode == BRW_OPCODE_ELSE ||
-			    inst->header.opcode == BRW_OPCODE_ENDIF)) {
+			    inst->header.opcode == BRW_OPCODE_ENDIF ||
+			    inst->header.opcode == BRW_OPCODE_WHILE)) {
        format (file, " %d", inst->bits1.branch_gen6.jump_count);
     }
 
diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c
index 2ff39e8e64a..3b5c4c071e3 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.c
+++ b/src/mesa/drivers/dri/i965/brw_eu.c
@@ -72,7 +72,37 @@ void brw_set_access_mode( struct brw_compile *p, GLuint access_mode )
 
 void brw_set_compression_control( struct brw_compile *p, GLboolean compression_control )
 {
-   p->current->header.compression_control = compression_control;
+   p->compressed = (compression_control == BRW_COMPRESSION_COMPRESSED);
+
+   if (p->brw->intel.gen >= 6) {
+      /* Since we don't use the 32-wide support in gen6, we translate
+       * the pre-gen6 compression control here.
+       */
+      switch (compression_control) {
+      case BRW_COMPRESSION_NONE:
+	 /* This is the "use the first set of bits of dmask/vmask/arf
+	  * according to execsize" option.
+	  */
+	 p->current->header.compression_control = GEN6_COMPRESSION_1Q;
+	 break;
+      case BRW_COMPRESSION_2NDHALF:
+	 /* For 8-wide, this is "use the second set of 8 bits." */
+	 p->current->header.compression_control = GEN6_COMPRESSION_2Q;
+	 break;
+      case BRW_COMPRESSION_COMPRESSED:
+	 /* For 16-wide instruction compression, use the first set of 16 bits
+	  * since we don't do 32-wide dispatch.
+	  */
+	 p->current->header.compression_control = GEN6_COMPRESSION_1H;
+	 break;
+      default:
+	 assert(!"not reached");
+	 p->current->header.compression_control = GEN6_COMPRESSION_1H;
+	 break;
+      }
+   } else {
+      p->current->header.compression_control = compression_control;
+   }
 }
 
 void brw_set_mask_control( struct brw_compile *p, GLuint value )
@@ -95,6 +125,7 @@ void brw_push_insn_state( struct brw_compile *p )
 {
    assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
    memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->compressed_stack[p->current - p->stack] = p->compressed;
    p->current++;   
 }
 
@@ -102,6 +133,7 @@ void brw_pop_insn_state( struct brw_compile *p )
 {
    assert(p->current != p->stack);
    p->current--;
+   p->compressed = p->compressed_stack[p->current - p->stack];
 }
 
 
@@ -112,6 +144,7 @@ void brw_init_compile( struct brw_context *brw, struct brw_compile *p )
    p->brw = brw;
    p->nr_insn = 0;
    p->current = p->stack;
+   p->compressed = false;
    memset(p->current, 0, sizeof(p->current[0]));
 
    /* Some defaults?
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index b4538e6e8a7..4dbdc522100 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -33,6 +33,7 @@
 #ifndef BRW_EU_H
 #define BRW_EU_H
 
+#include <stdbool.h>
 #include "brw_structs.h"
 #include "brw_defines.h"
 #include "program/prog_instruction.h"
@@ -106,10 +107,12 @@ struct brw_compile {
    /* Allow clients to push/pop instruction state:
     */
    struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   bool compressed_stack[BRW_EU_MAX_INSN_STACK];
    struct brw_instruction *current;
 
    GLuint flag_value;
    GLboolean single_program_flow;
+   bool compressed;
    struct brw_context *brw;
 
    struct brw_glsl_label *first_label;  /**< linked list of labels */
@@ -954,6 +957,8 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
 	       struct brw_instruction *patch_insn);
 
 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count);
+struct brw_instruction *brw_CONT_gen6(struct brw_compile *p,
+				      struct brw_instruction *do_insn);
 struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count);
 /* Forward jumps:
  */
@@ -1009,6 +1014,7 @@ void brw_math_invert( struct brw_compile *p,
 void brw_set_src1( struct brw_instruction *insn,
                           struct brw_reg reg );
 
+void brw_set_uip_jip(struct brw_compile *p);
 
 /* brw_optimize.c */
 void brw_optimize(struct brw_compile *p);
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 9cb941dacfd..9c764fe779d 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -41,19 +41,20 @@
  * Internal helper for constructing instructions
  */
 
-static void guess_execution_size( struct brw_instruction *insn,
-				  struct brw_reg reg )
+static void guess_execution_size(struct brw_compile *p,
+				 struct brw_instruction *insn,
+				 struct brw_reg reg)
 {
-   if (reg.width == BRW_WIDTH_8 && 
-       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED) 
+   if (reg.width == BRW_WIDTH_8 && p->compressed)
       insn->header.execution_size = BRW_EXECUTE_16;
    else
       insn->header.execution_size = reg.width;	/* note - definitions are compatible */
 }
 
 
-static void brw_set_dest( struct brw_instruction *insn,
-			  struct brw_reg dest )
+static void brw_set_dest(struct brw_compile *p,
+			 struct brw_instruction *insn,
+			 struct brw_reg dest)
 {
    if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
        dest.file != BRW_MESSAGE_REGISTER_FILE)
@@ -100,7 +101,7 @@ static void brw_set_dest( struct brw_instruction *insn,
    /* NEW: Set the execution size based on dest.width and
     * insn->compression_control:
     */
-   guess_execution_size(insn, dest);
+   guess_execution_size(p, insn, dest);
 }
 
 extern int reg_type_size[];
@@ -629,7 +630,7 @@ static struct brw_instruction *brw_alu1( struct brw_compile *p,
 					 struct brw_reg src )
 {
    struct brw_instruction *insn = next_insn(p, opcode);
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src);   
    return insn;
 }
@@ -641,7 +642,7 @@ static struct brw_instruction *brw_alu2(struct brw_compile *p,
 					struct brw_reg src1 )
 {
    struct brw_instruction *insn = next_insn(p, opcode);   
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
    return insn;
@@ -680,7 +681,7 @@ void brw_##OP(struct brw_compile *p,					      \
 {									      \
    struct brw_instruction *rnd, *add;					      \
    rnd = next_insn(p, BRW_OPCODE_##OP);					      \
-   brw_set_dest(rnd, dest);						      \
+   brw_set_dest(p, rnd, dest);						      \
    brw_set_src0(rnd, src);						      \
    rnd->header.destreg__conditionalmod = 0x7; /* turn on round-increments */  \
 									      \
@@ -779,7 +780,7 @@ struct brw_instruction *brw_MUL(struct brw_compile *p,
 void brw_NOP(struct brw_compile *p)
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);   
-   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
    brw_set_src1(insn, brw_imm_ud(0x0));
 }
@@ -840,11 +841,11 @@ struct brw_instruction *brw_IF(struct brw_compile *p, GLuint execute_size)
    /* Override the defaults for this instruction:
     */
    if (intel->gen < 6) {
-      brw_set_dest(insn, brw_ip_reg());
+      brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(insn, brw_ip_reg());
       brw_set_src1(insn, brw_imm_d(0x0));
    } else {
-      brw_set_dest(insn, brw_imm_w(0));
+      brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -870,7 +871,7 @@ brw_IF_gen6(struct brw_compile *p, uint32_t conditional,
 
    insn = next_insn(p, BRW_OPCODE_IF);
 
-   brw_set_dest(insn, brw_imm_w(0));
+   brw_set_dest(p, insn, brw_imm_w(0));
    insn->header.execution_size = BRW_EXECUTE_8;
    insn->bits1.branch_gen6.jump_count = 0;
    brw_set_src0(insn, src0);
@@ -905,11 +906,11 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
    }
 
    if (intel->gen < 6) {
-      brw_set_dest(insn, brw_ip_reg());
+      brw_set_dest(p, insn, brw_ip_reg());
       brw_set_src0(insn, brw_ip_reg());
       brw_set_src1(insn, brw_imm_d(0x0));
    } else {
-      brw_set_dest(insn, brw_imm_w(0));
+      brw_set_dest(p, insn, brw_imm_w(0));
       insn->bits1.branch_gen6.jump_count = 0;
       brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
@@ -965,11 +966,11 @@ void brw_ENDIF(struct brw_compile *p,
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
 
       if (intel->gen < 6) {
-	 brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+	 brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 	 brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
 	 brw_set_src1(insn, brw_imm_d(0x0));
       } else {
-	 brw_set_dest(insn, brw_imm_w(0));
+	 brw_set_dest(p, insn, brw_imm_w(0));
 	 brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 	 brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
       }
@@ -1029,16 +1030,44 @@ void brw_ENDIF(struct brw_compile *p,
 
 struct brw_instruction *brw_BREAK(struct brw_compile *p, int pop_count)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
+
    insn = next_insn(p, BRW_OPCODE_BREAK);
-   brw_set_dest(insn, brw_ip_reg());
+   if (intel->gen >= 6) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(insn, brw_imm_d(0x0));
+   } else {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(insn, brw_ip_reg());
+      brw_set_src1(insn, brw_imm_d(0x0));
+      insn->bits3.if_else.pad0 = 0;
+      insn->bits3.if_else.pop_count = pop_count;
+   }
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+
+   return insn;
+}
+
+struct brw_instruction *brw_CONT_gen6(struct brw_compile *p,
+				      struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+   int br = 2;
+
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   brw_set_dest(p, insn, brw_ip_reg());
    brw_set_src0(insn, brw_ip_reg());
    brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->bits3.break_cont.uip = br * (do_insn - insn);
+
    insn->header.compression_control = BRW_COMPRESSION_NONE;
    insn->header.execution_size = BRW_EXECUTE_8;
-   /* insn->header.mask_control = BRW_MASK_DISABLE; */
-   insn->bits3.if_else.pad0 = 0;
-   insn->bits3.if_else.pop_count = pop_count;
    return insn;
 }
 
@@ -1046,7 +1075,7 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
 {
    struct brw_instruction *insn;
    insn = next_insn(p, BRW_OPCODE_CONTINUE);
-   brw_set_dest(insn, brw_ip_reg());
+   brw_set_dest(p, insn, brw_ip_reg());
    brw_set_src0(insn, brw_ip_reg());
    brw_set_src1(insn, brw_imm_d(0x0));
    insn->header.compression_control = BRW_COMPRESSION_NONE;
@@ -1058,17 +1087,33 @@ struct brw_instruction *brw_CONT(struct brw_compile *p, int pop_count)
 }
 
 /* DO/WHILE loop:
+ *
+ * The DO/WHILE is just an unterminated loop -- break or continue are
+ * used for control within the loop.  We have a few ways they can be
+ * done.
+ *
+ * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
+ * jip and no DO instruction.
+ *
+ * For non-uniform control flow pre-gen6, there's a DO instruction to
+ * push the mask, and a WHILE to jump back, and BREAK to get out and
+ * pop the mask.
+ *
+ * For gen6, there's no more mask stack, so no need for DO.  WHILE
+ * just points back to the first instruction of the loop.
  */
 struct brw_instruction *brw_DO(struct brw_compile *p, GLuint execute_size)
 {
-   if (p->single_program_flow) {
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6 || p->single_program_flow) {
       return &p->store[p->nr_insn];
    } else {
       struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
 
       /* Override the defaults for this instruction:
        */
-      brw_set_dest(insn, brw_null_reg());
+      brw_set_dest(p, insn, brw_null_reg());
       brw_set_src0(insn, brw_null_reg());
       brw_set_src1(insn, brw_null_reg());
 
@@ -1094,34 +1139,42 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
    if (intel->gen >= 5)
       br = 2;
 
-   if (p->single_program_flow)
-      insn = next_insn(p, BRW_OPCODE_ADD);
-   else
+   if (intel->gen >= 6) {
       insn = next_insn(p, BRW_OPCODE_WHILE);
 
-   brw_set_dest(insn, brw_ip_reg());
-   brw_set_src0(insn, brw_ip_reg());
-   brw_set_src1(insn, brw_imm_d(0x0));
+      brw_set_dest(p, insn, brw_imm_w(0));
+      insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
+      brw_set_src0(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
 
-   insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = do_insn->header.execution_size;
+      assert(insn->header.execution_size == BRW_EXECUTE_8);
+   } else {
+      if (p->single_program_flow) {
+	 insn = next_insn(p, BRW_OPCODE_ADD);
 
-   if (p->single_program_flow) {
-      insn->header.execution_size = BRW_EXECUTE_1;
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(insn, brw_ip_reg());
+	 brw_set_src1(insn, brw_imm_d((do_insn - insn) * 16));
+	 insn->header.execution_size = BRW_EXECUTE_1;
+      } else {
+	 insn = next_insn(p, BRW_OPCODE_WHILE);
 
-      insn->bits3.d = (do_insn - insn) * 16;
-   } else {
-      insn->header.execution_size = do_insn->header.execution_size;
+	 assert(do_insn->header.opcode == BRW_OPCODE_DO);
 
-      assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
-      insn->bits3.if_else.pop_count = 0;
-      insn->bits3.if_else.pad0 = 0;
-   }
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(insn, brw_ip_reg());
+	 brw_set_src1(insn, brw_imm_d(0));
 
-/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+	 insn->header.execution_size = do_insn->header.execution_size;
+	 insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
+	 insn->bits3.if_else.pop_count = 0;
+	 insn->bits3.if_else.pad0 = 0;
+      }
+   }
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
-   /* insn->header.mask_control = BRW_MASK_DISABLE; */
-   p->current->header.predicate_control = BRW_PREDICATE_NONE;   
    return insn;
 }
 
@@ -1159,7 +1212,7 @@ void brw_CMP(struct brw_compile *p,
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
 
    insn->header.destreg__conditionalmod = conditional;
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
 
@@ -1184,7 +1237,7 @@ void brw_WAIT (struct brw_compile *p)
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
    struct brw_reg src = brw_notification_1_reg();
 
-   brw_set_dest(insn, src);
+   brw_set_dest(p, insn, src);
    brw_set_src0(insn, src);
    brw_set_src1(insn, brw_null_reg());
    insn->header.execution_size = 0; /* must */
@@ -1219,6 +1272,10 @@ void brw_math( struct brw_compile *p,
       assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
       assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
 
+      /* Source modifiers are ignored for extended math instructions. */
+      assert(!src.negate);
+      assert(!src.abs);
+
       if (function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT &&
 	  function != BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
 	 assert(src.type == BRW_REGISTER_TYPE_F);
@@ -1228,8 +1285,9 @@ void brw_math( struct brw_compile *p,
        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
        */
       insn->header.destreg__conditionalmod = function;
+      insn->header.saturate = saturate;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_src1(insn, brw_null_reg());
    } else {
@@ -1242,7 +1300,7 @@ void brw_math( struct brw_compile *p,
       insn->header.predicate_control = 0;
       insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_math_message(p->brw,
 			   insn,
@@ -1284,12 +1342,18 @@ void brw_math2(struct brw_compile *p,
       assert(src1.type == BRW_REGISTER_TYPE_F);
    }
 
+   /* Source modifiers are ignored for extended math instructions. */
+   assert(!src0.negate);
+   assert(!src0.abs);
+   assert(!src1.negate);
+   assert(!src1.abs);
+
    /* Math is the same ISA format as other opcodes, except that CondModifier
     * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
     */
    insn->header.destreg__conditionalmod = function;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, src1);
 }
@@ -1318,8 +1382,13 @@ void brw_math_16( struct brw_compile *p,
        * becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
        */
       insn->header.destreg__conditionalmod = function;
+      insn->header.saturate = saturate;
+
+      /* Source modifiers are ignored for extended math instructions. */
+      assert(!src.negate);
+      assert(!src.abs);
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src);
       brw_set_src1(insn, brw_null_reg());
       return;
@@ -1334,7 +1403,7 @@ void brw_math_16( struct brw_compile *p,
    insn = next_insn(p, BRW_OPCODE_SEND);
    insn->header.destreg__conditionalmod = msg_reg_nr;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src);
    brw_set_math_message(p->brw,
 			insn, 
@@ -1351,7 +1420,7 @@ void brw_math_16( struct brw_compile *p,
    insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
    insn->header.destreg__conditionalmod = msg_reg_nr+1;
 
-   brw_set_dest(insn, offset(dest,1));
+   brw_set_dest(p, insn, offset(dest,1));
    brw_set_src0(insn, src);
    brw_set_math_message(p->brw, 
 			insn, 
@@ -1446,7 +1515,7 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
 	 send_commit_msg = 1;
       }
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
       brw_set_dp_write_message(p->brw,
@@ -1516,7 +1585,7 @@ brw_oword_block_read_scratch(struct brw_compile *p,
       insn->header.compression_control = BRW_COMPRESSION_NONE;
       insn->header.destreg__conditionalmod = mrf.nr;
 
-      brw_set_dest(insn, dest);	/* UW? */
+      brw_set_dest(p, insn, dest);	/* UW? */
       brw_set_src0(insn, brw_null_reg());
 
       brw_set_dp_read_message(p->brw,
@@ -1569,7 +1638,7 @@ void brw_oword_block_read(struct brw_compile *p,
    /* cast dest to a uword[8] vector */
    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    if (intel->gen >= 6) {
       brw_set_src0(insn, mrf);
    } else {
@@ -1614,7 +1683,7 @@ void brw_dword_scattered_read(struct brw_compile *p,
    /* cast dest to a uword[8] vector */
    dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, brw_null_reg());
 
    brw_set_dp_read_message(p->brw,
@@ -1639,29 +1708,21 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
                       GLuint location,
                       GLuint bind_table_index)
 {
+   struct intel_context *intel = &p->brw->intel;
    struct brw_instruction *insn;
    GLuint msg_reg_nr = 1;
-   struct brw_reg b;
 
-   /*
-   printf("vs const read msg, location %u, msg_reg_nr %d\n",
-          location, msg_reg_nr);
-   */
+   if (intel->gen >= 6)
+      location /= 16;
 
    /* Setup MRF[1] with location/offset into const buffer */
    brw_push_insn_state(p);
    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-
-   /* XXX I think we're setting all the dwords of MRF[1] to 'location'.
-    * when the docs say only dword[2] should be set.  Hmmm.  But it works.
-    */
-   b = brw_message_reg(msg_reg_nr);
-   b = retype(b, BRW_REGISTER_TYPE_UD);
-   /*b = get_element_ud(b, 2);*/
-   brw_MOV(p, b, brw_imm_ud(location));
-
+   brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 2),
+		     BRW_REGISTER_TYPE_UD),
+	   brw_imm_ud(location));
    brw_pop_insn_state(p);
 
    insn = next_insn(p, BRW_OPCODE_SEND);
@@ -1671,8 +1732,12 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
    insn->header.destreg__conditionalmod = msg_reg_nr;
    insn->header.mask_control = BRW_MASK_DISABLE;
 
-   brw_set_dest(insn, dest);
-   brw_set_src0(insn, brw_null_reg());
+   brw_set_dest(p, insn, dest);
+   if (intel->gen >= 6) {
+      brw_set_src0(insn, brw_message_reg(msg_reg_nr));
+   } else {
+      brw_set_src0(insn, brw_null_reg());
+   }
 
    brw_set_dp_read_message(p->brw,
 			   insn,
@@ -1706,7 +1771,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    /* M1.0 is block offset 0, M1.4 is block offset 1, all other
     * fields ignored.
     */
-   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD),
+   brw_ADD(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_D),
 	   addr_reg, brw_imm_d(offset));
    brw_pop_insn_state(p);
 
@@ -1717,7 +1782,7 @@ void brw_dp_READ_4_vs_relative(struct brw_compile *p,
    insn->header.destreg__conditionalmod = 0;
    insn->header.mask_control = BRW_MASK_DISABLE;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, brw_vec8_grf(0, 0));
 
    if (intel->gen == 6)
@@ -1782,7 +1847,7 @@ void brw_fb_WRITE(struct brw_compile *p,
    else
       msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_dp_write_message(p->brw,
 			    insn,
@@ -1860,7 +1925,7 @@ void brw_SAMPLE(struct brw_compile *p,
 
 	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
 
-	 guess_execution_size(p->current, dest);
+	 guess_execution_size(p, p->current, dest);
 	 if (p->current->header.execution_size == BRW_EXECUTE_16)
 	    dispatch_16 = GL_TRUE;
 
@@ -1895,12 +1960,15 @@ void brw_SAMPLE(struct brw_compile *p,
        * and the first message register index comes from src0.
        */
       if (intel->gen >= 6) {
-	  brw_push_insn_state(p);
-	  brw_set_mask_control( p, BRW_MASK_DISABLE );
-	  /* m1 contains header? */
-	  brw_MOV(p, brw_message_reg(msg_reg_nr), src0);
-	  brw_pop_insn_state(p);
-	  src0 = brw_message_reg(msg_reg_nr);
+	 if (src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	     src0.nr != BRW_ARF_NULL) {
+	    brw_push_insn_state(p);
+	    brw_set_mask_control( p, BRW_MASK_DISABLE );
+	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	    brw_MOV(p, retype(brw_message_reg(msg_reg_nr), src0.type), src0);
+	    brw_pop_insn_state(p);
+	 }
+	 src0 = brw_message_reg(msg_reg_nr);
       }
 
       insn = next_insn(p, BRW_OPCODE_SEND);
@@ -1909,7 +1977,7 @@ void brw_SAMPLE(struct brw_compile *p,
       if (intel->gen < 6)
 	  insn->header.destreg__conditionalmod = msg_reg_nr;
 
-      brw_set_dest(insn, dest);
+      brw_set_dest(p, insn, dest);
       brw_set_src0(insn, src0);
       brw_set_sampler_message(p->brw, insn,
 			      binding_table_index,
@@ -1970,7 +2038,7 @@ void brw_urb_WRITE(struct brw_compile *p,
 
    assert(msg_length < BRW_MAX_MRF);
 
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
@@ -1989,6 +2057,80 @@ void brw_urb_WRITE(struct brw_compile *p,
 		       swizzle);
 }
 
+static int
+brw_find_next_block_end(struct brw_compile *p, int start)
+{
+   int ip;
+
+   for (ip = start + 1; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      switch (insn->header.opcode) {
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_WHILE:
+	 return ip;
+      }
+   }
+   assert(!"not reached");
+   return start + 1;
+}
+
+/* There is no DO instruction on gen6, so to find the end of the loop
+ * we have to see if the loop is jumping back before our start
+ * instruction.
+ */
+static int
+brw_find_loop_end(struct brw_compile *p, int start)
+{
+   int ip;
+   int br = 2;
+
+   for (ip = start + 1; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      if (insn->header.opcode == BRW_OPCODE_WHILE) {
+	 if (ip + insn->bits1.branch_gen6.jump_count / br < start)
+	    return ip;
+      }
+   }
+   assert(!"not reached");
+   return start + 1;
+}
+
+/* After program generation, go back and update the UIP and JIP of
+ * BREAK and CONT instructions to their correct locations.
+ */
+void
+brw_set_uip_jip(struct brw_compile *p)
+{
+   struct intel_context *intel = &p->brw->intel;
+   int ip;
+   int br = 2;
+
+   if (intel->gen < 6)
+      return;
+
+   for (ip = 0; ip < p->nr_insn; ip++) {
+      struct brw_instruction *insn = &p->store[ip];
+
+      switch (insn->header.opcode) {
+      case BRW_OPCODE_BREAK:
+	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+	 insn->bits3.break_cont.uip = br * (brw_find_loop_end(p, ip) - ip + 1);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+	 /* JIP is set at CONTINUE emit time, since that's when we
+	  * know where the start of the loop is.
+	  */
+	 insn->bits3.break_cont.jip = br * (brw_find_next_block_end(p, ip) - ip);
+	 assert(insn->bits3.break_cont.uip != 0);
+	 assert(insn->bits3.break_cont.jip != 0);
+	 break;
+      }
+   }
+}
+
 void brw_ff_sync(struct brw_compile *p,
 		   struct brw_reg dest,
 		   GLuint msg_reg_nr,
@@ -2013,7 +2155,7 @@ void brw_ff_sync(struct brw_compile *p,
    }
 
    insn = next_insn(p, BRW_OPCODE_SEND);
-   brw_set_dest(insn, dest);
+   brw_set_dest(p, insn, dest);
    brw_set_src0(insn, src0);
    brw_set_src1(insn, brw_imm_d(0));
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index edb02fabb23..c3cbe0df618 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -600,8 +600,13 @@ fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
     * might be able to do better by doing execsize = 1 math and then
     * expanding that result out, but we would need to be careful with
     * masking.
+    *
+    * The hardware ignores source modifiers (negate and abs) on math
+    * instructions, so we also move to a temp to set those up.
     */
-   if (intel->gen >= 6 && src.file == UNIFORM) {
+   if (intel->gen >= 6 && (src.file == UNIFORM ||
+			   src.abs ||
+			   src.negate)) {
       fs_reg expanded = fs_reg(this, glsl_type::float_type);
       emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
       src = expanded;
@@ -933,6 +938,10 @@ fs_visitor::visit(ir_expression *ir)
       assert(!"not reached: should be handled by lower_noise");
       break;
 
+   case ir_quadop_vector:
+      assert(!"not reached: should be handled by lower_quadop_vector");
+      break;
+
    case ir_unop_sqrt:
       emit_math(FS_OPCODE_SQRT, this->result, op[0]);
       break;
@@ -1423,28 +1432,70 @@ fs_visitor::visit(ir_discard *ir)
 void
 fs_visitor::visit(ir_constant *ir)
 {
-   fs_reg reg(this, ir->type);
-   this->result = reg;
+   /* Set this->result to reg at the bottom of the function because some code
+    * paths will cause this visitor to be applied to other fields.  This will
+    * cause the value stored in this->result to be modified.
+    *
+    * Make reg constant so that it doesn't get accidentally modified along the
+    * way.  Yes, I actually had this problem. :(
+    */
+   const fs_reg reg(this, ir->type);
+   fs_reg dst_reg = reg;
 
-   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
-      switch (ir->type->base_type) {
-      case GLSL_TYPE_FLOAT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.f[i])));
-	 break;
-      case GLSL_TYPE_UINT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.u[i])));
-	 break;
-      case GLSL_TYPE_INT:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg(ir->value.i[i])));
-	 break;
-      case GLSL_TYPE_BOOL:
-	 emit(fs_inst(BRW_OPCODE_MOV, reg, fs_reg((int)ir->value.b[i])));
-	 break;
-      default:
-	 assert(!"Non-float/uint/int/bool constant");
+   if (ir->type->is_array()) {
+      const unsigned size = type_size(ir->type->fields.array);
+
+      for (unsigned i = 0; i < ir->type->length; i++) {
+	 ir->array_elements[i]->accept(this);
+	 fs_reg src_reg = this->result;
+
+	 dst_reg.type = src_reg.type;
+	 for (unsigned j = 0; j < size; j++) {
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
+	    src_reg.reg_offset++;
+	    dst_reg.reg_offset++;
+	 }
+      }
+   } else if (ir->type->is_record()) {
+      foreach_list(node, &ir->components) {
+	 ir_instruction *const field = (ir_instruction *) node;
+	 const unsigned size = type_size(field->type);
+
+	 field->accept(this);
+	 fs_reg src_reg = this->result;
+
+	 dst_reg.type = src_reg.type;
+	 for (unsigned j = 0; j < size; j++) {
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
+	    src_reg.reg_offset++;
+	    dst_reg.reg_offset++;
+	 }
+      }
+   } else {
+      const unsigned size = type_size(ir->type);
+
+      for (unsigned i = 0; i < size; i++) {
+	 switch (ir->type->base_type) {
+	 case GLSL_TYPE_FLOAT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])));
+	    break;
+	 case GLSL_TYPE_UINT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])));
+	    break;
+	 case GLSL_TYPE_INT:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])));
+	    break;
+	 case GLSL_TYPE_BOOL:
+	    emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])));
+	    break;
+	 default:
+	    assert(!"Non-float/uint/int/bool constant");
+	 }
+	 dst_reg.reg_offset++;
       }
-      reg.reg_offset++;
    }
+
+   this->result = reg;
 }
 
 void
@@ -1574,7 +1625,7 @@ fs_visitor::emit_if_gen6(ir_if *ir)
 
       switch (expr->operation) {
       case ir_unop_logic_not:
-	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(1)));
+	 inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0)));
 	 inst->conditional_mod = BRW_CONDITIONAL_Z;
 	 return;
 
@@ -1951,7 +2002,7 @@ fs_visitor::emit_interpolation_setup_gen6()
    emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
 
    this->current_annotation = "compute 1/pos.w";
-   this->wpos_w = fs_reg(brw_vec8_grf(c->key.source_w_reg, 0));
+   this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
    this->pixel_w = fs_reg(this, glsl_type::float_type);
    emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
 
@@ -1979,17 +2030,17 @@ fs_visitor::emit_fb_writes()
       nr += 2;
    }
 
-   if (c->key.aa_dest_stencil_reg) {
+   if (c->aa_dest_stencil_reg) {
       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		   fs_reg(brw_vec8_grf(c->key.aa_dest_stencil_reg, 0))));
+		   fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
    }
 
    /* Reserve space for color. It'll be filled in per MRT below. */
    int color_mrf = nr;
    nr += 4;
 
-   if (c->key.source_depth_to_render_target) {
-      if (c->key.computes_depth) {
+   if (c->source_depth_to_render_target) {
+      if (c->computes_depth) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth);
 	 fs_reg depth = *(variable_storage(this->frag_depth));
@@ -1998,20 +2049,22 @@ fs_visitor::emit_fb_writes()
       } else {
 	 /* Pass through the payload depth. */
 	 emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		      fs_reg(brw_vec8_grf(c->key.source_depth_reg, 0))));
+		      fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
       }
    }
 
-   if (c->key.dest_depth_reg) {
+   if (c->dest_depth_reg) {
       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
-		   fs_reg(brw_vec8_grf(c->key.dest_depth_reg, 0))));
+		   fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
    }
 
    fs_reg color = reg_undef;
    if (this->frag_color)
       color = *(variable_storage(this->frag_color));
-   else if (this->frag_data)
+   else if (this->frag_data) {
       color = *(variable_storage(this->frag_data));
+      color.type = BRW_REGISTER_TYPE_F;
+   }
 
    for (int target = 0; target < c->key.nr_color_regions; target++) {
       this->current_annotation = talloc_asprintf(this->mem_ctx,
@@ -2452,7 +2505,7 @@ fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
 void
 fs_visitor::assign_curb_setup()
 {
-   c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
+   c->prog_data.first_curbe_grf = c->nr_payload_regs;
    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
 
    /* Map the offsets in the UNIFORM file to fixed HW regs. */
@@ -3227,6 +3280,7 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
 	 break;
       default:
 	 assert(!"not reached");
+	 brw_reg = brw_null_reg();
 	 break;
       }
       break;
@@ -3241,6 +3295,10 @@ static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
       assert(!"not reached");
       brw_reg = brw_null_reg();
       break;
+   default:
+      assert(!"not reached");
+      brw_reg = brw_null_reg();
+      break;
    }
    if (reg->abs)
       brw_reg = brw_abs(brw_reg);
@@ -3373,10 +3431,6 @@ fs_visitor::generate_code()
 	 break;
 
       case BRW_OPCODE_DO:
-	 /* FINISHME: We need to write the loop instruction support still. */
-	 if (intel->gen >= 6)
-	    this->fail = true;
-
 	 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
 	 if_depth_in_loop[loop_stack_depth] = 0;
 	 break;
@@ -3386,7 +3440,11 @@ fs_visitor::generate_code()
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 	 break;
       case BRW_OPCODE_CONTINUE:
-	 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
+	 /* FINISHME: We need to write the loop instruction support still. */
+	 if (intel->gen >= 6)
+	    brw_CONT_gen6(p, loop_stack[loop_stack_depth - 1]);
+	 else
+	    brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
 	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 	 break;
 
@@ -3400,16 +3458,18 @@ fs_visitor::generate_code()
 	 assert(loop_stack_depth > 0);
 	 loop_stack_depth--;
 	 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
-	 /* patch all the BREAK/CONT instructions from last BGNLOOP */
-	 while (inst0 > loop_stack[loop_stack_depth]) {
-	    inst0--;
-	    if (inst0->header.opcode == BRW_OPCODE_BREAK &&
-		inst0->bits3.if_else.jump_count == 0) {
-	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BGNLOOP */
+	    while (inst0 > loop_stack[loop_stack_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+		   inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
 	    }
-	    else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-		     inst0->bits3.if_else.jump_count == 0) {
-	       inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
 	    }
 	 }
       }
@@ -3486,6 +3546,26 @@ fs_visitor::generate_code()
 
       last_native_inst = p->nr_insn;
    }
+
+   brw_set_uip_jip(p);
+
+   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
+    * emit issues, it doesn't get the jump distances into the output,
+    * which is often something we want to debug.  So this is here in
+    * case you're doing that.
+    */
+   if (0) {
+      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+	 for (unsigned int i = 0; i < p->nr_insn; i++) {
+	    printf("0x%08x 0x%08x 0x%08x 0x%08x ",
+		   ((uint32_t *)&p->store[i])[3],
+		   ((uint32_t *)&p->store[i])[2],
+		   ((uint32_t *)&p->store[i])[1],
+		   ((uint32_t *)&p->store[i])[0]);
+	    brw_disasm(stdout, &p->store[i], intel->gen);
+	 }
+      }
+   }
 }
 
 GLboolean
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 3b7b03a05b8..20bfa4c3ea3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -205,6 +205,8 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_round_even:
    case ir_unop_sin:
    case ir_unop_cos:
+   case ir_unop_sin_reduced:
+   case ir_unop_cos_reduced:
    case ir_unop_dFdx:
    case ir_unop_dFdy:
       for (i = 0; i < vector_elements; i++) {
@@ -328,6 +330,9 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_noise:
       assert(!"noise should have been broken down to function call");
       break;
+   case ir_quadop_vector:
+      assert(!"should have been lowered");
+      break;
    }
 
    ir->remove();
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index b0c76f4094d..73b41fdbcef 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -166,6 +166,9 @@ static void populate_key( struct brw_context *brw,
 			  struct brw_gs_prog_key *key )
 {
    struct gl_context *ctx = &brw->intel.ctx;
+   struct intel_context *intel = &brw->intel;
+   int prim_gs_always;
+
    memset(key, 0, sizeof(*key));
 
    /* CACHE_NEW_VS_PROG */
@@ -185,10 +188,14 @@ static void populate_key( struct brw_context *brw,
       key->pv_first = GL_TRUE;
    }
 
-   key->need_gs_prog = (key->hint_gs_always ||
-			brw->primitive == GL_QUADS ||
+   if (intel->gen == 6)
+       prim_gs_always = brw->primitive == GL_LINE_LOOP;
+   else
+       prim_gs_always = brw->primitive == GL_QUADS ||
 			brw->primitive == GL_QUAD_STRIP ||
-			brw->primitive == GL_LINE_LOOP);
+			brw->primitive == GL_LINE_LOOP;
+
+   key->need_gs_prog = (key->hint_gs_always || prim_gs_always);
 }
 
 /* Calculate interpolants for triangle and line rasterization.
@@ -205,8 +212,10 @@ static void prepare_gs_prog(struct brw_context *brw)
       brw->gs.prog_active = key.need_gs_prog;
    }
 
+   drm_intel_bo_unreference(brw->gs.prog_bo);
+   brw->gs.prog_bo = NULL;
+
    if (brw->gs.prog_active) {
-      drm_intel_bo_unreference(brw->gs.prog_bo);
       brw->gs.prog_bo = brw_search_cache(&brw->cache, BRW_GS_PROG,
 					 &key, sizeof(key),
 					 NULL, 0,
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 1d350bc0413..a91b0528fac 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -38,40 +38,6 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
-
-
-
-
-/***********************************************************************
- * Blend color
- */
-
-static void upload_blend_constant_color(struct brw_context *brw)
-{
-   struct gl_context *ctx = &brw->intel.ctx;
-   struct brw_blend_constant_color bcc;
-
-   memset(&bcc, 0, sizeof(bcc));      
-   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
-   bcc.header.length = sizeof(bcc)/4-2;
-   bcc.blend_constant_color[0] = ctx->Color.BlendColor[0];
-   bcc.blend_constant_color[1] = ctx->Color.BlendColor[1];
-   bcc.blend_constant_color[2] = ctx->Color.BlendColor[2];
-   bcc.blend_constant_color[3] = ctx->Color.BlendColor[3];
-
-   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
-}
-
-
-const struct brw_tracked_state brw_blend_constant_color = {
-   .dirty = {
-      .mesa = _NEW_COLOR,
-      .brw = BRW_NEW_CONTEXT,
-      .cache = 0
-   },
-   .emit = upload_blend_constant_color
-};
-
 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 static void upload_drawing_rect(struct brw_context *brw)
 {
@@ -339,6 +305,9 @@ static void upload_polygon_stipple(struct brw_context *brw)
    struct brw_polygon_stipple bps;
    GLuint i;
 
+   if (!ctx->Polygon.StippleFlag)
+      return;
+
    memset(&bps, 0, sizeof(bps));
    bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
    bps.header.length = sizeof(bps)/4-2;
@@ -381,6 +350,9 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
    struct gl_context *ctx = &brw->intel.ctx;
    struct brw_polygon_stipple_offset bpso;
 
+   if (!ctx->Polygon.StippleFlag)
+      return;
+
    memset(&bpso, 0, sizeof(bpso));
    bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
    bpso.header.length = sizeof(bpso)/4-2;
@@ -409,7 +381,7 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
 
 const struct brw_tracked_state brw_polygon_stipple_offset = {
    .dirty = {
-      .mesa = _NEW_WINDOW_POS,
+      .mesa = _NEW_WINDOW_POS | _NEW_POLYGONSTIPPLE,
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
@@ -421,9 +393,10 @@ const struct brw_tracked_state brw_polygon_stipple_offset = {
  */
 static void upload_aa_line_parameters(struct brw_context *brw)
 {
+   struct gl_context *ctx = &brw->intel.ctx;
    struct brw_aa_line_parameters balp;
 
-   if (!brw->has_aa_line_parameters)
+   if (!ctx->Line.SmoothFlag || !brw->has_aa_line_parameters)
       return;
 
    /* use legacy aa line coverage computation */
@@ -436,7 +409,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
 
 const struct brw_tracked_state brw_aa_line_parameters = {
    .dirty = {
-      .mesa = 0,
+      .mesa = _NEW_LINE,
       .brw = BRW_NEW_CONTEXT,
       .cache = 0
    },
@@ -454,6 +427,9 @@ static void upload_line_stipple(struct brw_context *brw)
    GLfloat tmp;
    GLint tmpi;
 
+   if (!ctx->Line.StippleFlag)
+      return;
+
    memset(&bls, 0, sizeof(bls));
    bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
    bls.header.length = sizeof(bls)/4 - 2;
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 1367d814696..94efa791091 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -142,7 +142,6 @@ static GLboolean brwProgramStringNotify( struct gl_context *ctx,
       if (newFP == curFP)
 	 brw->state.dirty.brw |= BRW_NEW_FRAGMENT_PROGRAM;
       newFP->id = brw->program_id++;      
-      newFP->isGLSL = brw_wm_is_glsl(fprog);
 
       /* Don't reject fragment shaders for their Mesa IR state when we're
        * using the new FS backend.
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 338f3876b31..eba4411ca70 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -129,7 +129,7 @@ const struct brw_tracked_state *gen6_atoms[] =
 
    &brw_vs_constants, /* Before vs_surfaces and constant_buffer */
    &brw_wm_constants, /* Before wm_surfaces and constant_buffer */
-   &gen6_wm_constants, /* Before wm_surfaces and constant_buffer */
+   &gen6_wm_constants, /* Before wm_state */
 
    &brw_vs_surfaces,		/* must do before unit */
    &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 8ce9af9c4fe..461f27048cc 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -1064,6 +1064,15 @@ struct brw_sampler_default_color {
    GLfloat color[4];
 };
 
+struct gen5_sampler_default_color {
+   uint8_t ub[4];
+   float f[4];
+   uint16_t hf[4];
+   uint16_t us[4];
+   int16_t s[4];
+   uint8_t b[4];
+};
+
 struct brw_sampler_state
 {
    
@@ -1169,7 +1178,12 @@ struct brw_surface_state
       GLuint cube_neg_y:1; 
       GLuint cube_pos_x:1; 
       GLuint cube_neg_x:1; 
-      GLuint pad:4;
+      GLuint pad:2;
+      /* Required on gen6 for surfaces accessed through render cache messages.
+       */
+      GLuint render_cache_read_write:1;
+      /* Ironlake and newer: instead of replicating one of the texels */
+      GLuint cube_corner_average:1;
       GLuint mipmap_layout_mode:1; 
       GLuint vert_line_stride_ofs:1; 
       GLuint vert_line_stride:1; 
@@ -1539,6 +1553,21 @@ struct brw_instruction
 	 GLuint  pad0:12;
       } if_else;
 
+      struct
+      {
+	 /* Signed jump distance to the ip to jump to if all channels
+	  * are disabled after the break or continue.  It should point
+	  * to the end of the innermost control flow block, as that's
+	  * where some channel could get re-enabled.
+	  */
+	 int jip:16;
+
+	 /* Signed jump distance to the location to resume execution
+	  * of this channel if it's enabled for the break or continue.
+	  */
+	 int uip:16;
+      } break_cont;
+
       struct {
 	 GLuint function:4;
 	 GLuint int_type:1;
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index 4a41c7a5176..6ae75d22c14 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -99,8 +99,8 @@ static void do_vs_prog( struct brw_context *brw,
    (void) ctx;
 
    aux_size = sizeof(c.prog_data);
-   if (c.vp->use_const_buffer)
-      aux_size += c.vp->program.Base.Parameters->NumParameters;
+   /* constant_map */
+   aux_size += c.vp->program.Base.Parameters->NumParameters;
 
    drm_intel_bo_unreference(brw->vs.prog_bo);
    brw->vs.prog_bo = brw_upload_cache_with_auxdata(&brw->cache, BRW_VS_PROG,
@@ -130,6 +130,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
    key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
    key.copy_edgeflag = (ctx->Polygon.FrontMode != GL_FILL ||
 			ctx->Polygon.BackMode != GL_FILL);
+   key.two_side_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
    /* _NEW_POINT */
    if (ctx->Point.PointSprite) {
@@ -157,7 +158,7 @@ static void brw_upload_vs_prog(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_vs_prog = {
    .dirty = {
-      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON | _NEW_POINT,
+      .mesa  = _NEW_TRANSFORM | _NEW_POLYGON | _NEW_POINT | _NEW_LIGHT,
       .brw   = BRW_NEW_VERTEX_PROGRAM,
       .cache = 0
    },
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 9338a6b7dbf..0b88cc1ec76 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -44,6 +44,7 @@ struct brw_vs_prog_key {
    GLuint nr_userclip:4;
    GLuint copy_edgeflag:1;
    GLuint point_coord_replace:8;
+   GLuint two_side_color: 1;
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index 7e43324a1f9..09887dae95d 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -140,9 +140,13 @@ clear_current_const(struct brw_vs_compile *c)
 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    struct intel_context *intel = &c->func.brw->intel;
-   GLuint i, reg = 0, mrf;
+   GLuint i, reg = 0, mrf, j;
    int attributes_in_vue;
    int first_reladdr_output;
+   int max_constant;
+   int constant = 0;
+   int vert_result_reoder[VERT_RESULT_MAX];
+   int bfc = 0;
 
    /* Determine whether to use a real constant buffer or use a block
     * of GRF registers for constants.  The later is faster but only
@@ -181,62 +185,81 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 
    }
 
-   /* Vertex program parameters from curbe:
+   /* Assign some (probably all) of the vertex program constants to
+    * the push constant buffer/CURBE.
+    *
+    * There's an obvious limit to the numer of push constants equal to
+    * the number of register available, and that number is smaller
+    * than the minimum maximum number of vertex program parameters, so
+    * support for pull constants is required if we overflow.
+    * Additionally, on gen6 the number of push constants is even
+    * lower.
+    *
+    * When there's relative addressing, we don't know what range of
+    * Mesa IR registers can be accessed.  And generally, when relative
+    * addressing is used we also have too many constants to load them
+    * all as push constants.  So, we'll just support relative
+    * addressing out of the pull constant buffers, and try to load as
+    * many statically-accessed constants into the push constant buffer
+    * as we can.
     */
-   if (c->vp->use_const_buffer) {
-      int max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
-      int constant = 0;
-
-      /* We've got more constants than we can load with the push
-       * mechanism.  This is often correlated with reladdr loads where
-       * we should probably be using a pull mechanism anyway to avoid
-       * excessive reading.  However, the pull mechanism is slow in
-       * general.  So, we try to allocate as many non-reladdr-loaded
-       * constants through the push buffer as we can before giving up.
-       */
-      memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
-      for (i = 0;
-	   i < c->vp->program.Base.NumInstructions && constant < max_constant;
-	   i++) {
-	 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
-	 int arg;
-
-	 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
-	    if ((inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
-		 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
-		 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
-		 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
-		 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) ||
-		inst->SrcReg[arg].RelAddr)
-	       continue;
-
-	    if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
-	       c->constant_map[inst->SrcReg[arg].Index] = constant++;
-	    }
+   if (intel->gen >= 6) {
+      /* We can only load 32 regs of push constants. */
+      max_constant = 32 * 2 - c->key.nr_userclip;
+   } else {
+      max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
+   }
+
+   /* constant_map maps from ParameterValues[] index to index in the
+    * push constant buffer, or -1 if it's only in the pull constant
+    * buffer.
+    */
+   memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
+   for (i = 0;
+	i < c->vp->program.Base.NumInstructions && constant < max_constant;
+	i++) {
+      struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
+      int arg;
+
+      for (arg = 0; arg < 3 && constant < max_constant; arg++) {
+	 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
+	     inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
+	     inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
+	     inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
+	     inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
+	    continue;
 	 }
-      }
 
-      for (i = 0; i < constant; i++) {
-         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2,
-							      (i%2) * 4),
-						 0, 4, 1);
+	 if (inst->SrcReg[arg].RelAddr) {
+	    c->vp->use_const_buffer = GL_TRUE;
+	    continue;
+	 }
+
+	 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
+	    c->constant_map[inst->SrcReg[arg].Index] = constant++;
+	 }
       }
-      reg += (constant + 1) / 2;
-      c->prog_data.curb_read_length = reg - 1;
-      /* XXX 0 causes a bug elsewhere... */
-      c->prog_data.nr_params = MAX2(constant * 4, 4);
    }
-   else {
-      /* use a section of the GRF for constants */
-      GLuint nr_params = c->vp->program.Base.Parameters->NumParameters;
-      for (i = 0; i < nr_params; i++) {
-         c->regs[PROGRAM_STATE_VAR][i] = stride( brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
-      }
-      reg += (nr_params + 1) / 2;
-      c->prog_data.curb_read_length = reg - 1;
 
-      c->prog_data.nr_params = nr_params * 4;
+   /* If we ran out of push constant space, then we'll also upload all
+    * constants through the pull constant buffer so that they can be
+    * accessed no matter what.  For relative addressing (the common
+    * case) we need them all in place anyway.
+    */
+   if (constant == max_constant)
+      c->vp->use_const_buffer = GL_TRUE;
+
+   for (i = 0; i < constant; i++) {
+      c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
+							  (i % 2) * 4),
+					     0, 4, 1);
    }
+   reg += (constant + 1) / 2;
+   c->prog_data.curb_read_length = reg - 1;
+   c->prog_data.nr_params = constant * 4;
+   /* XXX 0 causes a bug elsewhere... */
+   if (intel->gen < 6 && c->prog_data.nr_params == 0)
+      c->prog_data.nr_params = 4;
 
    /* Allocate input regs:  
     */
@@ -270,7 +293,36 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       mrf = 4;
 
    first_reladdr_output = get_first_reladdr_output(&c->vp->program);
-   for (i = 0; i < VERT_RESULT_MAX; i++) {
+
+   for (i = 0; i < VERT_RESULT_MAX; i++)
+       vert_result_reoder[i] = i;
+
+   /* adjust attribute order in VUE for BFC0/BFC1 on Gen6+ */
+   if (intel->gen >= 6 && c->key.two_side_color) {
+       if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
+           (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
+           assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
+           assert(c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
+           bfc = 2;
+       } else if ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
+           (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
+           bfc = 1;
+
+       if (bfc) {
+           for (i = 0; i < bfc; i++) {
+               vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 0] = VERT_RESULT_COL0 + i;
+               vert_result_reoder[VERT_RESULT_COL0 + i * 2 + 1] = VERT_RESULT_BFC0 + i;
+           }
+
+           for (i = VERT_RESULT_COL0 + bfc * 2; i < VERT_RESULT_BFC0 + bfc; i++) {
+               vert_result_reoder[i] = i - bfc;
+           }
+       }
+   }
+
+   for (j = 0; j < VERT_RESULT_MAX; j++) {
+      i = vert_result_reoder[j];
+
       if (c->prog_data.outputs_written & BITFIELD64_BIT(i)) {
 	 c->nr_outputs++;
          assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
@@ -281,7 +333,6 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	 else if (i == VERT_RESULT_PSIZ) {
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	    reg++;
-	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
 	 }
 	 else {
 	    /* Two restrictions on our compute-to-MRF here.  The
@@ -574,9 +625,18 @@ static void emit_max( struct brw_compile *p,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
-   brw_SEL(p, dst, arg0, arg1);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
+      brw_SEL(p, dst, arg0, arg1);
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   } else {
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
+      brw_SEL(p, dst, arg0, arg1);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
 }
 
 static void emit_min( struct brw_compile *p, 
@@ -584,9 +644,18 @@ static void emit_min( struct brw_compile *p,
 		      struct brw_reg arg0,
 		      struct brw_reg arg1 )
 {
-   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
-   brw_SEL(p, dst, arg0, arg1);
-   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6) {
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+      brw_SEL(p, dst, arg0, arg1);
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   } else {
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+      brw_SEL(p, dst, arg0, arg1);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
 }
 
 static void emit_math1_gen4(struct brw_vs_compile *c,
@@ -680,7 +749,7 @@ emit_math1(struct brw_vs_compile *c,
       emit_math1_gen4(c, function, dst, arg0, precision);
 }
 
-static void emit_math2( struct brw_vs_compile *c, 
+static void emit_math2_gen4( struct brw_vs_compile *c, 
 			GLuint function,
 			struct brw_reg dst,
 			struct brw_reg arg0,
@@ -688,14 +757,11 @@ static void emit_math2( struct brw_vs_compile *c,
 			GLuint precision)
 {
    struct brw_compile *p = &c->func;
-   struct intel_context *intel = &p->brw->intel;
    struct brw_reg tmp = dst;
    GLboolean need_tmp = GL_FALSE;
 
-   if (dst.file != BRW_GENERAL_REGISTER_FILE)
-      need_tmp = GL_TRUE;
-
-   if (intel->gen < 6 && dst.dw1.bits.writemask != 0xf)
+   if (dst.file != BRW_GENERAL_REGISTER_FILE ||
+       dst.dw1.bits.writemask != 0xf)
       need_tmp = GL_TRUE;
 
    if (need_tmp) 
@@ -718,6 +784,53 @@ static void emit_math2( struct brw_vs_compile *c,
    }
 }
 
+static void emit_math2_gen6( struct brw_vs_compile *c, 
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp_src0, tmp_src1, tmp_dst;
+
+   tmp_src0 = get_tmp(c);
+   tmp_src1 = get_tmp(c);
+   tmp_dst = get_tmp(c);
+
+   brw_MOV(p, tmp_src0, arg0);
+   brw_MOV(p, tmp_src1, arg1);
+   
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_math2(p,
+	    tmp_dst,
+	    function,
+	    tmp_src0,
+	    tmp_src1);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   brw_MOV(p, dst, tmp_dst);
+
+   release_tmp(c, tmp_src0);
+   release_tmp(c, tmp_src1);
+   release_tmp(c, tmp_dst);
+}
+
+static void emit_math2( struct brw_vs_compile *c, 
+			GLuint function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			GLuint precision)
+{
+   struct brw_compile *p = &c->func;
+   struct intel_context *intel = &p->brw->intel;
+
+   if (intel->gen >= 6)
+      emit_math2_gen6(c, function, dst, arg0, arg1, precision);
+   else
+      emit_math2_gen4(c, function, dst, arg0, arg1, precision);
+}
 
 static void emit_exp_noalias( struct brw_vs_compile *c,
 			      struct brw_reg dst,
@@ -990,8 +1103,6 @@ get_constant(struct brw_vs_compile *c,
 
    assert(argIndex < 3);
 
-   assert(c->func.brw->intel.gen < 6); /* FINISHME */
-
    if (c->current_const[argIndex].index != src->Index) {
       /* Keep track of the last constant loaded in this slot, for reuse. */
       c->current_const[argIndex].index = src->Index;
@@ -1022,14 +1133,14 @@ get_reladdr_constant(struct brw_vs_compile *c,
 {
    const struct prog_src_register *src = &inst->SrcReg[argIndex];
    struct brw_compile *p = &c->func;
+   struct brw_context *brw = p->brw;
+   struct intel_context *intel = &brw->intel;
    struct brw_reg const_reg = c->current_const[argIndex].reg;
-   struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
-   struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
+   struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
+   uint32_t offset;
 
    assert(argIndex < 3);
 
-   assert(c->func.brw->intel.gen < 6); /* FINISHME */
-
    /* Can't reuse a reladdr constant load. */
    c->current_const[argIndex].index = -1;
 
@@ -1038,15 +1149,21 @@ get_reladdr_constant(struct brw_vs_compile *c,
 	  src->Index, argIndex, c->current_const[argIndex].reg.nr);
 #endif
 
-   brw_MUL(p, byte_addr_reg, addrReg, brw_imm_ud(16));
+   if (intel->gen >= 6) {
+      offset = src->Index;
+   } else {
+      struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
+      brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
+      addr_reg = byte_addr_reg;
+      offset = 16 * src->Index;
+   }
 
    /* fetch the first vec4 */
    brw_dp_READ_4_vs_relative(p,
-			     const_reg,                     /* writeback dest */
-			     byte_addr_reg,                 /* address register */
-			     16 * src->Index,               /* byte offset */
-			     SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
-			     );
+			     const_reg,
+			     addr_reg,
+			     offset,
+			     SURF_INDEX_VERT_CONST_BUFFER);
 
    return const_reg;
 }
@@ -1241,22 +1358,18 @@ get_src_reg( struct brw_vs_compile *c,
    case PROGRAM_UNIFORM:
    case PROGRAM_ENV_PARAM:
    case PROGRAM_LOCAL_PARAM:
-      if (c->vp->use_const_buffer) {
-	 if (!relAddr && c->constant_map[index] != -1) {
-	    assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
-	    return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
-	 } else if (relAddr)
+      if (!relAddr && c->constant_map[index] != -1) {
+	 /* Take from the push constant buffer if possible. */
+	 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
+	 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
+      } else {
+	 /* Must be in the pull constant buffer then .*/
+	 assert(c->vp->use_const_buffer);
+	 if (relAddr)
 	    return get_reladdr_constant(c, inst, argIndex);
 	 else
 	    return get_constant(c, inst, argIndex);
       }
-      else if (relAddr) {
-         return deref(c, c->regs[PROGRAM_STATE_VAR][0], index, 16);
-      }
-      else {
-         assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
-         return c->regs[PROGRAM_STATE_VAR][index];
-      }
    case PROGRAM_ADDRESS:
       assert(index == 0);
       return c->regs[file][index];
@@ -1585,6 +1698,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 	 break;
       if (!(c->prog_data.outputs_written & BITFIELD64_BIT(i)))
 	 continue;
+      if (i == VERT_RESULT_PSIZ)
+	 continue;
 
       if (i >= VERT_RESULT_TEX0 &&
 	  c->regs[PROGRAM_OUTPUT][i].file == BRW_GENERAL_REGISTER_FILE) {
@@ -1895,7 +2010,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
 	 break;
       case OPCODE_RSQ:
-	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
+	 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
 	 break;
 
       case OPCODE_SEQ:
@@ -1969,35 +2084,42 @@ void brw_vs_emit(struct brw_vs_compile *c )
          break;
       case OPCODE_CONT:
 	 brw_set_predicate_control(p, get_predicate(inst));
-	 brw_CONT(p, if_depth_in_loop[loop_depth]);
+	 if (intel->gen >= 6) {
+	    brw_CONT_gen6(p, loop_inst[loop_depth - 1]);
+	 } else {
+	    brw_CONT(p, if_depth_in_loop[loop_depth]);
+	 }
          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
          break;
-      case OPCODE_ENDLOOP: 
-         {
-	    clear_current_const(c);
-            struct brw_instruction *inst0, *inst1;
-	    GLuint br = 1;
-
-            loop_depth--;
-
-	    if (intel->gen == 5)
-	       br = 2;
-
-            inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
-            /* patch all the BREAK/CONT instructions from last BEGINLOOP */
-            while (inst0 > loop_inst[loop_depth]) {
-               inst0--;
-               if (inst0->header.opcode == BRW_OPCODE_BREAK &&
+
+      case OPCODE_ENDLOOP: {
+	 clear_current_const(c);
+	 struct brw_instruction *inst0, *inst1;
+	 GLuint br = 1;
+
+	 loop_depth--;
+
+	 if (intel->gen == 5)
+	    br = 2;
+
+	 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+
+	 if (intel->gen < 6) {
+	    /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+	    while (inst0 > loop_inst[loop_depth]) {
+	       inst0--;
+	       if (inst0->header.opcode == BRW_OPCODE_BREAK &&
 		   inst0->bits3.if_else.jump_count == 0) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-               }
-               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-			inst0->bits3.if_else.jump_count == 0) {
-                  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-               }
-            }
-         }
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
+	       } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
+			  inst0->bits3.if_else.jump_count == 0) {
+		  inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
+	       }
+	    }
+	 }
+      }
          break;
+
       case OPCODE_BRA:
 	 brw_set_predicate_control(p, get_predicate(inst));
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
@@ -2088,6 +2210,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
    }
 
    brw_resolve_cals(p);
+   brw_set_uip_jip(p);
 
    brw_optimize(p);
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index ccdc18e0b8d..656501b4f79 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -119,6 +119,62 @@ brw_wm_non_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
    brw_wm_emit(c);
 }
 
+static void
+brw_wm_payload_setup(struct brw_context *brw,
+		     struct brw_wm_compile *c)
+{
+   struct intel_context *intel = &brw->intel;
+   bool uses_depth = (c->fp->program.Base.InputsRead &
+		      (1 << FRAG_ATTRIB_WPOS)) != 0;
+
+   if (intel->gen >= 6) {
+      /* R0-1: masks, pixel X/Y coordinates. */
+      c->nr_payload_regs = 2;
+      /* R2: only for 32-pixel dispatch.*/
+      /* R3-4: perspective pixel location barycentric */
+      c->nr_payload_regs += 2;
+      /* R5-6: perspective pixel location bary for dispatch width != 8 */
+      if (c->dispatch_width == 16) {
+	 c->nr_payload_regs += 2;
+      }
+      /* R7-10: perspective centroid barycentric */
+      /* R11-14: perspective sample barycentric */
+      /* R15-18: linear pixel location barycentric */
+      /* R19-22: linear centroid barycentric */
+      /* R23-26: linear sample barycentric */
+
+      /* R27: interpolated depth if uses source depth */
+      if (uses_depth) {
+	 c->source_depth_reg = c->nr_payload_regs;
+	 c->nr_payload_regs++;
+	 if (c->dispatch_width == 16) {
+	    /* R28: interpolated depth if not 8-wide. */
+	    c->nr_payload_regs++;
+	 }
+      }
+      /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W.
+       */
+      if (uses_depth) {
+	 c->source_w_reg = c->nr_payload_regs;
+	 c->nr_payload_regs++;
+	 if (c->dispatch_width == 16) {
+	    /* R30: interpolated W if not 8-wide. */
+	    c->nr_payload_regs++;
+	 }
+      }
+      /* R31: MSAA position offsets. */
+      /* R32-: bary for 32-pixel. */
+      /* R58-59: interp W for 32-pixel. */
+
+      if (c->fp->program.Base.OutputsWritten &
+	  BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+	 c->source_depth_to_render_target = GL_TRUE;
+	 c->computes_depth = GL_TRUE;
+      }
+   } else {
+      brw_wm_lookup_iz(intel, c);
+   }
+}
 
 /**
  * All Mesa program -> GPU code generation goes through this function.
@@ -167,23 +223,18 @@ static void do_wm_prog( struct brw_context *brw,
 
    brw_init_compile(brw, &c->func);
 
-   /* temporary sanity check assertion */
-   ASSERT(fp->isGLSL == brw_wm_is_glsl(&c->fp->program));
+   brw_wm_payload_setup(brw, c);
 
    if (!brw_wm_fs_emit(brw, c)) {
       /*
        * Shader which use GLSL features such as flow control are handled
        * differently from "simple" shaders.
        */
-      if (fp->isGLSL) {
-	 c->dispatch_width = 8;
-	 brw_wm_glsl_emit(brw, c);
-      }
-      else {
-	 c->dispatch_width = 16;
-	 brw_wm_non_glsl_emit(brw, c);
-      }
+      c->dispatch_width = 16;
+      brw_wm_payload_setup(brw, c);
+      brw_wm_non_glsl_emit(brw, c);
    }
+   c->prog_data.dispatch_width = c->dispatch_width;
 
    /* Scratch space is used for register spilling */
    if (c->last_scratch) {
@@ -220,12 +271,10 @@ static void do_wm_prog( struct brw_context *brw,
 static void brw_wm_populate_key( struct brw_context *brw,
 				 struct brw_wm_prog_key *key )
 {
-   struct intel_context *intel = &brw->intel;
    struct gl_context *ctx = &brw->intel.ctx;
    /* BRW_NEW_FRAGMENT_PROGRAM */
    const struct brw_fragment_program *fp = 
       (struct brw_fragment_program *)brw->fragment_program;
-   GLboolean uses_depth = (fp->program.Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
    GLuint lookup = 0;
    GLuint line_aa;
    GLuint i;
@@ -285,57 +334,9 @@ static void brw_wm_populate_key( struct brw_context *brw,
       }
    }
 
-   if (intel->gen >= 6) {
-      /* R0-1: masks, pixel X/Y coordinates. */
-      key->nr_payload_regs = 2;
-      /* R2: only for 32-pixel dispatch.*/
-      /* R3-4: perspective pixel location barycentric */
-      key->nr_payload_regs += 2;
-      /* R5-6: perspective pixel location bary for dispatch width != 8 */
-      if (!fp->isGLSL) { /* dispatch_width != 8 */
-	 key->nr_payload_regs += 2;
-      }
-      /* R7-10: perspective centroid barycentric */
-      /* R11-14: perspective sample barycentric */
-      /* R15-18: linear pixel location barycentric */
-      /* R19-22: linear centroid barycentric */
-      /* R23-26: linear sample barycentric */
-
-      /* R27: interpolated depth if uses source depth */
-      if (uses_depth) {
-	 key->source_depth_reg = key->nr_payload_regs;
-	 key->nr_payload_regs++;
-	 if (!fp->isGLSL) { /* dispatch_width != 8 */
-	    /* R28: interpolated depth if not 8-wide. */
-	    key->nr_payload_regs++;
-	 }
-      }
-      /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W.
-       */
-      if (uses_depth) {
-	 key->source_w_reg = key->nr_payload_regs;
-	 key->nr_payload_regs++;
-	 if (!fp->isGLSL) { /* dispatch_width != 8 */
-	    /* R30: interpolated W if not 8-wide. */
-	    key->nr_payload_regs++;
-	 }
-      }
-      /* R31: MSAA position offsets. */
-      /* R32-: bary for 32-pixel. */
-      /* R58-59: interp W for 32-pixel. */
-
-      if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
-	 key->source_depth_to_render_target = GL_TRUE;
-	 key->computes_depth = GL_TRUE;
-      }
-
-   } else {
-      brw_wm_lookup_iz(intel,
-	      	       line_aa,
-		       lookup,
-		       uses_depth,
-		       key);
-   }
+   key->iz_lookup = lookup;
+   key->line_aa = line_aa;
+   key->stats_wm = brw->intel.stats_wm;
 
    /* BRW_NEW_WM_INPUT_DIMENSIONS */
    key->proj_attrib_mask = brw->wm.input_size_masks[4-1];
@@ -377,6 +378,10 @@ static void brw_wm_populate_key( struct brw_context *brw,
 	       swizzles[2] = SWIZZLE_ZERO;
 	    } else if (t->DepthMode == GL_LUMINANCE) {
 	       swizzles[3] = SWIZZLE_ONE;
+	    } else if (t->DepthMode == GL_RED) {
+	       swizzles[1] = SWIZZLE_ZERO;
+	       swizzles[2] = SWIZZLE_ZERO;
+	       swizzles[3] = SWIZZLE_ZERO;
 	    }
 	 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 2ca685784fc..e7f3cfbb75f 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -59,16 +59,9 @@
 #define AA_ALWAYS    2
 
 struct brw_wm_prog_key {
-   GLuint source_depth_reg:3;
-   GLuint source_w_reg:3;
-   GLuint aa_dest_stencil_reg:3;
-   GLuint dest_depth_reg:3;
-   GLuint nr_payload_regs:4;
-   GLuint computes_depth:1;	/* could be derived from program string */
-   GLuint source_depth_to_render_target:1;
+   GLuint stats_wm:1;
    GLuint flat_shade:1;
    GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
-   GLuint runtime_check_aads_emit:1;
    GLuint nr_color_regions:5;
    GLuint render_to_fbo:1;
 
@@ -81,6 +74,8 @@ struct brw_wm_prog_key {
 
    GLushort drawable_height;
    GLbitfield64 vp_outputs_written;
+   GLuint iz_lookup;
+   GLuint line_aa;
    GLuint program_string_id:32;
 };
 
@@ -204,6 +199,15 @@ struct brw_wm_compile {
       PASS2_DONE
    } state;
 
+   GLuint source_depth_reg:3;
+   GLuint source_w_reg:3;
+   GLuint aa_dest_stencil_reg:3;
+   GLuint dest_depth_reg:3;
+   GLuint nr_payload_regs:4;
+   GLuint computes_depth:1;	/* could be derived from program string */
+   GLuint source_depth_to_render_target:1;
+   GLuint runtime_check_aads_emit:1;
+
    /* Initial pass - translate fp instructions to fp instructions,
     * simplifying and adding instructions for interpolation and
     * framebuffer writes.
@@ -306,14 +310,9 @@ void brw_wm_print_insn( struct brw_wm_compile *c,
 void brw_wm_print_program( struct brw_wm_compile *c,
 			   const char *stage );
 
-void brw_wm_lookup_iz( struct intel_context *intel,
-		       GLuint line_aa,
-		       GLuint lookup,
-		       GLboolean ps_uses_depth,
-		       struct brw_wm_prog_key *key );
+void brw_wm_lookup_iz(struct intel_context *intel,
+		      struct brw_wm_compile *c);
 
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp);
-void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c);
 GLboolean brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c);
 
 /* brw_wm_emit.c */
@@ -381,7 +380,6 @@ void emit_fb_write(struct brw_wm_compile *c,
 void emit_frontfacing(struct brw_compile *p,
 		      const struct brw_reg *dst,
 		      GLuint mask);
-void emit_kil_nv(struct brw_wm_compile *c);
 void emit_linterp(struct brw_compile *p,
 		  const struct brw_reg *dst,
 		  GLuint mask,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index 96fecc97ee2..a0e86034e1e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -896,10 +896,14 @@ void emit_math1(struct brw_wm_compile *c,
 		      BRW_MATH_SATURATE_NONE);
    struct brw_reg src;
 
-   if (intel->gen >= 6 && (arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
-			   arg0[0].file != BRW_GENERAL_REGISTER_FILE)) {
+   if (intel->gen >= 6 && ((arg0[0].hstride == BRW_HORIZONTAL_STRIDE_0 ||
+			    arg0[0].file != BRW_GENERAL_REGISTER_FILE) ||
+			   arg0[0].negate || arg0[0].abs)) {
       /* Gen6 math requires that source and dst horizontal stride be 1,
        * and that the argument be in the GRF.
+       *
+       * The hardware ignores source modifiers (negate and abs) on math
+       * instructions, so we also move to a temp to set those up.
        */
       src = dst[dst_chan];
       brw_MOV(p, src, arg0[0]);
@@ -1301,9 +1305,15 @@ static void emit_kil( struct brw_wm_compile *c,
 		      struct brw_reg *arg0)
 {
    struct brw_compile *p = &c->func;
-   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   struct intel_context *intel = &p->brw->intel;
+   struct brw_reg pixelmask;
    GLuint i, j;
 
+   if (intel->gen >= 6)
+      pixelmask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+   else
+      pixelmask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
    for (i = 0; i < 4; i++) {
       /* Check if we've already done the comparison for this reg
        * -- common when someone does KIL TEMP.wwww.
@@ -1319,26 +1329,11 @@ static void emit_kil( struct brw_wm_compile *c,
       brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0[i], brw_imm_f(0));   
       brw_set_predicate_control_flag_value(p, 0xff);
       brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-      brw_AND(p, r0uw, brw_flag_reg(), r0uw);
+      brw_AND(p, pixelmask, brw_flag_reg(), pixelmask);
       brw_pop_insn_state(p);
    }
 }
 
-/* KIL_NV kills the pixels that are currently executing, not based on a test
- * of the arguments.
- */
-void emit_kil_nv( struct brw_wm_compile *c )
-{
-   struct brw_compile *p = &c->func;
-   struct brw_reg r0uw = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
-
-   brw_push_insn_state(p);
-   brw_set_mask_control(p, BRW_MASK_DISABLE);
-   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); /* IMASK */
-   brw_AND(p, r0uw, c->emit_mask_reg, r0uw);
-   brw_pop_insn_state(p);
-}
-
 static void fire_fb_write( struct brw_wm_compile *c,
 			   GLuint base_reg,
 			   GLuint nr,
@@ -1387,8 +1382,8 @@ static void emit_aa( struct brw_wm_compile *c,
 		     GLuint reg )
 {
    struct brw_compile *p = &c->func;
-   GLuint comp = c->key.aa_dest_stencil_reg / 2;
-   GLuint off = c->key.aa_dest_stencil_reg % 2;
+   GLuint comp = c->aa_dest_stencil_reg / 2;
+   GLuint off = c->aa_dest_stencil_reg % 2;
    struct brw_reg aa = offset(arg1[comp], off);
 
    brw_push_insn_state(p);
@@ -1416,11 +1411,10 @@ void emit_fb_write(struct brw_wm_compile *c,
    struct intel_context *intel = &brw->intel;
    GLuint nr = 2;
    GLuint channel;
-   int base_reg; /* For gen6 fb write with no header, starting from color payload directly!. */
 
    /* Reserve a space for AA - may not be needed:
     */
-   if (c->key.aa_dest_stencil_reg)
+   if (c->aa_dest_stencil_reg)
       nr += 1;
 
    /* I don't really understand how this achieves the color interleave
@@ -1428,11 +1422,6 @@ void emit_fb_write(struct brw_wm_compile *c,
     */
    brw_push_insn_state(p);
 
-   if (intel->gen >= 6)
-	base_reg = nr;
-   else
-	base_reg = 0;
-
    for (channel = 0; channel < 4; channel++) {
       if (intel->gen >= 6) {
 	 /* gen6 SIMD16 single source DP write looks like:
@@ -1493,9 +1482,9 @@ void emit_fb_write(struct brw_wm_compile *c,
 
    brw_pop_insn_state(p);
 
-   if (c->key.source_depth_to_render_target)
+   if (c->source_depth_to_render_target)
    {
-      if (c->key.computes_depth) 
+      if (c->computes_depth)
 	 brw_MOV(p, brw_message_reg(nr), arg2[2]);
       else 
 	 brw_MOV(p, brw_message_reg(nr), arg1[1]); /* ? */
@@ -1503,10 +1492,10 @@ void emit_fb_write(struct brw_wm_compile *c,
       nr += 2;
    }
 
-   if (c->key.dest_depth_reg)
+   if (c->dest_depth_reg)
    {
-      GLuint comp = c->key.dest_depth_reg / 2;
-      GLuint off = c->key.dest_depth_reg % 2;
+      GLuint comp = c->dest_depth_reg / 2;
+      GLuint off = c->dest_depth_reg % 2;
 
       if (off != 0) {
          brw_push_insn_state(p);
@@ -1524,15 +1513,27 @@ void emit_fb_write(struct brw_wm_compile *c,
    }
 
    if (intel->gen >= 6) {
-      /* Subtract off the message header, since we send headerless. */
-      nr -= 2;
+      /* Load the message header.  There's no implied move from src0
+       * to the base mrf on gen6.
+       */
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_MOV(p, brw_message_reg(0), brw_vec8_grf(0, 0));
+      brw_pop_insn_state(p);
+
+      if (target != 0) {
+	 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+					0,
+					2), BRW_REGISTER_TYPE_UD),
+		 brw_imm_ud(target));
+      }
    }
 
-   if (!c->key.runtime_check_aads_emit) {
-      if (c->key.aa_dest_stencil_reg)
+   if (!c->runtime_check_aads_emit) {
+      if (c->aa_dest_stencil_reg)
 	 emit_aa(c, arg1, 2);
 
-      fire_fb_write(c, base_reg, nr, target, eot);
+      fire_fb_write(c, 0, nr, target, eot);
    }
    else {
       struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
@@ -1897,10 +1898,6 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_kil(c, args[0]);
 	 break;
 
-      case OPCODE_KIL_NV:
-	 emit_kil_nv(c);
-	 break;
-
       default:
 	 printf("Unsupported opcode %i (%s) in fragment shader\n",
 		inst->opcode, inst->opcode < MAX_OPCODE ?
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 2cae6988804..4759b289a0c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -338,11 +338,13 @@ static struct prog_src_register get_delta_xy( struct brw_wm_compile *c )
 
 static struct prog_src_register get_pixel_w( struct brw_wm_compile *c )
 {
-   /* This is only called for producing 1/w in pre-gen6 interp.  for
-    * gen6, the interp opcodes don't use this argument.
+   /* This is called for producing 1/w in pre-gen6 interp.  for gen6,
+    * the interp opcodes don't use this argument.  But to keep the
+    * nr_args = 3 expectations of pinterp happy, just stuff delta_xy
+    * into the slot.
     */
    if (c->func.brw->intel.gen >= 6)
-      return src_undef();
+      return c->delta_xy;
 
    if (src_is_undef(c->pixel_w)) {
       struct prog_dst_register pixel_w = get_temp(c);
@@ -373,11 +375,7 @@ static void emit_interp( struct brw_wm_compile *c,
    struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
    struct prog_src_register deltas;
 
-   if (c->func.brw->intel.gen < 6) {
-      deltas = get_delta_xy(c);
-   } else {
-      deltas = src_undef();
-   }
+   deltas = get_delta_xy(c);
 
    /* Need to use PINTERP on attributes which have been
     * multiplied by 1/W in the SF program, and LINTERP on those
@@ -1133,6 +1131,11 @@ void brw_wm_pass_fp( struct brw_wm_compile *c )
 	 precalc_lit(c, inst);
 	 break;
 
+      case OPCODE_RSQ:
+	 out = emit_scalar_insn(c, inst);
+	 out->SrcReg[0].Abs = GL_TRUE;
+	 break;
+
       case OPCODE_TEX:
 	 precalc_tex(c, inst);
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
deleted file mode 100644
index 7fe8ab1f334..00000000000
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ /dev/null
@@ -1,1035 +0,0 @@
-#include "main/macros.h"
-#include "program/prog_parameter.h"
-#include "program/prog_print.h"
-#include "program/prog_optimize.h"
-#include "brw_context.h"
-#include "brw_eu.h"
-#include "brw_wm.h"
-
-static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
-                                  const struct prog_instruction *inst,
-                                  GLuint component);
-
-/**
- * Determine if the given fragment program uses GLSL features such
- * as flow conditionals, loops, subroutines.
- * Some GLSL shaders may use these features, others might not.
- */
-GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
-{
-    int i;
-
-    if (unlikely(INTEL_DEBUG & DEBUG_GLSL_FORCE))
-       return GL_TRUE;
-
-    for (i = 0; i < fp->Base.NumInstructions; i++) {
-	const struct prog_instruction *inst = &fp->Base.Instructions[i];
-	switch (inst->Opcode) {
-	    case OPCODE_ARL:
-	    case OPCODE_IF:
-	    case OPCODE_ENDIF:
-	    case OPCODE_CAL:
-	    case OPCODE_BRK:
-	    case OPCODE_RET:
-	    case OPCODE_BGNLOOP:
-		return GL_TRUE; 
-	    default:
-		break;
-	}
-    }
-    return GL_FALSE; 
-}
-
-
-
-static void
-reclaim_temps(struct brw_wm_compile *c);
-
-
-/** Mark GRF register as used. */
-static void
-prealloc_grf(struct brw_wm_compile *c, int r)
-{
-   c->used_grf[r] = GL_TRUE;
-}
-
-
-/** Mark given GRF register as not in use. */
-static void
-release_grf(struct brw_wm_compile *c, int r)
-{
-   /*assert(c->used_grf[r]);*/
-   c->used_grf[r] = GL_FALSE;
-   c->first_free_grf = MIN2(c->first_free_grf, r);
-}
-
-
-/** Return index of a free GRF, mark it as used. */
-static int
-alloc_grf(struct brw_wm_compile *c)
-{
-   GLuint r;
-   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
-      if (!c->used_grf[r]) {
-         c->used_grf[r] = GL_TRUE;
-         c->first_free_grf = r + 1;  /* a guess */
-         return r;
-      }
-   }
-
-   /* no free temps, try to reclaim some */
-   reclaim_temps(c);
-   c->first_free_grf = 0;
-
-   /* try alloc again */
-   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
-      if (!c->used_grf[r]) {
-         c->used_grf[r] = GL_TRUE;
-         c->first_free_grf = r + 1;  /* a guess */
-         return r;
-      }
-   }
-
-   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
-      assert(c->used_grf[r]);
-   }
-
-   /* really, no free GRF regs found */
-   if (!c->out_of_regs) {
-      /* print warning once per compilation */
-      _mesa_warning(NULL, "i965: ran out of registers for fragment program");
-      c->out_of_regs = GL_TRUE;
-   }
-
-   return -1;
-}
-
-
-/** Return number of GRF registers used */
-static int
-num_grf_used(const struct brw_wm_compile *c)
-{
-   int r;
-   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
-      if (c->used_grf[r])
-         return r + 1;
-   return 0;
-}
-
-
-
-/**
- * Record the mapping of a Mesa register to a hardware register.
- */
-static void set_reg(struct brw_wm_compile *c, int file, int index, 
-	int component, struct brw_reg reg)
-{
-    c->wm_regs[file][index][component].reg = reg;
-    c->wm_regs[file][index][component].inited = GL_TRUE;
-}
-
-static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
-{
-    struct brw_reg reg;
-
-    /* if we need to allocate another temp, grow the tmp_regs[] array */
-    if (c->tmp_index == c->tmp_max) {
-       int r = alloc_grf(c);
-       if (r < 0) {
-          /*printf("Out of temps in %s\n", __FUNCTION__);*/
-          r = 50; /* XXX random register! */
-       }
-       c->tmp_regs[ c->tmp_max++ ] = r;
-    }
-
-    /* form the GRF register */
-    reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
-    /*printf("alloc_temp %d\n", reg.nr);*/
-    assert(reg.nr < BRW_WM_MAX_GRF);
-    return reg;
-
-}
-
-/**
- * Save current temp register info.
- * There must be a matching call to release_tmps().
- */
-static int mark_tmps(struct brw_wm_compile *c)
-{
-    return c->tmp_index;
-}
-
-static void release_tmps(struct brw_wm_compile *c, int mark)
-{
-    c->tmp_index = mark;
-}
-
-/**
- * Convert Mesa src register to brw register.
- *
- * Since we're running in SOA mode each Mesa register corresponds to four
- * hardware registers.  We allocate the hardware registers as needed here.
- *
- * \param file  register file, one of PROGRAM_x
- * \param index  register number
- * \param component  src component (X=0, Y=1, Z=2, W=3)
- * \param nr  not used?!?
- * \param neg  negate value?
- * \param abs  take absolute value?
- */
-static struct brw_reg 
-get_reg(struct brw_wm_compile *c, int file, int index, int component,
-        int nr, GLuint neg, GLuint abs)
-{
-    struct brw_reg reg;
-    switch (file) {
-	case PROGRAM_STATE_VAR:
-	case PROGRAM_CONSTANT:
-	case PROGRAM_UNIFORM:
-	    file = PROGRAM_STATE_VAR;
-	    break;
-	case PROGRAM_UNDEFINED:
-	    return brw_null_reg();	
-	case PROGRAM_TEMPORARY:
-	case PROGRAM_INPUT:
-	case PROGRAM_OUTPUT:
-	case PROGRAM_PAYLOAD:
-	    break;
-	default:
-	    _mesa_problem(NULL, "Unexpected file in get_reg()");
-	    return brw_null_reg();
-    }
-
-    assert(index < 256);
-    assert(component < 4);
-
-    /* see if we've already allocated a HW register for this Mesa register */
-    if (c->wm_regs[file][index][component].inited) {
-       /* yes, re-use */
-       reg = c->wm_regs[file][index][component].reg;
-    }
-    else {
-	/* no, allocate new register */
-       int grf = alloc_grf(c);
-       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
-       if (grf < 0) {
-          /* totally out of temps */
-          grf = 51; /* XXX random register! */
-       }
-
-       reg = brw_vec8_grf(grf, 0);
-       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
-
-       set_reg(c, file, index, component, reg);
-    }
-
-    if (neg & (1 << component)) {
-	reg = negate(reg);
-    }
-    if (abs)
-	reg = brw_abs(reg);
-    return reg;
-}
-
-
-
-/**
- * This is called if we run out of GRF registers.  Examine the live intervals
- * of temp regs in the program and free those which won't be used again.
- */
-static void
-reclaim_temps(struct brw_wm_compile *c)
-{
-   GLint intBegin[MAX_PROGRAM_TEMPS];
-   GLint intEnd[MAX_PROGRAM_TEMPS];
-   int index;
-
-   /*printf("Reclaim temps:\n");*/
-
-   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
-                             intBegin, intEnd);
-
-   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
-      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
-         /* program temp[i] can be freed */
-         int component;
-         /*printf("  temp[%d] is dead\n", index);*/
-         for (component = 0; component < 4; component++) {
-            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
-               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
-               release_grf(c, r);
-               /*
-               printf("  Reclaim temp %d, reg %d at inst %d\n",
-                      index, r, c->cur_inst);
-               */
-               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
-            }
-         }
-      }
-   }
-}
-
-
-
-
-/**
- * Preallocate registers.  This sets up the Mesa to hardware register
- * mapping for certain registers, such as constants (uniforms/state vars)
- * and shader inputs.
- */
-static void prealloc_reg(struct brw_wm_compile *c)
-{
-    struct intel_context *intel = &c->func.brw->intel;
-    int i, j;
-    struct brw_reg reg;
-    int urb_read_length = 0;
-    GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted;
-    GLuint reg_index = 0;
-
-    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
-    c->first_free_grf = 0;
-
-    for (i = 0; i < 4; i++) {
-	if (i < (c->key.nr_payload_regs + 1) / 2)
-            reg = brw_vec8_grf(i * 2, 0);
-        else
-            reg = brw_vec8_grf(0, 0);
-	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
-    }
-    set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_W, 0,
-	    brw_vec8_grf(c->key.source_w_reg, 0));
-    reg_index += c->key.nr_payload_regs;
-
-    /* constants */
-    {
-        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
-        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
-
-        /* use a real constant buffer, or just use a section of the GRF? */
-        /* XXX this heuristic may need adjustment... */
-        if ((nr_params + nr_temps) * 4 + reg_index > 80) {
-	   for (i = 0; i < nr_params; i++) {
-	      float *pv = c->fp->program.Base.Parameters->ParameterValues[i];
-	      for (j = 0; j < 4; j++) {
-		 c->prog_data.pull_param[c->prog_data.nr_pull_params] = &pv[j];
-		 c->prog_data.nr_pull_params++;
-	      }
-	   }
-
-	   c->prog_data.nr_params = 0;
-	}
-        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
-
-        if (!c->prog_data.nr_pull_params) {
-           const struct gl_program_parameter_list *plist = 
-              c->fp->program.Base.Parameters;
-           int index = 0;
-
-           /* number of float constants in CURBE */
-           c->prog_data.nr_params = 4 * nr_params;
-
-           /* loop over program constants (float[4]) */
-           for (i = 0; i < nr_params; i++) {
-              /* loop over XYZW channels */
-              for (j = 0; j < 4; j++, index++) {
-                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
-                 /* Save pointer to parameter/constant value.
-                  * Constants will be copied in prepare_constant_buffer()
-                  */
-                 c->prog_data.param[index] = &plist->ParameterValues[i][j];
-                 set_reg(c, PROGRAM_STATE_VAR, i, j, reg);
-              }
-           }
-           /* number of constant regs used (each reg is float[8]) */
-	   c->nr_creg = ALIGN(nr_params, 2) / 2;
-	   reg_index += c->nr_creg;
-        }
-    }
-
-    /* fragment shader inputs: One 2-reg pair of interpolation
-     * coefficients for each vec4 to be set up.
-     */
-    if (intel->gen >= 6) {
-       for (i = 0; i < FRAG_ATTRIB_MAX; i++) {
-	  if (!(c->fp->program.Base.InputsRead & BITFIELD64_BIT(i)))
-	     continue;
-
-	  reg = brw_vec8_grf(reg_index, 0);
-	  for (j = 0; j < 4; j++) {
-	     set_reg(c, PROGRAM_PAYLOAD, i, j, reg);
-	  }
-	  reg_index += 2;
-       }
-       urb_read_length = reg_index;
-    } else {
-       for (i = 0; i < VERT_RESULT_MAX; i++) {
-	  int fp_input;
-
-	  if (i >= VERT_RESULT_VAR0)
-	     fp_input = i - VERT_RESULT_VAR0 + FRAG_ATTRIB_VAR0;
-	  else if (i <= VERT_RESULT_TEX7)
-	     fp_input = i;
-	  else
-	     fp_input = -1;
-
-	  if (fp_input >= 0 && inputs & (1 << fp_input)) {
-	     urb_read_length = reg_index;
-	     reg = brw_vec8_grf(reg_index, 0);
-	     for (j = 0; j < 4; j++)
-		set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
-	  }
-	  if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
-	     reg_index += 2;
-	  }
-       }
-    }
-
-    c->prog_data.first_curbe_grf = c->key.nr_payload_regs;
-    c->prog_data.urb_read_length = urb_read_length;
-    c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
-    reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
-    reg_index += 2;
-
-    /* mark GRF regs [0..reg_index-1] as in-use */
-    for (i = 0; i < reg_index; i++)
-       prealloc_grf(c, i);
-
-    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
-    prealloc_grf(c, 126);
-    prealloc_grf(c, 127);
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-	const struct prog_instruction *inst = &c->prog_instructions[i];
-	struct brw_reg dst[4];
-
-	switch (inst->Opcode) {
-	case OPCODE_TEX:
-	case OPCODE_TXB:
-	    /* Allocate the channels of texture results contiguously,
-	     * since they are written out that way by the sampler unit.
-	     */
-	    for (j = 0; j < 4; j++) {
-		dst[j] = get_dst_reg(c, inst, j);
-		if (j != 0)
-		    assert(dst[j].nr == dst[j - 1].nr + 1);
-	    }
-	    break;
-	default:
-	    break;
-	}
-    }
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-	const struct prog_instruction *inst = &c->prog_instructions[i];
-
-	switch (inst->Opcode) {
-	case WM_DELTAXY:
-	    /* Allocate WM_DELTAXY destination on G45/GM45 to an
-	     * even-numbered GRF if possible so that we can use the PLN
-	     * instruction.
-	     */
-	    if (inst->DstReg.WriteMask == WRITEMASK_XY &&
-		!c->wm_regs[inst->DstReg.File][inst->DstReg.Index][0].inited &&
-		!c->wm_regs[inst->DstReg.File][inst->DstReg.Index][1].inited &&
-		(IS_G4X(intel->intelScreen->deviceID) || intel->gen == 5)) {
-		int grf;
-
-		for (grf = c->first_free_grf & ~1;
-		     grf < BRW_WM_MAX_GRF;
-		     grf += 2)
-		{
-		    if (!c->used_grf[grf] && !c->used_grf[grf + 1]) {
-			c->used_grf[grf] = GL_TRUE;
-			c->used_grf[grf + 1] = GL_TRUE;
-			c->first_free_grf = grf + 2;  /* a guess */
-
-			set_reg(c, inst->DstReg.File, inst->DstReg.Index, 0,
-				brw_vec8_grf(grf, 0));
-			set_reg(c, inst->DstReg.File, inst->DstReg.Index, 1,
-				brw_vec8_grf(grf + 1, 0));
-			break;
-		    }
-		}
-	    }
-	default:
-	    break;
-	}
-    }
-
-    /* An instruction may reference up to three constants.
-     * They'll be found in these registers.
-     * XXX alloc these on demand!
-     */
-    if (c->prog_data.nr_pull_params) {
-       for (i = 0; i < 3; i++) {
-          c->current_const[i].index = -1;
-          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
-       }
-    }
-#if 0
-    printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
-    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
-#endif
-}
-
-
-/**
- * Check if any of the instruction's src registers are constants, uniforms,
- * or statevars.  If so, fetch any constants that we don't already have in
- * the three GRF slots.
- */
-static void fetch_constants(struct brw_wm_compile *c,
-                            const struct prog_instruction *inst)
-{
-   struct brw_compile *p = &c->func;
-   GLuint i;
-
-   /* loop over instruction src regs */
-   for (i = 0; i < 3; i++) {
-      const struct prog_src_register *src = &inst->SrcReg[i];
-      if (src->File == PROGRAM_STATE_VAR ||
-          src->File == PROGRAM_CONSTANT ||
-          src->File == PROGRAM_UNIFORM) {
-	 c->current_const[i].index = src->Index;
-
-#if 0
-	 printf("  fetch const[%d] for arg %d into reg %d\n",
-		src->Index, i, c->current_const[i].reg.nr);
-#endif
-
-	 /* need to fetch the constant now */
-	 brw_oword_block_read(p,
-			      c->current_const[i].reg,
-			      brw_message_reg(1),
-			      16 * src->Index,
-			      SURF_INDEX_FRAG_CONST_BUFFER);
-      }
-   }
-}
-
-
-/**
- * Convert Mesa dst register to brw register.
- */
-static struct brw_reg get_dst_reg(struct brw_wm_compile *c, 
-                                  const struct prog_instruction *inst,
-                                  GLuint component)
-{
-    const int nr = 1;
-    return get_reg(c, inst->DstReg.File, inst->DstReg.Index, component, nr,
-	    0, 0);
-}
-
-
-static struct brw_reg
-get_src_reg_const(struct brw_wm_compile *c,
-                  const struct prog_instruction *inst,
-                  GLuint srcRegIndex, GLuint component)
-{
-   /* We should have already fetched the constant from the constant
-    * buffer in fetch_constants().  Now we just have to return a
-    * register description that extracts the needed component and
-    * smears it across all eight vector components.
-    */
-   const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-   struct brw_reg const_reg;
-
-   assert(component < 4);
-   assert(srcRegIndex < 3);
-   assert(c->current_const[srcRegIndex].index != -1);
-   const_reg = c->current_const[srcRegIndex].reg;
-
-   /* extract desired float from the const_reg, and smear */
-   const_reg = stride(const_reg, 0, 1, 0);
-   const_reg.subnr = component * 4;
-
-   if (src->Negate & (1 << component))
-      const_reg = negate(const_reg);
-   if (src->Abs)
-      const_reg = brw_abs(const_reg);
-
-#if 0
-   printf("  form const[%d].%d for arg %d, reg %d\n",
-          c->current_const[srcRegIndex].index,
-          component,
-          srcRegIndex,
-          const_reg.nr);
-#endif
-
-   return const_reg;
-}
-
-
-/**
- * Convert Mesa src register to brw register.
- */
-static struct brw_reg get_src_reg(struct brw_wm_compile *c, 
-                                  const struct prog_instruction *inst,
-                                  GLuint srcRegIndex, GLuint channel)
-{
-    const struct prog_src_register *src = &inst->SrcReg[srcRegIndex];
-    const GLuint nr = 1;
-    const GLuint component = GET_SWZ(src->Swizzle, channel);
-
-    /* Only one immediate value can be used per native opcode, and it
-     * has be in the src1 slot, so not all Mesa instructions will get
-     * to take advantage of immediate constants.
-     */
-    if (brw_wm_arg_can_be_immediate(inst->Opcode, srcRegIndex)) {
-       const struct gl_program_parameter_list *params;
-
-       params = c->fp->program.Base.Parameters;
-
-       /* Extended swizzle terms */
-       if (component == SWIZZLE_ZERO) {
-	  return brw_imm_f(0.0F);
-       } else if (component == SWIZZLE_ONE) {
-	  if (src->Negate)
-	     return brw_imm_f(-1.0F);
-	  else
-	     return brw_imm_f(1.0F);
-       }
-
-       if (src->File == PROGRAM_CONSTANT) {
-	  float f = params->ParameterValues[src->Index][component];
-
-	  if (src->Abs)
-	     f = fabs(f);
-	  if (src->Negate)
-	     f = -f;
-
-	  return brw_imm_f(f);
-       }
-    }
-
-    if (c->prog_data.nr_pull_params &&
-        (src->File == PROGRAM_STATE_VAR ||
-         src->File == PROGRAM_CONSTANT ||
-         src->File == PROGRAM_UNIFORM)) {
-       return get_src_reg_const(c, inst, srcRegIndex, component);
-    }
-    else {
-       /* other type of source register */
-       return get_reg(c, src->File, src->Index, component, nr, 
-                      src->Negate, src->Abs);
-    }
-}
-
-static void emit_arl(struct brw_wm_compile *c,
-                     const struct prog_instruction *inst)
-{
-    struct brw_compile *p = &c->func;
-    struct brw_reg src0, addr_reg;
-    brw_set_saturate(p, (inst->SaturateMode != SATURATE_OFF) ? 1 : 0);
-    addr_reg = brw_uw8_reg(BRW_ARCHITECTURE_REGISTER_FILE, 
-                           BRW_ARF_ADDRESS, 0);
-    src0 = get_src_reg(c, inst, 0, 0); /* channel 0 */
-    brw_MOV(p, addr_reg, src0);
-    brw_set_saturate(p, 0);
-}
-
-static INLINE struct brw_reg high_words( struct brw_reg reg )
-{
-    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_W ), 1 ),
-		   0, 8, 2 );
-}
-
-static INLINE struct brw_reg low_words( struct brw_reg reg )
-{
-    return stride( retype( reg, BRW_REGISTER_TYPE_W ), 0, 8, 2 );
-}
-
-static INLINE struct brw_reg even_bytes( struct brw_reg reg )
-{
-    return stride( retype( reg, BRW_REGISTER_TYPE_B ), 0, 16, 2 );
-}
-
-static INLINE struct brw_reg odd_bytes( struct brw_reg reg )
-{
-    return stride( suboffset( retype( reg, BRW_REGISTER_TYPE_B ), 1 ),
-		   0, 16, 2 );
-}
-
-/**
- * Resolve subroutine calls after code emit is done.
- */
-static void post_wm_emit( struct brw_wm_compile *c )
-{
-    brw_resolve_cals(&c->func);
-}
-
-static void
-get_argument_regs(struct brw_wm_compile *c,
-		  const struct prog_instruction *inst,
-		  int index,
-		  struct brw_reg *dst,
-		  struct brw_reg *regs,
-		  int mask)
-{
-    struct brw_compile *p = &c->func;
-    int i, j;
-
-    for (i = 0; i < 4; i++) {
-	if (mask & (1 << i)) {
-	    regs[i] = get_src_reg(c, inst, index, i);
-
-	    /* Unalias destination registers from our sources. */
-	    if (regs[i].file == BRW_GENERAL_REGISTER_FILE) {
-	       for (j = 0; j < 4; j++) {
-		   if (memcmp(&regs[i], &dst[j], sizeof(regs[0])) == 0) {
-		       struct brw_reg tmp = alloc_tmp(c);
-		       brw_MOV(p, tmp, regs[i]);
-		       regs[i] = tmp;
-		       break;
-		   }
-	       }
-	    }
-	}
-    }
-}
-
-static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
-{
-   struct intel_context *intel = &brw->intel;
-#define MAX_IF_DEPTH 32
-#define MAX_LOOP_DEPTH 32
-    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
-    int if_depth_in_loop[MAX_LOOP_DEPTH];
-    GLuint i, if_depth = 0, loop_depth = 0;
-    struct brw_compile *p = &c->func;
-    struct brw_indirect stack_index = brw_indirect(0, 0);
-
-    c->out_of_regs = GL_FALSE;
-
-    if_depth_in_loop[loop_depth] = 0;
-
-    prealloc_reg(c);
-    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
-    brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
-
-    if (intel->gen >= 6)
-	brw_set_acc_write_control(p, 1);
-
-    for (i = 0; i < c->nr_fp_insns; i++) {
-        const struct prog_instruction *inst = &c->prog_instructions[i];
-	int dst_flags;
-	struct brw_reg args[3][4], dst[4];
-	int j;
-	int mark = mark_tmps( c );
-
-        c->cur_inst = i;
-
-#if 0
-        printf("Inst %d: ", i);
-        _mesa_print_instruction(inst);
-#endif
-
-        /* fetch any constants that this instruction needs */
-        if (c->prog_data.nr_pull_params)
-           fetch_constants(c, inst);
-
-	if (inst->Opcode != OPCODE_ARL) {
-	   for (j = 0; j < 4; j++) {
-	      if (inst->DstReg.WriteMask & (1 << j))
-		 dst[j] = get_dst_reg(c, inst, j);
-	      else
-		 dst[j] = brw_null_reg();
-	   }
-	}
-	for (j = 0; j < brw_wm_nr_args(inst->Opcode); j++)
-	    get_argument_regs(c, inst, j, dst, args[j], WRITEMASK_XYZW);
-
-	dst_flags = inst->DstReg.WriteMask;
-	if (inst->SaturateMode == SATURATE_ZERO_ONE)
-	    dst_flags |= SATURATE;
-
-	if (inst->CondUpdate)
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
-	else
-	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
-
-	switch (inst->Opcode) {
-	    case WM_PIXELXY:
-		emit_pixel_xy(c, dst, dst_flags);
-		break;
-	    case WM_DELTAXY: 
-		emit_delta_xy(p, dst, dst_flags, args[0]);
-		break;
-	    case WM_PIXELW:
-		emit_pixel_w(c, dst, dst_flags, args[0], args[1]);
-		break;	
-	    case WM_LINTERP:
-		emit_linterp(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case WM_PINTERP:
-		emit_pinterp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case WM_CINTERP:
-		emit_cinterp(p, dst, dst_flags, args[0]);
-		break;
-	    case WM_WPOSXY:
-		emit_wpos_xy(c, dst, dst_flags, args[0]);
-		break;
-	    case WM_FB_WRITE:
-		emit_fb_write(c, args[0], args[1], args[2],
-			      INST_AUX_GET_TARGET(inst->Aux),
-			      inst->Aux & INST_AUX_EOT);
-		break;
-	    case WM_FRONTFACING:
-		emit_frontfacing(p, dst, dst_flags);
-		break;
-	    case OPCODE_ADD:
-		emit_alu2(p, brw_ADD, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_ARL:
-		emit_arl(c, inst);
-		break;
-	    case OPCODE_FRC:
-		emit_alu1(p, brw_FRC, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_FLR:
-		emit_alu1(p, brw_RNDD, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_LRP:
-		emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_TRUNC:
-		emit_alu1(p, brw_RNDZ, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_MOV:
-	    case OPCODE_SWZ:
-		emit_alu1(p, brw_MOV, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_DP2:
-		emit_dp2(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DP3:
-		emit_dp3(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DP4:
-		emit_dp4(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_XPD:
-		emit_xpd(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DPH:
-		emit_dph(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_RCP:
-		emit_math1(c, BRW_MATH_FUNCTION_INV, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_RSQ:
-		emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_SIN:
-		emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_COS:
-		emit_math1(c, BRW_MATH_FUNCTION_COS, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_EX2:
-		emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_LG2:
-		emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_CMP:
-		emit_cmp(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_MIN:	
-		emit_min(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_MAX:	
-		emit_max(p, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_DDX:
-	    case OPCODE_DDY:
-		emit_ddxy(p, dst, dst_flags, (inst->Opcode == OPCODE_DDX),
-			  args[0]);
-                break;
-	    case OPCODE_SLT:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_L, args[0], args[1]);
-		break;
-	    case OPCODE_SLE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_LE, args[0], args[1]);
-		break;
-	    case OPCODE_SGT:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_G, args[0], args[1]);
-		break;
-	    case OPCODE_SGE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_GE, args[0], args[1]);
-		break;
-	    case OPCODE_SEQ:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_EQ, args[0], args[1]);
-		break;
-	    case OPCODE_SNE:
-		emit_sop(p, dst, dst_flags,
-			 BRW_CONDITIONAL_NEQ, args[0], args[1]);
-		break;
-	    case OPCODE_SSG:
-		emit_sign(p, dst, dst_flags, args[0]);
-		break;
-	    case OPCODE_MUL:
-		emit_alu2(p, brw_MUL, dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_POW:
-		emit_math2(c, BRW_MATH_FUNCTION_POW,
-			   dst, dst_flags, args[0], args[1]);
-		break;
-	    case OPCODE_MAD:
-		emit_mad(p, dst, dst_flags, args[0], args[1], args[2]);
-		break;
-	    case OPCODE_TEX:
-		emit_tex(c, dst, dst_flags, args[0],
-			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
-				 0, 1, 0, 0),
-			 inst->TexSrcTarget,
-			 inst->TexSrcUnit,
-			 (c->key.shadowtex_mask & (1 << inst->TexSrcUnit)) != 0);
-		break;
-	    case OPCODE_TXB:
-		emit_txb(c, dst, dst_flags, args[0],
-			 get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH,
-				 0, 1, 0, 0),
-			 inst->TexSrcTarget,
-			 c->fp->program.Base.SamplerUnits[inst->TexSrcUnit]);
-		break;
-	    case OPCODE_KIL_NV:
-		emit_kil_nv(c);
-		break;
-	    case OPCODE_IF:
-		assert(if_depth < MAX_IF_DEPTH);
-		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
-		if_depth_in_loop[loop_depth]++;
-		break;
-	    case OPCODE_ELSE:
-		assert(if_depth > 0);
-		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
-		break;
-	    case OPCODE_ENDIF:
-		assert(if_depth > 0);
-		brw_ENDIF(p, if_inst[--if_depth]);
-		if_depth_in_loop[loop_depth]--;
-		break;
-	    case OPCODE_BGNSUB:
-		brw_save_label(p, inst->Comment, p->nr_insn);
-		break;
-	    case OPCODE_ENDSUB:
-		/* no-op */
-		break;
-	    case OPCODE_CAL: 
-		brw_push_insn_state(p);
-		brw_set_mask_control(p, BRW_MASK_DISABLE);
-                brw_set_access_mode(p, BRW_ALIGN_1);
-                brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
-                brw_set_access_mode(p, BRW_ALIGN_16);
-                brw_ADD(p, get_addr_reg(stack_index),
-                         get_addr_reg(stack_index), brw_imm_d(4));
-		brw_save_call(&c->func, inst->Comment, p->nr_insn);
-                brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
-                brw_pop_insn_state(p);
-		break;
-
-	    case OPCODE_RET:
-		brw_push_insn_state(p);
-		brw_set_mask_control(p, BRW_MASK_DISABLE);
-                brw_ADD(p, get_addr_reg(stack_index),
-                        get_addr_reg(stack_index), brw_imm_d(-4));
-                brw_set_access_mode(p, BRW_ALIGN_1);
-                brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0));
-                brw_set_access_mode(p, BRW_ALIGN_16);
-		brw_pop_insn_state(p);
-
-		break;
-	    case OPCODE_BGNLOOP:
-                /* XXX may need to invalidate the current_constant regs */
-		loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
-		if_depth_in_loop[loop_depth] = 0;
-		break;
-	    case OPCODE_BRK:
-		brw_BREAK(p, if_depth_in_loop[loop_depth]);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-		break;
-	    case OPCODE_CONT:
-		brw_CONT(p, if_depth_in_loop[loop_depth]);
-		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-		break;
-	    case OPCODE_ENDLOOP: 
-               {
-                  struct brw_instruction *inst0, *inst1;
-                  GLuint br = 1;
-
-                  if (intel->gen == 5)
-                     br = 2;
-
-		  assert(loop_depth > 0);
-                  loop_depth--;
-                  inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
-                  /* patch all the BREAK/CONT instructions from last BGNLOOP */
-                  while (inst0 > loop_inst[loop_depth]) {
-                     inst0--;
-                     if (inst0->header.opcode == BRW_OPCODE_BREAK &&
-			 inst0->bits3.if_else.jump_count == 0) {
-			inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
-                     }
-                     else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
-			      inst0->bits3.if_else.jump_count == 0) {
-                        inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
-                     }
-                  }
-               }
-               break;
-	    default:
-		printf("unsupported opcode %d (%s) in fragment shader\n",
-		       inst->Opcode, inst->Opcode < MAX_OPCODE ?
-		       _mesa_opcode_string(inst->Opcode) : "unknown");
-	}
-
-	/* Release temporaries containing any unaliased source regs. */
-	release_tmps( c, mark );
-
-	if (inst->CondUpdate)
-	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
-	else
-	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
-    }
-    post_wm_emit(c);
-
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-      printf("wm-native:\n");
-      for (i = 0; i < p->nr_insn; i++)
-	 brw_disasm(stdout, &p->store[i], intel->gen);
-      printf("\n");
-    }
-}
-
-/**
- * Do GPU code generation for shaders that use GLSL features such as
- * flow control.  Other shaders will be compiled with the 
- */
-void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
-{
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-        printf("brw_wm_glsl_emit:\n");
-    }
-
-    /* initial instruction translation/simplification */
-    brw_wm_pass_fp(c);
-
-    /* actual code generation */
-    brw_wm_emit_glsl(brw, c);
-
-    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
-        brw_wm_print_program(c, "brw_wm_glsl_emit done");
-    }
-
-    c->prog_data.total_grf = num_grf_used(c);
-    c->prog_data.total_scratch = 0;
-}
diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.c b/src/mesa/drivers/dri/i965/brw_wm_iz.c
index 62e556698ba..471ea1c18d6 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_iz.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_iz.c
@@ -120,14 +120,14 @@ const struct {
  * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
  * \param lookup  bitmask of IZ_* flags
  */
-void brw_wm_lookup_iz( struct intel_context *intel,
-		       GLuint line_aa,
-		       GLuint lookup,
-		       GLboolean ps_uses_depth,
-		       struct brw_wm_prog_key *key )
+void brw_wm_lookup_iz(struct intel_context *intel,
+		      struct brw_wm_compile *c)
 {
    GLuint reg = 2;
    GLboolean kill_stats_promoted_workaround = GL_FALSE;
+   int lookup = c->key.iz_lookup;
+   bool uses_depth = (c->fp->program.Base.InputsRead &
+		      (1 << FRAG_ATTRIB_WPOS)) != 0;
 
    assert (lookup < IZ_BIT_MAX);
 
@@ -136,36 +136,36 @@ void brw_wm_lookup_iz( struct intel_context *intel,
     * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
     * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
     */
-   if (intel->stats_wm &&
+   if (c->key.stats_wm &&
        (lookup & IZ_PS_KILL_ALPHATEST_BIT) &&
        wm_iz_table[lookup].mode == P) {
       kill_stats_promoted_workaround = GL_TRUE;
    }
 
    if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
-      key->computes_depth = 1;
+      c->computes_depth = 1;
 
-   if (wm_iz_table[lookup].sd_present || ps_uses_depth ||
+   if (wm_iz_table[lookup].sd_present || uses_depth ||
        kill_stats_promoted_workaround) {
-      key->source_depth_reg = reg;
+      c->source_depth_reg = reg;
       reg += 2;
    }
 
    if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
-      key->source_depth_to_render_target = 1;
+      c->source_depth_to_render_target = 1;
 
-   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
-      key->aa_dest_stencil_reg = reg;
-      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
-				      line_aa == AA_SOMETIMES);
+   if (wm_iz_table[lookup].ds_present || c->key.line_aa != AA_NEVER) {
+      c->aa_dest_stencil_reg = reg;
+      c->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				    c->key.line_aa == AA_SOMETIMES);
       reg++;
    }
 
    if (wm_iz_table[lookup].dd_present) {
-      key->dest_depth_reg = reg;
+      c->dest_depth_reg = reg;
       reg+=2;
    }
 
-   key->nr_payload_regs = reg;
+   c->nr_payload_regs = reg;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass0.c b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
index 83152526b3a..f78bdc31866 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass0.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass0.c
@@ -380,7 +380,7 @@ static void pass0_init_payload( struct brw_wm_compile *c )
    GLuint i;
 
    for (i = 0; i < 4; i++) {
-      GLuint j = i >= (c->key.nr_payload_regs + 1) / 2 ? 0 : i;
+      GLuint j = i >= (c->nr_payload_regs + 1) / 2 ? 0 : i;
       pass0_set_fpreg_value( c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, 
 			     &c->payload.depth[j] );
    }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass1.c b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
index 3a2874b6ddf..7d6a3fa9f12 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass1.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
@@ -128,8 +128,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       if (inst->opcode == WM_FB_WRITE) {
 	 track_arg(c, inst, 0, WRITEMASK_XYZW); 
 	 track_arg(c, inst, 1, WRITEMASK_XYZW); 
-	 if (c->key.source_depth_to_render_target &&
-	     c->key.computes_depth)
+	 if (c->source_depth_to_render_target && c->computes_depth)
 	    track_arg(c, inst, 2, WRITEMASK_Z); 
 	 else
 	    track_arg(c, inst, 2, 0); 
@@ -281,7 +280,6 @@ void brw_wm_pass1( struct brw_wm_compile *c )
 
       case OPCODE_DST:
       case WM_FRONTFACING:
-      case OPCODE_KIL_NV:
       default:
 	 break;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass2.c b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
index 44e39538145..8c2b9e7020b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass2.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass2.c
@@ -69,6 +69,8 @@ static void prealloc_reg(struct brw_wm_compile *c,
  */
 static void init_registers( struct brw_wm_compile *c )
 {
+   struct brw_context *brw = c->func.brw;
+   struct intel_context *intel = &brw->intel;
    GLuint nr_interp_regs = 0;
    GLuint i = 0;
    GLuint j;
@@ -76,32 +78,41 @@ static void init_registers( struct brw_wm_compile *c )
    for (j = 0; j < c->grf_limit; j++) 
       c->pass2_grf[j].nextuse = BRW_WM_MAX_INSN;
 
-   for (j = 0; j < (c->key.nr_payload_regs + 1) / 2; j++)
+   for (j = 0; j < (c->nr_payload_regs + 1) / 2; j++)
       prealloc_reg(c, &c->payload.depth[j], i++);
 
    for (j = 0; j < c->nr_creg; j++) 
       prealloc_reg(c, &c->creg[j], i++);
 
-   for (j = 0; j < VERT_RESULT_MAX; j++) {
-      if (c->key.vp_outputs_written & BITFIELD64_BIT(j)) {
-	 int fp_index;
-
-	 if (j >= VERT_RESULT_VAR0)
-	    fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
-	 else if (j <= VERT_RESULT_TEX7)
-	    fp_index = j;
-	 else
-	    fp_index = -1;
-
-	 nr_interp_regs++;
-	 if (fp_index >= 0)
-	    prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
+   if (intel->gen >= 6) {
+      for (unsigned int j = 0; j < FRAG_ATTRIB_MAX; j++) {
+	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(j)) {
+	    nr_interp_regs++;
+	    prealloc_reg(c, &c->payload.input_interp[j], i++);
+	 }
+      }
+   } else {
+      for (j = 0; j < VERT_RESULT_MAX; j++) {
+	 if (c->key.vp_outputs_written & BITFIELD64_BIT(j)) {
+	    int fp_index;
+
+	    if (j >= VERT_RESULT_VAR0)
+	       fp_index = j - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
+	    else if (j <= VERT_RESULT_TEX7)
+	       fp_index = j;
+	    else
+	       fp_index = -1;
+
+	    nr_interp_regs++;
+	    if (fp_index >= 0)
+	       prealloc_reg(c, &c->payload.input_interp[fp_index], i++);
+	 }
       }
+      assert(nr_interp_regs >= 1);
    }
 
-   assert(nr_interp_regs >= 1);
 
-   c->prog_data.first_curbe_grf = ALIGN(c->key.nr_payload_regs, 2);
+   c->prog_data.first_curbe_grf = ALIGN(c->nr_payload_regs, 2);
    c->prog_data.urb_read_length = nr_interp_regs * 2;
    c->prog_data.curb_read_length = c->nr_creg * 2;
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index fea96d35381..e7c97a1cb05 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -69,12 +69,43 @@ static GLuint translate_wrap_mode( GLenum wrap )
 static drm_intel_bo *upload_default_color( struct brw_context *brw,
 				     const GLfloat *color )
 {
-   struct brw_sampler_default_color sdc;
+   struct intel_context *intel = &brw->intel;
 
-   COPY_4V(sdc.color, color); 
-   
-   return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
-			 &sdc, sizeof(sdc));
+   if (intel->gen >= 5) {
+      struct gen5_sampler_default_color sdc;
+
+      memset(&sdc, 0, sizeof(sdc));
+
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[0], color[0]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[1], color[1]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[2], color[2]);
+      UNCLAMPED_FLOAT_TO_UBYTE(sdc.ub[3], color[3]);
+
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[0], color[0]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[1], color[1]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[2], color[2]);
+      UNCLAMPED_FLOAT_TO_USHORT(sdc.us[3], color[3]);
+
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[0], color[0]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[1], color[1]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[2], color[2]);
+      UNCLAMPED_FLOAT_TO_SHORT(sdc.s[3], color[3]);
+
+      /* XXX: Fill in half floats */
+      /* XXX: Fill in signed bytes */
+
+      COPY_4V(sdc.f, color);
+
+      return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
+			    &sdc, sizeof(sdc));
+   } else {
+      struct brw_sampler_default_color sdc;
+
+      COPY_4V(sdc.color, color);
+
+      return brw_cache_data(&brw->cache, BRW_SAMPLER_DEFAULT_COLOR,
+			    &sdc, sizeof(sdc));
+   }
 }
 
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 76de7b7b6f6..e9ef635bca2 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -87,7 +87,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 {
    struct gl_context *ctx = &brw->intel.ctx;
    const struct gl_fragment_program *fp = brw->fragment_program;
-   const struct brw_fragment_program *bfp = (struct brw_fragment_program *) fp;
    struct intel_context *intel = &brw->intel;
 
    memset(key, 0, sizeof(*key));
@@ -132,7 +131,6 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
 
    /* _NEW_COLOR */
    key->uses_kill = fp->UsesKill || ctx->Color.AlphaEnabled;
-   key->is_glsl = bfp->isGLSL;
 
    /* If using the fragment shader backend, the program is always
     * 8-wide.
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 76fc94df1f6..ad744044c70 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -139,6 +139,8 @@ static GLuint translate_tex_format( gl_format mesa_format,
 	  return BRW_SURFACEFORMAT_I16_UNORM;
       else if (depth_mode == GL_ALPHA)
 	  return BRW_SURFACEFORMAT_A16_UNORM;
+      else if (depth_mode == GL_RED)
+	  return BRW_SURFACEFORMAT_R16_UNORM;
       else
 	  return BRW_SURFACEFORMAT_L16_UNORM;
 
@@ -174,6 +176,8 @@ static GLuint translate_tex_format( gl_format mesa_format,
          return BRW_SURFACEFORMAT_I24X8_UNORM;
       else if (depth_mode == GL_ALPHA)
          return BRW_SURFACEFORMAT_A24X8_UNORM;
+      else if (depth_mode == GL_RED)
+         return BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS;
       else
          return BRW_SURFACEFORMAT_L24X8_UNORM;
 
@@ -274,6 +278,7 @@ brw_create_constant_surface(struct brw_context *brw,
 			    drm_intel_bo **out_bo,
 			    uint32_t *out_offset)
 {
+   struct intel_context *intel = &brw->intel;
    const GLint w = width - 1;
    struct brw_surface_state surf;
    void *map;
@@ -284,6 +289,9 @@ brw_create_constant_surface(struct brw_context *brw,
    surf.ss0.surface_type = BRW_SURFACE_BUFFER;
    surf.ss0.surface_format = BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
 
+   if (intel->gen >= 6)
+      surf.ss0.render_cache_read_write = 1;
+
    assert(bo);
    surf.ss1.base_addr = bo->offset; /* reloc */
 
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index 800a2555214..c2631a7b4df 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -35,6 +35,7 @@
 struct gen6_blend_state_key {
    GLboolean color_blend, alpha_enabled;
    GLboolean dither;
+   GLboolean color_mask[BRW_MAX_DRAW_BUFFERS][4];
 
    GLenum logic_op;
 
@@ -54,6 +55,9 @@ blend_state_populate_key(struct brw_context *brw,
    memset(key, 0, sizeof(*key));
 
    /* _NEW_COLOR */
+   memcpy(key->color_mask, ctx->Color.ColorMask, sizeof(key->color_mask));
+
+   /* _NEW_COLOR */
    if (ctx->Color._LogicOpEnabled)
       key->logic_op = ctx->Color.LogicOp;
    else
@@ -87,54 +91,62 @@ static drm_intel_bo *
 blend_state_create_from_key(struct brw_context *brw,
 			    struct gen6_blend_state_key *key)
 {
-   struct gen6_blend_state blend;
+   struct gen6_blend_state blend[BRW_MAX_DRAW_BUFFERS];
    drm_intel_bo *bo;
+   int b;
 
    memset(&blend, 0, sizeof(blend));
 
-   if (key->logic_op != GL_COPY) {
-      blend.blend1.logic_op_enable = 1;
-      blend.blend1.logic_op_func = intel_translate_logic_op(key->logic_op);
-   } else if (key->color_blend) {
-      GLenum eqRGB = key->blend_eq_rgb;
-      GLenum eqA = key->blend_eq_a;
-      GLenum srcRGB = key->blend_src_rgb;
-      GLenum dstRGB = key->blend_dst_rgb;
-      GLenum srcA = key->blend_src_a;
-      GLenum dstA = key->blend_dst_a;
-
-      if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
-	 srcRGB = dstRGB = GL_ONE;
-      }
-
-      if (eqA == GL_MIN || eqA == GL_MAX) {
-	 srcA = dstA = GL_ONE;
+   for (b = 0; b < BRW_MAX_DRAW_BUFFERS; b++) {
+      if (key->logic_op != GL_COPY) {
+	 blend[b].blend1.logic_op_enable = 1;
+	 blend[b].blend1.logic_op_func = intel_translate_logic_op(key->logic_op);
+      } else if (key->color_blend & (1 << b)) {
+	 GLenum eqRGB = key->blend_eq_rgb;
+	 GLenum eqA = key->blend_eq_a;
+	 GLenum srcRGB = key->blend_src_rgb;
+	 GLenum dstRGB = key->blend_dst_rgb;
+	 GLenum srcA = key->blend_src_a;
+	 GLenum dstA = key->blend_dst_a;
+
+	 if (eqRGB == GL_MIN || eqRGB == GL_MAX) {
+	    srcRGB = dstRGB = GL_ONE;
+	 }
+
+	 if (eqA == GL_MIN || eqA == GL_MAX) {
+	    srcA = dstA = GL_ONE;
+	 }
+
+	 blend[b].blend0.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+	 blend[b].blend0.source_blend_factor = brw_translate_blend_factor(srcRGB);
+	 blend[b].blend0.blend_func = brw_translate_blend_equation(eqRGB);
+
+	 blend[b].blend0.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+	 blend[b].blend0.ia_source_blend_factor = brw_translate_blend_factor(srcA);
+	 blend[b].blend0.ia_blend_func = brw_translate_blend_equation(eqA);
+
+	 blend[b].blend0.blend_enable = 1;
+	 blend[b].blend0.ia_blend_enable = (srcA != srcRGB ||
+					 dstA != dstRGB ||
+					 eqA != eqRGB);
       }
 
-      blend.blend0.dest_blend_factor = brw_translate_blend_factor(dstRGB);
-      blend.blend0.source_blend_factor = brw_translate_blend_factor(srcRGB);
-      blend.blend0.blend_func = brw_translate_blend_equation(eqRGB);
-
-      blend.blend0.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
-      blend.blend0.ia_source_blend_factor = brw_translate_blend_factor(srcA);
-      blend.blend0.ia_blend_func = brw_translate_blend_equation(eqA);
+      if (key->alpha_enabled) {
+	 blend[b].blend1.alpha_test_enable = 1;
+	 blend[b].blend1.alpha_test_func = intel_translate_compare_func(key->alpha_func);
 
-      blend.blend0.blend_enable = 1;
-      blend.blend0.ia_blend_enable = (srcA != srcRGB ||
-				      dstA != dstRGB ||
-				      eqA != eqRGB);
-   }
-
-   if (key->alpha_enabled) {
-      blend.blend1.alpha_test_enable = 1;
-      blend.blend1.alpha_test_func = intel_translate_compare_func(key->alpha_func);
+      }
 
-   }
+      if (key->dither) {
+	 blend[b].blend1.dither_enable = 1;
+	 blend[b].blend1.y_dither_offset = 0;
+	 blend[b].blend1.x_dither_offset = 0;
+      }
 
-   if (key->dither) {
-      blend.blend1.dither_enable = 1;
-      blend.blend1.y_dither_offset = 0;
-      blend.blend1.x_dither_offset = 0;
+      blend[b].blend1.write_disable_r = !key->color_mask[b][0];
+      blend[b].blend1.write_disable_g = !key->color_mask[b][1];
+      blend[b].blend1.write_disable_b = !key->color_mask[b][2];
+      blend[b].blend1.write_disable_a = !key->color_mask[b][3];
    }
 
    bo = brw_upload_cache(&brw->cache, BRW_BLEND_STATE,
@@ -172,7 +184,7 @@ const struct brw_tracked_state gen6_blend_state = {
 };
 
 struct gen6_color_calc_state_key {
-   GLubyte blend_constant_color[4];
+   float blend_constant_color[4];
    GLclampf alpha_ref;
    GLubyte stencil_ref[2];
 };
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index c65b41e2b6b..c7c4eb1f27d 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -64,7 +64,9 @@ upload_clip_state(struct brw_context *brw)
 	     userclip << GEN6_USER_CLIP_CLIP_DISTANCES_SHIFT |
 	     depth_clamp |
 	     provoking);
-   OUT_BATCH(GEN6_CLIP_FORCE_ZERO_RTAINDEX);
+   OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
+             U_FIXED(225.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
+             GEN6_CLIP_FORCE_ZERO_RTAINDEX);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 471067e8f02..45c148baedd 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -33,9 +33,10 @@
 #include "intel_batchbuffer.h"
 
 static uint32_t
-get_attr_override(struct brw_context *brw, int fs_attr)
+get_attr_override(struct brw_context *brw, int fs_attr, int two_side_color)
 {
    int attr_index = 0, i, vs_attr;
+   int bfc = 0;
 
    if (fs_attr <= FRAG_ATTRIB_TEX7)
       vs_attr = fs_attr;
@@ -57,6 +58,30 @@ get_attr_override(struct brw_context *brw, int fs_attr)
 	 attr_index++;
    }
 
+   assert(attr_index < 32);
+
+   if (two_side_color) {
+       if ((brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL1)) &&
+           (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC1))) {
+           assert(brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0));
+           assert(brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0));
+           bfc = 2;
+       } else if ((brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_COL0)) &&
+                (brw->vs.prog_data->outputs_written & BITFIELD64_BIT(VERT_RESULT_BFC0)))
+           bfc = 1;
+   }
+
+   if (bfc && (fs_attr <= FRAG_ATTRIB_TEX7 && fs_attr > FRAG_ATTRIB_WPOS)) {
+       if (fs_attr == FRAG_ATTRIB_COL0)
+           attr_index |= (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT);
+       else if (fs_attr == FRAG_ATTRIB_COL1 && bfc == 2) {
+           attr_index++;
+           attr_index |= (ATTRIBUTE_SWIZZLE_INPUTATTR_FACING << ATTRIBUTE_SWIZZLE_SHIFT);
+       } else {
+           attr_index += bfc;
+       }
+   }
+
    return attr_index;
 }
 
@@ -67,13 +92,15 @@ upload_sf_state(struct brw_context *brw)
    struct gl_context *ctx = &intel->ctx;
    /* CACHE_NEW_VS_PROG */
    uint32_t num_inputs = brw_count_bits(brw->vs.prog_data->outputs_written);
+   /* BRW_NEW_FRAGMENT_PROGRAM */
    uint32_t num_outputs = brw_count_bits(brw->fragment_program->Base.InputsRead);
-   uint32_t dw1, dw2, dw3, dw4, dw16;
+   uint32_t dw1, dw2, dw3, dw4, dw16, dw17;
    int i;
    /* _NEW_BUFFER */
    GLboolean render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
    int attr = 0;
    int urb_start;
+   int two_side_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
    /* _NEW_TRANSFORM */
    if (ctx->Transform.ClipPlanesEnabled)
@@ -91,6 +118,7 @@ upload_sf_state(struct brw_context *brw)
    dw3 = 0;
    dw4 = 0;
    dw16 = 0;
+   dw17 = 0;
 
    /* _NEW_POLYGON */
    if ((ctx->Polygon.FrontFace == GL_CCW) ^ render_to_fbo)
@@ -99,6 +127,48 @@ upload_sf_state(struct brw_context *brw)
    if (ctx->Polygon.OffsetFill)
        dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_SOLID;
 
+   if (ctx->Polygon.OffsetLine)
+       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_WIREFRAME;
+
+   if (ctx->Polygon.OffsetPoint)
+       dw2 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_POINT;
+
+   switch (ctx->Polygon.FrontMode) {
+   case GL_FILL:
+       dw2 |= GEN6_SF_FRONT_SOLID;
+       break;
+
+   case GL_LINE:
+       dw2 |= GEN6_SF_FRONT_WIREFRAME;
+       break;
+
+   case GL_POINT:
+       dw2 |= GEN6_SF_FRONT_POINT;
+       break;
+
+   default:
+       assert(0);
+       break;
+   }
+
+   switch (ctx->Polygon.BackMode) {
+   case GL_FILL:
+       dw2 |= GEN6_SF_BACK_SOLID;
+       break;
+
+   case GL_LINE:
+       dw2 |= GEN6_SF_BACK_WIREFRAME;
+       break;
+
+   case GL_POINT:
+       dw2 |= GEN6_SF_BACK_POINT;
+       break;
+
+   default:
+       assert(0);
+       break;
+   }
+
    /* _NEW_SCISSOR */
    if (ctx->Scissor.Enabled)
       dw3 |= GEN6_SF_SCISSOR_ENABLE;
@@ -160,6 +230,12 @@ upload_sf_state(struct brw_context *brw)
        }
    }
 
+   /* flat shading */
+   if (ctx->Light.ShadeModel == GL_FLAT) {
+       dw17 |= ((brw->fragment_program->Base.InputsRead & (FRAG_BIT_COL0 | FRAG_BIT_COL1)) >>
+                ((brw->fragment_program->Base.InputsRead & FRAG_BIT_WPOS) ? 0 : 1));
+   }
+
    BEGIN_BATCH(20);
    OUT_BATCH(CMD_3D_SF_STATE << 16 | (20 - 2));
    OUT_BATCH(dw1);
@@ -174,7 +250,7 @@ upload_sf_state(struct brw_context *brw)
 
       for (; attr < 64; attr++) {
 	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr)) {
-	    attr_overrides |= get_attr_override(brw, attr);
+	    attr_overrides |= get_attr_override(brw, attr, two_side_color);
 	    attr++;
 	    break;
 	 }
@@ -182,7 +258,7 @@ upload_sf_state(struct brw_context *brw)
 
       for (; attr < 64; attr++) {
 	 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr)) {
-	    attr_overrides |= get_attr_override(brw, attr) << 16;
+	    attr_overrides |= get_attr_override(brw, attr, two_side_color) << 16;
 	    attr++;
 	    break;
 	 }
@@ -190,7 +266,7 @@ upload_sf_state(struct brw_context *brw)
       OUT_BATCH(attr_overrides);
    }
    OUT_BATCH(dw16); /* point sprite texcoord bitmask */
-   OUT_BATCH(0); /* constant interp bitmask */
+   OUT_BATCH(dw17); /* constant interp bitmask */
    OUT_BATCH(0); /* wrapshortest enables 0-7 */
    OUT_BATCH(0); /* wrapshortest enables 8-15 */
    ADVANCE_BATCH();
@@ -205,7 +281,8 @@ const struct brw_tracked_state gen6_sf_state = {
 		_NEW_BUFFERS |
 		_NEW_POINT |
 		_NEW_TRANSFORM),
-      .brw   = BRW_NEW_CONTEXT,
+      .brw   = (BRW_NEW_CONTEXT |
+		BRW_NEW_FRAGMENT_PROGRAM),
       .cache = CACHE_NEW_VS_PROG
    },
    .emit = upload_sf_state,
diff --git a/src/mesa/drivers/dri/i965/gen6_urb.c b/src/mesa/drivers/dri/i965/gen6_urb.c
index a34123478fb..de97fd3783d 100644
--- a/src/mesa/drivers/dri/i965/gen6_urb.c
+++ b/src/mesa/drivers/dri/i965/gen6_urb.c
@@ -72,7 +72,7 @@ const struct brw_tracked_state gen6_urb = {
    .dirty = {
       .mesa = 0,
       .brw = BRW_NEW_CONTEXT,
-      .cache = CACHE_NEW_VS_PROG,
+      .cache = (CACHE_NEW_VS_PROG | CACHE_NEW_GS_PROG),
    },
    .prepare = prepare_urb,
    .emit = upload_urb,
diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index e94d0c0ddbb..4ef9e2e6072 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -54,7 +54,7 @@ upload_vs_state(struct brw_context *brw)
       OUT_BATCH(0);
       ADVANCE_BATCH();
    } else {
-      int params_uploaded = 0;
+      int params_uploaded = 0, param_regs;
       float *param;
 
       if (brw->vertex_program->IsNVProgram)
@@ -88,20 +88,11 @@ upload_vs_state(struct brw_context *brw)
 	 params_uploaded++;
       }
 
-      if (vp->use_const_buffer) {
-	 for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
-	    if (brw->vs.constant_map[i] != -1) {
-	       memcpy(param + brw->vs.constant_map[i] * 4,
-		      vp->program.Base.Parameters->ParameterValues[i],
-		      4 * sizeof(float));
-	       params_uploaded++;
-	    }
-	 }
-      } else {
-	 for (i = 0; i < nr_params; i++) {
-	    memcpy(param, vp->program.Base.Parameters->ParameterValues[i],
+      for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) {
+	 if (brw->vs.constant_map[i] != -1) {
+	    memcpy(param + brw->vs.constant_map[i] * 4,
+		   vp->program.Base.Parameters->ParameterValues[i],
 		   4 * sizeof(float));
-	    param += 4;
 	    params_uploaded++;
 	 }
       }
@@ -117,13 +108,16 @@ upload_vs_state(struct brw_context *brw)
 
       drm_intel_gem_bo_unmap_gtt(constant_bo);
 
+      param_regs = (params_uploaded + 1) / 2;
+      assert(param_regs <= 32);
+
       BEGIN_BATCH(5);
       OUT_BATCH(CMD_3D_CONSTANT_VS_STATE << 16 |
 		GEN6_CONSTANT_BUFFER_0_ENABLE |
 		(5 - 2));
       OUT_RELOC(constant_bo,
 		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
-		ALIGN(params_uploaded, 2) / 2 - 1);
+		param_regs - 1);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index ea5418bacf1..d80df4e254b 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -66,6 +66,21 @@ prepare_wm_constants(struct brw_context *brw)
 	 constants[i] = convert_param(brw->wm.prog_data->param_convert[i],
 				      *brw->wm.prog_data->param[i]);
       }
+
+      if (0) {
+	 printf("WM constants:\n");
+	 for (i = 0; i < brw->wm.prog_data->nr_params; i++) {
+	    if ((i & 7) == 0)
+	       printf("g%d: ", brw->wm.prog_data->first_curbe_grf + i / 8);
+	    printf("%8f ", constants[i]);
+	    if ((i & 7) == 7)
+	       printf("\n");
+	 }
+	 if ((i & 7) != 0)
+	    printf("\n");
+	 printf("\n");
+      }
+
       drm_intel_gem_bo_unmap_gtt(brw->wm.push_const_bo);
    }
 }
@@ -88,6 +103,7 @@ upload_wm_state(struct brw_context *brw)
       brw_fragment_program_const(brw->fragment_program);
    uint32_t dw2, dw4, dw5, dw6;
 
+   /* CACHE_NEW_WM_PROG */
    if (brw->wm.prog_data->nr_params == 0) {
       /* Disable the push constant buffers. */
       BEGIN_BATCH(5);
@@ -104,7 +120,8 @@ upload_wm_state(struct brw_context *brw)
 		(5 - 2));
       OUT_RELOC(brw->wm.push_const_bo,
 		I915_GEM_DOMAIN_RENDER, 0, /* XXX: bad domain */
-		ALIGN(brw->wm.prog_data->nr_params, 8) / 8 - 1);
+		ALIGN(brw->wm.prog_data->nr_params,
+		      brw->wm.prog_data->dispatch_width) / 8 - 1);
       OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
@@ -126,8 +143,8 @@ upload_wm_state(struct brw_context *brw)
 
    dw5 |= (40 - 1) << GEN6_WM_MAX_THREADS_SHIFT;
 
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   if (fp->isGLSL)
+   /* CACHE_NEW_WM_PROG */
+   if (brw->wm.prog_data->dispatch_width == 8)
       dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
    else
       dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
@@ -176,13 +193,14 @@ upload_wm_state(struct brw_context *brw)
 const struct brw_tracked_state gen6_wm_state = {
    .dirty = {
       .mesa  = (_NEW_LINE | _NEW_POLYGONSTIPPLE | _NEW_COLOR | _NEW_BUFFERS |
-		_NEW_PROGRAM_CONSTANTS),
+		_NEW_PROGRAM_CONSTANTS | _NEW_POLYGON),
       .brw   = (BRW_NEW_CURBE_OFFSETS |
 		BRW_NEW_FRAGMENT_PROGRAM |
                 BRW_NEW_NR_WM_SURFACES |
 		BRW_NEW_URB_FENCE |
 		BRW_NEW_BATCH),
-      .cache = CACHE_NEW_SAMPLER
+      .cache = (CACHE_NEW_SAMPLER |
+		CACHE_NEW_WM_PROG)
    },
    .emit = upload_wm_state,
 };
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 4b498f8c5b2..21fc9ece886 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -92,7 +92,7 @@ do_flush_locked(struct intel_batchbuffer *batch, GLuint used)
 
    batch->ptr = NULL;
 
-   if (!intel->no_hw) {
+   if (!intel->intelScreen->no_hw) {
       drm_intel_bo_exec(batch->buf, used, NULL, 0,
 			(x_off & 0xffff) | (y_off << 16));
    }
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 152cdcaf37d..9c222c7b485 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -519,7 +519,6 @@ static const struct dri_debug_control debug_control[] = {
    { "sing",  DEBUG_SINGLE_THREAD },
    { "thre",  DEBUG_SINGLE_THREAD },
    { "wm",    DEBUG_WM },
-   { "glsl_force", DEBUG_GLSL_FORCE },
    { "urb",   DEBUG_URB },
    { "vs",    DEBUG_VS },
    { "clip",  DEBUG_CLIP },
@@ -800,11 +799,6 @@ intelInitContext(struct intel_context *intel,
    if (INTEL_DEBUG & DEBUG_BUFMGR)
       dri_bufmgr_set_debug(intel->bufmgr, GL_TRUE);
 
-   /* XXX force SIMD8 kernel for Sandybridge before we fixed
-      SIMD16 interpolation. */
-   if (intel->gen == 6)
-       INTEL_DEBUG |= DEBUG_GLSL_FORCE;
-
    intel->batch = intel_batchbuffer_alloc(intel);
 
    intel_fbo_init(intel);
@@ -838,11 +832,6 @@ intelInitContext(struct intel_context *intel,
       intel->always_flush_cache = 1;
    }
 
-   /* Disable all hardware rendering (skip emitting batches and fences/waits
-    * to the kernel)
-    */
-   intel->no_hw = getenv("INTEL_NO_HW") != NULL;
-
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 9d5139c0000..96493c0f2bb 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -207,7 +207,6 @@ struct intel_context
    GLboolean hw_stipple;
    GLboolean depth_buffer_is_float;
    GLboolean no_rast;
-   GLboolean no_hw;
    GLboolean always_flush_batch;
    GLboolean always_flush_cache;
 
@@ -362,7 +361,6 @@ extern int INTEL_DEBUG;
 #define DEBUG_WM        0x800000
 #define DEBUG_URB       0x1000000
 #define DEBUG_VS        0x2000000
-#define DEBUG_GLSL_FORCE 0x4000000
 #define DEBUG_CLIP      0x8000000
 
 #define DBG(...) do {						\
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 862a13d2ea5..18e796a1186 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -465,10 +465,12 @@ intel_update_wrapper(struct gl_context *ctx, struct intel_renderbuffer *irb,
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to XGBA8 texture OK\n");
    }
+#ifndef I915
    else if (texImage->TexFormat == MESA_FORMAT_SARGB8) {
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to SARGB8 texture OK\n");
    }
+#endif
    else if (texImage->TexFormat == MESA_FORMAT_RGB565) {
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to RGB5 texture OK\n");
@@ -481,6 +483,7 @@ intel_update_wrapper(struct gl_context *ctx, struct intel_renderbuffer *irb,
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to ARGB4444 texture OK\n");
    }
+#ifndef I915
    else if (texImage->TexFormat == MESA_FORMAT_A8) {
       irb->Base.DataType = GL_UNSIGNED_BYTE;
       DBG("Render to A8 texture OK\n");
@@ -501,6 +504,7 @@ intel_update_wrapper(struct gl_context *ctx, struct intel_renderbuffer *irb,
       irb->Base.DataType = GL_UNSIGNED_SHORT;
       DBG("Render to RG88 texture OK\n");
    }
+#endif
    else if (texImage->TexFormat == MESA_FORMAT_Z16) {
       irb->Base.DataType = GL_UNSIGNED_SHORT;
       DBG("Render to DEPTH16 texture OK\n");
@@ -710,15 +714,17 @@ intel_validate_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb)
       switch (irb->Base.Format) {
       case MESA_FORMAT_ARGB8888:
       case MESA_FORMAT_XRGB8888:
-      case MESA_FORMAT_SARGB8:
       case MESA_FORMAT_RGB565:
       case MESA_FORMAT_ARGB1555:
       case MESA_FORMAT_ARGB4444:
+#ifndef I915
+      case MESA_FORMAT_SARGB8:
       case MESA_FORMAT_A8:
       case MESA_FORMAT_R8:
       case MESA_FORMAT_R16:
       case MESA_FORMAT_RG88:
       case MESA_FORMAT_RG1616:
+#endif
 	 break;
       default:
 	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 061f0d278d6..3f13589a214 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -452,7 +452,7 @@ intelCreateContext(gl_api api,
       return brwCreateContext(api, mesaVis,
 			      driContextPriv, sharedContextPrivate);
 #endif
-   fprintf(stderr, "Unrecognized deviceID %x\n", intelScreen->deviceID);
+   fprintf(stderr, "Unrecognized deviceID 0x%x\n", intelScreen->deviceID);
    return GL_FALSE;
 }
 
@@ -462,7 +462,8 @@ intel_init_bufmgr(struct intel_screen *intelScreen)
    __DRIscreen *spriv = intelScreen->driScrnPriv;
    int num_fences = 0;
 
-   intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
+   intelScreen->no_hw = (getenv("INTEL_NO_HW") != NULL ||
+			 getenv("INTEL_DEVID_OVERRIDE") != NULL);
 
    intelScreen->bufmgr = intel_bufmgr_gem_init(spriv->fd, BATCH_SZ);
    if (intelScreen->bufmgr == NULL) {
@@ -497,6 +498,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    GLenum fb_format[3];
    GLenum fb_type[3];
    unsigned int api_mask;
+   char *devid_override;
 
    static const GLenum back_buffer_modes[] = {
        GLX_NONE, GLX_SWAP_UNDEFINED_OML, GLX_SWAP_COPY_OML
@@ -523,6 +525,16 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
 			&intelScreen->deviceID))
       return GL_FALSE;
 
+   /* Allow an override of the device ID for the purpose of making the
+    * driver produce dumps for debugging of new chipset enablement.
+    * This implies INTEL_NO_HW, to avoid programming your actual GPU
+    * incorrectly.
+    */
+   devid_override = getenv("INTEL_DEVID_OVERRIDE");
+   if (devid_override) {
+      intelScreen->deviceID = strtod(devid_override, NULL);
+   }
+
    api_mask = (1 << __DRI_API_OPENGL);
 #if FEATURE_ES1
    api_mask |= (1 << __DRI_API_GLES);
diff --git a/src/mesa/drivers/dri/intel/intel_tex_format.c b/src/mesa/drivers/dri/intel/intel_tex_format.c
index 9d73a2fb375..f8316ae2f8d 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_format.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_format.c
@@ -204,11 +204,13 @@ intelChooseTextureFormat(struct gl_context * ctx, GLint internalFormat,
     * { R, G, 1.0, 1.0 } from a red-green texture would be useful.
     */
    case GL_RED:
+   case GL_COMPRESSED_RED:
    case GL_R8:
       return MESA_FORMAT_R8;
    case GL_R16:
       return MESA_FORMAT_R16;
    case GL_RG:
+   case GL_COMPRESSED_RG:
    case GL_RG8:
       return MESA_FORMAT_RG88;
    case GL_RG16:
diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
index 8a047e6419b..b62290231b9 100644
--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
@@ -200,6 +200,7 @@ void r200EmitArrays( struct gl_context *ctx, GLubyte *vimap_rev )
 	    }
 	 default:
 	    assert(0);
+	    emitsize = 0;
 	 }
 	 if (!rmesa->radeon.tcl.aos[nr].bo) {
 	   rcommon_emit_vector( ctx,
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
index 8be32ea91fe..1db8678e890 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_emit.c
@@ -76,6 +76,9 @@ static void use_temporary(struct r300_fragment_program_code *code, unsigned int
 
 static unsigned int use_source(struct r300_fragment_program_code* code, struct rc_pair_instruction_source src)
 {
+	if (!src.Used)
+		return 0;
+
 	if (src.File == RC_FILE_CONSTANT) {
 		return src.Index | (1 << 5);
 	} else if (src.File == RC_FILE_TEMPORARY) {
diff --git a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
index 2d28b065390..05d3da8a10d 100644
--- a/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
+++ b/src/mesa/drivers/dri/r300/compiler/r300_fragprog_swizzle.c
@@ -94,6 +94,7 @@ static const struct swizzle_data* lookup_native_swizzle(unsigned int swizzle)
  */
 static int r300_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
+	const struct swizzle_data* sd;
 	unsigned int relevant;
 	int j;
 
@@ -127,7 +128,8 @@ static int r300_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 	if ((reg.Negate & relevant) && ((reg.Negate & relevant) != relevant))
 		return 0;
 
-	if (!lookup_native_swizzle(reg.Swizzle))
+	sd = lookup_native_swizzle(reg.Swizzle);
+	if (!sd || (reg.File == RC_FILE_PRESUB && sd->srcp_stride == 0))
 		return 0;
 
 	return 1;
@@ -201,7 +203,7 @@ unsigned int r300FPTranslateRGBSwizzle(unsigned int src, unsigned int swizzle)
 {
 	const struct swizzle_data* sd = lookup_native_swizzle(swizzle);
 
-	if (!sd) {
+	if (!sd || (src == RC_PAIR_PRESUB_SRC && sd->srcp_stride == 0)) {
 		fprintf(stderr, "Not a native swizzle: %08x\n", swizzle);
 		return 0;
 	}
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index 2f130198d35..e0d349b98ce 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -24,6 +24,7 @@
 
 #include <stdio.h>
 
+#include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
 #include "radeon_emulate_branches.h"
 #include "radeon_emulate_loops.h"
@@ -54,6 +55,8 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user)
 
 	for (rci = c->Base.Program.Instructions.Next; rci != &c->Base.Program.Instructions; rci = rci->Next) {
 		struct rc_sub_instruction * inst = &rci->U.I;
+		unsigned i;
+		const struct rc_opcode_info *info = rc_get_opcode_info(inst->Opcode);
 
 		if (inst->DstReg.File != RC_FILE_OUTPUT || inst->DstReg.Index != c->OutputDepth)
 			continue;
@@ -65,27 +68,12 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user)
 			continue;
 		}
 
-		switch (inst->Opcode) {
-			case RC_OPCODE_FRC:
-			case RC_OPCODE_MOV:
-				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
-				break;
-			case RC_OPCODE_ADD:
-			case RC_OPCODE_MAX:
-			case RC_OPCODE_MIN:
-			case RC_OPCODE_MUL:
-				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
-				inst->SrcReg[1] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[1]);
-				break;
-			case RC_OPCODE_CMP:
-			case RC_OPCODE_MAD:
-				inst->SrcReg[0] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[0]);
-				inst->SrcReg[1] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[1]);
-				inst->SrcReg[2] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[2]);
-				break;
-			default:
-				// Scalar instructions needn't be reswizzled
-				break;
+		if (!info->IsComponentwise) {
+			continue;
+		}
+
+		for (i = 0; i < info->NumSrcRegs; i++) {
+			inst->SrcReg[i] = lmul_swizzle(RC_SWIZZLE_ZZZZ, inst->SrcReg[i]);
 		}
 	}
 }
@@ -93,7 +81,6 @@ static void rc_rewrite_depth_out(struct radeon_compiler *cc, void *user)
 void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 {
 	int is_r500 = c->Base.is_r500;
-	int kill_consts = c->Base.remove_unused_constants;
 	int opt = !c->Base.disable_optimizations;
 
 	/* Lists of instruction transformations. */
@@ -133,11 +120,11 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 		{"emulate loops",		1, !is_r500,	rc_emulate_loops,		NULL},
 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
 		{"dataflow swizzles",		1, 1,		rc_dataflow_swizzles,		NULL},
-		{"dead constants",		1, kill_consts, rc_remove_unused_constants,	&c->code->constants_remap_table},
+		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
 		/* This pass makes it easier for the scheduler to group TEX
 		 * instructions and reduces the chances of creating too
 		 * many texture indirections.*/
-		{"register rename",		1, !is_r500,	rc_rename_regs,			NULL},
+		{"register rename",		1, !is_r500 || opt, rc_rename_regs,		NULL},
 		{"pair translate",		1, 1,		rc_pair_translate,		NULL},
 		{"pair scheduling",		1, 1,		rc_pair_schedule,		NULL},
 		{"register allocation",		1, opt,		rc_pair_regalloc,		NULL},
@@ -150,9 +137,10 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 		{NULL, 0, 0, NULL, NULL}
 	};
 
+	c->Base.type = RC_FRAGMENT_PROGRAM;
 	c->Base.SwizzleCaps = c->Base.is_r500 ? &r500_swizzle_caps : &r300_swizzle_caps;
 
-	rc_run_compiler(&c->Base, fs_list, "Fragment Program");
+	rc_run_compiler(&c->Base, fs_list);
 
 	rc_constants_copy(&c->code->constants, &c->Base.Program.Constants);
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index bf8341f0173..472029f63d0 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -26,6 +26,7 @@
 
 #include "../r300_reg.h"
 
+#include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
 #include "radeon_program_alu.h"
 #include "radeon_swizzle.h"
@@ -790,19 +791,14 @@ static void allocate_temporary_registers(struct radeon_compiler *c, void *user)
 						if (!hwtemps[j])
 							break;
 					}
-					if (j >= c->max_temp_regs) {
-						rc_error(c, "Too many temporaries\n");
-						return;
+					ta[orig].Allocated = 1;
+					if (last_inst_src_reladdr &&
+					    last_inst_src_reladdr->IP > inst->IP) {
+						ta[orig].HwTemp = orig;
 					} else {
-						ta[orig].Allocated = 1;
-						if (last_inst_src_reladdr &&
-						    last_inst_src_reladdr->IP > inst->IP) {
-							ta[orig].HwTemp = orig;
-						} else {
-							ta[orig].HwTemp = j;
-						}
-						hwtemps[ta[orig].HwTemp] = 1;
+						ta[orig].HwTemp = j;
 					}
+					hwtemps[ta[orig].HwTemp] = 1;
 				}
 
 				inst->U.I.DstReg.Index = ta[orig].HwTemp;
@@ -1018,7 +1014,6 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
 {
 	int is_r500 = c->Base.is_r500;
-	int kill_consts = c->Base.remove_unused_constants;
 	int opt = !c->Base.disable_optimizations;
 
 	/* Lists of instruction transformations. */
@@ -1062,18 +1057,18 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
 		{"dataflow optimize",		1, opt,		rc_optimize,			NULL},
 		/* This pass must be done after optimizations. */
 		{"source conflict resolve",	1, 1,		rc_local_transform,		resolve_src_conflicts},
-		{"dataflow swizzles",		1, 1,		rc_dataflow_swizzles,		NULL},
 		{"register allocation",		1, opt,		allocate_temporary_registers,	NULL},
-		{"dead constants",		1, kill_consts, rc_remove_unused_constants,	&c->code->constants_remap_table},
+		{"dead constants",		1, 1,		rc_remove_unused_constants,	&c->code->constants_remap_table},
 		{"final code validation",	0, 1,		rc_validate_final_shader,	NULL},
 		{"machine code generation",	0, 1,		translate_vertex_program,	NULL},
 		{"dump machine code",		0, c->Base.Debug & RC_DBG_LOG, r300_vertex_program_dump,	NULL},
 		{NULL, 0, 0, NULL, NULL}
 	};
 
+	c->Base.type = RC_VERTEX_PROGRAM;
 	c->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
 
-	rc_run_compiler(&c->Base, vs_list, "Vertex Program");
+	rc_run_compiler(&c->Base, vs_list);
 
 	c->code->InputsRead = c->Base.Program.InputsRead;
 	c->code->OutputsWritten = c->Base.Program.OutputsWritten;
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
index 289bb87ae59..ef81be48f77 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -29,6 +29,7 @@
 
 #include <stdio.h>
 
+#include "radeon_compiler_util.h"
 #include "../r300_reg.h"
 
 /**
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index 6f101c68eb6..5da82d90f67 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -45,9 +45,6 @@
 
 #include "radeon_program_pair.h"
 
-#define MAX_BRANCH_DEPTH_FULL 32
-#define MAX_BRANCH_DEPTH_PARTIAL 4
-
 #define PROG_CODE \
 	struct r500_fragment_program_code *code = &c->code->code.r500
 
@@ -200,6 +197,9 @@ static void use_temporary(struct r500_fragment_program_code* code, unsigned int
 
 static unsigned int use_source(struct r500_fragment_program_code* code, struct rc_pair_instruction_source src)
 {
+	if (!src.Used)
+		return 0;
+
 	if (src.File == RC_FILE_CONSTANT) {
 		return src.Index | 0x100;
 	} else if (src.File == RC_FILE_TEMPORARY) {
@@ -506,7 +506,7 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 		break;
 	}
 	case RC_OPCODE_IF:
-		if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) {
+		if ( s->CurrentBranchDepth >= R500_PFS_MAX_BRANCH_DEPTH_FULL) {
 			rc_error(s->C, "Branch depth exceeds hardware limit");
 			return;
 		}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index cfb6df2cd79..b69e81698ae 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -34,6 +34,8 @@
 #define R500_PFS_MAX_INST         512
 #define R500_PFS_NUM_TEMP_REGS    128
 #define R500_PFS_NUM_CONST_REGS   256
+#define R500_PFS_MAX_BRANCH_DEPTH_FULL 32
+#define R500_PFS_MAX_BRANCH_DEPTH_PARTIAL 4
 
 
 #define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
index 4286baed0c6..65548604bcc 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
@@ -29,6 +29,7 @@
 #include "radeon_dataflow.h"
 #include "radeon_program.h"
 #include "radeon_program_pair.h"
+#include "radeon_compiler_util.h"
 
 
 void rc_init(struct radeon_compiler * c)
@@ -356,66 +357,92 @@ void rc_transform_fragment_face(struct radeon_compiler *c, unsigned face)
 static void reg_count_callback(void * userdata, struct rc_instruction * inst,
 		rc_register_file file, unsigned int index, unsigned int mask)
 {
-	unsigned int * max_reg = userdata;
+	int *max_reg = userdata;
 	if (file == RC_FILE_TEMPORARY)
-		index > *max_reg ? *max_reg = index : 0;
+		(int)index > *max_reg ? *max_reg = index : 0;
 }
 
-static void print_stats(struct radeon_compiler * c)
+void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s)
 {
+	int max_reg = -1;
 	struct rc_instruction * tmp;
-	unsigned max_reg, insts, fc, tex, alpha, rgb, presub;
-	max_reg = insts = fc = tex = alpha = rgb = presub = 0;
+	memset(s, 0, sizeof(*s));
+
 	for(tmp = c->Program.Instructions.Next; tmp != &c->Program.Instructions;
 							tmp = tmp->Next){
 		const struct rc_opcode_info * info;
 		rc_for_all_reads_mask(tmp, reg_count_callback, &max_reg);
 		if (tmp->Type == RC_INSTRUCTION_NORMAL) {
 			if (tmp->U.I.PreSub.Opcode != RC_PRESUB_NONE)
-				presub++;
+				s->num_presub_ops++;
 			info = rc_get_opcode_info(tmp->U.I.Opcode);
 		} else {
 			if (tmp->U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Used)
-				presub++;
+				s->num_presub_ops++;
 			if (tmp->U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Used)
-				presub++;
+				s->num_presub_ops++;
 			/* Assuming alpha will never be a flow control or
 			 * a tex instruction. */
 			if (tmp->U.P.Alpha.Opcode != RC_OPCODE_NOP)
-				alpha++;
+				s->num_alpha_insts++;
 			if (tmp->U.P.RGB.Opcode != RC_OPCODE_NOP)
-				rgb++;
+				s->num_rgb_insts++;
 			info = rc_get_opcode_info(tmp->U.P.RGB.Opcode);
 		}
 		if (info->IsFlowControl)
-			fc++;
+			s->num_fc_insts++;
 		if (info->HasTexture)
-			tex++;
-		insts++;
+			s->num_tex_insts++;
+		s->num_insts++;
 	}
-	if (insts < 4)
-		return;
-	fprintf(stderr,"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
-		       "~%4u Instructions\n"
-		       "~%4u Vector Instructions (RGB)\n"
-		       "~%4u Scalar Instructions (Alpha)\n"
-		       "~%4u Flow Control Instructions\n"
-		       "~%4u Texture Instructions\n"
-		       "~%4u Presub Operations\n"
-		       "~%4u Temporary Registers\n"
-		       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
-		       insts, rgb, alpha, fc, tex, presub, max_reg + 1);
+	s->num_temp_regs = max_reg + 1;
 }
 
-/* Executes a list of compiler passes given in the parameter 'list'. */
-void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list,
-		     const char *shader_name)
+static void print_stats(struct radeon_compiler * c)
 {
-	if (c->Debug & RC_DBG_LOG) {
-		fprintf(stderr, "%s: before compilation\n", shader_name);
-		rc_print_program(&c->Program);
+	struct rc_program_stats s;
+
+	rc_get_stats(c, &s);
+
+	if (s.num_insts < 4)
+		return;
+
+	switch (c->type) {
+	case RC_VERTEX_PROGRAM:
+		fprintf(stderr,"~~~~~~~~~ VERTEX PROGRAM ~~~~~~~~\n"
+			       "~%4u Instructions\n"
+			       "~%4u Flow Control Instructions\n"
+			       "~%4u Temporary Registers\n"
+			       "~~~~~~~~~~~~~~ END ~~~~~~~~~~~~~~\n",
+			       s.num_insts, s.num_fc_insts, s.num_temp_regs);
+		break;
+
+	case RC_FRAGMENT_PROGRAM:
+		fprintf(stderr,"~~~~~~~~ FRAGMENT PROGRAM ~~~~~~~\n"
+			       "~%4u Instructions\n"
+			       "~%4u Vector Instructions (RGB)\n"
+			       "~%4u Scalar Instructions (Alpha)\n"
+			       "~%4u Flow Control Instructions\n"
+			       "~%4u Texture Instructions\n"
+			       "~%4u Presub Operations\n"
+			       "~%4u Temporary Registers\n"
+			       "~~~~~~~~~~~~~~ END ~~~~~~~~~~~~~~\n",
+			       s.num_insts, s.num_rgb_insts, s.num_alpha_insts,
+			       s.num_fc_insts, s.num_tex_insts, s.num_presub_ops,
+			       s.num_temp_regs);
+		break;
+	default:
+		assert(0);
 	}
+}
 
+static const char *shader_name[RC_NUM_PROGRAM_TYPES] = {
+	"Vertex Program",
+	"Fragment Program"
+};
+
+void rc_run_compiler_passes(struct radeon_compiler *c, struct radeon_compiler_pass *list)
+{
 	for (unsigned i = 0; list[i].name; i++) {
 		if (list[i].predicate) {
 			list[i].run(c, list[i].user);
@@ -424,11 +451,23 @@ void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *lis
 				return;
 
 			if ((c->Debug & RC_DBG_LOG) && list[i].dump) {
-				fprintf(stderr, "%s: after '%s'\n", shader_name, list[i].name);
+				fprintf(stderr, "%s: after '%s'\n", shader_name[c->type], list[i].name);
 				rc_print_program(&c->Program);
 			}
 		}
 	}
+}
+
+/* Executes a list of compiler passes given in the parameter 'list'. */
+void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list)
+{
+	if (c->Debug & RC_DBG_LOG) {
+		fprintf(stderr, "%s: before compilation\n", shader_name[c->type]);
+		rc_print_program(&c->Program);
+	}
+
+	rc_run_compiler_passes(c, list);
+
 	if (c->Debug & RC_DBG_STATS)
 		print_stats(c);
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
index 31fd469a04f..e6633395895 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
@@ -35,9 +35,16 @@
 
 struct rc_swizzle_caps;
 
+enum rc_program_type {
+	RC_VERTEX_PROGRAM,
+	RC_FRAGMENT_PROGRAM,
+	RC_NUM_PROGRAM_TYPES
+};
+
 struct radeon_compiler {
 	struct memory_pool Pool;
 	struct rc_program Program;
+	enum rc_program_type type;
 	unsigned Debug:2;
 	unsigned Error:1;
 	char * ErrorMsg;
@@ -140,9 +147,21 @@ struct radeon_compiler_pass {
 	void *user;		/* Optional parameter which is passed to the run function. */
 };
 
+struct rc_program_stats {
+	unsigned num_insts;
+	unsigned num_fc_insts;
+	unsigned num_tex_insts;
+	unsigned num_rgb_insts;
+	unsigned num_alpha_insts;
+	unsigned num_presub_ops;
+	unsigned num_temp_regs;
+};
+
+void rc_get_stats(struct radeon_compiler *c, struct rc_program_stats *s);
+
 /* Executes a list of compiler passes given in the parameter 'list'. */
-void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list,
-		     const char *shader_name);
+void rc_run_compiler_passes(struct radeon_compiler *c, struct radeon_compiler_pass *list);
+void rc_run_compiler(struct radeon_compiler *c, struct radeon_compiler_pass *list);
 void rc_validate_final_shader(struct radeon_compiler *c, void *user);
 
 #endif /* RADEON_COMPILER_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
index 97f4c758492..bf393a9fb16 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.c
@@ -31,6 +31,8 @@
 
 #include "radeon_compiler_util.h"
 
+#include "radeon_compiler.h"
+#include "radeon_dataflow.h"
 /**
  */
 unsigned int rc_swizzle_to_writemask(unsigned int swz)
@@ -46,6 +48,91 @@ unsigned int rc_swizzle_to_writemask(unsigned int swz)
 	return mask;
 }
 
+rc_swizzle get_swz(unsigned int swz, rc_swizzle idx)
+{
+	if (idx & 0x4)
+		return idx;
+	return GET_SWZ(swz, idx);
+}
+
+unsigned int combine_swizzles4(unsigned int src,
+		rc_swizzle swz_x, rc_swizzle swz_y, rc_swizzle swz_z, rc_swizzle swz_w)
+{
+	unsigned int ret = 0;
+
+	ret |= get_swz(src, swz_x);
+	ret |= get_swz(src, swz_y) << 3;
+	ret |= get_swz(src, swz_z) << 6;
+	ret |= get_swz(src, swz_w) << 9;
+
+	return ret;
+}
+
+unsigned int combine_swizzles(unsigned int src, unsigned int swz)
+{
+	unsigned int ret = 0;
+
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_X));
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Y)) << 3;
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Z)) << 6;
+	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_W)) << 9;
+
+	return ret;
+}
+
+/**
+ * @param mask Must be either RC_MASK_X, RC_MASK_Y, RC_MASK_Z, or RC_MASK_W
+ */
+rc_swizzle rc_mask_to_swizzle(unsigned int mask)
+{
+	switch (mask) {
+	case RC_MASK_X: return RC_SWIZZLE_X;
+	case RC_MASK_Y: return RC_SWIZZLE_Y;
+	case RC_MASK_Z: return RC_SWIZZLE_Z;
+	case RC_MASK_W: return RC_SWIZZLE_W;
+	}
+	return RC_SWIZZLE_UNUSED;
+}
+
+/* Reorder mask bits according to swizzle. */
+unsigned swizzle_mask(unsigned swizzle, unsigned mask)
+{
+	unsigned ret = 0;
+	for (unsigned chan = 0; chan < 4; ++chan) {
+		unsigned swz = GET_SWZ(swizzle, chan);
+		if (swz < 4)
+			ret |= GET_BIT(mask, swz) << chan;
+	}
+	return ret;
+}
+
+/**
+ * Left multiplication of a register with a swizzle
+ */
+struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg)
+{
+	struct rc_src_register tmp = srcreg;
+	int i;
+	tmp.Swizzle = 0;
+	tmp.Negate = 0;
+	for(i = 0; i < 4; ++i) {
+		rc_swizzle swz = GET_SWZ(swizzle, i);
+		if (swz < 4) {
+			tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
+			tmp.Negate |= GET_BIT(srcreg.Negate, swz) << i;
+		} else {
+			tmp.Swizzle |= swz << (i*3);
+		}
+	}
+	return tmp;
+}
+
+void reset_srcreg(struct rc_src_register* reg)
+{
+	memset(reg, 0, sizeof(struct rc_src_register));
+	reg->Swizzle = RC_SWIZZLE_XYZW;
+}
+
 unsigned int rc_src_reads_dst_mask(
 		rc_register_file src_file,
 		unsigned int src_idx,
@@ -59,3 +146,123 @@ unsigned int rc_src_reads_dst_mask(
 	}
 	return dst_mask & rc_swizzle_to_writemask(src_swz);
 }
+
+unsigned int rc_source_type_swz(unsigned int swizzle, unsigned int channels)
+{
+	unsigned int chan;
+	unsigned int swz = RC_SWIZZLE_UNUSED;
+	unsigned int ret = RC_SOURCE_NONE;
+
+	for(chan = 0; chan < channels; chan++) {
+		swz = GET_SWZ(swizzle, chan);
+		if (swz == RC_SWIZZLE_W) {
+			ret |= RC_SOURCE_ALPHA;
+		} else if (swz == RC_SWIZZLE_X || swz == RC_SWIZZLE_Y
+						|| swz == RC_SWIZZLE_Z) {
+			ret |= RC_SOURCE_RGB;
+		}
+	}
+	return ret;
+}
+
+unsigned int rc_source_type_mask(unsigned int mask)
+{
+	unsigned int ret = RC_SOURCE_NONE;
+
+	if (mask & RC_MASK_XYZ)
+		ret |= RC_SOURCE_RGB;
+
+	if (mask & RC_MASK_W)
+		ret |= RC_SOURCE_ALPHA;
+
+	return ret;
+}
+
+struct can_use_presub_data {
+	struct rc_src_register RemoveSrcs[3];
+	unsigned int RGBCount;
+	unsigned int AlphaCount;
+};
+
+static void can_use_presub_read_cb(
+	void * userdata,
+	struct rc_instruction * inst,
+	rc_register_file file,
+	unsigned int index,
+	unsigned int mask)
+{
+	struct can_use_presub_data * d = userdata;
+	unsigned int src_type = rc_source_type_mask(mask);
+	unsigned int i;
+
+	if (file == RC_FILE_NONE)
+		return;
+
+	for(i = 0; i < 3; i++) {
+		if (d->RemoveSrcs[i].File == file
+		    && d->RemoveSrcs[i].Index == index) {
+			src_type &=
+				~rc_source_type_swz(d->RemoveSrcs[i].Swizzle, 4);
+		}
+	}
+
+	if (src_type & RC_SOURCE_RGB)
+		d->RGBCount++;
+
+	if (src_type & RC_SOURCE_ALPHA)
+		d->AlphaCount++;
+}
+
+unsigned int rc_inst_can_use_presub(
+	struct rc_instruction * inst,
+	rc_presubtract_op presub_op,
+	unsigned int presub_writemask,
+	struct rc_src_register replace_reg,
+	struct rc_src_register presub_src0,
+	struct rc_src_register presub_src1)
+{
+	struct can_use_presub_data d;
+	unsigned int num_presub_srcs;
+	unsigned int presub_src_type = rc_source_type_mask(presub_writemask);
+	const struct rc_opcode_info * info =
+					rc_get_opcode_info(inst->U.I.Opcode);
+
+	if (presub_op == RC_PRESUB_NONE) {
+		return 1;
+	}
+
+	if (info->HasTexture) {
+		return 0;
+	}
+
+	/* We can't use more than one presubtract value in an
+	 * instruction, unless the two prsubtract operations
+	 * are the same and read from the same registers.
+	 * XXX For now we will limit instructions to only one presubtract
+	 * value.*/
+	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE) {
+		return 0;
+	}
+
+	memset(&d, 0, sizeof(d));
+	d.RemoveSrcs[0] = replace_reg;
+	d.RemoveSrcs[1] = presub_src0;
+	d.RemoveSrcs[2] = presub_src1;
+
+	rc_for_all_reads_mask(inst, can_use_presub_read_cb, &d);
+
+	num_presub_srcs = rc_presubtract_src_reg_count(presub_op);
+
+	if ((presub_src_type & RC_SOURCE_RGB)
+					&& d.RGBCount + num_presub_srcs > 3) {
+		return 0;
+	}
+
+	if ((presub_src_type & RC_SOURCE_ALPHA)
+					&& d.AlphaCount + num_presub_srcs > 3) {
+		return 0;
+	}
+
+	return 1;
+}
+
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
index 1a14e7cb0ef..461ab9ffb10 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler_util.h
@@ -3,8 +3,27 @@
 #ifndef RADEON_PROGRAM_UTIL_H
 #define RADEON_PROGRAM_UTIL_H
 
+struct rc_instruction;
+struct rc_src_register;
+
 unsigned int rc_swizzle_to_writemask(unsigned int swz);
 
+rc_swizzle get_swz(unsigned int swz, rc_swizzle idx);
+
+unsigned int combine_swizzles4(unsigned int src,
+			       rc_swizzle swz_x, rc_swizzle swz_y,
+			       rc_swizzle swz_z, rc_swizzle swz_w);
+
+unsigned int combine_swizzles(unsigned int src, unsigned int swz);
+
+rc_swizzle rc_mask_to_swizzle(unsigned int mask);
+
+unsigned swizzle_mask(unsigned swizzle, unsigned mask);
+
+struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg);
+
+void reset_srcreg(struct rc_src_register* reg);
+
 unsigned int rc_src_reads_dst_mask(
 		rc_register_file src_file,
 		unsigned int src_idx,
@@ -13,4 +32,16 @@ unsigned int rc_src_reads_dst_mask(
 		unsigned int dst_idx,
 		unsigned int dst_mask);
 
+unsigned int rc_source_type_swz(unsigned int swizzle, unsigned int channels);
+
+unsigned int rc_source_type_mask(unsigned int mask);
+
+unsigned int rc_inst_can_use_presub(
+	struct rc_instruction * inst,
+	rc_presubtract_op presub_op,
+	unsigned int presub_writemask,
+	struct rc_src_register replace_reg,
+	struct rc_src_register presub_src0,
+	struct rc_src_register presub_src1);
+
 #endif /* RADEON_PROGRAM_UTIL_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
index fd94194dc34..d0a64d936e0 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.c
@@ -139,7 +139,46 @@ static void pair_sub_for_all_args(
 	const struct rc_opcode_info * info = rc_get_opcode_info(sub->Opcode);
 
 	for(i = 0; i < info->NumSrcRegs; i++) {
-		cb(userdata, fullinst, &sub->Arg[i]);
+		unsigned int src_type;
+		unsigned int channels = 0;
+		if (&fullinst->U.P.RGB == sub)
+			channels = 3;
+		else if (&fullinst->U.P.Alpha == sub)
+			channels = 1;
+
+		assert(channels > 0);
+		src_type = rc_source_type_swz(sub->Arg[i].Swizzle, channels);
+
+		if (src_type == RC_SOURCE_NONE)
+			continue;
+
+		if (sub->Arg[i].Source == RC_PAIR_PRESUB_SRC) {
+			unsigned int presub_type;
+			unsigned int presub_src_count;
+			struct rc_pair_instruction_source * src_array;
+			unsigned int j;
+			if (src_type & RC_SOURCE_RGB) {
+				presub_type = fullinst->
+					U.P.RGB.Src[RC_PAIR_PRESUB_SRC].Index;
+				src_array = fullinst->U.P.RGB.Src;
+			} else {
+				presub_type = fullinst->
+					U.P.Alpha.Src[RC_PAIR_PRESUB_SRC].Index;
+				src_array = fullinst->U.P.Alpha.Src;
+			}
+			presub_src_count
+				= rc_presubtract_src_reg_count(presub_type);
+			for(j = 0; j < presub_src_count; j++) {
+				cb(userdata, fullinst, &sub->Arg[i],
+								&src_array[j]);
+			}
+		} else {
+			struct rc_pair_instruction_source * src =
+				rc_pair_get_src(&fullinst->U.P, &sub->Arg[i]);
+			if (src) {
+				cb(userdata, fullinst, &sub->Arg[i], src);
+			}
+		}
 	}
 }
 
@@ -308,6 +347,7 @@ static void remap_normal_instruction(struct rc_instruction * fullinst,
 {
 	struct rc_sub_instruction * inst = &fullinst->U.I;
 	const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->Opcode);
+	unsigned int remapped_presub = 0;
 
 	if (opcode->HasDstReg) {
 		rc_register_file file = inst->DstReg.File;
@@ -327,6 +367,12 @@ static void remap_normal_instruction(struct rc_instruction * fullinst,
 			unsigned int i;
 			unsigned int srcp_srcs = rc_presubtract_src_reg_count(
 						inst->PreSub.Opcode);
+			/* Make sure we only remap presubtract sources once in
+			 * case more than one source register reads the
+			 * presubtract result. */
+			if (remapped_presub)
+				continue;
+
 			for(i = 0; i < srcp_srcs; i++) {
 				file = inst->PreSub.SrcReg[i].File;
 				index = inst->PreSub.SrcReg[i].Index;
@@ -334,7 +380,7 @@ static void remap_normal_instruction(struct rc_instruction * fullinst,
 				inst->PreSub.SrcReg[i].File = file;
 				inst->PreSub.SrcReg[i].Index = index;
 			}
-
+			remapped_presub = 1;
 		}
 		else {
 			cb(userdata, fullinst, &file, &index);
@@ -430,12 +476,29 @@ static rc_opcode get_flow_control_inst(struct rc_instruction * inst)
 
 }
 
+struct branch_write_mask {
+	unsigned int IfWriteMask:4;
+	unsigned int ElseWriteMask:4;
+	unsigned int HasElse:1;
+};
+
+union get_readers_read_cb {
+	rc_read_src_fn I;
+	rc_pair_read_arg_fn P;
+};
+
 struct get_readers_callback_data {
 	struct radeon_compiler * C;
 	struct rc_reader_data * ReaderData;
-	rc_read_src_fn ReadCB;
+	rc_read_src_fn ReadNormalCB;
+	rc_pair_read_arg_fn ReadPairCB;
 	rc_read_write_mask_fn WriteCB;
+	rc_register_file DstFile;
+	unsigned int DstIndex;
+	unsigned int DstMask;
 	unsigned int AliveWriteMask;
+	/*  For convenience, this is indexed starting at 1 */
+	struct branch_write_mask BranchMasks[R500_PFS_MAX_BRANCH_DEPTH_FULL + 1];
 };
 
 static void add_reader(
@@ -443,7 +506,7 @@ static void add_reader(
 	struct rc_reader_data * data,
 	struct rc_instruction * inst,
 	unsigned int mask,
-	struct rc_src_register * src)
+	void * arg_or_src)
 {
 	struct rc_reader * new;
 	memory_pool_array_reserve(pool, struct rc_reader, data->Readers,
@@ -451,7 +514,74 @@ static void add_reader(
 	new = &data->Readers[data->ReaderCount++];
 	new->Inst = inst;
 	new->WriteMask = mask;
-	new->Src = src;
+	if (inst->Type == RC_INSTRUCTION_NORMAL) {
+		new->U.Src = arg_or_src;
+	} else {
+		new->U.Arg = arg_or_src;
+	}
+}
+
+static unsigned int get_readers_read_callback(
+	struct get_readers_callback_data * cb_data,
+	unsigned int has_rel_addr,
+	rc_register_file file,
+	unsigned int index,
+	unsigned int swizzle)
+{
+	unsigned int shared_mask, read_mask;
+
+	if (has_rel_addr) {
+		cb_data->ReaderData->Abort = 1;
+		return RC_MASK_NONE;
+	}
+
+	shared_mask = rc_src_reads_dst_mask(file, index, swizzle,
+		cb_data->DstFile, cb_data->DstIndex, cb_data->AliveWriteMask);
+
+	if (shared_mask == RC_MASK_NONE)
+		return shared_mask;
+
+	/* If we make it this far, it means that this source reads from the
+	 * same register written to by d->ReaderData->Writer. */
+
+	read_mask = rc_swizzle_to_writemask(swizzle);
+	if (cb_data->ReaderData->AbortOnRead & read_mask) {
+		cb_data->ReaderData->Abort = 1;
+		return shared_mask;
+	}
+
+	/* XXX The behavior in this case should be configurable. */
+	if ((read_mask & cb_data->AliveWriteMask) != read_mask) {
+		cb_data->ReaderData->Abort = 1;
+		return shared_mask;
+	}
+
+	return shared_mask;
+}
+
+static void get_readers_pair_read_callback(
+	void * userdata,
+	struct rc_instruction * inst,
+	struct rc_pair_instruction_arg * arg,
+	struct rc_pair_instruction_source * src)
+{
+	unsigned int shared_mask;
+	struct get_readers_callback_data * d = userdata;
+
+	shared_mask = get_readers_read_callback(d,
+				0 /*Pair Instructions don't use RelAddr*/,
+				src->File, src->Index, arg->Swizzle);
+
+	if (shared_mask == RC_MASK_NONE)
+		return;
+
+	if (d->ReadPairCB)
+		d->ReadPairCB(d->ReaderData, inst, arg, src);
+
+	if (d->ReaderData->Abort)
+		return;
+
+	add_reader(&d->C->Pool, d->ReaderData, inst, shared_mask, arg);
 }
 
 /**
@@ -464,37 +594,18 @@ static void get_readers_normal_read_callback(
 	struct rc_src_register * src)
 {
 	struct get_readers_callback_data * d = userdata;
-	unsigned int read_mask;
 	unsigned int shared_mask;
 
-	if (src->RelAddr)
-		d->ReaderData->Abort = 1;
-
-	shared_mask = rc_src_reads_dst_mask(src->File, src->Index,
-		src->Swizzle,
-		d->ReaderData->Writer->U.I.DstReg.File,
-		d->ReaderData->Writer->U.I.DstReg.Index,
-		d->AliveWriteMask);
+	shared_mask = get_readers_read_callback(d,
+			src->RelAddr, src->File, src->Index, src->Swizzle);
 
 	if (shared_mask == RC_MASK_NONE)
 		return;
+	/* The callback function could potentially clear d->ReaderData->Abort,
+	 * so we need to call it before we return. */
+	if (d->ReadNormalCB)
+		d->ReadNormalCB(d->ReaderData, inst, src);
 
-	/* If we make it this far, it means that this source reads from the
-	 * same register written to by d->ReaderData->Writer. */
-
-	if (d->ReaderData->AbortOnRead) {
-		d->ReaderData->Abort = 1;
-		return;
-	}
-
-	read_mask = rc_swizzle_to_writemask(src->Swizzle);
-	/* XXX The behavior in this case should be configurable. */
-	if ((read_mask & d->AliveWriteMask) != read_mask) {
-		d->ReaderData->Abort = 1;
-		return;
-	}
-
-	d->ReadCB(d->ReaderData, inst, src);
 	if (d->ReaderData->Abort)
 		return;
 
@@ -515,29 +626,132 @@ static void get_readers_write_callback(
 {
 	struct get_readers_callback_data * d = userdata;
 
-	if (index == d->ReaderData->Writer->U.I.DstReg.Index
-		&& file == d->ReaderData->Writer->U.I.DstReg.File) {
-			unsigned int shared_mask = mask
-				& d->ReaderData->Writer->U.I.DstReg.WriteMask;
-		if (d->ReaderData->InElse) {
-			if (shared_mask & d->AliveWriteMask) {
-				/* We set AbortOnRead here because the
-				 * destination register of d->ReaderData->Writer
-				 * is written to in both the IF and the
-				 * ELSE block of this IF/ELSE statement.
-				 * This means that readers of this
-				 * destination register that follow this IF/ELSE
-				 * statement use the value of different
-				 * instructions depending on the control flow
-				 * decisions made by the program. */
-				d->ReaderData->AbortOnRead = 1;
+	if (index == d->DstIndex && file == d->DstFile) {
+		unsigned int shared_mask = mask & d->DstMask;
+		d->ReaderData->AbortOnRead &= ~shared_mask;
+		d->AliveWriteMask &= ~shared_mask;
+	}
+
+	if(d->WriteCB)
+		d->WriteCB(d->ReaderData, inst, file, index, mask);
+}
+
+static void get_readers_for_single_write(
+	void * userdata,
+	struct rc_instruction * writer,
+	rc_register_file dst_file,
+	unsigned int dst_index,
+	unsigned int dst_mask)
+{
+	struct rc_instruction * tmp;
+	unsigned int branch_depth = 0;
+	struct get_readers_callback_data * d = userdata;
+
+	d->ReaderData->Writer = writer;
+	d->ReaderData->AbortOnRead = 0;
+	d->ReaderData->InElse = 0;
+	d->DstFile = dst_file;
+	d->DstIndex = dst_index;
+	d->DstMask = dst_mask;
+	d->AliveWriteMask = dst_mask;
+	memset(d->BranchMasks, 0, sizeof(d->BranchMasks));
+
+	if (!dst_mask)
+		return;
+
+	for(tmp = writer->Next; tmp != &d->C->Program.Instructions;
+							tmp = tmp->Next){
+		rc_opcode opcode = get_flow_control_inst(tmp);
+		switch(opcode) {
+		case RC_OPCODE_BGNLOOP:
+			/* XXX We can do better when we see a BGNLOOP if we
+			 * add a flag called AbortOnWrite to struct
+			 * rc_reader_data and leave it set until the next
+			 * ENDLOOP. */
+		case RC_OPCODE_ENDLOOP:
+			/* XXX We can do better when we see an ENDLOOP by
+			 * searching backwards from writer and looking for
+			 * readers of writer's destination index.  If we find a
+			 * reader before we get to the BGNLOOP, we must abort
+			 * unless there is another writer between that reader
+			 * and the BGNLOOP. */
+		case RC_OPCODE_BRK:
+		case RC_OPCODE_CONT:
+			d->ReaderData->Abort = 1;
+			return;
+		case RC_OPCODE_IF:
+			branch_depth++;
+			if (branch_depth > R500_PFS_MAX_BRANCH_DEPTH_FULL) {
+				d->ReaderData->Abort = 1;
+				return;
+			}
+			d->BranchMasks[branch_depth].IfWriteMask =
+							d->AliveWriteMask;
+			break;
+		case RC_OPCODE_ELSE:
+			if (branch_depth == 0) {
+				d->ReaderData->InElse = 1;
+			} else {
+				unsigned int temp_mask = d->AliveWriteMask;
+				d->AliveWriteMask =
+					d->BranchMasks[branch_depth].IfWriteMask;
+				d->BranchMasks[branch_depth].ElseWriteMask =
+								temp_mask;
+				d->BranchMasks[branch_depth].HasElse = 1;
 			}
+			break;
+		case RC_OPCODE_ENDIF:
+			if (branch_depth == 0) {
+				d->ReaderData->AbortOnRead = d->AliveWriteMask;
+				d->ReaderData->InElse = 0;
+			}
+			else {
+				struct branch_write_mask * masks =
+					&d->BranchMasks[branch_depth];
+
+				if (masks->HasElse) {
+					d->ReaderData->AbortOnRead |=
+						masks->IfWriteMask
+							& ~masks->ElseWriteMask;
+					d->AliveWriteMask = masks->IfWriteMask
+						^ ((masks->IfWriteMask ^
+							masks->ElseWriteMask)
+						& (masks->IfWriteMask
+							^ d->AliveWriteMask));
+				} else {
+					d->ReaderData->AbortOnRead |=
+						masks->IfWriteMask
+							& ~d->AliveWriteMask;
+					d->AliveWriteMask = masks->IfWriteMask;
+
+				}
+				memset(masks, 0,
+					sizeof(struct branch_write_mask));
+				branch_depth--;
+			}
+			break;
+		default:
+			break;
+		}
+
+		if (d->ReaderData->InElse)
+			continue;
+
+		if (tmp->Type == RC_INSTRUCTION_NORMAL) {
+			rc_for_all_reads_src(tmp,
+				get_readers_normal_read_callback, d);
 		} else {
-			d->AliveWriteMask &= ~shared_mask;
+			rc_pair_for_all_reads_arg(tmp,
+				get_readers_pair_read_callback, d);
 		}
-	}
+		rc_for_all_writes_mask(tmp, get_readers_write_callback, d);
 
-	d->WriteCB(d->ReaderData, inst, file, index, mask);
+		if (d->ReaderData->Abort)
+			return;
+
+		if (branch_depth == 0 && !d->AliveWriteMask)
+			return;
+	}
 }
 
 /**
@@ -578,83 +792,26 @@ static void get_readers_write_callback(
  * @param write_cb This function will be called for every instruction after
  * writer.
  */
-void  rc_get_readers_normal(
+void rc_get_readers(
 	struct radeon_compiler * c,
 	struct rc_instruction * writer,
 	struct rc_reader_data * data,
-	rc_read_src_fn read_cb,
+	rc_read_src_fn read_normal_cb,
+	rc_pair_read_arg_fn read_pair_cb,
 	rc_read_write_mask_fn write_cb)
 {
-	struct rc_instruction * tmp;
 	struct get_readers_callback_data d;
-	unsigned int branch_depth = 0;
 
-	data->Writer = writer;
 	data->Abort = 0;
-	data->AbortOnRead = 0;
-	data->InElse = 0;
 	data->ReaderCount = 0;
 	data->ReadersReserved = 0;
 	data->Readers = NULL;
 
 	d.C = c;
-	d.AliveWriteMask = writer->U.I.DstReg.WriteMask;
 	d.ReaderData = data;
-	d.ReadCB = read_cb;
+	d.ReadNormalCB = read_normal_cb;
+	d.ReadPairCB = read_pair_cb;
 	d.WriteCB = write_cb;
 
-	if (!writer->U.I.DstReg.WriteMask)
-		return;
-
-	for(tmp = writer->Next; tmp != &c->Program.Instructions;
-							tmp = tmp->Next){
-		rc_opcode opcode = get_flow_control_inst(tmp);
-		switch(opcode) {
-		case RC_OPCODE_BGNLOOP:
-			/* XXX We can do better when we see a BGNLOOP if we
-			 * add a flag called AbortOnWrite to struct
-			 * rc_reader_data and leave it set until the next
-			 * ENDLOOP. */
-		case RC_OPCODE_ENDLOOP:
-			/* XXX We can do better when we see an ENDLOOP by
-			 * searching backwards from writer and looking for
-			 * readers of writer's destination index.  If we find a
-			 * reader before we get to the BGNLOOP, we must abort
-			 * unless there is another writer between that reader
-			 * and the BGNLOOP. */
-			data->Abort = 1;
-			return;
-		case RC_OPCODE_IF:
-			/* XXX We can do better here, but this will have to
-			 * do until this dataflow analysis is more mature. */
-			data->Abort = 1;
-			branch_depth++;
-			break;
-		case RC_OPCODE_ELSE:
-			if (branch_depth == 0)
-				data->InElse = 1;
-			break;
-		case RC_OPCODE_ENDIF:
-			if (branch_depth == 0) {
-				data->AbortOnRead = 1;
-				data->InElse = 0;
-			}
-			else {
-				branch_depth--;
-			}
-			break;
-		default:
-			break;
-		}
-
-		if (!data->InElse)
-			rc_for_all_reads_src(tmp, get_readers_normal_read_callback, &d);
-		rc_for_all_writes_mask(tmp, get_readers_write_callback, &d);
-
-		if (data->Abort)
-			return;
-
-		if (!d.AliveWriteMask)
-			return;
-	}
+	rc_for_all_writes_mask(writer, get_readers_for_single_write, &d);
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
index 7de6b98f763..ef971c5b234 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow.h
@@ -36,6 +36,7 @@ struct rc_instruction;
 struct rc_swizzle_caps;
 struct rc_src_register;
 struct rc_pair_instruction_arg;
+struct rc_pair_instruction_source;
 struct rc_compiler;
 
 
@@ -59,7 +60,8 @@ void rc_for_all_reads_src(struct rc_instruction * inst, rc_read_src_fn cb,
 			void * userdata);
 
 typedef void (*rc_pair_read_arg_fn)(void * userdata,
-	struct rc_instruction * inst, struct rc_pair_instruction_arg * arg);
+	struct rc_instruction * inst, struct rc_pair_instruction_arg * arg,
+	struct rc_pair_instruction_source * src);
 void rc_pair_for_all_reads_arg(struct rc_instruction * inst,
 					rc_pair_read_arg_fn cb, void * userdata);
 
@@ -71,7 +73,10 @@ void rc_remap_registers(struct rc_instruction * inst, rc_remap_register_fn cb, v
 struct rc_reader {
 	struct rc_instruction * Inst;
 	unsigned int WriteMask;
-	struct rc_src_register * Src;
+	union {
+		struct rc_src_register * Src;
+		struct rc_pair_instruction_arg * Arg;
+	} U;
 };
 
 struct rc_reader_data {
@@ -87,14 +92,13 @@ struct rc_reader_data {
 	void * CbData;
 };
 
-void rc_get_readers_normal(
+void rc_get_readers(
 	struct radeon_compiler * c,
-	struct rc_instruction * inst,
+	struct rc_instruction * writer,
 	struct rc_reader_data * data,
-	/*XXX: These should be their own function types. */
-	rc_read_src_fn read_cb,
+	rc_read_src_fn read_normal_cb,
+	rc_pair_read_arg_fn read_pair_cb,
 	rc_read_write_mask_fn write_cb);
-
 /**
  * Compiler passes based on dataflow analysis.
  */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index da495a3afaa..25afd272bee 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -67,6 +67,13 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.IsComponentwise = 1
 	},
 	{
+		.Opcode = RC_OPCODE_CLAMP,
+		.Name = "CLAMP",
+		.NumSrcRegs = 3,
+		.HasDstReg = 1,
+		.IsComponentwise = 1
+	},
+	{
 		.Opcode = RC_OPCODE_CMP,
 		.Name = "CMP",
 		.NumSrcRegs = 3,
@@ -453,6 +460,7 @@ void rc_compute_sources_for_writemask(
 			srcmasks[1] |= RC_MASK_XY;
 			break;
 		case RC_OPCODE_DP3:
+		case RC_OPCODE_XPD:
 			srcmasks[0] |= RC_MASK_XYZ;
 			srcmasks[1] |= RC_MASK_XYZ;
 			break;
@@ -460,6 +468,10 @@ void rc_compute_sources_for_writemask(
 			srcmasks[0] |= RC_MASK_XYZW;
 			srcmasks[1] |= RC_MASK_XYZW;
 			break;
+		case RC_OPCODE_DPH:
+			srcmasks[0] |= RC_MASK_XYZ;
+			srcmasks[1] |= RC_MASK_XYZW;
+			break;
 		case RC_OPCODE_TXB:
 		case RC_OPCODE_TXP:
 			srcmasks[0] |= RC_MASK_W;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index d3f639c8701..7e666101276 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -50,6 +50,9 @@ typedef enum {
 	/** vec4 instruction: dst.c = ceil(src0.c) */
 	RC_OPCODE_CEIL,
 
+	/** vec4 instruction: dst.c = clamp(src0.c, src1.c, src2.c) */
+	RC_OPCODE_CLAMP,
+
 	/** vec4 instruction: dst.c = src0.c < 0.0 ? src1.c : src2.c */
 	RC_OPCODE_CMP,
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index 15b9c5e7dc3..44f4c0fbdc7 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -54,12 +54,7 @@ static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct
 		combine.Negate = outer.Negate;
 	} else {
 		combine.Abs = inner.Abs;
-		combine.Negate = 0;
-		for(unsigned int chan = 0; chan < 4; ++chan) {
-			unsigned int swz = GET_SWZ(outer.Swizzle, chan);
-			if (swz < 4)
-				combine.Negate |= GET_BIT(inner.Negate, swz) << chan;
-		}
+		combine.Negate = swizzle_mask(outer.Swizzle, inner.Negate);
 		combine.Negate ^= outer.Negate;
 	}
 	combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
@@ -71,12 +66,13 @@ static void copy_propagate_scan_read(void * data, struct rc_instruction * inst,
 {
 	rc_register_file file = src->File;
 	struct rc_reader_data * reader_data = data;
-	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
 
-	/* It is possible to do copy propigation in this situation,
-	 * just not right now, see peephole_add_presub_inv() */
-	if (reader_data->Writer->U.I.PreSub.Opcode != RC_PRESUB_NONE &&
-			(info->NumSrcRegs > 2 || info->HasTexture)) {
+	if(!rc_inst_can_use_presub(inst,
+				reader_data->Writer->U.I.PreSub.Opcode,
+				rc_swizzle_to_writemask(src->Swizzle),
+				*src,
+				reader_data->Writer->U.I.PreSub.SrcReg[0],
+				reader_data->Writer->U.I.PreSub.SrcReg[1])) {
 		reader_data->Abort = 1;
 		return;
 	}
@@ -112,11 +108,11 @@ static void src_clobbered_reads_cb(
 	    && src->Index == sc_data->Index
 	    && (rc_swizzle_to_writemask(src->Swizzle) & sc_data->Mask)) {
 
-		sc_data->ReaderData->AbortOnRead = 1;
+		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
 	}
 
 	if (src->RelAddr && sc_data->File == RC_FILE_ADDRESS) {
-		sc_data->ReaderData->AbortOnRead = 1;
+		sc_data->ReaderData->AbortOnRead = RC_MASK_XYZW;
 	}
 }
 
@@ -149,8 +145,9 @@ static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * i
 		return;
 
 	/* Get a list of all the readers of this MOV instruction. */
-	rc_get_readers_normal(c, inst_mov, &reader_data,
-			copy_propagate_scan_read, is_src_clobbered_scan_write);
+	rc_get_readers(c, inst_mov, &reader_data,
+		       copy_propagate_scan_read, NULL,
+		       is_src_clobbered_scan_write);
 
 	if (reader_data.Abort || reader_data.ReaderCount == 0)
 		return;
@@ -158,7 +155,7 @@ static void copy_propagate(struct radeon_compiler * c, struct rc_instruction * i
 	/* Propagate the MOV instruction. */
 	for (i = 0; i < reader_data.ReaderCount; i++) {
 		struct rc_instruction * inst = reader_data.Readers[i].Inst;
-		*reader_data.Readers[i].Src = chain_srcregs(*reader_data.Readers[i].Src, inst_mov->U.I.SrcReg[0]);
+		*reader_data.Readers[i].U.Src = chain_srcregs(*reader_data.Readers[i].U.Src, inst_mov->U.I.SrcReg[0]);
 
 		if (inst_mov->U.I.SrcReg[0].File == RC_FILE_PRESUB)
 			inst->U.I.PreSub = inst_mov->U.I.PreSub;
@@ -423,24 +420,13 @@ static void presub_scan_read(
 	struct rc_src_register * src)
 {
 	struct rc_reader_data * reader_data = data;
-	const struct rc_opcode_info * info =
-					rc_get_opcode_info(inst->U.I.Opcode);
-	/* XXX: There are some situations where instructions
-	 * with more than 2 src registers can use the
-	 * presubtract select, but to keep things simple we
-	 * will disable presubtract on these instructions for
-	 * now. */
-	if (info->NumSrcRegs > 2 || info->HasTexture) {
-		reader_data->Abort = 1;
-		return;
-	}
+	rc_presubtract_op * presub_opcode = reader_data->CbData;
 
-	/* We can't use more than one presubtract value in an
-	 * instruction, unless the two prsubtract operations
-	 * are the same and read from the same registers.
-	 * XXX For now we will limit instructions to only one presubtract
-	 * value.*/
-	if (inst->U.I.PreSub.Opcode != RC_PRESUB_NONE) {
+	if (!rc_inst_can_use_presub(inst, *presub_opcode,
+			reader_data->Writer->U.I.DstReg.WriteMask,
+			*src,
+			reader_data->Writer->U.I.SrcReg[0],
+			reader_data->Writer->U.I.SrcReg[1])) {
 		reader_data->Abort = 1;
 		return;
 	}
@@ -454,8 +440,10 @@ static int presub_helper(
 {
 	struct rc_reader_data reader_data;
 	unsigned int i;
+	rc_presubtract_op cb_op = presub_opcode;
 
-	rc_get_readers_normal(c, inst_add, &reader_data, presub_scan_read,
+	reader_data.CbData = &cb_op;
+	rc_get_readers(c, inst_add, &reader_data, presub_scan_read, NULL,
 						is_src_clobbered_scan_write);
 
 	if (reader_data.Abort || reader_data.ReaderCount == 0)
@@ -468,7 +456,7 @@ static int presub_helper(
 				rc_get_opcode_info(reader.Inst->U.I.Opcode);
 
 		for (src_index = 0; src_index < info->NumSrcRegs; src_index++) {
-			if (&reader.Inst->U.I.SrcReg[src_index] == reader.Src)
+			if (&reader.Inst->U.I.SrcReg[src_index] == reader.U.Src)
 				presub_replace(inst_add, reader.Inst, src_index);
 		}
 	}
@@ -505,7 +493,9 @@ static void presub_replace_add(
 	inst_reader->U.I.SrcReg[src_index].Index = presub_opcode;
 }
 
-static int is_presub_candidate(struct rc_instruction * inst)
+static int is_presub_candidate(
+	struct radeon_compiler * c,
+	struct rc_instruction * inst)
 {
 	const struct rc_opcode_info * info = rc_get_opcode_info(inst->U.I.Opcode);
 	unsigned int i;
@@ -514,7 +504,12 @@ static int is_presub_candidate(struct rc_instruction * inst)
 		return 0;
 
 	for(i = 0; i < info->NumSrcRegs; i++) {
-		if (src_reads_dst_mask(inst->U.I.SrcReg[i], inst->U.I.DstReg))
+		struct rc_src_register src = inst->U.I.SrcReg[i];
+		if (src_reads_dst_mask(src, inst->U.I.DstReg))
+			return 0;
+
+		src.File = RC_FILE_PRESUB;
+		if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, src))
 			return 0;
 	}
 	return 1;
@@ -528,7 +523,7 @@ static int peephole_add_presub_add(
 	struct rc_src_register * src1 = NULL;
 	unsigned int i;
 
-	if (!is_presub_candidate(inst_add))
+	if (!is_presub_candidate(c, inst_add))
 		return 0;
 
 	if (inst_add->U.I.SrcReg[0].Swizzle != inst_add->U.I.SrcReg[1].Swizzle)
@@ -592,7 +587,7 @@ static int peephole_add_presub_inv(
 {
 	unsigned int i, swz, mask;
 
-	if (!is_presub_candidate(inst_add))
+	if (!is_presub_candidate(c, inst_add))
 		return 0;
 
 	mask = inst_add->U.I.DstReg.WriteMask;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
index 91524f5ec68..d53181e1f75 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
@@ -66,10 +66,13 @@ struct regalloc_state {
 	struct hardware_register * HwTemporary;
 	unsigned int NumHwTemporaries;
 	/**
-	 * If an instruction is inside of a loop, end_loop will be the
-	 * IP of the ENDLOOP instruction, otherwise end_loop will be 0
+	 * If an instruction is inside of a loop, EndLoop will be the
+	 * IP of the ENDLOOP instruction, and BeginLoop will be the IP
+	 * of the BGNLOOP instruction.  Otherwise, EndLoop and BeginLoop
+	 * will be -1.
 	 */
-	int end_loop;
+	int EndLoop;
+	int BeginLoop;
 };
 
 static void print_live_intervals(struct live_intervals * src)
@@ -180,11 +183,13 @@ static void scan_callback(void * data, struct rc_instruction * inst,
 		reg->Used = 1;
 		if (file == RC_FILE_INPUT)
 			reg->Live.Start = -1;
+		else if (s->BeginLoop >= 0)
+			reg->Live.Start = s->BeginLoop;
 		else
 			reg->Live.Start = inst->IP;
 		reg->Live.End = inst->IP;
-	} else if (s->end_loop)
-		reg->Live.End = s->end_loop;
+	} else if (s->EndLoop >= 0)
+		reg->Live.End = s->EndLoop;
 	else if (inst->IP > reg->Live.End)
 		reg->Live.End = inst->IP;
 }
@@ -195,6 +200,8 @@ static void compute_live_intervals(struct radeon_compiler *c,
 	memset(s, 0, sizeof(*s));
 	s->C = c;
 	s->NumHwTemporaries = c->max_temp_regs;
+	s->BeginLoop = -1;
+	s->EndLoop = -1;
 	s->HwTemporary =
 		memory_pool_malloc(&c->Pool,
 				   s->NumHwTemporaries * sizeof(struct hardware_register));
@@ -207,10 +214,12 @@ static void compute_live_intervals(struct radeon_compiler *c,
 	    inst = inst->Next) {
 
 		/* For all instructions inside of a loop, the ENDLOOP
-		 * instruction is used as the end of the live interval. */
-		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP && !s->end_loop) {
+		 * instruction is used as the end of the live interval and
+		 * the BGNLOOP instruction is used as the beginning. */
+		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP && s->EndLoop < 0) {
 			int loops = 1;
 			struct rc_instruction * tmp;
+			s->BeginLoop = inst->IP;
 			for(tmp = inst->Next;
 					tmp != &s->C->Program.Instructions;
 					tmp = tmp->Next) {
@@ -219,15 +228,17 @@ static void compute_live_intervals(struct radeon_compiler *c,
 				} else if (tmp->U.I.Opcode
 							== RC_OPCODE_ENDLOOP) {
 					if(!--loops) {
-						s->end_loop = tmp->IP;
+						s->EndLoop = tmp->IP;
 						break;
 					}
 				}
 			}
 		}
 
-		if (inst->IP == s->end_loop)
-			s->end_loop = 0;
+		if (inst->IP == s->EndLoop) {
+			s->EndLoop = -1;
+			s->BeginLoop = -1;
+		}
 
 		rc_for_all_reads_mask(inst, scan_callback, s);
 		rc_for_all_writes_mask(inst, scan_callback, s);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
index 553e9dcf7c1..9beb5d63579 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_schedule.c
@@ -30,6 +30,7 @@
 #include <stdio.h>
 
 #include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
 #include "radeon_dataflow.h"
 
 
@@ -54,6 +55,11 @@ struct schedule_instruction {
 	 * this instruction can be scheduled.
 	 */
 	unsigned int NumDependencies:5;
+
+	/** List of all readers (see rc_get_readers() for the definition of
+	 * "all readers"), even those outside the basic block this instruction
+	 * lives in. */
+	struct rc_reader_data GlobalReaders;
 };
 
 
@@ -94,6 +100,16 @@ struct register_state {
 	struct reg_value * Values[4];
 };
 
+struct remap_reg {
+	struct rc_instruciont * Inst;
+	unsigned int OldIndex:(RC_REGISTER_INDEX_BITS+1);
+	unsigned int OldSwizzle:3;
+	unsigned int NewIndex:(RC_REGISTER_INDEX_BITS+1);
+	unsigned int NewSwizzle:3;
+	unsigned int OnlyTexReads:1;
+	struct remap_reg * Next;
+};
+
 struct schedule_state {
 	struct radeon_compiler * C;
 	struct schedule_instruction * Current;
@@ -126,15 +142,6 @@ static struct reg_value ** get_reg_valuep(struct schedule_state * s,
 	return &s->Temporary[index].Values[chan];
 }
 
-static struct reg_value * get_reg_value(struct schedule_state * s,
-		rc_register_file file, unsigned int index, unsigned int chan)
-{
-	struct reg_value ** pv = get_reg_valuep(s, file, index, chan);
-	if (!pv)
-		return 0;
-	return *pv;
-}
-
 static void add_inst_to_list(struct schedule_instruction ** list, struct schedule_instruction * inst)
 {
 	inst->NextReady = *list;
@@ -295,12 +302,12 @@ static int merge_presub_sources(
 	assert(dst_full->Alpha.Opcode == RC_OPCODE_NOP);
 
 	switch(type) {
-	case RC_PAIR_SOURCE_RGB:
+	case RC_SOURCE_RGB:
 		is_rgb = 1;
 		is_alpha = 0;
 		dst_sub = &dst_full->RGB;
 		break;
-	case RC_PAIR_SOURCE_ALPHA:
+	case RC_SOURCE_ALPHA:
 		is_rgb = 0;
 		is_alpha = 1;
 		dst_sub = &dst_full->Alpha;
@@ -341,6 +348,8 @@ static int merge_presub_sources(
 				continue;
 			free_source = rc_pair_alloc_source(dst_full, is_rgb,
 					is_alpha, temp.File, temp.Index);
+			if (free_source < 0)
+				return 0;
 			one_way = 1;
 		} else {
 			dst_sub->Src[free_source] = temp;
@@ -356,11 +365,11 @@ static int merge_presub_sources(
 		for(arg = 0; arg < info->NumSrcRegs; arg++) {
 			/*If this arg does not read from an rgb source,
 			 * do nothing. */
-			if (!(rc_source_type_that_arg_reads(
-				dst_full->RGB.Arg[arg].Source,
-				dst_full->RGB.Arg[arg].Swizzle) & type)) {
+			if (!(rc_source_type_swz(dst_full->RGB.Arg[arg].Swizzle,
+								3) & type)) {
 				continue;
 			}
+
 			if (dst_full->RGB.Arg[arg].Source == srcp_src)
 				dst_full->RGB.Arg[arg].Source = free_source;
 			/* We need to do this just in case register
@@ -392,13 +401,13 @@ static int destructive_merge_instructions(
 
 	/* Merge the rgb presubtract registers. */
 	if (alpha->RGB.Src[RC_PAIR_PRESUB_SRC].Used) {
-		if (!merge_presub_sources(rgb, alpha->RGB, RC_PAIR_SOURCE_RGB)) {
+		if (!merge_presub_sources(rgb, alpha->RGB, RC_SOURCE_RGB)) {
 			return 0;
 		}
 	}
 	/* Merge the alpha presubtract registers */
 	if (alpha->Alpha.Src[RC_PAIR_PRESUB_SRC].Used) {
-		if(!merge_presub_sources(rgb,  alpha->Alpha, RC_PAIR_SOURCE_ALPHA)){
+		if(!merge_presub_sources(rgb,  alpha->Alpha, RC_SOURCE_ALPHA)){
 			return 0;
 		}
 	}
@@ -525,6 +534,222 @@ static void presub_nop(struct rc_instruction * emitted) {
 		}
 	}
 }
+
+static void rgb_to_alpha_remap (
+	struct rc_instruction * inst,
+	struct rc_pair_instruction_arg * arg,
+	rc_register_file old_file,
+	rc_swizzle old_swz,
+	unsigned int new_index)
+{
+	int new_src_index;
+	unsigned int i;
+	struct rc_pair_instruction_source * old_src =
+					rc_pair_get_src(&inst->U.P, arg);
+	if (!old_src) {
+		return;
+	}
+
+	for (i = 0; i < 3; i++) {
+		if (get_swz(arg->Swizzle, i) == old_swz) {
+			SET_SWZ(arg->Swizzle, i, RC_SWIZZLE_W);
+		}
+	}
+	memset(old_src, 0, sizeof(struct rc_pair_instruction_source));
+	new_src_index = rc_pair_alloc_source(&inst->U.P, 0, 1,
+							old_file, new_index);
+	/* This conversion is not possible, we must have made a mistake in
+	 * is_rgb_to_alpha_possible. */
+	if (new_src_index < 0) {
+		assert(0);
+		return;
+	}
+
+	arg->Source = new_src_index;
+}
+
+static int can_remap(unsigned int opcode)
+{
+	switch(opcode) {
+	case RC_OPCODE_DDX:
+	case RC_OPCODE_DDY:
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static int can_convert_opcode_to_alpha(unsigned int opcode)
+{
+	switch(opcode) {
+	case RC_OPCODE_DDX:
+	case RC_OPCODE_DDY:
+	case RC_OPCODE_DP2:
+	case RC_OPCODE_DP3:
+	case RC_OPCODE_DP4:
+	case RC_OPCODE_DPH:
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void is_rgb_to_alpha_possible(
+	void * userdata,
+	struct rc_instruction * inst,
+	struct rc_pair_instruction_arg * arg,
+	struct rc_pair_instruction_source * src)
+{
+	unsigned int chan_count = 0;
+	unsigned int alpha_sources = 0;
+	unsigned int i;
+	struct rc_reader_data * reader_data = userdata;
+
+	if (!can_remap(inst->U.P.RGB.Opcode)
+	    || !can_remap(inst->U.P.Alpha.Opcode)) {
+		reader_data->Abort = 1;
+		return;
+	}
+
+	if (!src)
+		return;
+
+	/* XXX There are some cases where we can still do the conversion if
+	 * a reader reads from a presubtract source, but for now we'll prevent
+	 * it. */
+	if (arg->Source == RC_PAIR_PRESUB_SRC) {
+		reader_data->Abort = 1;
+		return;
+	}
+
+	/* Make sure the source only reads from one component.
+	 * XXX We should allow the source to read from the same component twice.
+	 * XXX If the index we will be converting to is the same as the
+	 * current index, then it is OK to read from more than one component.
+	 */
+	for (i = 0; i < 3; i++) {
+		rc_swizzle swz = get_swz(arg->Swizzle, i);
+		switch(swz) {
+		case RC_SWIZZLE_X:
+		case RC_SWIZZLE_Y:
+		case RC_SWIZZLE_Z:
+		case RC_SWIZZLE_W:
+			chan_count++;
+			break;
+		default:
+			break;
+		}
+	}
+	if (chan_count > 1) {
+		reader_data->Abort = 1;
+		return;
+	}
+
+	/* Make sure there are enough alpha sources.
+	 * XXX If we know what register all the readers are going
+	 * to be remapped to, then in some situations we can still do
+	 * the subsitution, even if all 3 alpha sources are being used.*/
+	for (i = 0; i < 3; i++) {
+		if (inst->U.P.Alpha.Src[i].Used) {
+			alpha_sources++;
+		}
+	}
+	if (alpha_sources > 2) {
+		reader_data->Abort = 1;
+		return;
+	}
+}
+
+static int convert_rgb_to_alpha(
+	struct schedule_state * s,
+	struct schedule_instruction * sched_inst)
+{
+	struct rc_pair_instruction * pair_inst = &sched_inst->Instruction->U.P;
+	unsigned int old_mask = pair_inst->RGB.WriteMask;
+	unsigned int old_swz = rc_mask_to_swizzle(old_mask);
+	const struct rc_opcode_info * info =
+				rc_get_opcode_info(pair_inst->RGB.Opcode);
+	int new_index = -1;
+	unsigned int i;
+
+	if (sched_inst->GlobalReaders.Abort)
+		return 0;
+
+	if (!pair_inst->RGB.WriteMask)
+		return 0;
+
+	if (!can_convert_opcode_to_alpha(pair_inst->RGB.Opcode)
+	    || !can_convert_opcode_to_alpha(pair_inst->Alpha.Opcode)) {
+		return 0;
+	}
+
+	assert(sched_inst->NumWriteValues == 1);
+
+	if (!sched_inst->WriteValues[0]) {
+		assert(0);
+		return 0;
+	}
+
+	/* We start at the old index, because if we can reuse the same
+	 * register and just change the swizzle then it is more likely we
+	 * will be able to convert all the readers. */
+	for (i = pair_inst->RGB.DestIndex; i < RC_REGISTER_MAX_INDEX; i++) {
+		struct reg_value ** new_regvalp = get_reg_valuep(
+						s, RC_FILE_TEMPORARY, i, 3);
+		if (!*new_regvalp) {
+			struct reg_value ** old_regvalp =
+				get_reg_valuep(s,
+					RC_FILE_TEMPORARY,
+					pair_inst->RGB.DestIndex,
+					rc_mask_to_swizzle(old_mask));
+			new_index = i;
+			*new_regvalp = *old_regvalp;
+			*old_regvalp = NULL;
+			new_regvalp = get_reg_valuep(s, RC_FILE_TEMPORARY, i, 3);
+			break;
+		}
+	}
+	if (new_index < 0) {
+		return 0;
+	}
+
+	pair_inst->Alpha.Opcode = pair_inst->RGB.Opcode;
+	pair_inst->Alpha.DestIndex = new_index;
+	pair_inst->Alpha.WriteMask = 1;
+	pair_inst->Alpha.Target = pair_inst->RGB.Target;
+	pair_inst->Alpha.OutputWriteMask = pair_inst->RGB.OutputWriteMask;
+	pair_inst->Alpha.DepthWriteMask = pair_inst->RGB.DepthWriteMask;
+	pair_inst->Alpha.Saturate = pair_inst->RGB.Saturate;
+	memcpy(pair_inst->Alpha.Arg, pair_inst->RGB.Arg,
+						sizeof(pair_inst->Alpha.Arg));
+	/* Move the swizzles into the first chan */
+	for (i = 0; i < info->NumSrcRegs; i++) {
+		unsigned int j;
+		for (j = 0; j < 3; j++) {
+			unsigned int swz = get_swz(pair_inst->Alpha.Arg[i].Swizzle, j);
+			if (swz != RC_SWIZZLE_UNUSED) {
+				pair_inst->Alpha.Arg[i].Swizzle = swz;
+				break;
+			}
+		}
+	}
+	pair_inst->RGB.Opcode = RC_OPCODE_NOP;
+	pair_inst->RGB.DestIndex = 0;
+	pair_inst->RGB.WriteMask = 0;
+	pair_inst->RGB.Target = 0;
+	pair_inst->RGB.OutputWriteMask = 0;
+	pair_inst->RGB.DepthWriteMask = 0;
+	pair_inst->RGB.Saturate = 0;
+	memset(pair_inst->RGB.Arg, 0, sizeof(pair_inst->RGB.Arg));
+
+	for(i = 0; i < sched_inst->GlobalReaders.ReaderCount; i++) {
+		struct rc_reader reader = sched_inst->GlobalReaders.Readers[i];
+		rgb_to_alpha_remap(reader.Inst, reader.U.Arg,
+					RC_FILE_TEMPORARY, old_swz, new_index);
+	}
+	return 1;
+}
+
 /**
  * Find a good ALU instruction or pair of ALU instruction and emit it.
  *
@@ -536,24 +761,16 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 {
 	struct schedule_instruction * sinst;
 
-	if (s->ReadyFullALU || !(s->ReadyRGB && s->ReadyAlpha)) {
-		if (s->ReadyFullALU) {
-			sinst = s->ReadyFullALU;
-			s->ReadyFullALU = s->ReadyFullALU->NextReady;
-		} else if (s->ReadyRGB) {
-			sinst = s->ReadyRGB;
-			s->ReadyRGB = s->ReadyRGB->NextReady;
-		} else {
-			sinst = s->ReadyAlpha;
-			s->ReadyAlpha = s->ReadyAlpha->NextReady;
-		}
-
+	if (s->ReadyFullALU) {
+		sinst = s->ReadyFullALU;
+		s->ReadyFullALU = s->ReadyFullALU->NextReady;
 		rc_insert_instruction(before->Prev, sinst->Instruction);
 		commit_alu_instruction(s, sinst);
 	} else {
 		struct schedule_instruction **prgb;
 		struct schedule_instruction **palpha;
-
+		struct schedule_instruction *prev;
+pair:
 		/* Some pairings might fail because they require too
 		 * many source slots; try all possible pairings if necessary */
 		for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
@@ -572,10 +789,43 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor
 				goto success;
 			}
 		}
-
-		/* No success in pairing; just take the first RGB instruction */
-		sinst = s->ReadyRGB;
-		s->ReadyRGB = s->ReadyRGB->NextReady;
+		prev = NULL;
+		/* No success in pairing, now try to convert one of the RGB
+		 * instructions to an Alpha so we can pair it with another RGB.
+		 */
+		if (s->ReadyRGB && s->ReadyRGB->NextReady) {
+		for(prgb = &s->ReadyRGB; *prgb; prgb = &(*prgb)->NextReady) {
+			if ((*prgb)->NumWriteValues == 1) {
+				struct schedule_instruction * prgb_next;
+				if (!convert_rgb_to_alpha(s, *prgb))
+					goto cont_loop;
+				prgb_next = (*prgb)->NextReady;
+				/* Add instruction to the Alpha ready list. */
+				(*prgb)->NextReady = s->ReadyAlpha;
+				s->ReadyAlpha = *prgb;
+				/* Remove instruction from the RGB ready list.*/
+				if (prev)
+					prev->NextReady = prgb_next;
+				else
+					s->ReadyRGB = prgb_next;
+				goto pair;
+			}
+cont_loop:
+			prev = *prgb;
+		}
+		}
+		/* Still no success in pairing, just take the first RGB
+		 * or alpha instruction. */
+		if (s->ReadyRGB) {
+			sinst = s->ReadyRGB;
+			s->ReadyRGB = s->ReadyRGB->NextReady;
+		} else if (s->ReadyAlpha) {
+			sinst = s->ReadyAlpha;
+			s->ReadyAlpha = s->ReadyAlpha->NextReady;
+		} else {
+			/*XXX Something real bad has happened. */
+			assert(0);
+		}
 
 		rc_insert_instruction(before->Prev, sinst->Instruction);
 		commit_alu_instruction(s, sinst);
@@ -591,13 +841,13 @@ static void scan_read(void * data, struct rc_instruction * inst,
 		rc_register_file file, unsigned int index, unsigned int chan)
 {
 	struct schedule_state * s = data;
-	struct reg_value * v = get_reg_value(s, file, index, chan);
+	struct reg_value ** v = get_reg_valuep(s, file, index, chan);
 	struct reg_value_reader * reader;
 
 	if (!v)
 		return;
 
-	if (v->Writer == s->Current) {
+	if (*v && (*v)->Writer == s->Current) {
 		/* The instruction reads and writes to a register component.
 		 * In this case, we only want to increment dependencies by one.
 		 */
@@ -608,16 +858,28 @@ static void scan_read(void * data, struct rc_instruction * inst,
 
 	reader = memory_pool_malloc(&s->C->Pool, sizeof(*reader));
 	reader->Reader = s->Current;
-	reader->Next = v->Readers;
-	v->Readers = reader;
-	v->NumReaders++;
-
-	s->Current->NumDependencies++;
+	if (!*v) {
+		/* In this situation, the instruction reads from a register
+		 * that hasn't been written to or read from in the current
+		 * block. */
+		*v = memory_pool_malloc(&s->C->Pool, sizeof(struct reg_value));
+		memset(*v, 0, sizeof(struct reg_value));
+		(*v)->Readers = reader;
+	} else {
+		reader->Next = (*v)->Readers;
+		(*v)->Readers = reader;
+		/* Only update the current instruction's dependencies if the
+		 * register it reads from has been written to in this block. */
+		if ((*v)->Writer) {
+			s->Current->NumDependencies++;
+		}
+	}
+	(*v)->NumReaders++;
 
 	if (s->Current->NumReadValues >= 12) {
 		rc_error(s->C, "%s: NumReadValues overflow\n", __FUNCTION__);
 	} else {
-		s->Current->ReadValues[s->Current->NumReadValues++] = v;
+		s->Current->ReadValues[s->Current->NumReadValues++] = *v;
 	}
 }
 
@@ -652,6 +914,16 @@ static void scan_write(void * data, struct rc_instruction * inst,
 	}
 }
 
+static void is_rgb_to_alpha_possible_normal(
+	void * userdata,
+	struct rc_instruction * inst,
+	struct rc_src_register * src)
+{
+	struct rc_reader_data * reader_data = userdata;
+	reader_data->Abort = 1;
+
+}
+
 static void schedule_block(struct r300_fragment_program_compiler * c,
 		struct rc_instruction * begin, struct rc_instruction * end)
 {
@@ -683,6 +955,11 @@ static void schedule_block(struct r300_fragment_program_compiler * c,
 
 		if (!s.Current->NumDependencies)
 			instruction_ready(&s, s.Current);
+
+		/* Get global readers for possible RGB->Alpha conversion. */
+		rc_get_readers(s.C, inst, &s.Current->GlobalReaders,
+				is_rgb_to_alpha_possible_normal,
+				is_rgb_to_alpha_possible, NULL);
 	}
 
 	/* Temporarily unlink all instructions */
@@ -711,8 +988,13 @@ static int is_controlflow(struct rc_instruction * inst)
 
 void rc_pair_schedule(struct radeon_compiler *cc, void *user)
 {
+	struct schedule_state s;
+
 	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)cc;
 	struct rc_instruction * inst = c->Base.Program.Instructions.Next;
+
+	memset(&s, 0, sizeof(s));
+	s.C = &c->Base;
 	while(inst != &c->Base.Program.Instructions) {
 		struct rc_instruction * first;
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
index c549be52183..fc05366f50e 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_translate.c
@@ -280,9 +280,12 @@ static void set_pair_instruction(struct r300_fragment_program_compiler *c,
 			pair->RGB.DestIndex = inst->DstReg.Index;
 			pair->RGB.WriteMask |= inst->DstReg.WriteMask & RC_MASK_XYZ;
 		}
+
 		if (needalpha) {
-			pair->Alpha.DestIndex = inst->DstReg.Index;
 			pair->Alpha.WriteMask |= GET_BIT(inst->DstReg.WriteMask, 3);
+			if (pair->Alpha.WriteMask) {
+				pair->Alpha.DestIndex = inst->DstReg.Index;
+			}
 		}
 	}
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program.c b/src/mesa/drivers/dri/r300/compiler/radeon_program.c
index 24b685fbeb4..fe5756ebc45 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.c
@@ -30,6 +30,7 @@
 #include <stdio.h>
 
 #include "radeon_compiler.h"
+#include "radeon_dataflow.h"
 
 
 /**
@@ -70,58 +71,98 @@ void rc_local_transform(
 	}
 }
 
+struct get_used_temporaries_data {
+	unsigned char * Used;
+	unsigned int UsedLength;
+};
+
+static void get_used_temporaries_cb(
+	void * userdata,
+	struct rc_instruction * inst,
+	rc_register_file file,
+	unsigned int index,
+	unsigned int mask)
+{
+	struct get_used_temporaries_data * d = userdata;
+
+	if (file != RC_FILE_TEMPORARY)
+		return;
+
+	if (index >= d->UsedLength)
+		return;
+
+	d->Used[index] |= mask;
+}
+
 /**
- * Left multiplication of a register with a swizzle
+ * This function fills in the parameter 'used' with a writemask that
+ * represent which components of each temporary register are used by the
+ * program.  This is meant to be combined with rc_find_free_temporary_list as a
+ * more efficient version of rc_find_free_temporary.
+ * @param used The function does not initialize this parameter.
  */
-struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg)
+void rc_get_used_temporaries(
+	struct radeon_compiler * c,
+	unsigned char * used,
+	unsigned int used_length)
+{
+	struct rc_instruction * inst;
+	struct get_used_temporaries_data d;
+	d.Used = used;
+	d.UsedLength = used_length;
+
+	for(inst = c->Program.Instructions.Next;
+			inst != &c->Program.Instructions; inst = inst->Next) {
+
+		rc_for_all_reads_mask(inst, get_used_temporaries_cb, &d);
+		rc_for_all_writes_mask(inst, get_used_temporaries_cb, &d);
+	}
+}
+
+/* Search a list of used temporaries for a free one
+ * \sa rc_get_used_temporaries
+ * @note If this functions finds a free temporary, it will mark it as used
+ * in the used temporary list (param 'used')
+ * @param used list of used temporaries
+ * @param used_length number of items in param 'used'
+ * @param mask which components must be free in the temporary index that is
+ * returned.
+ * @return -1 If there are no more free temporaries, otherwise the index of
+ * a temporary register where the components specified in param 'mask' are
+ * not being used.
+ */
+int rc_find_free_temporary_list(
+	struct radeon_compiler * c,
+	unsigned char * used,
+	unsigned int used_length,
+	unsigned int mask)
 {
-	struct rc_src_register tmp = srcreg;
 	int i;
-	tmp.Swizzle = 0;
-	tmp.Negate = 0;
-	for(i = 0; i < 4; ++i) {
-		rc_swizzle swz = GET_SWZ(swizzle, i);
-		if (swz < 4) {
-			tmp.Swizzle |= GET_SWZ(srcreg.Swizzle, swz) << (i*3);
-			tmp.Negate |= GET_BIT(srcreg.Negate, swz) << i;
-		} else {
-			tmp.Swizzle |= swz << (i*3);
+	for(i = 0; i < used_length; i++) {
+		if ((~used[i] & mask) == mask) {
+			used[i] |= mask;
+			return i;
 		}
 	}
-	return tmp;
+	return -1;
 }
 
 unsigned int rc_find_free_temporary(struct radeon_compiler * c)
 {
-	char used[RC_REGISTER_MAX_INDEX];
-	unsigned int i;
-	struct rc_instruction * rcinst;
+	unsigned char used[RC_REGISTER_MAX_INDEX];
+	int free;
 
 	memset(used, 0, sizeof(used));
 
-	for (rcinst = c->Program.Instructions.Next; rcinst != &c->Program.Instructions; rcinst = rcinst->Next) {
-		const struct rc_sub_instruction *inst = &rcinst->U.I;
-		const struct rc_opcode_info *opcode = rc_get_opcode_info(inst->Opcode);
-		unsigned int k;
-
-		for (k = 0; k < opcode->NumSrcRegs; k++) {
-			if (inst->SrcReg[k].File == RC_FILE_TEMPORARY)
-				used[inst->SrcReg[k].Index] = 1;
-		}
-
-		if (opcode->HasDstReg) {
-			if (inst->DstReg.File == RC_FILE_TEMPORARY)
-				used[inst->DstReg.Index] = 1;
-		}
-	}
+	rc_get_used_temporaries(c, used, RC_REGISTER_MAX_INDEX);
 
-	for (i = 0; i < RC_REGISTER_MAX_INDEX; i++) {
-		if (!used[i])
-			return i;
+	free = rc_find_free_temporary_list(c, used, RC_REGISTER_MAX_INDEX,
+								RC_MASK_XYZW);
+	if (free < 0) {
+		rc_error(c, "Ran out of temporary registers\n");
+		return 0;
 	}
-
-	rc_error(c, "Ran out of temporary registers\n");
-	return 0;
+	return free;
 }
 
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program.h b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
index f0a77d7b539..df6c94b35f9 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program.h
@@ -159,47 +159,6 @@ struct rc_program {
 	struct rc_constant_list Constants;
 };
 
-static inline rc_swizzle get_swz(unsigned int swz, rc_swizzle idx)
-{
-	if (idx & 0x4)
-		return idx;
-	return GET_SWZ(swz, idx);
-}
-
-static inline unsigned int combine_swizzles4(unsigned int src,
-		rc_swizzle swz_x, rc_swizzle swz_y, rc_swizzle swz_z, rc_swizzle swz_w)
-{
-	unsigned int ret = 0;
-
-	ret |= get_swz(src, swz_x);
-	ret |= get_swz(src, swz_y) << 3;
-	ret |= get_swz(src, swz_z) << 6;
-	ret |= get_swz(src, swz_w) << 9;
-
-	return ret;
-}
-
-static inline unsigned int combine_swizzles(unsigned int src, unsigned int swz)
-{
-	unsigned int ret = 0;
-
-	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_X));
-	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Y)) << 3;
-	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_Z)) << 6;
-	ret |= get_swz(src, GET_SWZ(swz, RC_SWIZZLE_W)) << 9;
-
-	return ret;
-}
-
-struct rc_src_register lmul_swizzle(unsigned int swizzle, struct rc_src_register srcreg);
-
-static inline void reset_srcreg(struct rc_src_register* reg)
-{
-	memset(reg, 0, sizeof(struct rc_src_register));
-	reg->Swizzle = RC_SWIZZLE_XYZW;
-}
-
-
 /**
  * A transformation that can be passed to \ref rc_local_transform.
  *
@@ -222,6 +181,17 @@ void rc_local_transform(
 	struct radeon_compiler *c,
 	void *user);
 
+void rc_get_used_temporaries(
+	struct radeon_compiler * c,
+	unsigned char * used,
+	unsigned int used_length);
+
+int rc_find_free_temporary_list(
+	struct radeon_compiler * c,
+	unsigned char * used,
+	unsigned int used_length,
+	unsigned int mask);
+
 unsigned int rc_find_free_temporary(struct radeon_compiler * c);
 
 struct rc_instruction *rc_alloc_instruction(struct radeon_compiler * c);
@@ -233,4 +203,5 @@ unsigned int rc_recompute_ips(struct radeon_compiler * c);
 
 void rc_print_program(const struct rc_program *prog);
 
+rc_swizzle rc_mask_to_swizzle(unsigned int mask);
 #endif
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
index 39408845d5a..58977a40c7c 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
@@ -36,6 +36,7 @@
 #include "radeon_program_alu.h"
 
 #include "radeon_compiler.h"
+#include "radeon_compiler_util.h"
 
 
 static struct rc_instruction *emit1(
@@ -84,16 +85,6 @@ static struct rc_instruction *emit3(
 	return fpi;
 }
 
-static struct rc_dst_register dstreg(int file, int index)
-{
-	struct rc_dst_register dst;
-	dst.File = file;
-	dst.Index = index;
-	dst.WriteMask = RC_MASK_XYZW;
-	dst.RelAddr = 0;
-	return dst;
-}
-
 static struct rc_dst_register dstregtmpmask(int index, int mask)
 {
 	struct rc_dst_register dst = {0};
@@ -186,6 +177,38 @@ static struct rc_src_register swizzle_wwww(struct rc_src_register reg)
 	return swizzle_smear(reg, RC_SWIZZLE_W);
 }
 
+static int is_dst_safe_to_reuse(struct rc_instruction *inst)
+{
+	const struct rc_opcode_info *info = rc_get_opcode_info(inst->U.I.Opcode);
+	unsigned i;
+
+	assert(info->HasDstReg);
+
+	if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
+		return 0;
+
+	for (i = 0; i < info->NumSrcRegs; i++) {
+		if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY &&
+		    inst->U.I.SrcReg[i].Index == inst->U.I.DstReg.Index)
+			return 0;
+	}
+
+	return 1;
+}
+
+static struct rc_dst_register try_to_reuse_dst(struct radeon_compiler *c,
+					       struct rc_instruction *inst)
+{
+	unsigned tmp;
+
+	if (is_dst_safe_to_reuse(inst))
+		tmp = inst->U.I.DstReg.Index;
+	else
+		tmp = rc_find_free_temporary(c);
+
+	return dstregtmpmask(tmp, inst->U.I.DstReg.WriteMask);
+}
+
 static void transform_ABS(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
@@ -209,10 +232,26 @@ static void transform_CEIL(struct radeon_compiler* c,
 	 *     ceil(x) = x+frac(-x)
 	 */
 
-	int tempreg = rc_find_free_temporary(c);
-	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]));
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, negate(inst->U.I.SrcReg[0]));
 	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg));
+		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index));
+	rc_remove_instruction(inst);
+}
+
+static void transform_CLAMP(struct radeon_compiler *c,
+	struct rc_instruction *inst)
+{
+	/* CLAMP dst, src, min, max
+	 *    into:
+	 * MIN tmp, src, max
+	 * MAX dst, tmp, min
+	 */
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+	emit2(c, inst->Prev, RC_OPCODE_MIN, 0, dst,
+		inst->U.I.SrcReg[0], inst->U.I.SrcReg[2]);
+	emit2(c, inst->Prev, RC_OPCODE_MAX, inst->U.I.SaturateMode, inst->U.I.DstReg,
+		srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1]);
 	rc_remove_instruction(inst);
 }
 
@@ -258,10 +297,10 @@ static void transform_DST(struct radeon_compiler* c,
 static void transform_FLR(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
-	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0]);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
+	emit1(c, inst->Prev, RC_OPCODE_FRC, 0, dst, inst->U.I.SrcReg[0]);
 	emit2(c, inst->Prev, RC_OPCODE_ADD, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
+		inst->U.I.SrcReg[0], negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
 	rc_remove_instruction(inst);
 }
 
@@ -351,14 +390,14 @@ static void transform_LIT(struct radeon_compiler* c,
 static void transform_LRP(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
-		dstreg(RC_FILE_TEMPORARY, tempreg),
+		dst,
 		inst->U.I.SrcReg[1], negate(inst->U.I.SrcReg[2]));
 	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode,
 		inst->U.I.DstReg,
-		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[2]);
+		inst->U.I.SrcReg[0], srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[2]);
 
 	rc_remove_instruction(inst);
 }
@@ -366,9 +405,8 @@ static void transform_LRP(struct radeon_compiler* c,
 static void transform_POW(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
-	struct rc_dst_register tempdst = dstreg(RC_FILE_TEMPORARY, tempreg);
-	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempreg);
+	struct rc_dst_register tempdst = try_to_reuse_dst(c, inst);
+	struct rc_src_register tempsrc = srcreg(RC_FILE_TEMPORARY, tempdst.Index);
 	tempdst.WriteMask = RC_MASK_W;
 	tempsrc.Swizzle = RC_SWIZZLE_WWWW;
 
@@ -388,11 +426,11 @@ static void transform_RSQ(struct radeon_compiler* c,
 static void transform_SEQ(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_zero, builtin_one);
+		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
 }
@@ -407,11 +445,11 @@ static void transform_SFL(struct radeon_compiler* c,
 static void transform_SGE(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
+		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
 }
@@ -419,11 +457,11 @@ static void transform_SGE(struct radeon_compiler* c,
 static void transform_SGT(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
+		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
 }
@@ -431,11 +469,11 @@ static void transform_SGT(struct radeon_compiler* c,
 static void transform_SLE(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, negate(inst->U.I.SrcReg[0]), inst->U.I.SrcReg[1]);
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_zero, builtin_one);
+		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_zero, builtin_one);
 
 	rc_remove_instruction(inst);
 }
@@ -443,11 +481,11 @@ static void transform_SLE(struct radeon_compiler* c,
 static void transform_SLT(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		srcreg(RC_FILE_TEMPORARY, tempreg), builtin_one, builtin_zero);
+		srcreg(RC_FILE_TEMPORARY, dst.Index), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
 }
@@ -455,11 +493,11 @@ static void transform_SLT(struct radeon_compiler* c,
 static void transform_SNE(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dstreg(RC_FILE_TEMPORARY, tempreg), inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
+	emit2(c, inst->Prev, RC_OPCODE_ADD, 0, dst, inst->U.I.SrcReg[0], negate(inst->U.I.SrcReg[1]));
 	emit3(c, inst->Prev, RC_OPCODE_CMP, inst->U.I.SaturateMode, inst->U.I.DstReg,
-		negate(absolute(srcreg(RC_FILE_TEMPORARY, tempreg))), builtin_one, builtin_zero);
+		negate(absolute(srcreg(RC_FILE_TEMPORARY, dst.Index))), builtin_one, builtin_zero);
 
 	rc_remove_instruction(inst);
 }
@@ -473,12 +511,13 @@ static void transform_SSG(struct radeon_compiler* c,
 	 *   CMP tmp1, x, 1, 0
 	 *   ADD result, tmp0, -tmp1;
 	 */
-	unsigned tmp0, tmp1;
+	struct rc_dst_register dst0;
+	unsigned tmp1;
 
 	/* 0 < x */
-	tmp0 = rc_find_free_temporary(c);
+	dst0 = try_to_reuse_dst(c, inst);
 	emit3(c, inst->Prev, RC_OPCODE_CMP, 0,
-	      dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
+	      dst0,
 	      negate(inst->U.I.SrcReg[0]),
 	      builtin_one,
 	      builtin_zero);
@@ -495,7 +534,7 @@ static void transform_SSG(struct radeon_compiler* c,
 	/* result = tmp0 - tmp1 */
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
 	      inst->U.I.DstReg,
-	      srcreg(RC_FILE_TEMPORARY, tmp0),
+	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
 	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
 
 	rc_remove_instruction(inst);
@@ -517,15 +556,15 @@ static void transform_SWZ(struct radeon_compiler* c,
 static void transform_XPD(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
-	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dstreg(RC_FILE_TEMPORARY, tempreg),
+	emit2(c, inst->Prev, RC_OPCODE_MUL, 0, dst,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W));
 	emit3(c, inst->Prev, RC_OPCODE_MAD, inst->U.I.SaturateMode, inst->U.I.DstReg,
 		swizzle(inst->U.I.SrcReg[0], RC_SWIZZLE_Y, RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_W),
 		swizzle(inst->U.I.SrcReg[1], RC_SWIZZLE_Z, RC_SWIZZLE_X, RC_SWIZZLE_Y, RC_SWIZZLE_W),
-		negate(srcreg(RC_FILE_TEMPORARY, tempreg)));
+		negate(srcreg(RC_FILE_TEMPORARY, dst.Index)));
 
 	rc_remove_instruction(inst);
 }
@@ -553,6 +592,7 @@ int radeonTransformALU(
 	switch(inst->U.I.Opcode) {
 	case RC_OPCODE_ABS: transform_ABS(c, inst); return 1;
 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
+	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
 	case RC_OPCODE_DP2: transform_DP2(c, inst); return 1;
 	case RC_OPCODE_DPH: transform_DPH(c, inst); return 1;
 	case RC_OPCODE_DST: transform_DST(c, inst); return 1;
@@ -592,7 +632,7 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
 {
 	/* There is no decent CMP available, so let's rig one up.
 	 * CMP is defined as dst = src0 < 0.0 ? src1 : src2
-	 * The following sequence consumes two temps and two extra slots
+	 * The following sequence consumes zero to two temps and two extra slots
 	 * (the second temp and the second slot is consumed by transform_LRP),
 	 * but should be equivalent:
 	 *
@@ -600,18 +640,18 @@ static void transform_r300_vertex_CMP(struct radeon_compiler* c,
 	 * LRP dst, tmp0, src1, src2
 	 *
 	 * Yes, I know, I'm a mad scientist. ~ C. & M. */
-	int tempreg0 = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 
 	/* SLT tmp0, src0, 0.0 */
 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
-		dstreg(RC_FILE_TEMPORARY, tempreg0),
+		dst,
 		inst->U.I.SrcReg[0], builtin_zero);
 
 	/* LRP dst, tmp0, src1, src2 */
 	transform_LRP(c,
 		emit3(c, inst->Prev, RC_OPCODE_LRP, 0,
 		      inst->U.I.DstReg,
-		      srcreg(RC_FILE_TEMPORARY, tempreg0), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
+		      srcreg(RC_FILE_TEMPORARY, dst.Index), inst->U.I.SrcReg[1],  inst->U.I.SrcReg[2]));
 
 	rc_remove_instruction(inst);
 }
@@ -642,7 +682,7 @@ static void transform_r300_vertex_DP3(struct radeon_compiler* c,
 static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
 	struct rc_instruction* inst)
 {
-	int tempreg = rc_find_free_temporary(c);
+	struct rc_dst_register dst = try_to_reuse_dst(c, inst);
 	unsigned constant_swizzle;
 	int constant = rc_constants_add_immediate_scalar(&c->Program.Constants,
 							 0.0000000000000000001,
@@ -650,16 +690,16 @@ static void transform_r300_vertex_fix_LIT(struct radeon_compiler* c,
 
 	/* MOV dst, src */
 	emit1(c, inst->Prev, RC_OPCODE_MOV, 0,
-		dstreg(RC_FILE_TEMPORARY, tempreg),
+		dst,
 		inst->U.I.SrcReg[0]);
 
 	/* MAX dst.z, src, 0.00...001 */
 	emit2(c, inst->Prev, RC_OPCODE_MAX, 0,
-		dstregtmpmask(tempreg, RC_MASK_Y),
-		srcreg(RC_FILE_TEMPORARY, tempreg),
+		dstregtmpmask(dst.Index, RC_MASK_Y),
+		srcreg(RC_FILE_TEMPORARY, dst.Index),
 		srcregswz(RC_FILE_CONSTANT, constant, constant_swizzle));
 
-	inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, tempreg);
+	inst->U.I.SrcReg[0] = srcreg(RC_FILE_TEMPORARY, dst.Index);
 }
 
 static void transform_r300_vertex_SEQ(struct radeon_compiler *c,
@@ -743,12 +783,13 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
 	 *   SLT tmp1, x, 0;
 	 *   ADD result, tmp0, -tmp1;
 	 */
-	unsigned tmp0, tmp1;
+	struct rc_dst_register dst0 = try_to_reuse_dst(c, inst);
+	unsigned tmp1;
 
 	/* 0 < x */
-	tmp0 = rc_find_free_temporary(c);
+	dst0 = try_to_reuse_dst(c, inst);
 	emit2(c, inst->Prev, RC_OPCODE_SLT, 0,
-	      dstregtmpmask(tmp0, inst->U.I.DstReg.WriteMask),
+	      dst0,
 	      builtin_zero,
 	      inst->U.I.SrcReg[0]);
 
@@ -763,7 +804,7 @@ static void transform_r300_vertex_SSG(struct radeon_compiler* c,
 	/* result = tmp0 - tmp1 */
 	emit2(c, inst->Prev, RC_OPCODE_ADD, 0,
 	      inst->U.I.DstReg,
-	      srcreg(RC_FILE_TEMPORARY, tmp0),
+	      srcreg(RC_FILE_TEMPORARY, dst0.Index),
 	      negate(srcreg(RC_FILE_TEMPORARY, tmp1)));
 
 	rc_remove_instruction(inst);
@@ -781,6 +822,7 @@ int r300_transform_vertex_alu(
 	switch(inst->U.I.Opcode) {
 	case RC_OPCODE_ABS: transform_r300_vertex_ABS(c, inst); return 1;
 	case RC_OPCODE_CEIL: transform_CEIL(c, inst); return 1;
+	case RC_OPCODE_CLAMP: transform_CLAMP(c, inst); return 1;
 	case RC_OPCODE_CMP: transform_r300_vertex_CMP(c, inst); return 1;
 	case RC_OPCODE_DP2: transform_r300_vertex_DP2(c, inst); return 1;
 	case RC_OPCODE_DP3: transform_r300_vertex_DP3(c, inst); return 1;
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
index 9dcd44c522d..45f79ece5ba 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_constants.h
@@ -181,4 +181,9 @@ static inline int rc_presubtract_src_reg_count(rc_presubtract_op op){
 		return 0;
 	}
 }
+
+#define RC_SOURCE_NONE  0x0
+#define RC_SOURCE_RGB   0x1
+#define RC_SOURCE_ALPHA 0x2
+
 #endif /* RADEON_PROGRAM_CONSTANTS_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
index a21fe8d3df8..5905d26e521 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.c
@@ -27,6 +27,9 @@
 
 #include "radeon_program_pair.h"
 
+#include "radeon_compiler_util.h"
+
+#include <stdlib.h>
 
 /**
  * Return the source slot where we installed the given register access,
@@ -204,24 +207,37 @@ void rc_pair_foreach_source_that_rgb_reads(
 	}
 }
 
-/*return 0 for rgb, 1 for alpha -1 for error. */
-
-unsigned int rc_source_type_that_arg_reads(
-	unsigned int source,
-	unsigned int swizzle)
+struct rc_pair_instruction_source * rc_pair_get_src(
+	struct rc_pair_instruction * pair_inst,
+	struct rc_pair_instruction_arg * arg)
 {
-	unsigned int chan;
-	unsigned int swz = RC_SWIZZLE_UNUSED;
-	unsigned int ret = RC_PAIR_SOURCE_NONE;
-
-	for(chan = 0; chan < 3; chan++) {
-		swz = GET_SWZ(swizzle, chan);
-		if (swz == RC_SWIZZLE_W) {
-			ret |= RC_PAIR_SOURCE_ALPHA;
-		} else if (swz == RC_SWIZZLE_X || swz == RC_SWIZZLE_Y
-						|| swz == RC_SWIZZLE_Z) {
-			ret |= RC_PAIR_SOURCE_RGB;
+	unsigned int i, type;
+	unsigned int channels = 0;
+
+	for(i = 0; i < 3; i++) {
+		if (arg == pair_inst->RGB.Arg + i) {
+			channels = 3;
+			break;
 		}
 	}
-	return ret;
+
+	if (channels == 0) {
+		for (i = 0; i < 3; i++) {
+			if (arg == pair_inst->Alpha.Arg + i) {
+				channels = 1;
+				break;
+			}
+		}
+	}
+
+	assert(channels > 0);
+	type = rc_source_type_swz(arg->Swizzle, channels);
+
+	if (type & RC_SOURCE_RGB) {
+		return &pair_inst->RGB.Src[arg->Source];
+	} else if (type & RC_SOURCE_ALPHA) {
+		return &pair_inst->Alpha.Src[arg->Source];
+	} else {
+		return NULL;
+	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
index 54d44a2098b..ccf7a0070cd 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_pair.h
@@ -55,10 +55,6 @@ struct radeon_compiler;
  */
 #define RC_PAIR_PRESUB_SRC 3
 
-#define RC_PAIR_SOURCE_NONE  0x0
-#define RC_PAIR_SOURCE_RGB   0x1
-#define RC_PAIR_SOURCE_ALPHA 0x2
-
 struct rc_pair_instruction_source {
 	unsigned int Used:1;
 	unsigned int File:3;
@@ -115,9 +111,9 @@ void rc_pair_foreach_source_that_rgb_reads(
 	void * data,
 	rc_pair_foreach_src_fn cb);
 
-unsigned int rc_source_type_that_arg_reads(
-	unsigned int source,
-	unsigned int swizzle);
+struct rc_pair_instruction_source * rc_pair_get_src(
+	struct rc_pair_instruction * pair_inst,
+	struct rc_pair_instruction_arg * arg);
 /*@}*/
 
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
index 618ab5a099b..ae13f6742f8 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_print.c
@@ -129,6 +129,7 @@ static char rc_swizzle_char(unsigned int swz)
 	case RC_SWIZZLE_HALF: return 'H';
 	case RC_SWIZZLE_UNUSED: return '_';
 	}
+	fprintf(stderr, "bad swz: %u\n", swz);
 	return '?';
 }
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
index 530afa5e08e..f9d9f34b6ad 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_tex.c
@@ -28,6 +28,8 @@
 
 #include "radeon_program_tex.h"
 
+#include "radeon_compiler_util.h"
+
 /* Series of transformations to be done on textures. */
 
 static struct rc_src_register shadow_ambient(struct r300_fragment_program_compiler *compiler,
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_remove_constants.c b/src/mesa/drivers/dri/r300/compiler/radeon_remove_constants.c
index 5f67f536f61..7d76585a593 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_remove_constants.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_remove_constants.c
@@ -87,8 +87,9 @@ void rc_remove_unused_constants(struct radeon_compiler *c, void *user)
 		rc_for_all_reads_src(inst, mark_used, &d);
 	}
 
-	/* Pass 2: If there is relative addressing, mark all externals as used. */
-	if (has_rel_addr) {
+	/* Pass 2: If there is relative addressing or dead constant elimination
+	 * is disabled, mark all externals as used. */
+	if (has_rel_addr || !c->remove_unused_constants) {
 		for (unsigned i = 0; i < c->Program.Constants.Count; i++)
 			if (constants[i].Type == RC_CONSTANT_EXTERNAL)
 				const_used[i] = 1;
@@ -119,7 +120,7 @@ void rc_remove_unused_constants(struct radeon_compiler *c, void *user)
 	/*  is_identity ==> new_count == old_count
 	 * !is_identity ==> new_count <  old_count */
 	assert( is_identity || new_count <  c->Program.Constants.Count);
-	assert(!(has_rel_addr && are_externals_remapped));
+	assert(!((has_rel_addr || !c->remove_unused_constants) && are_externals_remapped));
 
 	/* Pass 4: Redirect reads of all constants to their new locations. */
 	if (!is_identity) {
@@ -127,7 +128,6 @@ void rc_remove_unused_constants(struct radeon_compiler *c, void *user)
 		     inst != &c->Program.Instructions; inst = inst->Next) {
 			rc_remap_registers(inst, remap_regs, inv_remap_table);
 		}
-
 	}
 
 	/* Set the new constant count. Note that new_count may be less than
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
index 60e228be5bd..88165f78953 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_rename_regs.c
@@ -33,100 +33,51 @@
 
 #include "radeon_compiler.h"
 #include "radeon_dataflow.h"
-
-struct reg_rename {
-	int old_index;
-	int new_index;
-	int temp_index;
-};
-
-static void rename_reg(void * data, struct rc_instruction * inst,
-			rc_register_file * file, unsigned int * index)
-{
-	struct reg_rename *r = data;
-
-	if(r->old_index == *index && *file == RC_FILE_TEMPORARY) {
-		*index = r->new_index;
-	}
-	else if(r->new_index == *index && *file == RC_FILE_TEMPORARY) {
-		*index = r->temp_index;
-	}
-}
-
-static void rename_all(
-	struct radeon_compiler *c,
-	struct rc_instruction * start,
-	unsigned int old,
-	unsigned int new,
-	unsigned int temp)
-{
-	struct rc_instruction * inst;
-	struct reg_rename r;
-	r.old_index = old;
-	r.new_index = new;
-	r.temp_index = temp;
-	for(inst = start; inst != &c->Program.Instructions;
-						inst = inst->Next) {
-		rc_remap_registers(inst, rename_reg, &r);
-	}
-}
+#include "radeon_program.h"
 
 /**
  * This function renames registers in an attempt to get the code close to
  * SSA form.  After this function has completed, most of the register are only
- * written to one time, with a few exceptions.  For example, this block of code
- * will not be modified by this function:
- * Mov Temp[0].x Const[0].x
- * Mov Temp[0].y Const[0].y
- * Basically, destination registers will be renamed if:
- * 1. There have been no previous writes to that register
- * or
- * 2. If the instruction is writting to the exact components (no more, no less)
- * of a register that has been written to by previous instructions.
+ * written to one time, with a few exceptions.
  *
  * This function assumes all the instructions are still of type
  * RC_INSTRUCTION_NORMAL.
  */
 void rc_rename_regs(struct radeon_compiler *c, void *user)
 {
-	unsigned int cur_index = 0;
-	unsigned int icount;
+	unsigned int i, used_length;
+	int new_index;
 	struct rc_instruction * inst;
-	unsigned int * masks;
+	struct rc_reader_data reader_data;
+	unsigned char * used;
 
-	/* The number of instructions in the program is also the maximum
-	 * number of temp registers that could potentially be used. */
-	icount = rc_recompute_ips(c);
-	masks = memory_pool_malloc(&c->Pool, icount * sizeof(unsigned int));
-	memset(masks, 0, icount * sizeof(unsigned int));
+	used_length = 2 * rc_recompute_ips(c);
+	used = memory_pool_malloc(&c->Pool, sizeof(unsigned char) * used_length);
+	memset(used, 0, sizeof(unsigned char) * used_length);
 
+	rc_get_used_temporaries(c, used, used_length);
 	for(inst = c->Program.Instructions.Next;
 					inst != &c->Program.Instructions;
 					inst = inst->Next) {
-		const struct rc_opcode_info * info;
-		unsigned int old_index, temp_index;
-		struct rc_dst_register * dst;
-		if(inst->Type != RC_INSTRUCTION_NORMAL) {
-			rc_error(c, "%s only works with normal instructions.",
-								__FUNCTION__);
-			return;
-		}
-		dst = &inst->U.I.DstReg;
-		info = rc_get_opcode_info(inst->U.I.Opcode);
-		if(!info->HasDstReg || dst->File != RC_FILE_TEMPORARY) {
+
+		if (inst->U.I.DstReg.File != RC_FILE_TEMPORARY)
 			continue;
+
+		rc_get_readers(c, inst, &reader_data, NULL, NULL, NULL);
+
+		if (reader_data.Abort || reader_data.ReaderCount == 0)
+			continue;
+
+		new_index = rc_find_free_temporary_list(c, used, used_length,
+						RC_MASK_XYZW);
+		if (new_index < 0) {
+			rc_error(c, "Ran out of temporary registers\n");
+			return;
 		}
-		if(dst->Index >= icount || !masks[dst->Index] ||
-					masks[dst->Index] == dst->WriteMask) {
-			old_index = dst->Index;
-			/* We need to set dst->Index here so get free temporary
-			 * will work. */
-			dst->Index = cur_index++;
-			temp_index = rc_find_free_temporary(c);
-			rename_all(c, inst->Next, old_index,
-						dst->Index, temp_index);
+
+		reader_data.Writer->U.I.DstReg.Index = new_index;
+		for(i = 0; i < reader_data.ReaderCount; i++) {
+			reader_data.Readers[i].U.Src->Index = new_index;
 		}
-		assert(dst->Index < icount);
-		masks[dst->Index] |= dst->WriteMask;
 	}
 }
diff --git a/src/mesa/drivers/dri/r600/evergreen_chip.c b/src/mesa/drivers/dri/r600/evergreen_chip.c
index 2c9e4e2b844..53dacbfdf39 100644
--- a/src/mesa/drivers/dri/r600/evergreen_chip.c
+++ b/src/mesa/drivers/dri/r600/evergreen_chip.c
@@ -286,7 +286,11 @@ static void evergreenSetupVTXConstants(struct gl_context  * ctx,
     if (!paos->bo)
 	    return;
 
-	r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
+    if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_CEDAR) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_PALM))
+	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, TC_ACTION_ENA_bit);
+    else
+	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
 
     //uSQ_VTX_CONSTANT_WORD0_0
     uSQ_VTX_CONSTANT_WORD0_0 = paos->offset;
diff --git a/src/mesa/drivers/dri/r600/evergreen_state.c b/src/mesa/drivers/dri/r600/evergreen_state.c
index a77be183a12..076a608573c 100644
--- a/src/mesa/drivers/dri/r600/evergreen_state.c
+++ b/src/mesa/drivers/dri/r600/evergreen_state.c
@@ -1461,6 +1461,14 @@ static void evergreenInitSQConfig(struct gl_context * ctx)
         uMaxThreads = 248;
         uMaxStackEntries = 512;
 	    break;
+    case CHIP_FAMILY_PALM:
+	    uSqNumCfInsts       = 1;
+        bVC_ENABLE = GL_FALSE;
+        uMaxGPRs = 256;
+        uPSThreadCount = 96;
+        uMaxThreads = 192;
+        uMaxStackEntries = 256;
+	    break;
     default:
         uSqNumCfInsts       = 2;
         bVC_ENABLE = GL_TRUE;
diff --git a/src/mesa/drivers/dri/r600/evergreen_tex.c b/src/mesa/drivers/dri/r600/evergreen_tex.c
index 2f4c92d6767..3b5448a0e4e 100644
--- a/src/mesa/drivers/dri/r600/evergreen_tex.c
+++ b/src/mesa/drivers/dri/r600/evergreen_tex.c
@@ -31,7 +31,6 @@
 #include "main/enums.h"
 #include "main/image.h"
 #include "main/teximage.h"
-#include "main/mipmap.h"
 #include "main/simple_list.h"
 #include "main/texobj.h"
 
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index b6443bf0c53..aa1891eac32 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -259,7 +259,7 @@ static void r600InitConstValues(struct gl_context *ctx, radeonScreenPtr screen)
     R700_CHIP_CONTEXT *r700    = (R700_CHIP_CONTEXT*)(&context->hw);
 
     if(  (context->radeon.radeonScreen->chip_family >= CHIP_FAMILY_CEDAR)
-       &&(context->radeon.radeonScreen->chip_family <= CHIP_FAMILY_HEMLOCK) )
+       &&(context->radeon.radeonScreen->chip_family <= CHIP_FAMILY_PALM) )
     {
         r700->bShaderUseMemConstant = GL_TRUE;
     }
@@ -285,8 +285,13 @@ static void r600InitConstValues(struct gl_context *ctx, radeonScreenPtr screen)
 	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 	ctx->Const.MaxTextureLodBias = 16.0;
 
-	ctx->Const.MaxTextureLevels = 13; /* hw support 14 */
-	ctx->Const.MaxTextureRectSize = 4096; /* hw support 8192 */
+	if (screen->chip_family >= CHIP_FAMILY_CEDAR) {
+		ctx->Const.MaxTextureLevels = 15;
+		ctx->Const.MaxTextureRectSize = 16384;
+	} else {
+		ctx->Const.MaxTextureLevels = 14;
+		ctx->Const.MaxTextureRectSize = 8192;
+	}
 
 	ctx->Const.MinPointSize   = 0x0001 / 8.0;
 	ctx->Const.MinPointSizeAA = 0x0001 / 8.0;
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index 2bf24096a0d..1fa559cec1a 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -3334,7 +3334,14 @@ GLboolean assemble_CMP(r700_AssemblerBase *pAsm)
         return GL_FALSE;
     }
 
-    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGE;
+    if(8 == pAsm->unAsic)
+    {
+	pAsm->D.dst.opcode = EG_OP3_INST_CNDGE;
+    }
+    else
+    {
+	pAsm->D.dst.opcode = SQ_OP3_INST_CNDGE;
+    }
     pAsm->D.dst.op3     = 1;  
 
     tmp = (-1);
@@ -3416,8 +3423,14 @@ GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode)
     checkop1(pAsm);
 
     tmp = gethelpr(pAsm);
-
-    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_MULADD;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    }
     pAsm->D.dst.op3    = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
@@ -3457,7 +3470,14 @@ GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode)
     {
         return GL_FALSE;
     }
-    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_MULADD;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    }
     pAsm->D.dst.op3    = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
@@ -4742,7 +4762,14 @@ GLboolean assemble_SCS(r700_AssemblerBase *pAsm)
 
     tmp = gethelpr(pAsm);
 
-    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_MULADD;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    }
     pAsm->D.dst.op3    = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
@@ -4782,7 +4809,14 @@ GLboolean assemble_SCS(r700_AssemblerBase *pAsm)
     {
         return GL_FALSE;
     }
-    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_MULADD;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    }
     pAsm->D.dst.op3    = 1;
 
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
@@ -5010,7 +5044,14 @@ GLboolean assemble_SSG(r700_AssemblerBase *pAsm)
     
     GLuint tmp = gethelpr(pAsm);
     /* tmp = (src > 0 ? 1 : src) */
-    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGT;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_CNDGT;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_CNDGT;
+    }
     pAsm->D.dst.op3    = 1;
     pAsm->D.dst.rtype = DST_REG_TEMPORARY;
     pAsm->D.dst.reg   = tmp;
@@ -5033,7 +5074,14 @@ GLboolean assemble_SSG(r700_AssemblerBase *pAsm)
     }
 
     /* dst = (-tmp > 0 ? -1 : tmp) */
-    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGT;
+    if(8 == pAsm->unAsic)
+    {
+        pAsm->D.dst.opcode = EG_OP3_INST_CNDGT;
+    }
+    else
+    {
+        pAsm->D.dst.opcode = SQ_OP3_INST_CNDGT;
+    }
     pAsm->D.dst.op3    = 1;
 
     if( GL_FALSE == assemble_dst(pAsm) )
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index 61106fbc43f..82789cec5ed 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -440,6 +440,11 @@
 #define PCI_CHIP_HEMLOCK_689C           0x689C
 #define PCI_CHIP_HEMLOCK_689D           0x689D
 
+#define PCI_CHIP_PALM_9802              0x9802
+#define PCI_CHIP_PALM_9803              0x9803
+#define PCI_CHIP_PALM_9804              0x9804
+#define PCI_CHIP_PALM_9805              0x9805
+
 enum {
    CHIP_FAMILY_R100,
    CHIP_FAMILY_RV100,
@@ -483,6 +488,7 @@ enum {
    CHIP_FAMILY_JUNIPER,
    CHIP_FAMILY_CYPRESS,
    CHIP_FAMILY_HEMLOCK,
+   CHIP_FAMILY_PALM,
    CHIP_FAMILY_LAST
 };
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index fecdd119059..ca6ab46ca43 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -99,6 +99,7 @@ static const char* get_chip_family_name(int chip_family)
 	case CHIP_FAMILY_JUNIPER: return "JUNIPER";
 	case CHIP_FAMILY_CYPRESS: return "CYPRESS";
 	case CHIP_FAMILY_HEMLOCK: return "HEMLOCK";
+	case CHIP_FAMILY_PALM: return "PALM";
 	default: return "unknown";
 	}
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
index 088f9701722..a68a9768779 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
@@ -49,7 +49,7 @@ struct _radeon_mipmap_level {
 };
 
 /* store the max possible in the miptree */
-#define RADEON_MIPTREE_MAX_TEXTURE_LEVELS 13
+#define RADEON_MIPTREE_MAX_TEXTURE_LEVELS 15
 
 /**
  * A mipmap tree contains texture images in the layout that the hardware
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index b379240579d..94e56c2ade6 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -1155,6 +1155,14 @@ static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
        screen->chip_flags = RADEON_CHIPSET_TCL;
        break;
 
+    case PCI_CHIP_PALM_9802:
+    case PCI_CHIP_PALM_9803:
+    case PCI_CHIP_PALM_9804:
+    case PCI_CHIP_PALM_9805:
+       screen->chip_family = CHIP_FAMILY_PALM;
+       screen->chip_flags = RADEON_CHIPSET_TCL;
+       break;
+
    default:
       fprintf(stderr, "unknown chip id 0x%x, can't guess.\n",
 	      device_id);
diff --git a/src/mesa/drivers/dri/sis/server/sis_dri.h b/src/mesa/drivers/dri/sis/server/sis_dri.h
index f0171f3c0f8..7d8f507115d 100644
--- a/src/mesa/drivers/dri/sis/server/sis_dri.h
+++ b/src/mesa/drivers/dri/sis/server/sis_dri.h
@@ -72,13 +72,4 @@ typedef struct {
   int dummy;
 } SISDRIContextRec, *SISDRIContextPtr;
 
-#ifdef XFree86Server
-
-#include "screenint.h"
-
-Bool SISDRIScreenInit(ScreenPtr pScreen);
-void SISDRICloseScreen(ScreenPtr pScreen);
-Bool SISDRIFinishScreenInit(ScreenPtr pScreen);
-
-#endif
 #endif
diff --git a/src/mesa/drivers/dri/tdfx/tdfx_context.h b/src/mesa/drivers/dri/tdfx/tdfx_context.h
index fb38419dcdd..7e2f0e00a8e 100644
--- a/src/mesa/drivers/dri/tdfx/tdfx_context.h
+++ b/src/mesa/drivers/dri/tdfx/tdfx_context.h
@@ -41,11 +41,7 @@
 
 #include <sys/time.h>
 #include "dri_util.h"
-#ifdef XFree86Server
-#include "GL/xf86glx.h"
-#else
 #include "main/glheader.h"
-#endif
 #if defined(__linux__)
 #include <signal.h>
 #endif
diff --git a/src/mesa/drivers/dri/unichrome/server/via_dri.h b/src/mesa/drivers/dri/unichrome/server/via_dri.h
index b47397d5728..c6eed03c1c9 100644
--- a/src/mesa/drivers/dri/unichrome/server/via_dri.h
+++ b/src/mesa/drivers/dri/unichrome/server/via_dri.h
@@ -35,9 +35,7 @@
 #define VIA_DRIDDX_VERSION_MINOR  0
 #define VIA_DRIDDX_VERSION_PATCH  0
 
-#ifndef XFree86Server
 typedef int Bool;
-#endif
 
 typedef struct {
     drm_handle_t handle;
diff --git a/src/mesa/drivers/windows/gdi/InitCritSections.cpp b/src/mesa/drivers/windows/gdi/InitCritSections.cpp
index 7145bffa510..69f03b8e47c 100644
--- a/src/mesa/drivers/windows/gdi/InitCritSections.cpp
+++ b/src/mesa/drivers/windows/gdi/InitCritSections.cpp
@@ -1,7 +1,8 @@
 #include "glapi.h"
 #include "glThread.h"
 
-#ifdef WIN32_THREADS
+#ifdef WIN32
+
 extern "C" _glthread_Mutex OneTimeLock;
 extern "C" _glthread_Mutex GenTexturesLock;
 
@@ -29,4 +30,4 @@ public:
 _CriticalSectionInit _CriticalSectionInit::m_inst;
 
 
-#endif
+#endif /* WIN32 */
diff --git a/src/mesa/drivers/x11/glxheader.h b/src/mesa/drivers/x11/glxheader.h
index d88afba20e7..ee002191bc0 100644
--- a/src/mesa/drivers/x11/glxheader.h
+++ b/src/mesa/drivers/x11/glxheader.h
@@ -32,13 +32,6 @@
 
 #include "main/glheader.h"
 
-#ifdef XFree86Server
-
-# include "xorg-server.h"
-# include "resource.h"
-# include "windowstr.h"
-
-#else
 
 # include <X11/Xlib.h>
 # include <X11/Xlibint.h>
@@ -51,7 +44,6 @@
 # include <GL/glx.h>
 # include <sys/time.h>
 
-#endif
 
 
 
diff --git a/src/mesa/drivers/x11/xm_api.c b/src/mesa/drivers/x11/xm_api.c
index 00ceb960c62..b5eabadf486 100644
--- a/src/mesa/drivers/x11/xm_api.c
+++ b/src/mesa/drivers/x11/xm_api.c
@@ -158,14 +158,12 @@ static short hpcr_rgbTbl[3][256] = {
 /**
  * Return the host's byte order as LSBFirst or MSBFirst ala X.
  */
-#ifndef XFree86Server
 static int host_byte_order( void )
 {
    int i = 1;
    char *cptr = (char *) &i;
    return (*cptr==1) ? LSBFirst : MSBFirst;
 }
-#endif
 
 
 /**
@@ -176,7 +174,7 @@ static int host_byte_order( void )
  */
 static int check_for_xshm( XMesaDisplay *display )
 {
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
    int major, minor, ignore;
    Bool pixmaps;
 
@@ -227,16 +225,6 @@ gamma_adjust( GLfloat gamma, GLint value, GLint max )
 static int
 bits_per_pixel( XMesaVisual xmv )
 {
-#ifdef XFree86Server
-   const int depth = xmv->nplanes;
-   int i;
-   assert(depth > 0);
-   for (i = 0; i < screenInfo.numPixmapFormats; i++) {
-      if (screenInfo.formats[i].depth == depth)
-         return screenInfo.formats[i].bitsPerPixel;
-   }
-   return depth;  /* should never get here, but this should be safe */
-#else
    XMesaDisplay *dpy = xmv->display;
    XMesaVisualInfo visinfo = xmv->visinfo;
    XMesaImage *img;
@@ -257,7 +245,6 @@ bits_per_pixel( XMesaVisual xmv )
    img->data = NULL;
    XMesaDestroyImage( img );
    return bitsPerPixel;
-#endif
 }
 
 
@@ -271,7 +258,6 @@ bits_per_pixel( XMesaVisual xmv )
  * Return:  GL_TRUE - window exists
  *          GL_FALSE - window doesn't exist
  */
-#ifndef XFree86Server
 static GLboolean WindowExistsFlag;
 
 static int window_exists_err_handler( XMesaDisplay* dpy, XErrorEvent* xerr )
@@ -306,7 +292,6 @@ get_drawable_size( XMesaDisplay *dpy, Drawable d, GLuint *width, GLuint *height
    *height = h;
    return stat;
 }
-#endif
 
 
 /**
@@ -319,10 +304,6 @@ void
 xmesa_get_window_size(XMesaDisplay *dpy, XMesaBuffer b,
                       GLuint *width, GLuint *height)
 {
-#ifdef XFree86Server
-   *width = MIN2(b->frontxrb->drawable->width, MAX_WIDTH);
-   *height = MIN2(b->frontxrb->drawable->height, MAX_HEIGHT);
-#else
    Status stat;
 
    _glthread_LOCK_MUTEX(_xmesa_lock);
@@ -335,7 +316,6 @@ xmesa_get_window_size(XMesaDisplay *dpy, XMesaBuffer b,
       _mesa_warning(NULL, "XGetGeometry failed!\n");
       *width = *height = 1;
    }
-#endif
 }
 
 
@@ -549,16 +529,11 @@ noFaultXAllocColor( int client,
                     XMesaColor *color,
                     int *exact, int *alloced )
 {
-#ifdef XFree86Server
-   Pixel *ppixIn;
-   xrgb *ctable;
-#else
    /* we'll try to cache ctable for better remote display performance */
    static Display *prevDisplay = NULL;
    static XMesaColormap prevCmap = 0;
    static int prevCmapSize = 0;
    static XMesaColor *ctable = NULL;
-#endif
    XMesaColor subColor;
    int i, bestmatch;
    double mindist;       /* 3*2^16^2 exceeds long int precision. */
@@ -566,14 +541,7 @@ noFaultXAllocColor( int client,
    (void) client;
 
    /* First try just using XAllocColor. */
-#ifdef XFree86Server
-   if (AllocColor(cmap,
-		  &color->red, &color->green, &color->blue,
-		  &color->pixel,
-		  client) == Success)
-#else
    if (XAllocColor(dpy, cmap, color))
-#endif
    {
       *exact = 1;
       *alloced = 1;
@@ -584,14 +552,6 @@ noFaultXAllocColor( int client,
 
    /* Retrieve color table entries. */
    /* XXX alloca candidate. */
-#ifdef XFree86Server
-   ppixIn = (Pixel *) MALLOC(cmapSize * sizeof(Pixel));
-   ctable = (xrgb *) MALLOC(cmapSize * sizeof(xrgb));
-   for (i = 0; i < cmapSize; i++) {
-      ppixIn[i] = i;
-   }
-   QueryColors(cmap, cmapSize, ppixIn, ctable);
-#else
    if (prevDisplay != dpy || prevCmap != cmap
        || prevCmapSize != cmapSize || !ctable) {
       /* free previously cached color table */
@@ -608,7 +568,6 @@ noFaultXAllocColor( int client,
       prevCmap = cmap;
       prevCmapSize = cmapSize;
    }
-#endif
 
    /* Find best match. */
    bestmatch = -1;
@@ -632,14 +591,7 @@ noFaultXAllocColor( int client,
     * fail if the cell is read/write.  Otherwise, we're incrementing
     * the cell's reference count.
     */
-#ifdef XFree86Server
-   if (AllocColor(cmap,
-		  &subColor.red, &subColor.green, &subColor.blue,
-		  &subColor.pixel,
-		  client) == Success) {
-#else
    if (XAllocColor(dpy, cmap, &subColor)) {
-#endif
       *alloced = 1;
    }
    else {
@@ -651,12 +603,7 @@ noFaultXAllocColor( int client,
       subColor.flags = DoRed | DoGreen | DoBlue;
       *alloced = 0;
    }
-#ifdef XFree86Server
-   free(ppixIn);
-   free(ctable);
-#else
    /* don't free table, save it for next time */
-#endif
 
    *color = subColor;
    *exact = 0;
@@ -873,10 +820,8 @@ setup_8bit_hpcr(XMesaVisual v)
       v->hpcr_clear_pixmap = XMesaCreatePixmap(v->display,
                                                DefaultRootWindow(v->display),
                                                16, 2, 8);
-#ifndef XFree86Server
       v->hpcr_clear_ximage = XGetImage(v->display, v->hpcr_clear_pixmap,
                                        0, 0, 16, 2, AllPlanes, ZPixmap);
-#endif
    }
 }
 
@@ -1049,9 +994,6 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
    int client = 0;
    const int xclass = v->visualType;
 
-#ifdef XFree86Server
-   client = (window) ? CLIENT_ID(window->id) : 0;
-#endif
 
    ASSERT(!b || b->xm_visual == v);
 
@@ -1120,40 +1062,23 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
       }
 
       /* X11 graphics contexts */
-#ifdef XFree86Server
-      b->gc = CreateScratchGC(v->display, window->depth);
-#else
       b->gc = XCreateGC( v->display, window, 0, NULL );
-#endif
       XMesaSetFunction( v->display, b->gc, GXcopy );
 
       /* cleargc - for glClear() */
-#ifdef XFree86Server
-      b->cleargc = CreateScratchGC(v->display, window->depth);
-#else
       b->cleargc = XCreateGC( v->display, window, 0, NULL );
-#endif
       XMesaSetFunction( v->display, b->cleargc, GXcopy );
 
       /*
        * Don't generate Graphics Expose/NoExpose events in swapbuffers().
        * Patch contributed by Michael Pichler May 15, 1995.
        */
-#ifdef XFree86Server
-      b->swapgc = CreateScratchGC(v->display, window->depth);
-      {
-         CARD32 v[1];
-         v[0] = FALSE;
-         dixChangeGC(NullClient, b->swapgc, GCGraphicsExposures, v, NULL);
-      }
-#else
       {
          XGCValues gcvalues;
          gcvalues.graphics_exposures = False;
          b->swapgc = XCreateGC(v->display, window,
                                GCGraphicsExposures, &gcvalues);
       }
-#endif
       XMesaSetFunction( v->display, b->swapgc, GXcopy );
       /*
        * Set fill style and tile pixmap once for all for HPCR stuff
@@ -1175,9 +1100,6 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
 
       /* Initialize the row buffer XImage for use in write_color_span() */
       data = (char*) MALLOC(MAX_WIDTH*4);
-#ifdef XFree86Server
-      b->rowimage = XMesaCreateImage(GET_VISUAL_DEPTH(v), MAX_WIDTH, 1, data);
-#else
       b->rowimage = XCreateImage( v->display,
                                   v->visinfo->visual,
                                   v->visinfo->depth,
@@ -1186,7 +1108,6 @@ initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
                                   MAX_WIDTH, 1,         /*width, height*/
                                   32,                   /*bitmap_pad*/
                                   0                     /*bytes_per_line*/ );
-#endif
       if (!b->rowimage)
          return GL_FALSE;
    }
@@ -1334,7 +1255,6 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
    XMesaVisual v;
    GLint red_bits, green_bits, blue_bits, alpha_bits;
 
-#ifndef XFree86Server
    /* For debugging only */
    if (_mesa_getenv("MESA_XSYNC")) {
       /* This makes debugging X easier.
@@ -1343,7 +1263,6 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
        */
       XSynchronize( display, 1 );
    }
-#endif
 
    /* Color-index rendering not supported. */
    if (!rgb_flag)
@@ -1360,14 +1279,12 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
     * the struct but we may need some of the information contained in it
     * at a later time.
     */
-#ifndef XFree86Server
    v->visinfo = (XVisualInfo *) MALLOC(sizeof(*visinfo));
    if(!v->visinfo) {
       free(v);
       return NULL;
    }
    memcpy(v->visinfo, visinfo, sizeof(*visinfo));
-#endif
 
    /* check for MESA_GAMMA environment variable */
    gamma = _mesa_getenv("MESA_GAMMA");
@@ -1384,30 +1301,13 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
 
    v->ximage_flag = ximage_flag;
 
-#ifdef XFree86Server
-   /* We could calculate these values by ourselves.  nplanes is either the sum
-    * of the red, green, and blue bits or the number index bits.
-    * ColormapEntries is either (1U << index_bits) or
-    * (1U << max(redBits, greenBits, blueBits)).
-    */
-   assert(visinfo->nplanes > 0);
-   v->nplanes = visinfo->nplanes;
-   v->ColormapEntries = visinfo->ColormapEntries;
-
-   v->mesa_visual.redMask = visinfo->redMask;
-   v->mesa_visual.greenMask = visinfo->greenMask;
-   v->mesa_visual.blueMask = visinfo->blueMask;
-   v->visualID = visinfo->vid;
-   v->screen = 0; /* FIXME: What should be done here? */
-#else
    v->mesa_visual.redMask = visinfo->red_mask;
    v->mesa_visual.greenMask = visinfo->green_mask;
    v->mesa_visual.blueMask = visinfo->blue_mask;
    v->visualID = visinfo->visualid;
    v->screen = visinfo->screen;
-#endif
 
-#if defined(XFree86Server) || !(defined(__cplusplus) || defined(c_plusplus))
+#if !(defined(__cplusplus) || defined(c_plusplus))
    v->visualType = xmesa_convert_from_x_visual_type(visinfo->class);
 #else
    v->visualType = xmesa_convert_from_x_visual_type(visinfo->c_class);
@@ -1461,9 +1361,7 @@ XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
 PUBLIC
 void XMesaDestroyVisual( XMesaVisual v )
 {
-#ifndef XFree86Server
    free(v->visinfo);
-#endif
    free(v);
 }
 
@@ -1532,12 +1430,6 @@ XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
     _mesa_enable_extension(mesaCtx, "GL_EXT_timer_query");
 #endif
 
-#ifdef XFree86Server
-   /* If we're running in the X server, do bounds checking to prevent
-    * segfaults and server crashes!
-    */
-   mesaCtx->Const.CheckArrayBounds = GL_TRUE;
-#endif
 
    /* finish up xmesa context initializations */
    c->swapbytes = CHECK_BYTE_ORDER(v) ? GL_FALSE : GL_TRUE;
@@ -1602,9 +1494,7 @@ void XMesaDestroyContext( XMesaContext c )
 PUBLIC XMesaBuffer
 XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
 {
-#ifndef XFree86Server
    XWindowAttributes attr;
-#endif
    XMesaBuffer b;
    XMesaColormap cmap;
    int depth;
@@ -1613,12 +1503,8 @@ XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
    assert(w);
 
    /* Check that window depth matches visual depth */
-#ifdef XFree86Server
-   depth = ((XMesaDrawable)w)->depth;
-#else
    XGetWindowAttributes( v->display, w, &attr );
    depth = attr.depth;
-#endif
    if (GET_VISUAL_DEPTH(v) != depth) {
       _mesa_warning(NULL, "XMesaCreateWindowBuffer: depth mismatch between visual (%d) and window (%d)!\n",
                     GET_VISUAL_DEPTH(v), depth);
@@ -1626,9 +1512,6 @@ XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
    }
 
    /* Find colormap */
-#ifdef XFree86Server
-   cmap = (ColormapPtr)LookupIDByType(wColormap(w), RT_COLORMAP);
-#else
    if (attr.colormap) {
       cmap = attr.colormap;
    }
@@ -1638,7 +1521,6 @@ XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
       /* OK, let's just allocate a new one and hope for the best */
       cmap = XCreateColormap(v->display, w, attr.visual, AllocNone);
    }
-#endif
 
    b = create_xmesa_buffer((XMesaDrawable) w, WINDOW, v, cmap);
    if (!b)
@@ -1748,7 +1630,6 @@ XMesaBuffer
 XMesaCreatePBuffer(XMesaVisual v, XMesaColormap cmap,
                    unsigned int width, unsigned int height)
 {
-#ifndef XFree86Server
    XMesaWindow root;
    XMesaDrawable drawable;  /* X Pixmap Drawable */
    XMesaBuffer b;
@@ -1770,9 +1651,6 @@ XMesaCreatePBuffer(XMesaVisual v, XMesaColormap cmap,
    }
 
    return b;
-#else
-   return 0;
-#endif
 }
 
 
@@ -1931,40 +1809,6 @@ XMesaBuffer XMesaGetCurrentReadBuffer( void )
 }
 
 
-#ifdef XFree86Server
-PUBLIC
-GLboolean XMesaForceCurrent(XMesaContext c)
-{
-   if (c) {
-      _glapi_set_dispatch(c->mesa.CurrentDispatch);
-
-      if (&(c->mesa) != _mesa_get_current_context()) {
-	 _mesa_make_current(&c->mesa, c->mesa.DrawBuffer, c->mesa.ReadBuffer);
-      }
-   }
-   else {
-      _mesa_make_current(NULL, NULL, NULL);
-   }
-   return GL_TRUE;
-}
-
-
-PUBLIC
-GLboolean XMesaLoseCurrent(XMesaContext c)
-{
-   (void) c;
-   _mesa_make_current(NULL, NULL, NULL);
-   return GL_TRUE;
-}
-
-
-PUBLIC
-GLboolean XMesaCopyContext( XMesaContext xm_src, XMesaContext xm_dst, GLuint mask )
-{
-   _mesa_copy_context(&xm_src->mesa, &xm_dst->mesa, mask);
-   return GL_TRUE;
-}
-#endif /* XFree86Server */
 
 
 #ifndef FX
@@ -2004,7 +1848,7 @@ void XMesaSwapBuffers( XMesaBuffer b )
 #endif
       if (b->backxrb->ximage) {
 	 /* Copy Ximage (back buf) from client memory to server window */
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
 	 if (b->shm) {
             /*_glthread_LOCK_MUTEX(_xmesa_lock);*/
 	    XShmPutImage( b->xm_visual->display, b->frontxrb->drawable,
@@ -2041,9 +1885,7 @@ void XMesaSwapBuffers( XMesaBuffer b )
       if (b->swAlpha)
          _mesa_copy_soft_alpha_renderbuffers(ctx, &b->mesa_buffer);
    }
-#if !defined(XFree86Server)
    XSync( b->xm_visual->display, False );
-#endif
 }
 
 
@@ -2074,7 +1916,7 @@ void XMesaCopySubBuffer( XMesaBuffer b, int x, int y, int width, int height )
 #endif
       if (b->backxrb->ximage) {
          /* Copy Ximage from host's memory to server's window */
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
          if (b->shm) {
             /* XXX assuming width and height aren't too large! */
             XShmPutImage( b->xm_visual->display, b->frontxrb->drawable,
@@ -2116,7 +1958,6 @@ void XMesaCopySubBuffer( XMesaBuffer b, int x, int y, int width, int height )
  * Return:  GL_TRUE = context is double buffered
  *          GL_FALSE = context is single buffered
  */
-#ifndef XFree86Server
 GLboolean XMesaGetBackBuffer( XMesaBuffer b,
                               XMesaPixmap *pixmap,
                               XMesaImage **ximage )
@@ -2134,7 +1975,6 @@ GLboolean XMesaGetBackBuffer( XMesaBuffer b,
       return GL_FALSE;
    }
 }
-#endif /* XFree86Server */
 
 
 /*
@@ -2171,11 +2011,7 @@ GLboolean XMesaGetDepthBuffer( XMesaBuffer b, GLint *width, GLint *height,
 void XMesaFlush( XMesaContext c )
 {
    if (c && c->xm_visual) {
-#ifdef XFree86Server
-      /* NOT_NEEDED */
-#else
       XSync( c->xm_visual->display, False );
-#endif
    }
 }
 
@@ -2234,15 +2070,11 @@ void XMesaGarbageCollect( void )
    for (b=XMesaBufferList; b; b=next) {
       next = b->Next;
       if (b->display && b->frontxrb->drawable && b->type == WINDOW) {
-#ifdef XFree86Server
-	 /* NOT_NEEDED */
-#else
          XSync(b->display, False);
          if (!window_exists( b->display, b->frontxrb->drawable )) {
             /* found a dead window, free the ancillary info */
             XMesaDestroyBuffer( b );
          }
-#endif
       }
    }
 }
diff --git a/src/mesa/drivers/x11/xm_buffer.c b/src/mesa/drivers/x11/xm_buffer.c
index 2683bd44d19..10829b4284f 100644
--- a/src/mesa/drivers/x11/xm_buffer.c
+++ b/src/mesa/drivers/x11/xm_buffer.c
@@ -37,7 +37,7 @@
 #include "main/renderbuffer.h"
 
 
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
 static volatile int mesaXErrorFlag = 0;
 
 /**
@@ -170,7 +170,7 @@ alloc_back_buffer(XMesaBuffer b, GLuint width, GLuint height)
    if (b->db_mode == BACK_XIMAGE) {
       /* Deallocate the old backxrb->ximage, if any */
       if (b->backxrb->ximage) {
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
 	 if (b->shm) {
 	    XShmDetach(b->xm_visual->display, &b->shminfo);
 	    XDestroyImage(b->backxrb->ximage);
@@ -188,10 +188,6 @@ alloc_back_buffer(XMesaBuffer b, GLuint width, GLuint height)
       /* Allocate new back buffer */
       if (b->shm == 0 || !alloc_back_shm_ximage(b, width, height)) {
 	 /* Allocate a regular XImage for the back buffer. */
-#ifdef XFree86Server
-	 b->backxrb->ximage = XMesaCreateImage(b->xm_visual->BitsPerPixel,
-                                               width, height, NULL);
-#else
 	 b->backxrb->ximage = XCreateImage(b->xm_visual->display,
                                       b->xm_visual->visinfo->visual,
                                       GET_VISUAL_DEPTH(b->xm_visual),
@@ -199,7 +195,6 @@ alloc_back_buffer(XMesaBuffer b, GLuint width, GLuint height)
 				      NULL,
                                       width, height,
 				      8, 0);  /* pad, bytes_per_line */
-#endif
 	 if (!b->backxrb->ximage) {
 	    _mesa_warning(NULL, "alloc_back_buffer: XCreateImage failed.\n");
             return;
@@ -359,16 +354,8 @@ xmesa_delete_framebuffer(struct gl_framebuffer *fb)
    if (b->num_alloced > 0) {
       /* If no other buffer uses this X colormap then free the colors. */
       if (!xmesa_find_buffer(b->display, b->cmap, b)) {
-#ifdef XFree86Server
-         int client = 0;
-         if (b->frontxrb->drawable)
-            client = CLIENT_ID(b->frontxrb->drawable->id);
-         (void)FreeColors(b->cmap, client,
-                          b->num_alloced, b->alloced_colors, 0);
-#else
          XFreeColors(b->display, b->cmap,
                      b->alloced_colors, b->num_alloced, 0);
-#endif
       }
    }
 
@@ -382,7 +369,7 @@ xmesa_delete_framebuffer(struct gl_framebuffer *fb)
    if (fb->Visual.doubleBufferMode) {
       /* free back ximage/pixmap/shmregion */
       if (b->backxrb->ximage) {
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
          if (b->shm) {
             XShmDetach( b->display, &b->shminfo );
             XDestroyImage( b->backxrb->ximage );
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index acece2025cf..b8d9e20c426 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -93,16 +93,12 @@ const int xmesa_kernel1[16] = {
 static void
 finish_or_flush( struct gl_context *ctx )
 {
-#ifdef XFree86Server
-      /* NOT_NEEDED */
-#else
    const XMesaContext xmesa = XMESA_CONTEXT(ctx);
    if (xmesa) {
       _glthread_LOCK_MUTEX(_xmesa_lock);
       XSync( xmesa->display, False );
       _glthread_UNLOCK_MUTEX(_xmesa_lock);
    }
-#endif
 }
 
 
@@ -388,7 +384,6 @@ clear_buffers(struct gl_context *ctx, GLbitfield buffers)
 }
 
 
-#ifndef XFree86Server
 /* XXX these functions haven't been tested in the Xserver environment */
 
 
@@ -731,7 +726,6 @@ xmesa_CopyPixels( struct gl_context *ctx,
    }
 }
 
-#endif /* XFree86Server */
 
 
 
@@ -745,17 +739,9 @@ get_string( struct gl_context *ctx, GLenum name )
    (void) ctx;
    switch (name) {
       case GL_RENDERER:
-#ifdef XFree86Server
-         return (const GLubyte *) "Mesa GLX Indirect";
-#else
          return (const GLubyte *) "Mesa X11";
-#endif
       case GL_VENDOR:
-#ifdef XFree86Server
-         return (const GLubyte *) "Mesa project: www.mesa3d.org";
-#else
          return NULL;
-#endif
       default:
          return NULL;
    }
@@ -948,43 +934,6 @@ xmesa_update_state( struct gl_context *ctx, GLbitfield new_state )
 
 
 /**
- * Called via ctx->Driver.TestProxyTeximage().  Normally, we'd just use
- * the _mesa_test_proxy_teximage() fallback function, but we're going to
- * special-case the 3D texture case to allow textures up to 512x512x32
- * texels.
- */
-static GLboolean
-test_proxy_teximage(struct gl_context *ctx, GLenum target, GLint level,
-                    GLint internalFormat, GLenum format, GLenum type,
-                    GLint width, GLint height, GLint depth, GLint border)
-{
-   if (target == GL_PROXY_TEXTURE_3D) {
-      /* special case for 3D textures */
-      if (width * height * depth > 512 * 512 * 64 ||
-          width  < 2 * border ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           _mesa_bitcount(width  - 2 * border) != 1) ||
-          height < 2 * border ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           _mesa_bitcount(height - 2 * border) != 1) ||
-          depth  < 2 * border ||
-          (!ctx->Extensions.ARB_texture_non_power_of_two &&
-           _mesa_bitcount(depth  - 2 * border) != 1)) {
-         /* Bad size, or too many texels */
-         return GL_FALSE;
-      }
-      return GL_TRUE;
-   }
-   else {
-      /* use the fallback routine for 1D, 2D, cube and rect targets */
-      return _mesa_test_proxy_teximage(ctx, target, level, internalFormat,
-                                       format, type, width, height, depth,
-                                       border);
-   }
-}
-
-
-/**
  * In SW, we don't really compress GL_COMPRESSED_RGB[A] textures!
  */
 static gl_format
@@ -1124,7 +1073,6 @@ xmesa_init_driver_functions( XMesaVisual xmvisual,
    }
    else {
       driver->Clear = clear_buffers;
-#ifndef XFree86Server
       driver->CopyPixels = xmesa_CopyPixels;
       if (xmvisual->undithered_pf == PF_8R8G8B &&
           xmvisual->dithered_pf == PF_8R8G8B &&
@@ -1134,9 +1082,8 @@ xmesa_init_driver_functions( XMesaVisual xmvisual,
       else if (xmvisual->undithered_pf == PF_5R6G5B) {
          driver->DrawPixels = xmesa_DrawPixels_5R6G5B;
       }
-#endif
    }
-   driver->TestProxyTexImage = test_proxy_teximage;
+
 #if ENABLE_EXT_texure_compression_s3tc
    driver->ChooseTextureFormat = choose_tex_format;
 #else
diff --git a/src/mesa/drivers/x11/xm_glide.c b/src/mesa/drivers/x11/xm_glide.c
index cbd69b011a1..d8a0e6de6d0 100644
--- a/src/mesa/drivers/x11/xm_glide.c
+++ b/src/mesa/drivers/x11/xm_glide.c
@@ -140,16 +140,8 @@ static void FXgetImage( XMesaBuffer b )
    GLuint x, y;
    GLuint width, height;
 
-#ifdef XFree86Server
-   x = b->frontxrb->pixmap->x;
-   y = b->frontxrb->pixmap->y;
-   width = b->frontxrb->pixmap->width;
-   height = b->frontxrb->pixmap->height;
-   depth = b->frontxrb->pixmap->depth;
-#else
    xmesa_get_window_size(b->display, b, &width, &height);
    x = y = 0;
-#endif
    if (b->mesa_buffer.Width != width || b->mesa_buffer.Height != height) {
       b->mesa_buffer.Width = MIN2((int)width, b->FXctx->width);
       b->mesa_buffer.Height = MIN2((int)height, b->FXctx->height);
diff --git a/src/mesa/drivers/x11/xm_image.c b/src/mesa/drivers/x11/xm_image.c
index 087b4e4c3a7..12fef7dad34 100644
--- a/src/mesa/drivers/x11/xm_image.c
+++ b/src/mesa/drivers/x11/xm_image.c
@@ -37,97 +37,3 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "glxheader.h"
 #include "xmesaP.h"
 
-#ifdef XFree86Server
-
-#ifdef ROUNDUP
-#undef ROUNDUP
-#endif
-
-#define ROUNDUP(nbytes, pad) ((((nbytes) + ((pad)-1)) / (pad)) * ((pad)>>3))
-
-XMesaImage *XMesaCreateImage(int bitsPerPixel, int width, int height, char *data)
-{
-    XMesaImage *image;
-
-    image = (XMesaImage *)xalloc(sizeof(XMesaImage));
-
-    if (image) {
-	image->width = width;
-	image->height = height;
-	image->data = data;
-	/* Always pad to 32 bits */
-	image->bytes_per_line = ROUNDUP((bitsPerPixel * width), 32);
-	image->bits_per_pixel = bitsPerPixel;
-    }
-
-    return image;
-}
-
-void XMesaDestroyImage(XMesaImage *image)
-{
-    if (image->data)
-	free(image->data);
-    xfree(image);
-}
-
-unsigned long XMesaGetPixel(XMesaImage *image, int x, int y)
-{
-    CARD8  *row = (CARD8 *)(image->data + y*image->bytes_per_line);
-    CARD8  *i8;
-    CARD16 *i16;
-    CARD32 *i32;
-    switch (image->bits_per_pixel) {
-    case 8:
-	i8 = (CARD8 *)row;
-	return i8[x];
-	break;
-    case 15:
-    case 16:
-	i16 = (CARD16 *)row;
-	return i16[x];
-	break;
-    case 24: /* WARNING: architecture specific code */
-	i8 = (CARD8 *)row;
-	return (((CARD32)i8[x*3]) |
-		(((CARD32)i8[x*3+1])<<8) |
-		(((CARD32)i8[x*3+2])<<16));
-	break;
-    case 32:
-	i32 = (CARD32 *)row;
-	return i32[x];
-	break;
-    }
-    return 0;
-}
-
-#ifndef XMESA_USE_PUTPIXEL_MACRO
-void XMesaPutPixel(XMesaImage *image, int x, int y, unsigned long pixel)
-{
-    CARD8  *row = (CARD8 *)(image->data + y*image->bytes_per_line);
-    CARD8  *i8;
-    CARD16 *i16;
-    CARD32 *i32;
-    switch (image->bits_per_pixel) {
-    case 8:
-	i8 = (CARD8 *)row;
-	i8[x] = (CARD8)pixel;
-	break;
-    case 15:
-    case 16:
-	i16 = (CARD16 *)row;
-	i16[x] = (CARD16)pixel;
-	break;
-    case 24: /* WARNING: architecture specific code */
-	i8 = (CARD8 *)__row;
-	i8[x*3]   = (CARD8)(p);
-	i8[x*3+1] = (CARD8)(p>>8);
-	i8[x*3+2] = (CARD8)(p>>16);
-    case 32:
-	i32 = (CARD32 *)row;
-	i32[x] = (CARD32)pixel;
-	break;
-    }
-}
-#endif
-
-#endif /* XFree86Server */
diff --git a/src/mesa/drivers/x11/xm_line.c b/src/mesa/drivers/x11/xm_line.c
index f03f99f918f..04cedcd4ec0 100644
--- a/src/mesa/drivers/x11/xm_line.c
+++ b/src/mesa/drivers/x11/xm_line.c
@@ -537,7 +537,6 @@ void xmesa_choose_point( struct gl_context *ctx )
 
 
 
-#ifndef XFree86Server
 /**
  * Draw fast, XOR line with XDrawLine in front color buffer.
  * WARNING: this isn't fully OpenGL conformant because different pixels
@@ -567,7 +566,6 @@ xor_line(struct gl_context *ctx, const SWvertex *vert0, const SWvertex *vert1)
    XDrawLine(dpy, xrb->pixmap, gc, x0, y0, x1, y1);
    XMesaSetFunction(dpy, gc, GXcopy);  /* this gc is used elsewhere */
 }
-#endif /* XFree86Server */
 
 
 #endif /* CHAN_BITS == 8 */
@@ -660,7 +658,6 @@ get_line_func(struct gl_context *ctx)
       }
    }
 
-#ifndef XFree86Server
    if (ctx->DrawBuffer->_NumColorDrawBuffers == 1
        && ctx->DrawBuffer->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT
        && swrast->_RasterMask == LOGIC_OP_BIT
@@ -669,7 +666,6 @@ get_line_func(struct gl_context *ctx)
        && !ctx->Line.SmoothFlag) {
       return xor_line;
    }
-#endif /* XFree86Server */
 
 #endif /* CHAN_BITS == 8 */
    return (swrast_line_func) NULL;
diff --git a/src/mesa/drivers/x11/xm_span.c b/src/mesa/drivers/x11/xm_span.c
index ab66c5e1f12..294b93a57cc 100644
--- a/src/mesa/drivers/x11/xm_span.c
+++ b/src/mesa/drivers/x11/xm_span.c
@@ -42,7 +42,6 @@
  * generate BadMatch errors if the drawable isn't mapped.
  */
 
-#ifndef XFree86Server
 static int caught_xgetimage_error = 0;
 static int (*old_xerror_handler)( XMesaDisplay *dpy, XErrorEvent *ev );
 static unsigned long xgetimage_serial;
@@ -87,7 +86,6 @@ static int check_xgetimage_errors( void )
    /* return 0=no error, 1=error caught */
    return caught_xgetimage_error;
 }
-#endif
 
 
 /*
@@ -97,7 +95,6 @@ static unsigned long read_pixel( XMesaDisplay *dpy,
                                  XMesaDrawable d, int x, int y )
 {
    unsigned long p;
-#ifndef XFree86Server
    XMesaImage *pixel = NULL;
    int error;
 
@@ -113,9 +110,6 @@ static unsigned long read_pixel( XMesaDisplay *dpy,
    if (pixel) {
       XMesaDestroyImage( pixel );
    }
-#else
-   (*dpy->GetImage)(d, x, y, 1, 1, ZPixmap, ~0L, (pointer)&p);
-#endif
    return p;
 }
 
@@ -3763,7 +3757,6 @@ static void put_values_ci_ximage( PUT_VALUES_ARGS )
 /*****                      Pixel reading                         *****/
 /**********************************************************************/
 
-#ifndef XFree86Server
 /**
  * Do clip testing prior to calling XGetImage.  If any of the region lies
  * outside the screen's bounds, XGetImage will return NULL.
@@ -3806,7 +3799,6 @@ clip_for_xgetimage(struct gl_context *ctx, XMesaPixmap pixmap, GLuint *n, GLint
    }
    return 0;
 }
-#endif
 
 
 /*
@@ -3824,7 +3816,6 @@ get_row_ci(struct gl_context *ctx, struct gl_renderbuffer *rb,
    y = YFLIP(xrb, y);
 
    if (xrb->pixmap) {
-#ifndef XFree86Server
       XMesaImage *span = NULL;
       int error;
       int k = clip_for_xgetimage(ctx, xrb->pixmap, &n, &x, &y);
@@ -3850,11 +3841,6 @@ get_row_ci(struct gl_context *ctx, struct gl_renderbuffer *rb,
       if (span) {
 	 XMesaDestroyImage( span );
       }
-#else
-      (*xmesa->display->GetImage)(xrb->drawable,
-				  x, y, n, 1, ZPixmap,
-				  ~0L, (pointer)index);
-#endif
    }
    else if (xrb->ximage) {
       XMesaImage *img = xrb->ximage;
@@ -3882,14 +3868,6 @@ get_row_rgba(struct gl_context *ctx, struct gl_renderbuffer *rb,
       /* Read from Pixmap or Window */
       XMesaImage *span = NULL;
       int error;
-#ifdef XFree86Server
-      span = XMesaCreateImage(xmesa->xm_visual->BitsPerPixel, n, 1, NULL);
-      span->data = (char *)MALLOC(span->height * span->bytes_per_line);
-      error = (!span->data);
-      (*xmesa->display->GetImage)(xrb->drawable,
-				  x, YFLIP(xrb, y), n, 1, ZPixmap,
-				  ~0L, (pointer)span->data);
-#else
       int k;
       y = YFLIP(xrb, y);
       k = clip_for_xgetimage(ctx, xrb->pixmap, &n, &x, &y);
@@ -3900,7 +3878,6 @@ get_row_rgba(struct gl_context *ctx, struct gl_renderbuffer *rb,
       span = XGetImage( xmesa->display, xrb->pixmap,
 		        x, y, n, 1, AllPlanes, ZPixmap );
       error = check_xgetimage_errors();
-#endif
       if (span && !error) {
 	 switch (xmesa->pixelformat) {
 	    case PF_Truecolor:
diff --git a/src/mesa/drivers/x11/xmesa.h b/src/mesa/drivers/x11/xmesa.h
index f63626a9702..98737fab248 100644
--- a/src/mesa/drivers/x11/xmesa.h
+++ b/src/mesa/drivers/x11/xmesa.h
@@ -72,13 +72,9 @@ and create a window, you must do the following to use the X/Mesa interface:
 extern "C" {
 #endif
 
-#ifdef XFree86Server
-#include "xmesa_xf86.h"
-#else
 #include <X11/Xlib.h>
 #include <X11/Xutil.h>
 #include "xmesa_x.h"
-#endif
 #include "GL/gl.h"
 
 #ifdef AMIWIN
@@ -180,19 +176,6 @@ extern XMesaContext XMesaCreateContext( XMesaVisual v,
 extern void XMesaDestroyContext( XMesaContext c );
 
 
-#ifdef XFree86Server
-/*
- * These are the extra routines required for integration with XFree86.
- * None of these routines should be user visible. -KEM
- */
-extern GLboolean XMesaForceCurrent( XMesaContext c );
-
-extern GLboolean XMesaLoseCurrent( XMesaContext c );
-
-extern GLboolean XMesaCopyContext( XMesaContext src,
-				   XMesaContext dst,
-				   GLuint mask );
-#endif /* XFree86Server */
 
 
 /*
diff --git a/src/mesa/drivers/x11/xmesaP.h b/src/mesa/drivers/x11/xmesaP.h
index 5d34b430cb6..63e3e211bf6 100644
--- a/src/mesa/drivers/x11/xmesaP.h
+++ b/src/mesa/drivers/x11/xmesaP.h
@@ -33,9 +33,6 @@
 #include "fxmesa.h"
 #include "xm_glide.h"
 #endif
-#ifdef XFree86Server
-#include "xm_image.h"
-#endif
 
 
 extern _glthread_Mutex _xmesa_lock;
@@ -88,13 +85,8 @@ struct xmesa_visual {
    XMesaDisplay *display;	/* The X11 display */
    int screen, visualID;
    int visualType;
-#ifdef XFree86Server
-   GLint ColormapEntries;
-   GLint nplanes;
-#else
    XMesaVisualInfo visinfo;	/* X's visual info (pointer to private copy) */
    XVisualInfo *vishandle;	/* Only used in fakeglx.c */
-#endif
    GLint BitsPerPixel;		/* True bits per pixel for XImages */
 
    GLboolean ximage_flag;	/* Use XImage for back buffer (not pixmap)? */
@@ -233,7 +225,7 @@ struct xmesa_buffer {
 				/*    0 = not available			*/
 				/*    1 = XImage support available	*/
 				/*    2 = Pixmap support available too	*/
-#if defined(USE_XSHM) && !defined(XFree86Server)
+#if defined(USE_XSHM) 
    XShmSegmentInfo shminfo;
 #endif
 
@@ -259,11 +251,7 @@ struct xmesa_buffer {
 
    /* Used to do XAllocColor/XFreeColors accounting: */
    int num_alloced;
-#if defined(XFree86Server)
-   Pixel alloced_colors[256];
-#else
    unsigned long alloced_colors[256];
-#endif
 
 #if defined( FX )
    /* For 3Dfx Glide only */
@@ -578,9 +566,7 @@ extern void xmesa_register_swrast_functions( struct gl_context *ctx );
 
 #define ENABLE_EXT_texure_compression_s3tc 0 /* SW texture compression */
 
-#ifdef XFree86Server
-#define ENABLE_EXT_timer_query 0
-#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#if   defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 #define ENABLE_EXT_timer_query 1 /* should have 64-bit GLuint64EXT */
 #else
 #define ENABLE_EXT_timer_query 0 /* may not have 64-bit GLuint64EXT */