Merge remote branch 'origin/master' into nv50-compiler

author: Christoph Bumiller <[email protected]> 2010-08-18 14:37:47 +0200
committer: Christoph Bumiller <[email protected]> 2010-08-18 14:37:47 +0200
commit: 3e54d63429fe7ca5db3c75c181abbaf7a7f55724 (patch)
tree: e129c36aaef712525f0a04fc5b06c445e3cf84df /src/mesa/drivers/dri
parent: eaab76457818fad0926b84c663440e8987e1f19f (diff)
parent: 85d9bc236d6a8ff8f12cbc2150f8c3740354f573 (diff)
74 files changed, 2114 insertions, 828 deletions
diff --git a/src/mesa/drivers/dri/common/dri_metaops.c b/src/mesa/drivers/dri/common/dri_metaops.c
index 86e59a8e51c..a2f404b616f 100644
--- a/src/mesa/drivers/dri/common/dri_metaops.c
+++ b/src/mesa/drivers/dri/common/dri_metaops.c
@@ -29,6 +29,7 @@
 #include "main/arbprogram.h"
 #include "main/arrayobj.h"
 #include "main/bufferobj.h"
+#include "main/context.h"
 #include "main/enable.h"
 #include "main/matrix.h"
 #include "main/texstate.h"
diff --git a/src/mesa/drivers/dri/i810/i810render.c b/src/mesa/drivers/dri/i810/i810render.c
index b543d4f012c..205f0cebc1c 100644
--- a/src/mesa/drivers/dri/i810/i810render.c
+++ b/src/mesa/drivers/dri/i810/i810render.c
@@ -37,6 +37,8 @@
 #include "main/imports.h"
 #include "main/mtypes.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 
 #include "i810screen.h"
diff --git a/src/mesa/drivers/dri/i915/Makefile b/src/mesa/drivers/dri/i915/Makefile
index 71ee753748c..65fd658c047 100644
--- a/src/mesa/drivers/dri/i915/Makefile
+++ b/src/mesa/drivers/dri/i915/Makefile
@@ -56,7 +56,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-DRIVER_DEFINES = -I../intel -I../intel/server -DI915 \
+DRIVER_DEFINES = -I../intel -DI915 \
 	$(shell pkg-config libdrm --atleast-version=2.3.1 \
 				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
 
diff --git a/src/mesa/drivers/dri/i915/intel_render.c b/src/mesa/drivers/dri/i915/intel_render.c
index ec209391ab4..add0adacb56 100644
--- a/src/mesa/drivers/dri/i915/intel_render.c
+++ b/src/mesa/drivers/dri/i915/intel_render.c
@@ -37,6 +37,8 @@
 #include "main/mtypes.h"
 #include "main/enums.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 #include "tnl/t_vertex.h"
 #include "tnl/t_pipeline.h"
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 831981558d8..e381a5c714b 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -106,7 +106,7 @@ C_SOURCES = \
 
 ASM_SOURCES = 
 
-DRIVER_DEFINES = -I../intel -I../intel/server
+DRIVER_DEFINES = -I../intel
 
 INCLUDES += $(INTEL_CFLAGS)
 DRI_LIB_DEPS += $(INTEL_LIBS)
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index a74bbc25643..d2ac1235e46 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -192,11 +192,6 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    brw_clip_project_vertex(c, dest_ptr );
 }
 
-
-
-
-#define MAX_MRF 16
-
 void brw_clip_emit_vue(struct brw_clip_compile *c, 
 		       struct brw_indirect vert,
 		       GLboolean allocate,
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 6b20a2979f8..f7a68cead7c 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -604,6 +604,8 @@
 #define BRW_ARF_NOTIFICATION_COUNT    0x90
 #define BRW_ARF_IP                    0xA0
 
+#define BRW_MRF_COMPR4			(1 << 7)
+
 #define BRW_AMASK   0
 #define BRW_IMASK   1
 #define BRW_LMASK   2
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 31ff86cf731..ffdddd0a388 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -984,5 +984,7 @@ void brw_set_src1( struct brw_instruction *insn,
 
 /* brw_optimize.c */
 void brw_optimize(struct brw_compile *p);
+void brw_remove_duplicate_mrf_moves(struct brw_compile *p);
+void brw_remove_grf_to_mrf_moves(struct brw_compile *p);
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_optimize.c b/src/mesa/drivers/dri/i965/brw_optimize.c
index a364b158209..8aa6fb6cc6f 100644
--- a/src/mesa/drivers/dri/i965/brw_optimize.c
+++ b/src/mesa/drivers/dri/i965/brw_optimize.c
@@ -32,6 +32,594 @@
 #include "brw_defines.h"
 #include "brw_eu.h"
 
+static const struct {
+    char    *name;
+    int	    nsrc;
+    int	    ndst;
+    GLboolean is_arith;
+} inst_opcode[128] = {
+    [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
+
+    [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 },
+    [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
+
+    [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
+    [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
+    [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
+    [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
+    [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
+    [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
+};
+
+static INLINE
+GLboolean brw_is_arithmetic_inst(const struct brw_instruction *inst)
+{
+   return inst_opcode[inst->header.opcode].is_arith;
+}
+
+static const GLuint inst_stride[7] = {
+    [0] = 0,
+    [1] = 1,
+    [2] = 2,
+    [3] = 4,
+    [4] = 8,
+    [5] = 16,
+    [6] = 32
+};
+
+static const GLuint inst_type_size[8] = {
+    [BRW_REGISTER_TYPE_UD] = 4,
+    [BRW_REGISTER_TYPE_D] = 4,
+    [BRW_REGISTER_TYPE_UW] = 2,
+    [BRW_REGISTER_TYPE_W] = 2,
+    [BRW_REGISTER_TYPE_UB] = 1,
+    [BRW_REGISTER_TYPE_B] = 1,
+    [BRW_REGISTER_TYPE_F] = 4
+};
+
+static INLINE GLboolean
+brw_is_grf_written(const struct brw_instruction *inst,
+                   int reg_index, int size,
+                   int gen)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+   const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
+                         + inst->bits1.da1.dest_subreg_nr;
+   int length, write_end;
+
+   /* SEND is specific */
+   if (inst->header.opcode == BRW_OPCODE_SEND) {
+      if (gen >= 5)
+         length = inst->bits3.generic_gen5.response_length*REG_SIZE;
+      else 
+         length = inst->bits3.generic.response_length*REG_SIZE;
+   }
+   else {
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+   }
+
+   /* If the two intervals intersect, we overwrite the register */
+   write_end = write_start + length;
+   const int left = MAX2(write_start, reg_start);
+   const int right = MIN2(write_end, reg_end);
+
+   return left < right;
+}
+
+/* Specific path for message register since we need to handle the compr4 case */
+static INLINE GLboolean
+brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
+{
+   if (inst_opcode[inst->header.opcode].ndst == 0)
+      return GL_FALSE;
+
+   if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
+      if (inst->bits1.ia1.dest_reg_file == BRW_MESSAGE_REGISTER_FILE)
+         return GL_TRUE;
+
+   if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
+      return GL_FALSE;
+
+   const int reg_start = reg_index * REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
+   const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4;
+   const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
+
+   /* We use compr4 with a size != 16 elements. Strange, we conservatively
+    * consider that we are writing the register.
+    */
+   if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
+      return GL_TRUE;
+
+   GLboolean is_written = GL_FALSE;
+
+   /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
+   if (is_compr4) {
+      const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
+
+      /* First 8-way register */
+      const int write_start0 = mrf_index*REG_SIZE
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end0 = write_start0 + length;
+
+      /* Second 8-way register */
+      const int write_start1 = (mrf_index+4)*REG_SIZE
+                             + inst->bits1.da1.dest_subreg_nr;
+      const int write_end1 = write_start1 + length;
+
+      /* If the two intervals intersect, we overwrite the register */
+      const int left0 = MAX2(write_start0, reg_start);
+      const int right0 = MIN2(write_end0, reg_end);
+      const int left1 = MAX2(write_start1, reg_start);
+      const int right1 = MIN2(write_end1, reg_end);
+
+      is_written = left0 < right0 || left1 < right1;
+   }
+   else {
+      int length;
+      length = 1 << inst->header.execution_size;
+      length *= type_size;
+      length *= inst->bits1.da1.dest_horiz_stride;
+
+      /* If the two intervals intersect, we write into the register */
+      const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
+                            + inst->bits1.da1.dest_subreg_nr;
+      const int write_end = write_start + length;
+      const int left = MAX2(write_start, reg_start);
+      const int right = MIN2(write_end, reg_end);;
+
+      is_written = left < right;
+   }
+
+   /* SEND may perform an implicit mov to a mrf register */
+   if (is_written == GL_FALSE &&
+       inst->header.opcode == BRW_OPCODE_SEND &&
+       inst->bits1.da1.src0_reg_file != 0) {
+
+      const int mrf_start = inst->header.destreg__conditionalmod;
+      const int write_start = mrf_start * REG_SIZE;
+      const int write_end = write_start + REG_SIZE;
+      const int left = MAX2(write_start, reg_start);
+      const int right = MIN2(write_end, reg_end);;
+      is_written = left < right;
+   }
+
+   return is_written;
+}
+
+static INLINE GLboolean
+brw_is_mrf_read(const struct brw_instruction *inst,
+                int reg_index, int size, int gen)
+{
+   if (inst->header.opcode != BRW_OPCODE_SEND)
+      return GL_FALSE;
+   if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+      return GL_TRUE;
+
+   const int reg_start = reg_index*REG_SIZE;
+   const int reg_end = reg_start + size;
+
+   int length, read_start, read_end;
+   if (gen >= 5)
+      length = inst->bits3.generic_gen5.msg_length*REG_SIZE;
+   else 
+      length = inst->bits3.generic.msg_length*REG_SIZE;
+
+   /* Look if SEND uses an implicit mov. In that case, we read one less register
+    * (but we write it)
+    */
+   if (inst->bits1.da1.src0_reg_file != 0)
+      read_start = inst->header.destreg__conditionalmod;
+   else {
+      length--;
+      read_start = inst->header.destreg__conditionalmod + 1;
+   }
+   read_start *= REG_SIZE;
+   read_end = read_start + length;
+
+   const int left = MAX2(read_start, reg_start);
+   const int right = MIN2(read_end, reg_end);
+
+   return left < right;
+}
+
+static INLINE GLboolean
+brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
+{
+   int i, j;
+   if (inst_opcode[inst->header.opcode].nsrc == 0)
+      return GL_FALSE;
+
+   /* Look at first source. We must take into account register regions to
+    * monitor carefully the read. Note that we are a bit too conservative here
+    * since we do not take into account the fact that some complete registers
+    * may be skipped
+    */
+   if (inst_opcode[inst->header.opcode].nsrc >= 1) {
+
+      if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*REG_SIZE;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits2.da1.src0_width;
+      const int row_num = elem_num >> inst->bits2.da1.src0_width;
+      const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
+      int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE
+                    + inst->bits2.da1.src0_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   /* Second src register */
+   if (inst_opcode[inst->header.opcode].nsrc >= 2) {
+
+      if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
+         if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
+            return GL_TRUE;
+      if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
+         return GL_FALSE;
+
+      const int reg_start = reg_index*REG_SIZE;
+      const int reg_end = reg_start + size;
+
+      /* See if at least one of this element intersects the interval */
+      const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
+      const int elem_num = 1 << inst->header.execution_size;
+      const int width = 1 << inst->bits3.da1.src1_width;
+      const int row_num = elem_num >> inst->bits3.da1.src1_width;
+      const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
+      const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
+      int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE
+                    + inst->bits3.da1.src1_subreg_nr;
+      for (j = 0; j < row_num; ++j) {
+         int write_start = row_start;
+         for (i = 0; i < width; ++i) {
+            const int write_end = write_start + type_size;
+            const int left = write_start > reg_start ? write_start : reg_start;
+            const int right = write_end < reg_end ? write_end : reg_end;
+            if (left < right)
+               return GL_TRUE;
+            write_start += hs;
+         }
+         row_start += vs;
+      }
+   }
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_is_control_done(const struct brw_instruction *mov) {
+   return
+       mov->header.dependency_control != 0 ||
+       mov->header.thread_control != 0 ||
+       mov->header.mask_control != 0 ||
+       mov->header.saturate != 0 ||
+       mov->header.debug_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_predicated(const struct brw_instruction *mov) {
+   return mov->header.predicate_control != 0;
+}
+
+static INLINE GLboolean
+brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
+                      int *mrf_index,
+                      int *grf_index,
+                      GLboolean *is_compr4)
+{
+   if (brw_is_predicated(mov) ||
+       brw_is_control_done(mov) ||
+       mov->header.debug_control != 0)
+      return GL_FALSE;
+
+   if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
+       mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F ||
+       mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
+       mov->bits1.da1.dest_subreg_nr != 0)
+      return GL_FALSE;
+
+   if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
+       mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
+       mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F ||
+       mov->bits2.da1.src0_width != BRW_WIDTH_8 ||
+       mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
+       mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 ||
+       mov->bits2.da1.src0_subreg_nr != 0 ||
+       mov->bits2.da1.src0_abs != 0 ||
+       mov->bits2.da1.src0_negate != 0)
+      return GL_FALSE;
+
+   *grf_index = mov->bits2.da1.src0_reg_nr;
+   *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
+   *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0;
+   return GL_TRUE;
+}
+
+static INLINE GLboolean
+brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
+{
+   /* remark: no problem to predicate a SEL instruction */
+   if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
+       brw_is_control_done(inst) == GL_FALSE &&
+       inst->header.execution_size == 4 &&
+       inst->header.access_mode == BRW_ALIGN_1 &&
+       inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
+       inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
+       inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F &&
+       inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 &&
+       inst->bits1.da1.dest_reg_nr == grf_index &&
+       inst->bits1.da1.dest_subreg_nr == 0 &&
+       brw_is_arithmetic_inst(inst))
+      return GL_TRUE;
+
+   return GL_FALSE;
+}
+
+static INLINE GLboolean
+brw_inst_are_equal(const struct brw_instruction *src0,
+                   const struct brw_instruction *src1)
+{
+   const GLuint *field0 = (GLuint *) src0;
+   const GLuint *field1 = (GLuint *) src1;
+   return field0[0] == field1[0] &&
+          field0[1] == field1[1] &&
+          field0[2] == field1[2] &&
+          field0[3] == field1[3];
+}
+
+static INLINE void
+brw_inst_copy(struct brw_instruction *dst,
+              const struct brw_instruction *src)
+{
+   GLuint *field_dst = (GLuint *) dst;
+   const GLuint *field_src = (GLuint *) src;
+   field_dst[0] = field_src[0];
+   field_dst[1] = field_src[1];
+   field_dst[2] = field_src[2];
+   field_dst[3] = field_src[3];
+}
+
+static void brw_remove_inst(struct brw_compile *p, const GLboolean *removeInst)
+{
+   int i, nr_insn = 0, to = 0, from = 0;
+
+   for (from = 0; from < p->nr_insn; ++from) {
+      if (removeInst[from])
+         continue;
+      if(to != from)
+         brw_inst_copy(p->store + to, p->store + from);
+      to++;
+   }
+
+   for (i = 0; i < p->nr_insn; ++i)
+      if (removeInst[i] == GL_FALSE)
+         nr_insn++;
+   p->nr_insn = nr_insn;
+}
+
+/* The gen code emitter generates a lot of duplications in the
+ * grf-to-mrf moves, for example when texture sampling with the same
+ * coordinates from multiple textures..  Here, we monitor same mov
+ * grf-to-mrf instrutions and remove repeated ones where the operands
+ * and dst ahven't changed in between.
+ */
+void brw_remove_duplicate_mrf_moves(struct brw_compile *p)
+{
+   const int gen = p->brw->intel.gen;
+   int i, j;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
+      const int simd16_size = 2 * REG_SIZE;
+
+      for (j = i + 1; j < p->nr_insn; j++) {
+         const struct brw_instruction *inst = p->store + j;
+
+         if (brw_inst_are_equal(mov, inst)) {
+            removeInst[j] = GL_TRUE;
+            continue;
+         }
+
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
+             brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
+             brw_is_mrf_written(inst, mrf_index1, REG_SIZE))
+            break;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
+/* Replace moves to MRFs where the value moved is the result of a
+ * normal arithmetic operation with computation right into the MRF.
+ */
+void brw_remove_grf_to_mrf_moves(struct brw_compile *p)
+{
+   int i, j, prev;
+   struct brw_context *brw = p->brw;
+   const int gen = brw->intel.gen;
+   const int simd16_size = 2*REG_SIZE;
+
+   GLboolean *removeInst = calloc(sizeof(GLboolean), p->nr_insn);
+   assert(removeInst);
+
+   for (i = 0; i < p->nr_insn; i++) {
+      if (removeInst[i])
+         continue;
+
+      struct brw_instruction *grf_inst = NULL;
+      const struct brw_instruction *mov = p->store + i;
+      int mrf_index, grf_index;
+      GLboolean is_compr4;
+
+      /* Only consider _straight_ grf-to-mrf moves */
+      if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
+         continue;
+
+      /* Using comp4 enables a stride of 4 for this instruction */
+      const int mrf_index0 = mrf_index;
+      const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
+
+      /* Look where the register has been set */
+      prev = i;
+      GLboolean potential_remove = GL_FALSE;
+      while (prev--) {
+
+         /* If _one_ instruction writes the grf, we try to remove the mov */
+         struct brw_instruction *inst = p->store + prev;
+         if (brw_is_grf_straight_write(inst, grf_index)) {
+            potential_remove = GL_TRUE;
+            grf_inst = inst;
+            break;
+         }
+
+      }
+
+      if (potential_remove == GL_FALSE)
+         continue;
+      removeInst[i] = GL_TRUE;
+
+      /* Monitor first the section of code between the grf computation and the
+       * mov. Here we cannot read or write both mrf and grf register
+       */
+      for (j = prev + 1; j < i; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+         if (brw_is_grf_written(inst, grf_index, simd16_size, gen)   ||
+             brw_is_grf_read(inst, grf_index, simd16_size)           ||
+             brw_is_mrf_written(inst, mrf_index0, REG_SIZE)   ||
+             brw_is_mrf_written(inst, mrf_index1, REG_SIZE)   ||
+             brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) ||
+             brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+      }
+
+      /* After the mov, we can read or write the mrf. If the grf is overwritten,
+       * we are done
+       */
+      for (j = i + 1; j < p->nr_insn; ++j) {
+         struct brw_instruction *inst = p->store + j;
+         if (removeInst[j])
+            continue;
+
+         if (brw_is_grf_read(inst, grf_index, simd16_size)) {
+            removeInst[i] = GL_FALSE;
+            break;
+         }
+
+         if (brw_is_grf_straight_write(inst, grf_index))
+            break;
+      }
+
+      /* Note that with the top down traversal, we can safely pacth the mov
+       * instruction
+       */
+      if (removeInst[i]) {
+         grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
+         grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
+      }
+   }
+
+   brw_remove_inst(p, removeInst);
+   free(removeInst);
+}
+
 static GLboolean
 is_single_channel_dp4(struct brw_instruction *insn)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 40eece276b7..af08446f2d8 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -46,68 +46,68 @@ brw_add_validated_bo(struct brw_context *brw, drm_intel_bo *bo)
    }
 };
 
-const struct brw_tracked_state brw_blend_constant_color;
-const struct brw_tracked_state brw_cc_unit;
-const struct brw_tracked_state brw_check_fallback;
-const struct brw_tracked_state brw_clip_prog;
-const struct brw_tracked_state brw_clip_unit;
-const struct brw_tracked_state brw_vs_constants;
-const struct brw_tracked_state brw_wm_constants;
-const struct brw_tracked_state brw_constant_buffer;
-const struct brw_tracked_state brw_curbe_offsets;
-const struct brw_tracked_state brw_invarient_state;
-const struct brw_tracked_state brw_gs_prog;
-const struct brw_tracked_state brw_gs_unit;
-const struct brw_tracked_state brw_line_stipple;
-const struct brw_tracked_state brw_aa_line_parameters;
-const struct brw_tracked_state brw_pipelined_state_pointers;
-const struct brw_tracked_state brw_binding_table_pointers;
-const struct brw_tracked_state brw_depthbuffer;
-const struct brw_tracked_state brw_polygon_stipple_offset;
-const struct brw_tracked_state brw_polygon_stipple;
-const struct brw_tracked_state brw_program_parameters;
-const struct brw_tracked_state brw_recalculate_urb_fence;
-const struct brw_tracked_state brw_sf_prog;
-const struct brw_tracked_state brw_sf_unit;
-const struct brw_tracked_state brw_sf_vp;
-const struct brw_tracked_state brw_state_base_address;
-const struct brw_tracked_state brw_urb_fence;
-const struct brw_tracked_state brw_vertex_state;
-const struct brw_tracked_state brw_vs_surfaces;
-const struct brw_tracked_state brw_vs_prog;
-const struct brw_tracked_state brw_vs_unit;
-const struct brw_tracked_state brw_wm_input_sizes;
-const struct brw_tracked_state brw_wm_prog;
-const struct brw_tracked_state brw_wm_samplers;
-const struct brw_tracked_state brw_wm_constant_surface;
-const struct brw_tracked_state brw_wm_surfaces;
-const struct brw_tracked_state brw_wm_binding_table;
-const struct brw_tracked_state brw_wm_unit;
-
-const struct brw_tracked_state brw_psp_urb_cbs;
-
-const struct brw_tracked_state brw_pipe_control;
-
-const struct brw_tracked_state brw_drawing_rect;
-const struct brw_tracked_state brw_indices;
-const struct brw_tracked_state brw_vertices;
-const struct brw_tracked_state brw_index_buffer;
-const struct brw_tracked_state gen6_binding_table_pointers;
-const struct brw_tracked_state gen6_blend_state;
-const struct brw_tracked_state gen6_cc_state_pointers;
-const struct brw_tracked_state gen6_clip_state;
-const struct brw_tracked_state gen6_clip_vp;
-const struct brw_tracked_state gen6_color_calc_state;
-const struct brw_tracked_state gen6_depth_stencil_state;
-const struct brw_tracked_state gen6_gs_state;
-const struct brw_tracked_state gen6_sampler_state;
-const struct brw_tracked_state gen6_scissor_state;
-const struct brw_tracked_state gen6_sf_state;
-const struct brw_tracked_state gen6_sf_vp;
-const struct brw_tracked_state gen6_urb;
-const struct brw_tracked_state gen6_viewport_state;
-const struct brw_tracked_state gen6_vs_state;
-const struct brw_tracked_state gen6_wm_state;
+extern const struct brw_tracked_state brw_blend_constant_color;
+extern const struct brw_tracked_state brw_cc_unit;
+extern const struct brw_tracked_state brw_check_fallback;
+extern const struct brw_tracked_state brw_clip_prog;
+extern const struct brw_tracked_state brw_clip_unit;
+extern const struct brw_tracked_state brw_vs_constants;
+extern const struct brw_tracked_state brw_wm_constants;
+extern const struct brw_tracked_state brw_constant_buffer;
+extern const struct brw_tracked_state brw_curbe_offsets;
+extern const struct brw_tracked_state brw_invarient_state;
+extern const struct brw_tracked_state brw_gs_prog;
+extern const struct brw_tracked_state brw_gs_unit;
+extern const struct brw_tracked_state brw_line_stipple;
+extern const struct brw_tracked_state brw_aa_line_parameters;
+extern const struct brw_tracked_state brw_pipelined_state_pointers;
+extern const struct brw_tracked_state brw_binding_table_pointers;
+extern const struct brw_tracked_state brw_depthbuffer;
+extern const struct brw_tracked_state brw_polygon_stipple_offset;
+extern const struct brw_tracked_state brw_polygon_stipple;
+extern const struct brw_tracked_state brw_program_parameters;
+extern const struct brw_tracked_state brw_recalculate_urb_fence;
+extern const struct brw_tracked_state brw_sf_prog;
+extern const struct brw_tracked_state brw_sf_unit;
+extern const struct brw_tracked_state brw_sf_vp;
+extern const struct brw_tracked_state brw_state_base_address;
+extern const struct brw_tracked_state brw_urb_fence;
+extern const struct brw_tracked_state brw_vertex_state;
+extern const struct brw_tracked_state brw_vs_surfaces;
+extern const struct brw_tracked_state brw_vs_prog;
+extern const struct brw_tracked_state brw_vs_unit;
+extern const struct brw_tracked_state brw_wm_input_sizes;
+extern const struct brw_tracked_state brw_wm_prog;
+extern const struct brw_tracked_state brw_wm_samplers;
+extern const struct brw_tracked_state brw_wm_constant_surface;
+extern const struct brw_tracked_state brw_wm_surfaces;
+extern const struct brw_tracked_state brw_wm_binding_table;
+extern const struct brw_tracked_state brw_wm_unit;
+
+extern const struct brw_tracked_state brw_psp_urb_cbs;
+
+extern const struct brw_tracked_state brw_pipe_control;
+
+extern const struct brw_tracked_state brw_drawing_rect;
+extern const struct brw_tracked_state brw_indices;
+extern const struct brw_tracked_state brw_vertices;
+extern const struct brw_tracked_state brw_index_buffer;
+extern const struct brw_tracked_state gen6_binding_table_pointers;
+extern const struct brw_tracked_state gen6_blend_state;
+extern const struct brw_tracked_state gen6_cc_state_pointers;
+extern const struct brw_tracked_state gen6_clip_state;
+extern const struct brw_tracked_state gen6_clip_vp;
+extern const struct brw_tracked_state gen6_color_calc_state;
+extern const struct brw_tracked_state gen6_depth_stencil_state;
+extern const struct brw_tracked_state gen6_gs_state;
+extern const struct brw_tracked_state gen6_sampler_state;
+extern const struct brw_tracked_state gen6_scissor_state;
+extern const struct brw_tracked_state gen6_sf_state;
+extern const struct brw_tracked_state gen6_sf_vp;
+extern const struct brw_tracked_state gen6_urb;
+extern const struct brw_tracked_state gen6_viewport_state;
+extern const struct brw_tracked_state gen6_vs_state;
+extern const struct brw_tracked_state gen6_wm_state;
 
 /***********************************************************************
  * brw_state.c
diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c
index 1db2a210d45..e878da3850d 100644
--- a/src/mesa/drivers/dri/i965/brw_util.c
+++ b/src/mesa/drivers/dri/i965/brw_util.c
@@ -30,6 +30,8 @@
   */
          
 
+#include <assert.h>
+
 #include "main/mtypes.h"
 #include "program/prog_parameter.h"
 #include "brw_util.h"
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index a1bee2e44ab..b6b558e9a69 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -44,6 +44,7 @@ static GLboolean
 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
 {
    int opcode_array[] = {
+      [OPCODE_MOV] = 1,
       [OPCODE_ADD] = 2,
       [OPCODE_CMP] = 3,
       [OPCODE_DP3] = 2,
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index 323cfac8fa7..d9fa2e63354 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -1283,7 +1283,7 @@ void emit_fb_write(struct brw_wm_compile *c,
 	  * + 1 for the second half we get destination + 4.
 	  */
 	 brw_MOV(p,
-		 brw_message_reg(nr + channel + (1 << 7)),
+		 brw_message_reg(nr + channel + BRW_MRF_COMPR4),
 		 arg0[channel]);
       } else {
 	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
@@ -1712,12 +1712,20 @@ void brw_wm_emit( struct brw_wm_compile *c )
 		      inst->dst[i]->spill_slot);
    }
 
+   /* Only properly tested on ILK */
+   if (p->brw->intel.gen == 5) {
+     brw_remove_duplicate_mrf_moves(p);
+     if (c->dispatch_width == 16)
+	brw_remove_grf_to_mrf_moves(p);
+   }
+
    if (INTEL_DEBUG & DEBUG_WM) {
       int i;
 
-      printf("wm-native:\n");
-      for (i = 0; i < p->nr_insn; i++)
+     printf("wm-native:\n");
+     for (i = 0; i < p->nr_insn; i++)
 	 brw_disasm(stderr, &p->store[i], p->brw->intel.gen);
       printf("\n");
    }
 }
+
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 5f2035d79c9..e19f44035fd 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -29,6 +29,7 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/extensions.h"
+#include "main/fbobject.h"
 #include "main/framebuffer.h"
 #include "main/imports.h"
 #include "main/points.h"
@@ -39,8 +40,6 @@
 #include "drivers/common/driverfuncs.h"
 #include "drivers/common/meta.h"
 
-#include "i830_dri.h"
-
 #include "intel_chipset.h"
 #include "intel_buffers.h"
 #include "intel_tex.h"
@@ -420,7 +419,7 @@ intel_prepare_render(struct intel_context *intel)
    __DRIdrawable *drawable;
 
    drawable = driContext->driDrawablePriv;
-   if (drawable->dri2.stamp != driContext->dri2.draw_stamp) {
+   if (drawable && drawable->dri2.stamp != driContext->dri2.draw_stamp) {
       if (drawable->lastStamp != drawable->dri2.stamp)
 	 intel_update_renderbuffers(driContext, drawable);
       intel_draw_buffer(&intel->ctx, intel->ctx.DrawBuffer);
@@ -428,7 +427,7 @@ intel_prepare_render(struct intel_context *intel)
    }
 
    drawable = driContext->driReadablePriv;
-   if (drawable->dri2.stamp != driContext->dri2.read_stamp) {
+   if (drawable && drawable->dri2.stamp != driContext->dri2.read_stamp) {
       if (drawable->lastStamp != drawable->dri2.stamp)
 	 intel_update_renderbuffers(driContext, drawable);
       driContext->dri2.read_stamp = drawable->dri2.stamp;
@@ -613,6 +612,7 @@ intelInitContext(struct intel_context *intel,
    __DRIscreen *sPriv = driContextPriv->driScreenPriv;
    struct intel_screen *intelScreen = sPriv->private;
    int bo_reuse_mode;
+   __GLcontextModes visual;
 
    /* we can't do anything without a connection to the device */
    if (intelScreen->bufmgr == NULL)
@@ -624,6 +624,11 @@ intelInitContext(struct intel_context *intel,
       functions->Viewport = intel_viewport;
    }
 
+   if (mesaVis == NULL) {
+      memset(&visual, 0, sizeof visual);
+      mesaVis = &visual;
+   }
+
    if (!_mesa_initialize_context_for_api(&intel->ctx, api, mesaVis, shareCtx,
 					 functions, (void *) intel)) {
       printf("%s: failed to init mesa context\n", __FUNCTION__);
@@ -890,14 +895,21 @@ intelMakeCurrent(__DRIcontext * driContextPriv,
    }
 
    if (driContextPriv) {
-      struct gl_framebuffer *fb = driDrawPriv->driverPrivate;
-      struct gl_framebuffer *readFb = driReadPriv->driverPrivate;
+      struct gl_framebuffer *fb, *readFb;
+      
+      if (driDrawPriv == NULL && driReadPriv == NULL) {
+	 fb = _mesa_get_incomplete_framebuffer();
+	 readFb = _mesa_get_incomplete_framebuffer();
+      } else {
+	 fb = driDrawPriv->driverPrivate;
+	 readFb = driReadPriv->driverPrivate;
+	 driContextPriv->dri2.draw_stamp = driDrawPriv->dri2.stamp - 1;
+	 driContextPriv->dri2.read_stamp = driReadPriv->dri2.stamp - 1;
+      }
 
-      driContextPriv->dri2.draw_stamp = driDrawPriv->dri2.stamp - 1;
-      driContextPriv->dri2.read_stamp = driReadPriv->dri2.stamp - 1;
       intel_prepare_render(intel);
       _mesa_make_current(&intel->ctx, fb, readFb);
-
+      
       /* We do this in intel_prepare_render() too, but intel->ctx.DrawBuffer
        * is NULL at that point.  We can't call _mesa_makecurrent()
        * first, since we need the buffer size for the initial
diff --git a/src/mesa/drivers/dri/intel/intel_extensions_es2.c b/src/mesa/drivers/dri/intel/intel_extensions_es2.c
index baf8e130010..de34bbb2aec 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions_es2.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions_es2.c
@@ -28,7 +28,6 @@
 #include "main/extensions.h"
 
 #include "intel_extensions.h"
-#include "utils.h"
 
 static const char *es2_extensions[] = {
    /* Used by mesa internally (cf all_mesa_extensions in ../common/utils.c) */
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index 0e2fe893fed..02c0ffce31d 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -45,6 +45,7 @@
 #include "main/attrib.h"
 #include "main/enable.h"
 #include "main/viewport.h"
+#include "main/context.h"
 #include "swrast/swrast.h"
 
 #include "intel_screen.h"
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index fe4de189600..680d18ba299 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -155,6 +155,9 @@ intel_region_alloc_internal(struct intel_context *intel,
    }
 
    region = calloc(sizeof(*region), 1);
+   if (region == NULL)
+      return region;
+
    region->cpp = cpp;
    region->width = width;
    region->height = height;
@@ -189,6 +192,9 @@ intel_region_alloc(struct intel_context *intel,
 
    region = intel_region_alloc_internal(intel, cpp, width, height,
 					aligned_pitch / cpp, buffer);
+   if (region == NULL)
+      return region;
+
    region->tiling = tiling;
 
    return region;
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index 224b506c05b..6efb2ddc553 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -102,7 +102,7 @@ do_copy_texsubimage(struct intel_context *intel,
    GLcontext *ctx = &intel->ctx;
    const struct intel_region *src = get_teximage_source(intel, internalFormat);
 
-   if (!intelImage->mt || !src) {
+   if (!intelImage->mt || !src || !src->buffer) {
       if (INTEL_DEBUG & DEBUG_FALLBACKS)
 	 fprintf(stderr, "%s fail %p %p (0x%08x)\n",
 		 __FUNCTION__, intelImage->mt, src, internalFormat);
diff --git a/src/mesa/drivers/dri/intel/intel_tex_format.c b/src/mesa/drivers/dri/intel/intel_tex_format.c
index 5f813c0efa2..e03b203fb40 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_format.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_format.c
@@ -19,7 +19,6 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
                          GLenum format, GLenum type)
 {
    struct intel_context *intel = intel_context(ctx);
-   const GLboolean do32bpt = (intel->ctx.Visual.rgbBits >= 24);
 
 #if 0
    printf("%s intFmt=0x%x format=0x%x type=0x%x\n",
@@ -30,39 +29,28 @@ intelChooseTextureFormat(GLcontext * ctx, GLint internalFormat,
    case 4:
    case GL_RGBA:
    case GL_COMPRESSED_RGBA:
-      if (format == GL_BGRA) {
-         if (type == GL_UNSIGNED_BYTE || type == GL_UNSIGNED_INT_8_8_8_8_REV) {
-            return MESA_FORMAT_ARGB8888;
-         }
-         else if (type == GL_UNSIGNED_SHORT_4_4_4_4_REV) {
-            return MESA_FORMAT_ARGB4444;
-         }
-         else if (type == GL_UNSIGNED_SHORT_1_5_5_5_REV) {
-            return MESA_FORMAT_ARGB1555;
-         }
-      }
-      return do32bpt ? MESA_FORMAT_ARGB8888 : MESA_FORMAT_ARGB4444;
+      if (type == GL_UNSIGNED_SHORT_4_4_4_4_REV)
+	 return MESA_FORMAT_ARGB4444;
+      else if (type == GL_UNSIGNED_SHORT_1_5_5_5_REV)
+	 return MESA_FORMAT_ARGB1555;
+      else
+	 return MESA_FORMAT_ARGB8888;
 
    case 3:
    case GL_RGB:
    case GL_COMPRESSED_RGB:
-      if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
-         return MESA_FORMAT_RGB565;
-      }
-      if (do32bpt) {
-	 if (intel->has_xrgb_textures)
-	    return MESA_FORMAT_XRGB8888;
-	 else
-	    return MESA_FORMAT_ARGB8888;
-      } else {
+      if (type == GL_UNSIGNED_SHORT_5_6_5)
 	 return MESA_FORMAT_RGB565;
-      }
+      else if (intel->has_xrgb_textures)
+	 return MESA_FORMAT_XRGB8888;
+      else
+	 return MESA_FORMAT_ARGB8888;
 
    case GL_RGBA8:
    case GL_RGB10_A2:
    case GL_RGBA12:
    case GL_RGBA16:
-      return do32bpt ? MESA_FORMAT_ARGB8888 : MESA_FORMAT_ARGB4444;
+      return MESA_FORMAT_ARGB8888;
 
    case GL_RGBA4:
    case GL_RGBA2:
diff --git a/src/mesa/drivers/dri/intel/server/i830_dri.h b/src/mesa/drivers/dri/intel/server/i830_dri.h
deleted file mode 100644
index def049e7a6b..00000000000
--- a/src/mesa/drivers/dri/intel/server/i830_dri.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* $XFree86: xc/programs/Xserver/hw/xfree86/drivers/i810/i830_dri.h,v 1.6 2003/09/28 20:15:59 alanh Exp $ */
-
-#ifndef _I830_DRI_H
-#define _I830_DRI_H
-
-#include "xf86drm.h"
-
-#define I830_MAX_DRAWABLES 256
-
-#define I830_MAJOR_VERSION 1
-#define I830_MINOR_VERSION 9
-#define I830_PATCHLEVEL 0
-
-#define I830_REG_SIZE 0x80000
-
-typedef struct _I830DRIRec {
-   drm_handle_t regs;
-   drmSize regsSize;
-
-   drmSize unused1; /* backbufferSize */
-   drm_handle_t unused2; /* backbuffer */
-
-   drmSize unused3; /* depthbufferSize */
-   drm_handle_t unused4; /* depthbuffer */
-
-   drmSize unused5; /* rotatedSize */
-   drm_handle_t unused6; /* rotatedbuffer */
-
-   drm_handle_t unused7; /* textures */
-   int unused8; /* textureSize */
-
-   drm_handle_t unused9; /* agp_buffers */
-   drmSize unused10; /* agp_buf_size */
-
-   int deviceID;
-   int width;
-   int height;
-   int mem;
-   int cpp;
-   int bitsPerPixel;
-
-   int unused11[8]; /* was front/back/depth/rotated offset/pitch */
-
-   int unused12; /* logTextureGranularity */
-   int unused13; /* textureOffset */
-
-   int irq;
-   int sarea_priv_offset;
-} I830DRIRec, *I830DRIPtr;
-
-typedef struct {
-   /* Nothing here yet */
-   int dummy;
-} I830ConfigPrivRec, *I830ConfigPrivPtr;
-
-typedef struct {
-   /* Nothing here yet */
-   int dummy;
-} I830DRIContextRec, *I830DRIContextPtr;
-
-
-#endif
diff --git a/src/mesa/drivers/dri/intel/server/intel.h b/src/mesa/drivers/dri/intel/server/intel.h
deleted file mode 100644
index 6ea72499c1c..00000000000
--- a/src/mesa/drivers/dri/intel/server/intel.h
+++ /dev/null
@@ -1,331 +0,0 @@
-#ifndef _INTEL_H_
-#define _INTEL_H_
-
-#include "xf86drm.h"		/* drm_handle_t, etc */
-
-/* Intel */
-#ifndef PCI_CHIP_I810
-#define PCI_CHIP_I810              0x7121
-#define PCI_CHIP_I810_DC100        0x7123
-#define PCI_CHIP_I810_E            0x7125
-#define PCI_CHIP_I815              0x1132
-#define PCI_CHIP_I810_BRIDGE       0x7120
-#define PCI_CHIP_I810_DC100_BRIDGE 0x7122
-#define PCI_CHIP_I810_E_BRIDGE     0x7124
-#define PCI_CHIP_I815_BRIDGE       0x1130
-#endif
-
-#define PCI_CHIP_845_G			0x2562
-#define PCI_CHIP_I830_M			0x3577
-
-#ifndef PCI_CHIP_I855_GM
-#define PCI_CHIP_I855_GM	   0x3582
-#define PCI_CHIP_I855_GM_BRIDGE	   0x3580
-#endif
-
-#ifndef PCI_CHIP_I865_G
-#define PCI_CHIP_I865_G		   0x2572
-#define PCI_CHIP_I865_G_BRIDGE	   0x2570
-#endif
-
-#ifndef PCI_CHIP_I915_G
-#define PCI_CHIP_I915_G		   0x2582
-#define PCI_CHIP_I915_G_BRIDGE	   0x2580
-#endif
-
-#ifndef PCI_CHIP_I915_GM
-#define PCI_CHIP_I915_GM	   0x2592
-#define PCI_CHIP_I915_GM_BRIDGE	   0x2590
-#endif
-
-#ifndef PCI_CHIP_E7221_G
-#define PCI_CHIP_E7221_G	   0x258A
-/* Same as I915_G_BRIDGE */
-#define PCI_CHIP_E7221_G_BRIDGE	   0x2580
-#endif
-
-#ifndef PCI_CHIP_I945_G
-#define PCI_CHIP_I945_G        0x2772
-#define PCI_CHIP_I945_G_BRIDGE 0x2770
-#endif
-
-#ifndef PCI_CHIP_I945_GM
-#define PCI_CHIP_I945_GM        0x27A2
-#define PCI_CHIP_I945_GM_BRIDGE 0x27A0
-#endif
-
-#define IS_I810(pI810) (pI810->Chipset == PCI_CHIP_I810 ||	\
-			pI810->Chipset == PCI_CHIP_I810_DC100 || \
-			pI810->Chipset == PCI_CHIP_I810_E)
-#define IS_I815(pI810) (pI810->Chipset == PCI_CHIP_I815)
-#define IS_I830(pI810) (pI810->Chipset == PCI_CHIP_I830_M)
-#define IS_845G(pI810) (pI810->Chipset == PCI_CHIP_845_G)
-#define IS_I85X(pI810)  (pI810->Chipset == PCI_CHIP_I855_GM)
-#define IS_I852(pI810)  (pI810->Chipset == PCI_CHIP_I855_GM && (pI810->variant == I852_GM || pI810->variant == I852_GME))
-#define IS_I855(pI810)  (pI810->Chipset == PCI_CHIP_I855_GM && (pI810->variant == I855_GM || pI810->variant == I855_GME))
-#define IS_I865G(pI810) (pI810->Chipset == PCI_CHIP_I865_G)
-
-#define IS_I915G(pI810) (pI810->Chipset == PCI_CHIP_I915_G || pI810->Chipset == PCI_CHIP_E7221_G)
-#define IS_I915GM(pI810) (pI810->Chipset == PCI_CHIP_I915_GM)
-#define IS_I945G(pI810) (pI810->Chipset == PCI_CHIP_I945_G)
-#define IS_I945GM(pI810) (pI810->Chipset == PCI_CHIP_I945_GM)
-#define IS_I9XX(pI810) (IS_I915G(pI810) || IS_I915GM(pI810) || IS_I945G(pI810) || IS_I945GM(pI810))
-
-#define IS_MOBILE(pI810) (IS_I830(pI810) || IS_I85X(pI810) || IS_I915GM(pI810) || IS_I945GM(pI810))
-
-#define I830_GMCH_CTRL		0x52
-
-#define I830_GMCH_MEM_MASK      0x1
-#define I830_GMCH_MEM_64M       0x1
-#define I830_GMCH_MEM_128M      0
-
-#define I830_GMCH_GMS_MASK			0x70
-#define I830_GMCH_GMS_DISABLED		0x00
-#define I830_GMCH_GMS_LOCAL			0x10
-#define I830_GMCH_GMS_STOLEN_512	0x20
-#define I830_GMCH_GMS_STOLEN_1024	0x30
-#define I830_GMCH_GMS_STOLEN_8192	0x40
-
-#define I855_GMCH_GMS_MASK			(0x7 << 4)
-#define I855_GMCH_GMS_DISABLED			0x00
-#define I855_GMCH_GMS_STOLEN_1M			(0x1 << 4)
-#define I855_GMCH_GMS_STOLEN_4M			(0x2 << 4)
-#define I855_GMCH_GMS_STOLEN_8M			(0x3 << 4)
-#define I855_GMCH_GMS_STOLEN_16M		(0x4 << 4)
-#define I855_GMCH_GMS_STOLEN_32M		(0x5 << 4)
-#define I915G_GMCH_GMS_STOLEN_48M		(0x6 << 4)
-#define I915G_GMCH_GMS_STOLEN_64M		(0x7 << 4)
-
-typedef unsigned char Bool;
-#define TRUE 1
-#define FALSE 0
-
-#define PIPE_NONE	0<<0
-#define PIPE_CRT	1<<0
-#define PIPE_TV		1<<1
-#define PIPE_DFP	1<<2
-#define PIPE_LFP	1<<3
-#define PIPE_CRT2	1<<4
-#define PIPE_TV2	1<<5
-#define PIPE_DFP2	1<<6
-#define PIPE_LFP2	1<<7
-
-typedef struct _I830MemPool *I830MemPoolPtr;
-typedef struct _I830MemRange *I830MemRangePtr;
-typedef struct _I830MemRange {
-   long Start;
-   long End;
-   long Size;
-   unsigned long Physical;
-   unsigned long Offset;		/* Offset of AGP-allocated portion */
-   unsigned long Alignment;
-   drm_handle_t Key;
-   unsigned long Pitch; // add pitch
-   I830MemPoolPtr Pool;
-} I830MemRange;
-
-typedef struct _I830MemPool {
-   I830MemRange Total;
-   I830MemRange Free;
-   I830MemRange Fixed;
-   I830MemRange Allocated;
-} I830MemPool;
-
-typedef struct {
-   int tail_mask;
-   I830MemRange mem;
-   unsigned char *virtual_start;
-   int head;
-   int tail;
-   int space;
-} I830RingBuffer;
-
-typedef struct _I830Rec {
-   unsigned char *MMIOBase;
-   unsigned char *FbBase;
-   int cpp;
-   uint32_t aper_size;
-   unsigned int bios_version;
-
-   /* These are set in PreInit and never changed. */
-   long FbMapSize;
-   long TotalVideoRam;
-   I830MemRange StolenMemory;		/* pre-allocated memory */
-   long BIOSMemorySize;			/* min stolen pool size */
-   int BIOSMemSizeLoc;
-
-   /* These change according to what has been allocated. */
-   long FreeMemory;
-   I830MemRange MemoryAperture;
-   I830MemPool StolenPool;
-   long allocatedMemory;
-
-   /* Regions allocated either from the above pools, or from agpgart. */
-   /* for single and dual head configurations */
-   I830MemRange FrontBuffer;
-   I830MemRange FrontBuffer2;
-   I830MemRange Scratch;
-   I830MemRange Scratch2;
-
-   I830RingBuffer *LpRing;
-
-   I830MemRange BackBuffer;
-   I830MemRange DepthBuffer;
-   I830MemRange TexMem;
-   int TexGranularity;
-   I830MemRange ContextMem;
-   int drmMinor;
-   Bool have3DWindows;
-
-   Bool NeedRingBufferLow;
-   Bool allowPageFlip;
-   Bool disableTiling;
-
-   int Chipset;
-   unsigned long LinearAddr;
-   unsigned long MMIOAddr;
-
-   drmSize           registerSize;     /**< \brief MMIO register map size */
-   drm_handle_t         registerHandle;   /**< \brief MMIO register map handle */
-  //   IOADDRESS ioBase;
-   int               irq;              /**< \brief IRQ number */
-   int GttBound;
-
-   drm_handle_t ring_map;
-   unsigned int Fence[8];
-
-} I830Rec;
-
-/*
- * 12288 is set as the maximum, chosen because it is enough for
- * 1920x1440@32bpp with a 2048 pixel line pitch with some to spare.
- */
-#define I830_MAXIMUM_VBIOS_MEM		12288
-#define I830_DEFAULT_VIDEOMEM_2D	(MB(32) / 1024)
-#define I830_DEFAULT_VIDEOMEM_3D	(MB(64) / 1024)
-
-/* Flags for memory allocation function */
-#define FROM_ANYWHERE			0x00000000
-#define FROM_POOL_ONLY			0x00000001
-#define FROM_NEW_ONLY			0x00000002
-#define FROM_MASK			0x0000000f
-
-#define ALLOCATE_AT_TOP			0x00000010
-#define ALLOCATE_AT_BOTTOM		0x00000020
-#define FORCE_GAPS			0x00000040
-
-#define NEED_PHYSICAL_ADDR		0x00000100
-#define ALIGN_BOTH_ENDS			0x00000200
-#define FORCE_LOW			0x00000400
-
-#define ALLOC_NO_TILING			0x00001000
-#define ALLOC_INITIAL			0x00002000
-
-#define ALLOCATE_DRY_RUN		0x80000000
-
-/* Chipset registers for VIDEO BIOS memory RW access */
-#define _855_DRAM_RW_CONTROL 0x58
-#define _845_DRAM_RW_CONTROL 0x90
-#define DRAM_WRITE    0x33330000
-
-#define KB(x) ((x) * 1024)
-#define MB(x) ((x) * KB(1024))
-
-#define GTT_PAGE_SIZE			KB(4)
-#define ROUND_TO(x, y)			(((x) + (y) - 1) / (y) * (y))
-#define ROUND_DOWN_TO(x, y)		((x) / (y) * (y))
-#define ROUND_TO_PAGE(x)		ROUND_TO((x), GTT_PAGE_SIZE)
-#define ROUND_TO_MB(x)			ROUND_TO((x), MB(1))
-#define PRIMARY_RINGBUFFER_SIZE		KB(128)
-
-
-/* Ring buffer registers, p277, overview p19
- */
-#define LP_RING     0x2030
-#define HP_RING     0x2040
-
-#define RING_TAIL      0x00
-#define TAIL_ADDR           0x000FFFF8
-#define I830_TAIL_MASK	    0x001FFFF8
-
-#define RING_HEAD      0x04
-#define HEAD_WRAP_COUNT     0xFFE00000
-#define HEAD_WRAP_ONE       0x00200000
-#define HEAD_ADDR           0x001FFFFC
-#define I830_HEAD_MASK      0x001FFFFC
-
-#define RING_START     0x08
-#define START_ADDR          0x03FFFFF8
-#define I830_RING_START_MASK	0xFFFFF000
-
-#define RING_LEN       0x0C
-#define RING_NR_PAGES       0x001FF000 
-#define I830_RING_NR_PAGES	0x001FF000
-#define RING_REPORT_MASK    0x00000006
-#define RING_REPORT_64K     0x00000002
-#define RING_REPORT_128K    0x00000004
-#define RING_NO_REPORT      0x00000000
-#define RING_VALID_MASK     0x00000001
-#define RING_VALID          0x00000001
-#define RING_INVALID        0x00000000
-
-
-/* Fence/Tiling ranges [0..7]
- */
-#define FENCE            0x2000
-#define FENCE_NR         8
-
-#define I915G_FENCE_START_MASK	0x0ff00000
-
-#define I830_FENCE_START_MASK	0x07f80000
-
-#define FENCE_START_MASK    0x03F80000
-#define FENCE_X_MAJOR       0x00000000
-#define FENCE_Y_MAJOR       0x00001000
-#define FENCE_SIZE_MASK     0x00000700
-#define FENCE_SIZE_512K     0x00000000
-#define FENCE_SIZE_1M       0x00000100
-#define FENCE_SIZE_2M       0x00000200
-#define FENCE_SIZE_4M       0x00000300
-#define FENCE_SIZE_8M       0x00000400
-#define FENCE_SIZE_16M      0x00000500
-#define FENCE_SIZE_32M      0x00000600
-#define FENCE_SIZE_64M	    0x00000700
-#define I915G_FENCE_SIZE_1M       0x00000000
-#define I915G_FENCE_SIZE_2M       0x00000100
-#define I915G_FENCE_SIZE_4M       0x00000200
-#define I915G_FENCE_SIZE_8M       0x00000300
-#define I915G_FENCE_SIZE_16M      0x00000400
-#define I915G_FENCE_SIZE_32M      0x00000500
-#define I915G_FENCE_SIZE_64M	0x00000600
-#define I915G_FENCE_SIZE_128M	0x00000700
-#define FENCE_PITCH_1       0x00000000
-#define FENCE_PITCH_2       0x00000010
-#define FENCE_PITCH_4       0x00000020
-#define FENCE_PITCH_8       0x00000030
-#define FENCE_PITCH_16      0x00000040
-#define FENCE_PITCH_32      0x00000050
-#define FENCE_PITCH_64	    0x00000060
-#define FENCE_VALID         0x00000001
-
-#include <mmio.h>
-
-#  define MMIO_IN8(base, offset) \
-	*(volatile unsigned char *)(((unsigned char*)(base)) + (offset))
-#  define MMIO_IN32(base, offset) \
-	read_MMIO_LE32(base, offset)
-#  define MMIO_OUT8(base, offset, val) \
-	*(volatile unsigned char *)(((unsigned char*)(base)) + (offset)) = (val)
-#  define MMIO_OUT32(base, offset, val) \
-	*(volatile unsigned int *)(void *)(((unsigned char*)(base)) + (offset)) = CPU_TO_LE32(val)
-
-
-				/* Memory mapped register access macros */
-#define INREG8(addr)        MMIO_IN8(MMIO, addr)
-#define INREG(addr)         MMIO_IN32(MMIO, addr)
-#define OUTREG8(addr, val)  MMIO_OUT8(MMIO, addr, val)
-#define OUTREG(addr, val)   MMIO_OUT32(MMIO, addr, val)
-
-#define DSPABASE		0x70184
-
-#endif
diff --git a/src/mesa/drivers/dri/mach64/mach64_ioctl.h b/src/mesa/drivers/dri/mach64/mach64_ioctl.h
index 1ffda1932f1..9145ee6e6cf 100644
--- a/src/mesa/drivers/dri/mach64/mach64_ioctl.h
+++ b/src/mesa/drivers/dri/mach64/mach64_ioctl.h
@@ -32,6 +32,9 @@
 #ifndef __MACH64_IOCTL_H__
 #define __MACH64_IOCTL_H__
 
+#include <stdio.h>
+#include <stdlib.h>
+
 #include "mach64_dri.h"
 #include "mach64_reg.h"
 #include "mach64_lock.h"
diff --git a/src/mesa/drivers/dri/mga/mgarender.c b/src/mesa/drivers/dri/mga/mgarender.c
index 8b8fc485d31..cc0cea618d1 100644
--- a/src/mesa/drivers/dri/mga/mgarender.c
+++ b/src/mesa/drivers/dri/mga/mgarender.c
@@ -44,6 +44,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/imports.h"
 #include "main/mtypes.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 
 #include "mgacontext.h"
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
index 8be7edb150b..bd1273beea7 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
@@ -220,7 +220,7 @@ get_tex_format(struct gl_texture_image *ti)
 	case MESA_FORMAT_RGB565:
 		return GL_RGB5;
 	default:
-		assert(0);
+		return GL_NONE;
 	}
 }
 
@@ -231,7 +231,6 @@ nouveau_render_texture(GLcontext *ctx, struct gl_framebuffer *fb,
 	struct gl_renderbuffer *rb = att->Renderbuffer;
 	struct gl_texture_image *ti =
 		att->Texture->Image[att->CubeMapFace][att->TextureLevel];
-	int ret;
 
 	/* Allocate a renderbuffer object for the texture if we
 	 * haven't already done so. */
@@ -244,9 +243,7 @@ nouveau_render_texture(GLcontext *ctx, struct gl_framebuffer *fb,
 	}
 
 	/* Update the renderbuffer fields from the texture. */
-	ret = set_renderbuffer_format(rb, get_tex_format(ti));
-	assert(ret);
-
+	set_renderbuffer_format(rb, get_tex_format(ti));
 	rb->Width = ti->Width;
 	rb->Height = ti->Height;
 	nouveau_surface_ref(&to_nouveau_teximage(ti)->surface,
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_texture.c b/src/mesa/drivers/dri/nouveau/nouveau_texture.c
index dbf9a5cc613..442f4e899ee 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_texture.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_texture.c
@@ -38,6 +38,7 @@
 #include "main/mipmap.h"
 #include "main/texfetch.h"
 #include "main/teximage.h"
+#include "drivers/common/meta.h"
 
 static struct gl_texture_object *
 nouveau_texture_new(GLcontext *ctx, GLuint name, GLenum target)
@@ -182,10 +183,10 @@ teximage_fits(struct gl_texture_object *t, int level)
 	struct nouveau_surface *s = &to_nouveau_texture(t)->surfaces[level];
 	struct gl_texture_image *ti = t->Image[0][level];
 
-	return ti && (t->Target == GL_TEXTURE_RECTANGLE ||
-		      (s->bo && s->width == ti->Width &&
-		       s->height == ti->Height &&
-		       s->format == ti->TexFormat));
+	return ti && to_nouveau_teximage(ti)->surface.bo &&
+		(t->Target == GL_TEXTURE_RECTANGLE ||
+		 (s->bo && s->format == ti->TexFormat &&
+		  s->width == ti->Width && s->height == ti->Height));
 }
 
 static GLboolean
@@ -589,6 +590,53 @@ nouveau_texture_unmap(GLcontext *ctx, struct gl_texture_object *t)
 	}
 }
 
+static void
+store_mipmap(GLcontext *ctx, GLenum target, int first, int last,
+	     struct gl_texture_object *t)
+{
+	struct gl_pixelstore_attrib packing = {
+		.BufferObj = ctx->Shared->NullBufferObj,
+		.Alignment = 1
+	};
+	GLenum format = t->Image[0][first]->TexFormat;
+	unsigned base_format, type, comps;
+	int i;
+
+	base_format = _mesa_get_format_base_format(format);
+	_mesa_format_to_type_and_comps(format, &type, &comps);
+
+	for (i = first; i <= last; i++) {
+		struct gl_texture_image *ti = t->Image[0][i];
+		void *data = ti->Data;
+
+		nouveau_teximage(ctx, 3, target, i, ti->InternalFormat,
+				 ti->Width, ti->Height, ti->Depth,
+				 ti->Border, base_format, type, data,
+				 &packing, t, ti);
+
+		_mesa_free_texmemory(data);
+	}
+}
+
+static void
+nouveau_generate_mipmap(GLcontext *ctx, GLenum target,
+			struct gl_texture_object *t)
+{
+	if (_mesa_meta_check_generate_mipmap_fallback(ctx, target, t)) {
+		struct gl_texture_image *base = t->Image[0][t->BaseLevel];
+
+		nouveau_teximage_map(ctx, base);
+		_mesa_generate_mipmap(ctx, target, t);
+		nouveau_teximage_unmap(ctx, base);
+
+		store_mipmap(ctx, target, t->BaseLevel + 1,
+			     get_last_level(t), t);
+
+	} else {
+		_mesa_meta_GenerateMipmap(ctx, target, t);
+	}
+}
+
 void
 nouveau_texture_functions_init(struct dd_function_table *functions)
 {
@@ -607,4 +655,5 @@ nouveau_texture_functions_init(struct dd_function_table *functions)
 	functions->BindTexture = nouveau_bind_texture;
 	functions->MapTexture = nouveau_texture_map;
 	functions->UnmapTexture = nouveau_texture_unmap;
+	functions->GenerateMipmap = nouveau_generate_mipmap;
 }
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_fb.c b/src/mesa/drivers/dri/nouveau/nv20_state_fb.c
index 21da4f7af16..95691cad047 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_fb.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_fb.c
@@ -72,7 +72,7 @@ nv20_emit_framebuffer(GLcontext *ctx, int emit)
 			fb->_ColorDrawBuffers[0])->surface;
 
 		rt_format |= get_rt_format(s->format);
-		zeta_pitch = rt_pitch = s->pitch;
+		rt_pitch = s->pitch;
 
 		nouveau_bo_markl(bctx, kelvin, NV20TCL_COLOR_OFFSET,
 				 s->bo, 0, bo_flags);
@@ -88,6 +88,9 @@ nv20_emit_framebuffer(GLcontext *ctx, int emit)
 
 		nouveau_bo_markl(bctx, kelvin, NV20TCL_ZETA_OFFSET,
 				 s->bo, 0, bo_flags);
+	} else {
+		rt_format |= get_rt_format(MESA_FORMAT_Z24_S8);
+		zeta_pitch = rt_pitch;
 	}
 
 	BEGIN_RING(chan, kelvin, NV20TCL_RT_FORMAT, 2);
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_tex.c b/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
index e46118e4fce..2d45513bb4c 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_tex.c
@@ -194,7 +194,8 @@ nv20_emit_tex_obj(GLcontext *ctx, int emit)
 		| nvgl_wrap_mode(t->WrapS) << 0;
 
 	tx_filter = nvgl_filter_mode(t->MagFilter) << 24
-		| nvgl_filter_mode(t->MinFilter) << 16;
+		| nvgl_filter_mode(t->MinFilter) << 16
+		| 2 << 12;
 
 	tx_enable = NV20TCL_TX_ENABLE_ENABLE
 		| log2i(t->MaxAnisotropy) << 4;
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
index 262fe3cddee..dbf4ad477db 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
@@ -612,6 +612,8 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
+   radeon_prepare_render(&rmesa->radeon);
+
    if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
       /* need to disable perspective-correct texturing for point sprites */
       if ((hwprim & 0xf) == R200_VF_PRIM_POINT_SPRITES && ctx->Point.PointSprite) {
diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
index d43e14581e9..4ae0f304918 100644
--- a/src/mesa/drivers/dri/r200/r200_tcl.c
+++ b/src/mesa/drivers/dri/r200/r200_tcl.c
@@ -264,6 +264,8 @@ void r200TclPrimitive( GLcontext *ctx,
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLuint newprim = hw_prim | R200_VF_TCL_OUTPUT_VTX_ENABLE;
 
+   radeon_prepare_render(&rmesa->radeon);
+
    if (newprim != rmesa->tcl.hw_primitive ||
        !discrete_prim[hw_prim&0xf]) {
       /* need to disable perspective-correct texturing for point sprites */
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
index a326ee4c4fa..d2fa816894c 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_fragprog.c
@@ -109,13 +109,13 @@ void r3xx_compile_fragment_program(struct r300_fragment_program_compiler* c)
 	debug_program_log(c, "before compilation");
 
 	if (c->Base.is_r500){
-		r500_transform_unroll_loops(&c->Base, &loop_state);	
-		debug_program_log(c, "after r500 transform loops");
+		rc_unroll_loops(&c->Base, R500_PFS_MAX_INST);
+		debug_program_log(c, "after unroll loops");
 	}
 	else{
-		rc_transform_unroll_loops(&c->Base, &loop_state);
+		rc_transform_loops(&c->Base, &loop_state, -1);
 		debug_program_log(c, "after transform loops");
-		
+
 		rc_emulate_branches(&c->Base);
 		debug_program_log(c, "after emulate branches");
 	}
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
index d347b4df9cd..666c9c2a7a9 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog.c
@@ -32,6 +32,11 @@
 #include "radeon_emulate_branches.h"
 #include "radeon_emulate_loops.h"
 
+struct loop {
+	int BgnLoop;
+
+};
+
 /*
  * Take an already-setup and valid source then swizzle it appropriately to
  * obtain a constant ZERO or ONE source.
@@ -332,11 +337,140 @@ static void ei_pow(struct r300_vertex_program_code *vp,
 	inst[3] = t_src_scalar(vp, &vpi->SrcReg[1]);
 }
 
+static void mark_write(void * userdata,	struct rc_instruction * inst,
+		rc_register_file file,	unsigned int index, unsigned int mask)
+{
+	unsigned int * writemasks = userdata;
+
+	if (file != RC_FILE_TEMPORARY)
+		return;
+
+	if (index >= R300_VS_MAX_TEMPS)
+		return;
+
+	writemasks[index] |= mask;
+}
+
+static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
+{
+	return PVS_SRC_OPERAND(compiler->PredicateIndex,
+		t_swizzle(RC_SWIZZLE_ZERO),
+		t_swizzle(RC_SWIZZLE_ZERO),
+		t_swizzle(RC_SWIZZLE_ZERO),
+		t_swizzle(RC_SWIZZLE_W),
+		t_src_class(RC_FILE_TEMPORARY),
+		0);
+}
+
+static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
+					unsigned int hw_opcode, int is_math)
+{
+	return PVS_OP_DST_OPERAND(hw_opcode,
+	     is_math,
+	     0,
+	     compiler->PredicateIndex,
+	     RC_MASK_W,
+	     t_dst_class(RC_FILE_TEMPORARY));
+
+}
+
+static void ei_if(struct r300_vertex_program_compiler * compiler,
+					struct rc_instruction *rci,
+					unsigned int * inst,
+					unsigned int branch_depth)
+{
+	unsigned int predicate_opcode;
+	int is_math = 0;
+
+	if (!compiler->Base.is_r500) {
+		rc_error(&compiler->Base,"Opcode IF not supported\n");
+		return;
+	}
+
+	/* Reserve a temporary to use as our predicate stack counter, if we
+	 * don't already have one. */
+	if (!compiler->PredicateMask) {
+		unsigned int writemasks[R300_VS_MAX_TEMPS];
+		memset(writemasks, 0, sizeof(writemasks));
+		struct rc_instruction * inst;
+		unsigned int i;
+		for(inst = compiler->Base.Program.Instructions.Next;
+				inst != &compiler->Base.Program.Instructions;
+							inst = inst->Next) {
+			rc_for_all_writes_mask(inst, mark_write, writemasks);
+		}
+		for(i = 0; i < R300_VS_MAX_TEMPS; i++) {
+			unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
+			/* Only the W component can be used fo the predicate
+			 * stack counter. */
+			if (mask & RC_MASK_W) {
+				compiler->PredicateMask = RC_MASK_W;
+				compiler->PredicateIndex = i;
+				break;
+			}
+		}
+		if (i == R300_VS_MAX_TEMPS) {
+			rc_error(&compiler->Base, "No free temporary to use for"
+					" predicate stack counter.\n");
+			return;
+		}
+	}
+	predicate_opcode =
+			branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;
+
+	rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
+	if (branch_depth == 0) {
+		is_math = 1;
+		predicate_opcode = ME_PRED_SET_NEQ;
+		inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
+		inst[2] = 0;
+	} else {
+		predicate_opcode = VE_PRED_SET_NEQ_PUSH;
+		inst[1] = t_pred_src(compiler);
+		inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
+	}
+
+	inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
+	inst[3] = 0;
+
+}
+
+static void ei_else(struct r300_vertex_program_compiler * compiler,
+							unsigned int * inst)
+{
+	if (!compiler->Base.is_r500) {
+		rc_error(&compiler->Base,"Opcode ELSE not supported\n");
+		return;
+	}
+	inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
+	inst[1] = t_pred_src(compiler);
+	inst[2] = 0;
+	inst[3] = 0;
+}
+
+static void ei_endif(struct r300_vertex_program_compiler *compiler,
+							unsigned int * inst)
+{
+	if (!compiler->Base.is_r500) {
+		rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
+		return;
+	}
+	inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
+	inst[1] = t_pred_src(compiler);
+	inst[2] = 0;
+	inst[3] = 0;
+}
 
 static void translate_vertex_program(struct r300_vertex_program_compiler * compiler)
 {
 	struct rc_instruction *rci;
 
+	struct loop * loops;
+	int current_loop_depth = 0;
+	int loops_reserved = 0;
+
+	unsigned int branch_depth = 0;
+
 	compiler->code->pos_end = 0;	/* Not supported yet */
 	compiler->code->length = 0;
 
@@ -366,9 +500,12 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
 		case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
 		case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
+		case RC_OPCODE_ELSE: ei_else(compiler, inst); break;
+		case RC_OPCODE_ENDIF: ei_endif(compiler, inst); branch_depth--; break;
 		case RC_OPCODE_EX2: ei_math1(compiler->code, ME_EXP_BASE2_FULL_DX, vpi, inst); break;
 		case RC_OPCODE_EXP: ei_math1(compiler->code, ME_EXP_BASE2_DX, vpi, inst); break;
 		case RC_OPCODE_FRC: ei_vector1(compiler->code, VE_FRACTION, vpi, inst); break;
+		case RC_OPCODE_IF: ei_if(compiler, rci, inst, branch_depth); branch_depth++; break;
 		case RC_OPCODE_LG2: ei_math1(compiler->code, ME_LOG_BASE2_FULL_DX, vpi, inst); break;
 		case RC_OPCODE_LIT: ei_lit(compiler->code, vpi, inst); break;
 		case RC_OPCODE_LOG: ei_math1(compiler->code, ME_LOG_BASE2_DX, vpi, inst); break;
@@ -385,11 +522,86 @@ static void translate_vertex_program(struct r300_vertex_program_compiler * compi
 		case RC_OPCODE_SIN: ei_math1(compiler->code, ME_SIN, vpi, inst); break;
 		case RC_OPCODE_SLT: ei_vector2(compiler->code, VE_SET_LESS_THAN, vpi, inst); break;
 		case RC_OPCODE_SNE: ei_vector2(compiler->code, VE_SET_NOT_EQUAL, vpi, inst); break;
+		case RC_OPCODE_BGNLOOP:
+		{
+			struct loop * l;
+
+			if ((!compiler->Base.is_r500
+				&& loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
+				|| loops_reserved >= R500_VS_MAX_FC_DEPTH) {
+				rc_error(&compiler->Base,
+						"Loops are nested too deep.");
+				return;
+			}
+			memory_pool_array_reserve(&compiler->Base.Pool,
+					struct loop, loops, current_loop_depth,
+					loops_reserved, 1);
+			l = &loops[current_loop_depth++];
+			memset(l , 0, sizeof(struct loop));
+			l->BgnLoop = (compiler->code->length / 4);
+			continue;
+		}
+		case RC_OPCODE_ENDLOOP:
+		{
+			struct loop * l = &loops[current_loop_depth - 1];
+			unsigned int act_addr = l->BgnLoop - 1;
+			unsigned int last_addr = (compiler->code->length / 4) - 1;
+			unsigned int ret_addr = l->BgnLoop;
+
+			if (loops_reserved >= R300_VS_MAX_FC_OPS) {
+				rc_error(&compiler->Base,
+					"Too many flow control instructions.");
+				return;
+			}
+			if (compiler->Base.is_r500) {
+				compiler->code->fc_op_addrs.r500
+					[compiler->code->num_fc_ops].lw =
+					R500_PVS_FC_ACT_ADRS(act_addr)
+					| R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
+					;
+				compiler->code->fc_op_addrs.r500
+					[compiler->code->num_fc_ops].uw =
+					R500_PVS_FC_LAST_INST(last_addr)
+					| R500_PVS_FC_RTN_INST(ret_addr)
+					;
+			} else {
+				compiler->code->fc_op_addrs.r300
+					[compiler->code->num_fc_ops] =
+					R300_PVS_FC_ACT_ADRS(act_addr)
+					| R300_PVS_FC_LOOP_CNT_JMP_INST(0xff)
+					| R300_PVS_FC_LAST_INST(last_addr)
+					| R300_PVS_FC_RTN_INST(ret_addr)
+					;
+			}
+			compiler->code->fc_loop_index[compiler->code->num_fc_ops] =
+				R300_PVS_FC_LOOP_INIT_VAL(0x0)
+				| R300_PVS_FC_LOOP_STEP_VAL(0x1)
+				;
+			compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
+						compiler->code->num_fc_ops);
+			compiler->code->num_fc_ops++;
+			current_loop_depth--;
+			continue;
+		}
+
 		default:
 			rc_error(&compiler->Base, "Unknown opcode %s\n", rc_get_opcode_info(vpi->Opcode)->Name);
 			return;
 		}
 
+		/* Non-flow control instructions that are inside an if statement
+		 * need to pay attention to the predicate bit. */
+		if (branch_depth
+			&& vpi->Opcode != RC_OPCODE_IF
+			&& vpi->Opcode != RC_OPCODE_ELSE
+			&& vpi->Opcode != RC_OPCODE_ENDIF) {
+
+			inst[0] |= (PVS_DST_PRED_ENABLE_MASK
+						<< PVS_DST_PRED_ENABLE_SHIFT);
+			inst[0] |= (PVS_DST_PRED_SENSE_MASK
+						<< PVS_DST_PRED_SENSE_SHIFT);
+		}
+
 		compiler->code->length += 4;
 
 		if (compiler->Base.Error)
@@ -406,6 +618,7 @@ struct temporary_allocation {
 static void allocate_temporary_registers(struct r300_vertex_program_compiler * compiler)
 {
 	struct rc_instruction *inst;
+	struct rc_instruction *end_loop = NULL;
 	unsigned int num_orig_temps = 0;
 	char hwtemps[R300_VS_MAX_TEMPS];
 	struct temporary_allocation * ta;
@@ -440,10 +653,35 @@ static void allocate_temporary_registers(struct r300_vertex_program_compiler * c
 	/* Pass 2: Determine original temporary lifetimes */
 	for(inst = compiler->Base.Program.Instructions.Next; inst != &compiler->Base.Program.Instructions; inst = inst->Next) {
 		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+		/* Instructions inside of loops need to use the ENDLOOP
+		 * instruction as their LastRead. */
+		if (!end_loop && inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+			int endloops = 1;
+			struct rc_instruction * ptr;
+			for(ptr = inst->Next;
+				ptr != &compiler->Base.Program.Instructions;
+							ptr = ptr->Next){
+				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+					endloops++;
+				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
+					endloops--;
+					if (endloops <= 0) {
+						end_loop = ptr;
+						break;
+					}
+				}
+			}
+		}
+
+		if (inst == end_loop) {
+			end_loop = NULL;
+			continue;
+		}
 
 		for (i = 0; i < opcode->NumSrcRegs; ++i) {
 			if (inst->U.I.SrcReg[i].File == RC_FILE_TEMPORARY)
-				ta[inst->U.I.SrcReg[i].Index].LastRead = inst;
+				ta[inst->U.I.SrcReg[i].Index].LastRead =
+						end_loop ? end_loop : inst;
 		}
 	}
 
@@ -633,30 +871,24 @@ static struct rc_swizzle_caps r300_vertprog_swizzle_caps = {
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 {
 	struct emulate_loop_state loop_state;
-	
+
 	compiler->Base.SwizzleCaps = &r300_vertprog_swizzle_caps;
 
 	addArtificialOutputs(compiler);
 
 	debug_program_log(compiler, "before compilation");
 
-	/* XXX Ideally this should be done only for r3xx, but since
-	 * we don't have branching support for r5xx, we use the emulation
-	 * on all chipsets. */
-	rc_transform_unroll_loops(&compiler->Base, &loop_state);
-	
-	debug_program_log(compiler, "after transform loops");
-	
-	if (compiler->Base.is_r500){
-		rc_emulate_loops(&loop_state, R500_VS_MAX_ALU);
-	} else {
-		rc_emulate_loops(&loop_state, R300_VS_MAX_ALU);
-	}
-	debug_program_log(compiler, "after emulate loops");
+	if (compiler->Base.is_r500)
+		rc_transform_loops(&compiler->Base, &loop_state, R500_VS_MAX_ALU);
+	else
+		rc_transform_loops(&compiler->Base, &loop_state, R300_VS_MAX_ALU);
 
-	rc_emulate_branches(&compiler->Base);
+	debug_program_log(compiler, "after emulate loops");
 
-	debug_program_log(compiler, "after emulate branches");
+	if (!compiler->Base.is_r500) {
+		rc_emulate_branches(&compiler->Base);
+		debug_program_log(compiler, "after emulate branches");
+	}
 
 	if (compiler->Base.is_r500) {
 		struct radeon_program_transformation transformations[] = {
@@ -718,6 +950,6 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* compiler)
 
 	if (compiler->Base.Debug) {
 		fprintf(stderr, "Final vertex program code:\n");
-		r300_vertex_program_dump(compiler->code);
+		r300_vertex_program_dump(compiler);
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
index 5800f1a78e1..e6009338e2e 100644
--- a/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
+++ b/src/mesa/drivers/dri/r300/compiler/r3xx_vertprog_dump.c
@@ -20,7 +20,9 @@
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE. */
 
+#include "radeon_compiler.h"
 #include "radeon_code.h"
+#include "../r300_reg.h"
 
 #include <stdio.h>
 
@@ -133,6 +135,10 @@ static void r300_vs_op_dump(uint32_t op)
 {
 	fprintf(stderr, " dst: %d%s op: ",
 			(op >> 13) & 0x7f, r300_vs_dst_debug[(op >> 8) & 0x7]);
+	if ((op >> PVS_DST_PRED_ENABLE_SHIFT) & 0x1) {
+		fprintf(stderr, "PRED %u",
+				(op >> PVS_DST_PRED_SENSE_SHIFT) & 0x1);
+	}
 	if (op & 0x80) {
 		if (op & 0x1) {
 			fprintf(stderr, "PVS_MACRO_OP_2CLK_M2X_ADD\n");
@@ -160,8 +166,9 @@ static void r300_vs_src_dump(uint32_t src)
 			r300_vs_swiz_debug[(src >> 22) & 0x7]);
 }
 
-void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
+void r300_vertex_program_dump(struct r300_vertex_program_compiler * c)
 {
+	struct r300_vertex_program_code * vs = c->code;
 	unsigned instrcount = vs->length / 4;
 	unsigned i;
 
@@ -177,4 +184,21 @@ void r300_vertex_program_dump(struct r300_vertex_program_code * vs)
 			r300_vs_src_dump(vs->body.d[offset+1+src]);
 		}
 	}
+
+	fprintf(stderr, "Flow Control Ops: 0x%08x\n",vs->fc_ops);
+	for(i = 0; i < vs->num_fc_ops; i++) {
+		switch((vs->fc_ops >> (i * 2)) & 0x3 ) {
+		case 0: fprintf(stderr, "NOP"); break;
+		case 1: fprintf(stderr, "JUMP"); break;
+		case 2: fprintf(stderr, "LOOP"); break;
+		case 3: fprintf(stderr, "JSR"); break;
+		}
+		if (c->Base.is_r500) {
+			fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x\n",
+				vs->fc_op_addrs.r500[i].uw,
+				vs->fc_op_addrs.r500[i].lw);
+		} else {
+			fprintf(stderr,": 0x%08x\n", vs->fc_op_addrs.r300[i]);
+		}
+	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
index e6b5522c5b9..80a120497e3 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.c
@@ -30,7 +30,6 @@
 #include <stdio.h>
 
 #include "../r300_reg.h"
-#include "radeon_emulate_loops.h"
 
 /**
  * Rewrite IF instructions to use the ALU result special register.
@@ -60,31 +59,6 @@ int r500_transform_IF(
 	return 1;
 }
 
-/**
- * Rewrite loops to make them easier to emit.  This is not a local
- * transformation, because it modifies and reorders an entire block of code.
- */
-void r500_transform_unroll_loops(struct radeon_compiler * c,
-						struct emulate_loop_state *s)
-{
-	int i;
-	
-	rc_transform_unroll_loops(c, s);
-	
-	for( i = s->LoopCount - 1; i >= 0; i-- ){
-		struct rc_instruction * inst_continue;
-		if(!s->Loops[i].EndLoop){
-			continue;
-		}
-		/* Insert a continue instruction at the end of the loop.  This
-		 * is required in order to emit loops correctly. */
-		inst_continue = rc_insert_new_instruction(c,
-						s->Loops[i].EndIf->Prev);
-		inst_continue->U.I.Opcode = RC_OPCODE_CONTINUE;
-	}
-
-}
-
 static int r500_swizzle_is_native(rc_opcode opcode, struct rc_src_register reg)
 {
 	unsigned int relevant;
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
index 0d005a794ff..34173351f83 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog.h
@@ -49,6 +49,4 @@ extern int r500_transform_IF(
 	struct rc_instruction * inst,
 	void* data);
 
-void r500_transform_unroll_loops(struct radeon_compiler * c,
-						struct emulate_loop_state * s);
 #endif
diff --git a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
index 0bd8f0a239f..9b60e30f586 100644
--- a/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/compiler/r500_fragprog_emit.c
@@ -64,7 +64,16 @@ struct branch_info {
 };
 
 struct loop_info {
-	int LoopStart;
+	int BgnLoop;
+
+	int BranchDepth;
+	int * Brks;
+	int BrkCount;
+	int BrkReserved;
+
+	int * Conts;
+	int ContCount;
+	int ContReserved;
 };
 
 struct emit_state {
@@ -368,6 +377,12 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 
 	unsigned int newip = ++s->Code->inst_end;
 
+	/* Currently all loops use the same integer constant to intialize
+	 * the loop variables. */
+	if(!s->Code->int_constants[0]) {
+		s->Code->int_constants[0] = R500_FC_INT_CONST_KR(0xff);
+		s->Code->int_constant_count = 1;
+	}
 	s->Code->inst[newip].inst0 = R500_INST_TYPE_FC | R500_INST_ALU_WAIT;
 
 	switch(inst->U.I.Opcode){
@@ -378,32 +393,77 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 			s->Loops, s->CurrentLoopDepth, s->LoopsReserved, 1);
 
 		loop = &s->Loops[s->CurrentLoopDepth++];
-		
-		/* We don't emit an instruction for BGNLOOP, so we need to
-		 * decrement the instruction counter, but first we need to
-		 * set LoopStart to the current value of inst_end, which
-		 * will end up being the first real instruction in the loop.*/
-		loop->LoopStart = s->Code->inst_end--;
+		memset(loop, 0, sizeof(struct loop_info));
+		loop->BranchDepth = s->CurrentBranchDepth;
+		loop->BgnLoop = newip;
+
+		s->Code->inst[newip].inst2 = R500_FC_OP_LOOP
+			| R500_FC_JUMP_FUNC(0x00)
+			| R500_FC_IGNORE_UNCOVERED
+			;
 		break;
-	
 	case RC_OPCODE_BRK:
-		/* Don't emit an instruction for BRK */
-		s->Code->inst_end--;
+		loop = &s->Loops[s->CurrentLoopDepth - 1];
+		memory_pool_array_reserve(&s->C->Pool, int, loop->Brks,
+					loop->BrkCount, loop->BrkReserved, 1);
+
+		loop->Brks[loop->BrkCount++] = newip;
+		s->Code->inst[newip].inst2 = R500_FC_OP_BREAKLOOP
+			| R500_FC_JUMP_FUNC(0xff)
+			| R500_FC_B_OP1_DECR
+			| R500_FC_B_POP_CNT(
+				s->CurrentBranchDepth - loop->BranchDepth)
+			| R500_FC_IGNORE_UNCOVERED
+			;
 		break;
 
-	case RC_OPCODE_CONTINUE:
+	case RC_OPCODE_CONT:
 		loop = &s->Loops[s->CurrentLoopDepth - 1];
-		s->Code->inst[newip].inst2 = R500_FC_OP_JUMP |
-			R500_FC_JUMP_FUNC(0xff);
-		s->Code->inst[newip].inst3 = R500_FC_JUMP_ADDR(loop->LoopStart);
+		memory_pool_array_reserve(&s->C->Pool, int, loop->Conts,
+					loop->ContCount, loop->ContReserved, 1);
+		loop->Conts[loop->ContCount++] = newip;
+		s->Code->inst[newip].inst2 = R500_FC_OP_CONTINUE
+			| R500_FC_JUMP_FUNC(0xff)
+			| R500_FC_B_OP1_DECR
+			| R500_FC_B_POP_CNT(
+				s->CurrentBranchDepth -	loop->BranchDepth)
+			| R500_FC_IGNORE_UNCOVERED
+			;
 		break;
 
 	case RC_OPCODE_ENDLOOP:
-		/* Don't emit an instruction for ENDLOOP */
-		s->Code->inst_end--;
+	{
+		loop = &s->Loops[s->CurrentLoopDepth - 1];
+		/* Emit ENDLOOP */
+		s->Code->inst[newip].inst2 = R500_FC_OP_ENDLOOP
+			| R500_FC_JUMP_FUNC(0xff)
+			| R500_FC_JUMP_ANY
+			| R500_FC_IGNORE_UNCOVERED
+			;
+		/* The constant integer at index 0 is used by all loops. */
+		s->Code->inst[newip].inst3 = R500_FC_INT_ADDR(0)
+			| R500_FC_JUMP_ADDR(loop->BgnLoop + 1)
+			;
+
+		/* Set jump address and int constant for BGNLOOP */
+		s->Code->inst[loop->BgnLoop].inst3 = R500_FC_INT_ADDR(0)
+			| R500_FC_JUMP_ADDR(newip)
+			;
+
+		/* Set jump address for the BRK instructions. */
+		while(loop->BrkCount--) {
+			s->Code->inst[loop->Brks[loop->BrkCount]].inst3 =
+						R500_FC_JUMP_ADDR(newip + 1);
+		}
+
+		/* Set jump address for CONT instructions. */
+		while(loop->ContCount--) {
+			s->Code->inst[loop->Conts[loop->ContCount]].inst3 =
+						R500_FC_JUMP_ADDR(newip);
+		}
 		s->CurrentLoopDepth--;
 		break;
-
+	}
 	case RC_OPCODE_IF:
 		if ( s->CurrentBranchDepth >= MAX_BRANCH_DEPTH_FULL) {
 			rc_error(s->C, "Branch depth exceeds hardware limit");
@@ -442,24 +502,16 @@ static void emit_flowcontrol(struct emit_state * s, struct rc_instruction * inst
 		}
 
 		branch = &s->Branches[s->CurrentBranchDepth - 1];
-		
-		if(inst->Prev->U.I.Opcode == RC_OPCODE_BRK){
-			branch->Endif = --s->Code->inst_end;
-			s->Code->inst[branch->Endif].inst2 |=
-				R500_FC_B_OP0_DECR;
-		}
-		else{
-			branch->Endif = newip;
-		
-			s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
-				| R500_FC_A_OP_NONE /* no address stack */
-				| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
-				| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
-				| R500_FC_B_OP1_NONE /* no branch counter if stay */
-				| R500_FC_B_POP_CNT(1)
+		branch->Endif = newip;
+
+		s->Code->inst[branch->Endif].inst2 = R500_FC_OP_JUMP
+			| R500_FC_A_OP_NONE /* no address stack */
+			| R500_FC_JUMP_ANY /* docs says set this, but I don't understand why */
+			| R500_FC_B_OP0_DECR /* decrement branch counter if stay */
+			| R500_FC_B_OP1_NONE /* no branch counter if stay */
+			| R500_FC_B_POP_CNT(1)
 			;
-			s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
-		}
+		s->Code->inst[branch->Endif].inst3 = R500_FC_JUMP_ADDR(branch->Endif + 1);
 		s->Code->inst[branch->If].inst2 = R500_FC_OP_JUMP
 			| R500_FC_A_OP_NONE /* no address stack */
 			| R500_FC_JUMP_FUNC(0x0f) /* jump if ALU result is false */
@@ -544,11 +596,9 @@ void r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compi
 		code->inst[ip].inst0 = R500_INST_TYPE_OUT | R500_INST_TEX_SEM_WAIT;
 	}
 
-	/* Use FULL flow control mode if branches are nested deep enough.
-	 * We don not need to enable FULL flow control mode for loops, becasue
-	 * we aren't using the hardware loop instructions.
-	 */
-	if (s.MaxBranchDepth >= 4) {
+	/* Enable full flow control mode if we are using loops or have if
+	 * statements nested at least four deep. */
+	if (s.MaxBranchDepth >= 4 || s.LoopsReserved > 0) {
 		if (code->max_temp_idx < 1)
 			code->max_temp_idx = 1;
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_code.h b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
index d03689763bc..896246d2035 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_code.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_code.h
@@ -221,6 +221,9 @@ struct r500_fragment_program_code {
 	int max_temp_idx;
 
 	uint32_t us_fc_ctrl;
+
+	uint32_t int_constants[32];
+	uint32_t int_constant_count;
 };
 
 struct rX00_fragment_program_code {
@@ -240,6 +243,12 @@ struct rX00_fragment_program_code {
 #define R500_VS_MAX_ALU	        1024
 #define R500_VS_MAX_ALU_DWORDS  (R500_VS_MAX_ALU * 4)
 #define R300_VS_MAX_TEMPS	32
+/* This is the max for all chipsets (r300-r500) */
+#define R300_VS_MAX_FC_OPS 16
+/* The r500 maximum depth is not just for loops, but any combination of loops
+ * and subroutine jumps. */
+#define R500_VS_MAX_FC_DEPTH 8
+#define R300_VS_MAX_LOOP_DEPTH 1
 
 #define VSF_MAX_INPUTS 32
 #define VSF_MAX_OUTPUTS 32
@@ -260,9 +269,18 @@ struct r300_vertex_program_code {
 
 	uint32_t InputsRead;
 	uint32_t OutputsWritten;
-};
 
-void r300_vertex_program_dump(struct r300_vertex_program_code * vs);
+	unsigned int num_fc_ops;
+	uint32_t fc_ops;
+	union {
+	        uint32_t r300[R300_VS_MAX_FC_OPS];
+		struct {
+			uint32_t lw;
+			uint32_t uw;
+		} r500[R300_VS_MAX_FC_OPS];
+	} fc_op_addrs;
+	int32_t fc_loop_index[R300_VS_MAX_FC_OPS];
+};
 
 #endif /* RADEON_CODE_H */
 
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
index 1c8ba864a41..935dc9b0a80 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.c
@@ -307,3 +307,46 @@ void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsig
 	}
 }
 
+
+/**
+ * The FACE input in hardware contains 1 if it's a back face, 0 otherwise.
+ * Gallium and OpenGL define it the other way around.
+ *
+ * So let's just negate FACE at the beginning of the shader and rewrite the rest
+ * of the shader to read from the newly allocated temporary.
+ */
+void rc_transform_fragment_face(struct radeon_compiler *c, unsigned face)
+{
+	unsigned tempregi = rc_find_free_temporary(c);
+	struct rc_instruction *inst_add;
+	struct rc_instruction *inst;
+
+	/* perspective divide */
+	inst_add = rc_insert_new_instruction(c, &c->Program.Instructions);
+	inst_add->U.I.Opcode = RC_OPCODE_ADD;
+
+	inst_add->U.I.DstReg.File = RC_FILE_TEMPORARY;
+	inst_add->U.I.DstReg.Index = tempregi;
+	inst_add->U.I.DstReg.WriteMask = RC_MASK_X;
+
+	inst_add->U.I.SrcReg[0].File = RC_FILE_NONE;
+	inst_add->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_1111;
+
+	inst_add->U.I.SrcReg[1].File = RC_FILE_INPUT;
+	inst_add->U.I.SrcReg[1].Index = face;
+	inst_add->U.I.SrcReg[1].Swizzle = RC_SWIZZLE_XXXX;
+	inst_add->U.I.SrcReg[1].Negate = RC_MASK_XYZW;
+
+	for (inst = inst_add->Next; inst != &c->Program.Instructions; inst = inst->Next) {
+		const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
+		unsigned i;
+
+		for(i = 0; i < opcode->NumSrcRegs; i++) {
+			if (inst->U.I.SrcReg[i].File == RC_FILE_INPUT &&
+			    inst->U.I.SrcReg[i].Index == face) {
+				inst->U.I.SrcReg[i].File = RC_FILE_TEMPORARY;
+				inst->U.I.SrcReg[i].Index = tempregi;
+			}
+		}
+	}
+}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
index f15905d79d4..7c42eb3ae57 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_compiler.h
@@ -81,6 +81,7 @@ void rc_move_output(struct radeon_compiler * c, unsigned output, unsigned new_ou
 void rc_copy_output(struct radeon_compiler * c, unsigned output, unsigned dup_output);
 void rc_transform_fragment_wpos(struct radeon_compiler * c, unsigned wpos, unsigned new_input,
                                 int full_vtransform);
+void rc_transform_fragment_face(struct radeon_compiler *c, unsigned face);
 
 struct r300_fragment_program_compiler {
 	struct radeon_compiler Base;
@@ -110,8 +111,12 @@ struct r300_vertex_program_compiler {
 
 	void * UserData;
 	void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c);
+
+	int PredicateIndex;
+	unsigned int PredicateMask;
 };
 
 void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c);
+void r300_vertex_program_dump(struct r300_vertex_program_compiler * c);
 
 #endif /* RADEON_COMPILER_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
index fbb4235c223..faf531b412e 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_dataflow_deadcode.c
@@ -43,6 +43,12 @@ struct instruction_state {
 	unsigned char SrcReg[3];
 };
 
+struct loopinfo {
+	struct updatemask_state * Breaks;
+	unsigned int BreakCount;
+	unsigned int BreaksReserved;
+};
+
 struct branchinfo {
 	unsigned int HaveElse:1;
 
@@ -59,6 +65,10 @@ struct deadcode_state {
 	struct branchinfo * BranchStack;
 	unsigned int BranchStackSize;
 	unsigned int BranchStackReserved;
+
+	struct loopinfo * LoopStack;
+	unsigned int LoopStackSize;
+	unsigned int LoopStackReserved;
 };
 
 
@@ -78,6 +88,22 @@ static void or_updatemasks(
 	dst->Address = a->Address | b->Address;
 }
 
+static void push_break(struct deadcode_state *s)
+{
+	struct loopinfo * loop = &s->LoopStack[s->LoopStackSize - 1];
+	memory_pool_array_reserve(&s->C->Pool, struct updatemask_state,
+		loop->Breaks, loop->BreakCount, loop->BreaksReserved, 1);
+
+	memcpy(&loop->Breaks[loop->BreakCount++], &s->R, sizeof(s->R));
+}
+
+static void push_loop(struct deadcode_state * s)
+{
+	memory_pool_array_reserve(&s->C->Pool, struct loopinfo, s->LoopStack,
+			s->LoopStackSize, s->LoopStackReserved, 1);
+	memset(&s->LoopStack[s->LoopStackSize++], 0, sizeof(struct loopinfo));
+}
+
 static void push_branch(struct deadcode_state * s)
 {
 	memory_pool_array_reserve(&s->C->Pool, struct branchinfo, s->BranchStack,
@@ -233,11 +259,22 @@ void rc_dataflow_deadcode(struct radeon_compiler * c, rc_dataflow_mark_outputs_f
 					}
 				}
 			}
+			push_loop(&s);
 			break;
 		}
-		case RC_OPCODE_CONTINUE:
 		case RC_OPCODE_BRK:
+			push_break(&s);
+			break;
 		case RC_OPCODE_BGNLOOP:
+		{
+			unsigned int i;
+			struct loopinfo * loop = &s.LoopStack[s.LoopStackSize-1];
+			for(i = 0; i < loop->BreakCount; i++) {
+				or_updatemasks(&s.R, &s.R, &loop->Breaks[i]);
+			}
+			break;
+		}
+		case RC_OPCODE_CONT:
 			break;
 		case RC_OPCODE_ENDIF:
 			push_branch(&s);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
index 131e9e7436d..32d4b45dd6d 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.c
@@ -39,7 +39,6 @@
 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
 
 struct const_value {
-	
 	struct radeon_compiler * C;
 	struct rc_src_register * Src;
 	float Value;
@@ -78,17 +77,17 @@ static int src_reg_is_immediate(struct rc_src_register * src,
 	c->Program.Constants.Constants[src->Index].Type==RC_CONSTANT_IMMEDIATE;
 }
 
-static unsigned int loop_calc_iterations(struct emulate_loop_state *s, 
-			struct loop_info * loop, unsigned int max_instructions)
+static unsigned int loop_max_possible_iterations(struct radeon_compiler *c,
+			struct loop_info * loop, unsigned int prog_inst_limit)
 {
-	unsigned int total_i = rc_recompute_ips(s->C);
+	unsigned int total_i = rc_recompute_ips(c);
 	unsigned int loop_i = (loop->EndLoop->IP - loop->BeginLoop->IP) - 1;
 	/* +1 because the program already has one iteration of the loop. */
-	return 1 + ((max_instructions - total_i) / (s->LoopCount * loop_i));
+	return 1 + ((prog_inst_limit - total_i) / loop_i);
 }
 
-static void loop_unroll(struct emulate_loop_state * s,
-			struct loop_info *loop, unsigned int iterations)
+static void unroll_loop(struct radeon_compiler * c, struct loop_info * loop,
+						unsigned int iterations)
 {
 	unsigned int i;
 	struct rc_instruction * ptr;
@@ -99,7 +98,7 @@ static void loop_unroll(struct emulate_loop_state * s,
 	rc_remove_instruction(loop->EndLoop);
 	for( i = 1; i < iterations; i++){
 		for(ptr = first; ptr != last->Next; ptr = ptr->Next){
-			struct rc_instruction *new = rc_alloc_instruction(s->C);
+			struct rc_instruction *new = rc_alloc_instruction(c);
 			memcpy(new, ptr, sizeof(struct rc_instruction));
 			rc_insert_instruction(append_to, new);
 			append_to = new;
@@ -115,7 +114,7 @@ static void update_const_value(void * data, struct rc_instruction * inst,
 	if(value->Src->File != file ||
 	   value->Src->Index != index ||
 	   !(1 << GET_SWZ(value->Src->Swizzle, 0) & mask)){
-	   	return;
+		return;
 	}
 	switch(inst->U.I.Opcode){
 	case RC_OPCODE_MOV:
@@ -140,7 +139,7 @@ static void get_incr_amount(void * data, struct rc_instruction * inst,
 	if(file != RC_FILE_TEMPORARY ||
 	   count_inst->Index != index ||
 	   (1 << GET_SWZ(count_inst->Swz,0) != mask)){
-	   	return;
+		return;
 	}
 	/* Find the index of the counter register. */
 	opcode = rc_get_opcode_info(inst->U.I.Opcode);
@@ -185,13 +184,16 @@ static void get_incr_amount(void * data, struct rc_instruction * inst,
 		count_inst->Unknown = 1;
 		return;
 	}
-	
 }
 
-static int transform_const_loop(struct emulate_loop_state * s,
-						struct loop_info * loop)
+/**
+ * If prog_inst_limit is -1, then all eligible loops will be unrolled regardless
+ * of how many iterations they have.
+ */
+static int try_unroll_loop(struct radeon_compiler * c, struct loop_info * loop,
+						unsigned int prog_inst_limit)
 {
-	int end_loops = 1;
+	int end_loops;
 	int iterations;
 	struct count_inst count_inst;
 	float limit_value;
@@ -201,12 +203,12 @@ static int transform_const_loop(struct emulate_loop_state * s,
 	struct rc_instruction * inst;
 
 	/* Find the counter and the upper limit */
-	
-	if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], s->C)){
+
+	if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[0], c)){
 		limit = &loop->Cond->U.I.SrcReg[0];
 		counter = &loop->Cond->U.I.SrcReg[1];
 	}
-	else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], s->C)){
+	else if(src_reg_is_immediate(&loop->Cond->U.I.SrcReg[1], c)){
 		limit = &loop->Cond->U.I.SrcReg[1];
 		counter = &loop->Cond->U.I.SrcReg[0];
 	}
@@ -214,13 +216,13 @@ static int transform_const_loop(struct emulate_loop_state * s,
 		DBG("No constant limit.\n");
 		return 0;
 	}
-	
+
 	/* Find the initial value of the counter */
 	counter_value.Src = counter;
 	counter_value.Value = 0.0f;
 	counter_value.HasValue = 0;
-	counter_value.C = s->C;
-	for(inst = s->C->Program.Instructions.Next; inst != loop->BeginLoop;
+	counter_value.C = c;
+	for(inst = c->Program.Instructions.Next; inst != loop->BeginLoop;
 							inst = inst->Next){
 		rc_for_all_writes_mask(inst, update_const_value, &counter_value);
 	}
@@ -230,11 +232,12 @@ static int transform_const_loop(struct emulate_loop_state * s,
 	}
 	DBG("Initial counter value is %f\n", counter_value.Value);
 	/* Determine how the counter is modified each loop */
-	count_inst.C = s->C;
+	count_inst.C = c;
 	count_inst.Index = counter->Index;
 	count_inst.Swz = counter->Swizzle;
 	count_inst.Amount = 0.0f;
 	count_inst.Unknown = 0;
+	end_loops = 1;
 	for(inst = loop->BeginLoop->Next; end_loops > 0; inst = inst->Next){
 		switch(inst->U.I.Opcode){
 		/* XXX In the future we might want to try to unroll nested
@@ -246,6 +249,16 @@ static int transform_const_loop(struct emulate_loop_state * s,
 			loop->EndLoop = inst;
 			end_loops--;
 			break;
+		case RC_OPCODE_BRK:
+			/* Don't unroll loops if it has a BRK instruction
+			 * other one used when testing the main conditional
+			 * of the loop. */
+
+			/* Make sure we haven't entered a nested loops. */
+			if(inst != loop->Brk && end_loops == 1) {
+				return 0;
+			}
+			break;
 		/* XXX Check if the counter is modified within an if statement.
 		 */
 		case RC_OPCODE_IF:
@@ -266,17 +279,20 @@ static int transform_const_loop(struct emulate_loop_state * s,
 	/* Calculate the number of iterations of this loop.  Keeping this
 	 * simple, since we only support increment and decrement loops.
 	 */
-	limit_value = get_constant_value(s->C, limit, 0);
+	limit_value = get_constant_value(c, limit, 0);
 	DBG("Limit is %f.\n", limit_value);
+	/* The iteration calculations are opposite of what you would expect.
+	 * In a normal loop, if the condition is met, then loop continues, but
+	 * with our loops, if the condition is met, the is exited. */
 	switch(loop->Cond->U.I.Opcode){
-	case RC_OPCODE_SGT:
-	case RC_OPCODE_SLT:
+	case RC_OPCODE_SGE:
+	case RC_OPCODE_SLE:
 		iterations = (int) ceilf((limit_value - counter_value.Value) /
 							count_inst.Amount);
 		break;
 
-	case RC_OPCODE_SLE:
-	case RC_OPCODE_SGE:
+	case RC_OPCODE_SGT:
+	case RC_OPCODE_SLT:
 		iterations = (int) floorf((limit_value - counter_value.Value) /
 							count_inst.Amount) + 1;
 		break;
@@ -284,77 +300,85 @@ static int transform_const_loop(struct emulate_loop_state * s,
 		return 0;
 	}
 
+	if (prog_inst_limit > 0
+		&& iterations > loop_max_possible_iterations(c, loop,
+							prog_inst_limit)) {
+		return 0;
+	}
+
 	DBG("Loop will have %d iterations.\n", iterations);
-	
+
 	/* Prepare loop for unrolling */
 	rc_remove_instruction(loop->Cond);
 	rc_remove_instruction(loop->If);
 	rc_remove_instruction(loop->Brk);
 	rc_remove_instruction(loop->EndIf);
-	
-	loop_unroll(s, loop, iterations);
+
+	unroll_loop(c, loop, iterations);
 	loop->EndLoop = NULL;
 	return 1;
 }
 
-/** 
- * This function prepares a loop to be unrolled by converting it into an if
- * statement.  Here is an outline of the conversion process:
- * BGNLOOP;                         	-> BGNLOOP;
- * <Additional conditional code>	-> <Additional conditional code>
- * SGE/SLT temp[0], temp[1], temp[2];	-> SLT/SGE temp[0], temp[1], temp[2];
- * IF temp[0];                      	-> IF temp[0];
- * BRK;                             	->
- * ENDIF;                           	-> <Loop Body>
- * <Loop Body>                      	-> ENDIF;
- * ENDLOOP;                         	-> ENDLOOP
- *
+/**
+ * @param c
+ * @param loop
  * @param inst A pointer to a BGNLOOP instruction.
- * @return If the loop can be unrolled, a pointer to the first instruction of
- * 		the unrolled loop.
- * 	   Otherwise, A pointer to the ENDLOOP instruction.
- * 	   Null if there is an error.
+ * @return 1 if all of the members of loop where set.
+ * @return 0 if there was an error and some members of loop are still NULL.
  */
-static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
+static int build_loop_info(struct radeon_compiler * c, struct loop_info * loop,
 						struct rc_instruction * inst)
 {
-	struct loop_info *loop;
 	struct rc_instruction * ptr;
 
-	memory_pool_array_reserve(&s->C->Pool, struct loop_info,
-			s->Loops, s->LoopCount, s->LoopReserved, 1);
-
-	loop = &s->Loops[s->LoopCount++];
-	memset(loop, 0, sizeof(struct loop_info));
 	if(inst->U.I.Opcode != RC_OPCODE_BGNLOOP){
-		rc_error(s->C, "expected BGNLOOP\n", __FUNCTION__);
-		return NULL;
+		rc_error(c, "%s: expected BGNLOOP", __FUNCTION__);
+		return 0;
 	}
+
+	memset(loop, 0, sizeof(struct loop_info));
+
 	loop->BeginLoop = inst;
 
-	for(ptr = loop->BeginLoop->Next; !loop->EndLoop; ptr = ptr->Next){
+	for(ptr = loop->BeginLoop->Next; !loop->EndLoop; ptr = ptr->Next) {
+
+		if (ptr == &c->Program.Instructions) {
+			rc_error(c, "%s: BGNLOOP without an ENDLOOOP.\n",
+								__FUNCTION__);
+			return 0;
+		}
+
 		switch(ptr->U.I.Opcode){
 		case RC_OPCODE_BGNLOOP:
-			/* Nested loop */
-			ptr = transform_loop(s, ptr);
-			if(!ptr){
-				return NULL;
+		{
+			/* Nested loop, skip ahead to the end. */
+			unsigned int loop_depth = 1;
+			for(ptr = ptr->Next; ptr != &c->Program.Instructions;
+							ptr = ptr->Next){
+				if (ptr->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+					loop_depth++;
+				} else if (ptr->U.I.Opcode == RC_OPCODE_ENDLOOP) {
+					if (!--loop_depth) {
+						break;
+					}
+				}
+			}
+			if (ptr == &c->Program.Instructions) {
+				rc_error(c, "%s: BGNLOOP without an ENDLOOOP\n",
+								__FUNCTION__);
+					return 0;
 			}
 			break;
+		}
 		case RC_OPCODE_BRK:
-			loop->Brk = ptr;
-			if(ptr->Next->U.I.Opcode != RC_OPCODE_ENDIF){
-				rc_error(s->C,
-					"%s: expected ENDIF\n",__FUNCTION__);
-				return NULL;
-			}
-			loop->EndIf = ptr->Next;
-			if(ptr->Prev->U.I.Opcode != RC_OPCODE_IF){
-				rc_error(s->C,
-					"%s: expected IF\n", __FUNCTION__);
-				return NULL;
+			if(ptr->Next->U.I.Opcode != RC_OPCODE_ENDIF
+					|| ptr->Prev->U.I.Opcode != RC_OPCODE_IF
+					|| loop->Brk){
+				continue;
 			}
+			loop->Brk = ptr;
 			loop->If = ptr->Prev;
+			loop->EndIf = ptr->Next;
 			switch(loop->If->Prev->U.I.Opcode){
 			case RC_OPCODE_SLT:
 			case RC_OPCODE_SGE:
@@ -364,18 +388,58 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 			case RC_OPCODE_SNE:
 				break;
 			default:
-				rc_error(s->C, "%s expected conditional\n",
+				rc_error(c, "%s: expected conditional",
 								__FUNCTION__);
-				return NULL;
+				return 0;
 			}
 			loop->Cond = loop->If->Prev;
-			ptr = loop->EndIf;
 			break;
+
 		case RC_OPCODE_ENDLOOP:
 			loop->EndLoop = ptr;
 			break;
 		}
 	}
+
+	if (loop->BeginLoop && loop->Brk && loop->If && loop->EndIf
+					&& loop->Cond && loop->EndLoop) {
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * This function prepares a loop to be unrolled by converting it into an if
+ * statement.  Here is an outline of the conversion process:
+ * BGNLOOP;                         	-> BGNLOOP;
+ * <Additional conditional code>	-> <Additional conditional code>
+ * SGE/SLT temp[0], temp[1], temp[2];	-> SLT/SGE temp[0], temp[1], temp[2];
+ * IF temp[0];                      	-> IF temp[0];
+ * BRK;                             	->
+ * ENDIF;                           	-> <Loop Body>
+ * <Loop Body>                      	-> ENDIF;
+ * ENDLOOP;                         	-> ENDLOOP
+ *
+ * @param inst A pointer to a BGNLOOP instruction.
+ * @return 1 for success, 0 for failure
+ */
+static int transform_loop(struct emulate_loop_state * s,
+						struct rc_instruction * inst)
+{
+	struct loop_info * loop;
+
+	memory_pool_array_reserve(&s->C->Pool, struct loop_info,
+			s->Loops, s->LoopCount, s->LoopReserved, 1);
+
+	loop = &s->Loops[s->LoopCount++];
+
+	if (!build_loop_info(s->C, loop, inst))
+		return 0;
+
+	if(try_unroll_loop(s->C, loop, s->prog_inst_limit)){
+		return 1;
+	}
+
 	/* Reverse the conditional instruction */
 	switch(loop->Cond->U.I.Opcode){
 	case RC_OPCODE_SGE:
@@ -398,43 +462,51 @@ static struct rc_instruction * transform_loop(struct emulate_loop_state * s,
 		break;
 	default:
 		rc_error(s->C, "loop->Cond is not a conditional.\n");
-		return NULL;
-	}
-	
-	/* Check if the number of loops is known at compile time. */
-	if(transform_const_loop(s, loop)){
-		return loop->BeginLoop->Next;
+		return 0;
 	}
 
-	/* Prepare the loop to be unrolled */
+	/* Prepare the loop to be emulated */
 	rc_remove_instruction(loop->Brk);
 	rc_remove_instruction(loop->EndIf);
 	rc_insert_instruction(loop->EndLoop->Prev, loop->EndIf);
-	return loop->EndLoop;
+	return 1;
 }
 
-void rc_transform_unroll_loops(struct radeon_compiler *c,
-					struct emulate_loop_state * s)
+void rc_transform_loops(struct radeon_compiler *c,
+			struct emulate_loop_state * s, int prog_inst_limit)
 {
 	struct rc_instruction * ptr;
-	
+
 	memset(s, 0, sizeof(struct emulate_loop_state));
 	s->C = c;
-	ptr = s->C->Program.Instructions.Next;
-	while(ptr != &s->C->Program.Instructions) {
+	s->prog_inst_limit = prog_inst_limit;
+	for(ptr = s->C->Program.Instructions.Next;
+			ptr != &s->C->Program.Instructions; ptr = ptr->Next) {
 		if(ptr->Type == RC_INSTRUCTION_NORMAL &&
 					ptr->U.I.Opcode == RC_OPCODE_BGNLOOP){
-			ptr = transform_loop(s, ptr);
-			if(!ptr){
+			if (!transform_loop(s, ptr))
 				return;
+		}
+	}
+}
+
+void rc_unroll_loops(struct radeon_compiler *c, int prog_inst_limit)
+{
+	struct rc_instruction * inst;
+	struct loop_info loop;
+
+	for(inst = c->Program.Instructions.Next;
+			inst != &c->Program.Instructions; inst = inst->Next) {
+
+		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+			if (build_loop_info(c, &loop, inst)) {
+				try_unroll_loop(c, &loop, prog_inst_limit);
 			}
 		}
-		ptr = ptr->Next;
 	}
 }
 
-void rc_emulate_loops(struct emulate_loop_state *s,
-						unsigned int max_instructions)
+void rc_emulate_loops(struct emulate_loop_state *s, int prog_inst_limit)
 {
 	int i;
 	/* Iterate backwards of the list of loops so that loops that nested
@@ -444,8 +516,8 @@ void rc_emulate_loops(struct emulate_loop_state *s,
 		if(!s->Loops[i].EndLoop){
 			continue;
 		}
-		unsigned int iterations = loop_calc_iterations(s, &s->Loops[i],
-							max_instructions);
-		loop_unroll(s, &s->Loops[i], iterations);
+		unsigned int iterations = loop_max_possible_iterations(
+					s->C, &s->Loops[i], prog_inst_limit);
+		unroll_loop(s->C, &s->Loops[i], iterations);
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
index 7748813c4eb..bba1f68e308 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_emulate_loops.h
@@ -21,12 +21,14 @@ struct emulate_loop_state {
 	struct loop_info * Loops;
 	unsigned int LoopCount;
 	unsigned int LoopReserved;
+	int prog_inst_limit;
 };
 
-void rc_transform_unroll_loops(struct radeon_compiler *c,
-					struct emulate_loop_state * s);
+void rc_transform_loops(struct radeon_compiler *c,
+			struct emulate_loop_state * s, int prog_inst_limit);
 
-void rc_emulate_loops(struct emulate_loop_state *s,
-					unsigned int max_instructions);
+void rc_unroll_loops(struct radeon_compiler * c, int prog_inst_limit);
+
+void rc_emulate_loops(struct emulate_loop_state * s, int prog_inst_limit);
 
 #endif /* RADEON_EMULATE_LOOPS_H */
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
index 04f234f11d8..2ea830be7f9 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.c
@@ -386,8 +386,8 @@ struct rc_opcode_info rc_opcodes[MAX_RC_OPCODE] = {
 		.NumSrcRegs = 0,
 	},
 	{
-		.Opcode = RC_OPCODE_CONTINUE,
-		.Name = "CONTINUE",
+		.Opcode = RC_OPCODE_CONT,
+		.Name = "CONT",
 		.IsFlowControl = 1,
 		.NumSrcRegs = 0
 	},
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
index 8b9fa07dde2..6e18d6eb3f1 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_opcodes.h
@@ -187,7 +187,7 @@ typedef enum {
 
 	RC_OPCODE_ENDLOOP,
 
-	RC_OPCODE_CONTINUE,
+	RC_OPCODE_CONT,
 
 	/** special instruction, used in R300-R500 fragment program pair instructions
 	 * indicates that the result of the alpha operation shall be replicated
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
index eca06515367..7a3f35950a6 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_optimize.c
@@ -164,7 +164,8 @@ static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mo
 	    inst = inst->Next) {
 		/* XXX In the future we might be able to make the optimizer
 		 * smart enough to handle loops. */
-		if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP){
+		if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP
+				|| inst->U.I.Opcode == RC_OPCODE_ENDLOOP){
 			return;
 		}
 		rc_for_all_reads_mask(inst, peephole_scan_read, &s);
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
index 8a912da4613..ce72cd97ab2 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_pair_regalloc.c
@@ -65,6 +65,11 @@ struct regalloc_state {
 
 	struct hardware_register * HwTemporary;
 	unsigned int NumHwTemporaries;
+	/**
+	 * If an instruction is inside of a loop, end_loop will be the
+	 * IP of the ENDLOOP instruction, otherwise end_loop will be 0
+	 */
+	int end_loop;
 };
 
 static void print_live_intervals(struct live_intervals * src)
@@ -178,10 +183,10 @@ static void scan_callback(void * data, struct rc_instruction * inst,
 		else
 			reg->Live.Start = inst->IP;
 		reg->Live.End = inst->IP;
-	} else {
-		if (inst->IP > reg->Live.End)
-			reg->Live.End = inst->IP;
-	}
+	} else if (s->end_loop)
+		reg->Live.End = s->end_loop;
+	else if (inst->IP > reg->Live.End)
+		reg->Live.End = inst->IP;
 }
 
 static void compute_live_intervals(struct regalloc_state * s)
@@ -191,6 +196,31 @@ static void compute_live_intervals(struct regalloc_state * s)
 	for(struct rc_instruction * inst = s->C->Program.Instructions.Next;
 	    inst != &s->C->Program.Instructions;
 	    inst = inst->Next) {
+
+		/* For all instructions inside of a loop, the ENDLOOP
+		 * instruction is used as the end of the live interval. */
+		if (inst->U.I.Opcode == RC_OPCODE_BGNLOOP && !s->end_loop) {
+			int loops = 1;
+			struct rc_instruction * tmp;
+			for(tmp = inst->Next;
+					tmp != &s->C->Program.Instructions;
+					tmp = tmp->Next) {
+				if (tmp->U.I.Opcode == RC_OPCODE_BGNLOOP) {
+					loops++;
+					break;
+				} else if (tmp->U.I.Opcode
+							== RC_OPCODE_ENDLOOP) {
+					if(!--loops) {
+						s->end_loop = tmp->IP;
+						break;
+					}
+				}
+			}
+		}
+
+		if (inst->IP == s->end_loop)
+			s->end_loop = 0;
+
 		rc_for_all_reads_mask(inst, scan_callback, s);
 		rc_for_all_writes_mask(inst, scan_callback, s);
 	}
diff --git a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
index 3cc28972934..857aae55145 100644
--- a/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
+++ b/src/mesa/drivers/dri/r300/compiler/radeon_program_alu.c
@@ -988,17 +988,22 @@ void radeonTransformKILP(struct radeon_compiler * c)
 	for (inst = c->Program.Instructions.Next;
 			inst != &c->Program.Instructions; inst = inst->Next) {
 
-		if (inst->U.I.Opcode != RC_OPCODE_KILP
-			|| inst->Prev->U.I.Opcode != RC_OPCODE_IF
-			|| inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
+		if (inst->U.I.Opcode != RC_OPCODE_KILP)
 			continue;
-		}
+
 		inst->U.I.Opcode = RC_OPCODE_KIL;
-		inst->U.I.SrcReg[0] = negate(absolute(inst->Prev->U.I.SrcReg[0]));
 
-		/* Remove IF */
-		rc_remove_instruction(inst->Prev);
-		/* Remove ENDIF */
-		rc_remove_instruction(inst->Next);
+		if (inst->Prev->U.I.Opcode != RC_OPCODE_IF
+				|| inst->Next->U.I.Opcode != RC_OPCODE_ENDIF) {
+			inst->U.I.SrcReg[0] = negate(builtin_one);
+		} else {
+
+			inst->U.I.SrcReg[0] =
+				negate(absolute(inst->Prev->U.I.SrcReg[0]));
+			/* Remove IF */
+			rc_remove_instruction(inst->Prev);
+			/* Remove ENDIF */
+			rc_remove_instruction(inst->Next);
+		}
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index e4b302bbad9..3d2f8928fa6 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -461,7 +461,7 @@ static void r300InitGLExtensions(GLcontext *ctx)
 	if (!r300->radeon.radeonScreen->drmSupportsOcclusionQueries) {
 		_mesa_disable_extension(ctx, "GL_ARB_occlusion_query");
 	}
-	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV350)
+        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_R420)
   		_mesa_enable_extension(ctx, "GL_ARB_half_float_vertex");
 
 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index f25264b6f2d..f7705b0f6fe 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -441,6 +441,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_GB_HORZ_CLIP_ADJ                   0x2228
 #define R300_VAP_GB_HORZ_DISC_ADJ                   0x222c
 
+#define R300_VAP_PVS_FLOW_CNTL_ADDRS_0      0x2230
+#define R300_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R300_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 8)
+#define R300_PVS_FC_LAST_INST(x)            ((x) << 16)
+#define R300_PVS_FC_RTN_INST(x)             ((x) << 24)
+
 /* gap */
 
 /* Sometimes, END_OF_PKT and 0x2284=0 are the only commands sent between
@@ -459,6 +465,10 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_2288_R300                    0x00750000 /* -- nh */
 #       define R300_2288_RV350                   0x0000FFFF /* -- Vladimir */
 
+#define R300_VAP_PVS_FLOW_CNTL_LOOP_INDEX_0 0x2290
+#define R300_PVS_FC_LOOP_INIT_VAL(x)        ((x) << 0)
+#define R300_PVS_FC_LOOP_STEP_VAL(x)        ((x) << 8)
+
 /* gap */
 
 /* Addresses are relative to the vertex program instruction area of the
@@ -489,6 +499,9 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_VAP_PVS_CODE_CNTL_1	    0x22D8
 #       define R300_PVS_LAST_VTX_SRC_INST_SHIFT  0
 #define R300_VAP_PVS_FLOW_CNTL_OPC          0x22DC
+#define R300_VAP_PVS_FC_OPC_JUMP(x)         (1 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_LOOP(x)         (2 << (2 * (x)))
+#define R300_VAP_PVS_FC_OPC_JSR(x)          (3 << (2 * (x)))
 
 /* The entire range from 0x2300 to 0x2AC inclusive seems to be used for
  * immediate vertices
@@ -505,6 +518,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* write 0 to indicate end of packet? */
 #define R300_VAP_VTX_END_OF_PKT             0x24AC
 
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_LW_0   0x2500
+#define R500_PVS_FC_ACT_ADRS(x)             ((x) << 0)
+#define R500_PVS_FC_LOOP_CNT_JMP_INST(x)    ((x) << 16)
+
+#define R500_VAP_PVS_FLOW_CNTL_ADDRS_UW_0   0x2504
+#define R500_PVS_FC_LAST_INST(x)            ((x) << 0)
+#define R500_PVS_FC_RTN_INST(x)             ((x) << 16)
+
 /* gap */
 
 /* These are values from r300_reg/r300_reg.h - they are known to be correct
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index bb8f91491f5..cf89ab7ec3d 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -327,6 +327,8 @@ void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
 	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
 
+	radeon_prepare_render(&rmesa->radeon);
+
 	type = r300PrimitiveType(rmesa, prim);
 	num_verts = r300NumVerts(rmesa, end - start, prim);
 
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index 4ba6740e3d9..94588698265 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -152,8 +152,8 @@ int32_t r300TranslateTexFormat(gl_format mesaFormat)
 		case MESA_FORMAT_Z32:
 			return R300_EASY_TX_FORMAT(X, X, X, X, X32);
 		/* EXT_texture_sRGB */
-		case MESA_FORMAT_SRGBA8:
-			return R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA;
+		case MESA_FORMAT_SARGB8:
+			return R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA;
 		case MESA_FORMAT_SLA8:
 			return R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8) | R300_TX_FORMAT_GAMMA;
 		case MESA_FORMAT_SL8:
diff --git a/src/mesa/drivers/dri/r600/r600_blit.c b/src/mesa/drivers/dri/r600/r600_blit.c
index 172f85eb264..27acff9c166 100644
--- a/src/mesa/drivers/dri/r600/r600_blit.c
+++ b/src/mesa/drivers/dri/r600/r600_blit.c
@@ -72,7 +72,7 @@ unsigned r600_check_blit(gl_format mesa_format)
     case MESA_FORMAT_Z24_S8:
     case MESA_FORMAT_Z16:
     case MESA_FORMAT_Z32:
-    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SARGB8:
     case MESA_FORMAT_SLA8:
     case MESA_FORMAT_SL8:
 	    break;
@@ -320,9 +320,9 @@ set_render_target(context_t *context, struct radeon_bo *bo, gl_format mesa_forma
 	    CLEARbit(cb_color0_info, SOURCE_FORMAT_bit);
 	    SETfield(cb_color0_info, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
             break;
-    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SARGB8:
             format = COLOR_8_8_8_8;
-            comp_swap = SWAP_STD_REV;
+            comp_swap = SWAP_ALT;
 	    SETbit(cb_color0_info, SOURCE_FORMAT_bit);
 	    SETfield(cb_color0_info, NUMBER_SRGB, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
             break;
@@ -390,13 +390,20 @@ set_render_target(context_t *context, struct radeon_bo *bo, gl_format mesa_forma
 			 0, RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT, 0);
     END_BATCH();
 
-    BEGIN_BATCH_NO_AUTOSTATE(12);
+    BEGIN_BATCH_NO_AUTOSTATE(9);
     R600_OUT_BATCH_REGVAL(CB_COLOR0_SIZE + (4 * id), cb_color0_size);
     R600_OUT_BATCH_REGVAL(CB_COLOR0_VIEW + (4 * id), cb_color0_view);
-    R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), cb_color0_info);
     R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), 0);
     END_BATCH();
 
+    BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+    R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), cb_color0_info);
+    R600_OUT_BATCH_RELOC(0,
+			 bo,
+			 0,
+			 0, RADEON_GEM_DOMAIN_VRAM | RADEON_GEM_DOMAIN_GTT, 0);
+    END_BATCH();
+
     COMMIT_BATCH();
 
 }
@@ -1043,17 +1050,17 @@ set_tex_resource(context_t * context,
 	    SETfield(sq_tex_resource4, SQ_SEL_X,
 		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 	    break;
-    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SARGB8:
 	    SETfield(sq_tex_resource1, FMT_8_8_8_8,
 		     SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-	    SETfield(sq_tex_resource4, SQ_SEL_W,
-		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 	    SETfield(sq_tex_resource4, SQ_SEL_Z,
-		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 	    SETfield(sq_tex_resource4, SQ_SEL_Y,
-		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
 	    SETfield(sq_tex_resource4, SQ_SEL_X,
+		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+	    SETfield(sq_tex_resource4, SQ_SEL_W,
 		     SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 	    SETbit(sq_tex_resource4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
 	    break;
@@ -1477,7 +1484,6 @@ set_default_state(context_t *context)
                          (CLRCMP_SEL_SRC << CLRCMP_FCN_SEL_shift));
     R600_OUT_BATCH_REGVAL(SQ_VTX_BASE_VTX_LOC, 0);
     R600_OUT_BATCH_REGVAL(SQ_VTX_START_INST_LOC, 0);
-    R600_OUT_BATCH_REGVAL(DB_DEPTH_INFO, 0);
     R600_OUT_BATCH_REGVAL(DB_DEPTH_CONTROL, 0);
     R600_OUT_BATCH_REGVAL(CB_SHADER_MASK, (OUTPUT0_ENABLE_mask));
     R600_OUT_BATCH_REGVAL(CB_TARGET_MASK, (TARGET0_ENABLE_mask));
@@ -1526,6 +1532,7 @@ set_default_state(context_t *context)
     R600_OUT_BATCH(0);
 
     R600_OUT_BATCH_REGVAL(VGT_STRMOUT_BUFFER_EN, 0);
+    R600_OUT_BATCH_REGVAL(SX_ALPHA_TEST_CONTROL, 0);
 
     END_BATCH();
     COMMIT_BATCH();
@@ -1607,7 +1614,7 @@ unsigned r600_blit(GLcontext *ctx,
     /* Flush is needed to make sure that source buffer has correct data */
     radeonFlush(ctx);
 
-    rcommonEnsureCmdBufSpace(&context->radeon, 304, __FUNCTION__);
+    rcommonEnsureCmdBufSpace(&context->radeon, 308, __FUNCTION__);
 
     /* load shaders */
     load_shaders(context->radeon.glCtx);
@@ -1632,7 +1639,7 @@ unsigned r600_blit(GLcontext *ctx,
     set_tex_sampler(context);
 
     /* dst */
-    /* 27 */
+    /* 31 */
     set_render_target(context, dst_bo, dst_mesaformat,
 		      dst_pitch, dst_width, dst_height, dst_offset);
     /* scissors */
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
index 84d9d423124..389b0412baa 100644
--- a/src/mesa/drivers/dri/r600/r600_context.c
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -72,6 +72,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R600_ENABLE_GLSL_TEST 1
 
 #define need_GL_VERSION_2_0
+#define need_GL_VERSION_2_1
+#define need_GL_ARB_draw_elements_base_vertex
 #define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_vertex_program
@@ -140,6 +142,7 @@ static const struct dri_extension card_extensions[] = {
   {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
   {"GL_SGIS_generate_mipmap",		NULL},
   {"GL_ARB_pixel_buffer_object",        NULL},
+  {"GL_ARB_draw_elements_base_vertex",	GL_ARB_draw_elements_base_vertex_functions },
   {NULL,				NULL}
   /* *INDENT-ON* */
 };
@@ -157,6 +160,7 @@ static const struct dri_extension mm_extensions[] = {
 static const struct dri_extension gl_20_extension[] = {
 #ifdef R600_ENABLE_GLSL_TEST
     {"GL_ARB_shading_language_100",			GL_VERSION_2_0_functions },
+    {"GL_ARB_shading_language_120",			GL_VERSION_2_1_functions },
 #else
   {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
 #endif /* R600_ENABLE_GLSL_TEST */
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
index 41419f84601..512a52ede3e 100644
--- a/src/mesa/drivers/dri/r600/r600_tex.c
+++ b/src/mesa/drivers/dri/r600/r600_tex.c
@@ -431,7 +431,7 @@ unsigned r600IsFormatRenderable(gl_format mesa_format)
 	case MESA_FORMAT_Z24_S8:
 	case MESA_FORMAT_Z16:
 	case MESA_FORMAT_Z32:
-	case MESA_FORMAT_SRGBA8:
+	case MESA_FORMAT_SARGB8:
 	case MESA_FORMAT_SLA8:
 	case MESA_FORMAT_SL8:
 		return 1;
diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c
index 1600033b9bd..ba3690b70ed 100644
--- a/src/mesa/drivers/dri/r600/r600_texstate.c
+++ b/src/mesa/drivers/dri/r600/r600_texstate.c
@@ -605,17 +605,17 @@ static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, gl_format mesa
 		}
 		break;
 	/* EXT_texture_sRGB */
-	case MESA_FORMAT_SRGBA8:
+	case MESA_FORMAT_SARGB8:
 		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
 			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
 
-		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Z,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_Y,
-			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask);
 		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_X,
+			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask);
+		SETfield(t->SQ_TEX_RESOURCE4, SQ_SEL_W,
 			 SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift, SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask);
 		SETbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
 		break;
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
index 99a33df4fcb..9c954cbf70c 100644
--- a/src/mesa/drivers/dri/r600/r700_assembler.c
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -275,7 +275,10 @@ GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size)
                 case 2:
                     format = FMT_8_8; break;
                 case 3:
-                    format = FMT_8_8_8; break;
+                    /* for some (small/unaligned) strides using 4 comps works
+                     * better, probably same as GL_SHORT below
+                     * test piglit/draw-vertices */
+                    format = FMT_8_8_8_8; break;
                 case 4:
                     format = FMT_8_8_8_8; break;
                 default:
@@ -2872,25 +2875,92 @@ GLboolean assemble_CMP(r700_AssemblerBase *pAsm)
 
 GLboolean assemble_TRIG(r700_AssemblerBase *pAsm, BITS opcode)
 {
+    /* 
+     * r600 - trunc to -PI..PI range
+     * r700 - normalize by dividing by 2PI
+     * see fdo bug 27901
+     */
+  
     int tmp;
     checkop1(pAsm);
 
     tmp = gethelpr(pAsm);
 
-    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    pAsm->D.dst.op3    = 1;
+
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
     pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
     pAsm->D.dst.reg    = tmp;
-    pAsm->D.dst.writex = 1;
 
     assemble_src(pAsm, 0, -1);
 
     pAsm->S[1].src.rtype = SRC_REC_LITERAL;
     setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+    
+    pAsm->S[2].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_Y);
+
     pAsm->D2.dst2.literal_slots = 1;
     pAsm->C[0].f = 1/(3.1415926535 * 2);
-    pAsm->C[1].f = 0.0F;
-    next_ins(pAsm);
+    pAsm->C[1].f = 0.5f;
+    
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_FRACT;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+
+    if(( GL_FALSE == next_ins(pAsm) ))
+    {
+        return GL_FALSE;
+    }
+    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    pAsm->D.dst.op3    = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+
+    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+
+    pAsm->S[2].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_Y);
+
+    pAsm->D2.dst2.literal_slots = 1;
+
+    if (pAsm->bR6xx)
+    {
+       pAsm->C[0].f = 3.1415926535897f * 2.0f;
+       pAsm->C[1].f = -3.1415926535897f;
+    }
+    else 
+    {
+       pAsm->C[0].f = 1.0f;
+       pAsm->C[1].f = -0.5f;
+    }
+
+    if(( GL_FALSE == next_ins(pAsm) ))
+    {
+        return GL_FALSE;
+    }
 
     pAsm->D.dst.opcode = opcode;
     pAsm->D.dst.math = 1;
@@ -4030,22 +4100,79 @@ GLboolean assemble_SCS(r700_AssemblerBase *pAsm)
     checkop1(pAsm);
 
     tmp = gethelpr(pAsm);
-    /* tmp.x = src /2*PI */
-    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    pAsm->D.dst.op3    = 1;
+
     setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
     pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
     pAsm->D.dst.reg    = tmp;
-    pAsm->D.dst.writex = 1;
 
     assemble_src(pAsm, 0, -1);
 
     pAsm->S[1].src.rtype = SRC_REC_LITERAL;
     setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+
+    pAsm->S[2].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_Y);
+
     pAsm->D2.dst2.literal_slots = 1;
     pAsm->C[0].f = 1/(3.1415926535 * 2);
-    pAsm->C[1].f = 0.0F;
+    pAsm->C[1].f = 0.5F;
 
-    next_ins(pAsm);
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_FRACT;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+
+    if(( GL_FALSE == next_ins(pAsm) ))
+    {
+        return GL_FALSE;
+    }
+    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    pAsm->D.dst.op3    = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+
+    pAsm->S[1].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[1].src), SQ_SEL_X);
+
+    pAsm->S[2].src.rtype = SRC_REC_LITERAL;
+    setswizzle_PVSSRC(&(pAsm->S[2].src), SQ_SEL_Y);
+
+    pAsm->D2.dst2.literal_slots = 1;
+
+    if(pAsm->bR6xx) {
+       pAsm->C[0].f = 3.1415926535897f * 2.0f;
+       pAsm->C[1].f = -3.1415926535897f;
+    } else {
+       pAsm->C[0].f = 1.0f;
+       pAsm->C[1].f = -0.5f;
+    }
+
+    if(( GL_FALSE == next_ins(pAsm) ))
+    {
+        return GL_FALSE;
+    }
 
     // COS dst.x,    a.x
     pAsm->D.dst.opcode = SQ_OP2_INST_COS;
@@ -6473,7 +6600,7 @@ GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode,
      * results are undefined anyway */
     if(export_count == 0)
     {
-        Process_Export(pR700AsmCode, SQ_EXPORT_PIXEL, 0, 1, 0, GL_FALSE);
+        Process_Export(pR700AsmCode, SQ_EXPORT_PIXEL, 0, 1, pR700AsmCode->starting_export_register_number, GL_FALSE);
     }
     
     if(pR700AsmCode->cf_last_export_ptr != NULL) 
diff --git a/src/mesa/drivers/dri/r600/r700_chip.c b/src/mesa/drivers/dri/r600/r700_chip.c
index cefda3ac4ba..bf8063391a2 100644
--- a/src/mesa/drivers/dri/r600/r700_chip.c
+++ b/src/mesa/drivers/dri/r600/r700_chip.c
@@ -265,17 +265,6 @@ static void r700SendVTXState(GLcontext *ctx, struct radeon_state_atom *atom)
     if (context->radeon.tcl.aos_count == 0)
 	    return;
 
-    BEGIN_BATCH_NO_AUTOSTATE(6);
-    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
-    R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
-    R600_OUT_BATCH(0);
-
-    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
-    R600_OUT_BATCH(mmSQ_VTX_START_INST_LOC - ASIC_CTL_CONST_BASE_INDEX);
-    R600_OUT_BATCH(0);
-    END_BATCH();
-    COMMIT_BATCH();
-
     for(i=0; i<VERT_ATTRIB_MAX; i++) {
 	    if(vp->mesa_program->Base.InputsRead & (1 << i))
 	    {
@@ -523,9 +512,9 @@ static void r700SetRenderTarget(context_t *context, int id)
 		     CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
 	    CLEARbit(r700->render_target[id].CB_COLOR0_INFO.u32All, SOURCE_FORMAT_bit);
             break;
-    case MESA_FORMAT_SRGBA8:
+    case MESA_FORMAT_SARGB8:
             format = COLOR_8_8_8_8;
-            comp_swap = SWAP_STD_REV;
+            comp_swap = SWAP_ALT;
 	    number_type = NUMBER_SRGB;
 	    SETbit(r700->render_target[id].CB_COLOR0_INFO.u32All, SOURCE_FORMAT_bit);
             break;
@@ -617,18 +606,25 @@ static void r700SendDepthTargetState(GLcontext *ctx, struct radeon_state_atom *a
 
 	r700SetDepthTarget(context);
 
-        BEGIN_BATCH_NO_AUTOSTATE(8 + 2);
+        BEGIN_BATCH_NO_AUTOSTATE(7 + 2);
 	R600_OUT_BATCH_REGSEQ(DB_DEPTH_SIZE, 2);
 	R600_OUT_BATCH(r700->DB_DEPTH_SIZE.u32All);
 	R600_OUT_BATCH(r700->DB_DEPTH_VIEW.u32All);
-	R600_OUT_BATCH_REGSEQ(DB_DEPTH_BASE, 2);
+	R600_OUT_BATCH_REGSEQ(DB_DEPTH_BASE, 1);
 	R600_OUT_BATCH(r700->DB_DEPTH_BASE.u32All);
-	R600_OUT_BATCH(r700->DB_DEPTH_INFO.u32All);
 	R600_OUT_BATCH_RELOC(r700->DB_DEPTH_BASE.u32All,
 			     rrb->bo,
 			     r700->DB_DEPTH_BASE.u32All,
 			     0, RADEON_GEM_DOMAIN_VRAM, 0);
         END_BATCH();
+        BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+	R600_OUT_BATCH_REGSEQ(DB_DEPTH_INFO, 1);
+	R600_OUT_BATCH(r700->DB_DEPTH_INFO.u32All);
+	R600_OUT_BATCH_RELOC(r700->DB_DEPTH_INFO.u32All,
+			     rrb->bo,
+			     r700->DB_DEPTH_INFO.u32All,
+			     0, RADEON_GEM_DOMAIN_VRAM, 0);
+        END_BATCH();
 
 	if ((context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) &&
 	    (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)) {
@@ -687,27 +683,35 @@ static void r700SendRenderTargetState(GLcontext *ctx, struct radeon_state_atom *
 	BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
 	R600_OUT_BATCH_REGSEQ(CB_COLOR0_TILE + (4 * id), 1);
 	R600_OUT_BATCH(r700->render_target[id].CB_COLOR0_TILE.u32All);
-	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_BASE.u32All,
+	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_TILE.u32All,
 			     rrb->bo,
-			     r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     r700->render_target[id].CB_COLOR0_TILE.u32All,
 			     0, RADEON_GEM_DOMAIN_VRAM, 0);
 	END_BATCH();
 	BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
 	R600_OUT_BATCH_REGSEQ(CB_COLOR0_FRAG + (4 * id), 1);
 	R600_OUT_BATCH(r700->render_target[id].CB_COLOR0_FRAG.u32All);
-	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_BASE.u32All,
+	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_FRAG.u32All,
 			     rrb->bo,
-			     r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     r700->render_target[id].CB_COLOR0_FRAG.u32All,
 			     0, RADEON_GEM_DOMAIN_VRAM, 0);
         END_BATCH();
 
-        BEGIN_BATCH_NO_AUTOSTATE(12);
+        BEGIN_BATCH_NO_AUTOSTATE(9);
 	R600_OUT_BATCH_REGVAL(CB_COLOR0_SIZE + (4 * id), r700->render_target[id].CB_COLOR0_SIZE.u32All);
 	R600_OUT_BATCH_REGVAL(CB_COLOR0_VIEW + (4 * id), r700->render_target[id].CB_COLOR0_VIEW.u32All);
-	R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), r700->render_target[id].CB_COLOR0_INFO.u32All);
 	R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), r700->render_target[id].CB_COLOR0_MASK.u32All);
         END_BATCH();
 
+	BEGIN_BATCH_NO_AUTOSTATE(3 + 2);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), r700->render_target[id].CB_COLOR0_INFO.u32All);
+	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_INFO.u32All,
+			     rrb->bo,
+			     r700->render_target[id].CB_COLOR0_INFO.u32All,
+			     0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+        END_BATCH();
+
 	COMMIT_BATCH();
 
 }
@@ -1465,9 +1469,6 @@ static int check_vtx(GLcontext *ctx, struct radeon_state_atom *atom)
 	context_t *context = R700_CONTEXT(ctx);
 	int count = context->radeon.tcl.aos_count * 18;
 
-	if (count)
-		count += 6;
-
 	radeon_print(RADEON_STATE, RADEON_TRACE, "%s %d\n", __func__, count);
 	return count;
 }
@@ -1567,7 +1568,7 @@ void r600InitAtoms(context_t *context)
 	ALLOC_STATE(sq, always, 34, r700SendSQConfig);
 	ALLOC_STATE(db, always, 17, r700SendDBState);
 	ALLOC_STATE(stencil, always, 4, r700SendStencilState);
-	ALLOC_STATE(db_target, always, 12, r700SendDepthTargetState);
+	ALLOC_STATE(db_target, always, 16, r700SendDepthTargetState);
 	ALLOC_STATE(sc, always, 15, r700SendSCState);
 	ALLOC_STATE(scissor, always, 22, r700SendScissorState);
 	ALLOC_STATE(aa, always, 12, r700SendAAState);
@@ -1578,7 +1579,7 @@ void r600InitAtoms(context_t *context)
 	ALLOC_STATE(poly, always, 10, r700SendPolyState);
 	ALLOC_STATE(cb, cb, 18, r700SendCBState);
 	ALLOC_STATE(clrcmp, always, 6, r700SendCBCLRCMPState);
-	ALLOC_STATE(cb_target, always, 29, r700SendRenderTargetState);
+	ALLOC_STATE(cb_target, always, 31, r700SendRenderTargetState);
 	ALLOC_STATE(blnd, blnd, (6 + (R700_MAX_RENDER_TARGETS * 3)), r700SendCBBlendState);
 	ALLOC_STATE(blnd_clr, always, 6, r700SendCBBlendColorState);
 	ALLOC_STATE(sx, always, 9, r700SendSXState);
@@ -1590,7 +1591,7 @@ void r600InitAtoms(context_t *context)
 	ALLOC_STATE(ps, always, 24, r700SendPSState);
 	ALLOC_STATE(vs_consts, vs_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendVSConsts);
 	ALLOC_STATE(ps_consts, ps_consts, (2 + (R700_MAX_DX9_CONSTS * 4)), r700SendPSConsts);
-	ALLOC_STATE(vtx, vtx, (6 + (VERT_ATTRIB_MAX * 18)), r700SendVTXState);
+	ALLOC_STATE(vtx, vtx, (VERT_ATTRIB_MAX * 18), r700SendVTXState);
 	ALLOC_STATE(tx, tx, (R700_TEXTURE_NUMBERUNITS * 20), r700SendTexState);
 	ALLOC_STATE(tx_smplr, tx, (R700_TEXTURE_NUMBERUNITS * 5), r700SendTexSamplerState);
 	ALLOC_STATE(tx_brdr_clr, tx, (R700_TEXTURE_NUMBERUNITS * 6), r700SendTexBorderColorState);
diff --git a/src/mesa/drivers/dri/r600/r700_clear.c b/src/mesa/drivers/dri/r600/r700_clear.c
index 09c48565b68..d1008f28b9b 100644
--- a/src/mesa/drivers/dri/r600/r700_clear.c
+++ b/src/mesa/drivers/dri/r600/r700_clear.c
@@ -48,6 +48,7 @@ static GLboolean r700ClearFast(context_t *context, GLbitfield mask)
 void r700Clear(GLcontext * ctx, GLbitfield mask)
 {
     context_t *context = R700_CONTEXT(ctx);
+    radeonContextPtr radeon = &context->radeon;
     __DRIdrawable *dPriv = radeon_get_drawable(&context->radeon);
     const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask[0]);
     GLbitfield swrast_mask = 0, tri_mask = 0;
@@ -60,6 +61,8 @@ void r700Clear(GLcontext * ctx, GLbitfield mask)
         context->radeon.front_buffer_dirty = GL_TRUE;
     }
 
+    radeon_prepare_render(radeon);
+
     if( GL_TRUE == r700ClearFast(context, mask) )
     {
         return;
diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c
index 1929b7cc129..c5771f9fd0b 100644
--- a/src/mesa/drivers/dri/r600/r700_render.c
+++ b/src/mesa/drivers/dri/r600/r700_render.c
@@ -244,7 +244,8 @@ static int r700NumVerts(int num_verts, int prim)
 	return num_verts - verts_off;
 }
 
-static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
+static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end,
+				   int prim, GLint basevertex)
 {
     context_t *context = R700_CONTEXT(ctx);
     BATCH_LOCALS(&context->radeon);
@@ -282,6 +283,7 @@ static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim
     total_emit =   3  /* VGT_PRIMITIVE_TYPE */
 	         + 2  /* VGT_INDEX_TYPE */
 	         + 2  /* NUM_INSTANCES */
+		 + 4  /* VTX_BASE_VTX_LOC + VTX_START_INST_LOC */
 	         + 5 + 2; /* DRAW_INDEX */
 
     BEGIN_BATCH_NO_AUTOSTATE(total_emit);
@@ -294,6 +296,11 @@ static void r700RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim
     // num instances
     R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
     R600_OUT_BATCH(1);
+    /* offset */
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 2));
+    R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(basevertex); //VTX_BASE_VTX_LOC
+    R600_OUT_BATCH(0);          //VTX_START_INST_LOC
     // draw packet
     R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX, 3));
     R600_OUT_BATCH(context->ind_buf.bo_offset);
@@ -364,6 +371,7 @@ static void r700RunRenderPrimitiveImmediate(GLcontext * ctx, int start, int end,
     total_emit +=   3 /* VGT_PRIMITIVE_TYPE */
 	          + 2 /* VGT_INDEX_TYPE */
 	          + 2 /* NUM_INSTANCES */
+		  + 4 /* VTX_BASE_VTX_LOC + VTX_START_INST_LOC */
 	          + 3; /* DRAW */
 
     BEGIN_BATCH_NO_AUTOSTATE(total_emit);
@@ -376,6 +384,11 @@ static void r700RunRenderPrimitiveImmediate(GLcontext * ctx, int start, int end,
     // num instances
     R600_OUT_BATCH(CP_PACKET3(R600_IT_NUM_INSTANCES, 0));
     R600_OUT_BATCH(1);
+    /* offset */
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 2));
+    R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(0); //VTX_BASE_VTX_LOC
+    R600_OUT_BATCH(0); //VTX_START_INST_LOC
     // draw packet
     if(start == 0)
     {
@@ -433,16 +446,16 @@ static GLuint r700PredictRenderSize(GLcontext* ctx,
 
     dwords = PRE_EMIT_STATE_BUFSZ;
     if (ib)
-	    dwords += nr_prims * 14;
+	    dwords += nr_prims * 18;
     else {
 	    for (i = 0; i < nr_prims; ++i)
 	    {
 		    if (prim[i].start == 0)
-			    dwords += 10;
+			    dwords += 14;
 		    else if (prim[i].count > 0xffff)
-			    dwords += prim[i].count + 10;
+			    dwords += prim[i].count + 14;
 		    else
-			    dwords += ((prim[i].count + 1) / 2) + 10;
+			    dwords += ((prim[i].count + 1) / 2) + 14;
 	    }
     }
 
@@ -625,11 +638,11 @@ static void r700SetupStreams(GLcontext *ctx, const struct gl_client_array *input
 
         stride = (input[i]->StrideB == 0) ? getTypeSize(input[i]->Type) * input[i]->Size : input[i]->StrideB;
 
-        if (input[i]->Type == GL_DOUBLE || input[i]->Type == GL_UNSIGNED_INT || input[i]->Type == GL_INT ||
+        if (input[i]->Type == GL_DOUBLE || input[i]->Type == GL_UNSIGNED_INT || input[i]->Type == GL_INT
 #if MESA_BIG_ENDIAN
-            getTypeSize(input[i]->Type) != 4 || 
+            || getTypeSize(input[i]->Type) != 4
 #endif
-            stride < 4) 
+            ) 
         {
             r700ConvertAttrib(ctx, count, input[i], &context->stream_desc[index]);
         } 
@@ -637,19 +650,10 @@ static void r700SetupStreams(GLcontext *ctx, const struct gl_client_array *input
         {
             if (input[i]->BufferObj->Name) 
             {
-                if (stride % 4 != 0) 
-                {
-                    assert(((intptr_t) input[i]->Ptr) % input[i]->StrideB == 0);
-                    r700AlignDataToDword(ctx, input[i], count, &context->stream_desc[index]);
-                    context->stream_desc[index].is_named_bo = GL_FALSE;
-                } 
-                else 
-                {
-                    context->stream_desc[index].stride = input[i]->StrideB;
-                    context->stream_desc[index].bo_offset = (intptr_t) input[i]->Ptr;
-                    context->stream_desc[index].bo = get_radeon_buffer_object(input[i]->BufferObj)->bo;
-                    context->stream_desc[index].is_named_bo = GL_TRUE;
-                }
+                context->stream_desc[index].stride = input[i]->StrideB;
+                context->stream_desc[index].bo_offset = (intptr_t) input[i]->Ptr;
+                context->stream_desc[index].bo = get_radeon_buffer_object(input[i]->BufferObj)->bo;
+                context->stream_desc[index].is_named_bo = GL_TRUE;
             } 
             else 
             {
@@ -932,7 +936,8 @@ static GLboolean r700TryDrawPrims(GLcontext *ctx,
 		    r700RunRenderPrimitive(ctx,
 					   prim[i].start,
 					   prim[i].start + prim[i].count,
-					   prim[i].mode);
+					   prim[i].mode,
+					   prim[i].basevertex);
 	    else
 		    r700RunRenderPrimitiveImmediate(ctx,
 						    prim[i].start,
@@ -977,18 +982,24 @@ static void r700DrawPrims(GLcontext *ctx,
 {
 	GLboolean retval = GL_FALSE;
 
+	context_t *context = R700_CONTEXT(ctx);
+	radeonContextPtr radeon = &context->radeon;
+	radeon_prepare_render(radeon);
+
 	/* This check should get folded into just the places that
 	 * min/max index are really needed.
 	 */
-	if (!index_bounds_valid) {
-		vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
-	}
 
-	if (min_index) {
+	if (!vbo_all_varyings_in_vbos(arrays)) {
+	    if (!index_bounds_valid)
+		vbo_get_minmax_index(ctx, prim, ib, &min_index, &max_index);
+	    /* do we want to rebase, minimizes the 
+	     * amount of data to upload? */
+	    if (min_index) {
 		vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r700DrawPrims );
 		return;
+	    }
 	}
-
 	/* Make an attempt at drawing */
 	retval = r700TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
 
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c
index 137f3007ced..6a2a09eaf1a 100644
--- a/src/mesa/drivers/dri/r600/r700_vertprog.c
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.c
@@ -461,11 +461,11 @@ static void r700TranslateAttrib(GLcontext *ctx, GLuint unLoc, int count, const s
 	stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size 
                                    : input->StrideB;
 
-    if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT ||
+    if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT
 #if MESA_BIG_ENDIAN
-        getTypeSize(input->Type) != 4 ||
+        || getTypeSize(input->Type) != 4
 #endif
-        stride < 4) 
+       ) 
     {
         pStreamDesc->type = GL_FLOAT;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index b7ee9a134bf..7d54fabebbc 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -414,9 +414,9 @@ enum {
    CHIP_FAMILY_R350,
    CHIP_FAMILY_RV350,
    CHIP_FAMILY_RV380,
+   CHIP_FAMILY_RS400,
    CHIP_FAMILY_R420,
    CHIP_FAMILY_RV410,
-   CHIP_FAMILY_RS400,
    CHIP_FAMILY_RS600,
    CHIP_FAMILY_RS690,
    CHIP_FAMILY_RS740,
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 13f1f0611b8..c1a660af3d0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -708,7 +708,6 @@ void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
 		if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
 			rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer);
 			radeon->front_cliprects = GL_TRUE;
-			radeon->front_buffer_dirty = GL_TRUE;
 		} else {
 			rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer);
 			radeon->front_cliprects = GL_FALSE;
@@ -1132,17 +1131,13 @@ flush_front:
 		if (screen->dri2.loader && (screen->dri2.loader->base.version >= 2)
 			&& (screen->dri2.loader->flushFrontBuffer != NULL)) {
 			__DRIdrawable * drawable = radeon_get_drawable(radeon);
-			(*screen->dri2.loader->flushFrontBuffer)(drawable, drawable->loaderPrivate);
 
-			/* Only clear the dirty bit if front-buffer rendering is no longer
-			 * enabled.  This is done so that the dirty bit can only be set in
-			 * glDrawBuffer.  Otherwise the dirty bit would have to be set at
-			 * each of N places that do rendering.  This has worse performances,
-			 * but it is much easier to get correct.
+			/* We set the dirty bit in radeon_prepare_render() if we're
+			 * front buffer rendering once we get there.
 			 */
-			if (!radeon->is_front_buffer_rendering) {
-				radeon->front_buffer_dirty = GL_FALSE;
-			}
+			radeon->front_buffer_dirty = GL_FALSE;
+
+			(*screen->dri2.loader->flushFrontBuffer)(drawable, drawable->loaderPrivate);
 		}
 	}
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
index 5a7d52c4d2f..92663bf66d7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -493,6 +493,50 @@ radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
    return _mesa_get_format_bytes(rb->base.Format) * 8; 
 }
 
+/*
+ * Check if drawable has been invalidated by dri2InvalidateDrawable().
+ * Update renderbuffers if so. This prevents a client from accessing
+ * a backbuffer that has a swap pending but not yet completed.
+ *
+ * See intel_prepare_render for equivalent code in intel driver.
+ *
+ */
+void radeon_prepare_render(radeonContextPtr radeon)
+{
+    __DRIcontext *driContext = radeon->dri.context;
+    __DRIdrawable *drawable;
+    __DRIscreen *screen;
+
+    screen = driContext->driScreenPriv;
+    if (!screen->dri2.loader)
+        return;
+
+    drawable = driContext->driDrawablePriv;
+    if (drawable->dri2.stamp != driContext->dri2.draw_stamp) {
+	if (drawable->lastStamp != drawable->dri2.stamp)
+	    radeon_update_renderbuffers(driContext, drawable, GL_FALSE);
+
+	/* Intel driver does the equivalent of this, no clue if it is needed:
+	 * radeon_draw_buffer(radeon->glCtx, &(drawable->driverPrivate)->base);
+	 */
+	driContext->dri2.draw_stamp = drawable->dri2.stamp;
+    }
+
+    drawable = driContext->driReadablePriv;
+    if (drawable->dri2.stamp != driContext->dri2.read_stamp) {
+	if (drawable->lastStamp != drawable->dri2.stamp)
+	    radeon_update_renderbuffers(driContext, drawable, GL_FALSE);
+	driContext->dri2.read_stamp = drawable->dri2.stamp;
+    }
+
+    /* If we're currently rendering to the front buffer, the rendering
+     * that will happen next will probably dirty the front buffer.  So
+     * mark it as dirty here.
+     */
+    if (radeon->is_front_buffer_rendering)
+	radeon->front_buffer_dirty = GL_TRUE;
+}
+
 void
 radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable,
 			    GLboolean front_only)
@@ -514,6 +558,11 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable,
 	screen = context->driScreenPriv;
 	radeon = (radeonContextPtr) context->driverPrivate;
 
+	/* Set this up front, so that in case our buffers get invalidated
+	 * while we're getting new buffers, we don't clobber the stamp and
+	 * thus ignore the invalidate. */
+	drawable->lastStamp = drawable->dri2.stamp;
+
 	if (screen->dri2.loader
 	   && (screen->dri2.loader->base.version > 2)
 	   && (screen->dri2.loader->getBuffersWithFormat != NULL)) {
@@ -650,6 +699,13 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable,
 		rb->base.Height = drawable->h;
 		rb->has_surface = 0;
 
+		/* r6xx+ tiling */
+		rb->tile_config = radeon->radeonScreen->tile_config;
+		rb->group_bytes = radeon->radeonScreen->group_bytes;
+		rb->num_channels = radeon->radeonScreen->num_channels;
+		rb->num_banks = radeon->radeonScreen->num_banks;
+		rb->r7xx_bank_op = radeon->radeonScreen->r7xx_bank_op;
+
 		if (buffers[i].attachment == __DRI_BUFFER_STENCIL && depth_bo) {
 			if (RADEON_DEBUG & RADEON_DRI)
 				fprintf(stderr, "(reusing depth buffer as stencil)\n");
@@ -678,7 +734,7 @@ radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable,
 				bo->flags |= RADEON_BO_FLAGS_MACRO_TILE;
 			if (tiling_flags & RADEON_TILING_MICRO)
 				bo->flags |= RADEON_BO_FLAGS_MICRO_TILE;
-			
+
 		}
 
 		if (buffers[i].attachment == __DRI_BUFFER_DEPTH) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
index 5156c5d0d0a..f06e5fdf244 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -93,6 +93,13 @@ struct radeon_renderbuffer
 	GLuint pf_pending;  /**< sequence number of pending flip */
 	GLuint vbl_pending;   /**< vblank sequence number of pending flip */
 	__DRIdrawable *dPriv;
+
+	/* r6xx+ tiling */
+	GLuint tile_config;
+	GLint group_bytes;
+	GLint num_channels;
+	GLint num_banks;
+	GLint r7xx_bank_op;
 };
 
 struct radeon_framebuffer
@@ -614,5 +621,6 @@ GLboolean radeonMakeCurrent(__DRIcontext * driContextPriv,
 			    __DRIdrawable * driDrawPriv,
 			    __DRIdrawable * driReadPriv);
 extern void radeonDestroyContext(__DRIcontext * driContextPriv);
+void radeon_prepare_render(radeonContextPtr radeon);
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
index c877e6c1765..c6e5f110ea3 100644
--- a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -133,7 +133,7 @@ static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree
 	height = _mesa_next_pow_two_32(lvl->height);
 
 	lvl->rowstride = get_texture_image_row_stride(rmesa, mt->mesaFormat, lvl->width, mt->tilebits);
-	lvl->size = get_texture_image_size(mt->mesaFormat, lvl->rowstride, lvl->height, lvl->depth, mt->tilebits);
+	lvl->size = get_texture_image_size(mt->mesaFormat, lvl->rowstride, height, lvl->depth, mt->tilebits);
 
 	assert(lvl->size > 0);
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
index dadb8002c7d..fb741173ca8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
+++ b/src/mesa/drivers/dri/radeon/radeon_pixel_read.c
@@ -179,6 +179,9 @@ radeonReadPixels(GLcontext * ctx,
                  GLenum format, GLenum type,
                  const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
 {
+    radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+    radeon_prepare_render(radeon);
+
     if (do_blit_readpixels(ctx, x, y, width, height, format, type, pack, pixels))
         return;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 82107cc6aeb..fa97a19302c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -213,6 +213,10 @@ static const GLuint __driNConfigOptions = 17;
 
 static int getSwapInfo( __DRIdrawable *dPriv, __DRIswapInfo * sInfo );
 
+#ifndef RADEON_INFO_TILE_CONFIG
+#define RADEON_INFO_TILE_CONFIG 0x6
+#endif
+
 static int
 radeonGetParam(__DRIscreen *sPriv, int param, void *value)
 {
@@ -232,6 +236,9 @@ radeonGetParam(__DRIscreen *sPriv, int param, void *value)
       case RADEON_PARAM_NUM_Z_PIPES:
           info.request = RADEON_INFO_NUM_Z_PIPES;
           break;
+      case RADEON_INFO_TILE_CONFIG:
+	  info.request = RADEON_INFO_TILE_CONFIG;
+          break;
       default:
           return -EINVAL;
       }
@@ -376,6 +383,21 @@ static const __DRItexBufferExtension r600TexBufferExtension = {
 };
 #endif
 
+static void
+radeonDRI2Flush(__DRIdrawable *drawable)
+{
+    radeonContextPtr rmesa;
+
+    rmesa = (radeonContextPtr) drawable->driContextPriv->driverPrivate;
+    radeonFlush(rmesa->glCtx);
+}
+
+static const struct __DRI2flushExtensionRec radeonFlushExtension = {
+    { __DRI2_FLUSH, __DRI2_FLUSH_VERSION },
+    radeonDRI2Flush,
+    dri2InvalidateDrawable,
+};
+
 static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
 {
    screen->device_id = device_id;
@@ -1305,6 +1327,56 @@ radeonCreateScreen2(__DRIscreen *sPriv)
    else
 	   screen->chip_flags |= RADEON_CLASS_R600;
 
+   /* r6xx+ tiling */
+   if (IS_R600_CLASS(screen) && (sPriv->drm_version.minor >= 6)) {
+	   ret = radeonGetParam(sPriv, RADEON_INFO_TILE_CONFIG, &temp);
+	   if (ret)
+		   fprintf(stderr, "failed to get tiling info\n");
+	   else {
+		   screen->tile_config = temp;
+		   screen->r7xx_bank_op = 0;
+		   switch((screen->tile_config & 0xe) >> 1) {
+		   case 0:
+			   screen->num_channels = 1;
+			   break;
+		   case 1:
+			   screen->num_channels = 2;
+			   break;
+		   case 2:
+			   screen->num_channels = 4;
+			   break;
+		   case 3:
+			   screen->num_channels = 8;
+			   break;
+		   default:
+			   fprintf(stderr, "bad channels\n");
+			   break;
+		   }
+		   switch((screen->tile_config & 0x30) >> 4) {
+		   case 0:
+			   screen->num_banks = 4;
+			   break;
+		   case 1:
+			   screen->num_banks = 8;
+			   break;
+		   default:
+			   fprintf(stderr, "bad banks\n");
+			   break;
+		   }
+		   switch((screen->tile_config & 0xc0) >> 6) {
+		   case 0:
+			   screen->group_bytes = 256;
+			   break;
+		   case 1:
+			   screen->group_bytes = 512;
+			   break;
+		   default:
+			   fprintf(stderr, "bad group_bytes\n");
+			   break;
+		   }
+	   }
+   }
+
    if (IS_R300_CLASS(screen)) {
        ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_GB_PIPES, &temp);
        if (ret) {
@@ -1379,6 +1451,8 @@ radeonCreateScreen2(__DRIscreen *sPriv)
    screen->extensions[i++] = &r600TexBufferExtension.base;
 #endif
 
+   screen->extensions[i++] = &radeonFlushExtension.base;
+
    screen->extensions[i++] = NULL;
    sPriv->extensions = screen->extensions;
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
index 0d7e335fa3a..2b33201a538 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
@@ -112,6 +112,13 @@ typedef struct radeon_screen {
    int kernel_mm;
    drm_radeon_sarea_t *sarea;	/* Private SAREA data */
    struct radeon_bo_manager *bom;
+
+   /* r6xx+ tiling */
+   GLuint tile_config;
+   GLint group_bytes;
+   GLint num_channels;
+   GLint num_banks;
+   GLint r7xx_bank_op;
 } radeonScreenRec, *radeonScreenPtr;
 
 #define IS_R100_CLASS(screen) \
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c
index 1adb6096033..9dfe2dd2433 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.c
+++ b/src/mesa/drivers/dri/radeon/radeon_span.c
@@ -111,7 +111,6 @@ static GLubyte *r200_depth_4byte(const struct radeon_renderbuffer * rrb,
  * two main types:
  * - 1D (akin to macro-linear/micro-tiled on older asics)
  * - 2D (akin to macro-tiled/micro-tiled on older asics)
- * only 1D tiling is implemented below
  */
 #if defined(RADEON_R600)
 static inline GLint r600_1d_tile_helper(const struct radeon_renderbuffer * rrb,
@@ -208,12 +207,190 @@ static inline GLint r600_1d_tile_helper(const struct radeon_renderbuffer * rrb,
     return offset;
 }
 
+static inline GLint r600_log2(GLint n)
+{
+	GLint log2 = 0;
+
+	while (n >>= 1)
+		++log2;
+	return log2;
+}
+
+static inline GLint r600_2d_tile_helper(const struct radeon_renderbuffer * rrb,
+					GLint x, GLint y, GLint is_depth, GLint is_stencil)
+{
+	GLint group_bytes = rrb->group_bytes;
+	GLint num_channels = rrb->num_channels;
+	GLint num_banks = rrb->num_banks;
+	GLint r7xx_bank_op = rrb->r7xx_bank_op;
+	/* */
+	GLint group_bits = r600_log2(group_bytes);
+	GLint channel_bits = r600_log2(num_channels);
+	GLint bank_bits = r600_log2(num_banks);
+	GLint element_bytes = rrb->cpp;
+	GLint num_samples = 1;
+	GLint tile_width = 8;
+	GLint tile_height = 8;
+	GLint tile_thickness = 1;
+	GLint macro_tile_width = num_banks;
+	GLint macro_tile_height = num_channels;
+	GLint pitch_elements = (rrb->pitch / element_bytes) / tile_width;
+	GLint height = rrb->base.Height / tile_height;
+	GLint z = 0;
+	GLint sample_number = 0;
+	/* */
+	GLint tile_bytes;
+	GLint macro_tile_bytes;
+	GLint macro_tiles_per_row;
+	GLint macro_tiles_per_slice;
+	GLint slice_offset;
+	GLint macro_tile_row_index;
+	GLint macro_tile_column_index;
+	GLint macro_tile_offset;
+	GLint pixel_number = 0;
+	GLint element_offset;
+	GLint bank = 0;
+	GLint channel = 0;
+	GLint total_offset;
+	GLint group_mask = (1 << group_bits) - 1;
+	GLint offset_low;
+	GLint offset_high;
+	GLint offset = 0;
+
+	switch (num_channels) {
+	case 2:
+	default:
+		// channel[0] = x[3] ^ y[3]
+		channel |= (((x >> 3) ^ (y >> 3)) & 1) << 0;
+		break;
+	case 4:
+		// channel[0] = x[4] ^ y[3]
+		channel |= (((x >> 4) ^ (y >> 3)) & 1) << 0;
+		// channel[1] = x[3] ^ y[4]
+		channel |= (((x >> 3) ^ (y >> 4)) & 1) << 1;
+		break;
+	case 8:
+		// channel[0] = x[5] ^ y[3]
+		channel |= (((x >> 5) ^ (y >> 3)) & 1) << 0;
+		// channel[0] = x[4] ^ x[5] ^ y[4]
+		channel |= (((x >> 4) ^ (x >> 5) ^ (y >> 4)) & 1) << 1;
+		// channel[0] = x[3] ^ y[5]
+		channel |= (((x >> 3) ^ (y >> 5)) & 1) << 2;
+		break;
+	}
+
+	switch (num_banks) {
+	case 4:
+		// bank[0] = x[3] ^ y[4 + log2(num_channels)]
+		bank |= (((x >> 3) ^ (y >> (4 + channel_bits))) & 1) << 0;
+		if (r7xx_bank_op)
+			// bank[1] = x[3] ^ y[4 + log2(num_channels)] ^ x[5]
+			bank |= (((x >> 4) ^ (y >> (3 + channel_bits)) ^ (x >> 5)) & 1) << 1;
+		else
+			// bank[1] = x[4] ^ y[3 + log2(num_channels)]
+			bank |= (((x >> 4) ^ (y >> (3 + channel_bits))) & 1) << 1;
+		break;
+	case 8:
+		// bank[0] = x[3] ^ y[5 + log2(num_channels)]
+		bank |= (((x >> 3) ^ (y >> (5 + channel_bits))) & 1) << 0;
+		// bank[1] = x[4] ^ y[4 + log2(num_channels)] ^ y[5 + log2(num_channels)]
+		bank |= (((x >> 4) ^ (y >> (4 + channel_bits)) ^ (y >> (5 + channel_bits))) & 1) << 1;
+		if (r7xx_bank_op)
+			// bank[2] = x[5] ^ y[3 + log2(num_channels)] ^ x[6]
+			bank |= (((x >> 5) ^ (y >> (3 + channel_bits)) ^ (x >> 6)) & 1) << 2;
+		else
+			// bank[2] = x[5] ^ y[3 + log2(num_channels)]
+			bank |= (((x >> 5) ^ (y >> (3 + channel_bits))) & 1) << 2;
+		break;
+	}
+
+	tile_bytes = tile_width * tile_height * tile_thickness * element_bytes * num_samples;
+	macro_tile_bytes = macro_tile_width * macro_tile_height * tile_bytes;
+	macro_tiles_per_row = pitch_elements / macro_tile_width;
+	macro_tiles_per_slice = macro_tiles_per_row * (height / macro_tile_height);
+	slice_offset = (z / tile_thickness) * macro_tiles_per_slice * macro_tile_bytes;
+	macro_tile_row_index = (y / tile_height) / macro_tile_height;
+	macro_tile_column_index = (x / tile_width) / macro_tile_width;
+	macro_tile_offset = ((macro_tile_row_index * macro_tiles_per_row) + macro_tile_column_index) * macro_tile_bytes;
+
+	if (is_depth) {
+		GLint pixel_offset = 0;
+
+		pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+		pixel_number |= ((y >> 0) & 1) << 1; // pn[1] = y[0]
+		pixel_number |= ((x >> 1) & 1) << 2; // pn[2] = x[1]
+		pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+		pixel_number |= ((x >> 2) & 1) << 4; // pn[4] = x[2]
+		pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
+		switch (element_bytes) {
+		case 2:
+			pixel_offset = pixel_number * element_bytes * num_samples;
+			break;
+		case 4:
+			/* stencil and depth data are stored separately within a tile.
+			 * stencil is stored in a contiguous tile before the depth tile.
+			 * stencil element is 1 byte, depth element is 3 bytes.
+			 * stencil tile is 64 bytes.
+			 */
+			if (is_stencil)
+				pixel_offset = pixel_number * 1 * num_samples;
+			else
+				pixel_offset = (pixel_number * 3 * num_samples) + 64;
+			break;
+		}
+		element_offset = pixel_offset + (sample_number * element_bytes);
+	} else {
+		GLint sample_offset;
+
+		switch (element_bytes) {
+		case 1:
+			pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+			pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1]
+			pixel_number |= ((x >> 2) & 1) << 2; // pn[2] = x[2]
+			pixel_number |= ((y >> 1) & 1) << 3; // pn[3] = y[1]
+			pixel_number |= ((y >> 0) & 1) << 4; // pn[4] = y[0]
+			pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
+			break;
+		case 2:
+			pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+			pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1]
+			pixel_number |= ((x >> 2) & 1) << 2; // pn[2] = x[2]
+			pixel_number |= ((y >> 0) & 1) << 3; // pn[3] = y[0]
+			pixel_number |= ((y >> 1) & 1) << 4; // pn[4] = y[1]
+			pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
+			break;
+		case 4:
+			pixel_number |= ((x >> 0) & 1) << 0; // pn[0] = x[0]
+			pixel_number |= ((x >> 1) & 1) << 1; // pn[1] = x[1]
+			pixel_number |= ((y >> 0) & 1) << 2; // pn[2] = y[0]
+			pixel_number |= ((x >> 2) & 1) << 3; // pn[3] = x[2]
+			pixel_number |= ((y >> 1) & 1) << 4; // pn[4] = y[1]
+			pixel_number |= ((y >> 2) & 1) << 5; // pn[5] = y[2]
+			break;
+		}
+		sample_offset = sample_number * (tile_bytes / num_samples);
+		element_offset = sample_offset + (pixel_number * element_bytes);
+	}
+	total_offset = (slice_offset + macro_tile_offset) >> (channel_bits + bank_bits);
+	total_offset += element_offset;
+
+	offset_low = total_offset & group_mask;
+	offset_high = (total_offset & ~group_mask) << (channel_bits + bank_bits);
+	offset = (bank << (group_bits + channel_bits)) + (channel << group_bits) + offset_low + offset_high;
+
+	return offset;
+}
+
 /* depth buffers */
 static GLubyte *r600_ptr_depth(const struct radeon_renderbuffer * rrb,
 			       GLint x, GLint y)
 {
     GLubyte *ptr = rrb->bo->ptr;
-    GLint offset = r600_1d_tile_helper(rrb, x, y, 1, 0);
+    GLint offset;
+    if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+	    offset = r600_2d_tile_helper(rrb, x, y, 1, 0);
+    else
+	    offset = r600_1d_tile_helper(rrb, x, y, 1, 0);
     return &ptr[offset];
 }
 
@@ -221,7 +398,11 @@ static GLubyte *r600_ptr_stencil(const struct radeon_renderbuffer * rrb,
 				 GLint x, GLint y)
 {
     GLubyte *ptr = rrb->bo->ptr;
-    GLint offset = r600_1d_tile_helper(rrb, x, y, 1, 1);
+    GLint offset;
+    if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+	    offset = r600_2d_tile_helper(rrb, x, y, 1, 1);
+    else
+	    offset = r600_1d_tile_helper(rrb, x, y, 1, 1);
     return &ptr[offset];
 }
 
@@ -235,7 +416,10 @@ static GLubyte *r600_ptr_color(const struct radeon_renderbuffer * rrb,
     if (rrb->has_surface || !(rrb->bo->flags & mask)) {
         offset = x * rrb->cpp + y * rrb->pitch;
     } else {
-	    offset = r600_1d_tile_helper(rrb, x, y, 0, 0);
+	    if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+		    offset = r600_2d_tile_helper(rrb, x, y, 0, 0);
+	    else
+		    offset = r600_1d_tile_helper(rrb, x, y, 0, 0);
     }
     return &ptr[offset];
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index f2fcb46688a..29defe73a70 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -40,7 +40,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/simple_list.h"
 
+#include "math/m_xform.h"
+
 #include "swrast_setup/swrast_setup.h"
+
 #include "tnl/tnl.h"
 #include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
@@ -408,6 +411,8 @@ static GLboolean radeon_run_render( GLcontext *ctx,
        !radeon_dma_validate_render( ctx, VB ))
       return GL_TRUE;		
 
+   radeon_prepare_render(&rmesa->radeon);
+
    tnl->Driver.Render.Start( ctx );
 
    for (i = 0 ; i < VB->PrimitiveCount ; i++)
diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c
index ea796e1a45f..5e1718f9dfc 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c
@@ -252,6 +252,8 @@ void radeonTclPrimitive( GLcontext *ctx,
    GLuint se_cntl;
    GLuint newprim = hw_prim | RADEON_CP_VC_CNTL_TCL_ENABLE;
 
+   radeon_prepare_render(&rmesa->radeon);
+
    if (newprim != rmesa->tcl.hw_primitive ||
        !discrete_prim[hw_prim&0xf]) {
       RADEON_NEWPRIM( rmesa );
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex_copy.c b/src/mesa/drivers/dri/radeon/radeon_tex_copy.c
index 29fd31ac23f..4cb0bb60c85 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex_copy.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex_copy.c
@@ -153,6 +153,9 @@ radeonCopyTexImage2D(GLcontext *ctx, GLenum target, GLint level,
         _mesa_select_tex_image(ctx, texObj, target, level);
     int srcx, srcy, dstx, dsty;
 
+    radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+    radeon_prepare_render(radeon);
+
     if (border)
         goto fail;
 
@@ -202,6 +205,9 @@ radeonCopyTexSubImage2D(GLcontext *ctx, GLenum target, GLint level,
     struct gl_texture_object *texObj = _mesa_select_tex_object(ctx, texUnit, target);
     struct gl_texture_image *texImage = _mesa_select_tex_image(ctx, texObj, target, level);
 
+    radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+    radeon_prepare_render(radeon);
+
     if (!do_copy_texsubimage(ctx, target, level,
                              radeon_tex_obj(texObj), (radeon_texture_image *)texImage,
                              xoffset, yoffset, x, y, width, height)) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
index d2b190e42e0..8c6a50d2f0d 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texture.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -551,7 +551,7 @@ gl_format radeonChooseTextureFormat(GLcontext * ctx,
 	case GL_SRGB8_ALPHA8:
 	case GL_COMPRESSED_SRGB:
 	case GL_COMPRESSED_SRGB_ALPHA:
-		return MESA_FORMAT_SRGBA8;
+		return MESA_FORMAT_SARGB8;
 
 	case GL_SLUMINANCE:
 	case GL_SLUMINANCE8:
diff --git a/src/mesa/drivers/dri/savage/savagerender.c b/src/mesa/drivers/dri/savage/savagerender.c
index c369bb124c2..2d9e80e29c4 100644
--- a/src/mesa/drivers/dri/savage/savagerender.c
+++ b/src/mesa/drivers/dri/savage/savagerender.c
@@ -33,6 +33,8 @@
 #include "main/imports.h"
 #include "main/mtypes.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 
 #include "savagecontext.h"
diff --git a/src/mesa/drivers/dri/unichrome/via_render.c b/src/mesa/drivers/dri/unichrome/via_render.c
index 896c43db1b0..4351f119555 100644
--- a/src/mesa/drivers/dri/unichrome/via_render.c
+++ b/src/mesa/drivers/dri/unichrome/via_render.c
@@ -33,6 +33,8 @@
 #include "main/macros.h"
 #include "main/mtypes.h"
 
+#include "math/m_xform.h"
+
 #include "tnl/t_context.h"
 
 #include "via_context.h"
author	Christoph Bumiller <[email protected]>	2010-08-18 14:37:47 +0200
committer	Christoph Bumiller <[email protected]>	2010-08-18 14:37:47 +0200
commit	3e54d63429fe7ca5db3c75c181abbaf7a7f55724 (patch)
tree	e129c36aaef712525f0a04fc5b06c445e3cf84df /src/mesa/drivers/dri
parent	eaab76457818fad0926b84c663440e8987e1f19f (diff)
parent	85d9bc236d6a8ff8f12cbc2150f8c3740354f573 (diff)