149 files changed, 2767 insertions, 2802 deletions
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index 94c0c69fcae..def9a03d377 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -512,6 +512,22 @@ i915_translate_instruction(struct i915_fp_compile *p,
       emit_simple_arith(p, inst, A0_ADD, 2, fs);
       break;
 
+   case TGSI_OPCODE_CEIL:
+      src0 = src_vector(p, &inst->Src[0], fs);
+      tmp = i915_get_utemp(p);
+      flags = get_result_flags(inst);
+      i915_emit_arith(p,
+                      A0_FLR,
+                      tmp,
+                      flags & A0_DEST_CHANNEL_ALL, 0,
+                      negate(src0, 1, 1, 1, 1), 0, 0);
+      i915_emit_arith(p,
+                      A0_MOV,
+                      get_result_vector(p, &inst->Dst[0]),
+                      flags, 0,
+                      negate(tmp, 1, 1, 1, 1), 0, 0);
+      break;
+
    case TGSI_OPCODE_CMP:
       src0 = src_vector(p, &inst->Src[0], fs);
       src1 = src_vector(p, &inst->Src[1], fs);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend.h b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
index f82ae30bb7d..c0c95a27129 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend.h
@@ -64,12 +64,14 @@ lp_build_blend_func(struct lp_build_context *bld,
 LLVMValueRef
 lp_build_blend_aos(struct gallivm_state *gallivm,
                    const struct pipe_blend_state *blend,
+                   const enum pipe_format *cbuf_format,
                    struct lp_type type,
                    unsigned rt,
                    LLVMValueRef src,
                    LLVMValueRef dst,
+                   LLVMValueRef mask,
                    LLVMValueRef const_,
-                   unsigned alpha_swizzle);
+                   const unsigned char swizzle[4]);
 
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index c342346a36e..59d5f545966 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -45,12 +45,14 @@
 
 #include "pipe/p_state.h"
 #include "util/u_debug.h"
+#include "util/u_format.h"
 
 #include "gallivm/lp_bld_type.h"
 #include "gallivm/lp_bld_const.h"
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_swizzle.h"
+#include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_debug.h"
 
 #include "lp_bld_blend.h"
@@ -300,25 +302,39 @@ lp_build_blend_func(struct lp_build_context *bld,
 }
 
 
+/**
+ * Performs blending of src and dst pixels
+ *
+ * @param blend         the blend state of the shader variant
+ * @param cbuf_format   format of the colour buffer
+ * @param type          data type of the pixel vector
+ * @param rt            rt number
+ * @param src           blend src
+ * @param dst           blend dst
+ * @param mask          optional mask to apply to the blending result
+ * @param const_        const blend color
+ * @param swizzle       swizzle values for RGBA
+ *
+ * @return the result of blending src and dst
+ */
 LLVMValueRef
 lp_build_blend_aos(struct gallivm_state *gallivm,
                    const struct pipe_blend_state *blend,
+                   const enum pipe_format *cbuf_format,
                    struct lp_type type,
                    unsigned rt,
                    LLVMValueRef src,
                    LLVMValueRef dst,
+                   LLVMValueRef mask,
                    LLVMValueRef const_,
-                   unsigned alpha_swizzle)
+                   const unsigned char swizzle[4])
 {
    struct lp_build_blend_aos_context bld;
    LLVMValueRef src_term;
    LLVMValueRef dst_term;
-
-   /* FIXME: color masking not implemented yet */
-   assert(blend->rt[rt].colormask == 0xf);
-
-   if(!blend->rt[rt].blend_enable)
-      return src;
+   LLVMValueRef result;
+   unsigned alpha_swizzle = swizzle[3];
+   boolean fullcolormask;
 
    /* Setup build context */
    memset(&bld, 0, sizeof bld);
@@ -327,30 +343,59 @@ lp_build_blend_aos(struct gallivm_state *gallivm,
    bld.dst = dst;
    bld.const_ = const_;
 
-   /* TODO: There are still a few optimization opportunities here. For certain
-    * combinations it is possible to reorder the operations and therefore saving
-    * some instructions. */
+   if (!blend->rt[rt].blend_enable) {
+      result = src;
+   } else {
+
+      /* TODO: There are still a few optimization opportunities here. For certain
+       * combinations it is possible to reorder the operations and therefore saving
+       * some instructions. */
+
+      src_term = lp_build_blend_factor(&bld, src, blend->rt[rt].rgb_src_factor,
+                                       blend->rt[rt].alpha_src_factor, alpha_swizzle);
+      dst_term = lp_build_blend_factor(&bld, dst, blend->rt[rt].rgb_dst_factor,
+                                       blend->rt[rt].alpha_dst_factor, alpha_swizzle);
+
+      lp_build_name(src_term, "src_term");
+      lp_build_name(dst_term, "dst_term");
 
-   src_term = lp_build_blend_factor(&bld, src, blend->rt[rt].rgb_src_factor,
-                                    blend->rt[rt].alpha_src_factor, alpha_swizzle);
-   dst_term = lp_build_blend_factor(&bld, dst, blend->rt[rt].rgb_dst_factor,
-                                    blend->rt[rt].alpha_dst_factor, alpha_swizzle);
+      if(blend->rt[rt].rgb_func == blend->rt[rt].alpha_func) {
+         result = lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func, src_term, dst_term);
+      }
+      else {
+         /* Seperate RGB / A functions */
+
+         LLVMValueRef rgb;
+         LLVMValueRef alpha;
 
-   lp_build_name(src_term, "src_term");
-   lp_build_name(dst_term, "dst_term");
+         rgb   = lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func,   src_term, dst_term);
+         alpha = lp_build_blend_func(&bld.base, blend->rt[rt].alpha_func, src_term, dst_term);
 
-   if(blend->rt[rt].rgb_func == blend->rt[rt].alpha_func) {
-      return lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func, src_term, dst_term);
+         result = lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
+      }
    }
-   else {
-      /* Seperate RGB / A functions */
 
-      LLVMValueRef rgb;
-      LLVMValueRef alpha;
+   /* Check if color mask is necessary */
+   fullcolormask = util_format_colormask_full(util_format_description(cbuf_format[rt]), blend->rt[rt].colormask);
+
+   if (!fullcolormask) {
+      LLVMValueRef color_mask;
 
-      rgb   = lp_build_blend_func(&bld.base, blend->rt[rt].rgb_func,   src_term, dst_term);
-      alpha = lp_build_blend_func(&bld.base, blend->rt[rt].alpha_func, src_term, dst_term);
+      color_mask = lp_build_const_mask_aos_swizzled(gallivm, bld.base.type, blend->rt[rt].colormask, swizzle);
+      lp_build_name(color_mask, "color_mask");
 
-      return lp_build_blend_swizzle(&bld, rgb, alpha, LP_BUILD_BLEND_SWIZZLE_RGBA, alpha_swizzle);
+      /* Combine with input mask if necessary */
+      if (mask) {
+         mask = lp_build_and(&bld.base, color_mask, mask);
+      } else {
+         mask = color_mask;
+      }
+   }
+
+   /* Apply mask, if one exists */
+   if (mask) {
+      result = lp_build_select(&bld.base, mask, result, dst);
    }
+
+   return result;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 71d0ddf5e75..230b80a945f 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -290,6 +290,10 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
    rej4 = _mm_slli_epi32(rej4, 2);
 
+   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
+   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
+   rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
+
    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 
@@ -383,7 +387,7 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
-   
+
    transpose4_epi32(&p0, &p1, &p2, &zero,
                     &c, &dcdx, &dcdy, &unused);
 
@@ -394,6 +398,9 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
    c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
    c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
 
+   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
+   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
+
    dcdx2 = _mm_add_epi32(dcdx, dcdx);
    dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index b50c354fa9b..26d35debdaf 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -271,15 +271,13 @@ do_triangle_ccw(struct lp_setup_context *setup,
        */
       int adj = (setup->pixel_offset != 0) ? 1 : 0;
 
-      bbox.x0 = (MIN3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
-      bbox.x1 = (MAX3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER;
-      bbox.y0 = (MIN3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
-      bbox.y1 = (MAX3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
+      /* Inclusive x0, exclusive x1 */
+      bbox.x0 = MIN3(x[0], x[1], x[2]) >> FIXED_ORDER;
+      bbox.x1 = (MAX3(x[0], x[1], x[2]) - 1) >> FIXED_ORDER;
 
-      /* Inclusive coordinates:
-       */
-      bbox.x1--;
-      bbox.y1--;
+      /* Inclusive / exclusive depending upon adj (bottom-left or top-right) */
+      bbox.y0 = (MIN3(y[0], y[1], y[2]) + adj) >> FIXED_ORDER;
+      bbox.y1 = (MAX3(y[0], y[1], y[2]) - 1 + adj) >> FIXED_ORDER;
    }
 
    if (bbox.x1 < bbox.x0 ||
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index ec94190649c..2d2391e908c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -924,6 +924,7 @@ generate_variant(struct llvmpipe_context *lp,
                  const struct lp_fragment_shader_variant_key *key)
 {
    struct lp_fragment_shader_variant *variant;
+   const struct util_format_description *cbuf0_format_desc;
    boolean fullcolormask;
 
    variant = CALLOC_STRUCT(lp_fragment_shader_variant);
@@ -942,12 +943,8 @@ generate_variant(struct llvmpipe_context *lp,
     */
    fullcolormask = FALSE;
    if (key->nr_cbufs == 1) {
-      const struct util_format_description *format_desc;
-      format_desc = util_format_description(key->cbuf_format[0]);
-      if ((~key->blend.rt[0].colormask &
-           util_format_colormask(format_desc)) == 0) {
-         fullcolormask = TRUE;
-      }
+      cbuf0_format_desc = util_format_description(key->cbuf_format[0]);
+      fullcolormask = util_format_colormask_full(cbuf0_format_desc, key->blend.rt[0].colormask);
    }
 
    variant->opaque =
diff --git a/src/gallium/drivers/llvmpipe/lp_test_blend.c b/src/gallium/drivers/llvmpipe/lp_test_blend.c
index b3ca134131d..51324cbb6a3 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_blend.c
@@ -179,7 +179,9 @@ add_blend_test(struct gallivm_state *gallivm,
    LLVMValueRef res_ptr;
    LLVMBasicBlockRef block;
    LLVMBuilderRef builder;
+   const enum pipe_format format = PIPE_FORMAT_R8G8B8A8_UNORM;
    const unsigned rt = 0;
+   const unsigned char swizzle[4] = { 0, 1, 2, 3 };
 
    vec_type = lp_build_vec_type(gallivm, type);
 
@@ -205,7 +207,7 @@ add_blend_test(struct gallivm_state *gallivm,
       dst = LLVMBuildLoad(builder, dst_ptr, "dst");
       con = LLVMBuildLoad(builder, const_ptr, "const");
 
-      res = lp_build_blend_aos(gallivm, blend, type, rt, src, dst, con, 3);
+      res = lp_build_blend_aos(gallivm, blend, &format, type, rt, src, dst, NULL, con, swizzle);
 
       lp_build_name(res, "res");
 
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index ff199debd74..936e2bf246a 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -523,8 +523,10 @@ nouveau_scratch_runout_release(struct nouveau_context *nv)
 {
    if (!nv->scratch.nr_runout)
       return;
-   while (nv->scratch.nr_runout--)
+   do {
+      --nv->scratch.nr_runout;
       nouveau_bo_ref(NULL, &nv->scratch.runout[nv->scratch.nr_runout]);
+   } while (nv->scratch.nr_runout);
 
    FREE(nv->scratch.runout);
    nv->scratch.end = 0;
diff --git a/src/gallium/drivers/nv30/nvfx_fragprog.c b/src/gallium/drivers/nv30/nvfx_fragprog.c
index e562b454f92..592ad21c6c8 100644
--- a/src/gallium/drivers/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nv30/nvfx_fragprog.c
@@ -535,6 +535,11 @@ nvfx_fragprog_parse_instruction(struct nv30_context* nvfx, struct nvfx_fpc *fpc,
    case TGSI_OPCODE_ADD:
       nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none));
       break;
+   case TGSI_OPCODE_CEIL:
+      tmp = nvfx_src(temp(fpc));
+      nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, neg(src[0]), none, none));
+      nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, neg(tmp), none, none));
+      break;
    case TGSI_OPCODE_CMP:
       insn = arith(0, MOV, none.reg, mask, src[0], none, none);
       insn.cc_update = 1;
diff --git a/src/gallium/drivers/nv30/nvfx_vertprog.c b/src/gallium/drivers/nv30/nvfx_vertprog.c
index d7eb9fb0a63..82972b3943c 100644
--- a/src/gallium/drivers/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nv30/nvfx_vertprog.c
@@ -550,6 +550,11 @@ nvfx_vertprog_parse_instruction(struct nv30_context *nv30, struct nvfx_vpc *vpc,
    case TGSI_OPCODE_ARL:
       nvfx_vp_emit(vpc, arith(0, VEC, ARL, dst, mask, src[0], none, none));
       break;
+   case TGSI_OPCODE_CEIL:
+      tmp = nvfx_src(temp(vpc));
+      nvfx_vp_emit(vpc, arith(0, VEC, FLR, tmp.reg, mask, neg(src[0]), none, none));
+      nvfx_vp_emit(vpc, arith(sat, VEC, MOV, dst, mask, neg(tmp), none, none));
+      break;
    case TGSI_OPCODE_CMP:
       insn = arith(0, VEC, MOV, none.reg, mask, src[0], none, none);
       insn.cc_update = 1;
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp
index f7dac25c116..f713e6391c6 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.cpp
@@ -278,6 +278,31 @@ BuildUtil::mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc)
    return mkOp2(OP_UNION, typeOfSize(dst->reg.size), dst, def0, def1);
 }
 
+Instruction *
+BuildUtil::mkSplit(Value *h[2], uint8_t halfSize, Value *val)
+{
+   Instruction *insn = NULL;
+
+   const DataType fTy = typeOfSize(halfSize * 2);
+
+   if (val->reg.file == FILE_IMMEDIATE)
+      val = mkMov(getSSA(halfSize * 2), val, fTy)->getDef(0);
+
+   if (isMemoryFile(val->reg.file)) {
+      h[0] = cloneShallow(getFunction(), val);
+      h[1] = cloneShallow(getFunction(), val);
+      h[0]->reg.size = halfSize;
+      h[1]->reg.size = halfSize;
+      h[1]->reg.data.offset += halfSize;
+   } else {
+      h[0] = getSSA(halfSize, val->reg.file);
+      h[1] = getSSA(halfSize, val->reg.file);
+      insn = mkOp1(OP_SPLIT, fTy, h[0], val);
+      insn->setDef(1, h[1]);
+   }
+   return insn;
+}
+
 FlowInstruction *
 BuildUtil::mkFlow(operation op, void *targ, CondCode cc, Value *pred)
 {
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
index 9ee04dbcd12..dd7e491cb5c 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_build_util.h
@@ -81,6 +81,8 @@ public:
 
    Instruction *mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc);
 
+   Instruction *mkSplit(Value *half[2], uint8_t halfSize, Value *);
+
    void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
 
    ImmediateValue *mkImm(float);
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
index 82e23602ca0..16f191da159 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_from_tgsi.cpp
@@ -347,6 +347,7 @@ static nv50_ir::TexTarget translateTexture(uint tex)
    NV50_IR_TEX_TARG_CASE(SHADOW2D_ARRAY, 2D_ARRAY_SHADOW);
    NV50_IR_TEX_TARG_CASE(SHADOWCUBE, CUBE_SHADOW);
    NV50_IR_TEX_TARG_CASE(SHADOWRECT, RECT_SHADOW);
+   NV50_IR_TEX_TARG_CASE(BUFFER, BUFFER);
 
    case TGSI_TEXTURE_UNKNOWN:
    default:
@@ -548,7 +549,7 @@ static nv50_ir::operation translateOpcode(uint opcode)
    NV50_IR_OPCODE_CASE(SAMPLE_D, TXD);
    NV50_IR_OPCODE_CASE(SAMPLE_L, TXL);
    NV50_IR_OPCODE_CASE(GATHER4, TXG);
-   NV50_IR_OPCODE_CASE(RESINFO, TXQ);
+   NV50_IR_OPCODE_CASE(SVIEWINFO, TXQ);
 
    NV50_IR_OPCODE_CASE(END, EXIT);
 
@@ -597,8 +598,8 @@ public:
 
    int clipVertexOutput;
 
-   uint8_t *resourceTargets; // TGSI_TEXTURE_*
-   unsigned resourceCount;
+   uint8_t *samplerViewTargets; // TGSI_TEXTURE_*
+   unsigned samplerViewCount;
 
 private:
    int inferSysValDirection(unsigned sn) const;
@@ -617,7 +618,7 @@ Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
    if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
       tgsi_dump(tokens, 0);
 
-   resourceTargets = NULL;
+   samplerViewTargets = NULL;
 
    mainTempsInLMem = FALSE;
 }
@@ -632,8 +633,8 @@ Source::~Source()
    if (info->immd.type)
       FREE(info->immd.type);
 
-   if (resourceTargets)
-      delete[] resourceTargets;
+   if (samplerViewTargets)
+      delete[] samplerViewTargets;
 }
 
 bool Source::scanSource()
@@ -650,8 +651,8 @@ bool Source::scanSource()
 
    clipVertexOutput = -1;
 
-   resourceCount = scan.file_max[TGSI_FILE_RESOURCE] + 1;
-   resourceTargets = new uint8_t[resourceCount];
+   samplerViewCount = scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
+   samplerViewTargets = new uint8_t[samplerViewCount];
 
    info->immd.bufSize = 0;
    tempArrayCount = 0;
@@ -805,7 +806,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
             info->in[i].si = si;
             if (info->type == PIPE_SHADER_FRAGMENT) {
                // translate interpolation mode
-               switch (decl->Declaration.Interpolate) {
+               switch (decl->Interp.Interpolate) {
                case TGSI_INTERPOLATE_CONSTANT:
                   info->in[i].flat = 1;
                   break;
@@ -818,7 +819,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
                default:
                   break;
                }
-               if (decl->Declaration.Centroid)
+               if (decl->Interp.Centroid)
                   info->in[i].centroid = 1;
             }
          }
@@ -874,9 +875,9 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
          info->sv[i].input = inferSysValDirection(sn);
       }
       break;
-   case TGSI_FILE_RESOURCE:
+   case TGSI_FILE_SAMPLER_VIEW:
       for (i = first; i <= last; ++i)
-         resourceTargets[i] = decl->Resource.Resource;
+         samplerViewTargets[i] = decl->SamplerView.Resource;
       break;
    case TGSI_FILE_IMMEDIATE_ARRAY:
    {
@@ -1000,13 +1001,15 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
 nv50_ir::TexInstruction::Target
 Instruction::getTexture(const tgsi::Source *code, int s) const
 {
-   if (insn->Instruction.Texture) {
-      return translateTexture(insn->Texture.Texture);
-   } else {
+   switch (getSrc(s).getFile()) {
+   case TGSI_FILE_SAMPLER_VIEW: {
       // XXX: indirect access
       unsigned int r = getSrc(s).getIndex(0);
-      assert(r < code->resourceCount);
-      return translateTexture(code->resourceTargets[r]);
+      assert(r < code->samplerViewCount);
+      return translateTexture(code->samplerViewTargets[r]);
+   }
+   default:
+      return translateTexture(insn->Texture.Texture);
    }
 }
 
@@ -2042,7 +2045,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       handleTXF(dst0, 1);
       break;
    case TGSI_OPCODE_TXQ:
-   case TGSI_OPCODE_RESINFO:
+   case TGSI_OPCODE_SVIEWINFO:
       handleTXQ(dst0, TXQ_DIMS);
       break;
    case TGSI_OPCODE_F2I:
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
index 27373b4cc47..16bba0e1723 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp
@@ -57,15 +57,17 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
 
    Instruction *i[9];
 
-   Value *a[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
-   Value *b[2] = { bld->getSSA(halfSize), bld->getSSA(halfSize) };
+   bld->setPosition(mul, true);
+
+   Value *a[2], *b[2];
    Value *c[2];
    Value *t[4];
    for (int j = 0; j < 4; ++j)
       t[j] = bld->getSSA(fullSize);
 
-   (i[0] = bld->mkOp1(OP_SPLIT, fTy, a[0], mul->getSrc(0)))->setDef(1, a[1]);
-   (i[1] = bld->mkOp1(OP_SPLIT, fTy, b[0], mul->getSrc(1)))->setDef(1, b[1]);
+   // split sources into halves
+   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
+   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
 
    i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
    i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
@@ -96,7 +98,8 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
    delete_Instruction(bld->getProgram(), mul);
 
    for (int j = 2; j <= (highResult ? 5 : 4); ++j)
-      i[j]->sType = hTy;
+      if (i[j])
+         i[j]->sType = hTy;
 
    return true;
 }
@@ -518,7 +521,6 @@ private:
 
    bool handleEXPORT(Instruction *);
 
-   bool handleMUL(Instruction *);
    bool handleDIV(Instruction *);
    bool handleSQRT(Instruction *);
    bool handlePOW(Instruction *);
@@ -587,7 +589,8 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
    if (i->tex.target.isArray()) {
       Value *layer = i->getSrc(arg - 1);
       LValue *src = new_LValue(func, FILE_GPR);
-      bld.mkCvt(OP_CVT, TYPE_U16, src, TYPE_F32, layer);
+      bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
+      bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
       i->setSrc(arg - 1, src);
 
       if (i->tex.target.isCube()) {
@@ -941,14 +944,6 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
 }
 
 bool
-NV50LoweringPreSSA::handleMUL(Instruction *i)
-{
-   if (!isFloatType(i->dType) && typeSizeof(i->sType) > 2)
-      return expandIntegerMUL(&bld, i);
-   return true;
-}
-
-bool
 NV50LoweringPreSSA::handleDIV(Instruction *i)
 {
    if (!isFloatType(i->dType))
@@ -1068,8 +1063,6 @@ NV50LoweringPreSSA::visit(Instruction *i)
       return handleSELP(i);
    case OP_POW:
       return handlePOW(i);
-   case OP_MUL:
-      return handleMUL(i);
    case OP_DIV:
       return handleDIV(i);
    case OP_SQRT:
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
index db5195cd582..10382d9cac6 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_peephole.cpp
@@ -564,7 +564,7 @@ ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
       insn = mul2->getSrc(t)->getInsn();
       if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
          mul1 = insn;
-      if (mul1) {
+      if (mul1 && !mul1->saturate) {
          int s1;
 
          if (mul1->src(s1 = 0).getImmediate(imm1) ||
@@ -584,10 +584,11 @@ ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
             if (f < 0)
                mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
          }
+         mul1->saturate = mul2->saturate;
          return;
       }
    }
-   if (mul2->getDef(0)->refCount() == 1) {
+   if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
       // b = mul a, imm
       // d = mul b, c   -> d = mul_x_imm a, c
       int s2, t2;
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
index 77edaa6067a..726331e91e7 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_ra.cpp
@@ -1819,8 +1819,8 @@ RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
    int n = tex->srcCount(0xff, true);
    if (n > 4) {
       condenseSrcs(tex, 0, 3);
-      if (n > 5)
-         condenseSrcs(tex, 4, n - 1);
+      if (n > 5) // NOTE: first call modified positions already
+         condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
    } else
    if (n > 1) {
       condenseSrcs(tex, 0, n - 1);
@@ -1850,8 +1850,8 @@ RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex)
 
    if (s > 1)
       condenseSrcs(tex, 0, s - 1);
-   if (n > 1)
-      condenseSrcs(tex, s, s + (n - 1));
+   if (n > 1) // NOTE: first call modified positions already
+      condenseSrcs(tex, 1, n);
 
    condenseDefs(tex);
 }
diff --git a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp
index 5e541e514cb..8b11c6a2fdd 100644
--- a/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nv50/codegen/nv50_ir_target_nv50.cpp
@@ -310,7 +310,22 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
       return false;
    }
 
-   if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * typeSizeof(ld->dType)))
+   uint8_t ldSize;
+
+   if ((i->op == OP_MUL || i->op == OP_MAD) && !isFloatType(i->dType)) {
+      // 32-bit MUL will be split into 16-bit MULs
+      if (ld->src(0).isIndirect(0))
+         return false;
+      if (sf == FILE_IMMEDIATE)
+         return false;
+      ldSize = 2;
+   } else {
+      ldSize = typeSizeof(ld->dType);
+   }
+
+   if (ldSize < 4 && sf == FILE_SHADER_INPUT) // no < 4-byte aligned a[] access
+      return false;
+   if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * ldSize))
       return false;
 
    if (ld->src(0).isIndirect(0)) {
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 1cee0e06c02..44a0ba0f561 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -48,6 +48,7 @@
 #define NV50_NEW_CONSTBUF     (1 << 18)
 #define NV50_NEW_TEXTURES     (1 << 19)
 #define NV50_NEW_SAMPLERS     (1 << 20)
+#define NV50_NEW_STRMOUT      (1 << 21)
 #define NV50_NEW_CONTEXT      (1 << 31)
 
 #define NV50_BIND_FB          0
@@ -56,9 +57,10 @@
 #define NV50_BIND_INDEX       3
 #define NV50_BIND_TEXTURES    4
 #define NV50_BIND_CB(s, i)   (5 + 16 * (s) + (i))
-#define NV50_BIND_SCREEN     53
-#define NV50_BIND_TLS        54
-#define NV50_BIND_COUNT      55
+#define NV50_BIND_SO         53
+#define NV50_BIND_SCREEN     54
+#define NV50_BIND_TLS        55
+#define NV50_BIND_COUNT      56
 #define NV50_BIND_2D          0
 #define NV50_BIND_M2MF        0
 #define NV50_BIND_FENCE       1
@@ -92,11 +94,13 @@ struct nv50_context {
       boolean point_sprite;
       boolean rt_serialize;
       boolean flushed;
+      boolean rasterizer_discard;
       uint8_t tls_required;
       uint8_t num_vtxbufs;
       uint8_t num_vtxelts;
       uint8_t num_textures[3];
       uint8_t num_samplers[3];
+      uint8_t prim_size;
       uint16_t scissor;
    } state;
 
@@ -126,6 +130,10 @@ struct nv50_context {
    struct nv50_tsc_entry *samplers[3][PIPE_MAX_SAMPLERS];
    unsigned num_samplers[3];
 
+   uint8_t num_so_targets;
+   uint8_t so_targets_dirty;
+   struct pipe_stream_output_target *so_target[4];
+
    struct pipe_framebuffer_state framebuffer;
    struct pipe_blend_color blend_colour;
    struct pipe_stencil_ref stencil_ref;
@@ -168,6 +176,14 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
 
 /* nv50_query.c */
 void nv50_init_query_functions(struct nv50_context *);
+void nv50_query_pushbuf_submit(struct nouveau_pushbuf *,
+                               struct pipe_query *, unsigned result_offset);
+void nv84_query_fifo_wait(struct nouveau_pushbuf *, struct pipe_query *);
+void nva0_so_target_save_offset(struct pipe_context *,
+                                struct pipe_stream_output_target *,
+                                unsigned index, boolean seralize);
+
+#define NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
 /* nv50_shader_state.c */
 void nv50_vertprog_validate(struct nv50_context *);
@@ -177,6 +193,7 @@ void nv50_fp_linkage_validate(struct nv50_context *);
 void nv50_gp_linkage_validate(struct nv50_context *);
 void nv50_constbufs_validate(struct nv50_context *);
 void nv50_validate_derived_rs(struct nv50_context *);
+void nv50_stream_output_validate(struct nv50_context *);
 
 /* nv50_state.c */
 extern void nv50_init_state_functions(struct nv50_context *);
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 1b2e2934b79..ca40ac2dd43 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -68,6 +68,17 @@ nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
          break;
       }
    }
+
+   /*
+    * Corner case: VP has no inputs, but we will still need to submit data to
+    * draw it. HW will shout at us and won't draw anything if we don't enable
+    * any input, so let's just pretend it's the first one.
+    */
+   if (prog->vp.attrs[0] == 0 &&
+       prog->vp.attrs[1] == 0 &&
+       prog->vp.attrs[2] == 0)
+      prog->vp.attrs[0] |= 0xf;
+
    /* VertexID before InstanceID */
    if (info->io.vertexId < info->numSysVals)
       info->sv[info->io.vertexId].slot[0] = n++;
@@ -235,6 +246,59 @@ nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
    }
 }
 
+static struct nv50_stream_output_state *
+nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
+                                  const struct pipe_stream_output_info *pso)
+{
+   struct nv50_stream_output_state *so;
+   unsigned b, i, c;
+   unsigned base[4];
+
+   so = MALLOC_STRUCT(nv50_stream_output_state);
+   if (!so)
+      return NULL;
+   memset(so->map, 0xff, sizeof(so->map));
+
+   for (b = 0; b < 4; ++b)
+      so->num_attribs[b] = 0;
+   for (i = 0; i < pso->num_outputs; ++i) {
+      unsigned end =  pso->output[i].dst_offset + pso->output[i].num_components;
+      b = pso->output[i].output_buffer;
+      assert(b < 4);
+      so->num_attribs[b] = MAX2(so->num_attribs[b], end);
+   }
+
+   so->ctrl = NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED;
+
+   so->stride[0] = pso->stride[0] * 4;
+   base[0] = 0;
+   for (b = 1; b < 4; ++b) {
+      assert(!so->num_attribs[b] || so->num_attribs[b] == pso->stride[b]);
+      so->stride[b] = so->num_attribs[b] * 4;
+      if (so->num_attribs[b])
+         so->ctrl = (b + 1) << NV50_3D_STRMOUT_BUFFERS_CTRL_SEPARATE__SHIFT;
+      base[b] = align(base[b - 1] + so->num_attribs[b - 1], 4);
+   }
+   if (so->ctrl & NV50_3D_STRMOUT_BUFFERS_CTRL_INTERLEAVED) {
+      assert(so->stride[0] < NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__MAX);
+      so->ctrl |= so->stride[0] << NV50_3D_STRMOUT_BUFFERS_CTRL_STRIDE__SHIFT;
+   }
+
+   so->map_size = base[3] + so->num_attribs[3];
+
+   for (i = 0; i < pso->num_outputs; ++i) {
+      const unsigned s = pso->output[i].start_component;
+      const unsigned p = pso->output[i].dst_offset;
+      const unsigned r = pso->output[i].register_index;
+      b = pso->output[i].output_buffer;
+
+      for (c = 0; c < pso->output[i].num_components; ++c)
+         so->map[base[b] + p + c] = info->out[r].slot[s + c];
+   }
+
+   return so;
+}
+
 boolean
 nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
 {
@@ -293,6 +357,10 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_USES_KIL;
    }
 
+   if (prog->pipe.stream_output.num_outputs)
+      prog->so = nv50_program_create_strmout_state(info,
+                                                   &prog->pipe.stream_output);
+
 out:
    FREE(info);
    return !ret;
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 92361ad9946..f56268b5439 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -42,6 +42,15 @@ struct nv50_varying {
    ubyte si; /* semantic index */
 };
 
+struct nv50_stream_output_state
+{
+   uint32_t ctrl;
+   uint16_t stride[4];
+   uint8_t num_attribs[4];
+   uint8_t map_size;
+   uint8_t map[128];
+};
+
 struct nv50_program {
    struct pipe_shader_state pipe;
 
@@ -88,6 +97,8 @@ struct nv50_program {
    void *fixups; /* relocation records */
 
    struct nouveau_heap *mem;
+
+   struct nv50_stream_output_state *so;
 };
 
 boolean nv50_program_translate(struct nv50_program *, uint16_t chipset);
diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c
index 04e32b7e8b9..3abe189e7b5 100644
--- a/src/gallium/drivers/nv50/nv50_push.c
+++ b/src/gallium/drivers/nv50/nv50_push.c
@@ -210,7 +210,8 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
 {
    struct push_context ctx;
    unsigned i, index_size;
-   unsigned inst = info->instance_count;
+   unsigned inst_count = info->instance_count;
+   unsigned vert_count = info->count;
    boolean apply_bias = info->indexed && info->index_bias;
 
    ctx.push = nv50->base.pushbuf;
@@ -242,6 +243,17 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
       ctx.primitive_restart = info->primitive_restart;
       ctx.restart_index = info->restart_index;
    } else {
+      if (unlikely(info->count_from_stream_output)) {
+         struct pipe_context *pipe = &nv50->base.pipe;
+         struct nv50_so_target *targ;
+         targ = nv50_so_target(info->count_from_stream_output);
+         if (!targ->pq) {
+            NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
+            return;
+         }
+         pipe->get_query_result(pipe, targ->pq, TRUE, (void *)&vert_count);
+         vert_count /= targ->stride;
+      }
       ctx.idxbuf = NULL;
       index_size = 0;
       ctx.primitive_restart = FALSE;
@@ -262,21 +274,21 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
    }
    nv50->state.prim_restart = info->primitive_restart;
 
-   while (inst--) {
+   while (inst_count--) {
       BEGIN_NV04(ctx.push, NV50_3D(VERTEX_BEGIN_GL), 1);
       PUSH_DATA (ctx.push, ctx.prim);
       switch (index_size) {
       case 0:
-         emit_vertices_seq(&ctx, info->start, info->count);
+         emit_vertices_seq(&ctx, info->start, vert_count);
          break;
       case 1:
-         emit_vertices_i08(&ctx, info->start, info->count);
+         emit_vertices_i08(&ctx, info->start, vert_count);
          break;
       case 2:
-         emit_vertices_i16(&ctx, info->start, info->count);
+         emit_vertices_i16(&ctx, info->start, vert_count);
          break;
       case 4:
-         emit_vertices_i32(&ctx, info->start, info->count);
+         emit_vertices_i32(&ctx, info->start, vert_count);
          break;
       default:
          assert(0);
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 5275e74964a..8e62c5f11bc 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -36,7 +36,8 @@
 
 struct nv50_query {
    uint32_t *data;
-   uint32_t type;
+   uint16_t type;
+   uint16_t index;
    uint32_t sequence;
    struct nouveau_bo *bo;
    uint32_t base;
@@ -170,21 +171,15 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
       BEGIN_NV04(push, NV50_3D(SAMPLECNT_ENABLE), 1);
       PUSH_DATA (push, 1);
       break;
-   case PIPE_QUERY_PRIMITIVES_GENERATED: /* store before & after instead ? */
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES);
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      nv50_query_get(push, q, 0x10, 0x06805002);
       break;
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      PUSH_SPACE(push, 2);
-      BEGIN_NV04(push, NV50_3D(COUNTER_RESET), 1);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK);
+      nv50_query_get(push, q, 0x10, 0x05805002);
       break;
    case PIPE_QUERY_SO_STATISTICS:
-      PUSH_SPACE(push, 3);
-      BEGIN_NI04(push, NV50_3D(COUNTER_RESET), 2);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_TRANSFORM_FEEDBACK);
-      PUSH_DATA (push, NV50_3D_COUNTER_RESET_GENERATED_PRIMITIVES);
+      nv50_query_get(push, q, 0x20, 0x05805002);
+      nv50_query_get(push, q, 0x30, 0x06805002);
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
    case PIPE_QUERY_TIME_ELAPSED:
@@ -227,6 +222,9 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    case PIPE_QUERY_GPU_FINISHED:
       nv50_query_get(push, q, 0, 0x1000f010);
       break;
+   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      nv50_query_get(push, q, 0, 0x0d005002 | (q->index << 5));
+      break;
    default:
       assert(0);
       break;
@@ -247,6 +245,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    struct nv50_context *nv50 = nv50_context(pipe);
    struct nv50_query *q = nv50_query(pq);
    uint64_t *res64 = (uint64_t *)result;
+   uint32_t *res32 = (uint32_t *)result;
    boolean *res8 = (boolean *)result;
    uint64_t *data64 = (uint64_t *)q->data;
 
@@ -275,11 +274,11 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       break;
    case PIPE_QUERY_PRIMITIVES_GENERATED: /* u64 count, u64 time */
    case PIPE_QUERY_PRIMITIVES_EMITTED: /* u64 count, u64 time */
-      res64[0] = data64[0];
+      res64[0] = data64[0] - data64[2];
       break;
    case PIPE_QUERY_SO_STATISTICS:
-      res64[0] = data64[0];
-      res64[1] = data64[1];
+      res64[0] = data64[0] - data64[4];
+      res64[1] = data64[2] - data64[6];
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT: /* u32 sequence, u32 0, u64 time */
       res64[0] = 1000000000;
@@ -288,6 +287,9 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    case PIPE_QUERY_TIME_ELAPSED:
       res64[0] = data64[1] - data64[3];
       break;
+   case NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET:
+      res32[0] = q->data[1];
+      break;
    default:
       return FALSE;
    }
@@ -295,6 +297,21 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    return TRUE;
 }
 
+void
+nv84_query_fifo_wait(struct nouveau_pushbuf *push, struct pipe_query *pq)
+{
+   struct nv50_query *q = nv50_query(pq);
+   unsigned offset = q->offset;
+
+   PUSH_SPACE(push, 5);
+   PUSH_REFN (push, q->bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+   BEGIN_NV04(push, SUBC_3D(NV84_SUBCHAN_SEMAPHORE_ADDRESS_HIGH), 4);
+   PUSH_DATAh(push, q->bo->offset + offset);
+   PUSH_DATA (push, q->bo->offset + offset);
+   PUSH_DATA (push, q->sequence);
+   PUSH_DATA (push, NV84_SUBCHAN_SEMAPHORE_TRIGGER_ACQUIRE_EQUAL);
+}
+
 static void
 nv50_render_condition(struct pipe_context *pipe,
                       struct pipe_query *pq, uint mode)
@@ -325,6 +342,38 @@ nv50_render_condition(struct pipe_context *pipe,
 }
 
 void
+nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
+                          struct pipe_query *pq, unsigned result_offset)
+{
+   struct nv50_query *q = nv50_query(pq);
+
+   /* XXX: does this exist ? */
+#define NV50_IB_ENTRY_1_NO_PREFETCH (0 << (31 - 8))
+
+   nouveau_pushbuf_space(push, 0, 0, 1);
+   nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
+                        NV50_IB_ENTRY_1_NO_PREFETCH);
+}
+
+void
+nva0_so_target_save_offset(struct pipe_context *pipe,
+                           struct pipe_stream_output_target *ptarg,
+                           unsigned index, boolean serialize)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+
+   if (serialize) {
+      struct nouveau_pushbuf *push = nv50_context(pipe)->base.pushbuf;
+      PUSH_SPACE(push, 2);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   nv50_query(targ->pq)->index = index;
+   nv50_query_end(pipe, targ->pq);
+}
+
+void
 nv50_init_query_functions(struct nv50_context *nv50)
 {
    struct pipe_context *pipe = &nv50->base.pipe;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index a6dfbedf299..c96e028b2a2 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -73,6 +73,8 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 static int
 nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 {
+   const uint16_t class_3d = nouveau_screen(pscreen)->class_3d;
+
    switch (param) {
    case PIPE_CAP_MAX_COMBINED_SAMPLERS:
       return 64;
@@ -82,8 +84,8 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
       return 12;
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 14;
-   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS: /* shader support missing */
-      return 0;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      return 512;
    case PIPE_CAP_MIN_TEXEL_OFFSET:
       return -8;
    case PIPE_CAP_MAX_TEXEL_OFFSET:
@@ -95,7 +97,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_ANISOTROPIC_FILTER:
    case PIPE_CAP_SCALED_RESOLVE:
       return 1;
-   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return nv50_screen(pscreen)->tesla->oclass >= NVA0_3D_CLASS;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
@@ -121,11 +122,12 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_OCCLUSION_QUERY:
       return 1;
    case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
-      return 0;
+      return 4;
    case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
-      return 128;
    case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
-      return 32;
+      return 64;
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+      return (class_3d >= NVA0_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_BLEND_EQUATION_SEPARATE:
    case PIPE_CAP_INDEP_BLEND_ENABLE:
       return 1;
diff --git a/src/gallium/drivers/nv50/nv50_shader_state.c b/src/gallium/drivers/nv50/nv50_shader_state.c
index aef3f129c81..d070f07bbbc 100644
--- a/src/gallium/drivers/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nv50/nv50_shader_state.c
@@ -207,6 +207,8 @@ nv50_gmtyprog_validate(struct nv50_context *nv50)
       PUSH_DATA (push, gp->gp.vert_count);
       BEGIN_NV04(push, NV50_3D(GP_START_ID), 1);
       PUSH_DATA (push, gp->code_base);
+
+      nv50->state.prim_size = gp->gp.prim_type; /* enum matches vertex count */
    }
    nv50_program_update_context_state(nv50, gp, 2);
 
@@ -278,6 +280,12 @@ nv50_validate_derived_rs(struct nv50_context *nv50)
 
    nv50_sprite_coords_validate(nv50);
 
+   if (nv50->state.rasterizer_discard != nv50->rast->pipe.rasterizer_discard) {
+      nv50->state.rasterizer_discard = nv50->rast->pipe.rasterizer_discard;
+      BEGIN_NV04(push, NV50_3D(RASTERIZE_ENABLE), 1);
+      PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
+   }
+
    if (nv50->dirty & NV50_NEW_FRAGPROG)
       return;
    psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
@@ -343,6 +351,7 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    uint32_t colors = fp->fp.colors;
    uint32_t lin[4];
    uint8_t map[64];
+   uint8_t so_map[64];
 
    if (!(nv50->dirty & (NV50_NEW_VERTPROG |
                         NV50_NEW_FRAGPROG |
@@ -411,6 +420,30 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    if (nv50->rast->pipe.clamp_vertex_color)
       colors |= NV50_3D_SEMANTIC_COLOR_CLMP_EN;
 
+   if (unlikely(vp->so)) {
+      /* Slot i in STRMOUT_MAP specifies the offset where slot i in RESULT_MAP
+       * gets written.
+       *
+       * TODO:
+       * Inverting vp->so->map (output -> offset) would probably speed this up.
+       */
+      memset(so_map, 0, sizeof(so_map));
+      for (i = 0; i < vp->so->map_size; ++i) {
+         if (vp->so->map[i] == 0xff)
+            continue;
+         for (c = 0; c < m; ++c)
+            if (map[c] == vp->so->map[i] && !so_map[c])
+               break;
+         if (c == m) {
+            c = m;
+            map[m++] = vp->so->map[i];
+         }
+         so_map[c] = 0x80 | i;
+      }
+      for (c = m; c & 3; ++c)
+         so_map[c] = 0;
+   }
+
    n = (m + 3) / 4;
    assert(m <= 64);
 
@@ -451,6 +484,11 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
 
    BEGIN_NV04(push, NV50_3D(GP_ENABLE), 1);
    PUSH_DATA (push, nv50->gmtyprog ? 1 : 0);
+
+   if (vp->so) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_MAP(0)), n);
+      PUSH_DATAp(push, so_map, n);
+   }
 }
 
 static int
@@ -509,3 +547,75 @@ nv50_gp_linkage_validate(struct nv50_context *nv50)
    BEGIN_NV04(push, NV50_3D(VP_RESULT_MAP(0)), n);
    PUSH_DATAp(push, map, n);
 }
+
+void
+nv50_stream_output_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_stream_output_state *so;
+   uint32_t ctrl;
+   unsigned i;
+   unsigned prims = ~0;
+
+   so = nv50->gmtyprog ? nv50->gmtyprog->so : nv50->vertprog->so;
+
+   if (!so || !nv50->num_so_targets) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1);
+      PUSH_DATA (push, 0);
+      if (nv50->screen->base.class_3d < NVA0_3D_CLASS) {
+         BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+         PUSH_DATA (push, 0);
+      }
+      BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1);
+      PUSH_DATA (push, 1);
+      return;
+   }
+
+   ctrl = so->ctrl;
+   if (nv50->screen->base.class_3d >= NVA0_3D_CLASS)
+      ctrl |= NVA0_3D_STRMOUT_BUFFERS_CTRL_LIMIT_MODE_OFFSET;
+
+   BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
+   PUSH_DATA (push, ctrl);
+
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
+
+   for (i = 0; i < nv50->num_so_targets; ++i) {
+      struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
+      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
+
+      const unsigned n = nv50->screen->base.class_3d >= NVA0_3D_CLASS ? 4 : 3;
+
+      if (n == 4 && !targ->clean)
+         nv84_query_fifo_wait(push, targ->pq);
+      BEGIN_NV04(push, NV50_3D(STRMOUT_ADDRESS_HIGH(i)), n);
+      PUSH_DATAh(push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, buf->address + targ->pipe.buffer_offset);
+      PUSH_DATA (push, so->num_attribs[i]);
+      if (n == 4) {
+         PUSH_DATA(push, targ->pipe.buffer_size);
+
+         BEGIN_NV04(push, NVA0_3D(STRMOUT_OFFSET(i)), 1);
+         if (!targ->clean) {
+            assert(targ->pq);
+            nv50_query_pushbuf_submit(push, targ->pq, 0x4);
+         } else {
+            PUSH_DATA(push, 0);
+            targ->clean = FALSE;
+         }
+      } else {
+         const unsigned limit = targ->pipe.buffer_size /
+            (so->stride[i] * nv50->state.prim_size);
+         prims = MIN2(prims, limit);
+      }
+      BCTX_REFN(nv50->bufctx_3d, SO, buf, WR);
+   }
+   if (prims != ~0) {
+      BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+      PUSH_DATA (push, prims);
+   }
+   BEGIN_NV04(push, NV50_3D(STRMOUT_PARAMS_LATCH), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_3D(STRMOUT_ENABLE), 1);
+   PUSH_DATA (push, 1);
+}
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index a17540a1492..7f840e2b42e 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -680,6 +680,9 @@ nv50_sp_state_create(struct pipe_context *pipe,
    prog->type = type;
    prog->pipe.tokens = tgsi_dup_tokens(cso->tokens);
 
+   if (cso->stream_output.num_outputs)
+      prog->pipe.stream_output = cso->stream_output;
+
    return (void *)prog;
 }
 
@@ -920,6 +923,90 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
    nv50->dirty |= NV50_NEW_VERTEX;
 }
 
+static struct pipe_stream_output_target *
+nv50_so_target_create(struct pipe_context *pipe,
+                      struct pipe_resource *res,
+                      unsigned offset, unsigned size)
+{
+   struct nv50_so_target *targ = MALLOC_STRUCT(nv50_so_target);
+   if (!targ)
+      return NULL;
+
+   if (nouveau_context(pipe)->screen->class_3d >= NVA0_3D_CLASS) {
+      targ->pq = pipe->create_query(pipe,
+                                    NVA0_QUERY_STREAM_OUTPUT_BUFFER_OFFSET);
+      if (!targ->pq) {
+         FREE(targ);
+         return NULL;
+      }
+   } else {
+      targ->pq = NULL;
+   }
+   targ->clean = TRUE;
+
+   targ->pipe.buffer_size = size;
+   targ->pipe.buffer_offset = offset;
+   targ->pipe.context = pipe;
+   targ->pipe.buffer = NULL;
+   pipe_resource_reference(&targ->pipe.buffer, res);
+   pipe_reference_init(&targ->pipe.reference, 1);
+
+   return &targ->pipe;
+}
+
+static void
+nv50_so_target_destroy(struct pipe_context *pipe,
+                       struct pipe_stream_output_target *ptarg)
+{
+   struct nv50_so_target *targ = nv50_so_target(ptarg);
+   if (targ->pq)
+      pipe->destroy_query(pipe, targ->pq);
+   FREE(targ);
+}
+
+static void
+nv50_set_stream_output_targets(struct pipe_context *pipe,
+                               unsigned num_targets,
+                               struct pipe_stream_output_target **targets,
+                               unsigned append_mask)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   unsigned i;
+   boolean serialize = TRUE;
+   const boolean can_resume = nv50->screen->base.class_3d >= NVA0_3D_CLASS;
+
+   assert(num_targets <= 4);
+
+   for (i = 0; i < num_targets; ++i) {
+      const boolean changed = nv50->so_target[i] != targets[i];
+      if (!changed && (append_mask & (1 << i)))
+         continue;
+      nv50->so_targets_dirty |= 1 << i;
+
+      if (can_resume && changed && nv50->so_target[i]) {
+         nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
+         serialize = FALSE;
+      }
+
+      if (targets[i] && !(append_mask & (1 << i)))
+         nv50_so_target(targets[i])->clean = TRUE;
+
+      pipe_so_target_reference(&nv50->so_target[i], targets[i]);
+   }
+   for (; i < nv50->num_so_targets; ++i) {
+      if (can_resume && nv50->so_target[i]) {
+         nva0_so_target_save_offset(pipe, nv50->so_target[i], i, serialize);
+         serialize = FALSE;
+      }
+      pipe_so_target_reference(&nv50->so_target[i], NULL);
+      nv50->so_targets_dirty |= 1 << i;
+   }
+   nv50->num_so_targets = num_targets;
+
+   if (nv50->so_targets_dirty)
+      nv50->dirty |= NV50_NEW_STRMOUT;
+}
+
 void
 nv50_init_state_functions(struct nv50_context *nv50)
 {
@@ -975,5 +1062,8 @@ nv50_init_state_functions(struct nv50_context *nv50)
 
    pipe->set_vertex_buffers = nv50_set_vertex_buffers;
    pipe->set_index_buffer = nv50_set_index_buffer;
-}
 
+   pipe->create_stream_output_target = nv50_so_target_create;
+   pipe->stream_output_target_destroy = nv50_so_target_destroy;
+   pipe->set_stream_output_targets = nv50_set_stream_output_targets;
+}
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index c19acf6c426..a95e96d3c51 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -360,6 +360,8 @@ static struct state_validate {
     { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
     { nv50_validate_textures,      NV50_NEW_TEXTURES },
     { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
+    { nv50_stream_output_validate, NV50_NEW_STRMOUT |
+                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
     { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS }
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
diff --git a/src/gallium/drivers/nv50/nv50_stateobj.h b/src/gallium/drivers/nv50/nv50_stateobj.h
index 188406da600..8a9260c937e 100644
--- a/src/gallium/drivers/nv50/nv50_stateobj.h
+++ b/src/gallium/drivers/nv50/nv50_stateobj.h
@@ -51,4 +51,17 @@ struct nv50_vertex_stateobj {
    struct nv50_vertex_element element[0];
 };
 
+struct nv50_so_target {
+   struct pipe_stream_output_target pipe;
+   struct pipe_query *pq;
+   unsigned stride;
+   boolean clean;
+};
+
+static INLINE struct nv50_so_target *
+nv50_so_target(struct pipe_stream_output_target *ptarg)
+{
+   return (struct nv50_so_target *)ptarg;
+}
+
 #endif
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index b38e49ffcc1..15c88d5316d 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -711,7 +711,7 @@ nv50_blit_set_src(struct nv50_context *nv50,
    templ.swizzle_a = PIPE_SWIZZLE_ALPHA;
 
    nv50->textures[2][0] = nv50_create_sampler_view(pipe, res, &templ);
-   nv50->textures[2][0] = NULL;
+   nv50->textures[2][1] = NULL;
 
    nv50_blit_fixup_tic_entry(nv50->textures[2][0]);
 
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index bc01e69decf..323677eaf80 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -405,6 +405,25 @@ nv50_prim_gl(unsigned prim)
    }
 }
 
+/* For pre-nva0 transform feedback. */
+static const uint8_t nv50_pipe_prim_to_prim_size[PIPE_PRIM_MAX + 1] =
+{
+   [PIPE_PRIM_POINTS] = 1,
+   [PIPE_PRIM_LINES] = 2,
+   [PIPE_PRIM_LINE_LOOP] = 2,
+   [PIPE_PRIM_LINE_STRIP] = 2,
+   [PIPE_PRIM_TRIANGLES] = 3,
+   [PIPE_PRIM_TRIANGLE_STRIP] = 3,
+   [PIPE_PRIM_TRIANGLE_FAN] = 3,
+   [PIPE_PRIM_QUADS] = 3,
+   [PIPE_PRIM_QUAD_STRIP] = 3,
+   [PIPE_PRIM_POLYGON] = 3,
+   [PIPE_PRIM_LINES_ADJACENCY] = 2,
+   [PIPE_PRIM_LINE_STRIP_ADJACENCY] = 2,
+   [PIPE_PRIM_TRIANGLES_ADJACENCY] = 3,
+   [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = 3
+};
+
 static void
 nv50_draw_arrays(struct nv50_context *nv50,
                  unsigned mode, unsigned start, unsigned count,
@@ -624,6 +643,51 @@ nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
 }
 
 static void
+nva0_draw_stream_output(struct nv50_context *nv50,
+                        const struct pipe_draw_info *info)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_so_target *so = nv50_so_target(info->count_from_stream_output);
+   struct nv04_resource *res = nv04_resource(so->pipe.buffer);
+   unsigned num_instances = info->instance_count;
+   unsigned mode = nv50_prim_gl(info->mode);
+
+   if (unlikely(nv50->screen->base.class_3d < NVA0_3D_CLASS)) {
+      /* A proper implementation without waiting doesn't seem possible,
+       * so don't bother.
+       */
+      NOUVEAU_ERR("draw_stream_output not supported on pre-NVA0 cards\n");
+      return;
+   }
+
+   if (res->status & NOUVEAU_BUFFER_STATUS_GPU_WRITING) {
+      res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
+      PUSH_SPACE(push, 4);
+      BEGIN_NV04(push, SUBC_3D(NV50_GRAPH_SERIALIZE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_FLUSH), 1);
+      PUSH_DATA (push, 0);
+   }
+
+   assert(num_instances);
+   do {
+      PUSH_SPACE(push, 8);
+      BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
+      PUSH_DATA (push, mode);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BASE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_STRIDE), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NVA0_3D(DRAW_TFB_BYTES), 1);
+      nv50_query_pushbuf_submit(push, so->pq, 0x4);
+      BEGIN_NV04(push, NV50_3D(VERTEX_END_GL), 1);
+      PUSH_DATA (push, 0);
+
+      mode |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
+   } while (--num_instances);
+}
+
+static void
 nv50_draw_vbo_kick_notify(struct nouveau_pushbuf *chan)
 {
    struct nv50_screen *screen = chan->user_priv;
@@ -655,6 +719,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv50->vbo_user && !(nv50->dirty & (NV50_NEW_VERTEX | NV50_NEW_ARRAYS)))
       nv50_update_user_vbufs(nv50);
 
+   if (unlikely(nv50->num_so_targets && !nv50->gmtyprog))
+      nv50->state.prim_size = nv50_pipe_prim_to_prim_size[info->mode];
+
    nv50_state_validate(nv50, ~0, 8); /* 8 as minimum, we use flush_notify */
 
    push->kick_notify = nv50_draw_vbo_kick_notify;
@@ -679,11 +746,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nv50->base.vbo_dirty = FALSE;
    }
 
-   if (!info->indexed) {
-      nv50_draw_arrays(nv50,
-                       info->mode, info->start, info->count,
-                       info->instance_count);
-   } else {
+   if (info->indexed) {
       boolean shorten = info->max_index <= 65535;
 
       assert(nv50->idxbuf.buffer);
@@ -713,6 +776,13 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       nv50_draw_elements(nv50, shorten,
                          info->mode, info->start, info->count,
                          info->instance_count, info->index_bias);
+   } else
+   if (unlikely(info->count_from_stream_output)) {
+      nva0_draw_stream_output(nv50, info);
+   } else {
+      nv50_draw_arrays(nv50,
+                       info->mode, info->start, info->count,
+                       info->instance_count);
    }
    push->kick_notify = nv50_default_kick_notify;
 
diff --git a/src/gallium/drivers/nv50/nv50_winsys.h b/src/gallium/drivers/nv50/nv50_winsys.h
index b36898dabe6..145ee70cb9f 100644
--- a/src/gallium/drivers/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nv50/nv50_winsys.h
@@ -49,6 +49,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 
 #define SUBC_3D(m) 3, (m)
 #define NV50_3D(n) SUBC_3D(NV50_3D_##n)
+#define NVA0_3D(n) SUBC_3D(NVA0_3D_##n)
 
 #define SUBC_2D(m) 4, (m)
 #define NV50_2D(n) SUBC_2D(NV50_2D_##n)
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
index 2ca4979dc74..fbd1aa5dfc9 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_emit_nvc0.cpp
@@ -1020,7 +1020,7 @@ CodeEmitterNVC0::emitTEX(const TexInstruction *i)
       code[1] |= 0x02000000;
    }
 
-   if (i->tex.derivAll)
+   if (i->op != OP_TXD && i->tex.derivAll)
       code[1] |= 1 << 13;
 
    defId(i->def(0), 14);
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
index 02ae9fd5d0e..900e998df8d 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_lowering_nvc0.cpp
@@ -749,21 +749,22 @@ bool
 NVC0LoweringPass::handleTXD(TexInstruction *txd)
 {
    int dim = txd->tex.target.getDim();
-   int arg = txd->tex.target.getDim() + txd->tex.target.isArray();
+   int arg = txd->tex.target.getArgCount();
 
    handleTEX(txd);
-   while (txd->src(arg).exists())
+   while (txd->srcExists(arg))
       ++arg;
 
    txd->tex.derivAll = true;
-   if (dim > 2 || txd->tex.target.isShadow())
+   if (dim > 2 ||
+       txd->tex.target.isCube() ||
+       arg > 4 ||
+       txd->tex.target.isShadow())
       return handleManualTXD(txd);
 
-   assert(arg <= 4); // at most s/t/array, x, y, offset
-
    for (int c = 0; c < dim; ++c) {
-      txd->src(arg + c * 2 + 0).set(txd->dPdx[c]);
-      txd->src(arg + c * 2 + 1).set(txd->dPdy[c]);
+      txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
+      txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
       txd->dPdx[c].set(NULL);
       txd->dPdy[c].set(NULL);
    }
diff --git a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
index 10c2d09d657..e4b9dc18311 100644
--- a/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nvc0/codegen/nv50_ir_target_nvc0.cpp
@@ -223,6 +223,9 @@ static const struct opProperties _initProps[] =
    { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
    { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
    { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_CEIL,   0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_FLOOR,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_TRUNC,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
    { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
    { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
    { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
diff --git a/src/gallium/drivers/r300/r300_vs_draw.c b/src/gallium/drivers/r300/r300_vs_draw.c
index 69d67585d8b..b9e73dd514b 100644
--- a/src/gallium/drivers/r300/r300_vs_draw.c
+++ b/src/gallium/drivers/r300/r300_vs_draw.c
@@ -94,11 +94,12 @@ static void emit_output(struct tgsi_transform_context *ctx,
 
     decl = tgsi_default_full_declaration();
     decl.Declaration.File = TGSI_FILE_OUTPUT;
-    decl.Declaration.Interpolate = interp;
+    decl.Declaration.Interpolate = 1;
     decl.Declaration.Semantic = TRUE;
     decl.Semantic.Name = name;
     decl.Semantic.Index = index;
     decl.Range.First = decl.Range.Last = reg;
+    decl.Interp.Interpolate = interp;
     ctx->emit_declaration(ctx, &decl);
     ++vsctx->num_outputs;
 }
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index 3089a829e53..77d2674d262 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -29,7 +29,7 @@ libr600_a_SOURCES += \
 	$(LLVM_C_SOURCES)
 
 libr600_a_LIBADD = \
-	$(top_srcdir)/src/gallium/drivers/radeon/libradeon.a
+	$(top_builddir)/src/gallium/drivers/radeon/libradeon.a
 
 AM_CFLAGS += \
 	$(LLVM_CFLAGS) \
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index b6d03ef37de..d2c1679796a 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -133,6 +133,10 @@ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf)
 					S_SQ_CF_WORD1_COND(cf->cond) |
 					S_SQ_CF_WORD1_POP_COUNT(cf->pop_count);
 		break;
+	case CF_NATIVE:
+		bc->bytecode[id++] = cf->isa[0];
+		bc->bytecode[id++] = cf->isa[1];
+		break;
 	default:
 		R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
 		return -EINVAL;
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 517121dc288..81aedb5c0ac 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -796,11 +796,11 @@ static void *evergreen_create_dsa_state(struct pipe_context *ctx,
 		alpha_test_control |= S_028410_ALPHA_TEST_ENABLE(1);
 		alpha_ref = fui(state->alpha.ref_value);
 	}
+	dsa->sx_alpha_test_control = alpha_test_control & 0xff;
 	dsa->alpha_ref = alpha_ref;
 
 	/* misc */
 	db_render_control = 0;
-	r600_pipe_state_add_reg(rstate, R_028410_SX_ALPHA_TEST_CONTROL, alpha_test_control);
 	r600_pipe_state_add_reg(rstate, R_028800_DB_DEPTH_CONTROL, db_depth_control);
 	r600_pipe_state_add_reg(rstate, R_028000_DB_RENDER_CONTROL, db_render_control);
 	return rstate;
@@ -1428,6 +1428,11 @@ static void evergreen_cb(struct r600_context *rctx, struct r600_pipe_state *rsta
 		blend_bypass = 1;
 	}
 
+	if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT)
+		rctx->sx_alpha_test_control |= S_028410_ALPHA_TEST_BYPASS(1);
+	else
+		rctx->sx_alpha_test_control &= C_028410_ALPHA_TEST_BYPASS;
+
 	color_info |= S_028C70_FORMAT(format) |
 		S_028C70_COMP_SWAP(swap) |
 		S_028C70_BLEND_CLAMP(blend_clamp) |
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index 4009e91d4fc..105d80f061d 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -32,20 +32,16 @@
 #define EVERGREEN_CONTEXT_REG_OFFSET                0X00028000
 #define EVERGREEN_CONTEXT_REG_END                   0X00029000
 #define EVERGREEN_RESOURCE_OFFSET                   0x00030000
-#define EVERGREEN_RESOURCE_END                      0x00034000
-#define CAYMAN_RESOURCE_END                         0x00038000
+#define EVERGREEN_RESOURCE_END                      0x00038000
 #define EVERGREEN_LOOP_CONST_OFFSET                 0x0003A200
-#define EVERGREEN_LOOP_CONST_END                    0x0003A26C
+#define EVERGREEN_LOOP_CONST_END                    0x0003A500
 #define EVERGREEN_BOOL_CONST_OFFSET                 0x0003A500
-#define EVERGREEN_BOOL_CONST_END                    0x0003A506
-#define CAYMAN_BOOL_CONST_END                       0x0003A518
+#define EVERGREEN_BOOL_CONST_END                    0x0003A518
 #define EVERGREEN_SAMPLER_OFFSET                    0X0003C000
-#define EVERGREEN_SAMPLER_END                       0X0003CFF0
-#define CAYMAN_SAMPLER_END                          0X0003C600
+#define EVERGREEN_SAMPLER_END                       0X0003C600
 
 #define EVERGREEN_CTL_CONST_OFFSET                  0x0003CFF0
-#define EVERGREEN_CTL_CONST_END                     0x0003E200
-#define CAYMAN_CTL_CONST_END                        0x0003FF0C
+#define EVERGREEN_CTL_CONST_END                     0x0003FF0C
 
 #define EVENT_TYPE_PS_PARTIAL_FLUSH            0x10
 #define EVENT_TYPE_ZPASS_DONE                  0x15
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 651933bf37c..5a10bd90776 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -94,6 +94,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
@@ -153,6 +154,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE_UINT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETGE:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE:
@@ -171,6 +173,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode *
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE:
@@ -1927,6 +1930,7 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
 			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
+			case CF_NATIVE:
 				break;
 			default:
 				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
@@ -2025,13 +2029,12 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 				}
 				break;
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX:
-				if (bc->chip_class == CAYMAN) {
-					LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
-						r = r600_bytecode_vtx_build(bc, vtx, addr);
-						if (r)
-							return r;
-						addr += 4;
-					}
+				LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
+					assert(bc->chip_class >= EVERGREEN);
+					r = r600_bytecode_vtx_build(bc, vtx, addr);
+					if (r)
+						return r;
+					addr += 4;
 				}
 				LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
 					r = r600_bytecode_tex_build(bc, tex, addr);
@@ -2069,6 +2072,8 @@ int r600_bytecode_build(struct r600_bytecode *bc)
 			case EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
 			case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
 				break;
+			case CF_NATIVE:
+				break;
 			default:
 				R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
 				return -EINVAL;
@@ -2341,6 +2346,10 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 				fprintf(stderr, "COND:%X ", cf->cond);
 				fprintf(stderr, "POP_COUNT:%X\n", cf->pop_count);
 				break;
+			case CF_NATIVE:
+				fprintf(stderr, "%04d %08X CF NATIVE\n", id, bc->bytecode[id]);
+				fprintf(stderr, "%04d %08X CF NATIVE\n", id + 1, bc->bytecode[id + 1]);
+				break;
 			default:
 				R600_ERR("Unknown instruction %0x\n", cf->inst);
 			}
@@ -2477,7 +2486,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc)
 			if (alu->last) {
 				for (i = 0; i < nliteral; i++, id++) {
 					float *f = (float*)(bc->bytecode + id);
-					fprintf(stderr, "%04d %08X\t%f\n", id, bc->bytecode[id], *f);
+					fprintf(stderr, "%04d %08X\t%f (%d)\n", id, bc->bytecode[id], *f,
+							*(bc->bytecode + id));
 				}
 				id += nliteral & 1;
 				nliteral = 0;
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 5790ead991f..a8a157b79e4 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -135,6 +135,14 @@ struct r600_bytecode_kcache {
 	unsigned			addr;
 };
 
+/* A value of CF_NATIVE in r600_bytecode_cf::inst means that this instruction
+ * has already been encoded, and the encoding has been stored in
+ * r600_bytecode::isa.  This is used by the LLVM backend to emit CF instructions
+ * e.g. RAT_WRITE_* that can't be properly represented by struct
+ * r600_bytecode_cf.
+ */
+#define CF_NATIVE ~0
+
 struct r600_bytecode_cf {
 	struct list_head		list;
 
@@ -157,6 +165,7 @@ struct r600_bytecode_cf {
 	struct r600_bytecode_alu		*curr_bs_head;
 	struct r600_bytecode_alu		*prev_bs_head;
 	struct r600_bytecode_alu		*prev2_bs_head;
+	unsigned isa[2];
 };
 
 #define FC_NONE				0
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
index d467baf60fb..f916604db7b 100644
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ b/src/gallium/drivers/r600/r600_llvm.c
@@ -21,10 +21,44 @@ static LLVMValueRef llvm_fetch_const(
 	enum tgsi_opcode_type type,
 	unsigned swizzle)
 {
-	return lp_build_intrinsic_unary(bld_base->base.gallivm->builder,
+	LLVMValueRef cval = lp_build_intrinsic_unary(bld_base->base.gallivm->builder,
 		"llvm.AMDGPU.load.const", bld_base->base.elem_type,
 		lp_build_const_int32(bld_base->base.gallivm,
 		radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)));
+
+	return bitcast(bld_base, type, cval);
+}
+
+static void llvm_load_system_value(
+		struct radeon_llvm_context * ctx,
+		unsigned index,
+		const struct tgsi_full_declaration *decl)
+{
+	unsigned chan;
+
+	switch (decl->Semantic.Name) {
+	case TGSI_SEMANTIC_INSTANCEID: chan = 3; break;
+	case TGSI_SEMANTIC_VERTEXID: chan = 0; break;
+	default: assert(!"unknown system value");
+	}
+
+	LLVMValueRef reg = lp_build_const_int32(
+			ctx->soa.bld_base.base.gallivm, chan);
+	ctx->system_values[index] = lp_build_intrinsic_unary(
+			ctx->soa.bld_base.base.gallivm->builder,
+			"llvm.R600.load.input",
+			ctx->soa.bld_base.base.elem_type, reg);
+}
+
+static LLVMValueRef llvm_fetch_system_value(
+		struct lp_build_tgsi_context * bld_base,
+		const struct tgsi_full_src_register *reg,
+		enum tgsi_opcode_type type,
+		unsigned swizzle)
+{
+	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+	LLVMValueRef cval = ctx->system_values[reg->Register.Index];
+	return bitcast(bld_base, type, cval);
 }
 
 static void llvm_load_input(
@@ -59,17 +93,13 @@ static void llvm_emit_prologue(struct lp_build_tgsi_context * bld_base)
 	for (i = 0; i < ctx->reserved_reg_count; i++) {
 		unsigned chan;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			LLVMValueRef reg;
 			LLVMValueRef reg_index = lp_build_const_int32(
 					base->gallivm,
 					radeon_llvm_reg_index_soa(i, chan));
-			reg = lp_build_intrinsic_unary(base->gallivm->builder,
-						"llvm.AMDGPU.reserve.reg",
-						base->elem_type, reg_index);
 			lp_build_intrinsic_unary(base->gallivm->builder,
-				"llvm.AMDGPU.export.reg",
+				"llvm.AMDGPU.reserve.reg",
 				LLVMVoidTypeInContext(base->gallivm->context),
-				reg);
+				reg_index);
 		}
 	}
 }
@@ -85,7 +115,6 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 		unsigned chan;
 		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
 			LLVMValueRef output;
-			LLVMValueRef store_output;
 			unsigned adjusted_reg_idx = i +
 					ctx->reserved_reg_count;
 			LLVMValueRef reg_index = lp_build_const_int32(
@@ -95,16 +124,11 @@ static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 			output = LLVMBuildLoad(base->gallivm->builder,
 				ctx->soa.outputs[i][chan], "");
 
-			store_output = lp_build_intrinsic_binary(
+			lp_build_intrinsic_binary(
 				base->gallivm->builder,
 				"llvm.AMDGPU.store.output",
-				base->elem_type,
-				output, reg_index);
-
-			lp_build_intrinsic_unary(base->gallivm->builder,
-				"llvm.AMDGPU.export.reg",
 				LLVMVoidTypeInContext(base->gallivm->context),
-				store_output);
+				output, reg_index);
 		}
 	}
 }
@@ -169,28 +193,7 @@ static struct lp_build_tgsi_action dot_action = {
 	.intr_name = "llvm.AMDGPU.dp4"
 };
 
-static void txp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	LLVMValueRef src_w;
-	unsigned chan;
-	LLVMValueRef coords[4];
-
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
 
-	for (chan = 0; chan < 3; chan++ ) {
-		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		coords[chan] = lp_build_emit_llvm_binary(bld_base,
-					TGSI_OPCODE_DIV, arg, src_w);
-	}
-	coords[3] = bld_base->base.one;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->arg_count = 1;
-}
 
 LLVMModuleRef r600_tgsi_llvm(
 	struct radeon_llvm_context * ctx,
@@ -204,20 +207,25 @@ LLVMModuleRef r600_tgsi_llvm(
 	bld_base->info = &shader_info;
 	bld_base->userdata = ctx;
 	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const;
+	bld_base->emit_fetch_funcs[TGSI_FILE_SYSTEM_VALUE] = llvm_fetch_system_value;
 	bld_base->emit_prologue = llvm_emit_prologue;
 	bld_base->emit_epilogue = llvm_emit_epilogue;
 	ctx->userdata = ctx;
 	ctx->load_input = llvm_load_input;
+	ctx->load_system_value = llvm_load_system_value;
 
 	bld_base->op_actions[TGSI_OPCODE_DP2] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action;
 	bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action;
+	bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
 	bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex;
 
 	lp_build_tgsi_llvm(bld_base, tokens);
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 63fc27564d7..db455f021ad 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -161,6 +161,7 @@ struct r600_pipe_dsa {
 	ubyte				valuemask[2];
 	ubyte				writemask[2];
 	bool				is_flush;
+	unsigned                        sx_alpha_test_control;
 };
 
 struct r600_vertex_element
@@ -250,6 +251,7 @@ struct r600_context {
 	struct pipe_framebuffer_state	framebuffer;
 	unsigned			cb_target_mask;
 	unsigned			fb_cb_shader_mask;
+	unsigned			sx_alpha_test_control;
 	unsigned			cb_shader_mask;
 	unsigned			cb_color_control;
 	unsigned			pa_sc_line_stipple;
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 5e22b35ba48..cd78104a010 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -287,6 +287,7 @@ static void llvm_if(struct r600_shader_ctx *ctx, struct r600_bytecode_alu * alu,
 {
 	alu->inst = pred_inst; 
 	alu->predicate = 1;
+	alu->dst.write = 0;
 	alu->src[1].sel = V_SQ_ALU_SRC_0;
 	alu->src[1].chan = 0;
 	alu->last = 1;
@@ -362,6 +363,10 @@ static unsigned r600_fc_from_byte_stream(struct r600_shader_ctx *ctx,
 			tgsi_loop_brk_cont(ctx);
 		}
 		break;
+	case 8:
+		r600_break_from_byte_stream(ctx, &alu,
+			CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETE_INT));
+		break;
 	}
 
 	return bytes_read;
@@ -401,10 +406,43 @@ static unsigned r600_tex_from_byte_stream(struct r600_shader_ctx *ctx,
 	return bytes_read;
 }
 
+static int r600_vtx_from_byte_stream(struct r600_shader_ctx *ctx,
+	unsigned char * bytes, unsigned bytes_read)
+{
+	struct r600_bytecode_vtx vtx;
+	memset(&vtx, 0, sizeof(vtx));
+	vtx.inst = bytes[bytes_read++];
+	vtx.fetch_type = bytes[bytes_read++];
+	vtx.buffer_id = bytes[bytes_read++];
+	vtx.src_gpr = bytes[bytes_read++];
+	vtx.src_sel_x = bytes[bytes_read++];
+	vtx.mega_fetch_count = bytes[bytes_read++];
+	vtx.dst_gpr = bytes[bytes_read++];
+	vtx.dst_sel_x = bytes[bytes_read++];
+	vtx.dst_sel_y = bytes[bytes_read++];
+	vtx.dst_sel_z = bytes[bytes_read++];
+	vtx.dst_sel_w = bytes[bytes_read++];
+	vtx.use_const_fields = bytes[bytes_read++];
+	vtx.data_format = bytes[bytes_read++];
+	vtx.num_format_all = bytes[bytes_read++];
+	vtx.format_comp_all = bytes[bytes_read++];
+	vtx.srf_mode_all = bytes[bytes_read++];
+	vtx.offset = bytes[bytes_read++];
+	vtx.endian = bytes[bytes_read++];
+
+	if (r600_bytecode_add_vtx(ctx->bc, &vtx)) {
+		fprintf(stderr, "Error adding vtx\n");
+	}
+	/* Use the Texture Cache */
+	ctx->bc->cf_last->inst = EG_V_SQ_CF_WORD1_SQ_CF_INST_TEX;
+	return bytes_read;
+}
+
 static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
 				unsigned char * bytes,	unsigned num_bytes)
 {
 	unsigned bytes_read = 0;
+	unsigned i, byte;
 	while (bytes_read < num_bytes) {
 		char inst_type = bytes[bytes_read++];
 		switch (inst_type) {
@@ -420,6 +458,20 @@ static void r600_bytecode_from_byte_stream(struct r600_shader_ctx *ctx,
 			bytes_read = r600_fc_from_byte_stream(ctx, bytes,
 								bytes_read);
 			break;
+		case 3:
+			r600_bytecode_add_cfinst(ctx->bc, CF_NATIVE);
+			for (i = 0; i < 2; i++) {
+				for (byte = 0 ; byte < 4; byte++) {
+					ctx->bc->cf_last->isa[i] |=
+					(bytes[bytes_read++] << (byte * 8));
+				}
+			}
+			break;
+
+		case 4:
+			bytes_read = r600_vtx_from_byte_stream(ctx, bytes,
+								bytes_read);
+			break;
 		default:
 			/* XXX: Error here */
 			break;
@@ -670,8 +722,8 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->input[i].name = d->Semantic.Name;
 		ctx->shader->input[i].sid = d->Semantic.Index;
 		ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
-		ctx->shader->input[i].interpolate = d->Declaration.Interpolate;
-		ctx->shader->input[i].centroid = d->Declaration.Centroid;
+		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
+		ctx->shader->input[i].centroid = d->Interp.Centroid;
 		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
 		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
 			switch (ctx->shader->input[i].name) {
@@ -697,7 +749,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 		ctx->shader->output[i].sid = d->Semantic.Index;
 		ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
 		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
-		ctx->shader->output[i].interpolate = d->Declaration.Interpolate;
+		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
 		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
 		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
 			switch (d->Semantic.Name) {
@@ -5102,7 +5154,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{80,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PUSHA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_POPA,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
 	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
 	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
 	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
@@ -5168,16 +5220,16 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CASE,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DEFAULT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_ENDSWITCH,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD,      0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I,  0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_RESINFO,	0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_UARL,      0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_r600_arl},
@@ -5276,7 +5328,7 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
 	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
 	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2_trans},
 	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
 	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
@@ -5342,16 +5394,16 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD,      0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_RESINFO,	0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
@@ -5450,7 +5502,7 @@ static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
 	{80,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_PUSHA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_POPA,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
+	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CEIL, tgsi_op2},
 	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT, tgsi_op2},
 	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOT_INT, tgsi_op2},
 	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
@@ -5516,16 +5568,16 @@ static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CASE,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_DEFAULT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_ENDSWITCH,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD,      0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_LOAD_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE,    0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I,      0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SAMPLE_I_MS,   0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_B,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_D,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_L,  0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_GATHER4,   0, 0, tgsi_unsupported},
-	{TGSI_OPCODE_RESINFO,	0, 0, tgsi_unsupported},
+	{TGSI_OPCODE_SVIEWINFO,	0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
 	{TGSI_OPCODE_UARL,      0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT, tgsi_eg_arl},
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 3a83b613e58..acf59f80bf4 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -805,9 +805,9 @@ static void *r600_create_dsa_state(struct pipe_context *ctx,
 		alpha_test_control |= S_028410_ALPHA_TEST_ENABLE(1);
 		alpha_ref = fui(state->alpha.ref_value);
 	}
+	dsa->sx_alpha_test_control = alpha_test_control & 0xff;
 	dsa->alpha_ref = alpha_ref;
 
-	r600_pipe_state_add_reg(rstate, R_028410_SX_ALPHA_TEST_CONTROL, alpha_test_control);
 	r600_pipe_state_add_reg(rstate, R_028800_DB_DEPTH_CONTROL, db_depth_control);
 	return rstate;
 }
@@ -1466,6 +1466,11 @@ static void r600_cb(struct r600_context *rctx, struct r600_pipe_state *rstate,
 		blend_bypass = 1;
 	}
 
+	if (ntype == V_0280A0_NUMBER_UINT || ntype == V_0280A0_NUMBER_SINT)
+		rctx->sx_alpha_test_control |= S_028410_ALPHA_TEST_BYPASS(1);
+	else
+		rctx->sx_alpha_test_control &= C_028410_ALPHA_TEST_BYPASS;
+
 	color_info |= S_0280A0_FORMAT(format) |
 		S_0280A0_COMP_SWAP(swap) |
 		S_0280A0_BLEND_BYPASS(blend_bypass) |
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index ccae7d91d43..d47383558d9 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -244,6 +244,8 @@ void r600_bind_dsa_state(struct pipe_context *ctx, void *state)
 		return;
 	rstate = &dsa->rstate;
 	rctx->states[rstate->id] = rstate;
+	rctx->sx_alpha_test_control &= ~0xff;
+	rctx->sx_alpha_test_control |= dsa->sx_alpha_test_control;
 	rctx->alpha_ref = dsa->alpha_ref;
 	rctx->alpha_ref_dirty = true;
 	r600_context_pipe_state_set(rctx, rstate);
@@ -796,6 +798,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
 		r600_pipe_state_add_reg(&rctx->vgt, R_02823C_CB_SHADER_MASK, 0);
 		r600_pipe_state_add_reg(&rctx->vgt, R_028408_VGT_INDX_OFFSET, info.index_bias);
 		r600_pipe_state_add_reg(&rctx->vgt, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info.restart_index);
+		r600_pipe_state_add_reg(&rctx->vgt, R_028410_SX_ALPHA_TEST_CONTROL, 0);
 		r600_pipe_state_add_reg(&rctx->vgt, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, info.primitive_restart);
 		r600_pipe_state_add_reg(&rctx->vgt, R_03CFF4_SQ_VTX_START_INST_LOC, info.start_instance);
 		r600_pipe_state_add_reg(&rctx->vgt, R_028A0C_PA_SC_LINE_STIPPLE, 0);
@@ -817,6 +820,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)
 	r600_pipe_state_mod_reg(&rctx->vgt, rctx->cb_shader_mask);
 	r600_pipe_state_mod_reg(&rctx->vgt, info.index_bias);
 	r600_pipe_state_mod_reg(&rctx->vgt, info.restart_index);
+	r600_pipe_state_mod_reg(&rctx->vgt, rctx->sx_alpha_test_control);
 	r600_pipe_state_mod_reg(&rctx->vgt, info.primitive_restart);
 	r600_pipe_state_mod_reg(&rctx->vgt, info.start_instance);
 
diff --git a/src/gallium/drivers/radeon/AMDGPU.h b/src/gallium/drivers/radeon/AMDGPU.h
index eff002a5eae..0f42cb744d3 100644
--- a/src/gallium/drivers/radeon/AMDGPU.h
+++ b/src/gallium/drivers/radeon/AMDGPU.h
@@ -1,4 +1,4 @@
-//===-- AMDGPU.h - TODO: Add brief description -------===//
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
 
 #ifndef AMDGPU_H
 #define AMDGPU_H
@@ -19,29 +15,24 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-    class FunctionPass;
-    class AMDGPUTargetMachine;
-
-    FunctionPass *createR600CodeEmitterPass(formatted_raw_ostream &OS);
-    FunctionPass *createR600LowerShaderInstructionsPass(TargetMachine &tm);
-    FunctionPass *createR600LowerInstructionsPass(TargetMachine &tm);
-
-    FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
-    FunctionPass *createSIInitMachineFunctionInfoPass(TargetMachine &tm);
-    FunctionPass *createSILowerShaderInstructionsPass(TargetMachine &tm);
-    FunctionPass *createSIPropagateImmReadsPass(TargetMachine &tm);
-    FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 
-    FunctionPass *createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm);
+class FunctionPass;
+class AMDGPUTargetMachine;
 
-    FunctionPass *createAMDGPULowerInstructionsPass(TargetMachine &tm);
-    FunctionPass *createAMDGPULowerShaderInstructionsPass(TargetMachine &tm);
+// R600 Passes
+FunctionPass* createR600KernelParametersPass(const TargetData* TD);
+FunctionPass *createR600CodeEmitterPass(formatted_raw_ostream &OS);
+FunctionPass *createR600LowerInstructionsPass(TargetMachine &tm);
 
-    FunctionPass *createAMDGPUDelimitInstGroupsPass(TargetMachine &tm);
+// SI Passes
+FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
+FunctionPass *createSIPropagateImmReadsPass(TargetMachine &tm);
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 
-    FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
+// Passes common to R600 and SI
+FunctionPass *createAMDGPULowerInstructionsPass(TargetMachine &tm);
+FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
 
-    FunctionPass *createAMDGPUFixRegClassesPass(TargetMachine &tm);
+} // End namespace llvm
 
-} /* End namespace llvm */
-#endif /* AMDGPU_H */
+#endif // AMDGPU_H
diff --git a/src/gallium/drivers/radeon/AMDGPUConstants.pm b/src/gallium/drivers/radeon/AMDGPUConstants.pm
deleted file mode 100644
index b64ff49c187..00000000000
--- a/src/gallium/drivers/radeon/AMDGPUConstants.pm
+++ /dev/null
@@ -1,44 +0,0 @@
-#===-- AMDGPUConstants.pm - TODO: Add brief description -------===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===----------------------------------------------------------------------===#
-#
-# TODO: Add full description
-#
-#===----------------------------------------------------------------------===#
-
-package AMDGPUConstants;
-
-use base 'Exporter';
-
-use constant CONST_REG_COUNT => 256;
-use constant TEMP_REG_COUNT => 128;
-
-our @EXPORT = ('TEMP_REG_COUNT', 'CONST_REG_COUNT', 'get_hw_index', 'get_chan_str');
-
-sub get_hw_index {
-  my ($index) = @_;
-  return int($index / 4);
-}
-
-sub get_chan_str {
-  my ($index) = @_;
-  my $chan = $index % 4;
-  if ($chan == 0 )  {
-    return 'X';
-  } elsif ($chan == 1) {
-    return 'Y';
-  } elsif ($chan == 2) {
-    return 'Z';
-  } elsif ($chan == 3) {
-    return 'W';
-  } else {
-    die("Unknown chan value: $chan");
-  }
-}
-
-1;
diff --git a/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp b/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp
index ce947f8ff78..8e82b8438bb 100644
--- a/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUConvertToISA.cpp
@@ -34,7 +34,7 @@ namespace {
     virtual bool runOnMachineFunction(MachineFunction &MF);
 
   };
-} /* End anonymous namespace */
+} // End anonymous namespace
 
 char AMDGPUConvertToISAPass::ID = 0;
 
diff --git a/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl b/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl
index 1fd4fb04b3e..130eaac72bc 100644
--- a/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl
+++ b/src/gallium/drivers/radeon/AMDGPUGenInstrEnums.pl
@@ -1,15 +1,32 @@
-#===-- AMDGPUGenInstrEnums.pl - TODO: Add brief description -------===#
+#===-- AMDGPUGenInstrEnums.pl - Script for generating instruction enums ----===#
 #
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 #
-#===----------------------------------------------------------------------===#
+#===-----------------------------------------------------------------------===#
 #
-# TODO: Add full description
+# This perl script is used to generate the following files:
 #
-#===----------------------------------------------------------------------===#
+# 1. perl AMDGPUGenInstrEnums.pl td  > AMDGPUInstrEnums.td
+#
+#    This file contains Tablegen constants used for matching hw instructions
+#    from R600 and SI with functionally similar AMDIL instruction.  It aslo
+#    contains definitions of floating point constants like pi (in hex notation)
+#    that are used in some of the shader patterns.
+#
+# 2. perl AMDGPUGenInstrEnums.pl h   > AMDGPUInstrEnums.h
+#
+#    This file contains cpp enums that match the constant values in
+#    AMDGPUInstrEnums.td
+#
+# 3. perl AMDGPUGenInstrEnums.pl inc > AMDGPUInstrEnums.include
+#
+#    This file contains a function called GetRealAMDILOpcode which maps the
+#    constant values defined in AMDGPUInstrEnums.h to the corresponding AMDIL
+#    instructions.
+#===-----------------------------------------------------------------------===#
 
 use warnings;
 use strict;
@@ -41,7 +58,7 @@ my $FILE_TYPE = $ARGV[0];
 
 open AMDIL, '<', 'AMDILInstructions.td';
 
-my @INST_ENUMS = ('NONE', 'FEQ', 'FGE', 'FLT', 'FNE', 'MOVE_f32', 'MOVE_i32', 'FTOI', 'ITOF', 'CMOVLOG_f32', 'UGT', 'IGE', 'INE', 'UGE', 'IEQ');
+my @INST_ENUMS = ('NONE', 'FEQ', 'FGE', 'FLT', 'FNE', 'MOVE_f32', 'MOVE_i32', 'FTOI', 'ITOF', 'CMOVLOG_f32', 'UGT', 'IGE', 'INE', 'UGE', 'IEQ', 'BINARY_OR_i32', 'BINARY_NOT_i32');
 
 while (<AMDIL>) {
   if ($_ =~ /defm\s+([A-Z_]+)\s+:\s+([A-Za-z0-9]+)</) {
diff --git a/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl b/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl
deleted file mode 100644
index 60523a7b48f..00000000000
--- a/src/gallium/drivers/radeon/AMDGPUGenShaderPatterns.pl
+++ /dev/null
@@ -1,30 +0,0 @@
-#===-- AMDGPUGenShaderPatterns.pl - TODO: Add brief description -------===#
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-#===----------------------------------------------------------------------===#
-#
-# TODO: Add full description
-#
-#===----------------------------------------------------------------------===#
-
-use strict;
-use warnings;
-
-use AMDGPUConstants;
-
-my $reg_prefix = $ARGV[0];
-
-for (my $i = 0; $i < CONST_REG_COUNT * 4; $i++) {
-  my $index = get_hw_index($i);
-  my $chan = get_chan_str($i);
-print <<STRING;
-def : Pat <
-  (int_AMDGPU_load_const $i),
-  (f32 (MOV (f32 $reg_prefix$index\_$chan)))
->;
-STRING
-}
diff --git a/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
index 2c1052fd8ea..2bdc8a759f2 100644
--- a/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUISelLowering.cpp - TODO: Add brief description -------===//
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This is the parent TargetLowering class for hardware code gen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPUISelLowering.h b/src/gallium/drivers/radeon/AMDGPUISelLowering.h
index 3c5beb1cdae..1b3f71006e2 100644
--- a/src/gallium/drivers/radeon/AMDGPUISelLowering.h
+++ b/src/gallium/drivers/radeon/AMDGPUISelLowering.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUISelLowering.h - TODO: Add brief description -------===//
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains the interface defintiion of the TargetLowering class
+// that is common to all AMD GPUs.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp b/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp
index 4742283f688..ecd8ac90526 100644
--- a/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.cpp
@@ -108,9 +108,4 @@ unsigned AMDGPUInstrInfo::getISAOpcode(unsigned opcode) const
   }
 }
 
-bool AMDGPUInstrInfo::isRegPreload(const MachineInstr &MI) const
-{
-  return (get(MI.getOpcode()).TSFlags >> AMDGPU_TFLAG_SHIFTS::PRELOAD_REG) & 0x1;
-}
-
 #include "AMDGPUInstrEnums.include"
diff --git a/src/gallium/drivers/radeon/AMDGPUInstrInfo.h b/src/gallium/drivers/radeon/AMDGPUInstrInfo.h
index fa009bc6302..930b41e7191 100644
--- a/src/gallium/drivers/radeon/AMDGPUInstrInfo.h
+++ b/src/gallium/drivers/radeon/AMDGPUInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUInstrInfo.h - TODO: Add brief description -------===//
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains the definitoin of a TargetInstrInfo class that is common
+// to all AMD GPUs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,17 +22,17 @@
 
 namespace llvm {
 
-  class AMDGPUTargetMachine;
-  class MachineFunction;
-  class MachineInstr;
-  class MachineInstrBuilder;
+class AMDGPUTargetMachine;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
 
-  class AMDGPUInstrInfo : public AMDILInstrInfo {
-  private:
+class AMDGPUInstrInfo : public AMDILInstrInfo {
+private:
   AMDGPUTargetMachine & TM;
   std::map<unsigned, unsigned> amdilToISA;
 
-  public:
+public:
   explicit AMDGPUInstrInfo(AMDGPUTargetMachine &tm);
 
   virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
@@ -41,19 +42,9 @@ namespace llvm {
   virtual MachineInstr * convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const;
 
-  bool isRegPreload(const MachineInstr &MI) const;
-
   #include "AMDGPUInstrEnums.h.include"
-  };
+};
 
 } // End llvm namespace
 
-/* AMDGPU target flags are stored in bits 32-39 */
-namespace AMDGPU_TFLAG_SHIFTS {
-  enum TFLAGS {
-    PRELOAD_REG = 32
-  };
-}
-
-
 #endif // AMDGPUINSTRINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDGPUInstructions.td b/src/gallium/drivers/radeon/AMDGPUInstructions.td
index 0433c8dcd95..f689356e488 100644
--- a/src/gallium/drivers/radeon/AMDGPUInstructions.td
+++ b/src/gallium/drivers/radeon/AMDGPUInstructions.td
@@ -1,4 +1,4 @@
-//===-- AMDGPUInstructions.td - TODO: Add brief description -------===//
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains instruction defs that are common to all hw codegen
+// targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,14 +17,12 @@ include "AMDGPUInstrEnums.td"
 class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
   field bits<16> AMDILOp = 0;
   field bits<3> Gen = 0;
-  field bit PreloadReg = 0;
 
   let Namespace = "AMDIL";
   let OutOperandList = outs;
   let InOperandList = ins;
   let AsmString = asm;
   let Pattern = pattern;
-  let TSFlags{32} = PreloadReg;
   let TSFlags{42-40} = Gen;
   let TSFlags{63-48} = AMDILOp;
 }
@@ -37,42 +36,12 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
 
 let isCodeGenOnly = 1 in {
 
-  def EXPORT_REG : AMDGPUShaderInst <
-    (outs),
-    (ins GPRF32:$src),
-    "EXPORT_REG $src",
-    [(int_AMDGPU_export_reg GPRF32:$src)]
-  >;
-
-  def LOAD_INPUT : AMDGPUShaderInst <
-    (outs GPRF32:$dst),
-    (ins i32imm:$src),
-    "LOAD_INPUT $dst, $src",
-    [] >{
-    let PreloadReg = 1;
-  }
-
   def MASK_WRITE : AMDGPUShaderInst <
     (outs),
     (ins GPRF32:$src),
     "MASK_WRITE $src",
     []
   >;
-
-  def RESERVE_REG : AMDGPUShaderInst <
-    (outs GPRF32:$dst),
-    (ins i32imm:$src),
-    "RESERVE_REG $dst, $src",
-    [(set GPRF32:$dst, (int_AMDGPU_reserve_reg imm:$src))]> {
-    let PreloadReg = 1;
-  }
-
-  def STORE_OUTPUT: AMDGPUShaderInst <
-    (outs GPRF32:$dst),
-    (ins GPRF32:$src0, i32imm:$src1),
-    "STORE_OUTPUT $dst, $src0, $src1",
-    [(set GPRF32:$dst, (int_AMDGPU_store_output GPRF32:$src0, imm:$src1))]
-  >;
 }
 
 /* Generic helper patterns for intrinsics */
diff --git a/src/gallium/drivers/radeon/AMDGPUIntrinsics.td b/src/gallium/drivers/radeon/AMDGPUIntrinsics.td
index d2cda0db936..398fd11431f 100644
--- a/src/gallium/drivers/radeon/AMDGPUIntrinsics.td
+++ b/src/gallium/drivers/radeon/AMDGPUIntrinsics.td
@@ -1,4 +1,4 @@
-//===-- AMDGPUIntrinsics.td - TODO: Add brief description -------===//
+//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file defines intrinsics that are used by all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
 
-  def int_AMDGPU_export_reg : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], []>;
   def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], []>;
-  def int_AMDGPU_reserve_reg : Intrinsic<[llvm_float_ty], [llvm_i32_ty], []>;
-  def int_AMDGPU_store_output : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], []>;
+  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
   def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], []>;
 
   def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], []>;
@@ -26,7 +25,7 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], []>;
   def int_AMDGPU_floor : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
-  def int_AMDGPU_kill : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
   def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
@@ -35,7 +34,7 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
   def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
-  def int_AMDGPU_sge : BinaryIntFloat;
+  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_sin : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
   def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], []>;
@@ -43,9 +42,18 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
   def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], []>;
+  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], []>;
 }
 
 let TargetPrefix = "TGSI", isTarget = 1 in {
diff --git a/src/gallium/drivers/radeon/AMDGPULowerInstructions.cpp b/src/gallium/drivers/radeon/AMDGPULowerInstructions.cpp
index b49d0dddf65..2e455fea8ab 100644
--- a/src/gallium/drivers/radeon/AMDGPULowerInstructions.cpp
+++ b/src/gallium/drivers/radeon/AMDGPULowerInstructions.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPULowerInstructions.cpp - TODO: Add brief description -------===//
+//===-- AMDGPULowerInstructions.cpp - AMDGPU lowering pass ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This pass lowers unsupported AMDIL MachineInstrs to LLVM pseudo 
+// MachineInstrs for hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -27,7 +28,7 @@ namespace {
   private:
     static char ID;
     TargetMachine &TM;
-    void lowerVCREATE_v4f32(MachineInstr &MI, MachineBasicBlock::iterator I,
+    void lowerVCREATE_v4(MachineInstr &MI, MachineBasicBlock::iterator I,
                               MachineBasicBlock &MBB, MachineFunction &MF);
 
   public:
@@ -56,8 +57,9 @@ bool AMDGPULowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
 
       switch (MI.getOpcode()) {
       default: continue;
-      case AMDIL::VCREATE_v4f32: lowerVCREATE_v4f32(MI, I, MBB, MF); break;
-
+      case AMDIL::VCREATE_v4f32:
+      case AMDIL::VCREATE_v4i32:
+        lowerVCREATE_v4(MI, I, MBB, MF); break;
       }
       MI.eraseFromParent();
     }
@@ -65,7 +67,7 @@ bool AMDGPULowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
   return false;
 }
 
-void AMDGPULowerInstructionsPass::lowerVCREATE_v4f32(MachineInstr &MI,
+void AMDGPULowerInstructionsPass::lowerVCREATE_v4(MachineInstr &MI,
     MachineBasicBlock::iterator I, MachineBasicBlock &MBB, MachineFunction &MF)
 {
   MachineRegisterInfo & MRI = MF.getRegInfo();
diff --git a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp
deleted file mode 100644
index d33055ccb87..00000000000
--- a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- AMDGPULowerShaderInstructions.cpp - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "AMDGPULowerShaderInstructions.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-using namespace llvm;
-
-void AMDGPULowerShaderInstructionsPass::preloadRegister(MachineFunction * MF,
-    const TargetInstrInfo * TII, unsigned physReg, unsigned virtReg) const
-{
-  if (!MRI->isLiveIn(physReg)) {
-    MRI->addLiveIn(physReg, virtReg);
-    MachineBasicBlock &EntryMBB = MF->front();
-    BuildMI(MF->front(), EntryMBB.begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
-            virtReg)
-            .addReg(physReg);
-  } else {
-    /* We can't mark the same register as preloaded twice, but we still must
-     * associate virtReg with the correct preloaded register. */
-    unsigned newReg = MRI->getLiveInVirtReg(physReg);
-    MRI->replaceRegWith(virtReg, newReg);
-  }
-}
diff --git a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h b/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h
deleted file mode 100644
index 5ee77fafe2b..00000000000
--- a/src/gallium/drivers/radeon/AMDGPULowerShaderInstructions.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- AMDGPULowerShaderInstructions.h - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-
-#ifndef AMDGPU_LOWER_SHADER_INSTRUCTIONS
-#define AMDGPU_LOWER_SHADER_INSTRUCTIONS
-
-namespace llvm {
-
-class MachineFunction;
-class MachineRegisterInfo;
-class TargetInstrInfo;
-
-class AMDGPULowerShaderInstructionsPass {
-
-  protected:
-    MachineRegisterInfo * MRI;
-    /**
-     * @param physReg The physical register that will be preloaded.
-     * @param virtReg The virtual register that currently holds the
-     *                preloaded value.
-     */
-    void preloadRegister(MachineFunction * MF, const TargetInstrInfo * TII,
-                         unsigned physReg, unsigned virtReg) const;
-};
-
-} // end namespace llvm
-
-
-#endif // AMDGPU_LOWER_SHADER_INSTRUCTIONS
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp b/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp
index 162a49116a0..ad48335fd33 100644
--- a/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPURegisterInfo.cpp - TODO: Add brief description -------===//
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Parent TargetRegisterInfo class common to all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.h b/src/gallium/drivers/radeon/AMDGPURegisterInfo.h
index f4492e9795d..d545c06f69e 100644
--- a/src/gallium/drivers/radeon/AMDGPURegisterInfo.h
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDGPURegisterInfo.h - TODO: Add brief description -------===//
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains the TargetRegisterInfo interface that is implemented
+// by all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPURegisterInfo.td b/src/gallium/drivers/radeon/AMDGPURegisterInfo.td
index 173d6622569..1707903ae7e 100644
--- a/src/gallium/drivers/radeon/AMDGPURegisterInfo.td
+++ b/src/gallium/drivers/radeon/AMDGPURegisterInfo.td
@@ -1,4 +1,4 @@
-//===-- AMDGPURegisterInfo.td - TODO: Add brief description -------===//
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Tablegen register definitions common to all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp b/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp
deleted file mode 100644
index c923f19c39f..00000000000
--- a/src/gallium/drivers/radeon/AMDGPUReorderPreloadInstructions.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-//===-- AMDGPUReorderPreloadInstructions.cpp - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDIL.h"
-#include "AMDILInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Function.h"
-
-using namespace llvm;
-
-namespace {
-  class AMDGPUReorderPreloadInstructionsPass : public MachineFunctionPass {
-
-  private:
-    static char ID;
-    TargetMachine &TM;
-
-  public:
-    AMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm) { }
-
-      bool runOnMachineFunction(MachineFunction &MF);
-
-      const char *getPassName() const { return "AMDGPU Reorder Preload Instructions"; }
-    };
-} /* End anonymous namespace */
-
-char AMDGPUReorderPreloadInstructionsPass::ID = 0;
-
-FunctionPass *llvm::createAMDGPUReorderPreloadInstructionsPass(TargetMachine &tm) {
-    return new AMDGPUReorderPreloadInstructionsPass(tm);
-}
-
-/* This pass moves instructions that represent preloaded registers to the
- * start of the program. */
-bool AMDGPUReorderPreloadInstructionsPass::runOnMachineFunction(MachineFunction &MF)
-{
-  const AMDGPUInstrInfo * TII =
-                        static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
-
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
-      MachineInstr &MI = *I;
-      if (TII->isRegPreload(MI)) {
-         MF.front().insert(MF.front().begin(), MI.removeFromParent());
-      }
-    }
-  }
-  return false;
-}
diff --git a/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp b/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp
index 313349ce01b..c1c21abc9c1 100644
--- a/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUTargetMachine.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUTargetMachine.cpp - TODO: Add brief description -------===//
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// The AMDGPU target machine contains all of the hardware specific information
+// needed to emit code for R600 and SI GPUs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,7 +17,6 @@
 #include "AMDILTargetMachine.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
-#include "R600KernelParameters.h"
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "llvm/Analysis/Passes.h"
@@ -112,31 +112,28 @@ AMDGPUPassConfig::addPreISel()
 {
   const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
   if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
-    PM.add(createR600KernelParametersPass(
+    PM->add(createR600KernelParametersPass(
                      getAMDGPUTargetMachine().getTargetData()));
   }
   return false;
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
-  PM.add(createAMDILPeepholeOpt(*TM));
-  PM.add(createAMDILISelDag(getAMDGPUTargetMachine()));
+  PM->add(createAMDILPeepholeOpt(*TM));
+  PM->add(createAMDILISelDag(getAMDGPUTargetMachine()));
   return false;
 }
 
 bool AMDGPUPassConfig::addPreRegAlloc() {
   const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
 
-  PM.add(createAMDGPUReorderPreloadInstructionsPass(*TM));
   if (ST.device()->getGeneration() <= AMDILDeviceInfo::HD6XXX) {
-    PM.add(createR600LowerShaderInstructionsPass(*TM));
-    PM.add(createR600LowerInstructionsPass(*TM));
+    PM->add(createR600LowerInstructionsPass(*TM));
   } else {
-    PM.add(createSILowerShaderInstructionsPass(*TM));
-    PM.add(createSIAssignInterpRegsPass(*TM));
+    PM->add(createSIAssignInterpRegsPass(*TM));
   }
-  PM.add(createAMDGPULowerInstructionsPass(*TM));
-  PM.add(createAMDGPUConvertToISAPass(*TM));
+  PM->add(createAMDGPULowerInstructionsPass(*TM));
+  PM->add(createAMDGPUConvertToISAPass(*TM));
   return false;
 }
 
@@ -150,10 +147,10 @@ bool AMDGPUPassConfig::addPreSched2() {
 
 bool AMDGPUPassConfig::addPreEmitPass() {
   const AMDILSubtarget &ST = TM->getSubtarget<AMDILSubtarget>();
-  PM.add(createAMDILCFGPreparationPass(*TM));
-  PM.add(createAMDILCFGStructurizerPass(*TM));
+  PM->add(createAMDILCFGPreparationPass(*TM));
+  PM->add(createAMDILCFGStructurizerPass(*TM));
   if (ST.device()->getGeneration() == AMDILDeviceInfo::HD7XXX) {
-    PM.add(createSIPropagateImmReadsPass(*TM));
+    PM->add(createSIPropagateImmReadsPass(*TM));
   }
 
   return false;
diff --git a/src/gallium/drivers/radeon/AMDGPUTargetMachine.h b/src/gallium/drivers/radeon/AMDGPUTargetMachine.h
index d4165b09e84..2428fe638a7 100644
--- a/src/gallium/drivers/radeon/AMDGPUTargetMachine.h
+++ b/src/gallium/drivers/radeon/AMDGPUTargetMachine.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUTargetMachine.h - TODO: Add brief description -------===//
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+//  The AMDGPU TargetMachine interface definition for hw codgen targets.
 //
 //===----------------------------------------------------------------------===//
 
@@ -52,9 +52,6 @@ public:
                                               formatted_raw_ostream &Out,
                                               CodeGenFileType FileType,
                                               bool DisableVerify);
-public:
-   void dumpCode() { mDump = true; }
-   bool shouldDumpCode() const { return mDump; }
 };
 
 } /* End namespace llvm */
diff --git a/src/gallium/drivers/radeon/AMDGPUUtil.cpp b/src/gallium/drivers/radeon/AMDGPUUtil.cpp
index a5045436ab4..bd8f5eef697 100644
--- a/src/gallium/drivers/radeon/AMDGPUUtil.cpp
+++ b/src/gallium/drivers/radeon/AMDGPUUtil.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUUtil.cpp - TODO: Add brief description -------===//
+//===-- AMDGPUUtil.cpp - AMDGPU Utility functions -------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,39 +7,39 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Common utility functions used by hw codegen targets
 //
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUUtil.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDIL.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
-/* Some instructions act as place holders to emulate operations that the GPU
- * hardware does automatically. This function can be used to check if
- * an opcode falls into this category. */
-bool llvm::isPlaceHolderOpcode(unsigned opcode)
+// Some instructions act as place holders to emulate operations that the GPU
+// hardware does automatically. This function can be used to check if
+// an opcode falls into this category.
+bool AMDGPU::isPlaceHolderOpcode(unsigned opcode)
 {
   switch (opcode) {
   default: return false;
-  case AMDIL::EXPORT_REG:
   case AMDIL::RETURN:
   case AMDIL::LOAD_INPUT:
   case AMDIL::LAST:
+  case AMDIL::MASK_WRITE:
   case AMDIL::RESERVE_REG:
     return true;
   }
 }
 
-bool llvm::isTransOp(unsigned opcode)
+bool AMDGPU::isTransOp(unsigned opcode)
 {
   switch(opcode) {
     default: return false;
@@ -67,10 +67,12 @@ bool llvm::isTransOp(unsigned opcode)
   }
 }
 
-bool llvm::isTexOp(unsigned opcode)
+bool AMDGPU::isTexOp(unsigned opcode)
 {
   switch(opcode) {
   default: return false;
+  case AMDIL::TEX_LD:
+  case AMDIL::TEX_GET_TEXTURE_RESINFO:
   case AMDIL::TEX_SAMPLE:
   case AMDIL::TEX_SAMPLE_C:
   case AMDIL::TEX_SAMPLE_L:
@@ -79,11 +81,13 @@ bool llvm::isTexOp(unsigned opcode)
   case AMDIL::TEX_SAMPLE_C_LB:
   case AMDIL::TEX_SAMPLE_G:
   case AMDIL::TEX_SAMPLE_C_G:
+  case AMDIL::TEX_GET_GRADIENTS_H:
+  case AMDIL::TEX_GET_GRADIENTS_V:
     return true;
   }
 }
 
-bool llvm::isReductionOp(unsigned opcode)
+bool AMDGPU::isReductionOp(unsigned opcode)
 {
   switch(opcode) {
     default: return false;
@@ -93,13 +97,25 @@ bool llvm::isReductionOp(unsigned opcode)
   }
 }
 
-bool llvm::isFCOp(unsigned opcode)
+bool AMDGPU::isCubeOp(unsigned opcode)
+{
+  switch(opcode) {
+    default: return false;
+    case AMDIL::CUBE_r600:
+    case AMDIL::CUBE_eg:
+      return true;
+  }
+}
+
+
+bool AMDGPU::isFCOp(unsigned opcode)
 {
   switch(opcode) {
   default: return false;
   case AMDIL::BREAK_LOGICALZ_f32:
   case AMDIL::BREAK_LOGICALNZ_i32:
   case AMDIL::BREAK_LOGICALZ_i32:
+  case AMDIL::BREAK_LOGICALNZ_f32:
   case AMDIL::CONTINUE_LOGICALNZ_f32:
   case AMDIL::IF_LOGICALNZ_i32:
   case AMDIL::IF_LOGICALZ_f32:
@@ -112,11 +128,14 @@ bool llvm::isFCOp(unsigned opcode)
   }
 }
 
-void AMDGPU::utilAddLiveIn(MachineFunction * MF, MachineRegisterInfo & MRI,
-    const struct TargetInstrInfo * TII, unsigned physReg, unsigned virtReg)
+void AMDGPU::utilAddLiveIn(llvm::MachineFunction * MF,
+													 llvm::MachineRegisterInfo & MRI,
+													 const struct llvm::TargetInstrInfo * TII,
+													 unsigned physReg, unsigned virtReg)
 {
     if (!MRI.isLiveIn(physReg)) {
       MRI.addLiveIn(physReg, virtReg);
+      MF->front().addLiveIn(physReg);
       BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
                            TII->get(TargetOpcode::COPY), virtReg)
             .addReg(physReg);
diff --git a/src/gallium/drivers/radeon/AMDGPUUtil.h b/src/gallium/drivers/radeon/AMDGPUUtil.h
index 299146e1ba7..15f2ce57af9 100644
--- a/src/gallium/drivers/radeon/AMDGPUUtil.h
+++ b/src/gallium/drivers/radeon/AMDGPUUtil.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUUtil.h - TODO: Add brief description -------===//
+//===-- AMDGPUUtil.h - AMDGPU Utility function declarations -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,43 +7,40 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Declarations for utility functions common to all hw codegen targets.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef AMDGPU_UTIL_H
 #define AMDGPU_UTIL_H
 
-#include "AMDGPURegisterInfo.h"
-#include "llvm/Support/DataTypes.h"
-
 namespace llvm {
 
-class AMDILMachineFunctionInfo;
+class MachineFunction;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+
+}
 
-class TargetMachine;
-class TargetRegisterInfo;
+namespace AMDGPU {
 
 bool isPlaceHolderOpcode(unsigned opcode);
 
 bool isTransOp(unsigned opcode);
 bool isTexOp(unsigned opcode);
 bool isReductionOp(unsigned opcode);
+bool isCubeOp(unsigned opcode);
 bool isFCOp(unsigned opcode);
 
-/* XXX: Move these to AMDGPUInstrInfo.h */
+// XXX: Move these to AMDGPUInstrInfo.h
 #define MO_FLAG_CLAMP (1 << 0)
 #define MO_FLAG_NEG   (1 << 1)
 #define MO_FLAG_ABS   (1 << 2)
 #define MO_FLAG_MASK  (1 << 3)
 
-} /* End namespace llvm */
-
-namespace AMDGPU {
-
 void utilAddLiveIn(llvm::MachineFunction * MF, llvm::MachineRegisterInfo & MRI,
     const struct llvm::TargetInstrInfo * TII, unsigned physReg, unsigned virtReg);
 
 } // End namespace AMDGPU
 
-#endif /* AMDGPU_UTIL_H */
+#endif // AMDGPU_UTIL_H
diff --git a/src/gallium/drivers/radeon/AMDIL.h b/src/gallium/drivers/radeon/AMDIL.h
index 317ea124f66..6759ccd9527 100644
--- a/src/gallium/drivers/radeon/AMDIL.h
+++ b/src/gallium/drivers/radeon/AMDIL.h
@@ -137,11 +137,6 @@ enum AddressSpaces {
   LAST_ADDRESS     = 8
 };
 
-// We are piggybacking on the CommentFlag enum in MachineInstr.h to
-// set bits in AsmPrinterFlags of the MachineInstruction. We will
-// start at bit 16 and allocate down while LLVM will start at bit
-// 1 and allocate up.
-
 // This union/struct combination is an easy way to read out the
 // exact bits that are needed.
 typedef union ResourceRec {
@@ -181,26 +176,6 @@ typedef union ResourceRec {
 
 } // namespace AMDILAS
 
-// The OpSwizzle encodes a subset of all possible
-// swizzle combinations into a number of bits using
-// only the combinations utilized by the backend.
-// The lower 128 are for source swizzles and the
-// upper 128 or for destination swizzles.
-// The valid mappings can be found in the
-// getSrcSwizzle and getDstSwizzle functions of
-// AMDILUtilityFunctions.cpp.
-typedef union SwizzleRec {
-  struct {
-#ifdef __BIG_ENDIAN__
-    unsigned char dst : 1;
-    unsigned char swizzle : 7;
-#else
-    unsigned char swizzle : 7;
-    unsigned char dst : 1;
-#endif
-  } bits;
-  unsigned char u8all;
-} OpSwizzle;
 // Enums corresponding to AMDIL condition codes for IL.  These
 // values must be kept in sync with the ones in the .td file.
 namespace AMDILCC {
diff --git a/src/gallium/drivers/radeon/AMDIL.td b/src/gallium/drivers/radeon/AMDIL.td
index 9bcccac2411..deee290fad5 100644
--- a/src/gallium/drivers/radeon/AMDIL.td
+++ b/src/gallium/drivers/radeon/AMDIL.td
@@ -1,4 +1,4 @@
-//===-- AMDIL.td - TODO: Add brief description -------===//
+//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp b/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp
index 6625dd77d5f..d7c96573a15 100644
--- a/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDIL7XXDevice.cpp
@@ -1,4 +1,4 @@
-//===-- AMDIL7XXDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILBase.td b/src/gallium/drivers/radeon/AMDILBase.td
index 2706b211f2d..31ebed31d72 100644
--- a/src/gallium/drivers/radeon/AMDILBase.td
+++ b/src/gallium/drivers/radeon/AMDILBase.td
@@ -60,6 +60,11 @@ def FeatureDebug : SubtargetFeature<"debug",
         "CapsOverride[AMDILDeviceInfo::Debug]",
         "true",
         "Debug mode is enabled, so disable hardware accelerated address spaces.">;
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "mDumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
 
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
diff --git a/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp b/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp
index 289af6f210e..cdcd5e89880 100644
--- a/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp
+++ b/src/gallium/drivers/radeon/AMDILCFGStructurizer.cpp
@@ -7,22 +7,22 @@
 //
 //==-----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "structcfg"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
 #define DEBUGME 0
-#endif
+#define DEBUG_TYPE "structcfg"
 
 #include "AMDILTargetMachine.h"
 #include "AMDILUtilityFunctions.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -30,8 +30,6 @@
 #define FirstNonDebugInstr(A) A->begin()
 using namespace llvm;
 
-// bixia TODO: move this out to analysis lib. Make this work for both target
-// AMDIL and CBackend.
 // TODO: move-begin.
 
 //===----------------------------------------------------------------------===//
@@ -109,23 +107,6 @@ void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDILTargetMachine.h"
-#include "AMDILUtilityFunctions.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/DominatorInternals.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
 namespace llvm {
 
 /// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
@@ -3156,10 +3137,6 @@ struct CFGStructTraits<AMDILCFGStructurizer>
          iterEnd = srcBlk->end();
          iter != iterEnd; ++iter) {
       MachineInstr *instr = func->CloneMachineInstr(iter);
-      // This is a workaround for LLVM bugzilla 8420 because CloneMachineInstr
-      // does not clone the AsmPrinterFlags.
-      instr->setAsmPrinterFlag(
-         (llvm::MachineInstr::CommentFlag)iter->getAsmPrinterFlags());
       newBlk->push_back(instr);
     }
     return newBlk;
diff --git a/src/gallium/drivers/radeon/AMDILCodeEmitter.h b/src/gallium/drivers/radeon/AMDILCodeEmitter.h
index b0ea1455cf9..fa46cbd203d 100644
--- a/src/gallium/drivers/radeon/AMDILCodeEmitter.h
+++ b/src/gallium/drivers/radeon/AMDILCodeEmitter.h
@@ -1,23 +1,21 @@
-//                     The LLVM Compiler Infrastructure
+//===-- AMDILCodeEmitter.h - AMDIL Code Emitter interface -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
-//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
-//===-- AMDILCodeEmitter.h - TODO: Add brief description -------===//
+//===----------------------------------------------------------------------===//
 //
-//                     The LLVM Compiler Infrastructure
+// CodeEmitter interface for R600 and SI codegen.
 //
+//===----------------------------------------------------------------------===//
 
 #ifndef AMDILCODEEMITTER_H
 #define AMDILCODEEMITTER_H
 
 namespace llvm {
 
-  /* XXX: Temp HACK to work around tablegen name generation */
   class AMDILCodeEmitter {
   public:
     uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
diff --git a/src/gallium/drivers/radeon/AMDILConversions.td b/src/gallium/drivers/radeon/AMDILConversions.td
index 0db66ae8475..1bc5e4ddf37 100644
--- a/src/gallium/drivers/radeon/AMDILConversions.td
+++ b/src/gallium/drivers/radeon/AMDILConversions.td
@@ -1,4 +1,4 @@
-//===-- AMDILConversions.td - TODO: Add brief description -------===//
+//==- AMDILConversions.td - Type conversion tablegen patterns -*-tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILDevice.cpp b/src/gallium/drivers/radeon/AMDILDevice.cpp
index aa6d8af7012..4294a8bef0c 100644
--- a/src/gallium/drivers/radeon/AMDILDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDILDevice.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp b/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp
index 89b8312c294..cbf5b512471 100644
--- a/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDILDeviceInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILDeviceInfo.cpp - TODO: Add brief description -------===//
+//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,11 +6,16 @@
 // License. See LICENSE.TXT for details.
 //
 //==-----------------------------------------------------------------------===//
+//
+// Function that creates DeviceInfo from a device name and other information.
+//
+//==-----------------------------------------------------------------------===//
 #include "AMDILDevices.h"
 #include "AMDILSubtarget.h"
 
 using namespace llvm;
 namespace llvm {
+namespace AMDILDeviceInfo {
     AMDILDevice*
 getDeviceFromName(const std::string &deviceName, AMDILSubtarget *ptr, bool is64bit, bool is64on32bit)
 {
@@ -84,4 +89,5 @@ getDeviceFromName(const std::string &deviceName, AMDILSubtarget *ptr, bool is64b
         return new AMDIL7XXDevice(ptr);
     }
 }
-}
+} // End namespace AMDILDeviceInfo
+} // End namespace llvm
diff --git a/src/gallium/drivers/radeon/AMDILDeviceInfo.h b/src/gallium/drivers/radeon/AMDILDeviceInfo.h
index c4acf9145ae..06ac4322d0f 100644
--- a/src/gallium/drivers/radeon/AMDILDeviceInfo.h
+++ b/src/gallium/drivers/radeon/AMDILDeviceInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDILDeviceInfo.h - TODO: Add brief description -------===//
+//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -82,8 +82,8 @@ namespace llvm
     };
 
 
+  AMDILDevice*
+    getDeviceFromName(const std::string &name, AMDILSubtarget *ptr, bool is64bit = false, bool is64on32bit = false);
   } // namespace AMDILDeviceInfo
-  llvm::AMDILDevice*
-    getDeviceFromName(const std::string &name, llvm::AMDILSubtarget *ptr, bool is64bit = false, bool is64on32bit = false);
 } // namespace llvm
 #endif // _AMDILDEVICEINFO_H_
diff --git a/src/gallium/drivers/radeon/AMDILDevices.h b/src/gallium/drivers/radeon/AMDILDevices.h
index 3fc5fa05669..cfcc3304b4b 100644
--- a/src/gallium/drivers/radeon/AMDILDevices.h
+++ b/src/gallium/drivers/radeon/AMDILDevices.h
@@ -1,4 +1,4 @@
-//===-- AMDILDevices.h - TODO: Add brief description -------===//
+//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td b/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td
index 445fd608bbb..f10936b8c6c 100644
--- a/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td
+++ b/src/gallium/drivers/radeon/AMDILEnumeratedTypes.td
@@ -1,4 +1,4 @@
-//===-- AMDILEnumeratedTypes.td - TODO: Add brief description -------===//
+//===-- AMDILEnumeratedTypes.td - IL Type definitions --*- tablegen -*-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp b/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp
index 7b5c52345d2..779b2d3df2f 100644
--- a/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDILEvergreenDevice.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILEvergreenDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp b/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp
index ff04d9d55bf..b8898828dd6 100644
--- a/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp
+++ b/src/gallium/drivers/radeon/AMDILISelDAGToDAG.cpp
@@ -13,9 +13,12 @@
 #include "AMDILDevices.h"
 #include "AMDILTargetMachine.h"
 #include "AMDILUtilityFunctions.h"
+#include "llvm/ADT/ValueMap.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Compiler.h"
+#include <list>
+#include <queue>
 
 using namespace llvm;
 
@@ -35,13 +38,21 @@ class AMDILDAGToDAGISel : public SelectionDAGISel {
 public:
   AMDILDAGToDAGISel(AMDILTargetMachine &TM AMDIL_OPT_LEVEL_DECL);
   virtual ~AMDILDAGToDAGISel();
-  inline SDValue getSmallIPtrImm(unsigned Imm);
 
   SDNode *Select(SDNode *N);
+  virtual const char *getPassName() const;
+
+private:
+  inline SDValue getSmallIPtrImm(unsigned Imm);
+
   // Complex pattern selectors
   bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
   bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
   bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+
+  static bool checkType(const Value *ptr, unsigned int addrspace);
+  static const Value *getBasePointerValue(const Value *V);
+
   static bool isGlobalStore(const StoreSDNode *N);
   static bool isPrivateStore(const StoreSDNode *N);
   static bool isLocalStore(const StoreSDNode *N);
@@ -54,8 +65,6 @@ public:
   static bool isLocalLoad(const LoadSDNode *N);
   static bool isRegionLoad(const LoadSDNode *N);
 
-  virtual const char *getPassName() const;
-private:
   SDNode *xformAtomicInst(SDNode *N);
 
   // Include the pieces autogenerated from the target description.
@@ -165,26 +174,75 @@ SDNode *AMDILDAGToDAGISel::Select(SDNode *N) {
   return SelectCode(N);
 }
 
+bool AMDILDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
+  if (!ptr) {
+    return false;
+  }
+  Type *ptrType = ptr->getType();
+  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+}
+
+const Value * AMDILDAGToDAGISel::getBasePointerValue(const Value *V)
+{
+  if (!V) {
+    return NULL;
+  }
+  const Value *ret = NULL;
+  ValueMap<const Value *, bool> ValueBitMap;
+  std::queue<const Value *, std::list<const Value *> > ValueQueue;
+  ValueQueue.push(V);
+  while (!ValueQueue.empty()) {
+    V = ValueQueue.front();
+    if (ValueBitMap.find(V) == ValueBitMap.end()) {
+      ValueBitMap[V] = true;
+      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
+        ret = V;
+        break;
+      } else if (dyn_cast<GlobalVariable>(V)) {
+        ret = V;
+        break;
+      } else if (dyn_cast<Constant>(V)) {
+        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
+        if (CE) {
+          ValueQueue.push(CE->getOperand(0));
+        }
+      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+        ret = AI;
+        break;
+      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
+        uint32_t numOps = I->getNumOperands();
+        for (uint32_t x = 0; x < numOps; ++x) {
+          ValueQueue.push(I->getOperand(x));
+        }
+      } else {
+        // assert(0 && "Found a Value that we didn't know how to handle!");
+      }
+    }
+    ValueQueue.pop();
+  }
+  return ret;
+}
+
 bool AMDILDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  return (!check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
-          && !check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
-          && !check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS));
+  return (!checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS));
 }
 
 bool AMDILDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
-  if (check_type(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)) {
+  if (checkType(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)) {
     return true;
   }
   MachineMemOperand *MMO = N->getMemOperand();
@@ -195,27 +253,27 @@ bool AMDILDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
       && ((V && dyn_cast<GlobalValue>(V))
           || (BV && dyn_cast<GlobalValue>(
                         getBasePointerValue(MMO->getValue()))))) {
-    return check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS);
+    return checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS);
   } else {
     return false;
   }
 }
 
 bool AMDILDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
-  return check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
+  return checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS);
 }
 
 bool AMDILDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
   MachineMemOperand *MMO = N->getMemOperand();
-  if (check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
+  if (checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
     if (MMO) {
       const Value *V = MMO->getValue();
       const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
@@ -228,19 +286,19 @@ bool AMDILDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
 }
 
 bool AMDILDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
-  if (check_type(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
+  if (checkType(N->getSrcValue(), AMDILAS::PRIVATE_ADDRESS)) {
     // Check to make sure we are not a constant pool load or a constant load
     // that is marked as a private load
     if (isCPLoad(N) || isConstantLoad(N, -1)) {
       return false;
     }
   }
-  if (!check_type(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::REGION_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::PARAM_D_ADDRESS)
-      && !check_type(N->getSrcValue(), AMDILAS::PARAM_I_ADDRESS))
+  if (!checkType(N->getSrcValue(), AMDILAS::LOCAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::GLOBAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::REGION_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::CONSTANT_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::PARAM_D_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDILAS::PARAM_I_ADDRESS))
   {
     return true;
   }
diff --git a/src/gallium/drivers/radeon/AMDILISelLowering.cpp b/src/gallium/drivers/radeon/AMDILISelLowering.cpp
index 54c6ea65065..19b12fcf72b 100644
--- a/src/gallium/drivers/radeon/AMDILISelLowering.cpp
+++ b/src/gallium/drivers/radeon/AMDILISelLowering.cpp
@@ -623,6 +623,48 @@ translateToOpcode(uint64_t CCCode, unsigned int regClass)
   assert(0 && "Unknown opcode retrieved");
   return 0;
 }
+
+/// Helper function used by LowerFormalArguments
+static const TargetRegisterClass*
+getRegClassFromType(unsigned int type) {
+  switch (type) {
+  default:
+    assert(0 && "Passed in type does not match any register classes.");
+  case MVT::i8:
+    return &AMDIL::GPRI8RegClass;
+  case MVT::i16:
+    return &AMDIL::GPRI16RegClass;
+  case MVT::i32:
+    return &AMDIL::GPRI32RegClass;
+  case MVT::f32:
+    return &AMDIL::GPRF32RegClass;
+  case MVT::i64:
+    return &AMDIL::GPRI64RegClass;
+  case MVT::f64:
+    return &AMDIL::GPRF64RegClass;
+  case MVT::v4f32:
+    return &AMDIL::GPRV4F32RegClass;
+  case MVT::v4i8:
+    return &AMDIL::GPRV4I8RegClass;
+  case MVT::v4i16:
+    return &AMDIL::GPRV4I16RegClass;
+  case MVT::v4i32:
+    return &AMDIL::GPRV4I32RegClass;
+  case MVT::v2f32:
+    return &AMDIL::GPRV2F32RegClass;
+  case MVT::v2i8:
+    return &AMDIL::GPRV2I8RegClass;
+  case MVT::v2i16:
+    return &AMDIL::GPRV2I16RegClass;
+  case MVT::v2i32:
+    return &AMDIL::GPRV2I32RegClass;
+  case MVT::v2f64:
+    return &AMDIL::GPRV2F64RegClass;
+  case MVT::v2i64:
+    return &AMDIL::GPRV2I64RegClass;
+  }
+}
+
 SDValue
 AMDILTargetLowering::LowerMemArgument(
     SDValue Chain,
@@ -2189,6 +2231,7 @@ AMDILTargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const
   SDValue Result = DAG.getTargetExternalSymbol(Sym, MVT::i32);
   return Result;
 }
+
 /// LowerFORMAL_ARGUMENTS - transform physical registers into
 /// virtual registers and generate load operations for
 /// arguments places on the stack.
@@ -3191,7 +3234,7 @@ AMDILTargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (RST == MVT::f64 && RHSVT.isVector()
       && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
@@ -3248,7 +3291,7 @@ AMDILTargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (RST == MVT::f64 && RHSVT.isVector()
       && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
@@ -3314,7 +3357,7 @@ AMDILTargetLowering::genu32tof64(SDValue RHS, EVT LHSVT,
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (stm->calVersion() >= CAL_VERSION_SC_135) {
     // unsigned x = RHS;
@@ -3489,7 +3532,7 @@ AMDILTargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (LST == MVT::f64 && LHSVT.isVector()
       && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
@@ -3543,7 +3586,7 @@ AMDILTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
     amdtm = reinterpret_cast<const AMDILTargetMachine*>
     (&this->getTargetMachine());
   const AMDILSubtarget*
-    stm = dynamic_cast<const AMDILSubtarget*>(
+    stm = static_cast<const AMDILSubtarget*>(
         amdtm->getSubtargetImpl());
   if (LST == MVT::f64 && LHSVT.isVector()
       && stm->device()->getGeneration() > AMDILDeviceInfo::HD6XXX)  {
@@ -3843,7 +3886,6 @@ SDValue
 AMDILTargetLowering::LowerBUILD_VECTOR( SDValue Op, SelectionDAG &DAG ) const
 {
   EVT VT = Op.getValueType();
-  //printSDValue(Op, 1);
   SDValue Nodes1;
   SDValue second;
   SDValue third;
@@ -3965,7 +4007,6 @@ AMDILTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     SelectionDAG &DAG) const
 {
   EVT VT = Op.getValueType();
-  //printSDValue(Op, 1);
   const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   uint64_t swizzleNum = 0;
   DebugLoc DL = Op.getDebugLoc();
@@ -4782,7 +4823,7 @@ uint32_t
 AMDILTargetLowering::genVReg(uint32_t regType) const
 {
   return mBB->getParent()->getRegInfo().createVirtualRegister(
-      getRegClassFromID(regType));
+      getTargetMachine().getRegisterInfo()->getRegClass(regType));
 }
 
 MachineInstrBuilder
diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.cpp b/src/gallium/drivers/radeon/AMDILInstrInfo.cpp
index fbc3e45b357..cd2fb48209c 100644
--- a/src/gallium/drivers/radeon/AMDILInstrInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDILInstrInfo.cpp
@@ -10,13 +10,10 @@
 // This file contains the AMDIL implementation of the TargetInstrInfo class.
 //
 //===----------------------------------------------------------------------===//
-#include "AMDILInstrInfo.h"
-#include "AMDILUtilityFunctions.h"
-
-#define GET_INSTRINFO_CTOR
-#include "AMDILGenInstrInfo.inc"
 
 #include "AMDILInstrInfo.h"
+#include "AMDIL.h"
+#include "AMDILISelLowering.h"
 #include "AMDILUtilityFunctions.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -24,6 +21,9 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Instructions.h"
 
+#define GET_INSTRINFO_CTOR
+#include "AMDILGenInstrInfo.inc"
+
 using namespace llvm;
 
 AMDILInstrInfo::AMDILInstrInfo(AMDILTargetMachine &tm)
@@ -36,28 +36,6 @@ const AMDILRegisterInfo &AMDILInstrInfo::getRegisterInfo() const {
   return RI;
 }
 
-/// Return true if the instruction is a register to register move and leave the
-/// source and dest operands in the passed parameters.
-bool AMDILInstrInfo::isMoveInstr(const MachineInstr &MI, unsigned int &SrcReg,
-                                 unsigned int &DstReg, unsigned int &SrcSubIdx,
-                                 unsigned int &DstSubIdx) const {
-  // FIXME: we should look for:
-  //    add with 0
-  //assert(0 && "is Move Instruction has not been implemented yet!");
-  //return true;
-  if (!isMove(MI.getOpcode())) {
-    return false;
-  }
-  if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg()) {
-    return false;
-  }
-  SrcReg = MI.getOperand(1).getReg();
-  DstReg = MI.getOperand(0).getReg();
-  DstSubIdx = 0;
-  SrcSubIdx = 0;
-  return true;
-}
-
 bool AMDILInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                            unsigned &SrcReg, unsigned &DstReg,
                                            unsigned &SubIdx) const {
@@ -99,22 +77,7 @@ bool AMDILInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
 // TODO: Implement this function
   return false;
 }
-#if 0
-void
-AMDILInstrInfo::reMaterialize(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MI,
-                              unsigned DestReg, unsigned SubIdx,
-                             const MachineInstr *Orig,
-                             const TargetRegisterInfo *TRI) const {
-// TODO: Implement this function
-}
 
-MachineInst AMDILInstrInfo::duplicate(MachineInstr *Orig,
-                                      MachineFunction &MF) const {
-// TODO: Implement this function
-  return NULL;
-}
-#endif
 MachineInstr *
 AMDILInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                       MachineBasicBlock::iterator &MBBI,
@@ -122,25 +85,6 @@ AMDILInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
 // TODO: Implement this function
   return NULL;
 }
-#if 0
-MachineInst AMDILInstrInfo::commuteInstruction(MachineInstr *MI,
-                                               bool NewMI = false) const {
-// TODO: Implement this function
-  return NULL;
-}
-bool
-AMDILInstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
-                                     unsigned &SrcOpIdx2) const
-{
-// TODO: Implement this function
-}
-bool
-AMDILInstrInfo::produceSameValue(const MachineInstr *MI0,
-                                const MachineInstr *MI1) const
-{
-// TODO: Implement this function
-}
-#endif
 bool AMDILInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
                                         MachineBasicBlock &MBB) const {
   while (iter != MBB.end()) {
@@ -299,43 +243,6 @@ MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
   return MBB->end();
 }
 
-bool
-AMDILInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
-                             MachineBasicBlock::iterator I,
-                             unsigned DestReg, unsigned SrcReg,
-                             const TargetRegisterClass *DestRC,
-                             const TargetRegisterClass *SrcRC,
-                             DebugLoc DL) const {
-  // If we are adding to the end of a basic block we can safely assume that the
-  // move is caused by a PHI node since all move instructions that are non-PHI
-  // have already been inserted into the basic blocks Therefor we call the skip
-  // flow control instruction to move the iterator before the flow control
-  // instructions and put the move instruction there.
-  bool phi = (DestReg < 1025) || (SrcReg < 1025);
-  int movInst = phi ? getMoveInstFromID(DestRC->getID())
-                    : getPHIMoveInstFromID(DestRC->getID());
-  
-  MachineBasicBlock::iterator iTemp = (I == MBB.end()) ? skipFlowControl(&MBB)
-                                                       : I;
-  if (DestRC != SrcRC) {
-    //int convInst;
-    size_t dSize = DestRC->getSize();
-    size_t sSize = SrcRC->getSize();
-    if (dSize > sSize) {
-      // Elements are going to get duplicated.
-      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
-    } else if (dSize == sSize) {
-      // Direct copy, conversions are not handled.
-      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
-    } else if (dSize < sSize) {
-      // Elements are going to get dropped.
-      BuildMI(MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
-    }
-  } else {
-    BuildMI( MBB, iTemp, DL, get(movInst), DestReg).addReg(SrcReg);
-  }
-  return true;
-}
 void
 AMDILInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI, DebugLoc DL,
@@ -427,15 +334,11 @@ AMDILInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   if (MI != MBB.end()) {
     DL = MI->getDebugLoc();
   }
-  MachineInstr *nMI = BuildMI(MBB, MI, DL, get(Opc))
+  BuildMI(MBB, MI, DL, get(Opc))
     .addReg(SrcReg, getKillRegState(isKill))
     .addFrameIndex(FrameIndex)
     .addMemOperand(MMO)
     .addImm(0);
-  AMDILAS::InstrResEnc curRes;
-  curRes.bits.ResourceID 
-    = TM.getSubtargetImpl()->device()->getResourceID(AMDILDevice::SCRATCH_ID);
-  setAsmPrinterFlags(nMI, curRes);
 }
 
 void
@@ -511,16 +414,11 @@ AMDILInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   if (MI != MBB.end()) {
     DL = MI->getDebugLoc();
   }
-  MachineInstr* nMI = BuildMI(MBB, MI, DL, get(Opc))
+  BuildMI(MBB, MI, DL, get(Opc))
     .addReg(DestReg, RegState::Define)
     .addFrameIndex(FrameIndex)
     .addMemOperand(MMO)
     .addImm(0);
-  AMDILAS::InstrResEnc curRes;
-  curRes.bits.ResourceID 
-    = TM.getSubtargetImpl()->device()->getResourceID(AMDILDevice::SCRATCH_ID);
-  setAsmPrinterFlags(nMI, curRes);
-
 }
 MachineInstr *
 AMDILInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -569,65 +467,6 @@ AMDILInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
   return 0;
 }
 
-bool
-AMDILInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
-                                        int64_t &Offset1,
-                                        int64_t &Offset2) const {
-  return false;
-  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) {
-    return false;
-  }
-  const MachineSDNode *mload1 = dyn_cast<MachineSDNode>(Load1);
-  const MachineSDNode *mload2 = dyn_cast<MachineSDNode>(Load2);
-  if (!mload1 || !mload2) {
-    return false;
-  }
-  if (mload1->memoperands_empty() ||
-      mload2->memoperands_empty()) {
-    return false;
-  }
-  MachineMemOperand *memOp1 = (*mload1->memoperands_begin());
-  MachineMemOperand *memOp2 = (*mload2->memoperands_begin());
-  const Value *mv1 = memOp1->getValue();
-  const Value *mv2 = memOp2->getValue();
-  if (!memOp1->isLoad() || !memOp2->isLoad()) {
-    return false;
-  }
-  if (getBasePointerValue(mv1) == getBasePointerValue(mv2)) {
-    if (isa<GetElementPtrInst>(mv1) && isa<GetElementPtrInst>(mv2)) {
-      const GetElementPtrInst *gep1 = dyn_cast<GetElementPtrInst>(mv1);
-      const GetElementPtrInst *gep2 = dyn_cast<GetElementPtrInst>(mv2);
-      if (!gep1 || !gep2) {
-        return false;
-      }
-      if (gep1->getNumOperands() != gep2->getNumOperands()) {
-        return false;
-      }
-      for (unsigned i = 0, e = gep1->getNumOperands() - 1; i < e; ++i) {
-        const Value *op1 = gep1->getOperand(i);
-        const Value *op2 = gep2->getOperand(i);
-        if (op1 != op2) {
-          // If any value except the last one is different, return false.
-          return false;
-        }
-      }
-      unsigned size = gep1->getNumOperands()-1;
-      if (!isa<ConstantInt>(gep1->getOperand(size))
-          || !isa<ConstantInt>(gep2->getOperand(size))) {
-        return false;
-      }
-      Offset1 = dyn_cast<ConstantInt>(gep1->getOperand(size))->getSExtValue();
-      Offset2 = dyn_cast<ConstantInt>(gep2->getOperand(size))->getSExtValue();
-      return true;
-    } else if (isa<Argument>(mv1) && isa<Argument>(mv2)) {
-      return false;
-    } else if (isa<GlobalValue>(mv1) && isa<GlobalValue>(mv2)) {
-      return false;
-    }
-  }
-  return false;
-}
-
 bool AMDILInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                              int64_t Offset1, int64_t Offset2,
                                              unsigned NumLoads) const {
@@ -654,16 +493,6 @@ bool AMDILInstrInfo::isPredicated(const MachineInstr *MI) const {
   // TODO: Implement this function
   return false;
 }
-#if 0
-bool AMDILInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
-  // TODO: Implement this function
-}
-
-bool AMDILInstrInfo::PredicateInstruction(MachineInstr *MI,
-        const SmallVectorImpl<MachineOperand> &Pred) const {
-    // TODO: Implement this function
-}
-#endif
 bool
 AMDILInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
                                   const SmallVectorImpl<MachineOperand> &Pred2)
@@ -689,21 +518,112 @@ AMDILInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return true;
 }
 
-unsigned AMDILInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  // TODO: Implement this function
-  return 0;
+bool AMDILInstrInfo::isLoadInst(MachineInstr *MI) const {
+  if (strstr(getName(MI->getOpcode()), "LOADCONST")) {
+    return false;
+  }
+  return strstr(getName(MI->getOpcode()), "LOAD");
 }
 
-#if 0
-unsigned
-AMDILInstrInfo::GetFunctionSizeInBytes(const MachineFunction &MF) const {
-  // TODO: Implement this function
-  return 0;
+bool AMDILInstrInfo::isSWSExtLoadInst(MachineInstr *MI) const
+{
+switch (MI->getOpcode()) {
+    default:
+      break;
+      ExpandCaseToByteShortTypes(AMDIL::LOCALLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::GLOBALLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::REGIONLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::PRIVATELOAD);
+      ExpandCaseToByteShortTypes(AMDIL::CPOOLLOAD);
+      ExpandCaseToByteShortTypes(AMDIL::CONSTANTLOAD);
+      return true;
+  };
+  return false;
 }
 
-unsigned AMDILInstrInfo::getInlineAsmLength(const char *Str,
-                                            const MCAsmInfo &MAI) const {
-  // TODO: Implement this function
-  return 0;
+bool AMDILInstrInfo::isExtLoadInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "EXTLOAD");
+}
+
+bool AMDILInstrInfo::isSExtLoadInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "SEXTLOAD");
+}
+
+bool AMDILInstrInfo::isAExtLoadInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "AEXTLOAD");
+}
+
+bool AMDILInstrInfo::isZExtLoadInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "ZEXTLOAD");
+}
+
+bool AMDILInstrInfo::isStoreInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "STORE");
+}
+
+bool AMDILInstrInfo::isTruncStoreInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "TRUNCSTORE");
+}
+
+bool AMDILInstrInfo::isAtomicInst(MachineInstr *MI) const {
+  return strstr(getName(MI->getOpcode()), "ATOM");
+}
+
+bool AMDILInstrInfo::isVolatileInst(MachineInstr *MI) const {
+  if (!MI->memoperands_empty()) {
+    for (MachineInstr::mmo_iterator mob = MI->memoperands_begin(),
+        moe = MI->memoperands_end(); mob != moe; ++mob) {
+      // If there is a volatile mem operand, this is a volatile instruction.
+      if ((*mob)->isVolatile()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+bool AMDILInstrInfo::isGlobalInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "GLOBAL");
+}
+bool AMDILInstrInfo::isPrivateInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "PRIVATE");
+}
+bool AMDILInstrInfo::isConstantInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "CONSTANT")
+    || strstr(getName(MI->getOpcode()), "CPOOL");
+}
+bool AMDILInstrInfo::isRegionInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "REGION");
+}
+bool AMDILInstrInfo::isLocalInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "LOCAL");
+}
+bool AMDILInstrInfo::isImageInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "IMAGE");
+}
+bool AMDILInstrInfo::isAppendInst(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "APPEND");
+}
+bool AMDILInstrInfo::isRegionAtomic(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "ATOM_R");
+}
+bool AMDILInstrInfo::isLocalAtomic(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "ATOM_L");
+}
+bool AMDILInstrInfo::isGlobalAtomic(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "ATOM_G")
+    || isArenaAtomic(MI);
+}
+bool AMDILInstrInfo::isArenaAtomic(llvm::MachineInstr *MI) const
+{
+  return strstr(getName(MI->getOpcode()), "ATOM_A");
 }
-#endif
diff --git a/src/gallium/drivers/radeon/AMDILInstrInfo.h b/src/gallium/drivers/radeon/AMDILInstrInfo.h
index 88dd4e9441a..4121246e6f9 100644
--- a/src/gallium/drivers/radeon/AMDILInstrInfo.h
+++ b/src/gallium/drivers/radeon/AMDILInstrInfo.h
@@ -40,12 +40,6 @@ public:
   // always be able to get register info as well (through this method).
   const AMDILRegisterInfo &getRegisterInfo() const;
 
-  // Return true if the instruction is a register to register move and leave the
-  // source and dest operands in the passed parameters.
-  bool isMoveInstr(const MachineInstr &MI, unsigned int &SrcReg,
-                   unsigned int &DstReg, unsigned int &SrcSubIdx,
-                   unsigned int &DstSubIdx) const;
-
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
                              unsigned &DstReg, unsigned &SubIdx) const;
 
@@ -62,29 +56,10 @@ public:
                              const MachineMemOperand *&MMO,
                              int &FrameIndex) const;
 
-
-#if 0
-  void reMaterialize(MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator MI,
-                     unsigned DestReg, unsigned SubIdx,
-                     const MachineInstr *Orig,
-                     const TargetRegisterInfo *TRI) const;
-  MachineInstr *duplicate(MachineInstr *Orig,
-                          MachineFunction &MF) const;
-#endif
   MachineInstr *
   convertToThreeAddress(MachineFunction::iterator &MFI,
                         MachineBasicBlock::iterator &MBBI,
                         LiveVariables *LV) const;
-#if 0
-  MachineInstr *commuteInstruction(MachineInstr *MI,
-                                   bool NewMI = false) const;
-  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
-                             unsigned &SrcOpIdx2) const;
-  bool produceSameValue(const MachineInstr *MI0,
-                        const MachineInstr *MI1) const;
-
-#endif
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
@@ -99,12 +74,6 @@ public:
                const SmallVectorImpl<MachineOperand> &Cond,
                DebugLoc DL) const;
 
-  bool copyRegToReg(MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator I,
-                    unsigned DestReg, unsigned SrcReg,
-                    const TargetRegisterClass *DestRC,
-                    const TargetRegisterClass *SrcRC,
-                    DebugLoc DL) const;
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
@@ -141,8 +110,6 @@ public:
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
                                       bool UnfoldLoad, bool UnfoldStore,
                                       unsigned *LoadRegIndex = 0) const;
-  bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
-                               int64_t &Offset1, int64_t &Offset2) const;
   bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const;
@@ -151,24 +118,36 @@ public:
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const;
   bool isPredicated(const MachineInstr *MI) const;
-#if 0
-  bool isUnpredicatedTerminator(const MachineInstr *MI) const;
-  bool PredicateInstruction(MachineInstr *MI,
-                            const SmallVectorImpl<MachineOperand> &Pred) const;
-#endif
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
                          const SmallVectorImpl<MachineOperand> &Pred2) const;
   bool DefinesPredicate(MachineInstr *MI,
                         std::vector<MachineOperand> &Pred) const;
   bool isPredicable(MachineInstr *MI) const;
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
-#if 0
-  unsigned GetFunctionSizeInBytes(const MachineFunction &MF) const;
-  unsigned getInlineAsmLength(const char *Str,
-                              const MCAsmInfo &MAI) const;
-#endif
-  };
+
+  // Helper functions that check the opcode for status information
+  bool isLoadInst(llvm::MachineInstr *MI) const;
+  bool isExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isSExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isZExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isAExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isStoreInst(llvm::MachineInstr *MI) const;
+  bool isTruncStoreInst(llvm::MachineInstr *MI) const;
+  bool isAtomicInst(llvm::MachineInstr *MI) const;
+  bool isVolatileInst(llvm::MachineInstr *MI) const;
+  bool isGlobalInst(llvm::MachineInstr *MI) const;
+  bool isPrivateInst(llvm::MachineInstr *MI) const;
+  bool isConstantInst(llvm::MachineInstr *MI) const;
+  bool isRegionInst(llvm::MachineInstr *MI) const;
+  bool isLocalInst(llvm::MachineInstr *MI) const;
+  bool isImageInst(llvm::MachineInstr *MI) const;
+  bool isAppendInst(llvm::MachineInstr *MI) const;
+  bool isRegionAtomic(llvm::MachineInstr *MI) const;
+  bool isLocalAtomic(llvm::MachineInstr *MI) const;
+  bool isGlobalAtomic(llvm::MachineInstr *MI) const;
+  bool isArenaAtomic(llvm::MachineInstr *MI) const;
+};
 
 }
 
diff --git a/src/gallium/drivers/radeon/AMDILInstructions.td b/src/gallium/drivers/radeon/AMDILInstructions.td
index f824a67d7ad..db56e2121b3 100644
--- a/src/gallium/drivers/radeon/AMDILInstructions.td
+++ b/src/gallium/drivers/radeon/AMDILInstructions.td
@@ -1,4 +1,4 @@
-//===-- AMDILInstructions.td - TODO: Add brief description -------===//
+//===-- AMDILInstructions.td - AMDIL Instruction definitions --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp b/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp
deleted file mode 100644
index 9366f2e7bcb..00000000000
--- a/src/gallium/drivers/radeon/AMDILMCCodeEmitter.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//===---- AMDILMCCodeEmitter.cpp - Convert AMDIL text to AMDIL binary ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-//===---------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "amdil-emitter"
-#include "AMDIL.h"
-#include "AMDILInstrInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-#if 0
-namespace {
-  class AMDILMCCodeEmitter : public MCCodeEmitter {
-    AMDILMCCodeEmitter(const AMDILMCCodeEmitter &);// DO NOT IMPLEMENT
-    void operator=(const AMDILMCCodeEmitter &); // DO NOT IMPLEMENT
-    const TargetMachine &TM;
-    const TargetInstrInfo &TII;
-    MCContext &Ctx;
-    bool Is64BitMode;
-    public:
-    AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx, bool is64Bit);
-    ~AMDILMCCodeEmitter();
-    unsigned getNumFixupKinds() const;
-    const MCFixupKindInfo& getFixupKindInfo(MCFixupKind Kind) const;
-    static unsigned GetAMDILRegNum(const MCOperand &MO);
-    void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const;
-    void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-        raw_ostream &OS) const;
-    void EmitImmediate(const MCOperand &Disp, unsigned ImmSize,
-        MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &os,
-        SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
-
-    void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-        SmallVectorImpl<MCFixup> &Fixups) const;
-
-  }; // class AMDILMCCodeEmitter
-}; // anonymous namespace
-
-namespace llvm {
-  MCCodeEmitter *createAMDILMCCodeEmitter(const Target &,
-      TargetMachine &TM, MCContext &Ctx)
-  {
-    return new AMDILMCCodeEmitter(TM, Ctx, false);
-  }
-}
-
-AMDILMCCodeEmitter::AMDILMCCodeEmitter(TargetMachine &tm, MCContext &ctx
-    , bool is64Bit)
-: TM(tm), TII(*TM.getInstrInfo()), Ctx(ctx)
-{
-  Is64BitMode = is64Bit;
-}
-
-AMDILMCCodeEmitter::~AMDILMCCodeEmitter()
-{
-}
-
-unsigned
-AMDILMCCodeEmitter::getNumFixupKinds() const
-{
-  return 0;
-}
-
-const MCFixupKindInfo &
-AMDILMCCodeEmitter::getFixupKindInfo(MCFixupKind Kind) const
-{
-//  const static MCFixupKindInfo Infos[] = {};
-  if (Kind < FirstTargetFixupKind) {
-    return MCCodeEmitter::getFixupKindInfo(Kind);
-  }
-  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-      "Invalid kind!");
-  return MCCodeEmitter::getFixupKindInfo(Kind);
- // return Infos[Kind - FirstTargetFixupKind];
-
-}
-
-void
-AMDILMCCodeEmitter::EmitByte(unsigned char C, unsigned &CurByte,
-    raw_ostream &OS) const
-{
-  OS << (char) C;
-  ++CurByte;
-}
-void
-AMDILMCCodeEmitter::EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-    raw_ostream &OS) const
-{
-  // Output the constant in little endian byte order
-  for (unsigned i = 0; i != Size; ++i) {
-    EmitByte(Val & 255, CurByte, OS);
-    Val >>= 8;
-  }
-}
-void
-AMDILMCCodeEmitter::EmitImmediate(const MCOperand &DispOp, unsigned ImmSize,
-    MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
-    SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const
-{
-  // If this is a simple integer displacement that doesn't require a relocation
-  // emit it now.
-  if (DispOp.isImm()) {
-    EmitConstant(DispOp.getImm() + ImmOffset, ImmSize, CurByte, OS);
-  }
-
-  // If we have an immoffset, add it to the expression
-  const MCExpr *Expr = DispOp.getExpr();
-
-  if (ImmOffset) {
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-        MCConstantExpr::Create(ImmOffset, Ctx), Ctx);
-  }
-  // Emit a symbolic constant as a fixup and 4 zeros.
-  Fixups.push_back(MCFixup::Create(CurByte, Expr, FixupKind));
-  // TODO: Why the 4 zeros?
-  EmitConstant(0, ImmSize, CurByte, OS);
-}
-
-void
-AMDILMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-    SmallVectorImpl<MCFixup> &Fixups) const
-{
-#if 0
-  unsigned Opcode = MI.getOpcode();
-  const TargetInstrDesc &Desc = TII.get(Opcode);
-  unsigned TSFlags = Desc.TSFlags;
-
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
-
-  unsigned NumOps = Desc.getNumOperands();
-  unsigned CurOp = 0;
-
-  unsigned char BaseOpcode = 0;
-#ifndef NDEBUG
-  // FIXME: Verify.
-  if (// !Desc.isVariadic() &&
-      CurOp != NumOps) {
-    errs() << "Cannot encode all operands of: ";
-    MI.dump();
-    errs() << '\n';
-    abort();
-  }
-#endif
-#endif
-}
-#endif
diff --git a/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp b/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp
index b8e536361f0..5cb988785e2 100644
--- a/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp
+++ b/src/gallium/drivers/radeon/AMDILMachinePeephole.cpp
@@ -8,17 +8,11 @@
 //==-----------------------------------------------------------------------===//
 
 
-#define DEBUG_TYPE "machine_peephole"
-#if !defined(NDEBUG)
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME (false)
-#endif
-
 #include "AMDIL.h"
+#include "AMDILInstrInfo.h"
 #include "AMDILSubtarget.h"
-#include "AMDILUtilityFunctions.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
@@ -56,7 +50,7 @@ namespace llvm
 AMDILMachinePeephole::AMDILMachinePeephole(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
   : MachineFunctionPass(ID), TM(tm)
 {
-  mDebug = DEBUGME;
+  mDebug = false;
 }
 
 bool
@@ -64,6 +58,8 @@ AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF)
 {
   bool Changed = false;
   const AMDILSubtarget *STM = &TM.getSubtarget<AMDILSubtarget>();
+  const AMDILInstrInfo * AMDILII =
+                         static_cast<const AMDILInstrInfo *>(TM.getInstrInfo());
   for (MachineFunction::iterator MBB = MF.begin(), MBE = MF.end();
       MBB != MBE; ++MBB) {
     MachineBasicBlock *mb = MBB;
@@ -74,7 +70,7 @@ AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF)
       name = TM.getInstrInfo()->getName(mi->getOpcode());
       switch (mi->getOpcode()) {
         default:
-          if (isAtomicInst(TM.getInstrInfo(), mi)) {
+          if (AMDILII->isAtomicInst(mi)) {
             // If we don't support the hardware accellerated address spaces,
             // then the atomic needs to be transformed to the global atomic.
             if (strstr(name, "_L_")
@@ -94,7 +90,8 @@ AMDILMachinePeephole::runOnMachineFunction(MachineFunction &MF)
                   TM.getInstrInfo()->get(
                     (mi->getOpcode() - AMDIL::ATOM_R_ADD) + AMDIL::ATOM_G_ADD));
             }
-          } else if ((isLoadInst(TM.getInstrInfo(), mi) || isStoreInst(TM.getInstrInfo(), mi)) && isVolatileInst(TM.getInstrInfo(), mi)) {
+          } else if ((AMDILII->isLoadInst(mi) || AMDILII->isStoreInst(mi))
+                     && AMDILII->isVolatileInst(mi)) {
             insertFence(MIB);
           }
           continue;
diff --git a/src/gallium/drivers/radeon/AMDILMultiClass.td b/src/gallium/drivers/radeon/AMDILMultiClass.td
index 92691db52fd..d6828178ba7 100644
--- a/src/gallium/drivers/radeon/AMDILMultiClass.td
+++ b/src/gallium/drivers/radeon/AMDILMultiClass.td
@@ -1,4 +1,4 @@
-//===-- AMDILMultiClass.td - TODO: Add brief description -------===//
+//===-- AMDILMultiClass.td - AMDIL Multiclass defs ---*- tablegen -*-------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILNIDevice.cpp b/src/gallium/drivers/radeon/AMDILNIDevice.cpp
index 8fda1c18ae5..d4112cda0b5 100644
--- a/src/gallium/drivers/radeon/AMDILNIDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDILNIDevice.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILNIDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
index 5fe9f53c8c8..b62c7ab048b 100644
--- a/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
+++ b/src/gallium/drivers/radeon/AMDILPeepholeOptimizer.cpp
@@ -1,4 +1,4 @@
-//===-- AMDILPeepholeOptimizer.cpp - TODO: Add brief description -------===//
+//===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,20 +7,14 @@
 //
 //==-----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "PeepholeOpt"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME 0
-#endif
-
 #include "AMDILAlgorithms.tpp"
 #include "AMDILDevices.h"
-#include "AMDILUtilityFunctions.h"
+#include "AMDILInstrInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Constants.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/Function.h"
@@ -41,6 +35,9 @@ using namespace llvm;
 // The Peephole optimization pass is used to do simple last minute optimizations
 // that are required for correct code or to remove redundant functions
 namespace {
+
+class OpaqueType;
+
 class LLVM_LIBRARY_VISIBILITY AMDILPeepholeOpt : public FunctionPass {
 public:
   TargetMachine &TM;
@@ -114,6 +111,19 @@ private:
   // samplers at compile time.
   bool propagateSamplerInst(CallInst *CI);
 
+  // Helper functions
+
+  // Group of functions that recursively calculate the size of a structure based
+  // on it's sub-types.
+  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
+  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
+  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
+  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
+  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
+  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
+  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
+  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
+
   LLVMContext *mCTX;
   Function *mF;
   const AMDILSubtarget *mSTM;
@@ -134,7 +144,7 @@ namespace llvm {
 AMDILPeepholeOpt::AMDILPeepholeOpt(TargetMachine &tm AMDIL_OPT_LEVEL_DECL)
   : FunctionPass(ID), TM(tm) 
 {
-  mDebug = DEBUGME;
+  mDebug = false;
   optLevel = TM.getOptLevel();
 
 }
@@ -1136,3 +1146,106 @@ AMDILPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const
   FunctionPass::getAnalysisUsage(AU);
   AU.setPreservesAll();
 }
+
+size_t AMDILPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
+  size_t size = 0;
+  if (!T) {
+    return size;
+  }
+  switch (T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    size = T->getPrimitiveSizeInBits() >> 3;
+    break;
+  case Type::PointerTyID:
+    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
+    break;
+  case Type::IntegerTyID:
+    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
+    break;
+  case Type::StructTyID:
+    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
+    break;
+  case Type::ArrayTyID:
+    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
+    break;
+  case Type::FunctionTyID:
+    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
+    break;
+  case Type::VectorTyID:
+    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
+    break;
+  };
+  return size;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(StructType * const ST,
+    bool dereferencePtr) {
+  size_t size = 0;
+  if (!ST) {
+    return size;
+  }
+  Type *curType;
+  StructType::element_iterator eib;
+  StructType::element_iterator eie;
+  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
+    curType = *eib;
+    size += getTypeSize(curType, dereferencePtr);
+  }
+  return size;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(IntegerType * const IT,
+    bool dereferencePtr) {
+  return IT ? (IT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(FunctionType * const FT,
+    bool dereferencePtr) {
+    assert(0 && "Should not be able to calculate the size of an function type");
+    return 0;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(ArrayType * const AT,
+    bool dereferencePtr) {
+  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
+                                    dereferencePtr) * AT->getNumElements())
+                     : 0);
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(VectorType * const VT,
+    bool dereferencePtr) {
+  return VT ? (VT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(PointerType * const PT,
+    bool dereferencePtr) {
+  if (!PT) {
+    return 0;
+  }
+  Type *CT = PT->getElementType();
+  if (CT->getTypeID() == Type::StructTyID &&
+      PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
+    return getTypeSize(dyn_cast<StructType>(CT));
+  } else if (dereferencePtr) {
+    size_t size = 0;
+    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
+      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
+    }
+    return size;
+  } else {
+    return 4;
+  }
+}
+
+size_t AMDILPeepholeOpt::getTypeSize(OpaqueType * const OT,
+    bool dereferencePtr) {
+  //assert(0 && "Should not be able to calculate the size of an opaque type");
+  return 4;
+}
diff --git a/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp b/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp
index 5588233378c..d7c1dc74b8b 100644
--- a/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp
+++ b/src/gallium/drivers/radeon/AMDILRegisterInfo.cpp
@@ -20,7 +20,8 @@
 
 #include "AMDILRegisterInfo.h"
 #include "AMDIL.h"
-#include "AMDILUtilityFunctions.h"
+#include "AMDILInstrInfo.h"
+#include "AMDILTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -109,7 +110,9 @@ AMDILRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     if (!MI.getOperand(x).isFI()) {
       continue;
     }
-    bool def = isStoreInst(TM.getInstrInfo(), &MI);
+    const AMDILInstrInfo * AMDILII =
+                         static_cast<const AMDILInstrInfo *>(TM.getInstrInfo());
+    bool def = AMDILII->isStoreInst(&MI);
     int FrameIndex = MI.getOperand(x).getIndex();
     int64_t Offset = MFI->getObjectOffset(FrameIndex);
     //int64_t Size = MF.getFrameInfo()->getObjectSize(FrameIndex);
diff --git a/src/gallium/drivers/radeon/AMDILSIDevice.cpp b/src/gallium/drivers/radeon/AMDILSIDevice.cpp
index ce560984ef9..ae402a5d1f7 100644
--- a/src/gallium/drivers/radeon/AMDILSIDevice.cpp
+++ b/src/gallium/drivers/radeon/AMDILSIDevice.cpp
@@ -1,49 +1,49 @@
-//===-- AMDILSIDevice.cpp - TODO: Add brief description -------===//
+//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//==-----------------------------------------------------------------------===//
-#include "AMDILSIDevice.h"
-#include "AMDILEvergreenDevice.h"
-#include "AMDILNIDevice.h"
-#include "AMDILSubtarget.h"
+//==-----------------------------------------------------------------------===//
+#include "AMDILSIDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+#include "AMDILSubtarget.h"
 
-using namespace llvm;
-
-AMDILSIDevice::AMDILSIDevice(AMDILSubtarget *ST)
-  : AMDILEvergreenDevice(ST)
-{
-}
-AMDILSIDevice::~AMDILSIDevice()
-{
-}
-
-size_t
-AMDILSIDevice::getMaxLDSSize() const
-{
-  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_900;
-  } else {
-    return 0;
-  }
-}
-
-uint32_t
-AMDILSIDevice::getGeneration() const
-{
-  return AMDILDeviceInfo::HD7XXX;
-}
-
-std::string
-AMDILSIDevice::getDataLayout() const
-{
-    return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
-      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
-      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
-      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
-      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
-      "-n8:16:32:64");
-}
+using namespace llvm;
+
+AMDILSIDevice::AMDILSIDevice(AMDILSubtarget *ST)
+  : AMDILEvergreenDevice(ST)
+{
+}
+AMDILSIDevice::~AMDILSIDevice()
+{
+}
+
+size_t
+AMDILSIDevice::getMaxLDSSize() const
+{
+  if (usesHardware(AMDILDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_900;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDILSIDevice::getGeneration() const
+{
+  return AMDILDeviceInfo::HD7XXX;
+}
+
+std::string
+AMDILSIDevice::getDataLayout() const
+{
+    return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
+      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+      "-n8:16:32:64");
+}
diff --git a/src/gallium/drivers/radeon/AMDILSIDevice.h b/src/gallium/drivers/radeon/AMDILSIDevice.h
index 69f35a0588d..b272af7cfcf 100644
--- a/src/gallium/drivers/radeon/AMDILSIDevice.h
+++ b/src/gallium/drivers/radeon/AMDILSIDevice.h
@@ -1,45 +1,45 @@
-//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
+//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-//==-----------------------------------------------------------------------===//
-//
-// Interface for the subtarget data classes.
-//
-//===---------------------------------------------------------------------===//
-// This file will define the interface that each generation needs to
-// implement in order to correctly answer queries on the capabilities of the
+//==-----------------------------------------------------------------------===//
+//
+// Interface for the subtarget data classes.
+//
+//===---------------------------------------------------------------------===//
+// This file will define the interface that each generation needs to
+// implement in order to correctly answer queries on the capabilities of the
 // specific hardware.
-//===---------------------------------------------------------------------===//
-#ifndef _AMDILSIDEVICE_H_
-#define _AMDILSIDEVICE_H_
-#include "AMDILEvergreenDevice.h"
-#include "AMDILSubtarget.h"
+//===---------------------------------------------------------------------===//
+#ifndef _AMDILSIDEVICE_H_
+#define _AMDILSIDEVICE_H_
+#include "AMDILEvergreenDevice.h"
+#include "AMDILSubtarget.h"
+
+namespace llvm {
+  class AMDILSubtarget;
+//===---------------------------------------------------------------------===//
+// SI generation of devices and their respective sub classes
+//===---------------------------------------------------------------------===//
+
+// The AMDILSIDevice is the base class for all Northern Island series of
+// cards. It is very similiar to the AMDILEvergreenDevice, with the major
+// exception being differences in wavefront size and hardware capabilities.  The
+// SI devices are all 64 wide wavefronts and also add support for signed 24 bit
+// integer operations
+
+  class AMDILSIDevice : public AMDILEvergreenDevice {
+    public:
+      AMDILSIDevice(AMDILSubtarget*);
+      virtual ~AMDILSIDevice();
+      virtual size_t getMaxLDSSize() const;
+      virtual uint32_t getGeneration() const;
+      virtual std::string getDataLayout() const;
+    protected:
+  }; // AMDILSIDevice
 
-namespace llvm {
-  class AMDILSubtarget;
-//===---------------------------------------------------------------------===//
-// SI generation of devices and their respective sub classes
-//===---------------------------------------------------------------------===//
-
-// The AMDILSIDevice is the base class for all Northern Island series of
-// cards. It is very similiar to the AMDILEvergreenDevice, with the major
-// exception being differences in wavefront size and hardware capabilities.  The
-// SI devices are all 64 wide wavefronts and also add support for signed 24 bit
-// integer operations
-
-  class AMDILSIDevice : public AMDILEvergreenDevice {
-    public:
-      AMDILSIDevice(AMDILSubtarget*);
-      virtual ~AMDILSIDevice();
-      virtual size_t getMaxLDSSize() const;
-      virtual uint32_t getGeneration() const;
-      virtual std::string getDataLayout() const;
-    protected:
-  }; // AMDILSIDevice
-
-} // namespace llvm
-#endif // _AMDILSIDEVICE_H_
+} // namespace llvm
+#endif // _AMDILSIDEVICE_H_
diff --git a/src/gallium/drivers/radeon/AMDILSubtarget.cpp b/src/gallium/drivers/radeon/AMDILSubtarget.cpp
index 11b6bbe0c01..249cb03f4a3 100644
--- a/src/gallium/drivers/radeon/AMDILSubtarget.cpp
+++ b/src/gallium/drivers/radeon/AMDILSubtarget.cpp
@@ -27,7 +27,8 @@ using namespace llvm;
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "AMDILGenSubtargetInfo.inc"
 
-AMDILSubtarget::AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS) : AMDILGenSubtargetInfo( TT, CPU, FS )
+AMDILSubtarget::AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS) : AMDILGenSubtargetInfo( TT, CPU, FS ),
+  mDumpCode(false)
 {
   memset(CapsOverride, 0, sizeof(*CapsOverride)
       * AMDILDeviceInfo::MaxNumberCapabilities);
@@ -93,7 +94,7 @@ AMDILSubtarget::AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::St
   }
 #endif
   mDevName = GPU;
-  mDevice = getDeviceFromName(mDevName, this, mIs64bit);
+  mDevice = AMDILDeviceInfo::getDeviceFromName(mDevName, this, mIs64bit);
 }
 AMDILSubtarget::~AMDILSubtarget()
 {
diff --git a/src/gallium/drivers/radeon/AMDILSubtarget.h b/src/gallium/drivers/radeon/AMDILSubtarget.h
index a4b0e34ada7..38fcb859ac6 100644
--- a/src/gallium/drivers/radeon/AMDILSubtarget.h
+++ b/src/gallium/drivers/radeon/AMDILSubtarget.h
@@ -42,6 +42,7 @@ namespace llvm {
       uint32_t mVersion;
       bool mIs64bit;
       bool mIs32on64bit;
+      bool mDumpCode;
     public:
       AMDILSubtarget(llvm::StringRef TT, llvm::StringRef CPU, llvm::StringRef FS);
       virtual ~AMDILSubtarget();
@@ -67,6 +68,7 @@ namespace llvm {
         ParseSubtargetFeatures(
             llvm::StringRef CPU,
             llvm::StringRef FS);
+      bool dumpCode() const { return mDumpCode; }
 
   };
 
diff --git a/src/gallium/drivers/radeon/AMDILTargetMachine.cpp b/src/gallium/drivers/radeon/AMDILTargetMachine.cpp
index 77fac1d97bd..0879d43ad72 100644
--- a/src/gallium/drivers/radeon/AMDILTargetMachine.cpp
+++ b/src/gallium/drivers/radeon/AMDILTargetMachine.cpp
@@ -150,8 +150,8 @@ bool AMDILPassConfig::addPreISel()
 
 bool AMDILPassConfig::addInstSelector()
 {
-  PM.add(createAMDILPeepholeOpt(*TM));
-  PM.add(createAMDILISelDag(getAMDILTargetMachine()));
+  PM->add(createAMDILPeepholeOpt(*TM));
+  PM->add(createAMDILISelDag(getAMDILTargetMachine()));
   return false;
 }
 
@@ -162,7 +162,7 @@ bool AMDILPassConfig::addPreRegAlloc()
     llvm::RegisterScheduler::setDefault(&llvm::createSourceListDAGScheduler);
   }
 
-  PM.add(createAMDILMachinePeephole(*TM));
+  PM->add(createAMDILMachinePeephole(*TM));
   return false;
 }
 
@@ -175,8 +175,8 @@ bool AMDILPassConfig::addPostRegAlloc() {
 /// true if -print-machineinstrs should print out the code after the passes.
 bool AMDILPassConfig::addPreEmitPass()
 {
-  PM.add(createAMDILCFGPreparationPass(*TM));
-  PM.add(createAMDILCFGStructurizerPass(*TM));
+  PM->add(createAMDILCFGPreparationPass(*TM));
+  PM->add(createAMDILCFGStructurizerPass(*TM));
   return true;
 }
 
diff --git a/src/gallium/drivers/radeon/AMDILTokenDesc.td b/src/gallium/drivers/radeon/AMDILTokenDesc.td
index b81f593506f..2dafb2cd559 100644
--- a/src/gallium/drivers/radeon/AMDILTokenDesc.td
+++ b/src/gallium/drivers/radeon/AMDILTokenDesc.td
@@ -1,4 +1,4 @@
-//===-- AMDILTokenDesc.td - TODO: Add brief description -------===//
+//===-- AMDILTokenDesc.td - AMDIL Token Definitions --*- tablegen -*-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp b/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp
deleted file mode 100644
index f2ef4eb7771..00000000000
--- a/src/gallium/drivers/radeon/AMDILUtilityFunctions.cpp
+++ /dev/null
@@ -1,683 +0,0 @@
-//===-- AMDILUtilityFunctions.cpp - AMDIL Utility Functions       ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-// This file provides the implementations of functions that are declared in the
-// AMDILUtilityFUnctions.h file.
-//
-//===----------------------------------------------------------------------===//
-#include "AMDILUtilityFunctions.h"
-#include "AMDILISelLowering.h"
-#include "llvm/ADT/ValueMap.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instruction.h"
-#include "llvm/Instructions.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Type.h"
-
-#include <cstdio>
-#include <list>
-#include <queue>
-
-#define GET_OPCODE_NAME(TII, MI) \
-  TII->getName(MI->getOpcode())
-
-
-using namespace llvm;
-int64_t GET_SCALAR_SIZE(llvm::Type *A) {
-  return A->getScalarSizeInBits();
-}
-
-const TargetRegisterClass * getRegClassFromID(unsigned int ID) {
-  switch (ID) {
-  default:
-    assert(0 && "Passed in ID does not match any register classes.");
-    return NULL;
-  case AMDIL::GPRI8RegClassID:
-    return &AMDIL::GPRI8RegClass;
-  case AMDIL::GPRI16RegClassID:
-    return &AMDIL::GPRI16RegClass;
-  case AMDIL::GPRI32RegClassID:
-    return &AMDIL::GPRI32RegClass;
-  case AMDIL::GPRF32RegClassID:
-    return &AMDIL::GPRF32RegClass;
-  case AMDIL::GPRI64RegClassID:
-    return &AMDIL::GPRI64RegClass;
-  case AMDIL::GPRF64RegClassID:
-    return &AMDIL::GPRF64RegClass;
-  case AMDIL::GPRV4F32RegClassID:
-    return &AMDIL::GPRV4F32RegClass;
-  case AMDIL::GPRV4I8RegClassID:
-    return &AMDIL::GPRV4I8RegClass;
-  case AMDIL::GPRV4I16RegClassID:
-    return &AMDIL::GPRV4I16RegClass;
-  case AMDIL::GPRV4I32RegClassID:
-    return &AMDIL::GPRV4I32RegClass;
-  case AMDIL::GPRV2F32RegClassID:
-    return &AMDIL::GPRV2F32RegClass;
-  case AMDIL::GPRV2I8RegClassID:
-    return &AMDIL::GPRV2I8RegClass;
-  case AMDIL::GPRV2I16RegClassID:
-    return &AMDIL::GPRV2I16RegClass;
-  case AMDIL::GPRV2I32RegClassID:
-    return &AMDIL::GPRV2I32RegClass;
-  case AMDIL::GPRV2F64RegClassID:
-    return &AMDIL::GPRV2F64RegClass;
-  case AMDIL::GPRV2I64RegClassID:
-    return &AMDIL::GPRV2I64RegClass;
-  };
-}
-
-unsigned int getMoveInstFromID(unsigned int ID) {
-  switch (ID) {
-  default:
-    assert(0 && "Passed in ID does not match any move instructions.");
-  case AMDIL::GPRI8RegClassID:
-    return AMDIL::MOVE_i8;
-  case AMDIL::GPRI16RegClassID:
-    return AMDIL::MOVE_i16;
-  case AMDIL::GPRI32RegClassID:
-    return AMDIL::MOVE_i32;
-  case AMDIL::GPRF32RegClassID:
-    return AMDIL::MOVE_f32;
-  case AMDIL::GPRI64RegClassID:
-    return AMDIL::MOVE_i64;
-  case AMDIL::GPRF64RegClassID:
-    return AMDIL::MOVE_f64;
-  case AMDIL::GPRV4F32RegClassID:
-    return AMDIL::MOVE_v4f32;
-  case AMDIL::GPRV4I8RegClassID:
-    return AMDIL::MOVE_v4i8;
-  case AMDIL::GPRV4I16RegClassID:
-    return AMDIL::MOVE_v4i16;
-  case AMDIL::GPRV4I32RegClassID:
-    return AMDIL::MOVE_v4i32;
-  case AMDIL::GPRV2F32RegClassID:
-    return AMDIL::MOVE_v2f32;
-  case AMDIL::GPRV2I8RegClassID:
-    return AMDIL::MOVE_v2i8;
-  case AMDIL::GPRV2I16RegClassID:
-    return AMDIL::MOVE_v2i16;
-  case AMDIL::GPRV2I32RegClassID:
-    return AMDIL::MOVE_v2i32;
-  case AMDIL::GPRV2F64RegClassID:
-    return AMDIL::MOVE_v2f64;
-  case AMDIL::GPRV2I64RegClassID:
-    return AMDIL::MOVE_v2i64;
-  };
-  return -1;
-}
-
-unsigned int getPHIMoveInstFromID(unsigned int ID) {
-  switch (ID) {
-  default:
-    assert(0 && "Passed in ID does not match any move instructions.");
-  case AMDIL::GPRI8RegClassID:
-    return AMDIL::PHIMOVE_i8;
-  case AMDIL::GPRI16RegClassID:
-    return AMDIL::PHIMOVE_i16;
-  case AMDIL::GPRI32RegClassID:
-    return AMDIL::PHIMOVE_i32;
-  case AMDIL::GPRF32RegClassID:
-    return AMDIL::PHIMOVE_f32;
-  case AMDIL::GPRI64RegClassID:
-    return AMDIL::PHIMOVE_i64;
-  case AMDIL::GPRF64RegClassID:
-    return AMDIL::PHIMOVE_f64;
-  case AMDIL::GPRV4F32RegClassID:
-    return AMDIL::PHIMOVE_v4f32;
-  case AMDIL::GPRV4I8RegClassID:
-    return AMDIL::PHIMOVE_v4i8;
-  case AMDIL::GPRV4I16RegClassID:
-    return AMDIL::PHIMOVE_v4i16;
-  case AMDIL::GPRV4I32RegClassID:
-    return AMDIL::PHIMOVE_v4i32;
-  case AMDIL::GPRV2F32RegClassID:
-    return AMDIL::PHIMOVE_v2f32;
-  case AMDIL::GPRV2I8RegClassID:
-    return AMDIL::PHIMOVE_v2i8;
-  case AMDIL::GPRV2I16RegClassID:
-    return AMDIL::PHIMOVE_v2i16;
-  case AMDIL::GPRV2I32RegClassID:
-    return AMDIL::PHIMOVE_v2i32;
-  case AMDIL::GPRV2F64RegClassID:
-    return AMDIL::PHIMOVE_v2f64;
-  case AMDIL::GPRV2I64RegClassID:
-    return AMDIL::PHIMOVE_v2i64;
-  };
-  return -1;
-}
-
-const TargetRegisterClass* getRegClassFromType(unsigned int type) {
-  switch (type) {
-  default:
-    assert(0 && "Passed in type does not match any register classes.");
-  case MVT::i8:
-    return &AMDIL::GPRI8RegClass;
-  case MVT::i16:
-    return &AMDIL::GPRI16RegClass;
-  case MVT::i32:
-    return &AMDIL::GPRI32RegClass;
-  case MVT::f32:
-    return &AMDIL::GPRF32RegClass;
-  case MVT::i64:
-    return &AMDIL::GPRI64RegClass;
-  case MVT::f64:
-    return &AMDIL::GPRF64RegClass;
-  case MVT::v4f32:
-    return &AMDIL::GPRV4F32RegClass;
-  case MVT::v4i8:
-    return &AMDIL::GPRV4I8RegClass;
-  case MVT::v4i16:
-    return &AMDIL::GPRV4I16RegClass;
-  case MVT::v4i32:
-    return &AMDIL::GPRV4I32RegClass;
-  case MVT::v2f32:
-    return &AMDIL::GPRV2F32RegClass;
-  case MVT::v2i8:
-    return &AMDIL::GPRV2I8RegClass;
-  case MVT::v2i16:
-    return &AMDIL::GPRV2I16RegClass;
-  case MVT::v2i32:
-    return &AMDIL::GPRV2I32RegClass;
-  case MVT::v2f64:
-    return &AMDIL::GPRV2F64RegClass;
-  case MVT::v2i64:
-    return &AMDIL::GPRV2I64RegClass;
-  }
-}
-
-void printSDNode(const SDNode *N) {
-  printf("Opcode: %d isTargetOpcode: %d isMachineOpcode: %d\n",
-         N->getOpcode(), N->isTargetOpcode(), N->isMachineOpcode());
-  printf("Empty: %d OneUse: %d Size: %d NodeID: %d\n",
-         N->use_empty(), N->hasOneUse(), (int)N->use_size(), N->getNodeId());
-  for (unsigned int i = 0; i < N->getNumOperands(); ++i) {
-    printf("OperandNum: %d ValueCount: %d ValueType: %d\n",
-           i, N->getNumValues(), N->getValueType(0) .getSimpleVT().SimpleTy);
-    printSDValue(N->getOperand(i), 0);
-  }
-}
-
-void printSDValue(const SDValue &Op, int level) {
-  printf("\nOp: %p OpCode: %d NumOperands: %d ", (void*)&Op, Op.getOpcode(),
-         Op.getNumOperands());
-  printf("IsTarget: %d IsMachine: %d ", Op.isTargetOpcode(),
-         Op.isMachineOpcode());
-  if (Op.isMachineOpcode()) {
-    printf("MachineOpcode: %d\n", Op.getMachineOpcode());
-  } else {
-    printf("\n");
-  }
-  EVT vt = Op.getValueType();
-  printf("ValueType: %d \n", vt.getSimpleVT().SimpleTy);
-  printf("UseEmpty: %d OneUse: %d\n", Op.use_empty(), Op.hasOneUse());
-  if (level) {
-    printf("Children for %d:\n", level);
-    for (unsigned int i = 0; i < Op.getNumOperands(); ++i) {
-      printf("Child %d->%d:", level, i);
-      printSDValue(Op.getOperand(i), level - 1);
-    }
-  }
-}
-
-bool isPHIMove(unsigned int opcode) {
-  switch (opcode) {
-  default:
-    return false;
-    ExpandCaseToAllTypes(AMDIL::PHIMOVE);
-    return true;
-  }
-  return false;
-}
-
-bool isMove(unsigned int opcode) {
-  switch (opcode) {
-  default:
-    return false;
-    ExpandCaseToAllTypes(AMDIL::MOVE);
-    return true;
-  }
-  return false;
-}
-
-bool isMoveOrEquivalent(unsigned int opcode) {
-  switch (opcode) {
-  default:
-    return isMove(opcode) || isPHIMove(opcode);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASCHAR);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASSHORT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASINT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASLONG);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASDOUBLE);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASFLOAT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2CHAR);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2SHORT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2INT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2FLOAT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2LONG);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV2DOUBLE);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4CHAR);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4SHORT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4INT);
-    ExpandCaseToAllScalarTypes(AMDIL::IL_ASV4FLOAT);
-    case AMDIL::INTTOANY_i8:
-    case AMDIL::INTTOANY_i16:
-    case AMDIL::INTTOANY_i32:
-    case AMDIL::INTTOANY_f32:
-    case AMDIL::DLO:
-    case AMDIL::LLO:
-    case AMDIL::LLO_v2i64:
-      return true;
-  };
-  return false;
-}
-
-bool check_type(const Value *ptr, unsigned int addrspace) {
-  if (!ptr) {
-    return false;
-  }
-  Type *ptrType = ptr->getType();
-  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
-}
-
-size_t getTypeSize(Type * const T, bool dereferencePtr) {
-  size_t size = 0;
-  if (!T) {
-    return size;
-  }
-  switch (T->getTypeID()) {
-  case Type::X86_FP80TyID:
-  case Type::FP128TyID:
-  case Type::PPC_FP128TyID:
-  case Type::LabelTyID:
-    assert(0 && "These types are not supported by this backend");
-  default:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-    size = T->getPrimitiveSizeInBits() >> 3;
-    break;
-  case Type::PointerTyID:
-    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
-    break;
-  case Type::IntegerTyID:
-    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
-    break;
-  case Type::StructTyID:
-    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
-    break;
-  case Type::ArrayTyID:
-    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
-    break;
-  case Type::FunctionTyID:
-    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
-    break;
-  case Type::VectorTyID:
-    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
-    break;
-  };
-  return size;
-}
-
-size_t getTypeSize(StructType * const ST, bool dereferencePtr) {
-  size_t size = 0;
-  if (!ST) {
-    return size;
-  }
-  Type *curType;
-  StructType::element_iterator eib;
-  StructType::element_iterator eie;
-  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
-    curType = *eib;
-    size += getTypeSize(curType, dereferencePtr);
-  }
-  return size;
-}
-
-size_t getTypeSize(IntegerType * const IT, bool dereferencePtr) {
-  return IT ? (IT->getBitWidth() >> 3) : 0;
-}
-
-size_t getTypeSize(FunctionType * const FT, bool dereferencePtr) {
-    assert(0 && "Should not be able to calculate the size of an function type");
-    return 0;
-}
-
-size_t getTypeSize(ArrayType * const AT, bool dereferencePtr) {
-  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
-                                    dereferencePtr) * AT->getNumElements())
-                     : 0);
-}
-
-size_t getTypeSize(VectorType * const VT, bool dereferencePtr) {
-  return VT ? (VT->getBitWidth() >> 3) : 0;
-}
-
-size_t getTypeSize(PointerType * const PT, bool dereferencePtr) {
-  if (!PT) {
-    return 0;
-  }
-  Type *CT = PT->getElementType();
-  if (CT->getTypeID() == Type::StructTyID &&
-      PT->getAddressSpace() == AMDILAS::PRIVATE_ADDRESS) {
-    return getTypeSize(dyn_cast<StructType>(CT));
-  } else if (dereferencePtr) {
-    size_t size = 0;
-    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
-      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
-    }
-    return size;
-  } else {
-    return 4;
-  }
-}
-
-size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr) {
-  //assert(0 && "Should not be able to calculate the size of an opaque type");
-  return 4;
-}
-
-size_t getNumElements(Type * const T) {
-  size_t size = 0;
-  if (!T) {
-    return size;
-  }
-  switch (T->getTypeID()) {
-  case Type::X86_FP80TyID:
-  case Type::FP128TyID:
-  case Type::PPC_FP128TyID:
-  case Type::LabelTyID:
-    assert(0 && "These types are not supported by this backend");
-  default:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-    size = 1;
-    break;
-  case Type::PointerTyID:
-    size = getNumElements(dyn_cast<PointerType>(T));
-    break;
-  case Type::IntegerTyID:
-    size = getNumElements(dyn_cast<IntegerType>(T));
-    break;
-  case Type::StructTyID:
-    size = getNumElements(dyn_cast<StructType>(T));
-    break;
-  case Type::ArrayTyID:
-    size = getNumElements(dyn_cast<ArrayType>(T));
-    break;
-  case Type::FunctionTyID:
-    size = getNumElements(dyn_cast<FunctionType>(T));
-    break;
-  case Type::VectorTyID:
-    size = getNumElements(dyn_cast<VectorType>(T));
-    break;
-  };
-  return size;
-}
-
-size_t getNumElements(StructType * const ST) {
-  size_t size = 0;
-  if (!ST) {
-    return size;
-  }
-  Type *curType;
-  StructType::element_iterator eib;
-  StructType::element_iterator eie;
-  for (eib = ST->element_begin(), eie = ST->element_end();
-       eib != eie; ++eib) {
-    curType = *eib;
-    size += getNumElements(curType);
-  }
-  return size;
-}
-
-size_t getNumElements(IntegerType * const IT) {
-  return (!IT) ? 0 : 1;
-}
-
-size_t getNumElements(FunctionType * const FT) {
-  assert(0 && "Should not be able to calculate the number of "
-         "elements of a function type");
-  return 0;
-}
-
-size_t getNumElements(ArrayType * const AT) {
-  return (!AT) ? 0
-               :  (size_t)(getNumElements(AT->getElementType()) *
-                           AT->getNumElements());
-}
-
-size_t getNumElements(VectorType * const VT) {
-  return (!VT) ? 0
-               : VT->getNumElements() * getNumElements(VT->getElementType());
-}
-
-size_t getNumElements(PointerType * const PT) {
-  size_t size = 0;
-  if (!PT) {
-    return size;
-  }
-  for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
-    size += getNumElements(PT->getContainedType(x));
-  }
-  return size;
-}
-
-const llvm::Value *getBasePointerValue(const llvm::Value *V)
-{
-  if (!V) {
-    return NULL;
-  }
-  const Value *ret = NULL;
-  ValueMap<const Value *, bool> ValueBitMap;
-  std::queue<const Value *, std::list<const Value *> > ValueQueue;
-  ValueQueue.push(V);
-  while (!ValueQueue.empty()) {
-    V = ValueQueue.front();
-    if (ValueBitMap.find(V) == ValueBitMap.end()) {
-      ValueBitMap[V] = true;
-      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
-        ret = V;
-        break;
-      } else if (dyn_cast<GlobalVariable>(V)) {
-        ret = V;
-        break;
-      } else if (dyn_cast<Constant>(V)) {
-        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
-        if (CE) {
-          ValueQueue.push(CE->getOperand(0));
-        }
-      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
-        ret = AI;
-        break;
-      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
-        uint32_t numOps = I->getNumOperands();
-        for (uint32_t x = 0; x < numOps; ++x) {
-          ValueQueue.push(I->getOperand(x));
-        }
-      } else {
-        // assert(0 && "Found a Value that we didn't know how to handle!");
-      }
-    }
-    ValueQueue.pop();
-  }
-  return ret;
-}
-
-const llvm::Value *getBasePointerValue(const llvm::MachineInstr *MI) {
-  const Value *moVal = NULL;
-  if (!MI->memoperands_empty()) {
-    const MachineMemOperand *memOp = (*MI->memoperands_begin());
-    moVal = memOp ? memOp->getValue() : NULL;
-    moVal = getBasePointerValue(moVal);
-  }
-  return moVal;
-}
-
-bool commaPrint(int i, llvm::raw_ostream &O) {
-  O << ":" << i;
-  return false;
-}
-
-bool isLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  if (strstr(GET_OPCODE_NAME(TII, MI), "LOADCONST")) {
-    return false;
-  }
-  return strstr(GET_OPCODE_NAME(TII, MI), "LOAD");
-}
-
-bool isSWSExtLoadInst(MachineInstr *MI)
-{
-switch (MI->getOpcode()) {
-    default:
-      break;
-      ExpandCaseToByteShortTypes(AMDIL::LOCALLOAD);
-      ExpandCaseToByteShortTypes(AMDIL::GLOBALLOAD);
-      ExpandCaseToByteShortTypes(AMDIL::REGIONLOAD);
-      ExpandCaseToByteShortTypes(AMDIL::PRIVATELOAD);
-      ExpandCaseToByteShortTypes(AMDIL::CPOOLLOAD);
-      ExpandCaseToByteShortTypes(AMDIL::CONSTANTLOAD);
-      return true;
-  };
-  return false;
-}
-
-bool isExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "EXTLOAD");
-}
-
-bool isSExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "SEXTLOAD");
-}
-
-bool isAExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "AEXTLOAD");
-}
-
-bool isZExtLoadInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "ZEXTLOAD");
-}
-
-bool isStoreInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "STORE");
-}
-
-bool isTruncStoreInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "TRUNCSTORE");
-}
-
-bool isAtomicInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM");
-}
-
-bool isVolatileInst(const llvm::TargetInstrInfo * TII, MachineInstr *MI) {
-  if (!MI->memoperands_empty()) {
-    for (MachineInstr::mmo_iterator mob = MI->memoperands_begin(),
-        moe = MI->memoperands_end(); mob != moe; ++mob) {
-      // If there is a volatile mem operand, this is a volatile instruction.
-      if ((*mob)->isVolatile()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-bool isGlobalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "GLOBAL");
-}
-bool isPrivateInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "PRIVATE");
-}
-bool isConstantInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "CONSTANT")
-    || strstr(GET_OPCODE_NAME(TII, MI), "CPOOL");
-}
-bool isRegionInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "REGION");
-}
-bool isLocalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "LOCAL");
-}
-bool isImageInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "IMAGE");
-}
-bool isAppendInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "APPEND");
-}
-bool isRegionAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_R");
-}
-bool isLocalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_L");
-}
-bool isGlobalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_G")
-    || isArenaAtomic(TII, MI);
-}
-bool isArenaAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI)
-{
-  return strstr(GET_OPCODE_NAME(TII, MI), "ATOM_A");
-}
-
-const char* getSrcSwizzle(unsigned idx) {
-  const char *srcSwizzles[]  = {
-    "", ".x000", ".0x00", ".00x0", ".000x", ".y000", ".0y00", ".00y0", ".000y", 
-    ".z000", ".0z00", ".00z0", ".000z", ".w000", ".0w00", ".00w0", ".000w",
-    ".xy00", ".00xy", ".zw00", ".00zw", ".xyz0", ".0xyz", ".xyzw", ".0000",
-    ".xxxx", ".yyyy", ".zzzz", ".wwww", ".xyxy", ".zwzw", ".xzxz", ".ywyw",
-    ".x0y0", ".0x0y", ".xy_neg(y)", "_neg(yw)", "_neg(x)", ".xy_neg(xy)",
-    "_neg(xyzw)", ".0yzw", ".x0zw", ".xy0w", ".x", ".y", ".z", ".w", ".xy",
-    ".zw"
-  };
-  assert(idx < sizeof(srcSwizzles)/sizeof(srcSwizzles[0])
-      && "Idx passed in is invalid!");
-  return srcSwizzles[idx];
-}
-const char* getDstSwizzle(unsigned idx) {
-  const char *dstSwizzles[] = {
-    "", ".x___", ".xy__", ".xyz_", ".xyzw", "._y__", "._yz_", "._yzw", ".__z_",
-    ".__zw", ".___w", ".x_zw", ".xy_w", ".x_z_", ".x__w", "._y_w", 
-  };
-  assert(idx < sizeof(dstSwizzles)/sizeof(dstSwizzles[0])
-      && "Idx passed in is invalid!");
-  return dstSwizzles[idx];
-}
-/// Helper function to get the currently set flags
-void getAsmPrinterFlags(MachineInstr *MI, AMDILAS::InstrResEnc &curRes)
-{
-  // We need 16 bits of information, but LLVMr127097 cut the field in half.
-  // So we have to use two different fields to store all of our information.
-  uint16_t upper = MI->getFlags() << 8;
-  uint16_t lower = MI->getAsmPrinterFlags();
-  curRes.u16all = upper | lower;
-}
-/// Helper function to clear the currently set flags and add the new flags.
-void setAsmPrinterFlags(MachineInstr *MI, AMDILAS::InstrResEnc &curRes)
-{
-  // We need 16 bits of information, but LLVMr127097 cut the field in half.
-  // So we have to use two different fields to store all of our information.
-  MI->clearAsmPrinterFlags();
-  MI->setFlags(0);
-  uint8_t lower = curRes.u16all & 0xFF;
-  uint8_t upper = (curRes.u16all >> 8) & 0xFF;
-  MI->setFlags(upper);
-  MI->setAsmPrinterFlag((llvm::MachineInstr::CommentFlag)lower);
-}
diff --git a/src/gallium/drivers/radeon/AMDILUtilityFunctions.h b/src/gallium/drivers/radeon/AMDILUtilityFunctions.h
index 637c868b55c..66af706bbb3 100644
--- a/src/gallium/drivers/radeon/AMDILUtilityFunctions.h
+++ b/src/gallium/drivers/radeon/AMDILUtilityFunctions.h
@@ -7,191 +7,12 @@
 //
 //==-----------------------------------------------------------------------===//
 //
-// This file provides declarations for functions that are used across different
-// classes and provide various conversions or utility to shorten the code
+// This file provides helper macros for expanding case statements.
 //
 //===----------------------------------------------------------------------===//
 #ifndef AMDILUTILITYFUNCTIONS_H_
 #define AMDILUTILITYFUNCTIONS_H_
 
-#include "AMDIL.h"
-#include "AMDILTargetMachine.h"
-#include "llvm/ADT/SmallVector.h"
-
-// Utility functions from ID
-//
-namespace llvm {
-class TargetRegisterClass;
-class SDValue;
-class SDNode;
-class Value;
-class Type;
-class StructType;
-class IntegerType;
-class FunctionType;
-class VectorType;
-class ArrayType;
-class PointerType;
-class OpaqueType;
-class MachineInstr;
-
-}
-enum SrcSwizzles {
-  AMDIL_SRC_SWIZZLE_DEFAULT = 0,
-  AMDIL_SRC_SWIZZLE_X000,
-  AMDIL_SRC_SWIZZLE_0X00,
-  AMDIL_SRC_SWIZZLE_00X0,
-  AMDIL_SRC_SWIZZLE_000X,
-  AMDIL_SRC_SWIZZLE_Y000,
-  AMDIL_SRC_SWIZZLE_0Y00,
-  AMDIL_SRC_SWIZZLE_00Y0,
-  AMDIL_SRC_SWIZZLE_000Y,
-  AMDIL_SRC_SWIZZLE_Z000,
-  AMDIL_SRC_SWIZZLE_0Z00,
-  AMDIL_SRC_SWIZZLE_00Z0,
-  AMDIL_SRC_SWIZZLE_000Z,
-  AMDIL_SRC_SWIZZLE_W000,
-  AMDIL_SRC_SWIZZLE_0W00,
-  AMDIL_SRC_SWIZZLE_00W0,
-  AMDIL_SRC_SWIZZLE_000W,
-  AMDIL_SRC_SWIZZLE_XY00,
-  AMDIL_SRC_SWIZZLE_00XY,
-  AMDIL_SRC_SWIZZLE_ZW00,
-  AMDIL_SRC_SWIZZLE_00ZW,
-  AMDIL_SRC_SWIZZLE_XYZ0,
-  AMDIL_SRC_SWIZZLE_0XYZ,
-  AMDIL_SRC_SWIZZLE_XYZW,
-  AMDIL_SRC_SWIZZLE_0000,
-  AMDIL_SRC_SWIZZLE_XXXX,
-  AMDIL_SRC_SWIZZLE_YYYY,
-  AMDIL_SRC_SWIZZLE_ZZZZ,
-  AMDIL_SRC_SWIZZLE_WWWW,
-  AMDIL_SRC_SWIZZLE_XYXY,
-  AMDIL_SRC_SWIZZLE_ZWZW,
-  AMDIL_SRC_SWIZZLE_XZXZ,
-  AMDIL_SRC_SWIZZLE_YWYW,
-  AMDIL_SRC_SWIZZLE_X0Y0,
-  AMDIL_SRC_SWIZZLE_0X0Y,
-  AMDIL_SRC_SWIZZLE_XY_NEGY,
-  AMDIL_SRC_SWIZZLE_NEGYW,
-  AMDIL_SRC_SWIZZLE_NEGX,
-  AMDIL_SRC_SWIZZLE_XY_NEGXY,
-  AMDIL_SRC_SWIZZLE_NEG_XYZW,
-  AMDIL_SRC_SWIZZLE_0YZW,
-  AMDIL_SRC_SWIZZLE_X0ZW,
-  AMDIL_SRC_SWIZZLE_XY0W,
-  AMDIL_SRC_SWIZZLE_X,
-  AMDIL_SRC_SWIZZLE_Y,
-  AMDIL_SRC_SWIZZLE_Z,
-  AMDIL_SRC_SWIZZLE_W,
-  AMDIL_SRC_SWIZZLE_XY,
-  AMDIL_SRC_SWIZZLE_ZW,
-  AMDIL_SRC_SWIZZLE_LAST
-};
-enum DstSwizzles {
-  AMDIL_DST_SWIZZLE_DEFAULT = 0,
-  AMDIL_DST_SWIZZLE_X___,
-  AMDIL_DST_SWIZZLE_XY__,
-  AMDIL_DST_SWIZZLE_XYZ_,
-  AMDIL_DST_SWIZZLE_XYZW,
-  AMDIL_DST_SWIZZLE__Y__,
-  AMDIL_DST_SWIZZLE__YZ_,
-  AMDIL_DST_SWIZZLE__YZW,
-  AMDIL_DST_SWIZZLE___Z_,
-  AMDIL_DST_SWIZZLE___ZW,
-  AMDIL_DST_SWIZZLE____W,
-  AMDIL_DST_SWIZZLE_X_ZW,
-  AMDIL_DST_SWIZZLE_XY_W,
-  AMDIL_DST_SWIZZLE_X_Z_,
-  AMDIL_DST_SWIZZLE_X__W,
-  AMDIL_DST_SWIZZLE__Y_W,
-  AMDIL_DST_SWIZZLE_LAST
-};
-// Function to get the correct src swizzle string from ID
-const char *getSrcSwizzle(unsigned);
-
-// Function to get the correct dst swizzle string from ID
-const char *getDstSwizzle(unsigned);
-
-const llvm::TargetRegisterClass *getRegClassFromID(unsigned int ID);
-
-unsigned int getMoveInstFromID(unsigned int ID);
-unsigned int getPHIMoveInstFromID(unsigned int ID);
-
-// Utility functions from Type.
-const llvm::TargetRegisterClass *getRegClassFromType(unsigned int type);
-unsigned int getTargetIndependentMoveFromType(unsigned int type);
-
-// Debug functions for SDNode and SDValue.
-void printSDValue(const llvm::SDValue &Op, int level);
-void printSDNode(const llvm::SDNode *N);
-
-// Functions to check if an opcode is a specific type.
-bool isMove(unsigned int opcode);
-bool isPHIMove(unsigned int opcode);
-bool isMoveOrEquivalent(unsigned int opcode);
-
-// Function to check address space
-bool check_type(const llvm::Value *ptr, unsigned int addrspace);
-
-// Group of functions that recursively calculate the size of a structure based
-// on it's sub-types.
-size_t getTypeSize(llvm::Type * const T, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::StructType * const ST, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::IntegerType * const IT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::FunctionType * const FT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::ArrayType * const AT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::VectorType * const VT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::PointerType * const PT, bool dereferencePtr = false);
-size_t
-getTypeSize(llvm::OpaqueType * const OT, bool dereferencePtr = false);
-
-// Group of functions that recursively calculate the number of elements of a
-// structure based on it's sub-types.
-size_t getNumElements(llvm::Type * const T);
-size_t getNumElements(llvm::StructType * const ST);
-size_t getNumElements(llvm::IntegerType * const IT);
-size_t getNumElements(llvm::FunctionType * const FT);
-size_t getNumElements(llvm::ArrayType * const AT);
-size_t getNumElements(llvm::VectorType * const VT);
-size_t getNumElements(llvm::PointerType * const PT);
-size_t getNumElements(llvm::OpaqueType * const OT);
-const llvm::Value *getBasePointerValue(const llvm::Value *V);
-const llvm::Value *getBasePointerValue(const llvm::MachineInstr *MI);
-
-
-int64_t GET_SCALAR_SIZE(llvm::Type* A);
-
-// Helper functions that check the opcode for status information
-bool isLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isSWSExtLoadInst(llvm::MachineInstr *MI);
-bool isSExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isZExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isAExtLoadInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isStoreInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isTruncStoreInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isAtomicInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isVolatileInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isGlobalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isPrivateInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isConstantInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isRegionInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isLocalInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isImageInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isAppendInst(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isRegionAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isLocalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isGlobalAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-bool isArenaAtomic(const llvm::TargetInstrInfo * TII, llvm::MachineInstr *MI);
-
-
 // Macros that are used to help with switch statements for various data types
 // However, these macro's do not return anything unlike the second set below.
 #define ExpandCaseTo32bitIntTypes(Instr)  \
@@ -354,9 +175,4 @@ case Instr##_v4f32: \
 case Instr##_v2i64: \
 case Instr##_v2f64:
 
-bool commaPrint(int i, llvm::raw_ostream &O);
-/// Helper function to get the currently get/set flags.
-void getAsmPrinterFlags(llvm::MachineInstr *MI, llvm::AMDILAS::InstrResEnc &curRes);
-void setAsmPrinterFlags(llvm::MachineInstr *MI, llvm::AMDILAS::InstrResEnc &curRes);
-
 #endif // AMDILUTILITYFUNCTIONS_H_
diff --git a/src/gallium/drivers/radeon/AMDILVersion.td b/src/gallium/drivers/radeon/AMDILVersion.td
index b8b02608d3b..d863b068131 100644
--- a/src/gallium/drivers/radeon/AMDILVersion.td
+++ b/src/gallium/drivers/radeon/AMDILVersion.td
@@ -1,4 +1,4 @@
-//===-- AMDILVersion.td - TODO: Add brief description -------===//
+//===-- AMDILVersion.td - Barrier Instruction/Intrinsic definitions------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/src/gallium/drivers/radeon/Makefile b/src/gallium/drivers/radeon/Makefile
index 807dc781c7c..cc409645a6e 100644
--- a/src/gallium/drivers/radeon/Makefile
+++ b/src/gallium/drivers/radeon/Makefile
@@ -18,6 +18,8 @@ CXXFLAGS := $(filter-out -DDEBUG, $(CXXFLAGS))
 
 tablegen = $(TBLGEN) -I $(LLVM_INCLUDEDIR) $1 $2 -o $3
 
+HAVE_LLVM_INTRINSICS = $(shell grep IntrinsicsR600.td $(LLVM_INCLUDEDIR)/llvm/Intrinsics.td)
+
 gen: $(GENERATED_SOURCES)
 
 SIRegisterInfo.td: SIGenRegisterInfo.pl
@@ -26,9 +28,13 @@ SIRegisterInfo.td: SIGenRegisterInfo.pl
 SIRegisterGetHWRegNum.inc: SIGenRegisterInfo.pl
 	$(PERL) $^ $@ > /dev/null
 
-R600ShaderPatterns.td: AMDGPUGenShaderPatterns.pl
-	$(PERL) $^ C > $@
-	
+R600Intrinsics.td: R600IntrinsicsNoOpenCL.td R600IntrinsicsOpenCL.td
+ifeq ($(HAVE_LLVM_INTRINSICS),)
+	cp R600IntrinsicsNoOpenCL.td R600Intrinsics.td
+else
+	cp R600IntrinsicsOpenCL.td R600Intrinsics.td
+endif
+
 R600RegisterInfo.td: R600GenRegisterInfo.pl
 	$(PERL) $^ > $@
 
diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources
index 7d2932b4dbd..6dc62320f40 100644
--- a/src/gallium/drivers/radeon/Makefile.sources
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -1,6 +1,6 @@
 
 GENERATED_SOURCES := \
-	R600ShaderPatterns.td		\
+	R600Intrinsics.td		\
 	R600RegisterInfo.td		\
 	AMDGPUInstrEnums.td		\
 	SIRegisterInfo.td		\
@@ -29,20 +29,16 @@ CPP_SOURCES := \
 	AMDILISelDAGToDAG.cpp		\
 	AMDILISelLowering.cpp		\
 	AMDILMachinePeephole.cpp	\
-	AMDILMCCodeEmitter.cpp		\
 	AMDILNIDevice.cpp		\
 	AMDILPeepholeOptimizer.cpp	\
 	AMDILRegisterInfo.cpp		\
 	AMDILSIDevice.cpp		\
 	AMDILSubtarget.cpp		\
 	AMDILTargetMachine.cpp		\
-	AMDILUtilityFunctions.cpp	\
 	AMDGPUTargetMachine.cpp		\
 	AMDGPUISelLowering.cpp		\
 	AMDGPUConvertToISA.cpp		\
 	AMDGPULowerInstructions.cpp		\
-	AMDGPULowerShaderInstructions.cpp	\
-	AMDGPUReorderPreloadInstructions.cpp	\
 	AMDGPUInstrInfo.cpp		\
 	AMDGPURegisterInfo.cpp		\
 	AMDGPUUtil.cpp			\
@@ -51,13 +47,12 @@ CPP_SOURCES := \
 	R600InstrInfo.cpp		\
 	R600KernelParameters.cpp	\
 	R600LowerInstructions.cpp	\
-	R600LowerShaderInstructions.cpp	\
+	R600MachineFunctionInfo.cpp	\
 	R600RegisterInfo.cpp		\
 	SIAssignInterpRegs.cpp		\
 	SICodeEmitter.cpp		\
 	SIInstrInfo.cpp			\
 	SIISelLowering.cpp		\
-	SILowerShaderInstructions.cpp	\
 	SIMachineFunctionInfo.cpp	\
 	SIPropagateImmReads.cpp		\
 	SIRegisterInfo.cpp		\
diff --git a/src/gallium/drivers/radeon/R600CodeEmitter.cpp b/src/gallium/drivers/radeon/R600CodeEmitter.cpp
index 8faf0deb8c5..421562255f6 100644
--- a/src/gallium/drivers/radeon/R600CodeEmitter.cpp
+++ b/src/gallium/drivers/radeon/R600CodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- R600CodeEmitter.cpp - TODO: Add brief description -------===//
+//===-- R600CodeEmitter.cpp - Code Emitter for R600->Cayman GPU families --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This code emitters outputs bytecode that is understood by the r600g driver
+// in the Mesa [1] project.  The bytecode is very similar to the hardware's ISA,
+// except that the size of the instruction fields are rounded up to the
+// nearest byte.
+//
+// [1] http://www.mesa3d.org/
 //
 //===----------------------------------------------------------------------===//
 
@@ -44,8 +49,9 @@ namespace {
   const R600RegisterInfo * TRI;
   bool evergreenEncoding;
 
+  bool isCube;
   bool isReduction;
-  unsigned reductionElement;
+  unsigned currentElement;
   bool isLast;
 
   unsigned section_start;
@@ -53,7 +59,7 @@ namespace {
   public:
 
   R600CodeEmitter(formatted_raw_ostream &OS) : MachineFunctionPass(ID),
-      _OS(OS), TM(NULL), evergreenEncoding(false), isReduction(false),
+      _OS(OS), TM(NULL), evergreenEncoding(false), isCube(false), isReduction(false),
       isLast(true) { }
 
   const char *getPassName() const { return "AMDGPU Machine Code Emitter"; }
@@ -65,7 +71,7 @@ namespace {
   private:
 
   void emitALUInstr(MachineInstr  &MI);
-  void emitSrc(const MachineOperand & MO);
+  void emitSrc(const MachineOperand & MO, int chan_override  = -1);
   void emitDst(const MachineOperand & MO);
   void emitALU(MachineInstr &MI, unsigned numSrc);
   void emitTexInstr(MachineInstr &MI);
@@ -155,10 +161,8 @@ bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   } else {
     evergreenEncoding = true;
   }
-  const AMDGPUTargetMachine *amdtm =
-    static_cast<const AMDGPUTargetMachine *>(&MF.getTarget());
 
-  if (amdtm->shouldDumpCode()) {
+  if (STM.dumpCode()) {
     MF.dump();
   }
 
@@ -171,18 +175,26 @@ bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) {
           if (MI.getNumOperands() > 1 && MI.getOperand(0).isReg() && MI.getOperand(0).isDead()) {
             continue;
           }
-          if (isTexOp(MI.getOpcode())) {
+          if (AMDGPU::isTexOp(MI.getOpcode())) {
             emitTexInstr(MI);
-          } else if (isFCOp(MI.getOpcode())){
+          } else if (AMDGPU::isFCOp(MI.getOpcode())){
             emitFCInstr(MI);
-          } else if (isReductionOp(MI.getOpcode())) {
+          } else if (AMDGPU::isReductionOp(MI.getOpcode())) {
             isReduction = true;
             isLast = false;
-            for (reductionElement = 0; reductionElement < 4; reductionElement++) {
-              isLast = (reductionElement == 3);
+            for (currentElement = 0; currentElement < 4; currentElement++) {
+              isLast = (currentElement == 3);
               emitALUInstr(MI);
             }
             isReduction = false;
+          } else if (AMDGPU::isCubeOp(MI.getOpcode())) {
+              isCube = true;
+              isLast = false;
+              for (currentElement = 0; currentElement < 4; currentElement++) {
+                isLast = (currentElement == 3);
+                emitALUInstr(MI);
+              }
+              isCube = false;
           } else if (MI.getOpcode() == AMDIL::RETURN ||
                      MI.getOpcode() == AMDIL::BUNDLE ||
                      MI.getOpcode() == AMDIL::KILL) {
@@ -191,12 +203,7 @@ bool R600CodeEmitter::runOnMachineFunction(MachineFunction &MF) {
             switch(MI.getOpcode()) {
             case AMDIL::RAT_WRITE_CACHELESS_eg:
               {
-                /* XXX: Support for autoencoding 64-bit instructions was added
-                 * in LLVM 3.1.  Until we drop support for 3.0, we will use Magic
-                 * numbers for the high bits. */
-                  uint64_t high = 0x95c0100000000000;
                   uint64_t inst = getBinaryCodeForInstr(MI);
-                  inst |= high;
                 /* Set End Of Program bit */
                 /* XXX: Need better check of end of program.  EOP should be
                  * encoded in one of the operands of the MI, and it should be
@@ -286,7 +293,7 @@ void R600CodeEmitter::emitALUInstr(MachineInstr &MI)
 
    /* Some instructions are just place holder instructions that represent
     * operations that the GPU does automatically.  They should be ignored. */
-  if (isPlaceHolderOpcode(MI.getOpcode())) {
+  if (AMDGPU::isPlaceHolderOpcode(MI.getOpcode())) {
     return;
   }
 
@@ -309,18 +316,25 @@ void R600CodeEmitter::emitALUInstr(MachineInstr &MI)
   /* Emit instruction type */
   emitByte(0);
 
-  unsigned int opIndex;
-  for (opIndex = 1; opIndex < numOperands; opIndex++) {
-    /* Literal constants are always stored as the last operand. */
-    if (MI.getOperand(opIndex).isImm() || MI.getOperand(opIndex).isFPImm()) {
-      break;
+  if (isCube) {
+    static const int cube_src_swz[] = {2, 2, 0, 1};
+    emitSrc(MI.getOperand(1), cube_src_swz[currentElement]);
+    emitSrc(MI.getOperand(1), cube_src_swz[3-currentElement]);
+    emitNullBytes(SRC_BYTE_COUNT);
+  } else {
+    unsigned int opIndex;
+    for (opIndex = 1; opIndex < numOperands; opIndex++) {
+      /* Literal constants are always stored as the last operand. */
+      if (MI.getOperand(opIndex).isImm() || MI.getOperand(opIndex).isFPImm()) {
+        break;
+      }
+      emitSrc(MI.getOperand(opIndex));
     }
-    emitSrc(MI.getOperand(opIndex));
-  }
 
     /* Emit zeros for unused sources */
-  for ( ; opIndex < 4; opIndex++) {
-    emitNullBytes(SRC_BYTE_COUNT);
+    for ( ; opIndex < 4; opIndex++) {
+      emitNullBytes(SRC_BYTE_COUNT);
+    }
   }
 
   emitDst(dstOp);
@@ -328,7 +342,7 @@ void R600CodeEmitter::emitALUInstr(MachineInstr &MI)
   emitALU(MI, numOperands - 1);
 }
 
-void R600CodeEmitter::emitSrc(const MachineOperand & MO)
+void R600CodeEmitter::emitSrc(const MachineOperand & MO, int chan_override /* = -1 */)
 {
   uint32_t value = 0;
   /* Emit the source select (2 bytes).  For GPRs, this is the register index.
@@ -354,8 +368,10 @@ void R600CodeEmitter::emitSrc(const MachineOperand & MO)
   }
 
   /* Emit the source channel (1 byte) */
-  if (isReduction) {
-    emitByte(reductionElement);
+  if (chan_override != -1) {
+    emitByte(chan_override);
+  } else if (isReduction) {
+    emitByte(currentElement);
   } else if (MO.isReg()) {
     emitByte(TRI->getHWRegChan(MO.getReg()));
   } else {
@@ -397,8 +413,8 @@ void R600CodeEmitter::emitDst(const MachineOperand & MO)
     emitByte(getHWReg(MO.getReg()));
 
     /* Emit the element of the destination register (1 byte)*/
-    if (isReduction) {
-      emitByte(reductionElement);
+    if (isReduction || isCube) {
+      emitByte(currentElement);
     } else {
       emitByte(TRI->getHWRegChan(MO.getReg()));
     }
@@ -411,7 +427,7 @@ void R600CodeEmitter::emitDst(const MachineOperand & MO)
     }
 
     /* Emit writemask (1 byte).  */
-    if ((isReduction && reductionElement != TRI->getHWRegChan(MO.getReg()))
+    if ((isReduction && currentElement != TRI->getHWRegChan(MO.getReg()))
          || MO.getTargetFlags() & MO_FLAG_MASK) {
       emitByte(0);
     } else {
@@ -570,6 +586,7 @@ void R600CodeEmitter::emitFCInstr(MachineInstr &MI)
   case AMDIL::BREAK_LOGICALZ_f32:
     instr = FC_BREAK;
     break;
+  case AMDIL::BREAK_LOGICALNZ_f32:
   case AMDIL::BREAK_LOGICALNZ_i32:
     instr = FC_BREAK_NZ_INT;
     break;
@@ -577,6 +594,7 @@ void R600CodeEmitter::emitFCInstr(MachineInstr &MI)
     instr = FC_BREAK_Z_INT;
     break;
   case AMDIL::CONTINUE_LOGICALNZ_f32:
+  case AMDIL::CONTINUE_LOGICALNZ_i32:
     instr = FC_CONTINUE;
     break;
   /* XXX: This assumes that all IFs will be if (x != 0).  If we add
@@ -706,44 +724,5 @@ RegElement maskBitToElement(unsigned int maskBit)
   }
 }
 
-unsigned int dstSwizzleToWriteMask(unsigned swizzle)
-{
-  switch(swizzle) {
-  default:
-  case AMDIL_DST_SWIZZLE_DEFAULT:
-    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE_X___:
-    return WRITE_MASK_X;
-  case AMDIL_DST_SWIZZLE_XY__:
-    return WRITE_MASK_X | WRITE_MASK_Y;
-  case AMDIL_DST_SWIZZLE_XYZ_:
-    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z;
-  case AMDIL_DST_SWIZZLE_XYZW:
-    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE__Y__:
-    return WRITE_MASK_Y;
-  case AMDIL_DST_SWIZZLE__YZ_:
-    return WRITE_MASK_Y | WRITE_MASK_Z;
-  case AMDIL_DST_SWIZZLE__YZW:
-    return WRITE_MASK_Y | WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE___Z_:
-    return WRITE_MASK_Z;
-  case AMDIL_DST_SWIZZLE___ZW:
-    return WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE____W:
-    return WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE_X_ZW:
-    return WRITE_MASK_X | WRITE_MASK_Z | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE_XY_W:
-    return WRITE_MASK_X | WRITE_MASK_Y | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE_X_Z_:
-    return WRITE_MASK_X | WRITE_MASK_Z;
-  case AMDIL_DST_SWIZZLE_X__W:
-    return WRITE_MASK_X | WRITE_MASK_W;
-  case AMDIL_DST_SWIZZLE__Y_W:
-    return WRITE_MASK_Y | WRITE_MASK_W;
-  }
-}
-
 #include "AMDILGenCodeEmitter.inc"
 
diff --git a/src/gallium/drivers/radeon/R600GenRegisterInfo.pl b/src/gallium/drivers/radeon/R600GenRegisterInfo.pl
index cbded115766..406f3dfdd39 100644
--- a/src/gallium/drivers/radeon/R600GenRegisterInfo.pl
+++ b/src/gallium/drivers/radeon/R600GenRegisterInfo.pl
@@ -1,20 +1,23 @@
-#===-- R600GenRegisterInfo.pl - TODO: Add brief description -------===#
+#===-- R600GenRegisterInfo.pl - Script for generating register info files --===#
 #
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 #
-#===----------------------------------------------------------------------===#
+#===------------------------------------------------------------------------===#
 #
-# TODO: Add full description
+# This perl script prints to stdout .td code to be used as R600RegisterInfo.td
+# it also generates a file called R600HwRegInfo.include, which contains helper
+# functions for determining the hw encoding of registers.
 #
-#===----------------------------------------------------------------------===#
+#===------------------------------------------------------------------------===#
 
 use strict;
 use warnings;
 
-use AMDGPUConstants;
+use constant CONST_REG_COUNT => 256;
+use constant TEMP_REG_COUNT => 128;
 
 my $CREG_MAX = CONST_REG_COUNT - 1;
 my $TREG_MAX = TEMP_REG_COUNT - 1;
@@ -81,7 +84,7 @@ def R600_Reg32 : RegisterClass <"AMDIL", [f32, i32], 32, (add
     R600_CReg32,
     ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>;
 
-def R600_Reg128 : RegisterClass<"AMDIL", [v4f32], 128, (add
+def R600_Reg128 : RegisterClass<"AMDIL", [v4f32, v4i32], 128, (add
     $t128_string)>
 {
   let SubRegClasses = [(R600_TReg32 sel_x, sel_y, sel_z, sel_w)];
@@ -170,3 +173,24 @@ sub print_reg_defs {
   return @reg_list;
 }
 
+#Helper functions
+sub get_hw_index {
+  my ($index) = @_;
+  return int($index / 4);
+}
+
+sub get_chan_str {
+  my ($index) = @_;
+  my $chan = $index % 4;
+  if ($chan == 0 )  {
+    return 'X';
+  } elsif ($chan == 1) {
+    return 'Y';
+  } elsif ($chan == 2) {
+    return 'Z';
+  } elsif ($chan == 3) {
+    return 'W';
+  } else {
+    die("Unknown chan value: $chan");
+  }
+}
diff --git a/src/gallium/drivers/radeon/R600ISelLowering.cpp b/src/gallium/drivers/radeon/R600ISelLowering.cpp
index f92fe2641a5..e85ac31b34c 100644
--- a/src/gallium/drivers/radeon/R600ISelLowering.cpp
+++ b/src/gallium/drivers/radeon/R600ISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- R600ISelLowering.cpp - TODO: Add brief description -------===//
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Most of the DAG lowering is handled in AMDILISelLowering.cpp.  This file
+// is mostly EmitInstrWithCustomInserter().
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
 using namespace llvm;
@@ -25,9 +27,13 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
 //  setSchedulingPreference(Sched::VLIW);
   addRegisterClass(MVT::v4f32, &AMDIL::R600_Reg128RegClass);
   addRegisterClass(MVT::f32, &AMDIL::R600_Reg32RegClass);
+  addRegisterClass(MVT::v4i32, &AMDIL::R600_Reg128RegClass);
+  addRegisterClass(MVT::i32, &AMDIL::R600_Reg32RegClass);
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Legal);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Legal);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Legal);
 }
 
 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
@@ -35,10 +41,10 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 {
   MachineFunction * MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock::iterator I = *MI;
 
   switch (MI->getOpcode()) {
   default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
-  /* XXX: Use helper function from AMDGPULowerShaderInstructions here */
   case AMDIL::TGID_X:
     addLiveIn(MI, MF, MRI, TII, AMDIL::T1_X);
     break;
@@ -84,7 +90,49 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDIL::LOCAL_SIZE_Z:
     lowerImplicitParameter(MI, *BB, MRI, 8);
     break;
+
+  case AMDIL::R600_LOAD_CONST:
+    {
+      int64_t RegIndex = MI->getOperand(1).getImm();
+      unsigned ConstantReg = AMDIL::R600_CReg32RegClass.getRegister(RegIndex);
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDIL::COPY))
+                  .addOperand(MI->getOperand(0))
+                  .addReg(ConstantReg);
+      break;
+    }
+
+  case AMDIL::LOAD_INPUT:
+    {
+      int64_t RegIndex = MI->getOperand(1).getImm();
+      addLiveIn(MI, MF, MRI, TII,
+                AMDIL::R600_TReg32RegClass.getRegister(RegIndex));
+      break;
+    }
+  case AMDIL::STORE_OUTPUT:
+    {
+      int64_t OutputIndex = MI->getOperand(1).getImm();
+      unsigned OutputReg = AMDIL::R600_TReg32RegClass.getRegister(OutputIndex);
+
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDIL::COPY), OutputReg)
+                  .addOperand(MI->getOperand(0));
+
+      if (!MRI.isLiveOut(OutputReg)) {
+        MRI.addLiveOut(OutputReg);
+      }
+      break;
+    }
+
+  case AMDIL::RESERVE_REG:
+    {
+      R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
+      int64_t ReservedIndex = MI->getOperand(0).getImm();
+      unsigned ReservedReg =
+                          AMDIL::R600_TReg32RegClass.getRegister(ReservedIndex);
+      MFI->ReservedRegs.push_back(ReservedReg);
+      break;
+    }
   }
+
   MI->eraseFromParent();
   return BB;
 }
diff --git a/src/gallium/drivers/radeon/R600ISelLowering.h b/src/gallium/drivers/radeon/R600ISelLowering.h
index fd26bf538c4..fdd552a172d 100644
--- a/src/gallium/drivers/radeon/R600ISelLowering.h
+++ b/src/gallium/drivers/radeon/R600ISelLowering.h
@@ -1,4 +1,4 @@
-//===-- R600ISelLowering.h - TODO: Add brief description -------===//
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/R600InstrInfo.cpp b/src/gallium/drivers/radeon/R600InstrInfo.cpp
index 0c7ffc4334d..2bd59fd5e1b 100644
--- a/src/gallium/drivers/radeon/R600InstrInfo.cpp
+++ b/src/gallium/drivers/radeon/R600InstrInfo.cpp
@@ -1,4 +1,4 @@
-//===-- R600InstrInfo.cpp - TODO: Add brief description -------===//
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 Implementation of TargetInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -73,10 +73,22 @@ unsigned R600InstrInfo::getISAOpcode(unsigned opcode) const
     case AMDIL::MOVE_i32:
       return AMDIL::MOV;
     case AMDIL::SHR_i32:
+      return getASHRop();
+    case AMDIL::USHR_i32:
       return getLSHRop();
   }
 }
 
+unsigned R600InstrInfo::getASHRop() const
+{
+	unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
+	if (gen < AMDILDeviceInfo::HD5XXX) {
+		return AMDIL::ASHR_r600;
+	} else {
+		return AMDIL::ASHR_eg;
+	}
+}
+
 unsigned R600InstrInfo::getLSHRop() const
 {
   unsigned gen = TM.getSubtarget<AMDILSubtarget>().device()->getGeneration();
diff --git a/src/gallium/drivers/radeon/R600InstrInfo.h b/src/gallium/drivers/radeon/R600InstrInfo.h
index aedaa9f47f3..014eeb0b9f7 100644
--- a/src/gallium/drivers/radeon/R600InstrInfo.h
+++ b/src/gallium/drivers/radeon/R600InstrInfo.h
@@ -1,4 +1,4 @@
-//===-- R600InstrInfo.h - TODO: Add brief description -------===//
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Interface definition for R600InstrInfo
 //
 //===----------------------------------------------------------------------===//
 
@@ -52,6 +52,7 @@ namespace llvm {
   bool isTrig(const MachineInstr &MI) const;
 
   unsigned getLSHRop() const;
+  unsigned getASHRop() const;
   unsigned getMULHI_UINT() const;
   unsigned getMULLO_UINT() const;
   unsigned getRECIP_UINT() const;
diff --git a/src/gallium/drivers/radeon/R600Instructions.td b/src/gallium/drivers/radeon/R600Instructions.td
index 02043fdeea5..a18240f09bd 100644
--- a/src/gallium/drivers/radeon/R600Instructions.td
+++ b/src/gallium/drivers/radeon/R600Instructions.td
@@ -1,4 +1,4 @@
-//===-- R600Instructions.td - TODO: Add brief description -------===//
+//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 Tablegen instruction definitions
 //
 //===----------------------------------------------------------------------===//
 
@@ -84,7 +84,7 @@ class R600_3OP <bits<32> inst, string opName, list<dag> pattern,
   InstR600 <inst,
           (outs R600_Reg32:$dst),
           (ins R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2, variable_ops),
-          !strconcat(opName, "$dst $src0, $src1, $src2"),
+          !strconcat(opName, " $dst, $src0, $src1, $src2"),
           pattern,
           itin>{
 
@@ -92,7 +92,7 @@ class R600_3OP <bits<32> inst, string opName, list<dag> pattern,
   }
 
 class R600_REDUCTION <bits<32> inst, dag ins, string asm, list<dag> pattern,
-                      InstrItinClass itin = AnyALU> :
+                      InstrItinClass itin = VecALU> :
   InstR600 <inst,
           (outs R600_Reg32:$dst),
           ins,
@@ -152,8 +152,6 @@ class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, dag outs, dag ins,
   let Inst{31-30} = ELEM_SIZE;
 
   /* CF_ALLOC_EXPORT_WORD1_BUF */
-/* XXX: We can't have auto encoding of 64-bit instructions until LLVM 3.1 :( */
-/*
   let Inst{43-32} = ARRAY_SIZE;
   let Inst{47-44} = COMP_MASK;
   let Inst{51-48} = BURST_COUNT;
@@ -162,7 +160,6 @@ class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, dag outs, dag ins,
   let Inst{61-54} = cf_inst;
   let Inst{62}    = MARK;
   let Inst{63}    = BARRIER;
-*/
 }
 
 /*
@@ -311,6 +308,18 @@ def TRUNC : R600_1OP <
   [(set R600_Reg32:$dst, (int_AMDGPU_trunc R600_Reg32:$src))]
 >;
 
+def CEIL : R600_1OP <
+  0x12, "CEIL",
+  [(set R600_Reg32:$dst, (int_AMDIL_round_neginf R600_Reg32:$src))]> {
+  let AMDILOp = AMDILInst.ROUND_NEGINF_f32;
+}
+
+def RNDNE : R600_1OP <
+  0x13, "RNDNE",
+  [(set R600_Reg32:$dst, (int_AMDIL_round_nearest R600_Reg32:$src))]> {
+  let AMDILOp = AMDILInst.ROUND_NEAREST_f32;
+}
+
 def FLOOR : R600_1OP <
   0x14, "FLOOR",
   [(set R600_Reg32:$dst, (int_AMDGPU_floor R600_Reg32:$src))]
@@ -329,64 +338,114 @@ def AND_INT : R600_2OP <
   let AMDILOp = AMDILInst.AND_i32;
 }
 
+def OR_INT : R600_2OP <
+  0x31, "OR_INT",
+  []>{
+  let AMDILOp = AMDILInst.BINARY_OR_i32;
+}
+
 def XOR_INT : R600_2OP <
   0x32, "XOR_INT",
   []
 >;
 
+def NOT_INT : R600_1OP <
+  0x33, "NOT_INT",
+  []>{
+  let AMDILOp = AMDILInst.BINARY_NOT_i32;
+}
+
 def ADD_INT : R600_2OP <
-  0x34, "ADD_INT $dst, $src0, $src1",
+  0x34, "ADD_INT",
   []>{
   let AMDILOp = AMDILInst.ADD_i32;
 }
 
 def SUB_INT : R600_2OP <
-	0x35, "SUB_INT $dst, $src0, $src1",
+	0x35, "SUB_INT",
 	[]
 >;
 
+def MAX_INT : R600_2OP <
+  0x36, "MAX_INT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_imax R600_Reg32:$src0, R600_Reg32:$src1))]>;
+
+def MIN_INT : R600_2OP <
+  0x37, "MIN_INT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_imin R600_Reg32:$src0, R600_Reg32:$src1))]>;
+
+def MAX_UINT : R600_2OP <
+  0x38, "MAX_UINT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_umax R600_Reg32:$src0, R600_Reg32:$src1))]>;
+
+def MIN_UINT : R600_2OP <
+  0x39, "MIN_UINT",
+  [(set R600_Reg32:$dst, (int_AMDGPU_umin R600_Reg32:$src0, R600_Reg32:$src1))]>;
+
+
 def SETE_INT : R600_2OP <
-  0x3A, "SETE_INT $dst, $src0, $src1",
+  0x3A, "SETE_INT",
   []>{
   let AMDILOp = AMDILInst.IEQ;
 }
 
 def SETGT_INT : R600_2OP <
-  0x3B, "SGT_INT $dst, $src0, $src1",
+  0x3B, "SGT_INT",
   []
 >;
 
 def SETGE_INT : R600_2OP <
-	0x3C, "SETGE_INT $dst, $src0, $src1",
+	0x3C, "SETGE_INT",
 	[]>{
   let AMDILOp = AMDILInst.IGE;
 }
 
 def SETNE_INT : R600_2OP <
-  0x3D, "SETNE_INT $dst, $src0, $src1",
+  0x3D, "SETNE_INT",
   []>{
   let AMDILOp = AMDILInst.INE;
 }
 
 def SETGT_UINT : R600_2OP <
-  0x3E, "SETGT_UINT $dst, $src0, $src1",
+  0x3E, "SETGT_UINT",
   []>{
   let AMDILOp = AMDILInst.UGT;
 }
 
 def SETGE_UINT : R600_2OP <
-  0x3F, "SETGE_UINT $dst, $src0, $src1",
+  0x3F, "SETGE_UINT",
   []>{
   let AMDILOp = AMDILInst.UGE;
 }
 
 def CNDE_INT : R600_3OP <
-	0x1C, "CNDE_INT $dst, $src0, $src1, $src2",
+	0x1C, "CNDE_INT",
 	[]
 >;
 
 /* Texture instructions */
 
+
+def TEX_LD : R600_TEX <
+  0x03, "TEX_LD",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <
+  0x04, "TEX_GET_TEXTURE_RESINFO",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_GET_GRADIENTS_H : R600_TEX <
+  0x07, "TEX_GET_GRADIENTS_H",
+  [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
+def TEX_GET_GRADIENTS_V : R600_TEX <
+  0x08, "TEX_GET_GRADIENTS_V",
+  [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$src1, imm:$src2))]
+>;
+
 def TEX_SAMPLE : R600_TEX <
   0x10, "TEX_SAMPLE",
   [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$src1, imm:$src2))]
@@ -434,6 +493,11 @@ def KILP : Pat <
   (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
 >;
 
+def KIL : Pat <
+  (int_AMDGPU_kill R600_Reg32:$src0),
+  (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0)))
+>;
+
 /* Helper classes for common instructions */
 
 class MUL_LIT_Common <bits<32> inst> : R600_3OP <
@@ -470,6 +534,15 @@ class DOT4_Common <bits<32> inst> : R600_REDUCTION <
   [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
 >;
 
+class CUBE_Common <bits<32> inst> : InstR600 <
+  inst,
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src),
+  "CUBE $dst $src",
+  [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))],
+  VecALU
+>;
+
 class EXP_IEEE_Common <bits<32> inst> : R600_1OP <
   inst, "EXP_IEEE",
   []> {
@@ -509,6 +582,12 @@ class LSHR_Common <bits<32> inst> : R600_2OP <
   let AMDILOp = AMDILInst.USHR_i32;
 }
 
+class ASHR_Common <bits<32> inst> : R600_2OP <
+  inst, "ASHR $dst, $src0, $src1",
+  [] >{
+  let AMDILOp = AMDILInst.SHR_i32;
+}
+
 class MULHI_INT_Common <bits<32> inst> : R600_2OP <
   inst, "MULHI_INT $dst, $src0, $src1",
   [] >{
@@ -608,6 +687,7 @@ let Gen = AMDGPUGen.R600 in {
   def CNDGT_r600 : CNDGT_Common<0x19>;
   def CNDGE_r600 : CNDGE_Common<0x1A>;
   def DOT4_r600 : DOT4_Common<0x50>;
+  def CUBE_r600 : CUBE_Common<0x52>;
   def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
   def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
   def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
@@ -619,6 +699,7 @@ let Gen = AMDGPUGen.R600 in {
   def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
   def SIN_r600 : SIN_Common<0x6E>;
   def COS_r600 : COS_Common<0x6F>;
+  def ASHR_r600 : ASHR_Common<0x70>;
   def LSHR_r600 : LSHR_Common<0x71>;
   def LSHL_r600 : LSHL_Common<0x72>;
   def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
@@ -661,20 +742,12 @@ def RAT_WRITE_CACHELESS_eg :
     EG_CF_RAT <0x57, 0x2, (outs), (ins R600_TReg32_X:$rw_gpr,
                                    R600_TReg32_X:$index_gpr, i32imm:$rat_id), "">
 {
-/*
-  let Inst{3-0}   = RAT_ID;
-  let Inst{21-15} = RW_GPR;
-  let Inst{29-23} = INDEX_GPR;
-  /* Propery of the UAV */
-  let Inst{31-30} = ELEM_SIZE;
-*/
   let RIM         = 0;
   /* XXX: Have a separate instruction for non-indexed writes. */
   let TYPE        = 1;
   let RW_REL      = 0;
   let ELEM_SIZE   = 0;
 
-/*
   let ARRAY_SIZE  = 0;
   let COMP_MASK   = 1;
   let BURST_COUNT = 0;
@@ -682,7 +755,6 @@ def RAT_WRITE_CACHELESS_eg :
   let EOP         = 0;
   let MARK        = 0;
   let BARRIER     = 1;
-*/
 }
 
 def VTX_READ_eg : InstR600ISA < (outs R600_TReg32_X:$dst),
@@ -789,6 +861,7 @@ class TRIG_eg <InstR600 trig, Intrinsic intr> : Pat<
 let Gen = AMDGPUGen.EG_CAYMAN in {
 
   def MULADD_eg : MULADD_Common<0x14>;
+  def ASHR_eg : ASHR_Common<0x15>;
   def LSHR_eg : LSHR_Common<0x16>;
   def LSHL_eg : LSHL_Common<0x17>;
   def CNDE_eg : CNDE_Common<0x19>;
@@ -812,6 +885,7 @@ let Gen = AMDGPUGen.EG_CAYMAN in {
   def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
   def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
   def DOT4_eg : DOT4_Common<0xBE>;
+  def CUBE_eg : CUBE_Common<0xC0>;
 
 } // End AMDGPUGen.EG_CAYMAN
 
@@ -905,6 +979,34 @@ def LOCAL_SIZE_Y : R600PreloadInst <"LOCAL_SIZE_Y",
 def LOCAL_SIZE_Z : R600PreloadInst <"LOCAL_SIZE_Z",
                                     int_r600_read_local_size_z>;
 
+def R600_LOAD_CONST : AMDGPUShaderInst <
+  (outs R600_Reg32:$dst),
+  (ins i32imm:$src0),
+  "R600_LOAD_CONST $dst, $src0",
+  [(set R600_Reg32:$dst, (int_AMDGPU_load_const imm:$src0))]
+>;
+
+def LOAD_INPUT : AMDGPUShaderInst <
+  (outs R600_Reg32:$dst),
+  (ins i32imm:$src),
+  "LOAD_INPUT $dst, $src",
+  [(set R600_Reg32:$dst, (int_R600_load_input imm:$src))]
+>;
+
+def RESERVE_REG : AMDGPUShaderInst <
+  (outs),
+  (ins i32imm:$src),
+  "RESERVE_REG $src",
+  [(int_AMDGPU_reserve_reg imm:$src)]
+>;
+
+def STORE_OUTPUT: AMDGPUShaderInst <
+  (outs),
+  (ins R600_Reg32:$src0, i32imm:$src1),
+  "STORE_OUTPUT $src0, $src1",
+  [(int_AMDGPU_store_output R600_Reg32:$src0, imm:$src1)]
+>;
+
 } // End usesCustomInserter = 1, isPseudo = 1
 
 } // End isCodeGenOnly = 1
@@ -933,15 +1035,14 @@ def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 5, sel_y>;
 def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 6, sel_z>;
 def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 7, sel_w>;
 
+def : Extract_Element <i32, v4i32, R600_Reg128, 0, sel_x>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 1, sel_y>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 2, sel_z>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 3, sel_w>;
 
-include "R600ShaderPatterns.td"
-
-// We need this pattern to avoid having real registers in PHI nodes.
-// For some reason this pattern only works when it comes after the other
-// instruction defs.
-def : Pat <
-  (int_R600_load_input imm:$src),
-  (LOAD_INPUT imm:$src)
->;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 4, sel_x>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 5, sel_y>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 6, sel_z>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 7, sel_w>;
 
 } // End isR600toCayman Predicate
diff --git a/src/gallium/drivers/radeon/R600Intrinsics.td b/src/gallium/drivers/radeon/R600IntrinsicsNoOpenCL.td
index 8038fee1a3c..73ef4aae234 100644
--- a/src/gallium/drivers/radeon/R600Intrinsics.td
+++ b/src/gallium/drivers/radeon/R600IntrinsicsNoOpenCL.td
@@ -1,4 +1,4 @@
-//===-- R600Intrinsics.td - TODO: Add brief description -------===//
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 Intrinsic Definitions
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/R600InstrFormats.td b/src/gallium/drivers/radeon/R600IntrinsicsOpenCL.td
index 0890eb64509..cd761358475 100644
--- a/src/gallium/drivers/radeon/R600InstrFormats.td
+++ b/src/gallium/drivers/radeon/R600IntrinsicsOpenCL.td
@@ -1,4 +1,4 @@
-//===-- R600InstrFormats.td - TODO: Add brief description -------===//
+//===-- R600Intrinsics.td - TODO: Add brief description -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,6 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
-class ALUInst <bits<10> op, dag outs, dag ins, string asm, list<dag> pattern>
-  : InstR600 <, outs, ins , asm, pattern>
+let TargetPrefix = "R600", isTarget = 1 in {
+  def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadWriteArgMem]>;
+}
diff --git a/src/gallium/drivers/radeon/R600KernelParameters.cpp b/src/gallium/drivers/radeon/R600KernelParameters.cpp
index 3fdf48a2bf2..53bfebc7364 100644
--- a/src/gallium/drivers/radeon/R600KernelParameters.cpp
+++ b/src/gallium/drivers/radeon/R600KernelParameters.cpp
@@ -1,4 +1,4 @@
-//===-- R600KernelParameters.cpp - TODO: Add brief description -------===//
+//===-- R600KernelParameters.cpp - Lower kernel function arguments --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,89 +7,83 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This pass lowers kernel function arguments to loads from the vertex buffer.
+//
+// Kernel arguemnts are stored in the vertex buffer at an offset of 9 dwords,
+// so arg0 needs to be loaded from VTX_BUFFER[9] and arg1 is loaded from
+// VTX_BUFFER[10], etc.
 //
 //===----------------------------------------------------------------------===//
 
-#include <llvm-c/Core.h>
-#include "R600KernelParameters.h"
-#include "R600OpenCLUtils.h"
+#include "AMDGPU.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Constants.h"
+#include "llvm/Function.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/Metadata.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetData.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Support/TypeBuilder.h"
-// #include "llvm/CodeGen/Function.h"
-
-namespace AMDILAS {
-enum AddressSpaces {
-  PRIVATE_ADDRESS  = 0, // Address space for private memory.
-  GLOBAL_ADDRESS   = 1, // Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, // Address space for constant memory.
-  LOCAL_ADDRESS    = 3, // Address space for local memory.
-  REGION_ADDRESS   = 4, // Address space for region memory.
-  ADDRESS_NONE     = 5, // Address space for unknown memory.
-  PARAM_D_ADDRESS  = 6, // Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, // Address space for indirect addressible parameter memory (VTX1)
-  LAST_ADDRESS     = 8
-};
-}
-
 
 #include <map>
 #include <set>
 
 using namespace llvm;
-using namespace std;
+
+namespace {
 
 #define CONSTANT_CACHE_SIZE_DW 127
 
-class R600KernelParameters : public llvm::FunctionPass
+class R600KernelParameters : public FunctionPass
 {
-  const llvm::TargetData * TD;
+  const TargetData * TD;
   LLVMContext* Context;
   Module *mod;
-  
+
   struct param
   {
-    param() : val(NULL), ptr_val(NULL), offset_in_dw(0), size_in_dw(0), indirect(false), specialID(0) {}
-    
-    llvm::Value* val;
-    llvm::Value* ptr_val;
+    param() : val(NULL), ptr_val(NULL), offset_in_dw(0), size_in_dw(0),
+              indirect(false), specialID(0) {}
+
+    Value* val;
+    Value* ptr_val;
     int offset_in_dw;
     int size_in_dw;
 
     bool indirect;
-    
-    string specialType;
+
+    std::string specialType;
     int specialID;
-    
+
     int end() { return offset_in_dw + size_in_dw; }
-    /* The first 9 dwords are reserved for the grid sizes. */
+    // The first 9 dwords are reserved for the grid sizes.
     int get_rat_offset() { return 9 + offset_in_dw; }
   };
 
   std::vector<param> params;
 
-  int getLastSpecialID(const string& TypeName);
-  
+  bool isOpenCLKernel(const Function* fun);
+  int getLastSpecialID(const std::string& TypeName);
+
   int getListSize();
-  void AddParam(llvm::Argument* arg);
-  int calculateArgumentSize(llvm::Argument* arg);
-  void RunAna(llvm::Function* fun);
-  void Replace(llvm::Function* fun);
-  bool isIndirect(Value* val, set<Value*>& visited);
-  void Propagate(llvm::Function* fun);
-  void Propagate(llvm::Value* v, const llvm::Twine& name, bool indirect = false);
+  void AddParam(Argument* arg);
+  int calculateArgumentSize(Argument* arg);
+  void RunAna(Function* fun);
+  void Replace(Function* fun);
+  bool isIndirect(Value* val, std::set<Value*>& visited);
+  void Propagate(Function* fun);
+  void Propagate(Value* v, const Twine& name, bool indirect = false);
   Value* ConstantRead(Function* fun, param& p);
   Value* handleSpecial(Function* fun, param& p);
   bool isSpecialType(Type*);
-  string getSpecialTypeName(Type*);
+  std::string getSpecialTypeName(Type*);
 public:
   static char ID;
   R600KernelParameters() : FunctionPass(ID) {};
-  R600KernelParameters(const llvm::TargetData* TD) : FunctionPass(ID), TD(TD) {}
-//   bool runOnFunction (llvm::Function &F);
-  bool runOnFunction (llvm::Function &F);
+  R600KernelParameters(const TargetData* TD) : FunctionPass(ID), TD(TD) {}
+  bool runOnFunction (Function &F);
   void getAnalysisUsage(AnalysisUsage &AU) const;
   const char *getPassName() const;
   bool doInitialization(Module &M);
@@ -98,13 +92,42 @@ public:
 
 char R600KernelParameters::ID = 0;
 
-static RegisterPass<R600KernelParameters> X("kerparam", "OpenCL Kernel Parameter conversion", false, false);
+static RegisterPass<R600KernelParameters> X("kerparam",
+                            "OpenCL Kernel Parameter conversion", false, false);
 
-int R600KernelParameters::getLastSpecialID(const string& TypeName)
+bool R600KernelParameters::isOpenCLKernel(const Function* fun)
+{
+  Module *mod = const_cast<Function*>(fun)->getParent();
+  NamedMDNode * md = mod->getOrInsertNamedMetadata("opencl.kernels");
+
+  if (!md or !md->getNumOperands())
+  {
+    return false;
+  }
+
+  for (int i = 0; i < int(md->getNumOperands()); i++)
+  {
+    if (!md->getOperand(i) or !md->getOperand(i)->getOperand(0))
+    {
+      continue;
+    }
+    
+    assert(md->getOperand(i)->getNumOperands() == 1);
+
+    if (md->getOperand(i)->getOperand(0)->getName() == fun->getName())
+    {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+int R600KernelParameters::getLastSpecialID(const std::string& TypeName)
 {
   int lastID = -1;
-  
-  for (vector<param>::iterator i = params.begin(); i != params.end(); i++)
+
+  for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
   {
     if (i->specialType == TypeName)
     {
@@ -125,7 +148,7 @@ int R600KernelParameters::getListSize()
   return params.back().end();
 }
 
-bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
+bool R600KernelParameters::isIndirect(Value* val, std::set<Value*>& visited)
 {
   if (isa<LoadInst>(val))
   {
@@ -144,7 +167,7 @@ bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
   }
 
   visited.insert(val);
-  
+
   if (isa<GetElementPtrInst>(val))
   {
     GetElementPtrInst* GEP = dyn_cast<GetElementPtrInst>(val);
@@ -158,7 +181,7 @@ bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
       }
     }
   }
-  
+
   for (Value::use_iterator i = val->use_begin(); i != val->use_end(); i++)
   {
     Value* v2 = dyn_cast<Value>(*i);
@@ -175,24 +198,24 @@ bool R600KernelParameters::isIndirect(Value* val, set<Value*>& visited)
   return false;
 }
 
-void R600KernelParameters::AddParam(llvm::Argument* arg)
+void R600KernelParameters::AddParam(Argument* arg)
 {
   param p;
-  
+
   p.val = dyn_cast<Value>(arg);
   p.offset_in_dw = getListSize();
   p.size_in_dw = calculateArgumentSize(arg);
 
   if (isa<PointerType>(arg->getType()) and arg->hasByValAttr())
   {
-    set<Value*> visited;
+    std::set<Value*> visited;
     p.indirect = isIndirect(p.val, visited);
   }
-  
+
   params.push_back(p);
 }
 
-int R600KernelParameters::calculateArgumentSize(llvm::Argument* arg)
+int R600KernelParameters::calculateArgumentSize(Argument* arg)
 {
   Type* t = arg->getType();
 
@@ -200,16 +223,16 @@ int R600KernelParameters::calculateArgumentSize(llvm::Argument* arg)
   {
     t = dyn_cast<PointerType>(t)->getElementType();
   }
-  
+
   int store_size_in_dw = (TD->getTypeStoreSize(t) + 3)/4;
 
   assert(store_size_in_dw);
-  
+
   return store_size_in_dw;
 }
 
 
-void R600KernelParameters::RunAna(llvm::Function* fun)
+void R600KernelParameters::RunAna(Function* fun)
 {
   assert(isOpenCLKernel(fun));
 
@@ -220,7 +243,7 @@ void R600KernelParameters::RunAna(llvm::Function* fun)
 
 }
 
-void R600KernelParameters::Replace(llvm::Function* fun)
+void R600KernelParameters::Replace(Function* fun)
 {
   for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
   {
@@ -237,11 +260,11 @@ void R600KernelParameters::Replace(llvm::Function* fun)
     if (new_val)
     {
       i->val->replaceAllUsesWith(new_val);
-    }   
+    }
   }
 }
 
-void R600KernelParameters::Propagate(llvm::Function* fun)
+void R600KernelParameters::Propagate(Function* fun)
 {
   for (std::vector<param>::iterator i = params.begin(); i != params.end(); i++)
   {
@@ -256,8 +279,8 @@ void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect)
 {
   LoadInst* load = dyn_cast<LoadInst>(v);
   GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(v);
-  
-  unsigned addrspace; 
+
+  unsigned addrspace;
 
   if (indirect)
   {
@@ -274,49 +297,54 @@ void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect)
 
     if (dyn_cast<PointerType>(op->getType())->getAddressSpace() != addrspace)
     {
-      op = new BitCastInst(op, PointerType::get(dyn_cast<PointerType>(op->getType())->getElementType(), addrspace), name, dyn_cast<Instruction>(v));
+      op = new BitCastInst(op, PointerType::get(dyn_cast<PointerType>(
+                           op->getType())->getElementType(), addrspace),
+                           name, dyn_cast<Instruction>(v));
     }
 
-    vector<Value*> params(GEP->idx_begin(), GEP->idx_end());
-    
-    GetElementPtrInst* GEP2 = GetElementPtrInst::Create(op, params, name, dyn_cast<Instruction>(v));
+    std::vector<Value*> params(GEP->idx_begin(), GEP->idx_end());
+
+    GetElementPtrInst* GEP2 = GetElementPtrInst::Create(op, params, name,
+                                                      dyn_cast<Instruction>(v));
     GEP2->setIsInBounds(GEP->isInBounds());
     v = dyn_cast<Value>(GEP2);
     GEP->replaceAllUsesWith(GEP2);
     GEP->eraseFromParent();
     load = NULL;
   }
-  
+
   if (load)
   {
-    if (load->getPointerAddressSpace() != addrspace) ///normally at this point we have the right address space
+    ///normally at this point we have the right address space
+    if (load->getPointerAddressSpace() != addrspace)
     {
       Value *orig_ptr = load->getPointerOperand();
       PointerType *orig_ptr_type = dyn_cast<PointerType>(orig_ptr->getType());
-      
-      Type* new_ptr_type = PointerType::get(orig_ptr_type->getElementType(), addrspace);
+
+      Type* new_ptr_type = PointerType::get(orig_ptr_type->getElementType(),
+                                            addrspace);
 
       Value* new_ptr = orig_ptr;
-      
+
       if (orig_ptr->getType() != new_ptr_type)
       {
         new_ptr = new BitCastInst(orig_ptr, new_ptr_type, "prop_cast", load);
       }
-      
+
       Value* new_load = new LoadInst(new_ptr, name, load);
       load->replaceAllUsesWith(new_load);
       load->eraseFromParent();
     }
-    
+
     return;
   }
 
-  vector<User*> users(v->use_begin(), v->use_end());
-  
+  std::vector<User*> users(v->use_begin(), v->use_end());
+
   for (int i = 0; i < int(users.size()); i++)
   {
     Value* v2 = dyn_cast<Value>(users[i]);
-    
+
     if (v2)
     {
       Propagate(v2, name, indirect);
@@ -327,7 +355,7 @@ void R600KernelParameters::Propagate(Value* v, const Twine& name, bool indirect)
 Value* R600KernelParameters::ConstantRead(Function* fun, param& p)
 {
   assert(fun->front().begin() != fun->front().end());
-  
+
   Instruction *first_inst = fun->front().begin();
   IRBuilder <> builder (first_inst);
 /* First 3 dwords are reserved for the dimmension info */
@@ -346,43 +374,54 @@ Value* R600KernelParameters::ConstantRead(Function* fun, param& p)
   {
     addrspace = AMDILAS::PARAM_D_ADDRESS;
   }
-  
+
   Argument *arg = dyn_cast<Argument>(p.val);
   Type * argType = p.val->getType();
   PointerType * argPtrType = dyn_cast<PointerType>(p.val->getType());
-  
+
   if (argPtrType and arg->hasByValAttr())
   {
-    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(Type::getInt32Ty(*Context), addrspace));
-    Value* param_ptr = GetElementPtrInst::Create(param_addr_space_ptr, ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName(), first_inst);
-    param_ptr = new BitCastInst(param_ptr, PointerType::get(argPtrType->getElementType(), addrspace), arg->getName(), first_inst);
+    Value* param_addr_space_ptr = ConstantPointerNull::get(
+                                    PointerType::get(Type::getInt32Ty(*Context),
+                                    addrspace));
+    Value* param_ptr = GetElementPtrInst::Create(param_addr_space_ptr,
+                                    ConstantInt::get(Type::getInt32Ty(*Context),
+                                    p.get_rat_offset()), arg->getName(),
+                                    first_inst);
+    param_ptr = new BitCastInst(param_ptr,
+                                PointerType::get(argPtrType->getElementType(),
+                                                 addrspace),
+                                arg->getName(), first_inst);
     p.ptr_val = param_ptr;
     return param_ptr;
   }
   else
   {
-    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(argType, addrspace));
-    
+    Value* param_addr_space_ptr = ConstantPointerNull::get(PointerType::get(
+                                                        argType, addrspace));
+
     Value* param_ptr = builder.CreateGEP(param_addr_space_ptr,
-             ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()), arg->getName());
-    
+             ConstantInt::get(Type::getInt32Ty(*Context), p.get_rat_offset()),
+                              arg->getName());
+
     Value* param_value = builder.CreateLoad(param_ptr, arg->getName());
-    
+
     return param_value;
   }
 }
 
 Value* R600KernelParameters::handleSpecial(Function* fun, param& p)
 {
-  string name = getSpecialTypeName(p.val->getType());
+  std::string name = getSpecialTypeName(p.val->getType());
   int ID;
 
   assert(!name.empty());
-  
+
   if (name == "image2d_t" or name == "image3d_t")
   {
-    int lastID = max(getLastSpecialID("image2d_t"), getLastSpecialID("image3d_t"));
-    
+    int lastID = std::max(getLastSpecialID("image2d_t"),
+                     getLastSpecialID("image3d_t"));
+
     if (lastID == -1)
     {
       ID = 2; ///ID0 and ID1 are used internally by the driver
@@ -403,20 +442,22 @@ Value* R600KernelParameters::handleSpecial(Function* fun, param& p)
     else
     {
       ID = lastID + 1;
-    }    
+    }
   }
   else
   {
     ///TODO: give some error message
     return NULL;
   }
-    
+
   p.specialType = name;
   p.specialID = ID;
 
   Instruction *first_inst = fun->front().begin();
 
-  return new IntToPtrInst(ConstantInt::get(Type::getInt32Ty(*Context), p.specialID), p.val->getType(), "resourceID", first_inst);
+  return new IntToPtrInst(ConstantInt::get(Type::getInt32Ty(*Context),
+                                           p.specialID), p.val->getType(),
+                                           "resourceID", first_inst);
 }
 
 
@@ -425,7 +466,7 @@ bool R600KernelParameters::isSpecialType(Type* t)
   return !getSpecialTypeName(t).empty();
 }
 
-string R600KernelParameters::getSpecialTypeName(Type* t)
+std::string R600KernelParameters::getSpecialTypeName(Type* t)
 {
   PointerType *pt = dyn_cast<PointerType>(t);
   StructType *st = NULL;
@@ -437,9 +478,9 @@ string R600KernelParameters::getSpecialTypeName(Type* t)
 
   if (st)
   {
-    string prefix = "struct.opencl_builtin_type_";
-    
-    string name = st->getName().str();
+    std::string prefix = "struct.opencl_builtin_type_";
+
+    std::string name = st->getName().str();
 
     if (name.substr(0, prefix.length()) == prefix)
     {
@@ -458,19 +499,15 @@ bool R600KernelParameters::runOnFunction (Function &F)
     return false;
   }
 
-//  F.dump();
-  
   RunAna(&F);
   Replace(&F);
   Propagate(&F);
-  
-   mod->dump();
+
   return false;
 }
 
 void R600KernelParameters::getAnalysisUsage(AnalysisUsage &AU) const
 {
-//   AU.addRequired<FunctionAnalysis>();
   FunctionPass::getAnalysisUsage(AU);
   AU.setPreservesAll();
 }
@@ -484,7 +521,7 @@ bool R600KernelParameters::doInitialization(Module &M)
 {
   Context = &M.getContext();
   mod = &M;
-  
+
   return false;
 }
 
@@ -493,10 +530,12 @@ bool R600KernelParameters::doFinalization(Module &M)
   return false;
 }
 
-llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD)
+} // End anonymous namespace
+
+FunctionPass* llvm::createR600KernelParametersPass(const TargetData* TD)
 {
   FunctionPass *p = new R600KernelParameters(TD);
-  
+
   return p;
 }
 
diff --git a/src/gallium/drivers/radeon/R600KernelParameters.h b/src/gallium/drivers/radeon/R600KernelParameters.h
deleted file mode 100644
index 904a469a5f0..00000000000
--- a/src/gallium/drivers/radeon/R600KernelParameters.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//===-- R600KernelParameters.h - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef KERNELPARAMETERS_H
-#define KERNELPARAMETERS_H
-
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Value.h"
-
-#include <vector>
-
-llvm::FunctionPass* createR600KernelParametersPass(const llvm::TargetData* TD);
-
-
-#endif
diff --git a/src/gallium/drivers/radeon/R600LowerInstructions.cpp b/src/gallium/drivers/radeon/R600LowerInstructions.cpp
index fb5431d0eef..dca1fe195cc 100644
--- a/src/gallium/drivers/radeon/R600LowerInstructions.cpp
+++ b/src/gallium/drivers/radeon/R600LowerInstructions.cpp
@@ -1,4 +1,4 @@
-//===-- R600LowerInstructions.cpp - TODO: Add brief description -------===//
+//===-- R600LowerInstructions.cpp - Lower unsupported AMDIL instructions --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This pass lowers AMDIL MachineInstrs that aren't supported by the R600
+// target to either supported AMDIL MachineInstrs or R600 MachineInstrs.
 //
 //===----------------------------------------------------------------------===//
 
@@ -93,8 +94,8 @@ bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
                            &AMDIL::R600_TReg32RegClass);
           BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGE_INT),
                   setgt)
-                  .addOperand(MI.getOperand(1))
-                  .addReg(AMDIL::ZERO);
+                  .addReg(AMDIL::ZERO)
+                  .addOperand(MI.getOperand(1));
 
           unsigned add_int = MRI->createVirtualRegister(
                              &AMDIL::R600_TReg32RegClass);
@@ -311,7 +312,8 @@ bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
         MachineInstr * defInstr = MRI->getVRegDef(maskedRegister);
         MachineOperand * def = defInstr->findRegisterDefOperand(maskedRegister);
         def->addTargetFlag(MO_FLAG_MASK);
-        break;
+        /* Continue so the instruction is not erased */
+        continue;
       }
 
       case AMDIL::NEGATE_i32:
@@ -342,6 +344,13 @@ bool R600LowerInstructionsPass::runOnMachineFunction(MachineFunction &MF)
           break;
         }
 
+      case AMDIL::ULT:
+        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::SETGT_UINT))
+                .addOperand(MI.getOperand(0))
+                .addOperand(MI.getOperand(2))
+                .addOperand(MI.getOperand(1));
+        break;
+
       default:
         continue;
       }
diff --git a/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp b/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp
deleted file mode 100644
index 394ee7006ce..00000000000
--- a/src/gallium/drivers/radeon/R600LowerShaderInstructions.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-//===-- R600LowerShaderInstructions.cpp - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPULowerShaderInstructions.h"
-#include "AMDIL.h"
-#include "AMDILInstrInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-  class R600LowerShaderInstructionsPass : public MachineFunctionPass,
-        public AMDGPULowerShaderInstructionsPass {
-
-  private:
-    static char ID;
-    TargetMachine &TM;
-
-    void lowerEXPORT_REG_FAKE(MachineInstr &MI, MachineBasicBlock &MBB,
-        MachineBasicBlock::iterator I);
-    void lowerLOAD_INPUT(MachineInstr & MI);
-    bool lowerSTORE_OUTPUT(MachineInstr & MI, MachineBasicBlock &MBB,
-        MachineBasicBlock::iterator I);
-
-  public:
-    R600LowerShaderInstructionsPass(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm) { }
-
-      bool runOnMachineFunction(MachineFunction &MF);
-
-      const char *getPassName() const { return "R600 Lower Shader Instructions"; }
-    };
-} /* End anonymous namespace */
-
-char R600LowerShaderInstructionsPass::ID = 0;
-
-FunctionPass *llvm::createR600LowerShaderInstructionsPass(TargetMachine &tm) {
-    return new R600LowerShaderInstructionsPass(tm);
-}
-
-#define INSTR_CASE_FLOAT_V(inst) \
-  case AMDIL:: inst##_v4f32: \
-
-#define INSTR_CASE_FLOAT_S(inst) \
-  case AMDIL:: inst##_f32:
-
-#define INSTR_CASE_FLOAT(inst) \
-  INSTR_CASE_FLOAT_V(inst) \
-  INSTR_CASE_FLOAT_S(inst)
-bool R600LowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF)
-{
-  MRI = &MF.getRegInfo();
-
-
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end();) {
-      MachineInstr &MI = *I;
-      bool deleteInstr = false;
-      switch (MI.getOpcode()) {
-
-      default: break;
-
-      case AMDIL::RESERVE_REG:
-      case AMDIL::EXPORT_REG:
-        deleteInstr = true;
-        break;
-
-      case AMDIL::LOAD_INPUT:
-        lowerLOAD_INPUT(MI);
-        deleteInstr = true;
-        break;
-
-      case AMDIL::STORE_OUTPUT:
-        deleteInstr = lowerSTORE_OUTPUT(MI, MBB, I);
-        break;
-
-      }
-
-      ++I;
-
-      if (deleteInstr) {
-        MI.eraseFromParent();
-      }
-    }
-  }
-
-  return false;
-}
-
-/* The goal of this function is to replace the virutal destination register of
- * a LOAD_INPUT instruction with the correct physical register that will.
- *
- * XXX: I don't think this is the right way things assign physical registers,
- * but I'm not sure of another way to do this.
- */
-void R600LowerShaderInstructionsPass::lowerLOAD_INPUT(MachineInstr &MI)
-{
-  MachineOperand &dst = MI.getOperand(0);
-  MachineOperand &arg = MI.getOperand(1);
-  int64_t inputIndex = arg.getImm();
-  const TargetRegisterClass * inputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID);
-  unsigned newRegister = inputClass->getRegister(inputIndex);
-  unsigned dstReg = dst.getReg();
-
-  preloadRegister(MI.getParent()->getParent(), TM.getInstrInfo(), newRegister,
-                  dstReg);
-}
-
-bool R600LowerShaderInstructionsPass::lowerSTORE_OUTPUT(MachineInstr &MI,
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
-{
-  MachineOperand &valueOp = MI.getOperand(1);
-  MachineOperand &indexOp = MI.getOperand(2);
-  unsigned valueReg = valueOp.getReg();
-  int64_t outputIndex = indexOp.getImm();
-  const TargetRegisterClass * outputClass = TM.getRegisterInfo()->getRegClass(AMDIL::R600_TReg32RegClassID);
-  unsigned newRegister = outputClass->getRegister(outputIndex);
-
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TM.getInstrInfo()->get(AMDIL::COPY),
-                  newRegister)
-                  .addReg(valueReg);
-
-  if (!MRI->isLiveOut(newRegister))
-    MRI->addLiveOut(newRegister);
-
-  return true;
-
-}
diff --git a/src/gallium/drivers/radeon/R600MachineFunctionInfo.cpp b/src/gallium/drivers/radeon/R600MachineFunctionInfo.cpp
new file mode 100644
index 00000000000..48443fb57d8
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600MachineFunctionInfo.cpp
@@ -0,0 +1,16 @@
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : MachineFunctionInfo()
+  { }
diff --git a/src/gallium/drivers/radeon/R600MachineFunctionInfo.h b/src/gallium/drivers/radeon/R600MachineFunctionInfo.h
new file mode 100644
index 00000000000..948e1924272
--- /dev/null
+++ b/src/gallium/drivers/radeon/R600MachineFunctionInfo.h
@@ -0,0 +1,33 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600MachineFunctionInfo is used for keeping track of which registers have
+// been reserved by the llvm.AMDGPU.reserve.reg intrinsic.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINEFUNCTIONINFO_H
+#define R600MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include <vector>
+
+namespace llvm {
+
+class R600MachineFunctionInfo : public MachineFunctionInfo {
+
+public:
+  R600MachineFunctionInfo(const MachineFunction &MF);
+  std::vector<unsigned> ReservedRegs;
+
+};
+
+} // End llvm namespace
+
+#endif //R600MACHINEFUNCTIONINFO_H
diff --git a/src/gallium/drivers/radeon/R600OpenCLUtils.h b/src/gallium/drivers/radeon/R600OpenCLUtils.h
deleted file mode 100644
index 91e41d63d0d..00000000000
--- a/src/gallium/drivers/radeon/R600OpenCLUtils.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- OpenCLUtils.h - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-#ifndef OPENCLUTILS_H
-#define OPENCLUTILS_H
-
-#include "llvm/Function.h"
-
-#include <llvm/Module.h>
-
-static bool isOpenCLKernel(const llvm::Function* fun)
-{
-  llvm::Module *mod = const_cast<llvm::Function*>(fun)->getParent();
-  llvm::NamedMDNode * md = mod->getOrInsertNamedMetadata("opencl.kernels");
-
-  if (!md or !md->getNumOperands())
-  {
-    return false;
-  }
-
-  for (int i = 0; i < int(md->getNumOperands()); i++)
-  {
-    if (!md->getOperand(i) or !md->getOperand(i)->getOperand(0))
-    {
-      continue;
-    }
-    
-    assert(md->getOperand(i)->getNumOperands() == 1);
-
-    if (md->getOperand(i)->getOperand(0)->getName() == fun->getName())
-    {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-
-#endif
diff --git a/src/gallium/drivers/radeon/R600RegisterInfo.cpp b/src/gallium/drivers/radeon/R600RegisterInfo.cpp
index 96507b104cf..de559bd2dfa 100644
--- a/src/gallium/drivers/radeon/R600RegisterInfo.cpp
+++ b/src/gallium/drivers/radeon/R600RegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- R600RegisterInfo.cpp - TODO: Add brief description -------===//
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// The file contains the R600 implementation of the TargetRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "R600RegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "R600MachineFunctionInfo.h"
 
 using namespace llvm;
 
@@ -26,6 +27,8 @@ R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const
 {
   BitVector Reserved(getNumRegs());
+  const R600MachineFunctionInfo * MFI = MF.getInfo<R600MachineFunctionInfo>();
+
   Reserved.set(AMDIL::ZERO);
   Reserved.set(AMDIL::HALF);
   Reserved.set(AMDIL::ONE);
@@ -40,19 +43,11 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const
     Reserved.set(*I);
   }
 
-  for (MachineFunction::const_iterator BB = MF.begin(),
-                                 BB_E = MF.end(); BB != BB_E; ++BB) {
-    const MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
-                                                                  I != E; ++I) {
-      const MachineInstr &MI = *I;
-      if (MI.getOpcode() == AMDIL::RESERVE_REG) {
-        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) {
-          Reserved.set(MI.getOperand(0).getReg());
-        }
-      }
-    }
+  for (std::vector<unsigned>::const_iterator I = MFI->ReservedRegs.begin(),
+                                    E = MFI->ReservedRegs.end(); I != E; ++I) {
+    Reserved.set(*I);
   }
+
   return Reserved;
 }
 
diff --git a/src/gallium/drivers/radeon/R600RegisterInfo.h b/src/gallium/drivers/radeon/R600RegisterInfo.h
index 95a44f971a0..89a11f9333b 100644
--- a/src/gallium/drivers/radeon/R600RegisterInfo.h
+++ b/src/gallium/drivers/radeon/R600RegisterInfo.h
@@ -1,4 +1,4 @@
-//===-- R600RegisterInfo.h - TODO: Add brief description -------===//
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Interface definition for R600RegisterInfo
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/R600Schedule.td b/src/gallium/drivers/radeon/R600Schedule.td
index c6b1ca61bb5..d1957903d87 100644
--- a/src/gallium/drivers/radeon/R600Schedule.td
+++ b/src/gallium/drivers/radeon/R600Schedule.td
@@ -1,4 +1,4 @@
-//===-- R600Schedule.td - TODO: Add brief description -------===//
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
+// slot has been removed. 
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp b/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp
index b0bdf701a74..1ef097f7b1e 100644
--- a/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp
+++ b/src/gallium/drivers/radeon/SIAssignInterpRegs.cpp
@@ -1,4 +1,4 @@
-//===-- SIAssignInterpRegs.cpp - TODO: Add brief description -------===//
+//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This pass maps the pseudo interpolation registers to the correct physical
+// registers.  Prior to executing a fragment shader, the GPU loads interpolation
+// parameters into physical registers.  The specific physical register that each
+// interpolation parameter ends up in depends on the type of the interpolation
+// parameter as well as how many interpolation parameters are used by the
+// shader.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SICodeEmitter.cpp b/src/gallium/drivers/radeon/SICodeEmitter.cpp
index ad494fae7c6..6970d9f0875 100644
--- a/src/gallium/drivers/radeon/SICodeEmitter.cpp
+++ b/src/gallium/drivers/radeon/SICodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- SICodeEmitter.cpp - TODO: Add brief description -------===//
+//===-- SICodeEmitter.cpp - SI Code Emitter -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// The SI code emitter produces machine code that can be executed directly on
+// the GPU device.
 //
 //===----------------------------------------------------------------------===//
 
@@ -144,8 +145,6 @@ bool SICodeEmitter::runOnMachineFunction(MachineFunction &MF)
 {
   MF.dump();
   TM = &MF.getTarget();
-  const AMDGPUInstrInfo * TII =
-                        static_cast<const AMDGPUInstrInfo*>(TM->getInstrInfo());
 
   emitState(MF);
 
@@ -155,8 +154,7 @@ bool SICodeEmitter::runOnMachineFunction(MachineFunction &MF)
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
                                                       I != E; ++I) {
       MachineInstr &MI = *I;
-      if (!TII->isRegPreload(MI) && MI.getOpcode() != AMDIL::KILL
-          && MI.getOpcode() != AMDIL::RETURN) {
+      if (MI.getOpcode() != AMDIL::KILL && MI.getOpcode() != AMDIL::RETURN) {
         emitInstr(MI);
       }
     }
diff --git a/src/gallium/drivers/radeon/SIGenRegisterInfo.pl b/src/gallium/drivers/radeon/SIGenRegisterInfo.pl
index 644daa1bc22..bb5ebbd67e6 100644
--- a/src/gallium/drivers/radeon/SIGenRegisterInfo.pl
+++ b/src/gallium/drivers/radeon/SIGenRegisterInfo.pl
@@ -1,16 +1,17 @@
-#===-- SIGenRegisterInfo.pl - TODO: Add brief description -------===#
+#===-- SIGenRegisterInfo.pl - Script for generating register info files ----===#
 #
 #                     The LLVM Compiler Infrastructure
 #
 # This file is distributed under the University of Illinois Open Source
 # License. See LICENSE.TXT for details.
 #
-#===----------------------------------------------------------------------===#
+#===------------------------------------------------------------------------===#
 #
-# TODO: Add full description
+# This perl script prints to stdout .td code to be used as SIRegisterInfo.td
+# it also generates a file called SIHwRegInfo.include, which contains helper
+# functions for determining the hw encoding of registers.
 #
-#===----------------------------------------------------------------------===#
-
+#===------------------------------------------------------------------------===#
 
 use strict;
 use warnings;
diff --git a/src/gallium/drivers/radeon/SIISelLowering.cpp b/src/gallium/drivers/radeon/SIISelLowering.cpp
index 1a4b47ecbf5..441a4a07290 100644
--- a/src/gallium/drivers/radeon/SIISelLowering.cpp
+++ b/src/gallium/drivers/radeon/SIISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- SIISelLowering.cpp - TODO: Add brief description -------===//
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Most of the DAG lowering is handled in AMDILISelLowering.cpp.  This file is
+// mostly EmitInstrWithCustomInserter().
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIISelLowering.h b/src/gallium/drivers/radeon/SIISelLowering.h
index e7a79f8e215..229e682ef51 100644
--- a/src/gallium/drivers/radeon/SIISelLowering.h
+++ b/src/gallium/drivers/radeon/SIISelLowering.h
@@ -1,4 +1,4 @@
-//===-- SIISelLowering.h - TODO: Add brief description -------===//
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SI DAG Lowering interface definition
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIInstrFormats.td b/src/gallium/drivers/radeon/SIInstrFormats.td
index caf9b0ef120..de0d4fa39d2 100644
--- a/src/gallium/drivers/radeon/SIInstrFormats.td
+++ b/src/gallium/drivers/radeon/SIInstrFormats.td
@@ -1,4 +1,4 @@
-//===-- SIInstrFormats.td - TODO: Add brief description -------===//
+//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SI Instruction format definitions.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.cpp b/src/gallium/drivers/radeon/SIInstrInfo.cpp
index 6f92e96c6e7..0cb97643a7f 100644
--- a/src/gallium/drivers/radeon/SIInstrInfo.cpp
+++ b/src/gallium/drivers/radeon/SIInstrInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.cpp - TODO: Add brief description -------===//
+//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SI Implementation of TargetInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
@@ -107,6 +107,8 @@ unsigned SIInstrInfo::getISAOpcode(unsigned AMDILopcode) const
 {
   switch (AMDILopcode) {
   case AMDIL::MAD_f32: return AMDIL::V_MAD_LEGACY_F32;
+  //XXX We need a better way of detecting end of program
+  case AMDIL::RETURN: return AMDIL::S_ENDPGM;
   default: return AMDGPUInstrInfo::getISAOpcode(AMDILopcode);
   }
 }
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.h b/src/gallium/drivers/radeon/SIInstrInfo.h
index bd76c3f94aa..68940ea3ca4 100644
--- a/src/gallium/drivers/radeon/SIInstrInfo.h
+++ b/src/gallium/drivers/radeon/SIInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.h - TODO: Add brief description -------===//
+//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Interface definition for SIInstrInfo.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIInstrInfo.td b/src/gallium/drivers/radeon/SIInstrInfo.td
index 65b28ec84ad..435948ff1de 100644
--- a/src/gallium/drivers/radeon/SIInstrInfo.td
+++ b/src/gallium/drivers/radeon/SIInstrInfo.td
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.td - TODO: Add brief description -------===//
+//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
 
 
 
@@ -464,11 +460,4 @@ def IMM12bit : ImmLeaf <
 
 include "SIInstrFormats.td"
 
-def LOAD_CONST : AMDGPUShaderInst <
-  (outs GPRF32:$dst),
-  (ins i32imm:$src),
-  "LOAD_CONST $dst, $src",
-  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
->;
-
 include "SIInstructions.td"
diff --git a/src/gallium/drivers/radeon/SIInstructions.td b/src/gallium/drivers/radeon/SIInstructions.td
index 27a8b31757a..57bbc7a5d5a 100644
--- a/src/gallium/drivers/radeon/SIInstructions.td
+++ b/src/gallium/drivers/radeon/SIInstructions.td
@@ -1,4 +1,4 @@
-//===-- SIInstructions.td - TODO: Add brief description -------===//
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
 
 
 def isSI : Predicate<"Subtarget.device()"
@@ -800,6 +796,13 @@ def CONFIG_WRITE : InstSI <
   field bits<32> Inst = 0;
 }
 
+def LOAD_CONST : AMDGPUShaderInst <
+  (outs GPRF32:$dst),
+  (ins i32imm:$src),
+  "LOAD_CONST $dst, $src",
+  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+>;
+
 let usesCustomInserter = 1 in {
 
 def SI_V_CNDLT : InstSI <
@@ -833,7 +836,6 @@ def USE_SGPR_32 : InstSI <
 
 > {
   field bits<32> Inst = 0;
-  let PreloadReg = 1;
 }
 
 def USE_SGPR_64 : InstSI <
@@ -844,7 +846,6 @@ def USE_SGPR_64 : InstSI <
 
 > {
   field bits<32> Inst = 0;
-  let PreloadReg = 1;
 }
 
 def VS_LOAD_BUFFER_INDEX : InstSI <
@@ -854,7 +855,6 @@ def VS_LOAD_BUFFER_INDEX : InstSI <
   [(set VReg_32:$dst, (int_SI_vs_load_buffer_index))]> {
 
   field bits<32> Inst = 0;
-  let PreloadReg = 1;
 }
 
 } // end usesCustomInserter 
diff --git a/src/gallium/drivers/radeon/SIIntrinsics.td b/src/gallium/drivers/radeon/SIIntrinsics.td
index e3014e13916..4d23072d4f1 100644
--- a/src/gallium/drivers/radeon/SIIntrinsics.td
+++ b/src/gallium/drivers/radeon/SIIntrinsics.td
@@ -1,4 +1,4 @@
-//===-- SIIntrinsics.td - TODO: Add brief description -------===//
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SI Intrinsic Definitions
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp b/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp
deleted file mode 100644
index 5d49d88dc7c..00000000000
--- a/src/gallium/drivers/radeon/SILowerShaderInstructions.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-//===-- SILowerShaderInstructions.cpp - TODO: Add brief description -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
-
-
-#include "AMDGPU.h"
-#include "AMDGPULowerShaderInstructions.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-
-using namespace llvm;
-
-namespace {
-  class SILowerShaderInstructionsPass : public MachineFunctionPass,
-      public AMDGPULowerShaderInstructionsPass {
-
-  private:
-    static char ID;
-    TargetMachine &TM;
-
-  public:
-    SILowerShaderInstructionsPass(TargetMachine &tm) :
-      MachineFunctionPass(ID), TM(tm) { }
-
-    bool runOnMachineFunction(MachineFunction &MF);
-
-    const char *getPassName() const { return "SI Lower Shader Instructions"; }
-
-    void lowerRETURN(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
-    void lowerSET_M0(MachineInstr &MI, MachineBasicBlock &MBB,
-                     MachineBasicBlock::iterator I);
-  };
-} /* End anonymous namespace */
-
-char SILowerShaderInstructionsPass::ID = 0;
-
-FunctionPass *llvm::createSILowerShaderInstructionsPass(TargetMachine &tm) {
-    return new SILowerShaderInstructionsPass(tm);
-}
-
-bool SILowerShaderInstructionsPass::runOnMachineFunction(MachineFunction &MF)
-{
-  MRI = &MF.getRegInfo();
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                  BB != BB_E; ++BB) {
-    MachineBasicBlock &MBB = *BB;
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-         I != MBB.end(); I = Next, Next = llvm::next(I) ) {
-      MachineInstr &MI = *I;
-      switch (MI.getOpcode()) {
-      case AMDIL::RETURN:
-        lowerRETURN(MBB, I);
-        break;
-      case AMDIL::SET_M0:
-        lowerSET_M0(MI, MBB, I);
-        break;
-      default: continue;
-      }
-      MI.removeFromParent();
-    }
-  }
-
-  return false;
-}
-
-void SILowerShaderInstructionsPass::lowerRETURN(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator I)
-{
-  const struct TargetInstrInfo * TII = TM.getInstrInfo();
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_ENDPGM));
-}
-
-void SILowerShaderInstructionsPass::lowerSET_M0(MachineInstr &MI,
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I)
-{
-  const struct TargetInstrInfo * TII = TM.getInstrInfo();
-  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDIL::S_MOV_IMM_I32))
-          .addReg(AMDIL::M0)
-          .addOperand(MI.getOperand(1));
-}
diff --git a/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp b/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp
index eace40c226c..40ba76f1f86 100644
--- a/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp
+++ b/src/gallium/drivers/radeon/SIMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SIMachineFunctionInfo.cpp - TODO: Add brief description -------===//
+//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,10 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// TODO: Add full description
-//
-//===----------------------------------------------------------------------===//
 
 
 #include "SIMachineFunctionInfo.h"
diff --git a/src/gallium/drivers/radeon/SIMachineFunctionInfo.h b/src/gallium/drivers/radeon/SIMachineFunctionInfo.h
index 5647de9d81f..46a021f3613 100644
--- a/src/gallium/drivers/radeon/SIMachineFunctionInfo.h
+++ b/src/gallium/drivers/radeon/SIMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//===-- SIMachineFunctionInfo.h - TODO: Add brief description -------===//
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// SIMachineFunctionInfo is used to keep track of the spi_sp_input_addr config
+// register, which is to tell the hardware which interpolation parameters to
+// load.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIPropagateImmReads.cpp b/src/gallium/drivers/radeon/SIPropagateImmReads.cpp
index 4f925d5de1c..6a165488831 100644
--- a/src/gallium/drivers/radeon/SIPropagateImmReads.cpp
+++ b/src/gallium/drivers/radeon/SIPropagateImmReads.cpp
@@ -1,4 +1,4 @@
-//===-- SIPropagateImmReads.cpp - TODO: Add brief description -------===//
+//===-- SIPropagateImmReads.cpp - Lower Immediate Reads Pass --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// We can't do this in the ConvertToISA pass, because later passes might
+// create LOADCONST_* instructions that we would miss.  This is why we need 
+// a separate pass for this.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIRegisterInfo.cpp b/src/gallium/drivers/radeon/SIRegisterInfo.cpp
index da2ec36a773..2d530a4f022 100644
--- a/src/gallium/drivers/radeon/SIRegisterInfo.cpp
+++ b/src/gallium/drivers/radeon/SIRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SIRegisterInfo.cpp - TODO: Add brief description -------===//
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// This file contains the SI implementation of the TargetRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SIRegisterInfo.h b/src/gallium/drivers/radeon/SIRegisterInfo.h
index c797e3c8ace..77f3261efc5 100644
--- a/src/gallium/drivers/radeon/SIRegisterInfo.h
+++ b/src/gallium/drivers/radeon/SIRegisterInfo.h
@@ -1,4 +1,4 @@
-//===-- SIRegisterInfo.h - TODO: Add brief description -------===//
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// Interface definition for SIRegisterInfo
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/SISchedule.td b/src/gallium/drivers/radeon/SISchedule.td
index 9e99268e9ca..28b65b82585 100644
--- a/src/gallium/drivers/radeon/SISchedule.td
+++ b/src/gallium/drivers/radeon/SISchedule.td
@@ -1,4 +1,4 @@
-//===-- SISchedule.td - TODO: Add brief description -------===//
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO: Add full description
+// TODO: This is just a place holder for now.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 9be7f90c3e6..4a706397fdd 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -36,6 +36,8 @@
 #define RADEON_LLVM_MAX_BRANCH_DEPTH 16
 #define RADEON_LLVM_MAX_LOOP_DEPTH 16
 
+#define RADEON_LLVM_MAX_SYSTEM_VALUES 4
+
 struct radeon_llvm_branch {
 	LLVMBasicBlockRef endif_block;
 	LLVMBasicBlockRef if_block;
@@ -78,6 +80,9 @@ struct radeon_llvm_context {
 			unsigned input_index,
 			const struct tgsi_full_declaration *decl);
 
+	void (*load_system_value)(struct radeon_llvm_context *,
+			unsigned index,
+			const struct tgsi_full_declaration *decl);
 
 	/** User data to use with the callbacks */
 	void * userdata;
@@ -90,6 +95,8 @@ struct radeon_llvm_context {
 	LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
 	unsigned output_reg_count;
 
+	LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES];
+
 	unsigned reserved_reg_count;
 	/*=== Private Members ===*/
 
@@ -105,6 +112,37 @@ struct radeon_llvm_context {
 	struct gallivm_state gallivm;
 };
 
+static inline LLVMValueRef bitcast(
+		struct lp_build_tgsi_context * bld_base,
+		enum tgsi_opcode_type type,
+		LLVMValueRef value
+)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef ctx = bld_base->base.gallivm->context;
+	LLVMTypeRef dst_type;
+
+	switch (type) {
+	case TGSI_TYPE_UNSIGNED:
+	case TGSI_TYPE_SIGNED:
+		dst_type = LLVMInt32TypeInContext(ctx);
+		break;
+	case TGSI_TYPE_UNTYPED:
+	case TGSI_TYPE_FLOAT:
+		dst_type = LLVMFloatTypeInContext(ctx);
+		break;
+	default:
+		dst_type = 0;
+		break;
+	}
+
+	if (dst_type)
+		return LLVMBuildBitCast(builder, value, dst_type, "");
+	else
+		return value;
+}
+
+
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx);
 
 void radeon_llvm_dispose(struct radeon_llvm_context * ctx);
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.cpp b/src/gallium/drivers/radeon/radeon_llvm_emit.cpp
index b409cb2175e..ebc32106b52 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.cpp
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.cpp
@@ -93,19 +93,20 @@ radeon_llvm_compile(LLVMModuleRef M, unsigned char ** bytes,
    AMDGPUTriple.setArch(Arch);
 
    Module * mod = unwrap(M);
-   std::string FS = gpu_family;
+   std::string FS;
    TargetOptions TO;
 
+   if (dump) {
+      mod->dump();
+      FS += "+DumpCode";
+   }
+
    std::auto_ptr<TargetMachine> tm(AMDGPUTarget->createTargetMachine(
-                     AMDGPUTriple.getTriple(), gpu_family, "" /* Features */,
+                     AMDGPUTriple.getTriple(), gpu_family, FS,
                      TO, Reloc::Default, CodeModel::Default,
                      CodeGenOpt::Default
                      ));
    TargetMachine &AMDGPUTargetMachine = *tm.get();
-   /* XXX: Use TargetMachine.Options in 3.0 */
-   if (dump) {
-      mod->dump();
-   }
    PassManager PM;
    PM.add(new TargetData(*AMDGPUTargetMachine.getTargetData()));
    PM.add(createPromoteMemoryToRegisterPass());
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 62de9da28de..6e6fc3d12cd 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -29,6 +29,7 @@
 #include "gallivm/lp_bld_gather.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_intr.h"
 #include "gallivm/lp_bld_swizzle.h"
 #include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
@@ -112,8 +113,25 @@ emit_fetch_immediate(
 	enum tgsi_opcode_type type,
 	unsigned swizzle)
 {
+	LLVMTypeRef ctype;
+	LLVMContextRef ctx = bld_base->base.gallivm->context;
+
+	switch (type) {
+	case TGSI_TYPE_UNSIGNED:
+	case TGSI_TYPE_SIGNED:
+		ctype = LLVMInt32TypeInContext(ctx);
+		break;
+	case TGSI_TYPE_UNTYPED:
+	case TGSI_TYPE_FLOAT:
+		ctype = LLVMFloatTypeInContext(ctx);
+		break;
+	default:
+		ctype = 0;
+		break;
+	}
+
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	return bld->immediates[reg->Register.Index][swizzle];
+	return LLVMConstBitCast(bld->immediates[reg->Register.Index][swizzle], ctype);
 }
 
 static LLVMValueRef
@@ -134,7 +152,7 @@ emit_fetch_input(
 		return lp_build_gather_values(bld_base->base.gallivm, values,
 						TGSI_NUM_CHANNELS);
 	} else {
-		return ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
+		return bitcast(bld_base, type, ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)]);
 	}
 }
 
@@ -155,7 +173,7 @@ emit_fetch_temporary(
 	} else {
 		LLVMValueRef temp_ptr;
 		temp_ptr = lp_get_temp_ptr_soa(bld, reg->Register.Index, swizzle);
-		return LLVMBuildLoad(builder, temp_ptr, "");
+		return bitcast(bld_base,type,LLVMBuildLoad(builder, temp_ptr, ""));
 	}
 }
 
@@ -213,6 +231,15 @@ static void emit_declaration(
 	}
 	break;
 
+	case TGSI_FILE_SYSTEM_VALUE:
+	{
+		unsigned idx;
+		for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
+			ctx->load_system_value(ctx, idx, decl);
+		}
+	}
+	break;
+
 	case TGSI_FILE_OUTPUT:
 	{
 		unsigned idx;
@@ -304,6 +331,9 @@ emit_store(
 		default:
 			return;
 		}
+
+		value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
+
 		LLVMBuildStore(builder, value, temp_ptr);
 	}
 }
@@ -444,8 +474,10 @@ static void if_emit(
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	LLVMValueRef cond;
 	LLVMBasicBlockRef if_block, else_block, endif_block;
-	cond = LLVMBuildFCmp(gallivm->builder, LLVMRealOEQ, emit_data->args[0],
-							bld_base->base.one, "");
+
+	cond = LLVMBuildICmp(gallivm->builder, LLVMIntNE,
+	        bitcast(bld_base, TGSI_TYPE_UNSIGNED, emit_data->args[0]),
+			bld_base->int_bld.zero, "");
 
 	endif_block = LLVMAppendBasicBlockInContext(gallivm->context,
 						ctx->main_fn, "ENDIF");
@@ -463,6 +495,101 @@ static void if_emit(
 	ctx->branch[ctx->branch_depth - 1].has_else = 0;
 }
 
+static void kil_emit(
+	const struct lp_build_tgsi_action * action,
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	unsigned i;
+	for (i = 0; i < emit_data->arg_count; i++) {
+		emit_data->output[i] = lp_build_intrinsic_unary(
+			bld_base->base.gallivm->builder,
+			action->intr_name,
+			emit_data->dst_type, emit_data->args[i]);
+	}
+}
+
+
+static void emit_prepare_cube_coords(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	boolean shadowcube = (emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE);
+	struct gallivm_state * gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMTypeRef type = bld_base->base.elem_type;
+	LLVMValueRef coords[4];
+	LLVMValueRef mad_args[3];
+	unsigned i, cnt;
+
+	LLVMValueRef v = lp_build_intrinsic(builder, "llvm.AMDGPU.cube",
+			LLVMVectorType(type, 4),
+			&emit_data->args[0],1);
+
+	/* save src.w for shadow cube */
+	cnt = shadowcube ? 3 : 4;
+
+	for (i = 0; i < cnt; ++i) {
+		LLVMValueRef idx = lp_build_const_int32(gallivm, i);
+		coords[i] = LLVMBuildExtractElement(builder, v, idx, "");
+	}
+
+	coords[2] = lp_build_intrinsic(builder, "llvm.AMDIL.fabs.",
+			type, &coords[2], 1);
+	coords[2] = lp_build_intrinsic(builder, "llvm.AMDGPU.rcp",
+			type, &coords[2], 1);
+
+	mad_args[1] = coords[2];
+	mad_args[2] = LLVMConstReal(type, 1.5);
+
+	mad_args[0] = coords[0];
+	coords[0] = lp_build_intrinsic(builder, "llvm.AMDIL.mad.",
+			type, mad_args, 3);
+
+	mad_args[0] = coords[1];
+	coords[1] = lp_build_intrinsic(builder, "llvm.AMDIL.mad.",
+			type, mad_args, 3);
+
+	/* apply yxwy swizzle to cooords */
+	coords[2] = coords[3];
+	coords[3] = coords[1];
+	coords[1] = coords[0];
+	coords[0] = coords[3];
+
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+}
+
+static void txp_fetch_args(
+	struct lp_build_tgsi_context * bld_base,
+	struct lp_build_emit_data * emit_data)
+{
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	LLVMValueRef src_w;
+	unsigned chan;
+	LLVMValueRef coords[4];
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
+
+	for (chan = 0; chan < 3; chan++ ) {
+		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
+						emit_data->inst, 0, chan);
+		coords[chan] = lp_build_emit_llvm_binary(bld_base,
+					TGSI_OPCODE_DIV, arg, src_w);
+	}
+	coords[3] = bld_base->base.one;
+	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
+						coords, 4);
+	emit_data->arg_count = 1;
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
+		emit_prepare_cube_coords(bld_base, emit_data);
+	}
+}
+
 static void tex_fetch_args(
 	struct lp_build_tgsi_context * bld_base,
 	struct lp_build_emit_data * emit_data)
@@ -475,16 +602,261 @@ static void tex_fetch_args(
 
 	*/
 
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+
 	LLVMValueRef coords[4];
 	unsigned chan;
 	for (chan = 0; chan < 4; chan++) {
-		coords[chan] = lp_build_emit_fetch(bld_base, emit_data->inst, 0, chan);
+		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
 	}
 
 	emit_data->arg_count = 1;
 	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
 						coords, 4);
 	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+
+	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
+	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE) &&
+	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
+		emit_prepare_cube_coords(bld_base, emit_data);
+	}
+}
+
+static void emit_icmp(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	unsigned pred;
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMContextRef context = bld_base->base.gallivm->context;
+
+	switch (emit_data->inst->Instruction.Opcode) {
+	case TGSI_OPCODE_USEQ: pred = LLVMIntEQ; break;
+	case TGSI_OPCODE_USNE: pred = LLVMIntNE; break;
+	case TGSI_OPCODE_USGE: pred = LLVMIntUGE; break;
+	case TGSI_OPCODE_USLT: pred = LLVMIntULT; break;
+	case TGSI_OPCODE_ISGE: pred = LLVMIntSGE; break;
+	case TGSI_OPCODE_ISLT: pred = LLVMIntSLT; break;
+	default:
+		assert(!"unknown instruction");
+	}
+
+	LLVMValueRef v = LLVMBuildICmp(builder, pred,
+			emit_data->args[0], emit_data->args[1],"");
+
+	v = LLVMBuildSExtOrBitCast(builder, v,
+			LLVMInt32TypeInContext(context), "");
+
+	emit_data->output[emit_data->chan] = v;
+}
+
+static void emit_not(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	LLVMValueRef v = bitcast(bld_base, TGSI_TYPE_UNSIGNED,
+			emit_data->args[0]);
+	emit_data->output[emit_data->chan] = LLVMBuildNot(builder, v, "");
+}
+
+static void emit_and(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildAnd(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_or(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildOr(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_uadd(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildAdd(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_udiv(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildUDiv(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_idiv(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildSDiv(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_mod(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildSRem(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_umod(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildURem(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_shl(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildShl(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_ushr(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildLShr(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+static void emit_ishr(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildAShr(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_xor(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildXor(builder,
+			emit_data->args[0], emit_data->args[1], "");
+}
+
+static void emit_ssg(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+
+	LLVMValueRef cmp, val;
+
+	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_ISSG) {
+		cmp = LLVMBuildICmp(builder, LLVMIntSGT, emit_data->args[0], bld_base->int_bld.zero, "");
+		val = LLVMBuildSelect(builder, cmp, bld_base->int_bld.one, emit_data->args[0], "");
+		cmp = LLVMBuildICmp(builder, LLVMIntSGE, val, bld_base->int_bld.zero, "");
+		val = LLVMBuildSelect(builder, cmp, val, LLVMConstInt(bld_base->int_bld.elem_type, -1, true), "");
+	} else { // float SSG
+		cmp = LLVMBuildFCmp(builder, LLVMRealUGT, emit_data->args[0], bld_base->int_bld.zero, "");
+		val = LLVMBuildSelect(builder, cmp, bld_base->base.one, emit_data->args[0], "");
+		cmp = LLVMBuildFCmp(builder, LLVMRealUGE, val, bld_base->base.zero, "");
+		val = LLVMBuildSelect(builder, cmp, val, LLVMConstReal(bld_base->base.elem_type, -1), "");
+	}
+
+	emit_data->output[emit_data->chan] = val;
+}
+
+static void emit_ineg(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildNeg(builder,
+			emit_data->args[0], "");
+}
+
+static void emit_f2i(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildFPToSI(builder,
+			emit_data->args[0], bld_base->int_bld.elem_type, "");
+}
+
+static void emit_f2u(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildFPToUI(builder,
+			emit_data->args[0], bld_base->uint_bld.elem_type, "");
+}
+
+static void emit_i2f(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildSIToFP(builder,
+			emit_data->args[0], bld_base->base.elem_type, "");
+}
+
+static void emit_u2f(
+		const struct lp_build_tgsi_action * action,
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+	emit_data->output[emit_data->chan] = LLVMBuildUIToFP(builder,
+			emit_data->args[0], bld_base->base.elem_type, "");
+}
+
+static void emit_immediate(struct lp_build_tgsi_context * bld_base,
+		const struct tgsi_full_immediate *imm)
+{
+	unsigned i;
+	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+
+	for (i = 0; i < 4; ++i) {
+		ctx->soa.immediates[ctx->soa.num_immediates][i] =
+				LLVMConstInt(bld_base->uint_bld.elem_type, imm->u[i].Uint, false   );
+	}
+
+	ctx->soa.num_immediates++;
 }
 
 void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
@@ -526,12 +898,13 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 
 	lp_build_context_init(&bld_base->base, &ctx->gallivm, type);
 	lp_build_context_init(&ctx->soa.bld_base.uint_bld, &ctx->gallivm, lp_uint_type(type));
+	lp_build_context_init(&ctx->soa.bld_base.int_bld, &ctx->gallivm, lp_int_type(type));
 
 	bld_base->soa = 1;
 	bld_base->emit_store = emit_store;
 	bld_base->emit_swizzle = emit_swizzle;
 	bld_base->emit_declaration = emit_declaration;
-	bld_base->emit_immediate = lp_emit_immediate_soa;
+	bld_base->emit_immediate = emit_immediate;
 
 	bld_base->emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch_immediate;
 	bld_base->emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input;
@@ -545,6 +918,60 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 
 	lp_set_default_actions(bld_base);
 
+	bld_base->op_actions[TGSI_OPCODE_IABS].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_IABS].intr_name = "llvm.AMDIL.abs.";
+	bld_base->op_actions[TGSI_OPCODE_NOT].emit = emit_not;
+	bld_base->op_actions[TGSI_OPCODE_AND].emit = emit_and;
+	bld_base->op_actions[TGSI_OPCODE_XOR].emit = emit_xor;
+	bld_base->op_actions[TGSI_OPCODE_OR].emit = emit_or;
+	bld_base->op_actions[TGSI_OPCODE_UADD].emit = emit_uadd;
+	bld_base->op_actions[TGSI_OPCODE_UDIV].emit = emit_udiv;
+	bld_base->op_actions[TGSI_OPCODE_IDIV].emit = emit_idiv;
+	bld_base->op_actions[TGSI_OPCODE_MOD].emit = emit_mod;
+	bld_base->op_actions[TGSI_OPCODE_UMOD].emit = emit_umod;
+	bld_base->op_actions[TGSI_OPCODE_INEG].emit = emit_ineg;
+	bld_base->op_actions[TGSI_OPCODE_SHL].emit = emit_shl;
+	bld_base->op_actions[TGSI_OPCODE_ISHR].emit = emit_ishr;
+	bld_base->op_actions[TGSI_OPCODE_USHR].emit = emit_ushr;
+	bld_base->op_actions[TGSI_OPCODE_SSG].emit = emit_ssg;
+	bld_base->op_actions[TGSI_OPCODE_ISSG].emit = emit_ssg;
+	bld_base->op_actions[TGSI_OPCODE_I2F].emit = emit_i2f;
+	bld_base->op_actions[TGSI_OPCODE_U2F].emit = emit_u2f;
+	bld_base->op_actions[TGSI_OPCODE_F2I].emit = emit_f2i;
+	bld_base->op_actions[TGSI_OPCODE_F2U].emit = emit_f2u;
+	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
+	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
+	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_USEQ].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_USGE].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_USLT].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_USNE].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_ISGE].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_ISLT].emit = emit_icmp;
+	bld_base->op_actions[TGSI_OPCODE_ROUND].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_ROUND].intr_name = "llvm.AMDIL.round.nearest.";
+	bld_base->op_actions[TGSI_OPCODE_MIN].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_MIN].intr_name = "llvm.AMDIL.min.";
+	bld_base->op_actions[TGSI_OPCODE_MAX].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_MAX].intr_name = "llvm.AMDIL.max.";
+	bld_base->op_actions[TGSI_OPCODE_IMIN].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_IMIN].intr_name = "llvm.AMDGPU.imin";
+	bld_base->op_actions[TGSI_OPCODE_IMAX].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_IMAX].intr_name = "llvm.AMDGPU.imax";
+	bld_base->op_actions[TGSI_OPCODE_UMIN].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_UMIN].intr_name = "llvm.AMDGPU.umin";
+	bld_base->op_actions[TGSI_OPCODE_UMAX].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_UMAX].intr_name = "llvm.AMDGPU.umax";
+	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
+	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
+	bld_base->op_actions[TGSI_OPCODE_CEIL].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_CEIL].intr_name = "llvm.AMDIL.round.neginf.";
+
+
+
 	bld_base->op_actions[TGSI_OPCODE_ABS].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_ABS].intr_name = "llvm.AMDIL.fabs.";
 	bld_base->op_actions[TGSI_OPCODE_ARL].emit = lp_build_tgsi_intrinsic;
@@ -558,10 +985,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_CMP].intr_name = "llvm.AMDGPU.cndlt";
 	bld_base->op_actions[TGSI_OPCODE_COS].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_COS].intr_name = "llvm.AMDGPU.cos";
-	bld_base->op_actions[TGSI_OPCODE_DDX].emit = lp_build_tgsi_intrinsic;
-	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
-	bld_base->op_actions[TGSI_OPCODE_DDY].emit = lp_build_tgsi_intrinsic;
-	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
 	bld_base->op_actions[TGSI_OPCODE_DIV].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_DIV].intr_name = "llvm.AMDGPU.div";
 	bld_base->op_actions[TGSI_OPCODE_ELSE].emit = else_emit;
@@ -574,7 +997,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_FRC].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_FRC].intr_name = "llvm.AMDIL.fraction.";
 	bld_base->op_actions[TGSI_OPCODE_IF].emit = if_emit;
-	bld_base->op_actions[TGSI_OPCODE_KIL].emit = lp_build_tgsi_intrinsic;
+	bld_base->op_actions[TGSI_OPCODE_KIL].emit = kil_emit;
 	bld_base->op_actions[TGSI_OPCODE_KIL].intr_name = "llvm.AMDGPU.kill";
 	bld_base->op_actions[TGSI_OPCODE_KILP].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_KILP].intr_name = "llvm.AMDGPU.kilp";
@@ -597,7 +1020,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_SSG].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_SSG].intr_name = "llvm.AMDGPU.ssg";
 	bld_base->op_actions[TGSI_OPCODE_SGE].emit = lp_build_tgsi_intrinsic;
-	bld_base->op_actions[TGSI_OPCODE_SGE].intr_name = "llvm.AMDGPU.sge.";
+	bld_base->op_actions[TGSI_OPCODE_SGE].intr_name = "llvm.AMDGPU.sge";
 	bld_base->op_actions[TGSI_OPCODE_SEQ].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_SEQ].intr_name = "llvm.AMDGPU.seq";
 	bld_base->op_actions[TGSI_OPCODE_SLE].fetch_args = radeon_llvm_fetch_args_2_reverse_soa;
@@ -620,6 +1043,7 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
 	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
+	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].emit = lp_build_tgsi_intrinsic;
 	bld_base->op_actions[TGSI_OPCODE_TRUNC].intr_name = "llvm.AMDGPU.trunc";
diff --git a/src/gallium/drivers/radeonsi/evergreen_state.c b/src/gallium/drivers/radeonsi/evergreen_state.c
index 75d6cadc6cc..b094248fee1 100644
--- a/src/gallium/drivers/radeonsi/evergreen_state.c
+++ b/src/gallium/drivers/radeonsi/evergreen_state.c
@@ -1166,24 +1166,6 @@ static void si_delete_sampler_state(struct pipe_context *ctx,
 	free(state);
 }
 
-static unsigned si_map_swizzle(unsigned swizzle)
-{
-	switch (swizzle) {
-	case UTIL_FORMAT_SWIZZLE_Y:
-		return V_008F1C_SQ_SEL_Y;
-	case UTIL_FORMAT_SWIZZLE_Z:
-		return V_008F1C_SQ_SEL_Z;
-	case UTIL_FORMAT_SWIZZLE_W:
-		return V_008F1C_SQ_SEL_W;
-	case UTIL_FORMAT_SWIZZLE_0:
-		return V_008F1C_SQ_SEL_0;
-	case UTIL_FORMAT_SWIZZLE_1:
-		return V_008F1C_SQ_SEL_1;
-	default: /* UTIL_FORMAT_SWIZZLE_X */
-		return V_008F1C_SQ_SEL_X;
-	}
-}
-
 static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_context *ctx,
 							struct pipe_resource *texture,
 							const struct pipe_sampler_view *state)
@@ -1259,9 +1241,9 @@ static struct pipe_sampler_view *evergreen_create_sampler_view(struct pipe_conte
 
 	va = r600_resource_va(ctx->screen, texture);
 	view->state[0] = (va + tmp->offset[0]) >> 8;
-	view->state[1] = ((va + tmp->offset[0]) >> 40) & 0xff;
-	view->state[1] |= (S_008F14_DATA_FORMAT(format) |
-			   S_008F14_NUM_FORMAT(num_format));
+	view->state[1] = (S_008F14_BASE_ADDRESS_HI((va + tmp->offset[0]) >> 40) |
+			  S_008F14_DATA_FORMAT(format) |
+			  S_008F14_NUM_FORMAT(num_format));
 	view->state[2] = (S_008F18_WIDTH(texture->width0 - 1) |
 			  S_008F18_HEIGHT(height - 1));
 	view->state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
@@ -2087,9 +2069,9 @@ void si_pipe_shader_vs(struct pipe_context *ctx, struct si_pipe_shader *shader)
 	r600_pipe_state_add_reg(rstate,
 				R_02870C_SPI_SHADER_POS_FORMAT,
 				S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-				S_02870C_POS1_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-				S_02870C_POS2_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
-				S_02870C_POS3_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP),
+				S_02870C_POS1_EXPORT_FORMAT(V_02870C_SPI_SHADER_NONE) |
+				S_02870C_POS2_EXPORT_FORMAT(V_02870C_SPI_SHADER_NONE) |
+				S_02870C_POS3_EXPORT_FORMAT(V_02870C_SPI_SHADER_NONE),
 				NULL, 0);
 
 	va = r600_resource_va(ctx->screen, (void *)shader->bo);
diff --git a/src/gallium/drivers/radeonsi/r600_state_common.c b/src/gallium/drivers/radeonsi/r600_state_common.c
index 53a34ef519c..06eb96b9ee8 100644
--- a/src/gallium/drivers/radeonsi/r600_state_common.c
+++ b/src/gallium/drivers/radeonsi/r600_state_common.c
@@ -628,12 +628,15 @@ static void r600_vertex_buffer_update(struct r600_context *rctx)
 		ptr[0] = va & 0xFFFFFFFF;
 		ptr[1] = (S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(vertex_buffer->stride));
-		ptr[2] = (vertex_buffer->buffer->width0 - offset) / vertex_buffer->stride;
-		/* XXX: Hardcoding RGBA */
-		ptr[3] = (S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-			  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-			  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-			  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+		if (vertex_buffer->stride > 0)
+			ptr[2] = ((vertex_buffer->buffer->width0 - offset) /
+				  vertex_buffer->stride);
+		else
+			ptr[2] = vertex_buffer->buffer->width0 - offset;
+		ptr[3] = (S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+			  S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+			  S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+			  S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
 			  S_008F0C_NUM_FORMAT(num_format) |
 			  S_008F0C_DATA_FORMAT(data_format));
 
diff --git a/src/gallium/drivers/radeonsi/radeonsi_pipe.h b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
index ab30892d51a..bba4cf23691 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_pipe.h
+++ b/src/gallium/drivers/radeonsi/radeonsi_pipe.h
@@ -37,6 +37,7 @@
 #include "r600.h"
 #include "radeonsi_public.h"
 #include "r600_resource.h"
+#include "sid.h"
 
 #define R600_MAX_CONST_BUFFERS 1
 #define R600_MAX_CONST_BUFFER_SIZE 4096
@@ -467,6 +468,24 @@ static INLINE uint32_t S_FIXED(float value, uint32_t frac_bits)
 }
 #define ALIGN_DIVUP(x, y) (((x) + (y) - 1) / (y))
 
+static INLINE unsigned si_map_swizzle(unsigned swizzle)
+{
+	switch (swizzle) {
+	case UTIL_FORMAT_SWIZZLE_Y:
+		return V_008F0C_SQ_SEL_Y;
+	case UTIL_FORMAT_SWIZZLE_Z:
+		return V_008F0C_SQ_SEL_Z;
+	case UTIL_FORMAT_SWIZZLE_W:
+		return V_008F0C_SQ_SEL_W;
+	case UTIL_FORMAT_SWIZZLE_0:
+		return V_008F0C_SQ_SEL_0;
+	case UTIL_FORMAT_SWIZZLE_1:
+		return V_008F0C_SQ_SEL_1;
+	default: /* UTIL_FORMAT_SWIZZLE_X */
+		return V_008F0C_SQ_SEL_X;
+	}
+}
+
 static inline unsigned r600_tex_aniso_filter(unsigned filter)
 {
 	if (filter <= 1)   return 0;
diff --git a/src/gallium/drivers/radeonsi/radeonsi_shader.c b/src/gallium/drivers/radeonsi/radeonsi_shader.c
index 6425c352d28..0e1a97bba3e 100644
--- a/src/gallium/drivers/radeonsi/radeonsi_shader.c
+++ b/src/gallium/drivers/radeonsi/radeonsi_shader.c
@@ -199,7 +199,7 @@ static void declare_input_fs(
 	LLVMValueRef attr_number = lp_build_const_int32(gallivm, input_index);
 
 	/* XXX: Handle all possible interpolation modes */
-	switch (decl->Declaration.Interpolate) {
+	switch (decl->Interp.Interpolate) {
 	case TGSI_INTERPOLATE_COLOR:
 		if (si_shader_ctx->rctx->rasterizer->flatshade)
 			intr_name = "llvm.SI.fs.interp.constant";
@@ -331,14 +331,14 @@ static void si_llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
 			i = shader->ninput++;
 			shader->input[i].name = d->Semantic.Name;
 			shader->input[i].sid = d->Semantic.Index;
-			shader->input[i].interpolate = d->Declaration.Interpolate;
-			shader->input[i].centroid = d->Declaration.Centroid;
+			shader->input[i].interpolate = d->Interp.Interpolate;
+			shader->input[i].centroid = d->Interp.Centroid;
 			break;
 		case TGSI_FILE_OUTPUT:
 			i = shader->noutput++;
 			shader->output[i].name = d->Semantic.Name;
 			shader->output[i].sid = d->Semantic.Index;
-			shader->output[i].interpolate = d->Declaration.Interpolate;
+			shader->output[i].interpolate = d->Interp.Interpolate;
 			break;
 		}
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index d54e02e40cd..d4c01759dbe 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -799,7 +799,8 @@ get_texel_2d_array(const struct sp_sampler_variant *samp,
    const struct pipe_resource *texture = samp->view->texture;
    unsigned level = addr.bits.level;
 
-   assert(layer < texture->array_size);
+   assert(layer < (int) texture->array_size);
+   assert(layer >= 0);
 
    if (x < 0 || x >= (int) u_minify(texture->width0, level) ||
        y < 0 || y >= (int) u_minify(texture->height0, level)) {
@@ -1787,9 +1788,9 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
    float weight_buffer[TGSI_QUAD_SIZE];
    unsigned buffer_next;
    int j;
-   float den;// = 0.0F;
+   float den; /* = 0.0F; */
    float ddq;
-   float U;// = u0 - tex_u;
+   float U; /* = u0 - tex_u; */
    int v;
 
    /* Scale ellipse formula to directly index the Filter Lookup Table.
@@ -1805,8 +1806,8 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
     * also the same. Note that texel/image access can only be performed using
     * a quad, i.e. it is not possible to get the pixel value for a single
     * tex coord. In order to have a better performance, the access is buffered
-    * using the s_buffer/t_buffer and weight_buffer. Only when the buffer is full,
-    * then the pixel values are read from the image.
+    * using the s_buffer/t_buffer and weight_buffer. Only when the buffer is
+    * full, then the pixel values are read from the image.
     */
    ddq = 2 * A;
    
@@ -1834,7 +1835,9 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
 
          int u;
          for (u = u0; u <= u1; ++u) {
-            /* Note that the ellipse has been pre-scaled so F = WEIGHT_LUT_SIZE - 1 */
+            /* Note that the ellipse has been pre-scaled so F =
+             * WEIGHT_LUT_SIZE - 1
+             */
             if (q < WEIGHT_LUT_SIZE) {
                /* as a LUT is used, q must never be negative;
                 * should not happen, though
@@ -1873,10 +1876,11 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
          }
       }
 
-      /* if the tex coord buffer contains unread values, we will read them now.
-       * Note that in most cases we have to read more pixel values than required,
-       * however, as the img_filter_2d_nearest function(s) does not have a count
-       * parameter, we need to read the whole quad and ignore the unused values
+      /* if the tex coord buffer contains unread values, we will read
+       * them now.  Note that in most cases we have to read more pixel
+       * values than required, however, as the img_filter_2d_nearest
+       * function(s) does not have a count parameter, we need to read
+       * the whole quad and ignore the unused values
        */
       if (buffer_next > 0) {
          unsigned jj;
@@ -1895,11 +1899,9 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
       }
 
       if (den <= 0.0F) {
-         /* Reaching this place would mean
-          * that no pixels intersected the ellipse.
-          * This should never happen because
-          * the filter we use always
-          * intersects at least one pixel.
+         /* Reaching this place would mean that no pixels intersected
+          * the ellipse.  This should never happen because the filter
+          * we use always intersects at least one pixel.
           */
 
          /*rgba[0]=0;
@@ -1907,7 +1909,8 @@ img_filter_2d_ewa(struct tgsi_sampler *tgsi_sampler,
          rgba[2]=0;
          rgba[3]=0;*/
          /* not enough pixels in resampling, resort to direct interpolation */
-         samp->min_img_filter(tgsi_sampler, s, t, p, NULL, tgsi_sampler_lod_bias, rgba_temp);
+         samp->min_img_filter(tgsi_sampler, s, t, p, NULL,
+                              tgsi_sampler_lod_bias, rgba_temp);
          den = 1;
          num[0] = rgba_temp[0][j];
          num[1] = rgba_temp[1][j];
@@ -2020,7 +2023,6 @@ mip_filter_linear_aniso(struct tgsi_sampler *tgsi_sampler,
 }
 
 
-
 /**
  * Specialized version of mip_filter_linear with hard-wired calls to
  * 2d lambda calculation and 2d_linear_repeat_POT img filters.
@@ -2090,7 +2092,6 @@ mip_filter_linear_2d_linear_repeat_POT(
 }
 
 
-
 /**
  * Do shadow/depth comparisons.
  */
@@ -2287,9 +2288,11 @@ sample_cube(struct tgsi_sampler *tgsi_sampler,
    samp->compare(tgsi_sampler, ssss, tttt, NULL, c0, control, rgba);
 }
 
-static void do_swizzling(const struct sp_sampler_variant *samp,
-                         float in[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
-                         float out[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+
+static void
+do_swizzling(const struct sp_sampler_variant *samp,
+             float in[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+             float out[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    int j;
    const unsigned swizzle_r = samp->key.bits.swizzle_r;
@@ -2358,6 +2361,7 @@ static void do_swizzling(const struct sp_sampler_variant *samp,
    }
 }
 
+
 static void
 sample_swizzle(struct tgsi_sampler *tgsi_sampler,
                const float s[TGSI_QUAD_SIZE],
@@ -2464,6 +2468,19 @@ get_linear_wrap(unsigned mode)
 }
 
 
+/**
+ * Is swizzling needed for the given state key?
+ */
+static INLINE bool
+any_swizzle(union sp_sampler_key key)
+{
+   return (key.bits.swizzle_r != PIPE_SWIZZLE_RED ||
+           key.bits.swizzle_g != PIPE_SWIZZLE_GREEN ||
+           key.bits.swizzle_b != PIPE_SWIZZLE_BLUE ||
+           key.bits.swizzle_a != PIPE_SWIZZLE_ALPHA);
+}
+
+
 static compute_lambda_func
 get_lambda_func(const union sp_sampler_key key)
 {
@@ -2590,6 +2607,7 @@ sp_sampler_variant_destroy( struct sp_sampler_variant *samp )
    FREE(samp);
 }
 
+
 static void
 sample_get_dims(struct tgsi_sampler *tgsi_sampler, int level,
 		int dims[4])
@@ -2630,35 +2648,43 @@ sample_get_dims(struct tgsi_sampler *tgsi_sampler, int level,
     }
 }
 
-/* this function is only used for unfiltered texel gets
-   via the TGSI TXF opcode. */
+/**
+ * This function is only used for getting unfiltered texels via the
+ * TXF opcode.  The GL spec says that out-of-bounds texel fetches
+ * produce undefined results.  Instead of crashing, lets just clamp
+ * coords to the texture image size.
+ */
 static void
 sample_get_texels(struct tgsi_sampler *tgsi_sampler,
-	   const int v_i[TGSI_QUAD_SIZE],
-	   const int v_j[TGSI_QUAD_SIZE],
-	   const int v_k[TGSI_QUAD_SIZE],
-	   const int lod[TGSI_QUAD_SIZE],
-	   const int8_t offset[3],
-	   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+                  const int v_i[TGSI_QUAD_SIZE],
+                  const int v_j[TGSI_QUAD_SIZE],
+                  const int v_k[TGSI_QUAD_SIZE],
+                  const int lod[TGSI_QUAD_SIZE],
+                  const int8_t offset[3],
+                  float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct sp_sampler_variant *samp = sp_sampler_variant(tgsi_sampler);
    union tex_tile_address addr;
    const struct pipe_resource *texture = samp->view->texture;
    int j, c;
    const float *tx;
-   bool need_swizzle = (samp->key.bits.swizzle_r != PIPE_SWIZZLE_RED ||
-                        samp->key.bits.swizzle_g != PIPE_SWIZZLE_GREEN ||
-                        samp->key.bits.swizzle_b != PIPE_SWIZZLE_BLUE ||
-                        samp->key.bits.swizzle_a != PIPE_SWIZZLE_ALPHA);
+   const bool need_swizzle = any_swizzle(samp->key);
+   int width, height, depth, layers;
 
    addr.value = 0;
    /* TODO write a better test for LOD */
    addr.bits.level = lod[0];
 
+   width = u_minify(texture->width0, addr.bits.level);
+   height = u_minify(texture->height0, addr.bits.level);
+   depth = u_minify(texture->depth0, addr.bits.level);
+   layers = texture->array_size;
+
    switch(texture->target) {
    case PIPE_TEXTURE_1D:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_2d(samp, addr, v_i[j] + offset[0], 0);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+	 tx = get_texel_2d(samp, addr, x, 0);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2666,8 +2692,9 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
       break;
    case PIPE_TEXTURE_1D_ARRAY:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_1d_array(samp, addr, v_i[j] + offset[0],
-				 v_j[j] + offset[1]);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+         int y = CLAMP(v_j[j] + offset[1], 0, layers - 1);
+	 tx = get_texel_1d_array(samp, addr, x, y);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2676,8 +2703,9 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
    case PIPE_TEXTURE_2D:
    case PIPE_TEXTURE_RECT:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_2d(samp, addr, v_i[j] + offset[0],
-			   v_j[j] + offset[1]);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+         int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
+	 tx = get_texel_2d(samp, addr, x, y);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2685,9 +2713,10 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
       break;
    case PIPE_TEXTURE_2D_ARRAY:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_2d_array(samp, addr, v_i[j] + offset[0],
-				 v_j[j] + offset[1],
-				 v_k[j] + offset[2]);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+         int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
+         int layer = CLAMP(v_k[j] + offset[2], 0, layers - 1);
+	 tx = get_texel_2d_array(samp, addr, x, y, layer);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2695,9 +2724,11 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
       break;
    case PIPE_TEXTURE_3D:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-	 tx = get_texel_3d(samp, addr, v_i[j] + offset[0], 
-			   v_j[j] + offset[1],
-			   v_k[j] + offset[2]);
+         int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
+         int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
+         int z = CLAMP(v_k[j] + offset[2], 0, depth - 1);
+
+	 tx = get_texel_3d(samp, addr, x, y, z);
 	 for (c = 0; c < 4; c++) {
 	    rgba[c][j] = tx[c];
 	 }
@@ -2715,6 +2746,8 @@ sample_get_texels(struct tgsi_sampler *tgsi_sampler,
       do_swizzling(samp, rgba_temp, rgba);
    }
 }
+
+
 /**
  * Create a sampler variant for a given set of non-orthogonal state.
  */
@@ -2830,10 +2863,7 @@ sp_create_sampler_variant( const struct pipe_sampler_state *sampler,
       samp->sample_target = samp->compare;
    }
 
-   if (key.bits.swizzle_r != PIPE_SWIZZLE_RED ||
-       key.bits.swizzle_g != PIPE_SWIZZLE_GREEN ||
-       key.bits.swizzle_b != PIPE_SWIZZLE_BLUE ||
-       key.bits.swizzle_a != PIPE_SWIZZLE_ALPHA) {
+   if (any_swizzle(key)) {
       samp->base.get_samples = sample_swizzle;
    }
    else {
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index ac2d35e5ea4..64ec658b80e 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -241,7 +241,11 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_CAN_COMPACT_VARYINGS:
    case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
       return 0;
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+      return 1;
 
    default:
       debug_printf("Unexpected PIPE_CAP_ query %u\n", param);
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 5e6d1fbc904..a68912608bc 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -871,6 +871,31 @@ static boolean emit_floor(struct svga_shader_emitter *emit,
 }
 
 
+/* Translate the following TGSI CEIL instruction.
+ *    CEIL  DST, SRC
+ * To the following SVGA3D instruction sequence.
+ *    FRC  TMP, -SRC
+ *    ADD  DST, SRC, TMP
+ */
+static boolean emit_ceil(struct svga_shader_emitter *emit,
+                         const struct tgsi_full_instruction *insn)
+{
+   SVGA3dShaderDestToken dst = translate_dst_register(emit, insn, 0);
+   const struct src_register src0 = translate_src_register(emit, &insn->Src[0]);
+   SVGA3dShaderDestToken temp = get_temp(emit);
+
+   /* FRC  TMP, -SRC */
+   if (!submit_op1(emit, inst_token(SVGA3DOP_FRC), temp, negate(src0)))
+      return FALSE;
+
+   /* ADD DST, SRC, TMP */
+   if (!submit_op2(emit, inst_token(SVGA3DOP_ADD), dst, src0, src(temp)))
+      return FALSE;
+
+   return TRUE;
+}
+
+
 /* Translate the following TGSI CMP instruction.
  *    CMP  DST, SRC0, SRC1, SRC2
  * To the following SVGA3D instruction sequence.
@@ -2435,6 +2460,9 @@ static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
    case TGSI_OPCODE_TRUNC:        /* should be TRUNC, not FLR */
       return emit_floor( emit, insn );
 
+   case TGSI_OPCODE_CEIL:
+      return emit_ceil( emit, insn );
+
    case TGSI_OPCODE_CMP:
       return emit_cmp( emit, insn );