Merge commit 'origin/master' into gallium-sampler-view

Conflicts: src/gallium/drivers/nv30/nv30_context.h src/gallium/drivers/nv30/nv30_state.c src/gallium/drivers/nv40/nv40_context.h src/gallium/drivers/nv40/nv40_state.c src/gallium/drivers/r300/r300_emit.c
author: Keith Whitwell <[email protected]> 2010-03-15 09:44:52 +0000
committer: Keith Whitwell <[email protected]> 2010-03-15 09:44:52 +0000
commit: 42910ebe7b9748c0ecb6a597bae3e7d43c7e170f (patch)
tree: b2b9c72dc47a0473304dc2db1b078d13b658df34 /src/gallium/drivers/nvfx
parent: 47bfbd452c93e6a8db013fb90d9f42210cf24889 (diff)
parent: 68e58a96e80865878e6881dc4d34fcc3ec24eb19 (diff)
33 files changed, 7644 insertions, 0 deletions
diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile
new file mode 100644
index 00000000000..dfe97e6ed5f
--- /dev/null
+++ b/src/gallium/drivers/nvfx/Makefile
@@ -0,0 +1,32 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nvfx
+
+C_SOURCES = \
+	nv04_surface_2d.c \
+	nvfx_context.c \
+	nvfx_clear.c \
+	nvfx_draw.c \
+	nvfx_fragprog.c \
+	nvfx_fragtex.c \
+	nv30_fragtex.c \
+	nv40_fragtex.c \
+	nvfx_miptree.c \
+	nvfx_query.c \
+	nvfx_screen.c \
+	nvfx_state.c \
+	nvfx_state_blend.c \
+        nvfx_state_emit.c \
+	nvfx_state_fb.c \
+	nvfx_state_rasterizer.c \
+	nvfx_state_scissor.c \
+        nvfx_state_stipple.c \
+	nvfx_state_viewport.c \
+	nvfx_state_zsa.c \
+	nvfx_surface.c \
+	nvfx_transfer.c \
+	nvfx_vbo.c \
+	nvfx_vertprog.c
+
+include ../../Makefile.template
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.c b/src/gallium/drivers/nvfx/nv04_surface_2d.c
new file mode 100644
index 00000000000..ed18c9f24dc
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_surface_2d.c
@@ -0,0 +1,545 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_util.h"
+#include "nouveau/nouveau_screen.h"
+#include "nv04_surface_2d.h"
+
+static INLINE int
+nv04_surface_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_S8Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_rect_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case PIPE_FORMAT_S8Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_scaled_image_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+	case PIPE_FORMAT_L8_UNORM:
+	case PIPE_FORMAT_I8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8;
+	case PIPE_FORMAT_B5G5R5A1_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_L8A8_UNORM:
+		return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
+	default:
+		return -1;
+	}
+}
+
+static INLINE unsigned
+nv04_swizzle_bits_square(unsigned x, unsigned y)
+{
+	unsigned u = (x & 0x001) << 0 |
+	             (x & 0x002) << 1 |
+	             (x & 0x004) << 2 |
+	             (x & 0x008) << 3 |
+	             (x & 0x010) << 4 |
+	             (x & 0x020) << 5 |
+	             (x & 0x040) << 6 |
+	             (x & 0x080) << 7 |
+	             (x & 0x100) << 8 |
+	             (x & 0x200) << 9 |
+	             (x & 0x400) << 10 |
+	             (x & 0x800) << 11;
+
+	unsigned v = (y & 0x001) << 1 |
+	             (y & 0x002) << 2 |
+	             (y & 0x004) << 3 |
+	             (y & 0x008) << 4 |
+	             (y & 0x010) << 5 |
+	             (y & 0x020) << 6 |
+	             (y & 0x040) << 7 |
+	             (y & 0x080) << 8 |
+	             (y & 0x100) << 9 |
+	             (y & 0x200) << 10 |
+	             (y & 0x400) << 11 |
+	             (y & 0x800) << 12;
+	return v | u;
+}
+
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	unsigned s = MIN2(w, h);
+	unsigned m = s - 1;
+	return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
+}
+
+static int
+nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
+			  struct pipe_surface *dst, int dx, int dy,
+			  struct pipe_surface *src, int sx, int sy,
+			  int w, int h)
+{
+	struct nouveau_channel *chan = ctx->swzsurf->channel;
+	struct nouveau_grobj *swzsurf = ctx->swzsurf;
+	struct nouveau_grobj *sifm = ctx->sifm;
+	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
+	const unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+        /* Max width & height may not be the same on all HW, but must be POT */
+	const unsigned max_w = 1024;
+	const unsigned max_h = 1024;
+	unsigned sub_w = w > max_w ? max_w : w;
+	unsigned sub_h = h > max_h ? max_h : h;
+	unsigned x;
+	unsigned y;
+
+        /* Swizzled surfaces must be POT  */
+	assert(util_is_pot(dst->width) && util_is_pot(dst->height));
+
+        /* If area is too large to copy in one shot we must copy it in POT chunks to meet alignment requirements */
+	assert(sub_w == w || util_is_pot(sub_w));
+	assert(sub_h == h || util_is_pot(sub_h));
+
+	MARK_RING (chan, 8 + ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*17, 2 +
+			 ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*2);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, dst_bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1);
+	OUT_RING  (chan, nv04_surface_format(dst->format) |
+	                 log2i(dst->width) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+	                 log2i(dst->height) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+
+	BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, src_bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, swzsurf->handle);
+
+	for (y = 0; y < h; y += sub_h) {
+	  sub_h = MIN2(sub_h, h - y);
+
+	  for (x = 0; x < w; x += sub_w) {
+	    sub_w = MIN2(sub_w, w - x);
+
+	    assert(!(dst->offset & 63));
+
+	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
+	    OUT_RELOCl(chan, dst_bo, dst->offset,
+                             NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	    BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	    OUT_RING  (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
+	    OUT_RING  (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, 1 << 20);
+	    OUT_RING  (chan, 1 << 20);
+
+	    BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	    OUT_RING  (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | sub_w);
+	    OUT_RING  (chan, src_pitch |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+			     NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	    OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src->texture->format),
+                             NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	    OUT_RING  (chan, 0);
+	  }
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx,
+		       struct pipe_surface *dst, int dx, int dy,
+		       struct pipe_surface *src, int sx, int sy, int w, int h)
+{
+	struct nouveau_channel *chan = ctx->m2mf->channel;
+	struct nouveau_grobj *m2mf = ctx->m2mf;
+	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
+	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	unsigned dst_offset = dst->offset + dy * dst_pitch +
+	                      dx * util_format_get_blocksize(dst->texture->format);
+	unsigned src_offset = src->offset + sy * src_pitch +
+	                      sx * util_format_get_blocksize(src->texture->format);
+
+	MARK_RING (chan, 3 + ((h / 2047) + 1) * 9, 2 + ((h / 2047) + 1) * 2);
+	BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+	OUT_RELOCo(chan, src_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo,
+		   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	while (h) {
+		int count = (h > 2047) ? 2047 : h;
+
+		BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+		OUT_RELOCl(chan, src_bo, src_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCl(chan, dst_bo, dst_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+		OUT_RING  (chan, src_pitch);
+		OUT_RING  (chan, dst_pitch);
+		OUT_RING  (chan, w * util_format_get_blocksize(src->texture->format));
+		OUT_RING  (chan, count);
+		OUT_RING  (chan, 0x0101);
+		OUT_RING  (chan, 0);
+
+		h -= count;
+		src_offset += src_pitch * count;
+		dst_offset += dst_pitch * count;
+	}
+
+	return 0;
+}
+
+static int
+nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		       int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		       int w, int h)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *blit = ctx->blit;
+	struct nouveau_bo *src_bo = nouveau_bo(ctx->buf(src));
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
+	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	int format;
+
+	format = nv04_surface_format(dst->format);
+	if (format < 0)
+		return 1;
+
+	MARK_RING (chan, 12, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst_pitch << 16) | src_pitch);
+	OUT_RELOCl(chan, src_bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, blit, 0x0300, 3);
+	OUT_RING  (chan, (sy << 16) | sx);
+	OUT_RING  (chan, (dy << 16) | dx);
+	OUT_RING  (chan, ( h << 16) |  w);
+
+	return 0;
+}
+
+static void
+nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		  int w, int h)
+{
+	unsigned src_pitch = ((struct nv04_surface *)src)->pitch;
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	int src_linear = src->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR;
+	int dst_linear = dst->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR;
+
+	assert(src->format == dst->format);
+
+	/* Setup transfer to swizzle the texture to vram if needed */
+        if (src_linear && !dst_linear && w > 1 && h > 1) {
+           nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h);
+           return;
+        }
+
+	/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
+	 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
+	 */
+	if ((src->offset & 63) || (dst->offset & 63) ||
+	    (src_pitch & 63) || (dst_pitch & 63)) {
+		nv04_surface_copy_m2mf(ctx, dst, dx, dy, src, sx, sy, w, h);
+		return;
+	}
+
+	nv04_surface_copy_blit(ctx, dst, dx, dy, src, sx, sy, w, h);
+}
+
+static void
+nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst,
+		  int dx, int dy, int w, int h, unsigned value)
+{
+	struct nouveau_channel *chan = ctx->surf2d->channel;
+	struct nouveau_grobj *surf2d = ctx->surf2d;
+	struct nouveau_grobj *rect = ctx->rect;
+	struct nouveau_bo *dst_bo = nouveau_bo(ctx->buf(dst));
+	unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch;
+	int cs2d_format, gdirect_format;
+
+	cs2d_format = nv04_surface_format(dst->format);
+	assert(cs2d_format >= 0);
+
+	gdirect_format = nv04_rect_format(dst->format);
+	assert(gdirect_format >= 0);
+
+	MARK_RING (chan, 16, 4);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst_pitch << 16) | dst_pitch);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect,
+		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dx << 16) | dy);
+	OUT_RING  (chan, ( w << 16) |  h);
+}
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **pctx)
+{
+	struct nv04_surface_2d *ctx;
+
+	if (!pctx || !*pctx)
+		return;
+	ctx = *pctx;
+	*pctx = NULL;
+
+	nouveau_notifier_free(&ctx->ntfy);
+	nouveau_grobj_free(&ctx->m2mf);
+	nouveau_grobj_free(&ctx->surf2d);
+	nouveau_grobj_free(&ctx->swzsurf);
+	nouveau_grobj_free(&ctx->rect);
+	nouveau_grobj_free(&ctx->blit);
+	nouveau_grobj_free(&ctx->sifm);
+
+	FREE(ctx);
+}
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_screen *screen)
+{
+	struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d);
+	struct nouveau_channel *chan = screen->channel;
+	unsigned handle = 0x88000000, class;
+	int ret;
+
+	if (!ctx)
+		return NULL;
+
+	ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_CONTEXT_SURFACES_2D;
+	else
+		class = NV10_CONTEXT_SURFACES_2D;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->surf2d,
+			 NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+
+	if (chan->device->chipset < 0x10)
+		class = NV04_IMAGE_BLIT;
+	else
+		class = NV12_IMAGE_BLIT;
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT,
+				  &ctx->rect);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, ctx->ntfy->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, ctx->surf2d->handle);
+	BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, ctx->rect,
+			 NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	switch (chan->device->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x30:
+		class = NV30_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	default:
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm);
+	if (ret) {
+		nv04_surface_2d_takedown(&ctx);
+		return NULL;
+	}
+
+	ctx->copy = nv04_surface_copy;
+	ctx->fill = nv04_surface_fill;
+	return ctx;
+}
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns)
+{
+	int temp_flags;
+
+	// printf("creating temp, flags is %i!\n", flags);
+
+	if(ns->base.usage & PIPE_BUFFER_USAGE_DISCARD)
+	{
+		temp_flags = ns->base.usage | PIPE_BUFFER_USAGE_GPU_READ;
+		ns->base.usage = PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_DISCARD;
+	}
+	else
+	{
+		temp_flags = ns->base.usage | PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE;
+		ns->base.usage = PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_GPU_READ;
+	}
+
+	ns->base.usage = PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE;
+
+	struct pipe_texture templ;
+	memset(&templ, 0, sizeof(templ));
+	templ.format = ns->base.texture->format;
+	templ.target = PIPE_TEXTURE_2D;
+	templ.width0 = ns->base.width;
+	templ.height0 = ns->base.height;
+	templ.depth0 = 1;
+	templ.last_level = 0;
+
+	// TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented
+	templ.nr_samples = ns->base.texture->nr_samples;
+
+	templ.tex_usage = ns->base.texture->tex_usage | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+	struct pipe_texture* temp_tex = pscreen->texture_create(pscreen, &templ);
+	struct nv04_surface* temp_ns = (struct nv04_surface*)pscreen->get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags);
+	temp_ns->backing = ns;
+
+	if(ns->base.usage & PIPE_BUFFER_USAGE_GPU_READ)
+		eng2d->copy(eng2d, &temp_ns->backing->base, 0, 0, &ns->base, 0, 0, ns->base.width, ns->base.height);
+
+	return temp_ns;
+}
diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.h b/src/gallium/drivers/nvfx/nv04_surface_2d.h
new file mode 100644
index 00000000000..ce696a11a39
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv04_surface_2d.h
@@ -0,0 +1,37 @@
+#ifndef __NV04_SURFACE_2D_H__
+#define __NV04_SURFACE_2D_H__
+
+struct nv04_surface {
+	struct pipe_surface base;
+	unsigned pitch;
+	struct nv04_surface* backing;
+};
+
+struct nv04_surface_2d {
+	struct nouveau_notifier *ntfy;
+	struct nouveau_grobj *surf2d;
+	struct nouveau_grobj *swzsurf;
+	struct nouveau_grobj *m2mf;
+	struct nouveau_grobj *rect;
+	struct nouveau_grobj *blit;
+	struct nouveau_grobj *sifm;
+
+	struct pipe_buffer *(*buf)(struct pipe_surface *);
+
+	void (*copy)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, struct pipe_surface *src, int sx, int sy,
+		     int w, int h);
+	void (*fill)(struct nv04_surface_2d *, struct pipe_surface *dst,
+		     int dx, int dy, int w, int h, unsigned value);
+};
+
+struct nv04_surface_2d *
+nv04_surface_2d_init(struct nouveau_screen *screen);
+
+void
+nv04_surface_2d_takedown(struct nv04_surface_2d **);
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns);
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c
new file mode 100644
index 00000000000..2b56f454921
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv30_fragtex.c
@@ -0,0 +1,147 @@
+#include "util/u_format.h"
+
+#include "nvfx_context.h"
+#include "nouveau/nouveau_util.h"
+#include "nvfx_tex.h"
+
+void
+nv30_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso)
+{
+	if (cso->max_anisotropy >= 8) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+	} else
+	if (cso->max_anisotropy >= 4) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+	} else
+	if (cso->max_anisotropy >= 2) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
+	}
+}
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w)            \
+}
+
+struct nv30_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+};
+
+static struct nv30_texture_format
+nv30_texture_formats[] = {
+	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
+	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
+	_(Z16_UNORM     , R5G6B5  ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(S8Z24_UNORM   , A8R8G8B8,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	{},
+};
+
+static struct nv30_texture_format *
+nv30_fragtex_format(uint pipe_format)
+{
+	struct nv30_texture_format *tf = nv30_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
+	return NULL;
+}
+
+
+struct nouveau_stateobj *
+nv30_fragtex_build(struct nvfx_context *nvfx, int unit)
+{
+	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
+	struct nvfx_miptree *nv30mt = nvfx->tex_miptree[unit];
+	struct pipe_texture *pt = &nv30mt->base;
+	struct nouveau_bo *bo = nouveau_bo(nv30mt->buffer);
+	struct nv30_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv30_fragtex_format(pt->format);
+	if (!tf)
+		return NULL;
+
+	txf  = tf->format;
+	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= log2i(pt->width0) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= log2i(pt->height0) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= log2i(pt->depth0) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
+	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(1, 8, 2);
+	so_method(so, nvfx->screen->eng3d, NV34TCL_TX_OFFSET(unit), 8);
+	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
+		      NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV34TCL_TX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height0);
+	so_data  (so, ps->bcol);
+
+	return so;
+}
diff --git a/src/gallium/drivers/nvfx/nv30_vertprog.h b/src/gallium/drivers/nvfx/nv30_vertprog.h
new file mode 100644
index 00000000000..ec0444c07f8
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv30_vertprog.h
@@ -0,0 +1,169 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+#include "nvfx_shader.h"
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nv40_fragtex.c b/src/gallium/drivers/nvfx/nv40_fragtex.c
new file mode 100644
index 00000000000..5889b5e40d5
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv40_fragtex.c
@@ -0,0 +1,174 @@
+#include "util/u_format.h"
+#include "nvfx_context.h"
+#include "nvfx_tex.h"
+
+void
+nv40_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso)
+{
+	if (cso->max_anisotropy >= 2) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+		if (cso->max_anisotropy >= 16) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
+		}
+	}
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+}
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |         \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |         \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |         \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w),         \
+  ((NV34TCL_TX_FILTER_SIGNED_RED*sx) | (NV34TCL_TX_FILTER_SIGNED_GREEN*sy) |       \
+   (NV34TCL_TX_FILTER_SIGNED_BLUE*sz) | (NV34TCL_TX_FILTER_SIGNED_ALPHA*sw))       \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int     sign;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(B8G8R8X8_UNORM, A8R8G8B8,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(B8G8R8A8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B5G5R5A1_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B4G4R4A4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(B5G6R5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(L8A8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
+	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(S8Z24_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	{},
+};
+
+static struct nv40_texture_format *
+nv40_fragtex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format));
+	return NULL;
+}
+
+
+struct nouveau_stateobj *
+nv40_fragtex_build(struct nvfx_context *nvfx, int unit)
+{
+	struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit];
+	struct nvfx_miptree *nv40mt = nvfx->tex_miptree[unit];
+	struct nouveau_bo *bo = nouveau_bo(nv40mt->buffer);
+	struct pipe_texture *pt = &nv40mt->base;
+	struct nv40_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs, txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv40_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = ps->fmt;
+	txf |= tf->format | 0x8000;
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (1) /* XXX */
+		txf |= NV34TCL_TX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv40mt->level[0].pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(2, 9, 2);
+	so_method(so, nvfx->screen->eng3d, NV34TCL_TX_OFFSET(unit), 8);
+	so_reloc (so, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, bo, txf, tex_flags | NOUVEAU_BO_OR,
+		      NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | tf->sign | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height0);
+	so_data  (so, ps->bcol);
+	so_method(so, nvfx->screen->eng3d, NV40TCL_TEX_SIZE1(unit), 1);
+	so_data  (so, (pt->depth0 << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	return so;
+}
diff --git a/src/gallium/drivers/nvfx/nv40_vertprog.h b/src/gallium/drivers/nvfx/nv40_vertprog.h
new file mode 100644
index 00000000000..7337293babc
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nv40_vertprog.h
@@ -0,0 +1,177 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+#include "nvfx_shader.h"
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_clear.c b/src/gallium/drivers/nvfx/nvfx_clear.c
new file mode 100644
index 00000000000..2be70fcee40
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_clear.c
@@ -0,0 +1,14 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_clear.h"
+
+#include "nvfx_context.h"
+
+void
+nvfx_clear(struct pipe_context *pipe, unsigned buffers,
+           const float *rgba, double depth, unsigned stencil)
+{
+	util_clear(pipe, &nvfx_context(pipe)->framebuffer, buffers, rgba, depth,
+		   stencil);
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
new file mode 100644
index 00000000000..fc3cbdb558f
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -0,0 +1,90 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+
+static void
+nvfx_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
+		OUT_RING  (chan, 2);
+		BEGIN_RING(chan, eng3d, 0x1fd8, 1);
+		OUT_RING  (chan, 1);
+	}
+
+	FIRE_RING(chan);
+	if (fence)
+		*fence = NULL;
+}
+
+static void
+nvfx_destroy(struct pipe_context *pipe)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned i;
+
+	for (i = 0; i < NVFX_STATE_MAX; i++) {
+		if (nvfx->state.hw[i])
+			so_ref(NULL, &nvfx->state.hw[i]);
+	}
+
+	if (nvfx->draw)
+		draw_destroy(nvfx->draw);
+	FREE(nvfx);
+}
+
+struct pipe_context *
+nvfx_create(struct pipe_screen *pscreen, void *priv)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nvfx_context *nvfx;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvfx = CALLOC(1, sizeof(struct nvfx_context));
+	if (!nvfx)
+		return NULL;
+	nvfx->screen = screen;
+
+	nvfx->nvws = nvws;
+
+	nvfx->pipe.winsys = ws;
+	nvfx->pipe.screen = pscreen;
+	nvfx->pipe.priv = priv;
+	nvfx->pipe.destroy = nvfx_destroy;
+	nvfx->pipe.draw_arrays = nvfx_draw_arrays;
+	nvfx->pipe.draw_elements = nvfx_draw_elements;
+	nvfx->pipe.clear = nvfx_clear;
+	nvfx->pipe.flush = nvfx_flush;
+
+	nvfx->pipe.is_texture_referenced = nouveau_is_texture_referenced;
+	nvfx->pipe.is_buffer_referenced = nouveau_is_buffer_referenced;
+
+	screen->base.channel->user_private = nvfx;
+	screen->base.channel->flush_notify = nvfx_state_flush_notify;
+
+	nvfx->is_nv4x = screen->is_nv4x;
+
+	nvfx_init_query_functions(nvfx);
+	nvfx_init_surface_functions(nvfx);
+	nvfx_init_state_functions(nvfx);
+	nvfx_init_transfer_functions(nvfx);
+
+	/* Create, configure, and install fallback swtnl path */
+	nvfx->draw = draw_create();
+	draw_wide_point_threshold(nvfx->draw, 9999999.0);
+	draw_wide_line_threshold(nvfx->draw, 9999999.0);
+	draw_enable_line_stipple(nvfx->draw, FALSE);
+	draw_enable_point_sprites(nvfx->draw, FALSE);
+	draw_set_rasterize_stage(nvfx->draw, nvfx_draw_render_stage(nvfx));
+
+	return &nvfx->pipe;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
new file mode 100644
index 00000000000..001b19eedf0
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -0,0 +1,265 @@
+#ifndef __NVFX_CONTEXT_H__
+#define __NVFX_CONTEXT_H__
+
+#include <stdio.h>
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "util/u_inlines.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_context.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nvfx_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nvfx_state_index {
+	NVFX_STATE_FB = 0,
+	NVFX_STATE_VIEWPORT = 1,
+	NVFX_STATE_BLEND = 2,
+	NVFX_STATE_RAST = 3,
+	NVFX_STATE_ZSA = 4,
+	NVFX_STATE_BCOL = 5,
+	NVFX_STATE_CLIP = 6,
+	NVFX_STATE_SCISSOR = 7,
+	NVFX_STATE_STIPPLE = 8,
+	NVFX_STATE_FRAGPROG = 9,
+	NVFX_STATE_VERTPROG = 10,
+	NVFX_STATE_FRAGTEX0 = 11,
+	NVFX_STATE_FRAGTEX1 = 12,
+	NVFX_STATE_FRAGTEX2 = 13,
+	NVFX_STATE_FRAGTEX3 = 14,
+	NVFX_STATE_FRAGTEX4 = 15,
+	NVFX_STATE_FRAGTEX5 = 16,
+	NVFX_STATE_FRAGTEX6 = 17,
+	NVFX_STATE_FRAGTEX7 = 18,
+	NVFX_STATE_FRAGTEX8 = 19,
+	NVFX_STATE_FRAGTEX9 = 20,
+	NVFX_STATE_FRAGTEX10 = 21,
+	NVFX_STATE_FRAGTEX11 = 22,
+	NVFX_STATE_FRAGTEX12 = 23,
+	NVFX_STATE_FRAGTEX13 = 24,
+	NVFX_STATE_FRAGTEX14 = 25,
+	NVFX_STATE_FRAGTEX15 = 26,
+	NVFX_STATE_VERTTEX0 = 27,
+	NVFX_STATE_VERTTEX1 = 28,
+	NVFX_STATE_VERTTEX2 = 29,
+	NVFX_STATE_VERTTEX3 = 30,
+	NVFX_STATE_VTXBUF = 31,
+	NVFX_STATE_VTXFMT = 32,
+	NVFX_STATE_VTXATTR = 33,
+	NVFX_STATE_SR = 34,
+	NVFX_STATE_MAX = 35
+};
+
+#include "nvfx_screen.h"
+
+#define NVFX_NEW_BLEND		(1 <<  0)
+#define NVFX_NEW_RAST		(1 <<  1)
+#define NVFX_NEW_ZSA		(1 <<  2)
+#define NVFX_NEW_SAMPLER	(1 <<  3)
+#define NVFX_NEW_FB		(1 <<  4)
+#define NVFX_NEW_STIPPLE	(1 <<  5)
+#define NVFX_NEW_SCISSOR	(1 <<  6)
+#define NVFX_NEW_VIEWPORT	(1 <<  7)
+#define NVFX_NEW_BCOL		(1 <<  8)
+#define NVFX_NEW_VERTPROG	(1 <<  9)
+#define NVFX_NEW_FRAGPROG	(1 << 10)
+#define NVFX_NEW_ARRAYS		(1 << 11)
+#define NVFX_NEW_UCP		(1 << 12)
+#define NVFX_NEW_SR		(1 << 13)
+
+struct nvfx_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nvfx_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nvfx_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nvfx_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NVFX_STATE_MAX];
+};
+
+struct nvfx_vtxelt_state {
+	struct pipe_vertex_element pipe[16];
+	unsigned num_elements;
+};
+
+struct nvfx_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nvfx_screen *screen;
+
+	unsigned is_nv4x; /* either 0 or ~0 */
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nvfx_state state;
+	struct {
+		struct nvfx_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+	unsigned fallback_swrast;
+
+	/* Context state */
+	unsigned dirty, draw_dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct pipe_clip_state clip;
+	struct nvfx_vertex_program *vertprog;
+	struct nvfx_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nvfx_rasterizer_state *rasterizer;
+	struct nvfx_zsa_state *zsa;
+	struct nvfx_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_stencil_ref stencil_ref;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nvfx_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct nvfx_vtxelt_state *vtxelt;
+};
+
+static INLINE struct nvfx_context *
+nvfx_context(struct pipe_context *pipe)
+{
+	return (struct nvfx_context *)pipe;
+}
+
+struct nvfx_state_entry {
+	boolean (*validate)(struct nvfx_context *nvfx);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern struct nvfx_state_entry nvfx_state_blend;
+extern struct nvfx_state_entry nvfx_state_blend_colour;
+extern struct nvfx_state_entry nvfx_state_fragprog;
+extern struct nvfx_state_entry nvfx_state_fragtex;
+extern struct nvfx_state_entry nvfx_state_framebuffer;
+extern struct nvfx_state_entry nvfx_state_rasterizer;
+extern struct nvfx_state_entry nvfx_state_scissor;
+extern struct nvfx_state_entry nvfx_state_sr;
+extern struct nvfx_state_entry nvfx_state_stipple;
+extern struct nvfx_state_entry nvfx_state_vbo;
+extern struct nvfx_state_entry nvfx_state_vertprog;
+extern struct nvfx_state_entry nvfx_state_viewport;
+extern struct nvfx_state_entry nvfx_state_vtxfmt;
+extern struct nvfx_state_entry nvfx_state_zsa;
+
+extern void nvfx_init_query_functions(struct nvfx_context *nvfx);
+extern void nvfx_init_surface_functions(struct nvfx_context *nvfx);
+
+/* nvfx_context.c */
+struct pipe_context *
+nvfx_create(struct pipe_screen *pscreen, void *priv);
+
+/* nvfx_clear.c */
+extern void nvfx_clear(struct pipe_context *pipe, unsigned buffers,
+		       const float *rgba, double depth, unsigned stencil);
+
+/* nvfx_draw.c */
+extern struct draw_stage *nvfx_draw_render_stage(struct nvfx_context *nvfx);
+extern void nvfx_draw_elements_swtnl(struct pipe_context *pipe,
+					struct pipe_buffer *idxbuf,
+					unsigned ib_size, unsigned mode,
+					unsigned start, unsigned count);
+
+/* nvfx_fragprog.c */
+extern void nvfx_fragprog_destroy(struct nvfx_context *,
+				    struct nvfx_fragment_program *);
+
+/* nv30_fragtex.c */
+extern void
+nv30_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso);
+extern void nv30_fragtex_bind(struct nvfx_context *);
+extern struct nouveau_stateobj *
+nv30_fragtex_build(struct nvfx_context *nvfx, int unit);
+
+/* nv40_fragtex.c */
+extern void
+nv40_sampler_state_init(struct pipe_context *pipe,
+			  struct nvfx_sampler_state *ps,
+			  const struct pipe_sampler_state *cso);
+extern void nv40_fragtex_bind(struct nvfx_context *);
+extern struct nouveau_stateobj *
+nv40_fragtex_build(struct nvfx_context *nvfx, int unit);
+
+/* nvfx_state.c */
+extern void nvfx_init_state_functions(struct nvfx_context *nvfx);
+
+/* nvfx_state_emit.c */
+extern void nvfx_state_flush_notify(struct nouveau_channel *chan);
+extern boolean nvfx_state_validate(struct nvfx_context *nvfx);
+extern boolean nvfx_state_validate_swtnl(struct nvfx_context *nvfx);
+extern void nvfx_state_emit(struct nvfx_context *nvfx);
+
+/* nvfx_transfer.c */
+extern void nvfx_init_transfer_functions(struct nvfx_context *nvfx);
+
+/* nvfx_vbo.c */
+extern void nvfx_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern void nvfx_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nvfx_vertprog.c */
+extern void nvfx_vertprog_destroy(struct nvfx_context *,
+				  struct nvfx_vertex_program *);
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
new file mode 100644
index 00000000000..5379b29efd1
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -0,0 +1,350 @@
+#include "pipe/p_shader_tokens.h"
+#include "util/u_inlines.h"
+#include "tgsi/tgsi_ureg.h"
+
+#include "util/u_pack_color.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+
+#include "nvfx_context.h"
+#include "nv30_vertprog.h"
+#include "nv40_vertprog.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
+
+struct nvfx_render_stage {
+	struct draw_stage stage;
+	struct nvfx_context *nvfx;
+	unsigned prim;
+};
+
+static INLINE struct nvfx_render_stage *
+nvfx_render_stage(struct draw_stage *stage)
+{
+	return (struct nvfx_render_stage *)stage;
+}
+
+static INLINE void
+nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned i;
+
+	for (i = 0; i < nvfx->swtnl.nr_attribs; i++) {
+		unsigned idx = nvfx->swtnl.draw[i];
+		unsigned hw = nvfx->swtnl.hw[i];
+
+		switch (nvfx->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0]));
+			OUT_RING  (chan, fui(v->data[idx][1]));
+			OUT_RING  (chan, fui(v->data[idx][2]));
+			OUT_RING  (chan, fui(v->data[idx][3]));
+			break;
+		case 0xff:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (chan, fui(v->data[idx][0] / v->data[idx][3]));
+			OUT_RING  (chan, fui(v->data[idx][1] / v->data[idx][3]));
+			OUT_RING  (chan, fui(v->data[idx][2] / v->data[idx][3]));
+			OUT_RING  (chan, fui(1.0f / v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (chan, pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nvfx_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nvfx_render_stage *rs = nvfx_render_stage(stage);
+	struct nvfx_context *nvfx = rs->nvfx;
+
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (AVAIL_RING(chan) < ((count * 20) + 6)) {
+		if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING(chan);
+		nvfx_state_emit(nvfx);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+			OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nvfx_render_vertex(nvfx, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (AVAIL_RING(chan) < ((count * 20) + 6)) {
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
+	}
+}
+
+static void
+nvfx_render_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_POINTS, 1);
+}
+
+static void
+nvfx_render_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_LINES, 2);
+}
+
+static void
+nvfx_render_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	nvfx_render_prim(draw, prim, NV34TCL_VERTEX_BEGIN_END_TRIANGLES, 3);
+}
+
+static void
+nvfx_render_flush(struct draw_stage *draw, unsigned flags)
+{
+	struct nvfx_render_stage *rs = nvfx_render_stage(draw);
+	struct nvfx_context *nvfx = rs->nvfx;
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	if (rs->prim != NV34TCL_VERTEX_BEGIN_END_STOP) {
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, NV34TCL_VERTEX_BEGIN_END_STOP);
+		rs->prim = NV34TCL_VERTEX_BEGIN_END_STOP;
+	}
+}
+
+static void
+nvfx_render_reset_stipple_counter(struct draw_stage *draw)
+{
+}
+
+static void
+nvfx_render_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+static struct nvfx_vertex_program *
+nvfx_create_drawvp(struct nvfx_context *nvfx)
+{
+	struct ureg_program *ureg;
+	uint i;
+
+	ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
+	if (ureg == NULL)
+		return NULL;
+
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0), ureg_DECL_vs_input(ureg, 0));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0), ureg_DECL_vs_input(ureg, 3));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1), ureg_DECL_vs_input(ureg, 4));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_BCOLOR, 0), ureg_DECL_vs_input(ureg, 3));
+	ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_BCOLOR, 1), ureg_DECL_vs_input(ureg, 4));
+	ureg_MOV(ureg,
+		   ureg_writemask(ureg_DECL_output(ureg, TGSI_SEMANTIC_FOG, 1), TGSI_WRITEMASK_X),
+		   ureg_DECL_vs_input(ureg, 5));
+	for (i = 0; i < 8; ++i)
+		ureg_MOV(ureg, ureg_DECL_output(ureg, TGSI_SEMANTIC_GENERIC, i), ureg_DECL_vs_input(ureg, 8 + i));
+
+	ureg_END( ureg );
+
+	return ureg_create_shader_and_destroy( ureg, &nvfx->pipe );
+}
+
+struct draw_stage *
+nvfx_draw_render_stage(struct nvfx_context *nvfx)
+{
+	struct nvfx_render_stage *render = CALLOC_STRUCT(nvfx_render_stage);
+
+	if (!nvfx->swtnl.vertprog)
+		nvfx->swtnl.vertprog = nvfx_create_drawvp(nvfx);
+
+	render->nvfx = nvfx;
+	render->stage.draw = nvfx->draw;
+	render->stage.point = nvfx_render_point;
+	render->stage.line = nvfx_render_line;
+	render->stage.tri = nvfx_render_tri;
+	render->stage.flush = nvfx_render_flush;
+	render->stage.reset_stipple_counter = nvfx_render_reset_stipple_counter;
+	render->stage.destroy = nvfx_render_destroy;
+
+	return &render->stage;
+}
+
+void
+nvfx_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct pipe_screen *pscreen = pipe->screen;
+	unsigned i;
+	void *map;
+
+	if (!nvfx_state_validate_swtnl(nvfx))
+		return;
+	nvfx->state.dirty &= ~(1ULL << NVFX_STATE_VTXBUF);
+	nvfx_state_emit(nvfx);
+
+	for (i = 0; i < nvfx->vtxbuf_nr; i++) {
+		map = pipe_buffer_map(pscreen, nvfx->vtxbuf[i].buffer,
+                                      PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_vertex_buffer(nvfx->draw, i, map);
+	}
+
+	if (idxbuf) {
+		map = pipe_buffer_map(pscreen, idxbuf,
+				      PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(nvfx->draw, idxbuf_size, map);
+	} else {
+		draw_set_mapped_element_buffer(nvfx->draw, 0, NULL);
+	}
+
+	if (nvfx->constbuf[PIPE_SHADER_VERTEX]) {
+		const unsigned nr = nvfx->constbuf_nr[PIPE_SHADER_VERTEX];
+
+		map = pipe_buffer_map(pscreen,
+				      nvfx->constbuf[PIPE_SHADER_VERTEX],
+				      PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_constant_buffer(nvfx->draw, PIPE_SHADER_VERTEX, 0,
+                                                map, nr);
+	}
+
+	draw_arrays(nvfx->draw, mode, start, count);
+
+	for (i = 0; i < nvfx->vtxbuf_nr; i++)
+		pipe_buffer_unmap(pscreen, nvfx->vtxbuf[i].buffer);
+
+	if (idxbuf)
+		pipe_buffer_unmap(pscreen, idxbuf);
+
+	if (nvfx->constbuf[PIPE_SHADER_VERTEX])
+		pipe_buffer_unmap(pscreen, nvfx->constbuf[PIPE_SHADER_VERTEX]);
+
+	draw_flush(nvfx->draw);
+	pipe->flush(pipe, 0, NULL);
+}
+
+static INLINE void
+emit_attrib(struct nvfx_context *nvfx, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_shader_output(nvfx->draw, semantic, index);
+	unsigned a = nvfx->swtnl.nr_attribs++;
+
+	nvfx->swtnl.hw[a] = hw;
+	nvfx->swtnl.emit[a] = emit;
+	nvfx->swtnl.draw[a] = draw_out;
+}
+
+static boolean
+nvfx_state_vtxfmt_validate(struct nvfx_context *nvfx)
+{
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nvfx->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nvfx, 3 + i, EMIT_4F, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nvfx, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nvfx, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nvfx, 0, 0xff, TGSI_SEMANTIC_POSITION, 0);
+
+	return FALSE;
+}
+
+struct nvfx_state_entry nvfx_state_vtxfmt = {
+	.validate = nvfx_state_vtxfmt_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_ARRAYS | NVFX_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
new file mode 100644
index 00000000000..76351430f44
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -0,0 +1,950 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nvfx_context.h"
+#include "nvfx_shader.h"
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nvfx_fpc {
+	struct nvfx_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_sreg *r_temp;
+
+	int num_regs;
+
+	unsigned inst_offset;
+	unsigned have_const;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nvfx_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nvfx_sreg
+temp(struct nvfx_fpc *fpc)
+{
+	int idx = ffs(~fpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nvfx_sr(NVFXSR_TEMP, 0);
+	}
+
+	fpc->r_temps |= (1 << idx);
+	fpc->r_temps_discard |= (1 << idx);
+	return nvfx_sr(NVFXSR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nvfx_fpc *fpc)
+{
+	fpc->r_temps &= ~fpc->r_temps_discard;
+	fpc->r_temps_discard = 0;
+}
+
+static INLINE struct nvfx_sreg
+constant(struct nvfx_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nvfx_sr(NVFXSR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nvfx_fp_arith((cc), (s), NVFX_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nvfx_fp_tex((cc), (s), NVFX_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nvfx_fpc *fpc, int size)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NVFXSR_INPUT:
+		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NVFX_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NVFXSR_OUTPUT:
+		sr |= NVFX_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NVFXSR_TEMP:
+		sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NVFX_FP_REG_SRC_SHIFT);
+		break;
+	case NVFXSR_CONST:
+		if (!fpc->have_const) {
+			grow_insns(fpc, 4);
+			fpc->have_const = 1;
+		}
+
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nvfx_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NVFX_FP_REG_TYPE_CONST << NVFX_FP_REG_TYPE_SHIFT);
+		break;
+	case NVFXSR_NONE:
+		sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NVFX_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NVFX_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NVFX_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NVFX_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NVFX_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NVFXSR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NVFXSR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NVFX_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NVFXSR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NVFX_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op,
+	      struct nvfx_sreg dst, int mask,
+	      struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	fpc->have_const = 0;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NVFX_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
+	hw[0] |= (op << NVFX_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NVFX_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NVFX_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NVFX_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NVFX_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nvfx_fp_tex(struct nvfx_fpc *fpc, int sat, int op, int unit,
+	    struct nvfx_sreg dst, int mask,
+	    struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2)
+{
+	struct nvfx_fragment_program *fp = fpc->fp;
+
+	nvfx_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NVFX_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nvfx_sreg
+tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nvfx_sreg src;
+
+	switch (fsrc->Register.File) {
+	case TGSI_FILE_INPUT:
+		src = nvfx_sr(NVFXSR_INPUT,
+			      fpc->attrib_map[fsrc->Register.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->Register.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->Register.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = fpc->r_temp[fsrc->Register.Index];
+		break;
+	/* NV40 fragprog result regs are just temps, so this is simple */
+	case TGSI_FILE_OUTPUT:
+		src = fpc->r_result[fsrc->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	switch (fdst->Register.File) {
+	case TGSI_FILE_OUTPUT:
+		return fpc->r_result[fdst->Register.Index];
+	case TGSI_FILE_TEMPORARY:
+		return fpc->r_temp[fdst->Register.Index];
+	case TGSI_FILE_NULL:
+		return nvfx_sr(NVFXSR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File);
+		return nvfx_sr(NVFXSR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_FP_MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_FP_MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_FP_MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_FP_MASK_W;
+	return mask;
+}
+
+static boolean
+nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	struct nvfx_sreg src[3], dst, tmp;
+	int mask, sat, unit;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+
+		switch (fsrc->Register.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->Register.Index) {
+				ii = fsrc->Register.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->Register.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = nvfx_sr(NVFXSR_NONE, 0);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NVFX_COND_GE;
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		dst.cc_test = NVFX_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DDX:
+		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nvfx_sr(NVFXSR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, NVFX_FP_MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NVFX_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, LRP_NV30, dst, mask, src[0], src[1], src[2]);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+			arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		}
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, POW_NV30, dst, mask, src[0], src[1], none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, LG2, tmp, NVFX_FP_MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+			arith(fpc, 0, MUL, tmp, NVFX_FP_MASK_X, swz(tmp, X, X, X, X),
+			      swz(src[1], X, X, X, X), none);
+			arith(fpc, sat, EX2, dst, mask,
+			      swz(tmp, X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		if(!nvfx->is_nv4x)
+			arith(fpc, 0, RFL_NV30, dst, mask, src[0], src[1], none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[0], none);
+			arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_Y, src[0], src[1], none);
+			arith(fpc, 0, DIV, scale(tmp, 2X), NVFX_FP_MASK_Z,
+			      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+			arith(fpc, sat, MAD, dst, mask,
+			      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		}
+		break;
+	case TGSI_OPCODE_RSQ:
+		if(!nvfx->is_nv4x)
+			arith(fpc, sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+		else {
+			tmp = temp(fpc);
+			arith(fpc, 0, LG2, scale(tmp, INV_2X), NVFX_FP_MASK_X,
+			      abs(swz(src[0], X, X, X, X)), none, none);
+			arith(fpc, sat, EX2, dst, mask,
+			      neg(swz(tmp, X, X, X, X)), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SCS:
+		/* avoid overwriting the source */
+		if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X)
+		{
+			if (mask & NVFX_FP_MASK_X) {
+				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & NVFX_FP_MASK_Y) {
+				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+		}
+		else
+		{
+			if (mask & NVFX_FP_MASK_Y) {
+				arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & NVFX_FP_MASK_X) {
+				arith(fpc, sat, COS, dst, NVFX_FP_MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+		}
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~NVFX_FP_MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(fpc);
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_parse_decl_attrib(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NVFX_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NVFX_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.Index <= 7) {
+			hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     Index);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->Range.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->Range.First;
+	unsigned hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = 1;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		hw = ~0;
+		switch (fdec->Semantic.Index) {
+		case 0: hw = 0; break;
+		case 1: hw = 2; break;
+		case 2: hw = 3; break;
+		case 3: hw = 4; break;
+		}
+		if(hw > ((nvfx->is_nv4x) ? 4 : 2)) {
+			NOUVEAU_ERR("bad rcol index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	fpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
+	fpc->r_temps |= (1 << hw);
+	return TRUE;
+}
+
+static boolean
+nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, i;
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nvfx_fragprog_parse_decl_attrib(nvfx, fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->Range.Last > high_temp) {
+					high_temp =
+						fdec->Range.Last;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u[0].Float;
+			vals[1] = imm->u[1].Float;
+			vals[2] = imm->u[2].Float;
+			vals[3] = imm->u[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}
+
+	return TRUE;
+
+out_err:
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nvfx_fragprog_translate(struct nvfx_context *nvfx,
+			struct nvfx_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nvfx_fpc *fpc = NULL;
+
+	fpc = CALLOC(1, sizeof(struct nvfx_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->num_regs = 2;
+
+	if (!nvfx_fragprog_prepare(nvfx, fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	if(!nvfx->is_nv4x)
+		fp->fp_control |= (fpc->num_regs-1)/2;
+	else
+		fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+
+	fp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	FREE(fpc);
+}
+
+static void
+nvfx_fragprog_upload(struct nvfx_context *nvfx,
+		     struct nvfx_fragment_program *fp)
+{
+	struct pipe_screen *pscreen = nvfx->pipe.screen;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = pipe_buffer_map(pscreen, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	pipe_buffer_unmap(pscreen, fp->buffer);
+}
+
+static boolean
+nvfx_fragprog_validate(struct nvfx_context *nvfx)
+{
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct pipe_buffer *constbuf =
+		nvfx->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_screen *pscreen = nvfx->pipe.screen;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	nvfx->fallback_swrast &= ~NVFX_NEW_FRAGPROG;
+	nvfx_fragprog_translate(nvfx, fp);
+	if (!fp->translated) {
+		nvfx->fallback_swrast |= NVFX_NEW_FRAGPROG;
+		return FALSE;
+	}
+
+	fp->buffer = pscreen->buffer_create(pscreen, 0x100, 0, fp->insn_len * 4);
+	nvfx_fragprog_upload(nvfx, fp);
+
+	so = so_new(4, 4, 1);
+	so_method(so, nvfx->screen->eng3d, NV34TCL_FP_ACTIVE_PROGRAM, 1);
+	so_reloc (so, nouveau_bo(fp->buffer), 0, NOUVEAU_BO_VRAM |
+		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+		      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
+		      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+	so_method(so, nvfx->screen->eng3d, NV34TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	if(!nvfx->is_nv4x) {
+		so_method(so, nvfx->screen->eng3d, NV34TCL_FP_REG_CONTROL, 1);
+		so_data  (so, (1<<16)|0x4);
+		so_method(so, nvfx->screen->eng3d, NV34TCL_TX_UNITS_ENABLE, 1);
+		so_data  (so, fp->samplers);
+	}
+
+	so_ref(so, &fp->so);
+	so_ref(NULL, &so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+
+		map = pipe_buffer_map(pscreen, constbuf,
+				      PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nvfx_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		pipe_buffer_unmap(pscreen, constbuf);
+
+		if (new_consts)
+			nvfx_fragprog_upload(nvfx, fp);
+	}
+
+	if (new_consts || fp->so != nvfx->state.hw[NVFX_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nvfx->state.hw[NVFX_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nvfx_fragprog_destroy(struct nvfx_context *nvfx,
+		      struct nvfx_fragment_program *fp)
+{
+	if (fp->buffer)
+		pipe_buffer_reference(&fp->buffer, NULL);
+
+	if (fp->so)
+		so_ref(NULL, &fp->so);
+
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nvfx_state_entry nvfx_state_fragprog = {
+	.validate = nvfx_fragprog_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_FRAGPROG,
+		.hw = NVFX_STATE_FRAGPROG
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_fragtex.c b/src/gallium/drivers/nvfx/nvfx_fragtex.c
new file mode 100644
index 00000000000..84e4eb10042
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_fragtex.c
@@ -0,0 +1,49 @@
+#include "nvfx_context.h"
+
+static boolean
+nvfx_fragtex_validate(struct nvfx_context *nvfx)
+{
+	struct nvfx_fragment_program *fp = nvfx->fragprog;
+	struct nvfx_state *state = &nvfx->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(1, 1, 0);
+		so_method(so, nvfx->screen->eng3d, NV34TCL_TX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nvfx->state.hw[NVFX_STATE_FRAGTEX0 + unit]);
+		so_ref(NULL, &so);
+		state->dirty |= (1ULL << (NVFX_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nvfx->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		if(!nvfx->is_nv4x)
+			so = nv30_fragtex_build(nvfx, unit);
+		else
+			so = nv40_fragtex_build(nvfx, unit);
+
+		so_ref(so, &nvfx->state.hw[NVFX_STATE_FRAGTEX0 + unit]);
+		so_ref(NULL, &so);
+		state->dirty |= (1ULL << (NVFX_STATE_FRAGTEX0 + unit));
+	}
+
+	nvfx->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nvfx_state_entry nvfx_state_fragtex = {
+	.validate = nvfx_fragtex_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_SAMPLER | NVFX_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_miptree.c b/src/gallium/drivers/nvfx/nvfx_miptree.c
new file mode 100644
index 00000000000..0f5ed61aab7
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_miptree.c
@@ -0,0 +1,247 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_math.h"
+
+#include "nvfx_context.h"
+#include "nv04_surface_2d.h"
+
+
+
+static void
+nvfx_miptree_layout(struct nvfx_miptree *mt)
+{
+	struct pipe_texture *pt = &mt->base;
+	uint width = pt->width0;
+	uint offset = 0;
+	int nr_faces, l, f;
+	uint wide_pitch = pt->tex_usage & (PIPE_TEXTURE_USAGE_SAMPLER |
+		                           PIPE_TEXTURE_USAGE_DEPTH_STENCIL |
+		                           PIPE_TEXTURE_USAGE_RENDER_TARGET |
+		                           PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+		                           PIPE_TEXTURE_USAGE_SCANOUT);
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth0;
+	} else {
+		nr_faces = 1;
+	}
+
+	for (l = 0; l <= pt->last_level; l++) {
+		if (wide_pitch && (pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+			mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64);
+		else
+			mt->level[l].pitch = util_format_get_stride(pt->format, width);
+
+		mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = u_minify(width, 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l < pt->last_level; l++) {
+			mt->level[l].image_offset[f] = offset;
+
+			if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR) &&
+			    u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1)
+				offset += align(mt->level[l].pitch * u_minify(pt->height0, l), 64);
+			else
+				offset += mt->level[l].pitch * u_minify(pt->height0, l);
+		}
+
+		mt->level[l].image_offset[f] = offset;
+		offset += mt->level[l].pitch * u_minify(pt->height0, l);
+	}
+
+	mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct nvfx_miptree *mt;
+	unsigned buf_usage = PIPE_BUFFER_USAGE_PIXEL |
+	                     NOUVEAU_BUFFER_USAGE_TEXTURE;
+
+	mt = MALLOC(sizeof(struct nvfx_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	pipe_reference_init(&mt->base.reference, 1);
+	mt->base.screen = pscreen;
+
+	/* Swizzled textures must be POT */
+	if (pt->width0 & (pt->width0 - 1) ||
+	    pt->height0 & (pt->height0 - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_SCANOUT |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
+	                     PIPE_TEXTURE_USAGE_DEPTH_STENCIL))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		case PIPE_FORMAT_B5G6R5_UNORM:
+		case PIPE_FORMAT_L8A8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			/* TODO: we can actually swizzle these formats on nv40, we
+				are just preserving the pre-unification behavior.
+				The whole 2D code is going to be rewritten anyway. */
+			if(nvfx_screen(pscreen)->is_nv4x) {
+				mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+				break;
+			}
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B8G8R8X8_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		{
+			if (debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE))
+				mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+			break;
+		}
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
+		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET && util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
+	nvfx_miptree_layout(mt);
+
+	mt->buffer = pscreen->buffer_create(pscreen, 256, buf_usage, mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	mt->bo = nouveau_bo(mt->buffer);
+	return &mt->base;
+}
+
+static struct pipe_texture *
+nvfx_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt,
+		     const unsigned *stride, struct pipe_buffer *pb)
+{
+	struct nvfx_miptree *mt;
+
+	/* Only supports 2D, non-mipmapped textures for the moment */
+	if (pt->target != PIPE_TEXTURE_2D || pt->last_level != 0 ||
+	    pt->depth0 != 1)
+		return NULL;
+
+	mt = CALLOC_STRUCT(nvfx_miptree);
+	if (!mt)
+		return NULL;
+
+	mt->base = *pt;
+	pipe_reference_init(&mt->base.reference, 1);
+	mt->base.screen = pscreen;
+	mt->level[0].pitch = stride[0];
+	mt->level[0].image_offset = CALLOC(1, sizeof(unsigned));
+
+	/* Assume whoever created this buffer expects it to be linear for now */
+	mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
+	pipe_buffer_reference(&mt->buffer, pb);
+	mt->bo = nouveau_bo(mt->buffer);
+	return &mt->base;
+}
+
+static void
+nvfx_miptree_destroy(struct pipe_texture *pt)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	int l;
+
+	pipe_buffer_reference(&mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	struct nv04_surface *ns;
+
+	ns = CALLOC_STRUCT(nv04_surface);
+	if (!ns)
+		return NULL;
+	pipe_texture_reference(&ns->base.texture, pt);
+	ns->base.format = pt->format;
+	ns->base.width = u_minify(pt->width0, level);
+	ns->base.height = u_minify(pt->height0, level);
+	ns->base.usage = flags;
+	pipe_reference_init(&ns->base.reference, 1);
+	ns->base.face = face;
+	ns->base.level = level;
+	ns->base.zslice = zslice;
+	ns->pitch = mt->level[level].pitch;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ns->base.offset = mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ns->base.offset = mt->level[level].image_offset[zslice];
+	} else {
+		ns->base.offset = mt->level[level].image_offset[0];
+	}
+
+	/* create a linear temporary that we can render into if necessary.
+	 * Note that ns->pitch is always a multiple of 64 for linear surfaces and swizzled surfaces are POT, so
+	 * ns->pitch & 63 is equivalent to (ns->pitch < 64 && swizzled)*/
+	if((ns->pitch & 63) && (ns->base.usage & (PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER)) == PIPE_BUFFER_USAGE_GPU_WRITE)
+		return &nv04_surface_wrap_for_render(pscreen, ((struct nvfx_screen*)pscreen)->eng2d, ns)->base;
+
+	return &ns->base;
+}
+
+static void
+nvfx_miptree_surface_del(struct pipe_surface *ps)
+{
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nvfx_screen* screen = (struct nvfx_screen*)ps->texture->screen;
+		if(ns->backing->base.usage & PIPE_BUFFER_USAGE_GPU_WRITE)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nvfx_miptree_surface_del(&ns->backing->base);
+	}
+
+	pipe_texture_reference(&ps->texture, NULL);
+	FREE(ps);
+}
+
+void
+nvfx_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nvfx_miptree_create;
+	pscreen->texture_destroy = nvfx_miptree_destroy;
+	pscreen->get_tex_surface = nvfx_miptree_surface_new;
+	pscreen->tex_surface_destroy = nvfx_miptree_surface_del;
+
+	nouveau_screen(pscreen)->texture_blanket = nvfx_miptree_blanket;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_query.c b/src/gallium/drivers/nvfx/nvfx_query.c
new file mode 100644
index 00000000000..acbaf75a236
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_query.c
@@ -0,0 +1,127 @@
+#include "pipe/p_context.h"
+
+#include "nvfx_context.h"
+
+struct nvfx_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nvfx_query *
+nvfx_query(struct pipe_query *pipe)
+{
+	return (struct nvfx_query *)pipe;
+}
+
+static struct pipe_query *
+nvfx_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nvfx_query *q;
+
+	q = CALLOC(1, sizeof(struct nvfx_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nvfx_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_query *q = nvfx_query(pq);
+
+	if (q->object)
+		nouveau_resource_free(&q->object);
+	FREE(q);
+}
+
+static void
+nvfx_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_query *q = nvfx_query(pq);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64_t tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nouveau_resource_alloc(nvfx->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nouveau_notifier_reset(nvfx->screen->query, q->object->start);
+
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (chan, 1);
+
+	q->ready = FALSE;
+}
+
+static void
+nvfx_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	struct nvfx_query *q = nvfx_query(pq);
+
+	BEGIN_RING(chan, eng3d, NV34TCL_QUERY_GET, 1);
+	OUT_RING  (chan, (0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(chan);
+}
+
+static boolean
+nvfx_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64_t *result)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_query *q = nvfx_query(pq);
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nouveau_notifier_status(nvfx->screen->query,
+						 q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+
+			nouveau_notifier_wait_status(nvfx->screen->query,
+					q->object->start,
+					NV_NOTIFY_STATE_STATUS_COMPLETED, 0);
+		}
+
+		q->result = nouveau_notifier_return_val(nvfx->screen->query,
+							q->object->start);
+		q->ready = TRUE;
+		nouveau_resource_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nvfx_init_query_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.create_query = nvfx_query_create;
+	nvfx->pipe.destroy_query = nvfx_query_destroy;
+	nvfx->pipe.begin_query = nvfx_query_begin;
+	nvfx->pipe.end_query = nvfx_query_end;
+	nvfx->pipe.get_query_result = nvfx_query_result;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
new file mode 100644
index 00000000000..8138715cc7d
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -0,0 +1,433 @@
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+
+#include "nouveau/nouveau_screen.h"
+
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+
+#define NV30TCL_CHIPSET_3X_MASK 0x00000003
+#define NV34TCL_CHIPSET_3X_MASK 0x00000010
+#define NV35TCL_CHIPSET_3X_MASK 0x000001e0
+
+/* FIXME: It seems I should not include directly ../../winsys/drm/nouveau/drm/nouveau_drm_api.h
+* to get the pointer to the context front buffer, so I copied nouveau_winsys here.
+* nv30_screen_surface_format_supported() can then use it to enforce creating fbo
+* with same number of bits everywhere.
+*/
+struct nouveau_winsys {
+	struct pipe_winsys base;
+
+	struct pipe_screen *pscreen;
+
+	struct pipe_surface *front;
+};
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
+static int
+nvfx_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		/* TODO: check this */
+		return screen->is_nv4x ? 16 : 8;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return !!screen->is_nv4x;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return screen->is_nv4x ? 4 : 2;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return !!screen->is_nv4x;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+		return 0; /* We have 4 on nv40 - but unsupported currently */
+	case PIPE_CAP_TGSI_CONT_SUPPORTED:
+		return 0;
+	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+		return !!screen->is_nv4x;
+	case NOUVEAU_CAP_HW_VTXBUF:
+		/* TODO: this is almost surely wrong */
+		return !!screen->is_nv4x;
+	case NOUVEAU_CAP_HW_IDXBUF:
+		/* TODO: this is also almost surely wrong */
+		return screen->is_nv4x && screen->eng3d->grclass == NV40TCL;
+	case PIPE_CAP_MAX_COMBINED_SAMPLERS:
+		return 16;
+	case PIPE_CAP_INDEP_BLEND_ENABLE:
+		/* TODO: on nv40 we have separate color masks */
+		/* TODO: nv40 mrt blending is probably broken */
+		return 0;
+	case PIPE_CAP_INDEP_BLEND_FUNC:
+		return 0;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+		return 1;
+	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nvfx_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return screen->is_nv4x ? 16.0 : 8.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return screen->is_nv4x ? 16.0 : 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+	struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front;
+
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B5G6R5_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else
+	if (tex_usage & PIPE_TEXTURE_USAGE_DEPTH_STENCIL) {
+		switch (format) {
+		case PIPE_FORMAT_S8Z24_UNORM:
+		case PIPE_FORMAT_X8Z24_UNORM:
+			return TRUE;
+		case PIPE_FORMAT_Z16_UNORM:
+			/* TODO: this nv30 limitation probably does not exist */
+			if (!screen->is_nv4x && front)
+				return (front->format == PIPE_FORMAT_B5G6R5_UNORM);
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_B8G8R8A8_UNORM:
+		case PIPE_FORMAT_B5G5R5A1_UNORM:
+		case PIPE_FORMAT_B4G4R4A4_UNORM:
+		case PIPE_FORMAT_B5G6R5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_L8A8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_S8Z24_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		/* TODO: does nv30 support this? */
+		case PIPE_FORMAT_R16_SNORM:
+			return !!screen->is_nv4x;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static struct pipe_buffer *
+nvfx_surface_buffer(struct pipe_surface *surf)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)surf->texture;
+
+	return mt->buffer;
+}
+
+static void
+nvfx_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nvfx_screen *screen = nvfx_screen(pscreen);
+	unsigned i;
+
+	for (i = 0; i < NVFX_STATE_MAX; i++) {
+		if (screen->state[i])
+			so_ref(NULL, &screen->state[i]);
+	}
+
+	nouveau_resource_destroy(&screen->vp_exec_heap);
+	nouveau_resource_destroy(&screen->vp_data_heap);
+	nouveau_resource_destroy(&screen->query_heap);
+	nouveau_notifier_free(&screen->query);
+	nouveau_notifier_free(&screen->sync);
+	nouveau_grobj_free(&screen->eng3d);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
+
+	FREE(pscreen);
+}
+
+static void nv30_screen_init(struct nvfx_screen *screen, struct nouveau_stateobj* so)
+{
+	int i;
+
+	/* TODO: perhaps we should do some of this on nv40 too? */
+	for (i=1; i<8; i++) {
+		so_method(so, screen->eng3d, NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		so_data  (so, 0);
+		so_method(so, screen->eng3d, NV34TCL_VIEWPORT_CLIP_VERT(i), 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, screen->eng3d, 0x220, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->eng3d, 0x03b0, 1);
+	so_data  (so, 0x00100000);
+	so_method(so, screen->eng3d, 0x1454, 1);
+	so_data  (so, 0);
+	so_method(so, screen->eng3d, 0x1d80, 1);
+	so_data  (so, 3);
+	so_method(so, screen->eng3d, 0x1450, 1);
+	so_data  (so, 0x00030004);
+
+	/* NEW */
+	so_method(so, screen->eng3d, 0x1e98, 1);
+	so_data  (so, 0);
+	so_method(so, screen->eng3d, 0x17e0, 3);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+	so_method(so, screen->eng3d, 0x1f80, 16);
+	for (i=0; i<16; i++) {
+		so_data  (so, (i==8) ? 0x0000ffff : 0);
+	}
+
+	so_method(so, screen->eng3d, 0x120, 3);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 2);
+
+	so_method(so, screen->eng3d, 0x1d88, 1);
+	so_data  (so, 0x00001200);
+
+	so_method(so, screen->eng3d, NV34TCL_RC_ENABLE, 1);
+	so_data  (so, 0);
+
+	so_method(so, screen->eng3d, NV34TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->eng3d, NV34TCL_MULTISAMPLE_CONTROL, 1);
+	so_data  (so, 0xffff0000);
+
+	/* enables use of vp rather than fixed-function somehow */
+	so_method(so, screen->eng3d, 0x1e94, 1);
+	so_data  (so, 0x13);
+}
+
+static void nv40_screen_init(struct nvfx_screen *screen, struct nouveau_stateobj* so)
+{
+	so_method(so, screen->eng3d, NV40TCL_DMA_COLOR2, 2);
+	so_data  (so, screen->base.channel->vram->handle);
+	so_data  (so, screen->base.channel->vram->handle);
+
+	so_method(so, screen->eng3d, 0x1ea4, 3);
+	so_data  (so, 0x00000010);
+	so_data  (so, 0x01000100);
+	so_data  (so, 0xff800006);
+
+	/* vtxprog output routing */
+	so_method(so, screen->eng3d, 0x1fc4, 1);
+	so_data  (so, 0x06144321);
+	so_method(so, screen->eng3d, 0x1fc8, 2);
+	so_data  (so, 0xedcba987);
+	so_data  (so, 0x00000021);
+	so_method(so, screen->eng3d, 0x1fd0, 1);
+	so_data  (so, 0x00171615);
+	so_method(so, screen->eng3d, 0x1fd4, 1);
+	so_data  (so, 0x001b1a19);
+
+	so_method(so, screen->eng3d, 0x1ef8, 1);
+	so_data  (so, 0x0020ffff);
+	so_method(so, screen->eng3d, 0x1d64, 1);
+	so_data  (so, 0x00d30000);
+	so_method(so, screen->eng3d, 0x1e94, 1);
+	so_data  (so, 0x00000001);
+}
+
+struct pipe_screen *
+nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
+{
+	struct nvfx_screen *screen = CALLOC_STRUCT(nvfx_screen);
+	struct nouveau_channel *chan;
+	struct pipe_screen *pscreen;
+	struct nouveau_stateobj *so;
+	unsigned eng3d_class = 0;
+	int ret;
+
+	if (!screen)
+		return NULL;
+
+	pscreen = &screen->base.base;
+
+	ret = nouveau_screen_init(&screen->base, dev);
+	if (ret) {
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+	chan = screen->base.channel;
+
+	pscreen->winsys = ws;
+	pscreen->destroy = nvfx_screen_destroy;
+	pscreen->get_param = nvfx_screen_get_param;
+	pscreen->get_paramf = nvfx_screen_get_paramf;
+	pscreen->is_format_supported = nvfx_screen_surface_format_supported;
+	pscreen->context_create = nvfx_create;
+
+	switch (dev->chipset & 0xf0) {
+	case 0x30:
+		if (NV30TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0397;
+		else if (NV34TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0697;
+		else if (NV35TCL_CHIPSET_3X_MASK & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = 0x0497;
+		break;
+	case 0x40:
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV40TCL;
+		else if (NV4X_GRCLASS4497_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV44TCL;
+		screen->is_nv4x = ~0;
+		break;
+	case 0x60:
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (dev->chipset & 0x0f)))
+			eng3d_class = NV44TCL;
+		screen->is_nv4x = ~0;
+		break;
+	}
+
+	if (!eng3d_class) {
+		NOUVEAU_ERR("Unknown nv3x/nv4x chipset: nv%02x\n", dev->chipset);
+		return NULL;
+	}
+
+	nvfx_screen_init_miptree_functions(pscreen);
+
+	ret = nouveau_grobj_alloc(chan, 0xbeef3097, eng3d_class, &screen->eng3d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* 2D engine setup */
+	screen->eng2d = nv04_surface_2d_init(&screen->base);
+	screen->eng2d->buf = nvfx_surface_buffer;
+
+	/* Notifier for sync purposes */
+	ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nouveau_notifier_alloc(chan, 0xbeef0302, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	ret = nouveau_resource_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nouveau_resource_init(&screen->vp_exec_heap, 0, screen->is_nv4x ? 512 : 256) ||
+	    nouveau_resource_init(&screen->vp_data_heap, 0, 256)) {
+		nvfx_screen_destroy(pscreen);
+		return NULL;
+	}
+
+	/* Static eng3d initialisation */
+	/* make the so big and don't worry about exact values
+	   since we it will be thrown away immediately after use */
+	so = so_new(256, 256, 0);
+	so_method(so, screen->eng3d, NV34TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->eng3d, NV34TCL_DMA_TEXTURE0, 2);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->gart->handle);
+	so_method(so, screen->eng3d, NV34TCL_DMA_COLOR1, 1);
+	so_data  (so, chan->vram->handle);
+	so_method(so, screen->eng3d, NV34TCL_DMA_COLOR0, 2);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
+	so_method(so, screen->eng3d, NV34TCL_DMA_VTXBUF0, 2);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->gart->handle);
+
+	so_method(so, screen->eng3d, NV34TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);
+
+	so_method(so, screen->eng3d, NV34TCL_DMA_IN_MEMORY7, 2);
+	so_data  (so, chan->vram->handle);
+	so_data  (so, chan->vram->handle);
+
+	if(!screen->is_nv4x)
+		nv30_screen_init(screen, so);
+	else
+		nv40_screen_init(screen, so);
+
+	so_emit(chan, so);
+	so_ref(NULL, &so);
+	nouveau_pushbuf_flush(chan, 0);
+
+	return pscreen;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.h b/src/gallium/drivers/nvfx/nvfx_screen.h
new file mode 100644
index 00000000000..c0b4b9899dd
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_screen.h
@@ -0,0 +1,39 @@
+#ifndef __NVFX_SCREEN_H__
+#define __NVFX_SCREEN_H__
+
+#include "nouveau/nouveau_screen.h"
+#include "nv04_surface_2d.h"
+
+struct nvfx_screen {
+	struct nouveau_screen base;
+
+	struct nouveau_winsys *nvws;
+
+	struct nvfx_context *cur_ctx;
+
+	unsigned is_nv4x; /* either 0 or ~0 */
+
+	/* HW graphics objects */
+	struct nv04_surface_2d *eng2d;
+	struct nouveau_grobj *eng3d;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NVFX_STATE_MAX];
+};
+
+static INLINE struct nvfx_screen *
+nvfx_screen(struct pipe_screen *screen)
+{
+	return (struct nvfx_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h
new file mode 100644
index 00000000000..0b2f044f7fe
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_shader.h
@@ -0,0 +1,429 @@
+#ifndef __NVFX_SHADER_H__
+#define __NVFX_SHADER_H__
+
+/* this will resolve to either the NV30 or the NV40 version
+ * depending on the current hardware */
+/* unusual, but very fast and compact method */
+#define NVFX_VP(c) ((NV30_VP_##c) + (nvfx->is_nv4x & ((NV40_VP_##c) - (NV30_VP_##c))))
+
+#define NVFX_VP_INST_SLOT_VEC 0
+#define NVFX_VP_INST_SLOT_SCA 1
+
+#define NVFX_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#define NVFX_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#define NVFX_VP_INST_IN_NORMAL  2
+#define NVFX_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#define NVFX_VP_INST_IN_COL1  4
+#define NVFX_VP_INST_IN_FOGC  5
+#define NVFX_VP_INST_IN_TC0  8
+#define NVFX_VP_INST_IN_TC(n)  (8+n)
+
+#define NVFX_VP_INST_SCA_OP_NOP 0x00
+#define NVFX_VP_INST_SCA_OP_MOV 0x01
+#define NVFX_VP_INST_SCA_OP_RCP 0x02
+#define NVFX_VP_INST_SCA_OP_RCC 0x03
+#define NVFX_VP_INST_SCA_OP_RSQ 0x04
+#define NVFX_VP_INST_SCA_OP_EXP 0x05
+#define NVFX_VP_INST_SCA_OP_LOG 0x06
+#define NVFX_VP_INST_SCA_OP_LIT 0x07
+#define NVFX_VP_INST_SCA_OP_BRA 0x09
+#define NVFX_VP_INST_SCA_OP_CAL 0x0B
+#define NVFX_VP_INST_SCA_OP_RET 0x0C
+#define NVFX_VP_INST_SCA_OP_LG2 0x0D
+#define NVFX_VP_INST_SCA_OP_EX2 0x0E
+#define NVFX_VP_INST_SCA_OP_SIN 0x0F
+#define NVFX_VP_INST_SCA_OP_COS 0x10
+
+#define NV40_VP_INST_SCA_OP_PUSHA 0x13
+#define NV40_VP_INST_SCA_OP_POPA 0x14
+
+#define NVFX_VP_INST_VEC_OP_NOP 0x00
+#define NVFX_VP_INST_VEC_OP_MOV 0x01
+#define NVFX_VP_INST_VEC_OP_MUL 0x02
+#define NVFX_VP_INST_VEC_OP_ADD 0x03
+#define NVFX_VP_INST_VEC_OP_MAD 0x04
+#define NVFX_VP_INST_VEC_OP_DP3 0x05
+#define NVFX_VP_INST_VEC_OP_DPH 0x06
+#define NVFX_VP_INST_VEC_OP_DP4 0x07
+#define NVFX_VP_INST_VEC_OP_DST 0x08
+#define NVFX_VP_INST_VEC_OP_MIN 0x09
+#define NVFX_VP_INST_VEC_OP_MAX 0x0A
+#define NVFX_VP_INST_VEC_OP_SLT 0x0B
+#define NVFX_VP_INST_VEC_OP_SGE 0x0C
+#define NVFX_VP_INST_VEC_OP_ARL 0x0D
+#define NVFX_VP_INST_VEC_OP_FRC 0x0E
+#define NVFX_VP_INST_VEC_OP_FLR 0x0F
+#define NVFX_VP_INST_VEC_OP_SEQ 0x10
+#define NVFX_VP_INST_VEC_OP_SFL 0x11
+#define NVFX_VP_INST_VEC_OP_SGT 0x12
+#define NVFX_VP_INST_VEC_OP_SLE 0x13
+#define NVFX_VP_INST_VEC_OP_SNE 0x14
+#define NVFX_VP_INST_VEC_OP_STR 0x15
+#define NVFX_VP_INST_VEC_OP_SSG 0x16
+#define NVFX_VP_INST_VEC_OP_ARR 0x17
+#define NVFX_VP_INST_VEC_OP_ARA 0x18
+
+#define NV40_VP_INST_VEC_OP_TXL 0x19
+
+/* DWORD 3 */
+#define NVFX_VP_INST_LAST                           (1 << 0)
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *   0 - Opcode, output reg/mask, ATTRIB source
+ *   1 - Source 0
+ *   2 - Source 1
+ *   3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ *
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ *
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ *
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ *
+ * NV40 Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ */
+
+//== Opcode / Destination selection ==
+#define NVFX_FP_OP_PROGRAM_END          (1 << 0)
+#define NVFX_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+#define NV40_FP_OP_OUT_REG_MASK          (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NVFX_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NVFX_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NVFX_FP_OP_OUTMASK_SHIFT        9
+#define NVFX_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NVFX_FP_OP_OUT_X  (1<<9)
+#  define NVFX_FP_OP_OUT_Y  (1<<10)
+#  define NVFX_FP_OP_OUT_Z  (1<<11)
+#  define NVFX_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NVFX_FP_OP_INPUT_SRC_SHIFT        13
+#define NVFX_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NVFX_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NVFX_FP_OP_INPUT_SRC_COL0  0x1
+#  define NVFX_FP_OP_INPUT_SRC_COL1  0x2
+#  define NVFX_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NVFX_FP_OP_INPUT_SRC_TC0    0x4
+#  define NVFX_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#  define NV40_FP_OP_INPUT_SRC_FACING  0xE
+#define NVFX_FP_OP_TEX_UNIT_SHIFT        17
+#define NVFX_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NVFX_FP_OP_PRECISION_SHIFT        22
+#define NVFX_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NVFX_FP_PRECISION_FP32  0
+#   define NVFX_FP_PRECISION_FP16  1
+#   define NVFX_FP_PRECISION_FX12  2
+#define NVFX_FP_OP_OPCODE_SHIFT          24
+#define NVFX_FP_OP_OPCODE_MASK          (0x3F << 24)
+/* NV30/NV40 fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_NOP 0x00
+#define NVFX_FP_OP_OPCODE_MOV 0x01
+#define NVFX_FP_OP_OPCODE_MUL 0x02
+#define NVFX_FP_OP_OPCODE_ADD 0x03
+#define NVFX_FP_OP_OPCODE_MAD 0x04
+#define NVFX_FP_OP_OPCODE_DP3 0x05
+#define NVFX_FP_OP_OPCODE_DP4 0x06
+#define NVFX_FP_OP_OPCODE_DST 0x07
+#define NVFX_FP_OP_OPCODE_MIN 0x08
+#define NVFX_FP_OP_OPCODE_MAX 0x09
+#define NVFX_FP_OP_OPCODE_SLT 0x0A
+#define NVFX_FP_OP_OPCODE_SGE 0x0B
+#define NVFX_FP_OP_OPCODE_SLE 0x0C
+#define NVFX_FP_OP_OPCODE_SGT 0x0D
+#define NVFX_FP_OP_OPCODE_SNE 0x0E
+#define NVFX_FP_OP_OPCODE_SEQ 0x0F
+#define NVFX_FP_OP_OPCODE_FRC 0x10
+#define NVFX_FP_OP_OPCODE_FLR 0x11
+#define NVFX_FP_OP_OPCODE_KIL 0x12
+#define NVFX_FP_OP_OPCODE_PK4B 0x13
+#define NVFX_FP_OP_OPCODE_UP4B 0x14
+#define NVFX_FP_OP_OPCODE_DDX 0x15 /* can only write XY */
+#define NVFX_FP_OP_OPCODE_DDY 0x16 /* can only write XY */
+#define NVFX_FP_OP_OPCODE_TEX 0x17
+#define NVFX_FP_OP_OPCODE_TXP 0x18
+#define NVFX_FP_OP_OPCODE_TXD 0x19
+#define NVFX_FP_OP_OPCODE_RCP 0x1A
+#define NVFX_FP_OP_OPCODE_EX2 0x1C
+#define NVFX_FP_OP_OPCODE_LG2 0x1D
+#define NVFX_FP_OP_OPCODE_STR 0x20
+#define NVFX_FP_OP_OPCODE_SFL 0x21
+#define NVFX_FP_OP_OPCODE_COS 0x22
+#define NVFX_FP_OP_OPCODE_SIN 0x23
+#define NVFX_FP_OP_OPCODE_PK2H 0x24
+#define NVFX_FP_OP_OPCODE_UP2H 0x25
+#define NVFX_FP_OP_OPCODE_PK4UB 0x27
+#define NVFX_FP_OP_OPCODE_UP4UB 0x28
+#define NVFX_FP_OP_OPCODE_PK2US 0x29
+#define NVFX_FP_OP_OPCODE_UP2US 0x2A
+#define NVFX_FP_OP_OPCODE_DP2A 0x2E
+#define NVFX_FP_OP_OPCODE_TXB 0x31
+#define NVFX_FP_OP_OPCODE_DIV 0x3A
+
+/* NV30 only fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_RSQ_NV30 0x1B
+#define NVFX_FP_OP_OPCODE_LIT_NV30 0x1E
+#define NVFX_FP_OP_OPCODE_LRP_NV30 0x1F
+#define NVFX_FP_OP_OPCODE_POW_NV30 0x26
+#define NVFX_FP_OP_OPCODE_RFL_NV30 0x36
+
+/* NV40 only fragment program opcodes */
+#define NVFX_FP_OP_OPCODE_TXL_NV40 0x31
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+
+#define NVFX_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NVFX_FP_OP_OUT_ABS          (1 << 29)
+#define NVFX_FP_OP_COND_SWZ_W_SHIFT        27
+#define NVFX_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NVFX_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NVFX_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NVFX_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NVFX_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NVFX_FP_OP_COND_SWZ_X_SHIFT        21
+#define NVFX_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NVFX_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NVFX_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NVFX_FP_OP_COND_SHIFT          18
+#define NVFX_FP_OP_COND_MASK          (0x07 << 18)
+#  define NVFX_FP_OP_COND_FL  0
+#  define NVFX_FP_OP_COND_LT  1
+#  define NVFX_FP_OP_COND_EQ  2
+#  define NVFX_FP_OP_COND_LE  3
+#  define NVFX_FP_OP_COND_GT  4
+#  define NVFX_FP_OP_COND_NE  5
+#  define NVFX_FP_OP_COND_GE  6
+#  define NVFX_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NVFX_FP_OP_DST_SCALE_SHIFT        28
+#define NVFX_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NVFX_FP_OP_DST_SCALE_1X                                                0
+#define NVFX_FP_OP_DST_SCALE_2X                                                1
+#define NVFX_FP_OP_DST_SCALE_4X                                                2
+#define NVFX_FP_OP_DST_SCALE_8X                                                3
+#define NVFX_FP_OP_DST_SCALE_INV_2X                                            5
+#define NVFX_FP_OP_DST_SCALE_INV_4X                                            6
+#define NVFX_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+/* high order bits of SRC2 */
+#define NVFX_FP_OP_INDEX_INPUT          (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT        19
+#define NV40_FP_OP_ADDR_INDEX_MASK        (0xF << 19)
+
+//== Register selection ==
+#define NVFX_FP_REG_TYPE_SHIFT           0
+#define NVFX_FP_REG_TYPE_MASK           (3 << 0)
+#  define NVFX_FP_REG_TYPE_TEMP   0
+#  define NVFX_FP_REG_TYPE_INPUT  1
+#  define NVFX_FP_REG_TYPE_CONST  2
+#define NVFX_FP_REG_SRC_SHIFT            2
+#define NV30_FP_REG_SRC_MASK              (31 << 2)
+#define NV40_FP_REG_SRC_MASK              (63 << 2)
+#define NVFX_FP_REG_SRC_HALF            (1 << 8)
+#define NVFX_FP_REG_SWZ_ALL_SHIFT        9
+#define NVFX_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NVFX_FP_REG_SWZ_X_SHIFT          9
+#define NVFX_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NVFX_FP_REG_SWZ_Y_SHIFT          11
+#define NVFX_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NVFX_FP_REG_SWZ_Z_SHIFT          13
+#define NVFX_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NVFX_FP_REG_SWZ_W_SHIFT          15
+#define NVFX_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NVFX_FP_SWIZZLE_X  0
+#  define NVFX_FP_SWIZZLE_Y  1
+#  define NVFX_FP_SWIZZLE_Z  2
+#  define NVFX_FP_SWIZZLE_W  3
+#define NVFX_FP_REG_NEGATE          (1 << 17)
+
+#define NVFXSR_NONE	0
+#define NVFXSR_OUTPUT	1
+#define NVFXSR_INPUT	2
+#define NVFXSR_TEMP	3
+#define NVFXSR_CONST	4
+
+#define NVFX_COND_FL  0
+#define NVFX_COND_LT  1
+#define NVFX_COND_EQ  2
+#define NVFX_COND_LE  3
+#define NVFX_COND_GT  4
+#define NVFX_COND_NE  5
+#define NVFX_COND_GE  6
+#define NVFX_COND_TR  7
+
+/* Yes, this are ordered differently... */
+
+#define NVFX_VP_MASK_X 8
+#define NVFX_VP_MASK_Y 4
+#define NVFX_VP_MASK_Z 2
+#define NVFX_VP_MASK_W 1
+#define NVFX_VP_MASK_ALL 0xf
+
+#define NVFX_FP_MASK_X 1
+#define NVFX_FP_MASK_Y 2
+#define NVFX_FP_MASK_Z 4
+#define NVFX_FP_MASK_W 8
+#define NVFX_FP_MASK_ALL 0xf
+
+#define NVFX_SWZ_X 0
+#define NVFX_SWZ_Y 1
+#define NVFX_SWZ_Z 2
+#define NVFX_SWZ_W 3
+
+#define swz(s,x,y,z,w) nvfx_sr_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w)
+#define neg(s) nvfx_sr_neg((s))
+#define abs(s) nvfx_sr_abs((s))
+#define scale(s,v) nvfx_sr_scale((s), NVFX_FP_OP_DST_SCALE_##v)
+
+struct nvfx_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nvfx_sreg
+nvfx_sr(int type, int index)
+{
+	struct nvfx_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = 0,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = NVFX_COND_TR,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w)
+{
+	struct nvfx_sreg dst = src;
+
+	dst.swz[NVFX_SWZ_X] = src.swz[x];
+	dst.swz[NVFX_SWZ_Y] = src.swz[y];
+	dst.swz[NVFX_SWZ_Z] = src.swz[z];
+	dst.swz[NVFX_SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_neg(struct nvfx_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_abs(struct nvfx_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+nvfx_sr_scale(struct nvfx_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c
new file mode 100644
index 00000000000..32a81997528
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state.c
@@ -0,0 +1,652 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+#include "nvfx_tex.h"
+
+static void *
+nvfx_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+	struct nvfx_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(5, 8, 0);
+
+	if (cso->rt[0].blend_enable) {
+		so_method(so, eng3d, NV34TCL_BLEND_FUNC_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->rt[0].alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rt[0].rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->rt[0].alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rt[0].rgb_dst_factor));
+		if(nvfx->screen->base.device->chipset < 0x40) {
+			so_method(so, eng3d, NV34TCL_BLEND_EQUATION, 1);
+			so_data  (so, nvgl_blend_eqn(cso->rt[0].rgb_func));
+		} else {
+			so_method(so, eng3d, NV40TCL_BLEND_EQUATION, 1);
+			so_data  (so, nvgl_blend_eqn(cso->rt[0].alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rt[0].rgb_func));
+		}
+	} else {
+		so_method(so, eng3d, NV34TCL_BLEND_FUNC_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, eng3d, NV34TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->rt[0].colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+	       ((cso->rt[0].colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	/* TODO: add NV40 MRT color mask */
+
+	if (cso->logicop_enable) {
+		so_method(so, eng3d, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, eng3d, NV34TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, eng3d, NV34TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	so_ref(NULL, &so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nvfx_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->blend = hwcso;
+	nvfx->dirty |= NVFX_NEW_BLEND;
+}
+
+static void
+nvfx_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+static void *
+nvfx_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_sampler_state *ps;
+
+	ps = MALLOC(sizeof(struct nvfx_sampler_state));
+
+	/* on nv30, we use this as an internal flag */
+	ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT;
+	ps->en = 0;
+	ps->filt = nvfx_tex_filter(cso);
+	ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT) |
+		    nvfx_tex_wrap_compare_mode(cso);
+	ps->bcol = nvfx_tex_border_color(cso->border_color);
+
+	if(nvfx->is_nv4x)
+		nv40_sampler_state_init(pipe, ps, cso);
+	else
+		nv30_sampler_state_init(pipe, ps, cso);
+
+	return (void *)ps;
+}
+
+static void
+nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nvfx->tex_sampler[unit] = sampler[unit];
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_samplers; unit++) {
+		nvfx->tex_sampler[unit] = NULL;
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_samplers = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
+
+static void
+nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nvfx_set_fragment_sampler_views(struct pipe_context *pipe,
+				unsigned nr,
+				struct pipe_sampler_view **views)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_sampler_view_reference(&nv30->fragment_sampler_views[unit],
+                                            views[unit]);
+		pipe_texture_reference((struct pipe_texture **)
+				       &nvfx->tex_miptree[unit], miptree[unit]);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nvfx->nr_textures; unit++) {
+		pipe_sampler_view_reference(&nv30->fragment_sampler_views[unit],
+                                            NULL);
+		pipe_texture_reference((struct pipe_texture **)
+				       &nvfx->tex_miptree[unit], NULL);
+		nvfx->dirty_samplers |= (1 << unit);
+	}
+
+	nvfx->nr_textures = nr;
+	nvfx->dirty |= NVFX_NEW_SAMPLER;
+}
+
+
+static struct pipe_sampler_view *
+nv30_create_sampler_view(struct pipe_context *pipe,
+			 struct pipe_texture *texture,
+			 const struct pipe_sampler_view *templ)
+{
+	struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+	if (view) {
+		*view = *templ;
+		view->reference.count = 1;
+		view->texture = NULL;
+		pipe_texture_reference(&view->texture, texture);
+		view->context = pipe;
+	}
+
+	return view;
+}
+
+
+static void
+nv30_sampler_view_destroy(struct pipe_context *pipe,
+			  struct pipe_sampler_view *view)
+{
+	pipe_texture_reference(&view->texture, NULL);
+	FREE(view);
+}
+
+static void *
+nvfx_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(9, 19, 0);
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, eng3d, NV34TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV34TCL_SHADE_MODEL_FLAT :
+				       NV34TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, eng3d, NV34TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, eng3d, NV34TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, eng3d, NV34TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, eng3d, NV34TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, eng3d, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, eng3d, NV34TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, eng3d, NV34TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, eng3d, NV34TCL_POINT_SPRITE, 1);
+	if (cso->point_quad_rasterization) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if ((cso->sprite_coord_enable >> i) & 1)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	so_ref(NULL, &so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nvfx_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->rasterizer = hwcso;
+	nvfx->dirty |= NVFX_NEW_RAST;
+	nvfx->draw_dirty |= NVFX_NEW_RAST;
+}
+
+static void
+nvfx_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nvfx_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(6, 20, 0);
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+
+	so_method(so, eng3d, NV34TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, eng3d, NV34TCL_ALPHA_FUNC_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref_value));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, eng3d, NV34TCL_STENCIL_FRONT_ENABLE, 3);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_method(so, eng3d, NV34TCL_STENCIL_FRONT_FUNC_MASK, 4);
+		so_data  (so, cso->stencil[0].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, eng3d, NV34TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, eng3d, NV34TCL_STENCIL_BACK_ENABLE, 3);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].writemask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_method(so, eng3d, NV34TCL_STENCIL_BACK_FUNC_MASK, 4);
+		so_data  (so, cso->stencil[1].valuemask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, eng3d, NV34TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	so_ref(NULL, &so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nvfx_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->zsa = hwcso;
+	nvfx->dirty |= NVFX_NEW_ZSA;
+}
+
+static void
+nvfx_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nvfx_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nvfx_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe);
+
+	return (void *)vp;
+}
+
+static void
+nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->vertprog = hwcso;
+	nvfx->dirty |= NVFX_NEW_VERTPROG;
+	nvfx->draw_dirty |= NVFX_NEW_VERTPROG;
+}
+
+static void
+nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_vertex_program *vp = hwcso;
+
+	draw_delete_vertex_shader(nvfx->draw, vp->draw);
+	nvfx_vertprog_destroy(nvfx, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nvfx_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nvfx_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nvfx_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->fragprog = hwcso;
+	nvfx->dirty |= NVFX_NEW_FRAGPROG;
+}
+
+static void
+nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_fragment_program *fp = hwcso;
+
+	nvfx_fragprog_destroy(nvfx, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nvfx_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->blend_colour = *bcol;
+	nvfx->dirty |= NVFX_NEW_BCOL;
+}
+
+static void
+nvfx_set_stencil_ref(struct pipe_context *pipe,
+		     const struct pipe_stencil_ref *sr)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->stencil_ref = *sr;
+	nvfx->dirty |= NVFX_NEW_SR;
+}
+
+static void
+nvfx_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->clip = *clip;
+	nvfx->dirty |= NVFX_NEW_UCP;
+	nvfx->draw_dirty |= NVFX_NEW_UCP;
+}
+
+static void
+nvfx_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 struct pipe_buffer *buf )
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->constbuf[shader] = buf;
+	nvfx->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nvfx->dirty |= NVFX_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nvfx->dirty |= NVFX_NEW_FRAGPROG;
+	}
+}
+
+static void
+nvfx_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->framebuffer = *fb;
+	nvfx->dirty |= NVFX_NEW_FB;
+}
+
+static void
+nvfx_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	memcpy(nvfx->stipple, stipple->stipple, 4 * 32);
+	nvfx->dirty |= NVFX_NEW_STIPPLE;
+}
+
+static void
+nvfx_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->scissor = *s;
+	nvfx->dirty |= NVFX_NEW_SCISSOR;
+}
+
+static void
+nvfx_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->viewport = *vpt;
+	nvfx->dirty |= NVFX_NEW_VIEWPORT;
+	nvfx->draw_dirty |= NVFX_NEW_VIEWPORT;
+}
+
+static void
+nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	memcpy(nvfx->vtxbuf, vb, sizeof(*vb) * count);
+	nvfx->vtxbuf_nr = count;
+
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
+
+static void *
+nvfx_vtxelts_state_create(struct pipe_context *pipe,
+			  unsigned num_elements,
+			  const struct pipe_vertex_element *elements)
+{
+	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
+
+	assert(num_elements < 16); /* not doing fallbacks yet */
+	cso->num_elements = num_elements;
+	memcpy(cso->pipe, elements, num_elements * sizeof(*elements));
+
+/*	nvfx_vtxelt_construct(cso);*/
+
+	return (void *)cso;
+}
+
+static void
+nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+
+	nvfx->vtxelt = hwcso;
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	/*nvfx->draw_dirty |= NVFX_NEW_ARRAYS;*/
+}
+
+void
+nvfx_init_state_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.create_blend_state = nvfx_blend_state_create;
+	nvfx->pipe.bind_blend_state = nvfx_blend_state_bind;
+	nvfx->pipe.delete_blend_state = nvfx_blend_state_delete;
+
+	nvfx->pipe.create_sampler_state = nvfx_sampler_state_create;
+	nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind;
+	nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete;
+	nvfx->pipe.set_fragment_sampler_textures = nvfx_set_sampler_texture;
+
+	nvfx->pipe.create_rasterizer_state = nvfx_rasterizer_state_create;
+	nvfx->pipe.bind_rasterizer_state = nvfx_rasterizer_state_bind;
+	nvfx->pipe.delete_rasterizer_state = nvfx_rasterizer_state_delete;
+
+	nvfx->pipe.create_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_create;
+	nvfx->pipe.bind_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_bind;
+	nvfx->pipe.delete_depth_stencil_alpha_state =
+		nvfx_depth_stencil_alpha_state_delete;
+
+	nvfx->pipe.create_vs_state = nvfx_vp_state_create;
+	nvfx->pipe.bind_vs_state = nvfx_vp_state_bind;
+	nvfx->pipe.delete_vs_state = nvfx_vp_state_delete;
+
+	nvfx->pipe.create_fs_state = nvfx_fp_state_create;
+	nvfx->pipe.bind_fs_state = nvfx_fp_state_bind;
+	nvfx->pipe.delete_fs_state = nvfx_fp_state_delete;
+
+	nvfx->pipe.set_blend_color = nvfx_set_blend_color;
+        nvfx->pipe.set_stencil_ref = nvfx_set_stencil_ref;
+	nvfx->pipe.set_clip_state = nvfx_set_clip_state;
+	nvfx->pipe.set_constant_buffer = nvfx_set_constant_buffer;
+	nvfx->pipe.set_framebuffer_state = nvfx_set_framebuffer_state;
+	nvfx->pipe.set_polygon_stipple = nvfx_set_polygon_stipple;
+	nvfx->pipe.set_scissor_state = nvfx_set_scissor_state;
+	nvfx->pipe.set_viewport_state = nvfx_set_viewport_state;
+
+	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
+	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
+	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
+
+	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
new file mode 100644
index 00000000000..e585246879b
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -0,0 +1,83 @@
+#ifndef __NVFX_STATE_H__
+#define __NVFX_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nvfx_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nvfx_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nvfx_vertex_program {
+	struct pipe_shader_state pipe;
+
+	struct draw_vertex_shader *draw;
+
+	boolean translated;
+
+	struct pipe_clip_state ucp;
+
+	struct nvfx_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nvfx_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	uint32_t clip_ctrl;
+	struct nouveau_stateobj *so;
+};
+
+struct nvfx_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nvfx_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nvfx_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	struct nouveau_stateobj *so;
+};
+
+#define NVFX_MAX_TEXTURE_LEVELS  16
+
+struct nvfx_miptree {
+	struct pipe_texture base;
+	struct nouveau_bo *bo;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[NVFX_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nvfx/nvfx_state_blend.c b/src/gallium/drivers/nvfx/nvfx_state_blend.c
new file mode 100644
index 00000000000..03b6ef8117d
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_blend.c
@@ -0,0 +1,41 @@
+#include "nvfx_context.h"
+
+static boolean
+nvfx_state_blend_validate(struct nvfx_context *nvfx)
+{
+	so_ref(nvfx->blend->so, &nvfx->state.hw[NVFX_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_blend = {
+	.validate = nvfx_state_blend_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_BLEND,
+		.hw = NVFX_STATE_BLEND
+	}
+};
+
+static boolean
+nvfx_state_blend_colour_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_stateobj *so = so_new(1, 1, 0);
+	struct pipe_blend_color *bcol = &nvfx->blend_colour;
+
+	so_method(so, nvfx->screen->eng3d, NV34TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nvfx->state.hw[NVFX_STATE_BCOL]);
+	so_ref(NULL, &so);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_blend_colour = {
+	.validate = nvfx_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_BCOL,
+		.hw = NVFX_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
new file mode 100644
index 00000000000..72537388ea4
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -0,0 +1,179 @@
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+#include "draw/draw_context.h"
+
+#define RENDER_STATES(name, vbo) \
+static struct nvfx_state_entry *name##render_states[] = { \
+	&nvfx_state_framebuffer, \
+	&nvfx_state_rasterizer, \
+	&nvfx_state_scissor, \
+	&nvfx_state_stipple, \
+	&nvfx_state_fragprog, \
+	&nvfx_state_fragtex, \
+	&nvfx_state_vertprog, \
+	&nvfx_state_blend, \
+	&nvfx_state_blend_colour, \
+	&nvfx_state_zsa, \
+	&nvfx_state_sr, \
+	&nvfx_state_viewport, \
+	&nvfx_state_##vbo, \
+	NULL \
+}
+
+RENDER_STATES(, vbo);
+RENDER_STATES(swtnl_, vtxfmt);
+
+static void
+nvfx_state_do_validate(struct nvfx_context *nvfx,
+		       struct nvfx_state_entry **states)
+{
+	while (*states) {
+		struct nvfx_state_entry *e = *states;
+
+		if (nvfx->dirty & e->dirty.pipe) {
+			if (e->validate(nvfx))
+				nvfx->state.dirty |= (1ULL << e->dirty.hw);
+		}
+
+		states++;
+	}
+	nvfx->dirty = 0;
+}
+
+void
+nvfx_state_emit(struct nvfx_context *nvfx)
+{
+	struct nvfx_state *state = &nvfx->state;
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned i;
+	uint64_t states;
+
+	/* XXX: race conditions
+	 */
+	if (nvfx != screen->cur_ctx) {
+		for (i = 0; i < NVFX_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_ctx = nvfx;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nvfx->screen->state[i]);
+		if (state->hw[i])
+			so_emit(chan, nvfx->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	/* TODO: could nv30 need this or something similar too? */
+	if(nvfx->is_nv4x) {
+		if (state->dirty & ((1ULL << NVFX_STATE_FRAGPROG) |
+				    (1ULL << NVFX_STATE_FRAGTEX0))) {
+			BEGIN_RING(chan, eng3d, NV40TCL_TEX_CACHE_CTL, 1);
+			OUT_RING  (chan, 2);
+			BEGIN_RING(chan, eng3d, NV40TCL_TEX_CACHE_CTL, 1);
+			OUT_RING  (chan, 1);
+		}
+	}
+	state->dirty = 0;
+}
+
+void
+nvfx_state_flush_notify(struct nouveau_channel *chan)
+{
+	struct nvfx_context *nvfx = chan->user_private;
+	struct nvfx_state *state = &nvfx->state;
+	unsigned i, samplers;
+
+	so_emit_reloc_markers(chan, state->hw[NVFX_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(chan,
+				      state->hw[NVFX_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(chan, state->hw[NVFX_STATE_FRAGPROG]);
+	if (state->hw[NVFX_STATE_VTXBUF] && nvfx->render_mode == HW)
+		so_emit_reloc_markers(chan, state->hw[NVFX_STATE_VTXBUF]);
+}
+
+boolean
+nvfx_state_validate(struct nvfx_context *nvfx)
+{
+	boolean was_sw = nvfx->fallback_swtnl ? TRUE : FALSE;
+
+	if (nvfx->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nvfx->fallback_swtnl & nvfx->dirty)
+				!= nvfx->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nvfx->pipe.flush(&nvfx->pipe, 0, NULL);
+		nvfx->dirty |= (NVFX_NEW_VIEWPORT |
+				NVFX_NEW_VERTPROG |
+				NVFX_NEW_ARRAYS);
+		nvfx->render_mode = HW;
+	}
+
+	nvfx_state_do_validate(nvfx, render_states);
+
+	if (nvfx->fallback_swtnl || nvfx->fallback_swrast)
+		return FALSE;
+
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nvfx_state_validate_swtnl(struct nvfx_context *nvfx)
+{
+	struct draw_context *draw = nvfx->draw;
+
+	/* Setup for swtnl */
+	if (nvfx->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nvfx->fallback_swtnl);
+		nvfx->pipe.flush(&nvfx->pipe, 0, NULL);
+		nvfx->dirty |= (NVFX_NEW_VIEWPORT |
+				NVFX_NEW_VERTPROG |
+				NVFX_NEW_ARRAYS);
+		nvfx->render_mode = SWTNL;
+	}
+
+	if (nvfx->draw_dirty & NVFX_NEW_VERTPROG)
+		draw_bind_vertex_shader(draw, nvfx->vertprog->draw);
+
+	if (nvfx->draw_dirty & NVFX_NEW_RAST)
+		draw_set_rasterizer_state(draw, &nvfx->rasterizer->pipe);
+
+	if (nvfx->draw_dirty & NVFX_NEW_UCP)
+		draw_set_clip_state(draw, &nvfx->clip);
+
+	if (nvfx->draw_dirty & NVFX_NEW_VIEWPORT)
+		draw_set_viewport_state(draw, &nvfx->viewport);
+
+	if (nvfx->draw_dirty & NVFX_NEW_ARRAYS) {
+		draw_set_vertex_buffers(draw, nvfx->vtxbuf_nr, nvfx->vtxbuf);
+		draw_set_vertex_elements(draw, nvfx->vtxelt->num_elements, nvfx->vtxelt->pipe);
+	}
+
+	nvfx_state_do_validate(nvfx, swtnl_render_states);
+
+	if (nvfx->fallback_swrast) {
+		NOUVEAU_ERR("swtnl->swrast 0x%08x\n", nvfx->fallback_swrast);
+		return FALSE;
+	}
+
+	nvfx->draw_dirty = 0;
+	return TRUE;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
new file mode 100644
index 00000000000..dd64ba4193c
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -0,0 +1,234 @@
+#include "nvfx_context.h"
+#include "nouveau/nouveau_util.h"
+
+static struct pipe_buffer *
+nvfx_do_surface_buffer(struct pipe_surface *surface)
+{
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)surface->texture;
+	return mt->buffer;
+}
+
+#define nvfx_surface_buffer(ps) nouveau_bo(nvfx_do_surface_buffer(ps))
+
+static boolean
+nvfx_state_framebuffer_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_framebuffer_state *fb = &nvfx->framebuffer;
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+	struct nv04_surface *rt[4], *zeta = NULL;
+	uint32_t rt_enable = 0, rt_format = 0;
+	int i, colour_format = 0, zeta_format = 0;
+	int depth_only = 0;
+	struct nouveau_stateobj *so = so_new(18, 24, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+	int colour_bits = 32, zeta_bits = 32;
+
+	if(!nvfx->is_nv4x)
+		assert(fb->nr_cbufs <= 2);
+	else
+		assert(fb->nr_cbufs <= 4);
+
+	for (i = 0; i < fb->nr_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = (struct nv04_surface *)fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR1 |
+			 NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV34TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = (struct nv04_surface *)fb->zsbuf;
+	}
+
+	if (rt_enable & (NV34TCL_RT_ENABLE_COLOR0 | NV34TCL_RT_ENABLE_COLOR1 |
+		NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3)) {
+		/* Render to at least a colour buffer */
+		if (!(rt[0]->base.texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+			for (i = 1; i < fb->nr_cbufs; i++)
+				assert(!(rt[i]->base.texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+
+			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+				(log2i(rt[0]->base.width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+				(log2i(rt[0]->base.height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+		}
+		else
+			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+	} else if (fb->zsbuf) {
+		depth_only = 1;
+
+		/* Render to depth buffer only */
+		if (!(zeta->base.texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+			assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+
+			rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+				(log2i(zeta->base.width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+				(log2i(zeta->base.height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+		}
+		else
+			rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+	} else {
+		return FALSE;
+	}
+
+	switch (colour_format) {
+	case PIPE_FORMAT_B8G8R8X8_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8;
+		break;
+	case PIPE_FORMAT_B8G8R8A8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_B5G6R5_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		colour_bits = 16;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		zeta_bits = 16;
+		break;
+	case PIPE_FORMAT_S8Z24_UNORM:
+	case PIPE_FORMAT_X8Z24_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if ((!nvfx->is_nv4x) && colour_bits > zeta_bits) {
+		/* TODO: does this limitation really exist?
+		   TODO: can it be worked around somehow? */
+		return FALSE;
+	}
+
+	if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0)
+		|| ((!nvfx->is_nv4x) && depth_only)) {
+		struct nv04_surface *rt0 = (depth_only ? zeta : rt[0]);
+		uint32_t pitch = rt0->pitch;
+
+		if(!nvfx->is_nv4x)
+		{
+			if (zeta) {
+				pitch |= (zeta->pitch << 16);
+			} else {
+				pitch |= (pitch << 16);
+			}
+		}
+
+		so_method(so, eng3d, NV34TCL_DMA_COLOR0, 1);
+		so_reloc (so, nvfx_surface_buffer(&rt0->base), 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, eng3d, NV34TCL_COLOR0_PITCH, 2);
+		so_data  (so, pitch);
+		so_reloc (so, nvfx_surface_buffer(&rt[0]->base),
+			      rt0->base.offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		so_method(so, eng3d, NV34TCL_DMA_COLOR1, 1);
+		so_reloc (so, nvfx_surface_buffer(&rt[1]->base), 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, eng3d, NV34TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, nvfx_surface_buffer(&rt[1]->base),
+			      rt[1]->base.offset, rt_flags | NOUVEAU_BO_LOW,
+			      0, 0);
+		so_data  (so, rt[1]->pitch);
+	}
+
+	if(nvfx->is_nv4x)
+	{
+		if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+			so_method(so, eng3d, NV40TCL_DMA_COLOR2, 1);
+			so_reloc (so, nvfx_surface_buffer(&rt[2]->base), 0,
+				      rt_flags | NOUVEAU_BO_OR,
+				      chan->vram->handle, chan->gart->handle);
+			so_method(so, eng3d, NV40TCL_COLOR2_OFFSET, 1);
+			so_reloc (so, nvfx_surface_buffer(&rt[2]->base),
+				      rt[2]->base.offset, rt_flags | NOUVEAU_BO_LOW,
+				      0, 0);
+			so_method(so, eng3d, NV40TCL_COLOR2_PITCH, 1);
+			so_data  (so, rt[2]->pitch);
+		}
+
+		if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+			so_method(so, eng3d, NV40TCL_DMA_COLOR3, 1);
+			so_reloc (so, nvfx_surface_buffer(&rt[3]->base), 0,
+				      rt_flags | NOUVEAU_BO_OR,
+				      chan->vram->handle, chan->gart->handle);
+			so_method(so, eng3d, NV40TCL_COLOR3_OFFSET, 1);
+			so_reloc (so, nvfx_surface_buffer(&rt[3]->base),
+				      rt[3]->base.offset, rt_flags | NOUVEAU_BO_LOW,
+				      0, 0);
+			so_method(so, eng3d, NV40TCL_COLOR3_PITCH, 1);
+			so_data  (so, rt[3]->pitch);
+		}
+	}
+
+	if (zeta_format) {
+		so_method(so, eng3d, NV34TCL_DMA_ZETA, 1);
+		so_reloc (so, nvfx_surface_buffer(&zeta->base), 0,
+			      rt_flags | NOUVEAU_BO_OR,
+			      chan->vram->handle, chan->gart->handle);
+		so_method(so, eng3d, NV34TCL_ZETA_OFFSET, 1);
+		/* TODO: reverse engineer LMA */
+		so_reloc (so, nvfx_surface_buffer(&zeta->base),
+			      zeta->base.offset, rt_flags | NOUVEAU_BO_LOW, 0, 0);
+	        if(nvfx->is_nv4x) {
+			so_method(so, eng3d, NV40TCL_ZETA_PITCH, 1);
+			so_data  (so, zeta->pitch);
+		}
+	}
+
+	so_method(so, eng3d, NV34TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, eng3d, NV34TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, eng3d, NV34TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, eng3d, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, eng3d, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+
+	if(!nvfx->is_nv4x) {
+		/* Wonder why this is needed, context should all be set to zero on init */
+		/* TODO: we can most likely remove this, after putting it in context init */
+		so_method(so, eng3d, NV34TCL_VIEWPORT_TX_ORIGIN, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nvfx->state.hw[NVFX_STATE_FB]);
+	so_ref(NULL, &so);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_framebuffer = {
+	.validate = nvfx_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_FB,
+		.hw = NVFX_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c b/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c
new file mode 100644
index 00000000000..0d35ecbf209
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nvfx_context.h"
+
+static boolean
+nvfx_state_rasterizer_validate(struct nvfx_context *nvfx)
+{
+	so_ref(nvfx->rasterizer->so,
+	       &nvfx->state.hw[NVFX_STATE_RAST]);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_rasterizer = {
+	.validate = nvfx_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_RAST,
+		.hw = NVFX_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_state_scissor.c b/src/gallium/drivers/nvfx/nvfx_state_scissor.c
new file mode 100644
index 00000000000..940d8cb5c0c
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_scissor.c
@@ -0,0 +1,36 @@
+#include "nvfx_context.h"
+
+static boolean
+nvfx_state_scissor_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nvfx->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nvfx->state.hw[NVFX_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nvfx->state.scissor_enabled == 0))
+		return FALSE;
+	nvfx->state.scissor_enabled = rast->scissor;
+
+	so = so_new(1, 2, 0);
+	so_method(so, nvfx->screen->eng3d, NV34TCL_SCISSOR_HORIZ, 2);
+	if (nvfx->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nvfx->state.hw[NVFX_STATE_SCISSOR]);
+	so_ref(NULL, &so);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_scissor = {
+	.validate = nvfx_state_scissor_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_SCISSOR | NVFX_NEW_RAST,
+		.hw = NVFX_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_state_stipple.c b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
new file mode 100644
index 00000000000..57cd3c936a7
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_stipple.c
@@ -0,0 +1,40 @@
+#include "nvfx_context.h"
+
+static boolean
+nvfx_state_stipple_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+	struct nouveau_stateobj *so;
+
+	if (nvfx->state.hw[NVFX_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nvfx->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(2, 33, 0);
+		so_method(so, eng3d, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, eng3d, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nvfx->stipple[i]);
+	} else {
+		so = so_new(1, 1, 0);
+		so_method(so, eng3d, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nvfx->state.hw[NVFX_STATE_STIPPLE]);
+	so_ref(NULL, &so);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_stipple = {
+	.validate = nvfx_state_stipple_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_STIPPLE | NVFX_NEW_RAST,
+		.hw = NVFX_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_state_viewport.c b/src/gallium/drivers/nvfx/nvfx_state_viewport.c
new file mode 100644
index 00000000000..82e0e9220b0
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_viewport.c
@@ -0,0 +1,51 @@
+#include "nvfx_context.h"
+
+static boolean
+nvfx_state_viewport_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_viewport_state *vpt = &nvfx->viewport;
+	struct nouveau_stateobj *so;
+
+	if (nvfx->state.hw[NVFX_STATE_VIEWPORT] &&
+	    !(nvfx->dirty & NVFX_NEW_VIEWPORT))
+		return FALSE;
+
+	so = so_new(2, 9, 0);
+	so_method(so, nvfx->screen->eng3d,
+		  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+	if(nvfx->render_mode == HW) {
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+		so_method(so, nvfx->screen->eng3d, 0x1d78, 1);
+		so_data  (so, 1);
+	} else {
+		so_data  (so, fui(0.0f));
+		so_data  (so, fui(0.0f));
+		so_data  (so, fui(0.0f));
+		so_data  (so, fui(0.0f));
+		so_data  (so, fui(1.0f));
+		so_data  (so, fui(1.0f));
+		so_data  (so, fui(1.0f));
+		so_data  (so, fui(1.0f));
+		so_method(so, nvfx->screen->eng3d, 0x1d78, 1);
+		so_data  (so, nvfx->is_nv4x ? 0x110 : 1);
+	}
+
+	so_ref(so, &nvfx->state.hw[NVFX_STATE_VIEWPORT]);
+	so_ref(NULL, &so);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_viewport = {
+	.validate = nvfx_state_viewport_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_VIEWPORT,
+		.hw = NVFX_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_state_zsa.c b/src/gallium/drivers/nvfx/nvfx_state_zsa.c
new file mode 100644
index 00000000000..c84fd041c1e
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_state_zsa.c
@@ -0,0 +1,41 @@
+#include "nvfx_context.h"
+
+static boolean
+nvfx_state_zsa_validate(struct nvfx_context *nvfx)
+{
+	so_ref(nvfx->zsa->so,
+	       &nvfx->state.hw[NVFX_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_zsa = {
+	.validate = nvfx_state_zsa_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_ZSA,
+		.hw = NVFX_STATE_ZSA
+	}
+};
+
+static boolean
+nvfx_state_sr_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_stateobj *so = so_new(2, 2, 0);
+	struct pipe_stencil_ref *sr = &nvfx->stencil_ref;
+
+	so_method(so, nvfx->screen->eng3d, NV34TCL_STENCIL_FRONT_FUNC_REF, 1);
+	so_data  (so, sr->ref_value[0]);
+	so_method(so, nvfx->screen->eng3d, NV34TCL_STENCIL_BACK_FUNC_REF, 1);
+	so_data  (so, sr->ref_value[1]);
+
+	so_ref(so, &nvfx->state.hw[NVFX_STATE_SR]);
+	so_ref(NULL, &so);
+	return TRUE;
+}
+
+struct nvfx_state_entry nvfx_state_sr = {
+	.validate = nvfx_state_sr_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_SR,
+		.hw = NVFX_STATE_SR
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c
new file mode 100644
index 00000000000..8a05ad0a571
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_surface.c
@@ -0,0 +1,62 @@
+
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "nvfx_context.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nvfx_surface_copy(struct pipe_context *pipe,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
+
+	eng2d->copy(eng2d, dest, destx, desty, src, srcx, srcy, width, height);
+}
+
+static void
+nvfx_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nv04_surface_2d *eng2d = nvfx->screen->eng2d;
+
+	eng2d->fill(eng2d, dest, destx, desty, width, height, value);
+}
+
+void
+nvfx_init_surface_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.surface_copy = nvfx_surface_copy;
+	nvfx->pipe.surface_fill = nvfx_surface_fill;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_tex.h b/src/gallium/drivers/nvfx/nvfx_tex.h
new file mode 100644
index 00000000000..69187a79e79
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_tex.h
@@ -0,0 +1,133 @@
+#ifndef NVFX_TEX_H_
+#define NVFX_TEX_H_
+
+static inline unsigned
+nvfx_tex_wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+		break;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV34TCL_TX_WRAP_S_SHIFT;
+}
+
+static inline unsigned
+nvfx_tex_wrap_compare_mode(const struct pipe_sampler_state* cso)
+{
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			return NV34TCL_TX_WRAP_RCOMP_NEVER;
+		case PIPE_FUNC_GREATER:
+			return NV34TCL_TX_WRAP_RCOMP_GREATER;
+		case PIPE_FUNC_EQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_EQUAL;
+		case PIPE_FUNC_GEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+		case PIPE_FUNC_LESS:
+			return NV34TCL_TX_WRAP_RCOMP_LESS;
+		case PIPE_FUNC_NOTEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+		case PIPE_FUNC_LEQUAL:
+			return NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+		case PIPE_FUNC_ALWAYS:
+			return NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+		default:
+			break;
+		}
+	}
+	return 0;
+}
+
+static inline unsigned nvfx_tex_filter(const struct pipe_sampler_state* cso)
+{
+	unsigned filter = 0;
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+	return filter;
+}
+
+static inline unsigned nvfx_tex_border_color(const float* border_color)
+{
+	return ((float_to_ubyte(border_color[3]) << 24) |
+		    (float_to_ubyte(border_color[0]) << 16) |
+		    (float_to_ubyte(border_color[1]) <<  8) |
+		    (float_to_ubyte(border_color[2]) <<  0));
+}
+
+struct nvfx_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+#endif /* NVFX_TEX_H_ */
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.c b/src/gallium/drivers/nvfx/nvfx_transfer.c
new file mode 100644
index 00000000000..409b354d582
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.c
@@ -0,0 +1,182 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "nouveau/nouveau_winsys.h"
+#include "nvfx_context.h"
+#include "nvfx_screen.h"
+#include "nvfx_state.h"
+
+struct nvfx_transfer {
+	struct pipe_transfer base;
+	struct pipe_surface *surface;
+	boolean direct;
+};
+
+static void
+nvfx_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
+                             struct pipe_texture *template)
+{
+	memset(template, 0, sizeof(struct pipe_texture));
+	template->target = pt->target;
+	template->format = pt->format;
+	template->width0 = width;
+	template->height0 = height;
+	template->depth0 = 1;
+	template->last_level = 0;
+	template->nr_samples = pt->nr_samples;
+
+	template->tex_usage = PIPE_TEXTURE_USAGE_DYNAMIC |
+	                      NOUVEAU_TEXTURE_USAGE_LINEAR;
+}
+
+static struct pipe_transfer *
+nvfx_transfer_new(struct pipe_context *pcontext, struct pipe_texture *pt,
+		  unsigned face, unsigned level, unsigned zslice,
+		  enum pipe_transfer_usage usage,
+		  unsigned x, unsigned y, unsigned w, unsigned h)
+{
+        struct pipe_screen *pscreen = pcontext->screen;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)pt;
+	struct nvfx_transfer *tx;
+	struct pipe_texture tx_tex_template, *tx_tex;
+
+	tx = CALLOC_STRUCT(nvfx_transfer);
+	if (!tx)
+		return NULL;
+
+	pipe_texture_reference(&tx->base.texture, pt);
+	tx->base.x = x;
+	tx->base.y = y;
+	tx->base.width = w;
+	tx->base.height = h;
+	tx->base.stride = mt->level[level].pitch;
+	tx->base.usage = usage;
+	tx->base.face = face;
+	tx->base.level = level;
+	tx->base.zslice = zslice;
+
+	/* Direct access to texture */
+	if ((pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC ||
+	     debug_get_bool_option("NOUVEAU_NO_TRANSFER", TRUE/*XXX:FALSE*/)) &&
+	    pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)
+	{
+		tx->direct = true;
+		tx->surface = pscreen->get_tex_surface(pscreen, pt,
+	                                               face, level, zslice,
+	                                               pipe_transfer_buffer_flags(&tx->base));
+		return &tx->base;
+	}
+
+	tx->direct = false;
+
+	nvfx_compatible_transfer_tex(pt, w, h, &tx_tex_template);
+
+	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
+	if (!tx_tex)
+	{
+		FREE(tx);
+		return NULL;
+	}
+
+	tx->base.stride = ((struct nvfx_miptree*)tx_tex)->level[0].pitch;
+
+	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
+	                                       0, 0, 0,
+	                                       pipe_transfer_buffer_flags(&tx->base));
+
+	pipe_texture_reference(&tx_tex, NULL);
+
+	if (!tx->surface)
+	{
+		pipe_surface_reference(&tx->surface, NULL);
+		FREE(tx);
+		return NULL;
+	}
+
+	if (usage & PIPE_TRANSFER_READ) {
+		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
+		struct pipe_surface *src;
+
+		src = pscreen->get_tex_surface(pscreen, pt,
+	                                       face, level, zslice,
+	                                       PIPE_BUFFER_USAGE_GPU_READ);
+
+		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
+		/* TODO: Check if SIFM can un-swizzle */
+		nvscreen->eng2d->copy(nvscreen->eng2d,
+		                      tx->surface, 0, 0,
+		                      src, x, y,
+		                      w, h);
+
+		pipe_surface_reference(&src, NULL);
+	}
+
+	return &tx->base;
+}
+
+static void
+nvfx_transfer_del(struct pipe_context *pcontext,
+                  struct pipe_transfer *ptx)
+{
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
+		struct pipe_screen *pscreen = pcontext->screen;
+		struct nvfx_screen *nvscreen = nvfx_screen(pscreen);
+		struct pipe_surface *dst;
+
+		dst = pscreen->get_tex_surface(pscreen, ptx->texture,
+	                                       ptx->face, ptx->level, ptx->zslice,
+	                                       PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER);
+
+		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
+		nvscreen->eng2d->copy(nvscreen->eng2d,
+		                      dst, tx->base.x, tx->base.y,
+		                      tx->surface, 0, 0,
+		                      tx->base.width, tx->base.height);
+
+		pipe_surface_reference(&dst, NULL);
+	}
+
+	pipe_surface_reference(&tx->surface, NULL);
+	pipe_texture_reference(&ptx->texture, NULL);
+	FREE(ptx);
+}
+
+static void *
+nvfx_transfer_map(struct pipe_context *pcontext, struct pipe_transfer *ptx)
+{
+        struct pipe_screen *pscreen = pcontext->screen;
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
+	void *map = pipe_buffer_map(pscreen, mt->buffer,
+	                            pipe_transfer_buffer_flags(ptx));
+
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
+}
+
+static void
+nvfx_transfer_unmap(struct pipe_context *pcontext, struct pipe_transfer *ptx)
+{
+	struct pipe_screen *pscreen = pcontext->screen;
+	struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx;
+	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture;
+
+	pipe_buffer_unmap(pscreen, mt->buffer);
+}
+
+void
+nvfx_init_transfer_functions(struct nvfx_context *nvfx)
+{
+	nvfx->pipe.get_tex_transfer = nvfx_transfer_new;
+	nvfx->pipe.tex_transfer_destroy = nvfx_transfer_del;
+	nvfx->pipe.transfer_map = nvfx_transfer_map;
+	nvfx->pipe.transfer_unmap = nvfx_transfer_unmap;
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
new file mode 100644
index 00000000000..257087f8f63
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -0,0 +1,570 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+static boolean
+nvfx_force_swtnl(struct nvfx_context *nvfx)
+{
+	static int force_swtnl = -1;
+	if(force_swtnl < 0)
+		force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", 0);
+	return force_swtnl;
+}
+
+static INLINE int
+nvfx_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nvfx_vbo_set_idxbuf(struct nvfx_context *nvfx, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nvfx->screen->base.base;
+	unsigned type;
+
+	if (!ib) {
+		nvfx->idxbuf = NULL;
+		nvfx->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nvfx->idxbuf ||
+	    type != nvfx->idxbuf_format) {
+		nvfx->dirty |= NVFX_NEW_ARRAYS;
+		nvfx->idxbuf = ib;
+		nvfx->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nvfx_vbo_static_attrib(struct nvfx_context *nvfx, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_screen *pscreen = nvfx->pipe.screen;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = pipe_buffer_map(pscreen, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV34TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, eng3d, NV34TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, eng3d, NV34TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, eng3d, NV34TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, eng3d, NV34TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			pipe_buffer_unmap(pscreen, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		pipe_buffer_unmap(pscreen, vb->buffer);
+		return FALSE;
+	}
+
+	pipe_buffer_unmap(pscreen, vb->buffer);
+	return TRUE;
+}
+
+void
+nvfx_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned restart = 0;
+
+	nvfx_vbo_set_idxbuf(nvfx, NULL, 0);
+	if (nvfx_force_swtnl(nvfx) || !nvfx_state_validate(nvfx)) {
+		nvfx_draw_elements_swtnl(pipe, NULL, 0,
+                                           mode, start, count);
+                return;
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nvfx_state_emit(nvfx);
+
+		vc = nouveau_vbuf_split(AVAIL_RING(chan), 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(chan);
+			continue;
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(chan, eng3d, NV34TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+}
+
+static INLINE void
+nvfx_draw_elements_u08(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nvfx_state_emit(nvfx);
+
+		vc = nouveau_vbuf_split(AVAIL_RING(chan), 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(chan, eng3d, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nvfx_draw_elements_u16(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nvfx_state_emit(nvfx);
+
+		vc = nouveau_vbuf_split(AVAIL_RING(chan), 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (chan, elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(chan, eng3d, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nvfx_draw_elements_u32(struct nvfx_context *nvfx, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nvfx_state_emit(nvfx);
+
+		vc = nouveau_vbuf_split(AVAIL_RING(chan), 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(chan);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(chan, eng3d, NV34TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (chan, elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
+
+		start = restart;
+	}
+}
+
+static void
+nvfx_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct pipe_screen *pscreen = pipe->screen;
+	void *map;
+
+	map = pipe_buffer_map(pscreen, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nvfx_draw_elements_u08(nvfx, map, mode, start, count);
+		break;
+	case 2:
+		nvfx_draw_elements_u16(nvfx, map, mode, start, count);
+		break;
+	case 4:
+		nvfx_draw_elements_u32(nvfx, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	pipe_buffer_unmap(pscreen, ib);
+}
+
+static void
+nvfx_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	unsigned restart = 0;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nvfx_state_emit(nvfx);
+
+		vc = nouveau_vbuf_split(AVAIL_RING(chan), 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(chan);
+			continue;
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (chan, ((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(chan, eng3d, NV34TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(chan, ((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(chan, eng3d, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (chan, 0);
+
+		count -= vc;
+		start = restart;
+	}
+}
+
+void
+nvfx_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nvfx_vbo_set_idxbuf(nvfx, indexBuffer, indexSize);
+	if (nvfx_force_swtnl(nvfx) || !nvfx_state_validate(nvfx)) {
+		nvfx_draw_elements_swtnl(pipe, indexBuffer, indexSize,
+                                           mode, start, count);
+		return;
+	}
+
+	if (idxbuf) {
+		nvfx_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nvfx_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+}
+
+static boolean
+nvfx_vbo_validate(struct nvfx_context *nvfx)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *eng3d = nvfx->screen->eng3d;
+	struct pipe_buffer *ib = nvfx->idxbuf;
+	unsigned ib_format = nvfx->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	vtxbuf = so_new(3, 17, 18);
+	so_method(vtxbuf, eng3d, NV34TCL_VTXBUF_ADDRESS(0), nvfx->vtxelt->num_elements);
+	vtxfmt = so_new(1, 16, 0);
+	so_method(vtxfmt, eng3d, NV34TCL_VTXFMT(0), nvfx->vtxelt->num_elements);
+
+	for (hw = 0; hw < nvfx->vtxelt->num_elements; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nvfx->vtxelt->pipe[hw];
+		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->stride) {
+			if (!sattr)
+				sattr = so_new(16, 16 * 4, 0);
+
+			if (nvfx_vbo_static_attrib(nvfx, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV34TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			nvfx->fallback_swtnl |= NVFX_NEW_ARRAYS;
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, nouveau_bo(vb->buffer),
+				 vb->buffer_offset + ve->src_offset,
+				 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+				 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		struct nouveau_bo *bo = nouveau_bo(ib);
+
+		so_method(vtxbuf, eng3d, NV34TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, bo, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, bo, ib_format, vb_flags | NOUVEAU_BO_OR,
+				  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, eng3d, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nvfx->state.hw[NVFX_STATE_VTXBUF]);
+	so_ref(NULL, &vtxbuf);
+	nvfx->state.dirty |= (1ULL << NVFX_STATE_VTXBUF);
+	so_ref(vtxfmt, &nvfx->state.hw[NVFX_STATE_VTXFMT]);
+	so_ref(NULL, &vtxfmt);
+	nvfx->state.dirty |= (1ULL << NVFX_STATE_VTXFMT);
+	so_ref(sattr, &nvfx->state.hw[NVFX_STATE_VTXATTR]);
+	so_ref(NULL, &sattr);
+	nvfx->state.dirty |= (1ULL << NVFX_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nvfx_state_entry nvfx_state_vbo = {
+	.validate = nvfx_vbo_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
new file mode 100644
index 00000000000..2d243be16a3
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -0,0 +1,1049 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nvfx_context.h"
+#include "nvfx_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#include "nv30_vertprog.h"
+#include "nv40_vertprog.h"
+
+#define NVFX_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+
+struct nvfx_vpc {
+	struct nvfx_vertex_program *vp;
+
+	struct nvfx_vertex_program_exec *vpi;
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nvfx_sreg *r_address;
+	struct nvfx_sreg *r_temp;
+
+	struct nvfx_sreg *imm;
+	unsigned nr_imm;
+
+	unsigned hpos_idx;
+};
+
+static struct nvfx_sreg
+temp(struct nvfx_vpc *vpc)
+{
+	int idx = ffs(~vpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nvfx_sr(NVFXSR_TEMP, 0);
+	}
+
+	vpc->r_temps |= (1 << idx);
+	vpc->r_temps_discard |= (1 << idx);
+	return nvfx_sr(NVFXSR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nvfx_vpc *vpc)
+{
+	vpc->r_temps &= ~vpc->r_temps_discard;
+	vpc->r_temps_discard = 0;
+}
+
+static struct nvfx_sreg
+constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	struct nvfx_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nvfx_sr(NVFXSR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nvfx_sr(NVFXSR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nvfx_vp_arith(nvfx, (cc), NVFX_VP_INST_SLOT_##s, NVFX_VP_INST_##s##_OP_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_sreg src)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NVFXSR_TEMP:
+		sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT));
+		sr |= (src.index << NVFX_VP(SRC_TEMP_SRC_SHIFT));
+		break;
+	case NVFXSR_INPUT:
+		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NVFX_VP(INST_INPUT_SRC_SHIFT));
+		break;
+	case NVFXSR_CONST:
+		sr |= (NVFX_VP(SRC_REG_TYPE_CONST) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NVFXSR_NONE:
+		sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) <<
+		       NVFX_VP(SRC_REG_TYPE_SHIFT));
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NVFX_VP(SRC_NEGATE);
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NVFX_VP(SRC_SWZ_X_SHIFT)) |
+	       (src.swz[1] << NVFX_VP(SRC_SWZ_Y_SHIFT)) |
+	       (src.swz[2] << NVFX_VP(SRC_SWZ_Z_SHIFT)) |
+	       (src.swz[3] << NVFX_VP(SRC_SWZ_W_SHIFT)));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NVFX_VP(SRC0_HIGH_MASK)) >>
+			  NVFX_VP(SRC0_HIGH_SHIFT)) << NVFX_VP(INST_SRC0H_SHIFT);
+		hw[2] |= (sr & NVFX_VP(SRC0_LOW_MASK)) <<
+			  NVFX_VP(INST_SRC0L_SHIFT);
+		break;
+	case 1:
+		hw[2] |= sr << NVFX_VP(INST_SRC1_SHIFT);
+		break;
+	case 2:
+		hw[2] |= ((sr & NVFX_VP(SRC2_HIGH_MASK)) >>
+			  NVFX_VP(SRC2_HIGH_SHIFT)) << NVFX_VP(INST_SRC2H_SHIFT);
+		hw[3] |= (sr & NVFX_VP(SRC2_LOW_MASK)) <<
+			  NVFX_VP(INST_SRC2L_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_sreg dst)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NVFXSR_TEMP:
+		if(!nvfx->is_nv4x)
+			hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		else {
+			hw[3] |= NV40_VP_INST_DEST_MASK;
+			if (slot == 0) {
+				hw[0] |= (dst.index <<
+					  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+			} else {
+				hw[3] |= (dst.index <<
+					  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+			}
+		}
+		break;
+	case NVFXSR_OUTPUT:
+		/* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */
+		switch (dst.index) {
+		case NVFX_VP_INST_DEST_CLIP(0):
+			vp->or |= (1 << 6);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(1):
+			vp->or |= (1 << 7);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(2):
+			vp->or |= (1 << 8);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2;
+			dst.index = NVFX_VP(INST_DEST_FOGC);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(3):
+			vp->or |= (1 << 9);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(4):
+			vp->or |= (1 << 10);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		case NVFX_VP_INST_DEST_CLIP(5):
+			vp->or |= (1 << 11);
+			vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5;
+			dst.index = NVFX_VP(INST_DEST_PSZ);
+			break;
+		default:
+			if(!nvfx->is_nv4x) {
+				switch (dst.index) {
+				case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+				case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+				case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+				case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+				case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+				case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+				case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+				case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+				case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+				case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+				case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+				case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+				case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+				case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+				}
+			} else {
+				switch (dst.index) {
+				case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+				case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+				case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+				case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+				case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break;
+				case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+				case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+				case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+				case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+				case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+				case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+				case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+				case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+				case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+				}
+			}
+			break;
+		}
+
+		if(!nvfx->is_nv4x) {
+			hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+			hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+			/*XXX: no way this is entirely correct, someone needs to
+			 *     figure out what exactly it is.
+			 */
+			hw[3] |= 0x800;
+		} else {
+			hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+			if (slot == 0) {
+				hw[0] |= NV40_VP_INST_VEC_RESULT;
+				hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+			} else {
+				hw[3] |= NV40_VP_INST_SCA_RESULT;
+				hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+			}
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op,
+	      struct nvfx_sreg dst, int mask,
+	      struct nvfx_sreg s0, struct nvfx_sreg s1,
+	      struct nvfx_sreg s2)
+{
+	struct nvfx_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NVFX_COND_TR << NVFX_VP(INST_COND_SHIFT));
+	hw[0] |= ((0 << NVFX_VP(INST_COND_SWZ_X_SHIFT)) |
+		  (1 << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) |
+		  (2 << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) |
+		  (3 << NVFX_VP(INST_COND_SWZ_W_SHIFT)));
+
+	if(!nvfx->is_nv4x) {
+		hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//		hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK);
+//		hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT));
+
+		if (dst.type == NVFXSR_OUTPUT) {
+			if (slot)
+				hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+			else
+				hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+		} else {
+			if (slot)
+				hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+			else
+				hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+		}
+	 } else {
+		if (slot == 0) {
+			hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+			hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	    } else {
+			hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+			hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+			hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+		}
+	}
+
+	emit_dst(nvfx, vpc, hw, slot, dst);
+	emit_src(nvfx, vpc, hw, 0, s0);
+	emit_src(nvfx, vpc, hw, 1, s1);
+	emit_src(nvfx, vpc, hw, 2, s2);
+}
+
+static INLINE struct nvfx_sreg
+tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nvfx_sreg src;
+
+	switch (fsrc->Register.File) {
+	case TGSI_FILE_INPUT:
+		src = nvfx_sr(NVFXSR_INPUT, fsrc->Register.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = vpc->r_temp[fsrc->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->Register.Absolute;
+	src.negate = fsrc->Register.Negate;
+	src.swz[0] = fsrc->Register.SwizzleX;
+	src.swz[1] = fsrc->Register.SwizzleY;
+	src.swz[2] = fsrc->Register.SwizzleZ;
+	src.swz[3] = fsrc->Register.SwizzleW;
+	return src;
+}
+
+static INLINE struct nvfx_sreg
+tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nvfx_sreg dst;
+
+	switch (fdst->Register.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = vpc->r_result[fdst->Register.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = vpc->r_temp[fdst->Register.Index];
+		break;
+	case TGSI_FILE_ADDRESS:
+		dst = vpc->r_address[fdst->Register.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= NVFX_VP_MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= NVFX_VP_MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= NVFX_VP_MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= NVFX_VP_MASK_W;
+	return mask;
+}
+
+static boolean
+nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nvfx_sreg src[3], dst, tmp;
+	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+		if (fsrc->Register.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->Src[i];
+
+		switch (fsrc->Register.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->Register.Index) {
+				ai = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->Register.Index) {
+				ci = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->Register.Index) {
+				ii = fsrc->Register.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->Dst[0]);
+	mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, VEC, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, VEC, ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, VEC, ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, VEC, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, VEC, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, VEC, DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, VEC, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, SCA, EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, SCA, EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, VEC, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, VEC, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, SCA, LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, SCA, LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, SCA, LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, VEC, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, VEC, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, VEC, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, VEC, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, VEC, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, SCA, LG2, tmp, NVFX_VP_MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, VEC, MUL, tmp, NVFX_VP_MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, SCA, EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, SCA, RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, SCA, RSQ, dst, mask, none, none, abs(src[0]));
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, VEC, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, VEC, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, VEC, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, VEC, ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, VEC, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(vpc);
+	return TRUE;
+}
+
+static boolean
+nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->Range.First;
+	int hw;
+
+	switch (fdec->Semantic.Name) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NVFX_VP(INST_DEST_POS);
+		vpc->hpos_idx = idx;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_VP(INST_DEST_COL0);
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_VP(INST_DEST_COL1);
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.Index == 0) {
+			hw = NVFX_VP(INST_DEST_BFC0);
+		} else
+		if (fdec->Semantic.Index == 1) {
+			hw = NVFX_VP(INST_DEST_BFC1);
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NVFX_VP(INST_DEST_FOGC);
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NVFX_VP(INST_DEST_PSZ);
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.Index <= 7) {
+			hw = NVFX_VP(INST_DEST_TC(fdec->Semantic.Index));
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_EDGEFLAG:
+		/* not really an error just a fallback */
+		NOUVEAU_ERR("cannot handle edgeflag output\n");
+		return FALSE;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw);
+	return TRUE;
+}
+
+static boolean
+nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->Range.Last > high_temp) {
+					high_temp =
+						fdec->Range.Last;
+				}
+				break;
+#if 0 /* this would be nice.. except gallium doesn't track it */
+			case TGSI_FILE_ADDRESS:
+				if (fdec->Range.Last > high_addr) {
+					high_addr =
+						fdec->Range.Last;
+				}
+				break;
+#endif
+			case TGSI_FILE_OUTPUT:
+				if (!nvfx_vertprog_parse_decl_output(nvfx, vpc, fdec))
+					return FALSE;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+#if 1 /* yay, parse instructions looking for address regs instead */
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			const struct tgsi_full_dst_register *fdst;
+
+			finst = &p.FullToken.FullInstruction;
+			fdst = &finst->Dst[0];
+
+			if (fdst->Register.File == TGSI_FILE_ADDRESS) {
+				if (fdst->Register.Index > high_addr)
+					high_addr = fdst->Register.Index;
+			}
+
+		}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_sreg));
+		assert(vpc->imm);
+	}
+
+	if (++high_temp) {
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_temp; i++)
+			vpc->r_temp[i] = temp(vpc);
+	}
+
+	if (++high_addr) {
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_sreg));
+		for (i = 0; i < high_addr; i++)
+			vpc->r_address[i] = temp(vpc);
+	}
+
+	vpc->r_temps_discard = 0;
+	return TRUE;
+}
+
+static void
+nvfx_vertprog_translate(struct nvfx_context *nvfx,
+			struct nvfx_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nvfx_vpc *vpc = NULL;
+	struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0);
+	int i;
+
+	vpc = CALLOC(1, sizeof(struct nvfx_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+
+	if (!nvfx_vertprog_prepare(nvfx, vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	/* Redirect post-transform vertex position to a temp if user clip
+	 * planes are enabled.  We need to append code to the vtxprog
+	 * to handle clip planes later.
+	 */
+	if (vp->ucp.nr)  {
+		vpc->r_result[vpc->hpos_idx] = temp(vpc);
+		vpc->r_temps_discard = 0;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(imm->Immediate.NrTokens == 4 + 1);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u[0].Float,
+					 imm->u[1].Float,
+					 imm->u[2].Float,
+					 imm->u[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nvfx_vertprog_parse_instruction(nvfx, vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Write out HPOS if it was redirected to a temp earlier */
+	if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) {
+		struct nvfx_sreg hpos = nvfx_sr(NVFXSR_OUTPUT,
+						NVFX_VP(INST_DEST_POS));
+		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
+
+		arith(vpc, VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none);
+	}
+
+	/* Insert code to handle user clip planes */
+	for (i = 0; i < vp->ucp.nr; i++) {
+		struct nvfx_sreg cdst = nvfx_sr(NVFXSR_OUTPUT,
+						NVFX_VP_INST_DEST_CLIP(i));
+		struct nvfx_sreg ceqn = constant(vpc, -1,
+						 nvfx->clip.ucp[i][0],
+						 nvfx->clip.ucp[i][1],
+						 nvfx->clip.ucp[i][2],
+						 nvfx->clip.ucp[i][3]);
+		struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		unsigned mask;
+
+		switch (i) {
+		case 0: case 3: mask = NVFX_VP_MASK_Y; break;
+		case 1: case 4: mask = NVFX_VP_MASK_Z; break;
+		case 2: case 5: mask = NVFX_VP_MASK_W; break;
+		default:
+			NOUVEAU_ERR("invalid clip dist #%d\n", i);
+			goto out_err;
+		}
+
+		arith(vpc, VEC, DP4, cdst, mask, htmp, ceqn, none);
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (vpc->r_temp)
+		FREE(vpc->r_temp);
+	if (vpc->r_address)
+		FREE(vpc->r_address);
+	if (vpc->imm)
+		FREE(vpc->imm);
+	FREE(vpc);
+}
+
+static boolean
+nvfx_vertprog_validate(struct nvfx_context *nvfx)
+{
+	struct pipe_screen *pscreen = nvfx->pipe.screen;
+	struct nvfx_screen *screen = nvfx->screen;
+	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_grobj *eng3d = screen->eng3d;
+	struct nvfx_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	if (nvfx->render_mode == HW) {
+		vp = nvfx->vertprog;
+		constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX];
+
+		if ((nvfx->dirty & NVFX_NEW_UCP) ||
+		    memcmp(&nvfx->clip, &vp->ucp, sizeof(vp->ucp))) {
+			nvfx_vertprog_destroy(nvfx, vp);
+			memcpy(&vp->ucp, &nvfx->clip, sizeof(vp->ucp));
+		}
+	} else {
+		vp = nvfx->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
+	/* Translate TGSI shader into hw bytecode */
+	if (vp->translated)
+		goto check_gpu_resources;
+
+	nvfx->fallback_swtnl &= ~NVFX_NEW_VERTPROG;
+		nvfx_vertprog_translate(nvfx, vp);
+	if (!vp->translated) {
+		nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG;
+			return FALSE;
+	}
+
+check_gpu_resources:
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nvfx->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nvfx_vertex_program *evict;
+
+				evict = heap->next->priv;
+				nouveau_resource_free(&evict->exec);
+			}
+
+			if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(3, 4, 0);
+		so_method(so, eng3d, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		if(nvfx->is_nv4x) {
+			so_method(so, eng3d, NV40TCL_VP_ATTRIB_EN, 2);
+			so_data  (so, vp->ir);
+			so_data  (so, vp->or);
+		}
+		so_method(so, eng3d,  NV34TCL_VP_CLIP_PLANES_ENABLE, 1);
+		so_data  (so, vp->clip_ctrl);
+		so_ref(so, &vp->so);
+		so_ref(NULL, &so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nvfx->screen->vp_data_heap;
+
+		if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nvfx_vertex_program *evict;
+
+				evict = heap->next->priv;
+				nouveau_resource_free(&evict->data);
+			}
+
+			if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nvfx_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK);
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NVFX_VP(INST_CONST_SRC_SHIFT);
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = pipe_buffer_map(pscreen, constbuf,
+					      PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (chan, i + vp->data->start);
+			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf)
+			pipe_buffer_unmap(pscreen, constbuf);
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (chan, vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (chan, vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nvfx->state.hw[NVFX_STATE_VERTPROG]) {
+		so_ref(vp->so, &nvfx->state.hw[NVFX_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp)
+{
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nouveau_resource_free(&vp->exec);
+	vp->exec_start = 0;
+	nouveau_resource_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = vp->clip_ctrl = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nvfx_state_entry nvfx_state_vertprog = {
+	.validate = nvfx_vertprog_validate,
+	.dirty = {
+		.pipe = NVFX_NEW_VERTPROG | NVFX_NEW_UCP,
+		.hw = NVFX_STATE_VERTPROG,
+	}
+};
author	Keith Whitwell <[email protected]>	2010-03-15 09:44:52 +0000
committer	Keith Whitwell <[email protected]>	2010-03-15 09:44:52 +0000
commit	42910ebe7b9748c0ecb6a597bae3e7d43c7e170f (patch)
tree	b2b9c72dc47a0473304dc2db1b078d13b658df34 /src/gallium/drivers/nvfx
parent	47bfbd452c93e6a8db013fb90d9f42210cf24889 (diff)
parent	68e58a96e80865878e6881dc4d34fcc3ec24eb19 (diff)