19 files changed, 1357 insertions, 826 deletions
diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile
index 2834f8984c7..6cbbad699eb 100644
--- a/src/gallium/drivers/nvfx/Makefile
+++ b/src/gallium/drivers/nvfx/Makefile
@@ -14,6 +14,7 @@ C_SOURCES = \
 	nv30_fragtex.c \
 	nv40_fragtex.c \
 	nvfx_miptree.c \
+	nvfx_push.c \
 	nvfx_query.c \
 	nvfx_resource.c \
 	nvfx_screen.c \
diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c
index 63c578a0ce1..db8a8fc4b08 100644
--- a/src/gallium/drivers/nvfx/nv30_fragtex.c
+++ b/src/gallium/drivers/nvfx/nv30_fragtex.c
@@ -1,7 +1,6 @@
 #include "util/u_format.h"
 
 #include "nvfx_context.h"
-#include "nouveau/nouveau_util.h"
 #include "nvfx_tex.h"
 #include "nvfx_resource.h"
 
@@ -44,9 +43,9 @@ nv30_sampler_view_init(struct pipe_context *pipe,
 
 	txf = sv->u.init_fmt;
 	txf |= (level != sv->base.last_level ? NV34TCL_TX_FORMAT_MIPMAP : 0);
-	txf |= log2i(u_minify(pt->width0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
-	txf |= log2i(u_minify(pt->height0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
-	txf |= log2i(u_minify(pt->depth0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
+	txf |= util_logbase2(u_minify(pt->width0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT;
+	txf |= util_logbase2(u_minify(pt->height0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT;
+	txf |= util_logbase2(u_minify(pt->depth0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT;
 	txf |=  0x10000;
 
 	sv->u.nv30.fmt[0] = tf->fmt[0] | txf;
diff --git a/src/gallium/drivers/nvfx/nvfx_buffer.c b/src/gallium/drivers/nvfx/nvfx_buffer.c
index 44680e51959..89bb8570efd 100644
--- a/src/gallium/drivers/nvfx/nvfx_buffer.c
+++ b/src/gallium/drivers/nvfx/nvfx_buffer.c
@@ -6,13 +6,16 @@
 #include "nouveau/nouveau_screen.h"
 #include "nouveau/nouveau_winsys.h"
 #include "nvfx_resource.h"
+#include "nvfx_screen.h"
 
 void nvfx_buffer_destroy(struct pipe_screen *pscreen,
 				struct pipe_resource *presource)
 {
-	struct nvfx_resource *buffer = nvfx_resource(presource);
+	struct nvfx_buffer *buffer = nvfx_buffer(presource);
 
-	nouveau_screen_bo_release(pscreen, buffer->bo);
+	if(!(buffer->base.base.flags & NVFX_RESOURCE_FLAG_USER))
+		align_free(buffer->data);
+	nouveau_screen_bo_release(pscreen, buffer->base.bo);
 	FREE(buffer);
 }
 
@@ -20,31 +23,22 @@ struct pipe_resource *
 nvfx_buffer_create(struct pipe_screen *pscreen,
 		   const struct pipe_resource *template)
 {
-	struct nvfx_resource *buffer;
+	struct nvfx_screen* screen = nvfx_screen(pscreen);
+	struct nvfx_buffer* buffer;
 
-	buffer = CALLOC_STRUCT(nvfx_resource);
+	buffer = CALLOC_STRUCT(nvfx_buffer);
 	if (!buffer)
 		return NULL;
 
-	buffer->base = *template;
-	buffer->base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->base.screen = pscreen;
+	buffer->base.base = *template;
+	buffer->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR;
+	pipe_reference_init(&buffer->base.base.reference, 1);
+	buffer->base.base.screen = pscreen;
+	buffer->size = util_format_get_stride(template->format, template->width0);
+	buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold;
+	buffer->data = align_malloc(buffer->size, 16);
 
-	buffer->bo = nouveau_screen_bo_new(pscreen,
-					   16,
-					   buffer->base.usage,
-					   buffer->base.bind,
-					   buffer->base.width0);
-
-	if (buffer->bo == NULL)
-		goto fail;
-
-	return &buffer->base;
-
-fail:
-	FREE(buffer);
-	return NULL;
+	return &buffer->base.base;
 }
 
 
@@ -54,29 +48,49 @@ nvfx_user_buffer_create(struct pipe_screen *pscreen,
 			unsigned bytes,
 			unsigned usage)
 {
-	struct nvfx_resource *buffer;
+	struct nvfx_screen* screen = nvfx_screen(pscreen);
+	struct nvfx_buffer* buffer;
 
-	buffer = CALLOC_STRUCT(nvfx_resource);
+	buffer = CALLOC_STRUCT(nvfx_buffer);
 	if (!buffer)
 		return NULL;
 
-	pipe_reference_init(&buffer->base.reference, 1);
-	buffer->base.flags = NVFX_RESOURCE_FLAG_LINEAR;
-	buffer->base.screen = pscreen;
-	buffer->base.format = PIPE_FORMAT_R8_UNORM;
-	buffer->base.usage = PIPE_USAGE_IMMUTABLE;
-	buffer->base.bind = usage;
-	buffer->base.width0 = bytes;
-	buffer->base.height0 = 1;
-	buffer->base.depth0 = 1;
-
-	buffer->bo = nouveau_screen_bo_user(pscreen, ptr, bytes);
-	if (!buffer->bo)
-		goto fail;
-
-	return &buffer->base;
+	pipe_reference_init(&buffer->base.base.reference, 1);
+	buffer->base.base.flags = NVFX_RESOURCE_FLAG_LINEAR | NVFX_RESOURCE_FLAG_USER;
+	buffer->base.base.screen = pscreen;
+	buffer->base.base.format = PIPE_FORMAT_R8_UNORM;
+	buffer->base.base.usage = PIPE_USAGE_IMMUTABLE;
+	buffer->base.base.bind = usage;
+	buffer->base.base.width0 = bytes;
+	buffer->base.base.height0 = 1;
+	buffer->base.base.depth0 = 1;
+	buffer->data = ptr;
+	buffer->size = bytes;
+	buffer->bytes_to_draw_until_static = bytes * screen->static_reuse_threshold;
+	buffer->dirty_end = bytes;
+
+	return &buffer->base.base;
+}
 
-fail:
-	FREE(buffer);
-	return NULL;
+void nvfx_buffer_upload(struct nvfx_buffer* buffer)
+{
+	unsigned dirty = buffer->dirty_end - buffer->dirty_begin;
+	if(!buffer->base.bo)
+	{
+		buffer->base.bo = nouveau_screen_bo_new(buffer->base.base.screen,
+					   16,
+					   buffer->base.base.usage,
+					   buffer->base.base.bind,
+					   buffer->base.base.width0);
+	}
+
+	if(dirty)
+	{
+		// TODO: may want to use a temporary in some cases
+		nouveau_bo_map(buffer->base.bo, NOUVEAU_BO_WR
+				| (buffer->dirty_unsynchronized ? NOUVEAU_BO_NOSYNC : 0));
+		memcpy(buffer->base.bo->map + buffer->dirty_begin, buffer->data + buffer->dirty_begin, dirty);
+		nouveau_bo_unmap(buffer->base.bo);
+		buffer->dirty_begin = buffer->dirty_end = 0;
+	}
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c
index 1980176b23e..94c854b22b8 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.c
+++ b/src/gallium/drivers/nvfx/nvfx_context.c
@@ -76,7 +76,9 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 	nvfx_init_surface_functions(nvfx);
 	nvfx_init_state_functions(nvfx);
 	nvfx_init_sampling_functions(nvfx);
+	nvfx_init_vbo_functions(nvfx);
 	nvfx_init_resource_functions(&nvfx->pipe);
+	nvfx_init_transfer_functions(&nvfx->pipe);
 
 	/* Create, configure, and install fallback swtnl path */
 	nvfx->draw = draw_create(&nvfx->pipe);
@@ -89,6 +91,7 @@ nvfx_create(struct pipe_screen *pscreen, void *priv)
 	/* set these to that we init them on first validation */
 	nvfx->state.scissor_enabled = ~0;
 	nvfx->state.stipple_enabled = ~0;
+	nvfx->use_vertex_buffers = -1;
 
 	LIST_INITHEAD(&nvfx->render_cache);
 
diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h
index bce19df044d..8899bf991e1 100644
--- a/src/gallium/drivers/nvfx/nvfx_context.h
+++ b/src/gallium/drivers/nvfx/nvfx_context.h
@@ -44,6 +44,7 @@
 #define NVFX_NEW_SR		(1 << 13)
 #define NVFX_NEW_VERTCONST	(1 << 14)
 #define NVFX_NEW_FRAGCONST	(1 << 15)
+#define NVFX_NEW_INDEX	(1 << 16)
 
 struct nvfx_rasterizer_state {
 	struct pipe_rasterizer_state pipe;
@@ -71,9 +72,53 @@ struct nvfx_state {
 	unsigned render_temps;
 };
 
+struct nvfx_per_vertex_element {
+	unsigned idx;
+        unsigned vertex_buffer_index;
+        unsigned src_offset;
+};
+
+struct nvfx_low_frequency_element {
+	unsigned idx;
+	unsigned vertex_buffer_index;
+	unsigned src_offset;
+        void (*fetch_rgba_float)(float *dst, const uint8_t *src, unsigned i, unsigned j);
+        unsigned ncomp;
+};
+
+struct nvfx_per_instance_element {
+	struct nvfx_low_frequency_element base;
+	unsigned instance_divisor;
+};
+
+struct nvfx_per_vertex_buffer_info
+{
+	unsigned vertex_buffer_index;
+	unsigned per_vertex_size;
+};
+
 struct nvfx_vtxelt_state {
 	struct pipe_vertex_element pipe[16];
 	unsigned num_elements;
+	unsigned vtxfmt[16];
+
+	unsigned num_per_vertex_buffer_infos;
+	struct nvfx_per_vertex_buffer_info per_vertex_buffer_info[16];
+
+	unsigned num_per_vertex;
+	struct nvfx_per_vertex_element per_vertex[16];
+
+	unsigned num_per_instance;
+	struct nvfx_per_instance_element per_instance[16];
+
+	unsigned num_constant;
+	struct nvfx_low_frequency_element constant[16];
+
+	boolean needs_translate;
+	struct translate* translate;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
 };
 
 struct nvfx_render_target {
@@ -127,8 +172,6 @@ struct nvfx_context {
 	struct pipe_viewport_state viewport;
 	struct pipe_framebuffer_state framebuffer;
 	struct pipe_index_buffer idxbuf;
-	struct pipe_resource *idxbuf_buffer;
-	unsigned idxbuf_format;
 	struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
 	struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS];
 	unsigned nr_samplers;
@@ -137,8 +180,14 @@ struct nvfx_context {
 	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
 	unsigned vtxbuf_nr;
 	struct nvfx_vtxelt_state *vtxelt;
+	int base_vertex;
+	boolean use_index_buffer;
+	/* -1 = hardware input setup is outdated
+	 * 0 = hardware input setup is for inline vertices
+	 * 1 = hardware input setup is for hardware vertices
+	 */
+	int use_vertex_buffers;
 
-	unsigned vbo_bo;
 	unsigned hw_vtxelt_nr;
 	uint8_t hw_samplers;
 	uint32_t hw_txf[8];
@@ -180,11 +229,7 @@ extern void nvfx_clear(struct pipe_context *pipe, unsigned buffers,
 
 /* nvfx_draw.c */
 extern struct draw_stage *nvfx_draw_render_stage(struct nvfx_context *nvfx);
-extern void nvfx_draw_elements_swtnl(struct pipe_context *pipe,
-                                     struct pipe_resource *idxbuf,
-                                     unsigned ib_size, int ib_bias,
-                                     unsigned mode,
-                                     unsigned start, unsigned count);
+extern void nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info);
 extern void nvfx_vtxfmt_validate(struct nvfx_context *nvfx);
 
 /* nvfx_fb.c */
@@ -245,17 +290,53 @@ extern boolean nvfx_state_validate_swtnl(struct nvfx_context *nvfx);
 extern void nvfx_state_emit(struct nvfx_context *nvfx);
 
 /* nvfx_transfer.c */
-extern void nvfx_init_transfer_functions(struct nvfx_context *nvfx);
+extern void nvfx_init_transfer_functions(struct pipe_context *pipe);
 
 /* nvfx_vbo.c */
 extern boolean nvfx_vbo_validate(struct nvfx_context *nvfx);
 extern void nvfx_vbo_relocate(struct nvfx_context *nvfx);
+extern void nvfx_idxbuf_validate(struct nvfx_context* nvfx);
+extern void nvfx_idxbuf_relocate(struct nvfx_context* nvfx);
 extern void nvfx_draw_vbo(struct pipe_context *pipe,
                           const struct pipe_draw_info *info);
+extern void nvfx_init_vbo_functions(struct nvfx_context *nvfx);
+extern unsigned nvfx_vertex_formats[];
 
 /* nvfx_vertprog.c */
 extern boolean nvfx_vertprog_validate(struct nvfx_context *nvfx);
 extern void nvfx_vertprog_destroy(struct nvfx_context *,
 				  struct nvfx_vertex_program *);
 
+/* nvfx_push.c */
+extern void nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
+
+/* must WAIT_RING(chan, ncomp + 1) or equivalent beforehand! */
+static inline void nvfx_emit_vtx_attr(struct nouveau_channel* chan, unsigned attrib, float* v, unsigned ncomp)
+{
+	switch (ncomp) {
+	case 4:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
+		OUT_RING(chan, fui(v[0]));
+		OUT_RING(chan, fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		OUT_RING(chan,  fui(v[3]));
+		break;
+	case 3:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		OUT_RING(chan,  fui(v[2]));
+		break;
+	case 2:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
+		OUT_RING(chan,  fui(v[0]));
+		OUT_RING(chan,  fui(v[1]));
+		break;
+	case 1:
+		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
+		OUT_RING(chan,  fui(v[0]));
+		break;
+	}
+}
+
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c
index 22cff370b77..331e28418ad 100644
--- a/src/gallium/drivers/nvfx/nvfx_draw.c
+++ b/src/gallium/drivers/nvfx/nvfx_draw.c
@@ -9,6 +9,7 @@
 #include "draw/draw_pipe.h"
 
 #include "nvfx_context.h"
+#include "nvfx_resource.h"
 
 /* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
  * often at all.  Uses "quadro style" vertex submission + a fixed vertex
@@ -39,30 +40,21 @@ nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v)
 		unsigned idx = nvfx->swtnl.draw[i];
 		unsigned hw = nvfx->swtnl.hw[i];
 
+		WAIT_RING(chan, 5);
 		switch (nvfx->swtnl.emit[i]) {
 		case EMIT_OMIT:
 			break;
 		case EMIT_1F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_1F(hw), 1);
-			OUT_RING  (chan, fui(v->data[idx][0]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 1);
 			break;
 		case EMIT_2F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_2F_X(hw), 2);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 2);
 			break;
 		case EMIT_3F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_3F_X(hw), 3);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
-			OUT_RING  (chan, fui(v->data[idx][2]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 3);
 			break;
 		case EMIT_4F:
-			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
-			OUT_RING  (chan, fui(v->data[idx][0]));
-			OUT_RING  (chan, fui(v->data[idx][1]));
-			OUT_RING  (chan, fui(v->data[idx][2]));
-			OUT_RING  (chan, fui(v->data[idx][3]));
+			nvfx_emit_vtx_attr(chan, hw, v->data[idx], 4);
 			break;
 		case 0xff:
 			BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4);
@@ -231,15 +223,9 @@ nvfx_draw_render_stage(struct nvfx_context *nvfx)
 }
 
 void
-nvfx_draw_elements_swtnl(struct pipe_context *pipe,
-			 struct pipe_resource *idxbuf,
-			 unsigned idxbuf_size, int idxbuf_bias,
-			 unsigned mode, unsigned start, unsigned count)
+nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS];
-	struct pipe_transfer *ib_transfer = NULL;
-	struct pipe_transfer *cb_transfer = NULL;
 	unsigned i;
 	void *map;
 
@@ -247,18 +233,15 @@ nvfx_draw_elements_swtnl(struct pipe_context *pipe,
 		return;
 	nvfx_state_emit(nvfx);
 
+	/* these must be passed without adding the offsets */
 	for (i = 0; i < nvfx->vtxbuf_nr; i++) {
-		map = pipe_buffer_map(pipe, nvfx->vtxbuf[i].buffer,
-                                      PIPE_TRANSFER_READ,
-				      &vb_transfer[i]);
+		map = nvfx_buffer(nvfx->vtxbuf[i].buffer)->data;
 		draw_set_mapped_vertex_buffer(nvfx->draw, i, map);
 	}
 
-	if (idxbuf) {
-		map = pipe_buffer_map(pipe, idxbuf,
-				      PIPE_TRANSFER_READ,
-				      &ib_transfer);
-		draw_set_mapped_element_buffer(nvfx->draw, idxbuf_size, idxbuf_bias, map);
+	if (info->indexed) {
+		map = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;
+		draw_set_mapped_element_buffer_range(nvfx->draw, nvfx->idxbuf.index_size, info->index_bias, info->min_index, info->max_index, map);
 	} else {
 		draw_set_mapped_element_buffer(nvfx->draw, 0, 0, NULL);
 	}
@@ -266,28 +249,14 @@ nvfx_draw_elements_swtnl(struct pipe_context *pipe,
 	if (nvfx->constbuf[PIPE_SHADER_VERTEX]) {
 		const unsigned nr = nvfx->constbuf_nr[PIPE_SHADER_VERTEX];
 
-		map = pipe_buffer_map(pipe,
-				      nvfx->constbuf[PIPE_SHADER_VERTEX],
-				      PIPE_TRANSFER_READ,
-				      &cb_transfer);
+		map = nvfx_buffer(nvfx->constbuf[PIPE_SHADER_VERTEX])->data;
 		draw_set_mapped_constant_buffer(nvfx->draw, PIPE_SHADER_VERTEX, 0,
                                                 map, nr);
 	}
 
-	draw_arrays(nvfx->draw, mode, start, count);
-
-	for (i = 0; i < nvfx->vtxbuf_nr; i++)
-		pipe_buffer_unmap(pipe, nvfx->vtxbuf[i].buffer, vb_transfer[i]);
-
-	if (idxbuf)
-		pipe_buffer_unmap(pipe, idxbuf, ib_transfer);
-
-	if (nvfx->constbuf[PIPE_SHADER_VERTEX])
-		pipe_buffer_unmap(pipe, nvfx->constbuf[PIPE_SHADER_VERTEX],
-				  cb_transfer);
+	draw_arrays_instanced(nvfx->draw, info->mode, info->start, info->count, info->start_instance, info->instance_count);
 
 	draw_flush(nvfx->draw);
-	pipe->flush(pipe, 0, NULL);
 }
 
 static INLINE void
diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index ee41f03b9b8..ae4fe3aa262 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -9,6 +9,7 @@
 
 #include "nvfx_context.h"
 #include "nvfx_shader.h"
+#include "nvfx_resource.h"
 
 #define MAX_CONSTS 128
 #define MAX_IMM 32
@@ -925,10 +926,7 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 
 		if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) {
 			struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT];
-			// TODO: avoid using transfers, just directly the buffer
-			struct pipe_transfer* transfer;
-			// TODO: does this check make any sense, or should we do this unconditionally?
-			uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer);
+			uint32_t* map = (uint32_t*)nvfx_buffer(constbuf)->data;
 			uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
 			uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset);
 			int i;
@@ -942,7 +940,6 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 					nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t));
 				}
 			}
-			pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer);
 		}
 	}
 
diff --git a/src/gallium/drivers/nvfx/nvfx_push.c b/src/gallium/drivers/nvfx/nvfx_push.c
new file mode 100644
index 00000000000..52e891c6678
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_push.c
@@ -0,0 +1,402 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_split_prim.h"
+#include "translate/translate.h"
+
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+
+struct push_context {
+	struct nouveau_channel* chan;
+
+	void *idxbuf;
+	int32_t idxbias;
+
+	float edgeflag;
+	int edgeflag_attr;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
+
+	struct translate* translate;
+};
+
+static void
+emit_edgeflag(void *priv, boolean enabled)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+
+	OUT_RING(chan, RING_3D(NV34TCL_EDGEFLAG_ENABLE, 1));
+	OUT_RING(chan, enabled ? 1 : 0);
+}
+
+static void
+emit_vertices_lookup8(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint8_t* elts = (uint8_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts8(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup16(void *priv, unsigned start, unsigned count)
+{
+	struct push_context *ctx = priv;
+        uint16_t* elts = (uint16_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts16(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup32(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint32_t* elts = (uint32_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+
+        while(count)
+        {
+		unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+		unsigned length = push * ctx->vertex_length;
+
+		OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+		ctx->translate->run(ctx->translate, start, push, 0, ctx->chan->cur);
+		ctx->chan->cur += length;
+
+		count -= push;
+		start += push;
+        }
+}
+
+static void
+emit_ranges(void* priv, unsigned start, unsigned vc, unsigned reg)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	unsigned nr = (vc & 0xff);
+	if (nr) {
+		OUT_RING(chan, RING_3D(reg, 1));
+		OUT_RING  (chan, ((nr - 1) << 24) | start);
+		start += nr;
+	}
+
+	nr = vc >> 8;
+	while (nr) {
+		unsigned push = nr > 2047 ? 2047 : nr;
+
+		nr -= push;
+
+		OUT_RING(chan, RING_3D_NI(reg, push));
+		while (push--) {
+			OUT_RING(chan, ((0x100 - 1) << 24) | start);
+			start += 0x100;
+		}
+	}
+}
+
+static void
+emit_ib_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_INDEX_BATCH);
+}
+
+static void
+emit_vb_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_VERTEX_BATCH);
+}
+
+static INLINE void
+emit_elt8(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint8_t *elts = (uint8_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt16(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint16_t *elts = (uint16_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt32(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	while (vc) {
+		unsigned push = MIN2(vc, 2047);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
+		assert(AVAIL_RING(chan) >= push);
+		if(idxbias)
+		{
+			for(unsigned i = 0; i < push; ++i)
+				OUT_RING(chan, elts[i] + idxbias);
+		}
+		else
+			OUT_RINGp(chan, elts, push);
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+void
+nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct push_context ctx;
+	struct util_split_prim s;
+	unsigned instances_left = info->instance_count;
+	int vtx_value;
+	unsigned hw_mode = nvgl_primitive(info->mode);
+	int i;
+	struct
+	{
+		uint8_t* map;
+		unsigned step;
+	} per_instance[16];
+	unsigned p_overhead = 0
+			+ 4 /* begin/end */
+			+ 4; /* potential edgeflag enable/disable */
+
+	ctx.chan = nvfx->screen->base.channel;
+	ctx.translate = nvfx->vtxelt->translate;
+	ctx.idxbuf = NULL;
+	ctx.vertex_length = nvfx->vtxelt->vertex_length;
+	ctx.max_vertices_per_packet = nvfx->vtxelt->max_vertices_per_packet;
+	ctx.edgeflag = 0.5f;
+	// TODO: figure out if we really want to handle this, and do so in that case
+	ctx.edgeflag_attr = 0xff; // nvfx->vertprog->cfg.edgeflag_in;
+
+	if(!nvfx->use_vertex_buffers)
+	{
+		for(i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			uint8_t* data = nvfx_buffer(vb->buffer)->data + vb->buffer_offset;
+			if(info->indexed)
+				data += info->index_bias * vb->stride;
+			ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
+		}
+
+		if(ctx.edgeflag_attr < 16)
+			vtx_value = -(ctx.vertex_length + 3);  /* vertex data and edgeflag header and value */
+		else
+		{
+			p_overhead += 1; /* initial vertex_data header */
+			vtx_value = -ctx.vertex_length;  /* vertex data and edgeflag header and value */
+		}
+
+		if (info->indexed) {
+			// XXX: this case and is broken and probably need a new VTX_ATTR push path
+			if (nvfx->idxbuf.index_size == 1)
+				s.emit = emit_vertices_lookup8;
+			else if (nvfx->idxbuf.index_size == 2)
+				s.emit = emit_vertices_lookup16;
+			else
+				s.emit = emit_vertices_lookup32;
+		} else
+			s.emit = emit_vertices;
+	}
+	else
+	{
+		if(!info->indexed || nvfx->use_index_buffer)
+		{
+			s.emit = info->indexed ? emit_ib_ranges : emit_vb_ranges;
+			p_overhead += 3;
+			vtx_value = 0;
+		}
+		else if (nvfx->idxbuf.index_size == 4)
+		{
+			s.emit = emit_elt32;
+			p_overhead += 1;
+			vtx_value = 8;
+		}
+		else
+		{
+			s.emit = (nvfx->idxbuf.index_size == 2) ? emit_elt16 : emit_elt8;
+			p_overhead += 3;
+			vtx_value = 7;
+		}
+	}
+
+	ctx.idxbias = info->index_bias;
+	if(nvfx->use_vertex_buffers)
+		ctx.idxbias -= nvfx->base_vertex;
+
+	/* map index buffer, if present */
+	if (info->indexed && !nvfx->use_index_buffer)
+		ctx.idxbuf = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;
+
+	s.priv = &ctx;
+	s.edge = emit_edgeflag;
+
+	for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+	{
+		struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+		float v[4];
+		per_instance[i].step = info->start_instance % ve->instance_divisor;
+		per_instance[i].map = nvfx_buffer(vb->buffer)->data + vb->buffer_offset + ve->base.src_offset;
+
+		nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+
+		WAIT_RING(chan, 5);
+		nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+	}
+
+	/* per-instance loop */
+	while (instances_left--) {
+		int max_verts;
+		boolean done;
+
+		util_split_prim_init(&s, info->mode, info->start, info->count);
+		nvfx_state_emit(nvfx);
+		for(;;) {
+			max_verts  = AVAIL_RING(chan);
+			max_verts -= p_overhead;
+
+			/* if vtx_value < 0, each vertex is -vtx_value words long
+			 * otherwise, each vertex is 2^(vtx_value) / 255 words long (this is an approximation)
+			 */
+			if(vtx_value < 0)
+			{
+				max_verts /= -vtx_value;
+				max_verts -= (max_verts >> 10); /* vertex data headers */
+			}
+			else
+			{
+				if(max_verts >= (1 << 23)) /* avoid overflow here */
+					max_verts = (1 << 23);
+				max_verts = (max_verts * 255) >> vtx_value;
+			}
+
+			//printf("avail %u max_verts %u\n", AVAIL_RING(chan), max_verts);
+
+			if(max_verts >= 16)
+			{
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, hw_mode);
+				done = util_split_prim_next(&s, max_verts);
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, 0);
+
+				if(done)
+					break;
+			}
+
+			FIRE_RING(chan);
+			nvfx_state_emit(nvfx);
+		}
+
+		/* set data for the next instance, if any changed */
+		for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+		{
+			struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+
+			if(++per_instance[i].step == ve->instance_divisor)
+			{
+				float v[4];
+				per_instance[i].map += vb->stride;
+				per_instance[i].step = 0;
+
+				nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+				WAIT_RING(chan, 5);
+				nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+			}
+		}
+	}
+}
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.c b/src/gallium/drivers/nvfx/nvfx_resource.c
index 1c921b47100..3a46e0a7a57 100644
--- a/src/gallium/drivers/nvfx/nvfx_resource.c
+++ b/src/gallium/drivers/nvfx/nvfx_resource.c
@@ -59,12 +59,6 @@ nvfx_resource_get_handle(struct pipe_screen *pscreen,
 void
 nvfx_init_resource_functions(struct pipe_context *pipe)
 {
-	pipe->get_transfer = nvfx_transfer_new;
-	pipe->transfer_map = nvfx_transfer_map;
-	pipe->transfer_flush_region = u_default_transfer_flush_region;
-	pipe->transfer_unmap = nvfx_transfer_unmap;
-	pipe->transfer_destroy = util_staging_transfer_destroy;
-	pipe->transfer_inline_write = u_default_transfer_inline_write;
 	pipe->is_resource_referenced = nvfx_resource_is_referenced;
 }
 
diff --git a/src/gallium/drivers/nvfx/nvfx_resource.h b/src/gallium/drivers/nvfx/nvfx_resource.h
index ff86f6d9cb6..583be4de2ae 100644
--- a/src/gallium/drivers/nvfx/nvfx_resource.h
+++ b/src/gallium/drivers/nvfx/nvfx_resource.h
@@ -17,8 +17,23 @@ struct nvfx_resource {
 	struct nouveau_bo *bo;
 };
 
+static INLINE
+struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
+{
+	return (struct nvfx_resource *)resource;
+}
+
 #define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
+#define NVFX_RESOURCE_FLAG_USER (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
+
+/* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */
+static INLINE boolean
+nvfx_resource_mapped_by_gpu(struct pipe_resource *resource)
+{
+   return nvfx_resource(resource)->bo->handle;
+}
 
+/* is resource in VRAM? */
 static inline int
 nvfx_resource_on_gpu(struct pipe_resource* pr)
 {
@@ -63,12 +78,6 @@ struct nvfx_surface {
 	struct nvfx_miptree* temp;
 };
 
-static INLINE 
-struct nvfx_resource *nvfx_resource(struct pipe_resource *resource)
-{
-	return (struct nvfx_resource *)resource;
-}
-
 static INLINE struct nouveau_bo *
 nvfx_surface_buffer(struct pipe_surface *surf)
 {
@@ -106,22 +115,6 @@ nvfx_miptree_from_handle(struct pipe_screen *pscreen,
 			 const struct pipe_resource *template,
 			 struct winsys_handle *whandle);
 
-struct pipe_resource *
-nvfx_buffer_create(struct pipe_screen *pscreen,
-		   const struct pipe_resource *template);
-
-void
-nvfx_buffer_destroy(struct pipe_screen *pscreen,
-                    struct pipe_resource *presource);
-
-struct pipe_resource *
-nvfx_user_buffer_create(struct pipe_screen *screen,
-			void *ptr,
-			unsigned bytes,
-			unsigned usage);
-
-
-
 void
 nvfx_miptree_surface_del(struct pipe_surface *ps);
 
@@ -173,4 +166,58 @@ nvfx_surface_create_temp(struct pipe_context* pipe, struct pipe_surface* surf);
 void
 nvfx_surface_flush(struct pipe_context* pipe, struct pipe_surface* surf);
 
+struct nvfx_buffer
+{
+	struct nvfx_resource base;
+	uint8_t* data;
+	unsigned size;
+
+	/* the range of data not yet uploaded to the GPU bo */
+	unsigned dirty_begin;
+	unsigned dirty_end;
+
+	/* whether all transfers were unsynchronized */
+	boolean dirty_unsynchronized;
+
+	/* whether it would have been profitable to upload
+	 * the latest updated data to the GPU immediately */
+	boolean last_update_static;
+
+	/* how many bytes we need to draw before we deem
+	 * the buffer to be static
+	 */
+	long long bytes_to_draw_until_static;
+};
+
+static inline struct nvfx_buffer* nvfx_buffer(struct pipe_resource* pr)
+{
+	return (struct nvfx_buffer*)pr;
+}
+
+/* this is an heuristic to determine whether we are better off uploading the
+ * buffer to the GPU, or just continuing pushing it on the FIFO
+ */
+static inline boolean nvfx_buffer_seems_static(struct nvfx_buffer* buffer)
+{
+	return buffer->last_update_static
+		|| buffer->bytes_to_draw_until_static < 0;
+}
+
+struct pipe_resource *
+nvfx_buffer_create(struct pipe_screen *pscreen,
+		   const struct pipe_resource *template);
+
+void
+nvfx_buffer_destroy(struct pipe_screen *pscreen,
+                    struct pipe_resource *presource);
+
+struct pipe_resource *
+nvfx_user_buffer_create(struct pipe_screen *screen,
+			void *ptr,
+			unsigned bytes,
+			unsigned usage);
+
+void
+nvfx_buffer_upload(struct nvfx_buffer* buffer);
+
 #endif
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c
index a1b8361a9a4..7e3caf8d2e3 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.c
+++ b/src/gallium/drivers/nvfx/nvfx_screen.c
@@ -163,11 +163,11 @@ nvfx_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_cap param)
 }
 
 static boolean
-nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
+nvfx_screen_is_format_supported(struct pipe_screen *pscreen,
 				     enum pipe_format format,
 				     enum pipe_texture_target target,
 				     unsigned sample_count,
-				     unsigned tex_usage, unsigned geom_flags)
+				     unsigned bind, unsigned geom_flags)
 {
 	struct nvfx_screen *screen = nvfx_screen(pscreen);
 	struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front;
@@ -175,7 +175,7 @@ nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
 	 if (sample_count > 1)
 		return FALSE;
 
-	if (tex_usage & PIPE_BIND_RENDER_TARGET) {
+	if (bind & PIPE_BIND_RENDER_TARGET) {
 		switch (format) {
 		case PIPE_FORMAT_B8G8R8A8_UNORM:
 		case PIPE_FORMAT_B8G8R8X8_UNORM:
@@ -186,7 +186,7 @@ nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
 		}
 	}
 
-	if (tex_usage & PIPE_BIND_DEPTH_STENCIL) {
+	if (bind & PIPE_BIND_DEPTH_STENCIL) {
 		switch (format) {
 		case PIPE_FORMAT_S8_USCALED_Z24_UNORM:
 		case PIPE_FORMAT_X8Z24_UNORM:
@@ -201,7 +201,7 @@ nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
 		}
 	}
 
-	if (tex_usage & PIPE_BIND_SAMPLER_VIEW) {
+	if (bind & PIPE_BIND_SAMPLER_VIEW) {
 		struct nvfx_texture_format* tf = &nvfx_texture_formats[format];
 		if(util_format_is_s3tc(format) && !util_format_s3tc_enabled)
 			return FALSE;
@@ -218,6 +218,22 @@ nvfx_screen_surface_format_supported(struct pipe_screen *pscreen,
 		}
 	}
 
+	// note that we do actually support everything through translate
+	if (bind & PIPE_BIND_VERTEX_BUFFER) {
+		unsigned type = nvfx_vertex_formats[format];
+		if(!type)
+			return FALSE;
+	}
+
+	if (bind & PIPE_BIND_INDEX_BUFFER) {
+		// 8-bit indices supported, but not in hardware index buffer
+		if(format != PIPE_FORMAT_R16_USCALED && format != PIPE_FORMAT_R32_USCALED)
+			return FALSE;
+	}
+
+	if(bind & PIPE_BIND_STREAM_OUTPUT)
+		return FALSE;
+
 	return TRUE;
 }
 
@@ -387,7 +403,7 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	pscreen->destroy = nvfx_screen_destroy;
 	pscreen->get_param = nvfx_screen_get_param;
 	pscreen->get_paramf = nvfx_screen_get_paramf;
-	pscreen->is_format_supported = nvfx_screen_surface_format_supported;
+	pscreen->is_format_supported = nvfx_screen_is_format_supported;
 	pscreen->context_create = nvfx_create;
 
 	switch (dev->chipset & 0xf0) {
@@ -419,6 +435,11 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	}
 
 	screen->force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", FALSE);
+	screen->trace_draw = debug_get_bool_option("NVFX_TRACE_DRAW", FALSE);
+
+	screen->buffer_allocation_cost = debug_get_num_option("NVFX_BUFFER_ALLOCATION_COST", 16384);
+	screen->inline_cost_per_hardware_cost = atof(debug_get_option("NVFX_INLINE_COST_PER_HARDWARE_COST", "1.0"));
+	screen->static_reuse_threshold = atof(debug_get_option("NVFX_STATIC_REUSE_THRESHOLD", "2.0"));
 
 	screen->vertex_buffer_reloc_flags = nvfx_screen_get_vertex_buffer_flags(screen);
 
diff --git a/src/gallium/drivers/nvfx/nvfx_screen.h b/src/gallium/drivers/nvfx/nvfx_screen.h
index 4dedbe9cb40..473a1127752 100644
--- a/src/gallium/drivers/nvfx/nvfx_screen.h
+++ b/src/gallium/drivers/nvfx/nvfx_screen.h
@@ -16,6 +16,7 @@ struct nvfx_screen {
 
 	unsigned is_nv4x; /* either 0 or ~0 */
 	boolean force_swtnl;
+	boolean trace_draw;
 	unsigned vertex_buffer_reloc_flags;
 	unsigned index_buffer_reloc_flags;
 
@@ -33,6 +34,18 @@ struct nvfx_screen {
 	struct nouveau_resource *vp_data_heap;
 
 	struct nv04_2d_context* eng2d;
+
+	/* Once the amount of bytes drawn from the buffer reaches the updated size times this value,
+	 * we will assume that the buffer will be drawn an huge number of times before the
+	 * next modification
+	 */
+	float static_reuse_threshold;
+
+	/* Cost of allocating a buffer in terms of the cost of copying a byte to an hardware buffer */
+	unsigned buffer_allocation_cost;
+
+	/* inline_cost/hardware_cost conversion ration */
+	float inline_cost_per_hardware_cost;
 };
 
 static INLINE struct nvfx_screen *
diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c
index d459f9a8801..25d29720a85 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.c
+++ b/src/gallium/drivers/nvfx/nvfx_state.c
@@ -441,83 +441,6 @@ nvfx_set_viewport_state(struct pipe_context *pipe,
 	nvfx->draw_dirty |= NVFX_NEW_VIEWPORT;
 }
 
-static void
-nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
-			const struct pipe_vertex_buffer *vb)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	for(unsigned i = 0; i < count; ++i)
-	{
-		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, vb[i].buffer);
-		nvfx->vtxbuf[i].buffer_offset = vb[i].buffer_offset;
-		nvfx->vtxbuf[i].max_index = vb[i].max_index;
-		nvfx->vtxbuf[i].stride = vb[i].stride;
-	}
-
-	for(unsigned i = count; i < nvfx->vtxbuf_nr; ++i)
-		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
-
-	nvfx->vtxbuf_nr = count;
-
-	nvfx->dirty |= NVFX_NEW_ARRAYS;
-	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
-}
-
-static void
-nvfx_set_index_buffer(struct pipe_context *pipe,
-		      const struct pipe_index_buffer *ib)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	/* TODO make this more like a state */
-
-	if(ib)
-	{
-		pipe_resource_reference(&nvfx->idxbuf.buffer, ib->buffer);
-		nvfx->idxbuf.index_size = ib->index_size;
-		nvfx->idxbuf.offset = ib->offset;
-	}
-	else
-	{
-		pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
-		nvfx->idxbuf.index_size = 0;
-		nvfx->idxbuf.offset = 0;
-	}
-}
-
-static void *
-nvfx_vtxelts_state_create(struct pipe_context *pipe,
-			  unsigned num_elements,
-			  const struct pipe_vertex_element *elements)
-{
-	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
-
-	assert(num_elements < 16); /* not doing fallbacks yet */
-	cso->num_elements = num_elements;
-	memcpy(cso->pipe, elements, num_elements * sizeof(*elements));
-
-/*	nvfx_vtxelt_construct(cso);*/
-
-	return (void *)cso;
-}
-
-static void
-nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
-{
-	FREE(hwcso);
-}
-
-static void
-nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
-{
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-
-	nvfx->vtxelt = hwcso;
-	nvfx->dirty |= NVFX_NEW_ARRAYS;
-	/*nvfx->draw_dirty |= NVFX_NEW_ARRAYS;*/
-}
-
 void
 nvfx_init_state_functions(struct nvfx_context *nvfx)
 {
@@ -553,11 +476,4 @@ nvfx_init_state_functions(struct nvfx_context *nvfx)
 	nvfx->pipe.set_polygon_stipple = nvfx_set_polygon_stipple;
 	nvfx->pipe.set_scissor_state = nvfx_set_scissor_state;
 	nvfx->pipe.set_viewport_state = nvfx_set_viewport_state;
-
-	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
-	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
-	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
-
-	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
-	nvfx->pipe.set_index_buffer = nvfx_set_index_buffer;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c
index dc70f3de870..b9d18977919 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_emit.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c
@@ -8,6 +8,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	unsigned dirty;
+	unsigned still_dirty = 0;
 	int all_swizzled = -1;
 	boolean flush_tex_cache = FALSE;
 
@@ -52,11 +53,19 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 				return FALSE;
 		}
 
-		if(dirty & (NVFX_NEW_ARRAYS))
+		if(dirty & NVFX_NEW_ARRAYS)
 		{
 			if(!nvfx_vbo_validate(nvfx))
 				return FALSE;
 		}
+
+		if(dirty & NVFX_NEW_INDEX)
+		{
+			if(nvfx->use_index_buffer)
+				nvfx_idxbuf_validate(nvfx);
+			else
+				still_dirty = NVFX_NEW_INDEX;
+		}
 	}
 	else
 	{
@@ -64,7 +73,7 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 		if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP))
 			nvfx_vertprog_validate(nvfx);
 
-		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_FRAGPROG))
+		if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_INDEX | NVFX_NEW_FRAGPROG))
 			nvfx_vtxfmt_validate(nvfx);
 	}
 
@@ -118,7 +127,24 @@ nvfx_state_validate_common(struct nvfx_context *nvfx)
 			OUT_RING(chan, 1);
 		}
 	}
-	nvfx->dirty = 0;
+
+	nvfx->dirty = dirty & still_dirty;
+
+	unsigned render_temps = nvfx->state.render_temps;
+	if(render_temps)
+	{
+		for(int i = 0; i < nvfx->framebuffer.nr_cbufs; ++i)
+		{
+			if(render_temps & (1 << i))
+				util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.cbufs[i]),
+						(struct util_dirty_surface*)nvfx->framebuffer.cbufs[i]);
+		}
+
+		if(render_temps & 0x80)
+			util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.zsbuf),
+					(struct util_dirty_surface*)nvfx->framebuffer.zsbuf);
+	}
+
 	return TRUE;
 }
 
@@ -137,21 +163,6 @@ nvfx_state_emit(struct nvfx_context *nvfx)
 	      ;
 	MARK_RING(chan, max_relocs * 2, max_relocs * 2);
 	nvfx_state_relocate(nvfx);
-
-	unsigned render_temps = nvfx->state.render_temps;
-	if(render_temps)
-	{
-		for(int i = 0; i < nvfx->framebuffer.nr_cbufs; ++i)
-		{
-			if(render_temps & (1 << i))
-				util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.cbufs[i]),
-						(struct util_dirty_surface*)nvfx->framebuffer.cbufs[i]);
-		}
-
-		if(render_temps & 0x80)
-			util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.zsbuf),
-					(struct util_dirty_surface*)nvfx->framebuffer.zsbuf);
-	}
 }
 
 void
@@ -161,7 +172,11 @@ nvfx_state_relocate(struct nvfx_context *nvfx)
 	nvfx_fragtex_relocate(nvfx);
 	nvfx_fragprog_relocate(nvfx);
 	if (nvfx->render_mode == HW)
+	{
 		nvfx_vbo_relocate(nvfx);
+		if(nvfx->use_index_buffer)
+			nvfx_idxbuf_relocate(nvfx);
+	}
 }
 
 boolean
diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c
index 80b0f21575f..28bbd36c2e8 100644
--- a/src/gallium/drivers/nvfx/nvfx_state_fb.c
+++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c
@@ -1,6 +1,5 @@
 #include "nvfx_context.h"
 #include "nvfx_resource.h"
-#include "nouveau/nouveau_util.h"
 #include "util/u_format.h"
 
 static inline boolean
@@ -125,8 +124,8 @@ nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result)
 		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
 
 		rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
-			(log2i(fb->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
-			(log2i(fb->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
+			(util_logbase2(fb->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) |
+			(util_logbase2(fb->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT);
 	} else
 		rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
 
diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c
index 7efdd954b4b..135978ad274 100644
--- a/src/gallium/drivers/nvfx/nvfx_surface.c
+++ b/src/gallium/drivers/nvfx/nvfx_surface.c
@@ -36,7 +36,6 @@
 #include "util/u_blitter.h"
 
 #include "nouveau/nouveau_winsys.h"
-#include "nouveau/nouveau_util.h"
 #include "nouveau/nouveau_screen.h"
 #include "nvfx_context.h"
 #include "nvfx_screen.h"
@@ -62,7 +61,7 @@ nvfx_region_set_format(struct nv04_region* rgn, enum pipe_format format)
 		break;
 	default:
 		assert(util_is_pot(bits));
-		int shift = log2i(bits) - 3;
+		int shift = util_logbase2(bits) - 3;
 		assert(shift >= 2);
 		rgn->bpps = 2;
 		shift -= 2;
@@ -365,25 +364,29 @@ nvfx_surface_copy_temp(struct pipe_context* pipe, struct pipe_surface* surf, int
 {
 	struct nvfx_surface* ns = (struct nvfx_surface*)surf;
 	struct pipe_subresource tempsr, surfsr;
-	struct pipe_resource *idxbuf_buffer;
-	unsigned idxbuf_format;
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+
+	// TODO: we really should do this validation before setting these variable in draw calls
+	unsigned use_vertex_buffers = nvfx->use_vertex_buffers;
+	boolean use_index_buffer = nvfx->use_index_buffer;
+	unsigned base_vertex = nvfx->base_vertex;
 
 	tempsr.face = 0;
 	tempsr.level = 0;
 	surfsr.face = surf->face;
 	surfsr.level = surf->level;
 
-	// TODO: do this properly, in blitter save
-	idxbuf_buffer = ((struct nvfx_context*)pipe)->idxbuf_buffer;
-	idxbuf_format = ((struct nvfx_context*)pipe)->idxbuf_format;
-
 	if(to_temp)
 		nvfx_resource_copy_region(pipe, &ns->temp->base.base, tempsr, 0, 0, 0, surf->texture, surfsr, 0, 0, surf->zslice, surf->width, surf->height);
 	else
 		nvfx_resource_copy_region(pipe, surf->texture, surfsr, 0, 0, surf->zslice, &ns->temp->base.base, tempsr, 0, 0, 0, surf->width, surf->height);
 
-	((struct nvfx_context*)pipe)->idxbuf_buffer = idxbuf_buffer;
-	((struct nvfx_context*)pipe)->idxbuf_format = idxbuf_format;
+	nvfx->use_vertex_buffers = use_vertex_buffers;
+	nvfx->use_index_buffer = use_index_buffer;
+        nvfx->base_vertex = base_vertex;
+
+	nvfx->dirty |= NVFX_NEW_ARRAYS;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
 }
 
 void
diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.c b/src/gallium/drivers/nvfx/nvfx_transfer.c
index e9c3dd7e551..ca4462ef9dc 100644
--- a/src/gallium/drivers/nvfx/nvfx_transfer.c
+++ b/src/gallium/drivers/nvfx/nvfx_transfer.c
@@ -26,25 +26,44 @@ nvfx_transfer_new(struct pipe_context *pipe,
 			  unsigned usage,
 			  const struct pipe_box *box)
 {
-	struct nvfx_staging_transfer* tx;
-	bool direct = !nvfx_resource_on_gpu(pt) && pt->flags & NVFX_RESOURCE_FLAG_LINEAR;
-
-	tx = CALLOC_STRUCT(nvfx_staging_transfer);
-	if(!tx)
-		return NULL;
-
-	util_staging_transfer_init(pipe, pt, sr, usage, box, direct, tx);
+        if((usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_DONTBLOCK)) == PIPE_TRANSFER_DONTBLOCK)
+        {
+                struct nouveau_bo* bo = ((struct nvfx_resource*)pt)->bo;
+                if(bo && nouveau_bo_busy(bo, NOUVEAU_BO_WR))
+                        return NULL;
+        }
 
 	if(pt->target == PIPE_BUFFER)
 	{
-		tx->base.base.slice_stride = tx->base.base.stride = ((struct nvfx_resource*)tx->base.staging_resource)->bo->size;
-		if(direct)
-			tx->offset = util_format_get_stride(pt->format, box->x);
-		else
-			tx->offset = 0;
+		// it would be nice if we could avoid all this ridiculous overhead...
+		struct pipe_transfer* tx;
+		struct nvfx_buffer* buffer = nvfx_buffer(pt);
+
+		tx = CALLOC_STRUCT(pipe_transfer);
+		if (!tx)
+			return NULL;
+
+		pipe_resource_reference(&tx->resource, pt);
+		tx->sr = sr;
+		tx->usage = usage;
+		tx->box = *box;
+
+		tx->slice_stride = tx->stride = util_format_get_stride(pt->format, box->width);
+		tx->data = buffer->data + util_format_get_stride(pt->format, box->x);
+
+		return tx;
 	}
 	else
 	{
+	        struct nvfx_staging_transfer* tx;
+	        bool direct = !nvfx_resource_on_gpu(pt) && pt->flags & NVFX_RESOURCE_FLAG_LINEAR;
+
+	        tx = CALLOC_STRUCT(nvfx_staging_transfer);
+	        if(!tx)
+	        	return NULL;
+
+	        util_staging_transfer_init(pipe, pt, sr, usage, box, direct, &tx->base);
+
 		if(direct)
 		{
 			tx->base.base.stride = nvfx_subresource_pitch(pt, sr.level);
@@ -66,26 +85,132 @@ nvfx_transfer_new(struct pipe_context *pipe,
 	}
 }
 
+static void nvfx_buffer_dirty_interval(struct nvfx_buffer* buffer, unsigned begin, unsigned size, boolean unsynchronized)
+{
+	struct nvfx_screen* screen = nvfx_screen(buffer->base.base.screen);
+	buffer->last_update_static = buffer->bytes_to_draw_until_static < 0;
+	if(buffer->dirty_begin == buffer->dirty_end)
+	{
+		buffer->dirty_begin = begin;
+		buffer->dirty_end = begin + size;
+		buffer->dirty_unsynchronized = unsynchronized;
+	}
+	else
+	{
+		buffer->dirty_begin = MIN2(buffer->dirty_begin, begin);
+		buffer->dirty_end = MAX2(buffer->dirty_end, begin + size);
+		buffer->dirty_unsynchronized &= unsynchronized;
+	}
+
+	if(unsynchronized)
+	{
+		// TODO: revisit this, it doesn't seem quite right
+		//printf("UNSYNC UPDATE %p %u %u\n", buffer, begin, size);
+		buffer->bytes_to_draw_until_static += size * screen->static_reuse_threshold;
+	}
+	else
+		buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold;
+}
+
+static void nvfx_transfer_flush_region( struct pipe_context *pipe,
+				      struct pipe_transfer *ptx,
+				      const struct pipe_box *box)
+{
+	if(ptx->resource->target == PIPE_BUFFER && (ptx->usage & PIPE_TRANSFER_FLUSH_EXPLICIT))
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource);
+		nvfx_buffer_dirty_interval(buffer,
+				(uint8_t*)ptx->data - buffer->data + util_format_get_stride(buffer->base.base.format, box->x),
+				util_format_get_stride(buffer->base.base.format, box->width),
+				!!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED));
+	}
+}
+
+static void
+nvfx_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx)
+{
+	if(ptx->resource->target == PIPE_BUFFER)
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource);
+		if((ptx->usage & (PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT)) == PIPE_TRANSFER_WRITE)
+			nvfx_buffer_dirty_interval(buffer,
+				(uint8_t*)ptx->data - buffer->data,
+				ptx->stride,
+				!!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED));
+		pipe_resource_reference(&ptx->resource, 0);
+		FREE(ptx);
+	}
+	else
+		util_staging_transfer_destroy(pipe, ptx);
+}
+
 void *
 nvfx_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx)
 {
-	struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
-	if(!ptx->data)
+	if(ptx->resource->target == PIPE_BUFFER)
+		return ptx->data;
+	else
 	{
-		struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
-		uint8_t *map = nouveau_screen_bo_map(pipe->screen, mt->base.bo, nouveau_screen_transfer_flags(ptx->usage));
-		ptx->data = map + tx->offset;
+		struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
+		if(!ptx->data)
+		{
+			struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+			uint8_t *map = nouveau_screen_bo_map(pipe->screen, mt->base.bo, nouveau_screen_transfer_flags(ptx->usage));
+			ptx->data = map + tx->offset;
+		}
+
+		++tx->map_count;
+		return ptx->data;
 	}
-	++tx->map_count;
-	return ptx->data;
 }
 
 void
 nvfx_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx)
 {
-	struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
-	struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+	if(ptx->resource->target != PIPE_BUFFER)
+	{
+		struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx;
+		struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource;
+
+		if(!--tx->map_count)
+		{
+			nouveau_screen_bo_unmap(pipe->screen, mt->base.bo);
+			ptx->data = 0;
+		}
+	}
+}
+
+static void nvfx_transfer_inline_write( struct pipe_context *pipe,
+				      struct pipe_resource *pr,
+				      struct pipe_subresource sr,
+				      unsigned usage,
+				      const struct pipe_box *box,
+				      const void *data,
+				      unsigned stride,
+				      unsigned slice_stride)
+{
+	if(pr->target != PIPE_BUFFER)
+	{
+		u_default_transfer_inline_write(pipe, pr, sr, usage, box, data, stride, slice_stride);
+	}
+	else
+	{
+		struct nvfx_buffer* buffer = nvfx_buffer(pr);
+		unsigned begin = util_format_get_stride(pr->format, box->x);
+		unsigned size = util_format_get_stride(pr->format, box->width);
+		memcpy(buffer->data + begin, data, size);
+		nvfx_buffer_dirty_interval(buffer, begin, size,
+				!!(pr->flags & PIPE_TRANSFER_UNSYNCHRONIZED));
+	}
+}
 
-	if(!--tx->map_count)
-		nouveau_screen_bo_unmap(pipe->screen, mt->base.bo);
+void
+nvfx_init_transfer_functions(struct pipe_context *pipe)
+{
+	pipe->get_transfer = nvfx_transfer_new;
+	pipe->transfer_map = nvfx_transfer_map;
+	pipe->transfer_flush_region = nvfx_transfer_flush_region;
+	pipe->transfer_unmap = nvfx_transfer_unmap;
+	pipe->transfer_destroy = nvfx_transfer_destroy;
+	pipe->transfer_inline_write = nvfx_transfer_inline_write;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c
index 4aa37938425..a6cd1256350 100644
--- a/src/gallium/drivers/nvfx/nvfx_vbo.c
+++ b/src/gallium/drivers/nvfx/nvfx_vbo.c
@@ -2,6 +2,7 @@
 #include "pipe/p_state.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
+#include "translate/translate.h"
 
 #include "nvfx_context.h"
 #include "nvfx_state.h"
@@ -10,646 +11,583 @@
 #include "nouveau/nouveau_channel.h"
 #include "nouveau/nouveau_class.h"
 #include "nouveau/nouveau_pushbuf.h"
-#include "nouveau/nouveau_util.h"
 
-static INLINE int
-nvfx_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+static inline unsigned
+util_guess_unique_indices_count(unsigned mode, unsigned indices)
 {
-	switch (pipe) {
-	case PIPE_FORMAT_R32_FLOAT:
-	case PIPE_FORMAT_R32G32_FLOAT:
-	case PIPE_FORMAT_R32G32B32_FLOAT:
-	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
-		break;
-	case PIPE_FORMAT_R16_FLOAT:
-	case PIPE_FORMAT_R16G16_FLOAT:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-		*fmt = NV34TCL_VTXFMT_TYPE_HALF;
-		break;
-	case PIPE_FORMAT_R8_UNORM:
-	case PIPE_FORMAT_R8G8_UNORM:
-	case PIPE_FORMAT_R8G8B8_UNORM:
-	case PIPE_FORMAT_R8G8B8A8_UNORM:
-		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
-		break;
-	case PIPE_FORMAT_R16_SSCALED:
-	case PIPE_FORMAT_R16G16_SSCALED:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
-	case PIPE_FORMAT_R16G16B16A16_SSCALED:
-		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
-		break;
-	default:
-		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
-		return 1;
+	/* Euler's formula gives V =
+	 * = E - F + 2 =
+	 * = F * (polygon_edges / 2 - 1) + 2 =
+	 * =  F * (polygon_edges - 2) / 2 + 2 =
+	 * =  indices * (polygon_edges - 2) / (2 * indices_per_face) + 2
+	 * =  indices * (1 / 2 - 1 / polygon_edges) + 2
+	 */
+	switch(mode)
+	{
+	case PIPE_PRIM_LINES:
+		return indices >> 1;
+	case PIPE_PRIM_TRIANGLES:
+	{
+		// avoid an expensive division by 3 using the multiplicative inverse mod 2^32
+		unsigned q;
+		unsigned inv3 = 2863311531;
+		indices >>= 1;
+		q = indices * inv3;
+		if(unlikely(q >= indices))
+		{
+			q += inv3;
+			if(q >= indices)
+				q += inv3;
+		}
+		return indices + 2;
+		//return indices / 6 + 2;
 	}
-
-	switch (pipe) {
-	case PIPE_FORMAT_R8_UNORM:
-	case PIPE_FORMAT_R32_FLOAT:
-	case PIPE_FORMAT_R16_FLOAT:
-	case PIPE_FORMAT_R16_SSCALED:
-		*ncomp = 1;
-		break;
-	case PIPE_FORMAT_R8G8_UNORM:
-	case PIPE_FORMAT_R32G32_FLOAT:
-	case PIPE_FORMAT_R16G16_FLOAT:
-	case PIPE_FORMAT_R16G16_SSCALED:
-		*ncomp = 2;
-		break;
-	case PIPE_FORMAT_R8G8B8_UNORM:
-	case PIPE_FORMAT_R32G32B32_FLOAT:
-	case PIPE_FORMAT_R16G16B16_FLOAT:
-	case PIPE_FORMAT_R16G16B16_SSCALED:
-		*ncomp = 3;
-		break;
-	case PIPE_FORMAT_R8G8B8A8_UNORM:
-	case PIPE_FORMAT_R32G32B32A32_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_FLOAT:
-	case PIPE_FORMAT_R16G16B16A16_SSCALED:
-		*ncomp = 4;
-		break;
+	// guess that indexed quads are created by successive connections, since a closed mesh seems unlikely
+	case PIPE_PRIM_QUADS:
+		return (indices >> 1) + 2;
+	//	return (indices >> 2) + 2; // if it is a closed mesh
 	default:
-		NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe));
-		return 1;
+		return indices;
 	}
-
-	return 0;
 }
 
-static boolean
-nvfx_vbo_set_idxbuf(struct nvfx_context *nvfx, struct pipe_resource *ib,
-		    unsigned ib_size)
+static unsigned nvfx_decide_upload_mode(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
-	unsigned type;
-
-	if (!ib) {
-		nvfx->idxbuf_buffer = NULL;
-		nvfx->idxbuf_format = 0xdeadbeef;
-		return FALSE;
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+	unsigned hardware_cost = 0;
+	unsigned inline_cost = 0;
+	unsigned unique_vertices;
+	unsigned upload_mode;
+	if (info->indexed)
+		unique_vertices = util_guess_unique_indices_count(info->mode, info->count);
+	else
+		unique_vertices = info->count;
+
+	/* Here we try to figure out if we are better off writing vertex data directly on the FIFO,
+	 * or create hardware buffer objects and pointing the hardware to them.
+	 *
+	 * This is done by computing the total memcpy cost of each option, ignoring uploads
+	 * if we think that the buffer is static and thus the upload cost will be amortized over
+	 * future draw calls.
+	 *
+	 * For instance, if everything looks static, we will always create buffer objects, while if
+	 * everything is a user buffer and we are not doing indexed drawing, we never do.
+	 *
+	 * Other interesting cases are where a small user vertex buffer, but a huge user index buffer,
+	 * where we will upload the vertex buffer, so that we can use hardware index lookup, and
+	 * the opposite case, where we instead do index lookup in software to avoid uploading
+	 * a huge amount of vertex data that is not going to be used.
+	 *
+	 * Otherwise, we generally move to the GPU the after it has been pushed
+	 * NVFX_STATIC_BUFFER_MIN_REUSE_TIMES times to the GPU without having
+	 * been updated with a transfer (or just the buffer having been destroyed).
+	 *
+	 * There is no special handling for user buffers, since applications can use
+	 * OpenGL VBOs in a one-shot fashion. OpenGL 3/4 core profile forces this
+	 * by the way.
+	 *
+	 * Note that currently we don't support only putting some data on the FIFO, and
+	 * some on vertex buffers (constant and instanced data is independent from this).
+	 *
+	 * nVidia doesn't seem to do this either, even though it should be at least
+	 * doable with VTX_ATTR and possibly with VERTEX_DATA too if not indexed.
+	 */
+
+	for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++)
+	{
+		struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+		buffer->bytes_to_draw_until_static -= vbi->per_vertex_size * unique_vertices;
+		if (!nvfx_buffer_seems_static(buffer))
+		{
+			hardware_cost += buffer->dirty_end - buffer->dirty_begin;
+			if (!buffer->base.bo)
+				hardware_cost += nvfx->screen->buffer_allocation_cost;
+		}
+		inline_cost += vbi->per_vertex_size * info->count;
 	}
 
-	if (!nvfx->screen->index_buffer_reloc_flags || ib_size == 1)
-		return FALSE;
+	float best_index_cost_for_hardware_vertices_as_inline_cost = 0.0f;
+	boolean prefer_hardware_indices = FALSE;
+	unsigned index_inline_cost = 0;
+	unsigned index_hardware_cost = 0;
 
-	switch (ib_size) {
-	case 2:
-		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
-		break;
-	case 4:
-		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
-		break;
-	default:
-		return FALSE;
-	}
+	if (info->indexed)
+	{
+		index_inline_cost = nvfx->idxbuf.index_size * info->count;
+		if (nvfx->screen->index_buffer_reloc_flags
+			&& (nvfx->idxbuf.index_size == 2 || nvfx->idxbuf.index_size == 4)
+			&& !(nvfx->idxbuf.offset & (nvfx->idxbuf.index_size - 1)))
+		{
+			struct nvfx_buffer* buffer = nvfx_buffer(nvfx->idxbuf.buffer);
+			buffer->bytes_to_draw_until_static -= index_inline_cost;
 
-	if (ib != nvfx->idxbuf_buffer ||
-	    type != nvfx->idxbuf_format) {
-		nvfx->dirty |= NVFX_NEW_ARRAYS;
-		nvfx->idxbuf_buffer = ib;
-		nvfx->idxbuf_format = type;
-	}
+			prefer_hardware_indices = TRUE;
 
-	return TRUE;
-}
+			if (!nvfx_buffer_seems_static(buffer))
+			{
+				index_hardware_cost = buffer->dirty_end - buffer->dirty_begin;
+				if (!buffer->base.bo)
+					index_hardware_cost += nvfx->screen->buffer_allocation_cost;
+			}
 
-// type must be floating point
-static inline void
-nvfx_vbo_static_attrib(struct nvfx_context *nvfx,
-		       int attrib, struct pipe_vertex_element *ve,
-		       struct pipe_vertex_buffer *vb, unsigned ncomp)
-{
-	struct pipe_transfer *transfer;
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	void *map;
-	float *v;
-
-	map  = pipe_buffer_map(&nvfx->pipe, vb->buffer, PIPE_TRANSFER_READ, &transfer);
-	map = (uint8_t *) map + vb->buffer_offset + ve->src_offset;
-
-	v = map;
-
-	switch (ncomp) {
-	case 4:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4));
-		OUT_RING(chan, fui(v[0]));
-		OUT_RING(chan, fui(v[1]));
-		OUT_RING(chan,  fui(v[2]));
-		OUT_RING(chan,  fui(v[3]));
-		break;
-	case 3:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3));
-		OUT_RING(chan,  fui(v[0]));
-		OUT_RING(chan,  fui(v[1]));
-		OUT_RING(chan,  fui(v[2]));
-		break;
-	case 2:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2));
-		OUT_RING(chan,  fui(v[0]));
-		OUT_RING(chan,  fui(v[1]));
-		break;
-	case 1:
-		OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1));
-		OUT_RING(chan,  fui(v[0]));
-		break;
+			if ((float) index_inline_cost < (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost)
+			{
+				best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_inline_cost;
+			}
+			else
+			{
+				best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost;
+				prefer_hardware_indices = TRUE;
+			}
+		}
 	}
 
-	pipe_buffer_unmap(&nvfx->pipe, vb->buffer, transfer);
+	/* let's finally figure out which of the 3 paths we want to take */
+	if ((float) (inline_cost + index_inline_cost) > ((float) hardware_cost * nvfx->screen->inline_cost_per_hardware_cost + best_index_cost_for_hardware_vertices_as_inline_cost))
+		upload_mode = 1 + prefer_hardware_indices;
+	else
+		upload_mode = 0;
+
+#ifdef DEBUG
+        if (unlikely(nvfx->screen->trace_draw))
+          {
+                  fprintf(stderr, "DRAW");
+                  if (info->indexed)
+                  {
+                          fprintf(stderr, "_IDX%u", nvfx->idxbuf.index_size);
+                          if (info->index_bias)
+                                  fprintf(stderr, " biased %u", info->index_bias);
+                          fprintf(stderr, " idxrange %u -> %u", info->min_index, info->max_index);
+                  }
+                  if (info->instance_count > 1)
+                          fprintf(stderr, " %u instances from %u", info->instance_count, info->indexed);
+                  fprintf(stderr, " start %u count %u prim %u", info->start, info->count, info->mode);
+                  if (!upload_mode)
+                          fprintf(stderr, " -> inline vertex data");
+                  else if (upload_mode == 2 || !info->indexed)
+                          fprintf(stderr, " -> buffer range");
+                  else
+                          fprintf(stderr, " -> inline indices");
+                  fprintf(stderr, " [ivtx %u hvtx %u iidx %u hidx %u bidx %f] <", inline_cost, hardware_cost, index_inline_cost, index_hardware_cost, best_index_cost_for_hardware_vertices_as_inline_cost);
+                  for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+                  {
+                          struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+                          struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+                          struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+                          if (i)
+                                  fprintf(stderr, ", ");
+                          fprintf(stderr, "%p%s left %Li", buffer, buffer->last_update_static ? " static" : "", buffer->bytes_to_draw_until_static);
+                  }
+                  fprintf(stderr, ">\n");
+          }
+#endif
+
+	return upload_mode;
 }
 
-static void
-nvfx_draw_arrays(struct pipe_context *pipe,
-		 unsigned mode, unsigned start, unsigned count)
+void nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	unsigned restart = 0;
-
-	nvfx_vbo_set_idxbuf(nvfx, NULL, 0);
-	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
-		nvfx_draw_elements_swtnl(pipe, NULL, 0, 0,
-                                           mode, start, count);
-                return;
-	}
+	unsigned upload_mode = 0;
 
-	while (count) {
-		unsigned vc, nr, avail;
+	if (!nvfx->vtxelt->needs_translate)
+		upload_mode = nvfx_decide_upload_mode(pipe, info);
 
-		nvfx_state_emit(nvfx);
+	nvfx->use_index_buffer = upload_mode > 1;
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+	if ((upload_mode > 0) != nvfx->use_vertex_buffers)
+	{
+		nvfx->use_vertex_buffers = (upload_mode > 0);
+		nvfx->dirty |= NVFX_NEW_ARRAYS;
+		nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+	}
 
-		vc = nouveau_vbuf_split(avail, 6, 256,
-					mode, start, count, &restart);
-		if (!vc) {
-			FIRE_RING(chan);
-			continue;
+	if (upload_mode > 0)
+	{
+		for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			nvfx_buffer_upload(nvfx_buffer(vb->buffer));
 		}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+		if (upload_mode > 1)
+		{
+			nvfx_buffer_upload(nvfx_buffer(nvfx->idxbuf.buffer));
 
-		nr = (vc & 0xff);
-		if (nr) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_VERTEX_BATCH, 1));
-			OUT_RING  (chan, ((nr - 1) << 24) | start);
-			start += nr;
+			if (unlikely(info->index_bias != nvfx->base_vertex))
+			{
+				nvfx->base_vertex = info->index_bias;
+				nvfx->dirty |= NVFX_NEW_ARRAYS;
+			}
 		}
-
-		nr = vc >> 8;
-		while (nr) {
-			unsigned push = nr > 2047 ? 2047 : nr;
-
-			nr -= push;
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_VERTEX_BATCH, push));
-			while (push--) {
-				OUT_RING(chan, ((0x100 - 1) << 24) | start);
-				start += 0x100;
+		else
+		{
+			if (unlikely(info->start < nvfx->base_vertex && nvfx->base_vertex))
+			{
+				nvfx->base_vertex = 0;
+				nvfx->dirty |= NVFX_NEW_ARRAYS;
 			}
 		}
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
-
-		count -= vc;
-		start = restart;
 	}
 
-	pipe->flush(pipe, 0, NULL);
+	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx))
+		nvfx_draw_vbo_swtnl(pipe, info);
+	else
+		nvfx_push_vbo(pipe, info);
 }
 
-static INLINE void
-nvfx_draw_elements_u08(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
+boolean
+nvfx_vbo_validate(struct nvfx_context *nvfx)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	int i;
+	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
 
-	while (count) {
-		uint8_t *elts = (uint8_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
+	if (!elements)
+		return TRUE;
 
-		nvfx_state_emit(nvfx);
+	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
+	for(unsigned i = 0; i < nvfx->vtxelt->num_constant; ++i)
+	{
+		struct nvfx_low_frequency_element *ve = &nvfx->vtxelt->constant[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+		struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer);
+		float v[4];
+		ve->fetch_rgba_float(v, buffer->data + vb->buffer_offset + ve->src_offset, 0, 0);
+		nvfx_emit_vtx_attr(chan, ve->idx, v, ve->ncomp);
+	}
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
 
-		vc = nouveau_vbuf_split(avail, 6, 2,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
+	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
+	if(nvfx->use_vertex_buffers)
+	{
+		unsigned idx = 0;
+		for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+			struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+			if(idx != ve->idx)
+			{
+				assert(idx < ve->idx);
+				OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], ve->idx - idx);
+				idx = ve->idx;
+			}
 
-		if (vc & 1) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
-			OUT_RING  (chan, elts[0]);
-			elts++; vc--;
+			OUT_RING(chan, nvfx->vtxelt->vtxfmt[idx] | (vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT));
+			++idx;
 		}
+		if(idx != nvfx->vtxelt->num_elements)
+			OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], nvfx->vtxelt->num_elements - idx);
+	}
+	else
+		OUT_RINGp(chan, nvfx->vtxelt->vtxfmt, nvfx->vtxelt->num_elements);
 
-		while (vc) {
-			unsigned i;
-
-			push = MIN2(vc, 2047 * 2);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
-			for (i = 0; i < push; i+=2)
-				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
+	for(i = nvfx->vtxelt->num_elements; i < elements; ++i)
+		OUT_RING(chan, NV34TCL_VTXFMT_TYPE_32_FLOAT);
 
-			vc -= push;
-			elts += push;
+	if(nvfx->is_nv4x) {
+		unsigned i;
+		/* seems to be some kind of cache flushing */
+		for(i = 0; i < 3; ++i) {
+			OUT_RING(chan, RING_3D(0x1718, 1));
+			OUT_RING(chan, 0);
 		}
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
-
-		start = restart;
 	}
-}
-
-static INLINE void
-nvfx_draw_elements_u16(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
-{
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-
-	while (count) {
-		uint16_t *elts = (uint16_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
 
-		nvfx_state_emit(nvfx);
-
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
-
-		vc = nouveau_vbuf_split(avail, 6, 2,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
+	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
+	if(nvfx->use_vertex_buffers)
+	{
+		unsigned idx = 0;
+		for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+			struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+			for(; idx < ve->idx; ++idx)
+				OUT_RING(chan, 0);
 
-		if (vc & 1) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
-			OUT_RING  (chan, elts[0]);
-			elts++; vc--;
+			OUT_RELOC(chan, bo,
+					vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride,
+					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+			++idx;
 		}
 
-		while (vc) {
-			unsigned i;
-
-			push = MIN2(vc, 2047 * 2);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
-			for (i = 0; i < push; i+=2)
-				OUT_RING(chan, (elts[i+1] << 16) | elts[i]);
-
-			vc -= push;
-			elts += push;
-		}
+		for(; idx < elements; ++idx)
+			OUT_RING(chan, 0);
+	}
+	else
+	{
+		for (i = 0; i < elements; i++)
+			OUT_RING(chan, 0);
+	}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	OUT_RING(chan, RING_3D(0x1710, 1));
+	OUT_RING(chan, 0);
 
-		start = restart;
-	}
+	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
+	return TRUE;
 }
 
-static INLINE void
-nvfx_draw_elements_u32(struct nvfx_context *nvfx, void *ib,
-		       unsigned mode, unsigned start, unsigned count)
+void
+nvfx_vbo_relocate(struct nvfx_context *nvfx)
 {
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-
-	while (count) {
-		uint32_t *elts = (uint32_t *)ib + start;
-		unsigned vc, push, restart = 0, avail;
-
-		nvfx_state_emit(nvfx);
-
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
-
-		vc = nouveau_vbuf_split(avail, 5, 1,
-					mode, start, count, &restart);
-		if (vc == 0) {
-			FIRE_RING(chan);
-			continue;
-		}
-		count -= vc;
-
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
-
-		while (vc) {
-			push = MIN2(vc, 2047);
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
-			OUT_RINGp    (chan, elts, push);
-
-			vc -= push;
-			elts += push;
-		}
+        if(!nvfx->use_vertex_buffers)
+                return;
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
+	int i;
 
-		start = restart;
+	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
+        for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) {
+                struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i];
+                struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+                struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
+
+                OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(ve->idx), 1),
+				vb_flags, 0, 0);
+                OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride,
+				vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+				0, NV34TCL_VTXBUF_ADDRESS_DMA1);
 	}
 }
 
 static void
-nvfx_draw_elements_inline(struct pipe_context *pipe,
-			  struct pipe_resource *ib,
-			  unsigned ib_size, int ib_bias,
-			  unsigned mode, unsigned start, unsigned count)
+nvfx_idxbuf_emit(struct nvfx_context* nvfx, unsigned ib_flags)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct pipe_transfer *transfer;
-	void *map;
-
-	map = pipe_buffer_map(pipe, ib, PIPE_TRANSFER_READ, &transfer);
-	if (!ib) {
-		NOUVEAU_ERR("failed mapping ib\n");
-		return;
-	}
+	struct nouveau_channel* chan = nvfx->screen->base.channel;
+	unsigned ib_format = (nvfx->idxbuf.index_size == 2) ? NV34TCL_IDXBUF_FORMAT_TYPE_U16 : NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+	struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf.buffer)->bo;
+	ib_flags |= nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
 
-	assert(ib_bias == 0);
-
-	switch (ib_size) {
-	case 1:
-		nvfx_draw_elements_u08(nvfx, map, mode, start, count);
-		break;
-	case 2:
-		nvfx_draw_elements_u16(nvfx, map, mode, start, count);
-		break;
-	case 4:
-		nvfx_draw_elements_u32(nvfx, map, mode, start, count);
-		break;
-	default:
-		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
-		break;
-	}
+	assert(nvfx->screen->index_buffer_reloc_flags);
 
-	pipe_buffer_unmap(pipe, ib, transfer);
+	MARK_RING(chan, 3, 3);
+	if(ib_flags & NOUVEAU_BO_DUMMY)
+		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2), ib_flags, 0, 0);
+	else
+		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
+	OUT_RELOC(chan, bo, nvfx->idxbuf.offset + 1, ib_flags | NOUVEAU_BO_LOW, 0, 0);
+	OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
+			0, NV34TCL_IDXBUF_FORMAT_DMA1);
 }
 
-static void
-nvfx_draw_elements_vbo(struct pipe_context *pipe,
-		       unsigned mode, unsigned start, unsigned count)
+void
+nvfx_idxbuf_validate(struct nvfx_context* nvfx)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	struct nvfx_screen *screen = nvfx->screen;
-	struct nouveau_channel *chan = screen->base.channel;
-	unsigned restart = 0;
-
-	while (count) {
-		unsigned nr, vc, avail;
-
-		nvfx_state_emit(nvfx);
+	nvfx_idxbuf_emit(nvfx, 0);
+}
 
-		avail = AVAIL_RING(chan);
-		avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */
+void
+nvfx_idxbuf_relocate(struct nvfx_context* nvfx)
+{
+	nvfx_idxbuf_emit(nvfx, NOUVEAU_BO_DUMMY);
+}
 
-		vc = nouveau_vbuf_split(avail, 6, 256,
-					mode, start, count, &restart);
-		if (!vc) {
-			FIRE_RING(chan);
-			continue;
-		}
+unsigned nvfx_vertex_formats[PIPE_FORMAT_COUNT] =
+{
+	[PIPE_FORMAT_R32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32B32A32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R32G32B32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT,
+	[PIPE_FORMAT_R16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16B16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R16G16B16A16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT,
+	[PIPE_FORMAT_R8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8A8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM,
+	[PIPE_FORMAT_R8G8B8A8_USCALED] = NV34TCL_VTXFMT_TYPE_8_USCALED,
+	[PIPE_FORMAT_R16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16B16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16G16B16A16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM,
+	[PIPE_FORMAT_R16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16B16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+	[PIPE_FORMAT_R16G16B16A16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED,
+};
+
+static void *
+nvfx_vtxelts_state_create(struct pipe_context *pipe,
+			  unsigned num_elements,
+			  const struct pipe_vertex_element *elements)
+{
+	struct nvfx_context* nvfx = nvfx_context(pipe);
+	struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state);
+        struct translate_key transkey;
+        unsigned per_vertex_size[16];
+        memset(per_vertex_size, 0, sizeof(per_vertex_size));
+
+        unsigned vb_compacted_index[16];
+
+	assert(num_elements < 16); /* not doing fallbacks yet */
+
+	memcpy(cso->pipe, elements, num_elements * sizeof(elements[0]));
+	cso->num_elements = num_elements;
+	cso->needs_translate = FALSE;
+
+	transkey.nr_elements = 0;
+	transkey.output_stride = 0;
+
+	for(unsigned i = 0; i < num_elements; ++i)
+        {
+		const struct pipe_vertex_element* ve = &elements[i];
+		if(!ve->instance_divisor)
+                        per_vertex_size[ve->vertex_buffer_index] += util_format_get_stride(ve->src_format, 1);
+        }
+
+        for(unsigned i = 0; i < 16; ++i)
+        {
+                if(per_vertex_size[i])
+                {
+                        unsigned idx = cso->num_per_vertex_buffer_infos++;
+                        cso->per_vertex_buffer_info[idx].vertex_buffer_index = i;
+                        cso->per_vertex_buffer_info[idx].per_vertex_size = per_vertex_size[i];
+                        vb_compacted_index[i] = idx;
+                }
+        }
+
+	for(unsigned i = 0; i < num_elements; ++i)
+	{
+		const struct pipe_vertex_element* ve = &elements[i];
+		unsigned type = nvfx_vertex_formats[ve->src_format];
+		unsigned ncomp = util_format_get_nr_components(ve->src_format);
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, nvgl_primitive(mode));
+		//if(ve->frequency != PIPE_ELEMENT_FREQUENCY_PER_VERTEX)
+		if(ve->instance_divisor)
+		{
+			struct nvfx_low_frequency_element* lfve;
+			cso->vtxfmt[i] = NV34TCL_VTXFMT_TYPE_32_FLOAT;
+
+			//if(ve->frequency == PIPE_ELEMENT_FREQUENCY_CONSTANT)
+			if(0)
+				lfve = &cso->constant[cso->num_constant++];
+			else
+			{
+				lfve = &cso->per_instance[cso->num_per_instance++].base;
+				((struct nvfx_per_instance_element*)lfve)->instance_divisor = ve->instance_divisor;
+			}
 
-		nr = (vc & 0xff);
-		if (nr) {
-			OUT_RING(chan, RING_3D(NV34TCL_VB_INDEX_BATCH, 1));
-			OUT_RING  (chan, ((nr - 1) << 24) | start);
-			start += nr;
+                        lfve->idx = i;
+                        lfve->vertex_buffer_index = ve->vertex_buffer_index;
+                        lfve->src_offset = ve->src_offset;
+                        lfve->fetch_rgba_float = util_format_description(ve->src_format)->fetch_rgba_float;
+                        lfve->ncomp = ncomp;
 		}
-
-		nr = vc >> 8;
-		while (nr) {
-			unsigned push = nr > 2047 ? 2047 : nr;
-
-			nr -= push;
-
-			OUT_RING(chan, RING_3D_NI(NV34TCL_VB_INDEX_BATCH, push));
-			while (push--) {
-				OUT_RING(chan, ((0x100 - 1) << 24) | start);
-				start += 0x100;
+		else
+		{
+			unsigned idx;
+
+			idx = cso->num_per_vertex++;
+			cso->per_vertex[idx].idx = i;
+			cso->per_vertex[idx].vertex_buffer_index = ve->vertex_buffer_index;
+			cso->per_vertex[idx].src_offset = ve->src_offset;
+
+			idx = transkey.nr_elements++;
+			transkey.element[idx].input_format = ve->src_format;
+			transkey.element[idx].input_buffer = vb_compacted_index[ve->vertex_buffer_index];
+			transkey.element[idx].input_offset = ve->src_offset;
+			transkey.element[idx].instance_divisor = 0;
+			transkey.element[idx].type = TRANSLATE_ELEMENT_NORMAL;
+			if(type)
+			{
+				transkey.element[idx].output_format = ve->src_format;
+				cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type;
+			}
+			else
+			{
+				unsigned float32[4] = {PIPE_FORMAT_R32_FLOAT, PIPE_FORMAT_R32G32_FLOAT, PIPE_FORMAT_R32G32B32_FLOAT, PIPE_FORMAT_R32G32B32A32_FLOAT};
+				transkey.element[idx].output_format = float32[ncomp - 1];
+				cso->needs_translate = TRUE;
+				cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | NV34TCL_VTXFMT_TYPE_32_FLOAT;
 			}
+			transkey.element[idx].output_offset = transkey.output_stride;
+			transkey.output_stride += (util_format_get_stride(transkey.element[idx].output_format, 1) + 3) & ~3;
 		}
+	}
 
-		OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
-		OUT_RING  (chan, 0);
+	cso->translate = translate_generic_create(&transkey);
+	cso->vertex_length = transkey.output_stride >> 2;
+	cso->max_vertices_per_packet = 2047 / cso->vertex_length;
 
-		count -= vc;
-		start = restart;
-	}
+	return (void *)cso;
 }
 
 static void
-nvfx_draw_elements(struct pipe_context *pipe,
-		   struct pipe_resource *indexBuffer,
-		   unsigned indexSize, int indexBias,
-		   unsigned mode, unsigned start, unsigned count)
+nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso)
 {
-	struct nvfx_context *nvfx = nvfx_context(pipe);
-	boolean idxbuf;
-
-	idxbuf = nvfx_vbo_set_idxbuf(nvfx, indexBuffer, indexSize);
-	if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) {
-		nvfx_draw_elements_swtnl(pipe,
-		                         indexBuffer, indexSize, indexBias,
-		                         mode, start, count);
-		return;
-	}
-
-	if (idxbuf) {
-		nvfx_draw_elements_vbo(pipe, mode, start, count);
-	} else {
-		nvfx_draw_elements_inline(pipe,
-		                          indexBuffer, indexSize, indexBias,
-					  mode, start, count);
-	}
-
-	pipe->flush(pipe, 0, NULL);
+	FREE(hwcso);
 }
 
-void
-nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+static void
+nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso)
 {
 	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-	if (info->indexed && nvfx->idxbuf.buffer) {
-		unsigned offset;
-
-		assert(nvfx->idxbuf.offset % nvfx->idxbuf.index_size == 0);
-		offset = nvfx->idxbuf.offset / nvfx->idxbuf.index_size;
-
-		nvfx_draw_elements(pipe,
-				   nvfx->idxbuf.buffer,
-				   nvfx->idxbuf.index_size,
-				   info->index_bias,
-				   info->mode,
-				   info->start + offset,
-				   info->count);
-	}
-	else {
-		nvfx_draw_arrays(pipe,
-				info->mode,
-				info->start,
-				info->count);
-	}
+	nvfx->vtxelt = hwcso;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
 }
 
-boolean
-nvfx_vbo_validate(struct nvfx_context *nvfx)
+static void
+nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
 {
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	struct pipe_resource *ib = nvfx->idxbuf_buffer;
-	unsigned ib_format = nvfx->idxbuf_format;
-	int i;
-	int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr);
-	uint32_t vtxfmt[16];
-	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD;
-
-	if (!elements)
-		return TRUE;
-
-	nvfx->vbo_bo = 0;
-
-	MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2);
-	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
-		struct pipe_vertex_element *ve;
-		struct pipe_vertex_buffer *vb;
-		unsigned type, ncomp;
-
-		ve = &nvfx->vtxelt->pipe[i];
-		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
-
-		if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
-			MARK_UNDO(chan);
-			nvfx->fallback_swtnl |= NVFX_NEW_ARRAYS;
-			return FALSE;
-		}
+	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-		if (!vb->stride && type == NV34TCL_VTXFMT_TYPE_FLOAT) {
-			nvfx_vbo_static_attrib(nvfx, i, ve, vb, ncomp);
-			vtxfmt[i] = type;
-		} else {
-			vtxfmt[i] = ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) |
-				(ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type);
-			nvfx->vbo_bo |= (1 << i);
-		}
+	for(unsigned i = 0; i < count; ++i)
+	{
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, vb[i].buffer);
+		nvfx->vtxbuf[i].buffer_offset = vb[i].buffer_offset;
+		nvfx->vtxbuf[i].max_index = vb[i].max_index;
+		nvfx->vtxbuf[i].stride = vb[i].stride;
 	}
 
-	for(; i < elements; ++i)
-		vtxfmt[i] = NV34TCL_VTXFMT_TYPE_FLOAT;
-
-	OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements));
-	OUT_RINGp(chan, vtxfmt, elements);
-
-	if(nvfx->is_nv4x) {
-		unsigned i;
-		/* seems to be some kind of cache flushing */
-		for(i = 0; i < 3; ++i) {
-			OUT_RING(chan, RING_3D(0x1718, 1));
-			OUT_RING(chan, 0);
-		}
-	}
+	for(unsigned i = count; i < nvfx->vtxbuf_nr; ++i)
+		pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0);
 
-	OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements));
-	for (i = 0; i < nvfx->vtxelt->num_elements; i++) {
-		struct pipe_vertex_element *ve;
-		struct pipe_vertex_buffer *vb;
+	nvfx->vtxbuf_nr = count;
+	nvfx->use_vertex_buffers = -1;
+	nvfx->draw_dirty |= NVFX_NEW_ARRAYS;
+}
 
-		ve = &nvfx->vtxelt->pipe[i];
-		vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
+static void
+nvfx_set_index_buffer(struct pipe_context *pipe,
+		      const struct pipe_index_buffer *ib)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
 
-		if (!(nvfx->vbo_bo & (1 << i)))
-			OUT_RING(chan, 0);
-		else
-		{
-			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
-			OUT_RELOC(chan, bo,
-				 vb->buffer_offset + ve->src_offset,
-				 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-				 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
-		}
+	if(ib)
+	{
+		pipe_resource_reference(&nvfx->idxbuf.buffer, ib->buffer);
+		nvfx->idxbuf.index_size = ib->index_size;
+		nvfx->idxbuf.offset = ib->offset;
 	}
-
-        for (; i < elements; i++)
-		OUT_RING(chan, 0);
-
-	OUT_RING(chan, RING_3D(0x1710, 1));
-	OUT_RING(chan, 0);
-
-	if (ib) {
-		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD;
-		struct nouveau_bo* bo = nvfx_resource(ib)->bo;
-
-		assert(nvfx->screen->index_buffer_reloc_flags);
-
-		OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2));
-		OUT_RELOC(chan, bo, 0, ib_flags | NOUVEAU_BO_LOW, 0, 0);
-		OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR,
-				  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	else
+	{
+		pipe_resource_reference(&nvfx->idxbuf.buffer, 0);
+		nvfx->idxbuf.index_size = 0;
+		nvfx->idxbuf.offset = 0;
 	}
 
-	nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements;
-	return TRUE;
+	nvfx->dirty |= NVFX_NEW_INDEX;
+	nvfx->draw_dirty |= NVFX_NEW_INDEX;
 }
 
 void
-nvfx_vbo_relocate(struct nvfx_context *nvfx)
+nvfx_init_vbo_functions(struct nvfx_context *nvfx)
 {
-	struct nouveau_channel* chan = nvfx->screen->base.channel;
-	unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
-	int i;
+	nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers;
+	nvfx->pipe.set_index_buffer = nvfx_set_index_buffer;
 
-	MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3);
-	for(i = 0; i < nvfx->vtxelt->num_elements; ++i) {
-		if(nvfx->vbo_bo & (1 << i)) {
-			struct pipe_vertex_element *ve = &nvfx->vtxelt->pipe[i];
-			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index];
-			struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo;
-			OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(i), 1),
-					vb_flags, 0, 0);
-			OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset,
-					vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
-					0, NV34TCL_VTXBUF_ADDRESS_DMA1);
-		}
-	}
-
-	if(nvfx->idxbuf_buffer)
-	{
-		unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY;
-		struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf_buffer)->bo;
-
-		assert(nvfx->screen->index_buffer_reloc_flags);
-
-		OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2),
-				ib_flags, 0, 0);
-		OUT_RELOC(chan, bo, 0,
-				ib_flags | NOUVEAU_BO_LOW, 0, 0);
-		OUT_RELOC(chan, bo, nvfx->idxbuf_format,
-				ib_flags | NOUVEAU_BO_OR,
-				0, NV34TCL_IDXBUF_FORMAT_DMA1);
-	}
+	nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create;
+	nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete;
+	nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind;
 }
diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c
index 24d9846310e..939d2b83aee 100644
--- a/src/gallium/drivers/nvfx/nvfx_vertprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c
@@ -10,6 +10,7 @@
 
 #include "nvfx_context.h"
 #include "nvfx_state.h"
+#include "nvfx_resource.h"
 
 /* TODO (at least...):
  *  1. Indexed consts  + ARL
@@ -874,7 +875,6 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	struct nouveau_grobj *eng3d = screen->eng3d;
 	struct nvfx_vertex_program *vp;
 	struct pipe_resource *constbuf;
-	struct pipe_transfer *transfer = NULL;
 	boolean upload_code = FALSE, upload_data = FALSE;
 	int i;
 
@@ -983,11 +983,8 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 	if (vp->nr_consts) {
 		float *map = NULL;
 
-		if (constbuf) {
-			map = pipe_buffer_map(pipe, constbuf,
-					      PIPE_TRANSFER_READ,
-					      &transfer);
-		}
+		if (constbuf)
+			map = nvfx_buffer(constbuf)->data;
 
 		for (i = 0; i < vp->nr_consts; i++) {
 			struct nvfx_vertex_program_data *vpd = &vp->consts[i];
@@ -1005,9 +1002,6 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx)
 			OUT_RING  (chan, i + vp->data->start);
 			OUT_RINGp (chan, (uint32_t *)vpd->value, 4);
 		}
-
-		if (constbuf)
-			pipe_buffer_unmap(pipe, constbuf, transfer);
 	}
 
 	/* Upload vtxprog */