1 files changed, 402 insertions, 0 deletions
diff --git a/src/gallium/drivers/nvfx/nvfx_push.c b/src/gallium/drivers/nvfx/nvfx_push.c
new file mode 100644
index 00000000000..52e891c6678
--- /dev/null
+++ b/src/gallium/drivers/nvfx/nvfx_push.c
@@ -0,0 +1,402 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+#include "util/u_split_prim.h"
+#include "translate/translate.h"
+
+#include "nvfx_context.h"
+#include "nvfx_resource.h"
+
+struct push_context {
+	struct nouveau_channel* chan;
+
+	void *idxbuf;
+	int32_t idxbias;
+
+	float edgeflag;
+	int edgeflag_attr;
+
+	unsigned vertex_length;
+	unsigned max_vertices_per_packet;
+
+	struct translate* translate;
+};
+
+static void
+emit_edgeflag(void *priv, boolean enabled)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+
+	OUT_RING(chan, RING_3D(NV34TCL_EDGEFLAG_ENABLE, 1));
+	OUT_RING(chan, enabled ? 1 : 0);
+}
+
+static void
+emit_vertices_lookup8(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint8_t* elts = (uint8_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts8(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup16(void *priv, unsigned start, unsigned count)
+{
+	struct push_context *ctx = priv;
+        uint16_t* elts = (uint16_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts16(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices_lookup32(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+        uint32_t* elts = (uint32_t*)ctx->idxbuf + start;
+
+        while(count)
+        {
+                unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+                unsigned length = push * ctx->vertex_length;
+
+                OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+                ctx->translate->run_elts(ctx->translate, elts, push, 0, ctx->chan->cur);
+                ctx->chan->cur += length;
+
+                count -= push;
+                elts += push;
+        }
+}
+
+static void
+emit_vertices(void *priv, unsigned start, unsigned count)
+{
+        struct push_context *ctx = priv;
+
+        while(count)
+        {
+		unsigned push = MIN2(count, ctx->max_vertices_per_packet);
+		unsigned length = push * ctx->vertex_length;
+
+		OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length));
+		ctx->translate->run(ctx->translate, start, push, 0, ctx->chan->cur);
+		ctx->chan->cur += length;
+
+		count -= push;
+		start += push;
+        }
+}
+
+static void
+emit_ranges(void* priv, unsigned start, unsigned vc, unsigned reg)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	unsigned nr = (vc & 0xff);
+	if (nr) {
+		OUT_RING(chan, RING_3D(reg, 1));
+		OUT_RING  (chan, ((nr - 1) << 24) | start);
+		start += nr;
+	}
+
+	nr = vc >> 8;
+	while (nr) {
+		unsigned push = nr > 2047 ? 2047 : nr;
+
+		nr -= push;
+
+		OUT_RING(chan, RING_3D_NI(reg, push));
+		while (push--) {
+			OUT_RING(chan, ((0x100 - 1) << 24) | start);
+			start += 0x100;
+		}
+	}
+}
+
+static void
+emit_ib_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_INDEX_BATCH);
+}
+
+static void
+emit_vb_ranges(void* priv, unsigned start, unsigned vc)
+{
+	emit_ranges(priv, start, vc, NV34TCL_VB_VERTEX_BATCH);
+}
+
+static INLINE void
+emit_elt8(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint8_t *elts = (uint8_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt16(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint16_t *elts = (uint16_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	if (vc & 1) {
+		OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1));
+		OUT_RING  (chan, elts[0]);
+		elts++; vc--;
+	}
+
+	while (vc) {
+		unsigned i;
+		unsigned push = MIN2(vc, 2047 * 2);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1));
+		for (i = 0; i < push; i+=2)
+			OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias));
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+static INLINE void
+emit_elt32(void* priv, unsigned start, unsigned vc)
+{
+	struct push_context* ctx = priv;
+	struct nouveau_channel *chan = ctx->chan;
+	uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
+	int idxbias = ctx->idxbias;
+
+	while (vc) {
+		unsigned push = MIN2(vc, 2047);
+
+		OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push));
+		assert(AVAIL_RING(chan) >= push);
+		if(idxbias)
+		{
+			for(unsigned i = 0; i < push; ++i)
+				OUT_RING(chan, elts[i] + idxbias);
+		}
+		else
+			OUT_RINGp(chan, elts, push);
+
+		vc -= push;
+		elts += push;
+	}
+}
+
+void
+nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+	struct nvfx_context *nvfx = nvfx_context(pipe);
+	struct nouveau_channel *chan = nvfx->screen->base.channel;
+	struct push_context ctx;
+	struct util_split_prim s;
+	unsigned instances_left = info->instance_count;
+	int vtx_value;
+	unsigned hw_mode = nvgl_primitive(info->mode);
+	int i;
+	struct
+	{
+		uint8_t* map;
+		unsigned step;
+	} per_instance[16];
+	unsigned p_overhead = 0
+			+ 4 /* begin/end */
+			+ 4; /* potential edgeflag enable/disable */
+
+	ctx.chan = nvfx->screen->base.channel;
+	ctx.translate = nvfx->vtxelt->translate;
+	ctx.idxbuf = NULL;
+	ctx.vertex_length = nvfx->vtxelt->vertex_length;
+	ctx.max_vertices_per_packet = nvfx->vtxelt->max_vertices_per_packet;
+	ctx.edgeflag = 0.5f;
+	// TODO: figure out if we really want to handle this, and do so in that case
+	ctx.edgeflag_attr = 0xff; // nvfx->vertprog->cfg.edgeflag_in;
+
+	if(!nvfx->use_vertex_buffers)
+	{
+		for(i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
+		{
+			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
+			uint8_t* data = nvfx_buffer(vb->buffer)->data + vb->buffer_offset;
+			if(info->indexed)
+				data += info->index_bias * vb->stride;
+			ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
+		}
+
+		if(ctx.edgeflag_attr < 16)
+			vtx_value = -(ctx.vertex_length + 3);  /* vertex data and edgeflag header and value */
+		else
+		{
+			p_overhead += 1; /* initial vertex_data header */
+			vtx_value = -ctx.vertex_length;  /* vertex data and edgeflag header and value */
+		}
+
+		if (info->indexed) {
+			// XXX: this case and is broken and probably need a new VTX_ATTR push path
+			if (nvfx->idxbuf.index_size == 1)
+				s.emit = emit_vertices_lookup8;
+			else if (nvfx->idxbuf.index_size == 2)
+				s.emit = emit_vertices_lookup16;
+			else
+				s.emit = emit_vertices_lookup32;
+		} else
+			s.emit = emit_vertices;
+	}
+	else
+	{
+		if(!info->indexed || nvfx->use_index_buffer)
+		{
+			s.emit = info->indexed ? emit_ib_ranges : emit_vb_ranges;
+			p_overhead += 3;
+			vtx_value = 0;
+		}
+		else if (nvfx->idxbuf.index_size == 4)
+		{
+			s.emit = emit_elt32;
+			p_overhead += 1;
+			vtx_value = 8;
+		}
+		else
+		{
+			s.emit = (nvfx->idxbuf.index_size == 2) ? emit_elt16 : emit_elt8;
+			p_overhead += 3;
+			vtx_value = 7;
+		}
+	}
+
+	ctx.idxbias = info->index_bias;
+	if(nvfx->use_vertex_buffers)
+		ctx.idxbias -= nvfx->base_vertex;
+
+	/* map index buffer, if present */
+	if (info->indexed && !nvfx->use_index_buffer)
+		ctx.idxbuf = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;
+
+	s.priv = &ctx;
+	s.edge = emit_edgeflag;
+
+	for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+	{
+		struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+		float v[4];
+		per_instance[i].step = info->start_instance % ve->instance_divisor;
+		per_instance[i].map = nvfx_buffer(vb->buffer)->data + vb->buffer_offset + ve->base.src_offset;
+
+		nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+
+		WAIT_RING(chan, 5);
+		nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+	}
+
+	/* per-instance loop */
+	while (instances_left--) {
+		int max_verts;
+		boolean done;
+
+		util_split_prim_init(&s, info->mode, info->start, info->count);
+		nvfx_state_emit(nvfx);
+		for(;;) {
+			max_verts  = AVAIL_RING(chan);
+			max_verts -= p_overhead;
+
+			/* if vtx_value < 0, each vertex is -vtx_value words long
+			 * otherwise, each vertex is 2^(vtx_value) / 255 words long (this is an approximation)
+			 */
+			if(vtx_value < 0)
+			{
+				max_verts /= -vtx_value;
+				max_verts -= (max_verts >> 10); /* vertex data headers */
+			}
+			else
+			{
+				if(max_verts >= (1 << 23)) /* avoid overflow here */
+					max_verts = (1 << 23);
+				max_verts = (max_verts * 255) >> vtx_value;
+			}
+
+			//printf("avail %u max_verts %u\n", AVAIL_RING(chan), max_verts);
+
+			if(max_verts >= 16)
+			{
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, hw_mode);
+				done = util_split_prim_next(&s, max_verts);
+				OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1));
+				OUT_RING(chan, 0);
+
+				if(done)
+					break;
+			}
+
+			FIRE_RING(chan);
+			nvfx_state_emit(nvfx);
+		}
+
+		/* set data for the next instance, if any changed */
+		for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
+		{
+			struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
+			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
+
+			if(++per_instance[i].step == ve->instance_divisor)
+			{
+				float v[4];
+				per_instance[i].map += vb->stride;
+				per_instance[i].step = 0;
+
+				nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);
+				WAIT_RING(chan, 5);
+				nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
+			}
+		}
+	}
+}