From a79521d497bc87309cadc49f3a414703497522bc Mon Sep 17 00:00:00 2001
From: Luca Barbieri <luca@luca-barbieri.com>
Date: Wed, 20 Jan 2010 09:04:37 +0100
Subject: nvfx: use dynamically sized rotating BO pool for fragment programs

Currently we used a single buffer for each fragment programs, leading to
rendering synchronization. This patch uses a doubly linked list of BOs,
which is dynamically resized if all the BOs are busy.

Note that inline image transfers could be an alternative option: this
will be explored later.

This removes one of the big performance limitations of the current
driver.

We also stop using pipe_resource internally in favor of using nouveau_bo
directly.
---
 src/gallium/drivers/nvfx/nvfx_fragprog.c | 232 +++++++++++++++++--------------
 src/gallium/drivers/nvfx/nvfx_state.h    |  15 +-
 2 files changed, 137 insertions(+), 110 deletions(-)

(limited to 'src')

diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c
index 301ad82c08b..5fa825ad05d 100644
--- a/src/gallium/drivers/nvfx/nvfx_fragprog.c
+++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c
@@ -823,45 +823,18 @@ out_err:
 	FREE(fpc);
 }
 
-static void
-nvfx_fragprog_upload(struct nvfx_context *nvfx,
-		     struct nvfx_fragment_program *fp)
+static inline void
+nvfx_fp_memcpy(void* dst, const void* src, size_t len)
 {
-	struct pipe_context *pipe = &nvfx->pipe;
-	const uint32_t le = 1;
-
-#if 0
-	for (i = 0; i < fp->insn_len; i++) {
-		fflush(stdout); fflush(stderr);
-		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
-		fflush(stdout); fflush(stderr);
+#ifndef WORDS_BIGENDIAN
+	memcpy(dst, src, len);
+#else
+	size_t i;
+	for(i = 0; i < len; i += 4) {
+		uint32_t v = (uint32_t*)((char*)src + i);
+		*(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16);
 	}
 #endif
-
-	if ((*(const uint8_t *)&le)) {
-		/* Can do this with an inline transfer */
-		pipe_buffer_write(pipe,
-				  fp->buffer,
-				  0,
-				  fp->insn_len * sizeof fp->insn[0],
-				  fp->insn);
-	} else {
-		struct pipe_transfer *transfer;
-		uint32_t *map;
-		int i;
-
-		map = pipe_buffer_map(pipe, fp->buffer,
-				      PIPE_TRANSFER_WRITE,
-				      &transfer);
-	
-		/* Weird swapping for big-endian chips */
-		for (i = 0; i < fp->insn_len; i++) {
-			map[i] = ((fp->insn[i] & 0xffff) << 16) |
-				  ((fp->insn[i] >> 16) & 0xffff);
-		}
-
-		pipe_buffer_unmap(pipe, fp->buffer, transfer);
-	}
 }
 
 void
@@ -869,83 +842,118 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	struct nvfx_fragment_program *fp = nvfx->fragprog;
-	struct pipe_resource *constbuf =
-		nvfx->constbuf[PIPE_SHADER_FRAGMENT];
-	struct pipe_screen *pscreen = nvfx->pipe.screen;
-	boolean new_consts = FALSE;
+	int update = 0;
 	int i;
 
-	if (fp->translated)
-		goto update_constants;
+	if (!fp->translated)
+	{
+		nvfx_fragprog_translate(nvfx, fp);
+		if (!fp->translated) {
+			static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0};
+			static int warned = 0;
+			if(!warned)
+			{
+				fprintf(stderr, "nvfx: failed to translate fragment program!\n");
+				warned = 1;
+			}
 
-	nvfx_fragprog_translate(nvfx, fp);
-	if (!fp->translated) {
-		static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0};
-		static int warned = 0;
-		if(!warned)
-		{
-			fprintf(stderr, "nvfx: failed to translate fragment program!\n");
-			warned = 1;
+			/* use dummy program: we cannot fail here */
+			fp->translated = TRUE;
+			fp->insn = malloc(sizeof(dummy));
+			memcpy(fp->insn, dummy, sizeof(dummy));
+			fp->insn_len = sizeof(dummy) / sizeof(dummy[0]);
 		}
+		update = TRUE;
 
-		/* use a dummy program: we cannot fail here */
-		fp->translated = TRUE;
-		fp->insn = malloc(sizeof(dummy));
-		memcpy(fp->insn, dummy, sizeof(dummy));
-		fp->insn_len = sizeof(dummy) / sizeof(dummy[0]);
-		return;
+		fp->prog_size = (fp->insn_len * 4 + 63) & ~63;
+
+		int min_size = 4096;
+		if(fp->prog_size >= min_size)
+			fp->progs_per_bo = 1;
+		else
+			fp->progs_per_bo = min_size / fp->prog_size;
+		fp->bo_prog_idx = fp->progs_per_bo - 1;
 	}
 
-	fp->buffer = pipe_buffer_create(pscreen,
-					/* XXX: no alignment, maybe use a priv bind flag
-					 * 0x100,
-					 */
-					0, fp->insn_len * 4);
-	nvfx_fragprog_upload(nvfx, fp);
-
-update_constants:
-	if (fp->nr_consts) {
-		struct pipe_transfer *transfer;
-		float *map;
-
-		map = pipe_buffer_map(&nvfx->pipe, constbuf,
-				      PIPE_TRANSFER_READ,
-				      &transfer);
-
-		/* XXX: probably a bad idea to be reading back data
-		 * from a buffer the gpu has been using.  Not really
-		 * sure what this code is doing though, or how to
-		 * avoid it - kw.
-		 */
-		for (i = 0; i < fp->nr_consts; i++) {
-			struct nvfx_fragment_program_data *fpd = &fp->consts[i];
-			uint32_t *p = &fp->insn[fpd->offset];
-			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
-
-			if (!memcmp(p, cb, 4 * sizeof(float)))
-				continue;
-			memcpy(p, cb, 4 * sizeof(float));
-			new_consts = TRUE;
+	if (nvfx->dirty & NVFX_NEW_FRAGCONST)
+		update = TRUE;
+
+	if(update) {
+		++fp->bo_prog_idx;
+		if(fp->bo_prog_idx >= fp->progs_per_bo)
+		{
+			if(fp->fpbo && !nouveau_bo_busy(fp->fpbo->next->bo, NOUVEAU_BO_WR))
+			{
+				fp->fpbo = fp->fpbo->next;
+			}
+			else
+			{
+				struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + fp->prog_size * fp->progs_per_bo, 16);
+				if(fp->fpbo)
+				{
+					fpbo->next = fp->fpbo->next;
+					fp->fpbo->next = fpbo;
+				}
+				else
+					fpbo->next = fpbo;
+				fp->fpbo = fpbo;
+				fpbo->bo = 0;
+				nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo);
+				nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC);
+
+				char* map = fpbo->bo->map;
+				char* buf = fpbo->insn;
+				for(int i = 0; i < fp->progs_per_bo; ++i)
+				{
+					memcpy(buf, fp->insn, fp->insn_len * 4);
+					nvfx_fp_memcpy(map, fp->insn, fp->insn_len * 4);
+					map += fp->prog_size;
+					buf += fp->prog_size;
+				}
+			}
+			fp->bo_prog_idx = 0;
 		}
-		pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer);
 
-		if (new_consts)
-			nvfx_fragprog_upload(nvfx, fp);
+		int offset = fp->bo_prog_idx * fp->prog_size;
+
+		if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) {
+			struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT];
+			// TODO: avoid using transfers, just directly the buffer
+			struct pipe_transfer* transfer;
+			// TODO: does this check make any sense, or should we do this unconditionally?
+			uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer);
+			uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset);
+			uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset);
+			for (i = 0; i < fp->nr_consts; ++i) {
+				unsigned off = fp->consts[i].offset;
+				unsigned idx = fp->consts[i].index * 4;
+
+				/* TODO: is checking a good idea? */
+				if(memcmp(&buf[off], &map[idx], 4 * sizeof(uint32_t))) {
+					memcpy(&buf[off], &map[idx], 4 * sizeof(uint32_t));
+					nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t));
+				}
+			}
+			pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer);
+		}
 	}
 
-	MARK_RING(chan, 8, 1);
-	OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
-	OUT_RELOC(chan, nvfx_resource(fp->buffer)->bo, 0, NOUVEAU_BO_VRAM |
-		      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
-		      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
-		      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
-	OUT_RING(chan, RING_3D(NV34TCL_FP_CONTROL, 1));
-	OUT_RING(chan, fp->fp_control);
-	if(!nvfx->is_nv4x) {
-		OUT_RING(chan, RING_3D(NV34TCL_FP_REG_CONTROL, 1));
-		OUT_RING(chan, (1<<16)|0x4);
-		OUT_RING(chan, RING_3D(NV34TCL_TX_UNITS_ENABLE, 1));
-		OUT_RING(chan, fp->samplers);
+	if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) {
+		int offset = fp->bo_prog_idx * fp->prog_size;
+		MARK_RING(chan, 8, 1);
+		OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1));
+		OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM |
+			      NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW |
+			      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
+			      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+		OUT_RING(chan, RING_3D(NV34TCL_FP_CONTROL, 1));
+		OUT_RING(chan, fp->fp_control);
+		if(!nvfx->is_nv4x) {
+			OUT_RING(chan, RING_3D(NV34TCL_FP_REG_CONTROL, 1));
+			OUT_RING(chan, (1<<16)|0x4);
+			OUT_RING(chan, RING_3D(NV34TCL_TX_UNITS_ENABLE, 1));
+			OUT_RING(chan, fp->samplers);
+		}
 	}
 }
 
@@ -954,12 +962,13 @@ nvfx_fragprog_relocate(struct nvfx_context *nvfx)
 {
 	struct nouveau_channel* chan = nvfx->screen->base.channel;
 	struct nvfx_fragment_program *fp = nvfx->fragprog;
-	struct nouveau_bo* bo = nvfx_resource(fp->buffer)->bo;
+	struct nouveau_bo* bo = fp->fpbo->bo;
+	int offset = fp->bo_prog_idx * fp->prog_size;
 	unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART?
 	fp_flags |= NOUVEAU_BO_DUMMY;
 	MARK_RING(chan, 2, 2);
 	OUT_RELOC(chan, bo, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1), fp_flags, 0, 0);
-	OUT_RELOC(chan, bo, 0, fp_flags | NOUVEAU_BO_LOW |
+	OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW |
 		      NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0,
 		      NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
 }
@@ -968,8 +977,19 @@ void
 nvfx_fragprog_destroy(struct nvfx_context *nvfx,
 		      struct nvfx_fragment_program *fp)
 {
-	if (fp->buffer)
-		pipe_resource_reference(&fp->buffer, NULL);
+	struct nvfx_fragment_program_bo* fpbo = fp->fpbo;
+	if(fpbo)
+	{
+		do
+		{
+			struct nvfx_fragment_program_bo* next = fpbo->next;
+			nouveau_bo_unmap(fpbo->bo);
+			nouveau_bo_ref(0, &fpbo->bo);
+			free(fpbo);
+			fpbo = next;
+		}
+		while(fpbo != fp->fpbo);
+	}
 
 	if (fp->insn_len)
 		FREE(fp->insn);
diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h
index 555513a6428..9ceb2577ecc 100644
--- a/src/gallium/drivers/nvfx/nvfx_state.h
+++ b/src/gallium/drivers/nvfx/nvfx_state.h
@@ -46,6 +46,12 @@ struct nvfx_fragment_program_data {
 	unsigned index;
 };
 
+struct nvfx_fragment_program_bo {
+	struct nvfx_fragment_program_bo* next;
+	struct nouveau_bo* bo;
+	char insn[] __attribute__((aligned(16)));
+};
+
 struct nvfx_fragment_program {
 	struct pipe_shader_state pipe;
 	struct tgsi_shader_info info;
@@ -58,12 +64,13 @@ struct nvfx_fragment_program {
 
 	struct nvfx_fragment_program_data *consts;
 	unsigned nr_consts;
-	
-	/* XXX: just use a nouveau_bo for this? 
-	 */
-	struct pipe_resource *buffer;
 
 	uint32_t fp_control;
+
+	unsigned bo_prog_idx;
+	unsigned prog_size;
+	unsigned progs_per_bo;
+	struct nvfx_fragment_program_bo* fpbo;
 };
 
 
-- 
cgit v1.2.3