2 files changed, 129 insertions, 21 deletions
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index fd0e06e6c37..326724520b3 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -348,11 +348,12 @@ int check_read_slots(struct r600_bc *bc, struct r600_bc_alu *alu_first)
 }
 #endif
 
+/* CB constants start at 512, and get translated to a kcache index when ALU
+ * clauses are constructed. Note that we handle kcache constants the same way
+ * as (the now gone) cfile constants, is that really required? */
 static int is_const(int sel)
 {
-	if (sel > 255 && sel < 512)
-		return 1;
-	if (sel >= V_SQ_ALU_SRC_0 && sel <= V_SQ_ALU_SRC_LITERAL)
+	if (sel > 511 && sel < 4607)
 		return 1;
 	return 0;
 }
@@ -413,6 +414,120 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc, struct r600_bc_alu *al
 	return 0;
 }
 
+/* This code handles kcache lines as single blocks of 32 constants. We could
+ * probably do slightly better by recognizing that we actually have two
+ * consecutive lines of 16 constants, but the resulting code would also be
+ * somewhat more complicated. */
+static int r600_bc_alloc_kcache_lines(struct r600_bc *bc, struct r600_bc_alu *alu, int type)
+{
+	struct r600_bc_kcache *kcache = bc->cf_last->kcache;
+	unsigned int required_lines;
+	unsigned int free_lines = 0;
+	unsigned int cache_line[3];
+	unsigned int count = 0;
+	unsigned int i, j;
+	int r;
+
+	/* Collect required cache lines. */
+	for (i = 0; i < 3; ++i) {
+		bool found = false;
+		unsigned int line;
+
+		if (alu->src[i].sel < 512)
+			continue;
+
+		line = ((alu->src[i].sel - 512) / 32) * 2;
+
+		for (j = 0; j < count; ++j) {
+			if (cache_line[j] == line) {
+				found = true;
+				break;
+			}
+		}
+
+		if (!found)
+			cache_line[count++] = line;
+	}
+
+	/* This should never actually happen. */
+	if (count >= 3) return -ENOMEM;
+
+	for (i = 0; i < 2; ++i) {
+		if (kcache[i].mode == V_SQ_CF_KCACHE_NOP) {
+			++free_lines;
+		}
+	}
+
+	/* Filter lines pulled in by previous intructions. Note that this is
+	 * only for the required_lines count, we can't remove these from the
+	 * cache_line array since we may have to start a new ALU clause. */
+	for (i = 0, required_lines = count; i < count; ++i) {
+		for (j = 0; j < 2; ++j) {
+			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
+			    kcache[j].addr == cache_line[i]) {
+				--required_lines;
+				break;
+			}
+		}
+	}
+
+	/* Start a new ALU clause if needed. */
+	if (required_lines > free_lines) {
+		if ((r = r600_bc_add_cf(bc))) {
+			return r;
+		}
+		bc->cf_last->inst = (type << 3);
+		kcache = bc->cf_last->kcache;
+	}
+
+	/* Setup the kcache lines. */
+	for (i = 0; i < count; ++i) {
+		bool found = false;
+
+		for (j = 0; j < 2; ++j) {
+			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
+			    kcache[j].addr == cache_line[i]) {
+				found = true;
+				break;
+			}
+		}
+
+		if (found) continue;
+
+		for (j = 0; j < 2; ++j) {
+			if (kcache[j].mode == V_SQ_CF_KCACHE_NOP) {
+				kcache[j].bank = 0;
+				kcache[j].addr = cache_line[i];
+				kcache[j].mode = V_SQ_CF_KCACHE_LOCK_2;
+				break;
+			}
+		}
+	}
+
+	/* Alter the src operands to refer to the kcache. */
+	for (i = 0; i < 3; ++i) {
+		static const unsigned int base[] = {128, 160, 256, 288};
+		unsigned int line;
+
+		if (alu->src[i].sel < 512)
+			continue;
+
+		alu->src[i].sel -= 512;
+		line = (alu->src[i].sel / 32) * 2;
+
+		for (j = 0; j < 2; ++j) {
+			if (kcache[j].mode == V_SQ_CF_KCACHE_LOCK_2 &&
+			    kcache[j].addr == line) {
+				alu->src[i].sel &= 0x1f;
+				alu->src[i].sel += base[j];
+				break;
+			}
+		}
+	}
+
+	return 0;
+}
+
 int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type)
 {
 	struct r600_bc_alu *nalu = r600_bc_alu();
@@ -434,6 +549,14 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int
 		}
 		bc->cf_last->inst = (type << 3);
 	}
+
+	/* Setup the kcache for this ALU instruction. This will start a new
+	 * ALU clause if needed. */
+	if ((r = r600_bc_alloc_kcache_lines(bc, nalu, type))) {
+		free(nalu);
+		return r;
+	}
+
 	if (!bc->cf_last->curr_bs_head) {
 		bc->cf_last->curr_bs_head = nalu;
 		LIST_INITHEAD(&nalu->bs_list);
@@ -473,23 +596,6 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int
 	bc->cf_last->ndw += 2;
 	bc->ndw += 2;
 
-	/* The following configuration provides 64 128-bit constants.
-	 * Each cacheline holds 16 128-bit constants and each
-	 * kcache can lock 2 cachelines and there are 2 kcaches per
-	 * ALU clause for a max of 64 constants.
-	 * For supporting more than 64 constants, the code needs
-	 * to be broken down into multiple ALU clauses.
-	 */
-	/* select the constant buffer (0-15) for each kcache */
-	bc->cf_last->kcache[0].bank = 0;
-	bc->cf_last->kcache[1].bank = 0;
-	/* lock 2 cachelines per kcache; 4 total */
-	bc->cf_last->kcache[0].mode = V_SQ_CF_KCACHE_LOCK_2;
-	bc->cf_last->kcache[1].mode = V_SQ_CF_KCACHE_LOCK_2;
-	/* set the cacheline offsets for each kcache */
-	bc->cf_last->kcache[0].addr = 0;
-	bc->cf_last->kcache[1].addr = 2;
-
 	/* process cur ALU instructions for bank swizzle */
 	if (nalu->last) {
 		check_and_set_bank_swizzle(bc, bc->cf_last->curr_bs_head);
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index bb5038c49b0..5c089f48896 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -540,7 +540,9 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s
 	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
 						ctx.info.file_count[TGSI_FILE_OUTPUT];
 
-	ctx.file_offset[TGSI_FILE_CONSTANT] = 128;
+	/* Outside the GPR range. This will be translated to one of the
+	 * kcache banks later. */
+	ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
 
 	ctx.file_offset[TGSI_FILE_IMMEDIATE] = 253;
 	ctx.temp_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +