summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/r600/r600_asm.c
diff options
context:
space:
mode:
authorDave Airlie <[email protected]>2011-03-10 12:54:13 +1000
committerDave Airlie <[email protected]>2011-05-25 11:42:45 +1000
commit7779f6d1dffde2c0501e44adc342e52803de08d4 (patch)
treeb58cad97109e182584edc2441ce754bf06e7b504 /src/gallium/drivers/r600/r600_asm.c
parentd1b8f8e8b3b41ab6092fa3f18a4891a0198f64de (diff)
r600g: add initial cayman acceleration support.
Cayman is the RadeonHD 69xx series of GPUs. This adds support for 3D acceleration to the r600g driver. Major changes: Some context registers moved around - mainly MSAA and clipping/guardband related. GPR allocation is all dynamic no vertex cache - all unified in texture cache. 5-wide to 4-wide shader engines (no scalar or trans slot) - some changes to how instructions are placed into slots - removal of END_OF_PROGRAM bit in favour of END flow control clause - no vertex fetch clause - TC accepts vertex or texture Signed-off-by: Dave Airlie <[email protected]>
Diffstat (limited to 'src/gallium/drivers/r600/r600_asm.c')
-rw-r--r--src/gallium/drivers/r600/r600_asm.c133
1 files changed, 101 insertions, 32 deletions
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 033e84665f5..00572cbd5bd 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -94,6 +94,7 @@ static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r
}
break;
case CHIPREV_EVERGREEN:
+ case CHIPREV_CAYMAN:
switch (alu->inst) {
case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP:
return 0;
@@ -226,6 +227,9 @@ int r600_bc_init(struct r600_bc *bc, enum radeon_family family)
case CHIP_CAICOS:
bc->chiprev = CHIPREV_EVERGREEN;
break;
+ case CHIP_CAYMAN:
+ bc->chiprev = CHIPREV_CAYMAN;
+ break;
default:
R600_ERR("unknown family %d\n", bc->family);
return -EINVAL;
@@ -334,6 +338,7 @@ static int is_alu_once_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLT_PUSH_INT ||
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETLE_PUSH_INT);
case CHIPREV_EVERGREEN:
+ case CHIPREV_CAYMAN:
default:
return !alu->is_op3 && (
alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLE ||
@@ -384,6 +389,7 @@ static int is_alu_reduction_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4_IEEE ||
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX4);
case CHIPREV_EVERGREEN:
+ case CHIPREV_CAYMAN:
default:
return !alu->is_op3 && (
alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE ||
@@ -401,6 +407,7 @@ static int is_alu_cube_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
return !alu->is_op3 &&
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
case CHIPREV_EVERGREEN:
+ case CHIPREV_CAYMAN:
default:
return !alu->is_op3 &&
alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE;
@@ -417,6 +424,7 @@ static int is_alu_mova_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR ||
alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
case CHIPREV_EVERGREEN:
+ case CHIPREV_CAYMAN:
default:
return !alu->is_op3 && (
alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT);
@@ -469,6 +477,7 @@ static int is_alu_trans_unit_inst(struct r600_bc *bc, struct r600_bc_alu *alu)
alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M2 ||
alu->inst == V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT_M4;
case CHIPREV_EVERGREEN:
+ case CHIPREV_CAYMAN:
default:
if (!alu->is_op3)
/* Note that FLT_TO_INT_* instructions are vector-only instructions
@@ -514,13 +523,16 @@ static int assign_alu_units(struct r600_bc *bc, struct r600_bc_alu *alu_first,
{
struct r600_bc_alu *alu;
unsigned i, chan, trans;
+ int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
- for (i = 0; i < 5; i++)
+ for (i = 0; i < max_slots; i++)
assignment[i] = NULL;
for (alu = alu_first; alu; alu = LIST_ENTRY(struct r600_bc_alu, alu->list.next, list)) {
chan = alu->dst.chan;
- if (is_alu_trans_unit_inst(bc, alu))
+ if (max_slots == 4)
+ trans = 0;
+ else if (is_alu_trans_unit_inst(bc, alu))
trans = 1;
else if (is_alu_vec_unit_inst(bc, alu))
trans = 0;
@@ -719,8 +731,10 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc,
struct alu_bank_swizzle bs;
int bank_swizzle[5];
int i, r = 0, forced = 0;
- boolean scalar_only = true;
- for (i = 0; i < 5; i++) {
+ boolean scalar_only = bc->chiprev == CHIPREV_CAYMAN ? false : true;
+ int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
+
+ for (i = 0; i < max_slots; i++) {
if (slots[i] && slots[i]->bank_swizzle_force) {
slots[i]->bank_swizzle = slots[i]->bank_swizzle_force;
forced = 1;
@@ -737,6 +751,13 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc,
bank_swizzle[i] = SQ_ALU_VEC_012;
bank_swizzle[4] = SQ_ALU_SCL_210;
while(bank_swizzle[4] <= SQ_ALU_SCL_221) {
+
+ if (max_slots == 4) {
+ for (i = 0; i < max_slots; i++) {
+ if (bank_swizzle[i] == SQ_ALU_VEC_210)
+ return -1;
+ }
+ }
init_bank_swizzle(&bs);
if (scalar_only == false) {
for (i = 0; i < 4; i++) {
@@ -749,11 +770,11 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc,
} else
r = 0;
- if (!r && slots[4]) {
+ if (!r && slots[4] && max_slots == 5) {
r = check_scalar(bc, slots[4], &bs, bank_swizzle[4]);
}
if (!r) {
- for (i = 0; i < 5; i++) {
+ for (i = 0; i < max_slots; i++) {
if (slots[i])
slots[i]->bank_swizzle = bank_swizzle[i];
}
@@ -763,7 +784,7 @@ static int check_and_set_bank_swizzle(struct r600_bc *bc,
if (scalar_only) {
bank_swizzle[4]++;
} else {
- for (i = 0; i < 5; i++) {
+ for (i = 0; i < max_slots; i++) {
bank_swizzle[i]++;
if (bank_swizzle[i] <= SQ_ALU_VEC_210)
break;
@@ -783,12 +804,13 @@ static int replace_gpr_with_pv_ps(struct r600_bc *bc,
struct r600_bc_alu *prev[5];
int gpr[5], chan[5];
int i, j, r, src, num_src;
+ int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
r = assign_alu_units(bc, alu_prev, prev);
if (r)
return r;
- for (i = 0; i < 5; ++i) {
+ for (i = 0; i < max_slots; ++i) {
if(prev[i] && prev[i]->dst.write && !prev[i]->dst.rel) {
gpr[i] = prev[i]->dst.sel;
/* cube writes more than PV.X */
@@ -800,7 +822,7 @@ static int replace_gpr_with_pv_ps(struct r600_bc *bc,
gpr[i] = -1;
}
- for (i = 0; i < 5; ++i) {
+ for (i = 0; i < max_slots; ++i) {
struct r600_bc_alu *alu = slots[i];
if(!alu)
continue;
@@ -810,11 +832,13 @@ static int replace_gpr_with_pv_ps(struct r600_bc *bc,
if (!is_gpr(alu->src[src].sel) || alu->src[src].rel)
continue;
- if (alu->src[src].sel == gpr[4] &&
- alu->src[src].chan == chan[4]) {
- alu->src[src].sel = V_SQ_ALU_SRC_PS;
- alu->src[src].chan = 0;
- continue;
+ if (bc->chiprev < CHIPREV_CAYMAN) {
+ if (alu->src[src].sel == gpr[4] &&
+ alu->src[src].chan == chan[4]) {
+ alu->src[src].sel = V_SQ_ALU_SRC_PS;
+ alu->src[src].chan = 0;
+ continue;
+ }
}
for (j = 0; j < 4; ++j) {
@@ -922,12 +946,13 @@ static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
int i, j, r, src, num_src;
int num_once_inst = 0;
int have_mova = 0, have_rel = 0;
+ int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
r = assign_alu_units(bc, alu_prev, prev);
if (r)
return r;
- for (i = 0; i < 5; ++i) {
+ for (i = 0; i < max_slots; ++i) {
struct r600_bc_alu *alu;
/* check number of literals */
@@ -951,7 +976,7 @@ static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
result[i] = prev[i];
continue;
} else if (prev[i] && slots[i]) {
- if (result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
+ if (max_slots == 5 && result[4] == NULL && prev[4] == NULL && slots[4] == NULL) {
/* Trans unit is still free try to use it. */
if (is_alu_any_unit_inst(bc, slots[i])) {
result[i] = prev[i];
@@ -991,7 +1016,7 @@ static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
if (!is_gpr(alu->src[src].sel))
continue;
- for (j = 0; j < 5; ++j) {
+ for (j = 0; j < max_slots; ++j) {
if (!prev[j] || !prev[j]->dst.write)
continue;
@@ -1019,7 +1044,7 @@ static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
bc->cf_last->ndw -= align(prev_nliteral, 2);
/* sort instructions */
- for (i = 0; i < 5; ++i) {
+ for (i = 0; i < max_slots; ++i) {
slots[i] = result[i];
if (result[i]) {
LIST_DEL(&result[i]->list);
@@ -1032,7 +1057,7 @@ static int merge_inst_groups(struct r600_bc *bc, struct r600_bc_alu *slots[5],
LIST_ENTRY(struct r600_bc_alu, bc->cf_last->alu.prev, list)->last = 1;
/* determine new first instruction */
- for (i = 0; i < 5; ++i) {
+ for (i = 0; i < max_slots; ++i) {
if (result[i]) {
bc->cf_last->curr_bs_head = result[i];
break;
@@ -1225,6 +1250,7 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int
uint32_t literal[4];
unsigned nliteral;
struct r600_bc_alu *slots[5];
+ int max_slots = bc->chiprev == CHIPREV_CAYMAN ? 4 : 5;
r = assign_alu_units(bc, bc->cf_last->curr_bs_head, slots);
if (r)
return r;
@@ -1245,7 +1271,7 @@ int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int
if (r)
return r;
- for (i = 0, nliteral = 0; i < 5; i++) {
+ for (i = 0, nliteral = 0; i < max_slots; i++) {
if (slots[i]) {
r = r600_bc_alu_nliterals(bc, slots[i], literal, &nliteral);
if (r)
@@ -1282,6 +1308,7 @@ static unsigned r600_bc_num_tex_and_vtx_instructions(const struct r600_bc *bc)
return 16;
case CHIPREV_EVERGREEN:
+ case CHIPREV_CAYMAN:
return 64;
default:
@@ -1290,6 +1317,19 @@ static unsigned r600_bc_num_tex_and_vtx_instructions(const struct r600_bc *bc)
}
}
+static inline boolean last_inst_was_vtx_fetch(struct r600_bc *bc)
+{
+ if (bc->chiprev == CHIPREV_CAYMAN) {
+ if (bc->cf_last->inst != CM_V_SQ_CF_WORD1_SQ_CF_INST_TC)
+ return TRUE;
+ } else {
+ if (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
+ bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC)
+ return TRUE;
+ }
+ return FALSE;
+}
+
int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx)
{
struct r600_bc_vtx *nvtx = r600_bc_vtx();
@@ -1301,15 +1341,17 @@ int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx)
/* cf can contains only alu or only vtx or only tex */
if (bc->cf_last == NULL ||
- (bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX &&
- bc->cf_last->inst != V_SQ_CF_WORD1_SQ_CF_INST_VTX_TC) ||
- bc->force_add_cf) {
+ last_inst_was_vtx_fetch(bc) ||
+ bc->force_add_cf) {
r = r600_bc_add_cf(bc);
if (r) {
free(nvtx);
return r;
}
- bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
+ if (bc->chiprev == CHIPREV_CAYMAN)
+ bc->cf_last->inst = CM_V_SQ_CF_WORD1_SQ_CF_INST_TC;
+ else
+ bc->cf_last->inst = V_SQ_CF_WORD1_SQ_CF_INST_VTX;
}
LIST_ADDTAIL(&nvtx->list, &bc->cf_last->vtx);
/* each fetch use 4 dwords */
@@ -1379,14 +1421,21 @@ int r600_bc_add_cfinst(struct r600_bc *bc, int inst)
return 0;
}
+int cm_bc_add_cf_end(struct r600_bc *bc)
+{
+ return r600_bc_add_cfinst(bc, CM_V_SQ_CF_WORD1_SQ_CF_INST_END);
+}
+
/* common to all 3 families */
static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id)
{
- bc->bytecode[id++] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
+ bc->bytecode[id] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) |
S_SQ_VTX_WORD0_FETCH_TYPE(vtx->fetch_type) |
S_SQ_VTX_WORD0_SRC_GPR(vtx->src_gpr) |
- S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x) |
- S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
+ S_SQ_VTX_WORD0_SRC_SEL_X(vtx->src_sel_x);
+ if (bc->chiprev < CHIPREV_CAYMAN)
+ bc->bytecode[id] |= S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(vtx->mega_fetch_count);
+ id++;
bc->bytecode[id++] = S_SQ_VTX_WORD1_DST_SEL_X(vtx->dst_sel_x) |
S_SQ_VTX_WORD1_DST_SEL_Y(vtx->dst_sel_y) |
S_SQ_VTX_WORD1_DST_SEL_Z(vtx->dst_sel_z) |
@@ -1397,9 +1446,11 @@ static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsign
S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
- bc->bytecode[id++] = S_SQ_VTX_WORD2_OFFSET(vtx->offset) |
- S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian) |
- S_SQ_VTX_WORD2_MEGA_FETCH(1);
+ bc->bytecode[id] = S_SQ_VTX_WORD2_OFFSET(vtx->offset)|
+ S_SQ_VTX_WORD2_ENDIAN_SWAP(vtx->endian);
+ if (bc->chiprev < CHIPREV_CAYMAN)
+ bc->bytecode[id] |= S_SQ_VTX_WORD2_MEGA_FETCH(1);
+ id++;
bc->bytecode[id++] = 0;
return 0;
}
@@ -1601,6 +1652,7 @@ int r600_bc_build(struct r600_bc *bc)
case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
+ case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
break;
default:
R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
@@ -1616,7 +1668,7 @@ int r600_bc_build(struct r600_bc *bc)
return -ENOMEM;
LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) {
addr = cf->addr;
- if (bc->chiprev == CHIPREV_EVERGREEN)
+ if (bc->chiprev >= CHIPREV_EVERGREEN)
r = eg_bc_cf_build(bc, cf);
else
r = r600_bc_cf_build(bc, cf);
@@ -1640,6 +1692,7 @@ int r600_bc_build(struct r600_bc *bc)
break;
case CHIPREV_R700:
case CHIPREV_EVERGREEN: /* eg alu is same encoding as r700 */
+ case CHIPREV_CAYMAN: /* eg alu is same encoding as r700 */
r = r700_bc_alu_build(bc, alu, addr);
break;
default:
@@ -1668,6 +1721,14 @@ int r600_bc_build(struct r600_bc *bc)
}
break;
case V_SQ_CF_WORD1_SQ_CF_INST_TEX:
+ if (bc->chiprev == CHIPREV_CAYMAN) {
+ LIST_FOR_EACH_ENTRY(vtx, &cf->vtx, list) {
+ r = r600_bc_vtx_build(bc, vtx, addr);
+ if (r)
+ return r;
+ addr += 4;
+ }
+ }
LIST_FOR_EACH_ENTRY(tex, &cf->tex, list) {
r = r600_bc_tex_build(bc, tex, addr);
if (r)
@@ -1688,6 +1749,7 @@ int r600_bc_build(struct r600_bc *bc)
case V_SQ_CF_WORD1_SQ_CF_INST_POP:
case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
+ case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
break;
default:
R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst);
@@ -1752,6 +1814,9 @@ void r600_bc_dump(struct r600_bc *bc)
case 2:
chip = 'E';
break;
+ case 3:
+ chip = 'C';
+ break;
case 0:
default:
chip = '6';
@@ -1818,6 +1883,7 @@ void r600_bc_dump(struct r600_bc *bc)
case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK:
case V_SQ_CF_WORD1_SQ_CF_INST_CALL_FS:
case V_SQ_CF_WORD1_SQ_CF_INST_RETURN:
+ case CM_V_SQ_CF_WORD1_SQ_CF_INST_END:
fprintf(stderr, "%04d %08X CF ", id, bc->bytecode[id]);
fprintf(stderr, "ADDR:%d\n", cf->cf_addr);
id++;
@@ -1920,7 +1986,10 @@ void r600_bc_dump(struct r600_bc *bc)
fprintf(stderr, "%04d %08X ", id, bc->bytecode[id]);
fprintf(stderr, "SRC(GPR:%d ", vtx->src_gpr);
fprintf(stderr, "SEL_X:%d) ", vtx->src_sel_x);
- fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
+ if (bc->chiprev < CHIPREV_CAYMAN)
+ fprintf(stderr, "MEGA_FETCH_COUNT:%d ", vtx->mega_fetch_count);
+ else
+ fprintf(stderr, "SEL_Y:%d) ", 0);
fprintf(stderr, "DST(GPR:%d ", vtx->dst_gpr);
fprintf(stderr, "SEL_X:%d ", vtx->dst_sel_x);
fprintf(stderr, "SEL_Y:%d ", vtx->dst_sel_y);