diff options
-rw-r--r-- | src/gallium/drivers/r600/r600_asm.c | 95 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_asm.h | 9 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_shader.c | 2 | ||||
-rw-r--r-- | src/gallium/drivers/r600/r600_sq.h | 7 |
4 files changed, 106 insertions, 7 deletions
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 82347446fbd..b8d43c0d928 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -94,6 +94,7 @@ static inline unsigned int r600_bytecode_get_num_operands(struct r600_bytecode * case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR: + case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT: case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR: @@ -249,8 +250,18 @@ static struct r600_bytecode_tex *r600_bytecode_tex(void) return tex; } -void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class) +void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family) { + if ((chip_class == R600) && (family != CHIP_RV670)) + bc->ar_handling = AR_HANDLE_RV6XX; + else + bc->ar_handling = AR_HANDLE_NORMAL; + + if ((chip_class == R600) && (family != CHIP_RV670 && family != CHIP_RS780 && + family != CHIP_RS880)) + bc->r6xx_nop_after_rel_dst = 1; + else + bc->r6xx_nop_after_rel_dst = 0; LIST_INITHEAD(&bc->cf); bc->chip_class = chip_class; } @@ -441,7 +452,8 @@ static int is_alu_mova_inst(struct r600_bytecode *bc, struct r600_bytecode_alu * return !alu->is_op3 && ( alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA || alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR || - alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT); + alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_INT || + alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT); case EVERGREEN: case CAYMAN: default: @@ -457,7 +469,8 @@ static int is_alu_vec_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a case R600: case R700: return is_alu_reduction_inst(bc, alu) || - is_alu_mova_inst(bc, alu); + (is_alu_mova_inst(bc, alu) && + (alu->inst != V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT)); case EVERGREEN: case CAYMAN: default: @@ -478,6 +491,7 @@ static int is_alu_trans_unit_inst(struct r600_bytecode *bc, struct r600_bytecode case R700: if (!alu->is_op3) return alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ASHR_INT || + alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT || alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT || alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_UINT || alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT || @@ -547,6 +561,19 @@ static int is_alu_any_unit_inst(struct r600_bytecode *bc, struct r600_bytecode_a !is_alu_trans_unit_inst(bc, alu); } +static int is_nop_inst(struct r600_bytecode *bc, struct r600_bytecode_alu *alu) +{ + switch (bc->chip_class) { + case R600: + case R700: + return (!alu->is_op3 && alu->inst == V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP); + case EVERGREEN: + case CAYMAN: + default: + return (!alu->is_op3 && alu->inst == EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP); + } +} + static int assign_alu_units(struct r600_bytecode *bc, struct r600_bytecode_alu *alu_first, struct r600_bytecode_alu *assignment[5]) { @@ -1048,6 +1075,10 @@ static int merge_inst_groups(struct r600_bytecode *bc, struct r600_bytecode_alu alu = slots[i]; num_once_inst += is_alu_once_inst(bc, alu); + /* don't reschedule NOPs */ + if (is_nop_inst(bc, alu)) + return 0; + /* Let's check dst gpr. */ if (alu->dst.rel) { if (have_mova) @@ -1236,12 +1267,60 @@ static int r600_bytecode_alloc_kcache_lines(struct r600_bytecode *bc, struct r60 return 0; } +static int insert_nop_r6xx(struct r600_bytecode *bc) +{ + struct r600_bytecode_alu alu; + int r, i; + + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP; + alu.src[0].chan = i; + alu.dst.chan = i; + alu.last = (i == 3); + r = r600_bytecode_add_alu(bc, &alu); + if (r) + return r; + } + return 0; +} + +/* load AR register from gpr (bc->ar_reg) with MOVA_INT */ +static int load_ar_r6xx(struct r600_bytecode *bc) +{ + struct r600_bytecode_alu alu; + int r; + + if (bc->ar_loaded) + return 0; + + /* hack to avoid making MOVA the last instruction in the clause */ + if ((bc->cf_last->ndw>>1) >= 110) + bc->force_add_cf = 1; + + memset(&alu, 0, sizeof(alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_GPR_INT; + alu.src[0].sel = bc->ar_reg; + alu.last = 1; + alu.index_mode = INDEX_MODE_LOOP; + r = r600_bytecode_add_alu(bc, &alu); + if (r) + return r; + + /* no requirement to set uses waterfall on MOVA_GPR_INT */ + bc->ar_loaded = 1; + return 0; +} + /* load AR register from gpr (bc->ar_reg) with MOVA_INT */ static int load_ar(struct r600_bytecode *bc) { struct r600_bytecode_alu alu; int r; + if (bc->ar_handling) + return load_ar_r6xx(bc); + if (bc->ar_loaded) return 0; @@ -1376,6 +1455,10 @@ int r600_bytecode_add_alu_type(struct r600_bytecode *bc, const struct r600_bytec bc->cf_last->prev_bs_head = bc->cf_last->curr_bs_head; bc->cf_last->curr_bs_head = NULL; } + + if (nalu->dst.rel && bc->r6xx_nop_after_rel_dst) + insert_nop_r6xx(bc); + return 0; } @@ -1599,6 +1682,7 @@ static int r600_bytecode_alu_build(struct r600_bytecode *bc, struct r600_bytecod S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) | S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) | S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) | + S_SQ_ALU_WORD0_INDEX_MODE(alu->index_mode) | S_SQ_ALU_WORD0_LAST(alu->last); if (alu->is_op3) { @@ -2286,7 +2370,8 @@ void r600_bytecode_dump(struct r600_bytecode *bc) fprintf(stderr, "SRC1(SEL:%d ", alu->src[1].sel); fprintf(stderr, "REL:%d ", alu->src[1].rel); fprintf(stderr, "CHAN:%d ", alu->src[1].chan); - fprintf(stderr, "NEG:%d) ", alu->src[1].neg); + fprintf(stderr, "NEG:%d ", alu->src[1].neg); + fprintf(stderr, "IM:%d) ", alu->index_mode); fprintf(stderr, "LAST:%d)\n", alu->last); id++; fprintf(stderr, "%04d %08X %c ", id, bc->bytecode[id], alu->last ? '*' : ' '); @@ -2565,7 +2650,7 @@ int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, stru } memset(&bc, 0, sizeof(bc)); - r600_bytecode_init(&bc, rctx->chip_class); + r600_bytecode_init(&bc, rctx->chip_class, rctx->family); for (i = 0; i < ve->count; i++) { if (elements[i].instance_divisor > 1) { diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index d0ff75d6e2f..00f7e591ac3 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -54,6 +54,7 @@ struct r600_bytecode_alu { unsigned bank_swizzle; unsigned bank_swizzle_force; unsigned omod; + unsigned index_mode; }; struct r600_bytecode_tex { @@ -176,6 +177,10 @@ struct r600_cf_callstack { int max; }; +#define AR_HANDLE_NORMAL 0 +#define AR_HANDLE_RV6XX 1 /* except RV670 */ + + struct r600_bytecode { enum chip_class chip_class; int type; @@ -194,13 +199,15 @@ struct r600_bytecode { struct r600_cf_callstack callstack[SQ_MAX_CALL_DEPTH]; unsigned ar_loaded; unsigned ar_reg; + unsigned ar_handling; + unsigned r6xx_nop_after_rel_dst; }; /* eg_asm.c */ int eg_bytecode_cf_build(struct r600_bytecode *bc, struct r600_bytecode_cf *cf); /* r600_asm.c */ -void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class); +void r600_bytecode_init(struct r600_bytecode *bc, enum chip_class chip_class, enum radeon_family family); void r600_bytecode_clear(struct r600_bytecode *bc); int r600_bytecode_add_alu(struct r600_bytecode *bc, const struct r600_bytecode_alu *alu); int r600_bytecode_add_vtx(struct r600_bytecode *bc, const struct r600_bytecode_vtx *vtx); diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 59d41cf57de..5819c2bc1a0 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -807,7 +807,7 @@ static int r600_shader_from_tgsi(struct r600_pipe_context * rctx, struct r600_pi ctx.bc = &shader->bc; ctx.shader = shader; - r600_bytecode_init(ctx.bc, rctx->chip_class); + r600_bytecode_init(ctx.bc, rctx->chip_class, rctx->family); ctx.tokens = tokens; tgsi_scan_shader(tokens, &ctx.info); tgsi_parse_init(&ctx.parse, tokens); diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h index b9c41260f56..4b2a19a07f7 100644 --- a/src/gallium/drivers/r600/r600_sq.h +++ b/src/gallium/drivers/r600/r600_sq.h @@ -471,4 +471,11 @@ #define SQ_ALU_SCL_122 0x00000001 #define SQ_ALU_SCL_212 0x00000002 #define SQ_ALU_SCL_221 0x00000003 + +#define INDEX_MODE_AR_X 0 +#define INDEX_MODE_AR_Y 1 +#define INDEX_MODE_AR_Z 2 +#define INDEX_MODE_AR_W 3 +#define INDEX_MODE_LOOP 4 + #endif |