diff options
Diffstat (limited to 'src/mesa')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_defines.h | 1 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_eu.h | 1 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_eu_emit.c | 53 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 24 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 62 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 14 |
6 files changed, 148 insertions, 7 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 2b77ae62ff3..40571a4d54d 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -679,6 +679,7 @@ enum opcode { FS_OPCODE_VARYING_PULL_CONSTANT_LOAD, FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7, FS_OPCODE_MOV_DISPATCH_TO_FLAGS, + FS_OPCODE_DISCARD_JUMP, VS_OPCODE_URB_WRITE, VS_OPCODE_SCRATCH_READ, diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index adefcfd9bad..b2fa44864db 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -1032,6 +1032,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p); struct brw_instruction *brw_BREAK(struct brw_compile *p); struct brw_instruction *brw_CONT(struct brw_compile *p); struct brw_instruction *gen6_CONT(struct brw_compile *p); +struct brw_instruction *gen6_HALT(struct brw_compile *p); /* Forward jumps: */ void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx); diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index fb1255f728c..dd91a3087c8 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -1461,6 +1461,24 @@ struct brw_instruction *brw_CONT(struct brw_compile *p) return insn; } +struct brw_instruction *gen6_HALT(struct brw_compile *p) +{ + struct brw_instruction *insn; + + insn = next_insn(p, BRW_OPCODE_HALT); + brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); + brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */ + + if (p->compressed) { + insn->header.execution_size = BRW_EXECUTE_16; + } else { + insn->header.compression_control = BRW_COMPRESSION_NONE; + insn->header.execution_size = BRW_EXECUTE_8; + } + return insn; +} + /* DO/WHILE loop: * * The DO/WHILE is just an unterminated loop -- break or continue are @@ -2302,8 +2320,8 @@ brw_find_next_block_end(struct brw_compile *p, int start) return ip; } } - assert(!"not reached"); - return start + 1; + + return 0; } /* There is no DO instruction on gen6, so to find the end of the loop @@ -2336,7 +2354,7 @@ brw_find_loop_end(struct brw_compile *p, int start) } /* After program generation, go back and update the UIP and JIP of - * BREAK and CONT instructions to their correct locations. + * BREAK, CONT, and HALT instructions to their correct locations. */ void brw_set_uip_jip(struct brw_compile *p) @@ -2360,24 +2378,45 @@ brw_set_uip_jip(struct brw_compile *p) continue; } + int block_end_ip = brw_find_next_block_end(p, ip); switch (insn->header.opcode) { case BRW_OPCODE_BREAK: - insn->bits3.break_cont.jip = - (brw_find_next_block_end(p, ip) - ip) / scale; + assert(block_end_ip != 0); + insn->bits3.break_cont.jip = (block_end_ip - ip) / scale; /* Gen7 UIP points to WHILE; Gen6 points just after it */ insn->bits3.break_cont.uip = (brw_find_loop_end(p, ip) - ip + (intel->gen == 6 ? 16 : 0)) / scale; break; case BRW_OPCODE_CONTINUE: - insn->bits3.break_cont.jip = - (brw_find_next_block_end(p, ip) - ip) / scale; + assert(block_end_ip != 0); + insn->bits3.break_cont.jip = (block_end_ip - ip) / scale; insn->bits3.break_cont.uip = (brw_find_loop_end(p, ip) - ip) / scale; assert(insn->bits3.break_cont.uip != 0); assert(insn->bits3.break_cont.jip != 0); break; + case BRW_OPCODE_HALT: + /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19): + * + * "In case of the halt instruction not inside any conditional + * code block, the value of <JIP> and <UIP> should be the + * same. In case of the halt instruction inside conditional code + * block, the <UIP> should be the end of the program, and the + * <JIP> should be end of the most inner conditional code block." + * + * The uip will have already been set by whoever set up the + * instruction. + */ + if (block_end_ip == 0) { + insn->bits3.break_cont.jip = insn->bits3.break_cont.uip; + } else { + insn->bits3.break_cont.jip = (block_end_ip - ip) / scale; + } + assert(insn->bits3.break_cont.uip != 0); + assert(insn->bits3.break_cont.jip != 0); + break; } } } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index b00755f953d..d4ddb478fd2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -129,6 +129,26 @@ static const fs_reg reg_undef; static const fs_reg reg_null_f(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_F); static const fs_reg reg_null_d(ARF, BRW_ARF_NULL, BRW_REGISTER_TYPE_D); +class ip_record : public exec_node { +public: + static void* operator new(size_t size, void *ctx) + { + void *node; + + node = rzalloc_size(ctx, size); + assert(node != NULL); + + return node; + } + + ip_record(int ip) + { + this->ip = ip; + } + + int ip; +}; + class fs_inst : public backend_instruction { public: /* Callers of this ralloc-based new need not call delete. It's @@ -516,6 +536,9 @@ private: struct brw_reg index, struct brw_reg offset); void generate_mov_dispatch_to_flags(fs_inst *inst); + void generate_discard_jump(fs_inst *inst); + + void patch_discard_jumps_to_fb_writes(); struct brw_context *brw; struct intel_context *intel; @@ -530,6 +553,7 @@ private: unsigned dispatch_width; /**< 8 or 16 */ + exec_list discard_halt_patches; bool dual_source_output; void *mem_ctx; }; diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp index f185eb52104..9a891414e62 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp @@ -61,12 +61,53 @@ fs_generator::~fs_generator() } void +fs_generator::patch_discard_jumps_to_fb_writes() +{ + if (intel->gen < 6 || this->discard_halt_patches.is_empty()) + return; + + /* There is a somewhat strange undocumented requirement of using + * HALT, according to the simulator. If some channel has HALTed to + * a particular UIP, then by the end of the program, every channel + * must have HALTed to that UIP. Furthermore, the tracking is a + * stack, so you can't do the final halt of a UIP after starting + * halting to a new UIP. + * + * Symptoms of not emitting this instruction on actual hardware + * included GPU hangs and sparkly rendering on the piglit discard + * tests. + */ + struct brw_instruction *last_halt = gen6_HALT(p); + last_halt->bits3.break_cont.uip = 2; + last_halt->bits3.break_cont.jip = 2; + + int ip = p->nr_insn; + + foreach_list(node, &this->discard_halt_patches) { + ip_record *patch_ip = (ip_record *)node; + struct brw_instruction *patch = &p->store[patch_ip->ip]; + + assert(patch->header.opcode == BRW_OPCODE_HALT); + /* HALT takes a half-instruction distance from the pre-incremented IP. */ + patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2; + } + + this->discard_halt_patches.make_empty(); +} + +void fs_generator::generate_fb_write(fs_inst *inst) { bool eot = inst->eot; struct brw_reg implied_header; uint32_t msg_control; + /* Note that the jumps emitted to this point mean that the g0 -> + * base_mrf setup must be inside of this function, so that we jump + * to a point containing it. + */ + patch_discard_jumps_to_fb_writes(); + /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied * move, here's g1. */ @@ -525,6 +566,23 @@ fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src } void +fs_generator::generate_discard_jump(fs_inst *inst) +{ + assert(intel->gen >= 6); + + /* This HALT will be patched up at FB write time to point UIP at the end of + * the program, and at brw_uip_jip() JIP will be set to the end of the + * current block (or the program). + */ + this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn)); + + brw_push_insn_state(p); + brw_set_mask_control(p, BRW_MASK_DISABLE); + gen6_HALT(p); + brw_pop_insn_state(p); +} + +void fs_generator::generate_spill(fs_inst *inst, struct brw_reg src) { assert(inst->mlen != 0); @@ -1085,6 +1143,10 @@ fs_generator::generate_code(exec_list *instructions) generate_mov_dispatch_to_flags(inst); break; + case FS_OPCODE_DISCARD_JUMP: + generate_discard_jump(inst); + break; + case SHADER_OPCODE_SHADER_TIME_ADD: brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME); break; diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 98cd064aceb..56d43309f9d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -1446,6 +1446,20 @@ fs_visitor::visit(ir_discard *ir) BRW_CONDITIONAL_NZ)); cmp->predicate = BRW_PREDICATE_NORMAL; cmp->flag_subreg = 1; + + if (intel->gen >= 6) { + /* For performance, after a discard, jump to the end of the shader. + * However, many people will do foliage by discarding based on a + * texture's alpha mask, and then continue on to texture with the + * remaining pixels. To avoid trashing the derivatives for those + * texture samples, we'll only jump if all of the pixels in the subspan + * have been discarded. + */ + fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP); + discard_jump->flag_subreg = 1; + discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H; + discard_jump->predicate_inverse = true; + } } void |