diff options
author | Tom Stellard <[email protected]> | 2011-05-12 23:13:21 -0700 |
---|---|---|
committer | Tom Stellard <[email protected]> | 2011-10-14 18:30:14 -0700 |
commit | 163629fd05166b78d70c2c26f4a922b296e8999d (patch) | |
tree | 81c7020a49c5c1a7f8ba7c5e91b2a8321e7cbf5a /src/gallium/drivers/r300/compiler | |
parent | 51fe9994bdec1625a27e12216d492916e64ab622 (diff) |
r300/compiler: Implement the texture semaphore
The texture semaphore allows for prefetching of texture data. On my
RV515, this increases the FPS of Lightsmark by 33% (This is with the
reg_rename pass enabled, which is enabled in the next commit).
There is a new env variable now called RADEON_TEX_GROUP, which allows
you to specify the maximum number of texture lookups to do at once.
The default is 8, but different values could produce better results
for various application / card combinations.
Diffstat (limited to 'src/gallium/drivers/r300/compiler')
5 files changed, 304 insertions, 47 deletions
diff --git a/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c b/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c index 1ceef3beeef..bfd4aa82b95 100644 --- a/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c +++ b/src/gallium/drivers/r300/compiler/r500_fragprog_emit.c @@ -262,7 +262,7 @@ static void emit_paired(struct r300_fragment_program_compiler *c, struct rc_pair } else { code->inst[ip].inst0 = R500_INST_TYPE_ALU; } - code->inst[ip].inst0 |= R500_INST_TEX_SEM_WAIT; + code->inst[ip].inst0 |= (inst->SemWait << R500_INST_TEX_SEM_WAIT_SHIFT); code->inst[ip].inst0 |= (inst->RGB.WriteMask << 11); code->inst[ip].inst0 |= inst->Alpha.WriteMask ? 1 << 14 : 0; @@ -380,9 +380,9 @@ static int emit_tex(struct r300_fragment_program_compiler *c, struct rc_sub_inst code->inst[ip].inst0 = R500_INST_TYPE_TEX | (inst->DstReg.WriteMask << 11) - | R500_INST_TEX_SEM_WAIT; + | (inst->TexSemWait << R500_INST_TEX_SEM_WAIT_SHIFT); code->inst[ip].inst1 = R500_TEX_ID(inst->TexSrcUnit) - | R500_TEX_SEM_ACQUIRE; + | (inst->TexSemAcquire << R500_TEX_SEM_ACQUIRE_SHIFT); if (inst->TexSrcTarget == RC_TEXTURE_RECT) code->inst[ip].inst1 |= R500_TEX_UNSCALED; @@ -650,6 +650,9 @@ void r500BuildFragmentProgramHwCode(struct radeon_compiler *c, void *user) } } + /* Make sure TEX_SEM_WAIT is set on the last instruction */ + code->inst[code->inst_end].inst0 |= R500_INST_TEX_SEM_WAIT; + if (code->max_temp_idx >= compiler->Base.max_temp_regs) rc_error(&compiler->Base, "Too many hardware temporaries used"); diff --git a/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c b/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c index ee868ccba3a..e03afb4c82f 100644 --- a/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c +++ b/src/gallium/drivers/r300/compiler/radeon_pair_schedule.c @@ -32,7 +32,10 @@ #include "radeon_compiler.h" #include "radeon_compiler_util.h" #include "radeon_dataflow.h" +#include "radeon_list.h" +#include "radeon_variable.h" +#include "util/u_debug.h" #define VERBOSE 0 @@ -65,6 +68,17 @@ struct schedule_instruction { * PairedInst references the alpha insturction's dependency information. */ struct schedule_instruction * PairedInst; + + /** This scheduler uses the value of Score to determine which + * instruction to schedule. Instructions with a higher value of Score + * will be scheduled first. */ + int Score; + + /** The number of components that read from a TEX instruction. */ + unsigned TexReadCount; + + /** For TEX instructions a list of readers */ + struct rc_list * TexReaders; }; @@ -131,6 +145,12 @@ struct schedule_state { struct schedule_instruction *ReadyAlpha; struct schedule_instruction *ReadyTEX; /*@}*/ + struct rc_list *PendingTEX; + + void (*CalcScore)(struct schedule_instruction *); + long max_tex_group; + unsigned PrevBlockHasTex:1; + unsigned TEXCount; }; static struct reg_value ** get_reg_valuep(struct schedule_state * s, @@ -147,6 +167,29 @@ static struct reg_value ** get_reg_valuep(struct schedule_state * s, return &s->Temporary[index].Values[chan]; } +static unsigned get_tex_read_count(struct schedule_instruction * sinst) +{ + unsigned tex_read_count = sinst->TexReadCount; + if (sinst->PairedInst) { + tex_read_count += sinst->PairedInst->TexReadCount; + } + return tex_read_count; +} + +#if VERBOSE +static void print_list(struct schedule_instruction * sinst) +{ + struct schedule_instruction * ptr; + for (ptr = sinst; ptr; ptr=ptr->NextReady) { + unsigned tex_read_count = get_tex_read_count(ptr); + unsigned score = sinst->Score; + fprintf(stderr,"%u (%d) [%u],", ptr->Instruction->IP, score, + tex_read_count); + } + fprintf(stderr, "\n"); +} +#endif + static void remove_inst_from_list(struct schedule_instruction ** list, struct schedule_instruction * inst) { @@ -172,17 +215,28 @@ static void add_inst_to_list(struct schedule_instruction ** list, struct schedul *list = inst; } -static void add_inst_to_list_end(struct schedule_instruction ** list, +static void add_inst_to_list_score(struct schedule_instruction ** list, struct schedule_instruction * inst) { - if(!*list){ + struct schedule_instruction * temp; + struct schedule_instruction * prev; + if (!*list) { *list = inst; - }else{ - struct schedule_instruction * temp = *list; - while(temp->NextReady){ - temp = temp->NextReady; - } - temp->NextReady = inst; + return; + } + temp = *list; + prev = NULL; + while(temp && inst->Score <= temp->Score) { + prev = temp; + temp = temp->NextReady; + } + + if (!prev) { + inst->NextReady = temp; + *list = inst; + } else { + prev->NextReady = inst; + inst->NextReady = temp; } } @@ -193,13 +247,13 @@ static void instruction_ready(struct schedule_state * s, struct schedule_instruc /* Adding Ready TEX instructions to the end of the "Ready List" helps * us emit TEX instructions in blocks without losing our place. */ if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL) - add_inst_to_list_end(&s->ReadyTEX, sinst); + add_inst_to_list_score(&s->ReadyTEX, sinst); else if (sinst->Instruction->U.P.Alpha.Opcode == RC_OPCODE_NOP) - add_inst_to_list(&s->ReadyRGB, sinst); + add_inst_to_list_score(&s->ReadyRGB, sinst); else if (sinst->Instruction->U.P.RGB.Opcode == RC_OPCODE_NOP) - add_inst_to_list(&s->ReadyAlpha, sinst); + add_inst_to_list_score(&s->ReadyAlpha, sinst); else - add_inst_to_list(&s->ReadyFullALU, sinst); + add_inst_to_list_score(&s->ReadyFullALU, sinst); } static void decrease_dependencies(struct schedule_state * s, struct schedule_instruction * sinst) @@ -210,6 +264,68 @@ static void decrease_dependencies(struct schedule_state * s, struct schedule_ins instruction_ready(s, sinst); } +/* These functions provide different heuristics for scheduling instructions. + * The default is calc_score_readers. */ + +#if 0 + +static void calc_score_zero(struct schedule_instruction * sinst) +{ + sinst->Score = 0; +} + +static void calc_score_deps(struct schedule_instruction * sinst) +{ + int i; + sinst->Score = 0; + for (i = 0; i < sinst->NumWriteValues; i++) { + struct reg_value * v = sinst->WriteValues[i]; + if (v->NumReaders) { + struct reg_value_reader * r; + for (r = v->Readers; r; r = r->Next) { + if (r->Reader->NumDependencies == 1) { + sinst->Score += 100; + } + sinst->Score += r->Reader->NumDependencies; + } + } + } +} + +#endif + +#define NO_READ_TEX_SCORE (1 << 16) +#define NO_OUTPUT_SCORE (1 << 24) + +static void calc_score_readers(struct schedule_instruction * sinst) +{ + if (sinst->Instruction->Type == RC_INSTRUCTION_NORMAL) { + sinst->Score = 0; + } else { + sinst->Score = sinst->NumReadValues; + if (sinst->PairedInst) { + sinst->Score += sinst->PairedInst->NumReadValues; + } + if (get_tex_read_count(sinst) == 0) { + sinst->Score |= NO_READ_TEX_SCORE; + } + if (!sinst->Instruction->U.P.RGB.OutputWriteMask && + !sinst->Instruction->U.P.Alpha.OutputWriteMask) { + if (sinst->PairedInst) { + if (!sinst->PairedInst->Instruction->U.P. + RGB.OutputWriteMask + && !sinst->PairedInst->Instruction->U.P. + Alpha.OutputWriteMask) { + sinst->Score |= NO_OUTPUT_SCORE; + } + + } else { + sinst->Score |= NO_OUTPUT_SCORE; + } + } + } +} + /** * This function decreases the dependencies of the next instruction that * wants to write to each of sinst's read values. @@ -222,8 +338,9 @@ static void commit_update_reads(struct schedule_state * s, assert(v->NumReaders > 0); v->NumReaders--; if (!v->NumReaders) { - if (v->Next) + if (v->Next) { decrease_dependencies(s, v->Next->Writer); + } } } if (sinst->PairedInst) { @@ -256,13 +373,33 @@ static void commit_update_writes(struct schedule_state * s, } } +static void notify_sem_wait(struct schedule_state *s) +{ + struct rc_list * pend_ptr; + for (pend_ptr = s->PendingTEX; pend_ptr; pend_ptr = pend_ptr->Next) { + struct rc_list * read_ptr; + struct schedule_instruction * pending = pend_ptr->Item; + for (read_ptr = pending->TexReaders; read_ptr; + read_ptr = read_ptr->Next) { + struct schedule_instruction * reader = read_ptr->Item; + reader->TexReadCount--; + } + } + s->PendingTEX = NULL; +} + static void commit_alu_instruction(struct schedule_state * s, struct schedule_instruction * sinst) { - DBG("%i: commit\n", sinst->Instruction->IP); + DBG("%i: commit score = %d\n", sinst->Instruction->IP, sinst->Score); commit_update_reads(s, sinst); commit_update_writes(s, sinst); + + if (get_tex_read_count(sinst) > 0) { + sinst->Instruction->U.P.SemWait = 1; + notify_sem_wait(s); + } } /** @@ -277,6 +414,7 @@ static void emit_all_tex(struct schedule_state * s, struct rc_instruction * befo struct rc_instruction * inst_begin; assert(s->ReadyTEX); + notify_sem_wait(s); /* Node marker for R300 */ inst_begin = rc_insert_new_instruction(s->C, before->Prev); @@ -308,6 +446,12 @@ static void emit_all_tex(struct schedule_state * s, struct rc_instruction * befo while(readytex){ DBG("%i: commit TEX writes\n", readytex->Instruction->IP); commit_update_writes(s, readytex); + /* Set semaphore bits for last TEX instruction in the block */ + if (!readytex->NextReady) { + readytex->Instruction->U.I.TexSemAcquire = 1; + readytex->Instruction->U.I.TexSemWait = 1; + } + rc_list_add(&s->PendingTEX, rc_list(&s->C->Pool, readytex)); readytex = readytex->NextReady; } } @@ -491,6 +635,9 @@ static int destructive_merge_instructions( rgb->ALUResultCompare = alpha->ALUResultCompare; } + /* Copy SemWait */ + rgb->SemWait |= alpha->SemWait; + return 1; } @@ -824,14 +971,14 @@ static void pair_instructions(struct schedule_state * s) && convert_rgb_to_alpha(s, rgb_ptr)) { struct schedule_instruction * pair_ptr; + remove_inst_from_list(&s->ReadyRGB, rgb_ptr); + add_inst_to_list_score(&s->ReadyAlpha, rgb_ptr); + for (pair_ptr = s->ReadyRGB; pair_ptr; pair_ptr = pair_ptr->NextReady) { - if (pair_ptr == rgb_ptr) { - continue; - } if (merge_instructions(&pair_ptr->Instruction->U.P, &rgb_ptr->Instruction->U.P)) { - remove_inst_from_list(&s->ReadyRGB, rgb_ptr); + remove_inst_from_list(&s->ReadyAlpha, rgb_ptr); remove_inst_from_list(&s->ReadyRGB, pair_ptr); pair_ptr->PairedInst = rgb_ptr; @@ -849,6 +996,68 @@ static void pair_instructions(struct schedule_state * s) } } +static void update_max_score( + struct schedule_state * s, + struct schedule_instruction ** list, + int * max_score, + struct schedule_instruction ** max_inst_out, + struct schedule_instruction *** list_out) +{ + struct schedule_instruction * list_ptr; + for (list_ptr = *list; list_ptr; list_ptr = list_ptr->NextReady) { + int score; + s->CalcScore(list_ptr); + score = list_ptr->Score; + if (!*max_inst_out || score > *max_score) { + *max_score = score; + *max_inst_out = list_ptr; + *list_out = list; + } + } +} + +static void emit_instruction( + struct schedule_state * s, + struct rc_instruction * before) +{ + int max_score = -1; + struct schedule_instruction * max_inst = NULL; + struct schedule_instruction ** max_list = NULL; + unsigned tex_count = 0; + struct schedule_instruction * tex_ptr; + + pair_instructions(s); +#if VERBOSE + fprintf(stderr, "Full:\n"); + print_list(s->ReadyFullALU); + fprintf(stderr, "RGB:\n"); + print_list(s->ReadyRGB); + fprintf(stderr, "Alpha:\n"); + print_list(s->ReadyAlpha); + fprintf(stderr, "TEX:\n"); + print_list(s->ReadyTEX); +#endif + + for (tex_ptr = s->ReadyTEX; tex_ptr; tex_ptr = tex_ptr->NextReady) { + tex_count++; + } + update_max_score(s, &s->ReadyFullALU, &max_score, &max_inst, &max_list); + update_max_score(s, &s->ReadyRGB, &max_score, &max_inst, &max_list); + update_max_score(s, &s->ReadyAlpha, &max_score, &max_inst, &max_list); + + if (tex_count >= s->max_tex_group || max_score == -1 + || (s->TEXCount > 0 && tex_count == s->TEXCount)) { + emit_all_tex(s, before); + } else { + + + remove_inst_from_list(max_list, max_inst); + rc_insert_instruction(before->Prev, max_inst->Instruction); + commit_alu_instruction(s, max_inst); + + presub_nop(before->Prev); + } +} /** * Find a good ALU instruction or pair of ALU instruction and emit it. @@ -860,6 +1069,7 @@ static void pair_instructions(struct schedule_state * s) static void emit_one_alu(struct schedule_state *s, struct rc_instruction * before) { struct schedule_instruction * sinst; + int rgb_score = -1, alpha_score = -1; /* Try to merge RGB and Alpha instructions together. */ pair_instructions(s); @@ -871,6 +1081,12 @@ static void emit_one_alu(struct schedule_state *s, struct rc_instruction * befor commit_alu_instruction(s, sinst); } else { if (s->ReadyRGB) { + rgb_score = s->ReadyRGB->Score; + } + if (s->ReadyAlpha) { + alpha_score = s->ReadyAlpha->Score; + } + if (rgb_score > alpha_score) { sinst = s->ReadyRGB; s->ReadyRGB = s->ReadyRGB->NextReady; } else if (s->ReadyAlpha) { @@ -924,6 +1140,12 @@ static void scan_read(void * data, struct rc_instruction * inst, /* Only update the current instruction's dependencies if the * register it reads from has been written to in this block. */ if ((*v)->Writer) { + if ((*v)->Writer->Instruction->Type == + RC_INSTRUCTION_NORMAL) { + s->Current->TexReadCount++; + rc_list_add(&((*v)->Writer->TexReaders), + rc_list(&s->C->Pool, s->Current)); + } s->Current->NumDependencies++; } } @@ -977,22 +1199,33 @@ static void is_rgb_to_alpha_possible_normal( } -static void schedule_block(struct r300_fragment_program_compiler * c, +static void schedule_block(struct schedule_state * s, struct rc_instruction * begin, struct rc_instruction * end) { - struct schedule_state s; unsigned int ip; - memset(&s, 0, sizeof(s)); - s.C = &c->Base; - /* Scan instructions for data dependencies */ ip = 0; for(struct rc_instruction * inst = begin; inst != end; inst = inst->Next) { - s.Current = memory_pool_malloc(&c->Base.Pool, sizeof(*s.Current)); - memset(s.Current, 0, sizeof(struct schedule_instruction)); + s->Current = memory_pool_malloc(&s->C->Pool, sizeof(*s->Current)); + memset(s->Current, 0, sizeof(struct schedule_instruction)); + + if (inst->Type == RC_INSTRUCTION_NORMAL) { + const struct rc_opcode_info * info = + rc_get_opcode_info(inst->U.I.Opcode); + if (info->HasTexture) { + s->TEXCount++; + } + } - s.Current->Instruction = inst; + /* XXX: This causes SemWait to be set for all instructions in + * a block if the previous block contained a TEX instruction. + * We can do better here, but it will take a lot of work. */ + if (s->PrevBlockHasTex) { + s->Current->TexReadCount = 1; + } + + s->Current->Instruction = inst; inst->IP = ip++; DBG("%i: Scanning\n", inst->IP); @@ -1001,17 +1234,18 @@ static void schedule_block(struct r300_fragment_program_compiler * c, * counter-intuitive, to account for the case where an * instruction writes to the same register as it reads * from. */ - rc_for_all_writes_chan(inst, &scan_write, &s); - rc_for_all_reads_chan(inst, &scan_read, &s); + rc_for_all_writes_chan(inst, &scan_write, s); + rc_for_all_reads_chan(inst, &scan_read, s); - DBG("%i: Has %i dependencies\n", inst->IP, s.Current->NumDependencies); + DBG("%i: Has %i dependencies\n", inst->IP, s->Current->NumDependencies); - if (!s.Current->NumDependencies) - instruction_ready(&s, s.Current); + if (!s->Current->NumDependencies) { + instruction_ready(s, s->Current); + } /* Get global readers for possible RGB->Alpha conversion. */ - s.Current->GlobalReaders.ExitOnAbort = 1; - rc_get_readers(s.C, inst, &s.Current->GlobalReaders, + s->Current->GlobalReaders.ExitOnAbort = 1; + rc_get_readers(s->C, inst, &s->Current->GlobalReaders, is_rgb_to_alpha_possible_normal, is_rgb_to_alpha_possible, NULL); } @@ -1021,13 +1255,17 @@ static void schedule_block(struct r300_fragment_program_compiler * c, end->Prev = begin->Prev; /* Schedule instructions back */ - while(!s.C->Error && - (s.ReadyTEX || s.ReadyRGB || s.ReadyAlpha || s.ReadyFullALU)) { - if (s.ReadyTEX) - emit_all_tex(&s, end); + while(!s->C->Error && + (s->ReadyTEX || s->ReadyRGB || s->ReadyAlpha || s->ReadyFullALU)) { + if (s->C->is_r500) { + emit_instruction(s, end); + } else { + if (s->ReadyTEX) + emit_all_tex(s, end); - while(!s.C->Error && (s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha)) - emit_one_alu(&s, end); + while(!s->C->Error && (s->ReadyFullALU || s->ReadyRGB || s->ReadyAlpha)) + emit_one_alu(s, end); + } } } @@ -1042,13 +1280,14 @@ static int is_controlflow(struct rc_instruction * inst) void rc_pair_schedule(struct radeon_compiler *cc, void *user) { - struct schedule_state s; - struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)cc; + struct schedule_state s; struct rc_instruction * inst = c->Base.Program.Instructions.Next; memset(&s, 0, sizeof(s)); s.C = &c->Base; + s.CalcScore = calc_score_readers; + s.max_tex_group = debug_get_num_option("RADEON_TEX_GROUP", 8); while(inst != &c->Base.Program.Instructions) { struct rc_instruction * first; @@ -1063,6 +1302,11 @@ void rc_pair_schedule(struct radeon_compiler *cc, void *user) inst = inst->Next; DBG("Schedule one block\n"); - schedule_block(c, first, inst); + memset(s.Temporary, 0, sizeof(s.Temporary)); + s.TEXCount = 0; + schedule_block(&s, first, inst); + if (s.PendingTEX) { + s.PrevBlockHasTex = 1; + } } } diff --git a/src/gallium/drivers/r300/compiler/radeon_program.h b/src/gallium/drivers/r300/compiler/radeon_program.h index 98e235310ba..e68be935de4 100644 --- a/src/gallium/drivers/r300/compiler/radeon_program.h +++ b/src/gallium/drivers/r300/compiler/radeon_program.h @@ -109,6 +109,10 @@ struct rc_sub_instruction { /** True if tex instruction should do shadow comparison */ unsigned int TexShadow:1; + /**/ + unsigned int TexSemWait:1; + unsigned int TexSemAcquire:1; + /**R500 Only. How to swizzle the result of a TEX lookup*/ unsigned int TexSwizzle:12; /*@}*/ diff --git a/src/gallium/drivers/r300/compiler/radeon_program_pair.h b/src/gallium/drivers/r300/compiler/radeon_program_pair.h index 0e380e76807..b6eb0ebe47d 100644 --- a/src/gallium/drivers/r300/compiler/radeon_program_pair.h +++ b/src/gallium/drivers/r300/compiler/radeon_program_pair.h @@ -89,6 +89,7 @@ struct rc_pair_instruction { unsigned int WriteALUResult:2; unsigned int ALUResultCompare:3; unsigned int Nop:1; + unsigned int SemWait:1; }; typedef void (*rc_pair_foreach_src_fn) diff --git a/src/gallium/drivers/r300/compiler/radeon_program_print.c b/src/gallium/drivers/r300/compiler/radeon_program_print.c index 38713a9e99b..dc40d7fd4f2 100644 --- a/src/gallium/drivers/r300/compiler/radeon_program_print.c +++ b/src/gallium/drivers/r300/compiler/radeon_program_print.c @@ -293,10 +293,12 @@ static void rc_print_normal_instruction(FILE * f, struct rc_instruction * inst, } if (opcode->HasTexture) { - fprintf(f, ", %s%s[%u]", + fprintf(f, ", %s%s[%u]%s%s", textarget_to_string(inst->U.I.TexSrcTarget), inst->U.I.TexShadow ? "SHADOW" : "", - inst->U.I.TexSrcUnit); + inst->U.I.TexSrcUnit, + inst->U.I.TexSemWait ? " SEM_WAIT" : "", + inst->U.I.TexSemAcquire ? " SEM_ACQUIRE" : ""); } fprintf(f, ";"); @@ -348,6 +350,9 @@ static void rc_print_pair_instruction(FILE * f, struct rc_instruction * fullinst presubtract_op_to_string( inst->Alpha.Src[RC_PAIR_PRESUB_SRC].Index)); } + if (inst->SemWait) { + fprintf(f, " SEM_WAIT"); + } fprintf(f, "\n"); if (inst->RGB.Opcode != RC_OPCODE_NOP) { |