diff options
Diffstat (limited to 'src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp')
-rw-r--r-- | src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp | 253 |
1 files changed, 253 insertions, 0 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp index 6b8f767a3c0..04cbd402a18 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp @@ -68,6 +68,259 @@ TargetGM107::isOpSupported(operation op, DataType ty) const return true; } +// Return true when an instruction supports the reuse flag. When supported, the +// hardware will use the operand reuse cache introduced since Maxwell, which +// should try to reduce bank conflicts by caching values for the subsequent +// instructions. Note that the next instructions have to use the same GPR id in +// the same operand slot. +bool +TargetGM107::isReuseSupported(const Instruction *insn) const +{ + const OpClass cl = getOpClass(insn->op); + + // TODO: double-check! + switch (cl) { + case OPCLASS_ARITH: + case OPCLASS_COMPARE: + case OPCLASS_LOGIC: + case OPCLASS_MOVE: + case OPCLASS_SHIFT: + return true; + case OPCLASS_BITFIELD: + if (insn->op == OP_INSBF || insn->op == OP_EXTBF) + return true; + break; + default: + break; + } + return false; +} + +// Return true when an instruction requires to set up a barrier because it +// doesn't operate at a fixed latency. Variable latency instructions are memory +// operations, double precision operations, special function unit operations +// and other low throughput instructions. +bool +TargetGM107::isBarrierRequired(const Instruction *insn) const +{ + const OpClass cl = getOpClass(insn->op); + + if (insn->dType == TYPE_F64 || insn->sType == TYPE_F64) + return true; + + switch (cl) { + case OPCLASS_ATOMIC: + case OPCLASS_LOAD: + case OPCLASS_STORE: + case OPCLASS_SURFACE: + case OPCLASS_TEXTURE: + return true; + case OPCLASS_SFU: + switch (insn->op) { + case OP_COS: + case OP_EX2: + case OP_LG2: + case OP_LINTERP: + case OP_PINTERP: + case OP_RCP: + case OP_RSQ: + case OP_SIN: + return true; + default: + break; + } + break; + case OPCLASS_BITFIELD: + switch (insn->op) { + case OP_BFIND: + case OP_POPCNT: + return true; + default: + break; + } + break; + case OPCLASS_CONTROL: + switch (insn->op) { + case OP_EMIT: + case OP_RESTART: + return true; + default: + break; + } + break; + case OPCLASS_OTHER: + switch (insn->op) { + case OP_AFETCH: + case OP_PFETCH: + case OP_PIXLD: + case OP_RDSV: + case OP_SHFL: + return true; + default: + break; + } + break; + case OPCLASS_ARITH: + // TODO: IMUL/IMAD require barriers too, use of XMAD instead! + if ((insn->op == OP_MUL || insn->op == OP_MAD) && + !isFloatType(insn->dType)) + return true; + break; + case OPCLASS_CONVERT: + if (insn->def(0).getFile() != FILE_PREDICATE && + insn->src(0).getFile() != FILE_PREDICATE) + return true; + break; + default: + break; + } + return false; +} + +bool +TargetGM107::canDualIssue(const Instruction *a, const Instruction *b) const +{ + // TODO + return false; +} + +// Return the number of stall counts needed to complete a single instruction. +// On Maxwell GPUs, the pipeline depth is 6, but some instructions require +// different number of stall counts like memory operations. +int +TargetGM107::getLatency(const Instruction *insn) const +{ + // TODO: better values! This should be good enough for now though. + switch (insn->op) { + case OP_EMIT: + case OP_EXPORT: + case OP_PIXLD: + case OP_RESTART: + case OP_STORE: + case OP_SUSTB: + case OP_SUSTP: + return 1; + case OP_SHFL: + return 2; + case OP_ADD: + case OP_AND: + case OP_EXTBF: + case OP_FMA: + case OP_INSBF: + case OP_MAD: + case OP_MAX: + case OP_MIN: + case OP_MOV: + case OP_MUL: + case OP_NOT: + case OP_OR: + case OP_PREEX2: + case OP_PRESIN: + case OP_QUADOP: + case OP_SELP: + case OP_SET: + case OP_SET_AND: + case OP_SET_OR: + case OP_SET_XOR: + case OP_SHL: + case OP_SHLADD: + case OP_SHR: + case OP_SLCT: + case OP_SUB: + case OP_VOTE: + case OP_XOR: + if (insn->dType != TYPE_F64) + return 6; + break; + case OP_ABS: + case OP_CEIL: + case OP_CVT: + case OP_FLOOR: + case OP_NEG: + case OP_SAT: + case OP_TRUNC: + if (insn->op == OP_CVT && (insn->def(0).getFile() == FILE_PREDICATE || + insn->src(0).getFile() == FILE_PREDICATE)) + return 6; + break; + case OP_BFIND: + case OP_COS: + case OP_EX2: + case OP_LG2: + case OP_POPCNT: + case OP_QUADON: + case OP_QUADPOP: + case OP_RCP: + case OP_RSQ: + case OP_SIN: + return 13; + default: + break; + } + // Use the maximum number of stall counts for other instructions. + return 15; +} + +// Return the operand read latency which is the number of stall counts before +// an instruction can read its sources. For memory operations like ATOM, LOAD +// and STORE, the memory access has to be indirect. +int +TargetGM107::getReadLatency(const Instruction *insn) const +{ + switch (insn->op) { + case OP_ABS: + case OP_BFIND: + case OP_CEIL: + case OP_COS: + case OP_EX2: + case OP_FLOOR: + case OP_LG2: + case OP_NEG: + case OP_POPCNT: + case OP_RCP: + case OP_RSQ: + case OP_SAT: + case OP_SIN: + case OP_SULDB: + case OP_SULDP: + case OP_SUREDB: + case OP_SUREDP: + case OP_SUSTB: + case OP_SUSTP: + case OP_TRUNC: + return 4; + case OP_CVT: + if (insn->def(0).getFile() != FILE_PREDICATE && + insn->src(0).getFile() != FILE_PREDICATE) + return 4; + break; + case OP_ATOM: + case OP_LOAD: + case OP_STORE: + if (insn->src(0).isIndirect(0)) { + switch (insn->src(0).getFile()) { + case FILE_MEMORY_SHARED: + case FILE_MEMORY_CONST: + return 2; + case FILE_MEMORY_GLOBAL: + case FILE_MEMORY_LOCAL: + return 4; + default: + break; + } + } + break; + case OP_EXPORT: + case OP_PFETCH: + case OP_SHFL: + case OP_VFETCH: + return 2; + default: + break; + } + return 0; +} + bool TargetGM107::runLegalizePass(Program *prog, CGStage stage) const { |