aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp')
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp253
1 files changed, 253 insertions, 0 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
index 6b8f767a3c0..04cbd402a18 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_gm107.cpp
@@ -68,6 +68,259 @@ TargetGM107::isOpSupported(operation op, DataType ty) const
return true;
}
+// Return true when an instruction supports the reuse flag. When supported, the
+// hardware will use the operand reuse cache introduced since Maxwell, which
+// should try to reduce bank conflicts by caching values for the subsequent
+// instructions. Note that the next instructions have to use the same GPR id in
+// the same operand slot.
+bool
+TargetGM107::isReuseSupported(const Instruction *insn) const
+{
+ const OpClass cl = getOpClass(insn->op);
+
+ // TODO: double-check!
+ switch (cl) {
+ case OPCLASS_ARITH:
+ case OPCLASS_COMPARE:
+ case OPCLASS_LOGIC:
+ case OPCLASS_MOVE:
+ case OPCLASS_SHIFT:
+ return true;
+ case OPCLASS_BITFIELD:
+ if (insn->op == OP_INSBF || insn->op == OP_EXTBF)
+ return true;
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+// Return true when an instruction requires to set up a barrier because it
+// doesn't operate at a fixed latency. Variable latency instructions are memory
+// operations, double precision operations, special function unit operations
+// and other low throughput instructions.
+bool
+TargetGM107::isBarrierRequired(const Instruction *insn) const
+{
+ const OpClass cl = getOpClass(insn->op);
+
+ if (insn->dType == TYPE_F64 || insn->sType == TYPE_F64)
+ return true;
+
+ switch (cl) {
+ case OPCLASS_ATOMIC:
+ case OPCLASS_LOAD:
+ case OPCLASS_STORE:
+ case OPCLASS_SURFACE:
+ case OPCLASS_TEXTURE:
+ return true;
+ case OPCLASS_SFU:
+ switch (insn->op) {
+ case OP_COS:
+ case OP_EX2:
+ case OP_LG2:
+ case OP_LINTERP:
+ case OP_PINTERP:
+ case OP_RCP:
+ case OP_RSQ:
+ case OP_SIN:
+ return true;
+ default:
+ break;
+ }
+ break;
+ case OPCLASS_BITFIELD:
+ switch (insn->op) {
+ case OP_BFIND:
+ case OP_POPCNT:
+ return true;
+ default:
+ break;
+ }
+ break;
+ case OPCLASS_CONTROL:
+ switch (insn->op) {
+ case OP_EMIT:
+ case OP_RESTART:
+ return true;
+ default:
+ break;
+ }
+ break;
+ case OPCLASS_OTHER:
+ switch (insn->op) {
+ case OP_AFETCH:
+ case OP_PFETCH:
+ case OP_PIXLD:
+ case OP_RDSV:
+ case OP_SHFL:
+ return true;
+ default:
+ break;
+ }
+ break;
+ case OPCLASS_ARITH:
+ // TODO: IMUL/IMAD require barriers too, use of XMAD instead!
+ if ((insn->op == OP_MUL || insn->op == OP_MAD) &&
+ !isFloatType(insn->dType))
+ return true;
+ break;
+ case OPCLASS_CONVERT:
+ if (insn->def(0).getFile() != FILE_PREDICATE &&
+ insn->src(0).getFile() != FILE_PREDICATE)
+ return true;
+ break;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool
+TargetGM107::canDualIssue(const Instruction *a, const Instruction *b) const
+{
+ // TODO
+ return false;
+}
+
+// Return the number of stall counts needed to complete a single instruction.
+// On Maxwell GPUs, the pipeline depth is 6, but some instructions require
+// different number of stall counts like memory operations.
+int
+TargetGM107::getLatency(const Instruction *insn) const
+{
+ // TODO: better values! This should be good enough for now though.
+ switch (insn->op) {
+ case OP_EMIT:
+ case OP_EXPORT:
+ case OP_PIXLD:
+ case OP_RESTART:
+ case OP_STORE:
+ case OP_SUSTB:
+ case OP_SUSTP:
+ return 1;
+ case OP_SHFL:
+ return 2;
+ case OP_ADD:
+ case OP_AND:
+ case OP_EXTBF:
+ case OP_FMA:
+ case OP_INSBF:
+ case OP_MAD:
+ case OP_MAX:
+ case OP_MIN:
+ case OP_MOV:
+ case OP_MUL:
+ case OP_NOT:
+ case OP_OR:
+ case OP_PREEX2:
+ case OP_PRESIN:
+ case OP_QUADOP:
+ case OP_SELP:
+ case OP_SET:
+ case OP_SET_AND:
+ case OP_SET_OR:
+ case OP_SET_XOR:
+ case OP_SHL:
+ case OP_SHLADD:
+ case OP_SHR:
+ case OP_SLCT:
+ case OP_SUB:
+ case OP_VOTE:
+ case OP_XOR:
+ if (insn->dType != TYPE_F64)
+ return 6;
+ break;
+ case OP_ABS:
+ case OP_CEIL:
+ case OP_CVT:
+ case OP_FLOOR:
+ case OP_NEG:
+ case OP_SAT:
+ case OP_TRUNC:
+ if (insn->op == OP_CVT && (insn->def(0).getFile() == FILE_PREDICATE ||
+ insn->src(0).getFile() == FILE_PREDICATE))
+ return 6;
+ break;
+ case OP_BFIND:
+ case OP_COS:
+ case OP_EX2:
+ case OP_LG2:
+ case OP_POPCNT:
+ case OP_QUADON:
+ case OP_QUADPOP:
+ case OP_RCP:
+ case OP_RSQ:
+ case OP_SIN:
+ return 13;
+ default:
+ break;
+ }
+ // Use the maximum number of stall counts for other instructions.
+ return 15;
+}
+
+// Return the operand read latency which is the number of stall counts before
+// an instruction can read its sources. For memory operations like ATOM, LOAD
+// and STORE, the memory access has to be indirect.
+int
+TargetGM107::getReadLatency(const Instruction *insn) const
+{
+ switch (insn->op) {
+ case OP_ABS:
+ case OP_BFIND:
+ case OP_CEIL:
+ case OP_COS:
+ case OP_EX2:
+ case OP_FLOOR:
+ case OP_LG2:
+ case OP_NEG:
+ case OP_POPCNT:
+ case OP_RCP:
+ case OP_RSQ:
+ case OP_SAT:
+ case OP_SIN:
+ case OP_SULDB:
+ case OP_SULDP:
+ case OP_SUREDB:
+ case OP_SUREDP:
+ case OP_SUSTB:
+ case OP_SUSTP:
+ case OP_TRUNC:
+ return 4;
+ case OP_CVT:
+ if (insn->def(0).getFile() != FILE_PREDICATE &&
+ insn->src(0).getFile() != FILE_PREDICATE)
+ return 4;
+ break;
+ case OP_ATOM:
+ case OP_LOAD:
+ case OP_STORE:
+ if (insn->src(0).isIndirect(0)) {
+ switch (insn->src(0).getFile()) {
+ case FILE_MEMORY_SHARED:
+ case FILE_MEMORY_CONST:
+ return 2;
+ case FILE_MEMORY_GLOBAL:
+ case FILE_MEMORY_LOCAL:
+ return 4;
+ default:
+ break;
+ }
+ }
+ break;
+ case OP_EXPORT:
+ case OP_PFETCH:
+ case OP_SHFL:
+ case OP_VFETCH:
+ return 2;
+ default:
+ break;
+ }
+ return 0;
+}
+
bool
TargetGM107::runLegalizePass(Program *prog, CGStage stage) const
{