nv50/ir: do not perform global membar for shared memory

Shared memory is local to CTA, thus we should only wait for prior memory writes which are visible to other threads in the same CTA, and not at global level. This should speedup compute shaders which use shared memory. Signed-off-by: Samuel Pitoiset <[email protected]> Reviewed-by: Ilia Mirkin <[email protected]>
author: Samuel Pitoiset <[email protected]> 2016-10-24 21:41:11 +0200
committer: Samuel Pitoiset <[email protected]> 2016-10-24 22:51:54 +0200
commit: 6dbb8d12a8b78769b9803884fad5f0d9923023bc (patch)
tree: 3716f196fa05256bdbb652d641fc76cec6545a8e /src
parent: eed605a473554575305e1bf10c3641761a85feb9 (diff)
1 files changed, 4 insertions, 1 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index b47fc497c87..91cef81aa0d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -3561,12 +3561,15 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode());
       break;
    case TGSI_OPCODE_MEMBAR:
+   {
+      uint32_t level = tgsi.getSrc(0).getValueU32(0, info);
       geni = mkOp(OP_MEMBAR, TYPE_NONE, NULL);
       geni->fixed = 1;
-      if (tgsi.getSrc(0).getValueU32(0, info) & TGSI_MEMBAR_THREAD_GROUP)
+      if (!(level & ~(TGSI_MEMBAR_THREAD_GROUP | TGSI_MEMBAR_SHARED)))
          geni->subOp = NV50_IR_SUBOP_MEMBAR(M, CTA);
       else
          geni->subOp = NV50_IR_SUBOP_MEMBAR(M, GL);
+   }
       break;
    case TGSI_OPCODE_ATOMUADD:
    case TGSI_OPCODE_ATOMXCHG:
author	Samuel Pitoiset <[email protected]>	2016-10-24 21:41:11 +0200
committer	Samuel Pitoiset <[email protected]>	2016-10-24 22:51:54 +0200
commit	6dbb8d12a8b78769b9803884fad5f0d9923023bc (patch)
tree	3716f196fa05256bdbb652d641fc76cec6545a8e /src
parent	eed605a473554575305e1bf10c3641761a85feb9 (diff)