30 files changed, 2026 insertions, 55 deletions
diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index 83f81135590..31a93659647 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -64,6 +64,8 @@ NV50_C_SOURCES := \
 	nv50/nv50_3ddefs.xml.h \
 	nv50/nv50_3d.xml.h \
 	nv50/nv50_blit.h \
+	nv50/nv50_compute.c \
+	nv50/nv50_compute.xml.h \
 	nv50/nv50_context.c \
 	nv50/nv50_context.h \
 	nv50/nv50_defs.xml.h \
@@ -76,6 +78,10 @@ NV50_C_SOURCES := \
 	nv50/nv50_query.h \
 	nv50/nv50_query_hw.c \
 	nv50/nv50_query_hw.h \
+	nv50/nv50_query_hw_metric.c \
+	nv50/nv50_query_hw_metric.h \
+	nv50/nv50_query_hw_sm.c \
+	nv50/nv50_query_hw_sm.h \
 	nv50/nv50_resource.c \
 	nv50/nv50_resource.h \
 	nv50/nv50_screen.c \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 2a13e1086a0..9f84de03a4a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -2357,6 +2357,9 @@ CodeEmitterNVC0::emitInstruction(Instruction *insn)
    case OP_PFETCH:
       emitPFETCH(insn);
       break;
+   case OP_AFETCH:
+      emitAFETCH(insn);
+      break;
    case OP_EMIT:
    case OP_RESTART:
       emitOUT(insn);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 7859c8e79bd..41d2cc9167c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1573,10 +1573,28 @@ SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)
 
    Instruction *st;
    if (slot->reg.file == FILE_MEMORY_LOCAL) {
-      st = new_Instruction(func, OP_STORE, ty);
-      st->setSrc(0, slot);
-      st->setSrc(1, lval);
       lval->noSpill = 1;
+      if (ty != TYPE_B96) {
+         st = new_Instruction(func, OP_STORE, ty);
+         st->setSrc(0, slot);
+         st->setSrc(1, lval);
+      } else {
+         st = new_Instruction(func, OP_SPLIT, ty);
+         st->setSrc(0, lval);
+         for (int d = 0; d < lval->reg.size / 4; ++d)
+            st->setDef(d, new_LValue(func, FILE_GPR));
+
+         for (int d = lval->reg.size / 4 - 1; d >= 0; --d) {
+            Value *tmp = cloneShallow(func, slot);
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32);
+            s->setSrc(0, tmp);
+            s->setSrc(1, st->getDef(d));
+            defi->bb->insertAfter(defi, s);
+         }
+      }
    } else {
       st = new_Instruction(func, OP_CVT, ty);
       st->setDef(0, slot);
@@ -1596,7 +1614,27 @@ SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
    Instruction *ld;
    if (slot->reg.file == FILE_MEMORY_LOCAL) {
       lval->noSpill = 1;
-      ld = new_Instruction(func, OP_LOAD, ty);
+      if (ty != TYPE_B96) {
+         ld = new_Instruction(func, OP_LOAD, ty);
+      } else {
+         ld = new_Instruction(func, OP_MERGE, ty);
+         for (int d = 0; d < lval->reg.size / 4; ++d) {
+            Value *tmp = cloneShallow(func, slot);
+            LValue *val;
+            tmp->reg.size = 4;
+            tmp->reg.data.offset += 4 * d;
+
+            Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32);
+            l->setDef(0, (val = new_LValue(func, FILE_GPR)));
+            l->setSrc(0, tmp);
+            usei->bb->insertBefore(usei, l);
+            ld->setSrc(d, val);
+            val->noSpill = 1;
+         }
+         ld->setDef(0, lval);
+         usei->bb->insertBefore(usei, ld);
+         return lval;
+      }
    } else {
       ld = new_Instruction(func, OP_CVT, ty);
    }
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 68e69beb08f..1695553d793 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -657,8 +657,8 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
    if (buffer->base.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT |
                              PIPE_RESOURCE_FLAG_MAP_COHERENT)) {
       buffer->domain = NOUVEAU_BO_GART;
-   } else if (buffer->base.bind &
-              (screen->vidmem_bindings & screen->sysmem_bindings)) {
+   } else if (buffer->base.bind == 0 || (buffer->base.bind &
+              (screen->vidmem_bindings & screen->sysmem_bindings))) {
       switch (buffer->base.usage) {
       case PIPE_USAGE_DEFAULT:
       case PIPE_USAGE_IMMUTABLE:
@@ -685,6 +685,10 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
       if (buffer->base.bind & screen->sysmem_bindings)
          buffer->domain = NOUVEAU_BO_GART;
    }
+   /* There can be very special situations where we want non-gpu-mapped
+    * buffers, but never through this interface.
+    */
+   assert(buffer->domain);
    ret = nouveau_buffer_allocate(screen, buffer, buffer->domain);
 
    if (ret == false)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
new file mode 100644
index 00000000000..6d23fd66945
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -0,0 +1,320 @@
+/*
+ * Copyright 2012 Francisco Jerez
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_compute.xml.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+int
+nv50_screen_compute_setup(struct nv50_screen *screen,
+                          struct nouveau_pushbuf *push)
+{
+   struct nouveau_device *dev = screen->base.device;
+   struct nouveau_object *chan = screen->base.channel;
+   struct nv04_fifo *fifo = (struct nv04_fifo *)chan->data;
+   unsigned obj_class;
+   int i, ret;
+
+   switch (dev->chipset & 0xf0) {
+   case 0x50:
+   case 0x80:
+   case 0x90:
+      obj_class = NV50_COMPUTE_CLASS;
+      break;
+   case 0xa0:
+      switch (dev->chipset) {
+      case 0xa3:
+      case 0xa5:
+      case 0xa8:
+         obj_class = NVA3_COMPUTE_CLASS;
+         break;
+      default:
+         obj_class = NV50_COMPUTE_CLASS;
+         break;
+      }
+      break;
+   default:
+      NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
+      return -1;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef50c0, obj_class, NULL, 0,
+                            &screen->compute);
+   if (ret)
+      return ret;
+
+   BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->compute->handle);
+
+   BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->stack_bo->offset);
+   PUSH_DATA (push, screen->stack_bo->offset);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1);
+   PUSH_DATA (push, 4);
+
+   BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1);
+   PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
+   BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1);
+   PUSH_DATA (push, 0x100);
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1);
+   PUSH_DATA (push, fifo->vram);
+
+   for (i = 0; i < 15; i++) {
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1);
+      PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+   }
+
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1);
+   PUSH_DATA (push, ~0);
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1);
+   PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1);
+   PUSH_DATA (push, 7);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1);
+   PUSH_DATA (push, 7);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1);
+   PUSH_DATA (push, 0x54);
+   BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1);
+   PUSH_DATA (push, fifo->vram);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->tls_bo->offset + 65536);
+   PUSH_DATA (push, screen->tls_bo->offset + 65536);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1);
+   PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));
+
+   return 0;
+}
+
+static bool
+nv50_compute_validate_program(struct nv50_context *nv50)
+{
+   struct nv50_program *prog = nv50->compprog;
+
+   if (prog->mem)
+      return true;
+
+   if (!prog->translated) {
+      prog->translated = nv50_program_translate(
+         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
+      if (!prog->translated)
+         return false;
+   }
+   if (unlikely(!prog->code_size))
+      return false;
+
+   if (likely(prog->code_size)) {
+      if (nv50_program_upload_code(nv50, prog)) {
+         struct nouveau_pushbuf *push = nv50->base.pushbuf;
+         BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1);
+         PUSH_DATA (push, 0);
+         return true;
+      }
+   }
+   return false;
+}
+
+static void
+nv50_compute_validate_globals(struct nv50_context *nv50)
+{
+   unsigned i;
+
+   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource *res = *util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, i);
+      if (res)
+         nv50_add_bufctx_resident(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL,
+                                  nv04_resource(res), NOUVEAU_BO_RDWR);
+   }
+}
+
+static bool
+nv50_compute_state_validate(struct nv50_context *nv50)
+{
+   if (!nv50_compute_validate_program(nv50))
+      return false;
+
+   if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
+      nv50_compute_validate_globals(nv50);
+
+   /* TODO: validate textures, samplers, surfaces */
+
+   nv50_bufctx_fence(nv50->bufctx_cp, false);
+
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp);
+   if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf)))
+      return false;
+   if (unlikely(nv50->state.flushed))
+      nv50_bufctx_fence(nv50->bufctx_cp, true);
+
+   return true;
+}
+
+static void
+nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+   unsigned size = align(nv50->compprog->parm_size, 0x4);
+
+   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   PUSH_DATA (push, (size / 4) << 8);
+
+   if (size) {
+      struct nouveau_mm_allocation *mm;
+      struct nouveau_bo *bo = NULL;
+      unsigned offset;
+
+      mm = nouveau_mm_allocate(screen->base.mm_GART, size, &bo, &offset);
+      assert(mm);
+
+      nouveau_bo_map(bo, 0, screen->base.client);
+      memcpy(bo->map + offset, input, size);
+
+      nouveau_bufctx_refn(nv50->bufctx, 0, bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+      nouveau_pushbuf_bufctx(push, nv50->bufctx);
+      nouveau_pushbuf_validate(push);
+
+      BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4);
+      nouveau_pushbuf_data(push, bo, offset, size);
+
+      nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
+      nouveau_bo_ref(NULL, &bo);
+      nouveau_bufctx_reset(nv50->bufctx, 0);
+   }
+}
+
+static uint32_t
+nv50_compute_find_symbol(struct nv50_context *nv50, uint32_t label)
+{
+   struct nv50_program *prog = nv50->compprog;
+   const struct nv50_ir_prog_symbol *syms =
+      (const struct nv50_ir_prog_symbol *)prog->cp.syms;
+   unsigned i;
+
+   for (i = 0; i < prog->cp.num_syms; ++i) {
+      if (syms[i].label == label)
+         return prog->code_base + syms[i].offset;
+   }
+   return prog->code_base; /* no symbols or symbol not found */
+}
+
+void
+nv50_launch_grid(struct pipe_context *pipe,
+                 const uint *block_layout, const uint *grid_layout,
+                 uint32_t label, const void *input)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned block_size = block_layout[0] * block_layout[1] * block_layout[2];
+   struct nv50_program *cp = nv50->compprog;
+   bool ret;
+
+   ret = !nv50_compute_state_validate(nv50);
+   if (ret) {
+      NOUVEAU_ERR("Failed to launch grid !\n");
+      return;
+   }
+
+   nv50_compute_upload_input(nv50, input);
+
+   BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1);
+   PUSH_DATA (push, nv50_compute_find_symbol(nv50, label));
+
+   BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1);
+   PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
+   BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1);
+   PUSH_DATA (push, cp->max_gpr);
+
+   /* grid/block setup */
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2);
+   PUSH_DATA (push, block_layout[1] << 16 | block_layout[0]);
+   PUSH_DATA (push, block_layout[2]);
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1);
+   PUSH_DATA (push, 1 << 16 | block_size);
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1);
+   PUSH_DATA (push, grid_layout[1] << 16 | grid_layout[0]);
+   BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1);
+   PUSH_DATA (push, 1);
+
+   /* kernel launching */
+   BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+
+   /* bind a compute shader clobbers fragment shader state */
+   nv50->dirty |= NV50_NEW_FRAGPROG;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h
new file mode 100644
index 00000000000..268d11253b6
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h
@@ -0,0 +1,444 @@
+#ifndef NV50_COMPUTE_XML
+#define NV50_COMPUTE_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://github.com/envytools/envytools/
+git clone https://github.com/envytools/envytools.git
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/graph/g80_compute.xml (  14027 bytes, from 2015-02-14 02:01:36)
+- rnndb/copyright.xml         (   6456 bytes, from 2015-02-14 02:01:36)
+- rnndb/nvchipsets.xml        (   2833 bytes, from 2015-04-28 16:28:33)
+- rnndb/fifo/nv_object.xml    (  15390 bytes, from 2015-04-22 20:36:09)
+- rnndb/g80_defs.xml          (  18210 bytes, from 2015-10-19 20:49:59)
+
+Copyright (C) 2006-2015 by the following authors:
+- Artur Huillet <[email protected]> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <[email protected]> (koala_br)
+- Carlos Martin <[email protected]> (carlosmn)
+- Christoph Bumiller <[email protected]> (calim, chrisbmr)
+- Dawid Gajownik <[email protected]> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <[email protected]> (lumag)
+- EdB <[email protected]> (edb_)
+- Erik Waling <[email protected]> (erikwaling)
+- Francisco Jerez <[email protected]> (curro)
+- Ilia Mirkin <[email protected]> (imirkin)
+- jb17bsome <[email protected]> (jb17bsome)
+- Jeremy Kolb <[email protected]> (kjeremy)
+- Laurent Carlier <[email protected]> (lordheavy)
+- Luca Barbieri <[email protected]> (lb, lb1)
+- Maarten Maathuis <[email protected]> (stillunknown)
+- Marcin Kościelnicki <[email protected]> (mwk, koriakin)
+- Mark Carey <[email protected]> (careym)
+- Matthieu Castet <[email protected]> (mat-c)
+- nvidiaman <[email protected]> (nvidiaman)
+- Patrice Mandin <[email protected]> (pmandin, pmdata)
+- Pekka Paalanen <[email protected]> (pq, ppaalanen)
+- Peter Popov <[email protected]> (ironpeter)
+- Richard Hughes <[email protected]> (hughsient)
+- Rudi Cilibrasi <[email protected]> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <[email protected]> (leroutier)
+- Stephane Marchesin <[email protected]> (marcheu)
+- sturmflut <[email protected]> (sturmflut)
+- Sylvain Munaut <[email protected]>
+- Victor Stinner <[email protected]> (haypo)
+- Wladmir van der Laan <[email protected]> (miathan6)
+- Younes Manton <[email protected]> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NV50_COMPUTE_DMA_NOTIFY					0x00000180
+
+#define NV50_COMPUTE_DMA_GLOBAL					0x000001a0
+
+#define NV50_COMPUTE_DMA_QUERY					0x000001a4
+
+#define NV50_COMPUTE_DMA_LOCAL					0x000001b8
+
+#define NV50_COMPUTE_DMA_STACK					0x000001bc
+
+#define NV50_COMPUTE_DMA_CODE_CB					0x000001c0
+
+#define NV50_COMPUTE_DMA_TSC					0x000001c4
+
+#define NV50_COMPUTE_DMA_TIC					0x000001c8
+
+#define NV50_COMPUTE_DMA_TEXTURE					0x000001cc
+
+#define NV50_COMPUTE_UNK0200					0x00000200
+#define NV50_COMPUTE_UNK0200_UNK1__MASK				0x0000ffff
+#define NV50_COMPUTE_UNK0200_UNK1__SHIFT				0
+#define NV50_COMPUTE_UNK0200_UNK2__MASK				0x00ff0000
+#define NV50_COMPUTE_UNK0200_UNK2__SHIFT				16
+
+#define NV50_COMPUTE_UNK0204					0x00000204
+
+#define NV50_COMPUTE_UNK0208					0x00000208
+
+#define NV50_COMPUTE_UNK020C					0x0000020c
+
+#define NV50_COMPUTE_CP_ADDRESS_HIGH				0x00000210
+
+#define NV50_COMPUTE_CP_ADDRESS_LOW				0x00000214
+
+#define NV50_COMPUTE_STACK_ADDRESS_HIGH				0x00000218
+
+#define NV50_COMPUTE_STACK_ADDRESS_LOW				0x0000021c
+
+#define NV50_COMPUTE_STACK_SIZE_LOG				0x00000220
+
+#define NV50_COMPUTE_CALL_LIMIT_LOG				0x00000224
+
+#define NV50_COMPUTE_UNK0228					0x00000228
+#define NV50_COMPUTE_UNK0228_UNK0				0x00000001
+#define NV50_COMPUTE_UNK0228_UNK4__MASK				0x00000ff0
+#define NV50_COMPUTE_UNK0228_UNK4__SHIFT				4
+#define NV50_COMPUTE_UNK0228_UNK12__MASK				0x000ff000
+#define NV50_COMPUTE_UNK0228_UNK12__SHIFT			12
+
+#define NV50_COMPUTE_TSC_ADDRESS_HIGH				0x0000022c
+
+#define NV50_COMPUTE_TSC_ADDRESS_LOW				0x00000230
+#define NV50_COMPUTE_TSC_ADDRESS_LOW__ALIGN			0x00000020
+
+#define NV50_COMPUTE_TSC_LIMIT					0x00000234
+#define NV50_COMPUTE_TSC_LIMIT__MAX				0x00001fff
+
+#define NV50_COMPUTE_CB_ADDR					0x00000238
+#define NV50_COMPUTE_CB_ADDR_ID__MASK				0x003fff00
+#define NV50_COMPUTE_CB_ADDR_ID__SHIFT				8
+#define NV50_COMPUTE_CB_ADDR_BUFFER__MASK			0x0000007f
+#define NV50_COMPUTE_CB_ADDR_BUFFER__SHIFT			0
+
+#define NV50_COMPUTE_CB_DATA(i0)				       (0x0000023c + 0x4*(i0))
+#define NV50_COMPUTE_CB_DATA__ESIZE				0x00000004
+#define NV50_COMPUTE_CB_DATA__LEN				0x00000010
+
+#define NV50_COMPUTE_TSC_FLUSH					0x0000027c
+#define NV50_COMPUTE_TSC_FLUSH_SPECIFIC				0x00000001
+#define NV50_COMPUTE_TSC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NV50_COMPUTE_TSC_FLUSH_ENTRY__SHIFT			4
+
+#define NV50_COMPUTE_TIC_FLUSH					0x00000280
+#define NV50_COMPUTE_TIC_FLUSH_SPECIFIC				0x00000001
+#define NV50_COMPUTE_TIC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NV50_COMPUTE_TIC_FLUSH_ENTRY__SHIFT			4
+
+#define NV50_COMPUTE_DELAY1					0x00000284
+
+#define NV50_COMPUTE_WATCHDOG_TIMER				0x00000288
+
+#define NV50_COMPUTE_DELAY2					0x0000028c
+
+#define NV50_COMPUTE_UNK0290					0x00000290
+
+#define NV50_COMPUTE_LOCAL_ADDRESS_HIGH				0x00000294
+
+#define NV50_COMPUTE_LOCAL_ADDRESS_LOW				0x00000298
+#define NV50_COMPUTE_LOCAL_ADDRESS_LOW__ALIGN			0x00000100
+
+#define NV50_COMPUTE_LOCAL_SIZE_LOG				0x0000029c
+
+#define NV50_COMPUTE_UNK02A0					0x000002a0
+
+#define NV50_COMPUTE_CB_DEF_ADDRESS_HIGH				0x000002a4
+
+#define NV50_COMPUTE_CB_DEF_ADDRESS_LOW				0x000002a8
+
+#define NV50_COMPUTE_CB_DEF_SET					0x000002ac
+#define NV50_COMPUTE_CB_DEF_SET_SIZE__MASK			0x0000ffff
+#define NV50_COMPUTE_CB_DEF_SET_SIZE__SHIFT			0
+#define NV50_COMPUTE_CB_DEF_SET_BUFFER__MASK			0x007f0000
+#define NV50_COMPUTE_CB_DEF_SET_BUFFER__SHIFT			16
+
+#define NV50_COMPUTE_UNK02B0					0x000002b0
+
+#define NV50_COMPUTE_BLOCK_ALLOC					0x000002b4
+#define NV50_COMPUTE_BLOCK_ALLOC_THREADS__MASK			0x0000ffff
+#define NV50_COMPUTE_BLOCK_ALLOC_THREADS__SHIFT			0
+#define NV50_COMPUTE_BLOCK_ALLOC_BARRIERS__MASK			0x00ff0000
+#define NV50_COMPUTE_BLOCK_ALLOC_BARRIERS__SHIFT			16
+
+#define NV50_COMPUTE_LANES32_ENABLE				0x000002b8
+
+#define NV50_COMPUTE_UNK02BC					0x000002bc
+#define NV50_COMPUTE_UNK02BC_UNK1__MASK				0x00000007
+#define NV50_COMPUTE_UNK02BC_UNK1__SHIFT				0
+#define NV50_COMPUTE_UNK02BC_UNK2__MASK				0x00000070
+#define NV50_COMPUTE_UNK02BC_UNK2__SHIFT				4
+
+#define NV50_COMPUTE_CP_REG_ALLOC_TEMP				0x000002c0
+
+#define NV50_COMPUTE_TIC_ADDRESS_HIGH				0x000002c4
+
+#define NV50_COMPUTE_TIC_ADDRESS_LOW				0x000002c8
+
+#define NV50_COMPUTE_TIC_LIMIT					0x000002cc
+
+#define NV50_COMPUTE_MP_PM_SET(i0)			       (0x000002d0 + 0x4*(i0))
+#define NV50_COMPUTE_MP_PM_SET__ESIZE				0x00000004
+#define NV50_COMPUTE_MP_PM_SET__LEN				0x00000004
+
+#define NV50_COMPUTE_MP_PM_CONTROL(i0)			       (0x000002e0 + 0x4*(i0))
+#define NV50_COMPUTE_MP_PM_CONTROL__ESIZE			0x00000004
+#define NV50_COMPUTE_MP_PM_CONTROL__LEN				0x00000004
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE__MASK			0x00000001
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE__SHIFT			0
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE_LOGOP			0x00000000
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE_LOGOP_PULSE		0x00000001
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT__MASK			0x00000070
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT__SHIFT			4
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK0			0x00000000
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK1			0x00000010
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK2			0x00000020
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK3			0x00000030
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK4			0x00000040
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK5			0x00000050
+#define NV50_COMPUTE_MP_PM_CONTROL_FUNC__MASK			0x00ffff00
+#define NV50_COMPUTE_MP_PM_CONTROL_FUNC__SHIFT			8
+#define NV50_COMPUTE_MP_PM_CONTROL_SIG__MASK			0xff000000
+#define NV50_COMPUTE_MP_PM_CONTROL_SIG__SHIFT			24
+
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE			0x000002f0
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_0		0x00000001
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_1		0x00000002
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_2		0x00000004
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_3		0x00000008
+
+#define NV50_COMPUTE_UNK02F4					0x000002f4
+
+#define NV50_COMPUTE_BLOCKDIM_LATCH				0x000002f8
+
+#define NV50_COMPUTE_LOCAL_WARPS_LOG_ALLOC			0x000002fc
+
+#define NV50_COMPUTE_LOCAL_WARPS_NO_CLAMP			0x00000300
+
+#define NV50_COMPUTE_STACK_WARPS_LOG_ALLOC			0x00000304
+
+#define NV50_COMPUTE_STACK_WARPS_NO_CLAMP			0x00000308
+
+#define NV50_COMPUTE_UNK030C					0x0000030c
+
+#define NV50_COMPUTE_QUERY_ADDRESS_HIGH				0x00000310
+
+#define NV50_COMPUTE_QUERY_ADDRESS_LOW				0x00000314
+
+#define NV50_COMPUTE_QUERY_SEQUENCE				0x00000318
+
+#define NV50_COMPUTE_QUERY_GET					0x0000031c
+#define NV50_COMPUTE_QUERY_GET_INTR				0x00000200
+#define NV50_COMPUTE_QUERY_GET_SHORT				0x00008000
+
+#define NV50_COMPUTE_COND_ADDRESS_HIGH				0x00000320
+
+#define NV50_COMPUTE_COND_ADDRESS_LOW				0x00000324
+
+#define NV50_COMPUTE_COND_MODE					0x00000328
+#define NV50_COMPUTE_COND_MODE_NEVER				0x00000000
+#define NV50_COMPUTE_COND_MODE_ALWAYS				0x00000001
+#define NV50_COMPUTE_COND_MODE_RES_NON_ZERO			0x00000002
+#define NV50_COMPUTE_COND_MODE_EQUAL				0x00000003
+#define NV50_COMPUTE_COND_MODE_NOT_EQUAL				0x00000004
+
+#define NV50_COMPUTE_UNK032C					0x0000032c
+
+#define NV50_COMPUTE_UNK0330					0x00000330
+
+#define NV50_COMPUTE_UNK0334(i0)				       (0x00000334 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0334__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0334__LEN				0x00000003
+
+#define NV50_COMPUTE_UNK0340(i0)				       (0x00000340 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0340__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0340__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0348(i0)				       (0x00000348 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0348__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0348__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0350(i0)				       (0x00000350 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0350__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0350__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0358					0x00000358
+
+#define NV50_COMPUTE_UNK035C					0x0000035c
+
+#define NV50_COMPUTE_UNK0360					0x00000360
+#define NV50_COMPUTE_UNK0360_UNK0__MASK				0x000000f0
+#define NV50_COMPUTE_UNK0360_UNK0__SHIFT				4
+#define NV50_COMPUTE_UNK0360_UNK1__MASK				0x00000f00
+#define NV50_COMPUTE_UNK0360_UNK1__SHIFT				8
+
+#define NV50_COMPUTE_UNK0364					0x00000364
+
+#define NV50_COMPUTE_LAUNCH					0x00000368
+
+#define NV50_COMPUTE_UNK036C					0x0000036c
+
+#define NV50_COMPUTE_UNK0370					0x00000370
+
+#define NV50_COMPUTE_USER_PARAM_COUNT				0x00000374
+#define NV50_COMPUTE_USER_PARAM_COUNT_UNK0__MASK			0x000000ff
+#define NV50_COMPUTE_USER_PARAM_COUNT_UNK0__SHIFT		0
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__MASK		0x0000ff00
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__SHIFT		8
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__MAX			0x00000040
+
+#define NV50_COMPUTE_LINKED_TSC					0x00000378
+
+#define NV50_COMPUTE_UNK037C					0x0000037c
+#define NV50_COMPUTE_UNK037C_ALWAYS_DERIV			0x00000001
+#define NV50_COMPUTE_UNK037C_UNK16				0x00010000
+
+#define NV50_COMPUTE_CODE_CB_FLUSH				0x00000380
+
+#define NV50_COMPUTE_UNK0384					0x00000384
+
+#define NV50_COMPUTE_GRIDID					0x00000388
+
+#define NV50_COMPUTE_UNK038C(i0)				       (0x0000038c + 0x4*(i0))
+#define NV50_COMPUTE_UNK038C__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK038C__LEN				0x00000003
+
+#define NV50_COMPUTE_WRCACHE_FLUSH				0x00000398
+
+#define NV50_COMPUTE_UNK039C(i0)				       (0x0000039c + 0x4*(i0))
+#define NV50_COMPUTE_UNK039C__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK039C__LEN				0x00000002
+
+#define NV50_COMPUTE_GRIDDIM					0x000003a4
+#define NV50_COMPUTE_GRIDDIM_X__MASK				0x0000ffff
+#define NV50_COMPUTE_GRIDDIM_X__SHIFT				0
+#define NV50_COMPUTE_GRIDDIM_Y__MASK				0xffff0000
+#define NV50_COMPUTE_GRIDDIM_Y__SHIFT				16
+
+#define NV50_COMPUTE_SHARED_SIZE					0x000003a8
+#define NV50_COMPUTE_SHARED_SIZE__MAX				0x00004000
+#define NV50_COMPUTE_SHARED_SIZE__ALIGN				0x00000040
+
+#define NV50_COMPUTE_BLOCKDIM_XY					0x000003ac
+#define NV50_COMPUTE_BLOCKDIM_XY_X__MASK				0x0000ffff
+#define NV50_COMPUTE_BLOCKDIM_XY_X__SHIFT			0
+#define NV50_COMPUTE_BLOCKDIM_XY_Y__MASK				0xffff0000
+#define NV50_COMPUTE_BLOCKDIM_XY_Y__SHIFT			16
+
+#define NV50_COMPUTE_BLOCKDIM_Z					0x000003b0
+#define NV50_COMPUTE_BLOCKDIM_Z__MIN				0x00000001
+#define NV50_COMPUTE_BLOCKDIM_Z__MAX				0x00000040
+
+#define NV50_COMPUTE_CP_START_ID					0x000003b4
+
+#define NV50_COMPUTE_REG_MODE					0x000003b8
+#define NV50_COMPUTE_REG_MODE_PACKED				0x00000001
+#define NV50_COMPUTE_REG_MODE_STRIPED				0x00000002
+
+#define NV50_COMPUTE_TEX_LIMITS					0x000003bc
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MASK		0x0000000f
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__SHIFT		0
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MIN		0x00000000
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MAX		0x00000004
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MASK		0x000000f0
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__SHIFT		4
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MIN		0x00000000
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MAX		0x00000007
+
+#define NV50_COMPUTE_BIND_TSC					0x000003c0
+#define NV50_COMPUTE_BIND_TSC_VALID				0x00000001
+#define NV50_COMPUTE_BIND_TSC_SAMPLER__MASK			0x000000f0
+#define NV50_COMPUTE_BIND_TSC_SAMPLER__SHIFT			4
+#define NV50_COMPUTE_BIND_TSC_TSC__MASK				0x001ff000
+#define NV50_COMPUTE_BIND_TSC_TSC__SHIFT				12
+
+#define NV50_COMPUTE_BIND_TIC					0x000003c4
+#define NV50_COMPUTE_BIND_TIC_VALID				0x00000001
+#define NV50_COMPUTE_BIND_TIC_TEXTURE__MASK			0x000001fe
+#define NV50_COMPUTE_BIND_TIC_TEXTURE__SHIFT			1
+#define NV50_COMPUTE_BIND_TIC_TIC__MASK				0x7ffffe00
+#define NV50_COMPUTE_BIND_TIC_TIC__SHIFT				9
+
+#define NV50_COMPUTE_SET_PROGRAM_CB				0x000003c8
+#define NV50_COMPUTE_SET_PROGRAM_CB_INDEX__MASK			0x00000f00
+#define NV50_COMPUTE_SET_PROGRAM_CB_INDEX__SHIFT			8
+#define NV50_COMPUTE_SET_PROGRAM_CB_BUFFER__MASK			0x0007f000
+#define NV50_COMPUTE_SET_PROGRAM_CB_BUFFER__SHIFT		12
+#define NV50_COMPUTE_SET_PROGRAM_CB_VALID			0x000000ff
+
+#define NV50_COMPUTE_UNK03CC					0x000003cc
+
+#define NV50_COMPUTE_TEX_CACHE_CTL				0x000003d0
+#define NV50_COMPUTE_TEX_CACHE_CTL_UNK1__MASK			0x00000030
+#define NV50_COMPUTE_TEX_CACHE_CTL_UNK1__SHIFT			4
+
+#define NV50_COMPUTE_UNK03D4					0x000003d4
+
+#define NV50_COMPUTE_UNK03D8					0x000003d8
+
+#define NV50_COMPUTE_UNK03DC					0x000003dc
+
+#define NV50_COMPUTE_UNK03E0					0x000003e0
+
+#define NV50_COMPUTE_UNK03E4					0x000003e4
+
+#define NVA3_COMPUTE_TEX_MISC					0x000003e8
+#define NVA3_COMPUTE_TEX_MISC_UNK1				0x00000001
+#define NVA3_COMPUTE_TEX_MISC_SEAMLESS_CUBE_MAP		0x00000002
+
+#define NV50_COMPUTE_GLOBAL(i0)				       (0x00000400 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL__ESIZE				0x00000020
+#define NV50_COMPUTE_GLOBAL__LEN					0x00000010
+
+#define NV50_COMPUTE_GLOBAL_ADDRESS_HIGH(i0)		       (0x00000400 + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_ADDRESS_LOW(i0)		       (0x00000404 + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_PITCH(i0)			       (0x00000408 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL_PITCH__MAX				0x00800000
+#define NV50_COMPUTE_GLOBAL_PITCH__ALIGN				0x00000100
+
+#define NV50_COMPUTE_GLOBAL_LIMIT(i0)			       (0x0000040c + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_MODE(i0)			       (0x00000410 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL_MODE_LINEAR				0x00000001
+#define NV50_COMPUTE_GLOBAL_MODE_UNK1__MASK			0x000000f0
+#define NV50_COMPUTE_GLOBAL_MODE_UNK1__SHIFT			4
+#define NV50_COMPUTE_GLOBAL_MODE_TILE_MODE__MASK			0x00000f00
+#define NV50_COMPUTE_GLOBAL_MODE_TILE_MODE__SHIFT		8
+
+#define NV50_COMPUTE_USER_PARAM(i0)			       (0x00000600 + 0x4*(i0))
+#define NV50_COMPUTE_USER_PARAM__ESIZE				0x00000004
+#define NV50_COMPUTE_USER_PARAM__LEN				0x00000040
+
+#define NV50_COMPUTE_UNK0700(i0)				       (0x00000700 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0700__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0700__LEN				0x00000010
+
+
+#endif /* NV50_COMPUTE_XML */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 7867c2df7f3..4874b77b1e1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -113,6 +113,7 @@ nv50_context_unreference_resources(struct nv50_context *nv50)
 
    nouveau_bufctx_del(&nv50->bufctx_3d);
    nouveau_bufctx_del(&nv50->bufctx);
+   nouveau_bufctx_del(&nv50->bufctx_cp);
 
    util_unreference_framebuffer_state(&nv50->framebuffer);
 
@@ -131,6 +132,14 @@ nv50_context_unreference_resources(struct nv50_context *nv50)
          if (!nv50->constbuf[s][i].user)
             pipe_resource_reference(&nv50->constbuf[s][i].u.buf, NULL);
    }
+
+   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource **res = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, i);
+      pipe_resource_reference(res, NULL);
+   }
+   util_dynarray_fini(&nv50->global_residents);
 }
 
 static void
@@ -159,9 +168,10 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
                                  int ref)
 {
    struct nv50_context *nv50 = nv50_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
    unsigned s, i;
 
-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
       assert(nv50->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
       for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
          if (nv50->framebuffer.cbufs[i] &&
@@ -173,7 +183,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
          }
       }
    }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nv50->framebuffer.zsbuf &&
           nv50->framebuffer.zsbuf->texture == res) {
          nv50->dirty |= NV50_NEW_FRAMEBUFFER;
@@ -183,11 +193,11 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       }
    }
 
-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_SAMPLER_VIEW)) {
 
       assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
@@ -263,10 +273,13 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    nv50->base.pushbuf = screen->base.pushbuf;
    nv50->base.client = screen->base.client;
 
-   ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_COUNT,
-                            &nv50->bufctx_3d);
+   ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+   if (!ret)
+      ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_3D_COUNT,
+                               &nv50->bufctx_3d);
    if (!ret)
-      ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+      ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_CP_COUNT,
+                               &nv50->bufctx_cp);
    if (ret)
       goto out_err;
 
@@ -290,6 +303,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 
    pipe->draw_vbo = nv50_draw_vbo;
    pipe->clear = nv50_clear;
+   pipe->launch_grid = nv50_launch_grid;
 
    pipe->flush = nv50_flush;
    pipe->texture_barrier = nv50_texture_barrier;
@@ -335,19 +349,30 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
+   if (screen->compute) {
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code);
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc);
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->stack_bo);
+   }
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
 
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
    BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
+   if (screen->compute)
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
 
    nv50->base.scratch.bo_size = 2 << 20;
 
+   util_dynarray_init(&nv50->global_residents);
+
    return pipe;
 
 out_err:
    if (nv50->bufctx_3d)
       nouveau_bufctx_del(&nv50->bufctx_3d);
+   if (nv50->bufctx_cp)
+      nouveau_bufctx_del(&nv50->bufctx_cp);
    if (nv50->bufctx)
       nouveau_bufctx_del(&nv50->bufctx);
    FREE(nv50->blit);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index fb74a9748a3..2cebcd99423 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -49,6 +49,10 @@
 #define NV50_NEW_MIN_SAMPLES  (1 << 22)
 #define NV50_NEW_CONTEXT      (1 << 31)
 
+#define NV50_NEW_CP_PROGRAM   (1 << 0)
+#define NV50_NEW_CP_GLOBALS   (1 << 1)
+
+/* 3d bufctx (during draw_vbo, blit_3d) */
 #define NV50_BIND_FB          0
 #define NV50_BIND_VERTEX      1
 #define NV50_BIND_VERTEX_TMP  2
@@ -58,7 +62,15 @@
 #define NV50_BIND_SO         53
 #define NV50_BIND_SCREEN     54
 #define NV50_BIND_TLS        55
-#define NV50_BIND_COUNT      56
+#define NV50_BIND_3D_COUNT   56
+
+/* compute bufctx (during launch_grid) */
+#define NV50_BIND_CP_GLOBAL   0
+#define NV50_BIND_CP_SCREEN   1
+#define NV50_BIND_CP_QUERY    2
+#define NV50_BIND_CP_COUNT    3
+
+/* bufctx for other operations */
 #define NV50_BIND_2D          0
 #define NV50_BIND_M2MF        0
 #define NV50_BIND_FENCE       1
@@ -101,8 +113,10 @@ struct nv50_context {
 
    struct nouveau_bufctx *bufctx_3d;
    struct nouveau_bufctx *bufctx;
+   struct nouveau_bufctx *bufctx_cp;
 
    uint32_t dirty;
+   uint32_t dirty_cp; /* dirty flags for compute state */
    bool cb_dirty;
 
    struct nv50_graph_state state;
@@ -115,6 +129,7 @@ struct nv50_context {
    struct nv50_program *vertprog;
    struct nv50_program *gmtyprog;
    struct nv50_program *fragprog;
+   struct nv50_program *compprog;
 
    struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[3];
@@ -163,6 +178,8 @@ struct nv50_context {
    uint32_t cond_condmode; /* the calculated condition */
 
    struct nv50_blitctx *blit;
+
+   struct util_dynarray global_residents;
 };
 
 static inline struct nv50_context *
@@ -302,4 +319,9 @@ struct pipe_video_buffer *
 nv98_video_buffer_create(struct pipe_context *pipe,
                          const struct pipe_video_buffer *template);
 
+/* nv50_compute.c */
+void
+nv50_launch_grid(struct pipe_context *, const uint *, const uint *,
+                 uint32_t, const void *);
+
 #endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 89e7a338283..a4b8ddfda95 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -66,7 +66,6 @@ nv50_vertprog_assign_slots(struct nv50_ir_prog_info *info)
       case TGSI_SEMANTIC_VERTEXID:
          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID;
          prog->vp.attrs[2] |= NV50_3D_VP_GP_BUILTIN_ATTR_EN_VERTEX_ID_DRAW_ARRAYS_ADD_START;
-         prog->vp.vertexid = 1;
          continue;
       default:
          break;
@@ -259,6 +258,8 @@ nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
       return nv50_vertprog_assign_slots(info);
    case PIPE_SHADER_FRAGMENT:
       return nv50_fragprog_assign_slots(info);
+   case PIPE_SHADER_COMPUTE:
+      return 0;
    default:
       return -1;
    }
@@ -355,6 +356,9 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    prog->gp.has_layer = 0;
    prog->gp.has_viewport = 0;
 
+   if (prog->type == PIPE_SHADER_COMPUTE)
+      info->prop.cp.inputOffset = 0x10;
+
    info->driverPriv = prog;
 
 #ifdef DEBUG
@@ -378,6 +382,8 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    prog->max_gpr = MAX2(4, (info->bin.maxGPR >> 1) + 1);
    prog->tls_space = info->bin.tlsSpace;
 
+   prog->vp.need_vertex_id = info->io.vertexId < PIPE_MAX_SHADER_INPUTS;
+
    if (prog->type == PIPE_SHADER_FRAGMENT) {
       if (info->prop.fp.writesDepth) {
          prog->fp.flags[0] |= NV50_3D_FP_CONTROL_EXPORTS_Z;
@@ -401,6 +407,10 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
          break;
       }
       prog->gp.vert_count = info->prop.gp.maxVertices;
+   } else
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      prog->cp.syms = info->bin.syms;
+      prog->cp.num_syms = info->bin.numSyms;
    }
 
    if (prog->pipe.stream_output.num_outputs)
@@ -423,11 +433,13 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    struct nouveau_heap *heap;
    int ret;
    uint32_t size = align(prog->code_size, 0x40);
+   uint8_t prog_type;
 
    switch (prog->type) {
    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
    case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
+   case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
    default:
       assert(!"invalid program type");
       return false;
@@ -450,7 +462,14 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
          return false;
       }
    }
-   prog->code_base = prog->mem->start;
+
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      /* CP code must be uploaded in FP code segment. */
+      prog_type = 1;
+   } else {
+      prog->code_base = prog->mem->start;
+      prog_type = prog->type;
+   }
 
    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    if (ret < 0) {
@@ -468,7 +487,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
                             false /* flatshade */);
 
    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
-                       (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
+                       (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
 
    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
@@ -489,7 +508,7 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
    FREE(p->code);
 
    FREE(p->fixups);
-
+   FREE(p->interps);
    FREE(p->so);
 
    memset(p, 0, sizeof(*p));
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index 7a33eb11d6d..1de5122a56e 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -76,9 +76,9 @@ struct nv50_program {
       ubyte psiz;        /* output slot of point size */
       ubyte bfc[2];      /* indices into varying for FFC (FP) or BFC (VP) */
       ubyte edgeflag;
-      ubyte vertexid;
       ubyte clpd[2];     /* output slot of clip distance[i]'s 1st component */
       ubyte clpd_nr;
+      bool need_vertex_id;
    } vp;
 
    struct {
@@ -98,6 +98,13 @@ struct nv50_program {
       ubyte viewportid; /* hw value of viewport index output */
    } gp;
 
+   struct {
+      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
+      uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
+      void *syms;
+      unsigned num_syms;
+   } cp;
+
    void *fixups; /* relocation records */
    void *interps; /* interpolation records */
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_push.c b/src/gallium/drivers/nouveau/nv50/nv50_push.c
index f31eaa0e314..cbef95d07f6 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_push.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_push.c
@@ -24,6 +24,10 @@ struct push_context {
    struct translate *translate;
 
    bool primitive_restart;
+
+   bool need_vertex_id;
+   int32_t index_bias;
+
    uint32_t prim;
    uint32_t restart_index;
    uint32_t instance_id;
@@ -74,6 +78,11 @@ emit_vertices_i08(struct push_context *ctx, unsigned start, unsigned count)
 
       size = ctx->vertex_words * nr;
 
+      if (unlikely(ctx->need_vertex_id)) {
+         BEGIN_NV04(ctx->push, NV84_3D(VERTEX_ID_BASE), 1);
+         PUSH_DATA (ctx->push, *elts + ctx->index_bias);
+      }
+
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
       ctx->translate->run_elts8(ctx->translate, elts, nr, 0, ctx->instance_id,
@@ -107,6 +116,11 @@ emit_vertices_i16(struct push_context *ctx, unsigned start, unsigned count)
 
       size = ctx->vertex_words * nr;
 
+      if (unlikely(ctx->need_vertex_id)) {
+         BEGIN_NV04(ctx->push, NV84_3D(VERTEX_ID_BASE), 1);
+         PUSH_DATA (ctx->push, *elts + ctx->index_bias);
+      }
+
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
       ctx->translate->run_elts16(ctx->translate, elts, nr, 0, ctx->instance_id,
@@ -140,6 +154,11 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
 
       size = ctx->vertex_words * nr;
 
+      if (unlikely(ctx->need_vertex_id)) {
+         BEGIN_NV04(ctx->push, NV84_3D(VERTEX_ID_BASE), 1);
+         PUSH_DATA (ctx->push, *elts + ctx->index_bias);
+      }
+
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
       ctx->translate->run_elts(ctx->translate, elts, nr, 0, ctx->instance_id,
@@ -161,10 +180,18 @@ emit_vertices_i32(struct push_context *ctx, unsigned start, unsigned count)
 static void
 emit_vertices_seq(struct push_context *ctx, unsigned start, unsigned count)
 {
+   uint32_t elts = 0;
+
    while (count) {
       unsigned push = MIN2(count, ctx->packet_vertex_limit);
       unsigned size = ctx->vertex_words * push;
 
+      if (unlikely(ctx->need_vertex_id)) {
+         /* For non-indexed draws, gl_VertexID goes up after each vertex. */
+         BEGIN_NV04(ctx->push, NV84_3D(VERTEX_ID_BASE), 1);
+         PUSH_DATA (ctx->push, elts++);
+      }
+
       BEGIN_NI04(ctx->push, NV50_3D(VERTEX_DATA), size);
 
       ctx->translate->run(ctx->translate, start, push, 0, ctx->instance_id,
@@ -216,7 +243,14 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
 
    ctx.push = nv50->base.pushbuf;
    ctx.translate = nv50->vertex->translate;
-   ctx.packet_vertex_limit = nv50->vertex->packet_vertex_limit;
+
+   ctx.need_vertex_id = nv50->screen->base.class_3d >= NV84_3D_CLASS &&
+      nv50->vertprog->vp.need_vertex_id && (nv50->vertex->num_elements < 32);
+   ctx.index_bias = info->index_bias;
+
+   /* For indexed draws, gl_VertexID must be emitted for every vertex. */
+   ctx.packet_vertex_limit =
+      ctx.need_vertex_id ? 1 : nv50->vertex->packet_vertex_limit;
    ctx.vertex_words = nv50->vertex->vertex_size;
 
    assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
@@ -307,4 +341,10 @@ nv50_push_vbo(struct nv50_context *nv50, const struct pipe_draw_info *info)
       ctx.instance_id++;
       ctx.prim |= NV50_3D_VERTEX_BEGIN_GL_INSTANCE_NEXT;
    }
+
+   if (unlikely(ctx.need_vertex_id)) {
+      /* Reset gl_VertexID to prevent future indexed draws to be confused. */
+      BEGIN_NV04(ctx.push, NV84_3D(VERTEX_ID_BASE), 1);
+      PUSH_DATA (ctx.push, nv50->state.index_bias);
+   }
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index dd9b85b7208..4cd3b615606 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -27,6 +27,8 @@
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_query.h"
 #include "nv50/nv50_query_hw.h"
+#include "nv50/nv50_query_hw_metric.h"
+#include "nv50/nv50_query_hw_sm.h"
 
 static struct pipe_query *
 nv50_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
@@ -152,4 +154,79 @@ nv50_init_query_functions(struct nv50_context *nv50)
    pipe->end_query = nv50_end_query;
    pipe->get_query_result = nv50_get_query_result;
    pipe->render_condition = nv50_render_condition;
+   nv50->cond_condmode = NV50_3D_COND_MODE_ALWAYS;
+}
+
+int
+nv50_screen_get_driver_query_info(struct pipe_screen *pscreen,
+                                  unsigned id,
+                                  struct pipe_driver_query_info *info)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+   int num_hw_queries = 0;
+
+   num_hw_queries = nv50_hw_get_driver_query_info(screen, 0, NULL);
+
+   if (!info)
+      return num_hw_queries;
+
+   /* Init default values. */
+   info->name = "this_is_not_the_query_you_are_looking_for";
+   info->query_type = 0xdeadd01d;
+   info->max_value.u64 = 0;
+   info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
+   info->group_id = -1;
+   info->flags = 0;
+
+   return nv50_hw_get_driver_query_info(screen, id, info);
+}
+
+int
+nv50_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
+                                        unsigned id,
+                                        struct pipe_driver_query_group_info *info)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+   int count = 0;
+
+   if (screen->compute)
+      if (screen->base.class_3d >= NV84_3D_CLASS)
+         count += 2;
+
+   if (!info)
+      return count;
+
+   if (id == NV50_HW_SM_QUERY_GROUP) {
+      if (screen->compute) {
+         if (screen->base.class_3d >= NV84_3D_CLASS) {
+            info->name = "MP counters";
+
+            /* Because we can't expose the number of hardware counters needed
+             * for each different query, we don't want to allow more than one
+             * active query simultaneously to avoid failure when the maximum
+             * number of counters is reached. Note that these groups of GPU
+             * counters are currently only used by AMD_performance_monitor.
+             */
+            info->max_active_queries = 1;
+            info->num_queries = NV50_HW_SM_QUERY_COUNT;
+            return 1;
+         }
+      }
+   } else
+   if (id == NV50_HW_METRIC_QUERY_GROUP) {
+      if (screen->compute) {
+         if (screen->base.class_3d >= NV84_3D_CLASS) {
+            info->name = "Performance metrics";
+            info->max_active_queries = 1;
+            info->num_queries = NV50_HW_METRIC_QUERY_COUNT;
+            return 1;
+         }
+      }
+   }
+
+   /* user asked for info about non-existing query group */
+   info->name = "this_is_not_the_query_group_you_are_looking_for";
+   info->max_active_queries = 0;
+   info->num_queries = 0;
+   return 0;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.h b/src/gallium/drivers/nouveau/nv50/nv50_query.h
index d990285c857..bd4c0a386f6 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.h
@@ -28,6 +28,12 @@ nv50_query(struct pipe_query *pipe)
    return (struct nv50_query *)pipe;
 }
 
+/*
+ * Driver queries groups:
+ */
+#define NV50_HW_SM_QUERY_GROUP       0
+#define NV50_HW_METRIC_QUERY_GROUP   1
+
 void nv50_init_query_functions(struct nv50_context *);
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
index 945ce7abe50..b6ebbbf1010 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -25,6 +25,8 @@
 
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_query_hw.h"
+#include "nv50/nv50_query_hw_metric.h"
+#include "nv50/nv50_query_hw_sm.h"
 #include "nv_object.xml.h"
 
 #define NV50_HW_QUERY_STATE_READY   0
@@ -41,7 +43,7 @@
 
 #define NV50_HW_QUERY_ALLOC_SPACE 256
 
-static bool
+bool
 nv50_hw_query_allocate(struct nv50_context *nv50, struct nv50_query *q,
                        int size)
 {
@@ -122,6 +124,9 @@ nv50_hw_begin_query(struct nv50_context *nv50, struct nv50_query *q)
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_hw_query *hq = nv50_hw_query(q);
 
+   if (hq->funcs && hq->funcs->begin_query)
+      return hq->funcs->begin_query(nv50, hq);
+
    /* For occlusion queries we have to change the storage, because a previous
     * query might set the initial render condition to false even *after* we re-
     * initialized it to true.
@@ -193,6 +198,11 @@ nv50_hw_end_query(struct nv50_context *nv50, struct nv50_query *q)
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_hw_query *hq = nv50_hw_query(q);
 
+   if (hq->funcs && hq->funcs->end_query) {
+      hq->funcs->end_query(nv50, hq);
+      return;
+   }
+
    hq->state = NV50_HW_QUERY_STATE_ENDED;
 
    switch (q->type) {
@@ -261,6 +271,9 @@ nv50_hw_get_query_result(struct nv50_context *nv50, struct nv50_query *q,
    uint64_t *data64 = (uint64_t *)hq->data;
    int i;
 
+   if (hq->funcs && hq->funcs->get_query_result)
+      return hq->funcs->get_query_result(nv50, hq, wait, result);
+
    if (hq->state != NV50_HW_QUERY_STATE_READY)
       nv50_hw_query_update(q);
 
@@ -331,6 +344,18 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
    struct nv50_hw_query *hq;
    struct nv50_query *q;
 
+   hq = nv50_hw_sm_create_query(nv50, type);
+   if (hq) {
+      hq->base.funcs = &hw_query_funcs;
+      return (struct nv50_query *)hq;
+   }
+
+   hq = nv50_hw_metric_create_query(nv50, type);
+   if (hq) {
+      hq->base.funcs = &hw_query_funcs;
+      return (struct nv50_query *)hq;
+   }
+
    hq = CALLOC_STRUCT(nv50_hw_query);
    if (!hq)
       return NULL;
@@ -375,6 +400,26 @@ nv50_hw_create_query(struct nv50_context *nv50, unsigned type, unsigned index)
    return q;
 }
 
+int
+nv50_hw_get_driver_query_info(struct nv50_screen *screen, unsigned id,
+                              struct pipe_driver_query_info *info)
+{
+   int num_hw_sm_queries = 0, num_hw_metric_queries = 0;
+
+   num_hw_sm_queries = nv50_hw_sm_get_driver_query_info(screen, 0, NULL);
+   num_hw_metric_queries =
+      nv50_hw_metric_get_driver_query_info(screen, 0, NULL);
+
+   if (!info)
+      return num_hw_sm_queries + num_hw_metric_queries;
+
+   if (id < num_hw_sm_queries)
+      return nv50_hw_sm_get_driver_query_info(screen, id, info);
+
+   return nv50_hw_metric_get_driver_query_info(screen,
+                                               id - num_hw_sm_queries, info);
+}
+
 void
 nv50_hw_query_pushbuf_submit(struct nouveau_pushbuf *push, uint16_t method,
                              struct nv50_query *q, unsigned result_offset)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
index 294c67de9a4..82ec6bd2d96 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.h
@@ -8,8 +8,19 @@
 
 #define NVA0_HW_QUERY_STREAM_OUTPUT_BUFFER_OFFSET (PIPE_QUERY_TYPES + 0)
 
+struct nv50_hw_query;
+
+struct nv50_hw_query_funcs {
+   void (*destroy_query)(struct nv50_context *, struct nv50_hw_query *);
+   boolean (*begin_query)(struct nv50_context *, struct nv50_hw_query *);
+   void (*end_query)(struct nv50_context *, struct nv50_hw_query *);
+   boolean (*get_query_result)(struct nv50_context *, struct nv50_hw_query *,
+                               boolean, union pipe_query_result *);
+};
+
 struct nv50_hw_query {
    struct nv50_query base;
+   const struct nv50_hw_query_funcs *funcs;
    uint32_t *data;
    uint32_t sequence;
    struct nouveau_bo *bo;
@@ -31,6 +42,11 @@ nv50_hw_query(struct nv50_query *q)
 
 struct nv50_query *
 nv50_hw_create_query(struct nv50_context *, unsigned, unsigned);
+int
+nv50_hw_get_driver_query_info(struct nv50_screen *, unsigned,
+                              struct pipe_driver_query_info *);
+bool
+nv50_hw_query_allocate(struct nv50_context *, struct nv50_query *, int);
 void
 nv50_hw_query_pushbuf_submit(struct nouveau_pushbuf *, uint16_t,
                              struct nv50_query *, unsigned);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
new file mode 100644
index 00000000000..d1bccb94193
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw_metric.h"
+#include "nv50/nv50_query_hw_sm.h"
+
+/* === PERFORMANCE MONITORING METRICS for NV84+ === */
+static const char *nv50_hw_metric_names[] =
+{
+   "metric-branch_efficiency",
+};
+
+struct nv50_hw_metric_query_cfg {
+   uint32_t queries[4];
+   uint32_t num_queries;
+};
+
+#define _SM(n) NV50_HW_SM_QUERY(NV50_HW_SM_QUERY_ ##n)
+#define _M(n, c) [NV50_HW_METRIC_QUERY_##n] = c
+
+/* ==== Compute capability 1.1 (G84+) ==== */
+static const struct nv50_hw_metric_query_cfg
+sm11_branch_efficiency =
+{
+   .queries[0]  = _SM(BRANCH),
+   .queries[1]  = _SM(DIVERGENT_BRANCH),
+   .num_queries = 2,
+};
+
+static const struct nv50_hw_metric_query_cfg *sm11_hw_metric_queries[] =
+{
+   _M(BRANCH_EFFICIENCY, &sm11_branch_efficiency),
+};
+
+#undef _SM
+#undef _M
+
+static const struct nv50_hw_metric_query_cfg *
+nv50_hw_metric_query_get_cfg(struct nv50_context *nv50,
+                             struct nv50_hw_query *hq)
+{
+   struct nv50_query *q = &hq->base;
+   return sm11_hw_metric_queries[q->type - NV50_HW_METRIC_QUERY(0)];
+}
+
+static void
+nv50_hw_metric_destroy_query(struct nv50_context *nv50,
+                             struct nv50_hw_query *hq)
+{
+   struct nv50_hw_metric_query *hmq = nv50_hw_metric_query(hq);
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++)
+      hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
+   FREE(hmq);
+}
+
+static boolean
+nv50_hw_metric_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_hw_metric_query *hmq = nv50_hw_metric_query(hq);
+   boolean ret = false;
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++) {
+      ret = hmq->queries[i]->funcs->begin_query(nv50, hmq->queries[i]);
+      if (!ret)
+         return ret;
+   }
+   return ret;
+}
+
+static void
+nv50_hw_metric_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_hw_metric_query *hmq = nv50_hw_metric_query(hq);
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++)
+      hmq->queries[i]->funcs->end_query(nv50, hmq->queries[i]);
+}
+
+static uint64_t
+sm11_hw_metric_calc_result(struct nv50_hw_query *hq, uint64_t res64[8])
+{
+   switch (hq->base.type - NV50_HW_METRIC_QUERY(0)) {
+   case NV50_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
+      /* (branch / (branch + divergent_branch)) * 100 */
+      if (res64[0] + res64[1])
+         return (res64[0] / (double)(res64[0] + res64[1])) * 100;
+      break;
+   default:
+      debug_printf("invalid metric type: %d\n",
+                   hq->base.type - NV50_HW_METRIC_QUERY(0));
+      break;
+   }
+   return 0;
+}
+
+static boolean
+nv50_hw_metric_get_query_result(struct nv50_context *nv50,
+                                struct nv50_hw_query *hq, boolean wait,
+                                union pipe_query_result *result)
+{
+   struct nv50_hw_metric_query *hmq = nv50_hw_metric_query(hq);
+   union pipe_query_result results[4] = {};
+   uint64_t res64[4] = {};
+   boolean ret = false;
+   unsigned i;
+
+   for (i = 0; i < hmq->num_queries; i++) {
+      ret = hmq->queries[i]->funcs->get_query_result(nv50, hmq->queries[i],
+                                                     wait, &results[i]);
+      if (!ret)
+         return ret;
+      res64[i] = *(uint64_t *)&results[i];
+   }
+
+   *(uint64_t *)result = sm11_hw_metric_calc_result(hq, res64);
+   return ret;
+}
+
+static const struct nv50_hw_query_funcs hw_metric_query_funcs = {
+   .destroy_query = nv50_hw_metric_destroy_query,
+   .begin_query = nv50_hw_metric_begin_query,
+   .end_query = nv50_hw_metric_end_query,
+   .get_query_result = nv50_hw_metric_get_query_result,
+};
+
+struct nv50_hw_query *
+nv50_hw_metric_create_query(struct nv50_context *nv50, unsigned type)
+{
+   const struct nv50_hw_metric_query_cfg *cfg;
+   struct nv50_hw_metric_query *hmq;
+   struct nv50_hw_query *hq;
+   unsigned i;
+
+   if (type < NV50_HW_METRIC_QUERY(0) || type > NV50_HW_METRIC_QUERY_LAST)
+      return NULL;
+
+   hmq = CALLOC_STRUCT(nv50_hw_metric_query);
+   if (!hmq)
+      return NULL;
+
+   hq = &hmq->base;
+   hq->funcs = &hw_metric_query_funcs;
+   hq->base.type = type;
+
+   cfg = nv50_hw_metric_query_get_cfg(nv50, hq);
+
+   for (i = 0; i < cfg->num_queries; i++) {
+      hmq->queries[i] = nv50_hw_sm_create_query(nv50, cfg->queries[i]);
+      if (!hmq->queries[i]) {
+         nv50_hw_metric_destroy_query(nv50, hq);
+         return NULL;
+      }
+      hmq->num_queries++;
+   }
+
+   return hq;
+}
+
+int
+nv50_hw_metric_get_driver_query_info(struct nv50_screen *screen, unsigned id,
+                                     struct pipe_driver_query_info *info)
+{
+   int count = 0;
+
+   if (screen->compute)
+      if (screen->base.class_3d >= NV84_3D_CLASS)
+         count += NV50_HW_METRIC_QUERY_COUNT;
+
+   if (!info)
+      return count;
+
+   if (id < count) {
+      if (screen->compute) {
+         if (screen->base.class_3d >= NV84_3D_CLASS) {
+            info->name = nv50_hw_metric_names[id];
+            info->query_type = NV50_HW_METRIC_QUERY(id);
+            info->group_id = NV50_HW_METRIC_QUERY_GROUP;
+            return 1;
+         }
+      }
+   }
+   return 0;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.h
new file mode 100644
index 00000000000..f8cfc04084f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.h
@@ -0,0 +1,34 @@
+#ifndef __NV50_QUERY_HW_METRIC_H__
+#define __NV50_QUERY_HW_METRIC_H__
+
+#include "nv50_query_hw.h"
+
+struct nv50_hw_metric_query {
+   struct nv50_hw_query base;
+   struct nv50_hw_query *queries[4];
+   unsigned num_queries;
+};
+
+static inline struct nv50_hw_metric_query *
+nv50_hw_metric_query(struct nv50_hw_query *hq)
+{
+   return (struct nv50_hw_metric_query *)hq;
+}
+
+/*
+ * Driver metrics queries:
+ */
+#define NV50_HW_METRIC_QUERY(i)   (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i))
+#define NV50_HW_METRIC_QUERY_LAST  NV50_HW_METRIC_QUERY(NV50_HW_METRIC_QUERY_COUNT - 1)
+enum nv50_hw_metric_queries
+{
+    NV50_HW_METRIC_QUERY_BRANCH_EFFICIENCY = 0,
+    NV50_HW_METRIC_QUERY_COUNT
+};
+
+struct nv50_hw_query *
+nv50_hw_metric_create_query(struct nv50_context *, unsigned);
+int
+nv50_hw_metric_get_driver_query_info(struct nv50_screen *, unsigned,
+                                     struct pipe_driver_query_info *);
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
new file mode 100644
index 00000000000..8453ce76095
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#define NV50_PUSH_EXPLICIT_SPACE_CHECKING
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_query_hw_sm.h"
+
+#include "nv_object.xml.h"
+#include "nv50/nv50_compute.xml.h"
+
+/* === PERFORMANCE MONITORING COUNTERS for NV84+ === */
+
+/* NOTE: intentionally using the same names as NV */
+static const char *nv50_hw_sm_query_names[] =
+{
+   "branch",
+   "divergent_branch",
+   "instructions",
+   "prof_trigger_00",
+   "prof_trigger_01",
+   "prof_trigger_02",
+   "prof_trigger_03",
+   "prof_trigger_04",
+   "prof_trigger_05",
+   "prof_trigger_06",
+   "prof_trigger_07",
+   "sm_cta_launched",
+   "warp_serialize",
+};
+
+static const uint64_t nv50_read_hw_sm_counters_code[] =
+{
+   /* and b32 $r0 $r0 0x0000ffff
+    * add b32 $c0 $r0 $r0 $r0
+    * (lg $c0) ret
+    * mov $r0 $pm0
+    * mov $r1 $pm1
+    * mov $r2 $pm2
+    * mov $r3 $pm3
+    * mov $r4 $physid
+    * ld $r5 b32 s[0x10]
+    * ld $r6 b32 s[0x14]
+    * and b32 $r4 $r4 0x000f0000
+    * shr u32 $r4 $r4 0x10
+    * mul $r4 u24 $r4 0x14
+    * add b32 $r5 $r5 $r4
+    * st b32 g15[$r5] $r0
+    * add b32 $r5 $r5 0x04
+    * st b32 g15[$r5] $r1
+    * add b32 $r5 $r5 0x04
+    * st b32 g15[$r5] $r2
+    * add b32 $r5 $r5 0x04
+    * st b32 g15[$r5] $r3
+    * add b32 $r5 $r5 0x04
+    * exit st b32 g15[$r5] $r6 */
+   0x00000fffd03f0001ULL,
+   0x040007c020000001ULL,
+   0x0000028030000003ULL,
+   0x6001078000000001ULL,
+   0x6001478000000005ULL,
+   0x6001878000000009ULL,
+   0x6001c7800000000dULL,
+   0x6000078000000011ULL,
+   0x4400c78010000815ULL,
+   0x4400c78010000a19ULL,
+   0x0000f003d0000811ULL,
+   0xe410078030100811ULL,
+   0x0000000340540811ULL,
+   0x0401078020000a15ULL,
+   0xa0c00780d00f0a01ULL,
+   0x0000000320048a15ULL,
+   0xa0c00780d00f0a05ULL,
+   0x0000000320048a15ULL,
+   0xa0c00780d00f0a09ULL,
+   0x0000000320048a15ULL,
+   0xa0c00780d00f0a0dULL,
+   0x0000000320048a15ULL,
+   0xa0c00781d00f0a19ULL,
+};
+
+struct nv50_hw_sm_counter_cfg
+{
+   uint32_t mode : 4;    /* LOGOP, LOGOP_PULSE */
+   uint32_t unit : 8;    /* UNK[0-5] */
+   uint32_t sig  : 8;    /* signal selection */
+};
+
+struct nv50_hw_sm_query_cfg
+{
+   struct nv50_hw_sm_counter_cfg ctr[4];
+   uint8_t num_counters;
+};
+
+#define _Q(n, m, u, s) [NV50_HW_SM_QUERY_##n] = { { { NV50_COMPUTE_MP_PM_CONTROL_MODE_##m, NV50_COMPUTE_MP_PM_CONTROL_UNIT_##u, s, }, {}, {}, {} }, 1 }
+
+/* ==== Compute capability 1.1 (G84+) ==== */
+static const struct nv50_hw_sm_query_cfg sm11_hw_sm_queries[] =
+{
+   _Q(BRANCH,           LOGOP, UNK4, 0x02),
+   _Q(DIVERGENT_BRANCH, LOGOP, UNK4, 0x09),
+   _Q(INSTRUCTIONS,     LOGOP, UNK4, 0x04),
+   _Q(PROF_TRIGGER_0,   LOGOP, UNK1, 0x26),
+   _Q(PROF_TRIGGER_1,   LOGOP, UNK1, 0x27),
+   _Q(PROF_TRIGGER_2,   LOGOP, UNK1, 0x28),
+   _Q(PROF_TRIGGER_3,   LOGOP, UNK1, 0x29),
+   _Q(PROF_TRIGGER_4,   LOGOP, UNK1, 0x2a),
+   _Q(PROF_TRIGGER_5,   LOGOP, UNK1, 0x2b),
+   _Q(PROF_TRIGGER_6,   LOGOP, UNK1, 0x2c),
+   _Q(PROF_TRIGGER_7,   LOGOP, UNK1, 0x2d),
+   _Q(SM_CTA_LAUNCHED,  LOGOP, UNK1, 0x33),
+   _Q(WARP_SERIALIZE,   LOGOP, UNK0, 0x0b),
+};
+
+static inline uint16_t nv50_hw_sm_get_func(uint8_t slot)
+{
+   switch (slot) {
+   case 0: return 0xaaaa;
+   case 1: return 0xcccc;
+   case 2: return 0xf0f0;
+   case 3: return 0xff00;
+   }
+   return 0;
+}
+
+static const struct nv50_hw_sm_query_cfg *
+nv50_hw_sm_query_get_cfg(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_query *q = &hq->base;
+   return &sm11_hw_sm_queries[q->type - NV50_HW_SM_QUERY(0)];
+}
+
+static void
+nv50_hw_sm_destroy_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_query *q = &hq->base;
+   q->funcs->destroy_query(nv50, q);
+}
+
+static boolean
+nv50_hw_sm_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq);
+   const struct nv50_hw_sm_query_cfg *cfg;
+   uint16_t func;
+   int i, c;
+
+   cfg = nv50_hw_sm_query_get_cfg(nv50, hq);
+
+   /* check if we have enough free counter slots */
+   if (screen->pm.num_hw_sm_active + cfg->num_counters > 4) {
+      NOUVEAU_ERR("Not enough free MP counter slots !\n");
+      return false;
+   }
+
+   assert(cfg->num_counters <= 4);
+   PUSH_SPACE(push, 4 * 4);
+
+   /* set sequence field to 0 (used to check if result is available) */
+   for (i = 0; i < screen->MPsInTP; ++i) {
+      const unsigned b = (0x14 / 4) * i;
+      hq->data[b + 16] = 0;
+   }
+   hq->sequence++;
+
+   for (i = 0; i < cfg->num_counters; i++) {
+      screen->pm.num_hw_sm_active++;
+
+      /* find free counter slots */
+      for (c = 0; c < 4; ++c) {
+         if (!screen->pm.mp_counter[c]) {
+            hsq->ctr[i] = c;
+            screen->pm.mp_counter[c] = hsq;
+            break;
+         }
+      }
+
+      /* select func to aggregate counters */
+      func = nv50_hw_sm_get_func(c);
+
+      /* configure and reset the counter(s) */
+      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+      PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
+                        | cfg->ctr[i].unit | cfg->ctr[i].mode);
+      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_SET(c)), 1);
+      PUSH_DATA (push, 0);
+   }
+   return true;
+}
+
+static void
+nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct pipe_context *pipe = &nv50->base.pipe;
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq);
+   uint32_t mask;
+   uint32_t input[3];
+   const uint block[3] = { 32, 1, 1 };
+   const uint grid[3] = { screen->MPsInTP, screen->TPs, 1 };
+   int c;
+
+   if (unlikely(!screen->pm.prog)) {
+      struct nv50_program *prog = CALLOC_STRUCT(nv50_program);
+      prog->type = PIPE_SHADER_COMPUTE;
+      prog->translated = true;
+      prog->max_gpr = 7;
+      prog->parm_size = 8;
+      prog->code = (uint32_t *)nv50_read_hw_sm_counters_code;
+      prog->code_size = sizeof(nv50_read_hw_sm_counters_code);
+      screen->pm.prog = prog;
+   }
+
+   /* disable all counting */
+   PUSH_SPACE(push, 8);
+   for (c = 0; c < 4; c++) {
+      if (screen->pm.mp_counter[c]) {
+         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+         PUSH_DATA (push, 0);
+      }
+   }
+
+   /* release counters for this query */
+   for (c = 0; c < 4; c++) {
+      if (screen->pm.mp_counter[c] == hsq) {
+         screen->pm.num_hw_sm_active--;
+         screen->pm.mp_counter[c] = NULL;
+      }
+   }
+
+   BCTX_REFN_bo(nv50->bufctx_cp, CP_QUERY, NOUVEAU_BO_GART | NOUVEAU_BO_WR,
+                hq->bo);
+
+   PUSH_SPACE(push, 2);
+   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+
+   pipe->bind_compute_state(pipe, screen->pm.prog);
+   input[0] = hq->bo->offset + hq->base_offset;
+   input[1] = hq->sequence;
+   pipe->launch_grid(pipe, block, grid, 0, input);
+
+   nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_QUERY);
+
+   /* re-active other counters */
+   PUSH_SPACE(push, 8);
+   mask = 0;
+   for (c = 0; c < 4; c++) {
+      const struct nv50_hw_sm_query_cfg *cfg;
+      unsigned i;
+
+      hsq = screen->pm.mp_counter[c];
+      if (!hsq)
+         continue;
+
+      cfg = nv50_hw_sm_query_get_cfg(nv50, &hsq->base);
+      for (i = 0; i < cfg->num_counters; i++) {
+         uint16_t func;
+
+         if (mask & (1 << hsq->ctr[i]))
+            break;
+
+         mask |= 1 << hsq->ctr[i];
+         func  = nv50_hw_sm_get_func(hsq->ctr[i]);
+
+         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1);
+         PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
+                    | cfg->ctr[i].unit | cfg->ctr[i].mode);
+      }
+   }
+}
+
+static inline bool
+nv50_hw_sm_query_read_data(uint32_t count[32][4],
+                           struct nv50_context *nv50, bool wait,
+                           struct nv50_hw_query *hq,
+                           const struct nv50_hw_sm_query_cfg *cfg,
+                           unsigned mp_count)
+{
+   struct nv50_hw_sm_query *hsq = nv50_hw_sm_query(hq);
+   unsigned p, c;
+
+   for (p = 0; p < mp_count; ++p) {
+      const unsigned b = (0x14 / 4) * p;
+
+      for (c = 0; c < cfg->num_counters; ++c) {
+         if (hq->data[b + 4] != hq->sequence) {
+            if (!wait)
+               return false;
+            if (nouveau_bo_wait(hq->bo, NOUVEAU_BO_RD, nv50->base.client))
+               return false;
+         }
+         count[p][c] = hq->data[b + hsq->ctr[c]];
+      }
+   }
+   return true;
+}
+
+static boolean
+nv50_hw_sm_get_query_result(struct nv50_context *nv50, struct nv50_hw_query *hq,
+                            boolean wait, union pipe_query_result *result)
+{
+   uint32_t count[32][4];
+   uint64_t value = 0;
+   unsigned mp_count = MIN2(nv50->screen->MPsInTP, 32);
+   unsigned p, c;
+   const struct nv50_hw_sm_query_cfg *cfg;
+   bool ret;
+
+   cfg = nv50_hw_sm_query_get_cfg(nv50, hq);
+
+   ret = nv50_hw_sm_query_read_data(count, nv50, wait, hq, cfg, mp_count);
+   if (!ret)
+      return false;
+
+   for (c = 0; c < cfg->num_counters; ++c)
+      for (p = 0; p < mp_count; ++p)
+         value += count[p][c];
+
+   /* We only count a single TP, and simply multiply by the total number of
+    * TPs to compute result over all TPs. This is inaccurate, but enough! */
+   value *= nv50->screen->TPs;
+
+   *(uint64_t *)result = value;
+   return true;
+}
+
+static const struct nv50_hw_query_funcs hw_sm_query_funcs = {
+   .destroy_query = nv50_hw_sm_destroy_query,
+   .begin_query = nv50_hw_sm_begin_query,
+   .end_query = nv50_hw_sm_end_query,
+   .get_query_result = nv50_hw_sm_get_query_result,
+};
+
+struct nv50_hw_query *
+nv50_hw_sm_create_query(struct nv50_context *nv50, unsigned type)
+{
+   struct nv50_hw_sm_query *hsq;
+   struct nv50_hw_query *hq;
+   unsigned space;
+
+   if (type < NV50_HW_SM_QUERY(0) || type > NV50_HW_SM_QUERY_LAST)
+      return NULL;
+
+   hsq = CALLOC_STRUCT(nv50_hw_sm_query);
+   if (!hsq)
+      return NULL;
+
+   hq = &hsq->base;
+   hq->funcs = &hw_sm_query_funcs;
+   hq->base.type = type;
+
+   /*
+    * for each MP:
+    * [00] = MP.C0
+    * [04] = MP.C1
+    * [08] = MP.C2
+    * [0c] = MP.C3
+    * [10] = MP.sequence
+    */
+   space = (4 + 1) * nv50->screen->MPsInTP * sizeof(uint32_t);
+
+   if (!nv50_hw_query_allocate(nv50, &hq->base, space)) {
+      FREE(hq);
+      return NULL;
+   }
+
+   return hq;
+}
+
+int
+nv50_hw_sm_get_driver_query_info(struct nv50_screen *screen, unsigned id,
+                                 struct pipe_driver_query_info *info)
+{
+   int count = 0;
+
+   if (screen->compute)
+      if (screen->base.class_3d >= NV84_3D_CLASS)
+         count += NV50_HW_SM_QUERY_COUNT;
+
+   if (!info)
+      return count;
+
+   if (id < count) {
+      if (screen->compute) {
+         if (screen->base.class_3d >= NV84_3D_CLASS) {
+            info->name = nv50_hw_sm_query_names[id];
+            info->query_type = NV50_HW_SM_QUERY(id);
+            info->group_id = NV50_HW_SM_QUERY_GROUP;
+            return 1;
+         }
+      }
+   }
+   return 0;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.h b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.h
new file mode 100644
index 00000000000..c1a1cd175e3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.h
@@ -0,0 +1,45 @@
+#ifndef __NV50_QUERY_HW_SM_H__
+#define __NV50_QUERY_HW_SM_H__
+
+#include "nv50_query_hw.h"
+
+struct nv50_hw_sm_query {
+   struct nv50_hw_query base;
+   uint8_t ctr[4];
+};
+
+static inline struct nv50_hw_sm_query *
+nv50_hw_sm_query(struct nv50_hw_query *hq)
+{
+   return (struct nv50_hw_sm_query *)hq;
+}
+
+/*
+ * Performance counter queries:
+ */
+#define NV50_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
+#define NV50_HW_SM_QUERY_LAST   NV50_HW_SM_QUERY(NV50_HW_SM_QUERY_COUNT - 1)
+enum nv50_hw_sm_queries
+{
+   NV50_HW_SM_QUERY_BRANCH = 0,
+   NV50_HW_SM_QUERY_DIVERGENT_BRANCH,
+   NV50_HW_SM_QUERY_INSTRUCTIONS,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_0,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_1,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_2,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_3,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_4,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_5,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_6,
+   NV50_HW_SM_QUERY_PROF_TRIGGER_7,
+   NV50_HW_SM_QUERY_SM_CTA_LAUNCHED,
+   NV50_HW_SM_QUERY_WARP_SERIALIZE,
+   NV50_HW_SM_QUERY_COUNT,
+};
+
+struct nv50_hw_query *
+nv50_hw_sm_create_query(struct nv50_context *, unsigned);
+int
+nv50_hw_sm_get_driver_query_info(struct nv50_screen *, unsigned,
+                                 struct pipe_driver_query_info *);
+#endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index f47e998ab1e..1e4b75f18e0 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -41,8 +41,6 @@
 
 #define THREADS_IN_WARP 32
 
-#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
-
 static boolean
 nv50_screen_is_format_supported(struct pipe_screen *pscreen,
                                 enum pipe_format format,
@@ -183,6 +181,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_COMPUTE:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -212,7 +211,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-   case PIPE_CAP_COMPUTE:
    case PIPE_CAP_DRAW_INDIRECT:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
@@ -251,6 +249,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_VERTEX:
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_COMPUTE:
       break;
    default:
       return 0;
@@ -336,6 +335,52 @@ nv50_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
    return 0.0f;
 }
 
+static int
+nv50_screen_get_compute_param(struct pipe_screen *pscreen,
+                              enum pipe_compute_cap param, void *data)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+
+#define RET(x) do {                  \
+   if (data)                         \
+      memcpy(data, x, sizeof(x));    \
+   return sizeof(x);                 \
+} while (0)
+
+   switch (param) {
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      RET((uint64_t []) { 2 });
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      RET(((uint64_t []) { 65535, 65535 }));
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      RET(((uint64_t []) { 512, 512, 64 }));
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      RET((uint64_t []) { 512 });
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g0-15[] */
+      RET((uint64_t []) { 1ULL << 32 });
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */
+      RET((uint64_t []) { 16 << 10 });
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */
+      RET((uint64_t []) { 16 << 10 });
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
+      RET((uint64_t []) { 4096 });
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      RET((uint32_t []) { 32 });
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+      RET((uint64_t []) { 1ULL << 40 });
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      RET((uint32_t []) { 0 });
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+      RET((uint32_t []) { screen->mp_count });
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+      RET((uint32_t []) { 512 }); /* FIXME: arbitrary limit */
+   default:
+      return 0;
+   }
+
+#undef RET
+}
+
 static void
 nv50_screen_destroy(struct pipe_screen *pscreen)
 {
@@ -377,6 +422,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
    nouveau_object_del(&screen->tesla);
    nouveau_object_del(&screen->eng2d);
    nouveau_object_del(&screen->m2mf);
+   nouveau_object_del(&screen->compute);
    nouveau_object_del(&screen->sync);
 
    nouveau_screen_fini(&screen->base);
@@ -640,7 +686,7 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
    BEGIN_NV04(push, NV50_3D(VB_ELEMENT_BASE), 1);
    PUSH_DATA (push, 0);
    if (screen->base.class_3d >= NV84_3D_CLASS) {
-      BEGIN_NV04(push, SUBC_3D(NV84_3D_VERTEX_ID_BASE), 1);
+      BEGIN_NV04(push, NV84_3D(VERTEX_ID_BASE), 1);
       PUSH_DATA (push, 0);
    }
 
@@ -742,6 +788,9 @@ nv50_screen_create(struct nouveau_device *dev)
    pscreen->get_param = nv50_screen_get_param;
    pscreen->get_shader_param = nv50_screen_get_shader_param;
    pscreen->get_paramf = nv50_screen_get_paramf;
+   pscreen->get_compute_param = nv50_screen_get_compute_param;
+   pscreen->get_driver_query_info = nv50_screen_get_driver_query_info;
+   pscreen->get_driver_query_group_info = nv50_screen_get_driver_query_group_info;
 
    nv50_screen_init_resource_functions(pscreen);
 
@@ -851,6 +900,8 @@ nv50_screen_create(struct nouveau_device *dev)
    screen->TPs = util_bitcount(value & 0xffff);
    screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
 
+   screen->mp_count = screen->TPs * screen->MPsInTP;
+
    stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP *
          STACK_WARPS_ALLOC * 64 * 8;
 
@@ -902,6 +953,12 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nv50_screen_init_hwctx(screen);
 
+   ret = nv50_screen_compute_setup(screen, screen->base.pushbuf);
+   if (ret) {
+      NOUVEAU_ERR("Failed to init compute context: %d\n", ret);
+      goto fail;
+   }
+
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index ce51f0fc254..2a4983d1020 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -23,6 +23,10 @@ struct nv50_context;
 
 #define NV50_MAX_VIEWPORTS 16
 
+#define NV50_MAX_GLOBALS 16
+
+#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
+
 struct nv50_blitter;
 
 struct nv50_graph_state {
@@ -66,6 +70,7 @@ struct nv50_screen {
    unsigned MPsInTP;
    unsigned max_tls_space;
    unsigned cur_tls_space;
+   unsigned mp_count;
 
    struct nouveau_heap *vp_code_heap;
    struct nouveau_heap *gp_code_heap;
@@ -90,9 +95,16 @@ struct nv50_screen {
       struct nouveau_bo *bo;
    } fence;
 
+   struct {
+      struct nv50_program *prog; /* compute state object to read MP counters */
+      struct nv50_hw_sm_query *mp_counter[4]; /* counter to query allocation */
+      uint8_t num_hw_sm_active;
+   } pm;
+
    struct nouveau_object *sync;
 
    struct nouveau_object *tesla;
+   struct nouveau_object *compute;
    struct nouveau_object *eng2d;
    struct nouveau_object *m2mf;
 };
@@ -103,12 +115,19 @@ nv50_screen(struct pipe_screen *screen)
    return (struct nv50_screen *)screen;
 }
 
+int nv50_screen_get_driver_query_info(struct pipe_screen *, unsigned,
+                                      struct pipe_driver_query_info *);
+int nv50_screen_get_driver_query_group_info(struct pipe_screen *, unsigned,
+                                            struct pipe_driver_query_group_info *);
+
 bool nv50_blitter_create(struct nv50_screen *);
 void nv50_blitter_destroy(struct nv50_screen *);
 
 int nv50_screen_tic_alloc(struct nv50_screen *, void *);
 int nv50_screen_tsc_alloc(struct nv50_screen *, void *);
 
+int nv50_screen_compute_setup(struct nv50_screen *, struct nouveau_pushbuf *);
+
 static inline void
 nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index d27f12ca94b..b4ea08d4d13 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -792,6 +792,35 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     nv50->dirty |= NV50_NEW_GMTYPROG;
 }
 
+static void *
+nv50_cp_state_create(struct pipe_context *pipe,
+                     const struct pipe_compute_state *cso)
+{
+   struct nv50_program *prog;
+
+   prog = CALLOC_STRUCT(nv50_program);
+   if (!prog)
+      return NULL;
+   prog->type = PIPE_SHADER_COMPUTE;
+
+   prog->cp.smem_size = cso->req_local_mem;
+   prog->cp.lmem_size = cso->req_private_mem;
+   prog->parm_size = cso->req_input_mem;
+
+   prog->pipe.tokens = tgsi_dup_tokens((const struct tgsi_token *)cso->prog);
+
+   return (void *)prog;
+}
+
+static void
+nv50_cp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->compprog = hwcso;
+   nv50->dirty_cp |= NV50_NEW_CP_PROGRAM;
+}
+
 static void
 nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
                          struct pipe_constant_buffer *cb)
@@ -1134,6 +1163,70 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
       nv50->dirty |= NV50_NEW_STRMOUT;
 }
 
+static void
+nv50_set_compute_resources(struct pipe_context *pipe,
+                           unsigned start, unsigned nr,
+                           struct pipe_surface **resources)
+{
+   /* TODO: bind surfaces */
+}
+
+static inline void
+nv50_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
+{
+   struct nv04_resource *buf = nv04_resource(res);
+   if (buf) {
+      uint64_t limit = (buf->address + buf->base.width0) - 1;
+      if (limit < (1ULL << 32)) {
+         *phandle = (uint32_t)buf->address;
+      } else {
+         NOUVEAU_ERR("Cannot map into TGSI_RESOURCE_GLOBAL: "
+                     "resource not contained within 32-bit address space !\n");
+         *phandle = 0;
+      }
+   } else {
+      *phandle = 0;
+   }
+}
+
+static void
+nv50_set_global_bindings(struct pipe_context *pipe,
+                         unsigned start, unsigned nr,
+                         struct pipe_resource **resources,
+                         uint32_t **handles)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct pipe_resource **ptr;
+   unsigned i;
+   const unsigned end = start + nr;
+
+   if (nv50->global_residents.size <= (end * sizeof(struct pipe_resource *))) {
+      const unsigned old_size = nv50->global_residents.size;
+      const unsigned req_size = end * sizeof(struct pipe_resource *);
+      util_dynarray_resize(&nv50->global_residents, req_size);
+      memset((uint8_t *)nv50->global_residents.data + old_size, 0,
+             req_size - old_size);
+   }
+
+   if (resources) {
+      ptr = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i) {
+         pipe_resource_reference(&ptr[i], resources[i]);
+         nv50_set_global_handle(handles[i], resources[i]);
+      }
+   } else {
+      ptr = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i)
+         pipe_resource_reference(&ptr[i], NULL);
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL);
+
+   nv50->dirty_cp = NV50_NEW_CP_GLOBALS;
+}
+
 void
 nv50_init_state_functions(struct nv50_context *nv50)
 {
@@ -1162,12 +1255,15 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->create_vs_state = nv50_vp_state_create;
    pipe->create_fs_state = nv50_fp_state_create;
    pipe->create_gs_state = nv50_gp_state_create;
+   pipe->create_compute_state = nv50_cp_state_create;
    pipe->bind_vs_state = nv50_vp_state_bind;
    pipe->bind_fs_state = nv50_fp_state_bind;
    pipe->bind_gs_state = nv50_gp_state_bind;
+   pipe->bind_compute_state = nv50_cp_state_bind;
    pipe->delete_vs_state = nv50_sp_state_delete;
    pipe->delete_fs_state = nv50_sp_state_delete;
    pipe->delete_gs_state = nv50_sp_state_delete;
+   pipe->delete_compute_state = nv50_sp_state_delete;
 
    pipe->set_blend_color = nv50_set_blend_color;
    pipe->set_stencil_ref = nv50_set_stencil_ref;
@@ -1191,6 +1287,9 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->stream_output_target_destroy = nv50_so_target_destroy;
    pipe->set_stream_output_targets = nv50_set_stream_output_targets;
 
+   pipe->set_global_binding = nv50_set_global_bindings;
+   pipe->set_compute_resources = nv50_set_compute_resources;
+
    nv50->sample_mask = ~0;
    nv50->min_samples = 1;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index b6181edf24f..02a759c23ad 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -503,8 +503,7 @@ static struct state_validate {
     { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
     { nv50_stream_output_validate, NV50_NEW_STRMOUT |
                                    NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS |
-                                   NV50_NEW_VERTPROG },
+    { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS },
     { nv50_validate_min_samples,   NV50_NEW_MIN_SAMPLES },
 };
 #define validate_list_len (sizeof(validate_list) / sizeof(validate_list[0]))
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 916a7d44a31..8ba19d2cc90 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -339,12 +339,18 @@ nv50_clear_render_target(struct pipe_context *pipe,
    PUSH_DATA (push, (width << 16) | dstx);
    PUSH_DATA (push, (height << 16) | dsty);
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+
    BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
       PUSH_DATA (push, 0x3c |
                  (z << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
    }
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, nv50->cond_condmode);
+
    nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
 
@@ -415,12 +421,18 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
    PUSH_DATA (push, (width << 16) | dstx);
    PUSH_DATA (push, (height << 16) | dsty);
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+
    BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
       PUSH_DATA (push, mode |
                  (z << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
    }
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, nv50->cond_condmode);
+
    nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
 
@@ -673,6 +685,9 @@ nv50_clear_buffer(struct pipe_context *pipe,
    PUSH_DATA (push, (width << 16));
    PUSH_DATA (push, (height << 16));
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, NV50_3D_COND_MODE_ALWAYS);
+
    BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1);
    PUSH_DATA (push, 0x3c);
 
@@ -690,6 +705,9 @@ nv50_clear_buffer(struct pipe_context *pipe,
       PUSH_DATA (push, 0x3c);
    }
 
+   BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
+   PUSH_DATA (push, nv50->cond_condmode);
+
    nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence);
    nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 9aa593f919e..85878d5fcc7 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -294,8 +294,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
    uint64_t addrs[PIPE_MAX_ATTRIBS];
    uint32_t limits[PIPE_MAX_ATTRIBS];
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
-   struct nv50_vertex_stateobj dummy = {};
-   struct nv50_vertex_stateobj *vertex = nv50->vertex ? nv50->vertex : &dummy;
+   struct nv50_vertex_stateobj *vertex = nv50->vertex;
    struct pipe_vertex_buffer *vb;
    struct nv50_vertex_element *ve;
    uint32_t mask;
@@ -303,14 +302,6 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
    unsigned i;
    const unsigned n = MAX2(vertex->num_elements, nv50->state.num_vtxelts);
 
-   /* A vertexid is not generated for inline data uploads. Have to use a
-    * VBO. This check must come after the vertprog has been validated,
-    * otherwise vertexid may be unset.
-    */
-   assert(nv50->vertprog->translated);
-   if (nv50->vertprog->vp.vertexid)
-      nv50->vbo_push_hint = 0;
-
    if (unlikely(vertex->need_conversion))
       nv50->vbo_fifo = ~0;
    else
@@ -487,7 +478,7 @@ nv50_draw_arrays(struct nv50_context *nv50,
       BEGIN_NV04(push, NV50_3D(VB_ELEMENT_BASE), 1);
       PUSH_DATA (push, 0);
       if (nv50->screen->base.class_3d >= NV84_3D_CLASS) {
-         BEGIN_NV04(push, SUBC_3D(NV84_3D_VERTEX_ID_BASE), 1);
+         BEGIN_NV04(push, NV84_3D(VERTEX_ID_BASE), 1);
          PUSH_DATA (push, 0);
       }
       nv50->state.index_bias = 0;
@@ -613,7 +604,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
       BEGIN_NV04(push, NV50_3D(VB_ELEMENT_BASE), 1);
       PUSH_DATA (push, index_bias);
       if (nv50->screen->base.class_3d >= NV84_3D_CLASS) {
-         BEGIN_NV04(push, SUBC_3D(NV84_3D_VERTEX_ID_BASE), 1);
+         BEGIN_NV04(push, NV84_3D(VERTEX_ID_BASE), 1);
          PUSH_DATA (push, index_bias);
       }
       nv50->state.index_bias = index_bias;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
index 76f1b41ea70..68002305d72 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
@@ -49,6 +49,7 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 
 #define SUBC_3D(m) 3, (m)
 #define NV50_3D(n) SUBC_3D(NV50_3D_##n)
+#define NV84_3D(n) SUBC_3D(NV84_3D_##n)
 #define NVA0_3D(n) SUBC_3D(NVA0_3D_##n)
 
 #define SUBC_2D(m) 4, (m)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 82ed5a1864e..162661ff2a7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -180,9 +180,10 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
                                  int ref)
 {
    struct nvc0_context *nvc0 = nvc0_context(&ctx->pipe);
+   unsigned bind = res->bind ? res->bind : PIPE_BIND_VERTEX_BUFFER;
    unsigned s, i;
 
-   if (res->bind & PIPE_BIND_RENDER_TARGET) {
+   if (bind & PIPE_BIND_RENDER_TARGET) {
       for (i = 0; i < nvc0->framebuffer.nr_cbufs; ++i) {
          if (nvc0->framebuffer.cbufs[i] &&
              nvc0->framebuffer.cbufs[i]->texture == res) {
@@ -193,7 +194,7 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
          }
       }
    }
-   if (res->bind & PIPE_BIND_DEPTH_STENCIL) {
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nvc0->framebuffer.zsbuf &&
           nvc0->framebuffer.zsbuf->texture == res) {
          nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -203,12 +204,12 @@ nvc0_invalidate_resource_storage(struct nouveau_context *ctx,
       }
    }
 
-   if (res->bind & (PIPE_BIND_VERTEX_BUFFER |
-                    PIPE_BIND_INDEX_BUFFER |
-                    PIPE_BIND_CONSTANT_BUFFER |
-                    PIPE_BIND_STREAM_OUTPUT |
-                    PIPE_BIND_COMMAND_ARGS_BUFFER |
-                    PIPE_BIND_SAMPLER_VIEW)) {
+   if (bind & (PIPE_BIND_VERTEX_BUFFER |
+               PIPE_BIND_INDEX_BUFFER |
+               PIPE_BIND_CONSTANT_BUFFER |
+               PIPE_BIND_STREAM_OUTPUT |
+               PIPE_BIND_COMMAND_ARGS_BUFFER |
+               PIPE_BIND_SAMPLER_VIEW)) {
       for (i = 0; i < nvc0->num_vtxbufs; ++i) {
          if (nvc0->vtxbuf[i].buffer == res) {
             nvc0->dirty |= NVC0_NEW_ARRAYS;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index f53921092a5..d992b10a23c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -162,6 +162,7 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
    info->max_value.u64 = 0;
    info->type = PIPE_DRIVER_QUERY_TYPE_UINT64;
    info->group_id = -1;
+   info->flags = 0;
 
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
    if (id < num_sw_queries)
@@ -200,7 +201,6 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
    if (id == NVC0_HW_SM_QUERY_GROUP) {
       if (screen->compute) {
          info->name = "MP counters";
-         info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
 
          /* Because we can't expose the number of hardware counters needed for
           * each different query, we don't want to allow more than one active
@@ -224,7 +224,6 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
       if (screen->compute) {
          if (screen->base.class_3d < NVE4_3D_CLASS) {
             info->name = "Performance metrics";
-            info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
             info->max_active_queries = 1;
             info->num_queries = NVC0_HW_METRIC_QUERY_COUNT;
             return 1;
@@ -234,7 +233,6 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
    else if (id == NVC0_SW_QUERY_DRV_STAT_GROUP) {
       info->name = "Driver statistics";
-      info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_CPU;
       info->max_active_queries = NVC0_SW_QUERY_DRV_STAT_COUNT;
       info->num_queries = NVC0_SW_QUERY_DRV_STAT_COUNT;
       return 1;
@@ -245,7 +243,6 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
    info->name = "this_is_not_the_query_group_you_are_looking_for";
    info->max_active_queries = 0;
    info->num_queries = 0;
-   info->type = 0;
    return 0;
 }
 
@@ -260,4 +257,5 @@ nvc0_init_query_functions(struct nvc0_context *nvc0)
    pipe->end_query = nvc0_end_query;
    pipe->get_query_result = nvc0_get_query_result;
    pipe->render_condition = nvc0_render_condition;
+   nvc0->cond_condmode = NVC0_3D_COND_MODE_ALWAYS;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index 44b222e5134..7962143d45a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -1014,14 +1014,15 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
       struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
       prog->type = PIPE_SHADER_COMPUTE;
       prog->translated = true;
-      prog->num_gprs = 14;
       prog->parm_size = 12;
       if (is_nve4) {
          prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
          prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
+         prog->num_gprs = 14;
       } else {
          prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
          prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
+         prog->num_gprs = 12;
       }
       screen->pm.prog = prog;
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index cdb1fc1145f..6a4ae5be2ab 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -341,12 +341,16 @@ nvc0_clear_render_target(struct pipe_context *pipe,
       nvc0_resource_fence(res, NOUVEAU_BO_WR);
    }
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
+
    BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
       PUSH_DATA (push, 0x3c |
                  (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT));
    }
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
    nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
 }
 
@@ -470,6 +474,8 @@ nvc0_clear_buffer(struct pipe_context *pipe,
    IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
    IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0);
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
+
    IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
 
    if (width * height != elements) {
@@ -486,6 +492,8 @@ nvc0_clear_buffer(struct pipe_context *pipe,
       IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
    }
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
    nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence);
    nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr);
    nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
@@ -545,12 +553,16 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe,
    PUSH_DATA (push, dst->u.tex.first_layer);
    IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode);
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), NVC0_3D_COND_MODE_ALWAYS);
+
    BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
       PUSH_DATA (push, mode |
                  (z << NVC0_3D_CLEAR_BUFFERS_LAYER__SHIFT));
    }
 
+   IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode);
+
    nvc0->dirty |= NVC0_NEW_FRAMEBUFFER;
 }