diff options
Diffstat (limited to 'src/gallium/drivers/nv50')
-rw-r--r-- | src/gallium/drivers/nv50/nv50_context.c | 40 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_context.h | 35 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_miptree.c | 79 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_program.c | 2299 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_program.h | 35 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_screen.c | 76 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_state.c | 32 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_state_validate.c | 135 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_surface.c | 6 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_tex.c | 220 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_texture.h | 15 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_transfer.c | 179 | ||||
-rw-r--r-- | src/gallium/drivers/nv50/nv50_vbo.c | 413 |
13 files changed, 2571 insertions, 993 deletions
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c index 6e8f4f9750d..219e7a78623 100644 --- a/src/gallium/drivers/nv50/nv50_context.c +++ b/src/gallium/drivers/nv50/nv50_context.c @@ -33,15 +33,9 @@ nv50_flush(struct pipe_context *pipe, unsigned flags, { struct nv50_context *nv50 = nv50_context(pipe); struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *eng2d = nv50->screen->eng2d; - /* We need this in the ddx for reliable composite, not sure what we're - * actually flushing. We generate all our own flushes with flags = 0. */ - WAIT_RING(chan, 3); - BEGIN_RING(chan, eng2d, 0x0110, 1); - OUT_RING (chan, 0); - - FIRE_RING(chan); + if (flags & PIPE_FLUSH_FRAME) + FIRE_RING(chan); } static void @@ -59,29 +53,6 @@ nv50_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield) { } -static unsigned int -nv50_is_texture_referenced( struct pipe_context *pipe, - struct pipe_texture *texture, - unsigned face, unsigned level) -{ - /** - * FIXME: Optimize. - */ - - return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE; -} - -static unsigned int -nv50_is_buffer_referenced( struct pipe_context *pipe, - struct pipe_buffer *buf) -{ - /** - * FIXME: Optimize. - */ - - return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE; -} - struct pipe_context * nv50_create(struct pipe_screen *pscreen, unsigned pctx_id) { @@ -107,8 +78,11 @@ nv50_create(struct pipe_screen *pscreen, unsigned pctx_id) nv50->pipe.flush = nv50_flush; - nv50->pipe.is_texture_referenced = nv50_is_texture_referenced; - nv50->pipe.is_buffer_referenced = nv50_is_buffer_referenced; + nv50->pipe.is_texture_referenced = nouveau_is_texture_referenced; + nv50->pipe.is_buffer_referenced = nouveau_is_buffer_referenced; + + screen->base.channel->user_private = nv50; + screen->base.channel->flush_notify = nv50_state_flush_notify; nv50_init_surface_functions(nv50); nv50_init_state_functions(nv50); diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h index 1e9e8e49bfb..4b0f0622953 100644 --- a/src/gallium/drivers/nv50/nv50_context.h +++ b/src/gallium/drivers/nv50/nv50_context.h @@ -14,6 +14,7 @@ #include "nouveau/nouveau_winsys.h" #include "nouveau/nouveau_gldefs.h" #include "nouveau/nouveau_stateobj.h" +#include "nouveau/nouveau_context.h" #include "nv50_screen.h" #include "nv50_program.h" @@ -68,6 +69,18 @@ struct nv50_sampler_stateobj { unsigned tsc[8]; }; +static INLINE unsigned +get_tile_height(uint32_t tile_mode) +{ + return 1 << ((tile_mode & 0xf) + 2); +} + +static INLINE unsigned +get_tile_depth(uint32_t tile_mode) +{ + return 1 << (tile_mode >> 4); +} + struct nv50_miptree_level { int *image_offset; unsigned pitch; @@ -116,9 +129,11 @@ struct nv50_state { unsigned miptree_nr; struct nouveau_stateobj *vertprog; struct nouveau_stateobj *fragprog; + struct nouveau_stateobj *programs; struct nouveau_stateobj *vtxfmt; struct nouveau_stateobj *vtxbuf; struct nouveau_stateobj *vtxattr; + unsigned vtxelt_nr; }; struct nv50_context { @@ -151,6 +166,8 @@ struct nv50_context { unsigned sampler_nr; struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS]; unsigned miptree_nr; + + uint16_t vbo_fifo; }; static INLINE struct nv50_context * @@ -190,12 +207,28 @@ extern void nv50_clear(struct pipe_context *pipe, unsigned buffers, /* nv50_program.c */ extern void nv50_vertprog_validate(struct nv50_context *nv50); extern void nv50_fragprog_validate(struct nv50_context *nv50); -extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p); +extern void nv50_linkage_validate(struct nv50_context *nv50); +extern void nv50_program_destroy(struct nv50_context *nv50, + struct nv50_program *p); /* nv50_state_validate.c */ extern boolean nv50_state_validate(struct nv50_context *nv50); +extern void nv50_state_flush_notify(struct nouveau_channel *chan); + +extern void nv50_so_init_sifc(struct nv50_context *nv50, + struct nouveau_stateobj *so, + struct nouveau_bo *bo, unsigned reloc, + unsigned size); /* nv50_tex.c */ extern void nv50_tex_validate(struct nv50_context *); +/* nv50_transfer.c */ +extern void +nv50_upload_sifc(struct nv50_context *nv50, + struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc, + unsigned dst_format, int dst_w, int dst_h, int dst_pitch, + void *src, unsigned src_format, int src_pitch, + int x, int y, int w, int h, int cpp); + #endif diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c index 03b9243b828..9c20c5cc282 100644 --- a/src/gallium/drivers/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nv50/nv50_miptree.c @@ -26,6 +26,35 @@ #include "nv50_context.h" +/* The restrictions in tile mode selection probably aren't necessary. */ +static INLINE uint32_t +get_tile_mode(unsigned ny, unsigned d) +{ + uint32_t tile_mode = 0x00; + + if (ny > 32) tile_mode = 0x04; /* height 64 tiles */ + else + if (ny > 16) tile_mode = 0x03; /* height 32 tiles */ + else + if (ny > 8) tile_mode = 0x02; /* height 16 tiles */ + else + if (ny > 4) tile_mode = 0x01; /* height 8 tiles */ + + if (d == 1) + return tile_mode; + else + if (tile_mode > 0x02) + tile_mode = 0x02; + + if (d > 16 && tile_mode < 0x02) + return tile_mode | 0x50; /* depth 32 tiles */ + if (d > 8) return tile_mode | 0x40; /* depth 16 tiles */ + if (d > 4) return tile_mode | 0x30; /* depth 8 tiles */ + if (d > 2) return tile_mode | 0x20; /* depth 4 tiles */ + + return tile_mode | 0x10; +} + static struct pipe_texture * nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp) { @@ -33,8 +62,8 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp) struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree); struct pipe_texture *pt = &mt->base.base; unsigned width = tmp->width[0], height = tmp->height[0]; - unsigned depth = tmp->depth[0]; - uint32_t tile_mode, tile_flags, tile_h; + unsigned depth = tmp->depth[0], image_alignment; + uint32_t tile_flags; int ret, i, l; *pt = *tmp; @@ -57,24 +86,8 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp) break; } - if (pt->height[0] > 32) tile_mode = 4; - else if (pt->height[0] > 16) tile_mode = 3; - else if (pt->height[0] > 8) tile_mode = 2; - else if (pt->height[0] > 4) tile_mode = 1; - else tile_mode = 0; - tile_h = 1 << (tile_mode + 2); - - switch (pt->target) { - case PIPE_TEXTURE_3D: - mt->image_nr = pt->depth[0]; - break; - case PIPE_TEXTURE_CUBE: - mt->image_nr = 6; - break; - default: - mt->image_nr = 1; - break; - } + /* XXX: texture arrays */ + mt->image_nr = (pt->target == PIPE_TEXTURE_CUBE) ? 6 : 1; for (l = 0; l <= pt->last_level; l++) { struct nv50_miptree_level *lvl = &mt->level[l]; @@ -86,33 +99,36 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp) pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height); lvl->image_offset = CALLOC(mt->image_nr, sizeof(int)); - lvl->pitch = align(pt->width[l] * pt->block.size, 64); - lvl->tile_mode = tile_mode; + lvl->pitch = align(pt->nblocksx[l] * pt->block.size, 64); + lvl->tile_mode = get_tile_mode(pt->nblocksy[l], depth); width = MAX2(1, width >> 1); height = MAX2(1, height >> 1); depth = MAX2(1, depth >> 1); - - if (tile_mode && height <= (tile_h >> 1)) { - tile_mode--; - tile_h >>= 1; - } } + image_alignment = get_tile_height(mt->level[0].tile_mode) * 64; + image_alignment *= get_tile_depth(mt->level[0].tile_mode); + + /* NOTE the distinction between arrays of mip-mapped 2D textures and + * mip-mapped 3D textures. We can't use image_nr == depth for 3D mip. + */ for (i = 0; i < mt->image_nr; i++) { for (l = 0; l <= pt->last_level; l++) { struct nv50_miptree_level *lvl = &mt->level[l]; int size; - tile_h = 1 << (lvl->tile_mode + 2); + unsigned tile_h = get_tile_height(lvl->tile_mode); + unsigned tile_d = get_tile_depth(lvl->tile_mode); - size = align(pt->width[l], 8) * pt->block.size; - size = align(size, 64); - size *= align(pt->height[l], tile_h); + size = lvl->pitch; + size *= align(pt->nblocksy[l], tile_h); + size *= align(pt->depth[l], tile_d); lvl->image_offset[i] = mt->total_size; mt->total_size += size; } + mt->total_size = align(mt->total_size, image_alignment); } ret = nouveau_bo_new_tile(dev, NOUVEAU_BO_VRAM, 256, mt->total_size, @@ -148,6 +164,7 @@ nv50_miptree_blanket(struct pipe_screen *pscreen, const struct pipe_texture *pt, mt->image_nr = 1; mt->level[0].pitch = *stride; mt->level[0].image_offset = CALLOC(1, sizeof(unsigned)); + mt->level[0].tile_mode = bo->tile_mode; nouveau_bo_ref(bo, &mt->base.bo); return &mt->base.base; diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 4a838529de7..bf50982dd16 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -31,9 +31,12 @@ #include "nv50_context.h" -#define NV50_SU_MAX_TEMP 64 +#define NV50_SU_MAX_TEMP 127 +#define NV50_SU_MAX_ADDR 4 //#define NV50_PROGRAM_DUMP +/* $a5 and $a6 always seem to be 0, and using $a7 gives you noise */ + /* ARL - gallium craps itself on progs/vp/arl.txt * * MSB - Like MAD, but MUL+SUB @@ -79,22 +82,32 @@ struct nv50_reg { P_ATTR, P_RESULT, P_CONST, - P_IMMD + P_IMMD, + P_ADDR } type; int index; int hw; - int neg; + int mod; int rhw; /* result hw for FP outputs, or interpolant index */ int acc; /* instruction where this reg is last read (first insn == 1) */ }; +#define NV50_MOD_NEG 1 +#define NV50_MOD_ABS 2 +#define NV50_MOD_SAT 4 + +/* arbitrary limits */ +#define MAX_IF_DEPTH 4 +#define MAX_LOOP_DEPTH 4 + struct nv50_pc { struct nv50_program *p; /* hw resources */ struct nv50_reg *r_temp[NV50_SU_MAX_TEMP]; + struct nv50_reg r_addr[NV50_SU_MAX_ADDR]; /* tgsi resources */ struct nv50_reg *temp; @@ -108,15 +121,28 @@ struct nv50_pc { struct nv50_reg *immd; float *immd_buf; int immd_nr; + struct nv50_reg **addr; + int addr_nr; struct nv50_reg *temp_temp[16]; unsigned temp_temp_nr; + /* broadcast and destination replacement regs */ + struct nv50_reg *r_brdc; + struct nv50_reg *r_dst[4]; + unsigned interp_mode[32]; /* perspective interpolation registers */ struct nv50_reg *iv_p; struct nv50_reg *iv_c; + struct nv50_program_exec *if_cond; + struct nv50_program_exec *if_insn[MAX_IF_DEPTH]; + struct nv50_program_exec *br_join[MAX_IF_DEPTH]; + struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */ + int if_lvl, loop_lvl; + unsigned loop_pos[MAX_LOOP_DEPTH]; + /* current instruction and total number of insns */ unsigned insn_cur; unsigned insn_nr; @@ -124,6 +150,36 @@ struct nv50_pc { boolean allow32; }; +static INLINE void +ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw) +{ + reg->type = type; + reg->index = index; + reg->hw = hw; + reg->mod = 0; + reg->rhw = -1; + reg->acc = 0; +} + +static INLINE unsigned +popcnt4(uint32_t val) +{ + static const unsigned cnt[16] + = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 }; + return cnt[val & 0xf]; +} + +static void +terminate_mbb(struct nv50_pc *pc) +{ + int i; + + /* remove records of temporary address register values */ + for (i = 0; i < NV50_SU_MAX_ADDR; ++i) + if (pc->r_addr[i].index < 0) + pc->r_addr[i].rhw = -1; +} + static void alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) { @@ -173,6 +229,10 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) assert(0); } +/* XXX: For shaders that aren't executed linearly (e.g. shaders that + * contain loops), we need to assign all hw regs to TGSI TEMPs early, + * lest we risk temp_temps overwriting regs alloc'd "later". + */ static struct nv50_reg * alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) { @@ -184,11 +244,8 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) for (i = 0; i < NV50_SU_MAX_TEMP; i++) { if (!pc->r_temp[i]) { - r = CALLOC_STRUCT(nv50_reg); - r->type = P_TEMP; - r->index = -1; - r->hw = i; - r->rhw = -1; + r = MALLOC_STRUCT(nv50_reg); + ctor_reg(r, P_TEMP, -1, i); pc->r_temp[i] = r; return r; } @@ -254,10 +311,8 @@ alloc_temp4(struct nv50_pc *pc, struct nv50_reg *dst[4], int idx) return alloc_temp4(pc, dst, idx + 4); for (i = 0; i < 4; i++) { - dst[i] = CALLOC_STRUCT(nv50_reg); - dst[i]->type = P_TEMP; - dst[i]->index = -1; - dst[i]->hw = idx + i; + dst[i] = MALLOC_STRUCT(nv50_reg); + ctor_reg(dst[i], P_TEMP, -1, idx + i); pc->r_temp[idx + i] = dst[i]; } @@ -309,7 +364,7 @@ ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w) static struct nv50_reg * alloc_immd(struct nv50_pc *pc, float f) { - struct nv50_reg *r = CALLOC_STRUCT(nv50_reg); + struct nv50_reg *r = MALLOC_STRUCT(nv50_reg); unsigned hw; for (hw = 0; hw < pc->immd_nr * 4; hw++) @@ -319,9 +374,7 @@ alloc_immd(struct nv50_pc *pc, float f) if (hw == pc->immd_nr * 4) hw = ctor_immd(pc, f, -f, 0.5 * f, 0) * 4; - r->type = P_IMMD; - r->hw = hw; - r->index = -1; + ctor_reg(r, P_IMMD, -1, hw); return r; } @@ -403,14 +456,20 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e) } alloc_reg(pc, dst); + if (dst->hw > 63) + set_long(pc, e); e->inst[0] |= (dst->hw << 2); } static INLINE void set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) { + unsigned val; float f = pc->immd_buf[imm->hw]; - unsigned val = fui(imm->neg ? -f : f); + + if (imm->mod & NV50_MOD_ABS) + f = fabsf(f); + val = fui((imm->mod & NV50_MOD_NEG) ? -f : f); set_long(pc, e); /*XXX: can't be predicated - bits overlap.. catch cases where both @@ -423,9 +482,96 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) e->inst[1] |= (val >> 6) << 2; } +static INLINE void +set_addr(struct nv50_program_exec *e, struct nv50_reg *a) +{ + assert(!(e->inst[0] & 0x0c000000)); + assert(!(e->inst[1] & 0x00000004)); + + e->inst[0] |= (a->hw & 3) << 26; + e->inst[1] |= (a->hw >> 2) << 2; +} + +static void +emit_add_addr_imm(struct nv50_pc *pc, struct nv50_reg *dst, + struct nv50_reg *src0, uint16_t src1_val) +{ + struct nv50_program_exec *e = exec(pc); + + e->inst[0] = 0xd0000000 | (src1_val << 9); + e->inst[1] = 0x20000000; + set_long(pc, e); + e->inst[0] |= dst->hw << 2; + if (src0) /* otherwise will add to $a0, which is always 0 */ + set_addr(e, src0); + + emit(pc, e); +} + +static struct nv50_reg * +alloc_addr(struct nv50_pc *pc, struct nv50_reg *ref) +{ + int i; + struct nv50_reg *a_tgsi = NULL, *a = NULL; + + if (!ref) { + /* allocate for TGSI address reg */ + for (i = 0; i < NV50_SU_MAX_ADDR; ++i) { + if (pc->r_addr[i].index >= 0) + continue; + if (pc->r_addr[i].rhw >= 0 && + pc->r_addr[i].acc == pc->insn_cur) + continue; + + pc->r_addr[i].rhw = -1; + pc->r_addr[i].index = i; + return &pc->r_addr[i]; + } + assert(0); + return NULL; + } + + /* Allocate and set an address reg so we can access 'ref'. + * + * If and r_addr has index < 0, it is not reserved for TGSI, + * and index will be the negative of the TGSI addr index the + * value in rhw is relative to, or -256 if rhw is an offset + * from 0. If rhw < 0, the reg has not been initialized. + */ + for (i = NV50_SU_MAX_ADDR - 1; i >= 0; --i) { + if (pc->r_addr[i].index >= 0) /* occupied for TGSI */ + continue; + if (pc->r_addr[i].rhw < 0) { /* unused */ + a = &pc->r_addr[i]; + continue; + } + if (!a && pc->r_addr[i].acc != pc->insn_cur) + a = &pc->r_addr[i]; + + if (ref->hw - pc->r_addr[i].rhw >= 128) + continue; + + if ((ref->acc >= 0 && pc->r_addr[i].index == -256) || + (ref->acc < 0 && -pc->r_addr[i].index == ref->index)) { + pc->r_addr[i].acc = pc->insn_cur; + return &pc->r_addr[i]; + } + } + assert(a); + + if (ref->acc < 0) + a_tgsi = pc->addr[ref->index]; + + emit_add_addr_imm(pc, a, a_tgsi, (ref->hw & ~0x7f) * 4); + + a->rhw = ref->hw & ~0x7f; + a->acc = pc->insn_cur; + a->index = a_tgsi ? -ref->index : -256; + return a; +} #define INTERP_LINEAR 0 -#define INTERP_FLAT 1 +#define INTERP_FLAT 1 #define INTERP_PERSPECTIVE 2 #define INTERP_CENTROID 4 @@ -463,10 +609,18 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s, { set_long(pc, e); - e->param.index = src->hw; + e->param.index = src->hw & 127; e->param.shift = s; e->param.mask = m << (s % 32); + if (src->hw > 127) + set_addr(e, alloc_addr(pc, src)); + else + if (src->acc < 0) { + assert(src->type == P_CONST); + set_addr(e, pc->addr[src->index]); + } + e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22); } @@ -475,11 +629,13 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) { struct nv50_program_exec *e = exec(pc); - e->inst[0] |= 0x10000000; + e->inst[0] = 0x10000000; + if (!pc->allow32) + set_long(pc, e); set_dst(pc, dst, e); - if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) { + if (!is_long(e) && src->type == P_IMMD) { set_immd(pc, src, e); /*XXX: 32-bit, but steals part of "half" reg space - need to * catch and handle this case if/when we do half-regs @@ -496,6 +652,8 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) } alloc_reg(pc, src); + if (src->hw > 63) + set_long(pc, e); e->inst[0] |= (src->hw << 9); } @@ -543,6 +701,24 @@ check_swap_src_0_1(struct nv50_pc *pc, } static void +set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src, + struct nv50_program_exec *e) +{ + struct nv50_reg *temp; + + if (src->type != P_TEMP) { + temp = temp_temp(pc); + emit_mov(pc, temp, src); + src = temp; + } + + alloc_reg(pc, src); + if (src->hw > 63) + set_long(pc, e); + e->inst[0] |= (src->hw << 9); +} + +static void set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) { if (src->type == P_ATTR) { @@ -557,6 +733,8 @@ set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) } alloc_reg(pc, src); + if (src->hw > 63) + set_long(pc, e); e->inst[0] |= (src->hw << 9); } @@ -583,7 +761,9 @@ set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) } alloc_reg(pc, src); - e->inst[0] |= (src->hw << 16); + if (src->hw > 63) + set_long(pc, e); + e->inst[0] |= ((src->hw & 127) << 16); } static void @@ -611,7 +791,7 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) } alloc_reg(pc, src); - e->inst[1] |= (src->hw << 14); + e->inst[1] |= ((src->hw & 127) << 14); } static void @@ -629,12 +809,12 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, set_dst(pc, dst, e); set_src_0(pc, src0, e); if (src1->type == P_IMMD && !is_long(e)) { - if (src0->neg) + if (src0->mod & NV50_MOD_NEG) e->inst[0] |= 0x00008000; set_immd(pc, src1, e); } else { set_src_1(pc, src1, e); - if (src0->neg ^ src1->neg) { + if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) { if (is_long(e)) e->inst[1] |= 0x08000000; else @@ -651,13 +831,15 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst, { struct nv50_program_exec *e = exec(pc); - e->inst[0] |= 0xb0000000; + e->inst[0] = 0xb0000000; + alloc_reg(pc, src1); check_swap_src_0_1(pc, &src0, &src1); - if (!pc->allow32 || src0->neg || src1->neg) { + if (!pc->allow32 || (src0->mod | src1->mod) || src1->hw > 63) { set_long(pc, e); - e->inst[1] |= (src0->neg << 26) | (src1->neg << 27); + e->inst[1] |= ((src0->mod & NV50_MOD_NEG) << 26) | + ((src1->mod & NV50_MOD_NEG) << 27); } set_dst(pc, dst, e); @@ -674,6 +856,22 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst, } static void +emit_arl(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, + uint8_t s) +{ + struct nv50_program_exec *e = exec(pc); + + set_long(pc, e); + e->inst[1] |= 0xc0000000; + + e->inst[0] |= dst->hw << 2; + e->inst[0] |= s << 16; /* shift left */ + set_src_0_restricted(pc, src, e); + + emit(pc, e); +} + +static void emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, struct nv50_reg *src0, struct nv50_reg *src1) { @@ -688,6 +886,11 @@ emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst, set_src_0(pc, src0, e); set_src_1(pc, src1, e); + if (src0->mod & NV50_MOD_ABS) + e->inst[1] |= 0x00100000; + if (src1->mod & NV50_MOD_ABS) + e->inst[1] |= 0x00080000; + emit(pc, e); } @@ -695,9 +898,47 @@ static INLINE void emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, struct nv50_reg *src1) { - src1->neg ^= 1; + assert(src0 != src1); + src1->mod ^= NV50_MOD_NEG; emit_add(pc, dst, src0, src1); - src1->neg ^= 1; + src1->mod ^= NV50_MOD_NEG; +} + +static void +emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, + struct nv50_reg *src1, unsigned op) +{ + struct nv50_program_exec *e = exec(pc); + + e->inst[0] = 0xd0000000; + set_long(pc, e); + + check_swap_src_0_1(pc, &src0, &src1); + set_dst(pc, dst, e); + set_src_0(pc, src0, e); + + if (op != TGSI_OPCODE_AND && op != TGSI_OPCODE_OR && + op != TGSI_OPCODE_XOR) + assert(!"invalid bit op"); + + if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) { + set_immd(pc, src1, e); + if (op == TGSI_OPCODE_OR) + e->inst[0] |= 0x0100; + else + if (op == TGSI_OPCODE_XOR) + e->inst[0] |= 0x8000; + } else { + set_src_1(pc, src1, e); + e->inst[1] |= 0x04000000; /* 32 bit */ + if (op == TGSI_OPCODE_OR) + e->inst[1] |= 0x4000; + else + if (op == TGSI_OPCODE_XOR) + e->inst[1] |= 0x8000; + } + + emit(pc, e); } static void @@ -714,9 +955,9 @@ emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, set_src_1(pc, src1, e); set_src_2(pc, src2, e); - if (src0->neg ^ src1->neg) + if ((src0->mod ^ src1->mod) & NV50_MOD_NEG) e->inst[1] |= 0x04000000; - if (src2->neg) + if (src2->mod & NV50_MOD_NEG) e->inst[1] |= 0x08000000; emit(pc, e); @@ -726,9 +967,10 @@ static INLINE void emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, struct nv50_reg *src1, struct nv50_reg *src2) { - src2->neg ^= 1; + assert(src2 != src0 && src2 != src1); + src2->mod ^= NV50_MOD_NEG; emit_mad(pc, dst, src0, src1, src2); - src2->neg ^= 1; + src2->mod ^= NV50_MOD_NEG; } static void @@ -744,7 +986,11 @@ emit_flop(struct nv50_pc *pc, unsigned sub, } set_dst(pc, dst, e); - set_src_0(pc, src, e); + + if (sub == 0 || sub == 2) + set_src_0_restricted(pc, src, e); + else + set_src_0(pc, src, e); emit(pc, e); } @@ -786,16 +1032,19 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) #define CVTOP_SAT 0x08 #define CVTOP_ABS 0x10 +/* 0x04 == 32 bit dst */ +/* 0x40 == dst is float */ +/* 0x80 == src is float */ #define CVT_F32_F32 0xc4 #define CVT_F32_S32 0x44 -#define CVT_F32_U32 0x64 #define CVT_S32_F32 0x8c #define CVT_S32_S32 0x0c -#define CVT_F32_F32_ROP 0xcc +#define CVT_NEG 0x20 +#define CVT_RI 0x08 static void emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, - int wp, unsigned cop, unsigned fmt) + int wp, unsigned cvn, unsigned fmt) { struct nv50_program_exec *e; @@ -803,8 +1052,8 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, set_long(pc, e); e->inst[0] |= 0xa0000000; - e->inst[1] |= 0x00004000; - e->inst[1] |= (cop << 16); + e->inst[1] |= 0x00004000; /* 32 bit src */ + e->inst[1] |= (cvn << 16); e->inst[1] |= (fmt << 24); set_src_0(pc, src, e); @@ -821,53 +1070,85 @@ emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, emit(pc, e); } +/* nv50 Condition codes: + * 0x1 = LT + * 0x2 = EQ + * 0x3 = LE + * 0x4 = GT + * 0x5 = NE + * 0x6 = GE + * 0x7 = set condition code ? (used before bra.lt/le/gt/ge) + * 0x8 = unordered bit (allows NaN) + */ static void -emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst, +emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp, struct nv50_reg *src0, struct nv50_reg *src1) { + static const unsigned cc_swapped[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + struct nv50_program_exec *e = exec(pc); - unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; struct nv50_reg *rdst; - assert(c_op <= 7); + assert(ccode < 16); if (check_swap_src_0_1(pc, &src0, &src1)) - c_op = inv_cop[c_op]; + ccode = cc_swapped[ccode & 7] | (ccode & 8); rdst = dst; - if (dst->type != P_TEMP) + if (dst && dst->type != P_TEMP) dst = alloc_temp(pc, NULL); /* set.u32 */ set_long(pc, e); e->inst[0] |= 0xb0000000; - e->inst[1] |= (3 << 29); - e->inst[1] |= (c_op << 14); - /*XXX: breaks things, .u32 by default? - * decuda will disasm as .u16 and use .lo/.hi regs, but this - * doesn't seem to match what the hw actually does. - inst[1] |= 0x04000000; << breaks things.. .u32 by default? + e->inst[1] |= 0x60000000 | (ccode << 14); + + /* XXX: decuda will disasm as .u16 and use .lo/.hi regs, but + * that doesn't seem to match what the hw actually does + e->inst[1] |= 0x04000000; << breaks things, u32 by default ? */ - set_dst(pc, dst, e); + + if (wp >= 0) + set_pred_wr(pc, 1, wp, e); + if (dst) + set_dst(pc, dst, e); + else { + e->inst[0] |= 0x000001fc; + e->inst[1] |= 0x00000008; + } + set_src_0(pc, src0, e); set_src_1(pc, src1, e); - emit(pc, e); - /* cvt.f32.u32 */ - e = exec(pc); - e->inst[0] = 0xa0000001; - e->inst[1] = 0x64014780; - set_dst(pc, rdst, e); - set_src_0(pc, dst, e); emit(pc, e); + pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */ - if (dst != rdst) + /* cvt.f32.u32/s32 (?) if we didn't only write the predicate */ + if (rdst) + emit_cvt(pc, rdst, dst, -1, CVTOP_ABS | CVTOP_RN, CVT_F32_S32); + if (rdst && rdst != dst) free_temp(pc, dst); } +static INLINE unsigned +map_tgsi_setop_cc(unsigned op) +{ + switch (op) { + case TGSI_OPCODE_SLT: return 0x1; + case TGSI_OPCODE_SGE: return 0x6; + case TGSI_OPCODE_SEQ: return 0x2; + case TGSI_OPCODE_SGT: return 0x4; + case TGSI_OPCODE_SLE: return 0x3; + case TGSI_OPCODE_SNE: return 0xd; + default: + assert(0); + return 0; + } +} + static INLINE void emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) { - emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP); + emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI); } static void @@ -890,6 +1171,12 @@ emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) emit_cvt(pc, dst, src, -1, CVTOP_ABS, CVT_F32_F32); } +static INLINE void +emit_sat(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) +{ + emit_cvt(pc, dst, src, -1, CVTOP_SAT, CVT_F32_F32); +} + static void emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, struct nv50_reg **src) @@ -944,20 +1231,10 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, FREE(one); } -static void +static INLINE void emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) { - struct nv50_program_exec *e = exec(pc); - - set_long(pc, e); - e->inst[0] |= 0xa0000000; /* delta */ - e->inst[1] |= (7 << 29); /* delta */ - e->inst[1] |= 0x04000000; /* negate arg0? probably not */ - e->inst[1] |= (1 << 14); /* src .f32 */ - set_dst(pc, dst, e); - set_src_0(pc, src, e); - - emit(pc, e); + emit_cvt(pc, dst, src, -1, CVTOP_RN, CVT_F32_F32 | CVT_NEG); } static void @@ -965,30 +1242,52 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src) { struct nv50_program_exec *e; const int r_pred = 1; + unsigned cvn = CVT_F32_F32; - /* Sets predicate reg ? */ - e = exec(pc); - e->inst[0] = 0xa00001fd; - e->inst[1] = 0xc4014788; - set_src_0(pc, src, e); - set_pred_wr(pc, 1, r_pred, e); - if (src->neg) - e->inst[1] |= 0x20000000; - emit(pc, e); + if (src->mod & NV50_MOD_NEG) + cvn |= CVT_NEG; + /* write predicate reg */ + emit_cvt(pc, NULL, src, r_pred, CVTOP_RN, cvn); - /* This is probably KILP */ + /* conditional discard */ e = exec(pc); - e->inst[0] = 0x000001fe; + e->inst[0] = 0x00000002; set_long(pc, e); - set_pred(pc, 1 /* LT? */, r_pred, e); + set_pred(pc, 0x1 /* LT */, r_pred, e); emit(pc, e); } static void +load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4], + struct nv50_reg **src, boolean proj) +{ + int mod[3] = { src[0]->mod, src[1]->mod, src[2]->mod }; + + src[0]->mod |= NV50_MOD_ABS; + src[1]->mod |= NV50_MOD_ABS; + src[2]->mod |= NV50_MOD_ABS; + + emit_minmax(pc, 4, t[2], src[0], src[1]); + emit_minmax(pc, 4, t[2], src[2], t[2]); + + src[0]->mod = mod[0]; + src[1]->mod = mod[1]; + src[2]->mod = mod[2]; + + if (proj && 0 /* looks more correct without this */) + emit_mul(pc, t[2], t[2], src[3]); + emit_flop(pc, 0, t[2], t[2]); + + emit_mul(pc, t[0], src[0], t[2]); + emit_mul(pc, t[1], src[1], t[2]); + emit_mul(pc, t[2], src[2], t[2]); +} + +static void emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, struct nv50_reg **src, unsigned unit, unsigned type, boolean proj) { - struct nv50_reg *temp, *t[4]; + struct nv50_reg *t[4]; struct nv50_program_exec *e; unsigned c, mode, dim; @@ -1017,6 +1316,9 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, /* some cards need t[0]'s hw index to be a multiple of 4 */ alloc_temp4(pc, t, 0); + if (type == TGSI_TEXTURE_CUBE) { + load_cube_tex_coords(pc, t, src, proj); + } else if (proj) { if (src[0]->type == P_TEMP && src[0]->rhw != -1) { mode = pc->interp_mode[src[0]->index]; @@ -1041,17 +1343,8 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, */ } } else { - if (type == TGSI_TEXTURE_CUBE) { - temp = temp_temp(pc); - emit_minmax(pc, 4, temp, src[0], src[1]); - emit_minmax(pc, 4, temp, temp, src[2]); - emit_flop(pc, 0, temp, temp); - for (c = 0; c < 3; c++) - emit_mul(pc, t[c], src[c], temp); - } else { - for (c = 0; c < dim; c++) - emit_mov(pc, t[c], src[c]); - } + for (c = 0; c < dim; c++) + emit_mov(pc, t[c], src[c]); } e = exec(pc); @@ -1064,19 +1357,22 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, if (dim == 2) e->inst[0] |= 0x00400000; else - if (dim == 3) + if (dim == 3) { e->inst[0] |= 0x00800000; + if (type == TGSI_TEXTURE_CUBE) + e->inst[0] |= 0x08000000; + } e->inst[0] |= (mask & 0x3) << 25; e->inst[1] |= (mask & 0xc) << 12; emit(pc, e); - #if 1 - if (mask & 1) emit_mov(pc, dst[0], t[0]); - if (mask & 2) emit_mov(pc, dst[1], t[1]); - if (mask & 4) emit_mov(pc, dst[2], t[2]); - if (mask & 8) emit_mov(pc, dst[3], t[3]); + c = 0; + if (mask & 1) emit_mov(pc, dst[0], t[c++]); + if (mask & 2) emit_mov(pc, dst[1], t[c++]); + if (mask & 4) emit_mov(pc, dst[2], t[c++]); + if (mask & 8) emit_mov(pc, dst[3], t[c]); free_temp4(pc, t); #else @@ -1093,6 +1389,75 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask, } static void +emit_branch(struct nv50_pc *pc, int pred, unsigned cc, + struct nv50_program_exec **join) +{ + struct nv50_program_exec *e = exec(pc); + + if (join) { + set_long(pc, e); + e->inst[0] |= 0xa0000002; + emit(pc, e); + *join = e; + e = exec(pc); + } + + set_long(pc, e); + e->inst[0] |= 0x10000002; + if (pred >= 0) + set_pred(pc, cc, pred, e); + emit(pc, e); +} + +static void +emit_nop(struct nv50_pc *pc) +{ + struct nv50_program_exec *e = exec(pc); + + e->inst[0] = 0xf0000000; + set_long(pc, e); + e->inst[1] = 0xe0000000; + emit(pc, e); +} + +static void +emit_ddx(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) +{ + struct nv50_program_exec *e = exec(pc); + + assert(src->type == P_TEMP); + + e->inst[0] = 0xc0140000; + e->inst[1] = 0x89800000; + set_long(pc, e); + set_dst(pc, dst, e); + set_src_0(pc, src, e); + set_src_2(pc, src, e); + + emit(pc, e); +} + +static void +emit_ddy(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) +{ + struct nv50_program_exec *e = exec(pc); + + assert(src->type == P_TEMP); + + if (!(src->mod & NV50_MOD_NEG)) /* ! double negation */ + emit_neg(pc, src, src); + + e->inst[0] = 0xc0150000; + e->inst[1] = 0x8a400000; + set_long(pc, e); + set_dst(pc, dst, e); + set_src_0(pc, src, e); + set_src_2(pc, src, e); + + emit(pc, e); +} + +static void convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) { unsigned q = 0, m = ~0; @@ -1140,10 +1505,14 @@ convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) e->inst[1] |= q; } +/* Some operations support an optional negation flag. */ static boolean negate_supported(const struct tgsi_full_instruction *insn, int i) { + int s; + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_DDY: case TGSI_OPCODE_DP3: case TGSI_OPCODE_DP4: case TGSI_OPCODE_MUL: @@ -1151,12 +1520,93 @@ negate_supported(const struct tgsi_full_instruction *insn, int i) case TGSI_OPCODE_ADD: case TGSI_OPCODE_SUB: case TGSI_OPCODE_MAD: - return TRUE; + break; case TGSI_OPCODE_POW: - return (i == 1) ? TRUE : FALSE; + if (i == 1) + break; + return FALSE; default: return FALSE; } + + /* Watch out for possible multiple uses of an nv50_reg, we + * can't use nv50_reg::neg in these cases. + */ + for (s = 0; s < insn->Instruction.NumSrcRegs; ++s) { + if (s == i) + continue; + if ((insn->FullSrcRegisters[s].SrcRegister.Index == + insn->FullSrcRegisters[i].SrcRegister.Index) && + (insn->FullSrcRegisters[s].SrcRegister.File == + insn->FullSrcRegisters[i].SrcRegister.File)) + return FALSE; + } + + return TRUE; +} + +/* Return a read mask for source registers deduced from opcode & write mask. */ +static unsigned +nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c) +{ + unsigned x, mask = insn->FullDstRegisters[0].DstRegister.WriteMask; + + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0); + case TGSI_OPCODE_DP3: + return 0x7; + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DPH: + case TGSI_OPCODE_KIL: /* WriteMask ignored */ + return 0xf; + case TGSI_OPCODE_DST: + return mask & (c ? 0xa : 0x6); + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_LG2: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + case TGSI_OPCODE_SCS: + return 0x1; + case TGSI_OPCODE_LIT: + return 0xb; + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXP: + { + const struct tgsi_instruction_ext_texture *tex; + + assert(insn->Instruction.Extended); + tex = &insn->InstructionExtTexture; + + mask = 0x7; + if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) + mask |= 0x8; + + switch (tex->Texture) { + case TGSI_TEXTURE_1D: + mask &= 0x9; + break; + case TGSI_TEXTURE_2D: + mask &= 0xb; + break; + default: + break; + } + } + return mask; + case TGSI_OPCODE_XPD: + x = 0; + if (mask & 1) x |= 0x6; + if (mask & 2) x |= 0x5; + if (mask & 4) x |= 0x3; + return x; + default: + break; + } + + return mask; } static struct nv50_reg * @@ -1167,6 +1617,16 @@ tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) return &pc->temp[dst->DstRegister.Index * 4 + c]; case TGSI_FILE_OUTPUT: return &pc->result[dst->DstRegister.Index * 4 + c]; + case TGSI_FILE_ADDRESS: + { + struct nv50_reg *r = pc->addr[dst->DstRegister.Index * 4 + c]; + if (!r) { + r = alloc_addr(pc, NULL); + pc->addr[dst->DstRegister.Index * 4 + c] = r; + } + assert(r); + return r; + } case TGSI_FILE_NULL: return NULL; default: @@ -1182,16 +1642,19 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, { struct nv50_reg *r = NULL; struct nv50_reg *temp; - unsigned sgn, c; + unsigned sgn, c, swz; + + if (src->SrcRegister.File != TGSI_FILE_CONSTANT) + assert(!src->SrcRegister.Indirect); sgn = tgsi_util_get_full_src_register_sign_mode(src, chan); - c = tgsi_util_get_full_src_register_extswizzle(src, chan); + c = tgsi_util_get_full_src_register_swizzle(src, chan); switch (c) { - case TGSI_EXTSWIZZLE_X: - case TGSI_EXTSWIZZLE_Y: - case TGSI_EXTSWIZZLE_Z: - case TGSI_EXTSWIZZLE_W: + case TGSI_SWIZZLE_X: + case TGSI_SWIZZLE_Y: + case TGSI_SWIZZLE_Z: + case TGSI_SWIZZLE_W: switch (src->SrcRegister.File) { case TGSI_FILE_INPUT: r = &pc->attr[src->SrcRegister.Index * 4 + c]; @@ -1200,25 +1663,35 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, r = &pc->temp[src->SrcRegister.Index * 4 + c]; break; case TGSI_FILE_CONSTANT: - r = &pc->param[src->SrcRegister.Index * 4 + c]; + if (!src->SrcRegister.Indirect) { + r = &pc->param[src->SrcRegister.Index * 4 + c]; + break; + } + /* Indicate indirection by setting r->acc < 0 and + * use the index field to select the address reg. + */ + r = MALLOC_STRUCT(nv50_reg); + swz = tgsi_util_get_src_register_swizzle( + &src->SrcRegisterInd, 0); + ctor_reg(r, P_CONST, + src->SrcRegisterInd.Index * 4 + swz, + src->SrcRegister.Index * 4 + c); + r->acc = -1; break; case TGSI_FILE_IMMEDIATE: r = &pc->immd[src->SrcRegister.Index * 4 + c]; break; case TGSI_FILE_SAMPLER: break; + case TGSI_FILE_ADDRESS: + r = pc->addr[src->SrcRegister.Index * 4 + c]; + assert(r); + break; default: assert(0); break; } break; - case TGSI_EXTSWIZZLE_ZERO: - r = alloc_immd(pc, 0.0); - return r; - case TGSI_EXTSWIZZLE_ONE: - if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET) - return alloc_immd(pc, -1.0); - return alloc_immd(pc, 1.0); default: assert(0); break; @@ -1234,7 +1707,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, break; case TGSI_UTIL_SIGN_TOGGLE: if (neg) - r->neg = 1; + r->mod = NV50_MOD_NEG; else { temp = temp_temp(pc); emit_neg(pc, temp, r); @@ -1243,11 +1716,7 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, break; case TGSI_UTIL_SIGN_SET: temp = temp_temp(pc); - emit_abs(pc, temp, r); - if (neg) - temp->neg = 1; - else - emit_neg(pc, temp, temp); + emit_cvt(pc, temp, r, -1, CVTOP_ABS, CVT_F32_F32 | CVT_NEG); r = temp; break; default: @@ -1258,93 +1727,175 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src, return r; } -/* returns TRUE if instruction can overwrite sources before they're read */ +/* return TRUE for ops that produce only a single result */ static boolean -direct2dest_op(const struct tgsi_full_instruction *insn) +is_scalar_op(unsigned op) { - if (insn->Instruction.Saturate) - return FALSE; - - switch (insn->Instruction.Opcode) { + switch (op) { case TGSI_OPCODE_COS: + case TGSI_OPCODE_DP2: case TGSI_OPCODE_DP3: case TGSI_OPCODE_DP4: case TGSI_OPCODE_DPH: - case TGSI_OPCODE_KIL: - case TGSI_OPCODE_LIT: + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_LG2: case TGSI_OPCODE_POW: case TGSI_OPCODE_RCP: case TGSI_OPCODE_RSQ: - case TGSI_OPCODE_SCS: case TGSI_OPCODE_SIN: + /* + case TGSI_OPCODE_KIL: + case TGSI_OPCODE_LIT: + case TGSI_OPCODE_SCS: + */ + return TRUE; + default: + return FALSE; + } +} + +/* Returns a bitmask indicating which dst components depend + * on source s, component c (reverse of nv50_tgsi_src_mask). + */ +static unsigned +nv50_tgsi_dst_revdep(unsigned op, int s, int c) +{ + if (is_scalar_op(op)) + return 0x1; + + switch (op) { + case TGSI_OPCODE_DST: + return (1 << c) & (s ? 0xa : 0x6); + case TGSI_OPCODE_XPD: + switch (c) { + case 0: return 0x6; + case 1: return 0x5; + case 2: return 0x3; + case 3: return 0x0; + default: + assert(0); + return 0x0; + } + case TGSI_OPCODE_LIT: + case TGSI_OPCODE_SCS: case TGSI_OPCODE_TEX: case TGSI_OPCODE_TXP: - return FALSE; + /* these take care of dangerous swizzles themselves */ + return 0x0; + case TGSI_OPCODE_IF: + case TGSI_OPCODE_KIL: + /* don't call this function for these ops */ + assert(0); + return 0; default: - return TRUE; + /* linear vector instruction */ + return (1 << c); } } +static INLINE boolean +has_pred(struct nv50_program_exec *e, unsigned cc) +{ + if (!is_long(e) || is_immd(e)) + return FALSE; + return ((e->inst[1] & 0x780) == (cc << 7)); +} + +/* on ENDIF see if we can do "@p0.neu single_op" instead of: + * join_at ENDIF + * @p0.eq bra ENDIF + * single_op + * ENDIF: nop.join + */ static boolean -nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) +nv50_kill_branch(struct nv50_pc *pc) { - const struct tgsi_full_instruction *inst = &tok->FullInstruction; - struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp; + int lvl = pc->if_lvl; + + if (pc->if_insn[lvl]->next != pc->p->exec_tail) + return FALSE; + + /* if ccode == 'true', the BRA is from an ELSE and the predicate + * reg may no longer be valid, since we currently always use $p0 + */ + if (has_pred(pc->if_insn[lvl], 0xf)) + return FALSE; + assert(pc->if_insn[lvl] && pc->br_join[lvl]); + + /* We'll use the exec allocated for JOIN_AT (as we can't easily + * update prev's next); if exec_tail is BRK, update the pointer. + */ + if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail) + pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl]; + + pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */ + + *pc->br_join[lvl] = *pc->p->exec_tail; + + FREE(pc->if_insn[lvl]); + FREE(pc->p->exec_tail); + + pc->p->exec_tail = pc->br_join[lvl]; + pc->p->exec_tail->next = NULL; + set_pred(pc, 0xd, 0, pc->p->exec_tail); + + return TRUE; +} + +static boolean +nv50_program_tx_insn(struct nv50_pc *pc, + const struct tgsi_full_instruction *inst) +{ + struct nv50_reg *rdst[4], *dst[4], *brdc, *src[3][4], *temp; unsigned mask, sat, unit; - boolean assimilate = FALSE; int i, c; mask = inst->FullDstRegisters[0].DstRegister.WriteMask; sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE; + memset(src, 0, sizeof(src)); + for (c = 0; c < 4; c++) { - if (mask & (1 << c)) + if ((mask & (1 << c)) && !pc->r_dst[c]) dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); else - dst[c] = NULL; - rdst[c] = NULL; - src[0][c] = NULL; - src[1][c] = NULL; - src[2][c] = NULL; + dst[c] = pc->r_dst[c]; + rdst[c] = dst[c]; } for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { const struct tgsi_full_src_register *fs = &inst->FullSrcRegisters[i]; + unsigned src_mask; + boolean neg_supp; + + src_mask = nv50_tgsi_src_mask(inst, i); + neg_supp = negate_supported(inst, i); if (fs->SrcRegister.File == TGSI_FILE_SAMPLER) unit = fs->SrcRegister.Index; for (c = 0; c < 4; c++) - src[i][c] = tgsi_src(pc, c, fs, - negate_supported(inst, i)); + if (src_mask & (1 << c)) + src[i][c] = tgsi_src(pc, c, fs, neg_supp); } - if (sat) { - for (c = 0; c < 4; c++) { - rdst[c] = dst[c]; - dst[c] = temp_temp(pc); - } + brdc = temp = pc->r_brdc; + if (brdc && brdc->type != P_TEMP) { + temp = temp_temp(pc); + if (sat) + brdc = temp; } else - if (direct2dest_op(inst)) { + if (sat) { for (c = 0; c < 4; c++) { - if (!dst[c] || dst[c]->type != P_TEMP) + if (!(mask & (1 << c)) || dst[c]->type == P_TEMP) continue; - - for (i = c + 1; i < 4; i++) { - if (dst[c] == src[0][i] || - dst[c] == src[1][i] || - dst[c] == src[2][i]) - break; - } - if (i == 4) - continue; - - assimilate = TRUE; - rdst[c] = dst[c]; - dst[c] = alloc_temp(pc, NULL); + /* rdst[c] = dst[c]; */ /* done above */ + dst[c] = temp_temp(pc); } } + assert(brdc || !is_scalar_op(inst->Instruction.Opcode)); + switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ABS: for (c = 0; c < 4; c++) { @@ -1360,74 +1911,137 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) emit_add(pc, dst[c], src[0][c], src[1][c]); } break; - case TGSI_OPCODE_COS: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_OR: + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + emit_bitop2(pc, dst[c], src[0][c], src[1][c], + inst->Instruction.Opcode); + } + break; + case TGSI_OPCODE_ARL: + assert(src[0][0]); temp = temp_temp(pc); + emit_cvt(pc, temp, src[0][0], -1, CVTOP_FLOOR, CVT_S32_F32); + emit_arl(pc, dst[0], temp, 4); + break; + case TGSI_OPCODE_BGNLOOP: + pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size; + terminate_mbb(pc); + break; + case TGSI_OPCODE_BRK: + emit_branch(pc, -1, 0, NULL); + assert(pc->loop_lvl > 0); + pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail; + break; + case TGSI_OPCODE_CEIL: + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + emit_cvt(pc, dst[c], src[0][c], -1, + CVTOP_CEIL, CVT_F32_F32 | CVT_RI); + } + break; + case TGSI_OPCODE_CMP: + pc->allow32 = FALSE; + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + emit_cvt(pc, NULL, src[0][c], 1, CVTOP_RN, CVT_F32_F32); + emit_mov(pc, dst[c], src[1][c]); + set_pred(pc, 0x1, 1, pc->p->exec_tail); /* @SF */ + emit_mov(pc, dst[c], src[2][c]); + set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */ + } + break; + case TGSI_OPCODE_COS: + if (mask & 8) { + emit_precossin(pc, temp, src[0][3]); + emit_flop(pc, 5, dst[3], temp); + if (!(mask &= 7)) + break; + if (temp == dst[3]) + temp = brdc = temp_temp(pc); + } emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, 5, temp, temp); + emit_flop(pc, 5, brdc, temp); + break; + case TGSI_OPCODE_DDX: for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - emit_mov(pc, dst[c], temp); + emit_ddx(pc, dst[c], src[0][c]); } break; - case TGSI_OPCODE_DP3: - temp = temp_temp(pc); - emit_mul(pc, temp, src[0][0], src[1][0]); - emit_mad(pc, temp, src[0][1], src[1][1], temp); - emit_mad(pc, temp, src[0][2], src[1][2], temp); + case TGSI_OPCODE_DDY: for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - emit_mov(pc, dst[c], temp); + emit_ddy(pc, dst[c], src[0][c]); } break; + case TGSI_OPCODE_DP3: + emit_mul(pc, temp, src[0][0], src[1][0]); + emit_mad(pc, temp, src[0][1], src[1][1], temp); + emit_mad(pc, brdc, src[0][2], src[1][2], temp); + break; case TGSI_OPCODE_DP4: - temp = temp_temp(pc); emit_mul(pc, temp, src[0][0], src[1][0]); emit_mad(pc, temp, src[0][1], src[1][1], temp); emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_mad(pc, temp, src[0][3], src[1][3], temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_mad(pc, brdc, src[0][3], src[1][3], temp); break; case TGSI_OPCODE_DPH: - temp = temp_temp(pc); emit_mul(pc, temp, src[0][0], src[1][0]); emit_mad(pc, temp, src[0][1], src[1][1], temp); emit_mad(pc, temp, src[0][2], src[1][2], temp); - emit_add(pc, temp, src[1][3], temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_add(pc, brdc, src[1][3], temp); break; case TGSI_OPCODE_DST: - { - struct nv50_reg *one = alloc_immd(pc, 1.0); - if (mask & (1 << 0)) - emit_mov(pc, dst[0], one); if (mask & (1 << 1)) emit_mul(pc, dst[1], src[0][1], src[1][1]); if (mask & (1 << 2)) emit_mov(pc, dst[2], src[0][2]); if (mask & (1 << 3)) emit_mov(pc, dst[3], src[1][3]); - FREE(one); - } + if (mask & (1 << 0)) + emit_mov_immdval(pc, dst[0], 1.0f); + break; + case TGSI_OPCODE_ELSE: + emit_branch(pc, -1, 0, NULL); + pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; + pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; + terminate_mbb(pc); + break; + case TGSI_OPCODE_ENDIF: + pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size; + + /* try to replace branch over 1 insn with a predicated insn */ + if (nv50_kill_branch(pc) == TRUE) + break; + + if (pc->br_join[pc->if_lvl]) { + pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size; + pc->br_join[pc->if_lvl] = NULL; + } + terminate_mbb(pc); + /* emit a NOP as join point, we could set it on the next + * one, but would have to make sure it is long and !immd + */ + emit_nop(pc); + pc->p->exec_tail->inst[1] |= 2; + break; + case TGSI_OPCODE_ENDLOOP: + emit_branch(pc, -1, 0, NULL); + pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl]; + pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size; + terminate_mbb(pc); break; case TGSI_OPCODE_EX2: - temp = temp_temp(pc); emit_preex2(pc, temp, src[0][0]); - emit_flop(pc, 6, temp, temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_flop(pc, 6, brdc, temp); break; case TGSI_OPCODE_FLR: for (c = 0; c < 4; c++) { @@ -1445,24 +2059,27 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) emit_sub(pc, dst[c], src[0][c], temp); } break; + case TGSI_OPCODE_IF: + /* emitting a join_at may not be necessary */ + assert(pc->if_lvl < MAX_IF_DEPTH); + /* set_pred_wr(pc, 1, 0, pc->if_cond); */ + emit_cvt(pc, NULL, src[0][0], 0, CVTOP_ABS | CVTOP_RN, + CVT_F32_F32); + emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]); + pc->if_insn[pc->if_lvl++] = pc->p->exec_tail; + terminate_mbb(pc); + break; case TGSI_OPCODE_KIL: emit_kil(pc, src[0][0]); emit_kil(pc, src[0][1]); emit_kil(pc, src[0][2]); emit_kil(pc, src[0][3]); - pc->p->cfg.fp.regs[2] |= 0x00100000; break; case TGSI_OPCODE_LIT: emit_lit(pc, &dst[0], mask, &src[0][0]); break; case TGSI_OPCODE_LG2: - temp = temp_temp(pc); - emit_flop(pc, 3, temp, src[0][0]); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_flop(pc, 3, brdc, src[0][0]); break; case TGSI_OPCODE_LRP: temp = temp_temp(pc); @@ -1495,7 +2112,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) } break; case TGSI_OPCODE_MOV: - case TGSI_OPCODE_SWZ: for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; @@ -1510,31 +2126,18 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) } break; case TGSI_OPCODE_POW: - temp = temp_temp(pc); - emit_pow(pc, temp, src[0][0], src[1][0]); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); - } + emit_pow(pc, brdc, src[0][0], src[1][0]); break; case TGSI_OPCODE_RCP: - for (c = 3; c >= 0; c--) { - if (!(mask & (1 << c))) - continue; - emit_flop(pc, 0, dst[c], src[0][0]); - } + emit_flop(pc, 0, brdc, src[0][0]); break; case TGSI_OPCODE_RSQ: - for (c = 3; c >= 0; c--) { - if (!(mask & (1 << c))) - continue; - emit_flop(pc, 2, dst[c], src[0][0]); - } + emit_flop(pc, 2, brdc, src[0][0]); break; case TGSI_OPCODE_SCS: temp = temp_temp(pc); - emit_precossin(pc, temp, src[0][0]); + if (mask & 3) + emit_precossin(pc, temp, src[0][0]); if (mask & (1 << 0)) emit_flop(pc, 5, dst[0], temp); if (mask & (1 << 1)) @@ -1544,28 +2147,29 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) if (mask & (1 << 3)) emit_mov_immdval(pc, dst[3], 1.0); break; - case TGSI_OPCODE_SGE: - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_set(pc, 6, dst[c], src[0][c], src[1][c]); - } - break; case TGSI_OPCODE_SIN: - temp = temp_temp(pc); - emit_precossin(pc, temp, src[0][0]); - emit_flop(pc, 4, temp, temp); - for (c = 0; c < 4; c++) { - if (!(mask & (1 << c))) - continue; - emit_mov(pc, dst[c], temp); + if (mask & 8) { + emit_precossin(pc, temp, src[0][3]); + emit_flop(pc, 4, dst[3], temp); + if (!(mask &= 7)) + break; + if (temp == dst[3]) + temp = brdc = temp_temp(pc); } + emit_precossin(pc, temp, src[0][0]); + emit_flop(pc, 4, brdc, temp); break; case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + i = map_tgsi_setop_cc(inst->Instruction.Opcode); for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - emit_set(pc, 1, dst[c], src[0][c], src[1][c]); + emit_set(pc, i, dst[c], -1, src[0][c], src[1][c]); } break; case TGSI_OPCODE_SUB: @@ -1583,6 +2187,14 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) emit_tex(pc, dst, mask, src[0], unit, inst->InstructionExtTexture.Texture, TRUE); break; + case TGSI_OPCODE_TRUNC: + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + emit_cvt(pc, dst[c], src[0][c], -1, + CVTOP_TRUNC, CVT_F32_F32 | CVT_RI); + } + break; case TGSI_OPCODE_XPD: temp = temp_temp(pc); if (mask & (1 << 0)) { @@ -1607,28 +2219,36 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) return FALSE; } + if (brdc) { + if (sat) + emit_sat(pc, brdc, brdc); + for (c = 0; c < 4; c++) + if ((mask & (1 << c)) && dst[c] != brdc) + emit_mov(pc, dst[c], brdc); + } else if (sat) { for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - emit_cvt(pc, rdst[c], dst[c], -1, CVTOP_SAT, - CVT_F32_F32); + /* In this case we saturate later, and dst[c] won't + * be another temp_temp (and thus lost), since rdst + * already is TEMP (see above). */ + if (rdst[c]->type == P_TEMP && rdst[c]->index < 0) + continue; + emit_sat(pc, rdst[c], dst[c]); } - } else if (assimilate) { - for (c = 0; c < 4; c++) - if (rdst[c]) - assimilate_temp(pc, rdst[c], dst[c]); } for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { for (c = 0; c < 4; c++) { if (!src[i][c]) continue; + src[i][c]->mod = 0; if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) FREE(src[i][c]); else - if (src[i][c]->acc == pc->insn_cur) - release_hw(pc, src[i][c]); + if (src[i][c]->acc < 0 && src[i][c]->type == P_CONST) + FREE(src[i][c]); /* indirect constant */ } } @@ -1636,180 +2256,284 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) return TRUE; } -/* Adjust a bitmask that indicates what components of a source are used, - * we use this in tx_prep so we only load interpolants that are needed. - */ -static void -insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask) -{ - const struct tgsi_instruction_ext_texture *tex; - - switch (insn->Instruction.Opcode) { - case TGSI_OPCODE_DP3: - *mask = 0x7; - break; - case TGSI_OPCODE_DP4: - case TGSI_OPCODE_DPH: - *mask = 0xF; - break; - case TGSI_OPCODE_LIT: - *mask = 0xB; - break; - case TGSI_OPCODE_RCP: - case TGSI_OPCODE_RSQ: - *mask = 0x1; - break; - case TGSI_OPCODE_TEX: - case TGSI_OPCODE_TXP: - assert(insn->Instruction.Extended); - tex = &insn->InstructionExtTexture; - - *mask = 0x7; - if (tex->Texture == TGSI_TEXTURE_1D) - *mask = 0x1; - else - if (tex->Texture == TGSI_TEXTURE_2D) - *mask = 0x3; - - if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) - *mask |= 0x8; - break; - default: - break; - } -} - static void -prep_inspect_insn(struct nv50_pc *pc, const union tgsi_full_token *tok, - unsigned *r_usage[2]) +prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn) { - const struct tgsi_full_instruction *insn; + struct nv50_reg *reg = NULL; const struct tgsi_full_src_register *src; const struct tgsi_dst_register *dst; + unsigned i, c, k, mask; - unsigned i, c, k, n, mask, *acc_p; - - insn = &tok->FullInstruction; dst = &insn->FullDstRegisters[0].DstRegister; mask = dst->WriteMask; - if (!r_usage[0]) - r_usage[0] = CALLOC(pc->temp_nr * 4, sizeof(unsigned)); - if (!r_usage[1]) - r_usage[1] = CALLOC(pc->attr_nr * 4, sizeof(unsigned)); + if (dst->File == TGSI_FILE_TEMPORARY) + reg = pc->temp; + else + if (dst->File == TGSI_FILE_OUTPUT) + reg = pc->result; - if (dst->File == TGSI_FILE_TEMPORARY) { + if (reg) { for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - r_usage[0][dst->Index * 4 + c] = pc->insn_nr; + reg[dst->Index * 4 + c].acc = pc->insn_nr; } } for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { src = &insn->FullSrcRegisters[i]; - switch (src->SrcRegister.File) { - case TGSI_FILE_TEMPORARY: - acc_p = r_usage[0]; - break; - case TGSI_FILE_INPUT: - acc_p = r_usage[1]; - break; - default: + if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) + reg = pc->temp; + else + if (src->SrcRegister.File == TGSI_FILE_INPUT) + reg = pc->attr; + else continue; - } - insn_adjust_mask(insn, &mask); + mask = nv50_tgsi_src_mask(insn, i); for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; + k = tgsi_util_get_full_src_register_swizzle(src, c); - k = tgsi_util_get_full_src_register_extswizzle(src, c); - switch (k) { - case TGSI_EXTSWIZZLE_X: - case TGSI_EXTSWIZZLE_Y: - case TGSI_EXTSWIZZLE_Z: - case TGSI_EXTSWIZZLE_W: - n = src->SrcRegister.Index * 4 + k; - acc_p[n] = pc->insn_nr; - break; - default: - break; - } + reg[src->SrcRegister.Index * 4 + k].acc = pc->insn_nr; } } } +/* Returns a bitmask indicating which dst components need to be + * written to temporaries first to avoid 'corrupting' sources. + * + * m[i] (out) indicate component to write in the i-th position + * rdep[c] (in) bitmasks of dst[i] that require dst[c] as source + */ +static unsigned +nv50_revdep_reorder(unsigned m[4], unsigned rdep[4]) +{ + unsigned i, c, x, unsafe; + + for (c = 0; c < 4; c++) + m[c] = c; + + /* Swap as long as a dst component written earlier is depended on + * by one written later, but the next one isn't depended on by it. + */ + for (c = 0; c < 3; c++) { + if (rdep[m[c + 1]] & (1 << m[c])) + continue; /* if next one is depended on by us */ + for (i = c + 1; i < 4; i++) + /* if we are depended on by a later one */ + if (rdep[m[c]] & (1 << m[i])) + break; + if (i == 4) + continue; + /* now, swap */ + x = m[c]; + m[c] = m[c + 1]; + m[c + 1] = x; + + /* restart */ + c = 0; + } + + /* mark dependencies that could not be resolved by reordering */ + for (i = 0; i < 3; ++i) + for (c = i + 1; c < 4; ++c) + if (rdep[m[i]] & (1 << m[c])) + unsafe |= (1 << i); + + /* NOTE: $unsafe is with respect to order, not component */ + return unsafe; +} + +/* Select a suitable dst register for broadcasting scalar results, + * or return NULL if we have to allocate an extra TEMP. + * + * If e.g. only 1 component is written, we may also emit the final + * result to a write-only register. + */ +static struct nv50_reg * +tgsi_broadcast_dst(struct nv50_pc *pc, + const struct tgsi_full_dst_register *fd, unsigned mask) +{ + if (fd->DstRegister.File == TGSI_FILE_TEMPORARY) { + int c = ffs(~mask & fd->DstRegister.WriteMask); + if (c) + return tgsi_dst(pc, c - 1, fd); + } else { + int c = ffs(fd->DstRegister.WriteMask) - 1; + if ((1 << c) == fd->DstRegister.WriteMask) + return tgsi_dst(pc, c, fd); + } + + return NULL; +} + +/* Scan source swizzles and return a bitmask indicating dst regs that + * also occur among the src regs, and fill rdep for nv50_revdep_reoder. + */ static unsigned -load_fp_attrib(struct nv50_pc *pc, int i, unsigned *acc, int *mid, - int *aid, int *p_oid) +nv50_tgsi_scan_swizzle(const struct tgsi_full_instruction *insn, + unsigned rdep[4]) { - struct nv50_reg *iv; - int oid, c, n; - unsigned mask = 0; + const struct tgsi_full_dst_register *fd = &insn->FullDstRegisters[0]; + const struct tgsi_full_src_register *fs; + unsigned i, deqs = 0; - iv = (pc->interp_mode[i] & INTERP_CENTROID) ? pc->iv_c : pc->iv_p; + for (i = 0; i < 4; ++i) + rdep[i] = 0; - for (c = 0, n = i * 4; c < 4; c++, n++) { - oid = (*p_oid)++; - pc->attr[n].type = P_TEMP; - pc->attr[n].index = i; + for (i = 0; i < insn->Instruction.NumSrcRegs; i++) { + unsigned chn, mask = nv50_tgsi_src_mask(insn, i); + boolean neg_supp = negate_supported(insn, i); - if (pc->attr[n].acc == acc[n]) + fs = &insn->FullSrcRegisters[i]; + if (fs->SrcRegister.File != fd->DstRegister.File || + fs->SrcRegister.Index != fd->DstRegister.Index) continue; - mask |= (1 << c); - pc->attr[n].acc = acc[n]; - pc->attr[n].rhw = pc->attr[n].hw = -1; - alloc_reg(pc, &pc->attr[n]); + for (chn = 0; chn < 4; ++chn) { + unsigned s, c; + + if (!(mask & (1 << chn))) /* src is not read */ + continue; + c = tgsi_util_get_full_src_register_swizzle(fs, chn); + s = tgsi_util_get_full_src_register_sign_mode(fs, chn); + + if (!(fd->DstRegister.WriteMask & (1 << c))) + continue; - pc->attr[n].rhw = (*aid)++; - emit_interp(pc, &pc->attr[n], iv, pc->interp_mode[i]); + /* no danger if src is copied to TEMP first */ + if ((s != TGSI_UTIL_SIGN_KEEP) && + (s != TGSI_UTIL_SIGN_TOGGLE || !neg_supp)) + continue; - pc->p->cfg.fp.map[(*mid) / 4] |= oid << (8 * ((*mid) % 4)); - (*mid)++; - pc->p->cfg.fp.regs[1] += 0x00010001; + rdep[c] |= nv50_tgsi_dst_revdep( + insn->Instruction.Opcode, i, chn); + deqs |= (1 << c); + } } - return mask; + return deqs; } static boolean -nv50_program_tx_prep(struct nv50_pc *pc) +nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) { - struct tgsi_parse_context p; - boolean ret = FALSE; - unsigned i, c; - unsigned fcol, bcol, fcrd, depr; + struct tgsi_full_instruction insn = tok->FullInstruction; + const struct tgsi_full_dst_register *fd; + unsigned i, deqs, rdep[4], m[4]; + + fd = &tok->FullInstruction.FullDstRegisters[0]; + deqs = nv50_tgsi_scan_swizzle(&insn, rdep); - /* count (centroid) perspective interpolations */ - unsigned centroid_loads = 0; - unsigned perspect_loads = 0; + if (is_scalar_op(insn.Instruction.Opcode)) { + pc->r_brdc = tgsi_broadcast_dst(pc, fd, deqs); + if (!pc->r_brdc) + pc->r_brdc = temp_temp(pc); + return nv50_program_tx_insn(pc, &insn); + } + pc->r_brdc = NULL; - /* track register access for temps and attrs */ - unsigned *r_usage[2]; - r_usage[0] = NULL; - r_usage[1] = NULL; + if (!deqs) + return nv50_program_tx_insn(pc, &insn); - depr = fcol = bcol = fcrd = 0xffff; + deqs = nv50_revdep_reorder(m, rdep); - if (pc->p->type == PIPE_SHADER_FRAGMENT) { - pc->p->cfg.fp.regs[0] = 0x01000404; - pc->p->cfg.fp.regs[1] = 0x00000400; + for (i = 0; i < 4; ++i) { + assert(pc->r_dst[m[i]] == NULL); + + insn.FullDstRegisters[0].DstRegister.WriteMask = + fd->DstRegister.WriteMask & (1 << m[i]); + + if (!insn.FullDstRegisters[0].DstRegister.WriteMask) + continue; + + if (deqs & (1 << i)) + pc->r_dst[m[i]] = alloc_temp(pc, NULL); + + if (!nv50_program_tx_insn(pc, &insn)) + return FALSE; } - tgsi_parse_init(&p, pc->p->pipe.tokens); - while (!tgsi_parse_end_of_tokens(&p)) { - const union tgsi_full_token *tok = &p.FullToken; + for (i = 0; i < 4; i++) { + struct nv50_reg *reg = pc->r_dst[i]; + if (!reg) + continue; + pc->r_dst[i] = NULL; + + if (insn.Instruction.Saturate == TGSI_SAT_ZERO_ONE) + emit_sat(pc, tgsi_dst(pc, i, fd), reg); + else + emit_mov(pc, tgsi_dst(pc, i, fd), reg); + free_temp(pc, reg); + } + + return TRUE; +} + +static void +load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg) +{ + struct nv50_reg *iv, **ppiv; + unsigned mode = pc->interp_mode[reg->index]; + + ppiv = (mode & INTERP_CENTROID) ? &pc->iv_c : &pc->iv_p; + iv = *ppiv; + + if ((mode & INTERP_PERSPECTIVE) && !iv) { + iv = *ppiv = alloc_temp(pc, NULL); + iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1; + + emit_interp(pc, iv, NULL, mode & INTERP_CENTROID); + emit_flop(pc, 0, iv, iv); + + /* XXX: when loading interpolants dynamically, move these + * to the program head, or make sure it can't be skipped. + */ + } + + emit_interp(pc, reg, iv, mode); +} + +/* The face input is always at v[255] (varying space), with a + * value of 0 for back-facing, and 0xffffffff for front-facing. + */ +static void +load_frontfacing(struct nv50_pc *pc, struct nv50_reg *a) +{ + struct nv50_reg *one = alloc_immd(pc, 1.0f); + + assert(a->rhw == -1); + alloc_reg(pc, a); /* do this before rhw is set */ + a->rhw = 255; + load_interpolant(pc, a); + emit_bitop2(pc, a, a, one, TGSI_OPCODE_AND); + + FREE(one); +} + +static boolean +nv50_program_tx_prep(struct nv50_pc *pc) +{ + struct tgsi_parse_context tp; + struct nv50_program *p = pc->p; + boolean ret = FALSE; + unsigned i, c, flat_nr = 0; - tgsi_parse_token(&p); + tgsi_parse_init(&tp, pc->p->pipe.tokens); + while (!tgsi_parse_end_of_tokens(&tp)) { + const union tgsi_full_token *tok = &tp.FullToken; + + tgsi_parse_token(&tp); switch (tok->Token.Type) { case TGSI_TOKEN_TYPE_IMMEDIATE: { const struct tgsi_full_immediate *imm = - &p.FullToken.FullImmediate; + &tp.FullToken.FullImmediate; ctor_immd(pc, imm->u[0].Float, imm->u[1].Float, @@ -1820,88 +2544,69 @@ nv50_program_tx_prep(struct nv50_pc *pc) case TGSI_TOKEN_TYPE_DECLARATION: { const struct tgsi_full_declaration *d; - unsigned last, first, mode; + unsigned si, last, first, mode; - d = &p.FullToken.FullDeclaration; + d = &tp.FullToken.FullDeclaration; first = d->DeclarationRange.First; last = d->DeclarationRange.Last; switch (d->Declaration.File) { case TGSI_FILE_TEMPORARY: - if (pc->temp_nr < (last + 1)) - pc->temp_nr = last + 1; break; case TGSI_FILE_OUTPUT: - if (pc->result_nr < (last + 1)) - pc->result_nr = last + 1; - - if (!d->Declaration.Semantic) + if (!d->Declaration.Semantic || + p->type == PIPE_SHADER_FRAGMENT) break; + si = d->Semantic.SemanticIndex; switch (d->Semantic.SemanticName) { - case TGSI_SEMANTIC_POSITION: - depr = first; - pc->p->cfg.fp.regs[2] |= 0x00000100; - pc->p->cfg.fp.regs[3] |= 0x00000011; + case TGSI_SEMANTIC_BCOLOR: + p->cfg.two_side[si].hw = first; + if (p->cfg.io_nr > first) + p->cfg.io_nr = first; + break; + case TGSI_SEMANTIC_PSIZE: + p->cfg.psiz = first; + if (p->cfg.io_nr > first) + p->cfg.io_nr = first; break; + /* + case TGSI_SEMANTIC_CLIP_DISTANCE: + p->cfg.clpd = MIN2(p->cfg.clpd, first); + break; + */ default: break; } - break; case TGSI_FILE_INPUT: { - if (pc->attr_nr < (last + 1)) - pc->attr_nr = last + 1; - - if (pc->p->type != PIPE_SHADER_FRAGMENT) + if (p->type != PIPE_SHADER_FRAGMENT) break; switch (d->Declaration.Interpolate) { case TGSI_INTERPOLATE_CONSTANT: mode = INTERP_FLAT; + flat_nr++; break; case TGSI_INTERPOLATE_PERSPECTIVE: mode = INTERP_PERSPECTIVE; + p->cfg.regs[1] |= 0x08 << 24; break; default: mode = INTERP_LINEAR; break; } - - if (d->Declaration.Semantic) { - switch (d->Semantic.SemanticName) { - case TGSI_SEMANTIC_POSITION: - fcrd = first; - break; - case TGSI_SEMANTIC_COLOR: - fcol = first; - mode = INTERP_PERSPECTIVE; - break; - case TGSI_SEMANTIC_BCOLOR: - bcol = first; - mode = INTERP_PERSPECTIVE; - break; - } - } - - if (d->Declaration.Centroid) { + if (d->Declaration.Centroid) mode |= INTERP_CENTROID; - if (mode & INTERP_PERSPECTIVE) - centroid_loads++; - } else - if (mode & INTERP_PERSPECTIVE) - perspect_loads++; assert(last < 32); for (i = first; i <= last; i++) pc->interp_mode[i] = mode; } break; + case TGSI_FILE_ADDRESS: case TGSI_FILE_CONSTANT: - if (pc->param_nr < (last + 1)) - pc->param_nr = last + 1; - break; case TGSI_FILE_SAMPLER: break; default: @@ -1913,182 +2618,169 @@ nv50_program_tx_prep(struct nv50_pc *pc) break; case TGSI_TOKEN_TYPE_INSTRUCTION: pc->insn_nr++; - prep_inspect_insn(pc, tok, r_usage); + prep_inspect_insn(pc, &tok->FullInstruction); break; default: break; } } - if (pc->temp_nr) { - pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg)); - if (!pc->temp) - goto out_err; + if (p->type == PIPE_SHADER_VERTEX) { + int rid = 0; - for (i = 0; i < pc->temp_nr; i++) { - for (c = 0; c < 4; c++) { - pc->temp[i*4+c].type = P_TEMP; - pc->temp[i*4+c].hw = -1; - pc->temp[i*4+c].rhw = -1; - pc->temp[i*4+c].index = i; - pc->temp[i*4+c].acc = r_usage[0][i*4+c]; + for (i = 0; i < pc->attr_nr * 4; ++i) { + if (pc->attr[i].acc) { + pc->attr[i].hw = rid++; + p->cfg.attr[i / 32] |= 1 << (i % 32); } } - } - if (pc->attr_nr) { - int oid = 4, mid = 4, aid = 0; - /* oid = VP output id - * aid = FP attribute/interpolant id - * mid = VP output mapping field ID - */ + for (i = 0, rid = 0; i < pc->result_nr; ++i) { + p->cfg.io[i].hw = rid; + p->cfg.io[i].id_vp = i; - pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg)); - if (!pc->attr) - goto out_err; - - if (pc->p->type == PIPE_SHADER_FRAGMENT) { - /* position should be loaded first */ - if (fcrd != 0xffff) { - unsigned mask; - mid = 0; - mask = load_fp_attrib(pc, fcrd, r_usage[1], - &mid, &aid, &oid); - oid = 0; - pc->p->cfg.fp.regs[1] |= (mask << 24); - pc->p->cfg.fp.map[0] = 0x04040404 * fcrd; - } - pc->p->cfg.fp.map[0] += 0x03020100; - - /* should do MAD fcrd.xy, fcrd, SOME_CONST, fcrd */ - - if (perspect_loads) { - pc->iv_p = alloc_temp(pc, NULL); - - if (!(pc->p->cfg.fp.regs[1] & 0x08000000)) { - pc->p->cfg.fp.regs[1] |= 0x08000000; - pc->iv_p->rhw = aid++; - emit_interp(pc, pc->iv_p, NULL, - INTERP_LINEAR); - emit_flop(pc, 0, pc->iv_p, pc->iv_p); - } else { - pc->iv_p->rhw = aid - 1; - emit_flop(pc, 0, pc->iv_p, - &pc->attr[fcrd * 4 + 3]); - } + for (c = 0; c < 4; ++c) { + int n = i * 4 + c; + if (!pc->result[n].acc) + continue; + pc->result[n].hw = rid++; + p->cfg.io[i].mask |= 1 << c; } + } - if (centroid_loads) { - pc->iv_c = alloc_temp(pc, NULL); - pc->iv_c->rhw = pc->iv_p ? aid - 1 : aid++; - emit_interp(pc, pc->iv_c, NULL, - INTERP_CENTROID); - emit_flop(pc, 0, pc->iv_c, pc->iv_c); - pc->p->cfg.fp.regs[1] |= 0x08000000; - } + for (c = 0; c < 2; ++c) + if (p->cfg.two_side[c].hw < 0x40) + p->cfg.two_side[c] = p->cfg.io[ + p->cfg.two_side[c].hw]; - for (c = 0; c < 4; c++) { - /* I don't know what these values do, but - * let's set them like the blob does: - */ - if (fcol != 0xffff && r_usage[1][fcol * 4 + c]) - pc->p->cfg.fp.regs[0] += 0x00010000; - if (bcol != 0xffff && r_usage[1][bcol * 4 + c]) - pc->p->cfg.fp.regs[0] += 0x00010000; - } + if (p->cfg.psiz < 0x40) + p->cfg.psiz = p->cfg.io[p->cfg.psiz].hw; + } else + if (p->type == PIPE_SHADER_FRAGMENT) { + int rid, aid; + unsigned n = 0, m = pc->attr_nr - flat_nr; - for (i = 0; i < pc->attr_nr; i++) - load_fp_attrib(pc, i, r_usage[1], - &mid, &aid, &oid); + pc->allow32 = TRUE; - if (pc->iv_p) - free_temp(pc, pc->iv_p); - if (pc->iv_c) - free_temp(pc, pc->iv_c); + int base = (TGSI_SEMANTIC_POSITION == + p->info.input_semantic_name[0]) ? 0 : 1; - pc->p->cfg.fp.high_map = (mid / 4); - pc->p->cfg.fp.high_map += ((mid % 4) ? 1 : 0); - } else { - /* vertex program */ - for (i = 0; i < pc->attr_nr * 4; i++) { - pc->p->cfg.vp.attr[aid / 32] |= - (1 << (aid % 32)); - pc->attr[i].type = P_ATTR; - pc->attr[i].hw = aid++; - pc->attr[i].index = i / 4; + /* non-flat interpolants have to be mapped to + * the lower hardware IDs, so sort them: + */ + for (i = 0; i < pc->attr_nr; i++) { + if (pc->interp_mode[i] == INTERP_FLAT) { + p->cfg.io[m].id_vp = i + base; + p->cfg.io[m++].id_fp = i; + } else { + if (!(pc->interp_mode[i] & INTERP_PERSPECTIVE)) + p->cfg.io[n].linear = TRUE; + p->cfg.io[n].id_vp = i + base; + p->cfg.io[n++].id_fp = i; } } - } - if (pc->result_nr) { - int rid = 0; + if (!base) /* set w-coordinate mask from perspective interp */ + p->cfg.io[0].mask |= p->cfg.regs[1] >> 24; - pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg)); - if (!pc->result) - goto out_err; + aid = popcnt4( /* if fcrd isn't contained in cfg.io */ + base ? (p->cfg.regs[1] >> 24) : p->cfg.io[0].mask); - for (i = 0; i < pc->result_nr; i++) { - for (c = 0; c < 4; c++) { - if (pc->p->type == PIPE_SHADER_FRAGMENT) { - pc->result[i*4+c].type = P_TEMP; - pc->result[i*4+c].hw = -1; - pc->result[i*4+c].rhw = (i == depr) ? - -1 : rid++; - } else { - pc->result[i*4+c].type = P_RESULT; - pc->result[i*4+c].hw = rid++; - } - pc->result[i*4+c].index = i; + for (n = 0; n < pc->attr_nr; ++n) { + p->cfg.io[n].hw = rid = aid; + i = p->cfg.io[n].id_fp; + + if (p->info.input_semantic_name[n] == + TGSI_SEMANTIC_FACE) { + load_frontfacing(pc, &pc->attr[i * 4]); + continue; } - if (pc->p->type == PIPE_SHADER_FRAGMENT && - depr != 0xffff) { - pc->result[depr * 4 + 2].rhw = - (pc->result_nr - 1) * 4; + for (c = 0; c < 4; ++c) { + if (!pc->attr[i * 4 + c].acc) + continue; + pc->attr[i * 4 + c].rhw = rid++; + p->cfg.io[n].mask |= 1 << c; + + load_interpolant(pc, &pc->attr[i * 4 + c]); } + aid += popcnt4(p->cfg.io[n].mask); } - } - if (pc->param_nr) { - int rid = 0; + if (!base) + p->cfg.regs[1] |= p->cfg.io[0].mask << 24; - pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg)); - if (!pc->param) - goto out_err; + m = popcnt4(p->cfg.regs[1] >> 24); - for (i = 0; i < pc->param_nr; i++) { - for (c = 0; c < 4; c++) { - pc->param[i*4+c].type = P_CONST; - pc->param[i*4+c].hw = rid++; - pc->param[i*4+c].index = i; + /* set count of non-position inputs and of non-flat + * non-position inputs for FP_INTERPOLANT_CTRL + */ + p->cfg.regs[1] |= aid - m; + + if (flat_nr) { + i = p->cfg.io[pc->attr_nr - flat_nr].hw; + p->cfg.regs[1] |= (i - m) << 16; + } else + p->cfg.regs[1] |= p->cfg.regs[1] << 16; + + /* mark color semantic for light-twoside */ + n = 0x40; + for (i = 0; i < pc->attr_nr; i++) { + ubyte si, sn; + + sn = p->info.input_semantic_name[p->cfg.io[i].id_fp]; + si = p->info.input_semantic_index[p->cfg.io[i].id_fp]; + + if (sn == TGSI_SEMANTIC_COLOR) { + p->cfg.two_side[si] = p->cfg.io[i]; + + /* increase colour count */ + p->cfg.regs[0] += popcnt4( + p->cfg.two_side[si].mask) << 16; + + n = MIN2(n, p->cfg.io[i].hw - m); } } + if (n < 0x40) + p->cfg.regs[0] += n; + + /* Initialize FP results: + * FragDepth is always first TGSI and last hw output + */ + i = p->info.writes_z ? 4 : 0; + for (rid = 0; i < pc->result_nr * 4; i++) + pc->result[i].rhw = rid++; + if (p->info.writes_z) + pc->result[2].rhw = rid; + + p->cfg.high_result = rid; + + /* separate/different colour results for MRTs ? */ + if (pc->result_nr - (p->info.writes_z ? 1 : 0) > 1) + p->cfg.regs[2] |= 1; } if (pc->immd_nr) { int rid = 0; - pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg)); + pc->immd = MALLOC(pc->immd_nr * 4 * sizeof(struct nv50_reg)); if (!pc->immd) goto out_err; for (i = 0; i < pc->immd_nr; i++) { - for (c = 0; c < 4; c++) { - pc->immd[i*4+c].type = P_IMMD; - pc->immd[i*4+c].hw = rid++; - pc->immd[i*4+c].index = i; - } + for (c = 0; c < 4; c++, rid++) + ctor_reg(&pc->immd[rid], P_IMMD, i, rid); } } ret = TRUE; out_err: - if (r_usage[0]) - FREE(r_usage[0]); - if (r_usage[1]) - FREE(r_usage[1]); + if (pc->iv_p) + free_temp(pc, pc->iv_p); + if (pc->iv_c) + free_temp(pc, pc->iv_c); - tgsi_parse_free(&p); + tgsi_parse_free(&tp); return ret; } @@ -2110,18 +2802,175 @@ free_nv50_pc(struct nv50_pc *pc) } static boolean +ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p) +{ + int i, c; + unsigned rtype[2] = { P_ATTR, P_RESULT }; + + pc->p = p; + pc->temp_nr = p->info.file_max[TGSI_FILE_TEMPORARY] + 1; + pc->attr_nr = p->info.file_max[TGSI_FILE_INPUT] + 1; + pc->result_nr = p->info.file_max[TGSI_FILE_OUTPUT] + 1; + pc->param_nr = p->info.file_max[TGSI_FILE_CONSTANT] + 1; + pc->addr_nr = p->info.file_max[TGSI_FILE_ADDRESS] + 1; + assert(pc->addr_nr <= 2); + + p->cfg.high_temp = 4; + + p->cfg.two_side[0].hw = 0x40; + p->cfg.two_side[1].hw = 0x40; + + switch (p->type) { + case PIPE_SHADER_VERTEX: + p->cfg.psiz = 0x40; + p->cfg.clpd = 0x40; + p->cfg.io_nr = pc->result_nr; + break; + case PIPE_SHADER_FRAGMENT: + rtype[0] = rtype[1] = P_TEMP; + + p->cfg.regs[0] = 0x01000004; + p->cfg.io_nr = pc->attr_nr; + + if (p->info.writes_z) { + p->cfg.regs[2] |= 0x00000100; + p->cfg.regs[3] |= 0x00000011; + } + if (p->info.uses_kill) + p->cfg.regs[2] |= 0x00100000; + break; + } + + if (pc->temp_nr) { + pc->temp = MALLOC(pc->temp_nr * 4 * sizeof(struct nv50_reg)); + if (!pc->temp) + return FALSE; + + for (i = 0; i < pc->temp_nr * 4; ++i) + ctor_reg(&pc->temp[i], P_TEMP, i / 4, -1); + } + + if (pc->attr_nr) { + pc->attr = MALLOC(pc->attr_nr * 4 * sizeof(struct nv50_reg)); + if (!pc->attr) + return FALSE; + + for (i = 0; i < pc->attr_nr * 4; ++i) + ctor_reg(&pc->attr[i], rtype[0], i / 4, -1); + } + + if (pc->result_nr) { + unsigned nr = pc->result_nr * 4; + + pc->result = MALLOC(nr * sizeof(struct nv50_reg)); + if (!pc->result) + return FALSE; + + for (i = 0; i < nr; ++i) + ctor_reg(&pc->result[i], rtype[1], i / 4, -1); + } + + if (pc->param_nr) { + int rid = 0; + + pc->param = MALLOC(pc->param_nr * 4 * sizeof(struct nv50_reg)); + if (!pc->param) + return FALSE; + + for (i = 0; i < pc->param_nr; ++i) + for (c = 0; c < 4; ++c, ++rid) + ctor_reg(&pc->param[rid], P_CONST, i, rid); + } + + if (pc->addr_nr) { + pc->addr = CALLOC(pc->addr_nr * 4, sizeof(struct nv50_reg *)); + if (!pc->addr) + return FALSE; + } + for (i = 0; i < NV50_SU_MAX_ADDR; ++i) + ctor_reg(&pc->r_addr[i], P_ADDR, -256, i + 1); + + return TRUE; +} + +static void +nv50_fp_move_results(struct nv50_pc *pc) +{ + struct nv50_reg reg; + unsigned i; + + ctor_reg(®, P_TEMP, -1, -1); + + for (i = 0; i < pc->result_nr * 4; ++i) { + if (pc->result[i].rhw < 0 || pc->result[i].hw < 0) + continue; + if (pc->result[i].rhw != pc->result[i].hw) { + reg.hw = pc->result[i].rhw; + emit_mov(pc, ®, &pc->result[i]); + } + } +} + +static void +nv50_program_fixup_insns(struct nv50_pc *pc) +{ + struct nv50_program_exec *e, *prev = NULL, **bra_list; + unsigned i, n, pos; + + bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *)); + + /* Collect branch instructions, we need to adjust their offsets + * when converting 32 bit instructions to 64 bit ones + */ + for (n = 0, e = pc->p->exec_head; e; e = e->next) + if (e->param.index >= 0 && !e->param.mask) + bra_list[n++] = e; + + /* Make sure we don't have any single 32 bit instructions. */ + for (e = pc->p->exec_head, pos = 0; e; e = e->next) { + pos += is_long(e) ? 2 : 1; + + if ((pos & 1) && (!e->next || is_long(e->next))) { + for (i = 0; i < n; ++i) + if (bra_list[i]->param.index >= pos) + bra_list[i]->param.index += 1; + convert_to_long(pc, e); + ++pos; + } + if (e->next) + prev = e; + } + + assert(!is_immd(pc->p->exec_head)); + assert(!is_immd(pc->p->exec_tail)); + + /* last instruction must be long so it can have the end bit set */ + if (!is_long(pc->p->exec_tail)) { + convert_to_long(pc, pc->p->exec_tail); + if (prev) + convert_to_long(pc, prev); + } + assert(!(pc->p->exec_tail->inst[1] & 2)); + /* set the end-bit */ + pc->p->exec_tail->inst[1] |= 1; + + FREE(bra_list); +} + +static boolean nv50_program_tx(struct nv50_program *p) { struct tgsi_parse_context parse; struct nv50_pc *pc; - unsigned k; boolean ret; pc = CALLOC_STRUCT(nv50_pc); if (!pc) return FALSE; - pc->p = p; - pc->p->cfg.high_temp = 4; + + ret = ctor_nv50_pc(pc, p); + if (ret == FALSE) + goto out_cleanup; ret = nv50_program_tx_prep(pc); if (ret == FALSE) @@ -2141,7 +2990,7 @@ nv50_program_tx(struct nv50_program *p) switch (tok->Token.Type) { case TGSI_TOKEN_TYPE_INSTRUCTION: ++pc->insn_cur; - ret = nv50_program_tx_insn(pc, tok); + ret = nv50_tgsi_insn(pc, tok); if (ret == FALSE) goto out_err; break; @@ -2150,48 +2999,10 @@ nv50_program_tx(struct nv50_program *p) } } - if (p->type == PIPE_SHADER_FRAGMENT) { - struct nv50_reg out; + if (pc->p->type == PIPE_SHADER_FRAGMENT) + nv50_fp_move_results(pc); - out.type = P_TEMP; - for (k = 0; k < pc->result_nr * 4; k++) { - if (pc->result[k].rhw == -1) - continue; - if (pc->result[k].hw != pc->result[k].rhw) { - out.hw = pc->result[k].rhw; - emit_mov(pc, &out, &pc->result[k]); - } - if (pc->p->cfg.high_result < (pc->result[k].rhw + 1)) - pc->p->cfg.high_result = pc->result[k].rhw + 1; - } - } - - /* look for single half instructions and make them long */ - struct nv50_program_exec *e, *e_prev; - - for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) { - if (!is_long(e)) - k++; - - if (!e->next || is_long(e->next)) { - if (k & 1) - convert_to_long(pc, e); - k = 0; - } - - if (e->next) - e_prev = e; - } - - if (!is_long(pc->p->exec_tail)) { - /* this may occur if moving FP results */ - assert(e_prev && !is_long(e_prev)); - convert_to_long(pc, e_prev); - convert_to_long(pc, pc->p->exec_tail); - } - - assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head)); - pc->p->exec_tail->inst[1] |= 0x00000001; + nv50_program_fixup_insns(pc); p->param_nr = pc->param_nr * 4; p->immd_nr = pc->immd_nr * 4; @@ -2258,30 +3069,19 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p) p->immd_nr, NV50_CB_PMISC); } - if (!p->data[1] && p->param_nr) { - struct nouveau_resource *heap = - nv50->screen->parm_heap[p->type]; - - if (nouveau_resource_alloc(heap, p->param_nr, p, &p->data[1])) { - while (heap->next && heap->size < p->param_nr) { - struct nv50_program *evict = heap->next->priv; - nouveau_resource_free(&evict->data[1]); - } - - if (nouveau_resource_alloc(heap, p->param_nr, p, - &p->data[1])) - assert(0); - } - } + assert(p->param_nr <= 512); if (p->param_nr) { - unsigned cbuf = NV50_CB_PVP; + unsigned cb; float *map = pipe_buffer_map(pscreen, nv50->constbuf[p->type], PIPE_BUFFER_USAGE_CPU_READ); - if (p->type == PIPE_SHADER_FRAGMENT) - cbuf = NV50_CB_PFP; - nv50_program_upload_data(nv50, map, p->data[1]->start, - p->param_nr, cbuf); + + if (p->type == PIPE_SHADER_VERTEX) + cb = NV50_CB_PVP; + else + cb = NV50_CB_PFP; + + nv50_program_upload_data(nv50, map, 0, p->param_nr, cb); pipe_buffer_unmap(pscreen, nv50->constbuf[p->type]); } } @@ -2290,11 +3090,8 @@ static void nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) { struct nouveau_channel *chan = nv50->screen->base.channel; - struct nouveau_grobj *tesla = nv50->screen->tesla; struct nv50_program_exec *e; - struct nouveau_stateobj *so; - const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR; - unsigned start, count, *up, *ptr; + uint32_t *up, i; boolean upload = FALSE; if (!p->bo) { @@ -2303,32 +3100,46 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) upload = TRUE; } - if ((p->data[0] && p->data[0]->start != p->data_start[0]) || - (p->data[1] && p->data[1]->start != p->data_start[1])) { - for (e = p->exec_head; e; e = e->next) { - unsigned ei, ci, bs; + if (p->data[0] && p->data[0]->start != p->data_start[0]) + upload = TRUE; - if (e->param.index < 0) - continue; + if (!upload) + return; + + up = MALLOC(p->exec_size * 4); + + for (i = 0, e = p->exec_head; e; e = e->next) { + unsigned ei, ci, bs; + + if (e->param.index >= 0 && e->param.mask) { bs = (e->inst[1] >> 22) & 0x07; assert(bs < 2); ei = e->param.shift >> 5; - ci = e->param.index + p->data[bs]->start; + ci = e->param.index; + if (bs == 0) + ci += p->data[bs]->start; e->inst[ei] &= ~e->param.mask; e->inst[ei] |= (ci << e->param.shift); + } else + if (e->param.index >= 0) { + /* zero mask means param is a jump/branch offset */ + assert(!(e->param.index & 1)); + /* seem to be 8 byte steps */ + ei = (e->param.index >> 1) + 0 /* START_ID */; + + e->inst[0] &= 0xf0000fff; + e->inst[0] |= ei << 12; } - if (p->data[0]) - p->data_start[0] = p->data[0]->start; - if (p->data[1]) - p->data_start[1] = p->data[1]->start; - - upload = TRUE; + up[i++] = e->inst[0]; + if (is_long(e)) + up[i++] = e->inst[1]; } + assert(i == p->exec_size); - if (!upload) - return; + if (p->data[0]) + p->data_start[0] = p->data[0]->start; #ifdef NV50_PROGRAM_DUMP NOUVEAU_ERR("-------\n"); @@ -2338,45 +3149,12 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) NOUVEAU_ERR("0x%08x\n", e->inst[1]); } #endif - - up = ptr = MALLOC(p->exec_size * 4); - for (e = p->exec_head; e; e = e->next) { - *(ptr++) = e->inst[0]; - if (is_long(e)) - *(ptr++) = e->inst[1]; - } - - so = so_new(4,2); - so_method(so, nv50->screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); - so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, p->bo, 0, flags | NOUVEAU_BO_LOW, 0, 0); - so_data (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4)); - - start = 0; count = p->exec_size; - while (count) { - struct nouveau_channel *chan = nv50->screen->base.channel; - unsigned nr; - - so_emit(chan, so); - - nr = MIN2(count, 2047); - nr = MIN2(chan->pushbuf->remaining, nr); - if (chan->pushbuf->remaining < (nr + 3)) { - FIRE_RING(chan); - continue; - } - - BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 1); - OUT_RING (chan, (start << 8) | NV50_CB_PUPLOAD); - BEGIN_RING(chan, tesla, NV50TCL_CB_DATA(0) | 0x40000000, nr); - OUT_RINGp (chan, up + start, nr); - - start += nr; - count -= nr; - } + nv50_upload_sifc(nv50, p->bo, 0, NOUVEAU_BO_VRAM, + NV50_2D_DST_FORMAT_R8_UNORM, 65536, 1, 262144, + up, NV50_2D_SIFC_FORMAT_R8_UNORM, 0, + 0, 0, p->exec_size * 4, 1, 1); FREE(up); - so_ref(NULL, &so); } void @@ -2402,8 +3180,8 @@ nv50_vertprog_validate(struct nv50_context *nv50) so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); so_method(so, tesla, NV50TCL_VP_ATTR_EN_0, 2); - so_data (so, p->cfg.vp.attr[0]); - so_data (so, p->cfg.vp.attr[1]); + so_data (so, p->cfg.attr[0]); + so_data (so, p->cfg.attr[1]); so_method(so, tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1); so_data (so, p->cfg.high_result); so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 2); @@ -2421,7 +3199,6 @@ nv50_fragprog_validate(struct nv50_context *nv50) struct nouveau_grobj *tesla = nv50->screen->tesla; struct nv50_program *p = nv50->fragprog; struct nouveau_stateobj *so; - unsigned i; if (!p->translated) { nv50_program_validate(nv50, p); @@ -2438,29 +3215,186 @@ nv50_fragprog_validate(struct nv50_context *nv50) NOUVEAU_BO_HIGH, 0, 0); so_reloc (so, p->bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); - so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 */ - so_data (so, 0x00000004); - so_data (so, 0x00000000); - so_data (so, 0x00000000); - so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), p->cfg.fp.high_map); - for (i = 0; i < p->cfg.fp.high_map; i++) - so_data(so, p->cfg.fp.map[i]); - so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 2); - so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 */ + so_method(so, tesla, NV50TCL_FP_REG_ALLOC_TEMP, 1); so_data (so, p->cfg.high_temp); so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1); so_data (so, p->cfg.high_result); so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1); - so_data (so, p->cfg.fp.regs[2]); + so_data (so, p->cfg.regs[2]); so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1); - so_data (so, p->cfg.fp.regs[3]); + so_data (so, p->cfg.regs[3]); so_method(so, tesla, NV50TCL_FP_START_ID, 1); so_data (so, 0); /* program start offset */ so_ref(so, &nv50->state.fragprog); so_ref(NULL, &so); } +static void +nv50_pntc_replace(struct nv50_context *nv50, uint32_t pntc[8], unsigned base) +{ + struct nv50_program *fp = nv50->fragprog; + struct nv50_program *vp = nv50->vertprog; + unsigned i, c, m = base; + + /* XXX: This can't work correctly in all cases yet, we either + * have to create TGSI_SEMANTIC_PNTC or sprite_coord_mode has + * to be per FP input instead of per VP output + */ + memset(pntc, 0, 8 * sizeof(uint32_t)); + + for (i = 0; i < fp->cfg.io_nr; i++) { + uint8_t sn, si; + uint8_t j = fp->cfg.io[i].id_vp, k = fp->cfg.io[i].id_fp; + unsigned n = popcnt4(fp->cfg.io[i].mask); + + if (fp->info.input_semantic_name[k] != TGSI_SEMANTIC_GENERIC) { + m += n; + continue; + } + + sn = vp->info.input_semantic_name[j]; + si = vp->info.input_semantic_index[j]; + + if (j < fp->cfg.io_nr && sn == TGSI_SEMANTIC_GENERIC) { + ubyte mode = + nv50->rasterizer->pipe.sprite_coord_mode[si]; + + if (mode == PIPE_SPRITE_COORD_NONE) { + m += n; + continue; + } + } + + /* this is either PointCoord or replaced by sprite coords */ + for (c = 0; c < 4; c++) { + if (!(fp->cfg.io[i].mask & (1 << c))) + continue; + pntc[m / 8] |= (c + 1) << ((m % 8) * 4); + ++m; + } + } +} + +static int +nv50_sreg4_map(uint32_t *p_map, int mid, uint32_t lin[4], + struct nv50_sreg4 *fpi, struct nv50_sreg4 *vpo) +{ + int c; + uint8_t mv = vpo->mask, mf = fpi->mask, oid = vpo->hw; + uint8_t *map = (uint8_t *)p_map; + + for (c = 0; c < 4; ++c) { + if (mf & 1) { + if (fpi->linear == TRUE) + lin[mid / 32] |= 1 << (mid % 32); + map[mid++] = (mv & 1) ? oid : ((c == 3) ? 0x41 : 0x40); + } + + oid += mv & 1; + mf >>= 1; + mv >>= 1; + } + + return mid; +} + +void +nv50_linkage_validate(struct nv50_context *nv50) +{ + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_program *vp = nv50->vertprog; + struct nv50_program *fp = nv50->fragprog; + struct nouveau_stateobj *so; + struct nv50_sreg4 dummy, *vpo; + int i, n, c, m = 0; + uint32_t map[16], lin[4], reg[5], pcrd[8]; + + memset(map, 0, sizeof(map)); + memset(lin, 0, sizeof(lin)); + + reg[1] = 0x00000004; /* low and high clip distance map ids */ + reg[2] = 0x00000000; /* layer index map id (disabled, GP only) */ + reg[3] = 0x00000000; /* point size map id & enable */ + reg[0] = fp->cfg.regs[0]; /* colour semantic reg */ + reg[4] = fp->cfg.regs[1]; /* interpolant info */ + + dummy.linear = FALSE; + dummy.mask = 0xf; /* map all components of HPOS */ + m = nv50_sreg4_map(map, m, lin, &dummy, &vp->cfg.io[0]); + + dummy.mask = 0x0; + + if (vp->cfg.clpd < 0x40) { + for (c = 0; c < vp->cfg.clpd_nr; ++c) + map[m++] = vp->cfg.clpd + c; + reg[1] = (m << 8); + } + + reg[0] |= m << 8; /* adjust BFC0 id */ + + /* if light_twoside is active, it seems FFC0_ID == BFC0_ID is bad */ + if (nv50->rasterizer->pipe.light_twoside) { + vpo = &vp->cfg.two_side[0]; + + m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[0], &vpo[0]); + m = nv50_sreg4_map(map, m, lin, &fp->cfg.two_side[1], &vpo[1]); + } + + reg[0] += m - 4; /* adjust FFC0 id */ + reg[4] |= m << 8; /* set mid where 'normal' FP inputs start */ + + i = 0; + if (fp->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) + i = 1; + for (; i < fp->cfg.io_nr; i++) { + ubyte sn = fp->info.input_semantic_name[fp->cfg.io[i].id_fp]; + ubyte si = fp->info.input_semantic_index[fp->cfg.io[i].id_fp]; + + n = fp->cfg.io[i].id_vp; + if (n >= vp->cfg.io_nr || + vp->info.output_semantic_name[n] != sn || + vp->info.output_semantic_index[n] != si) + vpo = &dummy; + else + vpo = &vp->cfg.io[n]; + + m = nv50_sreg4_map(map, m, lin, &fp->cfg.io[i], vpo); + } + + if (nv50->rasterizer->pipe.point_size_per_vertex) { + map[m / 4] |= vp->cfg.psiz << ((m % 4) * 8); + reg[3] = (m++ << 4) | 1; + } + + /* now fill the stateobj */ + so = so_new(64, 0); + + n = (m + 3) / 4; + so_method(so, tesla, NV50TCL_VP_RESULT_MAP_SIZE, 1); + so_data (so, m); + so_method(so, tesla, NV50TCL_VP_RESULT_MAP(0), n); + so_datap (so, map, n); + + so_method(so, tesla, NV50TCL_MAP_SEMANTIC_0, 4); + so_datap (so, reg, 4); + + so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1); + so_data (so, reg[4]); + + so_method(so, tesla, 0x1540, 4); + so_datap (so, lin, 4); + + if (nv50->rasterizer->pipe.point_sprite) { + nv50_pntc_replace(nv50, pcrd, (reg[4] >> 8) & 0xff); + + so_method(so, tesla, NV50TCL_POINT_COORD_REPLACE_MAP(0), 8); + so_datap (so, pcrd, 8); + } + + so_ref(so, &nv50->state.programs); + so_ref(NULL, &so); +} + void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) { @@ -2476,7 +3410,6 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p) nouveau_bo_ref(NULL, &p->bo); nouveau_resource_free(&p->data[0]); - nouveau_resource_free(&p->data[1]); p->translated = 0; } diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 096e0476aab..d78dee083f1 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -15,6 +15,15 @@ struct nv50_program_exec { } param; }; +struct nv50_sreg4 { + uint8_t hw; + uint8_t id_vp; + uint8_t id_fp; + + uint8_t mask; + boolean linear; +}; + struct nv50_program { struct pipe_shader_state pipe; struct tgsi_shader_info info; @@ -24,8 +33,8 @@ struct nv50_program { struct nv50_program_exec *exec_head; struct nv50_program_exec *exec_tail; unsigned exec_size; - struct nouveau_resource *data[2]; - unsigned data_start[2]; + struct nouveau_resource *data[1]; + unsigned data_start[1]; struct nouveau_bo *bo; @@ -36,14 +45,20 @@ struct nv50_program { struct { unsigned high_temp; unsigned high_result; - struct { - unsigned attr[2]; - } vp; - struct { - unsigned regs[4]; - unsigned map[5]; - unsigned high_map; - } fp; + + uint32_t attr[2]; + uint32_t regs[4]; + + /* for VPs, io_nr doesn't count 'private' results (PSIZ etc.) */ + unsigned io_nr; + struct nv50_sreg4 io[PIPE_MAX_SHADER_OUTPUTS]; + + /* FP colour inputs, VP/GP back colour outputs */ + struct nv50_sreg4 two_side[2]; + + /* VP only */ + uint8_t clpd, clpd_nr; + uint8_t psiz; } cfg; }; diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c index c7f80a22037..c8d0f1e4d82 100644 --- a/src/gallium/drivers/nv50/nv50_screen.c +++ b/src/gallium/drivers/nv50/nv50_screen.c @@ -35,8 +35,14 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, { if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) { switch (format) { + case PIPE_FORMAT_X8R8G8B8_UNORM: case PIPE_FORMAT_A8R8G8B8_UNORM: case PIPE_FORMAT_R5G6B5_UNORM: + case PIPE_FORMAT_R16G16B16A16_SNORM: + case PIPE_FORMAT_R16G16B16A16_UNORM: + case PIPE_FORMAT_R32G32B32A32_FLOAT: + case PIPE_FORMAT_R16G16_SNORM: + case PIPE_FORMAT_R16G16_UNORM: return TRUE; default: break; @@ -55,6 +61,9 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, } else { switch (format) { case PIPE_FORMAT_A8R8G8B8_UNORM: + case PIPE_FORMAT_X8R8G8B8_UNORM: + case PIPE_FORMAT_A8R8G8B8_SRGB: + case PIPE_FORMAT_X8R8G8B8_SRGB: case PIPE_FORMAT_A1R5G5B5_UNORM: case PIPE_FORMAT_A4R4G4B4_UNORM: case PIPE_FORMAT_R5G6B5_UNORM: @@ -66,6 +75,13 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, case PIPE_FORMAT_DXT1_RGBA: case PIPE_FORMAT_DXT3_RGBA: case PIPE_FORMAT_DXT5_RGBA: + case PIPE_FORMAT_Z24S8_UNORM: + case PIPE_FORMAT_Z32_FLOAT: + case PIPE_FORMAT_R16G16B16A16_SNORM: + case PIPE_FORMAT_R16G16B16A16_UNORM: + case PIPE_FORMAT_R32G32B32A32_FLOAT: + case PIPE_FORMAT_R16G16_SNORM: + case PIPE_FORMAT_R16G16_UNORM: return TRUE; default: break; @@ -87,12 +103,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param) return 1; case PIPE_CAP_GLSL: return 0; - case PIPE_CAP_S3TC: - return 1; case PIPE_CAP_ANISOTROPIC_FILTER: return 1; case PIPE_CAP_POINT_SPRITE: - return 0; + return 1; case PIPE_CAP_MAX_RENDER_TARGETS: return 8; case PIPE_CAP_OCCLUSION_QUERY: @@ -218,7 +232,16 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) tesla_class = NV54TCL; break; case 0xa0: - tesla_class = NVA0TCL; + switch (chipset) { + case 0xa0: + case 0xaa: + case 0xac: + tesla_class = NVA0TCL; + break; + default: + tesla_class = 0x8597; + break; + } break; default: NOUVEAU_ERR("Not a known NV50 chipset: NV%02x\n", chipset); @@ -226,12 +249,6 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) return NULL; } - if (tesla_class == 0) { - NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset); - nv50_screen_destroy(pscreen); - return NULL; - } - ret = nouveau_grobj_alloc(chan, 0xbeef5097, tesla_class, &screen->tesla); if (ret) { @@ -292,6 +309,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) so_method(so, screen->tesla, 0x121c, 1); so_data (so, 1); + /* try to activate all/more lanes (threads) in a warp */ + so_method(so, screen->tesla, 0x1400, 1); + so_data (so, 0xf); + so_method(so, screen->tesla, 0x13bc, 1); so_data (so, 0x54); /* origin is top left (set to 1 for bottom left) */ @@ -301,7 +322,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) so_data (so, 8); /* constant buffers for immediates and VP/FP parameters */ - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4, + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (32 * 4) * 4, &screen->constbuf_misc[0]); if (ret) { nv50_screen_destroy(pscreen); @@ -309,7 +330,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) } for (i = 0; i < 2; i++) { - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 128*4*4, + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, (128 * 4) * 4, &screen->constbuf_parm[i]); if (ret) { nv50_screen_destroy(pscreen); @@ -318,8 +339,8 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) } if (nouveau_resource_init(&screen->immd_heap[0], 0, 128) || - nouveau_resource_init(&screen->parm_heap[0], 0, 128) || - nouveau_resource_init(&screen->parm_heap[1], 0, 128)) + nouveau_resource_init(&screen->parm_heap[0], 0, 512) || + nouveau_resource_init(&screen->parm_heap[1], 0, 512)) { NOUVEAU_ERR("Error initialising constant buffers.\n"); nv50_screen_destroy(pscreen); @@ -340,7 +361,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, (NV50_CB_PMISC << 16) | 0x00000800); + so_data (so, (NV50_CB_PMISC << 16) | 0x00000200); so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1); so_data (so, 0x00000001 | (NV50_CB_PMISC << 12)); so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1); @@ -364,48 +385,31 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1); so_data (so, 0x00000131 | (NV50_CB_PFP << 12)); - /* Texture sampler/image unit setup - we abuse the constant buffer - * upload mechanism for the moment to upload data to the tex config - * blocks. At some point we *may* want to go the NVIDIA way of doing - * things? - */ - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tic); + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 64*8*4, &screen->tic); if (ret) { nv50_screen_destroy(pscreen); return NULL; } - so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); - so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, (NV50_CB_TIC << 16) | 0x0800); so_method(so, screen->tesla, NV50TCL_TIC_ADDRESS_HIGH, 3); so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, 0x00000800); + so_data (so, 0x000007ff); - ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 32*8*4, &screen->tsc); + ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 64*8*4, &screen->tsc); if (ret) { nv50_screen_destroy(pscreen); return NULL; } - so_method(so, screen->tesla, NV50TCL_CB_DEF_ADDRESS_HIGH, 3); - so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); - so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM | - NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, (NV50_CB_TSC << 16) | 0x0800); so_method(so, screen->tesla, NV50TCL_TSC_ADDRESS_HIGH, 3); so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); - so_data (so, 0x00000800); + so_data (so, 0x00000000); /* Vertex array limits - max them out */ diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c index 4283808ed93..ffaa5e29d1c 100644 --- a/src/gallium/drivers/nv50/nv50_state.c +++ b/src/gallium/drivers/nv50/nv50_state.c @@ -146,6 +146,7 @@ nv50_sampler_state_create(struct pipe_context *pipe, (wrap_mode(cso->wrap_r) << 6)); switch (cso->mag_img_filter) { + case PIPE_TEX_FILTER_ANISO: case PIPE_TEX_FILTER_LINEAR: tsc[1] |= NV50TSC_1_1_MAGF_LINEAR; break; @@ -156,6 +157,7 @@ nv50_sampler_state_create(struct pipe_context *pipe, } switch (cso->min_img_filter) { + case PIPE_TEX_FILTER_ANISO: case PIPE_TEX_FILTER_LINEAR: tsc[1] |= NV50TSC_1_1_MINF_LINEAR; break; @@ -183,21 +185,15 @@ nv50_sampler_state_create(struct pipe_context *pipe, else if (cso->max_anisotropy >= 12.0) tsc[0] |= (6 << 20); - else - if (cso->max_anisotropy >= 10.0) - tsc[0] |= (5 << 20); - else - if (cso->max_anisotropy >= 8.0) - tsc[0] |= (4 << 20); - else - if (cso->max_anisotropy >= 6.0) - tsc[0] |= (3 << 20); - else - if (cso->max_anisotropy >= 4.0) - tsc[0] |= (2 << 20); - else - if (cso->max_anisotropy >= 2.0) - tsc[0] |= (1 << 20); + else { + tsc[0] |= (int)(cso->max_anisotropy * 0.5f) << 20; + + if (cso->max_anisotropy >= 4.0) + tsc[1] |= NV50TSC_1_1_UNKN_ANISO_35; + else + if (cso->max_anisotropy >= 2.0) + tsc[1] |= NV50TSC_1_1_UNKN_ANISO_15; + } if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) { tsc[0] |= (1 << 8); @@ -276,6 +272,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe, so_method(so, tesla, 0x1684, 1); so_data (so, cso->flatshade_first ? 0 : 1); + so_method(so, tesla, NV50TCL_VERTEX_TWO_SIDE_ENABLE, 1); + so_data (so, cso->light_twoside); + so_method(so, tesla, NV50TCL_LINE_WIDTH, 1); so_data (so, fui(cso->line_width)); so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1); @@ -294,6 +293,9 @@ nv50_rasterizer_state_create(struct pipe_context *pipe, so_method(so, tesla, NV50TCL_POINT_SIZE, 1); so_data (so, fui(cso->point_size)); + so_method(so, tesla, NV50TCL_POINT_SPRITE_ENABLE, 1); + so_data (so, cso->point_sprite); + so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3); if (cso->front_winding == PIPE_WINDING_CCW) { so_data(so, nvgl_polygon_mode(cso->fill_ccw)); diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c index 344c2cf6dde..a13d64b7fa7 100644 --- a/src/gallium/drivers/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nv50/nv50_state_validate.c @@ -23,6 +23,12 @@ #include "nv50_context.h" #include "nouveau/nouveau_stateobj.h" +#define NV50_CBUF_FORMAT_CASE(n) \ + case PIPE_FORMAT_##n: so_data(so, NV50TCL_RT_FORMAT_##n); break + +#define NV50_ZETA_FORMAT_CASE(n) \ + case PIPE_FORMAT_##n: so_data(so, NV50TCL_ZETA_FORMAT_##n); break + static void nv50_state_validate_fb(struct nv50_context *nv50) { @@ -31,6 +37,14 @@ nv50_state_validate_fb(struct nv50_context *nv50) struct pipe_framebuffer_state *fb = &nv50->framebuffer; unsigned i, w, h, gw = 0; + /* Set nr of active RTs. Don't know what 0xfac6880 does, but + * at least 0x880 was required to draw to more than 1 RT. + * In some special cases, 0xfac6880 is not used, we probably + * don't hit any of these though. + */ + so_method(so, tesla, 0x121c, 1); + so_data (so, 0x0fac6880 | fb->nr_cbufs); + for (i = 0; i < fb->nr_cbufs; i++) { struct pipe_texture *pt = fb->cbufs[i]->texture; struct nouveau_bo *bo = nv50_miptree(pt)->base.bo; @@ -54,19 +68,22 @@ nv50_state_validate_fb(struct nv50_context *nv50) so_reloc (so, bo, fb->cbufs[i]->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0); switch (fb->cbufs[i]->format) { - case PIPE_FORMAT_A8R8G8B8_UNORM: - so_data(so, NV50TCL_RT_FORMAT_A8R8G8B8_UNORM); - break; - case PIPE_FORMAT_R5G6B5_UNORM: - so_data(so, NV50TCL_RT_FORMAT_R5G6B5_UNORM); - break; + NV50_CBUF_FORMAT_CASE(A8R8G8B8_UNORM); + NV50_CBUF_FORMAT_CASE(X8R8G8B8_UNORM); + NV50_CBUF_FORMAT_CASE(R5G6B5_UNORM); + NV50_CBUF_FORMAT_CASE(R16G16B16A16_SNORM); + NV50_CBUF_FORMAT_CASE(R16G16B16A16_UNORM); + NV50_CBUF_FORMAT_CASE(R32G32B32A32_FLOAT); + NV50_CBUF_FORMAT_CASE(R16G16_SNORM); + NV50_CBUF_FORMAT_CASE(R16G16_UNORM); default: NOUVEAU_ERR("AIIII unknown format %s\n", pf_name(fb->cbufs[i]->format)); so_data(so, NV50TCL_RT_FORMAT_X8R8G8B8_UNORM); break; } - so_data(so, bo->tile_mode << 4); + so_data(so, nv50_miptree(pt)-> + level[fb->cbufs[i]->level].tile_mode << 4); so_data(so, 0x00000000); so_method(so, tesla, 0x1224, 1); @@ -92,25 +109,18 @@ nv50_state_validate_fb(struct nv50_context *nv50) so_reloc (so, bo, fb->zsbuf->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | NOUVEAU_BO_RDWR, 0, 0); switch (fb->zsbuf->format) { - case PIPE_FORMAT_Z32_FLOAT: - so_data(so, NV50TCL_ZETA_FORMAT_Z32_FLOAT); - break; - case PIPE_FORMAT_Z24S8_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_Z24S8_UNORM); - break; - case PIPE_FORMAT_X8Z24_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_X8Z24_UNORM); - break; - case PIPE_FORMAT_S8Z24_UNORM: - so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM); - break; + NV50_ZETA_FORMAT_CASE(S8Z24_UNORM); + NV50_ZETA_FORMAT_CASE(X8Z24_UNORM); + NV50_ZETA_FORMAT_CASE(Z24S8_UNORM); + NV50_ZETA_FORMAT_CASE(Z32_FLOAT); default: NOUVEAU_ERR("AIIII unknown format %s\n", pf_name(fb->zsbuf->format)); so_data(so, NV50TCL_ZETA_FORMAT_S8Z24_UNORM); break; } - so_data(so, bo->tile_mode << 4); + so_data(so, nv50_miptree(pt)-> + level[fb->zsbuf->level].tile_mode << 4); so_data(so, 0x00000000); so_method(so, tesla, 0x1538, 1); @@ -119,6 +129,9 @@ nv50_state_validate_fb(struct nv50_context *nv50) so_data (so, fb->zsbuf->width); so_data (so, fb->zsbuf->height); so_data (so, 0x00010001); + } else { + so_method(so, tesla, 0x1538, 1); + so_data (so, 0); } so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2); @@ -187,6 +200,8 @@ nv50_state_emit(struct nv50_context *nv50) so_emit(chan, nv50->state.vertprog); if (nv50->state.dirty & NV50_NEW_FRAGPROG) so_emit(chan, nv50->state.fragprog); + if (nv50->state.dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG)) + so_emit(chan, nv50->state.programs); if (nv50->state.dirty & NV50_NEW_RASTERIZER) so_emit(chan, nv50->state.rast); if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR) @@ -208,6 +223,15 @@ nv50_state_emit(struct nv50_context *nv50) so_emit(chan, nv50->state.vtxattr); } nv50->state.dirty = 0; +} + +void +nv50_state_flush_notify(struct nouveau_channel *chan) +{ + struct nv50_context *nv50 = chan->user_private; + + if (nv50->state.tic_upload && !(nv50->dirty & NV50_NEW_TEXTURE)) + so_emit(chan, nv50->state.tic_upload); so_emit_reloc_markers(chan, nv50->state.fb); so_emit_reloc_markers(chan, nv50->state.vertprog); @@ -220,6 +244,7 @@ boolean nv50_state_validate(struct nv50_context *nv50) { struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nouveau_grobj *eng2d = nv50->screen->eng2d; struct nouveau_stateobj *so; unsigned i; @@ -238,6 +263,9 @@ nv50_state_validate(struct nv50_context *nv50) if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB)) nv50_fragprog_validate(nv50); + if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG)) + nv50_linkage_validate(nv50); + if (nv50->dirty & NV50_NEW_RASTERIZER) so_ref(nv50->rasterizer->so, &nv50->state.rast); @@ -299,7 +327,7 @@ scissor_uptodate: goto viewport_uptodate; nv50->state.viewport_bypass = bypass; - so = so_new(12, 0); + so = so_new(14, 0); if (!bypass) { so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE(0), 3); so_data (so, fui(nv50->viewport.translate[0])); @@ -312,12 +340,21 @@ scissor_uptodate: so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1); so_data (so, 1); + /* 0x0000 = remove whole primitive only (xyz) + * 0x1018 = remove whole primitive only (xy), clamp z + * 0x1080 = clip primitive (xyz) + * 0x1098 = clip primitive (xy), clamp z + */ + so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1); + so_data (so, 0x1080); /* no idea what 0f90 does */ so_method(so, tesla, 0x0f90, 1); so_data (so, 0); } else { so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1); so_data (so, 0); + so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1); + so_data (so, 0x0000); so_method(so, tesla, 0x0f90, 1); so_data (so, 1); } @@ -329,15 +366,25 @@ scissor_uptodate: viewport_uptodate: if (nv50->dirty & NV50_NEW_SAMPLER) { - int i; - - so = so_new(nv50->sampler_nr * 8 + 3, 0); - so_method(so, tesla, NV50TCL_CB_ADDR, 1); - so_data (so, NV50_CB_TSC); - so_method(so, tesla, NV50TCL_CB_DATA(0) | 0x40000000, - nv50->sampler_nr * 8); - for (i = 0; i < nv50->sampler_nr; i++) + unsigned i; + + so = so_new(nv50->sampler_nr * 9 + 23 + 4, 2); + + nv50_so_init_sifc(nv50, so, nv50->screen->tsc, NOUVEAU_BO_VRAM, + nv50->sampler_nr * 8 * 4); + + for (i = 0; i < nv50->sampler_nr; i++) { + if (!nv50->sampler[i]) + continue; + so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8); so_datap (so, nv50->sampler[i]->tsc, 8); + } + + so_method(so, tesla, 0x1440, 1); /* sync SIFC */ + so_data (so, 0); + so_method(so, tesla, 0x1334, 1); /* flush TSC */ + so_data (so, 0); + so_ref(so, &nv50->state.tsc_upload); so_ref(NULL, &so); } @@ -355,3 +402,33 @@ viewport_uptodate: return TRUE; } +void nv50_so_init_sifc(struct nv50_context *nv50, + struct nouveau_stateobj *so, + struct nouveau_bo *bo, unsigned reloc, unsigned size) +{ + struct nouveau_grobj *eng2d = nv50->screen->eng2d; + + so_method(so, eng2d, NV50_2D_DST_FORMAT, 2); + so_data (so, NV50_2D_DST_FORMAT_R8_UNORM); + so_data (so, 1); + so_method(so, eng2d, NV50_2D_DST_PITCH, 5); + so_data (so, 262144); + so_data (so, 65536); + so_data (so, 1); + so_reloc (so, bo, 0, reloc | NOUVEAU_BO_WR | NOUVEAU_BO_HIGH, 0, 0); + so_reloc (so, bo, 0, reloc | NOUVEAU_BO_WR | NOUVEAU_BO_LOW, 0, 0); + so_method(so, eng2d, NV50_2D_SIFC_UNK0800, 2); + so_data (so, 0); + so_data (so, NV50_2D_SIFC_FORMAT_R8_UNORM); + so_method(so, eng2d, NV50_2D_SIFC_WIDTH, 10); + so_data (so, size); + so_data (so, 1); + so_data (so, 0); + so_data (so, 1); + so_data (so, 0); + so_data (so, 1); + so_data (so, 0); + so_data (so, 0); + so_data (so, 0); + so_data (so, 0); +} diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c index b266324f58d..6bf6f773b0c 100644 --- a/src/gallium/drivers/nv50/nv50_surface.c +++ b/src/gallium/drivers/nv50/nv50_surface.c @@ -60,13 +60,13 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst) format = nv50_format(ps->format); if (format < 0) return 1; - + if (!bo->tile_flags) { BEGIN_RING(chan, eng2d, mthd, 2); OUT_RING (chan, format); OUT_RING (chan, 1); BEGIN_RING(chan, eng2d, mthd + 0x14, 5); - OUT_RING (chan, mt->level[0].pitch); + OUT_RING (chan, mt->level[ps->level].pitch); OUT_RING (chan, ps->width); OUT_RING (chan, ps->height); OUT_RELOCh(chan, bo, ps->offset, flags); @@ -75,7 +75,7 @@ nv50_surface_set(struct nv50_screen *screen, struct pipe_surface *ps, int dst) BEGIN_RING(chan, eng2d, mthd, 5); OUT_RING (chan, format); OUT_RING (chan, 0); - OUT_RING (chan, bo->tile_mode << 4); + OUT_RING (chan, mt->level[ps->level].tile_mode << 4); OUT_RING (chan, 1); OUT_RING (chan, 0); BEGIN_RING(chan, eng2d, mthd + 0x18, 4); diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c index 033cb50c115..2813f544770 100644 --- a/src/gallium/drivers/nv50/nv50_tex.c +++ b/src/gallium/drivers/nv50/nv50_tex.c @@ -25,109 +25,115 @@ #include "nouveau/nouveau_stateobj.h" +#define _MIXED(pf, t0, t1, t2, t3, cr, cg, cb, ca, f) \ +{ \ + PIPE_FORMAT_##pf, \ + NV50TIC_0_0_MAPR_##cr | NV50TIC_0_0_TYPER_##t0 | \ + NV50TIC_0_0_MAPG_##cg | NV50TIC_0_0_TYPEG_##t1 | \ + NV50TIC_0_0_MAPB_##cb | NV50TIC_0_0_TYPEB_##t2 | \ + NV50TIC_0_0_MAPA_##ca | NV50TIC_0_0_TYPEA_##t3 | \ + NV50TIC_0_0_FMT_##f \ +} + +#define _(pf, t, cr, cg, cb, ca, f) _MIXED(pf, t, t, t, t, cr, cg, cb, ca, f) + +struct nv50_texture_format { + enum pipe_format pf; + uint32_t hw; +}; + +#define NV50_TEX_FORMAT_LIST_SIZE \ + (sizeof(nv50_tex_format_list) / sizeof(struct nv50_texture_format)) + +static const struct nv50_texture_format nv50_tex_format_list[] = +{ + _(A8R8G8B8_UNORM, UNORM, C2, C1, C0, C3, 8_8_8_8), + _(A8R8G8B8_SRGB, UNORM, C2, C1, C0, C3, 8_8_8_8), + _(X8R8G8B8_UNORM, UNORM, C2, C1, C0, ONE, 8_8_8_8), + _(X8R8G8B8_SRGB, UNORM, C2, C1, C0, ONE, 8_8_8_8), + _(A1R5G5B5_UNORM, UNORM, C2, C1, C0, C3, 1_5_5_5), + _(A4R4G4B4_UNORM, UNORM, C2, C1, C0, C3, 4_4_4_4), + + _(R5G6B5_UNORM, UNORM, C2, C1, C0, ONE, 5_6_5), + + _(L8_UNORM, UNORM, C0, C0, C0, ONE, 8), + _(A8_UNORM, UNORM, ZERO, ZERO, ZERO, C0, 8), + _(I8_UNORM, UNORM, C0, C0, C0, C0, 8), + + _(A8L8_UNORM, UNORM, C0, C0, C0, C1, 8_8), + + _(DXT1_RGB, UNORM, C0, C1, C2, ONE, DXT1), + _(DXT1_RGBA, UNORM, C0, C1, C2, C3, DXT1), + _(DXT3_RGBA, UNORM, C0, C1, C2, C3, DXT3), + _(DXT5_RGBA, UNORM, C0, C1, C2, C3, DXT5), + + _MIXED(Z24S8_UNORM, UINT, UNORM, UINT, UINT, C1, C1, C1, ONE, 24_8), + + _(R16G16B16A16_SNORM, UNORM, C0, C1, C2, C3, 16_16_16_16), + _(R16G16B16A16_UNORM, SNORM, C0, C1, C2, C3, 16_16_16_16), + _(R32G32B32A32_FLOAT, FLOAT, C0, C1, C2, C3, 32_32_32_32), + + _(R16G16_SNORM, SNORM, C0, C1, ZERO, ONE, 16_16), + _(R16G16_UNORM, UNORM, C0, C1, ZERO, ONE, 16_16), + + _MIXED(Z32_FLOAT, FLOAT, UINT, UINT, UINT, C0, C0, C0, ONE, 32_DEPTH) + +}; + +#undef _ +#undef _MIXED + static int nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so, struct nv50_miptree *mt, int unit) { - switch (mt->base.base.format) { - case PIPE_FORMAT_A8R8G8B8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8_8_8_8); - break; - case PIPE_FORMAT_A1R5G5B5_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_1_5_5_5); - break; - case PIPE_FORMAT_A4R4G4B4_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_4_4_4_4); - break; - case PIPE_FORMAT_R5G6B5_UNORM: - so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C2 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_5_6_5); - break; - case PIPE_FORMAT_L8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8); - break; - case PIPE_FORMAT_A8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_ZERO | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_ZERO | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_ZERO | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8); - break; - case PIPE_FORMAT_I8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8); - break; - case PIPE_FORMAT_A8L8_UNORM: - so_data(so, NV50TIC_0_0_MAPA_C1 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_8_8); - break; - case PIPE_FORMAT_DXT1_RGB: - so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_DXT1); + unsigned i; + uint32_t mode; + + for (i = 0; i < NV50_TEX_FORMAT_LIST_SIZE; i++) + if (nv50_tex_format_list[i].pf == mt->base.base.format) + break; + if (i == NV50_TEX_FORMAT_LIST_SIZE) + return 1; + + if (nv50->sampler[unit]->normalized) + mode = 0x50001000 | (1 << 31); + else { + mode = 0x50001000 | (7 << 14); + assert(mt->base.base.target == PIPE_TEXTURE_2D); + } + + mode |= ((mt->base.bo->tile_mode & 0x0f) << 22) | + ((mt->base.bo->tile_mode & 0xf0) << 21); + + if (pf_type(mt->base.base.format) == PIPE_FORMAT_TYPE_SRGB) + mode |= 0x0400; + + switch (mt->base.base.target) { + case PIPE_TEXTURE_1D: break; - case PIPE_FORMAT_DXT1_RGBA: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_DXT1); + case PIPE_TEXTURE_2D: + mode |= (1 << 14); break; - case PIPE_FORMAT_DXT3_RGBA: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_DXT3); + case PIPE_TEXTURE_3D: + mode |= (2 << 14); break; - case PIPE_FORMAT_DXT5_RGBA: - so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM | - NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM | - NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM | - NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM | - NV50TIC_0_0_FMT_DXT5); + case PIPE_TEXTURE_CUBE: + mode |= (3 << 14); break; default: - return 1; + assert(!"unsupported texture target"); + break; } + so_data (so, nv50_tex_format_list[i].hw); so_reloc(so, mt->base.bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW | - NOUVEAU_BO_RD, 0, 0); - if (nv50->sampler[unit]->normalized) - so_data (so, 0xd0005000 | mt->base.bo->tile_mode << 22); - else - so_data (so, 0x5001d000 | mt->base.bo->tile_mode << 22); + NOUVEAU_BO_RD, 0, 0); + so_data (so, mode); so_data (so, 0x00300000); - so_data (so, mt->base.base.width[0]); + so_data (so, mt->base.base.width[0] | (1 << 31)); so_data (so, (mt->base.base.last_level << 28) | - (mt->base.base.depth[0] << 16) | mt->base.base.height[0]); + (mt->base.base.depth[0] << 16) | mt->base.base.height[0]); so_data (so, 0x03000000); so_data (so, mt->base.base.last_level << 4); @@ -137,20 +143,24 @@ nv50_tex_construct(struct nv50_context *nv50, struct nouveau_stateobj *so, void nv50_tex_validate(struct nv50_context *nv50) { + struct nouveau_grobj *eng2d = nv50->screen->eng2d; struct nouveau_grobj *tesla = nv50->screen->tesla; struct nouveau_stateobj *so; - int unit, push; + unsigned i, unit, push; + + push = MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2 + 23 + 6; + so = so_new(nv50->miptree_nr * 9 + push, nv50->miptree_nr * 2 + 2); - push = nv50->miptree_nr * 9 + 2; - push += MAX2(nv50->miptree_nr, nv50->state.miptree_nr) * 2; + nv50_so_init_sifc(nv50, so, nv50->screen->tic, NOUVEAU_BO_VRAM, + nv50->miptree_nr * 8 * 4); - so = so_new(push, nv50->miptree_nr * 2); - so_method(so, tesla, NV50TCL_CB_ADDR, 1); - so_data (so, NV50_CB_TIC); - for (unit = 0; unit < nv50->miptree_nr; unit++) { + for (i = 0, unit = 0; unit < nv50->miptree_nr; ++unit) { struct nv50_miptree *mt = nv50->miptree[unit]; - so_method(so, tesla, NV50TCL_CB_DATA(0) | 0x40000000, 8); + if (!mt) + continue; + + so_method(so, eng2d, NV50_2D_SIFC_DATA | (2 << 29), 8); if (nv50_tex_construct(nv50, so, mt, unit)) { NOUVEAU_ERR("failed tex validate\n"); so_ref(NULL, &so); @@ -158,17 +168,25 @@ nv50_tex_validate(struct nv50_context *nv50) } so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1); - so_data (so, (unit << NV50TCL_SET_SAMPLER_TEX_TIC_SHIFT) | - (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | - NV50TCL_SET_SAMPLER_TEX_VALID); + so_data (so, (i++ << NV50TCL_SET_SAMPLER_TEX_TIC_SHIFT) | + (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | + NV50TCL_SET_SAMPLER_TEX_VALID); } for (; unit < nv50->state.miptree_nr; unit++) { so_method(so, tesla, NV50TCL_SET_SAMPLER_TEX, 1); so_data (so, - (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | 0); + (unit << NV50TCL_SET_SAMPLER_TEX_SAMPLER_SHIFT) | 0); } + /* not sure if the following really do what I think: */ + so_method(so, tesla, 0x1440, 1); /* sync SIFC */ + so_data (so, 0); + so_method(so, tesla, 0x1330, 1); /* flush TIC */ + so_data (so, 0); + so_method(so, tesla, 0x1338, 1); /* flush texture caches */ + so_data (so, 0x20); + so_ref(so, &nv50->state.tic_upload); so_ref(NULL, &so); nv50->state.miptree_nr = nv50->miptree_nr; diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h index 207fb039f70..d531e611327 100644 --- a/src/gallium/drivers/nv50/nv50_texture.h +++ b/src/gallium/drivers/nv50/nv50_texture.h @@ -38,18 +38,26 @@ #define NV50TIC_0_0_TYPEA_MASK 0x00038000 #define NV50TIC_0_0_TYPEA_UNORM 0x00010000 #define NV50TIC_0_0_TYPEA_SNORM 0x00008000 +#define NV50TIC_0_0_TYPEA_SINT 0x00018000 +#define NV50TIC_0_0_TYPEA_UINT 0x00020000 #define NV50TIC_0_0_TYPEA_FLOAT 0x00038000 #define NV50TIC_0_0_TYPEB_MASK 0x00007000 #define NV50TIC_0_0_TYPEB_UNORM 0x00002000 #define NV50TIC_0_0_TYPEB_SNORM 0x00001000 +#define NV50TIC_0_0_TYPEB_SINT 0x00003000 +#define NV50TIC_0_0_TYPEB_UINT 0x00004000 #define NV50TIC_0_0_TYPEB_FLOAT 0x00007000 #define NV50TIC_0_0_TYPEG_MASK 0x00000e00 #define NV50TIC_0_0_TYPEG_UNORM 0x00000400 #define NV50TIC_0_0_TYPEG_SNORM 0x00000200 +#define NV50TIC_0_0_TYPEG_SINT 0x00000600 +#define NV50TIC_0_0_TYPEG_UINT 0x00000800 #define NV50TIC_0_0_TYPEG_FLOAT 0x00000e00 #define NV50TIC_0_0_TYPER_MASK 0x000001c0 #define NV50TIC_0_0_TYPER_UNORM 0x00000080 #define NV50TIC_0_0_TYPER_SNORM 0x00000040 +#define NV50TIC_0_0_TYPER_SINT 0x000000c0 +#define NV50TIC_0_0_TYPER_UINT 0x00000100 #define NV50TIC_0_0_TYPER_FLOAT 0x000001c0 #define NV50TIC_0_0_FMT_MASK 0x0000003f #define NV50TIC_0_0_FMT_32_32_32_32 0x00000001 @@ -57,6 +65,7 @@ #define NV50TIC_0_0_FMT_32_32 0x00000004 #define NV50TIC_0_0_FMT_8_8_8_8 0x00000008 #define NV50TIC_0_0_FMT_2_10_10_10 0x00000009 +#define NV50TIC_0_0_FMT_16_16 0x0000000c #define NV50TIC_0_0_FMT_32 0x0000000f #define NV50TIC_0_0_FMT_4_4_4_4 0x00000012 /* #define NV50TIC_0_0_FMT_1_5_5_5 0x00000013 */ @@ -65,12 +74,16 @@ #define NV50TIC_0_0_FMT_8_8 0x00000018 #define NV50TIC_0_0_FMT_16 0x0000001b #define NV50TIC_0_0_FMT_8 0x0000001d +#define NV50TIC_0_0_FMT_5_9_9_9 0x00000020 #define NV50TIC_0_0_FMT_10_11_11 0x00000021 #define NV50TIC_0_0_FMT_DXT1 0x00000024 #define NV50TIC_0_0_FMT_DXT3 0x00000025 #define NV50TIC_0_0_FMT_DXT5 0x00000026 #define NV50TIC_0_0_FMT_RGTC1 0x00000027 #define NV50TIC_0_0_FMT_RGTC2 0x00000028 +#define NV50TIC_0_0_FMT_24_8 0x00000029 +#define NV50TIC_0_0_FMT_32_DEPTH 0x0000002f +#define NV50TIC_0_0_FMT_32_8 0x00000030 #define NV50TIC_0_1_OFFSET_LOW_MASK 0xffffffff #define NV50TIC_0_1_OFFSET_LOW_SHIFT 0 @@ -133,6 +146,8 @@ #define NV50TSC_1_1_MIPF_NEAREST 0x00000080 #define NV50TSC_1_1_MIPF_LINEAR 0x000000c0 #define NV50TSC_1_1_LOD_BIAS_MASK 0x01fff000 +#define NV50TSC_1_1_UNKN_ANISO_15 0x10000000 +#define NV50TSC_1_1_UNKN_ANISO_35 0x18000000 #define NV50TSC_1_2_MIN_LOD_MASK 0x00000f00 #define NV50TSC_1_2_MAX_LOD_MASK 0x00f00000 diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c index e9c3562194b..ea61357aaa6 100644 --- a/src/gallium/drivers/nv50/nv50_transfer.c +++ b/src/gallium/drivers/nv50/nv50_transfer.c @@ -12,6 +12,7 @@ struct nv50_transfer { int level_pitch; int level_width; int level_height; + int level_depth; int level_x; int level_y; }; @@ -20,10 +21,10 @@ static void nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, struct nouveau_bo *src_bo, unsigned src_offset, int src_pitch, unsigned src_tile_mode, - int sx, int sy, int sw, int sh, + int sx, int sy, int sw, int sh, int sd, struct nouveau_bo *dst_bo, unsigned dst_offset, int dst_pitch, unsigned dst_tile_mode, - int dx, int dy, int dw, int dh, + int dx, int dy, int dw, int dh, int dd, int cpp, int width, int height, unsigned src_reloc, unsigned dst_reloc) { @@ -51,7 +52,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, OUT_RING (chan, src_tile_mode << 4); OUT_RING (chan, sw * cpp); OUT_RING (chan, sh); - OUT_RING (chan, 1); + OUT_RING (chan, sd); OUT_RING (chan, 0); } @@ -70,7 +71,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, OUT_RING (chan, dst_tile_mode << 4); OUT_RING (chan, dw * cpp); OUT_RING (chan, dh); - OUT_RING (chan, 1); + OUT_RING (chan, dd); OUT_RING (chan, 0); } @@ -89,14 +90,14 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, if (src_bo->tile_flags) { BEGIN_RING(chan, m2mf, NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_IN, 1); - OUT_RING (chan, (sy << 16) | sx); + OUT_RING (chan, (sy << 16) | (sx * cpp)); } else { src_offset += (line_count * src_pitch); } if (dst_bo->tile_flags) { BEGIN_RING(chan, m2mf, NV50_MEMORY_TO_MEMORY_FORMAT_TILING_POSITION_OUT, 1); - OUT_RING (chan, (dy << 16) | dx); + OUT_RING (chan, (dy << 16) | (dx * cpp)); } else { dst_offset += (line_count * dst_pitch); } @@ -114,6 +115,20 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen, } } +static INLINE unsigned +get_zslice_offset(unsigned tile_mode, unsigned z, unsigned pitch, unsigned ny) +{ + unsigned tile_h = get_tile_height(tile_mode); + unsigned tile_d = get_tile_depth(tile_mode); + + /* pitch_2d == to next slice within this volume-tile */ + /* pitch_3d == to next slice in next 2D array of blocks */ + unsigned pitch_2d = tile_h * 64; + unsigned pitch_3d = tile_d * align(ny, tile_h) * pitch; + + return (z % tile_d) * pitch_2d + (z / tile_d) * pitch_3d; +} + static struct pipe_transfer * nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt, unsigned face, unsigned level, unsigned zslice, @@ -124,14 +139,11 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt, struct nv50_miptree *mt = nv50_miptree(pt); struct nv50_miptree_level *lvl = &mt->level[level]; struct nv50_transfer *tx; - unsigned image = 0; + unsigned nx, ny, image = 0; int ret; if (pt->target == PIPE_TEXTURE_CUBE) image = face; - else - if (pt->target == PIPE_TEXTURE_3D) - image = zslice; tx = CALLOC_STRUCT(nv50_transfer); if (!tx) @@ -142,34 +154,52 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt, tx->base.width = w; tx->base.height = h; tx->base.block = pt->block; - tx->base.nblocksx = pt->nblocksx[level]; - tx->base.nblocksy = pt->nblocksy[level]; - tx->base.stride = (w * pt->block.size); + if (!pt->nblocksx[level]) { + tx->base.nblocksx = pf_get_nblocksx(&pt->block, + pt->width[level]); + tx->base.nblocksy = pf_get_nblocksy(&pt->block, + pt->height[level]); + } else { + tx->base.nblocksx = pt->nblocksx[level]; + tx->base.nblocksy = pt->nblocksy[level]; + } + tx->base.stride = tx->base.nblocksx * pt->block.size; tx->base.usage = usage; tx->level_pitch = lvl->pitch; tx->level_width = mt->base.base.width[level]; tx->level_height = mt->base.base.height[level]; + tx->level_depth = mt->base.base.depth[level]; tx->level_offset = lvl->image_offset[image]; tx->level_tiling = lvl->tile_mode; - tx->level_x = x; - tx->level_y = y; + tx->level_x = pf_get_nblocksx(&tx->base.block, x); + tx->level_y = pf_get_nblocksy(&tx->base.block, y); ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 0, - w * pt->block.size * h, &tx->bo); + tx->base.nblocksy * tx->base.stride, &tx->bo); if (ret) { FREE(tx); return NULL; } - if (usage != PIPE_TRANSFER_WRITE) { + if (pt->target == PIPE_TEXTURE_3D) + tx->level_offset += get_zslice_offset(lvl->tile_mode, zslice, + lvl->pitch, + tx->base.nblocksy); + + if (usage & PIPE_TRANSFER_READ) { + nx = pf_get_nblocksx(&tx->base.block, tx->base.width); + ny = pf_get_nblocksy(&tx->base.block, tx->base.height); + nv50_transfer_rect_m2mf(pscreen, mt->base.bo, tx->level_offset, tx->level_pitch, tx->level_tiling, x, y, - tx->level_width, tx->level_height, - tx->bo, 0, tx->base.stride, - tx->bo->tile_mode, 0, 0, - tx->base.width, tx->base.height, - tx->base.block.size, w, h, + tx->base.nblocksx, tx->base.nblocksy, + tx->level_depth, + tx->bo, 0, + tx->base.stride, tx->bo->tile_mode, + 0, 0, + tx->base.nblocksx, tx->base.nblocksy, 1, + tx->base.block.size, nx, ny, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART, NOUVEAU_BO_GART); } @@ -183,17 +213,22 @@ nv50_transfer_del(struct pipe_transfer *ptx) struct nv50_transfer *tx = (struct nv50_transfer *)ptx; struct nv50_miptree *mt = nv50_miptree(ptx->texture); - if (ptx->usage != PIPE_TRANSFER_READ) { + unsigned nx = pf_get_nblocksx(&tx->base.block, tx->base.width); + unsigned ny = pf_get_nblocksy(&tx->base.block, tx->base.height); + + if (ptx->usage & PIPE_TRANSFER_WRITE) { struct pipe_screen *pscreen = ptx->texture->screen; - nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, tx->base.stride, - tx->bo->tile_mode, 0, 0, - tx->base.width, tx->base.height, + + nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, + tx->base.stride, tx->bo->tile_mode, + 0, 0, + tx->base.nblocksx, tx->base.nblocksy, 1, mt->base.bo, tx->level_offset, tx->level_pitch, tx->level_tiling, tx->level_x, tx->level_y, - tx->level_width, tx->level_height, - tx->base.block.size, tx->base.width, - tx->base.height, + tx->base.nblocksx, tx->base.nblocksy, + tx->level_depth, + tx->base.block.size, nx, ny, NOUVEAU_BO_GART, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART); } @@ -237,3 +272,89 @@ nv50_transfer_init_screen_functions(struct pipe_screen *pscreen) pscreen->transfer_map = nv50_transfer_map; pscreen->transfer_unmap = nv50_transfer_unmap; } + +void +nv50_upload_sifc(struct nv50_context *nv50, + struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc, + unsigned dst_format, int dst_w, int dst_h, int dst_pitch, + void *src, unsigned src_format, int src_pitch, + int x, int y, int w, int h, int cpp) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *eng2d = nv50->screen->eng2d; + struct nouveau_grobj *tesla = nv50->screen->tesla; + unsigned line_dwords = (w * cpp + 3) / 4; + + reloc |= NOUVEAU_BO_WR; + + WAIT_RING (chan, 32); + + if (bo->tile_flags) { + BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 5); + OUT_RING (chan, dst_format); + OUT_RING (chan, 0); + OUT_RING (chan, bo->tile_mode << 4); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + } else { + BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2); + OUT_RING (chan, dst_format); + OUT_RING (chan, 1); + BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1); + OUT_RING (chan, dst_pitch); + } + + BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 4); + OUT_RING (chan, dst_w); + OUT_RING (chan, dst_h); + OUT_RELOCh(chan, bo, dst_offset, reloc); + OUT_RELOCl(chan, bo, dst_offset, reloc); + + /* NV50_2D_OPERATION_SRCCOPY assumed already set */ + + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_UNK0800, 2); + OUT_RING (chan, 0); + OUT_RING (chan, src_format); + BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10); + OUT_RING (chan, w); + OUT_RING (chan, h); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, 1); + OUT_RING (chan, 0); + OUT_RING (chan, x); + OUT_RING (chan, 0); + OUT_RING (chan, y); + + while (h--) { + const uint32_t *p = src; + unsigned count = line_dwords; + + while (count) { + unsigned nr = MIN2(count, 1792); + + if (chan->pushbuf->remaining <= nr) { + FIRE_RING (chan); + + BEGIN_RING(chan, eng2d, + NV50_2D_DST_ADDRESS_HIGH, 2); + OUT_RELOCh(chan, bo, dst_offset, reloc); + OUT_RELOCl(chan, bo, dst_offset, reloc); + } + assert(chan->pushbuf->remaining > nr); + + BEGIN_RING(chan, eng2d, + NV50_2D_SIFC_DATA | (2 << 29), nr); + OUT_RINGp (chan, p, nr); + + p += nr; + count -= nr; + } + + src += src_pitch; + } + + BEGIN_RING(chan, tesla, 0x1440, 1); + OUT_RING (chan, 0); +} diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c index eeed148c7b9..db54380241f 100644 --- a/src/gallium/drivers/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nv50/nv50_vbo.c @@ -26,6 +26,18 @@ #include "nv50_context.h" +static boolean +nv50_push_elements_u08(struct nv50_context *, uint8_t *, unsigned); + +static boolean +nv50_push_elements_u16(struct nv50_context *, uint16_t *, unsigned); + +static boolean +nv50_push_elements_u32(struct nv50_context *, uint32_t *, unsigned); + +static boolean +nv50_push_arrays(struct nv50_context *, unsigned, unsigned); + static INLINE unsigned nv50_prim(unsigned mode) { @@ -132,6 +144,7 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start, struct nv50_context *nv50 = nv50_context(pipe); struct nouveau_channel *chan = nv50->screen->tesla->channel; struct nouveau_grobj *tesla = nv50->screen->tesla; + boolean ret; nv50_state_validate(nv50); @@ -139,24 +152,25 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start, OUT_RING (chan, 0); BEGIN_RING(chan, tesla, 0x142c, 1); OUT_RING (chan, 0); - BEGIN_RING(chan, tesla, 0x1440, 1); - OUT_RING (chan, 0); - BEGIN_RING(chan, tesla, 0x1334, 1); - OUT_RING (chan, 0); BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1); OUT_RING (chan, nv50_prim(mode)); - BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2); - OUT_RING (chan, start); - OUT_RING (chan, count); + + if (nv50->vbo_fifo) + ret = nv50_push_arrays(nv50, start, count); + else { + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2); + OUT_RING (chan, start); + OUT_RING (chan, count); + ret = TRUE; + } BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1); OUT_RING (chan, 0); - pipe->flush(pipe, 0, NULL); - return TRUE; + return ret; } -static INLINE void +static INLINE boolean nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map, unsigned start, unsigned count) { @@ -165,6 +179,9 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map, map += start; + if (nv50->vbo_fifo) + return nv50_push_elements_u08(nv50, map, count); + if (count & 1) { BEGIN_RING(chan, tesla, 0x15e8, 1); OUT_RING (chan, map[0]); @@ -183,9 +200,10 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map, count -= nr; map += nr; } + return TRUE; } -static INLINE void +static INLINE boolean nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map, unsigned start, unsigned count) { @@ -194,6 +212,9 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map, map += start; + if (nv50->vbo_fifo) + return nv50_push_elements_u16(nv50, map, count); + if (count & 1) { BEGIN_RING(chan, tesla, 0x15e8, 1); OUT_RING (chan, map[0]); @@ -212,9 +233,10 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map, count -= nr; map += nr; } + return TRUE; } -static INLINE void +static INLINE boolean nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map, unsigned start, unsigned count) { @@ -223,6 +245,9 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map, map += start; + if (nv50->vbo_fifo) + return nv50_push_elements_u32(nv50, map, count); + while (count) { unsigned nr = count > 2047 ? 2047 : count; @@ -232,6 +257,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map, count -= nr; map += nr; } + return TRUE; } boolean @@ -244,6 +270,7 @@ nv50_draw_elements(struct pipe_context *pipe, struct nouveau_grobj *tesla = nv50->screen->tesla; struct pipe_screen *pscreen = pipe->screen; void *map; + boolean ret; map = pipe_buffer_map(pscreen, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ); @@ -258,23 +285,25 @@ nv50_draw_elements(struct pipe_context *pipe, OUT_RING (chan, nv50_prim(mode)); switch (indexSize) { case 1: - nv50_draw_elements_inline_u08(nv50, map, start, count); + ret = nv50_draw_elements_inline_u08(nv50, map, start, count); break; case 2: - nv50_draw_elements_inline_u16(nv50, map, start, count); + ret = nv50_draw_elements_inline_u16(nv50, map, start, count); break; case 4: - nv50_draw_elements_inline_u32(nv50, map, start, count); + ret = nv50_draw_elements_inline_u32(nv50, map, start, count); break; default: assert(0); + ret = FALSE; + break; } BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1); OUT_RING (chan, 0); pipe_buffer_unmap(pscreen, indexBuffer); - pipe->flush(pipe, 0, NULL); - return TRUE; + + return ret; } static INLINE boolean @@ -341,17 +370,24 @@ nv50_vbo_validate(struct nv50_context *nv50) { struct nouveau_grobj *tesla = nv50->screen->tesla; struct nouveau_stateobj *vtxbuf, *vtxfmt, *vtxattr; - unsigned i; + unsigned i, n_ve; /* don't validate if Gallium took away our buffers */ if (nv50->vtxbuf_nr == 0) return; + nv50->vbo_fifo = 0; + + for (i = 0; i < nv50->vtxbuf_nr; ++i) + if (nv50->vtxbuf[i].stride && + !(nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX)) + nv50->vbo_fifo = 0xffff; + + n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr); vtxattr = NULL; - vtxbuf = so_new(nv50->vtxelt_nr * 7, nv50->vtxelt_nr * 4); - vtxfmt = so_new(nv50->vtxelt_nr + 1, 0); - so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), - nv50->vtxelt_nr); + vtxbuf = so_new(n_ve * 7, nv50->vtxelt_nr * 4); + vtxfmt = so_new(n_ve + 1, 0); + so_method(vtxfmt, tesla, NV50TCL_VERTEX_ARRAY_ATTRIB(0), n_ve); for (i = 0; i < nv50->vtxelt_nr; i++) { struct pipe_vertex_element *ve = &nv50->vtxelt[i]; @@ -367,10 +403,19 @@ nv50_vbo_validate(struct nv50_context *nv50) so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 1); so_data (vtxbuf, 0); + + nv50->vbo_fifo &= ~(1 << i); continue; } so_data(vtxfmt, hw | i); + if (nv50->vbo_fifo) { + so_method(vtxbuf, tesla, + NV50TCL_VERTEX_ARRAY_FORMAT(i), 1); + so_data (vtxbuf, 0); + continue; + } + so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 3); so_data (vtxbuf, 0x20000000 | vb->stride); so_reloc (vtxbuf, bo, vb->buffer_offset + @@ -389,6 +434,13 @@ nv50_vbo_validate(struct nv50_context *nv50) NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); } + for (; i < n_ve; ++i) { + so_data (vtxfmt, 0x7e080010); + + so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_FORMAT(i), 1); + so_data (vtxbuf, 0); + } + nv50->state.vtxelt_nr = nv50->vtxelt_nr; so_ref (vtxfmt, &nv50->state.vtxfmt); so_ref (vtxbuf, &nv50->state.vtxbuf); @@ -398,3 +450,320 @@ nv50_vbo_validate(struct nv50_context *nv50) so_ref (NULL, &vtxattr); } +typedef void (*pfn_push)(struct nouveau_channel *, void *); + +struct nv50_vbo_emitctx +{ + pfn_push push[16]; + void *map[16]; + unsigned stride[16]; + unsigned nr_ve; + unsigned vtx_dwords; + unsigned vtx_max; +}; + +static INLINE void +emit_vtx_next(struct nouveau_channel *chan, struct nv50_vbo_emitctx *emit) +{ + unsigned i; + + for (i = 0; i < emit->nr_ve; ++i) { + emit->push[i](chan, emit->map[i]); + emit->map[i] += emit->stride[i]; + } +} + +static INLINE void +emit_vtx(struct nouveau_channel *chan, struct nv50_vbo_emitctx *emit, + uint32_t vi) +{ + unsigned i; + + for (i = 0; i < emit->nr_ve; ++i) + emit->push[i](chan, emit->map[i] + emit->stride[i] * vi); +} + +static INLINE boolean +nv50_map_vbufs(struct nv50_context *nv50) +{ + int i; + + for (i = 0; i < nv50->vtxbuf_nr; ++i) { + struct pipe_vertex_buffer *vb = &nv50->vtxbuf[i]; + unsigned size, delta; + + if (nouveau_bo(vb->buffer)->map) + continue; + + size = vb->stride * (vb->max_index + 1); + delta = vb->buffer_offset; + + if (!size) + size = vb->buffer->size - vb->buffer_offset; + + if (nouveau_bo_map_range(nouveau_bo(vb->buffer), + delta, size, NOUVEAU_BO_RD)) + break; + } + + if (i == nv50->vtxbuf_nr) + return TRUE; + for (; i >= 0; --i) + nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer)); + return FALSE; +} + +static INLINE void +nv50_unmap_vbufs(struct nv50_context *nv50) +{ + unsigned i; + + for (i = 0; i < nv50->vtxbuf_nr; ++i) + if (nouveau_bo(nv50->vtxbuf[i].buffer)->map) + nouveau_bo_unmap(nouveau_bo(nv50->vtxbuf[i].buffer)); +} + +static void +emit_b32_1(struct nouveau_channel *chan, void *data) +{ + uint32_t *v = data; + + OUT_RING(chan, v[0]); +} + +static void +emit_b32_2(struct nouveau_channel *chan, void *data) +{ + uint32_t *v = data; + + OUT_RING(chan, v[0]); + OUT_RING(chan, v[1]); +} + +static void +emit_b32_3(struct nouveau_channel *chan, void *data) +{ + uint32_t *v = data; + + OUT_RING(chan, v[0]); + OUT_RING(chan, v[1]); + OUT_RING(chan, v[2]); +} + +static void +emit_b32_4(struct nouveau_channel *chan, void *data) +{ + uint32_t *v = data; + + OUT_RING(chan, v[0]); + OUT_RING(chan, v[1]); + OUT_RING(chan, v[2]); + OUT_RING(chan, v[3]); +} + +static void +emit_b16_1(struct nouveau_channel *chan, void *data) +{ + uint16_t *v = data; + + OUT_RING(chan, v[0]); +} + +static void +emit_b16_3(struct nouveau_channel *chan, void *data) +{ + uint16_t *v = data; + + OUT_RING(chan, (v[1] << 16) | v[0]); + OUT_RING(chan, v[2]); +} + +static void +emit_b08_1(struct nouveau_channel *chan, void *data) +{ + uint8_t *v = data; + + OUT_RING(chan, v[0]); +} + +static void +emit_b08_3(struct nouveau_channel *chan, void *data) +{ + uint8_t *v = data; + + OUT_RING(chan, (v[2] << 16) | (v[1] << 8) | v[0]); +} + +static boolean +emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit, + unsigned start) +{ + unsigned i; + + if (nv50_map_vbufs(nv50) == FALSE) + return FALSE; + + emit->nr_ve = 0; + emit->vtx_dwords = 0; + + for (i = 0; i < nv50->vtxelt_nr; ++i) { + struct pipe_vertex_element *ve; + struct pipe_vertex_buffer *vb; + unsigned n, type, size; + + ve = &nv50->vtxelt[i]; + vb = &nv50->vtxbuf[ve->vertex_buffer_index]; + if (!(nv50->vbo_fifo & (1 << i))) + continue; + n = emit->nr_ve++; + + emit->stride[n] = vb->stride; + emit->map[n] = nouveau_bo(vb->buffer)->map + + (start * vb->stride + ve->src_offset); + + type = pf_type(ve->src_format); + size = pf_size_x(ve->src_format) << pf_exp2(ve->src_format); + + assert(ve->nr_components > 0 && ve->nr_components <= 4); + + /* It shouldn't be necessary to push the implicit 1s + * for case 3 and size 8 cases 1, 2, 3. + */ + switch (size) { + default: + NOUVEAU_ERR("unsupported vtxelt size: %u\n", size); + return FALSE; + case 32: + switch (ve->nr_components) { + case 1: emit->push[n] = emit_b32_1; break; + case 2: emit->push[n] = emit_b32_2; break; + case 3: emit->push[n] = emit_b32_3; break; + case 4: emit->push[n] = emit_b32_4; break; + } + emit->vtx_dwords += ve->nr_components; + break; + case 16: + switch (ve->nr_components) { + case 1: emit->push[n] = emit_b16_1; break; + case 2: emit->push[n] = emit_b32_1; break; + case 3: emit->push[n] = emit_b16_3; break; + case 4: emit->push[n] = emit_b32_2; break; + } + emit->vtx_dwords += (ve->nr_components + 1) >> 1; + break; + case 8: + switch (ve->nr_components) { + case 1: emit->push[n] = emit_b08_1; break; + case 2: emit->push[n] = emit_b16_1; break; + case 3: emit->push[n] = emit_b08_3; break; + case 4: emit->push[n] = emit_b32_1; break; + } + emit->vtx_dwords += 1; + break; + } + } + + emit->vtx_max = 512 / emit->vtx_dwords; + + return TRUE; +} + +static boolean +nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_vbo_emitctx emit; + + if (emit_prepare(nv50, &emit, start) == FALSE) + return FALSE; + + while (count) { + unsigned i, dw, nr = MIN2(count, emit.vtx_max); + dw = nr * emit.vtx_dwords; + + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw); + for (i = 0; i < nr; ++i) + emit_vtx_next(chan, &emit); + + count -= nr; + } + nv50_unmap_vbufs(nv50); + + return TRUE; +} + +static boolean +nv50_push_elements_u32(struct nv50_context *nv50, uint32_t *map, unsigned count) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_vbo_emitctx emit; + + if (emit_prepare(nv50, &emit, 0) == FALSE) + return FALSE; + + while (count) { + unsigned i, dw, nr = MIN2(count, emit.vtx_max); + dw = nr * emit.vtx_dwords; + + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw); + for (i = 0; i < nr; ++i) + emit_vtx(chan, &emit, *map++); + + count -= nr; + } + nv50_unmap_vbufs(nv50); + + return TRUE; +} + +static boolean +nv50_push_elements_u16(struct nv50_context *nv50, uint16_t *map, unsigned count) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_vbo_emitctx emit; + + if (emit_prepare(nv50, &emit, 0) == FALSE) + return FALSE; + + while (count) { + unsigned i, dw, nr = MIN2(count, emit.vtx_max); + dw = nr * emit.vtx_dwords; + + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw); + for (i = 0; i < nr; ++i) + emit_vtx(chan, &emit, *map++); + + count -= nr; + } + nv50_unmap_vbufs(nv50); + + return TRUE; +} + +static boolean +nv50_push_elements_u08(struct nv50_context *nv50, uint8_t *map, unsigned count) +{ + struct nouveau_channel *chan = nv50->screen->base.channel; + struct nouveau_grobj *tesla = nv50->screen->tesla; + struct nv50_vbo_emitctx emit; + + if (emit_prepare(nv50, &emit, 0) == FALSE) + return FALSE; + + while (count) { + unsigned i, dw, nr = MIN2(count, emit.vtx_max); + dw = nr * emit.vtx_dwords; + + BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw); + for (i = 0; i < nr; ++i) + emit_vtx(chan, &emit, *map++); + + count -= nr; + } + nv50_unmap_vbufs(nv50); + + return TRUE; +} |