diff options
author | Zack Rusin <[email protected]> | 2013-06-27 20:40:10 -0400 |
---|---|---|
committer | Zack Rusin <[email protected]> | 2013-06-28 05:21:20 -0400 |
commit | 1c2e5c223da28cdffe156b6b430fcdf638909021 (patch) | |
tree | f86833cf8b5b43134231309cc75932da000de080 /src/gallium/auxiliary/translate | |
parent | df4ab7974a825bf686f9dfa3474f3648e9a3ca66 (diff) |
draw/translate: fix instancing
We were incorrectly computing the buffer offset when using the
instances. The buffer offset is always equal to:
start_instance * stride + (instance_num / instance_divisor) *
stride
We were completely ignoring the start instance quite
often producing instances that completely wrong, e.g. if
start instance = 5, instance divisor = 2, then on the first
iteration it should be:
5 * stride, not (5/2) * stride as we'd have currently, and if
start instance = 1, instance divisor = 3, then on the first
iteration it should be:
1 * stride, not 0 as we'd have.
This fixes it and adjusts all the code to the changes.
Signed-off-by: Zack Rusin <[email protected]>
Diffstat (limited to 'src/gallium/auxiliary/translate')
-rw-r--r-- | src/gallium/auxiliary/translate/translate.h | 4 | ||||
-rw-r--r-- | src/gallium/auxiliary/translate/translate_generic.c | 17 | ||||
-rw-r--r-- | src/gallium/auxiliary/translate/translate_sse.c | 32 |
3 files changed, 42 insertions, 11 deletions
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h index 850ef39ef21..1132114de9d 100644 --- a/src/gallium/auxiliary/translate/translate.h +++ b/src/gallium/auxiliary/translate/translate.h @@ -74,24 +74,28 @@ struct translate; typedef void (PIPE_CDECL *run_elts_func)(struct translate *, const unsigned *elts, unsigned count, + unsigned start_instance, unsigned instance_id, void *output_buffer); typedef void (PIPE_CDECL *run_elts16_func)(struct translate *, const uint16_t *elts, unsigned count, + unsigned start_instance, unsigned instance_id, void *output_buffer); typedef void (PIPE_CDECL *run_elts8_func)(struct translate *, const uint8_t *elts, unsigned count, + unsigned start_instance, unsigned instance_id, void *output_buffer); typedef void (PIPE_CDECL *run_func)(struct translate *, unsigned start, unsigned count, + unsigned start_instance, unsigned instance_id, void *output_buffer); diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c index 894c1684813..96e35b0eb41 100644 --- a/src/gallium/auxiliary/translate/translate_generic.c +++ b/src/gallium/auxiliary/translate/translate_generic.c @@ -607,6 +607,7 @@ static emit_func get_emit_func( enum pipe_format format ) static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg, unsigned elt, + unsigned start_instance, unsigned instance_id, void *vert ) { @@ -623,7 +624,9 @@ static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic * int copy_size; if (tg->attrib[attr].instance_divisor) { - index = instance_id / tg->attrib[attr].instance_divisor; + index = start_instance; + index += (instance_id - start_instance) / + tg->attrib[attr].instance_divisor; /* XXX we need to clamp the index here too, but to a * per-array max value, not the draw->pt.max_index value * that's being given to us via translate->set_buffer(). @@ -674,6 +677,7 @@ static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic * static void PIPE_CDECL generic_run_elts( struct translate *translate, const unsigned *elts, unsigned count, + unsigned start_instance, unsigned instance_id, void *output_buffer ) { @@ -682,7 +686,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate, unsigned i; for (i = 0; i < count; i++) { - generic_run_one(tg, *elts++, instance_id, vert); + generic_run_one(tg, *elts++, start_instance, instance_id, vert); vert += tg->translate.key.output_stride; } } @@ -690,6 +694,7 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate, static void PIPE_CDECL generic_run_elts16( struct translate *translate, const uint16_t *elts, unsigned count, + unsigned start_instance, unsigned instance_id, void *output_buffer ) { @@ -698,7 +703,7 @@ static void PIPE_CDECL generic_run_elts16( struct translate *translate, unsigned i; for (i = 0; i < count; i++) { - generic_run_one(tg, *elts++, instance_id, vert); + generic_run_one(tg, *elts++, start_instance, instance_id, vert); vert += tg->translate.key.output_stride; } } @@ -706,6 +711,7 @@ static void PIPE_CDECL generic_run_elts16( struct translate *translate, static void PIPE_CDECL generic_run_elts8( struct translate *translate, const uint8_t *elts, unsigned count, + unsigned start_instance, unsigned instance_id, void *output_buffer ) { @@ -714,7 +720,7 @@ static void PIPE_CDECL generic_run_elts8( struct translate *translate, unsigned i; for (i = 0; i < count; i++) { - generic_run_one(tg, *elts++, instance_id, vert); + generic_run_one(tg, *elts++, start_instance, instance_id, vert); vert += tg->translate.key.output_stride; } } @@ -722,6 +728,7 @@ static void PIPE_CDECL generic_run_elts8( struct translate *translate, static void PIPE_CDECL generic_run( struct translate *translate, unsigned start, unsigned count, + unsigned start_instance, unsigned instance_id, void *output_buffer ) { @@ -730,7 +737,7 @@ static void PIPE_CDECL generic_run( struct translate *translate, unsigned i; for (i = 0; i < count; i++) { - generic_run_one(tg, start + i, instance_id, vert); + generic_run_one(tg, start + i, start_instance, instance_id, vert); vert += tg->translate.key.output_stride; } } diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index c2dd42db96e..a4f7b243c13 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -112,6 +112,7 @@ struct translate_sse { boolean use_instancing; unsigned instance_id; + unsigned start_instance; /* these are actually known values, but putting them in a struct * like this is helpful to keep them in sync across the file. @@ -1061,6 +1062,8 @@ static boolean init_inputs( struct translate_sse *p, unsigned i; struct x86_reg instance_id = x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); + struct x86_reg start_instance = x86_make_disp(p->machine_EDI, + get_offset(p, &p->start_instance)); for (i = 0; i < p->nr_buffer_variants; i++) { struct translate_buffer_variant *variant = &p->buffer_variant[i]; @@ -1082,7 +1085,8 @@ static boolean init_inputs( struct translate_sse *p, * base_ptr + stride * index, where index depends on instance divisor */ if (variant->instance_divisor) { - /* Our index is instance ID divided by instance divisor. + /* Start with instance = instance_id + * which is true if divisor is 1. */ x86_mov(p->func, tmp_EAX, instance_id); @@ -1090,13 +1094,22 @@ static boolean init_inputs( struct translate_sse *p, struct x86_reg tmp_EDX = p->tmp2_EDX; struct x86_reg tmp_ECX = p->src_ECX; + /* instance_num = instance_id - start_instance */ + x86_mov(p->func, tmp_EDX, start_instance); + x86_sub(p->func, tmp_EAX, tmp_EDX); + /* TODO: Add x86_shr() to rtasm and use it whenever * instance divisor is power of two. */ - x86_xor(p->func, tmp_EDX, tmp_EDX); x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ + + /* instance = (instance_id - start_instance) / divisor + + * start_instance + */ + x86_mov(p->func, tmp_EDX, start_instance); + x86_add(p->func, tmp_EAX, tmp_EDX); } /* XXX we need to clamp the index here too, but to a @@ -1312,17 +1325,24 @@ static boolean build_vertex_emit( struct translate_sse *p, x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); if(x86_target(p->func) != X86_32) - x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); + x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); else - x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); + x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); /* Load instance ID. */ - if (p->use_instancing) { + if (p->use_instancing) { x86_mov(p->func, - p->tmp_EAX, + p->tmp2_EDX, x86_fn_arg(p->func, 4)); x86_mov(p->func, + x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)), + p->tmp2_EDX); + + x86_mov(p->func, + p->tmp_EAX, + x86_fn_arg(p->func, 5)); + x86_mov(p->func, x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), p->tmp_EAX); } |