diff options
author | Christian König <[email protected]> | 2010-10-12 23:05:25 +0200 |
---|---|---|
committer | Christian König <[email protected]> | 2010-10-12 23:07:29 +0200 |
commit | 695cc370a280a637f411f5ff3877b3fd1c05e424 (patch) | |
tree | 69ae2a8fbecfa553faba59274688ffe11ee1a612 /src/gallium/auxiliary | |
parent | f3e34ba6fba76870b1c91a27adb706d1b87aeec8 (diff) | |
parent | 48156b87bc9d3e09ec34372d69504a787332ea0b (diff) |
Merge branch 'master' of ssh://git.freedesktop.org/git/mesa/mesa into pipe-video
Conflicts:
configure.ac
src/gallium/drivers/nvfx/Makefile
src/gallium/include/pipe/p_defines.h
src/gallium/include/pipe/p_screen.h
src/gallium/include/state_tracker/dri1_api.h
src/gallium/include/state_tracker/drm_api.h
src/gallium/winsys/nouveau/drm/nouveau_drm_api.c
Diffstat (limited to 'src/gallium/auxiliary')
245 files changed, 21512 insertions, 9056 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile index 16bb50fe168..05096b12a86 100644 --- a/src/gallium/auxiliary/Makefile +++ b/src/gallium/auxiliary/Makefile @@ -4,10 +4,11 @@ include $(TOP)/configs/current LIBNAME = gallium C_SOURCES = \ - cso_cache/cso_context.c \ cso_cache/cso_cache.c \ + cso_cache/cso_context.c \ cso_cache/cso_hash.c \ draw/draw_context.c \ + draw/draw_fs.c \ draw/draw_gs.c \ draw/draw_pipe.c \ draw/draw_pipe_aaline.c \ @@ -26,32 +27,32 @@ C_SOURCES = \ draw/draw_pipe_wide_line.c \ draw/draw_pipe_wide_point.c \ draw/draw_pt.c \ - draw/draw_pt_elts.c \ draw/draw_pt_emit.c \ draw/draw_pt_fetch.c \ draw/draw_pt_fetch_emit.c \ draw/draw_pt_fetch_shade_emit.c \ draw/draw_pt_fetch_shade_pipeline.c \ draw/draw_pt_post_vs.c \ + draw/draw_pt_so_emit.c \ draw/draw_pt_util.c \ - draw/draw_pt_varray.c \ - draw/draw_pt_vcache.c \ + draw/draw_pt_vsplit.c \ draw/draw_vertex.c \ draw/draw_vs.c \ - draw/draw_vs_varient.c \ draw/draw_vs_aos.c \ draw/draw_vs_aos_io.c \ draw/draw_vs_aos_machine.c \ draw/draw_vs_exec.c \ draw/draw_vs_ppc.c \ draw/draw_vs_sse.c \ + draw/draw_vs_varient.c \ indices/u_indices_gen.c \ indices/u_unfilled_gen.c \ os/os_misc.c \ + os/os_stream.c \ os/os_stream_log.c \ + os/os_stream_null.c \ os/os_stream_stdc.c \ os/os_stream_str.c \ - os/os_stream_null.c \ os/os_time.c \ pipebuffer/pb_buffer_fenced.c \ pipebuffer/pb_buffer_malloc.c \ @@ -64,17 +65,16 @@ C_SOURCES = \ pipebuffer/pb_bufmgr_slab.c \ pipebuffer/pb_validate.c \ rbug/rbug_connection.c \ + rbug/rbug_context.c \ rbug/rbug_core.c \ + rbug/rbug_demarshal.c \ rbug/rbug_texture.c \ - rbug/rbug_context.c \ rbug/rbug_shader.c \ - rbug/rbug_demarshal.c \ rtasm/rtasm_cpu.c \ rtasm/rtasm_execmem.c \ - rtasm/rtasm_x86sse.c \ rtasm/rtasm_ppc.c \ rtasm/rtasm_ppc_spe.c \ - tgsi/tgsi_sanity.c \ + rtasm/rtasm_x86sse.c \ tgsi/tgsi_build.c \ tgsi/tgsi_dump.c \ tgsi/tgsi_exec.c \ @@ -82,25 +82,29 @@ C_SOURCES = \ tgsi/tgsi_iterate.c \ tgsi/tgsi_parse.c \ tgsi/tgsi_ppc.c \ + tgsi/tgsi_sanity.c \ tgsi/tgsi_scan.c \ tgsi/tgsi_sse2.c \ tgsi/tgsi_text.c \ tgsi/tgsi_transform.c \ tgsi/tgsi_ureg.c \ tgsi/tgsi_util.c \ - translate/translate_generic.c \ - translate/translate_sse.c \ translate/translate.c \ translate/translate_cache.c \ + translate/translate_generic.c \ + translate/translate_sse.c \ util/u_debug.c \ - util/u_debug_symbol.c \ + util/u_debug_describe.c \ + util/u_debug_refcnt.c \ util/u_debug_stack.c \ + util/u_debug_symbol.c \ util/u_dump_defines.c \ util/u_dump_state.c \ util/u_bitmask.c \ util/u_blit.c \ util/u_blitter.c \ util/u_cache.c \ + util/u_caps.c \ util/u_cpu_detect.c \ util/u_dl.c \ util/u_draw_quad.c \ @@ -112,21 +116,26 @@ C_SOURCES = \ util/u_format_tests.c \ util/u_format_yuv.c \ util/u_format_zs.c \ + util/u_framebuffer.c \ util/u_gen_mipmap.c \ util/u_half.c \ util/u_handle_table.c \ - util/u_hash_table.c \ util/u_hash.c \ + util/u_hash_table.c \ + util/u_index_modify.c \ util/u_keymap.c \ util/u_linear.c \ + util/u_linkage.c \ util/u_network.c \ util/u_math.c \ + util/u_mempool.c \ util/u_mm.c \ util/u_rect.c \ util/u_ringbuffer.c \ util/u_sampler.c \ util/u_simple_shaders.c \ util/u_snprintf.c \ + util/u_staging.c \ util/u_surface.c \ util/u_surfaces.c \ util/u_texture.c \ @@ -142,28 +151,38 @@ C_SOURCES = \ GALLIVM_SOURCES = \ gallivm/lp_bld_arit.c \ + gallivm/lp_bld_assert.c \ + gallivm/lp_bld_bitarit.c \ gallivm/lp_bld_const.c \ gallivm/lp_bld_conv.c \ gallivm/lp_bld_debug.c \ gallivm/lp_bld_flow.c \ gallivm/lp_bld_format_aos.c \ gallivm/lp_bld_format_soa.c \ + gallivm/lp_bld_format_yuv.c \ + gallivm/lp_bld_gather.c \ gallivm/lp_bld_init.c \ gallivm/lp_bld_intr.c \ gallivm/lp_bld_logic.c \ gallivm/lp_bld_pack.c \ gallivm/lp_bld_printf.c \ + gallivm/lp_bld_quad.c \ gallivm/lp_bld_sample.c \ + gallivm/lp_bld_sample_aos.c \ gallivm/lp_bld_sample_soa.c \ gallivm/lp_bld_struct.c \ gallivm/lp_bld_swizzle.c \ + gallivm/lp_bld_tgsi_aos.c \ gallivm/lp_bld_tgsi_soa.c \ gallivm/lp_bld_type.c \ draw/draw_llvm.c \ - draw/draw_pt_fetch_shade_pipeline_llvm.c \ - draw/draw_llvm_translate.c + draw/draw_llvm_sample.c \ + draw/draw_llvm_translate.c \ + draw/draw_vs_llvm.c \ + draw/draw_pt_fetch_shade_pipeline_llvm.c -GALLIVM_CPP_SOURCES = +GALLIVM_CPP_SOURCES = \ + gallivm/lp_bld_misc.cpp GENERATED_SOURCES = \ indices/u_indices_gen.c \ diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript index 6cea83060a9..a18f7c0b2a3 100644 --- a/src/gallium/auxiliary/SConscript +++ b/src/gallium/auxiliary/SConscript @@ -32,28 +32,30 @@ env.CodeGenerate( env.CodeGenerate( target = 'util/u_format_table.c', - script = 'util/u_format_table.py', - source = ['util/u_format.csv'], - command = 'python $SCRIPT $SOURCE > $TARGET' + script = '#src/gallium/auxiliary/util/u_format_table.py', + source = ['#src/gallium/auxiliary/util/u_format.csv'], + command = python_cmd + ' $SCRIPT $SOURCE > $TARGET' ) env.CodeGenerate( target = 'util/u_half.c', script = 'util/u_half.py', source = [], - command = 'python $SCRIPT > $TARGET' + command = python_cmd + ' $SCRIPT > $TARGET' ) env.Depends('util/u_format_table.c', [ - 'util/u_format_parse.py', + '#src/gallium/auxiliary/util/u_format_parse.py', 'util/u_format_pack.py', ]) source = [ - 'cso_cache/cso_context.c', 'cso_cache/cso_cache.c', + 'cso_cache/cso_context.c', 'cso_cache/cso_hash.c', 'draw/draw_context.c', + 'draw/draw_fs.c', + 'draw/draw_gs.c', 'draw/draw_pipe.c', 'draw/draw_pipe_aaline.c', 'draw/draw_pipe_aapoint.c', @@ -71,16 +73,15 @@ source = [ 'draw/draw_pipe_wide_line.c', 'draw/draw_pipe_wide_point.c', 'draw/draw_pt.c', - 'draw/draw_pt_elts.c', 'draw/draw_pt_emit.c', 'draw/draw_pt_fetch.c', 'draw/draw_pt_fetch_emit.c', 'draw/draw_pt_fetch_shade_emit.c', 'draw/draw_pt_fetch_shade_pipeline.c', 'draw/draw_pt_post_vs.c', + 'draw/draw_pt_so_emit.c', 'draw/draw_pt_util.c', - 'draw/draw_pt_varray.c', - 'draw/draw_pt_vcache.c', + 'draw/draw_pt_vsplit.c', 'draw/draw_vertex.c', 'draw/draw_vs.c', 'draw/draw_vs_aos.c', @@ -90,16 +91,16 @@ source = [ 'draw/draw_vs_ppc.c', 'draw/draw_vs_sse.c', 'draw/draw_vs_varient.c', - 'draw/draw_gs.c', #'indices/u_indices.c', #'indices/u_unfilled_indices.c', 'indices/u_indices_gen.c', 'indices/u_unfilled_gen.c', 'os/os_misc.c', + 'os/os_stream.c', 'os/os_stream_log.c', + 'os/os_stream_null.c', 'os/os_stream_stdc.c', 'os/os_stream_str.c', - 'os/os_stream_null.c', 'os/os_time.c', 'pipebuffer/pb_buffer_fenced.c', 'pipebuffer/pb_buffer_malloc.c', @@ -111,42 +112,45 @@ source = [ 'pipebuffer/pb_bufmgr_pool.c', 'pipebuffer/pb_bufmgr_slab.c', 'pipebuffer/pb_validate.c', + 'rbug/rbug_connection.c', + 'rbug/rbug_context.c', 'rbug/rbug_core.c', + 'rbug/rbug_demarshal.c', 'rbug/rbug_shader.c', - 'rbug/rbug_context.c', 'rbug/rbug_texture.c', - 'rbug/rbug_demarshal.c', - 'rbug/rbug_connection.c', 'rtasm/rtasm_cpu.c', 'rtasm/rtasm_execmem.c', - 'rtasm/rtasm_x86sse.c', 'rtasm/rtasm_ppc.c', 'rtasm/rtasm_ppc_spe.c', + 'rtasm/rtasm_x86sse.c', 'tgsi/tgsi_build.c', 'tgsi/tgsi_dump.c', 'tgsi/tgsi_exec.c', 'tgsi/tgsi_info.c', 'tgsi/tgsi_iterate.c', 'tgsi/tgsi_parse.c', + 'tgsi/tgsi_ppc.c', 'tgsi/tgsi_sanity.c', 'tgsi/tgsi_scan.c', - 'tgsi/tgsi_ppc.c', 'tgsi/tgsi_sse2.c', 'tgsi/tgsi_text.c', 'tgsi/tgsi_transform.c', 'tgsi/tgsi_ureg.c', 'tgsi/tgsi_util.c', - 'translate/translate_generic.c', - 'translate/translate_sse.c', 'translate/translate.c', 'translate/translate_cache.c', + 'translate/translate_generic.c', + 'translate/translate_sse.c', 'util/u_bitmask.c', 'util/u_blit.c', 'util/u_blitter.c', 'util/u_cache.c', + 'util/u_caps.c', 'util/u_cpu_detect.c', 'util/u_debug.c', + 'util/u_debug_describe.c', 'util/u_debug_memory.c', + 'util/u_debug_refcnt.c', 'util/u_debug_stack.c', 'util/u_debug_symbol.c', 'util/u_dump_defines.c', @@ -161,14 +165,19 @@ source = [ 'util/u_format_tests.c', 'util/u_format_yuv.c', 'util/u_format_zs.c', + 'util/u_framebuffer.c', 'util/u_gen_mipmap.c', 'util/u_half.c', 'util/u_handle_table.c', 'util/u_hash.c', 'util/u_hash_table.c', + 'util/u_index_modify.c', 'util/u_keymap.c', + 'util/u_linear.c', + 'util/u_linkage.c', 'util/u_network.c', 'util/u_math.c', + 'util/u_mempool.c', 'util/u_mm.c', 'util/u_rect.c', 'util/u_resource.c', @@ -176,6 +185,7 @@ source = [ 'util/u_sampler.c', 'util/u_simple_shaders.c', 'util/u_snprintf.c', + 'util/u_staging.c', 'util/u_surface.c', 'util/u_surfaces.c', 'util/u_texture.c', @@ -192,26 +202,36 @@ source = [ if env['llvm']: source += [ 'gallivm/lp_bld_arit.c', + 'gallivm/lp_bld_assert.c', + 'gallivm/lp_bld_bitarit.c', 'gallivm/lp_bld_const.c', 'gallivm/lp_bld_conv.c', 'gallivm/lp_bld_debug.c', 'gallivm/lp_bld_flow.c', 'gallivm/lp_bld_format_aos.c', 'gallivm/lp_bld_format_soa.c', + 'gallivm/lp_bld_format_yuv.c', + 'gallivm/lp_bld_gather.c', + 'gallivm/lp_bld_init.c', 'gallivm/lp_bld_intr.c', 'gallivm/lp_bld_logic.c', - 'gallivm/lp_bld_init.c', + 'gallivm/lp_bld_misc.cpp', 'gallivm/lp_bld_pack.c', 'gallivm/lp_bld_printf.c', + 'gallivm/lp_bld_quad.c', 'gallivm/lp_bld_sample.c', + 'gallivm/lp_bld_sample_aos.c', 'gallivm/lp_bld_sample_soa.c', 'gallivm/lp_bld_struct.c', 'gallivm/lp_bld_swizzle.c', + 'gallivm/lp_bld_tgsi_aos.c', 'gallivm/lp_bld_tgsi_soa.c', 'gallivm/lp_bld_type.c', 'draw/draw_llvm.c', + 'draw/draw_llvm_sample.c', + 'draw/draw_llvm_translate.c', 'draw/draw_pt_fetch_shade_pipeline_llvm.c', - 'draw/draw_llvm_translate.c' + 'draw/draw_vs_llvm.c' ] gallium = env.ConvenienceLibrary( diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c index 6d0b4207986..58b022d531d 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.c +++ b/src/gallium/auxiliary/cso_cache/cso_context.c @@ -36,6 +36,7 @@ */ #include "pipe/p_state.h" +#include "util/u_framebuffer.h" #include "util/u_inlines.h" #include "util/u_math.h" #include "util/u_memory.h" @@ -98,14 +99,11 @@ struct cso_context { struct pipe_framebuffer_state fb, fb_saved; struct pipe_viewport_state vp, vp_saved; struct pipe_blend_color blend_color; + unsigned sample_mask; struct pipe_stencil_ref stencil_ref, stencil_ref_saved; }; -static void -free_framebuffer_state(struct pipe_framebuffer_state *fb); - - static boolean delete_blend_state(struct cso_context *ctx, void *state) { struct cso_blend *cso = (struct cso_blend *)state; @@ -291,6 +289,9 @@ void cso_release_all( struct cso_context *ctx ) ctx->pipe->bind_fs_state( ctx->pipe, NULL ); ctx->pipe->bind_vs_state( ctx->pipe, NULL ); ctx->pipe->bind_vertex_elements_state( ctx->pipe, NULL ); + ctx->pipe->set_fragment_sampler_views(ctx->pipe, 0, NULL); + if (ctx->pipe->set_vertex_sampler_views) + ctx->pipe->set_vertex_sampler_views(ctx->pipe, 0, NULL); } for (i = 0; i < PIPE_MAX_SAMPLERS; i++) { @@ -303,8 +304,8 @@ void cso_release_all( struct cso_context *ctx ) pipe_sampler_view_reference(&ctx->vertex_sampler_views_saved[i], NULL); } - free_framebuffer_state(&ctx->fb); - free_framebuffer_state(&ctx->fb_saved); + util_unreference_framebuffer_state(&ctx->fb); + util_unreference_framebuffer_state(&ctx->fb_saved); if (ctx->cache) { cso_cache_delete( ctx->cache ); @@ -313,10 +314,13 @@ void cso_release_all( struct cso_context *ctx ) } +/** + * Free the CSO context. NOTE: the state tracker should have previously called + * cso_release_all(). + */ void cso_destroy_context( struct cso_context *ctx ) { if (ctx) { - /*cso_release_all( ctx );*/ FREE( ctx ); } } @@ -893,42 +897,11 @@ void cso_restore_vertex_shader(struct cso_context *ctx) } -/** - * Copy framebuffer state from src to dst with refcounting of surfaces. - */ -static void -copy_framebuffer_state(struct pipe_framebuffer_state *dst, - const struct pipe_framebuffer_state *src) -{ - uint i; - - dst->width = src->width; - dst->height = src->height; - dst->nr_cbufs = src->nr_cbufs; - for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) { - pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]); - } - pipe_surface_reference(&dst->zsbuf, src->zsbuf); -} - - -static void -free_framebuffer_state(struct pipe_framebuffer_state *fb) -{ - uint i; - - for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) { - pipe_surface_reference(&fb->cbufs[i], NULL); - } - pipe_surface_reference(&fb->zsbuf, NULL); -} - - enum pipe_error cso_set_framebuffer(struct cso_context *ctx, const struct pipe_framebuffer_state *fb) { if (memcmp(&ctx->fb, fb, sizeof(*fb)) != 0) { - copy_framebuffer_state(&ctx->fb, fb); + util_copy_framebuffer_state(&ctx->fb, fb); ctx->pipe->set_framebuffer_state(ctx->pipe, fb); } return PIPE_OK; @@ -936,15 +909,15 @@ enum pipe_error cso_set_framebuffer(struct cso_context *ctx, void cso_save_framebuffer(struct cso_context *ctx) { - copy_framebuffer_state(&ctx->fb_saved, &ctx->fb); + util_copy_framebuffer_state(&ctx->fb_saved, &ctx->fb); } void cso_restore_framebuffer(struct cso_context *ctx) { if (memcmp(&ctx->fb, &ctx->fb_saved, sizeof(ctx->fb))) { - copy_framebuffer_state(&ctx->fb, &ctx->fb_saved); + util_copy_framebuffer_state(&ctx->fb, &ctx->fb_saved); ctx->pipe->set_framebuffer_state(ctx->pipe, &ctx->fb); - free_framebuffer_state(&ctx->fb_saved); + util_unreference_framebuffer_state(&ctx->fb_saved); } } @@ -984,6 +957,16 @@ enum pipe_error cso_set_blend_color(struct cso_context *ctx, return PIPE_OK; } +enum pipe_error cso_set_sample_mask(struct cso_context *ctx, + unsigned sample_mask) +{ + if (ctx->sample_mask != sample_mask) { + ctx->sample_mask = sample_mask; + ctx->pipe->set_sample_mask(ctx->pipe, sample_mask); + } + return PIPE_OK; +} + enum pipe_error cso_set_stencil_ref(struct cso_context *ctx, const struct pipe_stencil_ref *sr) { @@ -1049,6 +1032,7 @@ static INLINE void clip_state_cpy(struct pipe_clip_state *dst, const struct pipe_clip_state *src) { + dst->depth_clamp = src->depth_clamp; dst->nr = src->nr; if (src->nr) { memcpy(dst->ucp, src->ucp, src->nr * sizeof(src->ucp[0])); @@ -1059,6 +1043,9 @@ static INLINE int clip_state_cmp(const struct pipe_clip_state *a, const struct pipe_clip_state *b) { + if (a->depth_clamp != b->depth_clamp) { + return 1; + } if (a->nr != b->nr) { return 1; } diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h index d6bcb1fe8f7..f0b07f73765 100644 --- a/src/gallium/auxiliary/cso_cache/cso_context.h +++ b/src/gallium/auxiliary/cso_cache/cso_context.h @@ -159,6 +159,8 @@ void cso_restore_viewport(struct cso_context *cso); enum pipe_error cso_set_blend_color(struct cso_context *cso, const struct pipe_blend_color *bc); +enum pipe_error cso_set_sample_mask(struct cso_context *cso, + unsigned stencil_mask); enum pipe_error cso_set_stencil_ref(struct cso_context *cso, const struct pipe_stencil_ref *sr); diff --git a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h new file mode 100644 index 00000000000..958ed20dc84 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h @@ -0,0 +1,114 @@ +/************************************************************************** + * + * Copyright 2010, VMware, inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +static boolean TAG(do_cliptest)( struct pt_post_vs *pvs, + struct draw_vertex_info *info ) +{ + struct vertex_header *out = info->verts; + const float *scale = pvs->draw->viewport.scale; + const float *trans = pvs->draw->viewport.translate; + /* const */ float (*plane)[4] = pvs->draw->plane; + const unsigned pos = draw_current_shader_position_output(pvs->draw); + const unsigned ef = pvs->draw->vs.edgeflag_output; + const unsigned nr = pvs->draw->nr_planes; + const unsigned flags = (FLAGS); + unsigned need_pipeline = 0; + unsigned j; + + for (j = 0; j < info->count; j++) { + float *position = out->data[pos]; + unsigned mask = 0x0; + + initialize_vertex_header(out); + + if (flags & (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_HALF_Z | DO_CLIP_USER)) { + out->clip[0] = position[0]; + out->clip[1] = position[1]; + out->clip[2] = position[2]; + out->clip[3] = position[3]; + + /* Do the hardwired planes first: + */ + if (flags & DO_CLIP_XY) { + if (-position[0] + position[3] < 0) mask |= (1<<0); + if ( position[0] + position[3] < 0) mask |= (1<<1); + if (-position[1] + position[3] < 0) mask |= (1<<2); + if ( position[1] + position[3] < 0) mask |= (1<<3); + } + + /* Clip Z planes according to full cube, half cube or none. + */ + if (flags & DO_CLIP_FULL_Z) { + if ( position[2] + position[3] < 0) mask |= (1<<4); + if (-position[2] + position[3] < 0) mask |= (1<<5); + } + else if (flags & DO_CLIP_HALF_Z) { + if ( position[2] < 0) mask |= (1<<4); + if (-position[2] + position[3] < 0) mask |= (1<<5); + } + + if (flags & DO_CLIP_USER) { + unsigned i; + for (i = 6; i < nr; i++) { + if (dot4(position, plane[i]) < 0) + mask |= (1<<i); + } + } + + out->clipmask = mask; + need_pipeline |= out->clipmask; + } + + if ((flags & DO_VIEWPORT) && mask == 0) + { + /* divide by w */ + float w = 1.0f / position[3]; + + /* Viewport mapping */ + position[0] = position[0] * w * scale[0] + trans[0]; + position[1] = position[1] * w * scale[1] + trans[1]; + position[2] = position[2] * w * scale[2] + trans[2]; + position[3] = w; + } + + if ((flags & DO_EDGEFLAG) && ef) { + const float *edgeflag = out->data[ef]; + out->edgeflag = !(edgeflag[0] != 1.0f); + need_pipeline |= !out->edgeflag; + } + + out = (struct vertex_header *)( (char *)out + info->stride ); + } + + return need_pipeline != 0; +} + + +#undef FLAGS +#undef TAG diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 02abddf1491..032fcbbc70a 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -34,12 +34,33 @@ #include "pipe/p_context.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_cpu_detect.h" #include "draw_context.h" #include "draw_vs.h" #include "draw_gs.h" #if HAVE_LLVM #include "gallivm/lp_bld_init.h" +#include "draw_llvm.h" + +static boolean +draw_get_option_use_llvm(void) +{ + static boolean first = TRUE; + static boolean value; + if (first) { + first = FALSE; + value = debug_get_bool_option("DRAW_USE_LLVM", TRUE); + +#ifdef PIPE_ARCH_X86 + util_cpu_detect(); + /* require SSE2 due to LLVM PR6960. */ + if (!util_cpu_caps.has_sse2) + value = FALSE; +#endif + } + return value; +} #endif struct draw_context *draw_create( struct pipe_context *pipe ) @@ -49,9 +70,13 @@ struct draw_context *draw_create( struct pipe_context *pipe ) goto fail; #if HAVE_LLVM - lp_build_init(); - assert(lp_build_engine); - draw->engine = lp_build_engine; + if(draw_get_option_use_llvm()) + { + lp_build_init(); + assert(lp_build_engine); + draw->engine = lp_build_engine; + draw->llvm = draw_llvm_create(draw); + } #endif if (!draw_init(draw)) @@ -81,6 +106,8 @@ boolean draw_init(struct draw_context *draw) ASSIGN_4V( draw->plane[4], 0, 0, 1, 1 ); /* yes these are correct */ ASSIGN_4V( draw->plane[5], 0, 0, -1, 1 ); /* mesa's a bit wonky */ draw->nr_planes = 6; + draw->clip_xy = 1; + draw->clip_z = 1; draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */ @@ -132,6 +159,10 @@ void draw_destroy( struct draw_context *draw ) draw_pt_destroy( draw ); draw_vs_destroy( draw ); draw_gs_destroy( draw ); +#ifdef HAVE_LLVM + if(draw->llvm) + draw_llvm_destroy( draw->llvm ); +#endif FREE( draw ); } @@ -157,6 +188,14 @@ void draw_set_mrd(struct draw_context *draw, double mrd) } +static void update_clip_flags( struct draw_context *draw ) +{ + draw->clip_xy = !draw->driver.bypass_clip_xy; + draw->clip_z = (!draw->driver.bypass_clip_z && + !draw->depth_clamp); + draw->clip_user = (draw->nr_planes > 6); +} + /** * Register new primitive rasterization/rendering state. * This causes the drawing pipeline to be rebuilt. @@ -171,18 +210,25 @@ void draw_set_rasterizer_state( struct draw_context *draw, draw->rasterizer = raster; draw->rast_handle = rast_handle; - draw->bypass_clipping = draw->driver.bypass_clipping; - } + } } - +/* With a little more work, llvmpipe will be able to turn this off and + * do its own x/y clipping. + * + * Some hardware can turn off clipping altogether - in particular any + * hardware with a TNL unit can do its own clipping, even if it is + * relying on the draw module for some other reason. + */ void draw_set_driver_clipping( struct draw_context *draw, - boolean bypass_clipping ) + boolean bypass_clip_xy, + boolean bypass_clip_z ) { draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); - draw->driver.bypass_clipping = bypass_clipping; - draw->bypass_clipping = draw->driver.bypass_clipping; + draw->driver.bypass_clip_xy = bypass_clip_xy; + draw->driver.bypass_clip_z = bypass_clip_z; + update_clip_flags(draw); } @@ -211,6 +257,9 @@ void draw_set_clip_state( struct draw_context *draw, assert(clip->nr <= PIPE_MAX_CLIP_PLANES); memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0])); draw->nr_planes = 6 + clip->nr; + draw->depth_clamp = clip->depth_clamp; + + update_clip_flags(draw); } @@ -282,12 +331,19 @@ draw_set_mapped_constant_buffer(struct draw_context *draw, shader_type == PIPE_SHADER_GEOMETRY); debug_assert(slot < PIPE_MAX_CONSTANT_BUFFERS); - if (shader_type == PIPE_SHADER_VERTEX) { + switch (shader_type) { + case PIPE_SHADER_VERTEX: draw->pt.user.vs_constants[slot] = buffer; + draw->pt.user.vs_constants_size[slot] = size; draw_vs_set_constants(draw, slot, buffer, size); - } else if (shader_type == PIPE_SHADER_GEOMETRY) { + break; + case PIPE_SHADER_GEOMETRY: draw->pt.user.gs_constants[slot] = buffer; + draw->pt.user.gs_constants_size[slot] = size; draw_gs_set_constants(draw, slot, buffer, size); + break; + default: + assert(0 && "invalid shader type in draw_set_mapped_constant_buffer"); } } @@ -357,6 +413,42 @@ draw_set_force_passthrough( struct draw_context *draw, boolean enable ) } + +/** + * Allocate an extra vertex/geometry shader vertex attribute. + * This is used by some of the optional draw module stages such + * as wide_point which may need to allocate additional generic/texcoord + * attributes. + */ +int +draw_alloc_extra_vertex_attrib(struct draw_context *draw, + uint semantic_name, uint semantic_index) +{ + const int num_outputs = draw_current_shader_outputs(draw); + const int n = draw->extra_shader_outputs.num; + + assert(n < Elements(draw->extra_shader_outputs.semantic_name)); + + draw->extra_shader_outputs.semantic_name[n] = semantic_name; + draw->extra_shader_outputs.semantic_index[n] = semantic_index; + draw->extra_shader_outputs.slot[n] = num_outputs + n; + draw->extra_shader_outputs.num++; + + return draw->extra_shader_outputs.slot[n]; +} + + +/** + * Remove all extra vertex attributes that were allocated with + * draw_alloc_extra_vertex_attrib(). + */ +void +draw_remove_extra_vertex_attribs(struct draw_context *draw) +{ + draw->extra_shader_outputs.num = 0; +} + + /** * Ask the draw module for the location/slot of the given vertex attribute in * a post-transformed vertex. @@ -390,12 +482,12 @@ draw_find_shader_output(const struct draw_context *draw, return i; } - /* XXX there may be more than one extra vertex attrib. - * For example, simulated gl_FragCoord and gl_PointCoord. - */ - if (draw->extra_shader_outputs.semantic_name == semantic_name && - draw->extra_shader_outputs.semantic_index == semantic_index) { - return draw->extra_shader_outputs.slot; + /* Search the extra vertex attributes */ + for (i = 0; i < draw->extra_shader_outputs.num; i++) { + if (draw->extra_shader_outputs.semantic_name[i] == semantic_name && + draw->extra_shader_outputs.semantic_index[i] == semantic_index) { + return draw->extra_shader_outputs.slot[i]; + } } return 0; @@ -414,16 +506,18 @@ draw_find_shader_output(const struct draw_context *draw, uint draw_num_shader_outputs(const struct draw_context *draw) { - uint count = draw->vs.vertex_shader->info.num_outputs; + uint count; /* If a geometry shader is present, its outputs go to the * driver, else the vertex shader's outputs. */ if (draw->gs.geometry_shader) count = draw->gs.geometry_shader->info.num_outputs; + else + count = draw->vs.vertex_shader->info.num_outputs; + + count += draw->extra_shader_outputs.num; - if (draw->extra_shader_outputs.slot > 0) - count++; return count; } @@ -435,13 +529,18 @@ draw_num_shader_outputs(const struct draw_context *draw) */ void draw_texture_samplers(struct draw_context *draw, + uint shader, uint num_samplers, struct tgsi_sampler **samplers) { - draw->vs.num_samplers = num_samplers; - draw->vs.samplers = samplers; - draw->gs.num_samplers = num_samplers; - draw->gs.samplers = samplers; + if (shader == PIPE_SHADER_VERTEX) { + draw->vs.num_samplers = num_samplers; + draw->vs.samplers = samplers; + } else { + debug_assert(shader == PIPE_SHADER_GEOMETRY); + draw->gs.num_samplers = num_samplers; + draw->gs.samplers = samplers; + } } @@ -454,47 +553,28 @@ void draw_set_render( struct draw_context *draw, } - -/** - * Tell the drawing context about the index/element buffer to use - * (ala glDrawElements) - * If no element buffer is to be used (i.e. glDrawArrays) then this - * should be called with eltSize=0 and elements=NULL. - * - * \param draw the drawing context - * \param eltSize size of each element (1, 2 or 4 bytes) - * \param elements the element buffer ptr - */ void -draw_set_mapped_element_buffer_range( struct draw_context *draw, - unsigned eltSize, - int eltBias, - unsigned min_index, - unsigned max_index, - const void *elements ) +draw_set_index_buffer(struct draw_context *draw, + const struct pipe_index_buffer *ib) { - draw->pt.user.elts = elements; - draw->pt.user.eltSize = eltSize; - draw->pt.user.eltBias = eltBias; - draw->pt.user.min_index = min_index; - draw->pt.user.max_index = max_index; + if (ib) + memcpy(&draw->pt.index_buffer, ib, sizeof(draw->pt.index_buffer)); + else + memset(&draw->pt.index_buffer, 0, sizeof(draw->pt.index_buffer)); } +/** + * Tell drawing context where to find mapped index/element buffer. + */ void -draw_set_mapped_element_buffer( struct draw_context *draw, - unsigned eltSize, - int eltBias, - const void *elements ) +draw_set_mapped_index_buffer(struct draw_context *draw, + const void *elements) { - draw->pt.user.elts = elements; - draw->pt.user.eltSize = eltSize; - draw->pt.user.eltBias = eltBias; - draw->pt.user.min_index = 0; - draw->pt.user.max_index = 0xffffffff; + draw->pt.user.elts = elements; } - + /* Revamp me please: */ void draw_do_flush( struct draw_context *draw, unsigned flags ) @@ -566,7 +646,7 @@ draw_get_rasterizer_no_cull( struct draw_context *draw, memset(&rast, 0, sizeof(rast)); rast.scissor = scissor; rast.flatshade = flatshade; - rast.front_winding = PIPE_WINDING_CCW; + rast.front_ccw = 1; rast.gl_rasterization_rules = draw->rasterizer->gl_rasterization_rules; draw->rasterizer_no_cull[scissor][flatshade] = @@ -574,3 +654,82 @@ draw_get_rasterizer_no_cull( struct draw_context *draw, } return draw->rasterizer_no_cull[scissor][flatshade]; } + +void +draw_set_mapped_so_buffers(struct draw_context *draw, + void *buffers[PIPE_MAX_SO_BUFFERS], + unsigned num_buffers) +{ + int i; + + for (i = 0; i < num_buffers; ++i) { + draw->so.buffers[i] = buffers[i]; + } + draw->so.num_buffers = num_buffers; +} + +void +draw_set_so_state(struct draw_context *draw, + struct pipe_stream_output_state *state) +{ + memcpy(&draw->so.state, + state, + sizeof(struct pipe_stream_output_state)); +} + +void +draw_set_sampler_views(struct draw_context *draw, + struct pipe_sampler_view **views, + unsigned num) +{ + unsigned i; + + debug_assert(num <= PIPE_MAX_VERTEX_SAMPLERS); + + for (i = 0; i < num; ++i) + draw->sampler_views[i] = views[i]; + for (i = num; i < PIPE_MAX_VERTEX_SAMPLERS; ++i) + draw->sampler_views[i] = NULL; + + draw->num_sampler_views = num; +} + +void +draw_set_samplers(struct draw_context *draw, + struct pipe_sampler_state **samplers, + unsigned num) +{ + unsigned i; + + debug_assert(num <= PIPE_MAX_VERTEX_SAMPLERS); + + for (i = 0; i < num; ++i) + draw->samplers[i] = samplers[i]; + for (i = num; i < PIPE_MAX_VERTEX_SAMPLERS; ++i) + draw->samplers[i] = NULL; + + draw->num_samplers = num; + +#ifdef HAVE_LLVM + if (draw->llvm) + draw_llvm_set_sampler_state(draw); +#endif +} + +void +draw_set_mapped_texture(struct draw_context *draw, + unsigned sampler_idx, + uint32_t width, uint32_t height, uint32_t depth, + uint32_t last_level, + uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS], + uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS], + const void *data[DRAW_MAX_TEXTURE_LEVELS]) +{ +#ifdef HAVE_LLVM + if(draw->llvm) + draw_llvm_set_mapped_texture(draw, + sampler_idx, + width, height, depth, last_level, + row_stride, img_stride, data); +#endif +} diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index b905c2f2da6..1f27cbf488a 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -39,19 +39,24 @@ #include "pipe/p_state.h" +#include "tgsi/tgsi_exec.h" struct pipe_context; struct draw_context; struct draw_stage; struct draw_vertex_shader; struct draw_geometry_shader; +struct draw_fragment_shader; struct tgsi_sampler; +#define DRAW_MAX_TEXTURE_LEVELS 13 /* 4K x 4K for now */ struct draw_context *draw_create( struct pipe_context *pipe ); void draw_destroy( struct draw_context *draw ); +void draw_flush(struct draw_context *draw); + void draw_set_viewport_state( struct draw_context *draw, const struct pipe_viewport_state *viewport ); @@ -97,9 +102,27 @@ draw_num_shader_outputs(const struct draw_context *draw); void draw_texture_samplers(struct draw_context *draw, + uint shader_type, uint num_samplers, struct tgsi_sampler **samplers); +void +draw_set_sampler_views(struct draw_context *draw, + struct pipe_sampler_view **views, + unsigned num); +void +draw_set_samplers(struct draw_context *draw, + struct pipe_sampler_state **samplers, + unsigned num); + +void +draw_set_mapped_texture(struct draw_context *draw, + unsigned sampler_idx, + uint32_t width, uint32_t height, uint32_t depth, + uint32_t last_level, + uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS], + uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS], + const void *data[DRAW_MAX_TEXTURE_LEVELS]); /* @@ -116,6 +139,17 @@ void draw_delete_vertex_shader(struct draw_context *draw, /* + * Fragment shader functions + */ +struct draw_fragment_shader * +draw_create_fragment_shader(struct draw_context *draw, + const struct pipe_shader_state *shader); +void draw_bind_fragment_shader(struct draw_context *draw, + struct draw_fragment_shader *dvs); +void draw_delete_fragment_shader(struct draw_context *draw, + struct draw_fragment_shader *dvs); + +/* * Geometry shader functions */ struct draw_geometry_shader * @@ -139,18 +173,11 @@ void draw_set_vertex_elements(struct draw_context *draw, unsigned count, const struct pipe_vertex_element *elements); -void -draw_set_mapped_element_buffer_range( struct draw_context *draw, - unsigned eltSize, - int eltBias, - unsigned min_index, - unsigned max_index, - const void *elements ); - -void draw_set_mapped_element_buffer( struct draw_context *draw, - unsigned eltSize, - int eltBias, - const void *elements ); +void draw_set_index_buffer(struct draw_context *draw, + const struct pipe_index_buffer *ib); + +void draw_set_mapped_index_buffer(struct draw_context *draw, + const void *elements); void draw_set_mapped_vertex_buffer(struct draw_context *draw, unsigned attr, const void *buffer); @@ -162,11 +189,22 @@ draw_set_mapped_constant_buffer(struct draw_context *draw, const void *buffer, unsigned size); +void +draw_set_mapped_so_buffers(struct draw_context *draw, + void *buffers[PIPE_MAX_SO_BUFFERS], + unsigned num_buffers); +void +draw_set_so_state(struct draw_context *draw, + struct pipe_stream_output_state *state); + /*********************************************************************** - * draw_prim.c + * draw_pt.c */ +void draw_vbo(struct draw_context *draw, + const struct pipe_draw_info *info); + void draw_arrays(struct draw_context *draw, unsigned prim, unsigned start, unsigned count); @@ -178,8 +216,6 @@ draw_arrays_instanced(struct draw_context *draw, unsigned startInstance, unsigned instanceCount); -void draw_flush(struct draw_context *draw); - /******************************************************************************* * Driver backend interface @@ -189,7 +225,8 @@ void draw_set_render( struct draw_context *draw, struct vbuf_render *render ); void draw_set_driver_clipping( struct draw_context *draw, - boolean bypass_clipping ); + boolean bypass_clip_xy, + boolean bypass_clip_z ); void draw_set_force_passthrough( struct draw_context *draw, boolean enable ); @@ -201,4 +238,16 @@ boolean draw_need_pipeline(const struct draw_context *draw, const struct pipe_rasterizer_state *rasterizer, unsigned prim ); +static INLINE int +draw_get_shader_param(unsigned shader, enum pipe_cap param) +{ + switch(shader) { + case PIPE_SHADER_VERTEX: + case PIPE_SHADER_GEOMETRY: + return tgsi_exec_get_shader_param(param); + default: + return 0; + } +} + #endif /* DRAW_CONTEXT_H */ diff --git a/src/gallium/auxiliary/draw/draw_decompose_tmp.h b/src/gallium/auxiliary/draw/draw_decompose_tmp.h new file mode 100644 index 00000000000..a142563af97 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_decompose_tmp.h @@ -0,0 +1,431 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Keith Whitwell <[email protected]> + * Chia-I Wu <[email protected]> + */ + +/* these macros are optional */ +#ifndef LOCAL_VARS +#define LOCAL_VARS +#endif +#ifndef FUNC_ENTER +#define FUNC_ENTER do {} while (0) +#endif +#ifndef FUNC_EXIT +#define FUNC_EXIT do {} while (0) +#endif +#ifndef LINE_ADJ +#define LINE_ADJ(flags, a0, i0, i1, a1) LINE(flags, i0, i1) +#endif +#ifndef TRIANGLE_ADJ +#define TRIANGLE_ADJ(flags, i0, a0, i1, a1, i2, a2) TRIANGLE(flags, i0, i1, i2) +#endif + +static void +FUNC(FUNC_VARS) +{ + unsigned idx[6], i; + ushort flags; + LOCAL_VARS + + FUNC_ENTER; + + /* prim, prim_flags, count, and last_vertex_last should have been defined */ + if (0) { + debug_printf("%s: prim 0x%x, prim_flags 0x%x, count %d, last_vertex_last %d\n", + __FUNCTION__, prim, prim_flags, count, last_vertex_last); + } + + switch (prim) { + case PIPE_PRIM_POINTS: + for (i = 0; i < count; i++) { + idx[0] = GET_ELT(i); + POINT(idx[0]); + } + break; + + case PIPE_PRIM_LINES: + flags = DRAW_PIPE_RESET_STIPPLE; + for (i = 0; i + 1 < count; i += 2) { + idx[0] = GET_ELT(i); + idx[1] = GET_ELT(i + 1); + LINE(flags, idx[0], idx[1]); + } + break; + + case PIPE_PRIM_LINE_LOOP: + case PIPE_PRIM_LINE_STRIP: + if (count >= 2) { + flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE; + idx[1] = GET_ELT(0); + idx[2] = idx[1]; + + for (i = 1; i < count; i++, flags = 0) { + idx[0] = idx[1]; + idx[1] = GET_ELT(i); + LINE(flags, idx[0], idx[1]); + } + /* close the loop */ + if (prim == PIPE_PRIM_LINE_LOOP && !prim_flags) + LINE(flags, idx[1], idx[2]); + } + break; + + case PIPE_PRIM_TRIANGLES: + flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; + for (i = 0; i + 2 < count; i += 3) { + idx[0] = GET_ELT(i); + idx[1] = GET_ELT(i + 1); + idx[2] = GET_ELT(i + 2); + TRIANGLE(flags, idx[0], idx[1], idx[2]); + } + break; + + case PIPE_PRIM_TRIANGLE_STRIP: + if (count >= 3) { + flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; + idx[1] = GET_ELT(0); + idx[2] = GET_ELT(1); + + if (last_vertex_last) { + for (i = 0; i + 2 < count; i++) { + idx[0] = idx[1]; + idx[1] = idx[2]; + idx[2] = GET_ELT(i + 2); + /* always emit idx[2] last */ + if (i & 1) + TRIANGLE(flags, idx[1], idx[0], idx[2]); + else + TRIANGLE(flags, idx[0], idx[1], idx[2]); + } + } + else { + for (i = 0; i + 2 < count; i++) { + idx[0] = idx[1]; + idx[1] = idx[2]; + idx[2] = GET_ELT(i + 2); + /* always emit idx[0] first */ + if (i & 1) + TRIANGLE(flags, idx[0], idx[2], idx[1]); + else + TRIANGLE(flags, idx[0], idx[1], idx[2]); + } + } + } + break; + + case PIPE_PRIM_TRIANGLE_FAN: + if (count >= 3) { + flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; + idx[0] = GET_ELT(0); + idx[2] = GET_ELT(1); + + /* idx[0] is neither the first nor the last vertex */ + if (last_vertex_last) { + for (i = 0; i + 2 < count; i++) { + idx[1] = idx[2]; + idx[2] = GET_ELT(i + 2); + /* always emit idx[2] last */ + TRIANGLE(flags, idx[0], idx[1], idx[2]); + } + } + else { + for (i = 0; i + 2 < count; i++) { + idx[1] = idx[2]; + idx[2] = GET_ELT(i + 2); + /* always emit idx[1] first */ + TRIANGLE(flags, idx[1], idx[2], idx[0]); + } + } + } + break; + + case PIPE_PRIM_QUADS: + if (last_vertex_last) { + for (i = 0; i + 3 < count; i += 4) { + idx[0] = GET_ELT(i); + idx[1] = GET_ELT(i + 1); + idx[2] = GET_ELT(i + 2); + idx[3] = GET_ELT(i + 3); + + flags = DRAW_PIPE_RESET_STIPPLE | + DRAW_PIPE_EDGE_FLAG_0 | + DRAW_PIPE_EDGE_FLAG_2; + /* always emit idx[3] last */ + TRIANGLE(flags, idx[0], idx[1], idx[3]); + + flags = DRAW_PIPE_EDGE_FLAG_0 | + DRAW_PIPE_EDGE_FLAG_1; + TRIANGLE(flags, idx[1], idx[2], idx[3]); + } + } + else { + for (i = 0; i + 3 < count; i += 4) { + idx[0] = GET_ELT(i); + idx[1] = GET_ELT(i + 1); + idx[2] = GET_ELT(i + 2); + idx[3] = GET_ELT(i + 3); + + flags = DRAW_PIPE_RESET_STIPPLE | + DRAW_PIPE_EDGE_FLAG_0 | + DRAW_PIPE_EDGE_FLAG_1; + /* XXX should always emit idx[0] first */ + /* always emit idx[3] first */ + TRIANGLE(flags, idx[3], idx[0], idx[1]); + + flags = DRAW_PIPE_EDGE_FLAG_1 | + DRAW_PIPE_EDGE_FLAG_2; + TRIANGLE(flags, idx[3], idx[1], idx[2]); + } + } + break; + + case PIPE_PRIM_QUAD_STRIP: + if (count >= 4) { + idx[2] = GET_ELT(0); + idx[3] = GET_ELT(1); + + if (last_vertex_last) { + for (i = 0; i + 3 < count; i += 2) { + idx[0] = idx[2]; + idx[1] = idx[3]; + idx[2] = GET_ELT(i + 2); + idx[3] = GET_ELT(i + 3); + + /* always emit idx[3] last */ + flags = DRAW_PIPE_RESET_STIPPLE | + DRAW_PIPE_EDGE_FLAG_0 | + DRAW_PIPE_EDGE_FLAG_2; + TRIANGLE(flags, idx[2], idx[0], idx[3]); + + flags = DRAW_PIPE_EDGE_FLAG_0 | + DRAW_PIPE_EDGE_FLAG_1; + TRIANGLE(flags, idx[0], idx[1], idx[3]); + } + } + else { + for (i = 0; i + 3 < count; i += 2) { + idx[0] = idx[2]; + idx[1] = idx[3]; + idx[2] = GET_ELT(i + 2); + idx[3] = GET_ELT(i + 3); + + flags = DRAW_PIPE_RESET_STIPPLE | + DRAW_PIPE_EDGE_FLAG_0 | + DRAW_PIPE_EDGE_FLAG_1; + /* XXX should always emit idx[0] first */ + /* always emit idx[3] first */ + TRIANGLE(flags, idx[3], idx[2], idx[0]); + + flags = DRAW_PIPE_EDGE_FLAG_1 | + DRAW_PIPE_EDGE_FLAG_2; + TRIANGLE(flags, idx[3], idx[0], idx[1]); + } + } + } + break; + + case PIPE_PRIM_POLYGON: + if (count >= 3) { + ushort edge_next, edge_finish; + + if (last_vertex_last) { + flags = (DRAW_PIPE_RESET_STIPPLE | + DRAW_PIPE_EDGE_FLAG_0); + if (!(prim_flags & DRAW_SPLIT_BEFORE)) + flags |= DRAW_PIPE_EDGE_FLAG_2; + + edge_next = DRAW_PIPE_EDGE_FLAG_0; + edge_finish = + (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_1; + } + else { + flags = (DRAW_PIPE_RESET_STIPPLE | + DRAW_PIPE_EDGE_FLAG_1); + if (!(prim_flags & DRAW_SPLIT_BEFORE)) + flags |= DRAW_PIPE_EDGE_FLAG_0; + + edge_next = DRAW_PIPE_EDGE_FLAG_1; + edge_finish = + (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_2; + } + + idx[0] = GET_ELT(0); + idx[2] = GET_ELT(1); + + for (i = 0; i + 2 < count; i++, flags = edge_next) { + idx[1] = idx[2]; + idx[2] = GET_ELT(i + 2); + + if (i + 3 == count) + flags |= edge_finish; + + /* idx[0] is both the first and the last vertex */ + if (last_vertex_last) + TRIANGLE(flags, idx[1], idx[2], idx[0]); + else + TRIANGLE(flags, idx[0], idx[1], idx[2]); + } + } + break; + + case PIPE_PRIM_LINES_ADJACENCY: + flags = DRAW_PIPE_RESET_STIPPLE; + for (i = 0; i + 3 < count; i += 4) { + idx[0] = GET_ELT(i); + idx[1] = GET_ELT(i + 1); + idx[2] = GET_ELT(i + 2); + idx[3] = GET_ELT(i + 3); + LINE_ADJ(flags, idx[0], idx[1], idx[2], idx[3]); + } + break; + + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + if (count >= 4) { + flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE; + idx[1] = GET_ELT(0); + idx[2] = GET_ELT(1); + idx[3] = GET_ELT(2); + + for (i = 1; i + 2 < count; i++, flags = 0) { + idx[0] = idx[1]; + idx[1] = idx[2]; + idx[2] = idx[3]; + idx[3] = GET_ELT(i + 2); + LINE_ADJ(flags, idx[0], idx[1], idx[2], idx[3]); + } + } + break; + + case PIPE_PRIM_TRIANGLES_ADJACENCY: + flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; + for (i = 0; i + 5 < count; i += 6) { + idx[0] = GET_ELT(i); + idx[1] = GET_ELT(i + 1); + idx[2] = GET_ELT(i + 2); + idx[3] = GET_ELT(i + 3); + idx[4] = GET_ELT(i + 4); + idx[5] = GET_ELT(i + 5); + TRIANGLE_ADJ(flags, idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]); + } + break; + + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + if (count >= 6) { + flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; + idx[0] = GET_ELT(1); + idx[2] = GET_ELT(0); + idx[4] = GET_ELT(2); + idx[3] = GET_ELT(4); + + /* + * The vertices of the i-th triangle are stored in + * idx[0,2,4] = { 2*i, 2*i+2, 2*i+4 }; + * + * The adjacent vertices are stored in + * idx[1,3,5] = { 2*i-2, 2*i+6, 2*i+3 }. + * + * However, there are two exceptions: + * + * For the first triangle, idx[1] = 1; + * For the last triangle, idx[3] = 2*i+5. + */ + if (last_vertex_last) { + for (i = 0; i + 5 < count; i += 2) { + idx[1] = idx[0]; + + idx[0] = idx[2]; + idx[2] = idx[4]; + idx[4] = idx[3]; + + idx[3] = GET_ELT(i + ((i + 7 < count) ? 6 : 5)); + idx[5] = GET_ELT(i + 3); + + /* + * alternate the first two vertices (idx[0] and idx[2]) and the + * corresponding adjacent vertices (idx[3] and idx[5]) to have + * the correct orientation + */ + if (i & 2) { + TRIANGLE_ADJ(flags, + idx[2], idx[1], idx[0], idx[5], idx[4], idx[3]); + } + else { + TRIANGLE_ADJ(flags, + idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]); + } + } + } + else { + for (i = 0; i + 5 < count; i += 2) { + idx[1] = idx[0]; + + idx[0] = idx[2]; + idx[2] = idx[4]; + idx[4] = idx[3]; + + idx[3] = GET_ELT(i + ((i + 7 < count) ? 6 : 5)); + idx[5] = GET_ELT(i + 3); + + /* + * alternate the last two vertices (idx[2] and idx[4]) and the + * corresponding adjacent vertices (idx[1] and idx[5]) to have + * the correct orientation + */ + if (i & 2) { + TRIANGLE_ADJ(flags, + idx[0], idx[5], idx[4], idx[3], idx[2], idx[1]); + } + else { + TRIANGLE_ADJ(flags, + idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]); + } + } + } + } + break; + + default: + assert(0); + break; + } + + FUNC_EXIT; +} + +#undef LOCAL_VARS +#undef FUNC_ENTER +#undef FUNC_EXIT +#undef LINE_ADJ +#undef TRIANGLE_ADJ + +#undef FUNC +#undef FUNC_VARS +#undef GET_ELT +#undef POINT +#undef LINE +#undef TRIANGLE diff --git a/src/gallium/auxiliary/draw/draw_fs.c b/src/gallium/auxiliary/draw/draw_fs.c new file mode 100644 index 00000000000..1543bd86f17 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_fs.c @@ -0,0 +1,73 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "pipe/p_shader_tokens.h" + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_prim.h" + +#include "tgsi/tgsi_parse.h" + +#include "draw_fs.h" +#include "draw_private.h" +#include "draw_context.h" + + +struct draw_fragment_shader * +draw_create_fragment_shader(struct draw_context *draw, + const struct pipe_shader_state *shader) +{ + struct draw_fragment_shader *dfs; + + dfs = CALLOC_STRUCT(draw_fragment_shader); + if (dfs) { + dfs->base = *shader; + tgsi_scan_shader(shader->tokens, &dfs->info); + } + + return dfs; +} + + +void +draw_bind_fragment_shader(struct draw_context *draw, + struct draw_fragment_shader *dfs) +{ + draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE); + + draw->fs.fragment_shader = dfs; +} + + +void +draw_delete_fragment_shader(struct draw_context *draw, + struct draw_fragment_shader *dfs) +{ + FREE(dfs); +} + diff --git a/src/gallium/auxiliary/draw/draw_fs.h b/src/gallium/auxiliary/draw/draw_fs.h new file mode 100644 index 00000000000..44995b8277f --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_fs.h @@ -0,0 +1,42 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef DRAW_FS_H +#define DRAW_FS_H + + +#include "tgsi/tgsi_scan.h" + + +struct draw_fragment_shader +{ + struct pipe_shader_state base; + struct tgsi_shader_info info; +}; + + +#endif /* DRAW_FS_H */ diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c index 131deed43e4..50a03ac95a5 100644 --- a/src/gallium/auxiliary/draw/draw_gs.c +++ b/src/gallium/auxiliary/draw/draw_gs.c @@ -1,6 +1,6 @@ /************************************************************************** * - * Copyright 2009 VMWare Inc. + * Copyright 2009 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -37,8 +37,8 @@ #include "util/u_math.h" #include "util/u_memory.h" +#include "util/u_prim.h" -#define MAX_PRIM_VERTICES 6 /* fixme: move it from here */ #define MAX_PRIMITIVES 64 @@ -75,6 +75,10 @@ draw_gs_set_constants(struct draw_context *draw, const void *constants, unsigned size) { + /* noop. added here for symmetry with the VS + * code and in case we'll ever want to allign + * the constants, e.g. when we'll change to a + * different interpreter */ } @@ -90,6 +94,7 @@ draw_create_geometry_shader(struct draw_context *draw, if (!gs) return NULL; + gs->draw = draw; gs->state = *state; gs->state.tokens = tgsi_dup_tokens(state->tokens); if (!gs->state.tokens) { @@ -112,7 +117,7 @@ draw_create_geometry_shader(struct draw_context *draw, TGSI_PROPERTY_GS_OUTPUT_PRIM) gs->output_primitive = gs->info.properties[i].data[0]; else if (gs->info.properties[i].name == - TGSI_PROPERTY_GS_MAX_VERTICES) + TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES) gs->max_output_vertices = gs->info.properties[i].data[0]; } @@ -154,128 +159,37 @@ void draw_delete_geometry_shader(struct draw_context *draw, FREE(dgs); } -static INLINE int num_vertices_for_prim(int prim) -{ - switch(prim) { - case PIPE_PRIM_POINTS: - return 1; - case PIPE_PRIM_LINES: - return 2; - case PIPE_PRIM_LINE_LOOP: - return 2; - case PIPE_PRIM_LINE_STRIP: - return 2; - case PIPE_PRIM_TRIANGLES: - return 3; - case PIPE_PRIM_TRIANGLE_STRIP: - return 3; - case PIPE_PRIM_TRIANGLE_FAN: - return 3; - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - return 4; - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - return 6; - default: - assert(!"Bad geometry shader input"); - return 0; - } -} - -static void draw_fetch_geometry_input(struct draw_geometry_shader *shader, - int start_primitive, - int num_primitives, - const float (*input_ptr)[4], - unsigned input_vertex_stride, - unsigned inputs_from_vs) -{ - struct tgsi_exec_machine *machine = shader->machine; - unsigned slot, vs_slot, k, j; - unsigned num_vertices = num_vertices_for_prim(shader->input_primitive); - int idx = 0; - - for (slot = 0, vs_slot = 0; slot < shader->info.num_inputs; slot++) { - /*debug_printf("Slot = %d (semantic = %d)\n", slot, - shader->info.input_semantic_name[slot]);*/ - if (shader->info.input_semantic_name[slot] == - TGSI_SEMANTIC_PRIMID) { - for (j = 0; j < num_primitives; ++j) { - machine->Inputs[idx].xyzw[0].f[j] = (float)start_primitive + j; - machine->Inputs[idx].xyzw[1].f[j] = (float)start_primitive + j; - machine->Inputs[idx].xyzw[2].f[j] = (float)start_primitive + j; - machine->Inputs[idx].xyzw[3].f[j] = (float)start_primitive + j; - } - ++idx; - } else { - for (j = 0; j < num_primitives; ++j) { - int vidx = idx; - const float (*prim_ptr)[4]; - /*debug_printf(" %d) Prim (num_verts = %d)\n", start_primitive + j, - num_vertices);*/ - prim_ptr = (const float (*)[4])( - (const char *)input_ptr + - (j * num_vertices * input_vertex_stride)); - - for (k = 0; k < num_vertices; ++k, ++vidx) { - const float (*input)[4]; - input = (const float (*)[4])( - (const char *)prim_ptr + (k * input_vertex_stride)); - vidx = k * TGSI_EXEC_MAX_INPUT_ATTRIBS + slot; - /*debug_printf("\t%d)(%d) Input vert:\n", vidx, k);*/ -#if 1 - assert(!util_is_inf_or_nan(input[vs_slot][0])); - assert(!util_is_inf_or_nan(input[vs_slot][1])); - assert(!util_is_inf_or_nan(input[vs_slot][2])); - assert(!util_is_inf_or_nan(input[vs_slot][3])); -#endif - machine->Inputs[vidx].xyzw[0].f[j] = input[vs_slot][0]; - machine->Inputs[vidx].xyzw[1].f[j] = input[vs_slot][1]; - machine->Inputs[vidx].xyzw[2].f[j] = input[vs_slot][2]; - machine->Inputs[vidx].xyzw[3].f[j] = input[vs_slot][3]; -#if 0 - debug_printf("\t\t%d %f %f %f %f\n", slot, - machine->Inputs[vidx].xyzw[0].f[j], - machine->Inputs[vidx].xyzw[1].f[j], - machine->Inputs[vidx].xyzw[2].f[j], - machine->Inputs[vidx].xyzw[3].f[j]); -#endif - } - } - ++vs_slot; - idx += num_vertices; - } - } -} - +/*#define DEBUG_OUTPUTS 1*/ static INLINE void draw_geometry_fetch_outputs(struct draw_geometry_shader *shader, int num_primitives, - float (*output)[4], - unsigned vertex_size) + float (**p_output)[4]) { struct tgsi_exec_machine *machine = shader->machine; unsigned prim_idx, j, slot; + float (*output)[4]; + + output = *p_output; /* Unswizzle all output results. */ - /* FIXME: handle all the primitives produced by the gs, not just - * the first one - unsigned prim_count = - mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0];*/ + for (prim_idx = 0; prim_idx < num_primitives; ++prim_idx) { - unsigned num_verts_per_prim = machine->Primitives[0]; + unsigned num_verts_per_prim = machine->Primitives[prim_idx]; + shader->primitive_lengths[prim_idx + shader->emitted_primitives] = + machine->Primitives[prim_idx]; + shader->emitted_vertices += num_verts_per_prim; for (j = 0; j < num_verts_per_prim; j++) { int idx = (prim_idx * num_verts_per_prim + j) * shader->info.num_outputs; #ifdef DEBUG_OUTPUTS - debug_printf("%d) Output vert:\n", idx); + debug_printf("%d) Output vert:\n", idx / shader->info.num_outputs); #endif for (slot = 0; slot < shader->info.num_outputs; slot++) { - output[slot][0] = machine->Outputs[idx + slot].xyzw[0].f[prim_idx]; - output[slot][1] = machine->Outputs[idx + slot].xyzw[1].f[prim_idx]; - output[slot][2] = machine->Outputs[idx + slot].xyzw[2].f[prim_idx]; - output[slot][3] = machine->Outputs[idx + slot].xyzw[3].f[prim_idx]; + output[slot][0] = machine->Outputs[idx + slot].xyzw[0].f[0]; + output[slot][1] = machine->Outputs[idx + slot].xyzw[1].f[0]; + output[slot][2] = machine->Outputs[idx + slot].xyzw[2].f[0]; + output[slot][3] = machine->Outputs[idx + slot].xyzw[3].f[0]; #ifdef DEBUG_OUTPUTS debug_printf("\t%d: %f %f %f %f\n", slot, output[slot][0], @@ -285,52 +199,276 @@ draw_geometry_fetch_outputs(struct draw_geometry_shader *shader, #endif debug_assert(!util_is_inf_or_nan(output[slot][0])); } - output = (float (*)[4])((char *)output + vertex_size); + output = (float (*)[4])((char *)output + shader->vertex_size); } } + *p_output = output; + shader->emitted_primitives += num_primitives; } -void draw_geometry_shader_run(struct draw_geometry_shader *shader, - const float (*input)[4], - float (*output)[4], - const void *constants[PIPE_MAX_CONSTANT_BUFFERS], - unsigned count, - unsigned input_stride, - unsigned vertex_size) +/*#define DEBUG_INPUTS 1*/ +static void draw_fetch_gs_input(struct draw_geometry_shader *shader, + unsigned *indices, + unsigned num_vertices, + unsigned prim_idx) { struct tgsi_exec_machine *machine = shader->machine; - unsigned int i; - unsigned num_vertices = num_vertices_for_prim(shader->input_primitive); - unsigned num_primitives = count/num_vertices; - unsigned inputs_from_vs = 0; + unsigned slot, vs_slot, i; + unsigned input_vertex_stride = shader->input_vertex_stride; + const float (*input_ptr)[4]; - for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - machine->Consts[i] = constants[i]; - } + input_ptr = shader->input; - for (i = 0; i < shader->info.num_inputs; ++i) { - if (shader->info.input_semantic_name[i] != TGSI_SEMANTIC_PRIMID) - ++inputs_from_vs; + for (i = 0; i < num_vertices; ++i) { + const float (*input)[4]; +#if DEBUG_INPUTS + debug_printf("%d) vertex index = %d (prim idx = %d)\n", + i, indices[i], prim_idx); +#endif + input = (const float (*)[4])( + (const char *)input_ptr + (indices[i] * input_vertex_stride)); + for (slot = 0, vs_slot = 0; slot < shader->info.num_inputs; ++slot) { + unsigned idx = i * TGSI_EXEC_MAX_INPUT_ATTRIBS + slot; + if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_PRIMID) { + machine->Inputs[idx].xyzw[0].f[prim_idx] = + (float)shader->in_prim_idx; + machine->Inputs[idx].xyzw[1].f[prim_idx] = + (float)shader->in_prim_idx; + machine->Inputs[idx].xyzw[2].f[prim_idx] = + (float)shader->in_prim_idx; + machine->Inputs[idx].xyzw[3].f[prim_idx] = + (float)shader->in_prim_idx; + } else { +#if DEBUG_INPUTS + debug_printf("\tSlot = %d, vs_slot = %d, idx = %d:\n", + slot, vs_slot, idx); +#endif +#if 1 + assert(!util_is_inf_or_nan(input[vs_slot][0])); + assert(!util_is_inf_or_nan(input[vs_slot][1])); + assert(!util_is_inf_or_nan(input[vs_slot][2])); + assert(!util_is_inf_or_nan(input[vs_slot][3])); +#endif + machine->Inputs[idx].xyzw[0].f[prim_idx] = input[vs_slot][0]; + machine->Inputs[idx].xyzw[1].f[prim_idx] = input[vs_slot][1]; + machine->Inputs[idx].xyzw[2].f[prim_idx] = input[vs_slot][2]; + machine->Inputs[idx].xyzw[3].f[prim_idx] = input[vs_slot][3]; +#if DEBUG_INPUTS + debug_printf("\t\t%f %f %f %f\n", + machine->Inputs[idx].xyzw[0].f[prim_idx], + machine->Inputs[idx].xyzw[1].f[prim_idx], + machine->Inputs[idx].xyzw[2].f[prim_idx], + machine->Inputs[idx].xyzw[3].f[prim_idx]); +#endif + ++vs_slot; + } + } } +} - for (i = 0; i < num_primitives; ++i) { - unsigned int max_primitives = 1; +static void gs_flush(struct draw_geometry_shader *shader, + unsigned input_primitives) +{ + unsigned out_prim_count; + struct tgsi_exec_machine *machine = shader->machine; + + debug_assert(input_primitives > 0 && + input_primitives < 4); - draw_fetch_geometry_input(shader, i, max_primitives, input, - input_stride, inputs_from_vs); + tgsi_set_exec_mask(machine, + 1, + input_primitives > 1, + input_primitives > 2, + input_primitives > 3); - tgsi_set_exec_mask(machine, - 1, - max_primitives > 1, - max_primitives > 2, - max_primitives > 3); + /* run interpreter */ + tgsi_exec_machine_run(machine); - /* run interpreter */ - tgsi_exec_machine_run(machine); + out_prim_count = + machine->Temps[TGSI_EXEC_TEMP_PRIMITIVE_I].xyzw[TGSI_EXEC_TEMP_PRIMITIVE_C].u[0]; + +#if 0 + debug_printf("PRIM emitted prims = %d (verts=%d), cur prim count = %d\n", + shader->emitted_primitives, shader->emitted_vertices, + out_prim_count); +#endif + draw_geometry_fetch_outputs(shader, out_prim_count, + &shader->tmp_output); +} + +static void gs_point(struct draw_geometry_shader *shader, + int idx) +{ + unsigned indices[1]; + + indices[0] = idx; + + draw_fetch_gs_input(shader, indices, 1, 0); + ++shader->in_prim_idx; + + gs_flush(shader, 1); +} + +static void gs_line(struct draw_geometry_shader *shader, + int i0, int i1) +{ + unsigned indices[2]; - draw_geometry_fetch_outputs(shader, max_primitives, - output, vertex_size); + indices[0] = i0; + indices[1] = i1; + + draw_fetch_gs_input(shader, indices, 2, 0); + ++shader->in_prim_idx; + + gs_flush(shader, 1); +} + +static void gs_line_adj(struct draw_geometry_shader *shader, + int i0, int i1, int i2, int i3) +{ + unsigned indices[4]; + + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + + draw_fetch_gs_input(shader, indices, 4, 0); + ++shader->in_prim_idx; + + gs_flush(shader, 1); +} + +static void gs_tri(struct draw_geometry_shader *shader, + int i0, int i1, int i2) +{ + unsigned indices[3]; + + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + + draw_fetch_gs_input(shader, indices, 3, 0); + ++shader->in_prim_idx; + + gs_flush(shader, 1); +} + +static void gs_tri_adj(struct draw_geometry_shader *shader, + int i0, int i1, int i2, + int i3, int i4, int i5) +{ + unsigned indices[6]; + + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + indices[3] = i3; + indices[4] = i4; + indices[5] = i5; + + draw_fetch_gs_input(shader, indices, 6, 0); + ++shader->in_prim_idx; + + gs_flush(shader, 1); +} + +#define FUNC gs_run +#define GET_ELT(idx) (idx) +#include "draw_gs_tmp.h" + + +#define FUNC gs_run_elts +#define LOCAL_VARS const ushort *elts = input_prims->elts; +#define GET_ELT(idx) (elts[idx]) +#include "draw_gs_tmp.h" + + +/** + * Execute geometry shader using TGSI interpreter. + */ +int draw_geometry_shader_run(struct draw_geometry_shader *shader, + const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + const unsigned constants_size[PIPE_MAX_CONSTANT_BUFFERS], + const struct draw_vertex_info *input_verts, + const struct draw_prim_info *input_prim, + struct draw_vertex_info *output_verts, + struct draw_prim_info *output_prims ) +{ + const float (*input)[4] = (const float (*)[4])input_verts->verts->data; + unsigned input_stride = input_verts->vertex_size; + unsigned vertex_size = input_verts->vertex_size; + struct tgsi_exec_machine *machine = shader->machine; + unsigned num_input_verts = input_prim->linear ? + input_verts->count : + input_prim->count; + unsigned num_in_primitives = + MAX2(u_gs_prims_for_vertices(input_prim->prim, num_input_verts), + u_gs_prims_for_vertices(shader->input_primitive, num_input_verts)); + unsigned max_out_prims = u_gs_prims_for_vertices(shader->output_primitive, + shader->max_output_vertices) + * num_in_primitives; + + output_verts->vertex_size = input_verts->vertex_size; + output_verts->stride = input_verts->vertex_size; + output_verts->verts = + (struct vertex_header *)MALLOC(input_verts->vertex_size * + num_in_primitives * + shader->max_output_vertices); + + +#if 0 + debug_printf("%s count = %d (in prims # = %d)\n", + __FUNCTION__, num_input_verts, num_in_primitives); + debug_printf("\tlinear = %d, prim_info->count = %d\n", + input_prim->linear, input_prim->count); + debug_printf("\tprimt pipe = %d, shader in = %d, shader out = %d, max out = %d\n", + input_prim->prim, shader->input_primitive, + shader->output_primitive, + shader->max_output_vertices); +#endif + + shader->emitted_vertices = 0; + shader->emitted_primitives = 0; + shader->vertex_size = vertex_size; + shader->tmp_output = (float (*)[4])output_verts->verts->data; + shader->in_prim_idx = 0; + shader->input_vertex_stride = input_stride; + shader->input = input; + if (shader->primitive_lengths) { + FREE(shader->primitive_lengths); } + shader->primitive_lengths = MALLOC(max_out_prims * sizeof(unsigned)); + + tgsi_exec_set_constant_buffers(machine, PIPE_MAX_CONSTANT_BUFFERS, + constants, constants_size); + + if (input_prim->linear) + gs_run(shader, input_prim, input_verts, + output_prims, output_verts); + else + gs_run_elts(shader, input_prim, input_verts, + output_prims, output_verts); + + /* Update prim_info: + */ + output_prims->linear = TRUE; + output_prims->elts = NULL; + output_prims->start = 0; + output_prims->count = shader->emitted_vertices; + output_prims->prim = shader->output_primitive; + output_prims->flags = 0x0; + output_prims->primitive_lengths = shader->primitive_lengths; + output_prims->primitive_count = shader->emitted_primitives; + output_verts->count = shader->emitted_vertices; + +#if 0 + debug_printf("GS finished, prims = %d, verts = %d\n", + output_prims->primitive_count, + output_verts->count); +#endif + + return shader->emitted_vertices; } void draw_geometry_shader_delete(struct draw_geometry_shader *shader) diff --git a/src/gallium/auxiliary/draw/draw_gs.h b/src/gallium/auxiliary/draw/draw_gs.h index d8eb2103433..67bc1aa73ff 100644 --- a/src/gallium/auxiliary/draw/draw_gs.h +++ b/src/gallium/auxiliary/draw/draw_gs.h @@ -1,6 +1,6 @@ /************************************************************************** * - * Copyright 2009 VMWare Inc. + * Copyright 2009 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -54,23 +54,37 @@ struct draw_geometry_shader { unsigned input_primitive; unsigned output_primitive; - /* Extracted from shader: - */ - const float (*immediates)[4]; + unsigned *primitive_lengths; + unsigned emitted_vertices; + unsigned emitted_primitives; + + float (*tmp_output)[4]; + unsigned vertex_size; + + unsigned in_prim_idx; + unsigned input_vertex_stride; + const float (*input)[4]; }; -void draw_geometry_shader_run(struct draw_geometry_shader *shader, - const float (*input)[4], - float (*output)[4], - const void *constants[PIPE_MAX_CONSTANT_BUFFERS], - unsigned count, - unsigned input_stride, - unsigned output_stride); +/* + * Returns the number of vertices emitted. + * The vertex shader can emit any number of vertices as long as it's + * smaller than the GS_MAX_OUTPUT_VERTICES shader property. + */ +int draw_geometry_shader_run(struct draw_geometry_shader *shader, + const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + const unsigned constants_size[PIPE_MAX_CONSTANT_BUFFERS], + const struct draw_vertex_info *input_verts, + const struct draw_prim_info *input_prim, + struct draw_vertex_info *output_verts, + struct draw_prim_info *output_prims ); void draw_geometry_shader_prepare(struct draw_geometry_shader *shader, struct draw_context *draw); void draw_geometry_shader_delete(struct draw_geometry_shader *shader); +int draw_gs_max_output_vertices(struct draw_geometry_shader *shader, + unsigned pipe_prim); #endif diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h new file mode 100644 index 00000000000..de7b02655a5 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h @@ -0,0 +1,32 @@ +#define FUNC_VARS struct draw_geometry_shader *gs, \ + const struct draw_prim_info *input_prims, \ + const struct draw_vertex_info *input_verts, \ + struct draw_prim_info *output_prims, \ + struct draw_vertex_info *output_verts + +#define FUNC_ENTER \ + /* declare more local vars */ \ + const unsigned prim = input_prims->prim; \ + const unsigned prim_flags = input_prims->flags; \ + const unsigned count = input_prims->count; \ + const boolean last_vertex_last = TRUE; \ + do { \ + debug_assert(input_prims->primitive_count == 1); \ + switch (prim) { \ + case PIPE_PRIM_QUADS: \ + case PIPE_PRIM_QUAD_STRIP: \ + case PIPE_PRIM_POLYGON: \ + debug_assert(!"unexpected primitive type in GS"); \ + return; \ + default: \ + break; \ + } \ + } while (0) \ + +#define POINT(i0) gs_point(gs,i0) +#define LINE(flags,i0,i1) gs_line(gs,i0,i1) +#define TRIANGLE(flags,i0,i1,i2) gs_tri(gs,i0,i1,i2) +#define LINE_ADJ(flags,i0,i1,i2,i3) gs_line_adj(gs,i0,i1,i2,i3) +#define TRIANGLE_ADJ(flags,i0,i1,i2,i3,i4,i5) gs_tri_adj(gs,i0,i1,i2,i3,i4,i5) + +#include "draw_decompose_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 27383221b9b..7fb86d7cb27 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -1,3 +1,30 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #include "draw_llvm.h" #include "draw_context.h" @@ -10,17 +37,21 @@ #include "gallivm/lp_bld_debug.h" #include "gallivm/lp_bld_tgsi.h" #include "gallivm/lp_bld_printf.h" +#include "gallivm/lp_bld_intr.h" +#include "gallivm/lp_bld_init.h" #include "tgsi/tgsi_exec.h" +#include "tgsi/tgsi_dump.h" #include "util/u_cpu_detect.h" +#include "util/u_math.h" +#include "util/u_pointer.h" #include "util/u_string.h" #include <llvm-c/Transforms/Scalar.h> #define DEBUG_STORE 0 - /* generates the draw jit function */ static void draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *var); @@ -34,12 +65,24 @@ init_globals(struct draw_llvm *llvm) /* struct draw_jit_texture */ { - LLVMTypeRef elem_types[4]; + LLVMTypeRef elem_types[DRAW_JIT_TEXTURE_NUM_FIELDS]; elem_types[DRAW_JIT_TEXTURE_WIDTH] = LLVMInt32Type(); elem_types[DRAW_JIT_TEXTURE_HEIGHT] = LLVMInt32Type(); - elem_types[DRAW_JIT_TEXTURE_STRIDE] = LLVMInt32Type(); - elem_types[DRAW_JIT_TEXTURE_DATA] = LLVMPointerType(LLVMInt8Type(), 0); + elem_types[DRAW_JIT_TEXTURE_DEPTH] = LLVMInt32Type(); + elem_types[DRAW_JIT_TEXTURE_LAST_LEVEL] = LLVMInt32Type(); + elem_types[DRAW_JIT_TEXTURE_ROW_STRIDE] = + LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS); + elem_types[DRAW_JIT_TEXTURE_IMG_STRIDE] = + LLVMArrayType(LLVMInt32Type(), DRAW_MAX_TEXTURE_LEVELS); + elem_types[DRAW_JIT_TEXTURE_DATA] = + LLVMArrayType(LLVMPointerType(LLVMInt8Type(), 0), + DRAW_MAX_TEXTURE_LEVELS); + elem_types[DRAW_JIT_TEXTURE_MIN_LOD] = LLVMFloatType(); + elem_types[DRAW_JIT_TEXTURE_MAX_LOD] = LLVMFloatType(); + elem_types[DRAW_JIT_TEXTURE_LOD_BIAS] = LLVMFloatType(); + elem_types[DRAW_JIT_TEXTURE_BORDER_COLOR] = + LLVMArrayType(LLVMFloatType(), 4); texture_type = LLVMStructType(elem_types, Elements(elem_types), 0); @@ -49,12 +92,33 @@ init_globals(struct draw_llvm *llvm) LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, height, llvm->target, texture_type, DRAW_JIT_TEXTURE_HEIGHT); - LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, stride, + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, depth, + llvm->target, texture_type, + DRAW_JIT_TEXTURE_DEPTH); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, last_level, + llvm->target, texture_type, + DRAW_JIT_TEXTURE_LAST_LEVEL); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, row_stride, + llvm->target, texture_type, + DRAW_JIT_TEXTURE_ROW_STRIDE); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, img_stride, llvm->target, texture_type, - DRAW_JIT_TEXTURE_STRIDE); + DRAW_JIT_TEXTURE_IMG_STRIDE); LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, data, llvm->target, texture_type, DRAW_JIT_TEXTURE_DATA); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, min_lod, + llvm->target, texture_type, + DRAW_JIT_TEXTURE_MIN_LOD); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, max_lod, + llvm->target, texture_type, + DRAW_JIT_TEXTURE_MAX_LOD); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, lod_bias, + llvm->target, texture_type, + DRAW_JIT_TEXTURE_LOD_BIAS); + LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, border_color, + llvm->target, texture_type, + DRAW_JIT_TEXTURE_BORDER_COLOR); LP_CHECK_STRUCT_SIZE(struct draw_jit_texture, llvm->target, texture_type); @@ -69,7 +133,8 @@ init_globals(struct draw_llvm *llvm) elem_types[0] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */ elem_types[1] = LLVMPointerType(LLVMFloatType(), 0); /* vs_constants */ - elem_types[2] = LLVMArrayType(texture_type, PIPE_MAX_SAMPLERS); /* textures */ + elem_types[2] = LLVMArrayType(texture_type, + PIPE_MAX_VERTEX_SAMPLERS); /* textures */ context_type = LLVMStructType(elem_types, Elements(elem_types), 0); @@ -79,7 +144,7 @@ init_globals(struct draw_llvm *llvm) llvm->target, context_type, 1); LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, textures, llvm->target, context_type, - DRAW_JIT_CONTEXT_TEXTURES_INDEX); + DRAW_JIT_CTX_TEXTURES); LP_CHECK_STRUCT_SIZE(struct draw_jit_context, llvm->target, context_type); @@ -161,9 +226,11 @@ create_vertex_header(struct draw_llvm *llvm, int data_elems) struct draw_llvm * draw_llvm_create(struct draw_context *draw) { - struct draw_llvm *llvm = CALLOC_STRUCT( draw_llvm ); + struct draw_llvm *llvm; - util_cpu_detect(); + llvm = CALLOC_STRUCT( draw_llvm ); + if (!llvm) + return NULL; llvm->draw = draw; llvm->engine = draw->engine; @@ -179,27 +246,50 @@ draw_llvm_create(struct draw_context *draw) llvm->pass = LLVMCreateFunctionPassManager(llvm->provider); LLVMAddTargetData(llvm->target, llvm->pass); - /* These are the passes currently listed in llvm-c/Transforms/Scalar.h, - * but there are more on SVN. */ - /* TODO: Add more passes */ - LLVMAddConstantPropagationPass(llvm->pass); - if(util_cpu_caps.has_sse4_1) { - /* FIXME: There is a bug in this pass, whereby the combination of fptosi - * and sitofp (necessary for trunc/floor/ceil/round implementation) - * somehow becomes invalid code. + + if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) { + /* These are the passes currently listed in llvm-c/Transforms/Scalar.h, + * but there are more on SVN. */ + /* TODO: Add more passes */ + + LLVMAddCFGSimplificationPass(llvm->pass); + + if (HAVE_LLVM >= 0x207 && sizeof(void*) == 4) { + /* For LLVM >= 2.7 and 32-bit build, use this order of passes to + * avoid generating bad code. + * Test with piglit glsl-vs-sqrt-zero test. + */ + LLVMAddConstantPropagationPass(llvm->pass); + LLVMAddPromoteMemoryToRegisterPass(llvm->pass); + } + else { + LLVMAddPromoteMemoryToRegisterPass(llvm->pass); + LLVMAddConstantPropagationPass(llvm->pass); + } + + if(util_cpu_caps.has_sse4_1) { + /* FIXME: There is a bug in this pass, whereby the combination of fptosi + * and sitofp (necessary for trunc/floor/ceil/round implementation) + * somehow becomes invalid code. + */ + LLVMAddInstructionCombiningPass(llvm->pass); + } + LLVMAddGVNPass(llvm->pass); + } else { + /* We need at least this pass to prevent the backends to fail in + * unexpected ways. */ - LLVMAddInstructionCombiningPass(llvm->pass); + LLVMAddPromoteMemoryToRegisterPass(llvm->pass); } - LLVMAddPromoteMemoryToRegisterPass(llvm->pass); - LLVMAddGVNPass(llvm->pass); - LLVMAddCFGSimplificationPass(llvm->pass); init_globals(llvm); + if (gallivm_debug & GALLIVM_DEBUG_IR) { + LLVMDumpModule(llvm->module); + } -#if 0 - LLVMDumpModule(llvm->module); -#endif + llvm->nr_variants = 0; + make_empty_list(&llvm->vs_variants_list); return llvm; } @@ -207,21 +297,41 @@ draw_llvm_create(struct draw_context *draw) void draw_llvm_destroy(struct draw_llvm *llvm) { + LLVMDisposePassManager(llvm->pass); + FREE(llvm); } struct draw_llvm_variant * -draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs) +draw_llvm_create_variant(struct draw_llvm *llvm, + unsigned num_inputs, + const struct draw_llvm_variant_key *key) { - struct draw_llvm_variant *variant = MALLOC(sizeof(struct draw_llvm_variant)); + struct draw_llvm_variant *variant; + struct llvm_vertex_shader *shader = + llvm_vertex_shader(llvm->draw->vs.vertex_shader); - draw_llvm_make_variant_key(llvm, &variant->key); + variant = MALLOC(sizeof *variant + + shader->variant_key_size - + sizeof variant->key); + if (variant == NULL) + return NULL; + + variant->llvm = llvm; + + memcpy(&variant->key, key, shader->variant_key_size); llvm->vertex_header_ptr_type = create_vertex_header(llvm, num_inputs); draw_llvm_generate(llvm, variant); draw_llvm_generate_elts(llvm, variant); + variant->shader = shader; + variant->list_item_global.base = variant; + variant->list_item_local.base = variant; + /*variant->no = */shader->variants_created++; + variant->list_item_global.base = variant; + return variant; } @@ -230,11 +340,13 @@ generate_vs(struct draw_llvm *llvm, LLVMBuilderRef builder, LLVMValueRef (*outputs)[NUM_CHANNELS], const LLVMValueRef (*inputs)[NUM_CHANNELS], - LLVMValueRef context_ptr) + LLVMValueRef context_ptr, + struct lp_build_sampler_soa *draw_sampler) { const struct tgsi_token *tokens = llvm->draw->vs.vertex_shader->state.tokens; struct lp_type vs_type; LLVMValueRef consts_ptr = draw_jit_context_vs_constants(builder, context_ptr); + struct lp_build_sampler_soa *sampler = 0; memset(&vs_type, 0, sizeof vs_type); vs_type.floating = TRUE; /* floating point values */ @@ -246,7 +358,14 @@ generate_vs(struct draw_llvm *llvm, num_vs = 4; /* number of vertices per block */ #endif - /*tgsi_dump(tokens, 0);*/ + if (gallivm_debug & GALLIVM_DEBUG_IR) { + tgsi_dump(tokens, 0); + } + + if (llvm->draw->num_sampler_views && + llvm->draw->num_samplers) + sampler = draw_sampler; + lp_build_tgsi_soa(builder, tokens, vs_type, @@ -255,7 +374,7 @@ generate_vs(struct draw_llvm *llvm, NULL /*pos*/, inputs, outputs, - NULL/*sampler*/, + sampler, &llvm->draw->vs.vertex_shader->info); } @@ -283,7 +402,8 @@ generate_fetch(LLVMBuilderRef builder, LLVMValueRef *res, struct pipe_vertex_element *velem, LLVMValueRef vbuf, - LLVMValueRef index) + LLVMValueRef index, + LLVMValueRef instance_id) { LLVMValueRef indices = LLVMConstInt(LLVMInt64Type(), velem->vertex_buffer_index, 0); LLVMValueRef vbuffer_ptr = LLVMBuildGEP(builder, vbuffers_ptr, @@ -294,8 +414,15 @@ generate_fetch(LLVMBuilderRef builder, LLVMValueRef cond; LLVMValueRef stride; - cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, ""); + if (velem->instance_divisor) { + /* array index = instance_id / instance_divisor */ + index = LLVMBuildUDiv(builder, instance_id, + LLVMConstInt(LLVMInt32Type(), velem->instance_divisor, 0), + "instance_divisor"); + } + /* limit index to min(inex, vb_max_index) */ + cond = LLVMBuildICmp(builder, LLVMIntULE, index, vb_max_index, ""); index = LLVMBuildSelect(builder, cond, index, vb_max_index, ""); stride = LLVMBuildMul(builder, vb_stride, index, ""); @@ -563,20 +690,22 @@ convert_to_aos(LLVMBuilderRef builder, static void draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) { - LLVMTypeRef arg_types[7]; + LLVMTypeRef arg_types[8]; LLVMTypeRef func_type; LLVMValueRef context_ptr; LLVMBasicBlockRef block; LLVMBuilderRef builder; LLVMValueRef start, end, count, stride, step, io_itr; LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr; + LLVMValueRef instance_id; struct draw_context *draw = llvm->draw; unsigned i, j; struct lp_build_context bld; struct lp_build_loop_state lp_loop; - struct lp_type vs_type = lp_type_float_vec(32); const int max_vertices = 4; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; + void *code; + struct lp_build_sampler_soa *sampler = 0; arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ @@ -585,6 +714,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) arg_types[4] = LLVMInt32Type(); /* count */ arg_types[5] = LLVMInt32Type(); /* stride */ arg_types[6] = llvm->vb_ptr_type; /* pipe_vertex_buffer's */ + arg_types[7] = LLVMInt32Type(); /* instance_id */ func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); @@ -601,6 +731,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) count = LLVMGetParam(variant->function, 4); stride = LLVMGetParam(variant->function, 5); vb_ptr = LLVMGetParam(variant->function, 6); + instance_id = LLVMGetParam(variant->function, 7); lp_build_name(context_ptr, "context"); lp_build_name(io_ptr, "io"); @@ -609,6 +740,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) lp_build_name(count, "count"); lp_build_name(stride, "stride"); lp_build_name(vb_ptr, "vb"); + lp_build_name(instance_id, "instance_id"); /* * Function body @@ -618,12 +750,17 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) builder = LLVMCreateBuilder(); LLVMPositionBuilderAtEnd(builder, block); - lp_build_context_init(&bld, builder, vs_type); + lp_build_context_init(&bld, builder, lp_type_int(32)); end = lp_build_add(&bld, start, count); step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); + /* code generated texture sampling */ + sampler = draw_llvm_sampler_soa_create( + draw_llvm_variant_key_samplers(&variant->key), + context_ptr); + #if DEBUG_STORE lp_build_printf(builder, "start = %d, end = %d, step = %d\n", start, end, step); @@ -654,7 +791,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, ""); generate_fetch(builder, vbuffers_ptr, - &aos_attribs[j][i], velem, vb, true_index); + &aos_attribs[j][i], velem, vb, true_index, + instance_id); } } convert_to_soa(builder, aos_attribs, inputs, @@ -665,7 +803,8 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) builder, outputs, ptr_aos, - context_ptr); + context_ptr, + sampler); convert_to_aos(builder, io, outputs, draw->vs.vertex_shader->info.num_outputs, @@ -673,6 +812,13 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) } lp_build_loop_end_cond(builder, end, step, LLVMIntUGE, &lp_loop); + sampler->destroy(sampler); + +#ifdef PIPE_ARCH_X86 + /* Avoid corrupting the FPU stack on 32bit OSes. */ + lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); +#endif + LLVMBuildRetVoid(builder); LLVMDisposeBuilder(builder); @@ -682,41 +828,48 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) */ #ifdef DEBUG if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) { - LLVMDumpValue(variant->function); + lp_debug_dump_value(variant->function); assert(0); } #endif LLVMRunFunctionPassManager(llvm->pass, variant->function); - if (0) { - LLVMDumpValue(variant->function); + if (gallivm_debug & GALLIVM_DEBUG_IR) { + lp_debug_dump_value(variant->function); debug_printf("\n"); } - variant->jit_func = (draw_jit_vert_func)LLVMGetPointerToGlobal(llvm->draw->engine, variant->function); - if (0) - lp_disassemble(variant->jit_func); + code = LLVMGetPointerToGlobal(llvm->draw->engine, variant->function); + variant->jit_func = (draw_jit_vert_func)pointer_to_func(code); + + if (gallivm_debug & GALLIVM_DEBUG_ASM) { + lp_disassemble(code); + } + lp_func_delete_body(variant->function); } static void draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *variant) { - LLVMTypeRef arg_types[7]; + LLVMTypeRef arg_types[8]; LLVMTypeRef func_type; LLVMValueRef context_ptr; LLVMBasicBlockRef block; LLVMBuilderRef builder; LLVMValueRef fetch_elts, fetch_count, stride, step, io_itr; LLVMValueRef io_ptr, vbuffers_ptr, vb_ptr; + LLVMValueRef instance_id; struct draw_context *draw = llvm->draw; unsigned i, j; struct lp_build_context bld; struct lp_build_loop_state lp_loop; - struct lp_type vs_type = lp_type_float_vec(32); const int max_vertices = 4; LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS]; + LLVMValueRef fetch_max; + void *code; + struct lp_build_sampler_soa *sampler = 0; arg_types[0] = llvm->context_ptr_type; /* context */ arg_types[1] = llvm->vertex_header_ptr_type; /* vertex_header */ @@ -725,14 +878,17 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian arg_types[4] = LLVMInt32Type(); /* fetch_count */ arg_types[5] = LLVMInt32Type(); /* stride */ arg_types[6] = llvm->vb_ptr_type; /* pipe_vertex_buffer's */ + arg_types[7] = LLVMInt32Type(); /* instance_id */ func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0); - variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", func_type); + variant->function_elts = LLVMAddFunction(llvm->module, "draw_llvm_shader_elts", + func_type); LLVMSetFunctionCallConv(variant->function_elts, LLVMCCallConv); for(i = 0; i < Elements(arg_types); ++i) if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind) - LLVMAddAttribute(LLVMGetParam(variant->function_elts, i), LLVMNoAliasAttribute); + LLVMAddAttribute(LLVMGetParam(variant->function_elts, i), + LLVMNoAliasAttribute); context_ptr = LLVMGetParam(variant->function_elts, 0); io_ptr = LLVMGetParam(variant->function_elts, 1); @@ -741,6 +897,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian fetch_count = LLVMGetParam(variant->function_elts, 4); stride = LLVMGetParam(variant->function_elts, 5); vb_ptr = LLVMGetParam(variant->function_elts, 6); + instance_id = LLVMGetParam(variant->function_elts, 7); lp_build_name(context_ptr, "context"); lp_build_name(io_ptr, "io"); @@ -749,6 +906,7 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian lp_build_name(fetch_count, "fetch_count"); lp_build_name(stride, "stride"); lp_build_name(vb_ptr, "vb"); + lp_build_name(instance_id, "instance_id"); /* * Function body @@ -758,10 +916,19 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian builder = LLVMCreateBuilder(); LLVMPositionBuilderAtEnd(builder, block); - lp_build_context_init(&bld, builder, vs_type); + lp_build_context_init(&bld, builder, lp_type_int(32)); step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); + /* code generated texture sampling */ + sampler = draw_llvm_sampler_soa_create( + draw_llvm_variant_key_samplers(&variant->key), + context_ptr); + + fetch_max = LLVMBuildSub(builder, fetch_count, + LLVMConstInt(LLVMInt32Type(), 1, 0), + "fetch_max"); + lp_build_loop_begin(builder, LLVMConstInt(LLVMInt32Type(), 0, 0), &lp_loop); { LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][NUM_CHANNELS]; @@ -780,8 +947,15 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian builder, lp_loop.counter, LLVMConstInt(LLVMInt32Type(), i, 0), ""); - LLVMValueRef fetch_ptr = LLVMBuildGEP(builder, fetch_elts, - &true_index, 1, ""); + LLVMValueRef fetch_ptr; + + /* make sure we're not out of bounds which can happen + * if fetch_count % 4 != 0, because on the last iteration + * a few of the 4 vertex fetches will be out of bounds */ + true_index = lp_build_min(&bld, true_index, fetch_max); + + fetch_ptr = LLVMBuildGEP(builder, fetch_elts, + &true_index, 1, ""); true_index = LLVMBuildLoad(builder, fetch_ptr, "fetch_elt"); for (j = 0; j < draw->pt.nr_vertex_elements; ++j) { struct pipe_vertex_element *velem = &draw->pt.vertex_element[j]; @@ -791,7 +965,8 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian LLVMValueRef vb = LLVMBuildGEP(builder, vb_ptr, &vb_index, 1, ""); generate_fetch(builder, vbuffers_ptr, - &aos_attribs[j][i], velem, vb, true_index); + &aos_attribs[j][i], velem, vb, true_index, + instance_id); } } convert_to_soa(builder, aos_attribs, inputs, @@ -802,7 +977,8 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian builder, outputs, ptr_aos, - context_ptr); + context_ptr, + sampler); convert_to_aos(builder, io, outputs, draw->vs.vertex_shader->info.num_outputs, @@ -810,6 +986,13 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian } lp_build_loop_end_cond(builder, fetch_count, step, LLVMIntUGE, &lp_loop); + sampler->destroy(sampler); + +#ifdef PIPE_ARCH_X86 + /* Avoid corrupting the FPU stack on 32bit OSes. */ + lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0); +#endif + LLVMBuildRetVoid(builder); LLVMDisposeBuilder(builder); @@ -819,37 +1002,136 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian */ #ifdef DEBUG if(LLVMVerifyFunction(variant->function_elts, LLVMPrintMessageAction)) { - LLVMDumpValue(variant->function_elts); + lp_debug_dump_value(variant->function_elts); assert(0); } #endif LLVMRunFunctionPassManager(llvm->pass, variant->function_elts); - if (0) { - LLVMDumpValue(variant->function_elts); + if (gallivm_debug & GALLIVM_DEBUG_IR) { + lp_debug_dump_value(variant->function_elts); debug_printf("\n"); } - variant->jit_func_elts = (draw_jit_vert_func_elts)LLVMGetPointerToGlobal( - llvm->draw->engine, variant->function_elts); - if (0) - lp_disassemble(variant->jit_func_elts); + code = LLVMGetPointerToGlobal(llvm->draw->engine, variant->function_elts); + variant->jit_func_elts = (draw_jit_vert_func_elts)pointer_to_func(code); + + if (gallivm_debug & GALLIVM_DEBUG_ASM) { + lp_disassemble(code); + } + lp_func_delete_body(variant->function_elts); } -void -draw_llvm_make_variant_key(struct draw_llvm *llvm, - struct draw_llvm_variant_key *key) + +struct draw_llvm_variant_key * +draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store) { - memset(key, 0, sizeof(struct draw_llvm_variant_key)); + unsigned i; + struct draw_llvm_variant_key *key; + struct lp_sampler_static_state *sampler; + + key = (struct draw_llvm_variant_key *)store; + /* Presumably all variants of the shader should have the same + * number of vertex elements - ie the number of shader inputs. + */ key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements; + /* All variants of this shader will have the same value for + * nr_samplers. Not yet trying to compact away holes in the + * sampler array. + */ + key->nr_samplers = llvm->draw->vs.vertex_shader->info.file_max[TGSI_FILE_SAMPLER] + 1; + + sampler = draw_llvm_variant_key_samplers(key); + memcpy(key->vertex_element, llvm->draw->pt.vertex_element, sizeof(struct pipe_vertex_element) * key->nr_vertex_elements); + + memset(sampler, 0, key->nr_samplers * sizeof *sampler); + + for (i = 0 ; i < key->nr_samplers; i++) { + lp_sampler_static_state(&sampler[i], + llvm->draw->sampler_views[i], + llvm->draw->samplers[i]); + } + + return key; +} + +void +draw_llvm_set_mapped_texture(struct draw_context *draw, + unsigned sampler_idx, + uint32_t width, uint32_t height, uint32_t depth, + uint32_t last_level, + uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS], + uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS], + const void *data[DRAW_MAX_TEXTURE_LEVELS]) +{ + unsigned j; + struct draw_jit_texture *jit_tex; + + assert(sampler_idx < PIPE_MAX_VERTEX_SAMPLERS); + + + jit_tex = &draw->llvm->jit_context.textures[sampler_idx]; + + jit_tex->width = width; + jit_tex->height = height; + jit_tex->depth = depth; + jit_tex->last_level = last_level; + + for (j = 0; j <= last_level; j++) { + jit_tex->data[j] = data[j]; + jit_tex->row_stride[j] = row_stride[j]; + jit_tex->img_stride[j] = img_stride[j]; + } +} + + +void +draw_llvm_set_sampler_state(struct draw_context *draw) +{ + unsigned i; + + for (i = 0; i < draw->num_samplers; i++) { + struct draw_jit_texture *jit_tex = &draw->llvm->jit_context.textures[i]; + + if (draw->samplers[i]) { + jit_tex->min_lod = draw->samplers[i]->min_lod; + jit_tex->max_lod = draw->samplers[i]->max_lod; + jit_tex->lod_bias = draw->samplers[i]->lod_bias; + COPY_4V(jit_tex->border_color, draw->samplers[i]->border_color); + } + } +} + + +void +draw_llvm_destroy_variant(struct draw_llvm_variant *variant) +{ + struct draw_llvm *llvm = variant->llvm; + struct draw_context *draw = llvm->draw; + + if (variant->function_elts) { + if (variant->function_elts) + LLVMFreeMachineCodeForFunction(draw->engine, + variant->function_elts); + LLVMDeleteFunction(variant->function_elts); + } + + if (variant->function) { + if (variant->function) + LLVMFreeMachineCodeForFunction(draw->engine, + variant->function); + LLVMDeleteFunction(variant->function); + } - memcpy(&key->vs, - &llvm->draw->vs.vertex_shader->state, - sizeof(struct pipe_shader_state)); + remove_from_list(&variant->list_item_local); + variant->shader->variants_cached--; + remove_from_list(&variant->list_item_global); + llvm->nr_variants--; + FREE(variant); } diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index 58fee7f9d60..d0a68ae412d 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -1,28 +1,79 @@ -#ifndef HAVE_LLVM_H -#define HAVE_LLVM_H +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef DRAW_LLVM_H +#define DRAW_LLVM_H #include "draw/draw_private.h" +#include "draw/draw_vs.h" +#include "gallivm/lp_bld_sample.h" + #include "pipe/p_context.h" +#include "util/u_simple_list.h" #include <llvm-c/Core.h> #include <llvm-c/Analysis.h> #include <llvm-c/Target.h> #include <llvm-c/ExecutionEngine.h> +#define DRAW_MAX_TEXTURE_LEVELS 13 /* 4K x 4K for now */ + +struct draw_llvm; +struct llvm_vertex_shader; + struct draw_jit_texture { uint32_t width; uint32_t height; - uint32_t stride; - const void *data; + uint32_t depth; + uint32_t last_level; + uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS]; + uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS]; + const void *data[DRAW_MAX_TEXTURE_LEVELS]; + float min_lod; + float max_lod; + float lod_bias; + float border_color[4]; }; enum { DRAW_JIT_TEXTURE_WIDTH = 0, DRAW_JIT_TEXTURE_HEIGHT, - DRAW_JIT_TEXTURE_STRIDE, - DRAW_JIT_TEXTURE_DATA + DRAW_JIT_TEXTURE_DEPTH, + DRAW_JIT_TEXTURE_LAST_LEVEL, + DRAW_JIT_TEXTURE_ROW_STRIDE, + DRAW_JIT_TEXTURE_IMG_STRIDE, + DRAW_JIT_TEXTURE_DATA, + DRAW_JIT_TEXTURE_MIN_LOD, + DRAW_JIT_TEXTURE_MAX_LOD, + DRAW_JIT_TEXTURE_LOD_BIAS, + DRAW_JIT_TEXTURE_BORDER_COLOR, + DRAW_JIT_TEXTURE_NUM_FIELDS /* number of fields above */ }; enum { @@ -48,7 +99,7 @@ struct draw_jit_context const float *gs_constants; - struct draw_jit_texture textures[PIPE_MAX_SAMPLERS]; + struct draw_jit_texture textures[PIPE_MAX_VERTEX_SAMPLERS]; }; @@ -58,10 +109,10 @@ struct draw_jit_context #define draw_jit_context_gs_constants(_builder, _ptr) \ lp_build_struct_get(_builder, _ptr, 1, "gs_constants") -#define DRAW_JIT_CONTEXT_TEXTURES_INDEX 2 +#define DRAW_JIT_CTX_TEXTURES 2 #define draw_jit_context_textures(_builder, _ptr) \ - lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CONTEXT_TEXTURES_INDEX, "textures") + lp_build_struct_get_ptr(_builder, _ptr, DRAW_JIT_CTX_TEXTURES, "textures") @@ -92,7 +143,8 @@ typedef void unsigned start, unsigned count, unsigned stride, - struct pipe_vertex_buffer *vertex_buffers); + struct pipe_vertex_buffer *vertex_buffers, + unsigned instance_id); typedef void @@ -102,13 +154,89 @@ typedef void const unsigned *fetch_elts, unsigned fetch_count, unsigned stride, - struct pipe_vertex_buffer *vertex_buffers); + struct pipe_vertex_buffer *vertex_buffers, + unsigned instance_id); + +struct draw_llvm_variant_key +{ + unsigned nr_vertex_elements:16; + unsigned nr_samplers:16; + + /* Variable number of vertex elements: + */ + struct pipe_vertex_element vertex_element[1]; + + /* Followed by variable number of samplers: + */ +/* struct lp_sampler_static_state sampler; */ +}; + +#define DRAW_LLVM_MAX_VARIANT_KEY_SIZE \ + (sizeof(struct draw_llvm_variant_key) + \ + PIPE_MAX_VERTEX_SAMPLERS * sizeof(struct lp_sampler_static_state) + \ + (PIPE_MAX_ATTRIBS-1) * sizeof(struct pipe_vertex_element)) + + +static INLINE size_t +draw_llvm_variant_key_size(unsigned nr_vertex_elements, + unsigned nr_samplers) +{ + return (sizeof(struct draw_llvm_variant_key) + + nr_samplers * sizeof(struct lp_sampler_static_state) + + (nr_vertex_elements - 1) * sizeof(struct pipe_vertex_element)); +} + + +static INLINE struct lp_sampler_static_state * +draw_llvm_variant_key_samplers(struct draw_llvm_variant_key *key) +{ + return (struct lp_sampler_static_state *) + &key->vertex_element[key->nr_vertex_elements]; +} + + + +struct draw_llvm_variant_list_item +{ + struct draw_llvm_variant *base; + struct draw_llvm_variant_list_item *next, *prev; +}; + +struct draw_llvm_variant +{ + LLVMValueRef function; + LLVMValueRef function_elts; + draw_jit_vert_func jit_func; + draw_jit_vert_func_elts jit_func_elts; + + struct llvm_vertex_shader *shader; + + struct draw_llvm *llvm; + struct draw_llvm_variant_list_item list_item_global; + struct draw_llvm_variant_list_item list_item_local; + + /* key is variable-sized, must be last */ + struct draw_llvm_variant_key key; + /* key is variable-sized, must be last */ +}; + +struct llvm_vertex_shader { + struct draw_vertex_shader base; + + unsigned variant_key_size; + struct draw_llvm_variant_list_item variants; + unsigned variants_created; + unsigned variants_cached; +}; struct draw_llvm { struct draw_context *draw; struct draw_jit_context jit_context; + struct draw_llvm_variant_list_item vs_variants_list; + int nr_variants; + LLVMModuleRef module; LLVMExecutionEngineRef engine; LLVMModuleProviderRef provider; @@ -121,23 +249,12 @@ struct draw_llvm { LLVMTypeRef vb_ptr_type; }; -struct draw_llvm_variant_key -{ - struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS]; - unsigned nr_vertex_elements; - struct pipe_shader_state vs; -}; - -struct draw_llvm_variant +static INLINE struct llvm_vertex_shader * +llvm_vertex_shader(struct draw_vertex_shader *vs) { - struct draw_llvm_variant_key key; - LLVMValueRef function; - LLVMValueRef function_elts; - draw_jit_vert_func jit_func; - draw_jit_vert_func_elts jit_func_elts; + return (struct llvm_vertex_shader *)vs; +} - struct draw_llvm_variant *next; -}; struct draw_llvm * draw_llvm_create(struct draw_context *draw); @@ -146,14 +263,35 @@ void draw_llvm_destroy(struct draw_llvm *llvm); struct draw_llvm_variant * -draw_llvm_prepare(struct draw_llvm *llvm, int num_inputs); +draw_llvm_create_variant(struct draw_llvm *llvm, + unsigned num_vertex_header_attribs, + const struct draw_llvm_variant_key *key); void -draw_llvm_make_variant_key(struct draw_llvm *llvm, - struct draw_llvm_variant_key *key); +draw_llvm_destroy_variant(struct draw_llvm_variant *variant); + +struct draw_llvm_variant_key * +draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store); LLVMValueRef draw_llvm_translate_from(LLVMBuilderRef builder, LLVMValueRef vbuffer, enum pipe_format from_format); + +struct lp_build_sampler_soa * +draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state, + LLVMValueRef context_ptr); + +void +draw_llvm_set_sampler_state(struct draw_context *draw); + +void +draw_llvm_set_mapped_texture(struct draw_context *draw, + unsigned sampler_idx, + uint32_t width, uint32_t height, uint32_t depth, + uint32_t last_level, + uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS], + uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS], + const void *data[DRAW_MAX_TEXTURE_LEVELS]); + #endif diff --git a/src/gallium/auxiliary/draw/draw_llvm_sample.c b/src/gallium/auxiliary/draw/draw_llvm_sample.c new file mode 100644 index 00000000000..ac1fbb179c6 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_llvm_sample.c @@ -0,0 +1,223 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * Texture sampling code generation + * @author Jose Fonseca <[email protected]> + */ + +#include "pipe/p_defines.h" +#include "pipe/p_shader_tokens.h" +#include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_type.h" +#include "gallivm/lp_bld_sample.h" +#include "gallivm/lp_bld_tgsi.h" + + +#include "util/u_debug.h" +#include "util/u_memory.h" +#include "util/u_pointer.h" +#include "util/u_string.h" + +#include "draw_llvm.h" + + +/** + * This provides the bridge between the sampler state store in + * lp_jit_context and lp_jit_texture and the sampler code + * generator. It provides the texture layout information required by + * the texture sampler code generator in terms of the state stored in + * lp_jit_context and lp_jit_texture in runtime. + */ +struct draw_llvm_sampler_dynamic_state +{ + struct lp_sampler_dynamic_state base; + + const struct lp_sampler_static_state *static_state; + + LLVMValueRef context_ptr; +}; + + +/** + * This is the bridge between our sampler and the TGSI translator. + */ +struct draw_llvm_sampler_soa +{ + struct lp_build_sampler_soa base; + + struct draw_llvm_sampler_dynamic_state dynamic_state; +}; + + +/** + * Fetch the specified member of the lp_jit_texture structure. + * \param emit_load if TRUE, emit the LLVM load instruction to actually + * fetch the field's value. Otherwise, just emit the + * GEP code to address the field. + * + * @sa http://llvm.org/docs/GetElementPtr.html + */ +static LLVMValueRef +draw_llvm_texture_member(const struct lp_sampler_dynamic_state *base, + LLVMBuilderRef builder, + unsigned unit, + unsigned member_index, + const char *member_name, + boolean emit_load) +{ + struct draw_llvm_sampler_dynamic_state *state = + (struct draw_llvm_sampler_dynamic_state *)base; + LLVMValueRef indices[4]; + LLVMValueRef ptr; + LLVMValueRef res; + + debug_assert(unit < PIPE_MAX_VERTEX_SAMPLERS); + + /* context[0] */ + indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + /* context[0].textures */ + indices[1] = LLVMConstInt(LLVMInt32Type(), DRAW_JIT_CTX_TEXTURES, 0); + /* context[0].textures[unit] */ + indices[2] = LLVMConstInt(LLVMInt32Type(), unit, 0); + /* context[0].textures[unit].member */ + indices[3] = LLVMConstInt(LLVMInt32Type(), member_index, 0); + + ptr = LLVMBuildGEP(builder, state->context_ptr, indices, Elements(indices), ""); + + if (emit_load) + res = LLVMBuildLoad(builder, ptr, ""); + else + res = ptr; + + lp_build_name(res, "context.texture%u.%s", unit, member_name); + + return res; +} + + +/** + * Helper macro to instantiate the functions that generate the code to + * fetch the members of lp_jit_texture to fulfill the sampler code + * generator requests. + * + * This complexity is the price we have to pay to keep the texture + * sampler code generator a reusable module without dependencies to + * llvmpipe internals. + */ +#define DRAW_LLVM_TEXTURE_MEMBER(_name, _index, _emit_load) \ + static LLVMValueRef \ + draw_llvm_texture_##_name( const struct lp_sampler_dynamic_state *base, \ + LLVMBuilderRef builder, \ + unsigned unit) \ + { \ + return draw_llvm_texture_member(base, builder, unit, _index, #_name, _emit_load ); \ + } + + +DRAW_LLVM_TEXTURE_MEMBER(width, DRAW_JIT_TEXTURE_WIDTH, TRUE) +DRAW_LLVM_TEXTURE_MEMBER(height, DRAW_JIT_TEXTURE_HEIGHT, TRUE) +DRAW_LLVM_TEXTURE_MEMBER(depth, DRAW_JIT_TEXTURE_DEPTH, TRUE) +DRAW_LLVM_TEXTURE_MEMBER(last_level, DRAW_JIT_TEXTURE_LAST_LEVEL, TRUE) +DRAW_LLVM_TEXTURE_MEMBER(row_stride, DRAW_JIT_TEXTURE_ROW_STRIDE, FALSE) +DRAW_LLVM_TEXTURE_MEMBER(img_stride, DRAW_JIT_TEXTURE_IMG_STRIDE, FALSE) +DRAW_LLVM_TEXTURE_MEMBER(data_ptr, DRAW_JIT_TEXTURE_DATA, FALSE) +DRAW_LLVM_TEXTURE_MEMBER(min_lod, DRAW_JIT_TEXTURE_MIN_LOD, TRUE) +DRAW_LLVM_TEXTURE_MEMBER(max_lod, DRAW_JIT_TEXTURE_MAX_LOD, TRUE) +DRAW_LLVM_TEXTURE_MEMBER(lod_bias, DRAW_JIT_TEXTURE_LOD_BIAS, TRUE) +DRAW_LLVM_TEXTURE_MEMBER(border_color, DRAW_JIT_TEXTURE_BORDER_COLOR, FALSE) + + +static void +draw_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler) +{ + FREE(sampler); +} + + +/** + * Fetch filtered values from texture. + * The 'texel' parameter returns four vectors corresponding to R, G, B, A. + */ +static void +draw_llvm_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base, + LLVMBuilderRef builder, + struct lp_type type, + unsigned unit, + unsigned num_coords, + const LLVMValueRef *coords, + const LLVMValueRef *ddx, + const LLVMValueRef *ddy, + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef *texel) +{ + struct draw_llvm_sampler_soa *sampler = (struct draw_llvm_sampler_soa *)base; + + assert(unit < PIPE_MAX_VERTEX_SAMPLERS); + + lp_build_sample_soa(builder, + &sampler->dynamic_state.static_state[unit], + &sampler->dynamic_state.base, + type, + unit, + num_coords, coords, + ddx, ddy, + lod_bias, explicit_lod, + texel); +} + + +struct lp_build_sampler_soa * +draw_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state, + LLVMValueRef context_ptr) +{ + struct draw_llvm_sampler_soa *sampler; + + sampler = CALLOC_STRUCT(draw_llvm_sampler_soa); + if(!sampler) + return NULL; + + sampler->base.destroy = draw_llvm_sampler_soa_destroy; + sampler->base.emit_fetch_texel = draw_llvm_sampler_soa_emit_fetch_texel; + sampler->dynamic_state.base.width = draw_llvm_texture_width; + sampler->dynamic_state.base.height = draw_llvm_texture_height; + sampler->dynamic_state.base.depth = draw_llvm_texture_depth; + sampler->dynamic_state.base.last_level = draw_llvm_texture_last_level; + sampler->dynamic_state.base.row_stride = draw_llvm_texture_row_stride; + sampler->dynamic_state.base.img_stride = draw_llvm_texture_img_stride; + sampler->dynamic_state.base.data_ptr = draw_llvm_texture_data_ptr; + sampler->dynamic_state.base.min_lod = draw_llvm_texture_min_lod; + sampler->dynamic_state.base.max_lod = draw_llvm_texture_max_lod; + sampler->dynamic_state.base.lod_bias = draw_llvm_texture_lod_bias; + sampler->dynamic_state.base.border_color = draw_llvm_texture_border_color; + sampler->dynamic_state.static_state = static_state; + sampler->dynamic_state.context_ptr = context_ptr; + + return &sampler->base; +} + diff --git a/src/gallium/auxiliary/draw/draw_llvm_translate.c b/src/gallium/auxiliary/draw/draw_llvm_translate.c index d7da7ed357d..5171327ce2d 100644 --- a/src/gallium/auxiliary/draw/draw_llvm_translate.c +++ b/src/gallium/auxiliary/draw/draw_llvm_translate.c @@ -3,10 +3,10 @@ #include "draw_llvm.h" -#include "gallivm/lp_bld_arit.h" #include "gallivm/lp_bld_struct.h" #include "gallivm/lp_bld_format.h" #include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_type.h" #include "util/u_memory.h" #include "util/u_format.h" @@ -466,6 +466,7 @@ draw_llvm_translate_from(LLVMBuilderRef builder, const struct util_format_description *format_desc; LLVMValueRef zero; int i; + struct lp_type type = lp_float32_vec4_type(); /* * The above can only cope with straight arrays: no bitfields, @@ -493,5 +494,5 @@ draw_llvm_translate_from(LLVMBuilderRef builder, format_desc = util_format_description(from_format); zero = LLVMConstNull(LLVMInt32Type()); - return lp_build_fetch_rgba_aos(builder, format_desc, vbuffer, zero, zero); + return lp_build_fetch_rgba_aos(builder, format_desc, type, vbuffer, zero, zero, zero); } diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c index 64c35025081..6206197dae9 100644 --- a/src/gallium/auxiliary/draw/draw_pipe.c +++ b/src/gallium/auxiliary/draw/draw_pipe.c @@ -170,58 +170,42 @@ static void do_triangle( struct draw_context *draw, * Set up macros for draw_pt_decompose.h template code. * This code uses vertex indexes / elements. */ -#define QUAD(i0,i1,i2,i3) \ - do_triangle( draw, \ - ( DRAW_PIPE_RESET_STIPPLE | \ - DRAW_PIPE_EDGE_FLAG_0 | \ - DRAW_PIPE_EDGE_FLAG_2 ), \ - verts + stride * elts[i0], \ - verts + stride * elts[i1], \ - verts + stride * elts[i3]); \ - do_triangle( draw, \ - ( DRAW_PIPE_EDGE_FLAG_0 | \ - DRAW_PIPE_EDGE_FLAG_1 ), \ - verts + stride * elts[i1], \ - verts + stride * elts[i2], \ - verts + stride * elts[i3]) - -#define TRIANGLE(flags,i0,i1,i2) \ - do_triangle( draw, \ - elts[i0], /* flags */ \ - verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (elts[i2] & ~DRAW_PIPE_FLAG_MASK) ); - -#define LINE(flags,i0,i1) \ - do_line( draw, \ - elts[i0], \ - verts + stride * (elts[i0] & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (elts[i1] & ~DRAW_PIPE_FLAG_MASK) ); + +#define TRIANGLE(flags,i0,i1,i2) \ + do { \ + do_triangle( draw, \ + flags, \ + verts + stride * (i0), \ + verts + stride * (i1), \ + verts + stride * (i2) ); \ + } while (0) + +#define LINE(flags,i0,i1) \ + do { \ + do_line( draw, \ + flags, \ + verts + stride * (i0), \ + verts + stride * (i1) ); \ + } while (0) #define POINT(i0) \ - do_point( draw, \ - verts + stride * elts[i0] ) + do { \ + do_point( draw, verts + stride * (i0) ); \ + } while (0) + +#define GET_ELT(idx) (elts[idx]) -#define FUNC pipe_run -#define ARGS \ +#define FUNC pipe_run_elts +#define FUNC_VARS \ struct draw_context *draw, \ unsigned prim, \ + unsigned prim_flags, \ struct vertex_header *vertices, \ unsigned stride, \ - const ushort *elts - -#define LOCAL_VARS \ - char *verts = (char *)vertices; \ - boolean flatfirst = (draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); \ - unsigned i; \ - ushort flags - -#define FLUSH + const ushort *elts, \ + unsigned count #include "draw_pt_decompose.h" -#undef ARGS -#undef LOCAL_VARS @@ -238,78 +222,84 @@ static void do_triangle( struct draw_context *draw, * draw_vbuf.c code uses when it has to perform a flush. */ void draw_pipeline_run( struct draw_context *draw, - unsigned prim, - struct vertex_header *vertices, - unsigned vertex_count, - unsigned stride, - const ushort *elts, - unsigned count ) + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info) { - char *verts = (char *)vertices; + unsigned i, start; + + draw->pipeline.verts = (char *)vert_info->verts; + draw->pipeline.vertex_stride = vert_info->stride; + draw->pipeline.vertex_count = vert_info->count; + + for (start = i = 0; + i < prim_info->primitive_count; + start += prim_info->primitive_lengths[i], i++) + { + const unsigned count = prim_info->primitive_lengths[i]; + +#if DEBUG + /* Warn if one of the element indexes go outside the vertex buffer */ + { + unsigned max_index = 0x0, i; + /* find the largest element index */ + for (i = 0; i < count; i++) { + unsigned int index = prim_info->elts[start + i]; + if (index > max_index) + max_index = index; + } + if (max_index >= vert_info->count) { + debug_printf("%s: max_index (%u) outside vertex buffer (%u)\n", + __FUNCTION__, + max_index, + vert_info->count); + } + } +#endif + + pipe_run_elts(draw, + prim_info->prim, + prim_info->flags, + vert_info->verts, + vert_info->stride, + prim_info->elts + start, + count); + } - draw->pipeline.verts = verts; - draw->pipeline.vertex_stride = stride; - draw->pipeline.vertex_count = vertex_count; - - pipe_run(draw, prim, vertices, stride, elts, count); - draw->pipeline.verts = NULL; draw->pipeline.vertex_count = 0; } - /* * Set up macros for draw_pt_decompose.h template code. - * This code is for non-indexed rendering (no elts). + * This code is for non-indexed (aka linear) rendering (no elts). */ -#define QUAD(i0,i1,i2,i3) \ - do_triangle( draw, \ - ( DRAW_PIPE_RESET_STIPPLE | \ - DRAW_PIPE_EDGE_FLAG_0 | \ - DRAW_PIPE_EDGE_FLAG_2 ), \ - verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (i1), \ - verts + stride * (i3)); \ - do_triangle( draw, \ - ( DRAW_PIPE_EDGE_FLAG_0 | \ - DRAW_PIPE_EDGE_FLAG_1 ), \ - verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (i2), \ - verts + stride * (i3)) - -#define TRIANGLE(flags,i0,i1,i2) \ - do_triangle( draw, \ - flags, /* flags */ \ - verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (i1), \ - verts + stride * (i2)) - -#define LINE(flags,i0,i1) \ - do_line( draw, \ - flags, \ - verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \ - verts + stride * (i1)) -#define POINT(i0) \ - do_point( draw, \ - verts + stride * i0 ) +#define TRIANGLE(flags,i0,i1,i2) \ + do_triangle( draw, flags, \ + verts + stride * (i0), \ + verts + stride * (i1), \ + verts + stride * (i2) ) -#define FUNC pipe_run_linear -#define ARGS \ - struct draw_context *draw, \ - unsigned prim, \ - struct vertex_header *vertices, \ - unsigned stride +#define LINE(flags,i0,i1) \ + do_line( draw, flags, \ + verts + stride * (i0), \ + verts + stride * (i1) ) -#define LOCAL_VARS \ - char *verts = (char *)vertices; \ - boolean flatfirst = (draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); \ - unsigned i; \ - ushort flags +#define POINT(i0) \ + do_point( draw, verts + stride * (i0) ) -#define FLUSH + +#define GET_ELT(idx) (idx) + +#define FUNC pipe_run_linear +#define FUNC_VARS \ + struct draw_context *draw, \ + unsigned prim, \ + unsigned prim_flags, \ + struct vertex_header *vertices, \ + unsigned stride, \ + unsigned count #include "draw_pt_decompose.h" @@ -318,17 +308,32 @@ void draw_pipeline_run( struct draw_context *draw, * For drawing non-indexed primitives. */ void draw_pipeline_run_linear( struct draw_context *draw, - unsigned prim, - struct vertex_header *vertices, - unsigned count, - unsigned stride ) + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info) { - char *verts = (char *)vertices; - draw->pipeline.verts = verts; - draw->pipeline.vertex_stride = stride; - draw->pipeline.vertex_count = count; - - pipe_run_linear(draw, prim, vertices, stride, count); + unsigned i, start; + + for (start = i = 0; + i < prim_info->primitive_count; + start += prim_info->primitive_lengths[i], i++) + { + unsigned count = prim_info->primitive_lengths[i]; + char *verts = ((char*)vert_info->verts) + + (start * vert_info->stride); + + draw->pipeline.verts = verts; + draw->pipeline.vertex_stride = vert_info->stride; + draw->pipeline.vertex_count = count; + + assert(count <= vert_info->count); + + pipe_run_linear(draw, + prim_info->prim, + prim_info->flags, + (struct vertex_header*)verts, + vert_info->stride, + count); + } draw->pipeline.verts = NULL; draw->pipeline.vertex_count = 0; diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index 4faf0a779ca..d1aba763098 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -373,8 +373,7 @@ generate_aaline_fs(struct aaline_stage *aaline) aaline->fs->sampler_unit = transform.freeSampler; - aaline->fs->aaline_fs - = aaline->driver_create_fs_state(pipe, &aaline_fs); + aaline->fs->aaline_fs = aaline->driver_create_fs_state(pipe, &aaline_fs); if (aaline->fs->aaline_fs == NULL) goto fail; @@ -425,7 +424,8 @@ aaline_create_texture(struct aaline_stage *aaline) /* Fill in mipmap images. * Basically each level is solid opaque, except for the outermost - * texels which are zero. Special case the 1x1 and 2x2 levels. + * texels which are zero. Special case the 1x1 and 2x2 levels + * (though, those levels shouldn't be used - see the max_lod setting). */ for (level = 0; level <= MAX_TEXTURE_LEVEL; level++) { struct pipe_transfer *transfer; @@ -497,7 +497,8 @@ aaline_create_sampler(struct aaline_stage *aaline) sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR; sampler.normalized_coords = 1; sampler.min_lod = 0.0f; - sampler.max_lod = MAX_TEXTURE_LEVEL; + /* avoid using the 1x1 and 2x2 mipmap levels */ + sampler.max_lod = MAX_TEXTURE_LEVEL - 2; aaline->sampler_cso = pipe->create_sampler_state(pipe, &sampler); if (aaline->sampler_cso == NULL) @@ -669,8 +670,8 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header) assert(draw->rasterizer->line_smooth); - if (draw->rasterizer->line_width <= 3.0) - aaline->half_line_width = 1.5f; + if (draw->rasterizer->line_width <= 2.2) + aaline->half_line_width = 1.1f; else aaline->half_line_width = 0.5f * draw->rasterizer->line_width; @@ -687,10 +688,9 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header) aaline->tex_slot = draw_current_shader_outputs(draw); aaline->pos_slot = draw_current_shader_position_output(draw);; - /* advertise the extra post-transformed vertex attribute */ - draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC; - draw->extra_shader_outputs.semantic_index = aaline->fs->generic_attrib; - draw->extra_shader_outputs.slot = aaline->tex_slot; + /* allocate the extra post-transformed vertex attribute */ + (void) draw_alloc_extra_vertex_attrib(draw, TGSI_SEMANTIC_GENERIC, + aaline->fs->generic_attrib); /* how many samplers? */ /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */ @@ -743,7 +743,7 @@ aaline_flush(struct draw_stage *stage, unsigned flags) draw->suspend_flushing = FALSE; - draw->extra_shader_outputs.slot = 0; + draw_remove_extra_vertex_attribs(draw); } @@ -788,9 +788,6 @@ draw_aaline_stage(struct draw_context *draw) if (aaline == NULL) return NULL; - if (!draw_alloc_temp_verts( &aaline->stage, 8 )) - goto fail; - aaline->stage.draw = draw; aaline->stage.name = "aaline"; aaline->stage.next = NULL; @@ -801,11 +798,14 @@ draw_aaline_stage(struct draw_context *draw) aaline->stage.reset_stipple_counter = aaline_reset_stipple_counter; aaline->stage.destroy = aaline_destroy; + if (!draw_alloc_temp_verts( &aaline->stage, 8 )) + goto fail; + return aaline; fail: if (aaline) - aaline_destroy(&aaline->stage); + aaline->stage.destroy(&aaline->stage); return NULL; } diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c index bba6f50c020..5ea552f51c1 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c @@ -701,9 +701,9 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header) aapoint->pos_slot = draw_current_shader_position_output(draw); - draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC; - draw->extra_shader_outputs.semantic_index = aapoint->fs->generic_attrib; - draw->extra_shader_outputs.slot = aapoint->tex_slot; + /* allocate the extra post-transformed vertex attribute */ + (void) draw_alloc_extra_vertex_attrib(draw, TGSI_SEMANTIC_GENERIC, + aapoint->fs->generic_attrib); /* find psize slot in post-transform vertex */ aapoint->psize_slot = -1; @@ -754,7 +754,7 @@ aapoint_flush(struct draw_stage *stage, unsigned flags) draw->suspend_flushing = FALSE; - draw->extra_shader_outputs.slot = 0; + draw_remove_extra_vertex_attribs(draw); } @@ -780,9 +780,6 @@ draw_aapoint_stage(struct draw_context *draw) if (aapoint == NULL) goto fail; - if (!draw_alloc_temp_verts( &aapoint->stage, 4 )) - goto fail; - aapoint->stage.draw = draw; aapoint->stage.name = "aapoint"; aapoint->stage.next = NULL; @@ -793,11 +790,14 @@ draw_aapoint_stage(struct draw_context *draw) aapoint->stage.reset_stipple_counter = aapoint_reset_stipple_counter; aapoint->stage.destroy = aapoint_destroy; + if (!draw_alloc_temp_verts( &aapoint->stage, 4 )) + goto fail; + return aapoint; fail: if (aapoint) - aapoint_destroy(&aapoint->stage); + aapoint->stage.destroy(&aapoint->stage); return NULL; diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index 51a6115ebf5..a10d8e9edc0 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -68,8 +68,7 @@ struct clip_stage { }; -/* This is a bit confusing: - */ +/** Cast wrapper */ static INLINE struct clip_stage *clip_stage( struct draw_stage *stage ) { return (struct clip_stage *)stage; @@ -81,18 +80,22 @@ static INLINE struct clip_stage *clip_stage( struct draw_stage *stage ) /* All attributes are float[4], so this is easy: */ -static void interp_attr( float *fdst, +static void interp_attr( float dst[4], float t, - const float *fin, - const float *fout ) + const float in[4], + const float out[4] ) { - fdst[0] = LINTERP( t, fout[0], fin[0] ); - fdst[1] = LINTERP( t, fout[1], fin[1] ); - fdst[2] = LINTERP( t, fout[2], fin[2] ); - fdst[3] = LINTERP( t, fout[3], fin[3] ); + dst[0] = LINTERP( t, out[0], in[0] ); + dst[1] = LINTERP( t, out[1], in[1] ); + dst[2] = LINTERP( t, out[2], in[2] ); + dst[3] = LINTERP( t, out[3], in[3] ); } +/** + * Copy front/back, primary/secondary colors from src vertex to dst vertex. + * Used when flat shading. + */ static void copy_colors( struct draw_stage *stage, struct vertex_header *dst, const struct vertex_header *src ) @@ -121,20 +124,17 @@ static void interp( const struct clip_stage *clip, /* Vertex header. */ - { - dst->clipmask = 0; - dst->edgeflag = 0; /* will get overwritten later */ - dst->pad = 0; - dst->vertex_id = UNDEFINED_VERTEX_ID; - } + dst->clipmask = 0; + dst->edgeflag = 0; /* will get overwritten later */ + dst->pad = 0; + dst->vertex_id = UNDEFINED_VERTEX_ID; - /* Clip coordinates: interpolate normally + /* Interpolate the clip-space coords. */ - { - interp_attr(dst->clip, t, in->clip, out->clip); - } + interp_attr(dst->clip, t, in->clip, out->clip); - /* Do the projective divide and insert window coordinates: + /* Do the projective divide and viewport transformation to get + * new window coordinates: */ { const float *pos = dst->clip; @@ -157,6 +157,10 @@ static void interp( const struct clip_stage *clip, } +/** + * Emit a post-clip polygon to the next pipeline stage. The polygon + * will be convex and the provoking vertex will always be vertex[0]. + */ static void emit_poly( struct draw_stage *stage, struct vertex_header **inlist, unsigned n, @@ -164,10 +168,18 @@ static void emit_poly( struct draw_stage *stage, { struct prim_header header; unsigned i; + ushort edge_first, edge_middle, edge_last; - const ushort edge_first = DRAW_PIPE_EDGE_FLAG_2; - const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0; - const ushort edge_last = DRAW_PIPE_EDGE_FLAG_1; + if (stage->draw->rasterizer->flatshade_first) { + edge_first = DRAW_PIPE_EDGE_FLAG_0; + edge_middle = DRAW_PIPE_EDGE_FLAG_1; + edge_last = DRAW_PIPE_EDGE_FLAG_2; + } + else { + edge_first = DRAW_PIPE_EDGE_FLAG_2; + edge_middle = DRAW_PIPE_EDGE_FLAG_0; + edge_last = DRAW_PIPE_EDGE_FLAG_1; + } /* later stages may need the determinant, but only the sign matters */ header.det = origPrim->det; @@ -175,9 +187,17 @@ static void emit_poly( struct draw_stage *stage, header.pad = 0; for (i = 2; i < n; i++, header.flags = edge_middle) { - header.v[0] = inlist[i-1]; - header.v[1] = inlist[i]; - header.v[2] = inlist[0]; /* keep in v[2] for flatshading */ + /* order the triangle verts to respect the provoking vertex mode */ + if (stage->draw->rasterizer->flatshade_first) { + header.v[0] = inlist[0]; /* the provoking vertex */ + header.v[1] = inlist[i-1]; + header.v[2] = inlist[i]; + } + else { + header.v[0] = inlist[i-1]; + header.v[1] = inlist[i]; + header.v[2] = inlist[0]; /* the provoking vertex */ + } if (i == n-1) header.flags |= edge_last; @@ -185,7 +205,8 @@ static void emit_poly( struct draw_stage *stage, if (0) { const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader; uint j, k; - debug_printf("Clipped tri:\n"); + debug_printf("Clipped tri: (flat-shade-first = %d)\n", + stage->draw->rasterizer->flatshade_first); for (j = 0; j < 3; j++) { for (k = 0; k < vs->info.num_outputs; k++) { debug_printf(" Vert %d: Attr %d: %f %f %f %f\n", j, k, @@ -241,6 +262,9 @@ do_clip_tri( struct draw_stage *stage, clipmask &= ~(1<<plane_idx); + assert(n < MAX_CLIPPED_VERTICES); + if (n >= MAX_CLIPPED_VERTICES) + return; inlist[n] = inlist[0]; /* prevent rotation of vertices */ for (i = 1; i <= n; i++) { @@ -249,11 +273,23 @@ do_clip_tri( struct draw_stage *stage, float dp = dot4( vert->clip, plane ); if (!IS_NEGATIVE(dp_prev)) { + assert(outcount < MAX_CLIPPED_VERTICES); + if (outcount >= MAX_CLIPPED_VERTICES) + return; outlist[outcount++] = vert_prev; } if (DIFFERENT_SIGNS(dp, dp_prev)) { - struct vertex_header *new_vert = clipper->stage.tmp[tmpnr++]; + struct vertex_header *new_vert; + + assert(tmpnr < MAX_CLIPPED_VERTICES + 1); + if (tmpnr >= MAX_CLIPPED_VERTICES + 1) + return; + new_vert = clipper->stage.tmp[tmpnr++]; + + assert(outcount < MAX_CLIPPED_VERTICES); + if (outcount >= MAX_CLIPPED_VERTICES) + return; outlist[outcount++] = new_vert; if (IS_NEGATIVE(dp)) { @@ -291,18 +327,34 @@ do_clip_tri( struct draw_stage *stage, } } - /* If flat-shading, copy color to new provoking vertex. + /* If flat-shading, copy provoking vertex color to polygon vertex[0] */ - if (clipper->flat && inlist[0] != header->v[2]) { - inlist[0] = dup_vert(stage, inlist[0], tmpnr++); - - copy_colors(stage, inlist[0], header->v[2]); - } - - /* Emit the polygon as triangles to the setup stage: - */ - if (n >= 3) + if (n >= 3) { + if (clipper->flat) { + if (stage->draw->rasterizer->flatshade_first) { + if (inlist[0] != header->v[0]) { + assert(tmpnr < MAX_CLIPPED_VERTICES + 1); + if (tmpnr >= MAX_CLIPPED_VERTICES + 1) + return; + inlist[0] = dup_vert(stage, inlist[0], tmpnr++); + copy_colors(stage, inlist[0], header->v[0]); + } + } + else { + if (inlist[0] != header->v[2]) { + assert(tmpnr < MAX_CLIPPED_VERTICES + 1); + if (tmpnr >= MAX_CLIPPED_VERTICES + 1) + return; + inlist[0] = dup_vert(stage, inlist[0], tmpnr++); + copy_colors(stage, inlist[0], header->v[2]); + } + } + } + + /* Emit the polygon as triangles to the setup stage: + */ emit_poly( stage, inlist, n, header ); + } } @@ -492,9 +544,6 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw ) if (clipper == NULL) goto fail; - if (!draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 )) - goto fail; - clipper->stage.draw = draw; clipper->stage.name = "clipper"; clipper->stage.point = clip_point; @@ -506,6 +555,9 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw ) clipper->plane = draw->plane; + if (!draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 )) + goto fail; + return &clipper->stage; fail: diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c index dc66c65a56c..2f4d01d23ab 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_cull.c +++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c @@ -40,7 +40,8 @@ struct cull_stage { struct draw_stage stage; - unsigned winding; /**< which winding(s) to cull (one of PIPE_WINDING_x) */ + unsigned cull_face; /**< which face(s) to cull (one of PIPE_FACE_x) */ + unsigned front_ccw; }; @@ -73,9 +74,12 @@ static void cull_tri( struct draw_stage *stage, /* if det < 0 then Z points toward the camera and the triangle is * counter-clockwise winding. */ - unsigned winding = (header->det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW; + unsigned ccw = (header->det < 0); + unsigned face = ((ccw == cull_stage(stage)->front_ccw) ? + PIPE_FACE_FRONT : + PIPE_FACE_BACK); - if ((winding & cull_stage(stage)->winding) == 0) { + if ((face & cull_stage(stage)->cull_face) == 0) { /* triangle is not culled, pass to next stage */ stage->next->tri( stage->next, header ); } @@ -88,7 +92,8 @@ static void cull_first_tri( struct draw_stage *stage, { struct cull_stage *cull = cull_stage(stage); - cull->winding = stage->draw->rasterizer->cull_mode; + cull->cull_face = stage->draw->rasterizer->cull_face; + cull->front_ccw = stage->draw->rasterizer->front_ccw; stage->tri = cull_tri; stage->tri( stage, header ); @@ -124,9 +129,6 @@ struct draw_stage *draw_cull_stage( struct draw_context *draw ) if (cull == NULL) goto fail; - if (!draw_alloc_temp_verts( &cull->stage, 0 )) - goto fail; - cull->stage.draw = draw; cull->stage.name = "cull"; cull->stage.next = NULL; @@ -137,6 +139,9 @@ struct draw_stage *draw_cull_stage( struct draw_context *draw ) cull->stage.reset_stipple_counter = cull_reset_stipple_counter; cull->stage.destroy = cull_destroy; + if (!draw_alloc_temp_verts( &cull->stage, 0 )) + goto fail; + return &cull->stage; fail: diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c index 34afb1a0b60..693f2895aad 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c +++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c @@ -257,9 +257,6 @@ struct draw_stage *draw_flatshade_stage( struct draw_context *draw ) if (flatshade == NULL) goto fail; - if (!draw_alloc_temp_verts( &flatshade->stage, 2 )) - goto fail; - flatshade->stage.draw = draw; flatshade->stage.name = "flatshade"; flatshade->stage.next = NULL; @@ -270,6 +267,9 @@ struct draw_stage *draw_flatshade_stage( struct draw_context *draw ) flatshade->stage.reset_stipple_counter = flatshade_reset_stipple_counter; flatshade->stage.destroy = flatshade_destroy; + if (!draw_alloc_temp_verts( &flatshade->stage, 2 )) + goto fail; + return &flatshade->stage; fail: diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c index 8e321946ced..8afbbfa1569 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_offset.c +++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c @@ -161,9 +161,7 @@ struct draw_stage *draw_offset_stage( struct draw_context *draw ) { struct offset_stage *offset = CALLOC_STRUCT(offset_stage); if (offset == NULL) - return NULL; - - draw_alloc_temp_verts( &offset->stage, 3 ); + goto fail; offset->stage.draw = draw; offset->stage.name = "offset"; @@ -175,5 +173,14 @@ struct draw_stage *draw_offset_stage( struct draw_context *draw ) offset->stage.reset_stipple_counter = offset_reset_stipple_counter; offset->stage.destroy = offset_destroy; + if (!draw_alloc_temp_verts( &offset->stage, 3 )) + goto fail; + return &offset->stage; + +fail: + if (offset) + offset->stage.destroy( &offset->stage ); + + return NULL; } diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c index ef30db094fe..ed9a53e154d 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c @@ -363,8 +363,12 @@ generate_pstip_fs(struct pstip_stage *pstip) assert(pstip->fs->sampler_unit < PIPE_MAX_SAMPLERS); pstip->fs->pstip_fs = pstip->driver_create_fs_state(pstip->pipe, &pstip_fs); - + FREE((void *)pstip_fs.tokens); + + if (!pstip->fs->pstip_fs) + return FALSE; + return TRUE; } @@ -603,12 +607,15 @@ pstip_destroy(struct draw_stage *stage) } +/** Create a new polygon stipple drawing stage object */ static struct pstip_stage * -draw_pstip_stage(struct draw_context *draw) +draw_pstip_stage(struct draw_context *draw, struct pipe_context *pipe) { struct pstip_stage *pstip = CALLOC_STRUCT(pstip_stage); + if (pstip == NULL) + goto fail; - draw_alloc_temp_verts( &pstip->stage, 8 ); + pstip->pipe = pipe; pstip->stage.draw = draw; pstip->stage.name = "pstip"; @@ -620,7 +627,16 @@ draw_pstip_stage(struct draw_context *draw) pstip->stage.reset_stipple_counter = pstip_reset_stipple_counter; pstip->stage.destroy = pstip_destroy; + if (!draw_alloc_temp_verts( &pstip->stage, 8 )) + goto fail; + return pstip; + +fail: + if (pstip) + pstip->stage.destroy( &pstip->stage ); + + return NULL; } @@ -756,14 +772,12 @@ draw_install_pstipple_stage(struct draw_context *draw, /* * Create / install pgon stipple drawing / prim stage */ - pstip = draw_pstip_stage( draw ); + pstip = draw_pstip_stage( draw, pipe ); if (pstip == NULL) goto fail; draw->pipeline.pstipple = &pstip->stage; - pstip->pipe = pipe; - /* create special texture, sampler state */ if (!pstip_create_texture(pstip)) goto fail; diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c index 70fbab9ea76..4b3f4e7ae11 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c @@ -235,8 +235,8 @@ stipple_destroy( struct draw_stage *stage ) struct draw_stage *draw_stipple_stage( struct draw_context *draw ) { struct stipple_stage *stipple = CALLOC_STRUCT(stipple_stage); - - draw_alloc_temp_verts( &stipple->stage, 2 ); + if (stipple == NULL) + goto fail; stipple->stage.draw = draw; stipple->stage.name = "stipple"; @@ -248,5 +248,14 @@ struct draw_stage *draw_stipple_stage( struct draw_context *draw ) stipple->stage.flush = stipple_flush; stipple->stage.destroy = stipple_destroy; + if (!draw_alloc_temp_verts( &stipple->stage, 2 )) + goto fail; + return &stipple->stage; + +fail: + if (stipple) + stipple->stage.destroy( &stipple->stage ); + + return NULL; } diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c index eef0238b157..9a3f3fee625 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c +++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c @@ -141,7 +141,7 @@ static void twoside_first_tri( struct draw_stage *stage, * if the triangle is back-facing (negative). * sign = -1 for CCW, +1 for CW */ - twoside->sign = (stage->draw->rasterizer->front_winding == PIPE_WINDING_CCW) ? -1.0f : 1.0f; + twoside->sign = stage->draw->rasterizer->front_ccw ? -1.0f : 1.0f; stage->tri = twoside_tri; stage->tri( stage, header ); @@ -177,9 +177,6 @@ struct draw_stage *draw_twoside_stage( struct draw_context *draw ) if (twoside == NULL) goto fail; - if (!draw_alloc_temp_verts( &twoside->stage, 3 )) - goto fail; - twoside->stage.draw = draw; twoside->stage.name = "twoside"; twoside->stage.next = NULL; @@ -190,6 +187,9 @@ struct draw_stage *draw_twoside_stage( struct draw_context *draw ) twoside->stage.reset_stipple_counter = twoside_reset_stipple_counter; twoside->stage.destroy = twoside_destroy; + if (!draw_alloc_temp_verts( &twoside->stage, 3 )) + goto fail; + return &twoside->stage; fail: diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c index 03bb842e20a..d87741b91e7 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c +++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c @@ -105,6 +105,23 @@ static void lines( struct draw_stage *stage, } +/** For debugging */ +static void +print_header_flags(unsigned flags) +{ + debug_printf("header->flags = "); + if (flags & DRAW_PIPE_RESET_STIPPLE) + debug_printf("RESET_STIPPLE "); + if (flags & DRAW_PIPE_EDGE_FLAG_0) + debug_printf("EDGE_FLAG_0 "); + if (flags & DRAW_PIPE_EDGE_FLAG_1) + debug_printf("EDGE_FLAG_1 "); + if (flags & DRAW_PIPE_EDGE_FLAG_2) + debug_printf("EDGE_FLAG_2 "); + debug_printf("\n"); +} + + /* Unfilled tri: * * Note edgeflags in the vertex struct is not sufficient as we will @@ -117,8 +134,12 @@ static void unfilled_tri( struct draw_stage *stage, struct prim_header *header ) { struct unfilled_stage *unfilled = unfilled_stage(stage); - unsigned mode = unfilled->mode[header->det >= 0.0]; + unsigned cw = header->det >= 0.0; + unsigned mode = unfilled->mode[cw]; + if (0) + print_header_flags(header->flags); + switch (mode) { case PIPE_POLYGON_MODE_FILL: stage->next->tri( stage->next, header ); @@ -139,9 +160,10 @@ static void unfilled_first_tri( struct draw_stage *stage, struct prim_header *header ) { struct unfilled_stage *unfilled = unfilled_stage(stage); + const struct pipe_rasterizer_state *rast = stage->draw->rasterizer; - unfilled->mode[0] = stage->draw->rasterizer->fill_ccw; /* front */ - unfilled->mode[1] = stage->draw->rasterizer->fill_cw; /* back */ + unfilled->mode[0] = rast->front_ccw ? rast->fill_front : rast->fill_back; + unfilled->mode[1] = rast->front_ccw ? rast->fill_back : rast->fill_front; stage->tri = unfilled_tri; stage->tri( stage, header ); @@ -180,9 +202,6 @@ struct draw_stage *draw_unfilled_stage( struct draw_context *draw ) if (unfilled == NULL) goto fail; - if (!draw_alloc_temp_verts( &unfilled->stage, 0 )) - goto fail; - unfilled->stage.draw = draw; unfilled->stage.name = "unfilled"; unfilled->stage.next = NULL; @@ -194,6 +213,9 @@ struct draw_stage *draw_unfilled_stage( struct draw_context *draw ) unfilled->stage.reset_stipple_counter = unfilled_reset_stipple_counter; unfilled->stage.destroy = unfilled_destroy; + if (!draw_alloc_temp_verts( &unfilled->stage, 0 )) + goto fail; + return &unfilled->stage; fail: diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c index 2a50af7a414..c575a8ac7ca 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_validate.c +++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c @@ -122,12 +122,14 @@ draw_need_pipeline(const struct draw_context *draw, return TRUE; /* unfilled polygons */ - if (rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL || - rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) + if (rasterizer->fill_front != PIPE_POLYGON_MODE_FILL || + rasterizer->fill_back != PIPE_POLYGON_MODE_FILL) return TRUE; /* polygon offset */ - if (rasterizer->offset_cw || rasterizer->offset_ccw) + if (rasterizer->offset_point || + rasterizer->offset_line || + rasterizer->offset_tri) return TRUE; /* two-side lighting */ @@ -170,7 +172,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) wide_lines = (rast->line_width > draw->pipeline.wide_line_threshold && !rast->line_smooth); - /* drawing large points? */ + /* drawing large/sprite points (but not AA points)? */ if (rast->sprite_coord_enable && draw->pipeline.point_sprite) wide_points = TRUE; else if (rast->point_smooth && draw->pipeline.aapoint) @@ -205,7 +207,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) precalc_flat = TRUE; } - if (wide_points || rast->sprite_coord_enable) { + if (wide_points) { draw->pipeline.wide_point->next = next; next = draw->pipeline.wide_point; } @@ -222,8 +224,8 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) next = draw->pipeline.pstipple; } - if (rast->fill_cw != PIPE_POLYGON_MODE_FILL || - rast->fill_ccw != PIPE_POLYGON_MODE_FILL) { + if (rast->fill_front != PIPE_POLYGON_MODE_FILL || + rast->fill_back != PIPE_POLYGON_MODE_FILL) { draw->pipeline.unfilled->next = next; next = draw->pipeline.unfilled; precalc_flat = TRUE; /* only needed for triangles really */ @@ -235,8 +237,9 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) next = draw->pipeline.flatshade; } - if (rast->offset_cw || - rast->offset_ccw) { + if (rast->offset_point || + rast->offset_line || + rast->offset_tri) { draw->pipeline.offset->next = next; next = draw->pipeline.offset; need_det = TRUE; @@ -255,14 +258,14 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) * to less work emitting vertices, smaller vertex buffers, etc. * It's difficult to say whether this will be true in general. */ - if (need_det || rast->cull_mode) { + if (need_det || rast->cull_face != PIPE_FACE_NONE) { draw->pipeline.cull->next = next; next = draw->pipeline.cull; } /* Clip stage */ - if (!draw->bypass_clipping) + if (draw->clip_xy || draw->clip_z || draw->clip_user) { draw->pipeline.clip->next = next; next = draw->pipeline.clip; diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c index abbf6247ab8..58c5858734a 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c +++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c @@ -159,19 +159,8 @@ vbuf_tri( struct draw_stage *stage, check_space( vbuf, 3 ); - if (vbuf->stage.draw->rasterizer->flatshade_first) { - /* Put provoking vertex in position expected by the driver. - * Emit last provoking vertex in first pos. - * Swap verts 0 & 1 to preserve polygon winding. - */ - vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[2] ); - vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[0] ); - vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[1] ); - } - else { - for (i = 0; i < 3; i++) { - vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] ); - } + for (i = 0; i < 3; i++) { + vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] ); } } @@ -335,9 +324,9 @@ vbuf_flush_vertices( struct vbuf_stage *vbuf ) if (vbuf->nr_indices) { - vbuf->render->draw(vbuf->render, - vbuf->indices, - vbuf->nr_indices ); + vbuf->render->draw_elements(vbuf->render, + vbuf->indices, + vbuf->nr_indices ); vbuf->nr_indices = 0; } @@ -364,9 +353,6 @@ vbuf_alloc_vertices( struct vbuf_stage *vbuf ) /* Allocate a new vertex buffer */ vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size; - /* even number */ - vbuf->max_vertices = vbuf->max_vertices & ~1; - if(vbuf->max_vertices >= UNDEFINED_VERTEX_ID) vbuf->max_vertices = UNDEFINED_VERTEX_ID - 1; diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c index ab167065815..98da9cfb999 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c +++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c @@ -77,8 +77,11 @@ static void wideline_line( struct draw_stage *stage, const float dx = fabsf(pos0[0] - pos2[0]); const float dy = fabsf(pos0[1] - pos2[1]); + const boolean gl_rasterization_rules = + stage->draw->rasterizer->gl_rasterization_rules; + /* small tweak to meet GL specification */ - const float bias = 0.125f; + const float bias = gl_rasterization_rules ? 0.125f : 0.0f; /* * Draw wide line as a quad (two tris) by "stretching" the line along @@ -92,19 +95,21 @@ static void wideline_line( struct draw_stage *stage, pos1[1] = pos1[1] + half_width - bias; pos2[1] = pos2[1] - half_width - bias; pos3[1] = pos3[1] + half_width - bias; - if (pos0[0] < pos2[0]) { - /* left to right line */ - pos0[0] -= 0.5f; - pos1[0] -= 0.5f; - pos2[0] -= 0.5f; - pos3[0] -= 0.5f; - } - else { - /* right to left line */ - pos0[0] += 0.5f; - pos1[0] += 0.5f; - pos2[0] += 0.5f; - pos3[0] += 0.5f; + if (gl_rasterization_rules) { + if (pos0[0] < pos2[0]) { + /* left to right line */ + pos0[0] -= 0.5f; + pos1[0] -= 0.5f; + pos2[0] -= 0.5f; + pos3[0] -= 0.5f; + } + else { + /* right to left line */ + pos0[0] += 0.5f; + pos1[0] += 0.5f; + pos2[0] += 0.5f; + pos3[0] += 0.5f; + } } } else { @@ -113,19 +118,21 @@ static void wideline_line( struct draw_stage *stage, pos1[0] = pos1[0] + half_width + bias; pos2[0] = pos2[0] - half_width + bias; pos3[0] = pos3[0] + half_width + bias; - if (pos0[1] < pos2[1]) { - /* top to bottom line */ - pos0[1] -= 0.5f; - pos1[1] -= 0.5f; - pos2[1] -= 0.5f; - pos3[1] -= 0.5f; - } - else { - /* bottom to top line */ - pos0[1] += 0.5f; - pos1[1] += 0.5f; - pos2[1] += 0.5f; - pos3[1] += 0.5f; + if (gl_rasterization_rules) { + if (pos0[1] < pos2[1]) { + /* top to bottom line */ + pos0[1] -= 0.5f; + pos1[1] -= 0.5f; + pos2[1] -= 0.5f; + pos3[1] -= 0.5f; + } + else { + /* bottom to top line */ + pos0[1] += 0.5f; + pos1[1] += 0.5f; + pos2[1] += 0.5f; + pos3[1] += 0.5f; + } } } @@ -195,8 +202,8 @@ static void wideline_destroy( struct draw_stage *stage ) struct draw_stage *draw_wide_line_stage( struct draw_context *draw ) { struct wideline_stage *wide = CALLOC_STRUCT(wideline_stage); - - draw_alloc_temp_verts( &wide->stage, 4 ); + if (wide == NULL) + goto fail; wide->stage.draw = draw; wide->stage.name = "wide-line"; @@ -208,5 +215,14 @@ struct draw_stage *draw_wide_line_stage( struct draw_context *draw ) wide->stage.reset_stipple_counter = wideline_reset_stipple_counter; wide->stage.destroy = wideline_destroy; + if (!draw_alloc_temp_verts( &wide->stage, 4 )) + goto fail; + return &wide->stage; + +fail: + if (wide) + wide->stage.destroy( &wide->stage ); + + return NULL; } diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c index a86fe19586c..3646c6a7145 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c +++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c @@ -57,26 +57,24 @@ #include "util/u_memory.h" #include "pipe/p_defines.h" #include "pipe/p_shader_tokens.h" +#include "draw_fs.h" #include "draw_vs.h" #include "draw_pipe.h" struct widepoint_stage { - struct draw_stage stage; + struct draw_stage stage; /**< base class */ float half_point_size; float xbias; float ybias; - uint texcoord_slot[PIPE_MAX_SHADER_OUTPUTS]; - uint texcoord_enable[PIPE_MAX_SHADER_OUTPUTS]; - uint num_texcoords; - uint texcoord_mode; + /** for automatic texcoord generation/replacement */ + uint num_texcoord_gen; + uint texcoord_gen_slot[PIPE_MAX_SHADER_OUTPUTS]; int psize_slot; - - int point_coord_fs_input; /**< input for pointcoord */ }; @@ -96,30 +94,20 @@ widepoint_stage( struct draw_stage *stage ) static void set_texcoords(const struct widepoint_stage *wide, struct vertex_header *v, const float tc[4]) { + const struct draw_context *draw = wide->stage.draw; + const struct pipe_rasterizer_state *rast = draw->rasterizer; + const uint texcoord_mode = rast->sprite_coord_mode; uint i; - for (i = 0; i < wide->num_texcoords; i++) { - if (wide->texcoord_enable[i]) { - uint j = wide->texcoord_slot[i]; - v->data[j][0] = tc[0]; - if (wide->texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) - v->data[j][1] = 1.0f - tc[1]; - else - v->data[j][1] = tc[1]; - v->data[j][2] = tc[2]; - v->data[j][3] = tc[3]; - } - } - if (wide->point_coord_fs_input >= 0) { - /* put gl_PointCoord into the extra vertex slot */ - uint slot = wide->stage.draw->extra_shader_outputs.slot; + for (i = 0; i < wide->num_texcoord_gen; i++) { + const uint slot = wide->texcoord_gen_slot[i]; v->data[slot][0] = tc[0]; - if (wide->texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) + if (texcoord_mode == PIPE_SPRITE_COORD_LOWER_LEFT) v->data[slot][1] = 1.0f - tc[1]; else v->data[slot][1] = tc[1]; - v->data[slot][2] = 0.0F; - v->data[slot][3] = 1.0F; + v->data[slot][2] = tc[2]; + v->data[slot][3] = tc[3]; } } @@ -201,18 +189,9 @@ static void widepoint_point( struct draw_stage *stage, } -static int -find_pntc_input_attrib(struct draw_context *draw) -{ - /* Scan the fragment program's input decls to find the pointcoord - * attribute. The xy components will store the point coord. - */ - return 0; /* XXX fix this */ -} - - -static void widepoint_first_point( struct draw_stage *stage, - struct prim_header *header ) +static void +widepoint_first_point(struct draw_stage *stage, + struct prim_header *header) { struct widepoint_stage *wide = widepoint_stage(stage); struct draw_context *draw = stage->draw; @@ -226,6 +205,7 @@ static void widepoint_first_point( struct draw_stage *stage, if (rast->gl_rasterization_rules) { wide->xbias = 0.125; + wide->ybias = -0.125; } /* Disable triangle culling, stippling, unfilled mode etc. */ @@ -243,31 +223,49 @@ static void widepoint_first_point( struct draw_stage *stage, stage->point = draw_pipe_passthrough_point; } + draw_remove_extra_vertex_attribs(draw); + if (rast->point_quad_rasterization) { - /* find vertex shader texcoord outputs */ - const struct draw_vertex_shader *vs = draw->vs.vertex_shader; - uint i, j = 0; - wide->texcoord_mode = rast->sprite_coord_mode; - for (i = 0; i < vs->info.num_outputs; i++) { - if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) { - wide->texcoord_slot[j] = i; - wide->texcoord_enable[j] = (rast->sprite_coord_enable >> j) & 1; - j++; + const struct draw_fragment_shader *fs = draw->fs.fragment_shader; + uint i; + + wide->num_texcoord_gen = 0; + + /* Loop over fragment shader inputs looking for generic inputs + * for which bit 'k' in sprite_coord_enable is set. + */ + for (i = 0; i < fs->info.num_inputs; i++) { + if (fs->info.input_semantic_name[i] == TGSI_SEMANTIC_GENERIC) { + const int generic_index = fs->info.input_semantic_index[i]; + /* Note that sprite_coord enable is a bitfield of + * PIPE_MAX_SHADER_OUTPUTS bits. + */ + if (generic_index < PIPE_MAX_SHADER_OUTPUTS && + (rast->sprite_coord_enable & (1 << generic_index))) { + /* OK, this generic attribute needs to be replaced with a + * texcoord (see above). + */ + int slot = draw_find_shader_output(draw, + TGSI_SEMANTIC_GENERIC, + generic_index); + + if (slot > 0) { + /* there's already a post-vertex shader attribute + * for this fragment shader input attribute. + */ + } + else { + /* need to allocate a new post-vertex shader attribute */ + slot = draw_alloc_extra_vertex_attrib(draw, + TGSI_SEMANTIC_GENERIC, + generic_index); + } + + /* add this slot to the texcoord-gen list */ + wide->texcoord_gen_slot[wide->num_texcoord_gen++] = slot; + } } } - wide->num_texcoords = j; - - /* find fragment shader PointCoord input */ - wide->point_coord_fs_input = find_pntc_input_attrib(draw); - - /* setup extra vp output (point coord implemented as a texcoord) */ - draw->extra_shader_outputs.semantic_name = TGSI_SEMANTIC_GENERIC; - draw->extra_shader_outputs.semantic_index = 0; - draw->extra_shader_outputs.slot = draw_current_shader_outputs(draw); - } - else { - wide->point_coord_fs_input = -1; - draw->extra_shader_outputs.slot = 0; } wide->psize_slot = -1; @@ -294,7 +292,8 @@ static void widepoint_flush( struct draw_stage *stage, unsigned flags ) stage->point = widepoint_first_point; stage->next->flush( stage->next, flags ); - stage->draw->extra_shader_outputs.slot = 0; + + draw_remove_extra_vertex_attribs(draw); /* restore original rasterizer state */ if (draw->rast_handle) { @@ -324,9 +323,6 @@ struct draw_stage *draw_wide_point_stage( struct draw_context *draw ) if (wide == NULL) goto fail; - if (!draw_alloc_temp_verts( &wide->stage, 4 )) - goto fail; - wide->stage.draw = draw; wide->stage.name = "wide-point"; wide->stage.next = NULL; @@ -337,6 +333,9 @@ struct draw_stage *draw_wide_point_stage( struct draw_context *draw ) wide->stage.reset_stipple_counter = widepoint_reset_stipple_counter; wide->stage.destroy = widepoint_destroy; + if (!draw_alloc_temp_verts( &wide->stage, 4 )) + goto fail; + return &wide->stage; fail: diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index a2bfb693c09..d417f825a0f 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -48,6 +48,7 @@ #ifdef HAVE_LLVM #include <llvm-c/ExecutionEngine.h> +struct draw_llvm; #endif @@ -81,6 +82,9 @@ struct vertex_header { #define UNDEFINED_VERTEX_ID 0xffff +/* maximum number of shader variants we can cache */ +#define DRAW_MAX_SHADER_VARIANTS 1024 + /** * Private context for the drawing module. */ @@ -136,8 +140,7 @@ struct draw_context } middle; struct { - struct draw_pt_front_end *vcache; - struct draw_pt_front_end *varray; + struct draw_pt_front_end *vsplit; } front; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; @@ -146,6 +149,8 @@ struct draw_context struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS]; unsigned nr_vertex_elements; + struct pipe_index_buffer index_buffer; + /* user-space vertex data, buffers */ struct { /** vertex element/index buffer (ex: glDrawElements) */ @@ -159,9 +164,11 @@ struct draw_context /** vertex arrays */ const void *vbuffer[PIPE_MAX_ATTRIBS]; - /** constant buffer (for vertex/geometry shader) */ + /** constant buffers (for vertex/geometry shader) */ const void *vs_constants[PIPE_MAX_CONSTANT_BUFFERS]; + unsigned vs_constants_size[PIPE_MAX_CONSTANT_BUFFERS]; const void *gs_constants[PIPE_MAX_CONSTANT_BUFFERS]; + unsigned gs_constants_size[PIPE_MAX_CONSTANT_BUFFERS]; } user; boolean test_fse; /* enable FSE even though its not correct (eg for softpipe) */ @@ -169,13 +176,19 @@ struct draw_context } pt; struct { - boolean bypass_clipping; - boolean bypass_vs; + boolean bypass_clip_xy; + boolean bypass_clip_z; } driver; boolean flushing; /**< debugging/sanity */ boolean suspend_flushing; /**< internally set */ - boolean bypass_clipping; /**< set if either api or driver bypass_clipping true */ + + /* Flags set if API requires clipping in these planes and the + * driver doesn't indicate that it can do it for us. + */ + boolean clip_xy; + boolean clip_z; + boolean clip_user; boolean force_passthrough; /**< never clip or shade */ @@ -194,6 +207,7 @@ struct draw_context struct pipe_viewport_state viewport; boolean identity_viewport; + /** Vertex shader state */ struct { struct draw_vertex_shader *vertex_shader; uint num_vs_outputs; /**< convenience, from vertex_shader */ @@ -223,6 +237,7 @@ struct draw_context struct translate_cache *emit_cache; } vs; + /** Geometry shader state */ struct { struct draw_geometry_shader *geometry_shader; uint num_gs_outputs; /**< convenience, from geometry_shader */ @@ -235,17 +250,31 @@ struct draw_context struct tgsi_sampler **samplers; } gs; + /** Fragment shader state */ + struct { + struct draw_fragment_shader *fragment_shader; + } fs; + + /** Stream output (vertex feedback) state */ + struct { + struct pipe_stream_output_state state; + void *buffers[PIPE_MAX_SO_BUFFERS]; + uint num_buffers; + } so; + /* Clip derived state: */ float plane[12][4]; unsigned nr_planes; + boolean depth_clamp; /* If a prim stage introduces new vertex attributes, they'll be stored here */ struct { - uint semantic_name; - uint semantic_index; - int slot; + uint num; + uint semantic_name[10]; + uint semantic_index[10]; + uint slot[10]; } extra_shader_outputs; unsigned reduced_prim; @@ -253,12 +282,51 @@ struct draw_context unsigned instance_id; #ifdef HAVE_LLVM + struct draw_llvm *llvm; LLVMExecutionEngineRef engine; #endif + struct pipe_sampler_view *sampler_views[PIPE_MAX_VERTEX_SAMPLERS]; + unsigned num_sampler_views; + const struct pipe_sampler_state *samplers[PIPE_MAX_VERTEX_SAMPLERS]; + unsigned num_samplers; + void *driver_private; }; + +struct draw_fetch_info { + boolean linear; + unsigned start; + const unsigned *elts; + unsigned count; +}; + +struct draw_vertex_info { + struct vertex_header *verts; + unsigned vertex_size; + unsigned stride; + unsigned count; +}; + +/* these flags are set if the primitive is a segment of a larger one */ +#define DRAW_SPLIT_BEFORE 0x1 +#define DRAW_SPLIT_AFTER 0x2 + +struct draw_prim_info { + boolean linear; + unsigned start; + + const ushort *elts; + unsigned count; + + unsigned prim; + unsigned flags; + unsigned *primitive_lengths; + unsigned primitive_count; +}; + + /******************************************************************************* * Draw common initialization code */ @@ -300,6 +368,11 @@ void draw_gs_destroy( struct draw_context *draw ); uint draw_current_shader_outputs(const struct draw_context *draw); uint draw_current_shader_position_output(const struct draw_context *draw); +int draw_alloc_extra_vertex_attrib(struct draw_context *draw, + uint semantic_name, uint semantic_index); +void draw_remove_extra_vertex_attribs(struct draw_context *draw); + + /******************************************************************************* * Vertex processing (was passthrough) code: */ @@ -319,35 +392,24 @@ void draw_pipeline_destroy( struct draw_context *draw ); -/* We use the top few bits in the elts[] parameter to convey a little - * API information. This limits the number of vertices we can address - * to only 4096 -- if that becomes a problem, we can switch to 32-bit - * draw indices. - * - * These flags expected at first vertex of lines & triangles when - * unfilled and/or line stipple modes are operational. +/* + * These flags are used by the pipeline when unfilled and/or line stipple modes + * are operational. */ -#define DRAW_PIPE_MAX_VERTICES (0x1<<12) -#define DRAW_PIPE_EDGE_FLAG_0 (0x1<<12) -#define DRAW_PIPE_EDGE_FLAG_1 (0x2<<12) -#define DRAW_PIPE_EDGE_FLAG_2 (0x4<<12) -#define DRAW_PIPE_EDGE_FLAG_ALL (0x7<<12) -#define DRAW_PIPE_RESET_STIPPLE (0x8<<12) -#define DRAW_PIPE_FLAG_MASK (0xf<<12) +#define DRAW_PIPE_EDGE_FLAG_0 0x1 +#define DRAW_PIPE_EDGE_FLAG_1 0x2 +#define DRAW_PIPE_EDGE_FLAG_2 0x4 +#define DRAW_PIPE_EDGE_FLAG_ALL 0x7 +#define DRAW_PIPE_RESET_STIPPLE 0x8 void draw_pipeline_run( struct draw_context *draw, - unsigned prim, - struct vertex_header *vertices, - unsigned vertex_count, - unsigned stride, - const ushort *elts, - unsigned count ); + const struct draw_vertex_info *vert, + const struct draw_prim_info *prim); void draw_pipeline_run_linear( struct draw_context *draw, - unsigned prim, - struct vertex_header *vertices, - unsigned count, - unsigned stride ); + const struct draw_vertex_info *vert, + const struct draw_prim_info *prim); + diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index b853f3a89f8..f44bf2507c6 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -31,30 +31,22 @@ */ #include "draw/draw_context.h" +#include "draw/draw_gs.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" +#include "draw/draw_vs.h" #include "tgsi/tgsi_dump.h" #include "util/u_math.h" #include "util/u_prim.h" +#include "util/u_format.h" +#include "util/u_draw.h" DEBUG_GET_ONCE_BOOL_OPTION(draw_fse, "DRAW_FSE", FALSE) DEBUG_GET_ONCE_BOOL_OPTION(draw_no_fse, "DRAW_NO_FSE", FALSE) -#ifdef HAVE_LLVM -DEBUG_GET_ONCE_BOOL_OPTION(draw_use_llvm, "DRAW_USE_LLVM", TRUE) -#endif - -static unsigned trim( unsigned count, unsigned first, unsigned incr ) -{ - if (count < first) - return 0; - return count - (count - first) % incr; -} - - /* Overall we split things into: - * - frontend -- prepare fetch_elts, draw_elts - eg vcache + * - frontend -- prepare fetch_elts, draw_elts - eg vsplit * - middle -- fetch, shade, cliptest, viewport * - pipeline -- the prim pipeline: clipping, wide lines, etc * - backend -- the vbuf_render provided by the driver. @@ -74,29 +66,35 @@ draw_pt_arrays(struct draw_context *draw, { unsigned first, incr; draw_pt_split_prim(prim, &first, &incr); - count = trim(count, first, incr); + count = draw_pt_trim_count(count, first, incr); if (count < first) return TRUE; } if (!draw->force_passthrough) { + unsigned gs_out_prim = (draw->gs.geometry_shader ? + draw->gs.geometry_shader->output_primitive : + prim); + if (!draw->render) { opt |= PT_PIPELINE; } - + if (draw_need_pipeline(draw, draw->rasterizer, - prim)) { + gs_out_prim)) { opt |= PT_PIPELINE; } - if (!draw->bypass_clipping && !draw->pt.test_fse) { + if ((draw->clip_xy || + draw->clip_z || + draw->clip_user) && !draw->pt.test_fse) { opt |= PT_CLIPTEST; } - + opt |= PT_SHADE; } - + if (draw->pt.middle.llvm) { middle = draw->pt.middle.llvm; } else { @@ -108,22 +106,11 @@ draw_pt_arrays(struct draw_context *draw, middle = draw->pt.middle.general; } - - /* Pick the right frontend - */ - if (draw->pt.user.elts || (opt & PT_PIPELINE)) { - frontend = draw->pt.front.vcache; - } else { - frontend = draw->pt.front.varray; - } + frontend = draw->pt.front.vsplit; frontend->prepare( frontend, prim, middle, opt ); - frontend->run(frontend, - draw_pt_elt_func(draw), - draw_pt_elt_ptr(draw, start), - draw->pt.user.eltBias, - count); + frontend->run(frontend, start, count); frontend->finish( frontend ); @@ -136,12 +123,8 @@ boolean draw_pt_init( struct draw_context *draw ) draw->pt.test_fse = debug_get_option_draw_fse(); draw->pt.no_fse = debug_get_option_draw_no_fse(); - draw->pt.front.vcache = draw_pt_vcache( draw ); - if (!draw->pt.front.vcache) - return FALSE; - - draw->pt.front.varray = draw_pt_varray(draw); - if (!draw->pt.front.varray) + draw->pt.front.vsplit = draw_pt_vsplit(draw); + if (!draw->pt.front.vsplit) return FALSE; draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw ); @@ -157,7 +140,7 @@ boolean draw_pt_init( struct draw_context *draw ) return FALSE; #if HAVE_LLVM - if (debug_get_option_draw_use_llvm()) + if (draw->llvm) draw->pt.middle.llvm = draw_pt_fetch_pipeline_or_emit_llvm( draw ); #endif @@ -187,14 +170,9 @@ void draw_pt_destroy( struct draw_context *draw ) draw->pt.middle.fetch_shade_emit = NULL; } - if (draw->pt.front.vcache) { - draw->pt.front.vcache->destroy( draw->pt.front.vcache ); - draw->pt.front.vcache = NULL; - } - - if (draw->pt.front.varray) { - draw->pt.front.varray->destroy( draw->pt.front.varray ); - draw->pt.front.varray = NULL; + if (draw->pt.front.vsplit) { + draw->pt.front.vsplit->destroy( draw->pt.front.vsplit ); + draw->pt.front.vsplit = NULL; } } @@ -214,24 +192,29 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count) uint ii = 0; uint j; - if (draw->pt.user.elts) { + if (draw->pt.user.eltSize) { + const char *elts; + /* indexed arrays */ + elts = (const char *) draw->pt.user.elts; + elts += draw->pt.index_buffer.offset; + switch (draw->pt.user.eltSize) { case 1: { - const ubyte *elem = (const ubyte *) draw->pt.user.elts; + const ubyte *elem = (const ubyte *) elts; ii = elem[start + i]; } break; case 2: { - const ushort *elem = (const ushort *) draw->pt.user.elts; + const ushort *elem = (const ushort *) elts; ii = elem[start + i]; } break; case 4: { - const uint *elem = (const uint *) draw->pt.user.elts; + const uint *elem = (const uint *) elts; ii = elem[start + i]; } break; @@ -252,6 +235,12 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count) for (j = 0; j < draw->pt.nr_vertex_elements; j++) { uint buf = draw->pt.vertex_element[j].vertex_buffer_index; ubyte *ptr = (ubyte *) draw->pt.user.vbuffer[buf]; + + if (draw->pt.vertex_element[j].instance_divisor) { + ii = draw->instance_id / draw->pt.vertex_element[j].instance_divisor; + } + + ptr += draw->pt.vertex_buffer[buf].buffer_offset; ptr += draw->pt.vertex_buffer[buf].stride * ii; ptr += draw->pt.vertex_element[j].src_offset; @@ -260,31 +249,38 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count) case PIPE_FORMAT_R32_FLOAT: { float *v = (float *) ptr; - debug_printf("%f @ %p\n", v[0], (void *) v); + debug_printf("R %f @ %p\n", v[0], (void *) v); } break; case PIPE_FORMAT_R32G32_FLOAT: { float *v = (float *) ptr; - debug_printf("%f %f @ %p\n", v[0], v[1], (void *) v); + debug_printf("RG %f %f @ %p\n", v[0], v[1], (void *) v); } break; case PIPE_FORMAT_R32G32B32_FLOAT: { float *v = (float *) ptr; - debug_printf("%f %f %f @ %p\n", v[0], v[1], v[2], (void *) v); + debug_printf("RGB %f %f %f @ %p\n", v[0], v[1], v[2], (void *) v); } break; case PIPE_FORMAT_R32G32B32A32_FLOAT: { float *v = (float *) ptr; - debug_printf("%f %f %f %f @ %p\n", v[0], v[1], v[2], v[3], + debug_printf("RGBA %f %f %f %f @ %p\n", v[0], v[1], v[2], v[3], (void *) v); } break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + { + ubyte *u = (ubyte *) ptr; + debug_printf("BGRA %d %d %d %d @ %p\n", u[0], u[1], u[2], u[3], + (void *) u); + } + break; default: - debug_printf("other format (fix me)\n"); - ; + debug_printf("other format %s (fix me)\n", + util_format_name(draw->pt.vertex_element[j].src_format)); } } } @@ -292,11 +288,8 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count) /** - * Draw vertex arrays - * This is the main entrypoint into the drawing module. - * \param prim one of PIPE_PRIM_x - * \param start index of first vertex to draw - * \param count number of vertices to draw + * Non-instanced drawing. + * \sa draw_arrays_instanced */ void draw_arrays(struct draw_context *draw, unsigned prim, @@ -305,6 +298,11 @@ draw_arrays(struct draw_context *draw, unsigned prim, draw_arrays_instanced(draw, prim, start, count, 0, 1); } + +/** + * Instanced drawing. + * \sa draw_vbo + */ void draw_arrays_instanced(struct draw_context *draw, unsigned mode, @@ -313,40 +311,90 @@ draw_arrays_instanced(struct draw_context *draw, unsigned startInstance, unsigned instanceCount) { - unsigned reduced_prim = u_reduced_prim(mode); + struct pipe_draw_info info; + + util_draw_init_info(&info); + + info.mode = mode; + info.start = start; + info.count = count; + info.start_instance = startInstance; + info.instance_count = instanceCount; + + info.indexed = (draw->pt.user.elts != NULL); + if (!info.indexed) { + info.min_index = start; + info.max_index = start + count - 1; + } + + draw_vbo(draw, &info); +} + + +/** + * Draw vertex arrays. + * This is the main entrypoint into the drawing module. If drawing an indexed + * primitive, the draw_set_index_buffer() and draw_set_mapped_index_buffer() + * functions should have already been called to specify the element/index + * buffer information. + */ +void +draw_vbo(struct draw_context *draw, + const struct pipe_draw_info *info) +{ + unsigned reduced_prim = u_reduced_prim(info->mode); unsigned instance; + assert(info->instance_count > 0); + if (info->indexed) + assert(draw->pt.user.elts); + + draw->pt.user.eltSize = + (info->indexed) ? draw->pt.index_buffer.index_size : 0; + + draw->pt.user.eltBias = info->index_bias; + draw->pt.user.min_index = info->min_index; + draw->pt.user.max_index = info->max_index; + if (reduced_prim != draw->reduced_prim) { draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE); draw->reduced_prim = reduced_prim; } if (0) - draw_print_arrays(draw, mode, start, MIN2(count, 20)); + debug_printf("draw_vbo(mode=%u start=%u count=%u):\n", + info->mode, info->start, info->count); -#if 0 - { - int i; - debug_printf("draw_arrays(mode=%u start=%u count=%u):\n", - mode, start, count); + if (0) tgsi_dump(draw->vs.vertex_shader->state.tokens, 0); + + if (0) { + unsigned int i; debug_printf("Elements:\n"); for (i = 0; i < draw->pt.nr_vertex_elements; i++) { - debug_printf(" format=%s\n", + debug_printf(" %u: src_offset=%u inst_div=%u vbuf=%u format=%s\n", + i, + draw->pt.vertex_element[i].src_offset, + draw->pt.vertex_element[i].instance_divisor, + draw->pt.vertex_element[i].vertex_buffer_index, util_format_name(draw->pt.vertex_element[i].src_format)); } debug_printf("Buffers:\n"); for (i = 0; i < draw->pt.nr_vertex_buffers; i++) { - debug_printf(" stride=%u offset=%u ptr=%p\n", + debug_printf(" %u: stride=%u maxindex=%u offset=%u ptr=%p\n", + i, draw->pt.vertex_buffer[i].stride, + draw->pt.vertex_buffer[i].max_index, draw->pt.vertex_buffer[i].buffer_offset, draw->pt.user.vbuffer[i]); } } -#endif - for (instance = 0; instance < instanceCount; instance++) { - draw->instance_id = instance + startInstance; - draw_pt_arrays(draw, mode, start, count); + if (0) + draw_print_arrays(draw, info->mode, info->start, MIN2(info->count, 20)); + + for (instance = 0; instance < info->instance_count; instance++) { + draw->instance_id = instance + info->start_instance; + draw_pt_arrays(draw, info->mode, info->start, info->count); } } diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h index 3e3ea320cc0..5fbb4242915 100644 --- a/src/gallium/auxiliary/draw/draw_pt.h +++ b/src/gallium/auxiliary/draw/draw_pt.h @@ -35,10 +35,10 @@ #include "pipe/p_compiler.h" -typedef unsigned (*pt_elt_func)( const void *elts, unsigned idx ); - struct draw_pt_middle_end; struct draw_context; +struct draw_prim_info; +struct draw_vertex_info; #define PT_SHADE 0x1 @@ -50,13 +50,18 @@ struct draw_context; /* The "front end" - prepare sets of fetch, draw elements for the * middle end. * - * Currenly one version of this: - * - vcache - catchall implementation, decomposes to TRI/LINE/POINT prims - * Later: - * - varray, varray_split - * - velement, velement_split + * The fetch elements are indices to the vertices. The draw elements are + * indices to the fetched vertices. When both arrays of elements are both + * linear, middle->run_linear is called; When only the fetch elements are + * linear, middle->run_linear_elts is called; Otherwise, middle->run is + * called. + * + * When the number of the draw elements exceeds max_vertex of the middle end, + * the draw elements (as well as the fetch elements) are splitted and the + * middle end is called multiple times. * - * Currenly only using the vcache version. + * Currenly there is: + * - vsplit - catchall implementation, splits big prims */ struct draw_pt_front_end { void (*prepare)( struct draw_pt_front_end *, @@ -65,9 +70,7 @@ struct draw_pt_front_end { unsigned opt ); void (*run)( struct draw_pt_front_end *, - pt_elt_func elt_func, - const void *elt_ptr, - int elt_bias, + unsigned start, unsigned count ); void (*finish)( struct draw_pt_front_end * ); @@ -78,6 +81,8 @@ struct draw_pt_front_end { /* The "middle end" - prepares actual hardware vertices for the * hardware backend. * + * prim_flags is as defined by pipe_draw_info::flags. + * * Currently two versions of this: * - fetch, vertex shade, cliptest, prim-pipeline * - fetch, emit (ie passthrough) @@ -92,11 +97,13 @@ struct draw_pt_middle_end { const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ); + unsigned draw_count, + unsigned prim_flags ); void (*run_linear)(struct draw_pt_middle_end *, unsigned start, - unsigned count); + unsigned count, + unsigned prim_flags ); /* Transform all vertices in a linear range and then draw them with * the supplied element list. May fail and return FALSE. @@ -105,7 +112,8 @@ struct draw_pt_middle_end { unsigned fetch_start, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ); + unsigned draw_count, + unsigned prim_flags ); int (*get_max_vertex_count)( struct draw_pt_middle_end * ); @@ -120,19 +128,11 @@ struct vbuf_render; struct vertex_header; -/* Helper functions. - */ -pt_elt_func draw_pt_elt_func( struct draw_context *draw ); -const void *draw_pt_elt_ptr( struct draw_context *draw, - unsigned start ); - /* Frontends: * - * Currently only the general-purpose vcache implementation, could add - * a special case for tiny vertex buffers. + * Currently only the general-purpose vsplit implementation. */ -struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw ); -struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw); +struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw); /* Middle-ends: @@ -162,21 +162,31 @@ void draw_pt_emit_prepare( struct pt_emit *emit, unsigned *max_vertices ); void draw_pt_emit( struct pt_emit *emit, - const float (*vertex_data)[4], - unsigned vertex_count, - unsigned stride, - const ushort *elts, - unsigned count ); + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info); void draw_pt_emit_linear( struct pt_emit *emit, - const float (*vertex_data)[4], - unsigned stride, - unsigned count ); + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info); void draw_pt_emit_destroy( struct pt_emit *emit ); struct pt_emit *draw_pt_emit_create( struct draw_context *draw ); +/******************************************************************************* + * HW stream output emit: + */ +struct pt_so_emit; + +void draw_pt_so_emit_prepare( struct pt_so_emit *emit ); + +void draw_pt_so_emit( struct pt_so_emit *emit, + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info ); + +void draw_pt_so_emit_destroy( struct pt_so_emit *emit ); + +struct pt_so_emit *draw_pt_so_emit_create( struct draw_context *draw ); /******************************************************************************* * API vertex fetch: @@ -208,12 +218,12 @@ struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw ); struct pt_post_vs; boolean draw_pt_post_vs_run( struct pt_post_vs *pvs, - struct vertex_header *pipeline_verts, - unsigned stride, - unsigned count ); + struct draw_vertex_info *info ); void draw_pt_post_vs_prepare( struct pt_post_vs *pvs, - boolean bypass_clipping, + boolean clip_xy, + boolean clip_z, + boolean clip_user, boolean bypass_viewport, boolean opengl, boolean need_edgeflags ); @@ -227,6 +237,7 @@ void draw_pt_post_vs_destroy( struct pt_post_vs *pvs ); * Utils: */ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr); +unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr); #endif diff --git a/src/gallium/auxiliary/draw/draw_pt_decompose.h b/src/gallium/auxiliary/draw/draw_pt_decompose.h index 3c44f7c11ee..3127aad7310 100644 --- a/src/gallium/auxiliary/draw/draw_pt_decompose.h +++ b/src/gallium/auxiliary/draw/draw_pt_decompose.h @@ -1,162 +1,7 @@ +#define LOCAL_VARS \ + char *verts = (char *) vertices; \ + const boolean last_vertex_last = \ + !(draw->rasterizer->flatshade && \ + draw->rasterizer->flatshade_first); - -static void FUNC( ARGS, - unsigned count ) -{ - LOCAL_VARS; - - switch (prim) { - case PIPE_PRIM_POINTS: - for (i = 0; i < count; i ++) { - POINT( (i + 0) ); - } - break; - - case PIPE_PRIM_LINES: - for (i = 0; i+1 < count; i += 2) { - LINE( DRAW_PIPE_RESET_STIPPLE, - (i + 0), - (i + 1)); - } - break; - - case PIPE_PRIM_LINE_LOOP: - if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; - - for (i = 1; i < count; i++, flags = 0) { - LINE( flags, - (i - 1), - (i )); - } - - LINE( flags, - (i - 1), - (0 )); - } - break; - - case PIPE_PRIM_LINE_STRIP: - flags = DRAW_PIPE_RESET_STIPPLE; - for (i = 1; i < count; i++, flags = 0) { - LINE( flags, - (i - 1), - (i )); - } - break; - - case PIPE_PRIM_TRIANGLES: - for (i = 0; i+2 < count; i += 3) { - if (flatfirst) { - /* put provoking vertex in last pos for clipper */ - TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - (i + 1), - (i + 2), - (i + 0 )); - } - else { - TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - (i + 0), - (i + 1), - (i + 2 )); - } - } - break; - - case PIPE_PRIM_TRIANGLE_STRIP: - if (flatfirst) { - for (i = 0; i+2 < count; i++) { - TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - (i + 1 + (i&1)), - (i + 2 - (i&1)), - (i + 0) ); - } - } - else { - for (i = 0; i+2 < count; i++) { - TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - (i + 0 + (i&1)), - (i + 1 - (i&1)), - (i + 2 )); - } - } - break; - - case PIPE_PRIM_TRIANGLE_FAN: - if (count >= 3) { - if (flatfirst) { - for (i = 0; i+2 < count; i++) { - TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - (i + 2), - 0, - (i + 1) ); - } - } - else { - for (i = 0; i+2 < count; i++) { - TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - (0), - (i + 1), - (i + 2 )); - } - } - } - break; - - - case PIPE_PRIM_QUADS: - for (i = 0; i+3 < count; i += 4) { - QUAD( (i + 0), - (i + 1), - (i + 2), - (i + 3)); - } - break; - - case PIPE_PRIM_QUAD_STRIP: - for (i = 0; i+3 < count; i += 2) { - QUAD( (i + 2), - (i + 0), - (i + 1), - (i + 3)); - } - break; - - case PIPE_PRIM_POLYGON: - { - /* These bitflags look a little odd because we submit the - * vertices as (1,2,0) to satisfy flatshade requirements. - */ - const ushort edge_first = DRAW_PIPE_EDGE_FLAG_2; - const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0; - const ushort edge_last = DRAW_PIPE_EDGE_FLAG_1; - - flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; - - for (i = 0; i+2 < count; i++, flags = edge_middle) { - - if (i + 3 == count) - flags |= edge_last; - - TRIANGLE( flags, - (i + 1), - (i + 2), - (0)); - } - } - break; - - default: - assert(0); - break; - } - - FLUSH; -} - - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC +#include "draw_decompose_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c deleted file mode 100644 index 88f4d9f495a..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_elts.c +++ /dev/null @@ -1,89 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <[email protected]> - */ - -#include "draw/draw_pt.h" -#include "draw/draw_private.h" - -/* Neat get_elt func that also works for varrays drawing by encoding - * the start value into a pointer. - */ - -static unsigned elt_uint( const void *elts, unsigned idx ) -{ - return *(((const uint *)elts) + idx); -} - -static unsigned elt_ushort( const void *elts, unsigned idx ) -{ - return *(((const ushort *)elts) + idx); -} - -static unsigned elt_ubyte( const void *elts, unsigned idx ) -{ - return *(((const ubyte *)elts) + idx); -} - -static unsigned elt_vert( const void *elts, unsigned idx ) -{ - /* unsigned index is packed in the pointer */ - return (unsigned)(uintptr_t)elts + idx; -} - -pt_elt_func draw_pt_elt_func( struct draw_context *draw ) -{ - switch (draw->pt.user.eltSize) { - case 0: return &elt_vert; - case 1: return &elt_ubyte; - case 2: return &elt_ushort; - case 4: return &elt_uint; - default: return NULL; - } -} - -const void *draw_pt_elt_ptr( struct draw_context *draw, - unsigned start ) -{ - const char *elts = draw->pt.user.elts; - - switch (draw->pt.user.eltSize) { - case 0: - return (const void *)(((const ubyte *)NULL) + start); - case 1: - return (const void *)(((const ubyte *)elts) + start); - case 2: - return (const void *)(((const ushort *)elts) + start); - case 4: - return (const void *)(((const uint *)elts) + start); - default: - return NULL; - } -} diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index ad48fa39a4f..c8dfc16911e 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -120,22 +120,21 @@ void draw_pt_emit_prepare( struct pt_emit *emit, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - - /* even number */ - *max_vertices = *max_vertices & ~1; } void draw_pt_emit( struct pt_emit *emit, - const float (*vertex_data)[4], - unsigned vertex_count, - unsigned stride, - const ushort *elts, - unsigned count ) + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info) { + const float (*vertex_data)[4] = (const float (*)[4])vert_info->verts->data; + unsigned vertex_count = vert_info->count; + unsigned stride = vert_info->stride; + const ushort *elts = prim_info->elts; struct draw_context *draw = emit->draw; struct translate *translate = emit->translate; struct vbuf_render *render = draw->render; + unsigned start, i; void *hw_verts; /* XXX: need to flush to get prim_vbuf.c to release its allocation?? @@ -145,11 +144,6 @@ void draw_pt_emit( struct pt_emit *emit, if (vertex_count == 0) return; - if (vertex_count >= UNDEFINED_VERTEX_ID) { - assert(0); - return; - } - /* XXX: and work out some way to coordinate the render primitive * between vbuf.c and here... */ @@ -180,6 +174,7 @@ void draw_pt_emit( struct pt_emit *emit, 0, ~0); + /* fetch/translate vertex attribs to fill hw_verts[] */ translate->run( translate, 0, vertex_count, @@ -190,23 +185,31 @@ void draw_pt_emit( struct pt_emit *emit, 0, vertex_count - 1 ); - render->draw(render, - elts, - count); + for (start = i = 0; + i < prim_info->primitive_count; + start += prim_info->primitive_lengths[i], i++) + { + render->draw_elements(render, + elts + start, + prim_info->primitive_lengths[i]); + } render->release_vertices(render); } void draw_pt_emit_linear(struct pt_emit *emit, - const float (*vertex_data)[4], - unsigned stride, - unsigned count) + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info) { + const float (*vertex_data)[4] = (const float (*)[4])vert_info->verts->data; + unsigned stride = vert_info->stride; + unsigned count = vert_info->count; struct draw_context *draw = emit->draw; struct translate *translate = emit->translate; struct vbuf_render *render = draw->render; void *hw_verts; + unsigned start, i; #if 0 debug_printf("Linear emit\n"); @@ -215,9 +218,6 @@ void draw_pt_emit_linear(struct pt_emit *emit, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - /* XXX: and work out some way to coordinate the render primitive * between vbuf.c and here... */ @@ -258,7 +258,14 @@ void draw_pt_emit_linear(struct pt_emit *emit, render->unmap_vertices( render, 0, count - 1 ); - render->draw_arrays(render, 0, count); + for (start = i = 0; + i < prim_info->primitive_count; + start += prim_info->primitive_lengths[i], i++) + { + render->draw_arrays(render, + start, + prim_info->primitive_lengths[i]); + } render->release_vertices(render); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c index a1347221b5d..ae12ee24bdc 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c @@ -29,7 +29,6 @@ #include "util/u_math.h" #include "draw/draw_context.h" #include "draw/draw_private.h" -#include "draw/draw_vbuf.h" #include "draw/draw_pt.h" #include "translate/translate.h" #include "translate/translate_cache.h" @@ -69,31 +68,12 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch, fetch->vertex_size = vertex_size; - /* Always emit/leave space for a vertex header. - * - * It's worth considering whether the vertex headers should contain - * a pointer to the 'data', rather than having it inline. - * Something to look at after we've fully switched over to the pt - * paths. + /* Leave the clipmask/edgeflags/pad/vertex_id untouched */ - { - /* Need to set header->vertex_id = 0xffff somehow. - */ - key.element[nr].type = TRANSLATE_ELEMENT_NORMAL; - key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT; - key.element[nr].input_buffer = draw->pt.nr_vertex_buffers; - key.element[nr].input_offset = 0; - key.element[nr].instance_divisor = 0; - key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT; - key.element[nr].output_offset = dst_offset; - dst_offset += 1 * sizeof(float); - nr++; - - - /* Just leave the clip[] array untouched. - */ - dst_offset += 4 * sizeof(float); - } + dst_offset += 1 * sizeof(float); + /* Just leave the clip[] array untouched. + */ + dst_offset += 4 * sizeof(float); if (instance_id_index != ~0) { num_extra_inputs++; @@ -132,26 +112,11 @@ void draw_pt_fetch_prepare( struct pt_fetch *fetch, key.nr_elements = nr; key.output_stride = vertex_size; - if (!fetch->translate || translate_key_compare(&fetch->translate->key, &key) != 0) { translate_key_sanitize(&key); fetch->translate = translate_cache_find(fetch->cache, &key); - - { - static struct vertex_header vh = { 0, - 1, - 0, - UNDEFINED_VERTEX_ID, - { .0f, .0f, .0f, .0f } }; - - fetch->translate->set_buffer(fetch->translate, - draw->pt.nr_vertex_buffers, - &vh, - 0, - ~0); - } } } diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c index d7735bf1ac9..e706b7796f8 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c @@ -36,6 +36,7 @@ #include "draw/draw_vbuf.h" #include "draw/draw_vertex.h" #include "draw/draw_pt.h" +#include "draw/draw_gs.h" #include "translate/translate.h" #include "translate/translate_cache.h" @@ -100,9 +101,14 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, boolean ok; struct translate_key key; + unsigned gs_out_prim = (draw->gs.geometry_shader ? + draw->gs.geometry_shader->output_primitive : + prim); + + ok = draw->render->set_primitive( draw->render, - prim ); + gs_out_prim ); if (!ok) { assert(0); return; @@ -185,15 +191,6 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - - /* Return an even number of verts. - * This prevents "parity" errors when splitting long triangle strips which - * can lead to front/back culling mix-ups. - * Every other triangle in a strip has an alternate front/back orientation - * so splitting at an odd position can cause the orientation of subsequent - * triangles to get reversed. - */ - *max_vertices = *max_vertices & ~1; } @@ -204,7 +201,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -214,11 +212,6 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (fetch_count >= UNDEFINED_VERTEX_ID) { - assert(0); - return; - } - draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)fetch_count ); @@ -254,9 +247,9 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, /* XXX: Draw arrays path to avoid re-emitting index list again and * again. */ - draw->render->draw( draw->render, - draw_elts, - draw_count ); + draw->render->draw_elements( draw->render, + draw_elts, + draw_count ); /* Done -- that was easy, wasn't it: */ @@ -267,7 +260,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, static void fetch_emit_run_linear( struct draw_pt_middle_end *middle, unsigned start, - unsigned count ) + unsigned count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -277,9 +271,6 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)count )) @@ -328,7 +319,8 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -338,9 +330,6 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - return FALSE; - if (!draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)count )) @@ -363,9 +352,9 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle, /* XXX: Draw arrays path to avoid re-emitting index list again and * again. */ - draw->render->draw( draw->render, - draw_elts, - draw_count ); + draw->render->draw_elements( draw->render, + draw_elts, + draw_count ); /* Done -- that was easy, wasn't it: */ diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index cbb5b6c9605..7c198c6026d 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -67,9 +67,8 @@ struct fetch_shade_emit { - static void fse_prepare( struct draw_pt_middle_end *middle, - unsigned prim, + unsigned prim, unsigned opt, unsigned *max_vertices ) { @@ -79,9 +78,12 @@ static void fse_prepare( struct draw_pt_middle_end *middle, const struct vertex_info *vinfo; unsigned i; unsigned nr_vbs = 0; - - if (!draw->render->set_primitive( draw->render, + /* Can't support geometry shader on this path. + */ + assert(!draw->gs.geometry_shader); + + if (!draw->render->set_primitive( draw->render, prim )) { assert(0); return; @@ -90,7 +92,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle, /* Must do this after set_primitive() above: */ fse->vinfo = vinfo = draw->render->get_vertex_info(draw->render); - fse->key.output_stride = vinfo->size * 4; @@ -101,7 +102,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->key.nr_inputs); /* inputs - fetch from api format */ fse->key.viewport = !draw->identity_viewport; - fse->key.clip = !draw->bypass_clipping; + fse->key.clip = draw->clip_xy || draw->clip_z || draw->clip_user; fse->key.const_vbuffers = 0; memset(fse->key.element, 0, @@ -174,15 +175,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - /* Return an even number of verts. - * This prevents "parity" errors when splitting long triangle strips which - * can lead to front/back culling mix-ups. - * Every other triangle in a strip has an alternate front/back orientation - * so splitting at an odd position can cause the orientation of subsequent - * triangles to get reversed. - */ - *max_vertices = *max_vertices & ~1; - /* Probably need to do this somewhere (or fix exec shader not to * need it): */ @@ -196,7 +188,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle, static void fse_run_linear( struct draw_pt_middle_end *middle, unsigned start, - unsigned count ) + unsigned count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -206,9 +199,6 @@ static void fse_run_linear( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)count )) @@ -264,7 +254,8 @@ fse_run(struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -274,9 +265,6 @@ fse_run(struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (fetch_count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)fetch_count )) @@ -307,9 +295,9 @@ fse_run(struct draw_pt_middle_end *middle, draw->render->unmap_vertices( draw->render, 0, (ushort)(fetch_count - 1) ); - draw->render->draw( draw->render, - draw_elts, - draw_count ); + draw->render->draw_elements( draw->render, + draw_elts, + draw_count ); draw->render->release_vertices( draw->render ); @@ -326,7 +314,8 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -336,9 +325,6 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - return FALSE; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)count )) @@ -357,9 +343,9 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, hw_verts ); - draw->render->draw( draw->render, - draw_elts, - draw_count ); + draw->render->draw_elements( draw->render, + draw_elts, + draw_count ); draw->render->unmap_vertices( draw->render, 0, (ushort)(count - 1) ); diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c index da5106463a7..b72fd612451 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c @@ -27,6 +27,7 @@ #include "util/u_math.h" #include "util/u_memory.h" +#include "util/u_prim.h" #include "draw/draw_context.h" #include "draw/draw_vbuf.h" #include "draw/draw_vertex.h" @@ -40,16 +41,16 @@ struct fetch_pipeline_middle_end { struct draw_context *draw; struct pt_emit *emit; + struct pt_so_emit *so_emit; struct pt_fetch *fetch; struct pt_post_vs *post_vs; unsigned vertex_data_offset; unsigned vertex_size; - unsigned prim; + unsigned input_prim; unsigned opt; }; - static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, unsigned prim, unsigned opt, @@ -61,6 +62,10 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, unsigned i; unsigned instance_id_index = ~0; + unsigned gs_out_prim = (draw->gs.geometry_shader ? + draw->gs.geometry_shader->output_primitive : + prim); + /* Add one to num_outputs because the pipeline occasionally tags on * an additional texcoord, eg for AA lines. */ @@ -76,7 +81,7 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, } } - fpme->prim = prim; + fpme->input_prim = prim; fpme->opt = opt; /* Always leave room for the vertex header whether we need it or @@ -95,173 +100,177 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, * but gl vs dx9 clip spaces. */ draw_pt_post_vs_prepare( fpme->post_vs, - (boolean)draw->bypass_clipping, - (boolean)draw->identity_viewport, + draw->clip_xy, + draw->clip_z, + draw->clip_user, + draw->identity_viewport, (boolean)draw->rasterizer->gl_rasterization_rules, - (draw->vs.edgeflag_output ? true : false) ); + (draw->vs.edgeflag_output ? TRUE : FALSE) ); + + draw_pt_so_emit_prepare( fpme->so_emit ); if (!(opt & PT_PIPELINE)) { - draw_pt_emit_prepare( fpme->emit, - prim, + draw_pt_emit_prepare( fpme->emit, + gs_out_prim, max_vertices ); - *max_vertices = MAX2( *max_vertices, - DRAW_PIPE_MAX_VERTICES ); + *max_vertices = MAX2( *max_vertices, 4096 ); } else { - *max_vertices = DRAW_PIPE_MAX_VERTICES; + /* limit max fetches by limiting max_vertices */ + *max_vertices = 4096; } - /* return even number */ - *max_vertices = *max_vertices & ~1; - /* No need to prepare the shader. */ vs->prepare(vs, draw); } - -static void fetch_pipeline_run( struct draw_pt_middle_end *middle, - const unsigned *fetch_elts, - unsigned fetch_count, - const ushort *draw_elts, - unsigned draw_count ) +static void fetch( struct pt_fetch *fetch, + const struct draw_fetch_info *fetch_info, + char *output) { - struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; - struct draw_context *draw = fpme->draw; - struct draw_vertex_shader *vshader = draw->vs.vertex_shader; - struct draw_geometry_shader *gshader = draw->gs.geometry_shader; - unsigned opt = fpme->opt; - unsigned alloc_count = align( fetch_count, 4 ); - - struct vertex_header *pipeline_verts = - (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count); - - if (!pipeline_verts) { - /* Not much we can do here - just skip the rendering. - */ - assert(0); - return; + if (fetch_info->linear) { + draw_pt_fetch_run_linear( fetch, + fetch_info->start, + fetch_info->count, + output ); } - - /* Fetch into our vertex buffer - */ - draw_pt_fetch_run( fpme->fetch, - fetch_elts, - fetch_count, - (char *)pipeline_verts ); - - /* Run the shader, note that this overwrites the data[] parts of - * the pipeline verts. - */ - if (opt & PT_SHADE) - { - vshader->run_linear(vshader, - (const float (*)[4])pipeline_verts->data, - ( float (*)[4])pipeline_verts->data, - draw->pt.user.vs_constants, - fetch_count, - fpme->vertex_size, - fpme->vertex_size); - if (gshader) - draw_geometry_shader_run(gshader, - (const float (*)[4])pipeline_verts->data, - ( float (*)[4])pipeline_verts->data, - draw->pt.user.gs_constants, - fetch_count, - fpme->vertex_size, - fpme->vertex_size); + else { + draw_pt_fetch_run( fetch, + fetch_info->elts, + fetch_info->count, + output ); } +} - if (draw_pt_post_vs_run( fpme->post_vs, - pipeline_verts, - fetch_count, - fpme->vertex_size )) - { - opt |= PT_PIPELINE; - } - /* Do we need to run the pipeline? - */ - if (opt & PT_PIPELINE) { +static void pipeline(struct fetch_pipeline_middle_end *fpme, + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info) +{ + if (prim_info->linear) + draw_pipeline_run_linear( fpme->draw, + vert_info, + prim_info); + else draw_pipeline_run( fpme->draw, - fpme->prim, - pipeline_verts, - fetch_count, - fpme->vertex_size, - draw_elts, - draw_count ); + vert_info, + prim_info ); +} + +static void emit(struct pt_emit *emit, + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info) +{ + if (prim_info->linear) { + draw_pt_emit_linear(emit, vert_info, prim_info); } else { - draw_pt_emit( fpme->emit, - (const float (*)[4])pipeline_verts->data, - fetch_count, - fpme->vertex_size, - draw_elts, - draw_count ); + draw_pt_emit(emit, vert_info, prim_info); } +} - FREE(pipeline_verts); +static void draw_vertex_shader_run(struct draw_vertex_shader *vshader, + const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS], + const struct draw_vertex_info *input_verts, + struct draw_vertex_info *output_verts ) +{ + output_verts->vertex_size = input_verts->vertex_size; + output_verts->stride = input_verts->vertex_size; + output_verts->count = input_verts->count; + output_verts->verts = + (struct vertex_header *)MALLOC(output_verts->vertex_size * + align(output_verts->count, 4)); + + vshader->run_linear(vshader, + (const float (*)[4])input_verts->verts->data, + ( float (*)[4])output_verts->verts->data, + constants, + const_size, + input_verts->count, + input_verts->vertex_size, + input_verts->vertex_size); } - -static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, - unsigned start, - unsigned count) +static void fetch_pipeline_generic( struct draw_pt_middle_end *middle, + const struct draw_fetch_info *fetch_info, + const struct draw_prim_info *prim_info ) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_context *draw = fpme->draw; - struct draw_vertex_shader *shader = draw->vs.vertex_shader; - struct draw_geometry_shader *geometry_shader = draw->gs.geometry_shader; + struct draw_vertex_shader *vshader = draw->vs.vertex_shader; + struct draw_geometry_shader *gshader = draw->gs.geometry_shader; + struct draw_prim_info gs_prim_info; + struct draw_vertex_info fetched_vert_info; + struct draw_vertex_info vs_vert_info; + struct draw_vertex_info gs_vert_info; + struct draw_vertex_info *vert_info; unsigned opt = fpme->opt; - unsigned alloc_count = align( count, 4 ); - struct vertex_header *pipeline_verts = - (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count); - - if (!pipeline_verts) { - /* Not much we can do here - just skip the rendering. - */ + fetched_vert_info.count = fetch_info->count; + fetched_vert_info.vertex_size = fpme->vertex_size; + fetched_vert_info.stride = fpme->vertex_size; + fetched_vert_info.verts = + (struct vertex_header *)MALLOC(fpme->vertex_size * + align(fetch_info->count, 4)); + if (!fetched_vert_info.verts) { assert(0); return; } - /* Fetch into our vertex buffer + /* Fetch into our vertex buffer. + */ + fetch( fpme->fetch, fetch_info, (char *)fetched_vert_info.verts ); + + /* Finished with fetch: */ - draw_pt_fetch_run_linear( fpme->fetch, - start, - count, - (char *)pipeline_verts ); + fetch_info = NULL; + vert_info = &fetched_vert_info; /* Run the shader, note that this overwrites the data[] parts of * the pipeline verts. */ - if (opt & PT_SHADE) - { - shader->run_linear(shader, - (const float (*)[4])pipeline_verts->data, - ( float (*)[4])pipeline_verts->data, - draw->pt.user.vs_constants, - count, - fpme->vertex_size, - fpme->vertex_size); - - if (geometry_shader) - draw_geometry_shader_run(geometry_shader, - (const float (*)[4])pipeline_verts->data, - ( float (*)[4])pipeline_verts->data, - draw->pt.user.gs_constants, - count, - fpme->vertex_size, - fpme->vertex_size); + if (fpme->opt & PT_SHADE) { + draw_vertex_shader_run(vshader, + draw->pt.user.vs_constants, + draw->pt.user.vs_constants_size, + vert_info, + &vs_vert_info); + + FREE(vert_info->verts); + vert_info = &vs_vert_info; + } + + if ((fpme->opt & PT_SHADE) && gshader) { + draw_geometry_shader_run(gshader, + draw->pt.user.gs_constants, + draw->pt.user.gs_constants_size, + vert_info, + prim_info, + &gs_vert_info, + &gs_prim_info); + + FREE(vert_info->verts); + vert_info = &gs_vert_info; + prim_info = &gs_prim_info; } + + /* Stream output needs to be done before clipping. + * + * XXX: Stream output surely needs to respect the prim_info->elt + * lists. + */ + draw_pt_so_emit( fpme->so_emit, + vert_info, + prim_info ); + if (draw_pt_post_vs_run( fpme->post_vs, - pipeline_verts, - count, - fpme->vertex_size )) + vert_info )) { opt |= PT_PIPELINE; } @@ -269,102 +278,102 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, /* Do we need to run the pipeline? */ if (opt & PT_PIPELINE) { - draw_pipeline_run_linear( fpme->draw, - fpme->prim, - pipeline_verts, - count, - fpme->vertex_size); + pipeline( fpme, + vert_info, + prim_info ); } else { - draw_pt_emit_linear( fpme->emit, - (const float (*)[4])pipeline_verts->data, - fpme->vertex_size, - count ); + emit( fpme->emit, + vert_info, + prim_info ); } - - FREE(pipeline_verts); + FREE(vert_info->verts); } +static void fetch_pipeline_run( struct draw_pt_middle_end *middle, + const unsigned *fetch_elts, + unsigned fetch_count, + const ushort *draw_elts, + unsigned draw_count, + unsigned prim_flags ) +{ + struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; + struct draw_fetch_info fetch_info; + struct draw_prim_info prim_info; + + fetch_info.linear = FALSE; + fetch_info.start = 0; + fetch_info.elts = fetch_elts; + fetch_info.count = fetch_count; + + prim_info.linear = FALSE; + prim_info.start = 0; + prim_info.count = draw_count; + prim_info.elts = draw_elts; + prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; + prim_info.primitive_count = 1; + prim_info.primitive_lengths = &draw_count; + + fetch_pipeline_generic( middle, &fetch_info, &prim_info ); +} -static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle, - unsigned start, - unsigned count, - const ushort *draw_elts, - unsigned draw_count ) +static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, + unsigned start, + unsigned count, + unsigned prim_flags) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; - struct draw_context *draw = fpme->draw; - struct draw_vertex_shader *shader = draw->vs.vertex_shader; - struct draw_geometry_shader *geometry_shader = draw->gs.geometry_shader; - unsigned opt = fpme->opt; - unsigned alloc_count = align( count, 4 ); + struct draw_fetch_info fetch_info; + struct draw_prim_info prim_info; + + fetch_info.linear = TRUE; + fetch_info.start = start; + fetch_info.count = count; + fetch_info.elts = NULL; + + prim_info.linear = TRUE; + prim_info.start = 0; + prim_info.count = count; + prim_info.elts = NULL; + prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; + prim_info.primitive_count = 1; + prim_info.primitive_lengths = &count; + + fetch_pipeline_generic( middle, &fetch_info, &prim_info ); +} - struct vertex_header *pipeline_verts = - (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count); - if (!pipeline_verts) - return FALSE; - /* Fetch into our vertex buffer - */ - draw_pt_fetch_run_linear( fpme->fetch, - start, - count, - (char *)pipeline_verts ); +static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle, + unsigned start, + unsigned count, + const ushort *draw_elts, + unsigned draw_count, + unsigned prim_flags ) +{ + struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; + struct draw_fetch_info fetch_info; + struct draw_prim_info prim_info; - /* Run the shader, note that this overwrites the data[] parts of - * the pipeline verts. - */ - if (opt & PT_SHADE) - { - shader->run_linear(shader, - (const float (*)[4])pipeline_verts->data, - ( float (*)[4])pipeline_verts->data, - draw->pt.user.vs_constants, - count, - fpme->vertex_size, - fpme->vertex_size); - - if (geometry_shader) - draw_geometry_shader_run(geometry_shader, - (const float (*)[4])pipeline_verts->data, - ( float (*)[4])pipeline_verts->data, - draw->pt.user.gs_constants, - count, - fpme->vertex_size, - fpme->vertex_size); - } + fetch_info.linear = TRUE; + fetch_info.start = start; + fetch_info.count = count; + fetch_info.elts = NULL; - if (draw_pt_post_vs_run( fpme->post_vs, - pipeline_verts, - count, - fpme->vertex_size )) - { - opt |= PT_PIPELINE; - } + prim_info.linear = FALSE; + prim_info.start = 0; + prim_info.count = draw_count; + prim_info.elts = draw_elts; + prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; + prim_info.primitive_count = 1; + prim_info.primitive_lengths = &draw_count; - /* Do we need to run the pipeline? - */ - if (opt & PT_PIPELINE) { - draw_pipeline_run( fpme->draw, - fpme->prim, - pipeline_verts, - count, - fpme->vertex_size, - draw_elts, - draw_count ); - } - else { - draw_pt_emit( fpme->emit, - (const float (*)[4])pipeline_verts->data, - count, - fpme->vertex_size, - draw_elts, - draw_count ); - } + fetch_pipeline_generic( middle, &fetch_info, &prim_info ); - FREE(pipeline_verts); return TRUE; } @@ -385,6 +394,9 @@ static void fetch_pipeline_destroy( struct draw_pt_middle_end *middle ) if (fpme->emit) draw_pt_emit_destroy( fpme->emit ); + if (fpme->so_emit) + draw_pt_so_emit_destroy( fpme->so_emit ); + if (fpme->post_vs) draw_pt_post_vs_destroy( fpme->post_vs ); @@ -416,7 +428,11 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit( struct draw_context * goto fail; fpme->emit = draw_pt_emit_create( draw ); - if (!fpme->emit) + if (!fpme->emit) + goto fail; + + fpme->so_emit = draw_pt_so_emit_create( draw ); + if (!fpme->so_emit) goto fail; return &fpme->base; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index d2a492f2b4c..77291e304e1 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -1,6 +1,6 @@ /************************************************************************** * - * Copyright 2010 VMWare, Inc. + * Copyright 2010 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -28,11 +28,11 @@ #include "util/u_math.h" #include "util/u_memory.h" #include "draw/draw_context.h" +#include "draw/draw_gs.h" #include "draw/draw_vbuf.h" #include "draw/draw_vertex.h" #include "draw/draw_pt.h" #include "draw/draw_vs.h" -#include "draw/draw_gs.h" #include "draw/draw_llvm.h" @@ -41,53 +41,59 @@ struct llvm_middle_end { struct draw_context *draw; struct pt_emit *emit; + struct pt_so_emit *so_emit; struct pt_fetch *fetch; struct pt_post_vs *post_vs; unsigned vertex_data_offset; unsigned vertex_size; - unsigned prim; + unsigned input_prim; unsigned opt; struct draw_llvm *llvm; - struct draw_llvm_variant *variants; struct draw_llvm_variant *current_variant; - int nr_variants; }; static void llvm_middle_end_prepare( struct draw_pt_middle_end *middle, - unsigned prim, + unsigned in_prim, unsigned opt, unsigned *max_vertices ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_context *draw = fpme->draw; - struct draw_vertex_shader *vs = draw->vs.vertex_shader; - struct draw_geometry_shader *gs = draw->gs.geometry_shader; - struct draw_llvm_variant_key key; + struct llvm_vertex_shader *shader = + llvm_vertex_shader(draw->vs.vertex_shader); + char store[DRAW_LLVM_MAX_VARIANT_KEY_SIZE]; + struct draw_llvm_variant_key *key; struct draw_llvm_variant *variant = NULL; + struct draw_llvm_variant_list_item *li; unsigned i; unsigned instance_id_index = ~0; + + unsigned out_prim = (draw->gs.geometry_shader ? + draw->gs.geometry_shader->output_primitive : + in_prim); + /* Add one to num_outputs because the pipeline occasionally tags on * an additional texcoord, eg for AA lines. */ - unsigned nr = MAX2( vs->info.num_inputs, - vs->info.num_outputs + 1 ); + unsigned nr = MAX2( shader->base.info.num_inputs, + shader->base.info.num_outputs + 1 ); /* Scan for instanceID system value. */ - for (i = 0; i < vs->info.num_inputs; i++) { - if (vs->info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) { + for (i = 0; i < shader->base.info.num_inputs; i++) { + if (shader->base.info.input_semantic_name[i] == TGSI_SEMANTIC_INSTANCEID) { instance_id_index = i; break; } } - fpme->prim = prim; + fpme->input_prim = in_prim; fpme->opt = opt; /* Always leave room for the vertex header whether we need it or @@ -97,57 +103,71 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, fpme->vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float); - - draw_pt_fetch_prepare( fpme->fetch, - vs->info.num_inputs, - fpme->vertex_size, - instance_id_index ); - if (opt & PT_SHADE) { - vs->prepare(vs, draw); - draw_geometry_shader_prepare(gs, draw); - } - - /* XXX: it's not really gl rasterization rules we care about here, * but gl vs dx9 clip spaces. */ draw_pt_post_vs_prepare( fpme->post_vs, - (boolean)draw->bypass_clipping, - (boolean)(draw->identity_viewport), + draw->clip_xy, + draw->clip_z, + draw->clip_user, + draw->identity_viewport, (boolean)draw->rasterizer->gl_rasterization_rules, - (draw->vs.edgeflag_output ? true : false) ); + (draw->vs.edgeflag_output ? TRUE : FALSE) ); + + draw_pt_so_emit_prepare( fpme->so_emit ); if (!(opt & PT_PIPELINE)) { draw_pt_emit_prepare( fpme->emit, - prim, + out_prim, max_vertices ); - *max_vertices = MAX2( *max_vertices, - DRAW_PIPE_MAX_VERTICES ); + *max_vertices = MAX2( *max_vertices, 4096 ); } else { - *max_vertices = DRAW_PIPE_MAX_VERTICES; + /* limit max fetches by limiting max_vertices */ + *max_vertices = 4096; } /* return even number */ *max_vertices = *max_vertices & ~1; + + key = draw_llvm_make_variant_key(fpme->llvm, store); - draw_llvm_make_variant_key(fpme->llvm, &key); - - variant = fpme->variants; - while(variant) { - if(memcmp(&variant->key, &key, sizeof key) == 0) + li = first_elem(&shader->variants); + while(!at_end(&shader->variants, li)) { + if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) { + variant = li->base; break; + } + li = next_elem(li); + } - variant = variant->next; + if (variant) { + move_to_head(&fpme->llvm->vs_variants_list, &variant->list_item_global); } + else { + unsigned i; + if (fpme->llvm->nr_variants >= DRAW_MAX_SHADER_VARIANTS) { + /* + * XXX: should we flush here ? + */ + for (i = 0; i < DRAW_MAX_SHADER_VARIANTS / 4; i++) { + struct draw_llvm_variant_list_item *item = + last_elem(&fpme->llvm->vs_variants_list); + draw_llvm_destroy_variant(item->base); + } + } + + variant = draw_llvm_create_variant(fpme->llvm, nr, key); - if (!variant) { - variant = draw_llvm_prepare(fpme->llvm, nr); - variant->next = fpme->variants; - fpme->variants = variant; - ++fpme->nr_variants; + if (variant) { + insert_at_head(&shader->variants, &variant->list_item_local); + insert_at_head(&fpme->llvm->vs_variants_list, &variant->list_item_global); + fpme->llvm->nr_variants++; + shader->variants_cached++; + } } + fpme->current_variant = variant; /*XXX we only support one constant buffer */ @@ -158,125 +178,174 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, } +static void pipeline(struct llvm_middle_end *llvm, + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info) +{ + if (prim_info->linear) + draw_pipeline_run_linear( llvm->draw, + vert_info, + prim_info); + else + draw_pipeline_run( llvm->draw, + vert_info, + prim_info ); +} + +static void emit(struct pt_emit *emit, + const struct draw_vertex_info *vert_info, + const struct draw_prim_info *prim_info) +{ + if (prim_info->linear) { + draw_pt_emit_linear(emit, vert_info, prim_info); + } + else { + draw_pt_emit(emit, vert_info, prim_info); + } +} -static void llvm_middle_end_run( struct draw_pt_middle_end *middle, - const unsigned *fetch_elts, - unsigned fetch_count, - const ushort *draw_elts, - unsigned draw_count ) +static void +llvm_pipeline_generic( struct draw_pt_middle_end *middle, + const struct draw_fetch_info *fetch_info, + const struct draw_prim_info *prim_info ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_context *draw = fpme->draw; + struct draw_geometry_shader *gshader = draw->gs.geometry_shader; + struct draw_prim_info gs_prim_info; + struct draw_vertex_info llvm_vert_info; + struct draw_vertex_info gs_vert_info; + struct draw_vertex_info *vert_info; unsigned opt = fpme->opt; - unsigned alloc_count = align( fetch_count, 4 ); - - struct vertex_header *pipeline_verts = - (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count); - if (!pipeline_verts) { - /* Not much we can do here - just skip the rendering. - */ + llvm_vert_info.count = fetch_info->count; + llvm_vert_info.vertex_size = fpme->vertex_size; + llvm_vert_info.stride = fpme->vertex_size; + llvm_vert_info.verts = + (struct vertex_header *)MALLOC(fpme->vertex_size * + align(fetch_info->count, 4)); + if (!llvm_vert_info.verts) { assert(0); return; } - fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context, - pipeline_verts, - (const char **)draw->pt.user.vbuffer, - fetch_elts, - fetch_count, - fpme->vertex_size, - draw->pt.vertex_buffer ); - - if (draw_pt_post_vs_run( fpme->post_vs, - pipeline_verts, - fetch_count, - fpme->vertex_size )) - { + if (fetch_info->linear) + fpme->current_variant->jit_func( &fpme->llvm->jit_context, + llvm_vert_info.verts, + (const char **)draw->pt.user.vbuffer, + fetch_info->start, + fetch_info->count, + fpme->vertex_size, + draw->pt.vertex_buffer, + draw->instance_id); + else + fpme->current_variant->jit_func_elts( &fpme->llvm->jit_context, + llvm_vert_info.verts, + (const char **)draw->pt.user.vbuffer, + fetch_info->elts, + fetch_info->count, + fpme->vertex_size, + draw->pt.vertex_buffer, + draw->instance_id); + + /* Finished with fetch and vs: + */ + fetch_info = NULL; + vert_info = &llvm_vert_info; + + + if ((opt & PT_SHADE) && gshader) { + draw_geometry_shader_run(gshader, + draw->pt.user.gs_constants, + draw->pt.user.gs_constants_size, + vert_info, + prim_info, + &gs_vert_info, + &gs_prim_info); + + FREE(vert_info->verts); + vert_info = &gs_vert_info; + prim_info = &gs_prim_info; + } + + /* stream output needs to be done before clipping */ + draw_pt_so_emit( fpme->so_emit, + vert_info, + prim_info ); + + if (draw_pt_post_vs_run( fpme->post_vs, vert_info )) { opt |= PT_PIPELINE; } /* Do we need to run the pipeline? */ if (opt & PT_PIPELINE) { - draw_pipeline_run( fpme->draw, - fpme->prim, - pipeline_verts, - fetch_count, - fpme->vertex_size, - draw_elts, - draw_count ); + pipeline( fpme, + vert_info, + prim_info ); } else { - draw_pt_emit( fpme->emit, - (const float (*)[4])pipeline_verts->data, - fetch_count, - fpme->vertex_size, - draw_elts, - draw_count ); + emit( fpme->emit, + vert_info, + prim_info ); } + FREE(vert_info->verts); +} - FREE(pipeline_verts); +static void llvm_middle_end_run( struct draw_pt_middle_end *middle, + const unsigned *fetch_elts, + unsigned fetch_count, + const ushort *draw_elts, + unsigned draw_count, + unsigned prim_flags ) +{ + struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; + struct draw_fetch_info fetch_info; + struct draw_prim_info prim_info; + + fetch_info.linear = FALSE; + fetch_info.start = 0; + fetch_info.elts = fetch_elts; + fetch_info.count = fetch_count; + + prim_info.linear = FALSE; + prim_info.start = 0; + prim_info.count = draw_count; + prim_info.elts = draw_elts; + prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; + prim_info.primitive_count = 1; + prim_info.primitive_lengths = &draw_count; + + llvm_pipeline_generic( middle, &fetch_info, &prim_info ); } static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle, unsigned start, - unsigned count) + unsigned count, + unsigned prim_flags) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; - struct draw_context *draw = fpme->draw; - unsigned opt = fpme->opt; - unsigned alloc_count = align( count, 4 ); - - struct vertex_header *pipeline_verts = - (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count); - - if (!pipeline_verts) { - /* Not much we can do here - just skip the rendering. - */ - assert(0); - return; - } - -#if 0 - debug_printf("#### Pipeline = %p (data = %p)\n", - pipeline_verts, pipeline_verts->data); -#endif - fpme->current_variant->jit_func( &fpme->llvm->jit_context, - pipeline_verts, - (const char **)draw->pt.user.vbuffer, - start, - count, - fpme->vertex_size, - draw->pt.vertex_buffer ); - - if (draw_pt_post_vs_run( fpme->post_vs, - pipeline_verts, - count, - fpme->vertex_size )) - { - opt |= PT_PIPELINE; - } - - /* Do we need to run the pipeline? - */ - if (opt & PT_PIPELINE) { - draw_pipeline_run_linear( fpme->draw, - fpme->prim, - pipeline_verts, - count, - fpme->vertex_size); - } - else { - draw_pt_emit_linear( fpme->emit, - (const float (*)[4])pipeline_verts->data, - fpme->vertex_size, - count ); - } - - FREE(pipeline_verts); + struct draw_fetch_info fetch_info; + struct draw_prim_info prim_info; + + fetch_info.linear = TRUE; + fetch_info.start = start; + fetch_info.count = count; + fetch_info.elts = NULL; + + prim_info.linear = TRUE; + prim_info.start = 0; + prim_info.count = count; + prim_info.elts = NULL; + prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; + prim_info.primitive_count = 1; + prim_info.primitive_lengths = &count; + + llvm_pipeline_generic( middle, &fetch_info, &prim_info ); } @@ -286,56 +355,29 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; - struct draw_context *draw = fpme->draw; - unsigned opt = fpme->opt; - unsigned alloc_count = align( count, 4 ); - - struct vertex_header *pipeline_verts = - (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count); - - if (!pipeline_verts) - return FALSE; - - fpme->current_variant->jit_func( &fpme->llvm->jit_context, - pipeline_verts, - (const char **)draw->pt.user.vbuffer, - start, - count, - fpme->vertex_size, - draw->pt.vertex_buffer ); - - if (draw_pt_post_vs_run( fpme->post_vs, - pipeline_verts, - count, - fpme->vertex_size )) - { - opt |= PT_PIPELINE; - } + struct draw_fetch_info fetch_info; + struct draw_prim_info prim_info; - /* Do we need to run the pipeline? - */ - if (opt & PT_PIPELINE) { - draw_pipeline_run( fpme->draw, - fpme->prim, - pipeline_verts, - count, - fpme->vertex_size, - draw_elts, - draw_count ); - } - else { - draw_pt_emit( fpme->emit, - (const float (*)[4])pipeline_verts->data, - count, - fpme->vertex_size, - draw_elts, - draw_count ); - } + fetch_info.linear = TRUE; + fetch_info.start = start; + fetch_info.count = count; + fetch_info.elts = NULL; + + prim_info.linear = FALSE; + prim_info.start = 0; + prim_info.count = draw_count; + prim_info.elts = draw_elts; + prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; + prim_info.primitive_count = 1; + prim_info.primitive_lengths = &draw_count; + + llvm_pipeline_generic( middle, &fetch_info, &prim_info ); - FREE(pipeline_verts); return TRUE; } @@ -356,17 +398,18 @@ static void llvm_middle_end_destroy( struct draw_pt_middle_end *middle ) if (fpme->emit) draw_pt_emit_destroy( fpme->emit ); + if (fpme->so_emit) + draw_pt_so_emit_destroy( fpme->so_emit ); + if (fpme->post_vs) draw_pt_post_vs_destroy( fpme->post_vs ); - if (fpme->llvm) - draw_llvm_destroy( fpme->llvm ); - FREE(middle); } -struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm( struct draw_context *draw ) +struct draw_pt_middle_end * +draw_pt_fetch_pipeline_or_emit_llvm(struct draw_context *draw) { struct llvm_middle_end *fpme = 0; @@ -398,13 +441,15 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit_llvm( struct draw_cont if (!fpme->emit) goto fail; - fpme->llvm = draw_llvm_create(draw); + fpme->so_emit = draw_pt_so_emit_create( draw ); + if (!fpme->so_emit) + goto fail; + + fpme->llvm = draw->llvm; if (!fpme->llvm) goto fail; - fpme->variants = NULL; fpme->current_variant = NULL; - fpme->nr_variants = 0; return &fpme->base; diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c index 5525dfc748d..769409cfd67 100644 --- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c +++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c @@ -26,22 +26,38 @@ **************************************************************************/ #include "util/u_memory.h" +#include "util/u_math.h" #include "pipe/p_context.h" #include "draw/draw_context.h" #include "draw/draw_private.h" -#include "draw/draw_vbuf.h" #include "draw/draw_pt.h" + +#define DO_CLIP_XY 0x1 +#define DO_CLIP_FULL_Z 0x2 +#define DO_CLIP_HALF_Z 0x4 +#define DO_CLIP_USER 0x8 +#define DO_VIEWPORT 0x10 +#define DO_EDGEFLAG 0x20 + + struct pt_post_vs { struct draw_context *draw; + unsigned flags; + boolean (*run)( struct pt_post_vs *pvs, - struct vertex_header *vertices, - unsigned count, - unsigned stride ); + struct draw_vertex_info *info ); }; - +static INLINE void +initialize_vertex_header(struct vertex_header *header) +{ + header->clipmask = 0; + header->edgeflag = 1; + header->pad = 0; + header->vertex_id = UNDEFINED_VERTEX_ID; +} static INLINE float dot4(const float *a, const float *b) @@ -52,215 +68,121 @@ dot4(const float *a, const float *b) a[3]*b[3]); } +#define FLAGS (0) +#define TAG(x) x##_none +#include "draw_cliptest_tmp.h" +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT) +#define TAG(x) x##_xy_fullz_viewport +#include "draw_cliptest_tmp.h" -static INLINE unsigned -compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr) -{ - unsigned mask = 0x0; - unsigned i; - -#if 0 - debug_printf("compute clipmask %f %f %f %f\n", - clip[0], clip[1], clip[2], clip[3]); - assert(clip[3] != 0.0); -#endif - - /* Do the hardwired planes first: - */ - if (-clip[0] + clip[3] < 0) mask |= (1<<0); - if ( clip[0] + clip[3] < 0) mask |= (1<<1); - if (-clip[1] + clip[3] < 0) mask |= (1<<2); - if ( clip[1] + clip[3] < 0) mask |= (1<<3); - if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */ - if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */ - - /* Followed by any remaining ones: - */ - for (i = 6; i < nr; i++) { - if (dot4(clip, plane[i]) < 0) - mask |= (1<<i); - } - - return mask; -} - - -/* The normal case - cliptest, rhw divide, viewport transform. - * - * Also handle identity viewport here at the expense of a few wasted - * instructions - */ -static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs, - struct vertex_header *vertices, - unsigned count, - unsigned stride ) -{ - struct vertex_header *out = vertices; - const float *scale = pvs->draw->viewport.scale; - const float *trans = pvs->draw->viewport.translate; - const unsigned pos = draw_current_shader_position_output(pvs->draw); - unsigned clipped = 0; - unsigned j; - - if (0) debug_printf("%s\n", __FUNCTION__); - - for (j = 0; j < count; j++) { - float *position = out->data[pos]; - -#if 0 - debug_printf("%d) io = %p, data = %p = [%f, %f, %f, %f]\n", - j, out, position, position[0], position[1], position[2], position[3]); -#endif - - out->clip[0] = position[0]; - out->clip[1] = position[1]; - out->clip[2] = position[2]; - out->clip[3] = position[3]; - - out->vertex_id = 0xffff; - out->clipmask = compute_clipmask_gl(out->clip, - pvs->draw->plane, - pvs->draw->nr_planes); - clipped += out->clipmask; - - if (out->clipmask == 0) - { - /* divide by w */ - float w = 1.0f / position[3]; - - /* Viewport mapping */ - position[0] = position[0] * w * scale[0] + trans[0]; - position[1] = position[1] * w * scale[1] + trans[1]; - position[2] = position[2] * w * scale[2] + trans[2]; - position[3] = w; -#if 0 - debug_printf("post viewport: %f %f %f %f\n", - position[0], - position[1], - position[2], - position[3]); -#endif - } - - out = (struct vertex_header *)( (char *)out + stride ); - } - - return clipped != 0; -} - - +#define FLAGS (DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT) +#define TAG(x) x##_xy_halfz_viewport +#include "draw_cliptest_tmp.h" -/* As above plus edgeflags - */ -static boolean -post_vs_cliptest_viewport_gl_edgeflag(struct pt_post_vs *pvs, - struct vertex_header *vertices, - unsigned count, - unsigned stride ) -{ - unsigned j; - boolean needpipe; +#define FLAGS (DO_CLIP_FULL_Z | DO_VIEWPORT) +#define TAG(x) x##_fullz_viewport +#include "draw_cliptest_tmp.h" - needpipe = post_vs_cliptest_viewport_gl( pvs, vertices, count, stride); +#define FLAGS (DO_CLIP_HALF_Z | DO_VIEWPORT) +#define TAG(x) x##_halfz_viewport +#include "draw_cliptest_tmp.h" - /* If present, copy edgeflag VS output into vertex header. - * Otherwise, leave header as is. - */ - if (pvs->draw->vs.edgeflag_output) { - struct vertex_header *out = vertices; - int ef = pvs->draw->vs.edgeflag_output; - - for (j = 0; j < count; j++) { - const float *edgeflag = out->data[ef]; - out->edgeflag = !(edgeflag[0] != 1.0f); - needpipe |= !out->edgeflag; - out = (struct vertex_header *)( (char *)out + stride ); - } - } - return needpipe; -} +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT) +#define TAG(x) x##_xy_fullz_user_viewport +#include "draw_cliptest_tmp.h" +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT | DO_EDGEFLAG) +#define TAG(x) x##_xy_fullz_user_viewport_edgeflag +#include "draw_cliptest_tmp.h" -/* If bypass_clipping is set, skip cliptest and rhw divide. +/* Don't want to create 64 versions of this function, so catch the + * less common ones here. This is looking like something which should + * be code-generated, perhaps appended to the end of the vertex + * shader. */ -static boolean post_vs_viewport( struct pt_post_vs *pvs, - struct vertex_header *vertices, - unsigned count, - unsigned stride ) -{ - struct vertex_header *out = vertices; - const float *scale = pvs->draw->viewport.scale; - const float *trans = pvs->draw->viewport.translate; - const unsigned pos = draw_current_shader_position_output(pvs->draw); - unsigned j; - - if (0) debug_printf("%s\n", __FUNCTION__); - for (j = 0; j < count; j++) { - float *position = out->data[pos]; - - /* Viewport mapping only, no cliptest/rhw divide - */ - position[0] = position[0] * scale[0] + trans[0]; - position[1] = position[1] * scale[1] + trans[1]; - position[2] = position[2] * scale[2] + trans[2]; - - out = (struct vertex_header *)((char *)out + stride); - } - - return FALSE; -} +#define FLAGS (pvs->flags) +#define TAG(x) x##_generic +#include "draw_cliptest_tmp.h" -/* If bypass_clipping is set and we have an identity viewport, nothing - * to do. - */ -static boolean post_vs_none( struct pt_post_vs *pvs, - struct vertex_header *vertices, - unsigned count, - unsigned stride ) -{ - if (0) debug_printf("%s\n", __FUNCTION__); - return FALSE; -} boolean draw_pt_post_vs_run( struct pt_post_vs *pvs, - struct vertex_header *pipeline_verts, - unsigned count, - unsigned stride ) + struct draw_vertex_info *info ) { - return pvs->run( pvs, pipeline_verts, count, stride ); + return pvs->run( pvs, info ); } void draw_pt_post_vs_prepare( struct pt_post_vs *pvs, - boolean bypass_clipping, + boolean clip_xy, + boolean clip_z, + boolean clip_user, boolean bypass_viewport, boolean opengl, boolean need_edgeflags ) { - if (!need_edgeflags) { - if (bypass_clipping) { - if (bypass_viewport) - pvs->run = post_vs_none; - else - pvs->run = post_vs_viewport; - } - else { - /* if (opengl) */ - pvs->run = post_vs_cliptest_viewport_gl; - } + pvs->flags = 0; + + if (clip_xy) + pvs->flags |= DO_CLIP_XY; + + if (clip_z && opengl) { + pvs->flags |= DO_CLIP_FULL_Z; + ASSIGN_4V( pvs->draw->plane[4], 0, 0, 1, 1 ); } - else { - /* If we need to copy edgeflags to the vertex header, it should - * mean we're running the primitive pipeline. Hence the bypass - * flags should be false. - */ - assert(!bypass_clipping); - assert(!bypass_viewport); - pvs->run = post_vs_cliptest_viewport_gl_edgeflag; + + if (clip_z && !opengl) { + pvs->flags |= DO_CLIP_HALF_Z; + ASSIGN_4V( pvs->draw->plane[4], 0, 0, 1, 0 ); + } + + if (clip_user) + pvs->flags |= DO_CLIP_USER; + + if (!bypass_viewport) + pvs->flags |= DO_VIEWPORT; + + if (need_edgeflags) + pvs->flags |= DO_EDGEFLAG; + + /* Now select the relevant function: + */ + switch (pvs->flags) { + case 0: + pvs->run = do_cliptest_none; + break; + + case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT: + pvs->run = do_cliptest_xy_fullz_viewport; + break; + + case DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT: + pvs->run = do_cliptest_xy_halfz_viewport; + break; + + case DO_CLIP_FULL_Z | DO_VIEWPORT: + pvs->run = do_cliptest_fullz_viewport; + break; + + case DO_CLIP_HALF_Z | DO_VIEWPORT: + pvs->run = do_cliptest_halfz_viewport; + break; + + case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT: + pvs->run = do_cliptest_xy_fullz_user_viewport; + break; + + case (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | + DO_VIEWPORT | DO_EDGEFLAG): + pvs->run = do_cliptest_xy_fullz_user_viewport_edgeflag; + break; + + default: + pvs->run = do_cliptest_generic; + break; } } @@ -272,7 +194,7 @@ struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw ) return NULL; pvs->draw = draw; - + return pvs; } diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c new file mode 100644 index 00000000000..c86bdd99a33 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c @@ -0,0 +1,293 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_vbuf.h" +#include "draw/draw_vertex.h" +#include "draw/draw_pt.h" + +#include "util/u_math.h" +#include "util/u_memory.h" + +struct pt_so_emit { + struct draw_context *draw; + + void *buffers[PIPE_MAX_SO_BUFFERS]; + + unsigned input_vertex_stride; + const float (*inputs)[4]; + + boolean has_so; + + boolean single_buffer; + + unsigned emitted_primitives; + unsigned emitted_vertices; +}; + + +void draw_pt_so_emit_prepare(struct pt_so_emit *emit) +{ + struct draw_context *draw = emit->draw; + + emit->has_so = (draw->so.state.num_outputs > 0); + + /* if we have a state with outputs make sure we have + * buffers to output to */ + if (emit->has_so) { + boolean has_valid_buffer = FALSE; + unsigned i; + for (i = 0; i < draw->so.num_buffers; ++i) { + if (draw->so.buffers[i]) { + has_valid_buffer = TRUE; + break; + } + } + emit->has_so = has_valid_buffer; + } + + if (!emit->has_so) + return; + + /* XXX: need to flush to get prim_vbuf.c to release its allocation?? + */ + draw_do_flush( draw, DRAW_FLUSH_BACKEND ); +} + +static boolean +is_component_writable(unsigned mask, + unsigned compo) +{ + switch (mask) { + case TGSI_WRITEMASK_NONE: + return FALSE; + case TGSI_WRITEMASK_X: + return compo == 0; + case TGSI_WRITEMASK_Y: + return compo == 1; + case TGSI_WRITEMASK_XY: + return compo == 0 || compo == 1; + case TGSI_WRITEMASK_Z: + return compo == 2; + case TGSI_WRITEMASK_XZ: + return compo == 0 || compo == 2; + case TGSI_WRITEMASK_YZ: + return compo == 1 || compo == 2; + case TGSI_WRITEMASK_XYZ: + return compo == 0 || compo == 1 || compo == 2; + case TGSI_WRITEMASK_W: + return compo == 3; + case TGSI_WRITEMASK_XW: + return compo == 0 || compo == 3; + case TGSI_WRITEMASK_YW: + return compo == 1 || compo == 3; + case TGSI_WRITEMASK_XYW: + return compo == 0 || compo == 1 || compo == 3; + case TGSI_WRITEMASK_ZW: + return compo == 2 || compo == 3; + case TGSI_WRITEMASK_XZW: + return compo == 0 || compo == 1 || compo == 3; + case TGSI_WRITEMASK_YZW: + return compo == 1 || compo == 2 || compo == 4; + case TGSI_WRITEMASK_XYZW: + return compo < 4; + default: + debug_assert(!"Unknown writemask in stream out"); + return compo < 4; + } +} + +static void so_emit_prim(struct pt_so_emit *so, + unsigned *indices, + unsigned num_vertices) +{ + unsigned slot, i; + unsigned input_vertex_stride = so->input_vertex_stride; + struct draw_context *draw = so->draw; + const float (*input_ptr)[4]; + const struct pipe_stream_output_state *state = + &draw->so.state; + float **buffer = 0; + + input_ptr = so->inputs; + + for (i = 0; i < num_vertices; ++i) { + const float (*input)[4]; + unsigned total_written_compos = 0; + /*debug_printf("%d) vertex index = %d (prim idx = %d)\n", i, indices[i], prim_idx);*/ + input = (const float (*)[4])( + (const char *)input_ptr + (indices[i] * input_vertex_stride)); + for (slot = 0; slot < state->num_outputs; ++slot) { + unsigned idx = state->register_index[slot]; + unsigned writemask = state->register_mask[slot]; + unsigned written_compos = 0; + unsigned compo; + + buffer = (float**)&so->buffers[state->output_buffer[slot]]; + + /*debug_printf("\tSlot = %d, vs_slot = %d, idx = %d:\n", + slot, vs_slot, idx);*/ +#if 1 + assert(!util_is_inf_or_nan(input[idx][0])); + assert(!util_is_inf_or_nan(input[idx][1])); + assert(!util_is_inf_or_nan(input[idx][2])); + assert(!util_is_inf_or_nan(input[idx][3])); +#endif + for (compo = 0; compo < 4; ++compo) { + if (is_component_writable(writemask, compo)) { + float *buf = *buffer; + buf[written_compos++] = input[idx][compo]; + } + } +#if 0 + debug_printf("\t\t(writemask = %d)%f %f %f %f\n", + writemask, + input[idx][0], + input[idx][1], + input[idx][2], + input[idx][3]); +#endif + *buffer += written_compos; + total_written_compos += written_compos; + } + if (so->single_buffer) { + int stride = (int)state->stride - + sizeof(float) * total_written_compos; + + debug_assert(stride >= 0); + *buffer = (float*) (((char*)*buffer) + stride); + } + } + so->emitted_vertices += num_vertices; + ++so->emitted_primitives; +} + +static void so_point(struct pt_so_emit *so, int idx) +{ + unsigned indices[1]; + + indices[0] = idx; + + so_emit_prim(so, indices, 1); +} + +static void so_line(struct pt_so_emit *so, int i0, int i1) +{ + unsigned indices[2]; + + indices[0] = i0; + indices[1] = i1; + + so_emit_prim(so, indices, 2); +} + +static void so_tri(struct pt_so_emit *so, int i0, int i1, int i2) +{ + unsigned indices[3]; + + indices[0] = i0; + indices[1] = i1; + indices[2] = i2; + + so_emit_prim(so, indices, 3); +} + + +#define FUNC so_run_linear +#define GET_ELT(idx) (start + (idx)) +#include "draw_so_emit_tmp.h" + + +#define FUNC so_run_elts +#define LOCAL_VARS const ushort *elts = input_prims->elts; +#define GET_ELT(idx) (elts[start + (idx)]) +#include "draw_so_emit_tmp.h" + + +void draw_pt_so_emit( struct pt_so_emit *emit, + const struct draw_vertex_info *input_verts, + const struct draw_prim_info *input_prims ) +{ + struct draw_context *draw = emit->draw; + struct vbuf_render *render = draw->render; + unsigned start, i; + + if (!emit->has_so) + return; + + emit->emitted_vertices = 0; + emit->emitted_primitives = 0; + emit->input_vertex_stride = input_verts->stride; + emit->inputs = (const float (*)[4])input_verts->verts->data; + for (i = 0; i < draw->so.num_buffers; ++i) { + emit->buffers[i] = draw->so.buffers[i]; + } + emit->single_buffer = TRUE; + for (i = 0; i < draw->so.state.num_outputs; ++i) { + if (draw->so.state.output_buffer[i] != 0) + emit->single_buffer = FALSE; + } + + /* XXX: need to flush to get prim_vbuf.c to release its allocation??*/ + draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + + for (start = i = 0; i < input_prims->primitive_count; + start += input_prims->primitive_lengths[i], i++) + { + unsigned count = input_prims->primitive_lengths[i]; + + if (input_prims->linear) { + so_run_linear(emit, input_prims, input_verts, + start, count); + } else { + so_run_elts(emit, input_prims, input_verts, + start, count); + } + } + + render->set_stream_output_info(render, + emit->emitted_primitives, + emit->emitted_vertices); +} + + +struct pt_so_emit *draw_pt_so_emit_create( struct draw_context *draw ) +{ + struct pt_so_emit *emit = CALLOC_STRUCT(pt_so_emit); + if (!emit) + return NULL; + + emit->draw = draw; + + return emit; +} + +void draw_pt_so_emit_destroy( struct pt_so_emit *emit ) +{ + FREE(emit); +} diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c index 3236d38e6ab..513bbbed216 100644 --- a/src/gallium/auxiliary/draw/draw_pt_util.c +++ b/src/gallium/auxiliary/draw/draw_pt_util.c @@ -53,7 +53,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; case PIPE_PRIM_LINES_ADJACENCY: *first = 4; - *incr = 2; + *incr = 4; break; case PIPE_PRIM_LINE_STRIP_ADJACENCY: *first = 4; @@ -65,7 +65,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; case PIPE_PRIM_TRIANGLES_ADJACENCY: *first = 6; - *incr = 3; + *incr = 6; break; case PIPE_PRIM_TRIANGLE_STRIP: case PIPE_PRIM_TRIANGLE_FAN: @@ -75,7 +75,7 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: *first = 6; - *incr = 1; + *incr = 2; break; case PIPE_PRIM_QUADS: *first = 4; @@ -92,3 +92,10 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; } } + +unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr) +{ + if (count < first) + return 0; + return count - (count - first) % incr; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c deleted file mode 100644 index d0e16c9bc3c..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray.c +++ /dev/null @@ -1,193 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "util/u_math.h" -#include "util/u_memory.h" - -#include "draw/draw_context.h" -#include "draw/draw_private.h" -#include "draw/draw_pt.h" - -#define FETCH_MAX 256 -#define DRAW_MAX (FETCH_MAX+8) - -struct varray_frontend { - struct draw_pt_front_end base; - struct draw_context *draw; - - ushort draw_elts[DRAW_MAX]; - unsigned fetch_elts[FETCH_MAX]; - - unsigned driver_fetch_max; - unsigned fetch_max; - - struct draw_pt_middle_end *middle; - - unsigned input_prim; - unsigned output_prim; -}; - - -static void varray_flush_linear(struct varray_frontend *varray, - unsigned start, unsigned count) -{ - if (count) { - assert(varray->middle->run_linear); - varray->middle->run_linear(varray->middle, start, count); - } -} - -static void varray_line_loop_segment(struct varray_frontend *varray, - unsigned start, - unsigned segment_start, - unsigned segment_count, - boolean end ) -{ - assert(segment_count < varray->fetch_max); - if (segment_count >= 1) { - unsigned nr = 0, i; - - for (i = 0; i < segment_count; i++) - varray->fetch_elts[nr++] = start + segment_start + i; - - if (end) - varray->fetch_elts[nr++] = start; - - assert(nr <= FETCH_MAX); - - varray->middle->run(varray->middle, - varray->fetch_elts, - nr, - varray->draw_elts, /* ie. linear */ - nr); - } -} - - - -static void varray_fan_segment(struct varray_frontend *varray, - unsigned start, - unsigned segment_start, - unsigned segment_count ) -{ - assert(segment_count < varray->fetch_max); - if (segment_count >= 2) { - unsigned nr = 0, i; - - if (segment_start != 0) - varray->fetch_elts[nr++] = start; - - for (i = 0 ; i < segment_count; i++) - varray->fetch_elts[nr++] = start + segment_start + i; - - assert(nr <= FETCH_MAX); - - varray->middle->run(varray->middle, - varray->fetch_elts, - nr, - varray->draw_elts, /* ie. linear */ - nr); - } -} - - - - -#define FUNC varray_run -#include "draw_pt_varray_tmp_linear.h" - -static unsigned decompose_prim[PIPE_PRIM_POLYGON + 1] = { - PIPE_PRIM_POINTS, - PIPE_PRIM_LINES, - PIPE_PRIM_LINE_STRIP, /* decomposed LINELOOP */ - PIPE_PRIM_LINE_STRIP, - PIPE_PRIM_TRIANGLES, - PIPE_PRIM_TRIANGLE_STRIP, - PIPE_PRIM_TRIANGLE_FAN, - PIPE_PRIM_QUADS, - PIPE_PRIM_QUAD_STRIP, - PIPE_PRIM_POLYGON -}; - - - -static void varray_prepare(struct draw_pt_front_end *frontend, - unsigned prim, - struct draw_pt_middle_end *middle, - unsigned opt) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - - varray->base.run = varray_run; - - varray->input_prim = prim; - varray->output_prim = decompose_prim[prim]; - - varray->middle = middle; - middle->prepare(middle, varray->output_prim, opt, &varray->driver_fetch_max ); - - /* check that the max is even */ - assert((varray->driver_fetch_max & 1) == 0); - - varray->fetch_max = MIN2(FETCH_MAX, varray->driver_fetch_max); -} - - - - -static void varray_finish(struct draw_pt_front_end *frontend) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - varray->middle->finish(varray->middle); - varray->middle = NULL; -} - -static void varray_destroy(struct draw_pt_front_end *frontend) -{ - FREE(frontend); -} - - -struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw) -{ - ushort i; - struct varray_frontend *varray = CALLOC_STRUCT(varray_frontend); - if (varray == NULL) - return NULL; - - varray->base.prepare = varray_prepare; - varray->base.run = NULL; - varray->base.finish = varray_finish; - varray->base.destroy = varray_destroy; - varray->draw = draw; - - for (i = 0; i < DRAW_MAX; i++) { - varray->draw_elts[i] = i; - } - - return &varray->base; -} diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h deleted file mode 100644 index 7c722457c3c..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h +++ /dev/null @@ -1,238 +0,0 @@ - -static void FUNC(struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - unsigned count) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - struct draw_context *draw = varray->draw; - unsigned start = (unsigned)elts; - - boolean flatfirst = (draw->rasterizer->flatshade && - draw->rasterizer->flatshade_first); - unsigned i, j; - ushort flags; - unsigned first, incr; - - varray->fetch_start = start; - - draw_pt_split_prim(varray->input_prim, &first, &incr); - -#if 0 - debug_printf("%s (%d) %d/%d\n", __FUNCTION__, - varray->input_prim, - start, count); -#endif - - switch (varray->input_prim) { - case PIPE_PRIM_POINTS: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i < end; i++) { - POINT(varray, i + 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_LINES: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+1 < end; i += 2) { - LINE(varray, DRAW_PIPE_RESET_STIPPLE, - i + 0, i + 1); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_LINE_LOOP: - if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; - - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 1; i < end; i++, flags = 0) { - LINE(varray, flags, i - 1, i); - } - LINE(varray, flags, i - 1, 0); - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - break; - - case PIPE_PRIM_LINE_STRIP: - flags = DRAW_PIPE_RESET_STIPPLE; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 1; i < end; i++, flags = 0) { - LINE(varray, flags, i - 1, i); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_TRIANGLES: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i += 3) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0, i + 1, i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_TRIANGLE_STRIP: - if (flatfirst) { - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0, i + 1 + (i&1), i + 2 - (i&1)); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - } - else { - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i + 2 < end; i++) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0 + (i&1), i + 1 - (i&1), i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - } - break; - - case PIPE_PRIM_TRIANGLE_FAN: - if (count >= 3) { - if (flatfirst) { - flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, flags, i + 1, i + 2, 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - else { - flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, flags, 0, i + 1, i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - } - break; - - case PIPE_PRIM_QUADS: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+3 < end; i += 4) { - QUAD(varray, i + 0, i + 1, i + 2, i + 3); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_QUAD_STRIP: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+3 < end; i += 2) { - QUAD(varray, i + 2, i + 0, i + 1, i + 3); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - break; - - case PIPE_PRIM_POLYGON: - { - /* These bitflags look a little odd because we submit the - * vertices as (1,2,0) to satisfy flatshade requirements. - */ - const ushort edge_first = DRAW_PIPE_EDGE_FLAG_2; - const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0; - const ushort edge_last = DRAW_PIPE_EDGE_FLAG_1; - - flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++, flags = edge_middle) { - - if (i + 3 == count) - flags |= edge_last; - - TRIANGLE(varray, flags, i + 1, i + 2, 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - break; - - default: - assert(0); - break; - } - - varray_flush(varray); -} - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h deleted file mode 100644 index a292346be95..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h +++ /dev/null @@ -1,98 +0,0 @@ -static unsigned trim( unsigned count, unsigned first, unsigned incr ) -{ - return count - (count - first) % incr; -} - -static void FUNC(struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned count) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - unsigned start = (unsigned) ((char *) elts - (char *) NULL); - - unsigned j; - unsigned first, incr; - - assert(elt_bias == 0); - - draw_pt_split_prim(varray->input_prim, &first, &incr); - - /* Sanitize primitive length: - */ - count = trim(count, first, incr); - if (count < first) - return; - -#if 0 - debug_printf("%s (%d) %d/%d\n", __FUNCTION__, - varray->input_prim, - start, count); -#endif - - switch (varray->input_prim) { - case PIPE_PRIM_POINTS: - case PIPE_PRIM_LINES: - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_LINE_STRIP: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_QUADS: - case PIPE_PRIM_QUAD_STRIP: - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - for (j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr ); - varray_flush_linear(varray, start + j, nr); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - break; - - case PIPE_PRIM_LINE_LOOP: - /* Always have to decompose as we've stated that this will be - * emitted as a line-strip. - */ - for (j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr ); - varray_line_loop_segment(varray, start, j, nr, nr == remaining); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - break; - - - case PIPE_PRIM_POLYGON: - case PIPE_PRIM_TRIANGLE_FAN: - if (count < varray->driver_fetch_max) { - varray_flush_linear(varray, start, count); - } - else { - for ( j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr ); - varray_fan_segment(varray, start, j, nr); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - } - break; - - default: - assert(0); - break; - } -} - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c deleted file mode 100644 index 37ffbac4f92..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_vcache.c +++ /dev/null @@ -1,523 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <[email protected]> - */ - -#include "util/u_memory.h" -#include "util/u_prim.h" -#include "draw/draw_context.h" -#include "draw/draw_private.h" -#include "draw/draw_pt.h" - - -#define CACHE_MAX 256 -#define FETCH_MAX 256 -#define DRAW_MAX (16*1024) - -struct vcache_frontend { - struct draw_pt_front_end base; - struct draw_context *draw; - - unsigned in[CACHE_MAX]; - ushort out[CACHE_MAX]; - - ushort draw_elts[DRAW_MAX]; - unsigned fetch_elts[FETCH_MAX]; - - unsigned draw_count; - unsigned fetch_count; - unsigned fetch_max; - - struct draw_pt_middle_end *middle; - - unsigned input_prim; - unsigned output_prim; - - unsigned middle_prim; - unsigned opt; -}; - -static INLINE void -vcache_flush( struct vcache_frontend *vcache ) -{ - if (vcache->middle_prim != vcache->output_prim) { - vcache->middle_prim = vcache->output_prim; - vcache->middle->prepare( vcache->middle, - vcache->middle_prim, - vcache->opt, - &vcache->fetch_max ); - } - - if (vcache->draw_count) { - vcache->middle->run( vcache->middle, - vcache->fetch_elts, - vcache->fetch_count, - vcache->draw_elts, - vcache->draw_count ); - } - - memset(vcache->in, ~0, sizeof(vcache->in)); - vcache->fetch_count = 0; - vcache->draw_count = 0; -} - -static INLINE void -vcache_check_flush( struct vcache_frontend *vcache ) -{ - if ( vcache->draw_count + 6 >= DRAW_MAX || - vcache->fetch_count + 4 >= FETCH_MAX ) - { - vcache_flush( vcache ); - } -} - - -static INLINE void -vcache_elt( struct vcache_frontend *vcache, - unsigned felt, - ushort flags ) -{ - unsigned idx = felt % CACHE_MAX; - - if (vcache->in[idx] != felt) { - assert(vcache->fetch_count < FETCH_MAX); - - vcache->in[idx] = felt; - vcache->out[idx] = (ushort)vcache->fetch_count; - vcache->fetch_elts[vcache->fetch_count++] = felt; - } - - vcache->draw_elts[vcache->draw_count++] = vcache->out[idx] | flags; -} - - - -static INLINE void -vcache_triangle( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1, - unsigned i2 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, i2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_flags( struct vcache_frontend *vcache, - ushort flags, - unsigned i0, - unsigned i1, - unsigned i2 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, i2, 0); - vcache_check_flush(vcache); -} - -static INLINE void -vcache_line( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_flags( struct vcache_frontend *vcache, - ushort flags, - unsigned i0, - unsigned i1 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_point( struct vcache_frontend *vcache, - unsigned i0 ) -{ - vcache_elt(vcache, i0, 0); - vcache_check_flush(vcache); -} - -static INLINE void -vcache_quad( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1, - unsigned i2, - unsigned i3 ) -{ - vcache_triangle( vcache, i0, i1, i3 ); - vcache_triangle( vcache, i1, i2, i3 ); -} - -static INLINE void -vcache_ef_quad( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1, - unsigned i2, - unsigned i3 ) -{ - if (vcache->draw->rasterizer->flatshade_first) { - vcache_triangle_flags( vcache, - ( DRAW_PIPE_RESET_STIPPLE | - DRAW_PIPE_EDGE_FLAG_0 | - DRAW_PIPE_EDGE_FLAG_1 ), - i0, i1, i2 ); - - vcache_triangle_flags( vcache, - ( DRAW_PIPE_EDGE_FLAG_2 | - DRAW_PIPE_EDGE_FLAG_1 ), - i0, i2, i3 ); - } - else { - vcache_triangle_flags( vcache, - ( DRAW_PIPE_RESET_STIPPLE | - DRAW_PIPE_EDGE_FLAG_0 | - DRAW_PIPE_EDGE_FLAG_2 ), - i0, i1, i3 ); - - vcache_triangle_flags( vcache, - ( DRAW_PIPE_EDGE_FLAG_0 | - DRAW_PIPE_EDGE_FLAG_1 ), - i1, i2, i3 ); - } -} - -/* At least for now, we're back to using a template include file for - * this. The two paths aren't too different though - it may be - * possible to reunify them. - */ -#define TRIANGLE(vc,flags,i0,i1,i2) vcache_triangle_flags(vc,flags,i0,i1,i2) -#define QUAD(vc,i0,i1,i2,i3) vcache_ef_quad(vc,i0,i1,i2,i3) -#define LINE(vc,flags,i0,i1) vcache_line_flags(vc,flags,i0,i1) -#define POINT(vc,i0) vcache_point(vc,i0) -#define FUNC vcache_run_extras -#include "draw_pt_vcache_tmp.h" - -#define TRIANGLE(vc,flags,i0,i1,i2) vcache_triangle(vc,i0,i1,i2) -#define QUAD(vc,i0,i1,i2,i3) vcache_quad(vc,i0,i1,i2,i3) -#define LINE(vc,flags,i0,i1) vcache_line(vc,i0,i1) -#define POINT(vc,i0) vcache_point(vc,i0) -#define FUNC vcache_run -#include "draw_pt_vcache_tmp.h" - -static INLINE void -rebase_uint_elts( const unsigned *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - -static INLINE void -rebase_ushort_elts( const ushort *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - -static INLINE void -rebase_ubyte_elts( const ubyte *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - - -static INLINE void -translate_uint_elts( const unsigned *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - -static INLINE void -translate_ushort_elts( const ushort *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - -static INLINE void -translate_ubyte_elts( const ubyte *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - - - -#if 0 -static INLINE enum pipe_format -format_from_get_elt( pt_elt_func get_elt ) -{ - switch (draw->pt.user.eltSize) { - case 1: return PIPE_FORMAT_R8_UNORM; - case 2: return PIPE_FORMAT_R16_UNORM; - case 4: return PIPE_FORMAT_R32_UNORM; - default: return PIPE_FORMAT_NONE; - } -} -#endif - -static INLINE void -vcache_check_run( struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned draw_count ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - struct draw_context *draw = vcache->draw; - unsigned min_index = draw->pt.user.min_index; - unsigned max_index = draw->pt.user.max_index; - unsigned index_size = draw->pt.user.eltSize; - unsigned fetch_count = max_index + 1 - min_index; - const ushort *transformed_elts; - ushort *storage = NULL; - boolean ok = FALSE; - - - if (0) debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, - vcache->fetch_max, - draw_count); - - if (elt_bias + max_index >= DRAW_PIPE_MAX_VERTICES || - fetch_count >= UNDEFINED_VERTEX_ID || - fetch_count > draw_count) { - if (0) debug_printf("fail\n"); - goto fail; - } - - if (vcache->middle_prim != vcache->input_prim) { - vcache->middle_prim = vcache->input_prim; - vcache->middle->prepare( vcache->middle, - vcache->middle_prim, - vcache->opt, - &vcache->fetch_max ); - } - - - assert((elt_bias >= 0 && min_index + elt_bias >= min_index) || - (elt_bias < 0 && min_index + elt_bias < min_index)); - - if (min_index == 0 && - index_size == 2) - { - transformed_elts = (const ushort *)elts; - } - else - { - storage = MALLOC( draw_count * sizeof(ushort) ); - if (!storage) - goto fail; - - if (min_index == 0) { - switch(index_size) { - case 1: - translate_ubyte_elts( (const ubyte *)elts, - draw_count, - storage ); - break; - - case 2: - translate_ushort_elts( (const ushort *)elts, - draw_count, - storage ); - break; - - case 4: - translate_uint_elts( (const uint *)elts, - draw_count, - storage ); - break; - - default: - assert(0); - FREE(storage); - return; - } - } - else { - switch(index_size) { - case 1: - rebase_ubyte_elts( (const ubyte *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - case 2: - rebase_ushort_elts( (const ushort *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - case 4: - rebase_uint_elts( (const uint *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - default: - assert(0); - FREE(storage); - return; - } - } - transformed_elts = storage; - } - - if (fetch_count < UNDEFINED_VERTEX_ID) - ok = vcache->middle->run_linear_elts( vcache->middle, - min_index + elt_bias, /* start */ - fetch_count, - transformed_elts, - draw_count ); - - FREE(storage); - - if (ok) - return; - - debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n", - fetch_count, draw_count); - - fail: - vcache_run( frontend, get_elt, elts, elt_bias, draw_count ); -} - - - - -static void -vcache_prepare( struct draw_pt_front_end *frontend, - unsigned prim, - struct draw_pt_middle_end *middle, - unsigned opt ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - - if (opt & PT_PIPELINE) - { - vcache->base.run = vcache_run_extras; - } - else - { - vcache->base.run = vcache_check_run; - } - - vcache->input_prim = prim; - vcache->output_prim = u_reduced_prim(prim); - - vcache->middle = middle; - vcache->opt = opt; - - /* Have to run prepare here, but try and guess a good prim for - * doing so: - */ - vcache->middle_prim = (opt & PT_PIPELINE) ? vcache->output_prim : vcache->input_prim; - middle->prepare( middle, vcache->middle_prim, opt, &vcache->fetch_max ); -} - - - - -static void -vcache_finish( struct draw_pt_front_end *frontend ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - vcache->middle->finish( vcache->middle ); - vcache->middle = NULL; -} - -static void -vcache_destroy( struct draw_pt_front_end *frontend ) -{ - FREE(frontend); -} - - -struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw ) -{ - struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend ); - if (vcache == NULL) - return NULL; - - vcache->base.prepare = vcache_prepare; - vcache->base.run = NULL; - vcache->base.finish = vcache_finish; - vcache->base.destroy = vcache_destroy; - vcache->draw = draw; - - memset(vcache->in, ~0, sizeof(vcache->in)); - - return &vcache->base; -} diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h deleted file mode 100644 index f7a63de3ba9..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h +++ /dev/null @@ -1,197 +0,0 @@ - - -static void FUNC( struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned count ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - struct draw_context *draw = vcache->draw; - - boolean flatfirst = (draw->rasterizer->flatshade && - draw->rasterizer->flatshade_first); - unsigned i; - ushort flags; - - if (0) debug_printf("%s %d\n", __FUNCTION__, count); - - - switch (vcache->input_prim) { - case PIPE_PRIM_POINTS: - for (i = 0; i < count; i ++) { - POINT( vcache, - get_elt(elts, i + 0) + elt_bias ); - } - break; - - case PIPE_PRIM_LINES: - for (i = 0; i+1 < count; i += 2) { - LINE( vcache, - DRAW_PIPE_RESET_STIPPLE, - get_elt(elts, i + 0) + elt_bias, - get_elt(elts, i + 1) + elt_bias); - } - break; - - case PIPE_PRIM_LINE_LOOP: - if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; - - for (i = 1; i < count; i++, flags = 0) { - LINE( vcache, - flags, - get_elt(elts, i - 1) + elt_bias, - get_elt(elts, i ) + elt_bias); - } - - LINE( vcache, - flags, - get_elt(elts, i - 1) + elt_bias, - get_elt(elts, 0 ) + elt_bias); - } - break; - - case PIPE_PRIM_LINE_STRIP: - flags = DRAW_PIPE_RESET_STIPPLE; - for (i = 1; i < count; i++, flags = 0) { - LINE( vcache, - flags, - get_elt(elts, i - 1) + elt_bias, - get_elt(elts, i ) + elt_bias); - } - break; - - case PIPE_PRIM_TRIANGLES: - for (i = 0; i+2 < count; i += 3) { - TRIANGLE( vcache, - DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - get_elt(elts, i + 0) + elt_bias, - get_elt(elts, i + 1) + elt_bias, - get_elt(elts, i + 2 ) + elt_bias); - } - break; - - case PIPE_PRIM_TRIANGLE_STRIP: - if (flatfirst) { - for (i = 0; i+2 < count; i++) { - TRIANGLE( vcache, - DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - get_elt(elts, i + 0) + elt_bias, - get_elt(elts, i + 1 + (i&1)) + elt_bias, - get_elt(elts, i + 2 - (i&1)) + elt_bias); - } - } - else { - for (i = 0; i+2 < count; i++) { - TRIANGLE( vcache, - DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - get_elt(elts, i + 0 + (i&1)) + elt_bias, - get_elt(elts, i + 1 - (i&1)) + elt_bias, - get_elt(elts, i + 2 ) + elt_bias); - } - } - break; - - case PIPE_PRIM_TRIANGLE_FAN: - if (count >= 3) { - if (flatfirst) { - for (i = 0; i+2 < count; i++) { - TRIANGLE( vcache, - DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - get_elt(elts, i + 1) + elt_bias, - get_elt(elts, i + 2) + elt_bias, - get_elt(elts, 0 ) + elt_bias); - } - } - else { - for (i = 0; i+2 < count; i++) { - TRIANGLE( vcache, - DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - get_elt(elts, 0) + elt_bias, - get_elt(elts, i + 1) + elt_bias, - get_elt(elts, i + 2 ) + elt_bias); - } - } - } - break; - - - case PIPE_PRIM_QUADS: - for (i = 0; i+3 < count; i += 4) { - QUAD( vcache, - get_elt(elts, i + 0) + elt_bias, - get_elt(elts, i + 1) + elt_bias, - get_elt(elts, i + 2) + elt_bias, - get_elt(elts, i + 3) + elt_bias ); - } - break; - - case PIPE_PRIM_QUAD_STRIP: - for (i = 0; i+3 < count; i += 2) { - QUAD( vcache, - get_elt(elts, i + 2) + elt_bias, - get_elt(elts, i + 0) + elt_bias, - get_elt(elts, i + 1) + elt_bias, - get_elt(elts, i + 3) + elt_bias ); - } - break; - - case PIPE_PRIM_POLYGON: - { - /* These bitflags look a little odd because we submit the - * vertices as (1,2,0) to satisfy flatshade requirements. - */ - const ushort edge_first = DRAW_PIPE_EDGE_FLAG_2; - const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0; - const ushort edge_last = DRAW_PIPE_EDGE_FLAG_1; - ushort edge_next, edge_finish; - - if (flatfirst) { - flags = DRAW_PIPE_RESET_STIPPLE | edge_middle | edge_last; - edge_next = edge_last; - edge_finish = edge_first; - } - else { - flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; - edge_next = edge_middle; - edge_finish = edge_last; - } - - for (i = 0; i+2 < count; i++, flags = edge_next) { - - if (i + 3 == count) - flags |= edge_finish; - - if (flatfirst) { - TRIANGLE( vcache, - flags, - get_elt(elts, 0) + elt_bias, - get_elt(elts, i + 1) + elt_bias, - get_elt(elts, i + 2) + elt_bias ); - } - else { - TRIANGLE( vcache, - flags, - get_elt(elts, i + 1) + elt_bias, - get_elt(elts, i + 2) + elt_bias, - get_elt(elts, 0) + elt_bias); - } - } - } - break; - - default: - assert(0); - break; - } - - vcache_flush( vcache ); -} - - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c new file mode 100644 index 00000000000..a6875253094 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c @@ -0,0 +1,208 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "util/u_math.h" +#include "util/u_memory.h" + +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_pt.h" + +#define SEGMENT_SIZE 1024 +#define MAP_SIZE 256 + +struct vsplit_frontend { + struct draw_pt_front_end base; + struct draw_context *draw; + + unsigned prim; + + struct draw_pt_middle_end *middle; + + unsigned max_vertices; + ushort segment_size; + + /* buffers for splitting */ + unsigned fetch_elts[SEGMENT_SIZE]; + ushort draw_elts[SEGMENT_SIZE]; + ushort identity_draw_elts[SEGMENT_SIZE]; + + struct { + /* map a fetch element to a draw element */ + unsigned fetches[MAP_SIZE]; + ushort draws[MAP_SIZE]; + boolean has_max_fetch; + + ushort num_fetch_elts; + ushort num_draw_elts; + } cache; +}; + + +static void +vsplit_clear_cache(struct vsplit_frontend *vsplit) +{ + memset(vsplit->cache.fetches, 0xff, sizeof(vsplit->cache.fetches)); + vsplit->cache.has_max_fetch = FALSE; + vsplit->cache.num_fetch_elts = 0; + vsplit->cache.num_draw_elts = 0; +} + +static void +vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags) +{ + vsplit->middle->run(vsplit->middle, + vsplit->fetch_elts, vsplit->cache.num_fetch_elts, + vsplit->draw_elts, vsplit->cache.num_draw_elts, flags); +} + +/** + * Add a fetch element and add it to the draw elements. + */ +static INLINE void +vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch) +{ + unsigned hash = fetch % MAP_SIZE; + + if (vsplit->cache.fetches[hash] != fetch) { + /* update cache */ + vsplit->cache.fetches[hash] = fetch; + vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts; + + /* add fetch */ + assert(vsplit->cache.num_fetch_elts < vsplit->segment_size); + vsplit->fetch_elts[vsplit->cache.num_fetch_elts++] = fetch; + } + + vsplit->draw_elts[vsplit->cache.num_draw_elts++] = vsplit->cache.draws[hash]; +} + + +/** + * Add a fetch element and add it to the draw elements. The fetch element is + * in full range (uint). + */ +static INLINE void +vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch) +{ + /* special care for 0xffffffff */ + if (fetch == 0xffffffff && !vsplit->cache.has_max_fetch) { + unsigned hash = fetch % MAP_SIZE; + vsplit->cache.fetches[hash] = fetch - 1; /* force update */ + vsplit->cache.has_max_fetch = TRUE; + } + + vsplit_add_cache(vsplit, fetch); +} + + +#define FUNC vsplit_run_linear +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_ubyte +#define ELT_TYPE ubyte +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_ushort +#define ELT_TYPE ushort +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_uint +#define ELT_TYPE uint +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache_uint(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + + +static void vsplit_prepare(struct draw_pt_front_end *frontend, + unsigned in_prim, + struct draw_pt_middle_end *middle, + unsigned opt) +{ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; + + switch (vsplit->draw->pt.user.eltSize) { + case 0: + vsplit->base.run = vsplit_run_linear; + break; + case 1: + vsplit->base.run = vsplit_run_ubyte; + break; + case 2: + vsplit->base.run = vsplit_run_ushort; + break; + case 4: + vsplit->base.run = vsplit_run_uint; + break; + default: + assert(0); + break; + } + + /* split only */ + vsplit->prim = in_prim; + + vsplit->middle = middle; + middle->prepare(middle, vsplit->prim, opt, &vsplit->max_vertices); + + vsplit->segment_size = MIN2(SEGMENT_SIZE, vsplit->max_vertices); +} + + +static void vsplit_finish(struct draw_pt_front_end *frontend) +{ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; + vsplit->middle->finish(vsplit->middle); + vsplit->middle = NULL; +} + + +static void vsplit_destroy(struct draw_pt_front_end *frontend) +{ + FREE(frontend); +} + + +struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw) +{ + struct vsplit_frontend *vsplit = CALLOC_STRUCT(vsplit_frontend); + ushort i; + + if (!vsplit) + return NULL; + + vsplit->base.prepare = vsplit_prepare; + vsplit->base.run = NULL; + vsplit->base.finish = vsplit_finish; + vsplit->base.destroy = vsplit_destroy; + vsplit->draw = draw; + + for (i = 0; i < SEGMENT_SIZE; i++) + vsplit->identity_draw_elts[i] = i; + + return &vsplit->base; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h new file mode 100644 index 00000000000..3f66f962e11 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h @@ -0,0 +1,309 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#define CONCAT2(name, elt_type) name ## elt_type +#define CONCAT(name, elt_type) CONCAT2(name, elt_type) + +#ifdef ELT_TYPE + +/** + * Fetch all elements in [min_index, max_index] with bias, and use the + * (rebased) index buffer as the draw elements. + */ +static boolean +CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned istart, unsigned icount) +{ + struct draw_context *draw = vsplit->draw; + const ELT_TYPE *ib = (const ELT_TYPE *) + ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset); + const unsigned min_index = draw->pt.user.min_index; + const unsigned max_index = draw->pt.user.max_index; + const int elt_bias = draw->pt.user.eltBias; + unsigned fetch_start, fetch_count; + const ushort *draw_elts = NULL; + unsigned i; + + /* use the ib directly */ + if (min_index == 0 && sizeof(ib[0]) == sizeof(draw_elts[0])) { + if (icount > vsplit->max_vertices) + return FALSE; + + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + assert(idx >= min_index && idx <= max_index); + } + draw_elts = (const ushort *) ib; + } + else { + /* have to go through vsplit->draw_elts */ + if (icount > vsplit->segment_size) + return FALSE; + } + + /* this is faster only when we fetch less elements than the normal path */ + if (max_index - min_index > icount - 1) + return FALSE; + + if (elt_bias < 0 && min_index < -elt_bias) + return FALSE; + + /* why this check? */ + for (i = 0; i < draw->pt.nr_vertex_elements; i++) { + if (draw->pt.vertex_element[i].instance_divisor) + return FALSE; + } + + fetch_start = min_index + elt_bias; + fetch_count = max_index - min_index + 1; + + if (!draw_elts) { + if (min_index == 0) { + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + + assert(idx >= min_index && idx <= max_index); + vsplit->draw_elts[i] = (ushort) idx; + } + } + else { + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + + assert(idx >= min_index && idx <= max_index); + vsplit->draw_elts[i] = (ushort) (idx - min_index); + } + } + + draw_elts = vsplit->draw_elts; + } + + return vsplit->middle->run_linear_elts(vsplit->middle, + fetch_start, fetch_count, + draw_elts, icount, 0x0); +} + +/** + * Use the cache to prepare the fetch and draw elements, and flush. + * + * When spoken is TRUE, ispoken replaces istart; When close is TRUE, iclose is + * appended. + */ +static INLINE void +CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, unsigned icount, + boolean spoken, unsigned ispoken, + boolean close, unsigned iclose) +{ + struct draw_context *draw = vsplit->draw; + const ELT_TYPE *ib = (const ELT_TYPE *) + ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset); + const int ibias = draw->pt.user.eltBias; + unsigned i; + + assert(icount + !!close <= vsplit->segment_size); + + vsplit_clear_cache(vsplit); + + spoken = !!spoken; + if (ibias == 0) { + if (spoken) + ADD_CACHE(vsplit, ib[ispoken]); + + for (i = spoken; i < icount; i++) + ADD_CACHE(vsplit, ib[istart + i]); + + if (close) + ADD_CACHE(vsplit, ib[iclose]); + } + else if (ibias > 0) { + if (spoken) + ADD_CACHE(vsplit, (uint) ib[ispoken] + ibias); + + for (i = spoken; i < icount; i++) + ADD_CACHE(vsplit, (uint) ib[istart + i] + ibias); + + if (close) + ADD_CACHE(vsplit, (uint) ib[iclose] + ibias); + } + else { + if (spoken) { + if (ib[ispoken] < -ibias) + return; + ADD_CACHE(vsplit, ib[ispoken] + ibias); + } + + for (i = spoken; i < icount; i++) { + if (ib[istart + i] < -ibias) + return; + ADD_CACHE(vsplit, ib[istart + i] + ibias); + } + + if (close) { + if (ib[iclose] < -ibias) + return; + ADD_CACHE(vsplit, ib[iclose] + ibias); + } + } + + vsplit_flush_cache(vsplit, flags); +} + +static void +CONCAT(vsplit_segment_simple_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount) +{ + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, FALSE, 0, FALSE, 0); +} + +static void +CONCAT(vsplit_segment_loop_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount, + unsigned i0) +{ + const boolean close_loop = ((flags) == DRAW_SPLIT_BEFORE); + + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, FALSE, 0, close_loop, i0); +} + +static void +CONCAT(vsplit_segment_fan_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount, + unsigned i0) +{ + const boolean use_spoken = (((flags) & DRAW_SPLIT_BEFORE) != 0); + + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, use_spoken, i0, FALSE, 0); +} + +#define LOCAL_VARS \ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; \ + const unsigned prim = vsplit->prim; \ + const unsigned max_count_simple = vsplit->segment_size; \ + const unsigned max_count_loop = vsplit->segment_size - 1; \ + const unsigned max_count_fan = vsplit->segment_size; + +#define PRIMITIVE(istart, icount) \ + CONCAT(vsplit_primitive_, ELT_TYPE)(vsplit, istart, icount) + +#else /* ELT_TYPE */ + +static void +vsplit_segment_simple_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount) +{ + assert(icount <= vsplit->max_vertices); + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); +} + +static void +vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount, unsigned i0) +{ + boolean close_loop = (flags == DRAW_SPLIT_BEFORE); + unsigned nr; + + assert(icount + !!close_loop <= vsplit->segment_size); + + if (close_loop) { + for (nr = 0; nr < icount; nr++) + vsplit->fetch_elts[nr] = istart + nr; + vsplit->fetch_elts[nr++] = i0; + + vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr, + vsplit->identity_draw_elts, nr, flags); + } + else { + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); + } +} + +static void +vsplit_segment_fan_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount, unsigned i0) +{ + boolean use_spoken = ((flags & DRAW_SPLIT_BEFORE) != 0); + unsigned nr = 0, i; + + assert(icount + !!use_spoken <= vsplit->segment_size); + + if (use_spoken) { + vsplit->fetch_elts[nr++] = i0; + for (i = 1 ; i < icount; i++) + vsplit->fetch_elts[nr++] = istart + i; + + vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr, + vsplit->identity_draw_elts, nr, flags); + } + else { + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); + } +} + +#define LOCAL_VARS \ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; \ + const unsigned prim = vsplit->prim; \ + const unsigned max_count_simple = vsplit->max_vertices; \ + const unsigned max_count_loop = vsplit->segment_size - 1; \ + const unsigned max_count_fan = vsplit->segment_size; + +#define PRIMITIVE(istart, icount) FALSE + +#define ELT_TYPE linear + +#endif /* ELT_TYPE */ + +#define FUNC_VARS \ + struct draw_pt_front_end *frontend, \ + unsigned start, \ + unsigned count + +#define SEGMENT_SIMPLE(flags, istart, icount) \ + CONCAT(vsplit_segment_simple_, ELT_TYPE)(vsplit, flags, istart, icount) + +#define SEGMENT_LOOP(flags, istart, icount, i0) \ + CONCAT(vsplit_segment_loop_, ELT_TYPE)(vsplit, flags, istart, icount, i0) + +#define SEGMENT_FAN(flags, istart, icount, i0) \ + CONCAT(vsplit_segment_fan_, ELT_TYPE)(vsplit, flags, istart, icount, i0) + +#include "draw_split_tmp.h" + +#undef CONCAT2 +#undef CONCAT + +#undef ELT_TYPE +#undef ADD_CACHE diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h new file mode 100644 index 00000000000..7fafde9d5e6 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h @@ -0,0 +1,31 @@ +#define FUNC_VARS \ + struct pt_so_emit *so, \ + const struct draw_prim_info *input_prims, \ + const struct draw_vertex_info *input_verts, \ + unsigned start, \ + unsigned count + +#define FUNC_ENTER \ + /* declare more local vars */ \ + const unsigned prim = input_prims->prim; \ + const unsigned prim_flags = input_prims->flags; \ + const boolean last_vertex_last = TRUE; \ + do { \ + debug_assert(input_prims->primitive_count == 1); \ + switch (prim) { \ + case PIPE_PRIM_LINES_ADJACENCY: \ + case PIPE_PRIM_LINE_STRIP_ADJACENCY: \ + case PIPE_PRIM_TRIANGLES_ADJACENCY: \ + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: \ + debug_assert(!"unexpected primitive type in stream output"); \ + return; \ + default: \ + break; \ + } \ + } while (0) \ + +#define POINT(i0) so_point(so,i0) +#define LINE(flags,i0,i1) so_line(so,i0,i1) +#define TRIANGLE(flags,i0,i1,i2) so_tri(so,i0,i1,i2) + +#include "draw_decompose_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_split_tmp.h b/src/gallium/auxiliary/draw/draw_split_tmp.h new file mode 100644 index 00000000000..47defc62b96 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_split_tmp.h @@ -0,0 +1,176 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +static void +FUNC(FUNC_VARS) +{ + unsigned first, incr; + LOCAL_VARS + + /* + * prim, start, count, and max_count_{simple,loop,fan} should have been + * defined + */ + if (0) { + debug_printf("%s: prim 0x%x, start %d, count %d, max_count_simple %d, " + "max_count_loop %d, max_count_fan %d\n", + __FUNCTION__, prim, start, count, max_count_simple, + max_count_loop, max_count_fan); + } + + draw_pt_split_prim(prim, &first, &incr); + /* sanitize primitive length */ + count = draw_pt_trim_count(count, first, incr); + if (count < first) + return; + + /* try flushing the entire primitive */ + if (PRIMITIVE(start, count)) + return; + + /* must be able to at least flush two complete primitives */ + assert(max_count_simple >= first + incr && + max_count_loop >= first + incr && + max_count_fan >= first + incr); + + /* no splitting required */ + if (count <= max_count_simple) { + SEGMENT_SIMPLE(0x0, start, count); + } + else { + const unsigned rollback = first - incr; + unsigned flags = DRAW_SPLIT_AFTER, seg_start = 0, seg_max; + + /* + * Both count and seg_max below are explicitly trimmed. Because + * + * seg_start = N * (seg_max - rollback) = N' * incr, + * + * we have + * + * remaining = count - seg_start = first + N'' * incr. + * + * That is, remaining is implicitly trimmed. + */ + switch (prim) { + case PIPE_PRIM_POINTS: + case PIPE_PRIM_LINES: + case PIPE_PRIM_LINE_STRIP: + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_QUADS: + case PIPE_PRIM_QUAD_STRIP: + case PIPE_PRIM_LINES_ADJACENCY: + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + case PIPE_PRIM_TRIANGLES_ADJACENCY: + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + seg_max = + draw_pt_trim_count(MIN2(max_count_simple, count), first, incr); + if (prim == PIPE_PRIM_TRIANGLE_STRIP || + prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) { + /* make sure we flush even number of triangles at a time */ + if (seg_max < count && !(((seg_max - first) / incr) & 1)) + seg_max -= incr; + } + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_SIMPLE(flags, start + seg_start, seg_max); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_SIMPLE(flags, start + seg_start, remaining); + seg_start += remaining; + } + } while (seg_start < count); + break; + + case PIPE_PRIM_LINE_LOOP: + seg_max = + draw_pt_trim_count(MIN2(max_count_loop, count), first, incr); + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_LOOP(flags, start + seg_start, seg_max, start); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_LOOP(flags, start + seg_start, remaining, start); + seg_start += remaining; + } + } while (seg_start < count); + break; + + case PIPE_PRIM_TRIANGLE_FAN: + case PIPE_PRIM_POLYGON: + seg_max = + draw_pt_trim_count(MIN2(max_count_fan, count), first, incr); + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_FAN(flags, start + seg_start, seg_max, start); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_FAN(flags, start + seg_start, remaining, start); + seg_start += remaining; + } + } while (seg_start < count); + break; + + default: + assert(0); + break; + } + } +} + +#undef FUNC +#undef FUNC_VARS +#undef LOCAL_VARS + +#undef PRIMITIVE +#undef SEGMENT_SIMPLE +#undef SEGMENT_LOOP +#undef SEGMENT_FAN diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h index cccd3bf4358..e32803c0720 100644 --- a/src/gallium/auxiliary/draw/draw_vbuf.h +++ b/src/gallium/auxiliary/draw/draw_vbuf.h @@ -98,14 +98,14 @@ struct vbuf_render { boolean (*set_primitive)( struct vbuf_render *, unsigned prim ); /** - * DrawElements, note indices are ushort. The driver must complete - * this call, if necessary splitting the index list itself. + * Draw indexed primitives. Note that indices are ushort. The driver + * must complete this call, if necessary splitting the index list itself. */ - void (*draw)( struct vbuf_render *, - const ushort *indices, - uint nr_indices ); + void (*draw_elements)( struct vbuf_render *, + const ushort *indices, + uint nr_indices ); - /* Draw Arrays path too. + /* Draw non-indexed primitives. */ void (*draw_arrays)( struct vbuf_render *, unsigned start, @@ -117,6 +117,14 @@ struct vbuf_render { void (*release_vertices)( struct vbuf_render * ); void (*destroy)( struct vbuf_render * ); + + + /** + * Called after writing data to the stream out buffers + */ + void (*set_stream_output_info)( struct vbuf_render *vbufr, + unsigned primitive_count, + unsigned vertices_count ); }; diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h index 3af31ffe126..e63cf5f4f98 100644 --- a/src/gallium/auxiliary/draw/draw_vertex.h +++ b/src/gallium/auxiliary/draw/draw_vertex.h @@ -166,7 +166,7 @@ static INLINE enum pipe_format draw_translate_vinfo_format(enum attrib_emit emit } } -static INLINE enum attrib_emit draw_translate_vinfo_size(enum attrib_emit emit) +static INLINE unsigned draw_translate_vinfo_size(enum attrib_emit emit) { switch (emit) { case EMIT_OMIT: diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c index c2832eefa2a..fb665b08fff 100644 --- a/src/gallium/auxiliary/draw/draw_vs.c +++ b/src/gallium/auxiliary/draw/draw_vs.c @@ -48,18 +48,30 @@ DEBUG_GET_ONCE_BOOL_OPTION(gallium_dump_vs, "GALLIUM_DUMP_VS", FALSE) + +/** + * Set a vertex shader constant buffer. + * \param slot which constant buffer in [0, PIPE_MAX_CONSTANT_BUFFERS-1] + * \param constants the mapped buffer + * \param size size of buffer in bytes + */ void draw_vs_set_constants(struct draw_context *draw, unsigned slot, const void *constants, unsigned size) { - if (((uintptr_t)constants) & 0xf) { + const int alignment = 16; + + /* check if buffer is 16-byte aligned */ + if (((uintptr_t)constants) & (alignment - 1)) { + /* if not, copy the constants into a new, 16-byte aligned buffer */ if (size > draw->vs.const_storage_size[slot]) { if (draw->vs.aligned_constant_storage[slot]) { align_free((void *)draw->vs.aligned_constant_storage[slot]); } - draw->vs.aligned_constant_storage[slot] = align_malloc(size, 16); + draw->vs.aligned_constant_storage[slot] = + align_malloc(size, alignment); } assert(constants); memcpy((void *)draw->vs.aligned_constant_storage[slot], @@ -85,18 +97,27 @@ struct draw_vertex_shader * draw_create_vertex_shader(struct draw_context *draw, const struct pipe_shader_state *shader) { - struct draw_vertex_shader *vs; + struct draw_vertex_shader *vs = NULL; if (draw->dump_vs) { tgsi_dump(shader->tokens, 0); } - vs = draw_create_vs_sse( draw, shader ); - if (!vs) { + if (!draw->pt.middle.llvm) { +#if defined(PIPE_ARCH_X86) + vs = draw_create_vs_sse( draw, shader ); +#elif defined(PIPE_ARCH_PPC) vs = draw_create_vs_ppc( draw, shader ); - if (!vs) { - vs = draw_create_vs_exec( draw, shader ); - } +#endif + } +#if HAVE_LLVM + else { + vs = draw_create_vs_llvm(draw, shader); + } +#endif + + if (!vs) { + vs = draw_create_vs_exec( draw, shader ); } if (vs) diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h index 6c7e94db433..f9a038788fb 100644 --- a/src/gallium/auxiliary/draw/draw_vs.h +++ b/src/gallium/auxiliary/draw/draw_vs.h @@ -133,7 +133,8 @@ struct draw_vertex_shader { void (*run_linear)( struct draw_vertex_shader *shader, const float (*input)[4], float (*output)[4], - const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS], unsigned count, unsigned input_stride, unsigned output_stride ); @@ -165,7 +166,6 @@ draw_create_vs_ppc(struct draw_context *draw, const struct pipe_shader_state *templ); - struct draw_vs_varient_key; struct draw_vertex_shader; @@ -173,6 +173,11 @@ struct draw_vs_varient * draw_vs_create_varient_aos_sse( struct draw_vertex_shader *vs, const struct draw_vs_varient_key *key ); +#if HAVE_LLVM +struct draw_vertex_shader * +draw_create_vs_llvm(struct draw_context *draw, + const struct pipe_shader_state *state); +#endif /******************************************************************************** diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h index 1911242f825..68e8295b5e1 100644 --- a/src/gallium/auxiliary/draw/draw_vs_aos.h +++ b/src/gallium/auxiliary/draw/draw_vs_aos.h @@ -32,6 +32,8 @@ #define DRAW_VS_AOS_H #include "pipe/p_config.h" +#include "tgsi/tgsi_exec.h" +#include "draw_vs.h" #ifdef PIPE_ARCH_X86 diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index bc34d390dae..dab3eb1ca8e 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -85,7 +85,8 @@ static void vs_exec_run_linear( struct draw_vertex_shader *shader, const float (*input)[4], float (*output)[4], - const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS], unsigned count, unsigned input_stride, unsigned output_stride ) @@ -95,9 +96,8 @@ vs_exec_run_linear( struct draw_vertex_shader *shader, unsigned int i, j; unsigned slot; - for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) { - machine->Consts[i] = constants[i]; - } + tgsi_exec_set_constant_buffers(machine, PIPE_MAX_CONSTANT_BUFFERS, + constants, const_size); for (i = 0; i < count; i += MAX_TGSI_VERTICES) { unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i); diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c new file mode 100644 index 00000000000..fa9992db783 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c @@ -0,0 +1,127 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "pipe/p_shader_tokens.h" +#include "pipe/p_screen.h" + +#include "draw_private.h" +#include "draw_context.h" +#include "draw_vs.h" +#include "draw_llvm.h" + +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" + +static void +vs_llvm_prepare(struct draw_vertex_shader *shader, + struct draw_context *draw) +{ + /*struct llvm_vertex_shader *evs = llvm_vertex_shader(shader);*/ +} + +static void +vs_llvm_run_linear( struct draw_vertex_shader *shader, + const float (*input)[4], + float (*output)[4], + const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + const unsigned constants_size[PIPE_MAX_CONSTANT_BUFFERS], + unsigned count, + unsigned input_stride, + unsigned output_stride ) +{ + /* we should never get here since the entire pipeline is + * generated in draw_pt_fetch_shade_pipeline_llvm.c */ + debug_assert(0); +} + + +static void +vs_llvm_delete( struct draw_vertex_shader *dvs ) +{ + struct llvm_vertex_shader *shader = llvm_vertex_shader(dvs); + struct pipe_fence_handle *fence = NULL; + struct draw_llvm_variant_list_item *li; + struct pipe_context *pipe = dvs->draw->pipe; + + /* + * XXX: This might be not neccessary at all. + */ + pipe->flush(pipe, 0, &fence); + if (fence) { + pipe->screen->fence_finish(pipe->screen, fence, 0); + pipe->screen->fence_reference(pipe->screen, &fence, NULL); + } + + + li = first_elem(&shader->variants); + while(!at_end(&shader->variants, li)) { + struct draw_llvm_variant_list_item *next = next_elem(li); + draw_llvm_destroy_variant(li->base); + li = next; + } + + assert(shader->variants_cached == 0); + FREE((void*) dvs->state.tokens); + FREE( dvs ); +} + + +struct draw_vertex_shader * +draw_create_vs_llvm(struct draw_context *draw, + const struct pipe_shader_state *state) +{ + struct llvm_vertex_shader *vs = CALLOC_STRUCT( llvm_vertex_shader ); + + if (vs == NULL) + return NULL; + + /* we make a private copy of the tokens */ + vs->base.state.tokens = tgsi_dup_tokens(state->tokens); + if (!vs->base.state.tokens) { + FREE(vs); + return NULL; + } + + tgsi_scan_shader(state->tokens, &vs->base.info); + + vs->variant_key_size = + draw_llvm_variant_key_size( + vs->base.info.file_max[TGSI_FILE_INPUT]+1, + vs->base.info.file_max[TGSI_FILE_SAMPLER]+1); + + vs->base.draw = draw; + vs->base.prepare = vs_llvm_prepare; + vs->base.run_linear = vs_llvm_run_linear; + vs->base.delete = vs_llvm_delete; + vs->base.create_varient = draw_vs_create_varient_generic; + + make_empty_list(&vs->variants); + + return &vs->base; +} diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c index d869eecec5e..5df84916c51 100644 --- a/src/gallium/auxiliary/draw/draw_vs_ppc.c +++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c @@ -125,7 +125,7 @@ vs_ppc_run_linear( struct draw_vertex_shader *base, */ shader->func(inputs_soa, outputs_soa, temps_soa, (float (*)[4]) shader->base.immediates, - (const float (*)[4])constants[0], + (float (*)[4])constants[0], ppc_builtin_constants); /* convert (up to) four output verts from SoA back to AoS format */ @@ -190,7 +190,7 @@ draw_create_vs_ppc(struct draw_context *draw, vs->base.create_varient = draw_vs_varient_aos_ppc; else #endif - vs->base.create_varient = draw_vs_varient_generic; + vs->base.create_varient = draw_vs_create_varient_generic; vs->base.prepare = vs_ppc_prepare; vs->base.run_linear = vs_ppc_run_linear; vs->base.delete = vs_ppc_delete; diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c index 14c95082a9d..0b0c6077c6f 100644 --- a/src/gallium/auxiliary/draw/draw_vs_sse.c +++ b/src/gallium/auxiliary/draw/draw_vs_sse.c @@ -84,6 +84,7 @@ vs_sse_run_linear( struct draw_vertex_shader *base, const float (*input)[4], float (*output)[4], const void *constants[PIPE_MAX_CONSTANT_BUFFERS], + const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS], unsigned count, unsigned input_stride, unsigned output_stride ) diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c index 6eb26927f27..eacd1601877 100644 --- a/src/gallium/auxiliary/draw/draw_vs_varient.c +++ b/src/gallium/auxiliary/draw/draw_vs_varient.c @@ -149,7 +149,8 @@ static void PIPE_CDECL vsvg_run_elts( struct draw_vs_varient *varient, vsvg->base.vs->run_linear( vsvg->base.vs, temp_buffer, temp_buffer, - vsvg->base.vs->draw->pt.user.vs_constants, + vsvg->base.vs->draw->pt.user.vs_constants, + vsvg->base.vs->draw->pt.user.vs_constants_size, count, temp_vertex_stride, temp_vertex_stride); @@ -214,7 +215,8 @@ static void PIPE_CDECL vsvg_run_linear( struct draw_vs_varient *varient, vsvg->base.vs->run_linear( vsvg->base.vs, temp_buffer, temp_buffer, - vsvg->base.vs->draw->pt.user.vs_constants, + vsvg->base.vs->draw->pt.user.vs_constants, + vsvg->base.vs->draw->pt.user.vs_constants_size, count, temp_vertex_stride, temp_vertex_stride); diff --git a/src/gallium/auxiliary/gallivm/f.cpp b/src/gallium/auxiliary/gallivm/f.cpp new file mode 100644 index 00000000000..5eb09c01ab3 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/f.cpp @@ -0,0 +1,85 @@ +/************************************************************************** + * + * (C) Copyright VMware, Inc 2010. + * (C) Copyright John Maddock 2006. + * Use, modification and distribution are subject to the + * Boost Software License, Version 1.0. (See accompanying file + * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + * + **************************************************************************/ + + +/* + * This file allows to compute the minimax polynomial coefficients we use + * for fast exp2/log2. + * + * How to use this source: + * + * - Download and abuild the NTL library from + * http://shoup.net/ntl/download.html + * + * - Download boost source code matching to your distro. + * + * - Goto libs/math/minimax and replace f.cpp with this file. + * + * - Build as + * + * g++ -o minimax -I /path/to/ntl/include main.cpp f.cpp /path/to/ntl/src/ntl.a -lboost_math_tr1 + * + * - Run as + * + * ./minimax + * + * - For example, to compute exp2 5th order polynomial between [0, 1] do: + * + * variant 1 + * range 0 1 + * order 5 0 + * steps 200 + * info + * + * - For more info see + * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html + */ + +#define L22 +#include <boost/math/bindings/rr.hpp> +#include <boost/math/tools/polynomial.hpp> + +#include <cmath> + + +boost::math::ntl::RR f(const boost::math::ntl::RR& x, int variant) +{ + static const boost::math::ntl::RR tiny = boost::math::tools::min_value<boost::math::ntl::RR>() * 64; + + switch(variant) + { + case 0: + // log2(x)/(x - 1) + return log(x)/log(2.0)/(x - 1.0); + + case 1: + // exp2(x) + return exp(x*log(2.0)); + } + + return 0; +} + + +void show_extra( + const boost::math::tools::polynomial<boost::math::ntl::RR>& n, + const boost::math::tools::polynomial<boost::math::ntl::RR>& d, + const boost::math::ntl::RR& x_offset, + const boost::math::ntl::RR& y_offset, + int variant) +{ + switch(variant) + { + default: + // do nothing here... + ; + } +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld.h b/src/gallium/auxiliary/gallivm/lp_bld.h index 2fa682f4009..8103bc917fc 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld.h +++ b/src/gallium/auxiliary/gallivm/lp_bld.h @@ -35,6 +35,17 @@ #define LP_BLD_H +/** + * @file + * LLVM IR building helpers interfaces. + * + * We use LLVM-C bindings for now. They are not documented, but follow the C++ + * interfaces very closely, and appear to be complete enough for code + * genration. See + * http://npcontemplation.blogspot.com/2008/06/secret-of-llvm-c-bindings.html + * for a standalone example. + */ + #include <llvm-c/Core.h> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 20ae958714b..64c468c14d4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -1,6 +1,6 @@ /************************************************************************** * - * Copyright 2009 VMware, Inc. + * Copyright 2009-2010 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -60,6 +60,11 @@ #include "lp_bld_arit.h" +#define EXP_POLY_DEGREE 3 + +#define LOG_POLY_DEGREE 5 + + /** * Generate min(a, b) * No checks for special case values of a or b = 1 or 0 are done. @@ -73,6 +78,9 @@ lp_build_min_simple(struct lp_build_context *bld, const char *intrinsic = NULL; LLVMValueRef cond; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + /* TODO: optimize the constant case */ if(type.width * type.length == 128) { @@ -119,6 +127,9 @@ lp_build_max_simple(struct lp_build_context *bld, const char *intrinsic = NULL; LLVMValueRef cond; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + /* TODO: optimize the constant case */ if(type.width * type.length == 128) { @@ -161,6 +172,8 @@ lp_build_comp(struct lp_build_context *bld, { const struct lp_type type = bld->type; + assert(lp_check_value(type, a)); + if(a == bld->one) return bld->zero; if(a == bld->zero) @@ -174,9 +187,15 @@ lp_build_comp(struct lp_build_context *bld, } if(LLVMIsConstant(a)) - return LLVMConstSub(bld->one, a); + if (type.floating) + return LLVMConstFSub(bld->one, a); + else + return LLVMConstSub(bld->one, a); else - return LLVMBuildSub(bld->builder, bld->one, a, ""); + if (type.floating) + return LLVMBuildFSub(bld->builder, bld->one, a, ""); + else + return LLVMBuildSub(bld->builder, bld->one, a, ""); } @@ -191,6 +210,9 @@ lp_build_add(struct lp_build_context *bld, const struct lp_type type = bld->type; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == bld->zero) return b; if(b == bld->zero) @@ -218,9 +240,15 @@ lp_build_add(struct lp_build_context *bld, } if(LLVMIsConstant(a) && LLVMIsConstant(b)) - res = LLVMConstAdd(a, b); + if (type.floating) + res = LLVMConstFAdd(a, b); + else + res = LLVMConstAdd(a, b); else - res = LLVMBuildAdd(bld->builder, a, b, ""); + if (type.floating) + res = LLVMBuildFAdd(bld->builder, a, b, ""); + else + res = LLVMBuildAdd(bld->builder, a, b, ""); /* clamp to ceiling of 1.0 */ if(bld->type.norm && (bld->type.floating || bld->type.fixed)) @@ -232,20 +260,20 @@ lp_build_add(struct lp_build_context *bld, } -/** Return the sum of the elements of a */ +/** Return the scalar sum of the elements of a */ LLVMValueRef lp_build_sum_vector(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; LLVMValueRef index, res; - int i; + unsigned i; - if (a == bld->zero) - return bld->zero; - if (a == bld->undef) - return bld->undef; - assert(type.length > 1); + assert(lp_check_value(type, a)); + + if (type.length == 1) { + return a; + } assert(!bld->type.norm); @@ -254,9 +282,16 @@ lp_build_sum_vector(struct lp_build_context *bld, for (i = 1; i < type.length; i++) { index = LLVMConstInt(LLVMInt32Type(), i, 0); - res = LLVMBuildAdd(bld->builder, res, - LLVMBuildExtractElement(bld->builder, a, index, ""), - ""); + if (type.floating) + res = LLVMBuildFAdd(bld->builder, res, + LLVMBuildExtractElement(bld->builder, + a, index, ""), + ""); + else + res = LLVMBuildAdd(bld->builder, res, + LLVMBuildExtractElement(bld->builder, + a, index, ""), + ""); } return res; @@ -274,6 +309,9 @@ lp_build_sub(struct lp_build_context *bld, const struct lp_type type = bld->type; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(b == bld->zero) return a; if(a == bld->undef || b == bld->undef) @@ -301,9 +339,15 @@ lp_build_sub(struct lp_build_context *bld, } if(LLVMIsConstant(a) && LLVMIsConstant(b)) - res = LLVMConstSub(a, b); + if (type.floating) + res = LLVMConstFSub(a, b); + else + res = LLVMConstSub(a, b); else - res = LLVMBuildSub(bld->builder, a, b, ""); + if (type.floating) + res = LLVMBuildFSub(bld->builder, a, b, ""); + else + res = LLVMBuildSub(bld->builder, a, b, ""); if(bld->type.norm && (bld->type.floating || bld->type.fixed)) res = lp_build_max_simple(bld, res, bld->zero); @@ -361,6 +405,10 @@ lp_build_mul_u8n(LLVMBuilderRef builder, LLVMValueRef c8; LLVMValueRef ab; + assert(!i16_type.floating); + assert(lp_check_value(i16_type, a)); + assert(lp_check_value(i16_type, b)); + c8 = lp_build_const_int_vec(i16_type, 8); #if 0 @@ -396,6 +444,9 @@ lp_build_mul(struct lp_build_context *bld, LLVMValueRef shift; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == bld->zero) return bld->zero; if(a == bld->one) @@ -434,7 +485,10 @@ lp_build_mul(struct lp_build_context *bld, shift = NULL; if(LLVMIsConstant(a) && LLVMIsConstant(b)) { - res = LLVMConstMul(a, b); + if (type.floating) + res = LLVMConstFMul(a, b); + else + res = LLVMConstMul(a, b); if(shift) { if(type.sign) res = LLVMConstAShr(res, shift); @@ -443,7 +497,10 @@ lp_build_mul(struct lp_build_context *bld, } } else { - res = LLVMBuildMul(bld->builder, a, b, ""); + if (type.floating) + res = LLVMBuildFMul(bld->builder, a, b, ""); + else + res = LLVMBuildMul(bld->builder, a, b, ""); if(shift) { if(type.sign) res = LLVMBuildAShr(bld->builder, res, shift, ""); @@ -466,6 +523,8 @@ lp_build_mul_imm(struct lp_build_context *bld, { LLVMValueRef factor; + assert(lp_check_value(bld->type, a)); + if(b == 0) return bld->zero; @@ -473,12 +532,12 @@ lp_build_mul_imm(struct lp_build_context *bld, return a; if(b == -1) - return LLVMBuildNeg(bld->builder, a, ""); + return lp_build_negate(bld, a); if(b == 2 && bld->type.floating) return lp_build_add(bld, a, a); - if(util_is_pot(b)) { + if(util_is_power_of_two(b)) { unsigned shift = ffs(b) - 1; if(bld->type.floating) { @@ -519,6 +578,9 @@ lp_build_div(struct lp_build_context *bld, { const struct lp_type type = bld->type; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == bld->zero) return bld->zero; if(a == bld->one) @@ -530,44 +592,125 @@ lp_build_div(struct lp_build_context *bld, if(a == bld->undef || b == bld->undef) return bld->undef; - if(LLVMIsConstant(a) && LLVMIsConstant(b)) - return LLVMConstFDiv(a, b); + if(LLVMIsConstant(a) && LLVMIsConstant(b)) { + if (type.floating) + return LLVMConstFDiv(a, b); + else if (type.sign) + return LLVMConstSDiv(a, b); + else + return LLVMConstUDiv(a, b); + } if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) return lp_build_mul(bld, a, lp_build_rcp(bld, b)); - return LLVMBuildFDiv(bld->builder, a, b, ""); + if (type.floating) + return LLVMBuildFDiv(bld->builder, a, b, ""); + else if (type.sign) + return LLVMBuildSDiv(bld->builder, a, b, ""); + else + return LLVMBuildUDiv(bld->builder, a, b, ""); } /** - * Linear interpolation. - * - * This also works for integer values with a few caveats. + * Linear interpolation -- without any checks. * * @sa http://www.stereopsis.com/doubleblend.html */ -LLVMValueRef -lp_build_lerp(struct lp_build_context *bld, - LLVMValueRef x, - LLVMValueRef v0, - LLVMValueRef v1) +static INLINE LLVMValueRef +lp_build_lerp_simple(struct lp_build_context *bld, + LLVMValueRef x, + LLVMValueRef v0, + LLVMValueRef v1) { LLVMValueRef delta; LLVMValueRef res; + assert(lp_check_value(bld->type, x)); + assert(lp_check_value(bld->type, v0)); + assert(lp_check_value(bld->type, v1)); + delta = lp_build_sub(bld, v1, v0); res = lp_build_mul(bld, x, delta); res = lp_build_add(bld, v0, res); - if(bld->type.fixed) + if (bld->type.fixed) { /* XXX: This step is necessary for lerping 8bit colors stored on 16bits, * but it will be wrong for other uses. Basically we need a more * powerful lp_type, capable of further distinguishing the values * interpretation from the value storage. */ res = LLVMBuildAnd(bld->builder, res, lp_build_const_int_vec(bld->type, (1 << bld->type.width/2) - 1), ""); + } + + return res; +} + + +/** + * Linear interpolation. + */ +LLVMValueRef +lp_build_lerp(struct lp_build_context *bld, + LLVMValueRef x, + LLVMValueRef v0, + LLVMValueRef v1) +{ + const struct lp_type type = bld->type; + LLVMValueRef res; + + assert(lp_check_value(type, x)); + assert(lp_check_value(type, v0)); + assert(lp_check_value(type, v1)); + + if (type.norm) { + struct lp_type wide_type; + struct lp_build_context wide_bld; + LLVMValueRef xl, xh, v0l, v0h, v1l, v1h, resl, resh; + LLVMValueRef shift; + + assert(type.length >= 2); + assert(!type.sign); + + /* + * Create a wider type, enough to hold the intermediate result of the + * multiplication. + */ + memset(&wide_type, 0, sizeof wide_type); + wide_type.fixed = TRUE; + wide_type.width = type.width*2; + wide_type.length = type.length/2; + + lp_build_context_init(&wide_bld, bld->builder, wide_type); + + lp_build_unpack2(bld->builder, type, wide_type, x, &xl, &xh); + lp_build_unpack2(bld->builder, type, wide_type, v0, &v0l, &v0h); + lp_build_unpack2(bld->builder, type, wide_type, v1, &v1l, &v1h); + + /* + * Scale x from [0, 255] to [0, 256] + */ + + shift = lp_build_const_int_vec(wide_type, type.width - 1); + + xl = lp_build_add(&wide_bld, xl, + LLVMBuildAShr(bld->builder, xl, shift, "")); + xh = lp_build_add(&wide_bld, xh, + LLVMBuildAShr(bld->builder, xh, shift, "")); + + /* + * Lerp both halves. + */ + + resl = lp_build_lerp_simple(&wide_bld, xl, v0l, v1l); + resh = lp_build_lerp_simple(&wide_bld, xh, v0h, v1h); + + res = lp_build_pack2(bld->builder, wide_type, type, resl, resh); + } else { + res = lp_build_lerp_simple(bld, x, v0, v1); + } return res; } @@ -597,6 +740,9 @@ lp_build_min(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { + assert(lp_check_value(bld->type, a)); + assert(lp_check_value(bld->type, b)); + if(a == bld->undef || b == bld->undef) return bld->undef; @@ -625,6 +771,9 @@ lp_build_max(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { + assert(lp_check_value(bld->type, a)); + assert(lp_check_value(bld->type, b)); + if(a == bld->undef || b == bld->undef) return bld->undef; @@ -654,6 +803,10 @@ lp_build_clamp(struct lp_build_context *bld, LLVMValueRef min, LLVMValueRef max) { + assert(lp_check_value(bld->type, a)); + assert(lp_check_value(bld->type, min)); + assert(lp_check_value(bld->type, max)); + a = lp_build_min(bld, a, max); a = lp_build_max(bld, a, min); return a; @@ -670,31 +823,20 @@ lp_build_abs(struct lp_build_context *bld, const struct lp_type type = bld->type; LLVMTypeRef vec_type = lp_build_vec_type(type); + assert(lp_check_value(type, a)); + if(!type.sign) return a; if(type.floating) { /* Mask out the sign bit */ - if (type.length == 1) { - LLVMTypeRef int_type = LLVMIntType(type.width); - LLVMTypeRef float_type = LLVMFloatType(); - unsigned long long absMask = ~(1ULL << (type.width - 1)); - LLVMValueRef mask = LLVMConstInt(int_type, absMask, 0); - a = LLVMBuildBitCast(bld->builder, a, int_type, ""); - a = LLVMBuildAnd(bld->builder, a, mask, ""); - a = LLVMBuildBitCast(bld->builder, a, float_type, ""); - return a; - } - else { - /* vector of floats */ - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); - unsigned long long absMask = ~(1ULL << (type.width - 1)); - LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask)); - a = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - a = LLVMBuildAnd(bld->builder, a, mask, ""); - a = LLVMBuildBitCast(bld->builder, a, vec_type, ""); - return a; - } + LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + unsigned long long absMask = ~(1ULL << (type.width - 1)); + LLVMValueRef mask = lp_build_const_int_vec(type, ((unsigned long long) absMask)); + a = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + a = LLVMBuildAnd(bld->builder, a, mask, ""); + a = LLVMBuildBitCast(bld->builder, a, vec_type, ""); + return a; } if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) { @@ -716,7 +858,16 @@ LLVMValueRef lp_build_negate(struct lp_build_context *bld, LLVMValueRef a) { - return LLVMBuildNeg(bld->builder, a, ""); + assert(lp_check_value(bld->type, a)); + +#if HAVE_LLVM >= 0x0207 + if (bld->type.floating) + a = LLVMBuildFNeg(bld->builder, a, ""); + else +#endif + a = LLVMBuildNeg(bld->builder, a, ""); + + return a; } @@ -729,6 +880,8 @@ lp_build_sgn(struct lp_build_context *bld, LLVMValueRef cond; LLVMValueRef res; + assert(lp_check_value(type, a)); + /* Handle non-zero case */ if(!type.sign) { /* if not zero then sign must be positive */ @@ -742,17 +895,9 @@ lp_build_sgn(struct lp_build_context *bld, LLVMValueRef one; unsigned long long maskBit = (unsigned long long)1 << (type.width - 1); - if (type.length == 1) { - int_type = lp_build_int_elem_type(type); - vec_type = lp_build_elem_type(type); - mask = LLVMConstInt(int_type, maskBit, 0); - } - else { - /* vector */ - int_type = lp_build_int_vec_type(type); - vec_type = lp_build_vec_type(type); - mask = lp_build_const_int_vec(type, maskBit); - } + int_type = lp_build_int_vec_type(type); + vec_type = lp_build_vec_type(type); + mask = lp_build_const_int_vec(type, maskBit); /* Take the sign bit and add it to 1 constant */ sign = LLVMBuildBitCast(bld->builder, a, int_type, ""); @@ -795,6 +940,7 @@ lp_build_set_sign(struct lp_build_context *bld, LLVMValueRef val, res; assert(type.floating); + assert(lp_check_value(type, a)); /* val = reinterpret_cast<int>(a) */ val = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); @@ -819,21 +965,11 @@ lp_build_int_to_float(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; + LLVMTypeRef vec_type = lp_build_vec_type(type); assert(type.floating); - /*assert(lp_check_value(type, a));*/ - if (type.length == 1) { - LLVMTypeRef float_type = LLVMFloatType(); - return LLVMBuildSIToFP(bld->builder, a, float_type, ""); - } - else { - LLVMTypeRef vec_type = lp_build_vec_type(type); - /*LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);*/ - LLVMValueRef res; - res = LLVMBuildSIToFP(bld->builder, a, vec_type, ""); - return res; - } + return LLVMBuildSIToFP(bld->builder, a, vec_type, ""); } @@ -853,31 +989,75 @@ lp_build_round_sse41(struct lp_build_context *bld, enum lp_build_round_sse41_mode mode) { const struct lp_type type = bld->type; - LLVMTypeRef vec_type = lp_build_vec_type(type); + LLVMTypeRef i32t = LLVMInt32Type(); const char *intrinsic; + LLVMValueRef res; assert(type.floating); - assert(type.width*type.length == 128); + assert(lp_check_value(type, a)); assert(util_cpu_caps.has_sse4_1); - switch(type.width) { - case 32: - intrinsic = "llvm.x86.sse41.round.ps"; - break; - case 64: - intrinsic = "llvm.x86.sse41.round.pd"; - break; - default: - assert(0); - return bld->undef; + if (type.length == 1) { + LLVMTypeRef vec_type; + LLVMValueRef undef; + LLVMValueRef args[3]; + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); + + switch(type.width) { + case 32: + intrinsic = "llvm.x86.sse41.round.ss"; + break; + case 64: + intrinsic = "llvm.x86.sse41.round.sd"; + break; + default: + assert(0); + return bld->undef; + } + + vec_type = LLVMVectorType(bld->elem_type, 4); + + undef = LLVMGetUndef(vec_type); + + args[0] = undef; + args[1] = LLVMBuildInsertElement(bld->builder, undef, a, index0, ""); + args[2] = LLVMConstInt(i32t, mode, 0); + + res = lp_build_intrinsic(bld->builder, intrinsic, + vec_type, args, Elements(args)); + + res = LLVMBuildExtractElement(bld->builder, res, index0, ""); + } + else { + assert(type.width*type.length == 128); + + switch(type.width) { + case 32: + intrinsic = "llvm.x86.sse41.round.ps"; + break; + case 64: + intrinsic = "llvm.x86.sse41.round.pd"; + break; + default: + assert(0); + return bld->undef; + } + + res = lp_build_intrinsic_binary(bld->builder, intrinsic, + bld->vec_type, a, + LLVMConstInt(i32t, mode, 0)); } - return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a, - LLVMConstInt(LLVMInt32Type(), mode, 0)); + return res; } +/** + * Return the integer part of a float (vector) value. The returned value is + * a float (vector). + * Ex: trunc(-1.5) = 1.0 + */ LLVMValueRef lp_build_trunc(struct lp_build_context *bld, LLVMValueRef a) @@ -887,8 +1067,10 @@ lp_build_trunc(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if(util_cpu_caps.has_sse4_1) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); @@ -900,6 +1082,12 @@ lp_build_trunc(struct lp_build_context *bld, } +/** + * Return float (vector) rounded to nearest integer (vector). The returned + * value is a float (vector). + * Ex: round(0.9) = 1.0 + * Ex: round(-1.5) = -2.0 + */ LLVMValueRef lp_build_round(struct lp_build_context *bld, LLVMValueRef a) @@ -909,8 +1097,10 @@ lp_build_round(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if(util_cpu_caps.has_sse4_1) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -921,6 +1111,11 @@ lp_build_round(struct lp_build_context *bld, } +/** + * Return floor of float (vector), result is a float (vector) + * Ex: floor(1.1) = 1.0 + * Ex: floor(-1.1) = -2.0 + */ LLVMValueRef lp_build_floor(struct lp_build_context *bld, LLVMValueRef a) @@ -928,16 +1123,12 @@ lp_build_floor(struct lp_build_context *bld, const struct lp_type type = bld->type; assert(type.floating); + assert(lp_check_value(type, a)); - if (type.length == 1) { - LLVMValueRef res; - res = lp_build_ifloor(bld, a); - res = LLVMBuildSIToFP(bld->builder, res, LLVMFloatType(), ""); - return res; - } - - if(util_cpu_caps.has_sse4_1) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -948,6 +1139,11 @@ lp_build_floor(struct lp_build_context *bld, } +/** + * Return ceiling of float (vector), returning float (vector). + * Ex: ceil( 1.1) = 2.0 + * Ex: ceil(-1.1) = -1.0 + */ LLVMValueRef lp_build_ceil(struct lp_build_context *bld, LLVMValueRef a) @@ -957,8 +1153,10 @@ lp_build_ceil(struct lp_build_context *bld, assert(type.floating); assert(lp_check_value(type, a)); - if(util_cpu_caps.has_sse4_1) + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); + } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; @@ -970,7 +1168,7 @@ lp_build_ceil(struct lp_build_context *bld, /** - * Return fractional part of 'a' computed as a - floor(f) + * Return fractional part of 'a' computed as a - floor(a) * Typically used in texture coord arithmetic. */ LLVMValueRef @@ -983,72 +1181,67 @@ lp_build_fract(struct lp_build_context *bld, /** - * Convert to integer, through whichever rounding method that's fastest, - * typically truncating toward zero. + * Return the integer part of a float (vector) value. The returned value is + * an integer (vector). + * Ex: itrunc(-1.5) = 1 */ LLVMValueRef lp_build_itrunc(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; + LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); assert(type.floating); + assert(lp_check_value(type, a)); - if (type.length == 1) { - LLVMTypeRef int_type = LLVMIntType(type.width); - return LLVMBuildFPToSI(bld->builder, a, int_type, ""); - } - else { - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); - assert(lp_check_value(type, a)); - return LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); - } + return LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); } /** - * Convert float[] to int[] with round(). + * Return float (vector) rounded to nearest integer (vector). The returned + * value is an integer (vector). + * Ex: iround(0.9) = 1 + * Ex: iround(-1.5) = -2 */ LLVMValueRef lp_build_iround(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); - if (type.length == 1) { - /* scalar float to int */ - LLVMTypeRef int_type = LLVMIntType(type.width); - /* XXX we want rounding here! */ - res = LLVMBuildFPToSI(bld->builder, a, int_type, ""); - return res; - } - assert(lp_check_value(type, a)); - if(util_cpu_caps.has_sse4_1) { + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { - LLVMTypeRef vec_type = lp_build_vec_type(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; LLVMValueRef half; - /* get sign bit */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - - /* sign * 0.5 */ half = lp_build_const_vec(type, 0.5); - half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); - half = LLVMBuildOr(bld->builder, sign, half, ""); - half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); - res = LLVMBuildAdd(bld->builder, a, half, ""); + if (type.sign) { + LLVMTypeRef vec_type = bld->vec_type; + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + + /* get sign bit */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + + /* sign * 0.5 */ + half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); + half = LLVMBuildOr(bld->builder, sign, half, ""); + half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); + } + + res = LLVMBuildFAdd(bld->builder, a, half, ""); } res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); @@ -1058,89 +1251,156 @@ lp_build_iround(struct lp_build_context *bld, /** - * Convert float[] to int[] with floor(). + * Return floor of float (vector), result is an int (vector) + * Ex: ifloor(1.1) = 1.0 + * Ex: ifloor(-1.1) = -2.0 */ LLVMValueRef lp_build_ifloor(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); - - if (type.length == 1) { - /* scalar float to int */ - LLVMTypeRef int_type = LLVMIntType(type.width); - res = LLVMBuildFPToSI(bld->builder, a, int_type, ""); - return res; - } - assert(lp_check_value(type, a)); - if(util_cpu_caps.has_sse4_1) { + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); } else { - /* Take the sign bit and add it to 1 constant */ - LLVMTypeRef vec_type = lp_build_vec_type(type); - unsigned mantissa = lp_mantissa(type); - LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); - LLVMValueRef sign; - LLVMValueRef offset; - - /* sign = a < 0 ? ~0 : 0 */ - sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - sign = LLVMBuildAnd(bld->builder, sign, mask, ""); - sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), ""); - lp_build_name(sign, "floor.sign"); - - /* offset = -0.99999(9)f */ - offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa)); - offset = LLVMConstBitCast(offset, int_vec_type); - - /* offset = a < 0 ? -0.99999(9)f : 0.0f */ - offset = LLVMBuildAnd(bld->builder, offset, sign, ""); - offset = LLVMBuildBitCast(bld->builder, offset, vec_type, ""); - lp_build_name(offset, "floor.offset"); - - res = LLVMBuildAdd(bld->builder, a, offset, ""); - lp_build_name(res, "floor.res"); + res = a; + + if (type.sign) { + /* Take the sign bit and add it to 1 constant */ + LLVMTypeRef vec_type = bld->vec_type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + LLVMValueRef offset; + + /* sign = a < 0 ? ~0 : 0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "ifloor.sign"); + + /* offset = -0.99999(9)f */ + offset = lp_build_const_vec(type, -(double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); + offset = LLVMConstBitCast(offset, int_vec_type); + + /* offset = a < 0 ? offset : 0.0f */ + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "ifloor.offset"); + + res = LLVMBuildFAdd(bld->builder, res, offset, "ifloor.res"); + } } - res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); - lp_build_name(res, "floor"); + /* round to nearest (toward zero) */ + res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "ifloor.res"); return res; } +/** + * Return ceiling of float (vector), returning int (vector). + * Ex: iceil( 1.1) = 2 + * Ex: iceil(-1.1) = -1 + */ LLVMValueRef lp_build_iceil(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + LLVMTypeRef int_vec_type = bld->int_vec_type; LLVMValueRef res; assert(type.floating); assert(lp_check_value(type, a)); - if(util_cpu_caps.has_sse4_1) { + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { - assert(0); - res = bld->undef; + LLVMTypeRef vec_type = bld->vec_type; + unsigned mantissa = lp_mantissa(type); + LLVMValueRef offset; + + /* offset = 0.99999(9)f */ + offset = lp_build_const_vec(type, (double)(((unsigned long long)1 << mantissa) - 10)/((unsigned long long)1 << mantissa)); + + if (type.sign) { + LLVMValueRef mask = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); + LLVMValueRef sign; + + /* sign = a < 0 ? 0 : ~0 */ + sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + sign = LLVMBuildAnd(bld->builder, sign, mask, ""); + sign = LLVMBuildAShr(bld->builder, sign, lp_build_const_int_vec(type, type.width - 1), "iceil.sign"); + sign = LLVMBuildNot(bld->builder, sign, "iceil.not"); + + /* offset = a < 0 ? 0.0 : offset */ + offset = LLVMConstBitCast(offset, int_vec_type); + offset = LLVMBuildAnd(bld->builder, offset, sign, ""); + offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "iceil.offset"); + } + + res = LLVMBuildFAdd(bld->builder, a, offset, "iceil.res"); } - res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); + /* round to nearest (toward zero) */ + res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "iceil.res"); return res; } +/** + * Combined ifloor() & fract(). + * + * Preferred to calling the functions separately, as it will ensure that the + * stratergy (floor() vs ifloor()) that results in less redundant work is used. + */ +void +lp_build_ifloor_fract(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart) +{ + + + const struct lp_type type = bld->type; + LLVMValueRef ipart; + + assert(type.floating); + assert(lp_check_value(type, a)); + + if (util_cpu_caps.has_sse4_1 && + (type.length == 1 || type.width*type.length == 128)) { + /* + * floor() is easier. + */ + + ipart = lp_build_floor(bld, a); + *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart"); + *out_ipart = LLVMBuildFPToSI(bld->builder, ipart, bld->int_vec_type, "ipart"); + } + else { + /* + * ifloor() is easier. + */ + + *out_ipart = lp_build_ifloor(bld, a); + ipart = LLVMBuildSIToFP(bld->builder, *out_ipart, bld->vec_type, "ipart"); + *out_fpart = LLVMBuildFSub(bld->builder, a, ipart, "fpart"); + } +} + + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a) @@ -1149,6 +1409,8 @@ lp_build_sqrt(struct lp_build_context *bld, LLVMTypeRef vec_type = lp_build_vec_type(type); char intrinsic[32]; + assert(lp_check_value(type, a)); + /* TODO: optimize the constant case */ /* TODO: optimize the constant case */ @@ -1159,12 +1421,44 @@ lp_build_sqrt(struct lp_build_context *bld, } +/** + * Do one Newton-Raphson step to improve reciprocate precision: + * + * x_{i+1} = x_i * (2 - a * x_i) + * + * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or + * +/-Inf, giving NaN instead. Certain applications rely on this behavior, + * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's + * halo. It would be necessary to clamp the argument to prevent this. + * + * See also: + * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division + * - http://softwarecommunity.intel.com/articles/eng/1818.htm + */ +static INLINE LLVMValueRef +lp_build_rcp_refine(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef rcp_a) +{ + LLVMValueRef two = lp_build_const_vec(bld->type, 2.0); + LLVMValueRef res; + + res = LLVMBuildFMul(bld->builder, a, rcp_a, ""); + res = LLVMBuildFSub(bld->builder, two, res, ""); + res = LLVMBuildFMul(bld->builder, rcp_a, res, ""); + + return res; +} + + LLVMValueRef lp_build_rcp(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; + assert(lp_check_value(type, a)); + if(a == bld->zero) return bld->undef; if(a == bld->one) @@ -1177,15 +1471,65 @@ lp_build_rcp(struct lp_build_context *bld, if(LLVMIsConstant(a)) return LLVMConstFDiv(bld->one, a); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) - /* FIXME: improve precision */ - return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a); + /* + * We don't use RCPPS because: + * - it only has 10bits of precision + * - it doesn't even get the reciprocate of 1.0 exactly + * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf + * - for recent processors the benefit over DIVPS is marginal, a case + * depedent + * + * We could still use it on certain processors if benchmarks show that the + * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for + * particular uses that require less workarounds. + */ + + if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + const unsigned num_iterations = 0; + LLVMValueRef res; + unsigned i; + + res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); + + for (i = 0; i < num_iterations; ++i) { + res = lp_build_rcp_refine(bld, a, res); + } + + return res; + } return LLVMBuildFDiv(bld->builder, bld->one, a, ""); } /** + * Do one Newton-Raphson step to improve rsqrt precision: + * + * x_{i+1} = 0.5 * x_i * (3.0 - a * x_i * x_i) + * + * See also: + * - http://softwarecommunity.intel.com/articles/eng/1818.htm + */ +static INLINE LLVMValueRef +lp_build_rsqrt_refine(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef rsqrt_a) +{ + LLVMValueRef half = lp_build_const_vec(bld->type, 0.5); + LLVMValueRef three = lp_build_const_vec(bld->type, 3.0); + LLVMValueRef res; + + res = LLVMBuildFMul(bld->builder, rsqrt_a, rsqrt_a, ""); + res = LLVMBuildFMul(bld->builder, a, res, ""); + res = LLVMBuildFSub(bld->builder, three, res, ""); + res = LLVMBuildFMul(bld->builder, rsqrt_a, res, ""); + res = LLVMBuildFMul(bld->builder, half, res, ""); + + return res; +} + + +/** * Generate 1/sqrt(a) */ LLVMValueRef @@ -1194,70 +1538,476 @@ lp_build_rsqrt(struct lp_build_context *bld, { const struct lp_type type = bld->type; + assert(lp_check_value(type, a)); + assert(type.floating); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) - return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a); + if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + const unsigned num_iterations = 0; + LLVMValueRef res; + unsigned i; + + res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); + + for (i = 0; i < num_iterations; ++i) { + res = lp_build_rsqrt_refine(bld, a, res); + } + + return res; + } return lp_build_rcp(bld, lp_build_sqrt(bld, a)); } +static inline LLVMValueRef +lp_build_const_v4si(unsigned long value) +{ + LLVMValueRef element = LLVMConstInt(LLVMInt32Type(), value, 0); + LLVMValueRef elements[4] = { element, element, element, element }; + return LLVMConstVector(elements, 4); +} + +static inline LLVMValueRef +lp_build_const_v4sf(float value) +{ + LLVMValueRef element = LLVMConstReal(LLVMFloatType(), value); + LLVMValueRef elements[4] = { element, element, element, element }; + return LLVMConstVector(elements, 4); +} + + /** - * Generate cos(a) + * Generate sin(a) using SSE2 */ LLVMValueRef -lp_build_cos(struct lp_build_context *bld, - LLVMValueRef a) +lp_build_sin(struct lp_build_context *bld, + LLVMValueRef a) { -#ifdef PIPE_OS_WINDOWS + struct lp_type int_type = lp_int_type(bld->type); + LLVMBuilderRef b = bld->builder; + LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4); + LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4); + /* - * FIXME: X86 backend translates llvm.cos.v4f32 to 4 calls to CRT's cosf() - * which is neither efficient nor does the CRT linkage work on Windows - * causing segmentation fault. So simply disable the code for now. + * take the absolute value, + * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); */ - return bld->one; -#else - const struct lp_type type = bld->type; - LLVMTypeRef vec_type = lp_build_vec_type(type); - char intrinsic[32]; - /* TODO: optimize the constant case */ + LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000); + LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si"); - assert(type.floating); - util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width); + LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); + LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs"); - return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a); -#endif + /* + * extract the sign bit (upper one) + * sign_bit = _mm_and_ps(sign_bit, *(v4sf*)_ps_sign_mask); + */ + LLVMValueRef sig_mask = lp_build_const_v4si(0x80000000); + LLVMValueRef sign_bit_i = LLVMBuildAnd(b, a_v4si, sig_mask, "sign_bit_i"); + + /* + * scale by 4/Pi + * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); + */ + + LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516); + LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); + + /* + * store the integer part of y in mm0 + * emm2 = _mm_cvttps_epi32(y); + */ + + LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i"); + + /* + * j=(j+1) & (~1) (see the cephes sources) + * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); + */ + + LLVMValueRef all_one = lp_build_const_v4si(1); + LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); + /* + * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); + */ + LLVMValueRef inv_one = lp_build_const_v4si(~1); + LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); + + /* + * y = _mm_cvtepi32_ps(emm2); + */ + LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2"); + + /* get the swap sign flag + * emm0 = _mm_and_si128(emm2, *(v4si*)_pi32_4); + */ + LLVMValueRef pi32_4 = lp_build_const_v4si(4); + LLVMValueRef emm0_and = LLVMBuildAnd(b, emm2_add, pi32_4, "emm0_and"); + + /* + * emm2 = _mm_slli_epi32(emm0, 29); + */ + LLVMValueRef const_29 = lp_build_const_v4si(29); + LLVMValueRef swap_sign_bit = LLVMBuildShl(b, emm0_and, const_29, "swap_sign_bit"); + + /* + * get the polynom selection mask + * there is one polynom for 0 <= x <= Pi/4 + * and another one for Pi/4<x<=Pi/2 + * Both branches will be computed. + * + * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); + * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + */ + + LLVMValueRef pi32_2 = lp_build_const_v4si(2); + LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_and, pi32_2, "emm2_3"); + LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL, + emm2_3, lp_build_const_v4si(0)); + /* + * sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit); + */ + LLVMValueRef sign_bit_1 = LLVMBuildXor(b, sign_bit_i, swap_sign_bit, "sign_bit"); + + /* + * _PS_CONST(minus_cephes_DP1, -0.78515625); + * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); + * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); + */ + LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625); + LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4); + LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8); + + /* + * The magic pass: "Extended precision modular arithmetic" + * x = ((x - y * DP1) - y * DP2) - y * DP3; + * xmm1 = _mm_mul_ps(y, xmm1); + * xmm2 = _mm_mul_ps(y, xmm2); + * xmm3 = _mm_mul_ps(y, xmm3); + */ + LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); + LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); + LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); + + /* + * x = _mm_add_ps(x, xmm1); + * x = _mm_add_ps(x, xmm2); + * x = _mm_add_ps(x, xmm3); + */ + + LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); + LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); + LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); + + /* + * Evaluate the first polynom (0 <= x <= Pi/4) + * + * z = _mm_mul_ps(x,x); + */ + LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); + + /* + * _PS_CONST(coscof_p0, 2.443315711809948E-005); + * _PS_CONST(coscof_p1, -1.388731625493765E-003); + * _PS_CONST(coscof_p2, 4.166664568298827E-002); + */ + LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005); + LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003); + LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002); + + /* + * y = *(v4sf*)_ps_coscof_p0; + * y = _mm_mul_ps(y, z); + */ + LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); + LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); + LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); + LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); + LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); + LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); + + + /* + * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); + * y = _mm_sub_ps(y, tmp); + * y = _mm_add_ps(y, *(v4sf*)_ps_1); + */ + LLVMValueRef half = lp_build_const_v4sf(0.5); + LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); + LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); + LLVMValueRef one = lp_build_const_v4sf(1.0); + LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); + + /* + * _PS_CONST(sincof_p0, -1.9515295891E-4); + * _PS_CONST(sincof_p1, 8.3321608736E-3); + * _PS_CONST(sincof_p2, -1.6666654611E-1); + */ + LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4); + LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3); + LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1); + + /* + * Evaluate the second polynom (Pi/4 <= x <= 0) + * + * y2 = *(v4sf*)_ps_sincof_p0; + * y2 = _mm_mul_ps(y2, z); + * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); + * y2 = _mm_mul_ps(y2, z); + * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); + * y2 = _mm_mul_ps(y2, z); + * y2 = _mm_mul_ps(y2, x); + * y2 = _mm_add_ps(y2, x); + */ + + LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); + LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); + LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); + LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); + LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); + LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); + LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); + + /* + * select the correct result from the two polynoms + * xmm3 = poly_mask; + * y2 = _mm_and_ps(xmm3, y2); //, xmm3); + * y = _mm_andnot_ps(xmm3, y); + * y = _mm_add_ps(y,y2); + */ + LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i"); + LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i"); + LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); + LLVMValueRef inv = lp_build_const_v4si(~0); + LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); + LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); + LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); + + /* + * update the sign + * y = _mm_xor_ps(y, sign_bit); + */ + LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit_1, "y_sin"); + LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result"); + return y_result; } /** - * Generate sin(a) + * Generate cos(a) using SSE2 */ LLVMValueRef -lp_build_sin(struct lp_build_context *bld, - LLVMValueRef a) +lp_build_cos(struct lp_build_context *bld, + LLVMValueRef a) { -#ifdef PIPE_OS_WINDOWS + struct lp_type int_type = lp_int_type(bld->type); + LLVMBuilderRef b = bld->builder; + LLVMTypeRef v4sf = LLVMVectorType(LLVMFloatType(), 4); + LLVMTypeRef v4si = LLVMVectorType(LLVMInt32Type(), 4); + /* - * FIXME: X86 backend translates llvm.sin.v4f32 to 4 calls to CRT's sinf() - * which is neither efficient nor does the CRT linkage work on Windows - * causing segmentation fault. So simply disable the code for now. + * take the absolute value, + * x = _mm_and_ps(x, *(v4sf*)_ps_inv_sign_mask); */ - return bld->zero; -#else - const struct lp_type type = bld->type; - LLVMTypeRef vec_type = lp_build_vec_type(type); - char intrinsic[32]; - /* TODO: optimize the constant case */ + LLVMValueRef inv_sig_mask = lp_build_const_v4si(~0x80000000); + LLVMValueRef a_v4si = LLVMBuildBitCast(b, a, v4si, "a_v4si"); - assert(type.floating); - util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width); + LLVMValueRef absi = LLVMBuildAnd(b, a_v4si, inv_sig_mask, "absi"); + LLVMValueRef x_abs = LLVMBuildBitCast(b, absi, v4sf, "x_abs"); - return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a); -#endif + /* + * scale by 4/Pi + * y = _mm_mul_ps(x, *(v4sf*)_ps_cephes_FOPI); + */ + + LLVMValueRef FOPi = lp_build_const_v4sf(1.27323954473516); + LLVMValueRef scale_y = LLVMBuildFMul(b, x_abs, FOPi, "scale_y"); + + /* + * store the integer part of y in mm0 + * emm2 = _mm_cvttps_epi32(y); + */ + + LLVMValueRef emm2_i = LLVMBuildFPToSI(b, scale_y, v4si, "emm2_i"); + + /* + * j=(j+1) & (~1) (see the cephes sources) + * emm2 = _mm_add_epi32(emm2, *(v4si*)_pi32_1); + */ + + LLVMValueRef all_one = lp_build_const_v4si(1); + LLVMValueRef emm2_add = LLVMBuildAdd(b, emm2_i, all_one, "emm2_add"); + /* + * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_inv1); + */ + LLVMValueRef inv_one = lp_build_const_v4si(~1); + LLVMValueRef emm2_and = LLVMBuildAnd(b, emm2_add, inv_one, "emm2_and"); + + /* + * y = _mm_cvtepi32_ps(emm2); + */ + LLVMValueRef y_2 = LLVMBuildSIToFP(b, emm2_and, v4sf, "y_2"); + + + /* + * emm2 = _mm_sub_epi32(emm2, *(v4si*)_pi32_2); + */ + LLVMValueRef const_2 = lp_build_const_v4si(2); + LLVMValueRef emm2_2 = LLVMBuildSub(b, emm2_and, const_2, "emm2_2"); + + + /* get the swap sign flag + * emm0 = _mm_andnot_si128(emm2, *(v4si*)_pi32_4); + */ + LLVMValueRef inv = lp_build_const_v4si(~0); + LLVMValueRef emm0_not = LLVMBuildXor(b, emm2_2, inv, "emm0_not"); + LLVMValueRef pi32_4 = lp_build_const_v4si(4); + LLVMValueRef emm0_and = LLVMBuildAnd(b, emm0_not, pi32_4, "emm0_and"); + + /* + * emm2 = _mm_slli_epi32(emm0, 29); + */ + LLVMValueRef const_29 = lp_build_const_v4si(29); + LLVMValueRef sign_bit = LLVMBuildShl(b, emm0_and, const_29, "sign_bit"); + + /* + * get the polynom selection mask + * there is one polynom for 0 <= x <= Pi/4 + * and another one for Pi/4<x<=Pi/2 + * Both branches will be computed. + * + * emm2 = _mm_and_si128(emm2, *(v4si*)_pi32_2); + * emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + */ + + LLVMValueRef pi32_2 = lp_build_const_v4si(2); + LLVMValueRef emm2_3 = LLVMBuildAnd(b, emm2_2, pi32_2, "emm2_3"); + LLVMValueRef poly_mask = lp_build_compare(b, int_type, PIPE_FUNC_EQUAL, + emm2_3, lp_build_const_v4si(0)); + + /* + * _PS_CONST(minus_cephes_DP1, -0.78515625); + * _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); + * _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8); + */ + LLVMValueRef DP1 = lp_build_const_v4sf(-0.78515625); + LLVMValueRef DP2 = lp_build_const_v4sf(-2.4187564849853515625e-4); + LLVMValueRef DP3 = lp_build_const_v4sf(-3.77489497744594108e-8); + + /* + * The magic pass: "Extended precision modular arithmetic" + * x = ((x - y * DP1) - y * DP2) - y * DP3; + * xmm1 = _mm_mul_ps(y, xmm1); + * xmm2 = _mm_mul_ps(y, xmm2); + * xmm3 = _mm_mul_ps(y, xmm3); + */ + LLVMValueRef xmm1 = LLVMBuildFMul(b, y_2, DP1, "xmm1"); + LLVMValueRef xmm2 = LLVMBuildFMul(b, y_2, DP2, "xmm2"); + LLVMValueRef xmm3 = LLVMBuildFMul(b, y_2, DP3, "xmm3"); + + /* + * x = _mm_add_ps(x, xmm1); + * x = _mm_add_ps(x, xmm2); + * x = _mm_add_ps(x, xmm3); + */ + + LLVMValueRef x_1 = LLVMBuildFAdd(b, x_abs, xmm1, "x_1"); + LLVMValueRef x_2 = LLVMBuildFAdd(b, x_1, xmm2, "x_2"); + LLVMValueRef x_3 = LLVMBuildFAdd(b, x_2, xmm3, "x_3"); + + /* + * Evaluate the first polynom (0 <= x <= Pi/4) + * + * z = _mm_mul_ps(x,x); + */ + LLVMValueRef z = LLVMBuildFMul(b, x_3, x_3, "z"); + + /* + * _PS_CONST(coscof_p0, 2.443315711809948E-005); + * _PS_CONST(coscof_p1, -1.388731625493765E-003); + * _PS_CONST(coscof_p2, 4.166664568298827E-002); + */ + LLVMValueRef coscof_p0 = lp_build_const_v4sf(2.443315711809948E-005); + LLVMValueRef coscof_p1 = lp_build_const_v4sf(-1.388731625493765E-003); + LLVMValueRef coscof_p2 = lp_build_const_v4sf(4.166664568298827E-002); + + /* + * y = *(v4sf*)_ps_coscof_p0; + * y = _mm_mul_ps(y, z); + */ + LLVMValueRef y_3 = LLVMBuildFMul(b, z, coscof_p0, "y_3"); + LLVMValueRef y_4 = LLVMBuildFAdd(b, y_3, coscof_p1, "y_4"); + LLVMValueRef y_5 = LLVMBuildFMul(b, y_4, z, "y_5"); + LLVMValueRef y_6 = LLVMBuildFAdd(b, y_5, coscof_p2, "y_6"); + LLVMValueRef y_7 = LLVMBuildFMul(b, y_6, z, "y_7"); + LLVMValueRef y_8 = LLVMBuildFMul(b, y_7, z, "y_8"); + + + /* + * tmp = _mm_mul_ps(z, *(v4sf*)_ps_0p5); + * y = _mm_sub_ps(y, tmp); + * y = _mm_add_ps(y, *(v4sf*)_ps_1); + */ + LLVMValueRef half = lp_build_const_v4sf(0.5); + LLVMValueRef tmp = LLVMBuildFMul(b, z, half, "tmp"); + LLVMValueRef y_9 = LLVMBuildFSub(b, y_8, tmp, "y_8"); + LLVMValueRef one = lp_build_const_v4sf(1.0); + LLVMValueRef y_10 = LLVMBuildFAdd(b, y_9, one, "y_9"); + + /* + * _PS_CONST(sincof_p0, -1.9515295891E-4); + * _PS_CONST(sincof_p1, 8.3321608736E-3); + * _PS_CONST(sincof_p2, -1.6666654611E-1); + */ + LLVMValueRef sincof_p0 = lp_build_const_v4sf(-1.9515295891E-4); + LLVMValueRef sincof_p1 = lp_build_const_v4sf(8.3321608736E-3); + LLVMValueRef sincof_p2 = lp_build_const_v4sf(-1.6666654611E-1); + + /* + * Evaluate the second polynom (Pi/4 <= x <= 0) + * + * y2 = *(v4sf*)_ps_sincof_p0; + * y2 = _mm_mul_ps(y2, z); + * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p1); + * y2 = _mm_mul_ps(y2, z); + * y2 = _mm_add_ps(y2, *(v4sf*)_ps_sincof_p2); + * y2 = _mm_mul_ps(y2, z); + * y2 = _mm_mul_ps(y2, x); + * y2 = _mm_add_ps(y2, x); + */ + + LLVMValueRef y2_3 = LLVMBuildFMul(b, z, sincof_p0, "y2_3"); + LLVMValueRef y2_4 = LLVMBuildFAdd(b, y2_3, sincof_p1, "y2_4"); + LLVMValueRef y2_5 = LLVMBuildFMul(b, y2_4, z, "y2_5"); + LLVMValueRef y2_6 = LLVMBuildFAdd(b, y2_5, sincof_p2, "y2_6"); + LLVMValueRef y2_7 = LLVMBuildFMul(b, y2_6, z, "y2_7"); + LLVMValueRef y2_8 = LLVMBuildFMul(b, y2_7, x_3, "y2_8"); + LLVMValueRef y2_9 = LLVMBuildFAdd(b, y2_8, x_3, "y2_9"); + + /* + * select the correct result from the two polynoms + * xmm3 = poly_mask; + * y2 = _mm_and_ps(xmm3, y2); //, xmm3); + * y = _mm_andnot_ps(xmm3, y); + * y = _mm_add_ps(y,y2); + */ + LLVMValueRef y2_i = LLVMBuildBitCast(b, y2_9, v4si, "y2_i"); + LLVMValueRef y_i = LLVMBuildBitCast(b, y_10, v4si, "y_i"); + LLVMValueRef y2_and = LLVMBuildAnd(b, y2_i, poly_mask, "y2_and"); + LLVMValueRef poly_mask_inv = LLVMBuildXor(b, poly_mask, inv, "poly_mask_inv"); + LLVMValueRef y_and = LLVMBuildAnd(b, y_i, poly_mask_inv, "y_and"); + LLVMValueRef y_combine = LLVMBuildAdd(b, y_and, y2_and, "y_combine"); + + /* + * update the sign + * y = _mm_xor_ps(y, sign_bit); + */ + LLVMValueRef y_sign = LLVMBuildXor(b, y_combine, sign_bit, "y_sin"); + LLVMValueRef y_result = LLVMBuildBitCast(b, y_sign, v4sf, "y_result"); + return y_result; } @@ -1270,9 +2020,11 @@ lp_build_pow(struct lp_build_context *bld, LLVMValueRef y) { /* TODO: optimize the constant case */ - if(LLVMIsConstant(x) && LLVMIsConstant(y)) + if (gallivm_debug & GALLIVM_DEBUG_PERF && + LLVMIsConstant(x) && LLVMIsConstant(y)) { debug_printf("%s: inefficient/imprecise constant arithmetic\n", __FUNCTION__); + } return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y)); } @@ -1288,6 +2040,8 @@ lp_build_exp(struct lp_build_context *bld, /* log2(e) = 1/log(2) */ LLVMValueRef log2e = lp_build_const_vec(bld->type, 1.4426950408889634); + assert(lp_check_value(bld->type, x)); + return lp_build_mul(bld, log2e, lp_build_exp2(bld, x)); } @@ -1302,14 +2056,12 @@ lp_build_log(struct lp_build_context *bld, /* log(2) */ LLVMValueRef log2 = lp_build_const_vec(bld->type, 0.69314718055994529); + assert(lp_check_value(bld->type, x)); + return lp_build_mul(bld, log2, lp_build_exp2(bld, x)); } -#define EXP_POLY_DEGREE 3 -#define LOG_POLY_DEGREE 5 - - /** * Generate polynomial. * Ex: coeffs[0] + x * coeffs[1] + x^2 * coeffs[2]. @@ -1321,22 +2073,22 @@ lp_build_polynomial(struct lp_build_context *bld, unsigned num_coeffs) { const struct lp_type type = bld->type; - LLVMTypeRef float_type = LLVMFloatType(); LLVMValueRef res = NULL; unsigned i; + assert(lp_check_value(bld->type, x)); + /* TODO: optimize the constant case */ - if(LLVMIsConstant(x)) + if (gallivm_debug & GALLIVM_DEBUG_PERF && + LLVMIsConstant(x)) { debug_printf("%s: inefficient/imprecise constant arithmetic\n", __FUNCTION__); + } for (i = num_coeffs; i--; ) { LLVMValueRef coeff; - if (type.length == 1) - coeff = LLVMConstReal(float_type, coeffs[i]); - else - coeff = lp_build_const_vec(type, coeffs[i]); + coeff = lp_build_const_vec(type, coeffs[i]); if(res) res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res)); @@ -1352,17 +2104,31 @@ lp_build_polynomial(struct lp_build_context *bld, /** - * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[ + * Minimax polynomial fit of 2**x, in range [0, 1[ */ const double lp_build_exp2_polynomial[] = { #if EXP_POLY_DEGREE == 5 - 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3 + 0.999999999690134838155, + 0.583974334321735217258, + 0.164553105719676828492, + 0.0292811063701710962255, + 0.00354944426657875141846, + 0.000296253726543423377365 #elif EXP_POLY_DEGREE == 4 - 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2 + 1.00000001502262084505, + 0.563586057338685991394, + 0.150436017652442413623, + 0.0243220604213317927308, + 0.0025359088446580436489 #elif EXP_POLY_DEGREE == 3 - 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2 + 0.999925218562710312959, + 0.695833540494823811697, + 0.226067155427249155588, + 0.0780245226406372992967 #elif EXP_POLY_DEGREE == 2 - 1.0017247, 6.5763628e-1, 3.3718944e-1 + 1.00172476321474503578, + 0.657636275736077639316, + 0.33718943461968720704 #else #error #endif @@ -1385,28 +2151,31 @@ lp_build_exp2_approx(struct lp_build_context *bld, LLVMValueRef expfpart = NULL; LLVMValueRef res = NULL; + assert(lp_check_value(bld->type, x)); + if(p_exp2_int_part || p_frac_part || p_exp2) { /* TODO: optimize the constant case */ - if(LLVMIsConstant(x)) + if (gallivm_debug & GALLIVM_DEBUG_PERF && + LLVMIsConstant(x)) { debug_printf("%s: inefficient/imprecise constant arithmetic\n", __FUNCTION__); + } assert(type.floating && type.width == 32); x = lp_build_min(bld, x, lp_build_const_vec(type, 129.0)); x = lp_build_max(bld, x, lp_build_const_vec(type, -126.99999)); - /* ipart = int(x - 0.5) */ - ipart = LLVMBuildSub(bld->builder, x, lp_build_const_vec(type, 0.5f), ""); - ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, ""); + /* ipart = floor(x) */ + ipart = lp_build_floor(bld, x); /* fpart = x - ipart */ - fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, ""); - fpart = LLVMBuildSub(bld->builder, x, fpart, ""); + fpart = LLVMBuildFSub(bld->builder, x, ipart, ""); } if(p_exp2_int_part || p_exp2) { /* expipart = (float) (1 << ipart) */ + ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, ""); expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_const_int_vec(type, 127), ""); expipart = LLVMBuildShl(bld->builder, expipart, lp_build_const_int_vec(type, 23), ""); expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, ""); @@ -1416,7 +2185,7 @@ lp_build_exp2_approx(struct lp_build_context *bld, expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial, Elements(lp_build_exp2_polynomial)); - res = LLVMBuildMul(bld->builder, expipart, expfpart, ""); + res = LLVMBuildFMul(bld->builder, expipart, expfpart, ""); } if(p_exp2_int_part) @@ -1447,13 +2216,27 @@ lp_build_exp2(struct lp_build_context *bld, */ const double lp_build_log2_polynomial[] = { #if LOG_POLY_DEGREE == 6 - 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313 + 3.11578814719469302614, + -3.32419399085241980044, + 2.59883907202499966007, + -1.23152682416275988241, + 0.318212422185251071475, + -0.0344359067839062357313 #elif LOG_POLY_DEGREE == 5 - 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533 + 2.8882704548164776201, + -2.52074962577807006663, + 1.48116647521213171641, + -0.465725644288844778798, + 0.0596515482674574969533 #elif LOG_POLY_DEGREE == 4 - 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454 + 2.61761038894603480148, + -1.75647175389045657003, + 0.688243882994381274313, + -0.107254423828329604454 #elif LOG_POLY_DEGREE == 3 - 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516 + 2.28330284476918490682, + -1.04913055217340124191, + 0.204446009836232697516 #else #error #endif @@ -1485,11 +2268,15 @@ lp_build_log2_approx(struct lp_build_context *bld, LLVMValueRef logmant = NULL; LLVMValueRef res = NULL; + assert(lp_check_value(bld->type, x)); + if(p_exp || p_floor_log2 || p_log2) { /* TODO: optimize the constant case */ - if(LLVMIsConstant(x)) + if (gallivm_debug & GALLIVM_DEBUG_PERF && + LLVMIsConstant(x)) { debug_printf("%s: inefficient/imprecise constant arithmetic\n", __FUNCTION__); + } assert(type.floating && type.width == 32); @@ -1515,9 +2302,9 @@ lp_build_log2_approx(struct lp_build_context *bld, Elements(lp_build_log2_polynomial)); /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ - logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), ""); + logmant = LLVMBuildFMul(bld->builder, logmant, LLVMBuildFSub(bld->builder, mant, bld->one, ""), ""); - res = LLVMBuildAdd(bld->builder, logmant, logexp, ""); + res = LLVMBuildFAdd(bld->builder, logmant, logexp, ""); } if(p_exp) { @@ -1533,89 +2320,90 @@ lp_build_log2_approx(struct lp_build_context *bld, } -/** scalar version of above function */ -static void -lp_build_float_log2_approx(struct lp_build_context *bld, - LLVMValueRef x, - LLVMValueRef *p_exp, - LLVMValueRef *p_floor_log2, - LLVMValueRef *p_log2) +LLVMValueRef +lp_build_log2(struct lp_build_context *bld, + LLVMValueRef x) +{ + LLVMValueRef res; + lp_build_log2_approx(bld, x, NULL, NULL, &res); + return res; +} + + +/** + * Faster (and less accurate) log2. + * + * log2(x) = floor(log2(x)) + frac(x) + * + * See http://www.flipcode.com/archives/Fast_log_Function.shtml + */ +LLVMValueRef +lp_build_fast_log2(struct lp_build_context *bld, + LLVMValueRef x) { const struct lp_type type = bld->type; - LLVMTypeRef float_type = LLVMFloatType(); - LLVMTypeRef int_type = LLVMIntType(type.width); + LLVMTypeRef vec_type = bld->vec_type; + LLVMTypeRef int_vec_type = bld->int_vec_type; - LLVMValueRef expmask = LLVMConstInt(int_type, 0x7f800000, 0); - LLVMValueRef mantmask = LLVMConstInt(int_type, 0x007fffff, 0); - LLVMValueRef one = LLVMConstBitCast(bld->one, int_type); + unsigned mantissa = lp_mantissa(type); + LLVMValueRef mantmask = lp_build_const_int_vec(type, (1ULL << mantissa) - 1); + LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type); - LLVMValueRef i = NULL; - LLVMValueRef exp = NULL; - LLVMValueRef mant = NULL; - LLVMValueRef logexp = NULL; - LLVMValueRef logmant = NULL; - LLVMValueRef res = NULL; + LLVMValueRef ipart; + LLVMValueRef fpart; - if(p_exp || p_floor_log2 || p_log2) { - /* TODO: optimize the constant case */ - if(LLVMIsConstant(x)) - debug_printf("%s: inefficient/imprecise constant arithmetic\n", - __FUNCTION__); + assert(lp_check_value(bld->type, x)); - assert(type.floating && type.width == 32); + assert(type.floating); - i = LLVMBuildBitCast(bld->builder, x, int_type, ""); + x = LLVMBuildBitCast(bld->builder, x, int_vec_type, ""); - /* exp = (float) exponent(x) */ - exp = LLVMBuildAnd(bld->builder, i, expmask, ""); - } + /* ipart = floor(log2(x)) - 1 */ + ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), ""); + ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), ""); + ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 128), ""); + ipart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, ""); - if(p_floor_log2 || p_log2) { - LLVMValueRef c23 = LLVMConstInt(int_type, 23, 0); - LLVMValueRef c127 = LLVMConstInt(int_type, 127, 0); - logexp = LLVMBuildLShr(bld->builder, exp, c23, ""); - logexp = LLVMBuildSub(bld->builder, logexp, c127, ""); - logexp = LLVMBuildSIToFP(bld->builder, logexp, float_type, ""); - } + /* fpart = 1.0 + frac(x) */ + fpart = LLVMBuildAnd(bld->builder, x, mantmask, ""); + fpart = LLVMBuildOr(bld->builder, fpart, one, ""); + fpart = LLVMBuildBitCast(bld->builder, fpart, vec_type, ""); - if(p_log2) { - /* mant = (float) mantissa(x) */ - mant = LLVMBuildAnd(bld->builder, i, mantmask, ""); - mant = LLVMBuildOr(bld->builder, mant, one, ""); - mant = LLVMBuildBitCast(bld->builder, mant, float_type, ""); + /* floor(log2(x)) + frac(x) */ + return LLVMBuildFAdd(bld->builder, ipart, fpart, ""); +} - logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial, - Elements(lp_build_log2_polynomial)); - /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ - logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildSub(bld->builder, mant, bld->one, ""), ""); +/** + * Fast implementation of iround(log2(x)). + * + * Not an approximation -- it should give accurate results all the time. + */ +LLVMValueRef +lp_build_ilog2(struct lp_build_context *bld, + LLVMValueRef x) +{ + const struct lp_type type = bld->type; + LLVMTypeRef int_vec_type = bld->int_vec_type; - res = LLVMBuildAdd(bld->builder, logmant, logexp, ""); - } + unsigned mantissa = lp_mantissa(type); + LLVMValueRef sqrt2 = lp_build_const_vec(type, 1.4142135623730951); - if(p_exp) { - exp = LLVMBuildBitCast(bld->builder, exp, float_type, ""); - *p_exp = exp; - } + LLVMValueRef ipart; - if(p_floor_log2) - *p_floor_log2 = logexp; + assert(lp_check_value(bld->type, x)); - if(p_log2) - *p_log2 = res; -} + assert(type.floating); + /* x * 2^(0.5) i.e., add 0.5 to the log2(x) */ + x = LLVMBuildFMul(bld->builder, x, sqrt2, ""); -LLVMValueRef -lp_build_log2(struct lp_build_context *bld, - LLVMValueRef x) -{ - LLVMValueRef res; - if (bld->type.length == 1) { - lp_build_float_log2_approx(bld, x, NULL, NULL, &res); - } - else { - lp_build_log2_approx(bld, x, NULL, NULL, &res); - } - return res; + x = LLVMBuildBitCast(bld->builder, x, int_vec_type, ""); + + /* ipart = floor(log2(x) + 0.5) */ + ipart = LLVMBuildLShr(bld->builder, x, lp_build_const_int_vec(type, mantissa), ""); + ipart = LLVMBuildAnd(bld->builder, ipart, lp_build_const_int_vec(type, 255), ""); + ipart = LLVMBuildSub(bld->builder, ipart, lp_build_const_int_vec(type, 127), ""); + + return ipart; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h b/src/gallium/auxiliary/gallivm/lp_bld_arit.h index 31efa9921ce..8424384f8f7 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h @@ -171,6 +171,12 @@ LLVMValueRef lp_build_itrunc(struct lp_build_context *bld, LLVMValueRef a); +void +lp_build_ifloor_fract(struct lp_build_context *bld, + LLVMValueRef a, + LLVMValueRef *out_ipart, + LLVMValueRef *out_fpart); + LLVMValueRef lp_build_sqrt(struct lp_build_context *bld, LLVMValueRef a); @@ -212,6 +218,15 @@ LLVMValueRef lp_build_log2(struct lp_build_context *bld, LLVMValueRef a); +LLVMValueRef +lp_build_fast_log2(struct lp_build_context *bld, + LLVMValueRef a); + +LLVMValueRef +lp_build_ilog2(struct lp_build_context *bld, + LLVMValueRef x); + + void lp_build_exp2_approx(struct lp_build_context *bld, LLVMValueRef x, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_assert.c b/src/gallium/auxiliary/gallivm/lp_bld_assert.c new file mode 100644 index 00000000000..f2ebd868a8d --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_assert.c @@ -0,0 +1,101 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_debug.h" +#include "util/u_memory.h" +#include "lp_bld_assert.h" +#include "lp_bld_init.h" +#include "lp_bld_printf.h" + + +/** + * A call to lp_build_assert() will build a function call to this function. + */ +static void +lp_assert(int condition, const char *msg) +{ + if (!condition) { + debug_printf("LLVM assertion '%s' failed!\n", msg); + assert(condition); + } +} + + + +/** + * lp_build_assert. + * + * Build an assertion in LLVM IR by building a function call to the + * lp_assert() function above. + * + * \param condition should be an 'i1' or 'i32' value + * \param msg a string to print if the assertion fails. + */ +LLVMValueRef +lp_build_assert(LLVMBuilderRef builder, LLVMValueRef condition, + const char *msg) +{ + LLVMModuleRef module; + LLVMTypeRef arg_types[2]; + LLVMValueRef msg_string, assert_func, params[2], r; + + module = LLVMGetGlobalParent(LLVMGetBasicBlockParent( + LLVMGetInsertBlock(builder))); + + msg_string = lp_build_const_string_variable(module, msg, strlen(msg) + 1); + + arg_types[0] = LLVMInt32Type(); + arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0); + + /* lookup the lp_assert function */ + assert_func = LLVMGetNamedFunction(module, "lp_assert"); + + /* Create the assertion function if not found */ + if (!assert_func) { + LLVMTypeRef func_type = + LLVMFunctionType(LLVMVoidType(), arg_types, 2, 0); + + assert_func = LLVMAddFunction(module, "lp_assert", func_type); + LLVMSetFunctionCallConv(assert_func, LLVMCCallConv); + LLVMSetLinkage(assert_func, LLVMExternalLinkage); + LLVMAddGlobalMapping(lp_build_engine, assert_func, + func_to_pointer((func_pointer)lp_assert)); + } + assert(assert_func); + + /* build function call param list */ + params[0] = LLVMBuildZExt(builder, condition, arg_types[0], ""); + params[1] = LLVMBuildBitCast(builder, msg_string, arg_types[1], ""); + + /* check arg types */ + assert(LLVMTypeOf(params[0]) == arg_types[0]); + assert(LLVMTypeOf(params[1]) == arg_types[1]); + + r = LLVMBuildCall(builder, assert_func, params, 2, ""); + + return r; +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_assert.h b/src/gallium/auxiliary/gallivm/lp_bld_assert.h new file mode 100644 index 00000000000..ddd879dc2c6 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_assert.h @@ -0,0 +1,41 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef LP_BLD_ASSERT_H +#define LP_BLD_ASSERT_H + + +#include "lp_bld.h" + + +LLVMValueRef +lp_build_assert(LLVMBuilderRef builder, LLVMValueRef condition, + const char *msg); + + +#endif + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c b/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c new file mode 100644 index 00000000000..706479b4d56 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_bitarit.c @@ -0,0 +1,187 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#include "util/u_debug.h" + +#include "lp_bld_type.h" +#include "lp_bld_debug.h" +#include "lp_bld_const.h" +#include "lp_bld_bitarit.h" + + +/** + * Return (a | b) + */ +LLVMValueRef +lp_build_or(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) +{ + const struct lp_type type = bld->type; + LLVMValueRef res; + + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + + /* can't do bitwise ops on floating-point values */ + if (type.floating) { + a = LLVMBuildBitCast(bld->builder, a, bld->int_vec_type, ""); + b = LLVMBuildBitCast(bld->builder, b, bld->int_vec_type, ""); + } + + res = LLVMBuildOr(bld->builder, a, b, ""); + + if (type.floating) { + res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, ""); + } + + return res; +} + + +/** + * Return (a & b) + */ +LLVMValueRef +lp_build_and(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) +{ + const struct lp_type type = bld->type; + LLVMValueRef res; + + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + + /* can't do bitwise ops on floating-point values */ + if (type.floating) { + a = LLVMBuildBitCast(bld->builder, a, bld->int_vec_type, ""); + b = LLVMBuildBitCast(bld->builder, b, bld->int_vec_type, ""); + } + + res = LLVMBuildAnd(bld->builder, a, b, ""); + + if (type.floating) { + res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, ""); + } + + return res; +} + + +/** + * Return (a & ~b) + */ +LLVMValueRef +lp_build_andnot(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) +{ + const struct lp_type type = bld->type; + LLVMValueRef res; + + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + + /* can't do bitwise ops on floating-point values */ + if (type.floating) { + a = LLVMBuildBitCast(bld->builder, a, bld->int_vec_type, ""); + b = LLVMBuildBitCast(bld->builder, b, bld->int_vec_type, ""); + } + + res = LLVMBuildNot(bld->builder, b, ""); + res = LLVMBuildAnd(bld->builder, a, res, ""); + + if (type.floating) { + res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, ""); + } + + return res; +} + + +/** + * Shift left. + */ +LLVMValueRef +lp_build_shl(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) +{ + const struct lp_type type = bld->type; + LLVMValueRef res; + + assert(!type.floating); + + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + + res = LLVMBuildShl(bld->builder, a, b, ""); + + return res; +} + + +/** + * Shift right. + */ +LLVMValueRef +lp_build_shr(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) +{ + const struct lp_type type = bld->type; + LLVMValueRef res; + + assert(!type.floating); + + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + + if (type.sign) { + res = LLVMBuildAShr(bld->builder, a, b, ""); + } else { + res = LLVMBuildLShr(bld->builder, a, b, ""); + } + + return res; +} + + +/** + * Shift left with immediate. + */ +LLVMValueRef +lp_build_shl_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm) +{ + LLVMValueRef b = lp_build_const_int_vec(bld->type, imm); + assert(imm <= bld->type.width); + return lp_build_shl(bld, a, b); +} + + +/** + * Shift right with immediate. + */ +LLVMValueRef +lp_build_shr_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm) +{ + LLVMValueRef b = lp_build_const_int_vec(bld->type, imm); + assert(imm <= bld->type.width); + return lp_build_shr(bld, a, b); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h b/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h new file mode 100644 index 00000000000..5c5b9818519 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_bitarit.h @@ -0,0 +1,69 @@ +/************************************************************************** + * + * Copyright 2009 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * @file + * Helper bitwise arithmetic functions. + * + * @author Jose Fonseca <[email protected]> + */ + + +#ifndef LP_BLD_BITARIT_H +#define LP_BLD_BITARIT_H + + +#include "gallivm/lp_bld.h" + + +struct lp_type; +struct lp_build_context; + + +LLVMValueRef +lp_build_or(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b); + +LLVMValueRef +lp_build_and(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b); + +LLVMValueRef +lp_build_andnot(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b); + +LLVMValueRef +lp_build_shl(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b); + +LLVMValueRef +lp_build_shr(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b); + +LLVMValueRef +lp_build_shl_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm); + +LLVMValueRef +lp_build_shr_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm); + + +#endif /* !LP_BLD_ARIT_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.c b/src/gallium/auxiliary/gallivm/lp_bld_const.c index 57843e9a60c..dd839c0bea5 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_const.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_const.c @@ -280,31 +280,45 @@ lp_build_one(struct lp_type type) /** - * Build constant-valued vector from a scalar value. + * Build constant-valued element from a scalar value. */ LLVMValueRef -lp_build_const_vec(struct lp_type type, - double val) +lp_build_const_elem(struct lp_type type, + double val) { LLVMTypeRef elem_type = lp_build_elem_type(type); - LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; - unsigned i; - - assert(type.length <= LP_MAX_VECTOR_LENGTH); + LLVMValueRef elem; if(type.floating) { - elems[0] = LLVMConstReal(elem_type, val); + elem = LLVMConstReal(elem_type, val); } else { double dscale = lp_const_scale(type); - elems[0] = LLVMConstInt(elem_type, val*dscale + 0.5, 0); + elem = LLVMConstInt(elem_type, val*dscale + 0.5, 0); } - for(i = 1; i < type.length; ++i) - elems[i] = elems[0]; + return elem; +} - return LLVMConstVector(elems, type.length); + +/** + * Build constant-valued vector from a scalar value. + */ +LLVMValueRef +lp_build_const_vec(struct lp_type type, + double val) +{ + if (type.length == 1) { + return lp_build_const_elem(type, val); + } else { + LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; + unsigned i; + elems[0] = lp_build_const_elem(type, val); + for(i = 1; i < type.length; ++i) + elems[i] = elems[0]; + return LLVMConstVector(elems, type.length); + } } @@ -321,6 +335,9 @@ lp_build_const_int_vec(struct lp_type type, for(i = 0; i < type.length; ++i) elems[i] = LLVMConstInt(elem_type, val, type.sign ? 1 : 0); + if (type.length == 1) + return elems[0]; + return LLVMConstVector(elems, type.length); } @@ -365,9 +382,12 @@ lp_build_const_aos(struct lp_type type, } +/** + * @param mask TGSI_WRITEMASK_xxx + */ LLVMValueRef lp_build_const_mask_aos(struct lp_type type, - const boolean cond[4]) + unsigned mask) { LLVMTypeRef elem_type = LLVMIntType(type.width); LLVMValueRef masks[LP_MAX_VECTOR_LENGTH]; @@ -375,9 +395,13 @@ lp_build_const_mask_aos(struct lp_type type, assert(type.length <= LP_MAX_VECTOR_LENGTH); - for(j = 0; j < type.length; j += 4) - for(i = 0; i < 4; ++i) - masks[j + i] = LLVMConstInt(elem_type, cond[i] ? ~0 : 0, 0); + for (j = 0; j < type.length; j += 4) { + for( i = 0; i < 4; ++i) { + masks[j + i] = LLVMConstInt(elem_type, + mask & (1 << i) ? ~0ULL : 0, + 1); + } + } return LLVMConstVector(masks, type.length); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_const.h b/src/gallium/auxiliary/gallivm/lp_bld_const.h index 9ca2f0664eb..6b1fc590c17 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_const.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_const.h @@ -85,6 +85,10 @@ lp_build_one(struct lp_type type); LLVMValueRef +lp_build_const_elem(struct lp_type type, + double val); + +LLVMValueRef lp_build_const_vec(struct lp_type type, double val); @@ -100,7 +104,15 @@ lp_build_const_aos(struct lp_type type, LLVMValueRef lp_build_const_mask_aos(struct lp_type type, - const boolean cond[4]); + unsigned mask); + + +static INLINE LLVMValueRef +lp_build_const_int32(int i) +{ + return LLVMConstInt(LLVMInt32Type(), i, 0); +} + #endif /* !LP_BLD_CONST_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c index 3f7f2ebde9c..127b13bc286 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c @@ -63,6 +63,7 @@ #include "util/u_debug.h" #include "util/u_math.h" +#include "util/u_cpu_detect.h" #include "lp_bld_type.h" #include "lp_bld_const.h" @@ -83,6 +84,9 @@ * * Although the result values can be scaled to an arbitrary bit width specified * by dst_width, the actual result type will have the same width. + * + * Ex: src = { float, float, float, float } + * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. */ LLVMValueRef lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, @@ -114,8 +118,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, scale = (double)mask/ubound; bias = (double)((unsigned long long)1 << (mantissa - n)); - res = LLVMBuildMul(builder, src, lp_build_const_vec(src_type, scale), ""); - res = LLVMBuildAdd(builder, res, lp_build_const_vec(src_type, bias), ""); + res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), ""); + res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), ""); res = LLVMBuildBitCast(builder, res, int_vec_type, ""); if(dst_width > n) { @@ -152,6 +156,8 @@ lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, /** * Inverse of lp_build_clamped_float_to_unsigned_norm above. + * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] + * return {float, float, float, float} with values in range [0, 1]. */ LLVMValueRef lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, @@ -170,6 +176,8 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, double scale; double bias; + assert(dst_type.floating); + mantissa = lp_mantissa(dst_type); n = MIN2(mantissa, src_width); @@ -194,8 +202,8 @@ lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, res = LLVMBuildBitCast(builder, res, vec_type, ""); - res = LLVMBuildSub(builder, res, bias_, ""); - res = LLVMBuildMul(builder, res, lp_build_const_vec(dst_type, scale), ""); + res = LLVMBuildFSub(builder, res, bias_, ""); + res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), ""); return res; } @@ -219,20 +227,116 @@ lp_build_conv(LLVMBuilderRef builder, unsigned num_tmps; unsigned i; - /* Register width must remain constant */ - assert(src_type.width * src_type.length == dst_type.width * dst_type.length); - /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); + assert(num_srcs <= LP_MAX_VECTOR_LENGTH); + assert(num_dsts <= LP_MAX_VECTOR_LENGTH); tmp_type = src_type; - for(i = 0; i < num_srcs; ++i) + for(i = 0; i < num_srcs; ++i) { + assert(lp_check_value(src_type, src[i])); tmp[i] = src[i]; + } num_tmps = num_srcs; + + /* Special case 4x4f --> 1x16ub + */ + if (src_type.floating == 1 && + src_type.fixed == 0 && + src_type.sign == 1 && + src_type.norm == 0 && + src_type.width == 32 && + src_type.length == 4 && + + dst_type.floating == 0 && + dst_type.fixed == 0 && + dst_type.sign == 0 && + dst_type.norm == 1 && + dst_type.width == 8 && + dst_type.length == 16) + { + int i; + + for (i = 0; i < num_dsts; i++, src += 4) { + struct lp_type int16_type = dst_type; + struct lp_type int32_type = dst_type; + LLVMValueRef lo, hi; + LLVMValueRef src_int0; + LLVMValueRef src_int1; + LLVMValueRef src_int2; + LLVMValueRef src_int3; + LLVMTypeRef int16_vec_type; + LLVMTypeRef int32_vec_type; + LLVMTypeRef src_vec_type; + LLVMTypeRef dst_vec_type; + LLVMValueRef const_255f; + LLVMValueRef a, b, c, d; + + int16_type.width *= 2; + int16_type.length /= 2; + int16_type.sign = 1; + + int32_type.width *= 4; + int32_type.length /= 4; + int32_type.sign = 1; + + src_vec_type = lp_build_vec_type(src_type); + dst_vec_type = lp_build_vec_type(dst_type); + int16_vec_type = lp_build_vec_type(int16_type); + int32_vec_type = lp_build_vec_type(int32_type); + + const_255f = lp_build_const_vec(src_type, 255.0f); + + a = LLVMBuildFMul(builder, src[0], const_255f, ""); + b = LLVMBuildFMul(builder, src[1], const_255f, ""); + c = LLVMBuildFMul(builder, src[2], const_255f, ""); + d = LLVMBuildFMul(builder, src[3], const_255f, ""); + + /* lp_build_round generates excessively general code without + * sse4, so do rounding manually. + */ + if (!util_cpu_caps.has_sse4_1) { + LLVMValueRef const_half = lp_build_const_vec(src_type, 0.5f); + + a = LLVMBuildFAdd(builder, a, const_half, ""); + b = LLVMBuildFAdd(builder, b, const_half, ""); + c = LLVMBuildFAdd(builder, c, const_half, ""); + d = LLVMBuildFAdd(builder, d, const_half, ""); + + src_int0 = LLVMBuildFPToSI(builder, a, int32_vec_type, ""); + src_int1 = LLVMBuildFPToSI(builder, b, int32_vec_type, ""); + src_int2 = LLVMBuildFPToSI(builder, c, int32_vec_type, ""); + src_int3 = LLVMBuildFPToSI(builder, d, int32_vec_type, ""); + } + else { + struct lp_build_context bld; + + bld.builder = builder; + bld.type = src_type; + bld.vec_type = src_vec_type; + bld.int_elem_type = lp_build_elem_type(int32_type); + bld.int_vec_type = int32_vec_type; + bld.undef = lp_build_undef(src_type); + bld.zero = lp_build_zero(src_type); + bld.one = lp_build_one(src_type); + + src_int0 = lp_build_iround(&bld, a); + src_int1 = lp_build_iround(&bld, b); + src_int2 = lp_build_iround(&bld, c); + src_int3 = lp_build_iround(&bld, d); + } + + lo = lp_build_pack2(builder, int32_type, int16_type, src_int0, src_int1); + hi = lp_build_pack2(builder, int32_type, int16_type, src_int2, src_int3); + dst[i] = lp_build_pack2(builder, int16_type, dst_type, lo, hi); + } + return; + } + /* * Clamp if necessary */ @@ -290,7 +394,7 @@ lp_build_conv(LLVMBuilderRef builder, if (dst_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) - tmp[i] = LLVMBuildMul(builder, tmp[i], scale, ""); + tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } /* Use an equally sized integer for intermediate computations */ @@ -326,30 +430,25 @@ lp_build_conv(LLVMBuilderRef builder, /* * Truncate or expand bit width + * + * No data conversion should happen here, although the sign bits are + * crucial to avoid bad clamping. */ - assert(!tmp_type.floating || tmp_type.width == dst_type.width); + { + struct lp_type new_type; - if(tmp_type.width > dst_type.width) { - assert(num_dsts == 1); - tmp[0] = lp_build_pack(builder, tmp_type, dst_type, TRUE, tmp, num_tmps); - tmp_type.width = dst_type.width; - tmp_type.length = dst_type.length; - num_tmps = 1; - } + new_type = tmp_type; + new_type.sign = dst_type.sign; + new_type.width = dst_type.width; + new_type.length = dst_type.length; + + lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); - if(tmp_type.width < dst_type.width) { - assert(num_tmps == 1); - lp_build_unpack(builder, tmp_type, dst_type, tmp[0], tmp, num_dsts); - tmp_type.width = dst_type.width; - tmp_type.length = dst_type.length; + tmp_type = new_type; num_tmps = num_dsts; } - assert(tmp_type.width == dst_type.width); - assert(tmp_type.length == dst_type.length); - assert(num_tmps == num_dsts); - /* * Scale to the widest range */ @@ -390,7 +489,7 @@ lp_build_conv(LLVMBuilderRef builder, if (src_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) - tmp[i] = LLVMBuildMul(builder, tmp[i], scale, ""); + tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } } } @@ -406,8 +505,10 @@ lp_build_conv(LLVMBuilderRef builder, } } - for(i = 0; i < num_dsts; ++i) + for(i = 0; i < num_dsts; ++i) { dst[i] = tmp[i]; + assert(lp_check_value(dst_type, dst[i])); + } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c index 39dfc51e503..d3a5afff8c2 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c @@ -46,7 +46,7 @@ boolean lp_check_alignment(const void *ptr, unsigned alignment) { - assert(util_is_pot(alignment)); + assert(util_is_power_of_two(alignment)); return ((uintptr_t)ptr & (alignment - 1)) == 0; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.h b/src/gallium/auxiliary/gallivm/lp_bld_debug.h index 7b010cbdb09..369c1bbf09a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.h @@ -36,6 +36,20 @@ #include "util/u_string.h" +#define GALLIVM_DEBUG_TGSI 0x1 +#define GALLIVM_DEBUG_IR 0x2 +#define GALLIVM_DEBUG_ASM 0x4 +#define GALLIVM_DEBUG_NO_OPT 0x8 +#define GALLIVM_DEBUG_PERF 0x10 + + +#ifdef DEBUG +extern unsigned gallivm_debug; +#else +#define gallivm_debug 0 +#endif + + static INLINE void lp_build_name(LLVMValueRef val, const char *format, ...) { @@ -53,6 +67,10 @@ lp_build_name(LLVMValueRef val, const char *format, ...) } +void +lp_debug_dump_value(LLVMValueRef value); + + boolean lp_check_alignment(const void *ptr, unsigned alignment); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.c b/src/gallium/auxiliary/gallivm/lp_bld_flow.c index 8f15b1d287d..5bc9c741a88 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_flow.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.c @@ -38,7 +38,7 @@ #include "lp_bld_flow.h" -#define LP_BUILD_FLOW_MAX_VARIABLES 32 +#define LP_BUILD_FLOW_MAX_VARIABLES 64 #define LP_BUILD_FLOW_MAX_DEPTH 32 /** @@ -407,6 +407,7 @@ lp_build_flow_skip_cond_break(struct lp_build_flow_context *flow, /* for each variable, update the Phi node with a (variable, block) pair */ for(i = 0; i < skip->num_variables; ++i) { assert(*flow->variables[i]); + assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i])); LLVMAddIncoming(skip->phi[i], flow->variables[i], ¤t_block, 1); } @@ -433,6 +434,7 @@ lp_build_flow_skip_end(struct lp_build_flow_context *flow) /* add (variable, block) tuples to the phi nodes */ for(i = 0; i < skip->num_variables; ++i) { assert(*flow->variables[i]); + assert(LLVMTypeOf(skip->phi[i]) == LLVMTypeOf(*flow->variables[i])); LLVMAddIncoming(skip->phi[i], flow->variables[i], ¤t_block, 1); *flow->variables[i] = skip->phi[i]; } @@ -821,8 +823,11 @@ lp_build_alloca(LLVMBuilderRef builder, LLVMBuilderRef first_builder = LLVMCreateBuilder(); LLVMValueRef res; - LLVMPositionBuilderAtEnd(first_builder, first_block); - LLVMPositionBuilderBefore(first_builder, first_instr); + if (first_instr) { + LLVMPositionBuilderBefore(first_builder, first_instr); + } else { + LLVMPositionBuilderAtEnd(first_builder, first_block); + } res = LLVMBuildAlloca(first_builder, type, name); @@ -840,7 +845,7 @@ lp_build_alloca(LLVMBuilderRef builder, * first block may prevent the X86 backend from successfully align the stack as * required. * - * Also the scalarrepl pass is supossedly more powerful and can promote + * Also the scalarrepl pass is supposedly more powerful and can promote * arrays in many cases. * * See also: @@ -859,7 +864,11 @@ lp_build_array_alloca(LLVMBuilderRef builder, LLVMBuilderRef first_builder = LLVMCreateBuilder(); LLVMValueRef res; - LLVMPositionBuilderBefore(first_builder, first_instr); + if (first_instr) { + LLVMPositionBuilderBefore(first_builder, first_instr); + } else { + LLVMPositionBuilderAtEnd(first_builder, first_block); + } res = LLVMBuildArrayAlloca(first_builder, type, count, name); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h index 085937588ff..60e22d727ad 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h @@ -40,6 +40,7 @@ struct util_format_description; struct lp_type; +struct lp_build_context; /* @@ -47,9 +48,9 @@ struct lp_type; */ LLVMValueRef -lp_build_unpack_rgba_aos(LLVMBuilderRef builder, - const struct util_format_description *desc, - LLVMValueRef packed); +lp_build_format_swizzle_aos(const struct util_format_description *desc, + struct lp_build_context *bld, + LLVMValueRef unswizzled); LLVMValueRef lp_build_pack_rgba_aos(LLVMBuilderRef builder, @@ -59,7 +60,9 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, LLVMValueRef lp_build_fetch_rgba_aos(LLVMBuilderRef builder, const struct util_format_description *format_desc, - LLVMValueRef ptr, + struct lp_type type, + LLVMValueRef base_ptr, + LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j); @@ -70,17 +73,22 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, void lp_build_format_swizzle_soa(const struct util_format_description *format_desc, - struct lp_type type, - const LLVMValueRef *unswizzled, - LLVMValueRef *swizzled); + struct lp_build_context *bld, + const LLVMValueRef unswizzled[4], + LLVMValueRef swizzled_out[4]); void lp_build_unpack_rgba_soa(LLVMBuilderRef builder, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef packed, - LLVMValueRef *rgba); + LLVMValueRef rgba_out[4]); +void +lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder, + struct lp_type dst_type, + LLVMValueRef packed, + LLVMValueRef *rgba); void lp_build_fetch_rgba_soa(LLVMBuilderRef builder, @@ -90,7 +98,20 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, LLVMValueRef offsets, LLVMValueRef i, LLVMValueRef j, - LLVMValueRef *rgba); + LLVMValueRef rgba_out[4]); + +/* + * YUV + */ +LLVMValueRef +lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j); + #endif /* !LP_BLD_FORMAT_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 6257e9a4047..6b9189e1da5 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -38,33 +38,124 @@ #include "util/u_math.h" #include "util/u_string.h" +#include "lp_bld_arit.h" #include "lp_bld_init.h" #include "lp_bld_type.h" #include "lp_bld_flow.h" +#include "lp_bld_const.h" +#include "lp_bld_conv.h" +#include "lp_bld_swizzle.h" +#include "lp_bld_gather.h" +#include "lp_bld_debug.h" #include "lp_bld_format.h" /** + * Basic swizzling. Rearrange the order of the unswizzled array elements + * according to the format description. PIPE_SWIZZLE_ZERO/ONE are supported + * too. + * Ex: if unswizzled[4] = {B, G, R, x}, then swizzled_out[4] = {R, G, B, 1}. + */ +LLVMValueRef +lp_build_format_swizzle_aos(const struct util_format_description *desc, + struct lp_build_context *bld, + LLVMValueRef unswizzled) +{ + unsigned char swizzles[4]; + unsigned chan; + + assert(bld->type.length % 4 == 0); + + for (chan = 0; chan < 4; ++chan) { + enum util_format_swizzle swizzle; + + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + /* + * For ZS formats do RGBA = ZZZ1 + */ + if (chan == 3) { + swizzle = UTIL_FORMAT_SWIZZLE_1; + } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) { + swizzle = UTIL_FORMAT_SWIZZLE_0; + } else { + swizzle = desc->swizzle[0]; + } + } else { + swizzle = desc->swizzle[chan]; + } + swizzles[chan] = swizzle; + } + + return lp_build_swizzle_aos(bld, unswizzled, swizzles); +} + + +/** + * Whether the format matches the vector type, apart of swizzles. + */ +static INLINE boolean +format_matches_type(const struct util_format_description *desc, + struct lp_type type) +{ + enum util_format_type chan_type; + unsigned chan; + + assert(type.length % 4 == 0); + + if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN || + desc->colorspace != UTIL_FORMAT_COLORSPACE_RGB || + desc->block.width != 1 || + desc->block.height != 1) { + return FALSE; + } + + if (type.floating) { + chan_type = UTIL_FORMAT_TYPE_FLOAT; + } else if (type.fixed) { + chan_type = UTIL_FORMAT_TYPE_FIXED; + } else if (type.sign) { + chan_type = UTIL_FORMAT_TYPE_SIGNED; + } else { + chan_type = UTIL_FORMAT_TYPE_UNSIGNED; + } + + for (chan = 0; chan < desc->nr_channels; ++chan) { + if (desc->channel[chan].size != type.width) { + return FALSE; + } + + if (desc->channel[chan].type != UTIL_FORMAT_TYPE_VOID) { + if (desc->channel[chan].type != chan_type || + desc->channel[chan].normalized != type.norm) { + return FALSE; + } + } + } + + return TRUE; +} + + +/** * Unpack a single pixel into its RGBA components. * - * @param packed integer. + * @param desc the pixel format for the packed pixel value + * @param packed integer pixel in a format such as PIPE_FORMAT_B8G8R8A8_UNORM * - * @return RGBA in a 4 floats vector. + * @return RGBA in a float[4] or ubyte[4] or ushort[4] vector. */ -LLVMValueRef -lp_build_unpack_rgba_aos(LLVMBuilderRef builder, - const struct util_format_description *desc, - LLVMValueRef packed) +static INLINE LLVMValueRef +lp_build_unpack_arith_rgba_aos(LLVMBuilderRef builder, + const struct util_format_description *desc, + LLVMValueRef packed) { LLVMValueRef shifted, casted, scaled, masked; LLVMValueRef shifts[4]; LLVMValueRef masks[4]; LLVMValueRef scales[4]; - LLVMValueRef swizzles[4]; - LLVMValueRef aux[4]; - bool normalized; - int empty_channel; - bool needs_uitofp; + + boolean normalized; + boolean needs_uitofp; unsigned shift; unsigned i; @@ -76,10 +167,12 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, /* Do the intermediate integer computations with 32bit integers since it * matches floating point size */ - if (desc->block.bits < 32) - packed = LLVMBuildZExt(builder, packed, LLVMInt32Type(), ""); + assert (LLVMTypeOf(packed) == LLVMInt32Type()); - /* Broadcast the packed value to all four channels */ + /* Broadcast the packed value to all four channels + * before: packed = BGRA + * after: packed = {BGRA, BGRA, BGRA, BGRA} + */ packed = LLVMBuildInsertElement(builder, LLVMGetUndef(LLVMVectorType(LLVMInt32Type(), 4)), packed, @@ -94,8 +187,9 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, /* Initialize vector constants */ normalized = FALSE; needs_uitofp = FALSE; - empty_channel = -1; shift = 0; + + /* Loop over 4 color components */ for (i = 0; i < 4; ++i) { unsigned bits = desc->channel[i].size; @@ -103,7 +197,6 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, shifts[i] = LLVMGetUndef(LLVMInt32Type()); masks[i] = LLVMConstNull(LLVMInt32Type()); scales[i] = LLVMConstNull(LLVMFloatType()); - empty_channel = i; } else { unsigned long long mask = (1ULL << bits) - 1; @@ -128,8 +221,13 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, shift += bits; } + /* Ex: convert packed = {BGRA, BGRA, BGRA, BGRA} + * into masked = {B, G, R, A} + */ shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), ""); masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), ""); + + if (!needs_uitofp) { /* UIToFP can't be expressed in SSE2 */ casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), ""); @@ -137,55 +235,17 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder, casted = LLVMBuildUIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), ""); } + /* At this point 'casted' may be a vector of floats such as + * {255.0, 255.0, 255.0, 255.0}. Next, if the pixel values are normalized + * we'll scale this to {1.0, 1.0, 1.0, 1.0}. + */ + if (normalized) - scaled = LLVMBuildMul(builder, casted, LLVMConstVector(scales, 4), ""); + scaled = LLVMBuildFMul(builder, casted, LLVMConstVector(scales, 4), ""); else scaled = casted; - for (i = 0; i < 4; ++i) - aux[i] = LLVMGetUndef(LLVMFloatType()); - - for (i = 0; i < 4; ++i) { - enum util_format_swizzle swizzle; - - if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { - /* - * For ZS formats do RGBA = ZZZ1 - */ - if (i == 3) { - swizzle = UTIL_FORMAT_SWIZZLE_1; - } else if (desc->swizzle[0] == UTIL_FORMAT_SWIZZLE_NONE) { - swizzle = UTIL_FORMAT_SWIZZLE_0; - } else { - swizzle = desc->swizzle[0]; - } - } else { - swizzle = desc->swizzle[i]; - } - - switch (swizzle) { - case UTIL_FORMAT_SWIZZLE_X: - case UTIL_FORMAT_SWIZZLE_Y: - case UTIL_FORMAT_SWIZZLE_Z: - case UTIL_FORMAT_SWIZZLE_W: - swizzles[i] = LLVMConstInt(LLVMInt32Type(), swizzle, 0); - break; - case UTIL_FORMAT_SWIZZLE_0: - assert(empty_channel >= 0); - swizzles[i] = LLVMConstInt(LLVMInt32Type(), empty_channel, 0); - break; - case UTIL_FORMAT_SWIZZLE_1: - swizzles[i] = LLVMConstInt(LLVMInt32Type(), 4, 0); - aux[0] = LLVMConstReal(LLVMFloatType(), 1.0); - break; - case UTIL_FORMAT_SWIZZLE_NONE: - swizzles[i] = LLVMGetUndef(LLVMFloatType()); - assert(0); - break; - } - } - - return LLVMBuildShuffleVector(builder, scaled, LLVMConstVector(aux, 4), LLVMConstVector(swizzles, 4), ""); + return scaled; } @@ -208,7 +268,7 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, LLVMValueRef shifted, casted, scaled, unswizzled; LLVMValueRef shifts[4]; LLVMValueRef scales[4]; - bool normalized; + boolean normalized; unsigned shift; unsigned i, j; @@ -263,7 +323,7 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, } if (normalized) - scaled = LLVMBuildMul(builder, unswizzled, LLVMConstVector(scales, 4), ""); + scaled = LLVMBuildFMul(builder, unswizzled, LLVMConstVector(scales, 4), ""); else scaled = unswizzled; @@ -292,44 +352,152 @@ lp_build_pack_rgba_aos(LLVMBuilderRef builder, } + + /** * Fetch a pixel into a 4 float AoS. * - * i and j are the sub-block pixel coordinates. + * \param format_desc describes format of the image we're fetching from + * \param ptr address of the pixel block (or the texel if uncompressed) + * \param i, j the sub-block pixel coordinates. For non-compressed formats + * these will always be (0, 0). + * \return a 4 element vector with the pixel's RGBA values. */ LLVMValueRef lp_build_fetch_rgba_aos(LLVMBuilderRef builder, const struct util_format_description *format_desc, - LLVMValueRef ptr, + struct lp_type type, + LLVMValueRef base_ptr, + LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j) { + unsigned num_pixels = type.length / 4; + struct lp_build_context bld; + + assert(type.length <= LP_MAX_VECTOR_LENGTH); + assert(type.length % 4 == 0); + + lp_build_context_init(&bld, builder, type); + + /* + * Trivial case + * + * The format matches the type (apart of a swizzle) so no need for + * scaling or converting. + */ + + if (format_matches_type(format_desc, type) && + format_desc->block.bits <= type.width * 4 && + util_is_power_of_two(format_desc->block.bits)) { + LLVMValueRef packed; + + /* + * The format matches the type (apart of a swizzle) so no need for + * scaling or converting. + */ + + packed = lp_build_gather(builder, type.length/4, + format_desc->block.bits, type.width*4, + base_ptr, offset); + + assert(format_desc->block.bits <= type.width * type.length); + + packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(type), ""); + + return lp_build_format_swizzle_aos(format_desc, &bld, packed); + } + + /* + * Bit arithmetic + */ if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && - util_is_pot(format_desc->block.bits) && + util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && !format_desc->is_mixed && (format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED || - format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) - { - LLVMValueRef packed; + format_desc->channel[1].type == UTIL_FORMAT_TYPE_UNSIGNED)) { + + LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef res; + unsigned k; + + /* + * Unpack a pixel at a time into a <4 x float> RGBA vector + */ + + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef packed; + + packed = lp_build_gather_elem(builder, num_pixels, + format_desc->block.bits, 32, + base_ptr, offset, k); + + tmps[k] = lp_build_unpack_arith_rgba_aos(builder, format_desc, + packed); + } + + /* + * Type conversion. + * + * TODO: We could avoid floating conversion for integer to + * integer conversions. + */ - ptr = LLVMBuildBitCast(builder, ptr, - LLVMPointerType(LLVMIntType(format_desc->block.bits), 0) , - ""); + if (gallivm_debug & GALLIVM_DEBUG_PERF && !type.floating) { + debug_printf("%s: unpacking %s with floating point\n", + __FUNCTION__, format_desc->short_name); + } - packed = LLVMBuildLoad(builder, ptr, "packed"); + lp_build_conv(builder, + lp_float32_vec4_type(), + type, + tmps, num_pixels, &res, 1); - return lp_build_unpack_rgba_aos(builder, format_desc, packed); + return lp_build_format_swizzle_aos(format_desc, &bld, res); } - else if (format_desc->fetch_rgba_float) { + + /* + * YUV / subsampled formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_subsampled_rgba_aos(builder, + format_desc, + num_pixels, + base_ptr, + offset, + i, j); + + lp_build_conv(builder, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + + /* + * Fallback to util_format_description::fetch_rgba_8unorm(). + */ + + if (format_desc->fetch_rgba_8unorm && + !type.floating && type.width == 8 && !type.sign && type.norm) { /* - * Fallback to calling util_format_description::fetch_rgba_float. + * Fallback to calling util_format_description::fetch_rgba_8unorm. * * This is definitely not the most efficient way of fetching pixels, as * we miss the opportunity to do vectorization, but this it is a @@ -339,13 +507,125 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); char name[256]; + LLVMTypeRef i8t = LLVMInt8Type(); + LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); + LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef function; + LLVMValueRef tmp_ptr; LLVMValueRef tmp; - LLVMValueRef args[4]; + LLVMValueRef res; + unsigned k; + + util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_8unorm", + format_desc->short_name); + + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: falling back to %s\n", __FUNCTION__, name); + } + + /* + * Declare and bind format_desc->fetch_rgba_8unorm(). + */ + + function = LLVMGetNamedFunction(module, name); + if (!function) { + LLVMTypeRef ret_type; + LLVMTypeRef arg_types[4]; + LLVMTypeRef function_type; + + ret_type = LLVMVoidType(); + arg_types[0] = pi8t; + arg_types[1] = pi8t; + arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8); + function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); + function = LLVMAddFunction(module, name, function_type); + + LLVMSetFunctionCallConv(function, LLVMCCallConv); + LLVMSetLinkage(function, LLVMExternalLinkage); + + assert(LLVMIsDeclaration(function)); + + LLVMAddGlobalMapping(lp_build_engine, function, + func_to_pointer((func_pointer)format_desc->fetch_rgba_8unorm)); + } + + tmp_ptr = lp_build_alloca(builder, i32t, ""); + + res = LLVMGetUndef(LLVMVectorType(i32t, num_pixels)); + + /* + * Invoke format_desc->fetch_rgba_8unorm() for each pixel and insert the result + * in the SoA vectors. + */ + + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); + LLVMValueRef args[4]; + + args[0] = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); + args[1] = lp_build_gather_elem_ptr(builder, num_pixels, + base_ptr, offset, k); + + if (num_pixels == 1) { + args[2] = i; + args[3] = j; + } + else { + args[2] = LLVMBuildExtractElement(builder, i, index, ""); + args[3] = LLVMBuildExtractElement(builder, j, index, ""); + } + + LLVMBuildCall(builder, function, args, Elements(args), ""); + + tmp = LLVMBuildLoad(builder, tmp_ptr, ""); + + if (num_pixels == 1) { + res = tmp; + } + else { + res = LLVMBuildInsertElement(builder, res, tmp, index, ""); + } + } + + /* Bitcast from <n x i32> to <4n x i8> */ + res = LLVMBuildBitCast(builder, res, bld.vec_type, ""); + + return res; + } + + + /* + * Fallback to util_format_description::fetch_rgba_float(). + */ + + if (format_desc->fetch_rgba_float) { + /* + * Fallback to calling util_format_description::fetch_rgba_float. + * + * This is definitely not the most efficient way of fetching pixels, as + * we miss the opportunity to do vectorization, but this it is a + * convenient for formats or scenarios for which there was no opportunity + * or incentive to optimize. + */ + + LLVMModuleRef module = LLVMGetGlobalParent(LLVMGetBasicBlockParent(LLVMGetInsertBlock(builder))); + char name[256]; + LLVMTypeRef f32t = LLVMFloatType(); + LLVMTypeRef f32x4t = LLVMVectorType(f32t, 4); + LLVMTypeRef pf32t = LLVMPointerType(f32t, 0); + LLVMValueRef function; + LLVMValueRef tmp_ptr; + LLVMValueRef tmps[LP_MAX_VECTOR_LENGTH/4]; + LLVMValueRef res; + unsigned k; util_snprintf(name, sizeof name, "util_format_%s_fetch_rgba_float", format_desc->short_name); + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: falling back to %s\n", __FUNCTION__, name); + } + /* * Declare and bind format_desc->fetch_rgba_float(). */ @@ -357,7 +637,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, LLVMTypeRef function_type; ret_type = LLVMVoidType(); - arg_types[0] = LLVMPointerType(LLVMFloatType(), 0); + arg_types[0] = pf32t; arg_types[1] = LLVMPointerType(LLVMInt8Type(), 0); arg_types[3] = arg_types[2] = LLVMIntType(sizeof(unsigned) * 8); function_type = LLVMFunctionType(ret_type, arg_types, Elements(arg_types), 0); @@ -368,28 +648,47 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, assert(LLVMIsDeclaration(function)); - LLVMAddGlobalMapping(lp_build_engine, function, format_desc->fetch_rgba_float); + LLVMAddGlobalMapping(lp_build_engine, function, + func_to_pointer((func_pointer)format_desc->fetch_rgba_float)); } - tmp = lp_build_alloca(builder, LLVMVectorType(LLVMFloatType(), 4), ""); + tmp_ptr = lp_build_alloca(builder, f32x4t, ""); /* * Invoke format_desc->fetch_rgba_float() for each pixel and insert the result * in the SoA vectors. */ - args[0] = LLVMBuildBitCast(builder, tmp, - LLVMPointerType(LLVMFloatType(), 0), ""); - args[1] = ptr; - args[2] = i; - args[3] = j; + for (k = 0; k < num_pixels; ++k) { + LLVMValueRef args[4]; - LLVMBuildCall(builder, function, args, 4, ""); + args[0] = LLVMBuildBitCast(builder, tmp_ptr, pf32t, ""); + args[1] = lp_build_gather_elem_ptr(builder, num_pixels, + base_ptr, offset, k); - return LLVMBuildLoad(builder, tmp, ""); - } - else { - assert(0); - return LLVMGetUndef(LLVMVectorType(LLVMFloatType(), 4)); + if (num_pixels == 1) { + args[2] = i; + args[3] = j; + } + else { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); + args[2] = LLVMBuildExtractElement(builder, i, index, ""); + args[3] = LLVMBuildExtractElement(builder, j, index, ""); + } + + LLVMBuildCall(builder, function, args, Elements(args), ""); + + tmps[k] = LLVMBuildLoad(builder, tmp_ptr, ""); + } + + lp_build_conv(builder, + lp_float32_vec4_type(), + type, + tmps, num_pixels, &res, 1); + + return res; } + + assert(0); + return lp_build_undef(type); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index 26b947b3b1c..ce7e54afc76 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -26,6 +26,8 @@ **************************************************************************/ +#include "pipe/p_defines.h" + #include "util/u_format.h" #include "util/u_memory.h" #include "util/u_string.h" @@ -33,51 +35,39 @@ #include "lp_bld_type.h" #include "lp_bld_const.h" #include "lp_bld_conv.h" -#include "lp_bld_sample.h" /* for lp_build_gather */ +#include "lp_bld_swizzle.h" +#include "lp_bld_gather.h" +#include "lp_bld_debug.h" #include "lp_bld_format.h" -static LLVMValueRef -lp_build_format_swizzle_chan_soa(struct lp_type type, - const LLVMValueRef *unswizzled, - enum util_format_swizzle swizzle) -{ - switch (swizzle) { - case UTIL_FORMAT_SWIZZLE_X: - case UTIL_FORMAT_SWIZZLE_Y: - case UTIL_FORMAT_SWIZZLE_Z: - case UTIL_FORMAT_SWIZZLE_W: - return unswizzled[swizzle]; - case UTIL_FORMAT_SWIZZLE_0: - return lp_build_zero(type); - case UTIL_FORMAT_SWIZZLE_1: - return lp_build_one(type); - case UTIL_FORMAT_SWIZZLE_NONE: - return lp_build_undef(type); - default: - assert(0); - return lp_build_undef(type); - } -} - - void lp_build_format_swizzle_soa(const struct util_format_description *format_desc, - struct lp_type type, + struct lp_build_context *bld, const LLVMValueRef *unswizzled, - LLVMValueRef *swizzled) + LLVMValueRef swizzled_out[4]) { - if(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + assert(UTIL_FORMAT_SWIZZLE_0 == PIPE_SWIZZLE_ZERO); + assert(UTIL_FORMAT_SWIZZLE_1 == PIPE_SWIZZLE_ONE); + + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + /* + * Return zzz1 for depth-stencil formats. + * + * XXX: Allow to control the depth swizzle with an additional parameter, + * as the caller may wish another depth swizzle, or retain the stencil + * value. + */ enum util_format_swizzle swizzle = format_desc->swizzle[0]; - LLVMValueRef depth = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle); - swizzled[2] = swizzled[1] = swizzled[0] = depth; - swizzled[3] = lp_build_one(type); + LLVMValueRef depth = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle); + swizzled_out[2] = swizzled_out[1] = swizzled_out[0] = depth; + swizzled_out[3] = bld->one; } else { unsigned chan; for (chan = 0; chan < 4; ++chan) { enum util_format_swizzle swizzle = format_desc->swizzle[chan]; - swizzled[chan] = lp_build_format_swizzle_chan_soa(type, unswizzled, swizzle); + swizzled_out[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, swizzle); } } } @@ -100,14 +90,20 @@ lp_build_format_swizzle_soa(const struct util_format_description *format_desc, * It requires that a packed pixel fits into an element of the output * channels. The common case is when converting pixel with a depth of 32 bit or * less into floats. + * + * \param format_desc the format of the 'packed' incoming pixel vector + * \param type the desired type for rgba_out (type.length = n, above) + * \param packed the incoming vector of packed pixels + * \param rgba_out returns the SoA R,G,B,A vectors */ void lp_build_unpack_rgba_soa(LLVMBuilderRef builder, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef packed, - LLVMValueRef *rgba) + LLVMValueRef rgba_out[4]) { + struct lp_build_context bld; LLVMValueRef inputs[4]; unsigned start; unsigned chan; @@ -120,11 +116,13 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, assert(type.floating); assert(type.width == 32); + lp_build_context_init(&bld, builder, type); + /* Decode the input vector components */ start = 0; for (chan = 0; chan < format_desc->nr_channels; ++chan) { - unsigned width = format_desc->channel[chan].size; - unsigned stop = start + width; + const unsigned width = format_desc->channel[chan].size; + const unsigned stop = start + width; LLVMValueRef input; input = packed; @@ -200,7 +198,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, if (format_desc->channel[chan].normalized) { double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1); LLVMValueRef scale_val = lp_build_const_vec(type, scale); - input = LLVMBuildMul(builder, input, scale_val, ""); + input = LLVMBuildFMul(builder, input, scale_val, ""); } } else { @@ -230,7 +228,7 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1); LLVMValueRef scale_val = lp_build_const_vec(type, scale); input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(type), ""); - input = LLVMBuildMul(builder, input, scale_val, ""); + input = LLVMBuildFMul(builder, input, scale_val, ""); } else { /* FIXME */ @@ -250,14 +248,59 @@ lp_build_unpack_rgba_soa(LLVMBuilderRef builder, start = stop; } - lp_build_format_swizzle_soa(format_desc, type, inputs, rgba); + lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out); +} + + +void +lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder, + struct lp_type dst_type, + LLVMValueRef packed, + LLVMValueRef *rgba) +{ + LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff); + unsigned chan; + + packed = LLVMBuildBitCast(builder, packed, + lp_build_int_vec_type(dst_type), ""); + + /* Decode the input vector components */ + for (chan = 0; chan < 4; ++chan) { + unsigned start = chan*8; + unsigned stop = start + 8; + LLVMValueRef input; + + input = packed; + + if (start) + input = LLVMBuildLShr(builder, input, + lp_build_const_int_vec(dst_type, start), ""); + + if (stop < 32) + input = LLVMBuildAnd(builder, input, mask, ""); + + input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input); + + rgba[chan] = input; + } } + /** - * Fetch a pixel into a SoA. + * Fetch a texels from a texture, returning them in SoA layout. * - * i and j are the sub-block pixel coordinates. + * \param type the desired return type for 'rgba'. The vector length + * is the number of texels to fetch + * + * \param base_ptr points to start of the texture image block. For non- + * compressed formats, this simply points to the texel. + * For compressed formats, it points to the start of the + * compressed data block. + * + * \param i, j the sub-block pixel coordinates. For non-compressed formats + * these will always be (0,0). For compressed formats, i will + * be in [0, block_width-1] and j will be in [0, block_height-1]. */ void lp_build_fetch_rgba_soa(LLVMBuilderRef builder, @@ -267,7 +310,7 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, - LLVMValueRef *rgba) + LLVMValueRef rgba_out[4]) { if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && @@ -281,7 +324,7 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, { /* * The packed pixel fits into an element of the destination format. Put - * the packed pixels into a vector and estract each component for all + * the packed pixels into a vector and extract each component for all * vector elements in parallel. */ @@ -289,6 +332,7 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, /* * gather the texels from the texture + * Ex: packed = {BGRA, BGRA, BGRA, BGRA}. */ packed = lp_build_gather(builder, type.length, @@ -302,49 +346,86 @@ lp_build_fetch_rgba_soa(LLVMBuilderRef builder, lp_build_unpack_rgba_soa(builder, format_desc, type, - packed, rgba); + packed, rgba_out); + return; } - else { - /* - * Fallback to calling lp_build_fetch_rgba_aos for each pixel. - * - * This is not the most efficient way of fetching pixels, as - * we miss some opportunities to do vectorization, but this it is a - * convenient for formats or scenarios for which there was no opportunity - * or incentive to optimize. - */ + /* + * Try calling lp_build_fetch_rgba_aos for all pixels. + */ + + if (util_format_fits_8unorm(format_desc) && + type.floating && type.width == 32 && type.length == 4) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = type.length * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type, + base_ptr, offset, i, j); + + lp_build_rgba8_to_f32_soa(builder, + type, + tmp, + rgba_out); + + return; + } + + /* + * Fallback to calling lp_build_fetch_rgba_aos for each pixel. + * + * This is not the most efficient way of fetching pixels, as we + * miss some opportunities to do vectorization, but this is + * convenient for formats or scenarios for which there was no + * opportunity or incentive to optimize. + */ + + { unsigned k, chan; + struct lp_type tmp_type; - assert(type.floating); + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: scalar unpacking of %s\n", + __FUNCTION__, format_desc->short_name); + } + + tmp_type = type; + tmp_type.length = 4; for (chan = 0; chan < 4; ++chan) { - rgba[chan] = lp_build_undef(type); + rgba_out[chan] = lp_build_undef(type); } + /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), k, 0); LLVMValueRef offset_elem; - LLVMValueRef ptr; LLVMValueRef i_elem, j_elem; LLVMValueRef tmp; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); - ptr = LLVMBuildGEP(builder, base_ptr, &offset_elem, 1, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); - tmp = lp_build_fetch_rgba_aos(builder, format_desc, ptr, i_elem, j_elem); + /* Get a single float[4]={R,G,B,A} pixel */ + tmp = lp_build_fetch_rgba_aos(builder, format_desc, tmp_type, + base_ptr, offset_elem, + i_elem, j_elem); /* - * AoS to SoA + * Insert the AoS tmp value channels into the SoA result vectors at + * position = 'index'. */ - for (chan = 0; chan < 4; ++chan) { LLVMValueRef chan_val = LLVMConstInt(LLVMInt32Type(), chan, 0), tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, ""); - rgba[chan] = LLVMBuildInsertElement(builder, rgba[chan], tmp_chan, index, ""); + rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan], + tmp_chan, index, ""); } } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c new file mode 100644 index 00000000000..2bce2895551 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_yuv.c @@ -0,0 +1,445 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +/** + * @file + * YUV pixel format manipulation. + * + * @author Jose Fonseca <[email protected]> + */ + + +#include "util/u_format.h" +#include "util/u_cpu_detect.h" + +#include "lp_bld_arit.h" +#include "lp_bld_type.h" +#include "lp_bld_const.h" +#include "lp_bld_conv.h" +#include "lp_bld_gather.h" +#include "lp_bld_format.h" +#include "lp_bld_logic.h" + +/** + * Extract Y, U, V channels from packed UYVY. + * @param packed is a <n x i32> vector with the packed UYVY blocks + * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) + */ +static void +uyvy_to_yuv_soa(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i, + LLVMValueRef *y, + LLVMValueRef *u, + LLVMValueRef *v) +{ + struct lp_type type; + LLVMValueRef mask; + + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + assert(lp_check_value(type, packed)); + assert(lp_check_value(type, i)); + + /* + * y = (uyvy >> (16*i + 8)) & 0xff + * u = (uyvy ) & 0xff + * v = (uyvy >> 16 ) & 0xff + */ + +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* + * Avoid shift with per-element count. + * No support on x86, gets translated to roughly 5 instructions + * per element. Didn't measure performance but cuts shader size + * by quite a bit (less difference if cpu has no sse4.1 support). + */ + if (util_cpu_caps.has_sse2 && n == 4) { + LLVMValueRef sel, tmp, tmp2; + struct lp_build_context bld32; + + lp_build_context_init(&bld32, builder, type); + + tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), ""); + tmp2 = LLVMBuildLShr(builder, tmp, lp_build_const_int_vec(type, 16), ""); + sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0)); + *y = lp_build_select(&bld32, sel, tmp, tmp2); + } else +#endif + { + LLVMValueRef shift; + shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); + shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), ""); + *y = LLVMBuildLShr(builder, packed, shift, ""); + } + + *u = packed; + *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), ""); + + mask = lp_build_const_int_vec(type, 0xff); + + *y = LLVMBuildAnd(builder, *y, mask, "y"); + *u = LLVMBuildAnd(builder, *u, mask, "u"); + *v = LLVMBuildAnd(builder, *v, mask, "v"); +} + + +/** + * Extract Y, U, V channels from packed YUYV. + * @param packed is a <n x i32> vector with the packed YUYV blocks + * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) + */ +static void +yuyv_to_yuv_soa(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i, + LLVMValueRef *y, + LLVMValueRef *u, + LLVMValueRef *v) +{ + struct lp_type type; + LLVMValueRef mask; + + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + assert(lp_check_value(type, packed)); + assert(lp_check_value(type, i)); + + /* + * y = (yuyv >> 16*i) & 0xff + * u = (yuyv >> 8 ) & 0xff + * v = (yuyv >> 24 ) & 0xff + */ + +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* + * Avoid shift with per-element count. + * No support on x86, gets translated to roughly 5 instructions + * per element. Didn't measure performance but cuts shader size + * by quite a bit (less difference if cpu has no sse4.1 support). + */ + if (util_cpu_caps.has_sse2 && n == 4) { + LLVMValueRef sel, tmp; + struct lp_build_context bld32; + + lp_build_context_init(&bld32, builder, type); + + tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), ""); + sel = lp_build_compare(builder, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(type, 0)); + *y = lp_build_select(&bld32, sel, packed, tmp); + } else +#endif + { + LLVMValueRef shift; + shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); + *y = LLVMBuildLShr(builder, packed, shift, ""); + } + + *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 8), ""); + *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 24), ""); + + mask = lp_build_const_int_vec(type, 0xff); + + *y = LLVMBuildAnd(builder, *y, mask, "y"); + *u = LLVMBuildAnd(builder, *u, mask, "u"); + *v = LLVMBuildAnd(builder, *v, mask, "v"); +} + + +static INLINE void +yuv_to_rgb_soa(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef y, LLVMValueRef u, LLVMValueRef v, + LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b) +{ + struct lp_type type; + struct lp_build_context bld; + + LLVMValueRef c0; + LLVMValueRef c8; + LLVMValueRef c16; + LLVMValueRef c128; + LLVMValueRef c255; + + LLVMValueRef cy; + LLVMValueRef cug; + LLVMValueRef cub; + LLVMValueRef cvr; + LLVMValueRef cvg; + + memset(&type, 0, sizeof type); + type.sign = TRUE; + type.width = 32; + type.length = n; + + lp_build_context_init(&bld, builder, type); + + assert(lp_check_value(type, y)); + assert(lp_check_value(type, u)); + assert(lp_check_value(type, v)); + + /* + * Constants + */ + + c0 = lp_build_const_int_vec(type, 0); + c8 = lp_build_const_int_vec(type, 8); + c16 = lp_build_const_int_vec(type, 16); + c128 = lp_build_const_int_vec(type, 128); + c255 = lp_build_const_int_vec(type, 255); + + cy = lp_build_const_int_vec(type, 298); + cug = lp_build_const_int_vec(type, -100); + cub = lp_build_const_int_vec(type, 516); + cvr = lp_build_const_int_vec(type, 409); + cvg = lp_build_const_int_vec(type, -208); + + /* + * y -= 16; + * u -= 128; + * v -= 128; + */ + + y = LLVMBuildSub(builder, y, c16, ""); + u = LLVMBuildSub(builder, u, c128, ""); + v = LLVMBuildSub(builder, v, c128, ""); + + /* + * r = 298 * _y + 409 * _v + 128; + * g = 298 * _y - 100 * _u - 208 * _v + 128; + * b = 298 * _y + 516 * _u + 128; + */ + + y = LLVMBuildMul(builder, y, cy, ""); + y = LLVMBuildAdd(builder, y, c128, ""); + + *r = LLVMBuildMul(builder, v, cvr, ""); + *g = LLVMBuildAdd(builder, + LLVMBuildMul(builder, u, cug, ""), + LLVMBuildMul(builder, v, cvg, ""), + ""); + *b = LLVMBuildMul(builder, u, cub, ""); + + *r = LLVMBuildAdd(builder, *r, y, ""); + *g = LLVMBuildAdd(builder, *g, y, ""); + *b = LLVMBuildAdd(builder, *b, y, ""); + + /* + * r >>= 8; + * g >>= 8; + * b >>= 8; + */ + + *r = LLVMBuildAShr(builder, *r, c8, "r"); + *g = LLVMBuildAShr(builder, *g, c8, "g"); + *b = LLVMBuildAShr(builder, *b, c8, "b"); + + /* + * Clamp + */ + + *r = lp_build_clamp(&bld, *r, c0, c255); + *g = lp_build_clamp(&bld, *g, c0, c255); + *b = lp_build_clamp(&bld, *b, c0, c255); +} + + +static LLVMValueRef +rgb_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef r, LLVMValueRef g, LLVMValueRef b) +{ + struct lp_type type; + LLVMValueRef a; + LLVMValueRef rgba; + + memset(&type, 0, sizeof type); + type.sign = TRUE; + type.width = 32; + type.length = n; + + assert(lp_check_value(type, r)); + assert(lp_check_value(type, g)); + assert(lp_check_value(type, b)); + + /* + * Make a 4 x unorm8 vector + */ + + r = r; + g = LLVMBuildShl(builder, g, lp_build_const_int_vec(type, 8), ""); + b = LLVMBuildShl(builder, b, lp_build_const_int_vec(type, 16), ""); + a = lp_build_const_int_vec(type, 0xff000000); + + rgba = r; + rgba = LLVMBuildOr(builder, rgba, g, ""); + rgba = LLVMBuildOr(builder, rgba, b, ""); + rgba = LLVMBuildOr(builder, rgba, a, ""); + + rgba = LLVMBuildBitCast(builder, rgba, + LLVMVectorType(LLVMInt8Type(), 4*n), ""); + + return rgba; +} + + +/** + * Convert from <n x i32> packed UYVY to <4n x i8> RGBA AoS + */ +static LLVMValueRef +uyvy_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i) +{ + LLVMValueRef y, u, v; + LLVMValueRef r, g, b; + LLVMValueRef rgba; + + uyvy_to_yuv_soa(builder, n, packed, i, &y, &u, &v); + yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b); + rgba = rgb_to_rgba_aos(builder, n, r, g, b); + + return rgba; +} + + +/** + * Convert from <n x i32> packed YUYV to <4n x i8> RGBA AoS + */ +static LLVMValueRef +yuyv_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i) +{ + LLVMValueRef y, u, v; + LLVMValueRef r, g, b; + LLVMValueRef rgba; + + yuyv_to_yuv_soa(builder, n, packed, i, &y, &u, &v); + yuv_to_rgb_soa(builder, n, y, u, v, &r, &g, &b); + rgba = rgb_to_rgba_aos(builder, n, r, g, b); + + return rgba; +} + + +/** + * Convert from <n x i32> packed RG_BG to <4n x i8> RGBA AoS + */ +static LLVMValueRef +rgbg_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i) +{ + LLVMValueRef r, g, b; + LLVMValueRef rgba; + + uyvy_to_yuv_soa(builder, n, packed, i, &g, &r, &b); + rgba = rgb_to_rgba_aos(builder, n, r, g, b); + + return rgba; +} + + +/** + * Convert from <n x i32> packed GR_GB to <4n x i8> RGBA AoS + */ +static LLVMValueRef +grgb_to_rgba_aos(LLVMBuilderRef builder, + unsigned n, + LLVMValueRef packed, + LLVMValueRef i) +{ + LLVMValueRef r, g, b; + LLVMValueRef rgba; + + yuyv_to_yuv_soa(builder, n, packed, i, &g, &r, &b); + rgba = rgb_to_rgba_aos(builder, n, r, g, b); + + return rgba; +} + + +/** + * @param n is the number of pixels processed + * @param packed is a <n x i32> vector with the packed YUYV blocks + * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) + * @return a <4*n x i8> vector with the pixel RGBA values in AoS + */ +LLVMValueRef +lp_build_fetch_subsampled_rgba_aos(LLVMBuilderRef builder, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMValueRef packed; + LLVMValueRef rgba; + + assert(format_desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED); + assert(format_desc->block.bits == 32); + assert(format_desc->block.width == 2); + assert(format_desc->block.height == 1); + + packed = lp_build_gather(builder, n, 32, 32, base_ptr, offset); + + (void)j; + + switch (format_desc->format) { + case PIPE_FORMAT_UYVY: + rgba = uyvy_to_rgba_aos(builder, n, packed, i); + break; + case PIPE_FORMAT_YUYV: + rgba = yuyv_to_rgba_aos(builder, n, packed, i); + break; + case PIPE_FORMAT_R8G8_B8G8_UNORM: + rgba = rgbg_to_rgba_aos(builder, n, packed, i); + break; + case PIPE_FORMAT_G8R8_G8B8_UNORM: + rgba = grgb_to_rgba_aos(builder, n, packed, i); + break; + default: + assert(0); + rgba = LLVMGetUndef(LLVMVectorType(LLVMInt8Type(), 4*n)); + break; + } + + return rgba; +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.c b/src/gallium/auxiliary/gallivm/lp_bld_gather.c new file mode 100644 index 00000000000..d60472e0656 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.c @@ -0,0 +1,148 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#include "util/u_debug.h" +#include "lp_bld_debug.h" +#include "lp_bld_const.h" +#include "lp_bld_format.h" +#include "lp_bld_gather.h" + + +/** + * Get the pointer to one element from scatter positions in memory. + * + * @sa lp_build_gather() + */ +LLVMValueRef +lp_build_gather_elem_ptr(LLVMBuilderRef builder, + unsigned length, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i) +{ + LLVMValueRef offset; + LLVMValueRef ptr; + + assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0)); + + if (length == 1) { + assert(i == 0); + offset = offsets; + } else { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + offset = LLVMBuildExtractElement(builder, offsets, index, ""); + } + + ptr = LLVMBuildGEP(builder, base_ptr, &offset, 1, ""); + + return ptr; +} + + +/** + * Gather one element from scatter positions in memory. + * + * @sa lp_build_gather() + */ +LLVMValueRef +lp_build_gather_elem(LLVMBuilderRef builder, + unsigned length, + unsigned src_width, + unsigned dst_width, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i) +{ + LLVMTypeRef src_type = LLVMIntType(src_width); + LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); + LLVMTypeRef dst_elem_type = LLVMIntType(dst_width); + LLVMValueRef ptr; + LLVMValueRef res; + + assert(LLVMTypeOf(base_ptr) == LLVMPointerType(LLVMInt8Type(), 0)); + + ptr = lp_build_gather_elem_ptr(builder, length, base_ptr, offsets, i); + ptr = LLVMBuildBitCast(builder, ptr, src_ptr_type, ""); + res = LLVMBuildLoad(builder, ptr, ""); + + assert(src_width <= dst_width); + if (src_width > dst_width) + res = LLVMBuildTrunc(builder, res, dst_elem_type, ""); + if (src_width < dst_width) + res = LLVMBuildZExt(builder, res, dst_elem_type, ""); + + return res; +} + + +/** + * Gather elements from scatter positions in memory into a single vector. + * Use for fetching texels from a texture. + * For SSE, typical values are length=4, src_width=32, dst_width=32. + * + * @param length length of the offsets + * @param src_width src element width in bits + * @param dst_width result element width in bits (src will be expanded to fit) + * @param base_ptr base pointer, should be a i8 pointer type. + * @param offsets vector with offsets + */ +LLVMValueRef +lp_build_gather(LLVMBuilderRef builder, + unsigned length, + unsigned src_width, + unsigned dst_width, + LLVMValueRef base_ptr, + LLVMValueRef offsets) +{ + LLVMValueRef res; + + if (length == 1) { + /* Scalar */ + return lp_build_gather_elem(builder, length, + src_width, dst_width, + base_ptr, offsets, 0); + } else { + /* Vector */ + + LLVMTypeRef dst_elem_type = LLVMIntType(dst_width); + LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length); + unsigned i; + + res = LLVMGetUndef(dst_vec_type); + for (i = 0; i < length; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef elem; + elem = lp_build_gather_elem(builder, length, + src_width, dst_width, + base_ptr, offsets, i); + res = LLVMBuildInsertElement(builder, res, elem, index, ""); + } + } + + return res; +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_gather.h b/src/gallium/auxiliary/gallivm/lp_bld_gather.h new file mode 100644 index 00000000000..131af8ea07e --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_gather.h @@ -0,0 +1,61 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#ifndef LP_BLD_GATHER_H_ +#define LP_BLD_GATHER_H_ + + +#include "gallivm/lp_bld.h" + + +LLVMValueRef +lp_build_gather_elem_ptr(LLVMBuilderRef builder, + unsigned length, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i); + +LLVMValueRef +lp_build_gather_elem(LLVMBuilderRef builder, + unsigned length, + unsigned src_width, + unsigned dst_width, + LLVMValueRef base_ptr, + LLVMValueRef offsets, + unsigned i); + +LLVMValueRef +lp_build_gather(LLVMBuilderRef builder, + unsigned length, + unsigned src_width, + unsigned dst_width, + LLVMValueRef base_ptr, + LLVMValueRef offsets); + + +#endif /* LP_BLD_GATHER_H_ */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c index 5067d0a164f..761f33b578d 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c @@ -29,18 +29,74 @@ #include "pipe/p_compiler.h" #include "util/u_cpu_detect.h" #include "util/u_debug.h" +#include "lp_bld_debug.h" #include "lp_bld_init.h" +#include <llvm-c/Transforms/Scalar.h> + + +#ifdef DEBUG +unsigned gallivm_debug = 0; + +static const struct debug_named_value lp_bld_debug_flags[] = { + { "tgsi", GALLIVM_DEBUG_TGSI, NULL }, + { "ir", GALLIVM_DEBUG_IR, NULL }, + { "asm", GALLIVM_DEBUG_ASM, NULL }, + { "nopt", GALLIVM_DEBUG_NO_OPT, NULL }, + { "perf", GALLIVM_DEBUG_PERF, NULL }, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(gallivm_debug, "GALLIVM_DEBUG", lp_bld_debug_flags, 0) +#endif + LLVMModuleRef lp_build_module = NULL; LLVMExecutionEngineRef lp_build_engine = NULL; LLVMModuleProviderRef lp_build_provider = NULL; LLVMTargetDataRef lp_build_target = NULL; +LLVMPassManagerRef lp_build_pass = NULL; + + +/* + * Optimization values are: + * - 0: None (-O0) + * - 1: Less (-O1) + * - 2: Default (-O2, -Os) + * - 3: Aggressive (-O3) + * + * See also CodeGenOpt::Level in llvm/Target/TargetMachine.h + */ +enum LLVM_CodeGenOpt_Level { +#if HAVE_LLVM >= 0x207 + None, // -O0 + Less, // -O1 + Default, // -O2, -Os + Aggressive // -O3 +#else + Default, + None, + Aggressive +#endif +}; + + +extern void +lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE); + +extern void +lp_set_target_options(void); void lp_build_init(void) { +#ifdef DEBUG + gallivm_debug = debug_get_option_gallivm_debug(); +#endif + + lp_set_target_options(); + LLVMInitializeNativeTarget(); LLVMLinkInJIT(); @@ -52,18 +108,58 @@ lp_build_init(void) lp_build_provider = LLVMCreateModuleProviderForExistingModule(lp_build_module); if (!lp_build_engine) { + enum LLVM_CodeGenOpt_Level optlevel; char *error = NULL; - if (LLVMCreateJITCompiler(&lp_build_engine, lp_build_provider, 1, &error)) { + if (gallivm_debug & GALLIVM_DEBUG_NO_OPT) { + optlevel = None; + } + else { + optlevel = Default; + } + + if (LLVMCreateJITCompiler(&lp_build_engine, lp_build_provider, + (unsigned)optlevel, &error)) { _debug_printf("%s\n", error); LLVMDisposeMessage(error); assert(0); } + +#if defined(DEBUG) || defined(PROFILE) + lp_register_oprofile_jit_event_listener(lp_build_engine); +#endif } if (!lp_build_target) lp_build_target = LLVMGetExecutionEngineTargetData(lp_build_engine); + if (!lp_build_pass) { + lp_build_pass = LLVMCreateFunctionPassManager(lp_build_provider); + LLVMAddTargetData(lp_build_target, lp_build_pass); + + if ((gallivm_debug & GALLIVM_DEBUG_NO_OPT) == 0) { + /* These are the passes currently listed in llvm-c/Transforms/Scalar.h, + * but there are more on SVN. */ + /* TODO: Add more passes */ + LLVMAddCFGSimplificationPass(lp_build_pass); + LLVMAddPromoteMemoryToRegisterPass(lp_build_pass); + LLVMAddConstantPropagationPass(lp_build_pass); + if(util_cpu_caps.has_sse4_1) { + /* FIXME: There is a bug in this pass, whereby the combination of fptosi + * and sitofp (necessary for trunc/floor/ceil/round implementation) + * somehow becomes invalid code. + */ + LLVMAddInstructionCombiningPass(lp_build_pass); + } + LLVMAddGVNPass(lp_build_pass); + } else { + /* We need at least this pass to prevent the backends to fail in + * unexpected ways. + */ + LLVMAddPromoteMemoryToRegisterPass(lp_build_pass); + } + } + util_cpu_detect(); #if 0 diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h index 0ec2afcd1be..f26fdac4663 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_init.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h @@ -38,10 +38,13 @@ extern LLVMModuleRef lp_build_module; extern LLVMExecutionEngineRef lp_build_engine; extern LLVMModuleProviderRef lp_build_provider; extern LLVMTargetDataRef lp_build_target; +extern LLVMPassManagerRef lp_build_pass; void lp_build_init(void); +extern void +lp_func_delete_body(LLVMValueRef func); #endif /* !LP_BLD_INIT_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h new file mode 100644 index 00000000000..369c8121b5c --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h @@ -0,0 +1,55 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#ifndef LP_BLD_LIMITS_H_ +#define LP_BLD_LIMITS_H_ + +/* + * TGSI translation limits. + * + * Some are slightly above SM 3.0 requirements to give some wiggle room to + * the state trackers. + */ + +#define LP_MAX_TGSI_TEMPS 256 + +#define LP_MAX_TGSI_ADDRS 16 + +#define LP_MAX_TGSI_IMMEDIATES 256 + +#define LP_MAX_TGSI_PREDS 16 + +/** + * Maximum control flow nesting + * + * SM3.0 requires 24 + */ +#define LP_MAX_TGSI_NESTING 32 + + +#endif /* LP_BLD_LIMITS_H_ */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index d13fa1a5d04..ce5d0214b43 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -34,11 +34,13 @@ #include "util/u_cpu_detect.h" +#include "util/u_memory.h" #include "util/u_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" #include "lp_bld_intr.h" +#include "lp_bld_debug.h" #include "lp_bld_logic.h" @@ -82,15 +84,31 @@ lp_build_compare(LLVMBuilderRef builder, assert(func >= PIPE_FUNC_NEVER); assert(func <= PIPE_FUNC_ALWAYS); + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); if(func == PIPE_FUNC_NEVER) return zeros; if(func == PIPE_FUNC_ALWAYS) return ones; - /* TODO: optimize the constant case */ +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) + /* + * There are no unsigned integer comparison instructions in SSE. + */ - /* XXX: It is not clear if we should use the ordered or unordered operators */ + if (!type.floating && !type.sign && + type.width * type.length == 128 && + util_cpu_caps.has_sse2 && + (func == PIPE_FUNC_LESS || + func == PIPE_FUNC_LEQUAL || + func == PIPE_FUNC_GREATER || + func == PIPE_FUNC_GEQUAL) && + (gallivm_debug & GALLIVM_DEBUG_PERF)) { + debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n", + __FUNCTION__, type.length, type.width); + } +#endif #if HAVE_LLVM < 0x0207 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) @@ -187,12 +205,10 @@ lp_build_compare(LLVMBuilderRef builder, return lp_build_undef(type); } - /* There are no signed byte and unsigned word/dword comparison - * instructions. So flip the sign bit so that the results match. + /* There are no unsigned comparison instructions. So flip the sign bit + * so that the results match. */ - if(table[func].gt && - ((type.width == 8 && type.sign) || - (type.width != 8 && !type.sign))) { + if (table[func].gt && !type.sign) { LLVMValueRef msb = lp_build_const_int_vec(type, (unsigned long long)1 << (type.width - 1)); a = LLVMBuildXor(builder, a, msb, ""); b = LLVMBuildXor(builder, b, msb, ""); @@ -223,6 +239,8 @@ lp_build_compare(LLVMBuilderRef builder, #endif #endif /* HAVE_LLVM < 0x0207 */ + /* XXX: It is not clear if we should use the ordered or unordered operators */ + if(type.floating) { LLVMRealPredicate op; switch(func) { @@ -324,8 +342,10 @@ lp_build_compare(LLVMBuilderRef builder, res = LLVMGetUndef(int_vec_type); - debug_printf("%s: warning: using slow element-wise int" - " vector comparison\n", __FUNCTION__); + if (gallivm_debug & GALLIVM_DEBUG_PERF) { + debug_printf("%s: using slow element-wise int" + " vector comparison\n", __FUNCTION__); + } for(i = 0; i < type.length; ++i) { LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); @@ -364,9 +384,55 @@ lp_build_cmp(struct lp_build_context *bld, /** + * Return (mask & a) | (~mask & b); + */ +LLVMValueRef +lp_build_select_bitwise(struct lp_build_context *bld, + LLVMValueRef mask, + LLVMValueRef a, + LLVMValueRef b) +{ + struct lp_type type = bld->type; + LLVMValueRef res; + + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + + if (a == b) { + return a; + } + + if(type.floating) { + LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); + a = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); + b = LLVMBuildBitCast(bld->builder, b, int_vec_type, ""); + } + + a = LLVMBuildAnd(bld->builder, a, mask, ""); + + /* This often gets translated to PANDN, but sometimes the NOT is + * pre-computed and stored in another constant. The best strategy depends + * on available registers, so it is not a big deal -- hopefully LLVM does + * the right decision attending the rest of the program. + */ + b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), ""); + + res = LLVMBuildOr(bld->builder, a, b, ""); + + if(type.floating) { + LLVMTypeRef vec_type = lp_build_vec_type(type); + res = LLVMBuildBitCast(bld->builder, res, vec_type, ""); + } + + return res; +} + + +/** * Return mask ? a : b; * - * mask is a bitwise mask, composed of 0 or ~0 for each element. + * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value + * will yield unpredictable results. */ LLVMValueRef lp_build_select(struct lp_build_context *bld, @@ -377,6 +443,9 @@ lp_build_select(struct lp_build_context *bld, struct lp_type type = bld->type; LLVMValueRef res; + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == b) return a; @@ -384,49 +453,78 @@ lp_build_select(struct lp_build_context *bld, mask = LLVMBuildTrunc(bld->builder, mask, LLVMInt1Type(), ""); res = LLVMBuildSelect(bld->builder, mask, a, b, ""); } - else { - if(type.floating) { - LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); - a = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); - b = LLVMBuildBitCast(bld->builder, b, int_vec_type, ""); + else if (util_cpu_caps.has_sse4_1 && + type.width * type.length == 128 && + !LLVMIsConstant(a) && + !LLVMIsConstant(b) && + !LLVMIsConstant(mask)) { + const char *intrinsic; + LLVMTypeRef arg_type; + LLVMValueRef args[3]; + + if (type.width == 64) { + intrinsic = "llvm.x86.sse41.blendvpd"; + arg_type = LLVMVectorType(LLVMDoubleType(), 2); + } else if (type.width == 32) { + intrinsic = "llvm.x86.sse41.blendvps"; + arg_type = LLVMVectorType(LLVMFloatType(), 4); + } else { + intrinsic = "llvm.x86.sse41.pblendvb"; + arg_type = LLVMVectorType(LLVMInt8Type(), 16); } - a = LLVMBuildAnd(bld->builder, a, mask, ""); + if (arg_type != bld->int_vec_type) { + mask = LLVMBuildBitCast(bld->builder, mask, arg_type, ""); + } - /* This often gets translated to PANDN, but sometimes the NOT is - * pre-computed and stored in another constant. The best strategy depends - * on available registers, so it is not a big deal -- hopefully LLVM does - * the right decision attending the rest of the program. - */ - b = LLVMBuildAnd(bld->builder, b, LLVMBuildNot(bld->builder, mask, ""), ""); + if (arg_type != bld->vec_type) { + a = LLVMBuildBitCast(bld->builder, a, arg_type, ""); + b = LLVMBuildBitCast(bld->builder, b, arg_type, ""); + } - res = LLVMBuildOr(bld->builder, a, b, ""); + args[0] = b; + args[1] = a; + args[2] = mask; - if(type.floating) { - LLVMTypeRef vec_type = lp_build_vec_type(type); - res = LLVMBuildBitCast(bld->builder, res, vec_type, ""); + res = lp_build_intrinsic(bld->builder, intrinsic, + arg_type, args, Elements(args)); + + if (arg_type != bld->vec_type) { + res = LLVMBuildBitCast(bld->builder, res, bld->vec_type, ""); } } + else { + res = lp_build_select_bitwise(bld, mask, a, b); + } return res; } +/** + * Return mask ? a : b; + * + * mask is a TGSI_WRITEMASK_xxx. + */ LLVMValueRef lp_build_select_aos(struct lp_build_context *bld, + unsigned mask, LLVMValueRef a, - LLVMValueRef b, - const boolean cond[4]) + LLVMValueRef b) { const struct lp_type type = bld->type; const unsigned n = type.length; unsigned i, j; + assert((mask & ~0xf) == 0); + assert(lp_check_value(type, a)); + assert(lp_check_value(type, b)); + if(a == b) return a; - if(cond[0] && cond[1] && cond[2] && cond[3]) + if((mask & 0xf) == 0xf) return a; - if(!cond[0] && !cond[1] && !cond[2] && !cond[3]) + if((mask & 0xf) == 0x0) return b; if(a == bld->undef || b == bld->undef) return bld->undef; @@ -449,7 +547,9 @@ lp_build_select_aos(struct lp_build_context *bld, for(j = 0; j < n; j += 4) for(i = 0; i < 4; ++i) - shuffles[j + i] = LLVMConstInt(elem_type, (cond[i] ? 0 : n) + j + i, 0); + shuffles[j + i] = LLVMConstInt(elem_type, + (mask & (1 << i) ? 0 : n) + j + i, + 0); return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), ""); } @@ -458,26 +558,17 @@ lp_build_select_aos(struct lp_build_context *bld, /* XXX: Unfortunately select of vectors do not work */ /* Use a select */ LLVMTypeRef elem_type = LLVMInt1Type(); - LLVMValueRef cond[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef cond_vec[LP_MAX_VECTOR_LENGTH]; for(j = 0; j < n; j += 4) for(i = 0; i < 4; ++i) - cond[j + i] = LLVMConstInt(elem_type, cond[i] ? 1 : 0, 0); + cond_vec[j + i] = LLVMConstInt(elem_type, + mask & (1 << i) ? 1 : 0, 0); - return LLVMBuildSelect(bld->builder, LLVMConstVector(cond, n), a, b, ""); + return LLVMBuildSelect(bld->builder, LLVMConstVector(cond_vec, n), a, b, ""); #else - LLVMValueRef mask = lp_build_const_mask_aos(type, cond); - return lp_build_select(bld, mask, a, b); + LLVMValueRef mask_vec = lp_build_const_mask_aos(type, mask); + return lp_build_select(bld, mask_vec, a, b); #endif } } - - -/** Return (a & ~b) */ -LLVMValueRef -lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) -{ - b = LLVMBuildNot(bld->builder, b, ""); - b = LLVMBuildAnd(bld->builder, a, b, ""); - return b; -} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.h b/src/gallium/auxiliary/gallivm/lp_bld_logic.h index 29f9fc3b205..141fb92058a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.h @@ -63,6 +63,11 @@ lp_build_cmp(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b); +LLVMValueRef +lp_build_select_bitwise(struct lp_build_context *bld, + LLVMValueRef mask, + LLVMValueRef a, + LLVMValueRef b); LLVMValueRef lp_build_select(struct lp_build_context *bld, @@ -72,13 +77,9 @@ lp_build_select(struct lp_build_context *bld, LLVMValueRef lp_build_select_aos(struct lp_build_context *bld, + unsigned mask, LLVMValueRef a, - LLVMValueRef b, - const boolean cond[4]); - - -LLVMValueRef -lp_build_andc(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b); + LLVMValueRef b); #endif /* !LP_BLD_LOGIC_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp new file mode 100644 index 00000000000..48baf7c425c --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -0,0 +1,180 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS +#endif + +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif + +#include <llvm-c/Core.h> +#include <llvm-c/ExecutionEngine.h> +#include <llvm/Target/TargetOptions.h> +#include <llvm/ExecutionEngine/ExecutionEngine.h> +#include <llvm/ExecutionEngine/JITEventListener.h> +#include <llvm/Support/CommandLine.h> +#include <llvm/Support/PrettyStackTrace.h> + +#include "pipe/p_config.h" +#include "util/u_debug.h" + + +#if (defined(PIPE_OS_WINDOWS) && !defined(PIPE_CC_MSVC)) || defined(PIPE_OS_EMBDDED) + +#include "llvm/Support/raw_ostream.h" + +class raw_debug_ostream : + public llvm::raw_ostream +{ + uint64_t pos; + + void write_impl(const char *Ptr, size_t Size); + uint64_t current_pos() { return pos; } + uint64_t current_pos() const { return pos; } + +#if HAVE_LLVM >= 0x207 + uint64_t preferred_buffer_size() { return 512; } +#else + size_t preferred_buffer_size() { return 512; } +#endif +}; + + +void +raw_debug_ostream::write_impl(const char *Ptr, size_t Size) +{ + if (Size > 0) { + char *lastPtr = (char *)&Ptr[Size]; + char last = *lastPtr; + *lastPtr = 0; + _debug_printf("%*s", Size, Ptr); + *lastPtr = last; + pos += Size; + } +} + + +/** + * Same as LLVMDumpValue, but through our debugging channels. + */ +extern "C" void +lp_debug_dump_value(LLVMValueRef value) +{ + raw_debug_ostream os; + llvm::unwrap(value)->print(os); + os.flush(); +} + + +#else + + +extern "C" void +lp_debug_dump_value(LLVMValueRef value) +{ + LLVMDumpValue(value); +} + + +#endif + + +/** + * Register the engine with oprofile. + * + * This allows to see the LLVM IR function names in oprofile output. + * + * To actually work LLVM needs to be built with the --with-oprofile configure + * option. + * + * Also a oprofile:oprofile user:group is necessary. Which is not created by + * default on some distributions. + */ +extern "C" void +lp_register_oprofile_jit_event_listener(LLVMExecutionEngineRef EE) +{ + llvm::unwrap(EE)->RegisterJITEventListener(llvm::createOProfileJITEventListener()); +} + + +extern "C" void +lp_set_target_options(void) +{ +#if defined(DEBUG) +#if HAVE_LLVM >= 0x0207 + llvm::JITEmitDebugInfo = true; +#endif +#endif + +#if defined(DEBUG) || defined(PROFILE) + llvm::NoFramePointerElim = true; +#endif + + llvm::NoExcessFPPrecision = false; + + /* XXX: Investigate this */ +#if 0 + llvm::UnsafeFPMath = true; +#endif + + /* + * LLVM will generate MMX instructions for vectors <= 64 bits, leading to + * innefficient code, and in 32bit systems, to the corruption of the FPU + * stack given that it expects the user to generate the EMMS instructions. + * + * See also: + * - http://llvm.org/bugs/show_bug.cgi?id=3287 + * - http://l4.me.uk/post/2009/06/07/llvm-wrinkle-3-configuration-what-configuration/ + */ + static boolean first = TRUE; + if (first) { + static const char* options[] = { + "prog", + "-disable-mmx" + }; + llvm::cl::ParseCommandLineOptions(2, const_cast<char**>(options)); + first = FALSE; + } + + /* + * By default LLVM adds a signal handler to output a pretty stack trace. + * This signal handler is never removed, causing problems when unloading the + * shared object where the gallium driver resides. + */ + llvm::DisablePrettyStackTrace = true; +} + + +extern "C" void +lp_func_delete_body(LLVMValueRef FF) +{ + llvm::Function *func = llvm::unwrap<llvm::Function>(FF); + func->deleteBody(); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c index 186f8849b8d..f7eb7148ab8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c @@ -111,8 +111,6 @@ lp_build_const_pack_shuffle(unsigned n) assert(n <= LP_MAX_VECTOR_LENGTH); - /* TODO: cache results in a static table */ - for(i = 0; i < n; ++i) elems[i] = LLVMConstInt(LLVMInt32Type(), 2*i, 0); @@ -171,14 +169,13 @@ lp_build_unpack2(LLVMBuilderRef builder, msb = lp_build_zero(src_type); /* Interleave bits */ - if(util_cpu_caps.little_endian) { +#ifdef PIPE_ARCH_LITTLE_ENDIAN *dst_lo = lp_build_interleave2(builder, src_type, src, msb, 0); *dst_hi = lp_build_interleave2(builder, src_type, src, msb, 1); - } - else { +#else *dst_lo = lp_build_interleave2(builder, src_type, msb, src, 0); *dst_hi = lp_build_interleave2(builder, src_type, msb, src, 1); - } +#endif /* Cast the result into the new type (twice as wide) */ @@ -261,13 +258,14 @@ lp_build_pack2(LLVMBuilderRef builder, #endif LLVMTypeRef dst_vec_type = lp_build_vec_type(dst_type); LLVMValueRef shuffle; - LLVMValueRef res; + LLVMValueRef res = NULL; assert(!src_type.floating); assert(!dst_type.floating); assert(src_type.width == dst_type.width * 2); assert(src_type.length * 2 == dst_type.length); + /* Check for special cases first */ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) { switch(src_type.width) { case 32: @@ -283,8 +281,8 @@ lp_build_pack2(LLVMBuilderRef builder, return lp_build_intrinsic_binary(builder, "llvm.x86.sse41.packusdw", dst_vec_type, lo, hi); } else { - assert(0); - return LLVMGetUndef(dst_vec_type); + /* use generic shuffle below */ + res = NULL; } } break; @@ -310,10 +308,13 @@ lp_build_pack2(LLVMBuilderRef builder, break; } - res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); - return res; + if (res) { + res = LLVMBuildBitCast(builder, res, dst_vec_type, ""); + return res; + } } + /* generic shuffle */ lo = LLVMBuildBitCast(builder, lo, dst_vec_type, ""); hi = LLVMBuildBitCast(builder, hi, dst_vec_type, ""); @@ -427,3 +428,123 @@ lp_build_pack(LLVMBuilderRef builder, return tmp[0]; } + + +/** + * Truncate or expand the bitwidth. + * + * NOTE: Getting the right sign flags is crucial here, as we employ some + * intrinsics that do saturation. + */ +void +lp_build_resize(LLVMBuilderRef builder, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef *src, unsigned num_srcs, + LLVMValueRef *dst, unsigned num_dsts) +{ + LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + /* + * We don't support float <-> int conversion here. That must be done + * before/after calling this function. + */ + assert(src_type.floating == dst_type.floating); + + /* + * We don't support double <-> float conversion yet, although it could be + * added with little effort. + */ + assert((!src_type.floating && !dst_type.floating) || + src_type.width == dst_type.width); + + /* We must not loose or gain channels. Only precision */ + assert(src_type.length * num_srcs == dst_type.length * num_dsts); + + /* We don't support M:N conversion, only 1:N, M:1, or 1:1 */ + assert(num_srcs == 1 || num_dsts == 1); + + assert(src_type.length <= LP_MAX_VECTOR_LENGTH); + assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); + assert(num_srcs <= LP_MAX_VECTOR_LENGTH); + assert(num_dsts <= LP_MAX_VECTOR_LENGTH); + + if (src_type.width > dst_type.width) { + /* + * Truncate bit width. + */ + + assert(num_dsts == 1); + + if (src_type.width * src_type.length == dst_type.width * dst_type.length) { + /* + * Register width remains constant -- use vector packing intrinsics + */ + + tmp[0] = lp_build_pack(builder, src_type, dst_type, TRUE, src, num_srcs); + } + else { + /* + * Do it element-wise. + */ + + assert(src_type.length == dst_type.length); + tmp[0] = lp_build_undef(dst_type); + for (i = 0; i < dst_type.length; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, ""); + val = LLVMBuildTrunc(builder, val, lp_build_elem_type(dst_type), ""); + tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, ""); + } + } + } + else if (src_type.width < dst_type.width) { + /* + * Expand bit width. + */ + + assert(num_srcs == 1); + + if (src_type.width * src_type.length == dst_type.width * dst_type.length) { + /* + * Register width remains constant -- use vector unpack intrinsics + */ + lp_build_unpack(builder, src_type, dst_type, src[0], tmp, num_dsts); + } + else { + /* + * Do it element-wise. + */ + + assert(src_type.length == dst_type.length); + tmp[0] = lp_build_undef(dst_type); + for (i = 0; i < dst_type.length; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef val = LLVMBuildExtractElement(builder, src[0], index, ""); + + if (src_type.sign && dst_type.sign) { + val = LLVMBuildSExt(builder, val, lp_build_elem_type(dst_type), ""); + } else { + val = LLVMBuildZExt(builder, val, lp_build_elem_type(dst_type), ""); + } + tmp[0] = LLVMBuildInsertElement(builder, tmp[0], val, index, ""); + } + } + } + else { + /* + * No-op + */ + + assert(num_srcs == 1); + assert(num_dsts == 1); + + tmp[0] = src[0]; + } + + for(i = 0; i < num_dsts; ++i) + dst[i] = tmp[i]; +} + + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h index 41adeed220c..e947b90d164 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h @@ -37,6 +37,8 @@ #define LP_BLD_PACK_H +#include "pipe/p_compiler.h" + #include "gallivm/lp_bld.h" @@ -92,4 +94,12 @@ lp_build_pack(LLVMBuilderRef builder, const LLVMValueRef *src, unsigned num_srcs); +void +lp_build_resize(LLVMBuilderRef builder, + struct lp_type src_type, + struct lp_type dst_type, + const LLVMValueRef *src, unsigned num_srcs, + LLVMValueRef *dst, unsigned num_dsts); + + #endif /* !LP_BLD_PACK_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.c b/src/gallium/auxiliary/gallivm/lp_bld_quad.c new file mode 100644 index 00000000000..7b1088939b9 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.c @@ -0,0 +1,101 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#include "lp_bld_type.h" +#include "lp_bld_arit.h" +#include "lp_bld_swizzle.h" +#include "lp_bld_quad.h" + + +static const unsigned char +swizzle_left[4] = { + LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_LEFT, + LP_BLD_QUAD_BOTTOM_LEFT, LP_BLD_QUAD_BOTTOM_LEFT +}; + +static const unsigned char +swizzle_right[4] = { + LP_BLD_QUAD_TOP_RIGHT, LP_BLD_QUAD_TOP_RIGHT, + LP_BLD_QUAD_BOTTOM_RIGHT, LP_BLD_QUAD_BOTTOM_RIGHT +}; + +static const unsigned char +swizzle_top[4] = { + LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_RIGHT, + LP_BLD_QUAD_TOP_LEFT, LP_BLD_QUAD_TOP_RIGHT +}; + +static const unsigned char +swizzle_bottom[4] = { + LP_BLD_QUAD_BOTTOM_LEFT, LP_BLD_QUAD_BOTTOM_RIGHT, + LP_BLD_QUAD_BOTTOM_LEFT, LP_BLD_QUAD_BOTTOM_RIGHT +}; + + +LLVMValueRef +lp_build_ddx(struct lp_build_context *bld, + LLVMValueRef a) +{ + LLVMValueRef a_left = lp_build_swizzle_aos(bld, a, swizzle_left); + LLVMValueRef a_right = lp_build_swizzle_aos(bld, a, swizzle_right); + return lp_build_sub(bld, a_right, a_left); +} + + +LLVMValueRef +lp_build_ddy(struct lp_build_context *bld, + LLVMValueRef a) +{ + LLVMValueRef a_top = lp_build_swizzle_aos(bld, a, swizzle_top); + LLVMValueRef a_bottom = lp_build_swizzle_aos(bld, a, swizzle_bottom); + return lp_build_sub(bld, a_bottom, a_top); +} + + +LLVMValueRef +lp_build_scalar_ddx(struct lp_build_context *bld, + LLVMValueRef a) +{ + LLVMValueRef idx_left = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0); + LLVMValueRef idx_right = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_RIGHT, 0); + LLVMValueRef a_left = LLVMBuildExtractElement(bld->builder, a, idx_left, ""); + LLVMValueRef a_right = LLVMBuildExtractElement(bld->builder, a, idx_right, ""); + return lp_build_sub(bld, a_right, a_left); +} + + +LLVMValueRef +lp_build_scalar_ddy(struct lp_build_context *bld, + LLVMValueRef a) +{ + LLVMValueRef idx_top = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_TOP_LEFT, 0); + LLVMValueRef idx_bottom = LLVMConstInt(LLVMInt32Type(), LP_BLD_QUAD_BOTTOM_LEFT, 0); + LLVMValueRef a_top = LLVMBuildExtractElement(bld->builder, a, idx_top, ""); + LLVMValueRef a_bottom = LLVMBuildExtractElement(bld->builder, a, idx_bottom, ""); + return lp_build_sub(bld, a_bottom, a_top); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_quad.h b/src/gallium/auxiliary/gallivm/lp_bld_quad.h new file mode 100644 index 00000000000..b7992912927 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_quad.h @@ -0,0 +1,96 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#ifndef LP_BLD_QUAD_H_ +#define LP_BLD_QUAD_H_ + + +#include "gallivm/lp_bld.h" + + +struct lp_build_context; + + +/* + * Each quad is composed of four elements. + * + * ######### + * # 0 | 1 # + * #---+---# + * # 2 | 3 # + * ######### + */ + +enum lp_bld_quad { + LP_BLD_QUAD_TOP_LEFT = 0, + LP_BLD_QUAD_TOP_RIGHT = 1, + LP_BLD_QUAD_BOTTOM_LEFT = 2, + LP_BLD_QUAD_BOTTOM_RIGHT = 3 +}; + + +/* + * (Vector) derivates. + * + * More than one quad is supported. The only requirement is that the vector + * contains a whole number of quads: + * + * ######### ######### ... + * # 0 | 1 # # 4 | 5 # + * #---+---# #---+---# ... + * # 2 | 3 # # 6 | 7 # + * ######### ######### ... + */ + +LLVMValueRef +lp_build_ddx(struct lp_build_context *bld, + LLVMValueRef a); + + +LLVMValueRef +lp_build_ddy(struct lp_build_context *bld, + LLVMValueRef a); + + +/* + * Scalar derivatives. + * + * Same as getting the first value of above. + */ + +LLVMValueRef +lp_build_scalar_ddx(struct lp_build_context *bld, + LLVMValueRef a); + + +LLVMValueRef +lp_build_scalar_ddy(struct lp_build_context *bld, + LLVMValueRef a); + + +#endif /* LP_BLD_QUAD_H_ */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index 195a4953ab1..7a64392d3c1 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -36,12 +36,54 @@ #include "pipe/p_state.h" #include "util/u_format.h" #include "util/u_math.h" -#include "lp_bld_debug.h" -#include "lp_bld_const.h" #include "lp_bld_arit.h" -#include "lp_bld_type.h" -#include "lp_bld_format.h" +#include "lp_bld_const.h" +#include "lp_bld_debug.h" +#include "lp_bld_printf.h" +#include "lp_bld_flow.h" #include "lp_bld_sample.h" +#include "lp_bld_swizzle.h" +#include "lp_bld_type.h" + + +/* + * Bri-linear factor. Use zero or any other number less than one to force + * tri-linear filtering. + */ +#define BRILINEAR_FACTOR 2 + + +/** + * Does the given texture wrap mode allow sampling the texture border color? + * XXX maybe move this into gallium util code. + */ +boolean +lp_sampler_wrap_mode_uses_border_color(unsigned mode, + unsigned min_img_filter, + unsigned mag_img_filter) +{ + switch (mode) { + case PIPE_TEX_WRAP_REPEAT: + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + return FALSE; + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + if (min_img_filter == PIPE_TEX_FILTER_NEAREST && + mag_img_filter == PIPE_TEX_FILTER_NEAREST) { + return FALSE; + } else { + return TRUE; + } + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + return TRUE; + default: + assert(0 && "unexpected wrap mode"); + return FALSE; + } +} /** @@ -83,34 +125,49 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, state->swizzle_a = view->swizzle_a; state->target = texture->target; - state->pot_width = util_is_pot(texture->width0); - state->pot_height = util_is_pot(texture->height0); - state->pot_depth = util_is_pot(texture->depth0); + state->pot_width = util_is_power_of_two(texture->width0); + state->pot_height = util_is_power_of_two(texture->height0); + state->pot_depth = util_is_power_of_two(texture->depth0); state->wrap_s = sampler->wrap_s; state->wrap_t = sampler->wrap_t; state->wrap_r = sampler->wrap_r; state->min_img_filter = sampler->min_img_filter; state->mag_img_filter = sampler->mag_img_filter; - if (texture->last_level) { + + if (view->last_level && sampler->max_lod > 0.0f) { state->min_mip_filter = sampler->min_mip_filter; } else { state->min_mip_filter = PIPE_TEX_MIPFILTER_NONE; } + if (state->min_mip_filter != PIPE_TEX_MIPFILTER_NONE) { + if (sampler->lod_bias != 0.0f) { + state->lod_bias_non_zero = 1; + } + + /* If min_lod == max_lod we can greatly simplify mipmap selection. + * This is a case that occurs during automatic mipmap generation. + */ + if (sampler->min_lod == sampler->max_lod) { + state->min_max_lod_equal = 1; + } else { + if (sampler->min_lod > 0.0f) { + state->apply_min_lod = 1; + } + + if (sampler->max_lod < (float)view->last_level) { + state->apply_max_lod = 1; + } + } + } + state->compare_mode = sampler->compare_mode; if (sampler->compare_mode != PIPE_TEX_COMPARE_NONE) { state->compare_func = sampler->compare_func; } state->normalized_coords = sampler->normalized_coords; - state->lod_bias = sampler->lod_bias; - state->min_lod = sampler->min_lod; - state->max_lod = sampler->max_lod; - state->border_color[0] = sampler->border_color[0]; - state->border_color[1] = sampler->border_color[1]; - state->border_color[2] = sampler->border_color[2]; - state->border_color[3] = sampler->border_color[3]; /* * FIXME: Handle the remainder of pipe_sampler_view. @@ -119,84 +176,817 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, /** - * Gather elements from scatter positions in memory into a single vector. + * Generate code to compute coordinate gradient (rho). + * \param ddx partial derivatives of (s, t, r, q) with respect to X + * \param ddy partial derivatives of (s, t, r, q) with respect to Y + * + * XXX: The resulting rho is scalar, so we ignore all but the first element of + * derivatives that are passed by the shader. + */ +static LLVMValueRef +lp_build_rho(struct lp_build_sample_context *bld, + const LLVMValueRef ddx[4], + const LLVMValueRef ddy[4]) +{ + struct lp_build_context *float_size_bld = &bld->float_size_bld; + struct lp_build_context *float_bld = &bld->float_bld; + const unsigned dims = bld->dims; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef index0 = LLVMConstInt(i32t, 0, 0); + LLVMValueRef index1 = LLVMConstInt(i32t, 1, 0); + LLVMValueRef index2 = LLVMConstInt(i32t, 2, 0); + LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy; + LLVMValueRef rho_x, rho_y; + LLVMValueRef rho_vec; + LLVMValueRef float_size; + LLVMValueRef rho; + + dsdx = LLVMBuildExtractElement(bld->builder, ddx[0], index0, "dsdx"); + dsdy = LLVMBuildExtractElement(bld->builder, ddy[0], index0, "dsdy"); + + if (dims <= 1) { + rho_x = dsdx; + rho_y = dsdy; + } + else { + rho_x = float_size_bld->undef; + rho_y = float_size_bld->undef; + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dsdx, index0, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dsdy, index0, ""); + + dtdx = LLVMBuildExtractElement(bld->builder, ddx[1], index0, "dtdx"); + dtdy = LLVMBuildExtractElement(bld->builder, ddy[1], index0, "dtdy"); + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, dtdx, index1, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, dtdy, index1, ""); + + if (dims >= 3) { + drdx = LLVMBuildExtractElement(bld->builder, ddx[2], index0, "drdx"); + drdy = LLVMBuildExtractElement(bld->builder, ddy[2], index0, "drdy"); + + rho_x = LLVMBuildInsertElement(bld->builder, rho_x, drdx, index2, ""); + rho_y = LLVMBuildInsertElement(bld->builder, rho_y, drdy, index2, ""); + } + } + + rho_x = lp_build_abs(float_size_bld, rho_x); + rho_y = lp_build_abs(float_size_bld, rho_y); + + rho_vec = lp_build_max(float_size_bld, rho_x, rho_y); + + float_size = lp_build_int_to_float(float_size_bld, bld->int_size); + + rho_vec = lp_build_mul(float_size_bld, rho_vec, float_size); + + if (dims <= 1) { + rho = rho_vec; + } + else { + if (dims >= 2) { + LLVMValueRef rho_s, rho_t, rho_r; + + rho_s = LLVMBuildExtractElement(bld->builder, rho_vec, index0, ""); + rho_t = LLVMBuildExtractElement(bld->builder, rho_vec, index1, ""); + + rho = lp_build_max(float_bld, rho_s, rho_t); + + if (dims >= 3) { + rho_r = LLVMBuildExtractElement(bld->builder, rho_vec, index0, ""); + rho = lp_build_max(float_bld, rho, rho_r); + } + } + } + + return rho; +} + + +/* + * Bri-linear lod computation + * + * Use a piece-wise linear approximation of log2 such that: + * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc. + * - linear approximation for values in the neighborhood of 0.5, 1.5., etc, + * with the steepness specified in 'factor' + * - exact result for 0.5, 1.5, etc. + * + * + * 1.0 - /----* + * / + * / + * / + * 0.5 - * + * / + * / + * / + * 0.0 - *----/ + * + * | | + * 2^0 2^1 + * + * This is a technique also commonly used in hardware: + * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html + * + * TODO: For correctness, this should only be applied when texture is known to + * have regular mipmaps, i.e., mipmaps derived from the base level. * - * @param src_width src element width - * @param dst_width result element width (source will be expanded to fit) - * @param length length of the offsets, - * @param base_ptr base pointer, should be a i8 pointer type. - * @param offsets vector with offsets + * TODO: This could be done in fixed point, where applicable. + */ +static void +lp_build_brilinear_lod(struct lp_build_sample_context *bld, + LLVMValueRef lod, + double factor, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart) +{ + struct lp_build_context *float_bld = &bld->float_bld; + LLVMValueRef lod_fpart; + float pre_offset = (factor - 0.5)/factor - 0.5; + float post_offset = 1 - factor; + + if (0) { + lp_build_printf(bld->builder, "lod = %f\n", lod); + } + + lod = lp_build_add(float_bld, lod, + lp_build_const_vec(float_bld->type, pre_offset)); + + lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, &lod_fpart); + + lod_fpart = lp_build_mul(float_bld, lod_fpart, + lp_build_const_vec(float_bld->type, factor)); + + lod_fpart = lp_build_add(float_bld, lod_fpart, + lp_build_const_vec(float_bld->type, post_offset)); + + /* + * It's not necessary to clamp lod_fpart since: + * - the above expression will never produce numbers greater than one. + * - the mip filtering branch is only taken if lod_fpart is positive + */ + + *out_lod_fpart = lod_fpart; + + if (0) { + lp_build_printf(bld->builder, "lod_ipart = %i\n", *out_lod_ipart); + lp_build_printf(bld->builder, "lod_fpart = %f\n\n", *out_lod_fpart); + } +} + + +/** + * Generate code to compute texture level of detail (lambda). + * \param ddx partial derivatives of (s, t, r, q) with respect to X + * \param ddy partial derivatives of (s, t, r, q) with respect to Y + * \param lod_bias optional float vector with the shader lod bias + * \param explicit_lod optional float vector with the explicit lod + * \param width scalar int texture width + * \param height scalar int texture height + * \param depth scalar int texture depth + * + * XXX: The resulting lod is scalar, so ignore all but the first element of + * derivatives, lod_bias, etc that are passed by the shader. + */ +void +lp_build_lod_selector(struct lp_build_sample_context *bld, + unsigned unit, + const LLVMValueRef ddx[4], + const LLVMValueRef ddy[4], + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ + unsigned mip_filter, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart) + +{ + struct lp_build_context *float_bld = &bld->float_bld; + LLVMValueRef lod; + + *out_lod_ipart = bld->int_bld.zero; + *out_lod_fpart = bld->float_bld.zero; + + if (bld->static_state->min_max_lod_equal) { + /* User is forcing sampling from a particular mipmap level. + * This is hit during mipmap generation. + */ + LLVMValueRef min_lod = + bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); + + lod = min_lod; + } + else { + LLVMValueRef sampler_lod_bias = + bld->dynamic_state->lod_bias(bld->dynamic_state, bld->builder, unit); + LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0); + + if (explicit_lod) { + lod = LLVMBuildExtractElement(bld->builder, explicit_lod, + index0, ""); + } + else { + LLVMValueRef rho; + + rho = lp_build_rho(bld, ddx, ddy); + + /* compute lod = log2(rho) */ + if ((mip_filter == PIPE_TEX_MIPFILTER_NONE || + mip_filter == PIPE_TEX_MIPFILTER_NEAREST) && + !lod_bias && + !bld->static_state->lod_bias_non_zero && + !bld->static_state->apply_max_lod && + !bld->static_state->apply_min_lod) { + *out_lod_ipart = lp_build_ilog2(float_bld, rho); + *out_lod_fpart = bld->float_bld.zero; + return; + } + + if (0) { + lod = lp_build_log2(float_bld, rho); + } + else { + lod = lp_build_fast_log2(float_bld, rho); + } + + /* add shader lod bias */ + if (lod_bias) { + lod_bias = LLVMBuildExtractElement(bld->builder, lod_bias, + index0, ""); + lod = LLVMBuildFAdd(bld->builder, lod, lod_bias, "shader_lod_bias"); + } + } + + /* add sampler lod bias */ + if (bld->static_state->lod_bias_non_zero) + lod = LLVMBuildFAdd(bld->builder, lod, sampler_lod_bias, "sampler_lod_bias"); + + + /* clamp lod */ + if (bld->static_state->apply_max_lod) { + LLVMValueRef max_lod = + bld->dynamic_state->max_lod(bld->dynamic_state, bld->builder, unit); + + lod = lp_build_min(float_bld, lod, max_lod); + } + if (bld->static_state->apply_min_lod) { + LLVMValueRef min_lod = + bld->dynamic_state->min_lod(bld->dynamic_state, bld->builder, unit); + + lod = lp_build_max(float_bld, lod, min_lod); + } + } + + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + if (BRILINEAR_FACTOR > 1.0) { + lp_build_brilinear_lod(bld, lod, BRILINEAR_FACTOR, + out_lod_ipart, out_lod_fpart); + } + else { + lp_build_ifloor_fract(float_bld, lod, out_lod_ipart, out_lod_fpart); + } + + lp_build_name(*out_lod_ipart, "lod_ipart"); + lp_build_name(*out_lod_fpart, "lod_fpart"); + } + else { + *out_lod_ipart = lp_build_iround(float_bld, lod); + } + + return; +} + + +/** + * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer + * mipmap level index. + * Note: this is all scalar code. + * \param lod scalar float texture level of detail + * \param level_out returns integer + */ +void +lp_build_nearest_mip_level(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef lod_ipart, + LLVMValueRef *level_out) +{ + struct lp_build_context *int_bld = &bld->int_bld; + LLVMValueRef last_level, level; + + LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0); + + last_level = bld->dynamic_state->last_level(bld->dynamic_state, + bld->builder, unit); + + /* convert float lod to integer */ + level = lod_ipart; + + /* clamp level to legal range of levels */ + *level_out = lp_build_clamp(int_bld, level, zero, last_level); +} + + +/** + * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to + * two (adjacent) mipmap level indexes. Later, we'll sample from those + * two mipmap levels and interpolate between them. + */ +void +lp_build_linear_mip_levels(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef lod_ipart, + LLVMValueRef *lod_fpart_inout, + LLVMValueRef *level0_out, + LLVMValueRef *level1_out) +{ + LLVMBuilderRef builder = bld->builder; + struct lp_build_context *int_bld = &bld->int_bld; + struct lp_build_context *float_bld = &bld->float_bld; + LLVMValueRef last_level; + LLVMValueRef clamp_min; + LLVMValueRef clamp_max; + + *level0_out = lod_ipart; + *level1_out = lp_build_add(int_bld, lod_ipart, int_bld->one); + + last_level = bld->dynamic_state->last_level(bld->dynamic_state, + bld->builder, unit); + + /* + * Clamp both lod_ipart and lod_ipart + 1 to [0, last_level], with the + * minimum number of comparisons, and zeroing lod_fpart in the extreme + * ends in the process. + */ + + /* lod_ipart < 0 */ + clamp_min = LLVMBuildICmp(builder, LLVMIntSLT, + lod_ipart, int_bld->zero, + "clamp_lod_to_zero"); + + *level0_out = LLVMBuildSelect(builder, clamp_min, + int_bld->zero, *level0_out, ""); + + *level1_out = LLVMBuildSelect(builder, clamp_min, + int_bld->zero, *level1_out, ""); + + *lod_fpart_inout = LLVMBuildSelect(builder, clamp_min, + float_bld->zero, *lod_fpart_inout, ""); + + /* lod_ipart >= last_level */ + clamp_max = LLVMBuildICmp(builder, LLVMIntSGE, + lod_ipart, last_level, + "clamp_lod_to_last"); + + *level0_out = LLVMBuildSelect(builder, clamp_max, + last_level, *level0_out, ""); + + *level1_out = LLVMBuildSelect(builder, clamp_max, + last_level, *level1_out, ""); + + *lod_fpart_inout = LLVMBuildSelect(builder, clamp_max, + float_bld->zero, *lod_fpart_inout, ""); + + lp_build_name(*level0_out, "sampler%u_miplevel0", unit); + lp_build_name(*level1_out, "sampler%u_miplevel1", unit); + lp_build_name(*lod_fpart_inout, "sampler%u_mipweight", unit); +} + + +/** + * Return pointer to a single mipmap level. + * \param data_array array of pointers to mipmap levels + * \param level integer mipmap level */ LLVMValueRef -lp_build_gather(LLVMBuilderRef builder, - unsigned length, - unsigned src_width, - unsigned dst_width, - LLVMValueRef base_ptr, - LLVMValueRef offsets) -{ - LLVMTypeRef src_type = LLVMIntType(src_width); - LLVMTypeRef src_ptr_type = LLVMPointerType(src_type, 0); - LLVMTypeRef dst_elem_type = LLVMIntType(dst_width); - LLVMTypeRef dst_vec_type = LLVMVectorType(dst_elem_type, length); - LLVMValueRef res; - unsigned i; +lp_build_get_mipmap_level(struct lp_build_sample_context *bld, + LLVMValueRef level) +{ + LLVMValueRef indexes[2], data_ptr; + indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + indexes[1] = level; + data_ptr = LLVMBuildGEP(bld->builder, bld->data_array, indexes, 2, ""); + data_ptr = LLVMBuildLoad(bld->builder, data_ptr, ""); + return data_ptr; +} + + +LLVMValueRef +lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, + int level) +{ + LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0); + return lp_build_get_mipmap_level(bld, lvl); +} + + +/** + * Codegen equivalent for u_minify(). + * Return max(1, base_size >> level); + */ +static LLVMValueRef +lp_build_minify(struct lp_build_context *bld, + LLVMValueRef base_size, + LLVMValueRef level) +{ + assert(lp_check_value(bld->type, base_size)); + assert(lp_check_value(bld->type, level)); + + if (level == bld->zero) { + /* if we're using mipmap level zero, no minification is needed */ + return base_size; + } + else { + LLVMValueRef size = + LLVMBuildLShr(bld->builder, base_size, level, "minify"); + assert(bld->type.sign); + size = lp_build_max(bld, size, bld->one); + return size; + } +} - res = LLVMGetUndef(dst_vec_type); - for(i = 0; i < length; ++i) { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); - LLVMValueRef elem_offset; - LLVMValueRef elem_ptr; - LLVMValueRef elem; - elem_offset = LLVMBuildExtractElement(builder, offsets, index, ""); - elem_ptr = LLVMBuildGEP(builder, base_ptr, &elem_offset, 1, ""); - elem_ptr = LLVMBuildBitCast(builder, elem_ptr, src_ptr_type, ""); - elem = LLVMBuildLoad(builder, elem_ptr, ""); +/** + * Dereference stride_array[mipmap_level] array to get a stride. + * Return stride as a vector. + */ +static LLVMValueRef +lp_build_get_level_stride_vec(struct lp_build_sample_context *bld, + LLVMValueRef stride_array, LLVMValueRef level) +{ + LLVMValueRef indexes[2], stride; + indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + indexes[1] = level; + stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, ""); + stride = LLVMBuildLoad(bld->builder, stride, ""); + stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride); + return stride; +} + + +/** + * When sampling a mipmap, we need to compute the width, height, depth + * of the source levels from the level indexes. This helper function + * does that. + */ +void +lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, + LLVMValueRef ilevel, + LLVMValueRef *out_width_vec, + LLVMValueRef *out_height_vec, + LLVMValueRef *out_depth_vec, + LLVMValueRef *row_stride_vec, + LLVMValueRef *img_stride_vec) +{ + const unsigned dims = bld->dims; + LLVMValueRef ilevel_vec; + LLVMValueRef size_vec; + LLVMTypeRef i32t = LLVMInt32Type(); + + ilevel_vec = lp_build_broadcast_scalar(&bld->int_size_bld, ilevel); + + /* + * Compute width, height, depth at mipmap level 'ilevel' + */ + size_vec = lp_build_minify(&bld->int_size_bld, bld->int_size, ilevel_vec); + + *out_width_vec = lp_build_extract_broadcast(bld->builder, + bld->int_size_type, + bld->int_coord_type, + size_vec, + LLVMConstInt(i32t, 0, 0)); + if (dims >= 2) { + + *out_height_vec = lp_build_extract_broadcast(bld->builder, + bld->int_size_type, + bld->int_coord_type, + size_vec, + LLVMConstInt(i32t, 1, 0)); + + *row_stride_vec = lp_build_get_level_stride_vec(bld, + bld->row_stride_array, + ilevel); + if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) { + *img_stride_vec = lp_build_get_level_stride_vec(bld, + bld->img_stride_array, + ilevel); + if (dims == 3) { + *out_depth_vec = lp_build_extract_broadcast(bld->builder, + bld->int_size_type, + bld->int_coord_type, + size_vec, + LLVMConstInt(i32t, 2, 0)); + } + } + } +} + + + +/** Helper used by lp_build_cube_lookup() */ +static LLVMValueRef +lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord) +{ + /* ima = -0.5 / abs(coord); */ + LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5); + LLVMValueRef absCoord = lp_build_abs(coord_bld, coord); + LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord); + return ima; +} + + +/** + * Helper used by lp_build_cube_lookup() + * \param sign scalar +1 or -1 + * \param coord float vector + * \param ima float vector + */ +static LLVMValueRef +lp_build_cube_coord(struct lp_build_context *coord_bld, + LLVMValueRef sign, int negate_coord, + LLVMValueRef coord, LLVMValueRef ima) +{ + /* return negate(coord) * ima * sign + 0.5; */ + LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5); + LLVMValueRef res; + + assert(negate_coord == +1 || negate_coord == -1); - assert(src_width <= dst_width); - if(src_width > dst_width) - elem = LLVMBuildTrunc(builder, elem, dst_elem_type, ""); - if(src_width < dst_width) - elem = LLVMBuildZExt(builder, elem, dst_elem_type, ""); + if (negate_coord == -1) { + coord = lp_build_negate(coord_bld, coord); + } - res = LLVMBuildInsertElement(builder, res, elem, index, ""); + res = lp_build_mul(coord_bld, coord, ima); + if (sign) { + sign = lp_build_broadcast_scalar(coord_bld, sign); + res = lp_build_mul(coord_bld, res, sign); } + res = lp_build_add(coord_bld, res, half); return res; } +/** Helper used by lp_build_cube_lookup() + * Return (major_coord >= 0) ? pos_face : neg_face; + */ +static LLVMValueRef +lp_build_cube_face(struct lp_build_sample_context *bld, + LLVMValueRef major_coord, + unsigned pos_face, unsigned neg_face) +{ + LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE, + major_coord, + bld->float_bld.zero, ""); + LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0); + LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0); + LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, ""); + return res; +} + + + +/** + * Generate code to do cube face selection and compute per-face texcoords. + */ +void +lp_build_cube_lookup(struct lp_build_sample_context *bld, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef *face, + LLVMValueRef *face_s, + LLVMValueRef *face_t) +{ + struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *coord_bld = &bld->coord_bld; + LLVMValueRef rx, ry, rz; + LLVMValueRef arx, ary, arz; + LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25); + LLVMValueRef arx_ge_ary, arx_ge_arz; + LLVMValueRef ary_ge_arx, ary_ge_arz; + LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz; + LLVMValueRef rx_pos, ry_pos, rz_pos; + + assert(bld->coord_bld.type.length == 4); + + /* + * Use the average of the four pixel's texcoords to choose the face. + */ + rx = lp_build_mul(float_bld, c25, + lp_build_sum_vector(&bld->coord_bld, s)); + ry = lp_build_mul(float_bld, c25, + lp_build_sum_vector(&bld->coord_bld, t)); + rz = lp_build_mul(float_bld, c25, + lp_build_sum_vector(&bld->coord_bld, r)); + + arx = lp_build_abs(float_bld, rx); + ary = lp_build_abs(float_bld, ry); + arz = lp_build_abs(float_bld, rz); + + /* + * Compare sign/magnitude of rx,ry,rz to determine face + */ + arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, ""); + arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, ""); + ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, ""); + ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, ""); + + arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, ""); + ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, ""); + + rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, ""); + ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, ""); + rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, ""); + + { + struct lp_build_flow_context *flow_ctx; + struct lp_build_if_state if_ctx; + + flow_ctx = lp_build_flow_create(bld->builder); + lp_build_flow_scope_begin(flow_ctx); + + *face_s = bld->coord_bld.undef; + *face_t = bld->coord_bld.undef; + *face = bld->int_bld.undef; + + lp_build_name(*face_s, "face_s"); + lp_build_name(*face_t, "face_t"); + lp_build_name(*face, "face"); + + lp_build_flow_scope_declare(flow_ctx, face_s); + lp_build_flow_scope_declare(flow_ctx, face_t); + lp_build_flow_scope_declare(flow_ctx, face); + + lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz); + { + /* +/- X face */ + LLVMValueRef sign = lp_build_sgn(float_bld, rx); + LLVMValueRef ima = lp_build_cube_ima(coord_bld, s); + *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima); + *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); + *face = lp_build_cube_face(bld, rx, + PIPE_TEX_FACE_POS_X, + PIPE_TEX_FACE_NEG_X); + } + lp_build_else(&if_ctx); + { + struct lp_build_flow_context *flow_ctx2; + struct lp_build_if_state if_ctx2; + + LLVMValueRef face_s2 = bld->coord_bld.undef; + LLVMValueRef face_t2 = bld->coord_bld.undef; + LLVMValueRef face2 = bld->int_bld.undef; + + flow_ctx2 = lp_build_flow_create(bld->builder); + lp_build_flow_scope_begin(flow_ctx2); + lp_build_flow_scope_declare(flow_ctx2, &face_s2); + lp_build_flow_scope_declare(flow_ctx2, &face_t2); + lp_build_flow_scope_declare(flow_ctx2, &face2); + + ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, ""); + + lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz); + { + /* +/- Y face */ + LLVMValueRef sign = lp_build_sgn(float_bld, ry); + LLVMValueRef ima = lp_build_cube_ima(coord_bld, t); + face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima); + face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima); + face2 = lp_build_cube_face(bld, ry, + PIPE_TEX_FACE_POS_Y, + PIPE_TEX_FACE_NEG_Y); + } + lp_build_else(&if_ctx2); + { + /* +/- Z face */ + LLVMValueRef sign = lp_build_sgn(float_bld, rz); + LLVMValueRef ima = lp_build_cube_ima(coord_bld, r); + face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima); + face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); + face2 = lp_build_cube_face(bld, rz, + PIPE_TEX_FACE_POS_Z, + PIPE_TEX_FACE_NEG_Z); + } + lp_build_endif(&if_ctx2); + lp_build_flow_scope_end(flow_ctx2); + lp_build_flow_destroy(flow_ctx2); + *face_s = face_s2; + *face_t = face_t2; + *face = face2; + } + + lp_build_endif(&if_ctx); + lp_build_flow_scope_end(flow_ctx); + lp_build_flow_destroy(flow_ctx); + } +} + + +/** + * Compute the partial offset of a pixel block along an arbitrary axis. + * + * @param coord coordinate in pixels + * @param stride number of bytes between rows of successive pixel blocks + * @param block_length number of pixels in a pixels block along the coordinate + * axis + * @param out_offset resulting relative offset of the pixel block in bytes + * @param out_subcoord resulting sub-block pixel coordinate + */ +void +lp_build_sample_partial_offset(struct lp_build_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_subcoord) +{ + LLVMValueRef offset; + LLVMValueRef subcoord; + + if (block_length == 1) { + subcoord = bld->zero; + } + else { + /* + * Pixel blocks have power of two dimensions. LLVM should convert the + * rem/div to bit arithmetic. + * TODO: Verify this. + * It does indeed BUT it does transform it to scalar (and back) when doing so + * (using roughly extract, shift/and, mov, unpack) (llvm 2.7). + * The generated code looks seriously unfunny and is quite expensive. + */ +#if 0 + LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length); + subcoord = LLVMBuildURem(bld->builder, coord, block_width, ""); + coord = LLVMBuildUDiv(bld->builder, coord, block_width, ""); +#else + unsigned logbase2 = util_unsigned_logbase2(block_length); + LLVMValueRef block_shift = lp_build_const_int_vec(bld->type, logbase2); + LLVMValueRef block_mask = lp_build_const_int_vec(bld->type, block_length - 1); + subcoord = LLVMBuildAnd(bld->builder, coord, block_mask, ""); + coord = LLVMBuildLShr(bld->builder, coord, block_shift, ""); +#endif + } + + offset = lp_build_mul(bld, coord, stride); + + assert(out_offset); + assert(out_subcoord); + + *out_offset = offset; + *out_subcoord = subcoord; +} + + /** * Compute the offset of a pixel block. * - * x, y, z, y_stride, z_stride are vectors, and they refer to pixel blocks, as - * per format description, and not individual pixels. + * x, y, z, y_stride, z_stride are vectors, and they refer to pixels. + * + * Returns the relative offset and i,j sub-block coordinates */ -LLVMValueRef +void lp_build_sample_offset(struct lp_build_context *bld, const struct util_format_description *format_desc, LLVMValueRef x, LLVMValueRef y, LLVMValueRef z, LLVMValueRef y_stride, - LLVMValueRef z_stride) + LLVMValueRef z_stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_i, + LLVMValueRef *out_j) { LLVMValueRef x_stride; LLVMValueRef offset; x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8); - offset = lp_build_mul(bld, x, x_stride); + + lp_build_sample_partial_offset(bld, + format_desc->block.width, + x, x_stride, + &offset, out_i); if (y && y_stride) { - LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride); + LLVMValueRef y_offset; + lp_build_sample_partial_offset(bld, + format_desc->block.height, + y, y_stride, + &y_offset, out_j); offset = lp_build_add(bld, offset, y_offset); } + else { + *out_j = bld->zero; + } if (z && z_stride) { - LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride); + LLVMValueRef z_offset; + LLVMValueRef k; + lp_build_sample_partial_offset(bld, + 1, /* pixel blocks are always 2D */ + z, z_stride, + &z_offset, &k); offset = lp_build_add(bld, offset, z_offset); } - return offset; + *out_offset = offset; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index 8ceb20473d5..d1a1aa143d8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -36,7 +36,12 @@ #define LP_BLD_SAMPLE_H +#include "pipe/p_format.h" +#include "util/u_debug.h" #include "gallivm/lp_bld.h" +#include "gallivm/lp_bld_type.h" +#include "gallivm/lp_bld_swizzle.h" + struct pipe_resource; struct pipe_sampler_view; @@ -56,14 +61,14 @@ struct lp_sampler_static_state { /* pipe_sampler_view's state */ enum pipe_format format; - unsigned swizzle_r:3; + unsigned swizzle_r:3; /**< PIPE_SWIZZLE_* */ unsigned swizzle_g:3; unsigned swizzle_b:3; unsigned swizzle_a:3; /* pipe_texture's state */ - unsigned target:3; - unsigned pot_width:1; + unsigned target:3; /**< PIPE_TEXTURE_* */ + unsigned pot_width:1; /**< is the width a power of two? */ unsigned pot_height:1; unsigned pot_depth:1; @@ -77,8 +82,10 @@ struct lp_sampler_static_state unsigned compare_mode:1; unsigned compare_func:3; unsigned normalized_coords:1; - float lod_bias, min_lod, max_lod; - float border_color[4]; + unsigned min_max_lod_equal:1; /**< min_lod == max_lod ? */ + unsigned lod_bias_non_zero:1; + unsigned apply_min_lod:1; /**< min_lod > 0 ? */ + unsigned apply_max_lod:1; /**< max_lod < last_level ? */ }; @@ -95,48 +102,191 @@ struct lp_sampler_static_state struct lp_sampler_dynamic_state { - /** Obtain the base texture width. */ + /** Obtain the base texture width (returns int32) */ LLVMValueRef - (*width)( struct lp_sampler_dynamic_state *state, + (*width)( const struct lp_sampler_dynamic_state *state, LLVMBuilderRef builder, unsigned unit); - /** Obtain the base texture height. */ + /** Obtain the base texture height (returns int32) */ LLVMValueRef - (*height)( struct lp_sampler_dynamic_state *state, + (*height)( const struct lp_sampler_dynamic_state *state, LLVMBuilderRef builder, unsigned unit); - /** Obtain the base texture depth. */ + /** Obtain the base texture depth (returns int32) */ LLVMValueRef - (*depth)( struct lp_sampler_dynamic_state *state, + (*depth)( const struct lp_sampler_dynamic_state *state, LLVMBuilderRef builder, unsigned unit); - /** Obtain the number of mipmap levels (minus one). */ + /** Obtain the number of mipmap levels minus one (returns int32) */ LLVMValueRef - (*last_level)( struct lp_sampler_dynamic_state *state, + (*last_level)( const struct lp_sampler_dynamic_state *state, LLVMBuilderRef builder, unsigned unit); + /** Obtain stride in bytes between image rows/blocks (returns int32) */ LLVMValueRef - (*row_stride)( struct lp_sampler_dynamic_state *state, + (*row_stride)( const struct lp_sampler_dynamic_state *state, LLVMBuilderRef builder, unsigned unit); + /** Obtain stride in bytes between image slices (returns int32) */ LLVMValueRef - (*img_stride)( struct lp_sampler_dynamic_state *state, + (*img_stride)( const struct lp_sampler_dynamic_state *state, LLVMBuilderRef builder, unsigned unit); + /** Obtain pointer to array of pointers to mimpap levels */ LLVMValueRef - (*data_ptr)( struct lp_sampler_dynamic_state *state, + (*data_ptr)( const struct lp_sampler_dynamic_state *state, LLVMBuilderRef builder, unsigned unit); + /** Obtain texture min lod (returns float) */ + LLVMValueRef + (*min_lod)(const struct lp_sampler_dynamic_state *state, + LLVMBuilderRef builder, unsigned unit); + + /** Obtain texture max lod (returns float) */ + LLVMValueRef + (*max_lod)(const struct lp_sampler_dynamic_state *state, + LLVMBuilderRef builder, unsigned unit); + + /** Obtain texture lod bias (returns float) */ + LLVMValueRef + (*lod_bias)(const struct lp_sampler_dynamic_state *state, + LLVMBuilderRef builder, unsigned unit); + + /** Obtain texture border color (returns ptr to float[4]) */ + LLVMValueRef + (*border_color)(const struct lp_sampler_dynamic_state *state, + LLVMBuilderRef builder, unsigned unit); +}; + + +/** + * Keep all information for sampling code generation in a single place. + */ +struct lp_build_sample_context +{ + LLVMBuilderRef builder; + + const struct lp_sampler_static_state *static_state; + + struct lp_sampler_dynamic_state *dynamic_state; + + const struct util_format_description *format_desc; + + /* See texture_dims() */ + unsigned dims; + + /** regular scalar float type */ + struct lp_type float_type; + struct lp_build_context float_bld; + + /** float vector type */ + struct lp_build_context float_vec_bld; + + /** regular scalar float type */ + struct lp_type int_type; + struct lp_build_context int_bld; + + /** Incoming coordinates type and build context */ + struct lp_type coord_type; + struct lp_build_context coord_bld; + + /** Unsigned integer coordinates */ + struct lp_type uint_coord_type; + struct lp_build_context uint_coord_bld; + + /** Signed integer coordinates */ + struct lp_type int_coord_type; + struct lp_build_context int_coord_bld; + + /** Unsigned integer texture size */ + struct lp_type int_size_type; + struct lp_build_context int_size_bld; + + /** Unsigned integer texture size */ + struct lp_type float_size_type; + struct lp_build_context float_size_bld; + + /** Output texels type and build context */ + struct lp_type texel_type; + struct lp_build_context texel_bld; + + /* Common dynamic state values */ + LLVMValueRef width; + LLVMValueRef height; + LLVMValueRef depth; + LLVMValueRef row_stride_array; + LLVMValueRef img_stride_array; + LLVMValueRef data_array; + + /** Integer vector with texture width, height, depth */ + LLVMValueRef int_size; }; + +/** + * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at + * this time. Return whether the given mode is supported by that function. + */ +static INLINE boolean +lp_is_simple_wrap_mode(unsigned mode) +{ + switch (mode) { + case PIPE_TEX_WRAP_REPEAT: + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + return TRUE; + default: + return FALSE; + } +} + + +static INLINE void +apply_sampler_swizzle(struct lp_build_sample_context *bld, + LLVMValueRef *texel) +{ + unsigned char swizzles[4]; + + swizzles[0] = bld->static_state->swizzle_r; + swizzles[1] = bld->static_state->swizzle_g; + swizzles[2] = bld->static_state->swizzle_b; + swizzles[3] = bld->static_state->swizzle_a; + + lp_build_swizzle_soa_inplace(&bld->texel_bld, texel, swizzles); +} + + +static INLINE unsigned +texture_dims(enum pipe_texture_target tex) +{ + switch (tex) { + case PIPE_TEXTURE_1D: + return 1; + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + case PIPE_TEXTURE_CUBE: + return 2; + case PIPE_TEXTURE_3D: + return 3; + default: + assert(0 && "bad texture target in texture_dims()"); + return 2; + } +} + + +boolean +lp_sampler_wrap_mode_uses_border_color(unsigned mode, + unsigned min_img_filter, + unsigned mag_img_filter); + /** * Derive the sampler static state. */ @@ -146,23 +296,80 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, const struct pipe_sampler_state *sampler); -LLVMValueRef -lp_build_gather(LLVMBuilderRef builder, - unsigned length, - unsigned src_width, - unsigned dst_width, - LLVMValueRef base_ptr, - LLVMValueRef offsets); +void +lp_build_lod_selector(struct lp_build_sample_context *bld, + unsigned unit, + const LLVMValueRef ddx[4], + const LLVMValueRef ddy[4], + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ + unsigned mip_filter, + LLVMValueRef *out_lod_ipart, + LLVMValueRef *out_lod_fpart); + +void +lp_build_nearest_mip_level(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef lod, + LLVMValueRef *level_out); +void +lp_build_linear_mip_levels(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef lod_ipart, + LLVMValueRef *lod_fpart_inout, + LLVMValueRef *level0_out, + LLVMValueRef *level1_out); + +LLVMValueRef +lp_build_get_mipmap_level(struct lp_build_sample_context *bld, + LLVMValueRef level); LLVMValueRef +lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, + int level); + + +void +lp_build_mipmap_level_sizes(struct lp_build_sample_context *bld, + LLVMValueRef ilevel, + LLVMValueRef *out_width_vec, + LLVMValueRef *out_height_vec, + LLVMValueRef *out_depth_vec, + LLVMValueRef *row_stride_vec, + LLVMValueRef *img_stride_vec); + + +void +lp_build_cube_lookup(struct lp_build_sample_context *bld, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef *face, + LLVMValueRef *face_s, + LLVMValueRef *face_t); + + +void +lp_build_sample_partial_offset(struct lp_build_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_i); + + +void lp_build_sample_offset(struct lp_build_context *bld, const struct util_format_description *format_desc, LLVMValueRef x, LLVMValueRef y, LLVMValueRef z, LLVMValueRef y_stride, - LLVMValueRef z_stride); + LLVMValueRef z_stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_i, + LLVMValueRef *out_j); void @@ -173,9 +380,15 @@ lp_build_sample_soa(LLVMBuilderRef builder, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - LLVMValueRef lodbias, - LLVMValueRef *texel); + const LLVMValueRef *ddx, + const LLVMValueRef *ddy, + LLVMValueRef lod_bias, + LLVMValueRef explicit_lod, + LLVMValueRef texel_out[4]); +void +lp_build_sample_nop(struct lp_type type, + LLVMValueRef texel_out[4]); #endif /* LP_BLD_SAMPLE_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c new file mode 100644 index 00000000000..e7410448c04 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -0,0 +1,1092 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * @file + * Texture sampling -- SoA. + * + * @author Jose Fonseca <[email protected]> + * @author Brian Paul <[email protected]> + */ + +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include "util/u_debug.h" +#include "util/u_dump.h" +#include "util/u_memory.h" +#include "util/u_math.h" +#include "util/u_format.h" +#include "lp_bld_debug.h" +#include "lp_bld_type.h" +#include "lp_bld_const.h" +#include "lp_bld_conv.h" +#include "lp_bld_arit.h" +#include "lp_bld_logic.h" +#include "lp_bld_swizzle.h" +#include "lp_bld_pack.h" +#include "lp_bld_flow.h" +#include "lp_bld_gather.h" +#include "lp_bld_format.h" +#include "lp_bld_sample.h" +#include "lp_bld_sample_aos.h" +#include "lp_bld_quad.h" + + +/** + * Build LLVM code for texture coord wrapping, for nearest filtering, + * for scaled integer texcoords. + * \param block_length is the length of the pixel block along the + * coordinate axis + * \param coord the incoming texcoord (s,t,r or q) scaled to the texture size + * \param length the texture size along one dimension + * \param stride pixel stride along the coordinate axis (in bytes) + * \param is_pot if TRUE, length is a power of two + * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param out_offset byte offset for the wrapped coordinate + * \param out_i resulting sub-block pixel coordinate for coord0 + */ +static void +lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef length, + LLVMValueRef stride, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *out_offset, + LLVMValueRef *out_i) +{ + struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; + struct lp_build_context *int_coord_bld = &bld->int_coord_bld; + LLVMValueRef length_minus_one; + + length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one); + + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + if(is_pot) + coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, ""); + else { + /* Add a bias to the texcoord to handle negative coords */ + LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024); + coord = LLVMBuildAdd(bld->builder, coord, bias, ""); + coord = LLVMBuildURem(bld->builder, coord, length, ""); + } + break; + + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero); + coord = lp_build_min(int_coord_bld, coord, length_minus_one); + break; + + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(0); + } + + lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride, + out_offset, out_i); +} + + +/** + * Build LLVM code for texture coord wrapping, for linear filtering, + * for scaled integer texcoords. + * \param block_length is the length of the pixel block along the + * coordinate axis + * \param coord0 the incoming texcoord (s,t,r or q) scaled to the texture size + * \param length the texture size along one dimension + * \param stride pixel stride along the coordinate axis (in bytes) + * \param is_pot if TRUE, length is a power of two + * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param offset0 resulting relative offset for coord0 + * \param offset1 resulting relative offset for coord0 + 1 + * \param i0 resulting sub-block pixel coordinate for coord0 + * \param i1 resulting sub-block pixel coordinate for coord0 + 1 + */ +static void +lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, + unsigned block_length, + LLVMValueRef coord0, + LLVMValueRef length, + LLVMValueRef stride, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *offset0, + LLVMValueRef *offset1, + LLVMValueRef *i0, + LLVMValueRef *i1) +{ + struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; + struct lp_build_context *int_coord_bld = &bld->int_coord_bld; + LLVMValueRef length_minus_one; + LLVMValueRef lmask, umask, mask; + + if (block_length != 1) { + /* + * If the pixel block covers more than one pixel then there is no easy + * way to calculate offset1 relative to offset0. Instead, compute them + * independently. + */ + + LLVMValueRef coord1; + + lp_build_sample_wrap_nearest_int(bld, + block_length, + coord0, + length, + stride, + is_pot, + wrap_mode, + offset0, i0); + + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + + lp_build_sample_wrap_nearest_int(bld, + block_length, + coord1, + length, + stride, + is_pot, + wrap_mode, + offset1, i1); + + return; + } + + /* + * Scalar pixels -- try to compute offset0 and offset1 with a single stride + * multiplication. + */ + + *i0 = uint_coord_bld->zero; + *i1 = uint_coord_bld->zero; + + length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); + + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + if (is_pot) { + coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, ""); + } + else { + /* Add a bias to the texcoord to handle negative coords */ + LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024); + coord0 = LLVMBuildAdd(bld->builder, coord0, bias, ""); + coord0 = LLVMBuildURem(bld->builder, coord0, length, ""); + } + + mask = lp_build_compare(bld->builder, int_coord_bld->type, + PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); + + *offset0 = lp_build_mul(uint_coord_bld, coord0, stride); + *offset1 = LLVMBuildAnd(bld->builder, + lp_build_add(uint_coord_bld, *offset0, stride), + mask, ""); + break; + + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type, + PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero); + umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type, + PIPE_FUNC_LESS, coord0, length_minus_one); + + coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero); + coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one); + + mask = LLVMBuildAnd(bld->builder, lmask, umask, ""); + + *offset0 = lp_build_mul(uint_coord_bld, coord0, stride); + *offset1 = lp_build_add(uint_coord_bld, + *offset0, + LLVMBuildAnd(bld->builder, stride, mask, "")); + break; + + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(0); + *offset0 = uint_coord_bld->zero; + *offset1 = uint_coord_bld->zero; + break; + } +} + + +/** + * Sample a single texture image with nearest sampling. + * If sampling a cube texture, r = cube face in [0,5]. + * Return filtered color as two vectors of 16-bit fixed point values. + */ +static void +lp_build_sample_image_nearest(struct lp_build_sample_context *bld, + LLVMValueRef width_vec, + LLVMValueRef height_vec, + LLVMValueRef depth_vec, + LLVMValueRef row_stride_vec, + LLVMValueRef img_stride_vec, + LLVMValueRef data_ptr, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) +{ + const unsigned dims = bld->dims; + LLVMBuilderRef builder = bld->builder; + struct lp_build_context i32, h16, u8n; + LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; + LLVMValueRef i32_c8; + LLVMValueRef s_ipart, t_ipart, r_ipart; + LLVMValueRef x_stride; + LLVMValueRef x_offset, offset; + LLVMValueRef x_subcoord, y_subcoord, z_subcoord; + + lp_build_context_init(&i32, builder, lp_type_int_vec(32)); + lp_build_context_init(&h16, builder, lp_type_ufixed(16)); + lp_build_context_init(&u8n, builder, lp_type_unorm(8)); + + i32_vec_type = lp_build_vec_type(i32.type); + h16_vec_type = lp_build_vec_type(h16.type); + u8n_vec_type = lp_build_vec_type(u8n.type); + + if (bld->static_state->normalized_coords) { + /* s = s * width, t = t * height */ + LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type); + LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec, + coord_vec_type, ""); + s = lp_build_mul(&bld->coord_bld, s, fp_width); + if (dims >= 2) { + LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec, + coord_vec_type, ""); + t = lp_build_mul(&bld->coord_bld, t, fp_height); + if (dims >= 3) { + LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec, + coord_vec_type, ""); + r = lp_build_mul(&bld->coord_bld, r, fp_depth); + } + } + } + + /* scale coords by 256 (8 fractional bits) */ + s = lp_build_mul_imm(&bld->coord_bld, s, 256); + if (dims >= 2) + t = lp_build_mul_imm(&bld->coord_bld, t, 256); + if (dims >= 3) + r = lp_build_mul_imm(&bld->coord_bld, r, 256); + + /* convert float to int */ + s = LLVMBuildFPToSI(builder, s, i32_vec_type, ""); + if (dims >= 2) + t = LLVMBuildFPToSI(builder, t, i32_vec_type, ""); + if (dims >= 3) + r = LLVMBuildFPToSI(builder, r, i32_vec_type, ""); + + /* compute floor (shift right 8) */ + i32_c8 = lp_build_const_int_vec(i32.type, 8); + s_ipart = LLVMBuildAShr(builder, s, i32_c8, ""); + if (dims >= 2) + t_ipart = LLVMBuildAShr(builder, t, i32_c8, ""); + if (dims >= 3) + r_ipart = LLVMBuildAShr(builder, r, i32_c8, ""); + + /* get pixel, row, image strides */ + x_stride = lp_build_const_vec(bld->uint_coord_bld.type, + bld->format_desc->block.bits/8); + + /* Do texcoord wrapping, compute texel offset */ + lp_build_sample_wrap_nearest_int(bld, + bld->format_desc->block.width, + s_ipart, width_vec, x_stride, + bld->static_state->pot_width, + bld->static_state->wrap_s, + &x_offset, &x_subcoord); + offset = x_offset; + if (dims >= 2) { + LLVMValueRef y_offset; + lp_build_sample_wrap_nearest_int(bld, + bld->format_desc->block.height, + t_ipart, height_vec, row_stride_vec, + bld->static_state->pot_height, + bld->static_state->wrap_t, + &y_offset, &y_subcoord); + offset = lp_build_add(&bld->uint_coord_bld, offset, y_offset); + if (dims >= 3) { + LLVMValueRef z_offset; + lp_build_sample_wrap_nearest_int(bld, + 1, /* block length (depth) */ + r_ipart, depth_vec, img_stride_vec, + bld->static_state->pot_height, + bld->static_state->wrap_r, + &z_offset, &z_subcoord); + offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset); + } + else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + LLVMValueRef z_offset; + /* The r coord is the cube face in [0,5] */ + z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec); + offset = lp_build_add(&bld->uint_coord_bld, offset, z_offset); + } + } + + /* + * Fetch the pixels as 4 x 32bit (rgba order might differ): + * + * rgba0 rgba1 rgba2 rgba3 + * + * bit cast them into 16 x u8 + * + * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + * + * unpack them into two 8 x i16: + * + * r0 g0 b0 a0 r1 g1 b1 a1 + * r2 g2 b2 a2 r3 g3 b3 a3 + * + * The higher 8 bits of the resulting elements will be zero. + */ + { + LLVMValueRef rgba8; + + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* + * Given the format is a rgba8, just read the pixels as is, + * without any swizzling. Swizzling will be done later. + */ + rgba8 = lp_build_gather(bld->builder, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset); + + rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + } + else { + rgba8 = lp_build_fetch_rgba_aos(bld->builder, + bld->format_desc, + u8n.type, + data_ptr, offset, + x_subcoord, + y_subcoord); + } + + /* Expand one 4*rgba8 to two 2*rgba16 */ + lp_build_unpack2(builder, u8n.type, h16.type, + rgba8, + colors_lo, colors_hi); + } +} + + +/** + * Sample a single texture image with (bi-)(tri-)linear sampling. + * Return filtered color as two vectors of 16-bit fixed point values. + */ +static void +lp_build_sample_image_linear(struct lp_build_sample_context *bld, + LLVMValueRef width_vec, + LLVMValueRef height_vec, + LLVMValueRef depth_vec, + LLVMValueRef row_stride_vec, + LLVMValueRef img_stride_vec, + LLVMValueRef data_ptr, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef *colors_lo, + LLVMValueRef *colors_hi) +{ + const unsigned dims = bld->dims; + LLVMBuilderRef builder = bld->builder; + struct lp_build_context i32, h16, u8n; + LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; + LLVMValueRef i32_c8, i32_c128, i32_c255; + LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi; + LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi; + LLVMValueRef r_ipart, r_fpart, r_fpart_lo, r_fpart_hi; + LLVMValueRef x_stride, y_stride, z_stride; + LLVMValueRef x_offset0, x_offset1; + LLVMValueRef y_offset0, y_offset1; + LLVMValueRef z_offset0, z_offset1; + LLVMValueRef offset[2][2][2]; /* [z][y][x] */ + LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2]; + LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */ + LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */ + LLVMValueRef packed_lo, packed_hi; + unsigned x, y, z; + unsigned i, j, k; + unsigned numj, numk; + + lp_build_context_init(&i32, builder, lp_type_int_vec(32)); + lp_build_context_init(&h16, builder, lp_type_ufixed(16)); + lp_build_context_init(&u8n, builder, lp_type_unorm(8)); + + i32_vec_type = lp_build_vec_type(i32.type); + h16_vec_type = lp_build_vec_type(h16.type); + u8n_vec_type = lp_build_vec_type(u8n.type); + + if (bld->static_state->normalized_coords) { + /* s = s * width, t = t * height */ + LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type); + LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width_vec, + coord_vec_type, ""); + s = lp_build_mul(&bld->coord_bld, s, fp_width); + if (dims >= 2) { + LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height_vec, + coord_vec_type, ""); + t = lp_build_mul(&bld->coord_bld, t, fp_height); + } + if (dims >= 3) { + LLVMValueRef fp_depth = LLVMBuildSIToFP(bld->builder, depth_vec, + coord_vec_type, ""); + r = lp_build_mul(&bld->coord_bld, r, fp_depth); + } + } + + /* scale coords by 256 (8 fractional bits) */ + s = lp_build_mul_imm(&bld->coord_bld, s, 256); + if (dims >= 2) + t = lp_build_mul_imm(&bld->coord_bld, t, 256); + if (dims >= 3) + r = lp_build_mul_imm(&bld->coord_bld, r, 256); + + /* convert float to int */ + s = LLVMBuildFPToSI(builder, s, i32_vec_type, ""); + if (dims >= 2) + t = LLVMBuildFPToSI(builder, t, i32_vec_type, ""); + if (dims >= 3) + r = LLVMBuildFPToSI(builder, r, i32_vec_type, ""); + + /* subtract 0.5 (add -128) */ + i32_c128 = lp_build_const_int_vec(i32.type, -128); + s = LLVMBuildAdd(builder, s, i32_c128, ""); + if (dims >= 2) { + t = LLVMBuildAdd(builder, t, i32_c128, ""); + } + if (dims >= 3) { + r = LLVMBuildAdd(builder, r, i32_c128, ""); + } + + /* compute floor (shift right 8) */ + i32_c8 = lp_build_const_int_vec(i32.type, 8); + s_ipart = LLVMBuildAShr(builder, s, i32_c8, ""); + if (dims >= 2) + t_ipart = LLVMBuildAShr(builder, t, i32_c8, ""); + if (dims >= 3) + r_ipart = LLVMBuildAShr(builder, r, i32_c8, ""); + + /* compute fractional part (AND with 0xff) */ + i32_c255 = lp_build_const_int_vec(i32.type, 255); + s_fpart = LLVMBuildAnd(builder, s, i32_c255, ""); + if (dims >= 2) + t_fpart = LLVMBuildAnd(builder, t, i32_c255, ""); + if (dims >= 3) + r_fpart = LLVMBuildAnd(builder, r, i32_c255, ""); + + /* get pixel, row and image strides */ + x_stride = lp_build_const_vec(bld->uint_coord_bld.type, + bld->format_desc->block.bits/8); + y_stride = row_stride_vec; + z_stride = img_stride_vec; + + /* do texcoord wrapping and compute texel offsets */ + lp_build_sample_wrap_linear_int(bld, + bld->format_desc->block.width, + s_ipart, width_vec, x_stride, + bld->static_state->pot_width, + bld->static_state->wrap_s, + &x_offset0, &x_offset1, + &x_subcoord[0], &x_subcoord[1]); + for (z = 0; z < 2; z++) { + for (y = 0; y < 2; y++) { + offset[z][y][0] = x_offset0; + offset[z][y][1] = x_offset1; + } + } + + if (dims >= 2) { + lp_build_sample_wrap_linear_int(bld, + bld->format_desc->block.height, + t_ipart, height_vec, y_stride, + bld->static_state->pot_height, + bld->static_state->wrap_t, + &y_offset0, &y_offset1, + &y_subcoord[0], &y_subcoord[1]); + + for (z = 0; z < 2; z++) { + for (x = 0; x < 2; x++) { + offset[z][0][x] = lp_build_add(&bld->uint_coord_bld, + offset[z][0][x], y_offset0); + offset[z][1][x] = lp_build_add(&bld->uint_coord_bld, + offset[z][1][x], y_offset1); + } + } + } + + if (dims >= 3) { + lp_build_sample_wrap_linear_int(bld, + bld->format_desc->block.height, + r_ipart, depth_vec, z_stride, + bld->static_state->pot_depth, + bld->static_state->wrap_r, + &z_offset0, &z_offset1, + &z_subcoord[0], &z_subcoord[1]); + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + offset[0][y][x] = lp_build_add(&bld->uint_coord_bld, + offset[0][y][x], z_offset0); + offset[1][y][x] = lp_build_add(&bld->uint_coord_bld, + offset[1][y][x], z_offset1); + } + } + } + else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + LLVMValueRef z_offset; + z_offset = lp_build_mul(&bld->uint_coord_bld, r, img_stride_vec); + for (y = 0; y < 2; y++) { + for (x = 0; x < 2; x++) { + /* The r coord is the cube face in [0,5] */ + offset[0][y][x] = lp_build_add(&bld->uint_coord_bld, + offset[0][y][x], z_offset); + } + } + } + + /* + * Transform 4 x i32 in + * + * s_fpart = {s0, s1, s2, s3} + * + * into 8 x i16 + * + * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3} + * + * into two 8 x i16 + * + * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1} + * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3} + * + * and likewise for t_fpart. There is no risk of loosing precision here + * since the fractional parts only use the lower 8bits. + */ + s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, ""); + if (dims >= 2) + t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, ""); + if (dims >= 3) + r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, ""); + + { + LLVMTypeRef elem_type = LLVMInt32Type(); + LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef shuffle_lo; + LLVMValueRef shuffle_hi; + + for (j = 0; j < h16.type.length; j += 4) { +#ifdef PIPE_ARCH_LITTLE_ENDIAN + unsigned subindex = 0; +#else + unsigned subindex = 1; +#endif + LLVMValueRef index; + + index = LLVMConstInt(elem_type, j/2 + subindex, 0); + for (i = 0; i < 4; ++i) + shuffles_lo[j + i] = index; + + index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0); + for (i = 0; i < 4; ++i) + shuffles_hi[j + i] = index; + } + + shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length); + shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length); + + s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, + shuffle_lo, ""); + s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, + shuffle_hi, ""); + if (dims >= 2) { + t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, + shuffle_lo, ""); + t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, + shuffle_hi, ""); + } + if (dims >= 3) { + r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, + shuffle_lo, ""); + r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, + shuffle_hi, ""); + } + } + + /* + * Fetch the pixels as 4 x 32bit (rgba order might differ): + * + * rgba0 rgba1 rgba2 rgba3 + * + * bit cast them into 16 x u8 + * + * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + * + * unpack them into two 8 x i16: + * + * r0 g0 b0 a0 r1 g1 b1 a1 + * r2 g2 b2 a2 r3 g3 b3 a3 + * + * The higher 8 bits of the resulting elements will be zero. + */ + numj = 1 + (dims >= 2); + numk = 1 + (dims >= 3); + + for (k = 0; k < numk; k++) { + for (j = 0; j < numj; j++) { + for (i = 0; i < 2; i++) { + LLVMValueRef rgba8; + + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* + * Given the format is a rgba8, just read the pixels as is, + * without any swizzling. Swizzling will be done later. + */ + rgba8 = lp_build_gather(bld->builder, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset[k][j][i]); + + rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + } + else { + rgba8 = lp_build_fetch_rgba_aos(bld->builder, + bld->format_desc, + u8n.type, + data_ptr, offset[k][j][i], + x_subcoord[i], + y_subcoord[j]); + } + + /* Expand one 4*rgba8 to two 2*rgba16 */ + lp_build_unpack2(builder, u8n.type, h16.type, + rgba8, + &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]); + } + } + } + + /* + * Linear interpolation with 8.8 fixed point. + */ + if (dims == 1) { + /* 1-D lerp */ + packed_lo = lp_build_lerp(&h16, + s_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1]); + + packed_hi = lp_build_lerp(&h16, + s_fpart_hi, + neighbors_hi[0][0][0], + neighbors_hi[0][0][1]); + } + else { + /* 2-D lerp */ + packed_lo = lp_build_lerp_2d(&h16, + s_fpart_lo, t_fpart_lo, + neighbors_lo[0][0][0], + neighbors_lo[0][0][1], + neighbors_lo[0][1][0], + neighbors_lo[0][1][1]); + + packed_hi = lp_build_lerp_2d(&h16, + s_fpart_hi, t_fpart_hi, + neighbors_hi[0][0][0], + neighbors_hi[0][0][1], + neighbors_hi[0][1][0], + neighbors_hi[0][1][1]); + + if (dims >= 3) { + LLVMValueRef packed_lo2, packed_hi2; + + /* lerp in the second z slice */ + packed_lo2 = lp_build_lerp_2d(&h16, + s_fpart_lo, t_fpart_lo, + neighbors_lo[1][0][0], + neighbors_lo[1][0][1], + neighbors_lo[1][1][0], + neighbors_lo[1][1][1]); + + packed_hi2 = lp_build_lerp_2d(&h16, + s_fpart_hi, t_fpart_hi, + neighbors_hi[1][0][0], + neighbors_hi[1][0][1], + neighbors_hi[1][1][0], + neighbors_hi[1][1][1]); + /* interp between two z slices */ + packed_lo = lp_build_lerp(&h16, r_fpart_lo, + packed_lo, packed_lo2); + packed_hi = lp_build_lerp(&h16, r_fpart_hi, + packed_hi, packed_hi2); + } + } + + *colors_lo = packed_lo; + *colors_hi = packed_hi; +} + + +/** + * Sample the texture/mipmap using given image filter and mip filter. + * data0_ptr and data1_ptr point to the two mipmap levels to sample + * from. width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes. + * If we're using nearest miplevel sampling the '1' values will be null/unused. + */ +static void +lp_build_sample_mipmap(struct lp_build_sample_context *bld, + unsigned img_filter, + unsigned mip_filter, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, + LLVMValueRef lod_fpart, + LLVMValueRef colors_lo_var, + LLVMValueRef colors_hi_var) +{ + LLVMBuilderRef builder = bld->builder; + LLVMValueRef width0_vec; + LLVMValueRef width1_vec; + LLVMValueRef height0_vec; + LLVMValueRef height1_vec; + LLVMValueRef depth0_vec; + LLVMValueRef depth1_vec; + LLVMValueRef row_stride0_vec; + LLVMValueRef row_stride1_vec; + LLVMValueRef img_stride0_vec; + LLVMValueRef img_stride1_vec; + LLVMValueRef data_ptr0; + LLVMValueRef data_ptr1; + LLVMValueRef colors0_lo, colors0_hi; + LLVMValueRef colors1_lo, colors1_hi; + + + /* sample the first mipmap level */ + lp_build_mipmap_level_sizes(bld, ilevel0, + &width0_vec, &height0_vec, &depth0_vec, + &row_stride0_vec, &img_stride0_vec); + data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, + width0_vec, height0_vec, depth0_vec, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } + else { + assert(img_filter == PIPE_TEX_FILTER_LINEAR); + lp_build_sample_image_linear(bld, + width0_vec, height0_vec, depth0_vec, + row_stride0_vec, img_stride0_vec, + data_ptr0, s, t, r, + &colors0_lo, &colors0_hi); + } + + /* Store the first level's colors in the output variables */ + LLVMBuildStore(builder, colors0_lo, colors_lo_var); + LLVMBuildStore(builder, colors0_hi, colors_hi_var); + + if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { + LLVMValueRef h16_scale = LLVMConstReal(LLVMFloatType(), 256.0); + LLVMTypeRef i32_type = LLVMIntType(32); + struct lp_build_flow_context *flow_ctx; + struct lp_build_if_state if_ctx; + LLVMValueRef need_lerp; + + flow_ctx = lp_build_flow_create(builder); + + lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16_scale, ""); + lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32_type, "lod_fpart.fixed16"); + + /* need_lerp = lod_fpart > 0 */ + need_lerp = LLVMBuildICmp(builder, LLVMIntSGT, + lod_fpart, LLVMConstNull(i32_type), + "need_lerp"); + + lp_build_if(&if_ctx, flow_ctx, builder, need_lerp); + { + struct lp_build_context h16_bld; + + lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16)); + + /* sample the second mipmap level */ + lp_build_mipmap_level_sizes(bld, ilevel1, + &width1_vec, &height1_vec, &depth1_vec, + &row_stride1_vec, &img_stride1_vec); + data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, + width1_vec, height1_vec, depth1_vec, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } + else { + lp_build_sample_image_linear(bld, + width1_vec, height1_vec, depth1_vec, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + &colors1_lo, &colors1_hi); + } + + /* interpolate samples from the two mipmap levels */ + + lod_fpart = LLVMBuildTrunc(builder, lod_fpart, h16_bld.elem_type, ""); + lod_fpart = lp_build_broadcast_scalar(&h16_bld, lod_fpart); + + colors0_lo = lp_build_lerp(&h16_bld, lod_fpart, + colors0_lo, colors1_lo); + colors0_hi = lp_build_lerp(&h16_bld, lod_fpart, + colors0_hi, colors1_hi); + + LLVMBuildStore(builder, colors0_lo, colors_lo_var); + LLVMBuildStore(builder, colors0_hi, colors_hi_var); + } + lp_build_endif(&if_ctx); + + lp_build_flow_destroy(flow_ctx); + } +} + + + +/** + * Texture sampling in AoS format. Used when sampling common 32-bit/texel + * formats. 1D/2D/3D/cube texture supported. All mipmap sampling modes + * but only limited texture coord wrap modes. + */ +void +lp_build_sample_aos(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + const LLVMValueRef *ddx, + const LLVMValueRef *ddy, + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef texel_out[4]) +{ + struct lp_build_context *int_bld = &bld->int_bld; + LLVMBuilderRef builder = bld->builder; + const unsigned mip_filter = bld->static_state->min_mip_filter; + const unsigned min_filter = bld->static_state->min_img_filter; + const unsigned mag_filter = bld->static_state->mag_img_filter; + const unsigned dims = bld->dims; + LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; + LLVMValueRef ilevel0, ilevel1 = NULL; + LLVMValueRef packed, packed_lo, packed_hi; + LLVMValueRef unswizzled[4]; + LLVMValueRef face_ddx[4], face_ddy[4]; + struct lp_build_context h16_bld; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0); + + /* we only support the common/simple wrap modes at this time */ + assert(lp_is_simple_wrap_mode(bld->static_state->wrap_s)); + if (dims >= 2) + assert(lp_is_simple_wrap_mode(bld->static_state->wrap_t)); + if (dims >= 3) + assert(lp_is_simple_wrap_mode(bld->static_state->wrap_r)); + + + /* make 16-bit fixed-pt builder context */ + lp_build_context_init(&h16_bld, builder, lp_type_ufixed(16)); + + /* cube face selection, compute pre-face coords, etc. */ + if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + LLVMValueRef face, face_s, face_t; + lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); + s = face_s; /* vec */ + t = face_t; /* vec */ + /* use 'r' to indicate cube face */ + r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ + + /* recompute ddx, ddy using the new (s,t) face texcoords */ + face_ddx[0] = lp_build_ddx(&bld->coord_bld, s); + face_ddx[1] = lp_build_ddx(&bld->coord_bld, t); + face_ddx[2] = NULL; + face_ddx[3] = NULL; + face_ddy[0] = lp_build_ddy(&bld->coord_bld, s); + face_ddy[1] = lp_build_ddy(&bld->coord_bld, t); + face_ddy[2] = NULL; + face_ddy[3] = NULL; + ddx = face_ddx; + ddy = face_ddy; + } + + /* + * Compute the level of detail (float). + */ + if (min_filter != mag_filter || + mip_filter != PIPE_TEX_MIPFILTER_NONE) { + /* Need to compute lod either to choose mipmap levels or to + * distinguish between minification/magnification with one mipmap level. + */ + lp_build_lod_selector(bld, unit, ddx, ddy, + lod_bias, explicit_lod, + mip_filter, + &lod_ipart, &lod_fpart); + } else { + lod_ipart = i32t_zero; + } + + /* + * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1 + */ + switch (mip_filter) { + default: + assert(0 && "bad mip_filter value in lp_build_sample_aos()"); + /* fall-through */ + case PIPE_TEX_MIPFILTER_NONE: + /* always use mip level 0 */ + if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + /* XXX this is a work-around for an apparent bug in LLVM 2.7. + * We should be able to set ilevel0 = const(0) but that causes + * bad x86 code to be emitted. + */ + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); + } + else { + ilevel0 = i32t_zero; + } + break; + case PIPE_TEX_MIPFILTER_NEAREST: + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); + break; + case PIPE_TEX_MIPFILTER_LINEAR: + assert(lod_ipart); + assert(lod_fpart); + lp_build_linear_mip_levels(bld, unit, + lod_ipart, &lod_fpart, + &ilevel0, &ilevel1); + break; + } + + /* + * Get/interpolate texture colors. + */ + + packed_lo = lp_build_alloca(builder, h16_bld.vec_type, "packed_lo"); + packed_hi = lp_build_alloca(builder, h16_bld.vec_type, "packed_hi"); + + if (min_filter == mag_filter) { + /* no need to distinquish between minification and magnification */ + lp_build_sample_mipmap(bld, + min_filter, mip_filter, + s, t, r, + ilevel0, ilevel1, lod_fpart, + packed_lo, packed_hi); + } + else { + /* Emit conditional to choose min image filter or mag image filter + * depending on the lod being > 0 or <= 0, respectively. + */ + struct lp_build_flow_context *flow_ctx; + struct lp_build_if_state if_ctx; + LLVMValueRef minify; + + flow_ctx = lp_build_flow_create(builder); + + /* minify = lod >= 0.0 */ + minify = LLVMBuildICmp(builder, LLVMIntSGE, + lod_ipart, int_bld->zero, ""); + + lp_build_if(&if_ctx, flow_ctx, builder, minify); + { + /* Use the minification filter */ + lp_build_sample_mipmap(bld, + min_filter, mip_filter, + s, t, r, + ilevel0, ilevel1, lod_fpart, + packed_lo, packed_hi); + } + lp_build_else(&if_ctx); + { + /* Use the magnification filter */ + lp_build_sample_mipmap(bld, + mag_filter, PIPE_TEX_MIPFILTER_NONE, + s, t, r, + i32t_zero, NULL, NULL, + packed_lo, packed_hi); + } + lp_build_endif(&if_ctx); + + lp_build_flow_destroy(flow_ctx); + } + + /* + * combine the values stored in 'packed_lo' and 'packed_hi' variables + * into 'packed' + */ + packed = lp_build_pack2(builder, + h16_bld.type, lp_type_unorm(8), + LLVMBuildLoad(builder, packed_lo, ""), + LLVMBuildLoad(builder, packed_hi, "")); + + /* + * Convert to SoA and swizzle. + */ + lp_build_rgba8_to_f32_soa(builder, + bld->texel_type, + packed, unswizzled); + + if (util_format_is_rgba8_variant(bld->format_desc)) { + lp_build_format_swizzle_soa(bld->format_desc, + &bld->texel_bld, + unswizzled, texel_out); + } + else { + texel_out[0] = unswizzled[0]; + texel_out[1] = unswizzled[1]; + texel_out[2] = unswizzled[2]; + texel_out[3] = unswizzled[3]; + } + + apply_sampler_swizzle(bld, texel_out); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h new file mode 100644 index 00000000000..5d9ecac4d50 --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.h @@ -0,0 +1,56 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * @file + * Texture sampling -- SoA. + * + * @author Jose Fonseca <[email protected]> + * @author Brian Paul <[email protected]> + */ + +#ifndef LP_BLD_SAMPLE_AOS_H +#define LP_BLD_SAMPLE_AOS_H + + +#include "lp_bld_sample.h" + + +void +lp_build_sample_aos(struct lp_build_sample_context *bld, + unsigned unit, + LLVMValueRef s, + LLVMValueRef t, + LLVMValueRef r, + const LLVMValueRef *ddx, + const LLVMValueRef *ddy, + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef texel_out[4]); + + +#endif /* LP_BLD_SAMPLE_AOS_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 54ef921678d..b8cf938acfe 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -30,6 +30,7 @@ * Texture sampling -- SoA. * * @author Jose Fonseca <[email protected]> + * @author Brian Paul <[email protected]> */ #include "pipe/p_defines.h" @@ -39,204 +40,28 @@ #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_format.h" -#include "util/u_cpu_detect.h" #include "lp_bld_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" #include "lp_bld_conv.h" #include "lp_bld_arit.h" +#include "lp_bld_bitarit.h" #include "lp_bld_logic.h" +#include "lp_bld_printf.h" #include "lp_bld_swizzle.h" -#include "lp_bld_pack.h" #include "lp_bld_flow.h" +#include "lp_bld_gather.h" #include "lp_bld_format.h" #include "lp_bld_sample.h" - - -/** - * Keep all information for sampling code generation in a single place. - */ -struct lp_build_sample_context -{ - LLVMBuilderRef builder; - - const struct lp_sampler_static_state *static_state; - - struct lp_sampler_dynamic_state *dynamic_state; - - const struct util_format_description *format_desc; - - /** regular scalar float type */ - struct lp_type float_type; - struct lp_build_context float_bld; - - /** regular scalar float type */ - struct lp_type int_type; - struct lp_build_context int_bld; - - /** Incoming coordinates type and build context */ - struct lp_type coord_type; - struct lp_build_context coord_bld; - - /** Unsigned integer coordinates */ - struct lp_type uint_coord_type; - struct lp_build_context uint_coord_bld; - - /** Signed integer coordinates */ - struct lp_type int_coord_type; - struct lp_build_context int_coord_bld; - - /** Output texels type and build context */ - struct lp_type texel_type; - struct lp_build_context texel_bld; -}; - - -/** - * Does the given texture wrap mode allow sampling the texture border color? - * XXX maybe move this into gallium util code. - */ -static boolean -wrap_mode_uses_border_color(unsigned mode) -{ - switch (mode) { - case PIPE_TEX_WRAP_REPEAT: - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - case PIPE_TEX_WRAP_MIRROR_REPEAT: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - return FALSE; - case PIPE_TEX_WRAP_CLAMP: - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - case PIPE_TEX_WRAP_MIRROR_CLAMP: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - return TRUE; - default: - assert(0 && "unexpected wrap mode"); - return FALSE; - } -} - - -static LLVMValueRef -lp_build_get_mipmap_level(struct lp_build_sample_context *bld, - LLVMValueRef data_array, LLVMValueRef level) -{ - LLVMValueRef indexes[2], data_ptr; - indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); - indexes[1] = level; - data_ptr = LLVMBuildGEP(bld->builder, data_array, indexes, 2, ""); - data_ptr = LLVMBuildLoad(bld->builder, data_ptr, ""); - return data_ptr; -} - - -static LLVMValueRef -lp_build_get_const_mipmap_level(struct lp_build_sample_context *bld, - LLVMValueRef data_array, int level) -{ - LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0); - return lp_build_get_mipmap_level(bld, data_array, lvl); -} - - -/** - * Dereference stride_array[mipmap_level] array to get a stride. - * Return stride as a vector. - */ -static LLVMValueRef -lp_build_get_level_stride_vec(struct lp_build_sample_context *bld, - LLVMValueRef stride_array, LLVMValueRef level) -{ - LLVMValueRef indexes[2], stride; - indexes[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); - indexes[1] = level; - stride = LLVMBuildGEP(bld->builder, stride_array, indexes, 2, ""); - stride = LLVMBuildLoad(bld->builder, stride, ""); - stride = lp_build_broadcast_scalar(&bld->int_coord_bld, stride); - return stride; -} - - -/** Dereference stride_array[0] array to get a stride (as vector). */ -static LLVMValueRef -lp_build_get_const_level_stride_vec(struct lp_build_sample_context *bld, - LLVMValueRef stride_array, int level) -{ - LLVMValueRef lvl = LLVMConstInt(LLVMInt32Type(), level, 0); - return lp_build_get_level_stride_vec(bld, stride_array, lvl); -} - - -static int -texture_dims(enum pipe_texture_target tex) -{ - switch (tex) { - case PIPE_TEXTURE_1D: - return 1; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_CUBE: - return 2; - case PIPE_TEXTURE_3D: - return 3; - default: - assert(0 && "bad texture target in texture_dims()"); - return 2; - } -} - - -static LLVMValueRef -lp_build_swizzle_chan_soa(struct lp_type type, - const LLVMValueRef *unswizzled, - enum util_format_swizzle swizzle) -{ - switch (swizzle) { - case PIPE_SWIZZLE_RED: - case PIPE_SWIZZLE_GREEN: - case PIPE_SWIZZLE_BLUE: - case PIPE_SWIZZLE_ALPHA: - return unswizzled[swizzle]; - case PIPE_SWIZZLE_ZERO: - return lp_build_zero(type); - case PIPE_SWIZZLE_ONE: - return lp_build_one(type); - default: - assert(0); - return lp_build_undef(type); - } -} - - -static void -lp_build_swizzle_soa(struct lp_build_sample_context *bld, - LLVMValueRef *texel) -{ - LLVMValueRef unswizzled[4]; - unsigned char swizzles[4]; - unsigned chan; - - for (chan = 0; chan < 4; ++chan) { - unswizzled[chan] = texel[chan]; - } - - swizzles[0] = bld->static_state->swizzle_r; - swizzles[1] = bld->static_state->swizzle_g; - swizzles[2] = bld->static_state->swizzle_b; - swizzles[3] = bld->static_state->swizzle_a; - - for (chan = 0; chan < 4; ++chan) { - unsigned swizzle = swizzles[chan]; - texel[chan] = lp_build_swizzle_chan_soa(bld->texel_type, - unswizzled, swizzle); - } -} - +#include "lp_bld_sample_aos.h" +#include "lp_bld_struct.h" +#include "lp_bld_quad.h" /** * Generate code to fetch a texel from a texture at int coords (x, y, z). * The computation depends on whether the texture is 1D, 2D or 3D. - * The result, texel, will be: + * The result, texel, will be float vectors: * texel[0] = red values * texel[1] = green values * texel[2] = blue values @@ -244,6 +69,7 @@ lp_build_swizzle_soa(struct lp_build_sample_context *bld, */ static void lp_build_sample_texel_soa(struct lp_build_sample_context *bld, + unsigned unit, LLVMValueRef width, LLVMValueRef height, LLVMValueRef depth, @@ -253,23 +79,29 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, LLVMValueRef y_stride, LLVMValueRef z_stride, LLVMValueRef data_ptr, - LLVMValueRef *texel) + LLVMValueRef texel_out[4]) { - const int dims = texture_dims(bld->static_state->target); + const struct lp_sampler_static_state *static_state = bld->static_state; + const unsigned dims = bld->dims; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; LLVMValueRef offset; LLVMValueRef i, j; LLVMValueRef use_border = NULL; /* use_border = x < 0 || x >= width || y < 0 || y >= height */ - if (wrap_mode_uses_border_color(bld->static_state->wrap_s)) { + if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s, + static_state->min_img_filter, + static_state->mag_img_filter)) { LLVMValueRef b1, b2; b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero); b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width); use_border = LLVMBuildOr(bld->builder, b1, b2, "b1_or_b2"); } - if (dims >= 2 && wrap_mode_uses_border_color(bld->static_state->wrap_t)) { + if (dims >= 2 && + lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t, + static_state->min_img_filter, + static_state->mag_img_filter)) { LLVMValueRef b1, b2; b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero); b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height); @@ -282,7 +114,10 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, } } - if (dims == 3 && wrap_mode_uses_border_color(bld->static_state->wrap_r)) { + if (dims == 3 && + lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r, + static_state->min_img_filter, + static_state->mag_img_filter)) { LLVMValueRef b1, b2; b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero); b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth); @@ -295,44 +130,30 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, } } - /* - * Describe the coordinates in terms of pixel blocks. - * - * TODO: pixel blocks are power of two. LLVM should convert rem/div to - * bit arithmetic. Verify this. - */ - - if (bld->format_desc->block.width == 1) { - i = bld->uint_coord_bld.zero; - } - else { - LLVMValueRef block_width = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.width); - i = LLVMBuildURem(bld->builder, x, block_width, ""); - x = LLVMBuildUDiv(bld->builder, x, block_width, ""); - } + /* convert x,y,z coords to linear offset from start of texture, in bytes */ + lp_build_sample_offset(&bld->uint_coord_bld, + bld->format_desc, + x, y, z, y_stride, z_stride, + &offset, &i, &j); - if (bld->format_desc->block.height == 1) { - j = bld->uint_coord_bld.zero; - } - else { - LLVMValueRef block_height = lp_build_const_int_vec(bld->uint_coord_bld.type, bld->format_desc->block.height); - j = LLVMBuildURem(bld->builder, y, block_height, ""); - y = LLVMBuildUDiv(bld->builder, y, block_height, ""); + if (use_border) { + /* If we can sample the border color, it means that texcoords may + * lie outside the bounds of the texture image. We need to do + * something to prevent reading out of bounds and causing a segfault. + * + * Simply AND the texture coords with !use_border. This will cause + * coords which are out of bounds to become zero. Zero's guaranteed + * to be inside the texture image. + */ + offset = lp_build_andnot(&bld->uint_coord_bld, offset, use_border); } - /* convert x,y,z coords to linear offset from start of texture, in bytes */ - offset = lp_build_sample_offset(&bld->uint_coord_bld, - bld->format_desc, - x, y, z, y_stride, z_stride); - lp_build_fetch_rgba_soa(bld->builder, bld->format_desc, bld->texel_type, data_ptr, offset, i, j, - texel); - - lp_build_swizzle_soa(bld, texel); + texel_out); /* * Note: if we find an app which frequently samples the texture border @@ -351,44 +172,22 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, if (use_border) { /* select texel color or border color depending on use_border */ + LLVMValueRef border_color_ptr = + bld->dynamic_state->border_color(bld->dynamic_state, + bld->builder, unit); int chan; for (chan = 0; chan < 4; chan++) { LLVMValueRef border_chan = - lp_build_const_vec(bld->texel_type, - bld->static_state->border_color[chan]); - texel[chan] = lp_build_select(&bld->texel_bld, use_border, - border_chan, texel[chan]); + lp_build_array_get(bld->builder, border_color_ptr, + lp_build_const_int32(chan)); + LLVMValueRef border_chan_vec = + lp_build_broadcast_scalar(&bld->float_vec_bld, border_chan); + texel_out[chan] = lp_build_select(&bld->texel_bld, use_border, + border_chan_vec, texel_out[chan]); } } -} - -static LLVMValueRef -lp_build_sample_packed(struct lp_build_sample_context *bld, - LLVMValueRef x, - LLVMValueRef y, - LLVMValueRef y_stride, - LLVMValueRef data_array) -{ - LLVMValueRef offset; - LLVMValueRef data_ptr; - - offset = lp_build_sample_offset(&bld->uint_coord_bld, - bld->format_desc, - x, y, NULL, y_stride, NULL); - - assert(bld->format_desc->block.width == 1); - assert(bld->format_desc->block.height == 1); - assert(bld->format_desc->block.bits <= bld->texel_type.width); - - /* get pointer to mipmap level 0 data */ - data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0); - - return lp_build_gather(bld->builder, - bld->texel_type.length, - bld->format_desc->block.bits, - bld->texel_type.width, - data_ptr, offset); + apply_sampler_swizzle(bld, texel_out); } @@ -426,81 +225,6 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld, /** - * We only support a few wrap modes in lp_build_sample_wrap_int() at this time. - * Return whether the given mode is supported by that function. - */ -static boolean -is_simple_wrap_mode(unsigned mode) -{ - switch (mode) { - case PIPE_TEX_WRAP_REPEAT: - case PIPE_TEX_WRAP_CLAMP: - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - return TRUE; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - default: - return FALSE; - } -} - - -/** - * Build LLVM code for texture wrap mode, for scaled integer texcoords. - * \param coord the incoming texcoord (s,t,r or q) scaled to the texture size - * \param length the texture size along one dimension - * \param is_pot if TRUE, length is a power of two - * \param wrap_mode one of PIPE_TEX_WRAP_x - */ -static LLVMValueRef -lp_build_sample_wrap_int(struct lp_build_sample_context *bld, - LLVMValueRef coord, - LLVMValueRef length, - boolean is_pot, - unsigned wrap_mode) -{ - struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; - struct lp_build_context *int_coord_bld = &bld->int_coord_bld; - LLVMValueRef length_minus_one; - - length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one); - - switch(wrap_mode) { - case PIPE_TEX_WRAP_REPEAT: - if(is_pot) - coord = LLVMBuildAnd(bld->builder, coord, length_minus_one, ""); - else - /* Signed remainder won't give the right results for negative - * dividends but unsigned remainder does.*/ - coord = LLVMBuildURem(bld->builder, coord, length, ""); - break; - - case PIPE_TEX_WRAP_CLAMP: - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: - coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero); - coord = lp_build_min(int_coord_bld, coord, length_minus_one); - break; - - case PIPE_TEX_WRAP_MIRROR_REPEAT: - case PIPE_TEX_WRAP_MIRROR_CLAMP: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - /* FIXME */ - _debug_printf("llvmpipe: failed to translate texture wrap mode %s\n", - util_dump_tex_wrap(wrap_mode, TRUE)); - coord = lp_build_max(uint_coord_bld, coord, uint_coord_bld->zero); - coord = lp_build_min(uint_coord_bld, coord, length_minus_one); - break; - - default: - assert(0); - } - - return coord; -} - - -/** * Build LLVM code for texture wrap mode for linear filtering. * \param x0_out returns first integer texcoord * \param x1_out returns second integer texcoord @@ -519,11 +243,9 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, struct lp_build_context *coord_bld = &bld->coord_bld; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; - LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0); LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5); LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length); LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one); - LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one); LLVMValueRef coord0, coord1, weight; switch(wrap_mode) { @@ -531,19 +253,19 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, /* mul by size and subtract 0.5 */ coord = lp_build_mul(coord_bld, coord, length_f); coord = lp_build_sub(coord_bld, coord, half); - /* convert to int */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(uint_coord_bld, coord0, uint_coord_bld->one); - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); /* repeat wrap */ if (is_pot) { coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, ""); coord1 = LLVMBuildAnd(bld->builder, coord1, length_minus_one, ""); } else { - /* Signed remainder won't give the right results for negative - * dividends but unsigned remainder does.*/ + /* Add a bias to the texcoord to handle negative coords */ + LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024); + coord0 = LLVMBuildAdd(bld->builder, coord0, bias, ""); + coord1 = LLVMBuildAdd(bld->builder, coord1, bias, ""); coord0 = LLVMBuildURem(bld->builder, coord0, length, ""); coord1 = LLVMBuildURem(bld->builder, coord1, length, ""); } @@ -551,16 +273,18 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, case PIPE_TEX_WRAP_CLAMP: if (bld->static_state->normalized_coords) { + /* scale coord to length */ coord = lp_build_mul(coord_bld, coord, length_f); } - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_clamp(coord_bld, coord, coord_bld->zero, - length_f_minus_one); - coord1 = lp_build_add(coord_bld, coord, coord_bld->one); - coord1 = lp_build_clamp(coord_bld, coord1, coord_bld->zero, - length_f_minus_one); - coord0 = lp_build_ifloor(coord_bld, coord0); - coord1 = lp_build_ifloor(coord_bld, coord1); + + /* clamp to [0, length] */ + coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f); + + coord = lp_build_sub(coord_bld, coord, half); + + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); break; case PIPE_TEX_WRAP_CLAMP_TO_EDGE: @@ -574,14 +298,12 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, else { LLVMValueRef min, max; /* clamp to [0.5, length - 0.5] */ - min = lp_build_const_vec(coord_bld->type, 0.5F); + min = half; max = lp_build_sub(coord_bld, length_f, min); coord = lp_build_clamp(coord_bld, coord, min, max); } - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - /* coord0 = floor(coord); */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); /* coord0 = max(coord0, 0) */ coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero); @@ -593,29 +315,16 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, { LLVMValueRef min, max; if (bld->static_state->normalized_coords) { - /* min = -1.0 / (2 * length) = -0.5 / length */ - min = lp_build_mul(coord_bld, - lp_build_const_vec(coord_bld->type, -0.5F), - lp_build_rcp(coord_bld, length_f)); - /* max = 1.0 - min */ - max = lp_build_sub(coord_bld, coord_bld->one, min); - /* coord = clamp(coord, min, max) */ - coord = lp_build_clamp(coord_bld, coord, min, max); - /* scale coord to length (and sub 0.5?) */ + /* scale coord to length */ coord = lp_build_mul(coord_bld, coord, length_f); - coord = lp_build_sub(coord_bld, coord, half); - } - else { - /* clamp to [-0.5, length + 0.5] */ - min = lp_build_const_vec(coord_bld->type, -0.5F); - max = lp_build_sub(coord_bld, length_f, min); - coord = lp_build_clamp(coord_bld, coord, min, max); - coord = lp_build_sub(coord_bld, coord, half); } - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - /* convert to int */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* clamp to [-0.5, length + 0.5] */ + min = lp_build_const_vec(coord_bld->type, -0.5F); + max = lp_build_sub(coord_bld, length_f, min); + coord = lp_build_clamp(coord_bld, coord, min, max); + coord = lp_build_sub(coord_bld, coord, half); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; @@ -628,11 +337,8 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, coord = lp_build_mul(coord_bld, coord, length_f); coord = lp_build_sub(coord_bld, coord, half); - /* compute lerp weight */ - weight = lp_build_fract(coord_bld, coord); - - /* convert to int coords */ - coord0 = lp_build_ifloor(coord_bld, coord); + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); /* coord0 = max(coord0, 0) */ @@ -642,37 +348,43 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, break; case PIPE_TEX_WRAP_MIRROR_CLAMP: - { - LLVMValueRef min, max; - /* min = 1.0 / (2 * length) */ - min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f)); - /* max = 1.0 - min */ - max = lp_build_sub(coord_bld, coord_bld->one, min); + coord = lp_build_abs(coord_bld, coord); - coord = lp_build_abs(coord_bld, coord); - coord = lp_build_clamp(coord_bld, coord, min, max); + if (bld->static_state->normalized_coords) { + /* scale coord to length */ coord = lp_build_mul(coord_bld, coord, length_f); - if(0)coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); - coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } + + /* clamp to [0, length] */ + coord = lp_build_min(coord_bld, coord, length_f); + + coord = lp_build_sub(coord_bld, coord, half); + + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); break; case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: { LLVMValueRef min, max; - /* min = 1.0 / (2 * length) */ - min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f)); - /* max = 1.0 - min */ - max = lp_build_sub(coord_bld, coord_bld->one, min); coord = lp_build_abs(coord_bld, coord); + + if (bld->static_state->normalized_coords) { + /* scale coord to length */ + coord = lp_build_mul(coord_bld, coord, length_f); + } + + /* clamp to [0.5, length - 0.5] */ + min = half; + max = lp_build_sub(coord_bld, length_f, min); coord = lp_build_clamp(coord_bld, coord, min, max); - coord = lp_build_mul(coord_bld, coord, length_f); + coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; @@ -680,19 +392,23 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: { LLVMValueRef min, max; - /* min = -1.0 / (2 * length) = -0.5 / length */ - min = lp_build_mul(coord_bld, - lp_build_const_vec(coord_bld->type, -0.5F), - lp_build_rcp(coord_bld, length_f)); - /* max = 1.0 - min */ - max = lp_build_sub(coord_bld, coord_bld->one, min); coord = lp_build_abs(coord_bld, coord); + + if (bld->static_state->normalized_coords) { + /* scale coord to length */ + coord = lp_build_mul(coord_bld, coord, length_f); + } + + /* clamp to [-0.5, length + 0.5] */ + min = lp_build_negate(coord_bld, half); + max = lp_build_sub(coord_bld, length_f, min); coord = lp_build_clamp(coord_bld, coord, min, max); - coord = lp_build_mul(coord_bld, coord, length_f); + coord = lp_build_sub(coord_bld, coord, half); - weight = lp_build_fract(coord_bld, coord); - coord0 = lp_build_ifloor(coord_bld, coord); + + /* convert to int, compute lerp weight */ + lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); } break; @@ -713,7 +429,7 @@ lp_build_sample_wrap_linear(struct lp_build_sample_context *bld, /** * Build LLVM code for texture wrap mode for nearest filtering. * \param coord the incoming texcoord (nominally in [0,1]) - * \param length the texture size along one dimension, as int + * \param length the texture size along one dimension, as int vector * \param is_pot if TRUE, length is a power of two * \param wrap_mode one of PIPE_TEX_WRAP_x */ @@ -727,10 +443,8 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, struct lp_build_context *coord_bld = &bld->coord_bld; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; - LLVMValueRef two = lp_build_const_vec(coord_bld->type, 2.0); LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length); LLVMValueRef length_minus_one = lp_build_sub(uint_coord_bld, length, uint_coord_bld->one); - LLVMValueRef length_f_minus_one = lp_build_sub(coord_bld, length_f, coord_bld->one); LLVMValueRef icoord; switch(wrap_mode) { @@ -739,127 +453,89 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, icoord = lp_build_ifloor(coord_bld, coord); if (is_pot) icoord = LLVMBuildAnd(bld->builder, icoord, length_minus_one, ""); - else - /* Signed remainder won't give the right results for negative - * dividends but unsigned remainder does.*/ + else { + /* Add a bias to the texcoord to handle negative coords */ + LLVMValueRef bias = lp_build_mul_imm(uint_coord_bld, length, 1024); + icoord = LLVMBuildAdd(bld->builder, icoord, bias, ""); icoord = LLVMBuildURem(bld->builder, icoord, length, ""); + } break; case PIPE_TEX_WRAP_CLAMP: - /* mul by size */ + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: if (bld->static_state->normalized_coords) { + /* scale coord to length */ coord = lp_build_mul(coord_bld, coord, length_f); } + /* floor */ icoord = lp_build_ifloor(coord_bld, coord); - /* clamp to [0, size-1]. Note: int coord builder type */ + + /* clamp to [0, length - 1]. */ icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero, length_minus_one); break; - case PIPE_TEX_WRAP_CLAMP_TO_EDGE: - { - LLVMValueRef min, max; - if (bld->static_state->normalized_coords) { - /* min = 1.0 / (2 * length) */ - min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f)); - /* max = length - min */ - max = lp_build_sub(coord_bld, length_f, min); - /* scale coord to length */ - coord = lp_build_mul(coord_bld, coord, length_f); - } - else { - /* clamp to [0.5, length - 0.5] */ - min = lp_build_const_vec(coord_bld->type, 0.5F); - max = lp_build_sub(coord_bld, length_f, min); - } - /* coord = clamp(coord, min, max) */ - coord = lp_build_clamp(coord_bld, coord, min, max); - icoord = lp_build_ifloor(coord_bld, coord); - } - break; - case PIPE_TEX_WRAP_CLAMP_TO_BORDER: /* Note: this is the same as CLAMP_TO_EDGE, except min = -min */ { LLVMValueRef min, max; + if (bld->static_state->normalized_coords) { - /* min = -1.0 / (2 * length) = -0.5 / length */ - min = lp_build_mul(coord_bld, - lp_build_const_vec(coord_bld->type, -0.5F), - lp_build_rcp(coord_bld, length_f)); - /* max = length - min */ - max = lp_build_sub(coord_bld, length_f, min); /* scale coord to length */ coord = lp_build_mul(coord_bld, coord, length_f); } - else { - /* clamp to [-0.5, length + 0.5] */ - min = lp_build_const_vec(coord_bld->type, -0.5F); - max = lp_build_sub(coord_bld, length_f, min); - } - /* coord = clamp(coord, min, max) */ - coord = lp_build_clamp(coord_bld, coord, min, max); + icoord = lp_build_ifloor(coord_bld, coord); + + /* clamp to [-1, length] */ + min = lp_build_negate(int_coord_bld, int_coord_bld->one); + max = length; + icoord = lp_build_clamp(int_coord_bld, icoord, min, max); } break; case PIPE_TEX_WRAP_MIRROR_REPEAT: - { - LLVMValueRef min, max; - /* min = 1.0 / (2 * length) */ - min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f)); - /* max = length - min */ - max = lp_build_sub(coord_bld, length_f, min); + /* compute mirror function */ + coord = lp_build_coord_mirror(bld, coord); - /* compute mirror function */ - coord = lp_build_coord_mirror(bld, coord); + /* scale coord to length */ + assert(bld->static_state->normalized_coords); + coord = lp_build_mul(coord_bld, coord, length_f); - /* scale coord to length */ - coord = lp_build_mul(coord_bld, coord, length_f); + icoord = lp_build_ifloor(coord_bld, coord); - /* coord = clamp(coord, min, max) */ - coord = lp_build_clamp(coord_bld, coord, min, max); - icoord = lp_build_ifloor(coord_bld, coord); - } + /* clamp to [0, length - 1] */ + icoord = lp_build_min(int_coord_bld, icoord, length_minus_one); break; case PIPE_TEX_WRAP_MIRROR_CLAMP: - coord = lp_build_abs(coord_bld, coord); - coord = lp_build_mul(coord_bld, coord, length_f); - coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f_minus_one); - icoord = lp_build_ifloor(coord_bld, coord); - break; - case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: - { - LLVMValueRef min, max; - /* min = 1.0 / (2 * length) */ - min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f)); - /* max = length - min */ - max = lp_build_sub(coord_bld, length_f, min); + coord = lp_build_abs(coord_bld, coord); - coord = lp_build_abs(coord_bld, coord); + if (bld->static_state->normalized_coords) { + /* scale coord to length */ coord = lp_build_mul(coord_bld, coord, length_f); - coord = lp_build_clamp(coord_bld, coord, min, max); - icoord = lp_build_ifloor(coord_bld, coord); } + + icoord = lp_build_ifloor(coord_bld, coord); + + /* clamp to [0, length - 1] */ + icoord = lp_build_min(int_coord_bld, icoord, length_minus_one); break; case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: - { - LLVMValueRef min, max; - /* min = 1.0 / (2 * length) */ - min = lp_build_rcp(coord_bld, lp_build_mul(coord_bld, two, length_f)); - min = lp_build_negate(coord_bld, min); - /* max = length - min */ - max = lp_build_sub(coord_bld, length_f, min); + coord = lp_build_abs(coord_bld, coord); - coord = lp_build_abs(coord_bld, coord); + if (bld->static_state->normalized_coords) { + /* scale coord to length */ coord = lp_build_mul(coord_bld, coord, length_f); - coord = lp_build_clamp(coord_bld, coord, min, max); - icoord = lp_build_ifloor(coord_bld, coord); } + + icoord = lp_build_ifloor(coord_bld, coord); + + /* clamp to [0, length] */ + icoord = lp_build_min(int_coord_bld, icoord, length); break; default: @@ -872,213 +548,12 @@ lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld, /** - * Codegen equivalent for u_minify(). - * Return max(1, base_size >> level); - */ -static LLVMValueRef -lp_build_minify(struct lp_build_sample_context *bld, - LLVMValueRef base_size, - LLVMValueRef level) -{ - LLVMValueRef size = LLVMBuildAShr(bld->builder, base_size, level, "minify"); - size = lp_build_max(&bld->int_coord_bld, size, bld->int_coord_bld.one); - return size; -} - - -/** - * Generate code to compute texture level of detail (lambda). - * \param s vector of texcoord s values - * \param t vector of texcoord t values - * \param r vector of texcoord r values - * \param shader_lod_bias vector float with the shader lod bias, - * \param width scalar int texture width - * \param height scalar int texture height - * \param depth scalar int texture depth - */ -static LLVMValueRef -lp_build_lod_selector(struct lp_build_sample_context *bld, - LLVMValueRef s, - LLVMValueRef t, - LLVMValueRef r, - LLVMValueRef shader_lod_bias, - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef depth) - -{ - if (bld->static_state->min_lod == bld->static_state->max_lod) { - /* User is forcing sampling from a particular mipmap level. - * This is hit during mipmap generation. - */ - return LLVMConstReal(LLVMFloatType(), bld->static_state->min_lod); - } - else { - const int dims = texture_dims(bld->static_state->target); - struct lp_build_context *float_bld = &bld->float_bld; - LLVMValueRef sampler_lod_bias = LLVMConstReal(LLVMFloatType(), - bld->static_state->lod_bias); - LLVMValueRef min_lod = LLVMConstReal(LLVMFloatType(), - bld->static_state->min_lod); - LLVMValueRef max_lod = LLVMConstReal(LLVMFloatType(), - bld->static_state->max_lod); - - LLVMValueRef index0 = LLVMConstInt(LLVMInt32Type(), 0, 0); - LLVMValueRef index1 = LLVMConstInt(LLVMInt32Type(), 1, 0); - LLVMValueRef index2 = LLVMConstInt(LLVMInt32Type(), 2, 0); - - LLVMValueRef s0, s1, s2; - LLVMValueRef t0, t1, t2; - LLVMValueRef r0, r1, r2; - LLVMValueRef dsdx, dsdy, dtdx, dtdy, drdx, drdy; - LLVMValueRef rho, lod; - - /* - * dsdx = abs(s[1] - s[0]); - * dsdy = abs(s[2] - s[0]); - * dtdx = abs(t[1] - t[0]); - * dtdy = abs(t[2] - t[0]); - * drdx = abs(r[1] - r[0]); - * drdy = abs(r[2] - r[0]); - * XXX we're assuming a four-element quad in 2x2 layout here. - */ - s0 = LLVMBuildExtractElement(bld->builder, s, index0, "s0"); - s1 = LLVMBuildExtractElement(bld->builder, s, index1, "s1"); - s2 = LLVMBuildExtractElement(bld->builder, s, index2, "s2"); - dsdx = LLVMBuildSub(bld->builder, s1, s0, ""); - dsdx = lp_build_abs(float_bld, dsdx); - dsdy = LLVMBuildSub(bld->builder, s2, s0, ""); - dsdy = lp_build_abs(float_bld, dsdy); - if (dims > 1) { - t0 = LLVMBuildExtractElement(bld->builder, t, index0, "t0"); - t1 = LLVMBuildExtractElement(bld->builder, t, index1, "t1"); - t2 = LLVMBuildExtractElement(bld->builder, t, index2, "t2"); - dtdx = LLVMBuildSub(bld->builder, t1, t0, ""); - dtdx = lp_build_abs(float_bld, dtdx); - dtdy = LLVMBuildSub(bld->builder, t2, t0, ""); - dtdy = lp_build_abs(float_bld, dtdy); - if (dims > 2) { - r0 = LLVMBuildExtractElement(bld->builder, r, index0, "r0"); - r1 = LLVMBuildExtractElement(bld->builder, r, index1, "r1"); - r2 = LLVMBuildExtractElement(bld->builder, r, index2, "r2"); - drdx = LLVMBuildSub(bld->builder, r1, r0, ""); - drdx = lp_build_abs(float_bld, drdx); - drdy = LLVMBuildSub(bld->builder, r2, r0, ""); - drdy = lp_build_abs(float_bld, drdy); - } - } - - /* Compute rho = max of all partial derivatives scaled by texture size. - * XXX this could be vectorized somewhat - */ - rho = LLVMBuildMul(bld->builder, - lp_build_max(float_bld, dsdx, dsdy), - lp_build_int_to_float(float_bld, width), ""); - if (dims > 1) { - LLVMValueRef max; - max = LLVMBuildMul(bld->builder, - lp_build_max(float_bld, dtdx, dtdy), - lp_build_int_to_float(float_bld, height), ""); - rho = lp_build_max(float_bld, rho, max); - if (dims > 2) { - max = LLVMBuildMul(bld->builder, - lp_build_max(float_bld, drdx, drdy), - lp_build_int_to_float(float_bld, depth), ""); - rho = lp_build_max(float_bld, rho, max); - } - } - - /* compute lod = log2(rho) */ - lod = lp_build_log2(float_bld, rho); - - /* add sampler lod bias */ - lod = LLVMBuildAdd(bld->builder, lod, sampler_lod_bias, "sampler LOD bias"); - - /* add shader lod bias */ - /* XXX for now we take only the first element since our lod is scalar */ - shader_lod_bias = LLVMBuildExtractElement(bld->builder, shader_lod_bias, - LLVMConstInt(LLVMInt32Type(), 0, 0), ""); - lod = LLVMBuildAdd(bld->builder, lod, shader_lod_bias, "shader LOD bias"); - - /* clamp lod */ - lod = lp_build_clamp(float_bld, lod, min_lod, max_lod); - - return lod; - } -} - - -/** - * For PIPE_TEX_MIPFILTER_NEAREST, convert float LOD to integer - * mipmap level index. - * Note: this is all scalar code. - * \param lod scalar float texture level of detail - * \param level_out returns integer - */ -static void -lp_build_nearest_mip_level(struct lp_build_sample_context *bld, - unsigned unit, - LLVMValueRef lod, - LLVMValueRef *level_out) -{ - struct lp_build_context *float_bld = &bld->float_bld; - struct lp_build_context *int_bld = &bld->int_bld; - LLVMValueRef last_level, level; - - LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0); - - last_level = bld->dynamic_state->last_level(bld->dynamic_state, - bld->builder, unit); - - /* convert float lod to integer */ - level = lp_build_iround(float_bld, lod); - - /* clamp level to legal range of levels */ - *level_out = lp_build_clamp(int_bld, level, zero, last_level); -} - - -/** - * For PIPE_TEX_MIPFILTER_LINEAR, convert float LOD to integer to - * two (adjacent) mipmap level indexes. Later, we'll sample from those - * two mipmap levels and interpolate between them. - */ -static void -lp_build_linear_mip_levels(struct lp_build_sample_context *bld, - unsigned unit, - LLVMValueRef lod, - LLVMValueRef *level0_out, - LLVMValueRef *level1_out, - LLVMValueRef *weight_out) -{ - struct lp_build_context *float_bld = &bld->float_bld; - struct lp_build_context *int_bld = &bld->int_bld; - LLVMValueRef last_level, level; - - last_level = bld->dynamic_state->last_level(bld->dynamic_state, - bld->builder, unit); - - /* convert float lod to integer */ - level = lp_build_ifloor(float_bld, lod); - - /* compute level 0 and clamp to legal range of levels */ - *level0_out = lp_build_clamp(int_bld, level, - int_bld->zero, - last_level); - /* compute level 1 and clamp to legal range of levels */ - *level1_out = lp_build_add(int_bld, *level0_out, int_bld->one); - *level1_out = lp_build_min(int_bld, *level1_out, last_level); - - *weight_out = lp_build_fract(float_bld, lod); -} - - -/** * Generate code to sample a mipmap level with nearest filtering. * If sampling a cube texture, r = cube face in [0,5]. */ static void lp_build_sample_image_nearest(struct lp_build_sample_context *bld, + unsigned unit, LLVMValueRef width_vec, LLVMValueRef height_vec, LLVMValueRef depth_vec, @@ -1090,7 +565,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef r, LLVMValueRef colors_out[4]) { - const int dims = texture_dims(bld->static_state->target); + const unsigned dims = bld->dims; LLVMValueRef x, y, z; /* @@ -1109,7 +584,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, if (dims == 3) { z = lp_build_sample_wrap_nearest(bld, r, depth_vec, - bld->static_state->pot_height, + bld->static_state->pot_depth, bld->static_state->wrap_r); lp_build_name(z, "tex.z.wrapped"); } @@ -1127,7 +602,8 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, /* * Get texture colors. */ - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x, y, z, row_stride_vec, img_stride_vec, data_ptr, colors_out); @@ -1140,6 +616,7 @@ lp_build_sample_image_nearest(struct lp_build_sample_context *bld, */ static void lp_build_sample_image_linear(struct lp_build_sample_context *bld, + unsigned unit, LLVMValueRef width_vec, LLVMValueRef height_vec, LLVMValueRef depth_vec, @@ -1151,7 +628,7 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, LLVMValueRef r, LLVMValueRef colors_out[4]) { - const int dims = texture_dims(bld->static_state->target); + const unsigned dims = bld->dims; LLVMValueRef x0, y0, z0, x1, y1, z1; LLVMValueRef s_fpart, t_fpart, r_fpart; LLVMValueRef neighbors[2][2][4]; @@ -1201,11 +678,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, * Get texture colors. */ /* get x0/x1 texels */ - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x0, y0, z0, row_stride_vec, img_stride_vec, data_ptr, neighbors[0][0]); - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x1, y0, z0, row_stride_vec, img_stride_vec, data_ptr, neighbors[0][1]); @@ -1223,11 +702,13 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, LLVMValueRef colors0[4]; /* get x0/x1 texels at y1 */ - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x0, y1, z0, row_stride_vec, img_stride_vec, data_ptr, neighbors[1][0]); - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x1, y1, z0, row_stride_vec, img_stride_vec, data_ptr, neighbors[1][1]); @@ -1247,19 +728,23 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, LLVMValueRef colors1[4]; /* get x0/x1/y0/y1 texels at z1 */ - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x0, y0, z1, row_stride_vec, img_stride_vec, data_ptr, neighbors1[0][0]); - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x1, y0, z1, row_stride_vec, img_stride_vec, data_ptr, neighbors1[0][1]); - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x0, y1, z1, row_stride_vec, img_stride_vec, data_ptr, neighbors1[1][0]); - lp_build_sample_texel_soa(bld, width_vec, height_vec, depth_vec, + lp_build_sample_texel_soa(bld, unit, + width_vec, height_vec, depth_vec, x1, y1, z1, row_stride_vec, img_stride_vec, data_ptr, neighbors1[1][1]); @@ -1291,209 +776,6 @@ lp_build_sample_image_linear(struct lp_build_sample_context *bld, } -/** Helper used by lp_build_cube_lookup() */ -static LLVMValueRef -lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord) -{ - /* ima = -0.5 / abs(coord); */ - LLVMValueRef negHalf = lp_build_const_vec(coord_bld->type, -0.5); - LLVMValueRef absCoord = lp_build_abs(coord_bld, coord); - LLVMValueRef ima = lp_build_mul(coord_bld, negHalf, - lp_build_rcp(coord_bld, absCoord)); - return ima; -} - - -/** - * Helper used by lp_build_cube_lookup() - * \param sign scalar +1 or -1 - * \param coord float vector - * \param ima float vector - */ -static LLVMValueRef -lp_build_cube_coord(struct lp_build_context *coord_bld, - LLVMValueRef sign, int negate_coord, - LLVMValueRef coord, LLVMValueRef ima) -{ - /* return negate(coord) * ima * sign + 0.5; */ - LLVMValueRef half = lp_build_const_vec(coord_bld->type, 0.5); - LLVMValueRef res; - - assert(negate_coord == +1 || negate_coord == -1); - - if (negate_coord == -1) { - coord = lp_build_negate(coord_bld, coord); - } - - res = lp_build_mul(coord_bld, coord, ima); - if (sign) { - sign = lp_build_broadcast_scalar(coord_bld, sign); - res = lp_build_mul(coord_bld, res, sign); - } - res = lp_build_add(coord_bld, res, half); - - return res; -} - - -/** Helper used by lp_build_cube_lookup() - * Return (major_coord >= 0) ? pos_face : neg_face; - */ -static LLVMValueRef -lp_build_cube_face(struct lp_build_sample_context *bld, - LLVMValueRef major_coord, - unsigned pos_face, unsigned neg_face) -{ - LLVMValueRef cmp = LLVMBuildFCmp(bld->builder, LLVMRealUGE, - major_coord, - bld->float_bld.zero, ""); - LLVMValueRef pos = LLVMConstInt(LLVMInt32Type(), pos_face, 0); - LLVMValueRef neg = LLVMConstInt(LLVMInt32Type(), neg_face, 0); - LLVMValueRef res = LLVMBuildSelect(bld->builder, cmp, pos, neg, ""); - return res; -} - - - -/** - * Generate code to do cube face selection and per-face texcoords. - */ -static void -lp_build_cube_lookup(struct lp_build_sample_context *bld, - LLVMValueRef s, - LLVMValueRef t, - LLVMValueRef r, - LLVMValueRef *face, - LLVMValueRef *face_s, - LLVMValueRef *face_t) -{ - struct lp_build_context *float_bld = &bld->float_bld; - struct lp_build_context *coord_bld = &bld->coord_bld; - LLVMValueRef rx, ry, rz; - LLVMValueRef arx, ary, arz; - LLVMValueRef c25 = LLVMConstReal(LLVMFloatType(), 0.25); - LLVMValueRef arx_ge_ary, arx_ge_arz; - LLVMValueRef ary_ge_arx, ary_ge_arz; - LLVMValueRef arx_ge_ary_arz, ary_ge_arx_arz; - LLVMValueRef rx_pos, ry_pos, rz_pos; - - assert(bld->coord_bld.type.length == 4); - - /* - * Use the average of the four pixel's texcoords to choose the face. - */ - rx = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, s)); - ry = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, t)); - rz = lp_build_mul(float_bld, c25, - lp_build_sum_vector(&bld->coord_bld, r)); - - arx = lp_build_abs(float_bld, rx); - ary = lp_build_abs(float_bld, ry); - arz = lp_build_abs(float_bld, rz); - - /* - * Compare sign/magnitude of rx,ry,rz to determine face - */ - arx_ge_ary = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, ary, ""); - arx_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, arx, arz, ""); - ary_ge_arx = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arx, ""); - ary_ge_arz = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ary, arz, ""); - - arx_ge_ary_arz = LLVMBuildAnd(bld->builder, arx_ge_ary, arx_ge_arz, ""); - ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, ""); - - rx_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rx, float_bld->zero, ""); - ry_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, ry, float_bld->zero, ""); - rz_pos = LLVMBuildFCmp(bld->builder, LLVMRealUGE, rz, float_bld->zero, ""); - - { - struct lp_build_flow_context *flow_ctx; - struct lp_build_if_state if_ctx; - - flow_ctx = lp_build_flow_create(bld->builder); - lp_build_flow_scope_begin(flow_ctx); - - *face_s = bld->coord_bld.undef; - *face_t = bld->coord_bld.undef; - *face = bld->int_bld.undef; - - lp_build_name(*face_s, "face_s"); - lp_build_name(*face_t, "face_t"); - lp_build_name(*face, "face"); - - lp_build_flow_scope_declare(flow_ctx, face_s); - lp_build_flow_scope_declare(flow_ctx, face_t); - lp_build_flow_scope_declare(flow_ctx, face); - - lp_build_if(&if_ctx, flow_ctx, bld->builder, arx_ge_ary_arz); - { - /* +/- X face */ - LLVMValueRef sign = lp_build_sgn(float_bld, rx); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, s); - *face_s = lp_build_cube_coord(coord_bld, sign, +1, r, ima); - *face_t = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); - *face = lp_build_cube_face(bld, rx, - PIPE_TEX_FACE_POS_X, - PIPE_TEX_FACE_NEG_X); - } - lp_build_else(&if_ctx); - { - struct lp_build_flow_context *flow_ctx2; - struct lp_build_if_state if_ctx2; - - LLVMValueRef face_s2 = bld->coord_bld.undef; - LLVMValueRef face_t2 = bld->coord_bld.undef; - LLVMValueRef face2 = bld->int_bld.undef; - - flow_ctx2 = lp_build_flow_create(bld->builder); - lp_build_flow_scope_begin(flow_ctx2); - lp_build_flow_scope_declare(flow_ctx2, &face_s2); - lp_build_flow_scope_declare(flow_ctx2, &face_t2); - lp_build_flow_scope_declare(flow_ctx2, &face2); - - ary_ge_arx_arz = LLVMBuildAnd(bld->builder, ary_ge_arx, ary_ge_arz, ""); - - lp_build_if(&if_ctx2, flow_ctx2, bld->builder, ary_ge_arx_arz); - { - /* +/- Y face */ - LLVMValueRef sign = lp_build_sgn(float_bld, ry); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, t); - face_s2 = lp_build_cube_coord(coord_bld, NULL, -1, s, ima); - face_t2 = lp_build_cube_coord(coord_bld, sign, -1, r, ima); - face2 = lp_build_cube_face(bld, ry, - PIPE_TEX_FACE_POS_Y, - PIPE_TEX_FACE_NEG_Y); - } - lp_build_else(&if_ctx2); - { - /* +/- Z face */ - LLVMValueRef sign = lp_build_sgn(float_bld, rz); - LLVMValueRef ima = lp_build_cube_ima(coord_bld, r); - face_s2 = lp_build_cube_coord(coord_bld, sign, -1, s, ima); - face_t2 = lp_build_cube_coord(coord_bld, NULL, +1, t, ima); - face2 = lp_build_cube_face(bld, rz, - PIPE_TEX_FACE_POS_Z, - PIPE_TEX_FACE_NEG_Z); - } - lp_build_endif(&if_ctx2); - lp_build_flow_scope_end(flow_ctx2); - lp_build_flow_destroy(flow_ctx2); - - *face_s = face_s2; - *face_t = face_t2; - *face = face2; - } - - lp_build_endif(&if_ctx); - lp_build_flow_scope_end(flow_ctx); - lp_build_flow_destroy(flow_ctx); - } -} - - - /** * Sample the texture/mipmap using given image filter and mip filter. * data0_ptr and data1_ptr point to the two mipmap levels to sample @@ -1502,72 +784,107 @@ lp_build_cube_lookup(struct lp_build_sample_context *bld, */ static void lp_build_sample_mipmap(struct lp_build_sample_context *bld, + unsigned unit, unsigned img_filter, unsigned mip_filter, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, + LLVMValueRef ilevel0, + LLVMValueRef ilevel1, LLVMValueRef lod_fpart, - LLVMValueRef width0_vec, - LLVMValueRef width1_vec, - LLVMValueRef height0_vec, - LLVMValueRef height1_vec, - LLVMValueRef depth0_vec, - LLVMValueRef depth1_vec, - LLVMValueRef row_stride0_vec, - LLVMValueRef row_stride1_vec, - LLVMValueRef img_stride0_vec, - LLVMValueRef img_stride1_vec, - LLVMValueRef data_ptr0, - LLVMValueRef data_ptr1, LLVMValueRef *colors_out) { + LLVMBuilderRef builder = bld->builder; + LLVMValueRef width0_vec; + LLVMValueRef width1_vec; + LLVMValueRef height0_vec; + LLVMValueRef height1_vec; + LLVMValueRef depth0_vec; + LLVMValueRef depth1_vec; + LLVMValueRef row_stride0_vec; + LLVMValueRef row_stride1_vec; + LLVMValueRef img_stride0_vec; + LLVMValueRef img_stride1_vec; + LLVMValueRef data_ptr0; + LLVMValueRef data_ptr1; LLVMValueRef colors0[4], colors1[4]; - int chan; + unsigned chan; + /* sample the first mipmap level */ + lp_build_mipmap_level_sizes(bld, ilevel0, + &width0_vec, &height0_vec, &depth0_vec, + &row_stride0_vec, &img_stride0_vec); + data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0); if (img_filter == PIPE_TEX_FILTER_NEAREST) { - lp_build_sample_image_nearest(bld, + lp_build_sample_image_nearest(bld, unit, width0_vec, height0_vec, depth0_vec, row_stride0_vec, img_stride0_vec, - data_ptr0, s, t, r, colors0); - - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* sample the second mipmap level, and interp */ - lp_build_sample_image_nearest(bld, - width1_vec, height1_vec, depth1_vec, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, colors1); - } + data_ptr0, s, t, r, + colors0); } else { assert(img_filter == PIPE_TEX_FILTER_LINEAR); - - lp_build_sample_image_linear(bld, + lp_build_sample_image_linear(bld, unit, width0_vec, height0_vec, depth0_vec, row_stride0_vec, img_stride0_vec, - data_ptr0, s, t, r, colors0); - - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* sample the second mipmap level, and interp */ - lp_build_sample_image_linear(bld, - width1_vec, height1_vec, depth1_vec, - row_stride1_vec, img_stride1_vec, - data_ptr1, s, t, r, colors1); - } + data_ptr0, s, t, r, + colors0); + } + + /* Store the first level's colors in the output variables */ + for (chan = 0; chan < 4; chan++) { + LLVMBuildStore(builder, colors0[chan], colors_out[chan]); } if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* interpolate samples from the two mipmap levels */ - for (chan = 0; chan < 4; chan++) { - colors_out[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart, + struct lp_build_flow_context *flow_ctx; + struct lp_build_if_state if_ctx; + LLVMValueRef need_lerp; + + flow_ctx = lp_build_flow_create(builder); + + /* need_lerp = lod_fpart > 0 */ + need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT, + lod_fpart, + bld->float_bld.zero, + "need_lerp"); + + lp_build_if(&if_ctx, flow_ctx, builder, need_lerp); + { + /* sample the second mipmap level */ + lp_build_mipmap_level_sizes(bld, ilevel1, + &width1_vec, &height1_vec, &depth1_vec, + &row_stride1_vec, &img_stride1_vec); + data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1); + if (img_filter == PIPE_TEX_FILTER_NEAREST) { + lp_build_sample_image_nearest(bld, unit, + width1_vec, height1_vec, depth1_vec, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + colors1); + } + else { + lp_build_sample_image_linear(bld, unit, + width1_vec, height1_vec, depth1_vec, + row_stride1_vec, img_stride1_vec, + data_ptr1, s, t, r, + colors1); + } + + /* interpolate samples from the two mipmap levels */ + + lod_fpart = lp_build_broadcast_scalar(&bld->texel_bld, lod_fpart); + + for (chan = 0; chan < 4; chan++) { + colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart, colors0[chan], colors1[chan]); + LLVMBuildStore(builder, colors0[chan], colors_out[chan]); + } } - } - else { - /* use first/only level's colors */ - for (chan = 0; chan < 4; chan++) { - colors_out[chan] = colors0[chan]; - } + lp_build_endif(&if_ctx); + + lp_build_flow_destroy(flow_ctx); } } @@ -1584,30 +901,24 @@ lp_build_sample_general(struct lp_build_sample_context *bld, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, - LLVMValueRef lodbias, - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef depth, - LLVMValueRef width_vec, - LLVMValueRef height_vec, - LLVMValueRef depth_vec, - LLVMValueRef row_stride_array, - LLVMValueRef img_stride_array, - LLVMValueRef data_array, + const LLVMValueRef *ddx, + const LLVMValueRef *ddy, + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *colors_out) { - struct lp_build_context *float_bld = &bld->float_bld; + struct lp_build_context *int_bld = &bld->int_bld; + LLVMBuilderRef builder = bld->builder; const unsigned mip_filter = bld->static_state->min_mip_filter; const unsigned min_filter = bld->static_state->min_img_filter; const unsigned mag_filter = bld->static_state->mag_img_filter; - const int dims = texture_dims(bld->static_state->target); - LLVMValueRef lod = NULL, lod_fpart = NULL; - LLVMValueRef ilevel0, ilevel1 = NULL, ilevel0_vec, ilevel1_vec = NULL; - LLVMValueRef width0_vec = NULL, height0_vec = NULL, depth0_vec = NULL; - LLVMValueRef width1_vec = NULL, height1_vec = NULL, depth1_vec = NULL; - LLVMValueRef row_stride0_vec = NULL, row_stride1_vec = NULL; - LLVMValueRef img_stride0_vec = NULL, img_stride1_vec = NULL; - LLVMValueRef data_ptr0, data_ptr1 = NULL; + LLVMValueRef lod_ipart = NULL, lod_fpart = NULL; + LLVMValueRef ilevel0, ilevel1 = NULL; + LLVMValueRef face_ddx[4], face_ddy[4]; + LLVMValueRef texels[4]; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef i32t_zero = LLVMConstInt(i32t, 0, 0); + unsigned chan; /* printf("%s mip %d min %d mag %d\n", __FUNCTION__, @@ -1615,6 +926,30 @@ lp_build_sample_general(struct lp_build_sample_context *bld, */ /* + * Choose cube face, recompute texcoords and derivatives for the chosen face. + */ + if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + LLVMValueRef face, face_s, face_t; + lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); + s = face_s; /* vec */ + t = face_t; /* vec */ + /* use 'r' to indicate cube face */ + r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ + + /* recompute ddx, ddy using the new (s,t) face texcoords */ + face_ddx[0] = lp_build_ddx(&bld->coord_bld, s); + face_ddx[1] = lp_build_ddx(&bld->coord_bld, t); + face_ddx[2] = NULL; + face_ddx[3] = NULL; + face_ddy[0] = lp_build_ddy(&bld->coord_bld, s); + face_ddy[1] = lp_build_ddy(&bld->coord_bld, t); + face_ddy[2] = NULL; + face_ddy[3] = NULL; + ddx = face_ddx; + ddy = face_ddy; + } + + /* * Compute the level of detail (float). */ if (min_filter != mag_filter || @@ -1622,432 +957,173 @@ lp_build_sample_general(struct lp_build_sample_context *bld, /* Need to compute lod either to choose mipmap levels or to * distinguish between minification/magnification with one mipmap level. */ - lod = lp_build_lod_selector(bld, s, t, r, lodbias, width, height, depth); + lp_build_lod_selector(bld, unit, ddx, ddy, + lod_bias, explicit_lod, + mip_filter, + &lod_ipart, &lod_fpart); + } else { + lod_ipart = i32t_zero; } /* - * Compute integer mipmap level(s) to fetch texels from. + * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1 */ - if (mip_filter == PIPE_TEX_MIPFILTER_NONE) { + switch (mip_filter) { + default: + assert(0 && "bad mip_filter value in lp_build_sample_soa()"); + /* fall-through */ + case PIPE_TEX_MIPFILTER_NONE: /* always use mip level 0 */ - ilevel0 = LLVMConstInt(LLVMInt32Type(), 0, 0); - } - else { - if (mip_filter == PIPE_TEX_MIPFILTER_NEAREST) { - lp_build_nearest_mip_level(bld, unit, lod, &ilevel0); + if (bld->static_state->target == PIPE_TEXTURE_CUBE) { + /* XXX this is a work-around for an apparent bug in LLVM 2.7. + * We should be able to set ilevel0 = const(0) but that causes + * bad x86 code to be emitted. + */ + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); } else { - assert(mip_filter == PIPE_TEX_MIPFILTER_LINEAR); - lp_build_linear_mip_levels(bld, unit, lod, &ilevel0, &ilevel1, - &lod_fpart); - lod_fpart = lp_build_broadcast_scalar(&bld->coord_bld, lod_fpart); - } - } - - /* - * Convert scalar integer mipmap levels into vectors. - */ - ilevel0_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel0); - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) - ilevel1_vec = lp_build_broadcast_scalar(&bld->int_coord_bld, ilevel1); - - /* - * Compute width, height at mipmap level 'ilevel0' - */ - width0_vec = lp_build_minify(bld, width_vec, ilevel0_vec); - if (dims >= 2) { - height0_vec = lp_build_minify(bld, height_vec, ilevel0_vec); - row_stride0_vec = lp_build_get_level_stride_vec(bld, row_stride_array, - ilevel0); - if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) { - img_stride0_vec = lp_build_get_level_stride_vec(bld, - img_stride_array, - ilevel0); - if (dims == 3) { - depth0_vec = lp_build_minify(bld, depth_vec, ilevel0_vec); - } - } - } - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - /* compute width, height, depth for second mipmap level at 'ilevel1' */ - width1_vec = lp_build_minify(bld, width_vec, ilevel1_vec); - if (dims >= 2) { - height1_vec = lp_build_minify(bld, height_vec, ilevel1_vec); - row_stride1_vec = lp_build_get_level_stride_vec(bld, row_stride_array, - ilevel1); - if (dims == 3 || bld->static_state->target == PIPE_TEXTURE_CUBE) { - img_stride1_vec = lp_build_get_level_stride_vec(bld, - img_stride_array, - ilevel1); - if (dims ==3) { - depth1_vec = lp_build_minify(bld, depth_vec, ilevel1_vec); - } - } + ilevel0 = i32t_zero; } + break; + case PIPE_TEX_MIPFILTER_NEAREST: + assert(lod_ipart); + lp_build_nearest_mip_level(bld, unit, lod_ipart, &ilevel0); + break; + case PIPE_TEX_MIPFILTER_LINEAR: + assert(lod_ipart); + assert(lod_fpart); + lp_build_linear_mip_levels(bld, unit, + lod_ipart, &lod_fpart, + &ilevel0, &ilevel1); + break; } /* - * Choose cube face, recompute per-face texcoords. + * Get/interpolate texture colors. */ - if (bld->static_state->target == PIPE_TEXTURE_CUBE) { - LLVMValueRef face, face_s, face_t; - lp_build_cube_lookup(bld, s, t, r, &face, &face_s, &face_t); - s = face_s; /* vec */ - t = face_t; /* vec */ - /* use 'r' to indicate cube face */ - r = lp_build_broadcast_scalar(&bld->int_coord_bld, face); /* vec */ - } - /* - * Get pointer(s) to image data for mipmap level(s). - */ - data_ptr0 = lp_build_get_mipmap_level(bld, data_array, ilevel0); - if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { - data_ptr1 = lp_build_get_mipmap_level(bld, data_array, ilevel1); + for (chan = 0; chan < 4; ++chan) { + texels[chan] = lp_build_alloca(builder, bld->texel_bld.vec_type, ""); + lp_build_name(texels[chan], "sampler%u_texel_%c_var", unit, "xyzw"[chan]); } - /* - * Get/interpolate texture colors. - */ if (min_filter == mag_filter) { /* no need to distinquish between minification and magnification */ - lp_build_sample_mipmap(bld, min_filter, mip_filter, s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - colors_out); + lp_build_sample_mipmap(bld, unit, + min_filter, mip_filter, + s, t, r, + ilevel0, ilevel1, lod_fpart, + texels); } else { /* Emit conditional to choose min image filter or mag image filter - * depending on the lod being >0 or <= 0, respectively. + * depending on the lod being > 0 or <= 0, respectively. */ struct lp_build_flow_context *flow_ctx; struct lp_build_if_state if_ctx; LLVMValueRef minify; - flow_ctx = lp_build_flow_create(bld->builder); - lp_build_flow_scope_begin(flow_ctx); - - lp_build_flow_scope_declare(flow_ctx, &colors_out[0]); - lp_build_flow_scope_declare(flow_ctx, &colors_out[1]); - lp_build_flow_scope_declare(flow_ctx, &colors_out[2]); - lp_build_flow_scope_declare(flow_ctx, &colors_out[3]); + flow_ctx = lp_build_flow_create(builder); - /* minify = lod > 0.0 */ - minify = LLVMBuildFCmp(bld->builder, LLVMRealUGE, - lod, float_bld->zero, ""); + /* minify = lod >= 0.0 */ + minify = LLVMBuildICmp(builder, LLVMIntSGE, + lod_ipart, int_bld->zero, ""); - lp_build_if(&if_ctx, flow_ctx, bld->builder, minify); + lp_build_if(&if_ctx, flow_ctx, builder, minify); { /* Use the minification filter */ - lp_build_sample_mipmap(bld, min_filter, mip_filter, - s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - colors_out); + lp_build_sample_mipmap(bld, unit, + min_filter, mip_filter, + s, t, r, + ilevel0, ilevel1, lod_fpart, + texels); } lp_build_else(&if_ctx); { /* Use the magnification filter */ - lp_build_sample_mipmap(bld, mag_filter, mip_filter, - s, t, r, lod_fpart, - width0_vec, width1_vec, - height0_vec, height1_vec, - depth0_vec, depth1_vec, - row_stride0_vec, row_stride1_vec, - img_stride0_vec, img_stride1_vec, - data_ptr0, data_ptr1, - colors_out); + lp_build_sample_mipmap(bld, unit, + mag_filter, PIPE_TEX_MIPFILTER_NONE, + s, t, r, + i32t_zero, NULL, NULL, + texels); } lp_build_endif(&if_ctx); - lp_build_flow_scope_end(flow_ctx); lp_build_flow_destroy(flow_ctx); } -} - - -static void -lp_build_rgba8_to_f32_soa(LLVMBuilderRef builder, - struct lp_type dst_type, - LLVMValueRef packed, - LLVMValueRef *rgba) -{ - LLVMValueRef mask = lp_build_const_int_vec(dst_type, 0xff); - unsigned chan; - - /* Decode the input vector components */ for (chan = 0; chan < 4; ++chan) { - unsigned start = chan*8; - unsigned stop = start + 8; - LLVMValueRef input; - - input = packed; - - if(start) - input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(dst_type, start), ""); - - if(stop < 32) - input = LLVMBuildAnd(builder, input, mask, ""); - - input = lp_build_unsigned_norm_to_float(builder, 8, dst_type, input); - - rgba[chan] = input; + colors_out[chan] = LLVMBuildLoad(builder, texels[chan], ""); + lp_build_name(colors_out[chan], "sampler%u_texel_%c", unit, "xyzw"[chan]); } } -static void -lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, - LLVMValueRef s, - LLVMValueRef t, - LLVMValueRef width, - LLVMValueRef height, - LLVMValueRef stride_array, - LLVMValueRef data_array, - LLVMValueRef *texel) -{ - LLVMBuilderRef builder = bld->builder; - struct lp_build_context i32, h16, u8n; - LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; - LLVMValueRef i32_c8, i32_c128, i32_c255; - LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi; - LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi; - LLVMValueRef x0, x1; - LLVMValueRef y0, y1; - LLVMValueRef neighbors[2][2]; - LLVMValueRef neighbors_lo[2][2]; - LLVMValueRef neighbors_hi[2][2]; - LLVMValueRef packed, packed_lo, packed_hi; - LLVMValueRef unswizzled[4]; - LLVMValueRef stride; - - lp_build_context_init(&i32, builder, lp_type_int_vec(32)); - lp_build_context_init(&h16, builder, lp_type_ufixed(16)); - lp_build_context_init(&u8n, builder, lp_type_unorm(8)); - - i32_vec_type = lp_build_vec_type(i32.type); - h16_vec_type = lp_build_vec_type(h16.type); - u8n_vec_type = lp_build_vec_type(u8n.type); - - if (bld->static_state->normalized_coords) { - LLVMTypeRef coord_vec_type = lp_build_vec_type(bld->coord_type); - LLVMValueRef fp_width = LLVMBuildSIToFP(bld->builder, width, coord_vec_type, ""); - LLVMValueRef fp_height = LLVMBuildSIToFP(bld->builder, height, coord_vec_type, ""); - s = lp_build_mul(&bld->coord_bld, s, fp_width); - t = lp_build_mul(&bld->coord_bld, t, fp_height); - } - - /* scale coords by 256 (8 fractional bits) */ - s = lp_build_mul_imm(&bld->coord_bld, s, 256); - t = lp_build_mul_imm(&bld->coord_bld, t, 256); - - /* convert float to int */ - s = LLVMBuildFPToSI(builder, s, i32_vec_type, ""); - t = LLVMBuildFPToSI(builder, t, i32_vec_type, ""); - - /* subtract 0.5 (add -128) */ - i32_c128 = lp_build_const_int_vec(i32.type, -128); - s = LLVMBuildAdd(builder, s, i32_c128, ""); - t = LLVMBuildAdd(builder, t, i32_c128, ""); - - /* compute floor (shift right 8) */ - i32_c8 = lp_build_const_int_vec(i32.type, 8); - s_ipart = LLVMBuildAShr(builder, s, i32_c8, ""); - t_ipart = LLVMBuildAShr(builder, t, i32_c8, ""); - - /* compute fractional part (AND with 0xff) */ - i32_c255 = lp_build_const_int_vec(i32.type, 255); - s_fpart = LLVMBuildAnd(builder, s, i32_c255, ""); - t_fpart = LLVMBuildAnd(builder, t, i32_c255, ""); - - x0 = s_ipart; - y0 = t_ipart; - - x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one); - y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one); - - x0 = lp_build_sample_wrap_int(bld, x0, width, bld->static_state->pot_width, - bld->static_state->wrap_s); - y0 = lp_build_sample_wrap_int(bld, y0, height, bld->static_state->pot_height, - bld->static_state->wrap_t); - - x1 = lp_build_sample_wrap_int(bld, x1, width, bld->static_state->pot_width, - bld->static_state->wrap_s); - y1 = lp_build_sample_wrap_int(bld, y1, height, bld->static_state->pot_height, - bld->static_state->wrap_t); - - /* - * Transform 4 x i32 in - * - * s_fpart = {s0, s1, s2, s3} - * - * into 8 x i16 - * - * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3} - * - * into two 8 x i16 - * - * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1} - * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3} - * - * and likewise for t_fpart. There is no risk of loosing precision here - * since the fractional parts only use the lower 8bits. - */ - - s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, ""); - t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, ""); - - { - LLVMTypeRef elem_type = LLVMInt32Type(); - LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH]; - LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; - LLVMValueRef shuffle_lo; - LLVMValueRef shuffle_hi; - unsigned i, j; - - for(j = 0; j < h16.type.length; j += 4) { - unsigned subindex = util_cpu_caps.little_endian ? 0 : 1; - LLVMValueRef index; - - index = LLVMConstInt(elem_type, j/2 + subindex, 0); - for(i = 0; i < 4; ++i) - shuffles_lo[j + i] = index; - - index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0); - for(i = 0; i < 4; ++i) - shuffles_hi[j + i] = index; - } - - shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length); - shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length); - - s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_lo, ""); - t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_lo, ""); - s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_hi, ""); - t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, ""); - } - - stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0); - - /* - * Fetch the pixels as 4 x 32bit (rgba order might differ): - * - * rgba0 rgba1 rgba2 rgba3 - * - * bit cast them into 16 x u8 - * - * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 - * - * unpack them into two 8 x i16: - * - * r0 g0 b0 a0 r1 g1 b1 a1 - * r2 g2 b2 a2 r3 g3 b3 a3 - * - * The higher 8 bits of the resulting elements will be zero. - */ - - neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array); - neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array); - neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array); - neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array); - - neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, ""); - neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, ""); - neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, ""); - neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, ""); - - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]); - - /* - * Linear interpolate with 8.8 fixed point. - */ - - packed_lo = lp_build_lerp_2d(&h16, - s_fpart_lo, t_fpart_lo, - neighbors_lo[0][0], - neighbors_lo[0][1], - neighbors_lo[1][0], - neighbors_lo[1][1]); - - packed_hi = lp_build_lerp_2d(&h16, - s_fpart_hi, t_fpart_hi, - neighbors_hi[0][0], - neighbors_hi[0][1], - neighbors_hi[1][0], - neighbors_hi[1][1]); - - packed = lp_build_pack2(builder, h16.type, u8n.type, packed_lo, packed_hi); - - /* - * Convert to SoA and swizzle. - */ - - packed = LLVMBuildBitCast(builder, packed, i32_vec_type, ""); - - lp_build_rgba8_to_f32_soa(bld->builder, - bld->texel_type, - packed, unswizzled); - - lp_build_format_swizzle_soa(bld->format_desc, - bld->texel_type, unswizzled, - texel); - - lp_build_swizzle_soa(bld, texel); -} - - +/** + * Do shadow test/comparison. + * \param p the texcoord Z (aka R, aka P) component + * \param texel the texel to compare against (use the X channel) + */ static void lp_build_sample_compare(struct lp_build_sample_context *bld, LLVMValueRef p, - LLVMValueRef *texel) + LLVMValueRef texel[4]) { struct lp_build_context *texel_bld = &bld->texel_bld; LLVMValueRef res; - unsigned chan; + const unsigned chan = 0; - if(bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE) + if (bld->static_state->compare_mode == PIPE_TEX_COMPARE_NONE) return; - /* TODO: Compare before swizzling, to avoid redundant computations */ - res = NULL; - for(chan = 0; chan < 4; ++chan) { - LLVMValueRef cmp; - cmp = lp_build_cmp(texel_bld, bld->static_state->compare_func, p, texel[chan]); - cmp = lp_build_select(texel_bld, cmp, texel_bld->one, texel_bld->zero); - - if(res) - res = lp_build_add(texel_bld, res, cmp); - else - res = cmp; + /* debug code */ + if (0) { + LLVMValueRef indx = lp_build_const_int32(0); + LLVMValueRef coord = LLVMBuildExtractElement(bld->builder, p, indx, ""); + LLVMValueRef tex = LLVMBuildExtractElement(bld->builder, + texel[chan], indx, ""); + lp_build_printf(bld->builder, "shadow compare coord %f to texture %f\n", + coord, tex); } - assert(res); - res = lp_build_mul(texel_bld, res, lp_build_const_vec(texel_bld->type, 0.25)); + /* result = (p FUNC texel) ? 1 : 0 */ + res = lp_build_cmp(texel_bld, bld->static_state->compare_func, + p, texel[chan]); + res = lp_build_select(texel_bld, res, texel_bld->one, texel_bld->zero); /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */ - for(chan = 0; chan < 3; ++chan) - texel[chan] = res; + texel[0] = + texel[1] = + texel[2] = res; texel[3] = texel_bld->one; } /** + * Just set texels to white instead of actually sampling the texture. + * For debugging. + */ +void +lp_build_sample_nop(struct lp_type type, + LLVMValueRef texel_out[4]) +{ + LLVMValueRef one = lp_build_one(type); + unsigned chan; + + for (chan = 0; chan < 4; chan++) { + texel_out[chan] = one; + } +} + + +/** * Build texture sampling code. * 'texel' will return a vector of four LLVMValueRefs corresponding to * R, G, B, A. * \param type vector float type to use for coords, etc. + * \param ddx partial derivatives of (s,t,r,q) with respect to x + * \param ddy partial derivatives of (s,t,r,q) with respect to y */ void lp_build_sample_soa(LLVMBuilderRef builder, @@ -2057,18 +1133,27 @@ lp_build_sample_soa(LLVMBuilderRef builder, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - LLVMValueRef lodbias, - LLVMValueRef *texel) + const LLVMValueRef ddx[4], + const LLVMValueRef ddy[4], + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ + LLVMValueRef texel_out[4]) { + unsigned dims = texture_dims(static_state->target); struct lp_build_sample_context bld; - LLVMValueRef width, width_vec; - LLVMValueRef height, height_vec; - LLVMValueRef depth, depth_vec; - LLVMValueRef row_stride_array, img_stride_array; - LLVMValueRef data_array; + LLVMTypeRef i32t = LLVMInt32Type(); + LLVMValueRef s; LLVMValueRef t; LLVMValueRef r; + struct lp_type float_vec_type; + + if (0) { + enum pipe_format fmt = static_state->format; + debug_printf("Sample from %s\n", util_format_name(fmt)); + } + + assert(type.floating); /* Setup our build context */ memset(&bld, 0, sizeof bld); @@ -2076,57 +1161,90 @@ lp_build_sample_soa(LLVMBuilderRef builder, bld.static_state = static_state; bld.dynamic_state = dynamic_state; bld.format_desc = util_format_description(static_state->format); + bld.dims = dims; bld.float_type = lp_type_float(32); bld.int_type = lp_type_int(32); bld.coord_type = type; bld.uint_coord_type = lp_uint_type(type); bld.int_coord_type = lp_int_type(type); + bld.float_size_type = lp_type_float(32); + bld.float_size_type.length = dims > 1 ? 4 : 1; + bld.int_size_type = lp_int_type(bld.float_size_type); bld.texel_type = type; + float_vec_type = lp_type_float_vec(32); + lp_build_context_init(&bld.float_bld, builder, bld.float_type); + lp_build_context_init(&bld.float_vec_bld, builder, float_vec_type); lp_build_context_init(&bld.int_bld, builder, bld.int_type); lp_build_context_init(&bld.coord_bld, builder, bld.coord_type); lp_build_context_init(&bld.uint_coord_bld, builder, bld.uint_coord_type); lp_build_context_init(&bld.int_coord_bld, builder, bld.int_coord_type); + lp_build_context_init(&bld.int_size_bld, builder, bld.int_size_type); + lp_build_context_init(&bld.float_size_bld, builder, bld.float_size_type); lp_build_context_init(&bld.texel_bld, builder, bld.texel_type); /* Get the dynamic state */ - width = dynamic_state->width(dynamic_state, builder, unit); - height = dynamic_state->height(dynamic_state, builder, unit); - depth = dynamic_state->depth(dynamic_state, builder, unit); - row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit); - img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit); - data_array = dynamic_state->data_ptr(dynamic_state, builder, unit); + bld.width = dynamic_state->width(dynamic_state, builder, unit); + bld.height = dynamic_state->height(dynamic_state, builder, unit); + bld.depth = dynamic_state->depth(dynamic_state, builder, unit); + bld.row_stride_array = dynamic_state->row_stride(dynamic_state, builder, unit); + bld.img_stride_array = dynamic_state->img_stride(dynamic_state, builder, unit); + bld.data_array = dynamic_state->data_ptr(dynamic_state, builder, unit); /* Note that data_array is an array[level] of pointers to texture images */ s = coords[0]; t = coords[1]; r = coords[2]; - width_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, width); - height_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, height); - depth_vec = lp_build_broadcast_scalar(&bld.uint_coord_bld, depth); - - if (util_format_is_rgba8_variant(bld.format_desc) && - static_state->target == PIPE_TEXTURE_2D && - static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR && - static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR && - static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE && - is_simple_wrap_mode(static_state->wrap_s) && - is_simple_wrap_mode(static_state->wrap_t)) { - /* special case */ - lp_build_sample_2d_linear_aos(&bld, s, t, width_vec, height_vec, - row_stride_array, data_array, texel); + /* width, height, depth as single int vector */ + if (dims <= 1) { + bld.int_size = bld.width; } else { - lp_build_sample_general(&bld, unit, s, t, r, lodbias, - width, height, depth, - width_vec, height_vec, depth_vec, - row_stride_array, img_stride_array, - data_array, - texel); + bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_bld.undef, + bld.width, LLVMConstInt(i32t, 0, 0), ""); + if (dims >= 2) { + bld.int_size = LLVMBuildInsertElement(builder, bld.int_size, + bld.height, LLVMConstInt(i32t, 1, 0), ""); + if (dims >= 3) { + bld.int_size = LLVMBuildInsertElement(builder, bld.int_size, + bld.depth, LLVMConstInt(i32t, 2, 0), ""); + } + } + } + + if (0) { + /* For debug: no-op texture sampling */ + lp_build_sample_nop(bld.texel_type, texel_out); + } + else if (util_format_fits_8unorm(bld.format_desc) && + lp_is_simple_wrap_mode(static_state->wrap_s) && + lp_is_simple_wrap_mode(static_state->wrap_t)) { + /* do sampling/filtering with fixed pt arithmetic */ + lp_build_sample_aos(&bld, unit, s, t, r, ddx, ddy, + lod_bias, explicit_lod, + texel_out); + } + + else { + if ((gallivm_debug & GALLIVM_DEBUG_PERF) && + util_format_fits_8unorm(bld.format_desc)) { + debug_printf("%s: using floating point linear filtering for %s\n", + __FUNCTION__, bld.format_desc->short_name); + debug_printf(" min_img %d mag_img %d mip %d wraps %d wrapt %d\n", + static_state->min_img_filter, + static_state->mag_img_filter, + static_state->min_mip_filter, + static_state->wrap_s, + static_state->wrap_t); + } + + lp_build_sample_general(&bld, unit, s, t, r, ddx, ddy, + lod_bias, explicit_lod, + texel_out); } - lp_build_sample_compare(&bld, r, texel); + lp_build_sample_compare(&bld, r, texel_out); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.c b/src/gallium/auxiliary/gallivm/lp_bld_struct.c index 3998ac374fe..4693c2de6f9 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_struct.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.c @@ -49,6 +49,8 @@ lp_build_struct_get_ptr(LLVMBuilderRef builder, { LLVMValueRef indices[2]; LLVMValueRef member_ptr; + assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind); + assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMStructTypeKind); indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); indices[1] = LLVMConstInt(LLVMInt32Type(), member, 0); member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), ""); @@ -65,8 +67,91 @@ lp_build_struct_get(LLVMBuilderRef builder, { LLVMValueRef member_ptr; LLVMValueRef res; + assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind); + assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMStructTypeKind); member_ptr = lp_build_struct_get_ptr(builder, ptr, member, name); res = LLVMBuildLoad(builder, member_ptr, ""); lp_build_name(res, "%s.%s", LLVMGetValueName(ptr), name); return res; } + + +LLVMValueRef +lp_build_array_get_ptr(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMValueRef indices[2]; + LLVMValueRef element_ptr; + assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind); + assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMArrayTypeKind); + indices[0] = LLVMConstInt(LLVMInt32Type(), 0, 0); + indices[1] = index; + element_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), ""); +#ifdef DEBUG + lp_build_name(element_ptr, "&%s[%s]", + LLVMGetValueName(ptr), LLVMGetValueName(index)); +#endif + return element_ptr; +} + + +LLVMValueRef +lp_build_array_get(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMValueRef element_ptr; + LLVMValueRef res; + assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind); + assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMArrayTypeKind); + element_ptr = lp_build_array_get_ptr(builder, ptr, index); + res = LLVMBuildLoad(builder, element_ptr, ""); +#ifdef DEBUG + lp_build_name(res, "%s[%s]", LLVMGetValueName(ptr), LLVMGetValueName(index)); +#endif + return res; +} + + +void +lp_build_array_set(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index, + LLVMValueRef value) +{ + LLVMValueRef element_ptr; + assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind); + assert(LLVMGetTypeKind(LLVMGetElementType(LLVMTypeOf(ptr))) == LLVMArrayTypeKind); + element_ptr = lp_build_array_get_ptr(builder, ptr, index); + LLVMBuildStore(builder, value, element_ptr); +} + + +LLVMValueRef +lp_build_pointer_get(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMValueRef element_ptr; + LLVMValueRef res; + assert(LLVMGetTypeKind(LLVMTypeOf(ptr)) == LLVMPointerTypeKind); + element_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); + res = LLVMBuildLoad(builder, element_ptr, ""); +#ifdef DEBUG + lp_build_name(res, "%s[%s]", LLVMGetValueName(ptr), LLVMGetValueName(index)); +#endif + return res; +} + + +void +lp_build_pointer_set(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index, + LLVMValueRef value) +{ + LLVMValueRef element_ptr; + element_ptr = LLVMBuildGEP(builder, ptr, &index, 1, ""); + LLVMBuildStore(builder, value, element_ptr); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_struct.h b/src/gallium/auxiliary/gallivm/lp_bld_struct.h index 147336edb4b..eb87a8eee9e 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_struct.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_struct.h @@ -71,5 +71,46 @@ lp_build_struct_get(LLVMBuilderRef builder, unsigned member, const char *name); +/** + * Get value pointer to an array element. + */ +LLVMValueRef +lp_build_array_get_ptr(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index); + +/** + * Get the value of an array element. + */ +LLVMValueRef +lp_build_array_get(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index); + +/** + * Set the value of an array element. + */ +void +lp_build_array_set(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index, + LLVMValueRef value); + +/** + * Get the value of an array element. + */ +LLVMValueRef +lp_build_pointer_get(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index); + +/** + * Set the value of an array element. + */ +void +lp_build_pointer_set(LLVMBuilderRef builder, + LLVMValueRef ptr, + LLVMValueRef index, + LLVMValueRef value); #endif /* !LP_BLD_STRUCT_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c index 278c838eaca..4685a90e418 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.c @@ -60,28 +60,130 @@ lp_build_broadcast(LLVMBuilderRef builder, } +/** + * Broadcast + */ LLVMValueRef lp_build_broadcast_scalar(struct lp_build_context *bld, LLVMValueRef scalar) { const struct lp_type type = bld->type; + + assert(lp_check_elem_type(type, LLVMTypeOf(scalar))); + + if (type.length == 1) { + return scalar; + } + else { + LLVMValueRef res; + +#if HAVE_LLVM >= 0x207 + /* The shuffle vector is always made of int32 elements */ + struct lp_type i32_vec_type = lp_type_int_vec(32); + i32_vec_type.length = type.length; + + res = LLVMBuildInsertElement(bld->builder, bld->undef, scalar, + LLVMConstInt(LLVMInt32Type(), 0, 0), ""); + res = LLVMBuildShuffleVector(bld->builder, res, bld->undef, + lp_build_const_int_vec(i32_vec_type, 0), ""); +#else + /* XXX: The above path provokes a bug in LLVM 2.6 */ + unsigned i; + res = bld->undef; + for(i = 0; i < type.length; ++i) { + LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); + res = LLVMBuildInsertElement(bld->builder, res, scalar, index, ""); + } +#endif + return res; + } +} + + +/** + * Combined extract and broadcast (or a mere shuffle when the two types match) + */ +LLVMValueRef +lp_build_extract_broadcast(LLVMBuilderRef builder, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef vector, + LLVMValueRef index) +{ + LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef res; - unsigned i; - res = bld->undef; - for(i = 0; i < type.length; ++i) { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0); - res = LLVMBuildInsertElement(bld->builder, res, scalar, index, ""); + assert(src_type.floating == dst_type.floating); + assert(src_type.width == dst_type.width); + + assert(lp_check_value(src_type, vector)); + assert(LLVMTypeOf(index) == i32t); + + if (src_type.length == 1) { + if (dst_type.length == 1) { + /* + * Trivial scalar -> scalar. + */ + + res = vector; + } + else { + /* + * Broadcast scalar -> vector. + */ + + res = lp_build_broadcast(builder, + lp_build_vec_type(dst_type), + vector); + } + } + else { + if (dst_type.length == src_type.length) { + /* + * Special shuffle of the same size. + */ + + LLVMValueRef shuffle; + shuffle = lp_build_broadcast(builder, + LLVMVectorType(i32t, dst_type.length), + index); + res = LLVMBuildShuffleVector(builder, vector, + LLVMGetUndef(lp_build_vec_type(dst_type)), + shuffle, ""); + } + else { + LLVMValueRef scalar; + scalar = LLVMBuildExtractElement(builder, vector, index, ""); + if (dst_type.length == 1) { + /* + * Trivial extract scalar from vector. + */ + + res = scalar; + } + else { + /* + * General case of different sized vectors. + */ + + res = lp_build_broadcast(builder, + lp_build_vec_type(dst_type), + vector); + } + } } return res; } +/** + * Swizzle one channel into all other three channels. + */ LLVMValueRef -lp_build_broadcast_aos(struct lp_build_context *bld, - LLVMValueRef a, - unsigned channel) +lp_build_swizzle_scalar_aos(struct lp_build_context *bld, + LLVMValueRef a, + unsigned channel) { const struct lp_type type = bld->type; const unsigned n = type.length; @@ -93,7 +195,7 @@ lp_build_broadcast_aos(struct lp_build_context *bld, /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing * using shuffles here actually causes worst results. More investigation is * needed. */ - if (n <= 4) { + if (type.width >= 16) { /* * Shuffle. */ @@ -115,21 +217,25 @@ lp_build_broadcast_aos(struct lp_build_context *bld, * YY00 YY00 .... YY00 * YYYY YYYY .... YYYY <= output */ - struct lp_type type4 = type; + struct lp_type type4; const char shifts[4][2] = { { 1, 2}, {-1, 2}, { 1, -2}, {-1, -2} }; - boolean cond[4]; unsigned i; - memset(cond, 0, sizeof cond); - cond[channel] = 1; + a = LLVMBuildAnd(bld->builder, a, + lp_build_const_mask_aos(type, 1 << channel), ""); - a = LLVMBuildAnd(bld->builder, a, lp_build_const_mask_aos(type, cond), ""); + /* + * Build a type where each element is an integer that cover the four + * channels. + */ + type4 = type; + type4.floating = FALSE; type4.width *= 4; type4.length /= 4; @@ -159,81 +265,248 @@ lp_build_broadcast_aos(struct lp_build_context *bld, LLVMValueRef -lp_build_swizzle1_aos(struct lp_build_context *bld, - LLVMValueRef a, - const unsigned char swizzle[4]) +lp_build_swizzle_aos(struct lp_build_context *bld, + LLVMValueRef a, + const unsigned char swizzles[4]) { - const unsigned n = bld->type.length; + const struct lp_type type = bld->type; + const unsigned n = type.length; unsigned i, j; - if(a == bld->undef || a == bld->zero || a == bld->one) + if (swizzles[0] == PIPE_SWIZZLE_RED && + swizzles[1] == PIPE_SWIZZLE_GREEN && + swizzles[2] == PIPE_SWIZZLE_BLUE && + swizzles[3] == PIPE_SWIZZLE_ALPHA) { return a; + } - if(swizzle[0] == swizzle[1] && swizzle[1] == swizzle[2] && swizzle[2] == swizzle[3]) - return lp_build_broadcast_aos(bld, a, swizzle[0]); + if (swizzles[0] == swizzles[1] && + swizzles[1] == swizzles[2] && + swizzles[2] == swizzles[3]) { + switch (swizzles[0]) { + case PIPE_SWIZZLE_RED: + case PIPE_SWIZZLE_GREEN: + case PIPE_SWIZZLE_BLUE: + case PIPE_SWIZZLE_ALPHA: + return lp_build_swizzle_scalar_aos(bld, a, swizzles[0]); + case PIPE_SWIZZLE_ZERO: + return bld->zero; + case PIPE_SWIZZLE_ONE: + return bld->one; + default: + assert(0); + return bld->undef; + } + } - { + if (type.width >= 16) { /* * Shuffle. */ - LLVMTypeRef elem_type = LLVMInt32Type(); + LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(type)); + LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + LLVMValueRef aux[LP_MAX_VECTOR_LENGTH]; + + memset(aux, 0, sizeof aux); + + for(j = 0; j < n; j += 4) { + for(i = 0; i < 4; ++i) { + unsigned shuffle; + switch (swizzles[i]) { + default: + assert(0); + /* fall through */ + case PIPE_SWIZZLE_RED: + case PIPE_SWIZZLE_GREEN: + case PIPE_SWIZZLE_BLUE: + case PIPE_SWIZZLE_ALPHA: + shuffle = j + swizzles[i]; + break; + case PIPE_SWIZZLE_ZERO: + shuffle = type.length + 0; + if (!aux[0]) { + aux[0] = lp_build_const_elem(type, 0.0); + } + break; + case PIPE_SWIZZLE_ONE: + shuffle = type.length + 1; + if (!aux[1]) { + aux[1] = lp_build_const_elem(type, 1.0); + } + break; + } + shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); + } + } - for(j = 0; j < n; j += 4) - for(i = 0; i < 4; ++i) - shuffles[j + i] = LLVMConstInt(elem_type, j + swizzle[i], 0); + for (i = 0; i < n; ++i) { + if (!aux[i]) { + aux[i] = undef; + } + } - return LLVMBuildShuffleVector(bld->builder, a, bld->undef, LLVMConstVector(shuffles, n), ""); + return LLVMBuildShuffleVector(bld->builder, a, + LLVMConstVector(aux, n), + LLVMConstVector(shuffles, n), ""); + } else { + /* + * Bit mask and shifts. + * + * For example, this will convert BGRA to RGBA by doing + * + * rgba = (bgra & 0x00ff0000) >> 16 + * | (bgra & 0xff00ff00) + * | (bgra & 0x000000ff) << 16 + * + * This is necessary not only for faster cause, but because X86 backend + * will refuse shuffles of <4 x i8> vectors + */ + LLVMValueRef res; + struct lp_type type4; + unsigned cond = 0; + unsigned chan; + int shift; + + /* + * Start with a mixture of 1 and 0. + */ + for (chan = 0; chan < 4; ++chan) { + if (swizzles[chan] == PIPE_SWIZZLE_ONE) { + cond |= 1 << chan; + } + } + res = lp_build_select_aos(bld, cond, bld->one, bld->zero); + + /* + * Build a type where each element is an integer that cover the four + * channels. + */ + type4 = type; + type4.floating = FALSE; + type4.width *= 4; + type4.length /= 4; + + a = LLVMBuildBitCast(bld->builder, a, lp_build_vec_type(type4), ""); + res = LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type4), ""); + + /* + * Mask and shift the channels, trying to group as many channels in the + * same shift as possible + */ + for (shift = -3; shift <= 3; ++shift) { + unsigned long long mask = 0; + + assert(type4.width <= sizeof(mask)*8); + + for (chan = 0; chan < 4; ++chan) { + /* FIXME: big endian */ + if (swizzles[chan] < 4 && + chan - swizzles[chan] == shift) { + mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width); + } + } + + if (mask) { + LLVMValueRef masked; + LLVMValueRef shifted; + + if (0) + debug_printf("shift = %i, mask = 0x%08llx\n", shift, mask); + + masked = LLVMBuildAnd(bld->builder, a, + lp_build_const_int_vec(type4, mask), ""); + if (shift > 0) { + shifted = LLVMBuildShl(bld->builder, masked, + lp_build_const_int_vec(type4, shift*type.width), ""); + } else if (shift < 0) { + shifted = LLVMBuildLShr(bld->builder, masked, + lp_build_const_int_vec(type4, -shift*type.width), ""); + } else { + shifted = masked; + } + + res = LLVMBuildOr(bld->builder, res, shifted, ""); + } + } + + return LLVMBuildBitCast(bld->builder, res, lp_build_vec_type(type), ""); } } +/** + * Extended swizzle of a single channel of a SoA vector. + * + * @param bld building context + * @param unswizzled array with the 4 unswizzled values + * @param swizzle one of the PIPE_SWIZZLE_* + * + * @return the swizzled value. + */ LLVMValueRef -lp_build_swizzle2_aos(struct lp_build_context *bld, - LLVMValueRef a, - LLVMValueRef b, - const unsigned char swizzle[4]) +lp_build_swizzle_soa_channel(struct lp_build_context *bld, + const LLVMValueRef *unswizzled, + unsigned swizzle) { - const unsigned n = bld->type.length; - unsigned i, j; + switch (swizzle) { + case PIPE_SWIZZLE_RED: + case PIPE_SWIZZLE_GREEN: + case PIPE_SWIZZLE_BLUE: + case PIPE_SWIZZLE_ALPHA: + return unswizzled[swizzle]; + case PIPE_SWIZZLE_ZERO: + return bld->zero; + case PIPE_SWIZZLE_ONE: + return bld->one; + default: + assert(0); + return bld->undef; + } +} - if(swizzle[0] < 4 && swizzle[1] < 4 && swizzle[2] < 4 && swizzle[3] < 4) - return lp_build_swizzle1_aos(bld, a, swizzle); - if(a == b) { - unsigned char swizzle1[4]; - swizzle1[0] = swizzle[0] % 4; - swizzle1[1] = swizzle[1] % 4; - swizzle1[2] = swizzle[2] % 4; - swizzle1[3] = swizzle[3] % 4; - return lp_build_swizzle1_aos(bld, a, swizzle1); - } +/** + * Extended swizzle of a SoA vector. + * + * @param bld building context + * @param unswizzled array with the 4 unswizzled values + * @param swizzles array of PIPE_SWIZZLE_* + * @param swizzled output swizzled values + */ +void +lp_build_swizzle_soa(struct lp_build_context *bld, + const LLVMValueRef *unswizzled, + const unsigned char swizzles[4], + LLVMValueRef *swizzled) +{ + unsigned chan; - if(swizzle[0] % 4 == 0 && - swizzle[1] % 4 == 1 && - swizzle[2] % 4 == 2 && - swizzle[3] % 4 == 3) { - boolean cond[4]; - cond[0] = swizzle[0] / 4; - cond[1] = swizzle[1] / 4; - cond[2] = swizzle[2] / 4; - cond[3] = swizzle[3] / 4; - return lp_build_select_aos(bld, a, b, cond); + for (chan = 0; chan < 4; ++chan) { + swizzled[chan] = lp_build_swizzle_soa_channel(bld, unswizzled, + swizzles[chan]); } +} - { - /* - * Shuffle. - */ - LLVMTypeRef elem_type = LLVMInt32Type(); - LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; - for(j = 0; j < n; j += 4) - for(i = 0; i < 4; ++i) - shuffles[j + i] = LLVMConstInt(elem_type, j + (swizzle[i] % 4) + (swizzle[i] / 4 * n), 0); +/** + * Do an extended swizzle of a SoA vector inplace. + * + * @param bld building context + * @param values intput/output array with the 4 values + * @param swizzles array of PIPE_SWIZZLE_* + */ +void +lp_build_swizzle_soa_inplace(struct lp_build_context *bld, + LLVMValueRef *values, + const unsigned char swizzles[4]) +{ + LLVMValueRef unswizzled[4]; + unsigned chan; - return LLVMBuildShuffleVector(bld->builder, a, b, LLVMConstVector(shuffles, n), ""); + for (chan = 0; chan < 4; ++chan) { + unswizzled[chan] = values[chan]; } -} - + lp_build_swizzle_soa(bld, unswizzled, swizzles, values); +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h index 138ca620e63..fdea8442aef 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_swizzle.h @@ -55,12 +55,20 @@ lp_build_broadcast_scalar(struct lp_build_context *bld, LLVMValueRef scalar); +LLVMValueRef +lp_build_extract_broadcast(LLVMBuilderRef builder, + struct lp_type src_type, + struct lp_type dst_type, + LLVMValueRef vector, + LLVMValueRef index); + + /** * Broadcast one channel of a vector composed of arrays of XYZW structures into * all four channel. */ LLVMValueRef -lp_build_broadcast_aos(struct lp_build_context *bld, +lp_build_swizzle_scalar_aos(struct lp_build_context *bld, LLVMValueRef a, unsigned channel); @@ -68,24 +76,31 @@ lp_build_broadcast_aos(struct lp_build_context *bld, /** * Swizzle a vector consisting of an array of XYZW structs. * - * @param swizzle is the in [0,4[ range. + * @param swizzles is the in [0,4[ range. */ LLVMValueRef -lp_build_swizzle1_aos(struct lp_build_context *bld, - LLVMValueRef a, - const unsigned char swizzle[4]); +lp_build_swizzle_aos(struct lp_build_context *bld, + LLVMValueRef a, + const unsigned char swizzles[4]); -/** - * Swizzle two vector consisting of an array of XYZW structs. - * - * @param swizzle is the in [0,8[ range. Values in [4,8[ range refer to b. - */ LLVMValueRef -lp_build_swizzle2_aos(struct lp_build_context *bld, - LLVMValueRef a, - LLVMValueRef b, - const unsigned char swizzle[4]); +lp_build_swizzle_soa_channel(struct lp_build_context *bld, + const LLVMValueRef *unswizzled, + unsigned swizzle); + + +void +lp_build_swizzle_soa(struct lp_build_context *bld, + const LLVMValueRef *unswizzled, + const unsigned char swizzles[4], + LLVMValueRef *swizzled); + + +void +lp_build_swizzle_soa_inplace(struct lp_build_context *bld, + LLVMValueRef *values, + const unsigned char swizzles[4]); #endif /* !LP_BLD_SWIZZLE_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h index 2eac5da6c69..97318b3456c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h @@ -45,6 +45,15 @@ struct lp_build_context; struct lp_build_mask_context; +enum lp_build_tex_modifier { + LP_BLD_TEX_MODIFIER_NONE = 0, + LP_BLD_TEX_MODIFIER_PROJECTED, + LP_BLD_TEX_MODIFIER_LOD_BIAS, + LP_BLD_TEX_MODIFIER_EXPLICIT_LOD, + LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV +}; + + /** * Sampler code generation interface. * @@ -59,17 +68,34 @@ struct lp_build_sampler_soa (*destroy)( struct lp_build_sampler_soa *sampler ); void - (*emit_fetch_texel)( struct lp_build_sampler_soa *sampler, + (*emit_fetch_texel)( const struct lp_build_sampler_soa *sampler, LLVMBuilderRef builder, struct lp_type type, unsigned unit, unsigned num_coords, const LLVMValueRef *coords, - LLVMValueRef lodbias, + const LLVMValueRef *ddx, + const LLVMValueRef *ddy, + LLVMValueRef lod_bias, /* optional */ + LLVMValueRef explicit_lod, /* optional */ LLVMValueRef *texel); }; +struct lp_build_sampler_aos +{ + LLVMValueRef + (*emit_fetch_texel)( struct lp_build_sampler_aos *sampler, + struct lp_build_context *bld, + unsigned target, /* TGSI_TEXTURE_* */ + unsigned unit, + LLVMValueRef coords, + LLVMValueRef ddx, + LLVMValueRef ddy, + enum lp_build_tex_modifier modifier); +}; + + void lp_build_tgsi_soa(LLVMBuilderRef builder, const struct tgsi_token *tokens, @@ -80,7 +106,19 @@ lp_build_tgsi_soa(LLVMBuilderRef builder, const LLVMValueRef (*inputs)[4], LLVMValueRef (*outputs)[4], struct lp_build_sampler_soa *sampler, - struct tgsi_shader_info *info); + const struct tgsi_shader_info *info); + + +void +lp_build_tgsi_aos(LLVMBuilderRef builder, + const struct tgsi_token *tokens, + struct lp_type type, + const unsigned char swizzles[4], + LLVMValueRef consts_ptr, + const LLVMValueRef *inputs, + LLVMValueRef *outputs, + struct lp_build_sampler_aos *sampler, + const struct tgsi_shader_info *info); #endif /* LP_BLD_TGSI_H */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c new file mode 100644 index 00000000000..d5f963be58d --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c @@ -0,0 +1,1176 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * @file + * TGSI to LLVM IR translation -- AoS. + * + * FIXME: + * - No control flow support: the existing control flow code should be factored + * out into from the SoA code into a common module and shared. + * - No derivatives. Derivate logic should be pluggable, just like the samplers. + * + * @author Jose Fonseca <[email protected]> + */ + +#include "pipe/p_config.h" +#include "pipe/p_shader_tokens.h" +#include "util/u_debug.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_info.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_scan.h" +#include "lp_bld_type.h" +#include "lp_bld_const.h" +#include "lp_bld_arit.h" +#include "lp_bld_logic.h" +#include "lp_bld_swizzle.h" +#include "lp_bld_flow.h" +#include "lp_bld_quad.h" +#include "lp_bld_tgsi.h" +#include "lp_bld_limits.h" +#include "lp_bld_debug.h" + + +#define LP_MAX_INSTRUCTIONS 256 + + +struct lp_build_tgsi_aos_context +{ + struct lp_build_context base; + + /* Builder for integer masks and indices */ + struct lp_build_context int_bld; + + /* + * AoS swizzle used: + * - swizzles[0] = red index + * - swizzles[1] = green index + * - swizzles[2] = blue index + * - swizzles[3] = alpha index + */ + unsigned char swizzles[4]; + unsigned char inv_swizzles[4]; + + LLVMValueRef consts_ptr; + const LLVMValueRef *inputs; + LLVMValueRef *outputs; + + struct lp_build_sampler_aos *sampler; + + LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES]; + LLVMValueRef temps[LP_MAX_TGSI_TEMPS]; + LLVMValueRef addr[LP_MAX_TGSI_ADDRS]; + LLVMValueRef preds[LP_MAX_TGSI_PREDS]; + + /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is + * set in the indirect_files field. + * The temps[] array above is unused then. + */ + LLVMValueRef temps_array; + + /** bitmask indicating which register files are accessed indirectly */ + unsigned indirect_files; + + struct tgsi_full_instruction *instructions; + uint max_instructions; +}; + + +/** + * Wrapper around lp_build_swizzle_aos which translates swizzles to another + * ordering. + */ +static LLVMValueRef +swizzle_aos(struct lp_build_tgsi_aos_context *bld, + LLVMValueRef a, + unsigned swizzle_x, + unsigned swizzle_y, + unsigned swizzle_z, + unsigned swizzle_w) +{ + unsigned char swizzles[4]; + + assert(swizzle_x < 4); + assert(swizzle_y < 4); + assert(swizzle_z < 4); + assert(swizzle_w < 4); + + swizzles[bld->inv_swizzles[0]] = bld->swizzles[swizzle_x]; + swizzles[bld->inv_swizzles[1]] = bld->swizzles[swizzle_y]; + swizzles[bld->inv_swizzles[2]] = bld->swizzles[swizzle_z]; + swizzles[bld->inv_swizzles[3]] = bld->swizzles[swizzle_w]; + + return lp_build_swizzle_aos(&bld->base, a, swizzles); +} + + +static LLVMValueRef +swizzle_scalar_aos(struct lp_build_tgsi_aos_context *bld, + LLVMValueRef a, + unsigned chan) +{ + chan = bld->swizzles[chan]; + return lp_build_swizzle_scalar_aos(&bld->base, a, chan); +} + + +/** + * Register fetch. + */ +static LLVMValueRef +emit_fetch( + struct lp_build_tgsi_aos_context *bld, + const struct tgsi_full_instruction *inst, + unsigned src_op) +{ + struct lp_type type = bld->base.type; + const struct tgsi_full_src_register *reg = &inst->Src[src_op]; + LLVMValueRef res; + unsigned chan; + + assert(!reg->Register.Indirect); + + /* + * Fetch the from the register file. + */ + + switch (reg->Register.File) { + case TGSI_FILE_CONSTANT: + /* + * Get the constants components + */ + + res = bld->base.undef; + for (chan = 0; chan < 4; ++chan) { + LLVMValueRef index; + LLVMValueRef scalar_ptr; + LLVMValueRef scalar; + LLVMValueRef swizzle; + + index = LLVMConstInt(LLVMInt32Type(), + reg->Register.Index*4 + chan, + 0); + + scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, + &index, 1, ""); + + scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, ""); + + lp_build_name(scalar, "const[%u].%c", reg->Register.Index, "xyzw"[chan]); + + /* + * NOTE: constants array is always assumed to be RGBA + */ + + swizzle = LLVMConstInt(LLVMInt32Type(), chan, 0); + + res = LLVMBuildInsertElement(bld->base.builder, res, scalar, swizzle, ""); + } + + /* + * Broadcast the first quaternion to all others. + * + * XXX: could be factored into a reusable function. + */ + + if (type.length > 4) { + LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; + unsigned i; + + for (chan = 0; chan < 4; ++chan) { + shuffles[chan] = LLVMConstInt(LLVMInt32Type(), chan, 0); + } + + for (i = 4; i < type.length; ++i) { + shuffles[i] = shuffles[i % 4]; + } + + res = LLVMBuildShuffleVector(bld->base.builder, + res, bld->base.undef, + LLVMConstVector(shuffles, type.length), + ""); + } + break; + + case TGSI_FILE_IMMEDIATE: + res = bld->immediates[reg->Register.Index]; + assert(res); + break; + + case TGSI_FILE_INPUT: + res = bld->inputs[reg->Register.Index]; + assert(res); + break; + + case TGSI_FILE_TEMPORARY: + { + LLVMValueRef temp_ptr; + temp_ptr = bld->temps[reg->Register.Index]; + res = LLVMBuildLoad(bld->base.builder, temp_ptr, ""); + if (!res) + return bld->base.undef; + } + break; + + default: + assert(0 && "invalid src register in emit_fetch()"); + return bld->base.undef; + } + + /* + * Apply sign modifier. + */ + + if (reg->Register.Absolute) { + res = lp_build_abs(&bld->base, res); + } + + if(reg->Register.Negate) { + res = lp_build_negate(&bld->base, res); + } + + /* + * Swizzle the argument + */ + + res = swizzle_aos(bld, res, + reg->Register.SwizzleX, + reg->Register.SwizzleY, + reg->Register.SwizzleZ, + reg->Register.SwizzleW); + + return res; +} + + +/** + * Register store. + */ +static void +emit_store( + struct lp_build_tgsi_aos_context *bld, + const struct tgsi_full_instruction *inst, + unsigned index, + LLVMValueRef value) +{ + const struct tgsi_full_dst_register *reg = &inst->Dst[index]; + LLVMValueRef mask = NULL; + LLVMValueRef ptr; + + /* + * Saturate the value + */ + + switch (inst->Instruction.Saturate) { + case TGSI_SAT_NONE: + break; + + case TGSI_SAT_ZERO_ONE: + value = lp_build_max(&bld->base, value, bld->base.zero); + value = lp_build_min(&bld->base, value, bld->base.one); + break; + + case TGSI_SAT_MINUS_PLUS_ONE: + value = lp_build_max(&bld->base, value, lp_build_const_vec(bld->base.type, -1.0)); + value = lp_build_min(&bld->base, value, bld->base.one); + break; + + default: + assert(0); + } + + /* + * Translate the register file + */ + + assert(!reg->Register.Indirect); + + switch (reg->Register.File) { + case TGSI_FILE_OUTPUT: + ptr = bld->outputs[reg->Register.Index]; + break; + + case TGSI_FILE_TEMPORARY: + ptr = bld->temps[reg->Register.Index]; + break; + + case TGSI_FILE_ADDRESS: + ptr = bld->addr[reg->Indirect.Index]; + break; + + case TGSI_FILE_PREDICATE: + ptr = bld->preds[reg->Register.Index]; + break; + + default: + assert(0); + return; + } + + /* + * Predicate + */ + + if (inst->Instruction.Predicate) { + LLVMValueRef pred; + + assert(inst->Predicate.Index < LP_MAX_TGSI_PREDS); + + pred = LLVMBuildLoad(bld->base.builder, + bld->preds[inst->Predicate.Index], ""); + + /* + * Convert the value to an integer mask. + */ + pred = lp_build_compare(bld->base.builder, + bld->base.type, + PIPE_FUNC_NOTEQUAL, + pred, + bld->base.zero); + + if (inst->Predicate.Negate) { + pred = LLVMBuildNot(bld->base.builder, pred, ""); + } + + pred = swizzle_aos(bld, pred, + inst->Predicate.SwizzleX, + inst->Predicate.SwizzleY, + inst->Predicate.SwizzleZ, + inst->Predicate.SwizzleW); + + if (mask) { + mask = LLVMBuildAnd(bld->base.builder, mask, pred, ""); + } else { + mask = pred; + } + } + + /* + * Writemask + */ + + if (reg->Register.WriteMask != TGSI_WRITEMASK_XYZW) { + LLVMValueRef writemask; + + writemask = lp_build_const_mask_aos(bld->base.type, reg->Register.WriteMask); + + if (mask) { + mask = LLVMBuildAnd(bld->base.builder, mask, writemask, ""); + } else { + mask = writemask; + } + } + + if (mask) { + LLVMValueRef orig_value; + + orig_value = LLVMBuildLoad(bld->base.builder, ptr, ""); + value = lp_build_select(&bld->base, + mask, value, orig_value); + } + + LLVMBuildStore(bld->base.builder, value, ptr); +} + + +/** + * High-level instruction translators. + */ + +static LLVMValueRef +emit_tex(struct lp_build_tgsi_aos_context *bld, + const struct tgsi_full_instruction *inst, + enum lp_build_tex_modifier modifier) +{ + unsigned target; + unsigned unit; + LLVMValueRef coords; + LLVMValueRef ddx; + LLVMValueRef ddy; + + if (!bld->sampler) { + _debug_printf("warning: found texture instruction but no sampler generator supplied\n"); + return bld->base.undef; + } + + target = inst->Texture.Texture; + + coords = emit_fetch( bld, inst, 0 ); + + if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { + ddx = emit_fetch( bld, inst, 1 ); + ddy = emit_fetch( bld, inst, 2 ); + unit = inst->Src[3].Register.Index; + } else { +#if 0 + ddx = lp_build_ddx( &bld->base, coords ); + ddy = lp_build_ddy( &bld->base, coords ); +#else + /* TODO */ + ddx = bld->base.one; + ddy = bld->base.one; +#endif + unit = inst->Src[1].Register.Index; + } + + return bld->sampler->emit_fetch_texel(bld->sampler, + &bld->base, + target, unit, + coords, ddx, ddy, + modifier); +} + + +static void +emit_declaration( + struct lp_build_tgsi_aos_context *bld, + const struct tgsi_full_declaration *decl) +{ + LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type); + + unsigned first = decl->Range.First; + unsigned last = decl->Range.Last; + unsigned idx; + + for (idx = first; idx <= last; ++idx) { + switch (decl->Declaration.File) { + case TGSI_FILE_TEMPORARY: + assert(idx < LP_MAX_TGSI_TEMPS); + if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) { + LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(), + last + 1, 0); + bld->temps_array = lp_build_array_alloca(bld->base.builder, + vec_type, array_size, ""); + } else { + bld->temps[idx] = lp_build_alloca(bld->base.builder, + vec_type, ""); + } + break; + + case TGSI_FILE_OUTPUT: + bld->outputs[idx] = lp_build_alloca(bld->base.builder, + vec_type, ""); + break; + + case TGSI_FILE_ADDRESS: + assert(idx < LP_MAX_TGSI_ADDRS); + bld->addr[idx] = lp_build_alloca(bld->base.builder, + vec_type, ""); + break; + + case TGSI_FILE_PREDICATE: + assert(idx < LP_MAX_TGSI_PREDS); + bld->preds[idx] = lp_build_alloca(bld->base.builder, + vec_type, ""); + break; + + default: + /* don't need to declare other vars */ + break; + } + } +} + + +/** + * Emit LLVM for one TGSI instruction. + * \param return TRUE for success, FALSE otherwise + */ +static boolean +emit_instruction( + struct lp_build_tgsi_aos_context *bld, + const struct tgsi_full_instruction *inst, + const struct tgsi_opcode_info *info, + int *pc) +{ + LLVMValueRef src0, src1, src2; + LLVMValueRef tmp0, tmp1; + LLVMValueRef dst0; + + /* + * Stores and write masks are handled in a general fashion after the long + * instruction opcode switch statement. + * + * Although not stricitly necessary, we avoid generating instructions for + * channels which won't be stored, in cases where's that easy. For some + * complex instructions, like texture sampling, it is more convenient to + * assume a full writemask and then let LLVM optimization passes eliminate + * redundant code. + */ + + (*pc)++; + + assert(info->num_dst <= 1); + if (info->num_dst) { + dst0 = bld->base.undef; + } + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_ARL: + src0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_floor(&bld->base, src0); + break; + + case TGSI_OPCODE_MOV: + dst0 = emit_fetch(bld, inst, 0); + break; + + case TGSI_OPCODE_LIT: + return FALSE; + + case TGSI_OPCODE_RCP: + /* TGSI_OPCODE_RECIP */ + src0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_rcp(&bld->base, src0); + break; + + case TGSI_OPCODE_RSQ: + /* TGSI_OPCODE_RECIPSQRT */ + src0 = emit_fetch(bld, inst, 0); + tmp0 = lp_build_abs(&bld->base, src0); + dst0 = lp_build_rsqrt(&bld->base, tmp0); + break; + + case TGSI_OPCODE_EXP: + return FALSE; + + case TGSI_OPCODE_LOG: + return FALSE; + + case TGSI_OPCODE_MUL: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + dst0 = lp_build_mul(&bld->base, src0, src1); + break; + + case TGSI_OPCODE_ADD: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + dst0 = lp_build_add(&bld->base, src0, src1); + break; + + case TGSI_OPCODE_DP3: + /* TGSI_OPCODE_DOT3 */ + return FALSE; + + case TGSI_OPCODE_DP4: + /* TGSI_OPCODE_DOT4 */ + return FALSE; + + case TGSI_OPCODE_DST: + return FALSE; + + case TGSI_OPCODE_MIN: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + dst0 = lp_build_max(&bld->base, src0, src1); + break; + + case TGSI_OPCODE_MAX: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + dst0 = lp_build_max(&bld->base, src0, src1); + break; + + case TGSI_OPCODE_SLT: + /* TGSI_OPCODE_SETLT */ + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_LESS, src0, src1); + dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero); + break; + + case TGSI_OPCODE_SGE: + /* TGSI_OPCODE_SETGE */ + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_GEQUAL, src0, src1); + dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero); + break; + + case TGSI_OPCODE_MAD: + /* TGSI_OPCODE_MADD */ + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + src2 = emit_fetch(bld, inst, 2); + tmp0 = lp_build_mul(&bld->base, src0, src1); + dst0 = lp_build_add(&bld->base, tmp0, src2); + break; + + case TGSI_OPCODE_SUB: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + dst0 = lp_build_sub(&bld->base, src0, src1); + break; + + case TGSI_OPCODE_LRP: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + src2 = emit_fetch(bld, inst, 2); + tmp0 = lp_build_sub(&bld->base, src1, src2); + tmp0 = lp_build_mul(&bld->base, src0, tmp0); + dst0 = lp_build_add(&bld->base, tmp0, src2); + break; + + case TGSI_OPCODE_CND: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + src2 = emit_fetch(bld, inst, 2); + tmp1 = lp_build_const_vec(bld->base.type, 0.5); + tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, src2, tmp1); + dst0 = lp_build_select(&bld->base, tmp0, src0, src1); + break; + + case TGSI_OPCODE_DP2A: + return FALSE; + + case TGSI_OPCODE_FRC: + src0 = emit_fetch(bld, inst, 0); + tmp0 = lp_build_floor(&bld->base, src0); + dst0 = lp_build_sub(&bld->base, src0, tmp0); + break; + + case TGSI_OPCODE_CLAMP: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + src2 = emit_fetch(bld, inst, 2); + tmp0 = lp_build_max(&bld->base, src0, src1); + dst0 = lp_build_min(&bld->base, tmp0, src2); + break; + + case TGSI_OPCODE_FLR: + src0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_floor(&bld->base, src0); + break; + + case TGSI_OPCODE_ROUND: + src0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_round(&bld->base, src0); + break; + + case TGSI_OPCODE_EX2: + src0 = emit_fetch(bld, inst, 0); + tmp0 = lp_build_swizzle_scalar_aos(&bld->base, src0, TGSI_SWIZZLE_X); + dst0 = lp_build_exp2(&bld->base, tmp0); + break; + + case TGSI_OPCODE_LG2: + src0 = emit_fetch(bld, inst, 0); + tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X); + dst0 = lp_build_log2(&bld->base, tmp0); + break; + + case TGSI_OPCODE_POW: + src0 = emit_fetch(bld, inst, 0); + src0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X); + src1 = emit_fetch(bld, inst, 1); + src1 = swizzle_scalar_aos(bld, src1, TGSI_SWIZZLE_X); + dst0 = lp_build_pow(&bld->base, src0, src1); + break; + + case TGSI_OPCODE_XPD: + return FALSE; + + case TGSI_OPCODE_ABS: + src0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_abs(&bld->base, src0); + break; + + case TGSI_OPCODE_RCC: + /* deprecated? */ + assert(0); + return FALSE; + + case TGSI_OPCODE_DPH: + return FALSE; + + case TGSI_OPCODE_COS: + src0 = emit_fetch(bld, inst, 0); + tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X); + dst0 = lp_build_cos(&bld->base, tmp0); + break; + + case TGSI_OPCODE_DDX: + return FALSE; + + case TGSI_OPCODE_DDY: + return FALSE; + + case TGSI_OPCODE_KILP: + /* predicated kill */ + return FALSE; + + case TGSI_OPCODE_KIL: + /* conditional kill */ + return FALSE; + + case TGSI_OPCODE_PK2H: + return FALSE; + break; + + case TGSI_OPCODE_PK2US: + return FALSE; + break; + + case TGSI_OPCODE_PK4B: + return FALSE; + break; + + case TGSI_OPCODE_PK4UB: + return FALSE; + + case TGSI_OPCODE_RFL: + return FALSE; + + case TGSI_OPCODE_SEQ: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_EQUAL, src0, src1); + dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero); + break; + + case TGSI_OPCODE_SFL: + dst0 = bld->base.zero; + break; + + case TGSI_OPCODE_SGT: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_GREATER, src0, src1); + dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero); + break; + + case TGSI_OPCODE_SIN: + src0 = emit_fetch(bld, inst, 0); + tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X); + dst0 = lp_build_sin(&bld->base, tmp0); + break; + + case TGSI_OPCODE_SLE: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_LEQUAL, src0, src1); + dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero); + break; + + case TGSI_OPCODE_SNE: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_NOTEQUAL, src0, src1); + dst0 = lp_build_select(&bld->base, tmp0, bld->base.one, bld->base.zero); + break; + + case TGSI_OPCODE_STR: + dst0 = bld->base.one; + break; + + case TGSI_OPCODE_TEX: + dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_NONE); + break; + + case TGSI_OPCODE_TXD: + dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV); + break; + + case TGSI_OPCODE_UP2H: + /* deprecated */ + assert (0); + return FALSE; + break; + + case TGSI_OPCODE_UP2US: + /* deprecated */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_UP4B: + /* deprecated */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_UP4UB: + /* deprecated */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_X2D: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_ARA: + /* deprecated */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_ARR: + src0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_round(&bld->base, src0); + break; + + case TGSI_OPCODE_BRA: + /* deprecated */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_CAL: + return FALSE; + + case TGSI_OPCODE_RET: + return FALSE; + + case TGSI_OPCODE_END: + *pc = -1; + break; + + case TGSI_OPCODE_SSG: + /* TGSI_OPCODE_SGN */ + tmp0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_sgn(&bld->base, tmp0); + break; + + case TGSI_OPCODE_CMP: + src0 = emit_fetch(bld, inst, 0); + src1 = emit_fetch(bld, inst, 1); + src2 = emit_fetch(bld, inst, 2); + tmp0 = lp_build_cmp(&bld->base, PIPE_FUNC_LESS, src0, bld->base.zero); + dst0 = lp_build_select(&bld->base, tmp0, src1, src2); + break; + + case TGSI_OPCODE_SCS: + return FALSE; + + case TGSI_OPCODE_TXB: + dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS); + break; + + case TGSI_OPCODE_NRM: + /* fall-through */ + case TGSI_OPCODE_NRM4: + return FALSE; + + case TGSI_OPCODE_DIV: + /* deprecated */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_DP2: + return FALSE; + + case TGSI_OPCODE_TXL: + dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD); + break; + + case TGSI_OPCODE_TXP: + dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED); + break; + + case TGSI_OPCODE_BRK: + return FALSE; + + case TGSI_OPCODE_IF: + return FALSE; + + case TGSI_OPCODE_BGNLOOP: + return FALSE; + + case TGSI_OPCODE_BGNSUB: + return FALSE; + + case TGSI_OPCODE_ELSE: + return FALSE; + + case TGSI_OPCODE_ENDIF: + return FALSE; + + case TGSI_OPCODE_ENDLOOP: + return FALSE; + + case TGSI_OPCODE_ENDSUB: + return FALSE; + + case TGSI_OPCODE_PUSHA: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_POPA: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_CEIL: + src0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_ceil(&bld->base, src0); + break; + + case TGSI_OPCODE_I2F: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_NOT: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_TRUNC: + src0 = emit_fetch(bld, inst, 0); + dst0 = lp_build_trunc(&bld->base, src0); + break; + + case TGSI_OPCODE_SHL: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_ISHR: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_AND: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_OR: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_MOD: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_XOR: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_SAD: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_TXF: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_TXQ: + /* deprecated? */ + assert(0); + return FALSE; + break; + + case TGSI_OPCODE_CONT: + return FALSE; + + case TGSI_OPCODE_EMIT: + return FALSE; + break; + + case TGSI_OPCODE_ENDPRIM: + return FALSE; + break; + + case TGSI_OPCODE_NOP: + break; + + default: + return FALSE; + } + + if (info->num_dst) { + emit_store(bld, inst, 0, dst0); + } + + return TRUE; +} + + +void +lp_build_tgsi_aos(LLVMBuilderRef builder, + const struct tgsi_token *tokens, + struct lp_type type, + const unsigned char swizzles[4], + LLVMValueRef consts_ptr, + const LLVMValueRef *inputs, + LLVMValueRef *outputs, + struct lp_build_sampler_aos *sampler, + const struct tgsi_shader_info *info) +{ + struct lp_build_tgsi_aos_context bld; + struct tgsi_parse_context parse; + uint num_immediates = 0; + uint num_instructions = 0; + unsigned chan; + int pc = 0; + + /* Setup build context */ + memset(&bld, 0, sizeof bld); + lp_build_context_init(&bld.base, builder, type); + lp_build_context_init(&bld.int_bld, builder, lp_int_type(type)); + + for (chan = 0; chan < 4; ++chan) { + bld.swizzles[chan] = swizzles[chan]; + bld.inv_swizzles[swizzles[chan]] = chan; + } + + bld.inputs = inputs; + bld.outputs = outputs; + bld.consts_ptr = consts_ptr; + bld.sampler = sampler; + bld.indirect_files = info->indirect_files; + bld.instructions = (struct tgsi_full_instruction *) + MALLOC(LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction)); + bld.max_instructions = LP_MAX_INSTRUCTIONS; + + if (!bld.instructions) { + return; + } + + tgsi_parse_init(&parse, tokens); + + while (!tgsi_parse_end_of_tokens(&parse)) { + tgsi_parse_token(&parse); + + switch(parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_DECLARATION: + /* Inputs already interpolated */ + emit_declaration(&bld, &parse.FullToken.FullDeclaration); + break; + + case TGSI_TOKEN_TYPE_INSTRUCTION: + { + /* save expanded instruction */ + if (num_instructions == bld.max_instructions) { + struct tgsi_full_instruction *instructions; + instructions = REALLOC(bld.instructions, + bld.max_instructions + * sizeof(struct tgsi_full_instruction), + (bld.max_instructions + LP_MAX_INSTRUCTIONS) + * sizeof(struct tgsi_full_instruction)); + if (!instructions) { + break; + } + bld.instructions = instructions; + bld.max_instructions += LP_MAX_INSTRUCTIONS; + } + + memcpy(bld.instructions + num_instructions, + &parse.FullToken.FullInstruction, + sizeof(bld.instructions[0])); + + num_instructions++; + } + + break; + + case TGSI_TOKEN_TYPE_IMMEDIATE: + /* simply copy the immediate values into the next immediates[] slot */ + { + const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1; + float imm[4]; + assert(size <= 4); + assert(num_immediates < LP_MAX_TGSI_IMMEDIATES); + for (chan = 0; chan < 4; ++chan) { + imm[chan] = 0.0f; + } + for (chan = 0; chan < size; ++chan) { + unsigned swizzle = bld.swizzles[chan]; + imm[swizzle] = parse.FullToken.FullImmediate.u[chan].Float; + } + bld.immediates[num_immediates] = + lp_build_const_aos(type, + imm[0], imm[1], imm[2], imm[3], + NULL); + num_immediates++; + } + break; + + case TGSI_TOKEN_TYPE_PROPERTY: + break; + + default: + assert(0); + } + } + + while (pc != -1) { + struct tgsi_full_instruction *instr = bld.instructions + pc; + const struct tgsi_opcode_info *opcode_info = + tgsi_get_opcode_info(instr->Instruction.Opcode); + if (!emit_instruction(&bld, instr, opcode_info, &pc)) + _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n", + opcode_info->mnemonic); + } + + if (0) { + LLVMBasicBlockRef block = LLVMGetInsertBlock(builder); + LLVMValueRef function = LLVMGetBasicBlockParent(block); + debug_printf("11111111111111111111111111111 \n"); + tgsi_dump(tokens, 0); + lp_debug_dump_value(function); + debug_printf("2222222222222222222222222222 \n"); + } + tgsi_parse_free(&parse); + + if (0) { + LLVMModuleRef module = LLVMGetGlobalParent( + LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld.base.builder))); + LLVMDumpModule(module); + } + + FREE(bld.instructions); +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index d3c769e28b8..441aebae298 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -45,22 +45,21 @@ #include "tgsi/tgsi_info.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" -#include "tgsi/tgsi_exec.h" #include "tgsi/tgsi_scan.h" #include "lp_bld_type.h" #include "lp_bld_const.h" #include "lp_bld_arit.h" +#include "lp_bld_bitarit.h" +#include "lp_bld_gather.h" #include "lp_bld_logic.h" #include "lp_bld_swizzle.h" #include "lp_bld_flow.h" +#include "lp_bld_quad.h" #include "lp_bld_tgsi.h" +#include "lp_bld_limits.h" #include "lp_bld_debug.h" -#define LP_MAX_TEMPS 256 -#define LP_MAX_IMMEDIATES 256 - - #define FOR_EACH_CHANNEL( CHAN )\ for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++) @@ -78,13 +77,10 @@ #define CHAN_Y 1 #define CHAN_Z 2 #define CHAN_W 3 +#define NUM_CHANNELS 4 -#define QUAD_TOP_LEFT 0 -#define QUAD_TOP_RIGHT 1 -#define QUAD_BOTTOM_LEFT 2 -#define QUAD_BOTTOM_RIGHT 3 +#define LP_MAX_INSTRUCTIONS 256 -#define LP_TGSI_MAX_NESTING 16 struct lp_exec_mask { struct lp_build_context *bld; @@ -93,22 +89,28 @@ struct lp_exec_mask { LLVMTypeRef int_vec_type; - LLVMValueRef cond_stack[LP_TGSI_MAX_NESTING]; + LLVMValueRef cond_stack[LP_MAX_TGSI_NESTING]; int cond_stack_size; LLVMValueRef cond_mask; - LLVMValueRef break_stack[LP_TGSI_MAX_NESTING]; - int break_stack_size; - LLVMValueRef break_mask; - - LLVMValueRef cont_stack[LP_TGSI_MAX_NESTING]; - int cont_stack_size; + LLVMBasicBlockRef loop_block; LLVMValueRef cont_mask; - - LLVMBasicBlockRef loop_stack[LP_TGSI_MAX_NESTING]; + LLVMValueRef break_mask; + LLVMValueRef break_var; + struct { + LLVMBasicBlockRef loop_block; + LLVMValueRef cont_mask; + LLVMValueRef break_mask; + LLVMValueRef break_var; + } loop_stack[LP_MAX_TGSI_NESTING]; int loop_stack_size; - LLVMBasicBlockRef loop_block; + LLVMValueRef ret_mask; + struct { + int pc; + LLVMValueRef ret_mask; + } call_stack[LP_MAX_TGSI_NESTING]; + int call_stack_size; LLVMValueRef exec_mask; }; @@ -117,48 +119,36 @@ struct lp_build_tgsi_soa_context { struct lp_build_context base; + /* Builder for integer masks and indices */ + struct lp_build_context uint_bld; + LLVMValueRef consts_ptr; const LLVMValueRef *pos; const LLVMValueRef (*inputs)[NUM_CHANNELS]; LLVMValueRef (*outputs)[NUM_CHANNELS]; - struct lp_build_sampler_soa *sampler; + const struct lp_build_sampler_soa *sampler; - LLVMValueRef immediates[LP_MAX_IMMEDIATES][NUM_CHANNELS]; - LLVMValueRef temps[LP_MAX_TEMPS][NUM_CHANNELS]; - LLVMValueRef addr[LP_MAX_TEMPS][NUM_CHANNELS]; + LLVMValueRef immediates[LP_MAX_TGSI_IMMEDIATES][NUM_CHANNELS]; + LLVMValueRef temps[LP_MAX_TGSI_TEMPS][NUM_CHANNELS]; + LLVMValueRef addr[LP_MAX_TGSI_ADDRS][NUM_CHANNELS]; + LLVMValueRef preds[LP_MAX_TGSI_PREDS][NUM_CHANNELS]; - /* we allocate an array of temps if we have indirect - * addressing and then the temps above is unused */ + /* We allocate/use this array of temps if (1 << TGSI_FILE_TEMPORARY) is + * set in the indirect_files field. + * The temps[] array above is unused then. + */ LLVMValueRef temps_array; - boolean has_indirect_addressing; + + const struct tgsi_shader_info *info; + /** bitmask indicating which register files are accessed indirectly */ + unsigned indirect_files; struct lp_build_mask_context *mask; struct lp_exec_mask exec_mask; -}; - -static const unsigned char -swizzle_left[4] = { - QUAD_TOP_LEFT, QUAD_TOP_LEFT, - QUAD_BOTTOM_LEFT, QUAD_BOTTOM_LEFT -}; - -static const unsigned char -swizzle_right[4] = { - QUAD_TOP_RIGHT, QUAD_TOP_RIGHT, - QUAD_BOTTOM_RIGHT, QUAD_BOTTOM_RIGHT -}; -static const unsigned char -swizzle_top[4] = { - QUAD_TOP_LEFT, QUAD_TOP_RIGHT, - QUAD_TOP_LEFT, QUAD_TOP_RIGHT -}; - -static const unsigned char -swizzle_bottom[4] = { - QUAD_BOTTOM_LEFT, QUAD_BOTTOM_RIGHT, - QUAD_BOTTOM_LEFT, QUAD_BOTTOM_RIGHT + struct tgsi_full_instruction *instructions; + uint max_instructions; }; static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld) @@ -167,10 +157,11 @@ static void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context mask->has_mask = FALSE; mask->cond_stack_size = 0; mask->loop_stack_size = 0; - mask->break_stack_size = 0; - mask->cont_stack_size = 0; + mask->call_stack_size = 0; mask->int_vec_type = lp_build_int_vec_type(mask->bld->type); + mask->exec_mask = mask->ret_mask = mask->break_mask = mask->cont_mask = mask->cond_mask = + LLVMConstAllOnes(mask->int_vec_type); } static void lp_exec_mask_update(struct lp_exec_mask *mask) @@ -190,33 +181,47 @@ static void lp_exec_mask_update(struct lp_exec_mask *mask) } else mask->exec_mask = mask->cond_mask; + if (mask->call_stack_size) { + mask->exec_mask = LLVMBuildAnd(mask->bld->builder, + mask->exec_mask, + mask->ret_mask, + "callmask"); + } mask->has_mask = (mask->cond_stack_size > 0 || - mask->loop_stack_size > 0); + mask->loop_stack_size > 0 || + mask->call_stack_size > 0); } static void lp_exec_mask_cond_push(struct lp_exec_mask *mask, LLVMValueRef val) { + assert(mask->cond_stack_size < LP_MAX_TGSI_NESTING); + if (mask->cond_stack_size == 0) { + assert(mask->cond_mask == LLVMConstAllOnes(mask->int_vec_type)); + } mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask; - mask->cond_mask = LLVMBuildBitCast(mask->bld->builder, val, - mask->int_vec_type, ""); - + assert(LLVMTypeOf(val) == mask->int_vec_type); + mask->cond_mask = LLVMBuildAnd(mask->bld->builder, + mask->cond_mask, + val, + ""); lp_exec_mask_update(mask); } static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask) { - LLVMValueRef prev_mask = mask->cond_stack[mask->cond_stack_size - 1]; - LLVMValueRef inv_mask = LLVMBuildNot(mask->bld->builder, - mask->cond_mask, ""); - - /* means that we didn't have any mask before and that - * we were fully enabled */ - if (mask->cond_stack_size <= 1) { - prev_mask = LLVMConstAllOnes(mask->int_vec_type); + LLVMValueRef prev_mask; + LLVMValueRef inv_mask; + + assert(mask->cond_stack_size); + prev_mask = mask->cond_stack[mask->cond_stack_size - 1]; + if (mask->cond_stack_size == 1) { + assert(prev_mask == LLVMConstAllOnes(mask->int_vec_type)); } + inv_mask = LLVMBuildNot(mask->bld->builder, mask->cond_mask, ""); + mask->cond_mask = LLVMBuildAnd(mask->bld->builder, inv_mask, prev_mask, ""); @@ -225,27 +230,37 @@ static void lp_exec_mask_cond_invert(struct lp_exec_mask *mask) static void lp_exec_mask_cond_pop(struct lp_exec_mask *mask) { + assert(mask->cond_stack_size); mask->cond_mask = mask->cond_stack[--mask->cond_stack_size]; lp_exec_mask_update(mask); } static void lp_exec_bgnloop(struct lp_exec_mask *mask) { + if (mask->loop_stack_size == 0) { + assert(mask->loop_block == NULL); + assert(mask->cont_mask == LLVMConstAllOnes(mask->int_vec_type)); + assert(mask->break_mask == LLVMConstAllOnes(mask->int_vec_type)); + assert(mask->break_var == NULL); + } - if (mask->cont_stack_size == 0) - mask->cont_mask = LLVMConstAllOnes(mask->int_vec_type); - if (mask->break_stack_size == 0) - mask->break_mask = LLVMConstAllOnes(mask->int_vec_type); - if (mask->cond_stack_size == 0) - mask->cond_mask = LLVMConstAllOnes(mask->int_vec_type); + assert(mask->loop_stack_size < LP_MAX_TGSI_NESTING); + + mask->loop_stack[mask->loop_stack_size].loop_block = mask->loop_block; + mask->loop_stack[mask->loop_stack_size].cont_mask = mask->cont_mask; + mask->loop_stack[mask->loop_stack_size].break_mask = mask->break_mask; + mask->loop_stack[mask->loop_stack_size].break_var = mask->break_var; + ++mask->loop_stack_size; + + mask->break_var = lp_build_alloca(mask->bld->builder, mask->int_vec_type, ""); + LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var); - mask->break_stack[mask->break_stack_size++] = mask->break_mask; - mask->cont_stack[mask->cont_stack_size++] = mask->cont_mask; - mask->loop_stack[mask->loop_stack_size++] = mask->loop_block; mask->loop_block = lp_build_insert_new_block(mask->bld->builder, "bgnloop"); LLVMBuildBr(mask->bld->builder, mask->loop_block); LLVMPositionBuilderAtEnd(mask->bld->builder, mask->loop_block); + mask->break_mask = LLVMBuildLoad(mask->bld->builder, mask->break_var, ""); + lp_exec_mask_update(mask); } @@ -255,16 +270,9 @@ static void lp_exec_break(struct lp_exec_mask *mask) mask->exec_mask, "break"); - /* mask->break_stack_size > 1 implies that we encountered a break - * statemant already and if that's the case we want to make sure - * our mask is a combination of the previous break and the current - * execution mask */ - if (mask->break_stack_size > 1) { - mask->break_mask = LLVMBuildAnd(mask->bld->builder, - mask->break_mask, - exec_mask, "break_full"); - } else - mask->break_mask = exec_mask; + mask->break_mask = LLVMBuildAnd(mask->bld->builder, + mask->break_mask, + exec_mask, "break_full"); lp_exec_mask_update(mask); } @@ -275,12 +283,9 @@ static void lp_exec_continue(struct lp_exec_mask *mask) mask->exec_mask, ""); - if (mask->cont_stack_size > 1) { - mask->cont_mask = LLVMBuildAnd(mask->bld->builder, - mask->cont_mask, - exec_mask, ""); - } else - mask->cont_mask = exec_mask; + mask->cont_mask = LLVMBuildAnd(mask->bld->builder, + mask->cont_mask, + exec_mask, ""); lp_exec_mask_update(mask); } @@ -295,11 +300,24 @@ static void lp_exec_endloop(struct lp_exec_mask *mask) assert(mask->break_mask); + /* + * Restore the cont_mask, but don't pop + */ + assert(mask->loop_stack_size); + mask->cont_mask = mask->loop_stack[mask->loop_stack_size - 1].cont_mask; + lp_exec_mask_update(mask); + + /* + * Unlike the continue mask, the break_mask must be preserved across loop + * iterations + */ + LLVMBuildStore(mask->bld->builder, mask->break_mask, mask->break_var); + /* i1cond = (mask == 0) */ i1cond = LLVMBuildICmp( mask->bld->builder, LLVMIntNE, - LLVMBuildBitCast(mask->bld->builder, mask->break_mask, reg_type, ""), + LLVMBuildBitCast(mask->bld->builder, mask->exec_mask, reg_type, ""), LLVMConstNull(reg_type), ""); endloop = lp_build_insert_new_block(mask->bld->builder, "endloop"); @@ -309,15 +327,12 @@ static void lp_exec_endloop(struct lp_exec_mask *mask) LLVMPositionBuilderAtEnd(mask->bld->builder, endloop); - mask->loop_block = mask->loop_stack[--mask->loop_stack_size]; - /* pop the cont mask */ - if (mask->cont_stack_size) { - mask->cont_mask = mask->cont_stack[--mask->cont_stack_size]; - } - /* pop the break mask */ - if (mask->break_stack_size) { - mask->break_mask = mask->break_stack[--mask->break_stack_size]; - } + assert(mask->loop_stack_size); + --mask->loop_stack_size; + mask->loop_block = mask->loop_stack[mask->loop_stack_size].loop_block; + mask->cont_mask = mask->loop_stack[mask->loop_stack_size].cont_mask; + mask->break_mask = mask->loop_stack[mask->loop_stack_size].break_mask; + mask->break_var = mask->loop_stack[mask->loop_stack_size].break_var; lp_exec_mask_update(mask); } @@ -328,15 +343,25 @@ static void lp_exec_endloop(struct lp_exec_mask *mask) * (0 means don't store this bit, 1 means do store). */ static void lp_exec_mask_store(struct lp_exec_mask *mask, + LLVMValueRef pred, LLVMValueRef val, LLVMValueRef dst) { + /* Mix the predicate and execution mask */ if (mask->has_mask) { + if (pred) { + pred = LLVMBuildAnd(mask->bld->builder, pred, mask->exec_mask, ""); + } else { + pred = mask->exec_mask; + } + } + + if (pred) { LLVMValueRef real_val, dst_val; dst_val = LLVMBuildLoad(mask->bld->builder, dst, ""); real_val = lp_build_select(mask->bld, - mask->exec_mask, + pred, val, dst_val); LLVMBuildStore(mask->bld->builder, real_val, dst); @@ -344,44 +369,149 @@ static void lp_exec_mask_store(struct lp_exec_mask *mask, LLVMBuildStore(mask->bld->builder, val, dst); } +static void lp_exec_mask_call(struct lp_exec_mask *mask, + int func, + int *pc) +{ + assert(mask->call_stack_size < LP_MAX_TGSI_NESTING); + mask->call_stack[mask->call_stack_size].pc = *pc; + mask->call_stack[mask->call_stack_size].ret_mask = mask->ret_mask; + mask->call_stack_size++; + *pc = func; +} -static LLVMValueRef -emit_ddx(struct lp_build_tgsi_soa_context *bld, - LLVMValueRef src) +static void lp_exec_mask_ret(struct lp_exec_mask *mask, int *pc) { - LLVMValueRef src_left = lp_build_swizzle1_aos(&bld->base, src, swizzle_left); - LLVMValueRef src_right = lp_build_swizzle1_aos(&bld->base, src, swizzle_right); - return lp_build_sub(&bld->base, src_right, src_left); + LLVMValueRef exec_mask; + + if (mask->call_stack_size == 0) { + /* returning from main() */ + *pc = -1; + return; + } + exec_mask = LLVMBuildNot(mask->bld->builder, + mask->exec_mask, + "ret"); + + mask->ret_mask = LLVMBuildAnd(mask->bld->builder, + mask->ret_mask, + exec_mask, "ret_full"); + + lp_exec_mask_update(mask); } +static void lp_exec_mask_bgnsub(struct lp_exec_mask *mask) +{ +} -static LLVMValueRef -emit_ddy(struct lp_build_tgsi_soa_context *bld, - LLVMValueRef src) +static void lp_exec_mask_endsub(struct lp_exec_mask *mask, int *pc) { - LLVMValueRef src_top = lp_build_swizzle1_aos(&bld->base, src, swizzle_top); - LLVMValueRef src_bottom = lp_build_swizzle1_aos(&bld->base, src, swizzle_bottom); - return lp_build_sub(&bld->base, src_top, src_bottom); + assert(mask->call_stack_size); + mask->call_stack_size--; + *pc = mask->call_stack[mask->call_stack_size].pc; + mask->ret_mask = mask->call_stack[mask->call_stack_size].ret_mask; + lp_exec_mask_update(mask); } + +/** + * Return pointer to a temporary register channel (src or dest). + * Note that indirect addressing cannot be handled here. + * \param index which temporary register + * \param chan which channel of the temp register. + */ static LLVMValueRef get_temp_ptr(struct lp_build_tgsi_soa_context *bld, unsigned index, - unsigned swizzle, - boolean is_indirect, - LLVMValueRef addr) + unsigned chan) { - if (!bld->has_indirect_addressing) { - return bld->temps[index][swizzle]; - } else { - LLVMValueRef lindex = - LLVMConstInt(LLVMInt32Type(), index*4 + swizzle, 0); - if (is_indirect) - lindex = lp_build_add(&bld->base, lindex, addr); + assert(chan < 4); + if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) { + LLVMValueRef lindex = lp_build_const_int32(index * 4 + chan); return LLVMBuildGEP(bld->base.builder, bld->temps_array, &lindex, 1, ""); } + else { + return bld->temps[index][chan]; + } +} + + +/** + * Gather vector. + * XXX the lp_build_gather() function should be capable of doing this + * with a little work. + */ +static LLVMValueRef +build_gather(struct lp_build_tgsi_soa_context *bld, + LLVMValueRef base_ptr, + LLVMValueRef indexes) +{ + LLVMValueRef res = bld->base.undef; + unsigned i; + + /* + * Loop over elements of index_vec, load scalar value, insert it into 'res'. + */ + for (i = 0; i < bld->base.type.length; i++) { + LLVMValueRef ii = LLVMConstInt(LLVMInt32Type(), i, 0); + LLVMValueRef index = LLVMBuildExtractElement(bld->base.builder, + indexes, ii, ""); + LLVMValueRef scalar_ptr = LLVMBuildGEP(bld->base.builder, base_ptr, + &index, 1, ""); + LLVMValueRef scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, ""); + + res = LLVMBuildInsertElement(bld->base.builder, res, scalar, ii, ""); + } + + return res; } + +/** + * Read the current value of the ADDR register, convert the floats to + * ints, multiply by four and return the vector of offsets. + * The offsets will be used to index into the constant buffer or + * temporary register file. + */ +static LLVMValueRef +get_indirect_index(struct lp_build_tgsi_soa_context *bld, + unsigned reg_file, unsigned reg_index, + const struct tgsi_src_register *indirect_reg) +{ + struct lp_build_context *uint_bld = &bld->uint_bld; + /* always use X component of address register */ + unsigned swizzle = indirect_reg->SwizzleX; + LLVMValueRef base; + LLVMValueRef rel; + LLVMValueRef max_index; + LLVMValueRef index; + + assert(bld->indirect_files & (1 << reg_file)); + + base = lp_build_const_int_vec(uint_bld->type, reg_index); + + assert(swizzle < 4); + rel = LLVMBuildLoad(bld->base.builder, + bld->addr[indirect_reg->Index][swizzle], + "load addr reg"); + + /* for indexing we want integers */ + rel = LLVMBuildFPToSI(bld->base.builder, + rel, + uint_bld->vec_type, ""); + + index = lp_build_add(uint_bld, base, rel); + + max_index = lp_build_const_int_vec(uint_bld->type, + bld->info->file_max[reg_file]); + + assert(!uint_bld->type.sign); + index = lp_build_min(uint_bld, index, max_index); + + return index; +} + + /** * Register fetch. */ @@ -389,81 +519,102 @@ static LLVMValueRef emit_fetch( struct lp_build_tgsi_soa_context *bld, const struct tgsi_full_instruction *inst, - unsigned index, + unsigned src_op, const unsigned chan_index ) { - const struct tgsi_full_src_register *reg = &inst->Src[index]; - unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index ); + struct lp_build_context *uint_bld = &bld->uint_bld; + const struct tgsi_full_src_register *reg = &inst->Src[src_op]; + const unsigned swizzle = + tgsi_util_get_full_src_register_swizzle(reg, chan_index); LLVMValueRef res; - LLVMValueRef addr; + LLVMValueRef indirect_index = NULL; + + if (swizzle > 3) { + assert(0 && "invalid swizzle in emit_fetch()"); + return bld->base.undef; + } - switch (swizzle) { - case TGSI_SWIZZLE_X: - case TGSI_SWIZZLE_Y: - case TGSI_SWIZZLE_Z: - case TGSI_SWIZZLE_W: + if (reg->Register.Indirect) { + indirect_index = get_indirect_index(bld, + reg->Register.File, + reg->Register.Index, + ®->Indirect); + } else { + assert(reg->Register.Index <= bld->info->file_max[reg->Register.File]); + } + switch (reg->Register.File) { + case TGSI_FILE_CONSTANT: if (reg->Register.Indirect) { - LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type); - unsigned swizzle = tgsi_util_get_src_register_swizzle( ®->Indirect, chan_index ); - addr = LLVMBuildLoad(bld->base.builder, - bld->addr[reg->Indirect.Index][swizzle], - ""); - /* for indexing we want integers */ - addr = LLVMBuildFPToSI(bld->base.builder, addr, - int_vec_type, ""); - addr = LLVMBuildExtractElement(bld->base.builder, - addr, LLVMConstInt(LLVMInt32Type(), 0, 0), - ""); - addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0)); - } - - switch (reg->Register.File) { - case TGSI_FILE_CONSTANT: { - LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), reg->Register.Index*4 + swizzle, 0); + LLVMValueRef swizzle_vec = + lp_build_const_int_vec(uint_bld->type, swizzle); + LLVMValueRef index_vec; /* index into the const buffer */ + + /* index_vec = indirect_index * 4 + swizzle */ + index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2); + index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec); + + /* Gather values from the constant buffer */ + res = build_gather(bld, bld->consts_ptr, index_vec); + } + else { + LLVMValueRef index; /* index into the const buffer */ LLVMValueRef scalar, scalar_ptr; - if (reg->Register.Indirect) { - /*lp_build_printf(bld->base.builder, - "\taddr = %d\n", addr);*/ - index = lp_build_add(&bld->base, index, addr); - } - scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, &index, 1, ""); + index = lp_build_const_int32(reg->Register.Index*4 + swizzle); + + scalar_ptr = LLVMBuildGEP(bld->base.builder, bld->consts_ptr, + &index, 1, ""); scalar = LLVMBuildLoad(bld->base.builder, scalar_ptr, ""); res = lp_build_broadcast_scalar(&bld->base, scalar); - break; } + break; - case TGSI_FILE_IMMEDIATE: - res = bld->immediates[reg->Register.Index][swizzle]; - assert(res); - break; + case TGSI_FILE_IMMEDIATE: + res = bld->immediates[reg->Register.Index][swizzle]; + assert(res); + break; - case TGSI_FILE_INPUT: - res = bld->inputs[reg->Register.Index][swizzle]; - assert(res); - break; + case TGSI_FILE_INPUT: + res = bld->inputs[reg->Register.Index][swizzle]; + assert(res); + break; - case TGSI_FILE_TEMPORARY: { - LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index, - swizzle, - reg->Register.Indirect, - addr); + case TGSI_FILE_TEMPORARY: + if (reg->Register.Indirect) { + LLVMValueRef swizzle_vec = + lp_build_const_int_vec(uint_bld->type, swizzle); + LLVMValueRef length_vec = + lp_build_const_int_vec(uint_bld->type, bld->base.type.length); + LLVMValueRef index_vec; /* index into the const buffer */ + LLVMValueRef temps_array; + LLVMTypeRef float4_ptr_type; + + /* index_vec = (indirect_index * 4 + swizzle) * length */ + index_vec = lp_build_shl_imm(uint_bld, indirect_index, 2); + index_vec = lp_build_add(uint_bld, index_vec, swizzle_vec); + index_vec = lp_build_mul(uint_bld, index_vec, length_vec); + + /* cast temps_array pointer to float* */ + float4_ptr_type = LLVMPointerType(LLVMFloatType(), 0); + temps_array = LLVMBuildBitCast(uint_bld->builder, bld->temps_array, + float4_ptr_type, ""); + + /* Gather values from the temporary register array */ + res = build_gather(bld, temps_array, index_vec); + } + else { + LLVMValueRef temp_ptr; + temp_ptr = get_temp_ptr(bld, reg->Register.Index, swizzle); res = LLVMBuildLoad(bld->base.builder, temp_ptr, ""); - if(!res) + if (!res) return bld->base.undef; - break; - } - - default: - assert( 0 ); - return bld->base.undef; } break; default: - assert( 0 ); + assert(0 && "invalid src register in emit_fetch()"); return bld->base.undef; } @@ -473,13 +624,10 @@ emit_fetch( break; case TGSI_UTIL_SIGN_SET: - /* TODO: Use bitwese OR for floating point */ res = lp_build_abs( &bld->base, res ); - res = LLVMBuildNeg( bld->base.builder, res, "" ); - break; - + /* fall through */ case TGSI_UTIL_SIGN_TOGGLE: - res = LLVMBuildNeg( bld->base.builder, res, "" ); + res = lp_build_negate( &bld->base, res ); break; case TGSI_UTIL_SIGN_KEEP: @@ -513,10 +661,77 @@ emit_fetch_deriv( /* TODO: use interpolation coeffs for inputs */ if(ddx) - *ddx = emit_ddx(bld, src); + *ddx = lp_build_ddx(&bld->base, src); if(ddy) - *ddy = emit_ddy(bld, src); + *ddy = lp_build_ddy(&bld->base, src); +} + + +/** + * Predicate. + */ +static void +emit_fetch_predicate( + struct lp_build_tgsi_soa_context *bld, + const struct tgsi_full_instruction *inst, + LLVMValueRef *pred) +{ + unsigned index; + unsigned char swizzles[4]; + LLVMValueRef unswizzled[4] = {NULL, NULL, NULL, NULL}; + LLVMValueRef value; + unsigned chan; + + if (!inst->Instruction.Predicate) { + FOR_EACH_CHANNEL( chan ) { + pred[chan] = NULL; + } + return; + } + + swizzles[0] = inst->Predicate.SwizzleX; + swizzles[1] = inst->Predicate.SwizzleY; + swizzles[2] = inst->Predicate.SwizzleZ; + swizzles[3] = inst->Predicate.SwizzleW; + + index = inst->Predicate.Index; + assert(index < LP_MAX_TGSI_PREDS); + + FOR_EACH_CHANNEL( chan ) { + unsigned swizzle = swizzles[chan]; + + /* + * Only fetch the predicate register channels that are actually listed + * in the swizzles + */ + if (!unswizzled[swizzle]) { + value = LLVMBuildLoad(bld->base.builder, + bld->preds[index][swizzle], ""); + + /* + * Convert the value to an integer mask. + * + * TODO: Short-circuit this comparison -- a D3D setp_xx instructions + * is needlessly causing two comparisons due to storing the intermediate + * result as float vector instead of an integer mask vector. + */ + value = lp_build_compare(bld->base.builder, + bld->base.type, + PIPE_FUNC_NOTEQUAL, + value, + bld->base.zero); + if (inst->Predicate.Negate) { + value = LLVMBuildNot(bld->base.builder, value, ""); + } + + unswizzled[swizzle] = value; + } else { + value = unswizzled[swizzle]; + } + + pred[chan] = value; + } } @@ -529,10 +744,11 @@ emit_store( const struct tgsi_full_instruction *inst, unsigned index, unsigned chan_index, + LLVMValueRef pred, LLVMValueRef value) { const struct tgsi_full_dst_register *reg = &inst->Dst[index]; - LLVMValueRef addr; + LLVMValueRef indirect_index = NULL; switch( inst->Instruction.Saturate ) { case TGSI_SAT_NONE: @@ -553,43 +769,41 @@ emit_store( } if (reg->Register.Indirect) { - LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->base.type); - unsigned swizzle = tgsi_util_get_src_register_swizzle( ®->Indirect, chan_index ); - addr = LLVMBuildLoad(bld->base.builder, - bld->addr[reg->Indirect.Index][swizzle], - ""); - /* for indexing we want integers */ - addr = LLVMBuildFPToSI(bld->base.builder, addr, - int_vec_type, ""); - addr = LLVMBuildExtractElement(bld->base.builder, - addr, LLVMConstInt(LLVMInt32Type(), 0, 0), - ""); - addr = lp_build_mul(&bld->base, addr, LLVMConstInt(LLVMInt32Type(), 4, 0)); + indirect_index = get_indirect_index(bld, + reg->Register.File, + reg->Register.Index, + ®->Indirect); + } else { + assert(reg->Register.Index <= bld->info->file_max[reg->Register.File]); } switch( reg->Register.File ) { case TGSI_FILE_OUTPUT: - lp_exec_mask_store(&bld->exec_mask, value, + lp_exec_mask_store(&bld->exec_mask, pred, value, bld->outputs[reg->Register.Index][chan_index]); break; - case TGSI_FILE_TEMPORARY: { - LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index, - chan_index, - reg->Register.Indirect, - addr); - lp_exec_mask_store(&bld->exec_mask, value, temp_ptr); + case TGSI_FILE_TEMPORARY: + if (reg->Register.Indirect) { + /* XXX not done yet */ + debug_printf("WARNING: LLVM scatter store of temp regs" + " not implemented\n"); + } + else { + LLVMValueRef temp_ptr = get_temp_ptr(bld, reg->Register.Index, + chan_index); + lp_exec_mask_store(&bld->exec_mask, pred, value, temp_ptr); + } break; - } case TGSI_FILE_ADDRESS: - lp_exec_mask_store(&bld->exec_mask, value, + lp_exec_mask_store(&bld->exec_mask, pred, value, bld->addr[reg->Indirect.Index][chan_index]); break; case TGSI_FILE_PREDICATE: - /* FIXME */ - assert(0); + lp_exec_mask_store(&bld->exec_mask, pred, value, + bld->preds[reg->Register.Index][chan_index]); break; default: @@ -602,21 +816,29 @@ emit_store( * High-level instruction translators. */ - static void emit_tex( struct lp_build_tgsi_soa_context *bld, const struct tgsi_full_instruction *inst, - boolean apply_lodbias, - boolean projected, + enum lp_build_tex_modifier modifier, LLVMValueRef *texel) { - const uint unit = inst->Src[1].Register.Index; - LLVMValueRef lodbias; + unsigned unit; + LLVMValueRef lod_bias, explicit_lod; LLVMValueRef oow = NULL; LLVMValueRef coords[3]; + LLVMValueRef ddx[3]; + LLVMValueRef ddy[3]; unsigned num_coords; unsigned i; + if (!bld->sampler) { + _debug_printf("warning: found texture instruction but no sampler generator supplied\n"); + for (i = 0; i < 4; i++) { + texel[i] = bld->base.undef; + } + return; + } + switch (inst->Texture.Texture) { case TGSI_TEXTURE_1D: num_coords = 1; @@ -637,29 +859,57 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, return; } - if(apply_lodbias) - lodbias = emit_fetch( bld, inst, 0, 3 ); - else - lodbias = bld->base.zero; + if (modifier == LP_BLD_TEX_MODIFIER_LOD_BIAS) { + lod_bias = emit_fetch( bld, inst, 0, 3 ); + explicit_lod = NULL; + } + else if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_LOD) { + lod_bias = NULL; + explicit_lod = emit_fetch( bld, inst, 0, 3 ); + } + else { + lod_bias = NULL; + explicit_lod = NULL; + } - if (projected) { + if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) { oow = emit_fetch( bld, inst, 0, 3 ); oow = lp_build_rcp(&bld->base, oow); } for (i = 0; i < num_coords; i++) { coords[i] = emit_fetch( bld, inst, 0, i ); - if (projected) + if (modifier == LP_BLD_TEX_MODIFIER_PROJECTED) coords[i] = lp_build_mul(&bld->base, coords[i], oow); } for (i = num_coords; i < 3; i++) { coords[i] = bld->base.undef; } + if (modifier == LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV) { + for (i = 0; i < num_coords; i++) { + ddx[i] = emit_fetch( bld, inst, 1, i ); + ddy[i] = emit_fetch( bld, inst, 2, i ); + } + unit = inst->Src[3].Register.Index; + } else { + for (i = 0; i < num_coords; i++) { + ddx[i] = lp_build_ddx( &bld->base, coords[i] ); + ddy[i] = lp_build_ddy( &bld->base, coords[i] ); + } + unit = inst->Src[1].Register.Index; + } + for (i = num_coords; i < 3; i++) { + ddx[i] = bld->base.undef; + ddy[i] = bld->base.undef; + } + bld->sampler->emit_fetch_texel(bld->sampler, bld->base.builder, bld->base.type, - unit, num_coords, coords, lodbias, + unit, num_coords, coords, + ddx, ddy, + lod_bias, explicit_lod, texel); } @@ -739,25 +989,27 @@ emit_kilp(struct lp_build_tgsi_soa_context *bld, lp_build_mask_update(bld->mask, mask); } -static int +static void emit_declaration( struct lp_build_tgsi_soa_context *bld, const struct tgsi_full_declaration *decl) { - LLVMTypeRef vec_type = lp_build_vec_type(bld->base.type); + LLVMTypeRef vec_type = bld->base.vec_type; unsigned first = decl->Range.First; unsigned last = decl->Range.Last; unsigned idx, i; for (idx = first; idx <= last; ++idx) { + assert(last <= bld->info->file_max[decl->Declaration.File]); switch (decl->Declaration.File) { case TGSI_FILE_TEMPORARY: - if (bld->has_indirect_addressing) { - LLVMValueRef val = LLVMConstInt(LLVMInt32Type(), - last*4 + 4, 0); + assert(idx < LP_MAX_TGSI_TEMPS); + if (bld->indirect_files & (1 << TGSI_FILE_TEMPORARY)) { + LLVMValueRef array_size = LLVMConstInt(LLVMInt32Type(), + last*4 + 4, 0); bld->temps_array = lp_build_array_alloca(bld->base.builder, - vec_type, val, ""); + vec_type, array_size, ""); } else { for (i = 0; i < NUM_CHANNELS; i++) bld->temps[idx][i] = lp_build_alloca(bld->base.builder, @@ -772,18 +1024,24 @@ emit_declaration( break; case TGSI_FILE_ADDRESS: + assert(idx < LP_MAX_TGSI_ADDRS); for (i = 0; i < NUM_CHANNELS; i++) bld->addr[idx][i] = lp_build_alloca(bld->base.builder, vec_type, ""); break; + case TGSI_FILE_PREDICATE: + assert(idx < LP_MAX_TGSI_PREDS); + for (i = 0; i < NUM_CHANNELS; i++) + bld->preds[idx][i] = lp_build_alloca(bld->base.builder, + vec_type, ""); + break; + default: /* don't need to declare other vars */ break; } } - - return TRUE; } @@ -795,7 +1053,8 @@ static boolean emit_instruction( struct lp_build_tgsi_soa_context *bld, const struct tgsi_full_instruction *inst, - const struct tgsi_opcode_info *info) + const struct tgsi_opcode_info *info, + int *pc) { unsigned chan_index; LLVMValueRef src0, src1, src2; @@ -819,8 +1078,10 @@ emit_instruction( * redundant code. */ + (*pc)++; + assert(info->num_dst <= 1); - if(info->num_dst) { + if (info->num_dst) { FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) { dst0[chan_index] = bld->base.undef; } @@ -1359,12 +1620,11 @@ emit_instruction( break; case TGSI_OPCODE_TEX: - emit_tex( bld, inst, FALSE, FALSE, dst0 ); + emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_NONE, dst0 ); break; case TGSI_OPCODE_TXD: - /* FIXME */ - return FALSE; + emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV, dst0 ); break; case TGSI_OPCODE_UP2H: @@ -1418,16 +1678,18 @@ emit_instruction( break; case TGSI_OPCODE_CAL: - /* FIXME */ - return FALSE; + lp_exec_mask_call(&bld->exec_mask, + inst->Label.Label, + pc); + break; case TGSI_OPCODE_RET: - /* FIXME */ - return FALSE; + lp_exec_mask_ret(&bld->exec_mask, pc); break; case TGSI_OPCODE_END: + *pc = -1; break; case TGSI_OPCODE_SSG: @@ -1466,7 +1728,7 @@ emit_instruction( break; case TGSI_OPCODE_TXB: - emit_tex( bld, inst, TRUE, FALSE, dst0 ); + emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS, dst0 ); break; case TGSI_OPCODE_NRM: @@ -1571,11 +1833,11 @@ emit_instruction( break; case TGSI_OPCODE_TXL: - emit_tex( bld, inst, TRUE, FALSE, dst0 ); + emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD, dst0 ); break; case TGSI_OPCODE_TXP: - emit_tex( bld, inst, FALSE, TRUE, dst0 ); + emit_tex( bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED, dst0 ); break; case TGSI_OPCODE_BRK: @@ -1593,6 +1855,10 @@ emit_instruction( lp_exec_bgnloop(&bld->exec_mask); break; + case TGSI_OPCODE_BGNSUB: + lp_exec_mask_bgnsub(&bld->exec_mask); + break; + case TGSI_OPCODE_ELSE: lp_exec_mask_cond_invert(&bld->exec_mask); break; @@ -1605,6 +1871,10 @@ emit_instruction( lp_exec_endloop(&bld->exec_mask); break; + case TGSI_OPCODE_ENDSUB: + lp_exec_mask_endsub(&bld->exec_mask, pc); + break; + case TGSI_OPCODE_PUSHA: /* deprecated? */ assert(0); @@ -1717,8 +1987,12 @@ emit_instruction( } if(info->num_dst) { + LLVMValueRef pred[NUM_CHANNELS]; + + emit_fetch_predicate( bld, inst, pred ); + FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) { - emit_store( bld, inst, 0, chan_index, dst0[chan_index]); + emit_store( bld, inst, 0, chan_index, pred[chan_index], dst0[chan_index]); } } @@ -1736,24 +2010,42 @@ lp_build_tgsi_soa(LLVMBuilderRef builder, const LLVMValueRef (*inputs)[NUM_CHANNELS], LLVMValueRef (*outputs)[NUM_CHANNELS], struct lp_build_sampler_soa *sampler, - struct tgsi_shader_info *info) + const struct tgsi_shader_info *info) { struct lp_build_tgsi_soa_context bld; struct tgsi_parse_context parse; uint num_immediates = 0; + uint num_instructions = 0; unsigned i; + int pc = 0; + + struct lp_type res_type; + + assert(type.length <= LP_MAX_VECTOR_LENGTH); + memset(&res_type, 0, sizeof res_type); + res_type.width = type.width; + res_type.length = type.length; + res_type.sign = 1; /* Setup build context */ memset(&bld, 0, sizeof bld); lp_build_context_init(&bld.base, builder, type); + lp_build_context_init(&bld.uint_bld, builder, lp_uint_type(type)); bld.mask = mask; bld.pos = pos; bld.inputs = inputs; bld.outputs = outputs; bld.consts_ptr = consts_ptr; bld.sampler = sampler; - bld.has_indirect_addressing = info->opcode_count[TGSI_OPCODE_ARR] > 0 || - info->opcode_count[TGSI_OPCODE_ARL] > 0; + bld.info = info; + bld.indirect_files = info->indirect_files; + bld.instructions = (struct tgsi_full_instruction *) + MALLOC( LP_MAX_INSTRUCTIONS * sizeof(struct tgsi_full_instruction) ); + bld.max_instructions = LP_MAX_INSTRUCTIONS; + + if (!bld.instructions) { + return; + } lp_exec_mask_init(&bld.exec_mask, &bld.base); @@ -1765,19 +2057,31 @@ lp_build_tgsi_soa(LLVMBuilderRef builder, switch( parse.FullToken.Token.Type ) { case TGSI_TOKEN_TYPE_DECLARATION: /* Inputs already interpolated */ - { - if (!emit_declaration( &bld, &parse.FullToken.FullDeclaration )) - _debug_printf("warning: failed to define LLVM variable\n"); - } + emit_declaration( &bld, &parse.FullToken.FullDeclaration ); break; case TGSI_TOKEN_TYPE_INSTRUCTION: { - unsigned opcode = parse.FullToken.FullInstruction.Instruction.Opcode; - const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(opcode); - if (!emit_instruction( &bld, &parse.FullToken.FullInstruction, opcode_info )) - _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n", - opcode_info->mnemonic); + /* save expanded instruction */ + if (num_instructions == bld.max_instructions) { + struct tgsi_full_instruction *instructions; + instructions = REALLOC(bld.instructions, + bld.max_instructions + * sizeof(struct tgsi_full_instruction), + (bld.max_instructions + LP_MAX_INSTRUCTIONS) + * sizeof(struct tgsi_full_instruction)); + if (!instructions) { + break; + } + bld.instructions = instructions; + bld.max_instructions += LP_MAX_INSTRUCTIONS; + } + + memcpy(bld.instructions + num_instructions, + &parse.FullToken.FullInstruction, + sizeof(bld.instructions[0])); + + num_instructions++; } break; @@ -1787,7 +2091,7 @@ lp_build_tgsi_soa(LLVMBuilderRef builder, { const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1; assert(size <= 4); - assert(num_immediates < LP_MAX_IMMEDIATES); + assert(num_immediates < LP_MAX_TGSI_IMMEDIATES); for( i = 0; i < size; ++i ) bld.immediates[num_immediates][i] = lp_build_const_vec(type, parse.FullToken.FullImmediate.u[i].Float); @@ -1804,14 +2108,33 @@ lp_build_tgsi_soa(LLVMBuilderRef builder, assert( 0 ); } } + + while (pc != -1) { + struct tgsi_full_instruction *instr = bld.instructions + pc; + const struct tgsi_opcode_info *opcode_info = + tgsi_get_opcode_info(instr->Instruction.Opcode); + if (!emit_instruction( &bld, instr, opcode_info, &pc )) + _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n", + opcode_info->mnemonic); + } + if (0) { LLVMBasicBlockRef block = LLVMGetInsertBlock(builder); LLVMValueRef function = LLVMGetBasicBlockParent(block); debug_printf("11111111111111111111111111111 \n"); tgsi_dump(tokens, 0); - LLVMDumpValue(function); + lp_debug_dump_value(function); debug_printf("2222222222222222222222222222 \n"); } tgsi_parse_free( &parse ); + + if (0) { + LLVMModuleRef module = LLVMGetGlobalParent( + LLVMGetBasicBlockParent(LLVMGetInsertBlock(bld.base.builder))); + LLVMDumpModule(module); + + } + + FREE( bld.instructions ); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.c b/src/gallium/auxiliary/gallivm/lp_bld_type.c index 796af88caad..06f1aae6dcc 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.c @@ -195,6 +195,7 @@ lp_uint_type(struct lp_type type) { struct lp_type res_type; + assert(type.length <= LP_MAX_VECTOR_LENGTH); memset(&res_type, 0, sizeof res_type); res_type.width = type.width; res_type.length = type.length; @@ -211,6 +212,7 @@ lp_int_type(struct lp_type type) { struct lp_type res_type; + assert(type.length <= LP_MAX_VECTOR_LENGTH); memset(&res_type, 0, sizeof res_type); res_type.width = type.width; res_type.length = type.length; @@ -238,6 +240,131 @@ lp_wider_type(struct lp_type type) } +/** + * Return the size of the LLVMType in bits. + * XXX this function doesn't necessarily handle all LLVM types. + */ +unsigned +lp_sizeof_llvm_type(LLVMTypeRef t) +{ + LLVMTypeKind k = LLVMGetTypeKind(t); + + switch (k) { + case LLVMIntegerTypeKind: + return LLVMGetIntTypeWidth(t); + case LLVMFloatTypeKind: + return 8 * sizeof(float); + case LLVMDoubleTypeKind: + return 8 * sizeof(double); + case LLVMVectorTypeKind: + { + LLVMTypeRef elem = LLVMGetElementType(t); + unsigned len = LLVMGetVectorSize(t); + return len * lp_sizeof_llvm_type(elem); + } + break; + case LLVMArrayTypeKind: + { + LLVMTypeRef elem = LLVMGetElementType(t); + unsigned len = LLVMGetArrayLength(t); + return len * lp_sizeof_llvm_type(elem); + } + break; + default: + assert(0 && "Unexpected type in lp_get_llvm_type_size()"); + return 0; + } +} + + +/** + * Return string name for a LLVMTypeKind. Useful for debugging. + */ +const char * +lp_typekind_name(LLVMTypeKind t) +{ + switch (t) { + case LLVMVoidTypeKind: + return "LLVMVoidTypeKind"; + case LLVMFloatTypeKind: + return "LLVMFloatTypeKind"; + case LLVMDoubleTypeKind: + return "LLVMDoubleTypeKind"; + case LLVMX86_FP80TypeKind: + return "LLVMX86_FP80TypeKind"; + case LLVMFP128TypeKind: + return "LLVMFP128TypeKind"; + case LLVMPPC_FP128TypeKind: + return "LLVMPPC_FP128TypeKind"; + case LLVMLabelTypeKind: + return "LLVMLabelTypeKind"; + case LLVMIntegerTypeKind: + return "LLVMIntegerTypeKind"; + case LLVMFunctionTypeKind: + return "LLVMFunctionTypeKind"; + case LLVMStructTypeKind: + return "LLVMStructTypeKind"; + case LLVMArrayTypeKind: + return "LLVMArrayTypeKind"; + case LLVMPointerTypeKind: + return "LLVMPointerTypeKind"; + case LLVMOpaqueTypeKind: + return "LLVMOpaqueTypeKind"; + case LLVMVectorTypeKind: + return "LLVMVectorTypeKind"; + case LLVMMetadataTypeKind: + return "LLVMMetadataTypeKind"; + /* Only in LLVM 2.7 and later??? + case LLVMUnionTypeKind: + return "LLVMUnionTypeKind"; + */ + default: + return "unknown LLVMTypeKind"; + } +} + + +/** + * Print an LLVMTypeRef. Like LLVMDumpValue(). For debugging. + */ +void +lp_dump_llvmtype(LLVMTypeRef t) +{ + LLVMTypeKind k = LLVMGetTypeKind(t); + + if (k == LLVMVectorTypeKind) { + LLVMTypeRef te = LLVMGetElementType(t); + LLVMTypeKind ke = LLVMGetTypeKind(te); + unsigned len = LLVMGetVectorSize(t); + if (ke == LLVMIntegerTypeKind) { + unsigned b = LLVMGetIntTypeWidth(te); + debug_printf("Vector [%u] of %u-bit Integer\n", len, b); + } + else { + debug_printf("Vector [%u] of %s\n", len, lp_typekind_name(ke)); + } + } + else if (k == LLVMArrayTypeKind) { + LLVMTypeRef te = LLVMGetElementType(t); + LLVMTypeKind ke = LLVMGetTypeKind(te); + unsigned len = LLVMGetArrayLength(t); + debug_printf("Array [%u] of %s\n", len, lp_typekind_name(ke)); + } + else if (k == LLVMIntegerTypeKind) { + unsigned b = LLVMGetIntTypeWidth(t); + debug_printf("%u-bit Integer\n", b); + } + else if (k == LLVMPointerTypeKind) { + LLVMTypeRef te = LLVMGetElementType(t); + debug_printf("Pointer to "); + lp_dump_llvmtype(te); + } + else { + debug_printf("%s\n", lp_typekind_name(k)); + } +} + + void lp_build_context_init(struct lp_build_context *bld, LLVMBuilderRef builder, @@ -245,7 +372,23 @@ lp_build_context_init(struct lp_build_context *bld, { bld->builder = builder; bld->type = type; - bld->undef = lp_build_undef(type); - bld->zero = lp_build_zero(type); + + bld->int_elem_type = lp_build_int_elem_type(type); + if (type.floating) + bld->elem_type = lp_build_elem_type(type); + else + bld->elem_type = bld->int_elem_type; + + if (type.length == 1) { + bld->int_vec_type = bld->int_elem_type; + bld->vec_type = bld->elem_type; + } + else { + bld->int_vec_type = LLVMVectorType(bld->int_elem_type, type.length); + bld->vec_type = LLVMVectorType(bld->elem_type, type.length); + } + + bld->undef = LLVMGetUndef(bld->vec_type); + bld->zero = LLVMConstNull(bld->vec_type); bld->one = lp_build_one(type); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h index cd59d2faa66..fec1d3dfbc6 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h @@ -128,6 +128,18 @@ struct lp_build_context */ struct lp_type type; + /** Same as lp_build_elem_type(type) */ + LLVMTypeRef elem_type; + + /** Same as lp_build_vec_type(type) */ + LLVMTypeRef vec_type; + + /** Same as lp_build_int_elem_type(type) */ + LLVMTypeRef int_elem_type; + + /** Same as lp_build_int_vec_type(type) */ + LLVMTypeRef int_vec_type; + /** Same as lp_build_undef(type) */ LLVMValueRef undef; @@ -304,6 +316,54 @@ LLVMTypeRef lp_build_int32_vec4_type(void); +static INLINE struct lp_type +lp_float32_vec4_type(void) +{ + struct lp_type type; + + memset(&type, 0, sizeof(type)); + type.floating = TRUE; + type.sign = TRUE; + type.norm = FALSE; + type.width = 32; + type.length = 4; + + return type; +} + + +static INLINE struct lp_type +lp_int32_vec4_type(void) +{ + struct lp_type type; + + memset(&type, 0, sizeof(type)); + type.floating = FALSE; + type.sign = TRUE; + type.norm = FALSE; + type.width = 32; + type.length = 4; + + return type; +} + + +static INLINE struct lp_type +lp_unorm8_vec4_type(void) +{ + struct lp_type type; + + memset(&type, 0, sizeof(type)); + type.floating = FALSE; + type.sign = FALSE; + type.norm = TRUE; + type.width = 8; + type.length = 4; + + return type; +} + + struct lp_type lp_uint_type(struct lp_type type); @@ -316,6 +376,18 @@ struct lp_type lp_wider_type(struct lp_type type); +unsigned +lp_sizeof_llvm_type(LLVMTypeRef t); + + +const char * +lp_typekind_name(LLVMTypeKind t); + + +void +lp_dump_llvmtype(LLVMTypeRef t); + + void lp_build_context_init(struct lp_build_context *bld, LLVMBuilderRef builder, diff --git a/src/gallium/auxiliary/indices/.gitignore b/src/gallium/auxiliary/indices/.gitignore new file mode 100644 index 00000000000..73740071451 --- /dev/null +++ b/src/gallium/auxiliary/indices/.gitignore @@ -0,0 +1,2 @@ +u_indices_gen.c +u_unfilled_gen.c diff --git a/src/gallium/auxiliary/os/os_stream.c b/src/gallium/auxiliary/os/os_stream.c new file mode 100644 index 00000000000..3c55fc00d92 --- /dev/null +++ b/src/gallium/auxiliary/os/os_stream.c @@ -0,0 +1,58 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "pipe/p_config.h" + +#include "os_stream.h" +#include "util/u_memory.h" +#include "util/u_string.h" + +int +os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + char buf[1024]; + int retval; + va_list ap2; + va_copy(ap2, ap); + retval = util_vsnprintf(buf, sizeof(buf), format, ap2); + va_end(ap2); + if(retval <= 0) + {} + else if(retval < sizeof(buf)) + stream->write(stream, buf, retval); + else + { + char* str = MALLOC(retval + 1); + if(!str) + return -1; + retval = util_vsnprintf(str, retval + 1, format, ap); + if(retval > 0) + stream->write(stream, str, retval); + FREE(str); + } + + return retval; +} diff --git a/src/gallium/auxiliary/os/os_stream.h b/src/gallium/auxiliary/os/os_stream.h index 693a0621e2d..6c6050bb028 100644 --- a/src/gallium/auxiliary/os/os_stream.h +++ b/src/gallium/auxiliary/os/os_stream.h @@ -50,6 +50,9 @@ struct os_stream void (*flush)(struct os_stream *stream); + + int + (*vprintf)(struct os_stream *stream, const char* format, va_list ap); }; @@ -90,6 +93,27 @@ os_stream_flush(struct os_stream *stream) stream->flush(stream); } +int +os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap); + +static INLINE int +os_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + return stream->vprintf(stream, format, ap); +} + +static INLINE int +os_stream_printf (struct os_stream* stream, const char *format, ...) +{ + int retval; + va_list args; + + va_start (args, format); + retval = stream->vprintf(stream, format, args); + va_end (args); + + return retval; +} struct os_stream * os_file_stream_create(const char *filename); @@ -118,5 +142,4 @@ os_str_stream_get_and_close(struct os_stream *stream); #define os_file_stream_create(_filename) os_null_stream_create() #endif - #endif /* _OS_STREAM_H_ */ diff --git a/src/gallium/auxiliary/os/os_stream_log.c b/src/gallium/auxiliary/os/os_stream_log.c index 7cc2028a22c..b01377c3468 100644 --- a/src/gallium/auxiliary/os/os_stream_log.c +++ b/src/gallium/auxiliary/os/os_stream_log.c @@ -73,7 +73,8 @@ static struct os_stream os_log_stream_struct = { &os_log_stream_close, &os_log_stream_write, - &os_log_stream_flush + &os_log_stream_flush, + &os_default_stream_vprintf, }; diff --git a/src/gallium/auxiliary/os/os_stream_null.c b/src/gallium/auxiliary/os/os_stream_null.c index 128c4e8f0e0..a549a789e62 100644 --- a/src/gallium/auxiliary/os/os_stream_null.c +++ b/src/gallium/auxiliary/os/os_stream_null.c @@ -56,12 +56,18 @@ os_null_stream_flush(struct os_stream *stream) (void)stream; } +static int +os_null_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + return 0; +} static struct os_stream os_null_stream = { &os_null_stream_close, &os_null_stream_write, - &os_null_stream_flush + &os_null_stream_flush, + &os_null_stream_vprintf }; diff --git a/src/gallium/auxiliary/os/os_stream_stdc.c b/src/gallium/auxiliary/os/os_stream_stdc.c index 9e7ed711076..37e7d063e2b 100644 --- a/src/gallium/auxiliary/os/os_stream_stdc.c +++ b/src/gallium/auxiliary/os/os_stream_stdc.c @@ -83,6 +83,14 @@ os_stdc_stream_flush(struct os_stream *_stream) fflush(stream->file); } +static int +os_stdc_stream_vprintf (struct os_stream* _stream, const char *format, va_list ap) +{ + struct os_stdc_stream *stream = os_stdc_stream(_stream); + + return vfprintf(stream->file, format, ap); +} + struct os_stream * os_file_stream_create(const char *filename) @@ -96,6 +104,7 @@ os_file_stream_create(const char *filename) stream->base.close = &os_stdc_stream_close; stream->base.write = &os_stdc_stream_write; stream->base.flush = &os_stdc_stream_flush; + stream->base.vprintf = &os_stdc_stream_vprintf; stream->file = fopen(filename, "w"); if(!stream->file) diff --git a/src/gallium/auxiliary/os/os_stream_str.c b/src/gallium/auxiliary/os/os_stream_str.c index b5c7270d2ae..be9478b2a17 100644 --- a/src/gallium/auxiliary/os/os_stream_str.c +++ b/src/gallium/auxiliary/os/os_stream_str.c @@ -118,6 +118,7 @@ os_str_stream_create(size_t size) stream->base.close = &os_str_stream_close; stream->base.write = &os_str_stream_write; stream->base.flush = &os_str_stream_flush; + stream->base.vprintf = &os_default_stream_vprintf; stream->str = os_malloc(size); if(!stream->str) diff --git a/src/gallium/auxiliary/os/os_thread.h b/src/gallium/auxiliary/os/os_thread.h index c09e8a7a76f..a084310d4ff 100644 --- a/src/gallium/auxiliary/os/os_thread.h +++ b/src/gallium/auxiliary/os/os_thread.h @@ -40,12 +40,11 @@ #include "util/u_debug.h" /* for assert */ -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN) #include <pthread.h> /* POSIX threads headers */ #include <stdio.h> /* for perror() */ -#define PIPE_THREAD_HAVE_CONDVAR /* pipe_thread */ @@ -168,19 +167,59 @@ typedef CRITICAL_SECTION pipe_mutex; #define pipe_mutex_unlock(mutex) \ LeaveCriticalSection(&mutex) +/* TODO: Need a macro to declare "I don't care about WinXP compatibilty" */ +#if 0 && defined (_WIN32_WINNT) && (_WIN32_WINNT >= 0x0600) +/* CONDITION_VARIABLE is only available on newer versions of Windows + * (Server 2008/Vista or later). + * http://msdn.microsoft.com/en-us/library/ms682052(VS.85).aspx + * + * pipe_condvar + */ +typedef CONDITION_VARIABLE pipe_condvar; + +#define pipe_static_condvar(cond) \ + /*static*/ pipe_condvar cond = CONDITION_VARIABLE_INIT + +#define pipe_condvar_init(cond) \ + InitializeConditionVariable(&(cond)) + +#define pipe_condvar_destroy(cond) \ + (void) cond /* nothing to do */ + +#define pipe_condvar_wait(cond, mutex) \ + SleepConditionVariableCS(&(cond), &(mutex), INFINITE) + +#define pipe_condvar_signal(cond) \ + WakeConditionVariable(&(cond)) + +#define pipe_condvar_broadcast(cond) \ + WakeAllConditionVariable(&(cond)) + +#else /* need compatibility with pre-Vista Win32 */ /* pipe_condvar (XXX FIX THIS) + * See http://www.cs.wustl.edu/~schmidt/win32-cv-1.html + * for potential pitfalls in implementation. */ -typedef unsigned pipe_condvar; +typedef DWORD pipe_condvar; + +#define pipe_static_condvar(cond) \ + /*static*/ pipe_condvar cond = 1 #define pipe_condvar_init(cond) \ - (void) cond + (void) (cond = 1) #define pipe_condvar_destroy(cond) \ (void) cond +/* Poor man's pthread_cond_wait(): + Just release the mutex and sleep for one millisecond. + The caller's while() loop does all the work. */ #define pipe_condvar_wait(cond, mutex) \ - (void) cond; (void) mutex + do { pipe_mutex_unlock(mutex); \ + Sleep(cond); \ + pipe_mutex_lock(mutex); \ + } while (0) #define pipe_condvar_signal(cond) \ (void) cond @@ -188,9 +227,12 @@ typedef unsigned pipe_condvar; #define pipe_condvar_broadcast(cond) \ (void) cond +#endif /* pre-Vista win32 */ #else +#include "os/os_time.h" + /** Dummy definitions */ typedef unsigned pipe_thread; @@ -214,7 +256,6 @@ static INLINE int pipe_thread_destroy( pipe_thread thread ) } typedef unsigned pipe_mutex; -typedef unsigned pipe_condvar; #define pipe_static_mutex(mutex) \ static pipe_mutex mutex = 0 @@ -231,17 +272,25 @@ typedef unsigned pipe_condvar; #define pipe_mutex_unlock(mutex) \ (void) mutex +typedef int64_t pipe_condvar; + #define pipe_static_condvar(condvar) \ - static unsigned condvar = 0 + static pipe_condvar condvar = 1000 #define pipe_condvar_init(condvar) \ - (void) condvar + (void) (condvar = 1000) #define pipe_condvar_destroy(condvar) \ (void) condvar +/* Poor man's pthread_cond_wait(): + Just release the mutex and sleep for one millisecond. + The caller's while() loop does all the work. */ #define pipe_condvar_wait(condvar, mutex) \ - (void) condvar + do { pipe_mutex_unlock(mutex); \ + os_time_sleep(condvar); \ + pipe_mutex_lock(mutex); \ + } while (0) #define pipe_condvar_signal(condvar) \ (void) condvar @@ -277,27 +326,7 @@ static INLINE void pipe_barrier_wait(pipe_barrier *barrier) } -#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) - -/* XXX FIX THIS */ -typedef unsigned pipe_barrier; - -static INLINE void pipe_barrier_init(pipe_barrier *barrier, unsigned count) -{ - /* XXX we could implement barriers with a mutex and condition var */ -} - -static INLINE void pipe_barrier_destroy(pipe_barrier *barrier) -{ -} - -static INLINE void pipe_barrier_wait(pipe_barrier *barrier) -{ - assert(0); -} - - -#else +#else /* If the OS doesn't have its own, implement barriers using a mutex and a condvar */ typedef struct { unsigned count; @@ -405,7 +434,7 @@ pipe_semaphore_wait(pipe_semaphore *sema) */ typedef struct { -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN) pthread_key_t key; #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) DWORD key; @@ -420,7 +449,7 @@ typedef struct { static INLINE void pipe_tsd_init(pipe_tsd *tsd) { -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN) if (pthread_key_create(&tsd->key, NULL/*free*/) != 0) { perror("pthread_key_create(): failed to allocate key for thread specific data"); exit(-1); @@ -437,7 +466,7 @@ pipe_tsd_get(pipe_tsd *tsd) if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) { pipe_tsd_init(tsd); } -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN) return pthread_getspecific(tsd->key); #elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) assert(0); @@ -454,7 +483,7 @@ pipe_tsd_set(pipe_tsd *tsd, void *value) if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) { pipe_tsd_init(tsd); } -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_EMBEDDED) || defined(PIPE_OS_CYGWIN) if (pthread_setspecific(tsd->key, value) != 0) { perror("pthread_set_specific() failed"); exit(-1); diff --git a/src/gallium/auxiliary/os/os_time.c b/src/gallium/auxiliary/os/os_time.c index 6259142bec0..84907215fe6 100644 --- a/src/gallium/auxiliary/os/os_time.c +++ b/src/gallium/auxiliary/os/os_time.c @@ -37,7 +37,7 @@ #if !defined(PIPE_OS_EMBEDDED) -#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) +#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_CYGWIN) # include <sys/time.h> /* timeval */ #elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) # include <windows.h> diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h index a6c50dcf0c1..5a13f39849f 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h @@ -130,7 +130,7 @@ struct pb_vtbl * flags is bitmask of PB_USAGE_CPU_READ/WRITE. */ void *(*map)( struct pb_buffer *buf, - unsigned flags ); + unsigned flags, void *flush_ctx ); void (*unmap)( struct pb_buffer *buf ); @@ -164,13 +164,13 @@ struct pb_vtbl */ static INLINE void * pb_map(struct pb_buffer *buf, - unsigned flags) + unsigned flags, void *flush_ctx) { assert(buf); if(!buf) return NULL; assert(pipe_is_referenced(&buf->base.reference)); - return buf->vtbl->map(buf, flags); + return buf->vtbl->map(buf, flags, flush_ctx); } diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c index d6cf6405825..c310f28f51f 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c @@ -624,7 +624,7 @@ fenced_buffer_copy_storage_to_gpu_locked(struct fenced_buffer *fenced_buf) assert(fenced_buf->data); assert(fenced_buf->buffer); - map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_WRITE); + map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_WRITE, NULL); if(!map) return PIPE_ERROR; @@ -644,7 +644,7 @@ fenced_buffer_copy_storage_to_cpu_locked(struct fenced_buffer *fenced_buf) assert(fenced_buf->data); assert(fenced_buf->buffer); - map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_READ); + map = pb_map(fenced_buf->buffer, PB_USAGE_CPU_READ, NULL); if(!map) return PIPE_ERROR; @@ -674,7 +674,7 @@ fenced_buffer_destroy(struct pb_buffer *buf) static void * fenced_buffer_map(struct pb_buffer *buf, - unsigned flags) + unsigned flags, void *flush_ctx) { struct fenced_buffer *fenced_buf = fenced_buffer(buf); struct fenced_manager *fenced_mgr = fenced_buf->mgr; @@ -712,7 +712,7 @@ fenced_buffer_map(struct pb_buffer *buf, } if(fenced_buf->buffer) { - map = pb_map(fenced_buf->buffer, flags); + map = pb_map(fenced_buf->buffer, flags, flush_ctx); } else { assert(fenced_buf->data); diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c index b706f429be5..c2322eed19b 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c @@ -70,7 +70,8 @@ malloc_buffer_destroy(struct pb_buffer *buf) static void * malloc_buffer_map(struct pb_buffer *buf, - unsigned flags) + unsigned flags, + void *flush_ctx) { return malloc_buffer(buf)->data; } diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h index cec2524da2b..2ef02160f23 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h @@ -50,8 +50,7 @@ #define PB_BUFMGR_H_ -#include "pipe/p_compiler.h" -#include "pipe/p_defines.h" +#include "pb_buffer.h" #ifdef __cplusplus diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c index 88501e8d72d..a6eb4039621 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c @@ -167,10 +167,10 @@ pb_cache_buffer_destroy(struct pb_buffer *_buf) static void * pb_cache_buffer_map(struct pb_buffer *_buf, - unsigned flags) + unsigned flags, void *flush_ctx) { struct pb_cache_buffer *buf = pb_cache_buffer(_buf); - return pb_map(buf->buffer, flags); + return pb_map(buf->buffer, flags, flush_ctx); } @@ -222,7 +222,7 @@ pb_cache_buffer_vtbl = { }; -static INLINE boolean +static INLINE int pb_cache_is_buffer_compat(struct pb_cache_buffer *buf, pb_size size, const struct pb_desc *desc) @@ -230,26 +230,26 @@ pb_cache_is_buffer_compat(struct pb_cache_buffer *buf, void *map; if(buf->base.base.size < size) - return FALSE; + return 0; /* be lenient with size */ if(buf->base.base.size >= 2*size) - return FALSE; + return 0; if(!pb_check_alignment(desc->alignment, buf->base.base.alignment)) - return FALSE; + return 0; if(!pb_check_usage(desc->usage, buf->base.base.usage)) - return FALSE; + return 0; - map = pb_map(buf->buffer, PB_USAGE_DONTBLOCK); + map = pb_map(buf->buffer, PB_USAGE_DONTBLOCK, NULL); if (!map) { - return FALSE; + return -1; } pb_unmap(buf->buffer); - return TRUE; + return 1; } @@ -263,7 +263,8 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr, struct pb_cache_buffer *curr_buf; struct list_head *curr, *next; int64_t now; - + int ret = 0; + pipe_mutex_lock(mgr->mutex); buf = NULL; @@ -274,25 +275,30 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr, now = os_time_get(); while(curr != &mgr->delayed) { curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head); - if(!buf && pb_cache_is_buffer_compat(curr_buf, size, desc)) - buf = curr_buf; + if(!buf && (ret = pb_cache_is_buffer_compat(curr_buf, size, desc) > 0)) + buf = curr_buf; else if(os_time_timeout(curr_buf->start, curr_buf->end, now)) - _pb_cache_buffer_destroy(curr_buf); + _pb_cache_buffer_destroy(curr_buf); else /* This buffer (and all hereafter) are still hot in cache */ break; + if (ret == -1) + break; curr = next; next = curr->next; } /* keep searching in the hot buffers */ - if(!buf) { + if(!buf && ret != -1) { while(curr != &mgr->delayed) { curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head); - if(pb_cache_is_buffer_compat(curr_buf, size, desc)) { + ret = pb_cache_is_buffer_compat(curr_buf, size, desc); + if (ret > 0) { buf = curr_buf; break; } + if (ret == -1) + break; /* no need to check the timeout here */ curr = next; next = curr->next; @@ -301,6 +307,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr, if(buf) { LIST_DEL(&buf->head); + --mgr->numDelayed; pipe_mutex_unlock(mgr->mutex); /* Increase refcount */ pipe_reference_init(&buf->base.base.reference, 1); diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c index 0dc5b31a754..7604e75af8d 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c @@ -158,7 +158,7 @@ pb_debug_buffer_fill(struct pb_debug_buffer *buf) { uint8_t *map; - map = pb_map(buf->buffer, PB_USAGE_CPU_WRITE); + map = pb_map(buf->buffer, PB_USAGE_CPU_WRITE, NULL); assert(map); if(map) { fill_random_pattern(map, buf->underflow_size); @@ -181,7 +181,7 @@ pb_debug_buffer_check(struct pb_debug_buffer *buf) map = pb_map(buf->buffer, PB_USAGE_CPU_READ | - PB_USAGE_UNSYNCHRONIZED); + PB_USAGE_UNSYNCHRONIZED, NULL); assert(map); if(map) { boolean underflow, overflow; @@ -247,14 +247,14 @@ pb_debug_buffer_destroy(struct pb_buffer *_buf) static void * pb_debug_buffer_map(struct pb_buffer *_buf, - unsigned flags) + unsigned flags, void *flush_ctx) { struct pb_debug_buffer *buf = pb_debug_buffer(_buf); void *map; pb_debug_buffer_check(buf); - map = pb_map(buf->buffer, flags); + map = pb_map(buf->buffer, flags, flush_ctx); if(!map) return NULL; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c index faf7c352674..88da786216a 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c @@ -108,11 +108,14 @@ mm_buffer_destroy(struct pb_buffer *buf) static void * mm_buffer_map(struct pb_buffer *buf, - unsigned flags) + unsigned flags, + void *flush_ctx) { struct mm_buffer *mm_buf = mm_buffer(buf); struct mm_pb_manager *mm = mm_buf->mgr; + /* XXX: it will be necessary to remap here to propagate flush_ctx */ + return (unsigned char *) mm->map + mm_buf->block->ofs; } @@ -269,7 +272,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, mm->map = pb_map(mm->buffer, PB_USAGE_CPU_READ | - PB_USAGE_CPU_WRITE); + PB_USAGE_CPU_WRITE, NULL); if(!mm->map) goto failure; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c index 31f1ebbeb7c..694a092f3c2 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c @@ -103,13 +103,13 @@ pb_ondemand_buffer_destroy(struct pb_buffer *_buf) static void * pb_ondemand_buffer_map(struct pb_buffer *_buf, - unsigned flags) + unsigned flags, void *flush_ctx) { struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf); if(buf->buffer) { assert(!buf->data); - return pb_map(buf->buffer, flags); + return pb_map(buf->buffer, flags, flush_ctx); } else { assert(buf->data); @@ -150,7 +150,7 @@ pb_ondemand_buffer_instantiate(struct pb_ondemand_buffer *buf) if(!buf->buffer) return PIPE_ERROR_OUT_OF_MEMORY; - map = pb_map(buf->buffer, PB_USAGE_CPU_READ); + map = pb_map(buf->buffer, PB_USAGE_CPU_READ, NULL); if(!map) { pb_reference(&buf->buffer, NULL); return PIPE_ERROR; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c index fdcce428784..2f7c7389ff4 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c @@ -118,12 +118,14 @@ pool_buffer_destroy(struct pb_buffer *buf) static void * -pool_buffer_map(struct pb_buffer *buf, unsigned flags) +pool_buffer_map(struct pb_buffer *buf, unsigned flags, void *flush_ctx) { struct pool_buffer *pool_buf = pool_buffer(buf); struct pool_pb_manager *pool = pool_buf->mgr; void *map; + /* XXX: it will be necessary to remap here to propagate flush_ctx */ + pipe_mutex_lock(pool->mutex); map = (unsigned char *) pool->map + pool_buf->start; pipe_mutex_unlock(pool->mutex); @@ -285,7 +287,7 @@ pool_bufmgr_create(struct pb_manager *provider, pool->map = pb_map(pool->buffer, PB_USAGE_CPU_READ | - PB_USAGE_CPU_WRITE); + PB_USAGE_CPU_WRITE, NULL); if(!pool->map) goto failure; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c index 7a3305aaf37..176f9aa38aa 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c @@ -227,10 +227,13 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf) static void * pb_slab_buffer_map(struct pb_buffer *_buf, - unsigned flags) + unsigned flags, + void *flush_ctx) { struct pb_slab_buffer *buf = pb_slab_buffer(_buf); + /* XXX: it will be necessary to remap here to propagate flush_ctx */ + ++buf->mapCount; return (void *) ((uint8_t *) buf->slab->virtual + buf->start); } @@ -316,7 +319,7 @@ pb_slab_create(struct pb_slab_manager *mgr) * through this address so it is required that the buffer is pinned. */ slab->virtual = pb_map(slab->bo, PB_USAGE_CPU_READ | - PB_USAGE_CPU_WRITE); + PB_USAGE_CPU_WRITE, NULL); if(!slab->virtual) { ret = PIPE_ERROR_OUT_OF_MEMORY; goto out_err1; diff --git a/src/gallium/auxiliary/rbug/README b/src/gallium/auxiliary/rbug/README index d984067893c..c5156438a1b 100644 --- a/src/gallium/auxiliary/rbug/README +++ b/src/gallium/auxiliary/rbug/README @@ -10,7 +10,7 @@ The code currently uses tcp and ip4v for connections. Information about driver integration can be found in: -src/gallium/drivers/trace/README +src/gallium/drivers/rbug/README for information about applications look in: diff --git a/src/gallium/auxiliary/rbug/rbug_context.c b/src/gallium/auxiliary/rbug/rbug_context.c index 1832425658f..a3fd7e8430e 100644 --- a/src/gallium/auxiliary/rbug/rbug_context.c +++ b/src/gallium/auxiliary/rbug/rbug_context.c @@ -480,7 +480,7 @@ struct rbug_proto_context_list * rbug_demarshal_context_list(struct rbug_proto_h if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_LIST) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_LIST) return NULL; pos = 0; @@ -506,7 +506,7 @@ struct rbug_proto_context_info * rbug_demarshal_context_info(struct rbug_proto_h if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_INFO) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_INFO) return NULL; pos = 0; @@ -533,7 +533,7 @@ struct rbug_proto_context_draw_block * rbug_demarshal_context_draw_block(struct if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_BLOCK) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_BLOCK) return NULL; pos = 0; @@ -561,7 +561,7 @@ struct rbug_proto_context_draw_step * rbug_demarshal_context_draw_step(struct rb if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_STEP) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_STEP) return NULL; pos = 0; @@ -589,7 +589,7 @@ struct rbug_proto_context_draw_unblock * rbug_demarshal_context_draw_unblock(str if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_UNBLOCK) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_UNBLOCK) return NULL; pos = 0; @@ -617,7 +617,7 @@ struct rbug_proto_context_draw_rule * rbug_demarshal_context_draw_rule(struct rb if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_RULE) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_RULE) return NULL; pos = 0; @@ -649,7 +649,7 @@ struct rbug_proto_context_flush * rbug_demarshal_context_flush(struct rbug_proto if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_FLUSH) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_FLUSH) return NULL; pos = 0; @@ -677,7 +677,7 @@ struct rbug_proto_context_list_reply * rbug_demarshal_context_list_reply(struct if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_LIST_REPLY) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_LIST_REPLY) return NULL; pos = 0; @@ -705,7 +705,7 @@ struct rbug_proto_context_info_reply * rbug_demarshal_context_info_reply(struct if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_INFO_REPLY) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_INFO_REPLY) return NULL; pos = 0; @@ -739,7 +739,7 @@ struct rbug_proto_context_draw_blocked * rbug_demarshal_context_draw_blocked(str if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_CONTEXT_DRAW_BLOCKED) + if (header->opcode != (int32_t)RBUG_OP_CONTEXT_DRAW_BLOCKED) return NULL; pos = 0; diff --git a/src/gallium/auxiliary/rbug/rbug_core.c b/src/gallium/auxiliary/rbug/rbug_core.c index 876ae5a0ce6..1d47d13c9f3 100644 --- a/src/gallium/auxiliary/rbug/rbug_core.c +++ b/src/gallium/auxiliary/rbug/rbug_core.c @@ -233,7 +233,7 @@ struct rbug_proto_noop * rbug_demarshal_noop(struct rbug_proto_header *header) if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_NOOP) + if (header->opcode != (int32_t)RBUG_OP_NOOP) return NULL; pos = 0; @@ -259,7 +259,7 @@ struct rbug_proto_ping * rbug_demarshal_ping(struct rbug_proto_header *header) if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_PING) + if (header->opcode != (int32_t)RBUG_OP_PING) return NULL; pos = 0; @@ -285,7 +285,7 @@ struct rbug_proto_error * rbug_demarshal_error(struct rbug_proto_header *header) if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_ERROR) + if (header->opcode != (int32_t)RBUG_OP_ERROR) return NULL; pos = 0; @@ -312,7 +312,7 @@ struct rbug_proto_ping_reply * rbug_demarshal_ping_reply(struct rbug_proto_heade if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_PING_REPLY) + if (header->opcode != (int32_t)RBUG_OP_PING_REPLY) return NULL; pos = 0; @@ -339,7 +339,7 @@ struct rbug_proto_error_reply * rbug_demarshal_error_reply(struct rbug_proto_hea if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_ERROR_REPLY) + if (header->opcode != (int32_t)RBUG_OP_ERROR_REPLY) return NULL; pos = 0; diff --git a/src/gallium/auxiliary/rbug/rbug_demarshal.c b/src/gallium/auxiliary/rbug/rbug_demarshal.c index 47390fbcee7..06caa45469d 100644 --- a/src/gallium/auxiliary/rbug/rbug_demarshal.c +++ b/src/gallium/auxiliary/rbug/rbug_demarshal.c @@ -91,3 +91,67 @@ struct rbug_header * rbug_demarshal(struct rbug_proto_header *header) return NULL; } } + +const char* rbug_proto_get_name(enum rbug_opcode opcode) +{ + switch(opcode) { + case RBUG_OP_NOOP: + return "RBUG_OP_NOOP"; + case RBUG_OP_PING: + return "RBUG_OP_PING"; + case RBUG_OP_ERROR: + return "RBUG_OP_ERROR"; + case RBUG_OP_PING_REPLY: + return "RBUG_OP_PING_REPLY"; + case RBUG_OP_ERROR_REPLY: + return "RBUG_OP_ERROR_REPLY"; + case RBUG_OP_TEXTURE_LIST: + return "RBUG_OP_TEXTURE_LIST"; + case RBUG_OP_TEXTURE_INFO: + return "RBUG_OP_TEXTURE_INFO"; + case RBUG_OP_TEXTURE_WRITE: + return "RBUG_OP_TEXTURE_WRITE"; + case RBUG_OP_TEXTURE_READ: + return "RBUG_OP_TEXTURE_READ"; + case RBUG_OP_TEXTURE_LIST_REPLY: + return "RBUG_OP_TEXTURE_LIST_REPLY"; + case RBUG_OP_TEXTURE_INFO_REPLY: + return "RBUG_OP_TEXTURE_INFO_REPLY"; + case RBUG_OP_TEXTURE_READ_REPLY: + return "RBUG_OP_TEXTURE_READ_REPLY"; + case RBUG_OP_CONTEXT_LIST: + return "RBUG_OP_CONTEXT_LIST"; + case RBUG_OP_CONTEXT_INFO: + return "RBUG_OP_CONTEXT_INFO"; + case RBUG_OP_CONTEXT_DRAW_BLOCK: + return "RBUG_OP_CONTEXT_DRAW_BLOCK"; + case RBUG_OP_CONTEXT_DRAW_STEP: + return "RBUG_OP_CONTEXT_DRAW_STEP"; + case RBUG_OP_CONTEXT_DRAW_UNBLOCK: + return "RBUG_OP_CONTEXT_DRAW_UNBLOCK"; + case RBUG_OP_CONTEXT_DRAW_RULE: + return "RBUG_OP_CONTEXT_DRAW_RULE"; + case RBUG_OP_CONTEXT_FLUSH: + return "RBUG_OP_CONTEXT_FLUSH"; + case RBUG_OP_CONTEXT_LIST_REPLY: + return "RBUG_OP_CONTEXT_LIST_REPLY"; + case RBUG_OP_CONTEXT_INFO_REPLY: + return "RBUG_OP_CONTEXT_INFO_REPLY"; + case RBUG_OP_CONTEXT_DRAW_BLOCKED: + return "RBUG_OP_CONTEXT_DRAW_BLOCKED"; + case RBUG_OP_SHADER_LIST: + return "RBUG_OP_SHADER_LIST"; + case RBUG_OP_SHADER_INFO: + return "RBUG_OP_SHADER_INFO"; + case RBUG_OP_SHADER_DISABLE: + return "RBUG_OP_SHADER_DISABLE"; + case RBUG_OP_SHADER_REPLACE: + return "RBUG_OP_SHADER_REPLACE"; + case RBUG_OP_SHADER_LIST_REPLY: + return "RBUG_OP_SHADER_LIST_REPLY"; + case RBUG_OP_SHADER_INFO_REPLY: + return "RBUG_OP_SHADER_INFO_REPLY"; + default: + return NULL; + } +} diff --git a/src/gallium/auxiliary/rbug/rbug_proto.h b/src/gallium/auxiliary/rbug/rbug_proto.h index 4f3eb75dc4d..2fce725bc9e 100644 --- a/src/gallium/auxiliary/rbug/rbug_proto.h +++ b/src/gallium/auxiliary/rbug/rbug_proto.h @@ -91,4 +91,9 @@ struct rbug_proto_header */ struct rbug_connection; +/** + * Get printable string for opcode. + */ +const char* rbug_proto_get_name(enum rbug_opcode opcode); + #endif diff --git a/src/gallium/auxiliary/rbug/rbug_shader.c b/src/gallium/auxiliary/rbug/rbug_shader.c index fccd2f55efd..1742941cc17 100644 --- a/src/gallium/auxiliary/rbug/rbug_shader.c +++ b/src/gallium/auxiliary/rbug/rbug_shader.c @@ -305,7 +305,7 @@ struct rbug_proto_shader_list * rbug_demarshal_shader_list(struct rbug_proto_hea if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_SHADER_LIST) + if (header->opcode != (int32_t)RBUG_OP_SHADER_LIST) return NULL; pos = 0; @@ -332,7 +332,7 @@ struct rbug_proto_shader_info * rbug_demarshal_shader_info(struct rbug_proto_hea if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_SHADER_INFO) + if (header->opcode != (int32_t)RBUG_OP_SHADER_INFO) return NULL; pos = 0; @@ -360,7 +360,7 @@ struct rbug_proto_shader_disable * rbug_demarshal_shader_disable(struct rbug_pro if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_SHADER_DISABLE) + if (header->opcode != (int32_t)RBUG_OP_SHADER_DISABLE) return NULL; pos = 0; @@ -389,7 +389,7 @@ struct rbug_proto_shader_replace * rbug_demarshal_shader_replace(struct rbug_pro if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_SHADER_REPLACE) + if (header->opcode != (int32_t)RBUG_OP_SHADER_REPLACE) return NULL; pos = 0; @@ -418,7 +418,7 @@ struct rbug_proto_shader_list_reply * rbug_demarshal_shader_list_reply(struct rb if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_SHADER_LIST_REPLY) + if (header->opcode != (int32_t)RBUG_OP_SHADER_LIST_REPLY) return NULL; pos = 0; @@ -446,7 +446,7 @@ struct rbug_proto_shader_info_reply * rbug_demarshal_shader_info_reply(struct rb if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_SHADER_INFO_REPLY) + if (header->opcode != (int32_t)RBUG_OP_SHADER_INFO_REPLY) return NULL; pos = 0; diff --git a/src/gallium/auxiliary/rbug/rbug_texture.c b/src/gallium/auxiliary/rbug/rbug_texture.c index 5a918fe6bc0..2ad577915e8 100644 --- a/src/gallium/auxiliary/rbug/rbug_texture.c +++ b/src/gallium/auxiliary/rbug/rbug_texture.c @@ -417,7 +417,7 @@ struct rbug_proto_texture_list * rbug_demarshal_texture_list(struct rbug_proto_h if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_TEXTURE_LIST) + if (header->opcode != (int32_t)RBUG_OP_TEXTURE_LIST) return NULL; pos = 0; @@ -443,7 +443,7 @@ struct rbug_proto_texture_info * rbug_demarshal_texture_info(struct rbug_proto_h if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_TEXTURE_INFO) + if (header->opcode != (int32_t)RBUG_OP_TEXTURE_INFO) return NULL; pos = 0; @@ -470,7 +470,7 @@ struct rbug_proto_texture_write * rbug_demarshal_texture_write(struct rbug_proto if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_TEXTURE_WRITE) + if (header->opcode != (int32_t)RBUG_OP_TEXTURE_WRITE) return NULL; pos = 0; @@ -506,7 +506,7 @@ struct rbug_proto_texture_read * rbug_demarshal_texture_read(struct rbug_proto_h if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_TEXTURE_READ) + if (header->opcode != (int32_t)RBUG_OP_TEXTURE_READ) return NULL; pos = 0; @@ -540,7 +540,7 @@ struct rbug_proto_texture_list_reply * rbug_demarshal_texture_list_reply(struct if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_TEXTURE_LIST_REPLY) + if (header->opcode != (int32_t)RBUG_OP_TEXTURE_LIST_REPLY) return NULL; pos = 0; @@ -568,7 +568,7 @@ struct rbug_proto_texture_info_reply * rbug_demarshal_texture_info_reply(struct if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_TEXTURE_INFO_REPLY) + if (header->opcode != (int32_t)RBUG_OP_TEXTURE_INFO_REPLY) return NULL; pos = 0; @@ -606,7 +606,7 @@ struct rbug_proto_texture_read_reply * rbug_demarshal_texture_read_reply(struct if (!header) return NULL; - if (header->opcode != (int16_t)RBUG_OP_TEXTURE_READ_REPLY) + if (header->opcode != (int32_t)RBUG_OP_TEXTURE_READ_REPLY) return NULL; pos = 0; diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c index 2e15751e508..0461c815504 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c +++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c @@ -30,7 +30,7 @@ #include "rtasm_cpu.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) static boolean rtasm_sse_enabled(void) { static boolean firsttime = 1; @@ -49,7 +49,7 @@ static boolean rtasm_sse_enabled(void) int rtasm_cpu_has_sse(void) { /* FIXME: actually detect this at run-time */ -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) return rtasm_sse_enabled(); #else return 0; @@ -59,7 +59,7 @@ int rtasm_cpu_has_sse(void) int rtasm_cpu_has_sse2(void) { /* FIXME: actually detect this at run-time */ -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) return rtasm_sse_enabled(); #else return 0; diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c index 65d5ce795be..fbde1d191a4 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c +++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c @@ -58,7 +58,6 @@ #include <unistd.h> #include <sys/mman.h> -#include "os/os_thread.h" #include "util/u_mm.h" #define EXEC_HEAP_SIZE (10*1024*1024) diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c index 7595214bdf2..75b0f6a68ea 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c @@ -22,8 +22,9 @@ **************************************************************************/ #include "pipe/p_config.h" +#include "util/u_cpu_detect.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "pipe/p_compiler.h" #include "util/u_debug.h" @@ -231,6 +232,10 @@ static void emit_modrm( struct x86_function *p, assert(reg.mod == mod_REG); + /* TODO: support extended x86-64 registers */ + assert(reg.idx < 8); + assert(regmem.idx < 8); + val |= regmem.mod << 6; /* mod field */ val |= reg.idx << 3; /* reg field */ val |= regmem.idx; /* r/m field */ @@ -363,6 +368,12 @@ int x86_get_label( struct x86_function *p ) */ +void x64_rexw(struct x86_function *p) +{ + if(x86_target(p) != X86_32) + emit_1ub(p, 0x48); +} + void x86_jcc( struct x86_function *p, enum x86_cc cc, int label ) @@ -449,6 +460,52 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ) emit_1i(p, imm); } +void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + if(dst.mod == mod_REG) + x86_mov_reg_imm(p, dst, imm); + else + { + emit_1ub(p, 0xc7); + emit_modrm_noreg(p, 0, dst); + emit_1i(p, imm); + } +} + +void x86_mov16_imm( struct x86_function *p, struct x86_reg dst, uint16_t imm ) +{ + DUMP_RI( dst, imm ); + emit_1ub(p, 0x66); + if(dst.mod == mod_REG) + { + emit_1ub(p, 0xb8 + dst.idx); + emit_2ub(p, imm & 0xff, imm >> 8); + } + else + { + emit_1ub(p, 0xc7); + emit_modrm_noreg(p, 0, dst); + emit_2ub(p, imm & 0xff, imm >> 8); + } +} + +void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm ) +{ + DUMP_RI( dst, imm ); + if(dst.mod == mod_REG) + { + emit_1ub(p, 0xb0 + dst.idx); + emit_1ub(p, imm); + } + else + { + emit_1ub(p, 0xc6); + emit_modrm_noreg(p, 0, dst); + emit_1ub(p, imm); + } +} + /** * Immediate group 1 instructions. */ @@ -520,7 +577,7 @@ void x86_push( struct x86_function *p, } - p->stack_offset += 4; + p->stack_offset += sizeof(void*); } void x86_push_imm32( struct x86_function *p, @@ -530,7 +587,7 @@ void x86_push_imm32( struct x86_function *p, emit_1ub(p, 0x68); emit_1i(p, imm32); - p->stack_offset += 4; + p->stack_offset += sizeof(void*); } @@ -540,23 +597,33 @@ void x86_pop( struct x86_function *p, DUMP_R( reg ); assert(reg.mod == mod_REG); emit_1ub(p, 0x58 + reg.idx); - p->stack_offset -= 4; + p->stack_offset -= sizeof(void*); } void x86_inc( struct x86_function *p, struct x86_reg reg ) { DUMP_R( reg ); - assert(reg.mod == mod_REG); - emit_1ub(p, 0x40 + reg.idx); + if(x86_target(p) == X86_32 && reg.mod == mod_REG) + { + emit_1ub(p, 0x40 + reg.idx); + return; + } + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 0, reg); } void x86_dec( struct x86_function *p, struct x86_reg reg ) { DUMP_R( reg ); - assert(reg.mod == mod_REG); - emit_1ub(p, 0x48 + reg.idx); + if(x86_target(p) == X86_32 && reg.mod == mod_REG) + { + emit_1ub(p, 0x48 + reg.idx); + return; + } + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 1, reg); } void x86_ret( struct x86_function *p ) @@ -583,9 +650,82 @@ void x86_mov( struct x86_function *p, struct x86_reg src ) { DUMP_RR( dst, src ); + /* special hack for reading arguments until we support x86-64 registers everywhere */ + if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8)) + { + uint8_t rex = 0x40; + if(dst.idx >= 8) + { + rex |= 4; + dst.idx -= 8; + } + if(src.idx >= 8) + { + rex |= 1; + src.idx -= 8; + } + emit_1ub(p, rex); + } + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_mov16( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_1ub(p, 0x66); emit_op_modrm( p, 0x8b, 0x89, dst, src ); } +void x86_mov8( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_op_modrm( p, 0x8a, 0x88, dst, src ); +} + +void x64_mov64( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + uint8_t rex = 0x48; + DUMP_RR( dst, src ); + assert(x86_target(p) != X86_32); + + /* special hack for reading arguments until we support x86-64 registers everywhere */ + if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8)) + { + if(dst.idx >= 8) + { + rex |= 4; + dst.idx -= 8; + } + if(src.idx >= 8) + { + rex |= 1; + src.idx -= 8; + } + } + emit_1ub(p, rex); + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_2ub(p, 0x0f, 0xb6); + emit_modrm(p, dst, src); +} + +void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_2ub(p, 0x0f, 0xb7); + emit_modrm(p, dst, src); +} + void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -680,6 +820,61 @@ void x86_div( struct x86_function *p, emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src); } +void x86_bswap( struct x86_function *p, struct x86_reg reg ) +{ + DUMP_R(reg); + assert(reg.file == file_REG32); + assert(reg.mod == mod_REG); + emit_2ub(p, 0x0f, 0xc8 + reg.idx); +} + +void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 5, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 5, reg); + emit_1ub(p, imm); + } +} + +void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 7, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 7, reg); + emit_1ub(p, imm); + } +} + +void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 4, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 4, reg); + emit_1ub(p, imm); + } +} /*********************************************************************** @@ -1013,6 +1208,77 @@ void sse_movmskps( struct x86_function *p, * SSE2 instructions */ +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + if(dst.mod == mod_REG && dst.file == file_REG32) + { + emit_1ub(p, 0x7e); + emit_modrm(p, src, dst); + } + else + { + emit_op_modrm(p, 0x6e, 0x7e, dst, src); + } +} + +void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + switch (dst.mod) { + case mod_REG: + emit_3ub(p, 0xf3, 0x0f, 0x7e); + emit_modrm(p, dst, src); + break; + case mod_INDIRECT: + case mod_DISP32: + case mod_DISP8: + assert(src.mod == mod_REG); + emit_3ub(p, 0x66, 0x0f, 0xd6); + emit_modrm(p, src, dst); + break; + default: + assert(0); + break; + } +} + +void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0xf3, 0x0f); + emit_op_modrm(p, 0x6f, 0x7f, dst, src); +} + +void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x6f, 0x7f, dst, src); +} + +void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0xf2, 0x0f); + emit_op_modrm(p, 0x10, 0x11, dst, src); +} + +void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x10, 0x11, dst, src); +} + +void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x28, 0x29, dst, src); +} + /** * Perform a reduced swizzle: */ @@ -1027,6 +1293,28 @@ void sse2_pshufd( struct x86_function *p, emit_1ub(p, shuf); } +void sse2_pshuflw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf) +{ + DUMP_RRI( dst, src, shuf ); + emit_3ub(p, 0xf2, X86_TWOB, 0x70); + emit_modrm(p, dst, src); + emit_1ub(p, shuf); +} + +void sse2_pshufhw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf) +{ + DUMP_RRI( dst, src, shuf ); + emit_3ub(p, 0xf3, X86_TWOB, 0x70); + emit_modrm(p, dst, src); + emit_1ub(p, shuf); +} + void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -1045,6 +1333,24 @@ void sse2_cvtps2dq( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse2_cvtsd2ss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0xf2, 0x0f, 0x5a); + emit_modrm( p, dst, src ); +} + +void sse2_cvtpd2ps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x5a); + emit_modrm( p, dst, src ); +} + void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -1081,6 +1387,97 @@ void sse2_punpcklbw( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x61); + emit_modrm( p, dst, src ); +} + +void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x62); + emit_modrm( p, dst, src ); +} + +void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x6c); + emit_modrm( p, dst, src ); +} + +void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x73); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x73); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 4, dst); + emit_1ub(p, imm); +} + +void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 4, dst); + emit_1ub(p, imm); +} + +void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_3ub(p, 0x66, 0x0f, 0xeb); + emit_modrm(p, dst, src); +} void sse2_rcpps( struct x86_function *p, struct x86_reg dst, @@ -1100,18 +1497,6 @@ void sse2_rcpss( struct x86_function *p, emit_modrm( p, dst, src ); } -void sse2_movd( struct x86_function *p, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( dst, src ); - emit_2ub(p, 0x66, X86_TWOB); - emit_op_modrm( p, 0x6e, 0x7e, dst, src ); -} - - - - /*********************************************************************** * x87 instructions */ @@ -1702,23 +2087,80 @@ void x86_cdecl_caller_pop_regs( struct x86_function *p ) } -/* Retreive a reference to one of the function arguments, taking into - * account any push/pop activity: - */ struct x86_reg x86_fn_arg( struct x86_function *p, - unsigned arg ) + unsigned arg ) { - return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + switch(x86_target(p)) + { + case X86_64_WIN64_ABI: + /* Microsoft uses a different calling convention than the rest of the world */ + switch(arg) + { + case 1: + return x86_make_reg(file_REG32, reg_CX); + case 2: + return x86_make_reg(file_REG32, reg_DX); + case 3: + return x86_make_reg(file_REG32, reg_R8); + case 4: + return x86_make_reg(file_REG32, reg_R9); + default: + /* Win64 allocates stack slots as if it pushed the first 4 arguments too */ + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + arg * 8); + } + case X86_64_STD_ABI: + switch(arg) + { + case 1: + return x86_make_reg(file_REG32, reg_DI); + case 2: + return x86_make_reg(file_REG32, reg_SI); + case 3: + return x86_make_reg(file_REG32, reg_DX); + case 4: + return x86_make_reg(file_REG32, reg_CX); + case 5: + return x86_make_reg(file_REG32, reg_R8); + case 6: + return x86_make_reg(file_REG32, reg_R9); + default: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + (arg - 6) * 8); /* ??? */ + } + case X86_32: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), p->stack_offset + arg * 4); /* ??? */ + default: + abort(); + } } +static void x86_init_func_common( struct x86_function *p ) +{ + util_cpu_detect(); + p->caps = 0; + if(util_cpu_caps.has_mmx) + p->caps |= X86_MMX; + if(util_cpu_caps.has_mmx2) + p->caps |= X86_MMX2; + if(util_cpu_caps.has_sse) + p->caps |= X86_SSE; + if(util_cpu_caps.has_sse2) + p->caps |= X86_SSE2; + if(util_cpu_caps.has_sse3) + p->caps |= X86_SSE3; + if(util_cpu_caps.has_sse4_1) + p->caps |= X86_SSE4_1; + p->csr = p->store; + DUMP_START(); +} void x86_init_func( struct x86_function *p ) { p->size = 0; p->store = NULL; - p->csr = p->store; - DUMP_START(); + x86_init_func_common(p); } void x86_init_func_size( struct x86_function *p, unsigned code_size ) @@ -1728,8 +2170,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size ) if (p->store == NULL) { p->store = p->error_overflow; } - p->csr = p->store; - DUMP_START(); + x86_init_func_common(p); } void x86_release_func( struct x86_function *p ) @@ -1743,20 +2184,35 @@ void x86_release_func( struct x86_function *p ) } -void (*x86_get_func( struct x86_function *p ))(void) +static INLINE x86_func +voidptr_to_x86_func(void *v) +{ + union { + void *v; + x86_func f; + } u; + assert(sizeof(u.v) == sizeof(u.f)); + u.v = v; + return u.f; +} + + +x86_func x86_get_func( struct x86_function *p ) { DUMP_END(); if (DISASSEM && p->store) debug_printf("disassemble %p %p\n", p->store, p->csr); if (p->store == p->error_overflow) - return (void (*)(void)) NULL; + return voidptr_to_x86_func(NULL); else - return (void (*)(void)) p->store; + return voidptr_to_x86_func(p->store); } #else +void x86sse_dummy( void ); + void x86sse_dummy( void ) { } diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h index 319b836ffb1..2b9678b1765 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h @@ -24,22 +24,31 @@ #ifndef _RTASM_X86SSE_H_ #define _RTASM_X86SSE_H_ +#include "pipe/p_compiler.h" #include "pipe/p_config.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* It is up to the caller to ensure that instructions issued are * suitable for the host cpu. There are no checks made in this module * for mmx/sse/sse2 support on the cpu. */ struct x86_reg { - unsigned file:3; - unsigned idx:3; + unsigned file:2; + unsigned idx:4; unsigned mod:2; /* mod_REG if this is just a register */ int disp:24; /* only +/- 23bits of offset - should be enough... */ }; +#define X86_MMX 1 +#define X86_MMX2 2 +#define X86_SSE 4 +#define X86_SSE2 8 +#define X86_SSE3 0x10 +#define X86_SSE4_1 0x20 + struct x86_function { + unsigned caps; unsigned size; unsigned char *store; unsigned char *csr; @@ -75,7 +84,15 @@ enum x86_reg_name { reg_SP, reg_BP, reg_SI, - reg_DI + reg_DI, + reg_R8, + reg_R9, + reg_R10, + reg_R11, + reg_R12, + reg_R13, + reg_R14, + reg_R15 }; @@ -102,14 +119,42 @@ enum sse_cc { #define cc_Z cc_E #define cc_NZ cc_NE + +/** generic pointer to function */ +typedef void (*x86_func)(void); + + /* Begin/end/retrieve function creation: */ +enum x86_target +{ + X86_32, + X86_64_STD_ABI, + X86_64_WIN64_ABI +}; + +/* make this read a member of x86_function if target != host is desired */ +static INLINE enum x86_target x86_target( struct x86_function* p ) +{ +#ifdef PIPE_ARCH_X86 + return X86_32; +#elif defined(_WIN64) + return X86_64_WIN64_ABI; +#elif defined(PIPE_ARCH_X86_64) + return X86_64_STD_ABI; +#endif +} + +static INLINE unsigned x86_target_caps( struct x86_function* p ) +{ + return p->caps; +} void x86_init_func( struct x86_function *p ); void x86_init_func_size( struct x86_function *p, unsigned code_size ); void x86_release_func( struct x86_function *p ); -void (*x86_get_func( struct x86_function *p ))( void ); +x86_func x86_get_func( struct x86_function *p ); /* Debugging: */ @@ -133,6 +178,8 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg ); */ int x86_get_label( struct x86_function *p ); +void x64_rexw(struct x86_function *p); + void x86_jcc( struct x86_function *p, enum x86_cc cc, int label ); @@ -173,18 +220,54 @@ void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvtsd2ss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvtpd2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, unsigned char shuf ); +void sse2_pshuflw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); +void sse2_pshufhw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_pshuflw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); +void sse2_pshufhw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); +void sse2_pshufd( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr); void sse_prefetch0( struct x86_function *p, struct x86_reg ptr); @@ -222,7 +305,6 @@ void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src ); -void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_movmskps( struct x86_function *p, struct x86_reg dst, struct x86_reg src); void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -232,6 +314,14 @@ void x86_dec( struct x86_function *p, struct x86_reg reg ); void x86_inc( struct x86_function *p, struct x86_reg reg ); void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x64_mov64( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov8( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov16( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov_imm(struct x86_function *p, struct x86_reg dst, int imm ); +void x86_mov8_imm(struct x86_function *p, struct x86_reg dst, uint8_t imm ); +void x86_mov16_imm(struct x86_function *p, struct x86_reg dst, uint16_t imm ); void x86_mul( struct x86_function *p, struct x86_reg src ); void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -245,7 +335,10 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_sahf( struct x86_function *p ); void x86_div( struct x86_function *p, struct x86_reg src ); - +void x86_bswap( struct x86_function *p, struct x86_reg src ); +void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); +void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); +void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); void x86_cdecl_caller_push_regs( struct x86_function *p ); void x86_cdecl_caller_pop_regs( struct x86_function *p ); diff --git a/src/gallium/auxiliary/target-helpers/inline_debug_helper.h b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h new file mode 100644 index 00000000000..0433da6141d --- /dev/null +++ b/src/gallium/auxiliary/target-helpers/inline_debug_helper.h @@ -0,0 +1,44 @@ + +#ifndef INLINE_DEBUG_HELPER_H +#define INLINE_DEBUG_HELPER_H + +#include "pipe/p_compiler.h" +#include "util/u_debug.h" + + +/* Helper function to wrap a screen with + * one or more debug driver: rbug, trace. + */ + +#ifdef GALLIUM_TRACE +#include "trace/tr_public.h" +#endif + +#ifdef GALLIUM_RBUG +#include "rbug/rbug_public.h" +#endif + +#ifdef GALLIUM_GALAHAD +#include "galahad/glhd_public.h" +#endif + +static INLINE struct pipe_screen * +debug_screen_wrap(struct pipe_screen *screen) +{ + +#if defined(GALLIUM_RBUG) + screen = rbug_screen_create(screen); +#endif + +#if defined(GALLIUM_TRACE) + screen = trace_screen_create(screen); +#endif + +#if defined(GALLIUM_GALAHAD) + screen = galahad_screen_create(screen); +#endif + + return screen; +} + +#endif diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h new file mode 100644 index 00000000000..036c1ee48a8 --- /dev/null +++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h @@ -0,0 +1,63 @@ + +#ifndef INLINE_SW_HELPER_H +#define INLINE_SW_HELPER_H + +#include "pipe/p_compiler.h" +#include "util/u_debug.h" +#include "state_tracker/sw_winsys.h" + + +/* Helper function to choose and instantiate one of the software rasterizers: + * cell, llvmpipe, softpipe. + */ + +#ifdef GALLIUM_SOFTPIPE +#include "softpipe/sp_public.h" +#endif + +#ifdef GALLIUM_LLVMPIPE +#include "llvmpipe/lp_public.h" +#endif + +#ifdef GALLIUM_CELL +#include "cell/ppu/cell_public.h" +#endif + +static INLINE struct pipe_screen * +sw_screen_create(struct sw_winsys *winsys) +{ + const char *default_driver; + const char *driver; + struct pipe_screen *screen = NULL; + +#if defined(GALLIUM_CELL) + default_driver = "cell"; +#elif defined(GALLIUM_LLVMPIPE) + default_driver = "llvmpipe"; +#elif defined(GALLIUM_SOFTPIPE) + default_driver = "softpipe"; +#else + default_driver = ""; +#endif + + driver = debug_get_option("GALLIUM_DRIVER", default_driver); + +#if defined(GALLIUM_CELL) + if (screen == NULL && strcmp(driver, "cell") == 0) + screen = cell_create_screen(winsys); +#endif + +#if defined(GALLIUM_LLVMPIPE) + if (screen == NULL && strcmp(driver, "llvmpipe") == 0) + screen = llvmpipe_create_screen(winsys); +#endif + +#if defined(GALLIUM_SOFTPIPE) + if (screen == NULL) + screen = softpipe_create_screen(winsys); +#endif + + return screen; +} + +#endif diff --git a/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h new file mode 100644 index 00000000000..0b4e7404034 --- /dev/null +++ b/src/gallium/auxiliary/target-helpers/inline_wrapper_sw_helper.h @@ -0,0 +1,34 @@ + +#ifndef INLINE_WRAPPER_SW_HELPER_H +#define INLINE_WRAPPER_SW_HELPER_H + +#include "target-helpers/inline_sw_helper.h" +#include "sw/wrapper/wrapper_sw_winsys.h" + +/** + * Try to wrap a hw screen with a software screen. + * On failure will return given screen. + */ +static INLINE struct pipe_screen * +sw_screen_wrap(struct pipe_screen *screen) +{ + struct sw_winsys *sws; + struct pipe_screen *sw_screen; + + sws = wrapper_sw_winsys_warp_pipe_screen(screen); + if (!sws) + goto err; + + sw_screen = sw_screen_create(sws); + if (sw_screen == screen) + goto err_winsys; + + return sw_screen; + +err_winsys: + sws->destroy(sws); +err: + return screen; +} + +#endif diff --git a/src/gallium/auxiliary/target-helpers/wrap_screen.c b/src/gallium/auxiliary/target-helpers/wrap_screen.c index eb475123198..df5d56a53c9 100644 --- a/src/gallium/auxiliary/target-helpers/wrap_screen.c +++ b/src/gallium/auxiliary/target-helpers/wrap_screen.c @@ -33,6 +33,7 @@ #include "target-helpers/wrap_screen.h" #include "trace/tr_public.h" +#include "rbug/rbug_public.h" #include "identity/id_public.h" #include "util/u_debug.h" @@ -56,6 +57,9 @@ gallium_wrap_screen( struct pipe_screen *screen ) /* Trace does its own checking if it should run */ screen = trace_screen_create(screen); + /* Rbug does its own checking if it should run */ + screen = rbug_screen_create(screen); + return screen; } diff --git a/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt b/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt deleted file mode 100644 index 5d9eed92580..00000000000 --- a/src/gallium/auxiliary/tgsi/tgsi-instruction-set.txt +++ /dev/null @@ -1,1127 +0,0 @@ -TGSI Instruction Specification -============================== -============================== - - -1 Instruction Set Operations -============================= - - -1.1 GL_NV_vertex_program -------------------------- - - -1.1.1 ARL - Address Register Load - - dst.x = floor(src.x) - dst.y = floor(src.y) - dst.z = floor(src.z) - dst.w = floor(src.w) - - -1.1.2 MOV - Move - - dst.x = src.x - dst.y = src.y - dst.z = src.z - dst.w = src.w - - -1.1.3 LIT - Light Coefficients - - dst.x = 1.0 - dst.y = max(src.x, 0.0) - dst.z = (src.x > 0.0) ? pow(max(src.y, 0.0), clamp(src.w, -128.0, 128.0)) : 0.0 - dst.w = 1.0 - - -1.1.4 RCP - Reciprocal - - dst.x = 1.0 / src.x - dst.y = 1.0 / src.x - dst.z = 1.0 / src.x - dst.w = 1.0 / src.x - - -1.1.5 RSQ - Reciprocal Square Root - - dst.x = 1.0 / sqrt(abs(src.x)) - dst.y = 1.0 / sqrt(abs(src.x)) - dst.z = 1.0 / sqrt(abs(src.x)) - dst.w = 1.0 / sqrt(abs(src.x)) - - -1.1.6 EXP - Approximate Exponential Base 2 - - dst.x = pow(2.0, floor(src.x)) - dst.y = src.x - floor(src.x) - dst.z = pow(2.0, src.x) - dst.w = 1.0 - - -1.1.7 LOG - Approximate Logarithm Base 2 - - dst.x = floor(lg2(abs(src.x))) - dst.y = abs(src.x) / pow(2.0, floor(lg2(abs(src.x)))) - dst.z = lg2(abs(src.x)) - dst.w = 1.0 - - -1.1.8 MUL - Multiply - - dst.x = src0.x * src1.x - dst.y = src0.y * src1.y - dst.z = src0.z * src1.z - dst.w = src0.w * src1.w - - -1.1.9 ADD - Add - - dst.x = src0.x + src1.x - dst.y = src0.y + src1.y - dst.z = src0.z + src1.z - dst.w = src0.w + src1.w - - -1.1.10 DP3 - 3-component Dot Product - - dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z - dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z - dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z - dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z - - -1.1.11 DP4 - 4-component Dot Product - - dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w - dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w - dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w - dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src0.w * src1.w - - -1.1.12 DST - Distance Vector - - dst.x = 1.0 - dst.y = src0.y * src1.y - dst.z = src0.z - dst.w = src1.w - - -1.1.13 MIN - Minimum - - dst.x = min(src0.x, src1.x) - dst.y = min(src0.y, src1.y) - dst.z = min(src0.z, src1.z) - dst.w = min(src0.w, src1.w) - - -1.1.14 MAX - Maximum - - dst.x = max(src0.x, src1.x) - dst.y = max(src0.y, src1.y) - dst.z = max(src0.z, src1.z) - dst.w = max(src0.w, src1.w) - - -1.1.15 SLT - Set On Less Than - - dst.x = (src0.x < src1.x) ? 1.0 : 0.0 - dst.y = (src0.y < src1.y) ? 1.0 : 0.0 - dst.z = (src0.z < src1.z) ? 1.0 : 0.0 - dst.w = (src0.w < src1.w) ? 1.0 : 0.0 - - -1.1.16 SGE - Set On Greater Equal Than - - dst.x = (src0.x >= src1.x) ? 1.0 : 0.0 - dst.y = (src0.y >= src1.y) ? 1.0 : 0.0 - dst.z = (src0.z >= src1.z) ? 1.0 : 0.0 - dst.w = (src0.w >= src1.w) ? 1.0 : 0.0 - - -1.1.17 MAD - Multiply And Add - - dst.x = src0.x * src1.x + src2.x - dst.y = src0.y * src1.y + src2.y - dst.z = src0.z * src1.z + src2.z - dst.w = src0.w * src1.w + src2.w - - -1.2 GL_ATI_fragment_shader ---------------------------- - - -1.2.1 SUB - Subtract - - dst.x = src0.x - src1.x - dst.y = src0.y - src1.y - dst.z = src0.z - src1.z - dst.w = src0.w - src1.w - - -1.2.2 DOT3 - 3-component Dot Product - - Alias for DP3. - - -1.2.3 DOT4 - 4-component Dot Product - - Alias for DP4. - - -1.2.4 LERP - Linear Interpolate - - dst.x = src0.x * (src1.x - src2.x) + src2.x - dst.y = src0.y * (src1.y - src2.y) + src2.y - dst.z = src0.z * (src1.z - src2.z) + src2.z - dst.w = src0.w * (src1.w - src2.w) + src2.w - - -1.2.5 CND - Condition - - dst.x = (src2.x > 0.5) ? src0.x : src1.x - dst.y = (src2.y > 0.5) ? src0.y : src1.y - dst.z = (src2.z > 0.5) ? src0.z : src1.z - dst.w = (src2.w > 0.5) ? src0.w : src1.w - - -1.2.6 CND0 - Condition Zero - - Removed. Use (CMP src2, src1, src0) instead. - -1.2.7 DOT2ADD - 2-component Dot Product And Add - - dst.x = src0.x * src1.x + src0.y * src1.y + src2.x - dst.y = src0.x * src1.x + src0.y * src1.y + src2.x - dst.z = src0.x * src1.x + src0.y * src1.y + src2.x - dst.w = src0.x * src1.x + src0.y * src1.y + src2.x - - -1.3 GL_EXT_vertex_shader -------------------------- - - -1.3.1 INDEX - Array Lookup - - Considered for removal from language. - - -1.3.2 NEGATE - Negate - - Considered for removal from language. - - -1.3.3 MADD - Multiply And Add - - Alias for MAD. - - -1.3.4 FRAC - Fraction - - dst.x = src.x - floor(src.x) - dst.y = src.y - floor(src.y) - dst.z = src.z - floor(src.z) - dst.w = src.w - floor(src.w) - - -1.3.5 SETGE - Set On Greater Equal - - Alias for SGE. - - -1.3.6 SETLT - Set On Less Than - - Alias for SLT. - - -1.3.7 CLAMP - Clamp - - dst.x = clamp(src0.x, src1.x, src2.x) - dst.y = clamp(src0.y, src1.y, src2.y) - dst.z = clamp(src0.z, src1.z, src2.z) - dst.w = clamp(src0.w, src1.w, src2.w) - - -1.3.8 FLOOR - Floor - - dst.x = floor(src.x) - dst.y = floor(src.y) - dst.z = floor(src.z) - dst.w = floor(src.w) - - -1.3.9 ROUND - Round - - dst.x = round(src.x) - dst.y = round(src.y) - dst.z = round(src.z) - dst.w = round(src.w) - - -1.3.10 EXPBASE2 - Exponential Base 2 - - dst.x = pow(2.0, src.x) - dst.y = pow(2.0, src.x) - dst.z = pow(2.0, src.x) - dst.w = pow(2.0, src.x) - - -1.3.11 LOGBASE2 - Logarithm Base 2 - - dst.x = lg2(src.x) - dst.y = lg2(src.x) - dst.z = lg2(src.x) - dst.w = lg2(src.x) - - -1.3.12 POWER - Power - - dst.x = pow(src0.x, src1.x) - dst.y = pow(src0.x, src1.x) - dst.z = pow(src0.x, src1.x) - dst.w = pow(src0.x, src1.x) - - -1.3.13 RECIP - Reciprocal - - Alias for RCP. - - -1.3.14 RECIPSQRT - Reciprocal Square Root - - Alias for RSQ. - - -1.3.15 CROSSPRODUCT - Cross Product - - dst.x = src0.y * src1.z - src1.y * src0.z - dst.y = src0.z * src1.x - src1.z * src0.x - dst.z = src0.x * src1.y - src1.x * src0.y - dst.w = 1.0 - - -1.3.16 MULTIPLYMATRIX - Multiply Matrix - - Considered for removal from language. - - -1.4 GL_NV_vertex_program1_1 ----------------------------- - - -1.4.1 ABS - Absolute - - dst.x = abs(src.x) - dst.y = abs(src.y) - dst.z = abs(src.z) - dst.w = abs(src.w) - - -1.4.2 RCC - Reciprocal Clamped - - dst.x = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020) - dst.y = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020) - dst.z = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020) - dst.w = (1.0 / src.x) > 0.0 ? clamp(1.0 / src.x, 5.42101e-020, 1.884467e+019) : clamp(1.0 / src.x, -1.884467e+019, -5.42101e-020) - - -1.4.3 DPH - Homogeneous Dot Product - - dst.x = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w - dst.y = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w - dst.z = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w - dst.w = src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w - - -1.5 GL_NV_fragment_program ---------------------------- - - -1.5.1 COS - Cosine - - dst.x = cos(src.x) - dst.y = cos(src.x) - dst.z = cos(src.x) - dst.w = cos(src.w) - - -1.5.2 DDX - Derivative Relative To X - - dst.x = partialx(src.x) - dst.y = partialx(src.y) - dst.z = partialx(src.z) - dst.w = partialx(src.w) - - -1.5.3 DDY - Derivative Relative To Y - - dst.x = partialy(src.x) - dst.y = partialy(src.y) - dst.z = partialy(src.z) - dst.w = partialy(src.w) - - -1.5.4 EX2 - Exponential Base 2 - - Alias for EXPBASE2. - - -1.5.5 FLR - Floor - - Alias for FLOOR. - - -1.5.6 FRC - Fraction - - Alias for FRAC. - - -1.5.7 KILP - Predicated Discard - - discard - - -1.5.8 LG2 - Logarithm Base 2 - - Alias for LOGBASE2. - - -1.5.9 LRP - Linear Interpolate - - Alias for LERP. - - -1.5.10 PK2H - Pack Two 16-bit Floats - - TBD - - -1.5.11 PK2US - Pack Two Unsigned 16-bit Scalars - - TBD - - -1.5.12 PK4B - Pack Four Signed 8-bit Scalars - - TBD - - -1.5.13 PK4UB - Pack Four Unsigned 8-bit Scalars - - TBD - - -1.5.14 POW - Power - - Alias for POWER. - - -1.5.15 RFL - Reflection Vector - - dst.x = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.x - src1.x - dst.y = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.y - src1.y - dst.z = 2.0 * (src0.x * src1.x + src0.y * src1.y + src0.z * src1.z) / (src0.x * src0.x + src0.y * src0.y + src0.z * src0.z) * src0.z - src1.z - dst.w = 1.0 - - -1.5.16 SEQ - Set On Equal - - dst.x = (src0.x == src1.x) ? 1.0 : 0.0 - dst.y = (src0.y == src1.y) ? 1.0 : 0.0 - dst.z = (src0.z == src1.z) ? 1.0 : 0.0 - dst.w = (src0.w == src1.w) ? 1.0 : 0.0 - - -1.5.17 SFL - Set On False - - dst.x = 0.0 - dst.y = 0.0 - dst.z = 0.0 - dst.w = 0.0 - - -1.5.18 SGT - Set On Greater Than - - dst.x = (src0.x > src1.x) ? 1.0 : 0.0 - dst.y = (src0.y > src1.y) ? 1.0 : 0.0 - dst.z = (src0.z > src1.z) ? 1.0 : 0.0 - dst.w = (src0.w > src1.w) ? 1.0 : 0.0 - - -1.5.19 SIN - Sine - - dst.x = sin(src.x) - dst.y = sin(src.x) - dst.z = sin(src.x) - dst.w = sin(src.w) - - -1.5.20 SLE - Set On Less Equal Than - - dst.x = (src0.x <= src1.x) ? 1.0 : 0.0 - dst.y = (src0.y <= src1.y) ? 1.0 : 0.0 - dst.z = (src0.z <= src1.z) ? 1.0 : 0.0 - dst.w = (src0.w <= src1.w) ? 1.0 : 0.0 - - -1.5.21 SNE - Set On Not Equal - - dst.x = (src0.x != src1.x) ? 1.0 : 0.0 - dst.y = (src0.y != src1.y) ? 1.0 : 0.0 - dst.z = (src0.z != src1.z) ? 1.0 : 0.0 - dst.w = (src0.w != src1.w) ? 1.0 : 0.0 - - -1.5.22 STR - Set On True - - dst.x = 1.0 - dst.y = 1.0 - dst.z = 1.0 - dst.w = 1.0 - - -1.5.23 TEX - Texture Lookup - - TBD - - -1.5.24 TXD - Texture Lookup with Derivatives - - TBD - - -1.5.25 TXP - Projective Texture Lookup - - TBD - - -1.5.26 UP2H - Unpack Two 16-Bit Floats - - TBD - - -1.5.27 UP2US - Unpack Two Unsigned 16-Bit Scalars - - TBD - - -1.5.28 UP4B - Unpack Four Signed 8-Bit Values - - TBD - - -1.5.29 UP4UB - Unpack Four Unsigned 8-Bit Scalars - - TBD - - -1.5.30 X2D - 2D Coordinate Transformation - - dst.x = src0.x + src1.x * src2.x + src1.y * src2.y - dst.y = src0.y + src1.x * src2.z + src1.y * src2.w - dst.z = src0.x + src1.x * src2.x + src1.y * src2.y - dst.w = src0.y + src1.x * src2.z + src1.y * src2.w - - -1.6 GL_NV_vertex_program2 --------------------------- - - -1.6.1 ARA - Address Register Add - - TBD - - -1.6.2 ARR - Address Register Load With Round - - dst.x = round(src.x) - dst.y = round(src.y) - dst.z = round(src.z) - dst.w = round(src.w) - - -1.6.3 BRA - Branch - - pc = target - - -1.6.4 CAL - Subroutine Call - - push(pc) - pc = target - - -1.6.5 RET - Subroutine Call Return - - pc = pop() - - -1.6.6 SSG - Set Sign - - dst.x = (src.x > 0.0) ? 1.0 : (src.x < 0.0) ? -1.0 : 0.0 - dst.y = (src.y > 0.0) ? 1.0 : (src.y < 0.0) ? -1.0 : 0.0 - dst.z = (src.z > 0.0) ? 1.0 : (src.z < 0.0) ? -1.0 : 0.0 - dst.w = (src.w > 0.0) ? 1.0 : (src.w < 0.0) ? -1.0 : 0.0 - - -1.7 GL_ARB_vertex_program --------------------------- - - -1.7.1 SWZ - Extended Swizzle - - dst.x = src.x - dst.y = src.y - dst.z = src.z - dst.w = src.w - - -1.7.2 XPD - Cross Product - - Alias for CROSSPRODUCT. - - -1.8 GL_ARB_fragment_program ----------------------------- - - -1.8.1 CMP - Compare - - dst.x = (src0.x < 0.0) ? src1.x : src2.x - dst.y = (src0.y < 0.0) ? src1.y : src2.y - dst.z = (src0.z < 0.0) ? src1.z : src2.z - dst.w = (src0.w < 0.0) ? src1.w : src2.w - - -1.8.2 KIL - Conditional Discard - - if (src.x < 0.0 || src.y < 0.0 || src.z < 0.0 || src.w < 0.0) - discard - endif - - -1.8.3 SCS - Sine Cosine - - dst.x = cos(src.x) - dst.y = sin(src.x) - dst.z = 0.0 - dst.y = 1.0 - - -1.8.4 TXB - Texture Lookup With Bias - - TBD - - -1.9 GL_NV_fragment_program2 ----------------------------- - - -1.9.1 NRM - 3-component Vector Normalise - - dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z) - dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z) - dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z) - dst.w = 1.0 - - -1.9.2 DIV - Divide - - dst.x = src0.x / src1.x - dst.y = src0.y / src1.y - dst.z = src0.z / src1.z - dst.w = src0.w / src1.w - - -1.9.3 DP2 - 2-component Dot Product - - dst.x = src0.x * src1.x + src0.y * src1.y - dst.y = src0.x * src1.x + src0.y * src1.y - dst.z = src0.x * src1.x + src0.y * src1.y - dst.w = src0.x * src1.x + src0.y * src1.y - - -1.9.4 DP2A - 2-component Dot Product And Add - - Alias for DOT2ADD. - - -1.9.5 TXL - Texture Lookup With LOD - - TBD - - -1.9.6 BRK - Break - - TBD - - -1.9.7 IF - If - - TBD - - -1.9.10 ELSE - Else - - TBD - - -1.9.11 ENDIF - End If - - TBD - - -1.10 GL_NV_vertex_program3 ---------------------------- - - -1.10.1 PUSHA - Push Address Register On Stack - - push(src.x) - push(src.y) - push(src.z) - push(src.w) - - -1.10.2 POPA - Pop Address Register From Stack - - dst.w = pop() - dst.z = pop() - dst.y = pop() - dst.x = pop() - - -1.11 GL_NV_gpu_program4 ------------------------- - - -1.11.1 CEIL - Ceiling - - dst.x = ceil(src.x) - dst.y = ceil(src.y) - dst.z = ceil(src.z) - dst.w = ceil(src.w) - - -1.11.2 I2F - Integer To Float - - dst.x = (float) src.x - dst.y = (float) src.y - dst.z = (float) src.z - dst.w = (float) src.w - - -1.11.3 NOT - Bitwise Not - - dst.x = ~src.x - dst.y = ~src.y - dst.z = ~src.z - dst.w = ~src.w - - -1.11.4 TRUNC - Truncate - - dst.x = trunc(src.x) - dst.y = trunc(src.y) - dst.z = trunc(src.z) - dst.w = trunc(src.w) - - -1.11.5 SHL - Shift Left - - dst.x = src0.x << src1.x - dst.y = src0.y << src1.x - dst.z = src0.z << src1.x - dst.w = src0.w << src1.x - - -1.11.6 SHR - Shift Right - - dst.x = src0.x >> src1.x - dst.y = src0.y >> src1.x - dst.z = src0.z >> src1.x - dst.w = src0.w >> src1.x - - -1.11.7 AND - Bitwise And - - dst.x = src0.x & src1.x - dst.y = src0.y & src1.y - dst.z = src0.z & src1.z - dst.w = src0.w & src1.w - - -1.11.8 OR - Bitwise Or - - dst.x = src0.x | src1.x - dst.y = src0.y | src1.y - dst.z = src0.z | src1.z - dst.w = src0.w | src1.w - - -1.11.9 MOD - Modulus - - dst.x = src0.x % src1.x - dst.y = src0.y % src1.y - dst.z = src0.z % src1.z - dst.w = src0.w % src1.w - - -1.11.10 XOR - Bitwise Xor - - dst.x = src0.x ^ src1.x - dst.y = src0.y ^ src1.y - dst.z = src0.z ^ src1.z - dst.w = src0.w ^ src1.w - - -1.11.11 SAD - Sum Of Absolute Differences - - dst.x = abs(src0.x - src1.x) + src2.x - dst.y = abs(src0.y - src1.y) + src2.y - dst.z = abs(src0.z - src1.z) + src2.z - dst.w = abs(src0.w - src1.w) + src2.w - - -1.11.12 TXF - Texel Fetch - - TBD - - -1.11.13 TXQ - Texture Size Query - - TBD - - -1.11.14 CONT - Continue - - TBD - - -1.12 GL_NV_geometry_program4 ------------------------------ - - -1.12.1 EMIT - Emit - - TBD - - -1.12.2 ENDPRIM - End Primitive - - TBD - - -1.13 GLSL ----------- - - -1.13.1 BGNLOOP - Begin a Loop - - TBD - - -1.13.2 BGNSUB - Begin Subroutine - - TBD - - -1.13.3 ENDLOOP - End a Loop - - TBD - - -1.13.4 ENDSUB - End Subroutine - - TBD - - -1.13.5 INT - Truncate - - Alias for TRUNC. - - -1.13.6 NOISE1 - 1D Noise - - TBD - - -1.13.7 NOISE2 - 2D Noise - - TBD - - -1.13.8 NOISE3 - 3D Noise - - TBD - - -1.13.9 NOISE4 - 4D Noise - - TBD - - -1.13.10 NOP - No Operation - - Do nothing. - - -1.14 ps_1_1 ------------- - - -1.14.1 TEXKILL - Conditional Discard - - Alias for KIL. - - -1.15 ps_1_4 ------------- - - -1.15.1 TEXLD - Texture Lookup - - Alias for TEX. - - -1.16 ps_2_0 ------------- - - -1.16.1 M4X4 - Multiply Matrix - - Alias for MULTIPLYMATRIX. - - -1.16.2 M4X3 - Multiply Matrix - - Considered for removal from language. - - -1.16.3 M3X4 - Multiply Matrix - - Considered for removal from language. - - -1.16.4 M3X3 - Multiply Matrix - - Considered for removal from language. - - -1.16.5 M3X2 - Multiply Matrix - - Considered for removal from language. - - -1.16.6 CRS - Cross Product - - Alias for XPD. - - -1.16.7 NRM4 - 4-component Vector Normalise - - dst.x = src.x / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w) - dst.y = src.y / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w) - dst.z = src.z / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w) - dst.w = src.w / (src.x * src.x + src.y * src.y + src.z * src.z + src.w * src.w) - - -1.16.8 SINCOS - Sine Cosine - - Alias for SCS. - - -1.16.9 TEXLDB - Texture Lookup With Bias - - Alias for TXB. - - -1.16.10 DP2ADD - 2-component Dot Product And Add - - Alias for DP2A. - - -1.17 ps_2_x ------------- - - -1.17.1 CALL - Subroutine Call - - Alias for CAL. - - -1.17.2 CALLNZ - Subroutine Call If Not Zero - - TBD - - -1.17.3 IFC - If - - TBD - - -1.17.4 BREAK - Break - - Alias for BRK. - - -1.17.5 BREAKC - Break Conditional - - TBD - - -1.17.6 DSX - Derivative Relative To X - - Alias for DDX. - - -1.17.7 DSY - Derivative Relative To Y - - Alias for DDY. - - -1.17.8 TEXLDD - Texture Lookup with Derivatives - - Alias for TXD. - - -1.18 vs_1_1 ------------- - - -1.18.1 EXPP - Approximate Exponential Base 2 - - Use EXP. See also 1.19.3. - - -1.18.2 LOGP - Logarithm Base 2 - - Use LOG. See also 1.19.4. - - -1.19 vs_2_0 ------------- - - -1.19.1 SGN - Set Sign - - Alias for SSG. - - -1.19.2 MOVA - Move Address Register - - Alias for ARR. - - -1.19.3 EXPP - Approximate Exponential Base 2 - - Use EX2. - - -1.19.4 LOGP - Logarithm Base 2 - - Use LG2. - - -2 Explanation of symbols used -============================== - - -2.1 Functions --------------- - - - abs(x) Absolute value of x. - |x| - (x < 0.0) ? -x : x - - ceil(x) Ceiling of x. - - clamp(x,y,z) Clamp x between y and z. - (x < y) ? y : (x > z) ? z : x - - cos(x) Cosine of x. - - floor(x) Floor of x. - - lg2(x) Logarithm base 2 of x. - - max(x,y) Maximum of x and y. - (x > y) ? x : y - - min(x,y) Minimum of x and y. - (x < y) ? x : y - - partialx(x) Derivative of x relative to fragment's X. - - partialy(x) Derivative of x relative to fragment's Y. - - pop() Pop from stack. - - pow(x,y) Raise x to power of y. - - push(x) Push x on stack. - - round(x) Round x. - - sin(x) Sine of x. - - sqrt(x) Square root of x. - - trunc(x) Truncate x. - - -2.2 Keywords -------------- - - - discard Discard fragment. - - dst First destination register. - - dst0 First destination register. - - pc Program counter. - - src First source register. - - src0 First source register. - - src1 Second source register. - - src2 Third source register. - - target Label of target instruction. - - -3 Other tokens -=============== - - -3.1 Declaration Semantic -------------------------- - - - Follows Declaration token if Semantic bit is set. - - Since its purpose is to link a shader with other stages of the pipeline, - it is valid to follow only those Declaration tokens that declare a register - either in INPUT or OUTPUT file. - - SemanticName field contains the semantic name of the register being declared. - There is no default value. - - SemanticIndex is an optional subscript that can be used to distinguish - different register declarations with the same semantic name. The default value - is 0. - - The meanings of the individual semantic names are explained in the following - sections. - - -3.1.1 FACE - - Valid only in a fragment shader INPUT declaration. - - FACE.x is negative when the primitive is back facing. FACE.x is positive - when the primitive is front facing. diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c index 0890078cd05..6dbedf15ca8 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.c +++ b/src/gallium/auxiliary/tgsi/tgsi_build.c @@ -164,6 +164,7 @@ tgsi_default_full_declaration( void ) full_declaration.Declaration = tgsi_default_declaration(); full_declaration.Range = tgsi_default_declaration_range(); full_declaration.Semantic = tgsi_default_declaration_semantic(); + full_declaration.ImmediateData.u = NULL; return full_declaration; } @@ -180,7 +181,7 @@ tgsi_build_full_declaration( struct tgsi_declaration_range *dr; if( maxsize <= size ) - return 0; + return 0; declaration = (struct tgsi_declaration *) &tokens[size]; size++; @@ -235,6 +236,24 @@ tgsi_build_full_declaration( header ); } + if (full_decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) { + unsigned i, j; + union tgsi_immediate_data *data; + + for (i = 0; i <= dr->Last; ++i) { + for (j = 0; j < 4; ++j) { + unsigned idx = i*4 + j; + if (maxsize <= size) + return 0; + data = (union tgsi_immediate_data *) &tokens[size]; + ++size; + + *data = full_decl->ImmediateData.u[idx]; + declaration_grow( declaration, header ); + } + } + } + return size; } @@ -613,6 +632,7 @@ tgsi_build_full_instruction( reg->Register.File, reg->Register.WriteMask, reg->Register.Indirect, + reg->Register.Dimension, reg->Register.Index, instruction, header ); @@ -640,6 +660,46 @@ tgsi_build_full_instruction( instruction, header ); } + + if( reg->Register.Dimension ) { + struct tgsi_dimension *dim; + + assert( !reg->Dimension.Dimension ); + + if( maxsize <= size ) + return 0; + dim = (struct tgsi_dimension *) &tokens[size]; + size++; + + *dim = tgsi_build_dimension( + reg->Dimension.Indirect, + reg->Dimension.Index, + instruction, + header ); + + if( reg->Dimension.Indirect ) { + struct tgsi_src_register *ind; + + if( maxsize <= size ) + return 0; + ind = (struct tgsi_src_register *) &tokens[size]; + size++; + + *ind = tgsi_build_src_register( + reg->DimIndirect.File, + reg->DimIndirect.SwizzleX, + reg->DimIndirect.SwizzleY, + reg->DimIndirect.SwizzleZ, + reg->DimIndirect.SwizzleW, + reg->DimIndirect.Negate, + reg->DimIndirect.Absolute, + reg->DimIndirect.Indirect, + reg->DimIndirect.Dimension, + reg->DimIndirect.Index, + instruction, + header ); + } + } } for( i = 0; i < full_inst->Instruction.NumSrcRegs; i++ ) { @@ -959,6 +1019,7 @@ tgsi_build_dst_register( unsigned file, unsigned mask, unsigned indirect, + unsigned dimension, int index, struct tgsi_instruction *instruction, struct tgsi_header *header ) @@ -974,6 +1035,7 @@ tgsi_build_dst_register( dst_register.WriteMask = mask; dst_register.Index = index; dst_register.Indirect = indirect; + dst_register.Dimension = dimension; instruction_grow( instruction, header ); @@ -987,6 +1049,8 @@ tgsi_default_full_dst_register( void ) full_dst_register.Register = tgsi_default_dst_register(); full_dst_register.Indirect = tgsi_default_src_register(); + full_dst_register.Dimension = tgsi_default_dimension(); + full_dst_register.DimIndirect = tgsi_default_src_register(); return full_dst_register; } diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h index 13d7f5272d6..112107a0881 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.h +++ b/src/gallium/auxiliary/tgsi/tgsi_build.h @@ -263,6 +263,7 @@ tgsi_build_dst_register( unsigned file, unsigned mask, unsigned indirect, + unsigned dimension, int index, struct tgsi_instruction *instruction, struct tgsi_header *header ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c index 83000200189..f71ffb70308 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.c +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c @@ -56,7 +56,7 @@ dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...) va_list ap; (void)ctx; va_start(ap, format); - debug_vprintf(format, ap); + _debug_vprintf(format, ap); va_end(ap); } @@ -100,9 +100,10 @@ static const char *file_names[TGSI_FILE_COUNT] = "SAMP", "ADDR", "IMM", - "LOOP", "PRED", - "SV" + "SV", + "IMMX", + "TEMPX" }; static const char *interpolate_names[] = @@ -175,7 +176,11 @@ static const char *primitive_names[] = "TRIANGLE_FAN", "QUADS", "QUAD_STRIP", - "POLYGON" + "POLYGON", + "LINES_ADJACENCY", + "LINE_STRIP_ADJACENCY", + "TRIANGLES_ADJACENCY", + "TRIANGLE_STRIP_ADJACENCY" }; static const char *fs_coord_origin_names[] = @@ -192,29 +197,30 @@ static const char *fs_coord_pixel_center_names[] = static void -_dump_register_dst( - struct dump_ctx *ctx, - uint file, - int index) -{ - ENM( file, file_names ); - - CHR( '[' ); - SID( index ); - CHR( ']' ); -} - - -static void _dump_register_src( struct dump_ctx *ctx, const struct tgsi_full_src_register *src ) { ENM(src->Register.File, file_names); if (src->Register.Dimension) { - CHR('['); - SID(src->Dimension.Index); - CHR(']'); + if (src->Dimension.Indirect) { + CHR( '[' ); + ENM( src->DimIndirect.File, file_names ); + CHR( '[' ); + SID( src->DimIndirect.Index ); + TXT( "]." ); + ENM( src->DimIndirect.SwizzleX, swizzle_names ); + if (src->Dimension.Index != 0) { + if (src->Dimension.Index > 0) + CHR( '+' ); + SID( src->Dimension.Index ); + } + CHR( ']' ); + } else { + CHR('['); + SID(src->Dimension.Index); + CHR(']'); + } } if (src->Register.Indirect) { CHR( '[' ); @@ -236,30 +242,52 @@ _dump_register_src( } } + static void -_dump_register_ind( +_dump_register_dst( struct dump_ctx *ctx, - uint file, - int index, - uint ind_file, - int ind_index, - uint ind_swizzle ) + const struct tgsi_full_dst_register *dst ) { - ENM( file, file_names ); - CHR( '[' ); - ENM( ind_file, file_names ); - CHR( '[' ); - SID( ind_index ); - TXT( "]." ); - ENM( ind_swizzle, swizzle_names ); - if (index != 0) { - if (index > 0) - CHR( '+' ); - SID( index ); + ENM(dst->Register.File, file_names); + if (dst->Register.Dimension) { + if (dst->Dimension.Indirect) { + CHR( '[' ); + ENM( dst->DimIndirect.File, file_names ); + CHR( '[' ); + SID( dst->DimIndirect.Index ); + TXT( "]." ); + ENM( dst->DimIndirect.SwizzleX, swizzle_names ); + if (dst->Dimension.Index != 0) { + if (dst->Dimension.Index > 0) + CHR( '+' ); + SID( dst->Dimension.Index ); + } + CHR( ']' ); + } else { + CHR('['); + SID(dst->Dimension.Index); + CHR(']'); + } + } + if (dst->Register.Indirect) { + CHR( '[' ); + ENM( dst->Indirect.File, file_names ); + CHR( '[' ); + SID( dst->Indirect.Index ); + TXT( "]." ); + ENM( dst->Indirect.SwizzleX, swizzle_names ); + if (dst->Register.Index != 0) { + if (dst->Register.Index > 0) + CHR( '+' ); + SID( dst->Register.Index ); + } + CHR( ']' ); + } else { + CHR( '[' ); + SID( dst->Register.Index ); + CHR( ']' ); } - CHR( ']' ); } - static void _dump_writemask( struct dump_ctx *ctx, @@ -278,6 +306,39 @@ _dump_writemask( } } +static void +dump_imm_data(struct tgsi_iterate_context *iter, + union tgsi_immediate_data *data, + unsigned num_tokens, + unsigned data_type) +{ + struct dump_ctx *ctx = (struct dump_ctx *)iter; + unsigned i ; + + TXT( " {" ); + + assert( num_tokens <= 4 ); + for (i = 0; i < num_tokens; i++) { + switch (data_type) { + case TGSI_IMM_FLOAT32: + FLT( data[i].Float ); + break; + case TGSI_IMM_UINT32: + UID(data[i].Uint); + break; + case TGSI_IMM_INT32: + SID(data[i].Int); + break; + default: + assert( 0 ); + } + + if (i < num_tokens - 1) + TXT( ", " ); + } + TXT( "}" ); +} + static boolean iter_declaration( struct tgsi_iterate_context *iter, @@ -358,6 +419,43 @@ iter_declaration( } } + if (decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) { + unsigned i; + char range_indent[4]; + + TXT(" {"); + + if (decl->Range.Last < 10) + range_indent[0] = '\0'; + else if (decl->Range.Last < 100) { + range_indent[0] = ' '; + range_indent[1] = '\0'; + } else if (decl->Range.Last < 1000) { + range_indent[0] = ' '; + range_indent[1] = ' '; + range_indent[2] = '\0'; + } else { + range_indent[0] = ' '; + range_indent[1] = ' '; + range_indent[2] = ' '; + range_indent[3] = '\0'; + } + + dump_imm_data(iter, decl->ImmediateData.u, + 4, TGSI_IMM_FLOAT32); + for(i = 1; i <= decl->Range.Last; ++i) { + /* indent by strlen of: + * "DCL IMMX[0..1] {" */ + CHR('\n'); + TXT( " " ); + TXT( range_indent ); + dump_imm_data(iter, decl->ImmediateData.u + i, + 4, TGSI_IMM_FLOAT32); + } + + TXT(" }"); + } + EOL(); return TRUE; @@ -431,33 +529,11 @@ iter_immediate( { struct dump_ctx *ctx = (struct dump_ctx *) iter; - uint i; - TXT( "IMM " ); ENM( imm->Immediate.DataType, immediate_type_names ); - TXT( " { " ); - - assert( imm->Immediate.NrTokens <= 4 + 1 ); - for (i = 0; i < imm->Immediate.NrTokens - 1; i++) { - switch (imm->Immediate.DataType) { - case TGSI_IMM_FLOAT32: - FLT( imm->u[i].Float ); - break; - case TGSI_IMM_UINT32: - UID(imm->u[i].Uint); - break; - case TGSI_IMM_INT32: - SID(imm->u[i].Int); - break; - default: - assert( 0 ); - } - - if (i < imm->Immediate.NrTokens - 2) - TXT( ", " ); - } - TXT( " }" ); + dump_imm_data(iter, imm->u, imm->Immediate.NrTokens - 1, + imm->Immediate.DataType); EOL(); @@ -488,12 +564,36 @@ iter_instruction( INSTID( instno ); TXT( ": " ); - + ctx->indent -= info->pre_dedent; for(i = 0; (int)i < ctx->indent; ++i) TXT( " " ); ctx->indent += info->post_indent; - + + if (inst->Instruction.Predicate) { + CHR( '(' ); + + if (inst->Predicate.Negate) + CHR( '!' ); + + TXT( "PRED[" ); + SID( inst->Predicate.Index ); + CHR( ']' ); + + if (inst->Predicate.SwizzleX != TGSI_SWIZZLE_X || + inst->Predicate.SwizzleY != TGSI_SWIZZLE_Y || + inst->Predicate.SwizzleZ != TGSI_SWIZZLE_Z || + inst->Predicate.SwizzleW != TGSI_SWIZZLE_W) { + CHR( '.' ); + ENM( inst->Predicate.SwizzleX, swizzle_names ); + ENM( inst->Predicate.SwizzleY, swizzle_names ); + ENM( inst->Predicate.SwizzleZ, swizzle_names ); + ENM( inst->Predicate.SwizzleW, swizzle_names ); + } + + TXT( ") " ); + } + TXT( info->mnemonic ); switch (inst->Instruction.Saturate) { @@ -516,21 +616,7 @@ iter_instruction( CHR( ',' ); CHR( ' ' ); - if (dst->Register.Indirect) { - _dump_register_ind( - ctx, - dst->Register.File, - dst->Register.Index, - dst->Indirect.File, - dst->Indirect.Index, - dst->Indirect.SwizzleX ); - } - else { - _dump_register_dst( - ctx, - dst->Register.File, - dst->Register.Index ); - } + _dump_register_dst( ctx, dst ); _dump_writemask( ctx, dst->Register.WriteMask ); first_reg = FALSE; diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h index 4cd27317b36..dd78b361007 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.h +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h @@ -28,6 +28,7 @@ #ifndef TGSI_DUMP_H #define TGSI_DUMP_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index 82eac05dc4d..3a71540506d 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -557,6 +557,23 @@ print_temp(const struct tgsi_exec_machine *mach, uint index) #endif +void +tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach, + unsigned num_bufs, + const void **bufs, + const unsigned *buf_sizes) +{ + unsigned i; + + for (i = 0; i < num_bufs; i++) { + mach->Consts[i] = bufs[i]; + mach->ConstsSize[i] = buf_sizes[i]; + } +} + + + + /** * Check if there's a potential src/dst register data dependency when * using SOA execution. @@ -588,8 +605,10 @@ tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst) for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { if ((inst->Src[i].Register.File == inst->Dst[0].Register.File) && - (inst->Src[i].Register.Index == - inst->Dst[0].Register.Index)) { + ((inst->Src[i].Register.Index == + inst->Dst[0].Register.Index) || + inst->Src[i].Register.Indirect || + inst->Dst[0].Register.Indirect)) { /* loop over dest channels */ uint channelsWritten = 0x0; FOR_EACH_ENABLED_CHANNEL(*inst, chan) { @@ -621,12 +640,10 @@ tgsi_exec_machine_bind_shader( { uint k; struct tgsi_parse_context parse; - struct tgsi_exec_labels *labels = &mach->Labels; struct tgsi_full_instruction *instructions; struct tgsi_full_declaration *declarations; uint maxInstructions = 10, numInstructions = 0; uint maxDeclarations = 10, numDeclarations = 0; - uint instno = 0; #if 0 tgsi_dump(tokens, 0); @@ -634,9 +651,30 @@ tgsi_exec_machine_bind_shader( util_init_math(); + if (numSamplers) { + assert(samplers); + } + mach->Tokens = tokens; mach->Samplers = samplers; + if (!tokens) { + /* unbind and free all */ + if (mach->Declarations) { + FREE( mach->Declarations ); + } + mach->Declarations = NULL; + mach->NumDeclarations = 0; + + if (mach->Instructions) { + FREE( mach->Instructions ); + } + mach->Instructions = NULL; + mach->NumInstructions = 0; + + return; + } + k = tgsi_parse_init (&parse, mach->Tokens); if (k != TGSI_PARSE_OK) { debug_printf( "Problem parsing!\n" ); @@ -645,7 +683,6 @@ tgsi_exec_machine_bind_shader( mach->Processor = parse.FullHeader.Processor.Processor; mach->ImmLimit = 0; - labels->count = 0; declarations = (struct tgsi_full_declaration *) MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) ); @@ -663,7 +700,6 @@ tgsi_exec_machine_bind_shader( } while( !tgsi_parse_end_of_tokens( &parse ) ) { - uint pointer = parse.Position; uint i; tgsi_parse_token( &parse ); @@ -686,6 +722,19 @@ tgsi_exec_machine_bind_shader( ++mach->NumOutputs; } } + if (parse.FullToken.FullDeclaration.Declaration.File == + TGSI_FILE_IMMEDIATE_ARRAY) { + unsigned reg; + struct tgsi_full_declaration *decl = + &parse.FullToken.FullDeclaration; + debug_assert(decl->Range.Last < TGSI_EXEC_NUM_IMMEDIATES); + for (reg = decl->Range.First; reg <= decl->Range.Last; ++reg) { + for( i = 0; i < 4; i++ ) { + int idx = reg * 4 + i; + mach->ImmArray[reg][i] = decl->ImmediateData.u[idx].Float; + } + } + } memcpy(declarations + numDeclarations, &parse.FullToken.FullDeclaration, sizeof(declarations[0])); @@ -707,11 +756,6 @@ tgsi_exec_machine_bind_shader( break; case TGSI_TOKEN_TYPE_INSTRUCTION: - assert( labels->count < MAX_LABELS ); - - labels->labels[labels->count][0] = instno; - labels->labels[labels->count][1] = pointer; - labels->count++; /* save expanded instruction */ if (numInstructions == maxInstructions) { @@ -801,7 +845,9 @@ void tgsi_exec_machine_destroy(struct tgsi_exec_machine *mach) { if (mach) { - FREE(mach->Instructions); + if (mach->Instructions) + FREE(mach->Instructions); + if (mach->Declarations) FREE(mach->Declarations); } @@ -1017,6 +1063,8 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach, { uint i; + assert(swizzle < 4); + switch (file) { case TGSI_FILE_CONSTANT: for (i = 0; i < QUAD_SIZE; i++) { @@ -1026,9 +1074,23 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach, if (index->i[i] < 0) { chan->u[i] = 0; } else { - const uint *p = (const uint *)mach->Consts[index2D->i[i]]; - - chan->u[i] = p[index->i[i] * 4 + swizzle]; + /* NOTE: copying the const value as a uint instead of float */ + const uint constbuf = index2D->i[i]; + const uint *buf = (const uint *)mach->Consts[constbuf]; + const int pos = index->i[i] * 4 + swizzle; + /* const buffer bounds check */ + if (pos < 0 || pos >= mach->ConstsSize[constbuf]) { + if (0) { + /* Debug: print warning */ + static int count = 0; + if (count++ < 100) + debug_printf("TGSI Exec: const buffer index %d" + " out of bounds\n", pos); + } + chan->u[i] = 0; + } + else + chan->u[i] = buf[pos]; } } break; @@ -1036,8 +1098,16 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach, case TGSI_FILE_INPUT: case TGSI_FILE_SYSTEM_VALUE: for (i = 0; i < QUAD_SIZE; i++) { - /* XXX: 2D indexing */ - chan->u[i] = mach->Inputs[index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]].xyzw[swizzle].u[i]; + /* + if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) { + debug_printf("Fetching Input[%d] (2d=%d, 1d=%d)\n", + index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i], + index2D->i[i], index->i[i]); + }*/ + int pos = index2D->i[i] * TGSI_EXEC_MAX_INPUT_ATTRIBS + index->i[i]; + assert(pos >= 0); + assert(pos < Elements(mach->Inputs)); + chan->u[i] = mach->Inputs[pos].xyzw[swizzle].u[i]; } break; @@ -1050,6 +1120,16 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach, } break; + case TGSI_FILE_TEMPORARY_ARRAY: + for (i = 0; i < QUAD_SIZE; i++) { + assert(index->i[i] < TGSI_EXEC_NUM_TEMPS); + assert(index2D->i[i] < TGSI_EXEC_NUM_TEMP_ARRAYS); + + chan->u[i] = + mach->TempArray[index2D->i[i]][index->i[i]].xyzw[swizzle].u[i]; + } + break; + case TGSI_FILE_IMMEDIATE: for (i = 0; i < QUAD_SIZE; i++) { assert(index->i[i] >= 0 && index->i[i] < (int)mach->ImmLimit); @@ -1059,6 +1139,14 @@ fetch_src_file_channel(const struct tgsi_exec_machine *mach, } break; + case TGSI_FILE_IMMEDIATE_ARRAY: + for (i = 0; i < QUAD_SIZE; i++) { + assert(index2D->i[i] == 0); + + chan->f[i] = mach->ImmArray[index->i[i]][swizzle]; + } + break; + case TGSI_FILE_ADDRESS: for (i = 0; i < QUAD_SIZE; i++) { assert(index->i[i] >= 0); @@ -1139,7 +1227,7 @@ fetch_source(const struct tgsi_exec_machine *mach, index2.i[1] = index2.i[2] = index2.i[3] = reg->Indirect.Index; - + assert(reg->Indirect.File == TGSI_FILE_ADDRESS); /* get current value of address register[swizzle] */ swizzle = tgsi_util_get_src_register_swizzle( ®->Indirect, CHAN_X ); fetch_src_file_channel(mach, @@ -1270,6 +1358,7 @@ store_dest(struct tgsi_exec_machine *mach, uint i; union tgsi_exec_channel null; union tgsi_exec_channel *dst; + union tgsi_exec_channel index2D; uint execmask = mach->ExecMask; int offset = 0; /* indirection offset */ int index; @@ -1315,6 +1404,77 @@ store_dest(struct tgsi_exec_machine *mach, offset = indir_index.i[0]; } + /* There is an extra source register that is a second + * subscript to a register file. Effectively it means that + * the register file is actually a 2D array of registers. + * + * file[3][1], + * where: + * [3] = Dimension.Index + */ + if (reg->Register.Dimension) { + index2D.i[0] = + index2D.i[1] = + index2D.i[2] = + index2D.i[3] = reg->Dimension.Index; + + /* Again, the second subscript index can be addressed indirectly + * identically to the first one. + * Nothing stops us from indirectly addressing the indirect register, + * but there is no need for that, so we won't exercise it. + * + * file[ind[4].y+3][1], + * where: + * ind = DimIndirect.File + * [4] = DimIndirect.Index + * .y = DimIndirect.SwizzleX + */ + if (reg->Dimension.Indirect) { + union tgsi_exec_channel index2; + union tgsi_exec_channel indir_index; + const uint execmask = mach->ExecMask; + unsigned swizzle; + uint i; + + index2.i[0] = + index2.i[1] = + index2.i[2] = + index2.i[3] = reg->DimIndirect.Index; + + swizzle = tgsi_util_get_src_register_swizzle( ®->DimIndirect, CHAN_X ); + fetch_src_file_channel(mach, + reg->DimIndirect.File, + swizzle, + &index2, + &ZeroVec, + &indir_index); + + index2D.i[0] += indir_index.i[0]; + index2D.i[1] += indir_index.i[1]; + index2D.i[2] += indir_index.i[2]; + index2D.i[3] += indir_index.i[3]; + + /* for disabled execution channels, zero-out the index to + * avoid using a potential garbage value. + */ + for (i = 0; i < QUAD_SIZE; i++) { + if ((execmask & (1 << i)) == 0) { + index2D.i[i] = 0; + } + } + } + + /* If by any chance there was a need for a 3D array of register + * files, we would have to check whether Dimension is followed + * by a dimension register and continue the saga. + */ + } else { + index2D.i[0] = + index2D.i[1] = + index2D.i[2] = + index2D.i[3] = 0; + } + switch (reg->Register.File) { case TGSI_FILE_NULL: dst = &null; @@ -1341,16 +1501,19 @@ store_dest(struct tgsi_exec_machine *mach, dst = &mach->Temps[offset + index].xyzw[chan_index]; break; - case TGSI_FILE_ADDRESS: + case TGSI_FILE_TEMPORARY_ARRAY: index = reg->Register.Index; - dst = &mach->Addrs[index].xyzw[chan_index]; + assert( index < TGSI_EXEC_NUM_TEMPS ); + assert( index2D.i[0] < TGSI_EXEC_NUM_TEMP_ARRAYS ); + /* XXX we use index2D.i[0] here but somehow we might + * end up with someone trying to store indirectly in + * different buffers */ + dst = &mach->TempArray[index2D.i[0]][offset + index].xyzw[chan_index]; break; - case TGSI_FILE_LOOP: - assert(reg->Register.Index == 0); - assert(mach->LoopCounterStackTop > 0); - assert(chan_index == CHAN_X); - dst = &mach->LoopCounterStack[mach->LoopCounterStackTop - 1].xyzw[chan_index]; + case TGSI_FILE_ADDRESS: + index = reg->Register.Index; + dst = &mach->Addrs[index].xyzw[chan_index]; break; case TGSI_FILE_PREDICATE: @@ -1533,6 +1696,19 @@ emit_primitive(struct tgsi_exec_machine *mach) } } +static void +conditional_emit_primitive(struct tgsi_exec_machine *mach) +{ + if (TGSI_PROCESSOR_GEOMETRY == mach->Processor) { + int emitted_verts = + mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]; + if (emitted_verts) { + emit_primitive(mach); + } + } +} + + /* * Fetch four texture samples using STR texture coordinates. */ @@ -3065,6 +3241,8 @@ exec_instruction( if (mach->CallStackTop == 0) { /* returning from main() */ + mach->CondStackTop = 0; + mach->LoopStackTop = 0; *pc = -1; return; } @@ -3133,7 +3311,7 @@ exec_instruction( break; case TGSI_OPCODE_DIV: - assert( 0 ); + exec_vector_binary(mach, inst, micro_div, TGSI_EXEC_DATA_FLOAT, TGSI_EXEC_DATA_FLOAT); break; case TGSI_OPCODE_DP2: @@ -3182,6 +3360,9 @@ exec_instruction( break; case TGSI_OPCODE_END: + /* make sure we end primitives which haven't + * been explicitly emitted */ + conditional_emit_primitive(mach); /* halt execution */ *pc = -1; break; @@ -3590,6 +3771,9 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) } #endif + /* Strictly speaking, these assertions aren't really needed but they + * can potentially catch some bugs in the control flow code. + */ assert(mach->CondStackTop == 0); assert(mach->LoopStackTop == 0); assert(mach->ContStackTop == 0); diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h index a22873e4c2b..9d62c1d7e7e 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.h +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h @@ -37,8 +37,6 @@ extern "C" { #endif -#define MAX_LABELS (4 * 1024) /**< basically, max instructions */ - #define NUM_CHANNELS 4 /* R,G,B,A */ #define QUAD_SIZE 4 /* 4 pixel/quad */ @@ -93,18 +91,9 @@ struct tgsi_sampler float rgba[NUM_CHANNELS][QUAD_SIZE]); }; -/** - * For branching/calling subroutines. - */ -struct tgsi_exec_labels -{ - unsigned labels[MAX_LABELS][2]; - unsigned count; -}; - - #define TGSI_EXEC_NUM_TEMPS 128 #define TGSI_EXEC_NUM_IMMEDIATES 256 +#define TGSI_EXEC_NUM_TEMP_ARRAYS 8 /* * Locations of various utility registers (_I = Index, _C = Channel) @@ -186,10 +175,11 @@ struct tgsi_exec_labels -#define TGSI_EXEC_MAX_COND_NESTING 32 -#define TGSI_EXEC_MAX_LOOP_NESTING 32 -#define TGSI_EXEC_MAX_SWITCH_NESTING 32 -#define TGSI_EXEC_MAX_CALL_NESTING 32 +#define TGSI_EXEC_MAX_NESTING 32 +#define TGSI_EXEC_MAX_COND_NESTING TGSI_EXEC_MAX_NESTING +#define TGSI_EXEC_MAX_LOOP_NESTING TGSI_EXEC_MAX_NESTING +#define TGSI_EXEC_MAX_SWITCH_NESTING TGSI_EXEC_MAX_NESTING +#define TGSI_EXEC_MAX_CALL_NESTING TGSI_EXEC_MAX_NESTING /* The maximum number of input attributes per vertex. For 2D * input register files, this is the stride between two 1D @@ -248,9 +238,12 @@ struct tgsi_exec_machine */ struct tgsi_exec_vector Temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_TEMP_EXTRAS]; + struct tgsi_exec_vector TempArray[TGSI_EXEC_NUM_TEMP_ARRAYS][TGSI_EXEC_NUM_TEMPS]; float Imms[TGSI_EXEC_NUM_IMMEDIATES][4]; + float ImmArray[TGSI_EXEC_NUM_IMMEDIATES][4]; + struct tgsi_exec_vector Inputs[TGSI_MAX_PRIM_VERTICES * PIPE_MAX_ATTRIBS]; struct tgsi_exec_vector Outputs[TGSI_MAX_TOTAL_VERTICES]; @@ -260,7 +253,10 @@ struct tgsi_exec_machine struct tgsi_sampler **Samplers; unsigned ImmLimit; + const void *Consts[PIPE_MAX_CONSTANT_BUFFERS]; + unsigned ConstsSize[PIPE_MAX_CONSTANT_BUFFERS]; + const struct tgsi_token *Tokens; /**< Declarations, instructions */ unsigned Processor; /**< TGSI_PROCESSOR_x */ @@ -299,10 +295,6 @@ struct tgsi_exec_machine uint LoopLabelStack[TGSI_EXEC_MAX_LOOP_NESTING]; int LoopLabelStackTop; - /** Loop counter stack (x = index, y = counter, z = step) */ - struct tgsi_exec_vector LoopCounterStack[TGSI_EXEC_MAX_LOOP_NESTING]; - int LoopCounterStackTop; - /** Loop continue mask stack (see comments in tgsi_exec.c) */ uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING]; int ContStackTop; @@ -328,7 +320,6 @@ struct tgsi_exec_machine struct tgsi_full_declaration *Declarations; uint NumDeclarations; - struct tgsi_exec_labels Labels; }; struct tgsi_exec_machine * @@ -379,6 +370,43 @@ tgsi_set_exec_mask(struct tgsi_exec_machine *mach, } +extern void +tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach, + unsigned num_bufs, + const void **bufs, + const unsigned *buf_sizes); + + +static INLINE int +tgsi_exec_get_shader_param(enum pipe_shader_cap param) +{ + switch(param) { + case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS: + case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS: + return INT_MAX; + case PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH: + return TGSI_EXEC_MAX_NESTING; + case PIPE_SHADER_CAP_MAX_INPUTS: + return TGSI_EXEC_MAX_INPUT_ATTRIBS; + case PIPE_SHADER_CAP_MAX_CONSTS: + return TGSI_EXEC_MAX_CONST_BUFFER; + case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: + return PIPE_MAX_CONSTANT_BUFFERS; + case PIPE_SHADER_CAP_MAX_TEMPS: + return TGSI_EXEC_NUM_TEMPS; + case PIPE_SHADER_CAP_MAX_ADDRS: + return TGSI_EXEC_NUM_ADDRS; + case PIPE_SHADER_CAP_MAX_PREDS: + return TGSI_EXEC_NUM_PREDS; + case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: + return 1; + default: + return 0; + } +} + #if defined __cplusplus } /* extern "C" */ #endif diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c index cfa2f631bd8..e59e964ffa7 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_info.c +++ b/src/gallium/auxiliary/tgsi/tgsi_info.c @@ -205,3 +205,18 @@ tgsi_get_opcode_name( uint opcode ) return info->mnemonic; } + +const char * +tgsi_get_processor_name( uint processor ) +{ + switch (processor) { + case TGSI_PROCESSOR_VERTEX: + return "vertex shader"; + case TGSI_PROCESSOR_FRAGMENT: + return "fragment shader"; + case TGSI_PROCESSOR_GEOMETRY: + return "geometry shader"; + default: + return "unknown shader type!"; + } +} diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h index 74713c3b98a..1992d11bbe8 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_info.h +++ b/src/gallium/auxiliary/tgsi/tgsi_info.h @@ -28,6 +28,7 @@ #ifndef TGSI_INFO_H #define TGSI_INFO_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus @@ -52,6 +53,9 @@ tgsi_get_opcode_info( uint opcode ); const char * tgsi_get_opcode_name( uint opcode ); +const char * +tgsi_get_processor_name( uint processor ); + #if defined __cplusplus } diff --git a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h index e472947507d..b3123ed016d 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h +++ b/src/gallium/auxiliary/tgsi/tgsi_opcode_tmp.h @@ -163,6 +163,10 @@ OP12(USGE) OP12(USHR) OP12(USLT) OP12(USNE) +OP01(SWITCH) +OP01(CASE) +OP00(DEFAULT) +OP00(ENDSWITCH) #undef OP00 diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c index 7e19e1fe36f..1891203abe1 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.c +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c @@ -117,6 +117,17 @@ tgsi_parse_token( next_token( ctx, &decl->Semantic ); } + if (decl->Declaration.File == TGSI_FILE_IMMEDIATE_ARRAY) { + unsigned i, j; + decl->ImmediateData.u = (union tgsi_immediate_data*) + &ctx->Tokens[ctx->Position]; + for (i = 0; i <= decl->Range.Last; ++i) { + for (j = 0; j < 4; ++j) { + ctx->Position++; + } + } + } + break; } @@ -181,11 +192,6 @@ tgsi_parse_token( next_token( ctx, &inst->Dst[i].Register ); - /* - * No support for indirect or multi-dimensional addressing. - */ - assert( !inst->Dst[i].Register.Dimension ); - if( inst->Dst[i].Register.Indirect ) { next_token( ctx, &inst->Dst[i].Indirect ); @@ -195,6 +201,24 @@ tgsi_parse_token( assert( !inst->Dst[i].Indirect.Dimension ); assert( !inst->Dst[i].Indirect.Indirect ); } + if( inst->Dst[i].Register.Dimension ) { + next_token( ctx, &inst->Dst[i].Dimension ); + + /* + * No support for multi-dimensional addressing. + */ + assert( !inst->Dst[i].Dimension.Dimension ); + + if( inst->Dst[i].Dimension.Indirect ) { + next_token( ctx, &inst->Dst[i].DimIndirect ); + + /* + * No support for indirect or multi-dimensional addressing. + */ + assert( !inst->Dst[i].Indirect.Indirect ); + assert( !inst->Dst[i].Indirect.Dimension ); + } + } } assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS ); @@ -258,17 +282,6 @@ tgsi_parse_token( } -unsigned -tgsi_num_tokens(const struct tgsi_token *tokens) -{ - struct tgsi_parse_context ctx; - if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) { - unsigned len = (ctx.FullHeader.Header.HeaderSize + - ctx.FullHeader.Header.BodySize); - return len; - } - return 0; -} /** @@ -295,3 +308,19 @@ tgsi_alloc_tokens(unsigned num_tokens) unsigned bytes = num_tokens * sizeof(struct tgsi_token); return (struct tgsi_token *) MALLOC(bytes); } + + +void +tgsi_dump_tokens(const struct tgsi_token *tokens) +{ + const unsigned *dwords = (const unsigned *)tokens; + int nr = tgsi_num_tokens(tokens); + int i; + + assert(sizeof(*tokens) == sizeof(unsigned)); + + debug_printf("const unsigned tokens[%d] = {\n", nr); + for (i = 0; i < nr; i++) + debug_printf("0x%08x,\n", dwords[i]); + debug_printf("};\n"); +} diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h index b45ccee2f63..d4df5851764 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.h +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h @@ -28,6 +28,7 @@ #ifndef TGSI_PARSE_H #define TGSI_PARSE_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus @@ -44,6 +45,8 @@ struct tgsi_full_dst_register { struct tgsi_dst_register Register; struct tgsi_src_register Indirect; + struct tgsi_dimension Dimension; + struct tgsi_src_register DimIndirect; }; struct tgsi_full_src_register @@ -54,12 +57,18 @@ struct tgsi_full_src_register struct tgsi_src_register DimIndirect; }; +struct tgsi_immediate_array_data +{ + union tgsi_immediate_data *u; +}; + struct tgsi_full_declaration { struct tgsi_declaration Declaration; struct tgsi_declaration_range Range; struct tgsi_declaration_dimension Dim; struct tgsi_declaration_semantic Semantic; + struct tgsi_immediate_array_data ImmediateData; }; struct tgsi_full_immediate @@ -124,8 +133,15 @@ void tgsi_parse_token( struct tgsi_parse_context *ctx ); -unsigned -tgsi_num_tokens(const struct tgsi_token *tokens); +static INLINE unsigned +tgsi_num_tokens(const struct tgsi_token *tokens) +{ + struct tgsi_header header = *(const struct tgsi_header *) tokens; + return header.HeaderSize + header.BodySize; +} + +void +tgsi_dump_tokens(const struct tgsi_token *tokens); struct tgsi_token * tgsi_dup_tokens(const struct tgsi_token *tokens); diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c index ad553c71a57..3521847b619 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ppc.c +++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c @@ -1366,4 +1366,12 @@ tgsi_emit_ppc(const struct tgsi_token *tokens, return ok; } +#else + +void ppc_dummy_func(void); + +void ppc_dummy_func(void) +{ +} + #endif /* PIPE_ARCH_PPC */ diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c index 76b7564cc36..acbff103efe 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c @@ -33,6 +33,10 @@ #include "tgsi_info.h" #include "tgsi_iterate.h" + +DEBUG_GET_ONCE_BOOL_OPTION(print_sanity, "TGSI_PRINT_SANITY", FALSE) + + typedef struct { uint file : 28; /* max 2 dimensions */ @@ -54,6 +58,8 @@ struct sanity_check_ctx uint errors; uint warnings; uint implied_array_size; + + boolean print; }; static INLINE unsigned @@ -90,9 +96,18 @@ static void scan_register_dst(scan_register *reg, struct tgsi_full_dst_register *dst) { - fill_scan_register1d(reg, - dst->Register.File, - dst->Register.Index); + if (dst->Register.Dimension) { + /*FIXME: right now we don't support indirect + * multidimensional addressing */ + fill_scan_register2d(reg, + dst->Register.File, + dst->Register.Index, + dst->Dimension.Index); + } else { + fill_scan_register1d(reg, + dst->Register.File, + dst->Register.Index); + } } static void @@ -102,7 +117,6 @@ scan_register_src(scan_register *reg, if (src->Register.Dimension) { /*FIXME: right now we don't support indirect * multidimensional addressing */ - debug_assert(!src->Dimension.Indirect); fill_scan_register2d(reg, src->Register.File, src->Register.Index, @@ -140,6 +154,9 @@ report_error( { va_list args; + if (!ctx->print) + return; + debug_printf( "Error : " ); va_start( args, format ); _debug_vprintf( format, args ); @@ -156,6 +173,9 @@ report_warning( { va_list args; + if (!ctx->print) + return; + debug_printf( "Warning: " ); va_start( args, format ); _debug_vprintf( format, args ); @@ -235,8 +255,10 @@ static const char *file_names[TGSI_FILE_COUNT] = "SAMP", "ADDR", "IMM", - "LOOP", - "PRED" + "PRED", + "SV", + "IMMX", + "TEMPX" }; static boolean @@ -529,6 +551,7 @@ tgsi_sanity_check( ctx.errors = 0; ctx.warnings = 0; ctx.implied_array_size = 0; + ctx.print = debug_get_option_print_sanity(); if (!tgsi_iterate_shader( tokens, &ctx.iter )) return FALSE; diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h index 52263ff8832..73f0f414e3f 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h +++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h @@ -35,7 +35,8 @@ extern "C" { #endif /* Check the given token stream for errors and common mistakes. - * Diagnostic messages are printed out to the debug output. + * Diagnostic messages are printed out to the debug output, and is + * controlled by the debug option TGSI_PRINT_SANITY (default false). * Returns TRUE if there are no errors, even though there could be some warnings. */ boolean diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index 232fc537c1d..90198a4f604 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -36,6 +36,7 @@ #include "util/u_math.h" #include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_util.h" #include "tgsi/tgsi_scan.h" @@ -84,31 +85,46 @@ tgsi_scan_shader(const struct tgsi_token *tokens, { const struct tgsi_full_instruction *fullinst = &parse.FullToken.FullInstruction; + uint i; assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST); info->opcode_count[fullinst->Instruction.Opcode]++; - /* special case: scan fragment shaders for use of the fog - * input/attribute. The X component is fog, the Y component - * is the front/back-face flag. - */ - if (procType == TGSI_PROCESSOR_FRAGMENT) { - uint i; - for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) { - const struct tgsi_full_src_register *src = - &fullinst->Src[i]; - if (src->Register.File == TGSI_FILE_INPUT || - src->Register.File == TGSI_FILE_SYSTEM_VALUE) { - const int ind = src->Register.Index; - if (info->input_semantic_name[ind] == TGSI_SEMANTIC_FOG) { - info->uses_fogcoord = TRUE; - } - else if (info->input_semantic_name[ind] == TGSI_SEMANTIC_FACE) { - info->uses_frontfacing = TRUE; + for (i = 0; i < fullinst->Instruction.NumSrcRegs; i++) { + const struct tgsi_full_src_register *src = + &fullinst->Src[i]; + int ind = src->Register.Index; + + /* Mark which inputs are effectively used */ + if (src->Register.File == TGSI_FILE_INPUT) { + unsigned usage_mask; + usage_mask = tgsi_util_get_inst_usage_mask(fullinst, i); + if (src->Register.Indirect) { + for (ind = 0; ind < info->num_inputs; ++ind) { + info->input_usage_mask[ind] |= usage_mask; } + } else { + assert(ind >= 0); + assert(ind < PIPE_MAX_SHADER_INPUTS); + info->input_usage_mask[ind] |= usage_mask; } } + + /* check for indirect register reads */ + if (src->Register.Indirect) { + info->indirect_files |= (1 << src->Register.File); + } } + + /* check for indirect register writes */ + for (i = 0; i < fullinst->Instruction.NumDstRegs; i++) { + const struct tgsi_full_dst_register *dst = &fullinst->Dst[i]; + if (dst->Register.Indirect) { + info->indirect_files |= (1 << dst->Register.File); + } + } + + info->num_instructions++; } break; diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h index 741aa7d5c42..f8aa90cf065 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -45,6 +45,7 @@ struct tgsi_shader_info ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */ ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS]; ubyte input_interpolate[PIPE_MAX_SHADER_INPUTS]; + ubyte input_usage_mask[PIPE_MAX_SHADER_INPUTS]; ubyte input_cylindrical_wrap[PIPE_MAX_SHADER_INPUTS]; ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */ ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS]; @@ -54,14 +55,19 @@ struct tgsi_shader_info int file_max[TGSI_FILE_COUNT]; /**< highest index of declared registers */ uint immediate_count; /**< number of immediates declared */ + uint num_instructions; uint opcode_count[TGSI_OPCODE_LAST]; /**< opcode histogram */ boolean writes_z; /**< does fragment shader write Z value? */ boolean writes_edgeflag; /**< vertex shader outputs edgeflag */ boolean uses_kill; /**< KIL or KILP instruction used? */ - boolean uses_fogcoord; /**< fragment shader uses fog coord? */ - boolean uses_frontfacing; /**< fragment shader uses front/back-face flag? */ + + /** + * Bitmask indicating which register files are accessed with + * indirect addressing. The bits are (1 << TGSI_FILE_x), etc. + */ + unsigned indirect_files; struct { unsigned name; diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c index 1071298b497..086d983a73a 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c @@ -1244,16 +1244,9 @@ emit_sub( make_xmm( xmm_src ) ); } - - - - - - /** * Register fetch. */ - static void emit_fetch( struct x86_function *func, @@ -1338,7 +1331,6 @@ emit_fetch( /** * Register store. */ - static void emit_store( struct x86_function *func, @@ -1455,7 +1447,6 @@ fetch_texel( struct tgsi_sampler **sampler, /** * High-level instruction translators. */ - static void emit_tex( struct x86_function *func, const struct tgsi_full_instruction *inst, @@ -1507,7 +1498,6 @@ emit_tex( struct x86_function *func, get_temp( TEMP_R0, 3 ), make_xmm( 3 ) ); - if (projected) { FETCH( func, *inst, 3, 0, 3 ); @@ -1535,7 +1525,6 @@ emit_tex( struct x86_function *func, args[0] = get_temp( TEMP_R0, 0 ); args[1] = get_sampler_ptr( unit ); - emit_func_call( func, 0, args, @@ -1569,7 +1558,8 @@ emit_kil( /* This mask stores component bits that were already tested. Note that * we test if the value is less than zero, so 1.0 and 0.0 need not to be - * tested. */ + * tested. + */ uniquemask = 0; FOR_EACH_CHANNEL( chan_index ) { @@ -1715,22 +1705,26 @@ emit_cmp( /** - * Check if inst src/dest regs use indirect addressing into temporary - * register file. + * Check if inst src/dest regs use indirect addressing into temporary, + * input or output register files. */ static boolean -indirect_temp_reference(const struct tgsi_full_instruction *inst) +indirect_reg_reference(const struct tgsi_full_instruction *inst) { uint i; for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { const struct tgsi_full_src_register *reg = &inst->Src[i]; - if (reg->Register.File == TGSI_FILE_TEMPORARY && + if ((reg->Register.File == TGSI_FILE_TEMPORARY || + reg->Register.File == TGSI_FILE_INPUT || + reg->Register.File == TGSI_FILE_OUTPUT) && reg->Register.Indirect) return TRUE; } for (i = 0; i < inst->Instruction.NumDstRegs; i++) { const struct tgsi_full_dst_register *reg = &inst->Dst[i]; - if (reg->Register.File == TGSI_FILE_TEMPORARY && + if ((reg->Register.File == TGSI_FILE_TEMPORARY || + reg->Register.File == TGSI_FILE_INPUT || + reg->Register.File == TGSI_FILE_OUTPUT) && reg->Register.Indirect) return TRUE; } @@ -1746,7 +1740,7 @@ emit_instruction( unsigned chan_index; /* we can't handle indirect addressing into temp register file yet */ - if (indirect_temp_reference(inst)) + if (indirect_reg_reference(inst)) return FALSE; switch (inst->Instruction.Opcode) { @@ -1929,20 +1923,32 @@ emit_instruction( break; case TGSI_OPCODE_MUL: + /* do all fetches and adds, storing results in temp regs */ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - emit_mul( func, 0, 1 ); - STORE( func, *inst, 0, 0, chan_index ); + int r = chan_index + 1; + FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */ + FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */ + emit_mul( func, r, 0 ); /* xmm[r] = xmm[r] * xmm[0] */ + } + /* do all stores of the temp regs */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + int r = chan_index + 1; + STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */ } break; case TGSI_OPCODE_ADD: + /* do all fetches and adds, storing results in temp regs */ FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { - FETCH( func, *inst, 0, 0, chan_index ); - FETCH( func, *inst, 1, 1, chan_index ); - emit_add( func, 0, 1 ); - STORE( func, *inst, 0, 0, chan_index ); + int r = chan_index + 1; + FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */ + FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */ + emit_add( func, r, 0 ); /* xmm[r] = xmm[r] + xmm[0] */ + } + /* do all stores of the temp regs */ + FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) { + int r = chan_index + 1; + STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */ } break; @@ -2697,8 +2703,7 @@ static void aos_to_soa( struct x86_function *func, struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX ); struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX ); struct x86_reg stride = x86_make_reg( file_REG32, reg_DX ); - int inner_loop; - + int loop_top, loop_exit_fixup; /* Save EBX */ x86_push( func, x86_make_reg( file_REG32, reg_BX ) ); @@ -2711,8 +2716,11 @@ static void aos_to_soa( struct x86_function *func, x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) ); x86_mov( func, stride, x86_fn_arg( func, arg_stride ) ); - /* do */ - inner_loop = x86_get_label( func ); + /* while (num_inputs != 0) */ + loop_top = x86_get_label( func ); + x86_cmp_imm( func, num_inputs, 0 ); + loop_exit_fixup = x86_jcc_forward( func, cc_E ); + { x86_push( func, aos_input ); sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) ); @@ -2744,9 +2752,10 @@ static void aos_to_soa( struct x86_function *func, x86_lea( func, aos_input, x86_make_disp(aos_input, 16) ); x86_lea( func, soa_input, x86_make_disp(soa_input, 64) ); } - /* while --num_inputs */ + /* --num_inputs */ x86_dec( func, num_inputs ); - x86_jcc( func, cc_NE, inner_loop ); + x86_jmp( func, loop_top ); + x86_fixup_fwd_jump( func, loop_exit_fixup ); /* Restore EBX */ x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); @@ -2816,6 +2825,61 @@ static void soa_to_aos( struct x86_function *func, x86_pop( func, x86_make_reg( file_REG32, reg_BX ) ); } + +/** + * Check if the instructions dst register is the same as any src + * register and warn if there's a posible SOA dependency. + */ +static boolean +check_soa_dependencies(const struct tgsi_full_instruction *inst) +{ + uint opcode = inst->Instruction.Opcode; + + /* XXX: we only handle src/dst aliasing in a few opcodes currently. + * Need to use an additional temporay to hold the result in the + * cases where the code is too opaque to fix. + */ + + switch (opcode) { + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_MUL: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + case TGSI_OPCODE_EXP: + case TGSI_OPCODE_LOG: + case TGSI_OPCODE_DP3: + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DP2A: + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_LG2: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_XPD: + case TGSI_OPCODE_DPH: + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXP: + case TGSI_OPCODE_NRM: + case TGSI_OPCODE_NRM4: + case TGSI_OPCODE_DP2: + /* OK - these opcodes correctly handle SOA dependencies */ + return TRUE; + default: + if (!tgsi_check_soa_dependencies(inst)) + return TRUE; + + debug_printf("Warning: src/dst aliasing in instruction" + " is not handled:\n"); + debug_printf("Warning: "); + tgsi_dump_instruction(inst, 1); + + return FALSE; + } +} + + /** * Translate a TGSI vertex/fragment shader to SSE2 code. * Slightly different things are done for vertex vs. fragment shaders. @@ -2885,7 +2949,6 @@ tgsi_emit_sse2( x86_make_disp( get_machine_base(), Offset( struct tgsi_exec_machine, Samplers ) ) ); - while( !tgsi_parse_end_of_tokens( &parse ) && ok ) { tgsi_parse_token( &parse ); @@ -2905,27 +2968,15 @@ tgsi_emit_sse2( if (!ok) { uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode; + uint proc = parse.FullHeader.Processor.Processor; debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n", opcode, tgsi_get_opcode_name(opcode), - parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ? - "vertex shader" : "fragment shader"); + tgsi_get_processor_name(proc)); } - if (tgsi_check_soa_dependencies(&parse.FullToken.FullInstruction)) { - uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode; - - /* XXX: we only handle src/dst aliasing in a few opcodes - * currently. Need to use an additional temporay to hold - * the result in the cases where the code is too opaque to - * fix. - */ - if (opcode != TGSI_OPCODE_MOV) { - debug_printf("Warning: src/dst aliasing in instruction" - " is not handled:\n"); - tgsi_dump_instruction(&parse.FullToken.FullInstruction, 1); - } - } + if (ok) + ok = check_soa_dependencies(&parse.FullToken.FullInstruction); break; case TGSI_TOKEN_TYPE_IMMEDIATE: @@ -2982,4 +3033,3 @@ tgsi_emit_sse2( } #endif /* PIPE_ARCH_X86 */ - diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h index d81ee3d00ec..00aa8b84fe9 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h @@ -32,9 +32,12 @@ extern "C" { #endif +#include "pipe/p_compiler.h" + +struct tgsi_exec_machine; +struct tgsi_interp_coef; struct tgsi_token; struct x86_function; -struct tgsi_interp_coef; unsigned tgsi_emit_sse2( diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c index 0b468a9184e..b01d2ff4689 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_text.c +++ b/src/gallium/auxiliary/tgsi/tgsi_text.c @@ -58,7 +58,7 @@ static boolean is_digit_alpha_underscore( const char *cur ) static char uprcase( char c ) { if (c >= 'a' && c <= 'z') - return c += 'A' - 'a'; + return c + 'A' - 'a'; return c; } @@ -138,6 +138,7 @@ static boolean parse_identifier( const char **pcur, char *ret ) ret[i++] = *cur++; while (is_alpha_underscore( cur )) ret[i++] = *cur++; + ret[i++] = '\0'; *pcur = cur; return TRUE; } @@ -278,9 +279,10 @@ static const char *file_names[TGSI_FILE_COUNT] = "SAMP", "ADDR", "IMM", - "LOOP", "PRED", - "SV" + "SV", + "IMMX", + "TEMPX" }; static boolean @@ -345,12 +347,68 @@ parse_opt_writemask( return TRUE; } + +/* <register_file_bracket> ::= <file> `[' + */ +static boolean +parse_register_file_bracket( + struct translate_ctx *ctx, + uint *file ) +{ + if (!parse_file( &ctx->cur, file )) { + report_error( ctx, "Unknown register file" ); + return FALSE; + } + eat_opt_white( &ctx->cur ); + if (*ctx->cur != '[') { + report_error( ctx, "Expected `['" ); + return FALSE; + } + ctx->cur++; + return TRUE; +} + +/* <register_file_bracket_index> ::= <register_file_bracket> <uint> + */ +static boolean +parse_register_file_bracket_index( + struct translate_ctx *ctx, + uint *file, + int *index ) +{ + uint uindex; + + if (!parse_register_file_bracket( ctx, file )) + return FALSE; + eat_opt_white( &ctx->cur ); + if (!parse_uint( &ctx->cur, &uindex )) { + report_error( ctx, "Expected literal unsigned integer" ); + return FALSE; + } + *index = (int) uindex; + return TRUE; +} + +/* Parse simple 1d register operand. + * <register_dst> ::= <register_file_bracket_index> `]' + */ static boolean -parse_register_dst( struct translate_ctx *ctx, - uint *file, - int *index ); +parse_register_1d(struct translate_ctx *ctx, + uint *file, + int *index ) +{ + if (!parse_register_file_bracket_index( ctx, file, index )) + return FALSE; + eat_opt_white( &ctx->cur ); + if (*ctx->cur != ']') { + report_error( ctx, "Expected `]'" ); + return FALSE; + } + ctx->cur++; + return TRUE; +} -struct parsed_src_bracket { +struct parsed_bracket { int index; uint ind_file; @@ -360,21 +418,21 @@ struct parsed_src_bracket { static boolean -parse_register_src_bracket( +parse_register_bracket( struct translate_ctx *ctx, - struct parsed_src_bracket *brackets) + struct parsed_bracket *brackets) { const char *cur; uint uindex; - memset(brackets, 0, sizeof(struct parsed_src_bracket)); + memset(brackets, 0, sizeof(struct parsed_bracket)); eat_opt_white( &ctx->cur ); cur = ctx->cur; if (parse_file( &cur, &brackets->ind_file )) { - if (!parse_register_dst( ctx, &brackets->ind_file, - &brackets->ind_index )) + if (!parse_register_1d( ctx, &brackets->ind_file, + &brackets->ind_index )) return FALSE; eat_opt_white( &ctx->cur ); @@ -443,7 +501,7 @@ parse_register_src_bracket( static boolean parse_opt_register_src_bracket( struct translate_ctx *ctx, - struct parsed_src_bracket *brackets, + struct parsed_bracket *brackets, int *parsed_brackets) { const char *cur = ctx->cur; @@ -455,7 +513,7 @@ parse_opt_register_src_bracket( ++cur; ctx->cur = cur; - if (!parse_register_src_bracket(ctx, brackets)) + if (!parse_register_bracket(ctx, brackets)) return FALSE; *parsed_brackets = 1; @@ -464,46 +522,6 @@ parse_opt_register_src_bracket( return TRUE; } -/* <register_file_bracket> ::= <file> `[' - */ -static boolean -parse_register_file_bracket( - struct translate_ctx *ctx, - uint *file ) -{ - if (!parse_file( &ctx->cur, file )) { - report_error( ctx, "Unknown register file" ); - return FALSE; - } - eat_opt_white( &ctx->cur ); - if (*ctx->cur != '[') { - report_error( ctx, "Expected `['" ); - return FALSE; - } - ctx->cur++; - return TRUE; -} - -/* <register_file_bracket_index> ::= <register_file_bracket> <uint> - */ -static boolean -parse_register_file_bracket_index( - struct translate_ctx *ctx, - uint *file, - int *index ) -{ - uint uindex; - - if (!parse_register_file_bracket( ctx, file )) - return FALSE; - eat_opt_white( &ctx->cur ); - if (!parse_uint( &ctx->cur, &uindex )) { - report_error( ctx, "Expected literal unsigned integer" ); - return FALSE; - } - *index = (int) uindex; - return TRUE; -} /* Parse source register operand. * <register_src> ::= <register_file_bracket_index> `]' | @@ -515,13 +533,12 @@ static boolean parse_register_src( struct translate_ctx *ctx, uint *file, - struct parsed_src_bracket *brackets) + struct parsed_bracket *brackets) { - brackets->ind_comp = TGSI_SWIZZLE_X; if (!parse_register_file_bracket( ctx, file )) return FALSE; - if (!parse_register_src_bracket( ctx, brackets )) + if (!parse_register_bracket( ctx, brackets )) return FALSE; return TRUE; @@ -629,23 +646,19 @@ parse_register_dcl( } -/* Parse destination register operand. - * <register_dst> ::= <register_file_bracket_index> `]' - */ +/* Parse destination register operand.*/ static boolean parse_register_dst( struct translate_ctx *ctx, uint *file, - int *index ) + struct parsed_bracket *brackets) { - if (!parse_register_file_bracket_index( ctx, file, index )) - return FALSE; - eat_opt_white( &ctx->cur ); - if (*ctx->cur != ']') { - report_error( ctx, "Expected `]'" ); + brackets->ind_comp = TGSI_SWIZZLE_X; + if (!parse_register_file_bracket( ctx, file )) return FALSE; - } - ctx->cur++; + if (!parse_register_bracket( ctx, brackets )) + return FALSE; + return TRUE; } @@ -655,11 +668,14 @@ parse_dst_operand( struct tgsi_full_dst_register *dst ) { uint file; - int index; uint writemask; const char *cur; + struct parsed_bracket bracket[2]; + int parsed_opt_brackets; - if (!parse_register_dst( ctx, &file, &index )) + if (!parse_register_dst( ctx, &file, &bracket[0] )) + return FALSE; + if (!parse_opt_register_src_bracket(ctx, &bracket[1], &parsed_opt_brackets)) return FALSE; cur = ctx->cur; @@ -669,8 +685,24 @@ parse_dst_operand( return FALSE; dst->Register.File = file; - dst->Register.Index = index; + if (parsed_opt_brackets) { + dst->Register.Dimension = 1; + dst->Dimension.Indirect = 0; + dst->Dimension.Dimension = 0; + dst->Dimension.Index = bracket[0].index; + bracket[0] = bracket[1]; + } + dst->Register.Index = bracket[0].index; dst->Register.WriteMask = writemask; + if (bracket[0].ind_file != TGSI_FILE_NULL) { + dst->Register.Indirect = 1; + dst->Indirect.File = bracket[0].ind_file; + dst->Indirect.Index = bracket[0].ind_index; + dst->Indirect.SwizzleX = bracket[0].ind_comp; + dst->Indirect.SwizzleY = bracket[0].ind_comp; + dst->Indirect.SwizzleZ = bracket[0].ind_comp; + dst->Indirect.SwizzleW = bracket[0].ind_comp; + } return TRUE; } @@ -700,7 +732,7 @@ parse_optional_swizzle( else if (uprcase( *cur ) == 'W') swizzle[i] = TGSI_SWIZZLE_W; else { - report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" ); + report_error( ctx, "Expected register swizzle component `x', `y', `z' or `w'" ); return FALSE; } cur++; @@ -719,7 +751,7 @@ parse_src_operand( uint file; uint swizzle[4]; boolean parsed_swizzle; - struct parsed_src_bracket bracket[2]; + struct parsed_bracket bracket[2]; int parsed_opt_brackets; if (*ctx->cur == '-') { @@ -816,6 +848,45 @@ parse_instruction( struct tgsi_full_instruction inst; uint advance; + inst = tgsi_default_full_instruction(); + + /* Parse predicate. + */ + eat_opt_white( &ctx->cur ); + if (*ctx->cur == '(') { + uint file; + int index; + uint swizzle[4]; + boolean parsed_swizzle; + + inst.Instruction.Predicate = 1; + + ctx->cur++; + if (*ctx->cur == '!') { + ctx->cur++; + inst.Predicate.Negate = 1; + } + + if (!parse_register_1d( ctx, &file, &index )) + return FALSE; + + if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle )) { + if (parsed_swizzle) { + inst.Predicate.SwizzleX = swizzle[0]; + inst.Predicate.SwizzleY = swizzle[1]; + inst.Predicate.SwizzleZ = swizzle[2]; + inst.Predicate.SwizzleW = swizzle[3]; + } + } + + if (*ctx->cur != ')') { + report_error( ctx, "Expected `)'" ); + return FALSE; + } + + ctx->cur++; + } + /* Parse instruction name. */ eat_opt_white( &ctx->cur ); @@ -849,7 +920,6 @@ parse_instruction( return FALSE; } - inst = tgsi_default_full_instruction(); inst.Instruction.Opcode = i; inst.Instruction.Saturate = saturate; inst.Instruction.NumDstRegs = info->num_dst; @@ -947,6 +1017,45 @@ static const char *interpolate_names[TGSI_INTERPOLATE_COUNT] = "PERSPECTIVE" }; + +/* parses a 4-touple of the form {x, y, z, w} + * where x, y, z, w are numbers */ +static boolean parse_immediate_data(struct translate_ctx *ctx, + float *values) +{ + unsigned i; + + eat_opt_white( &ctx->cur ); + if (*ctx->cur != '{') { + report_error( ctx, "Expected `{'" ); + return FALSE; + } + ctx->cur++; + for (i = 0; i < 4; i++) { + eat_opt_white( &ctx->cur ); + if (i > 0) { + if (*ctx->cur != ',') { + report_error( ctx, "Expected `,'" ); + return FALSE; + } + ctx->cur++; + eat_opt_white( &ctx->cur ); + } + if (!parse_float( &ctx->cur, &values[i] )) { + report_error( ctx, "Expected literal floating point" ); + return FALSE; + } + } + eat_opt_white( &ctx->cur ); + if (*ctx->cur != '}') { + report_error( ctx, "Expected `}'" ); + return FALSE; + } + ctx->cur++; + + return TRUE; +} + static boolean parse_declaration( struct translate_ctx *ctx ) { struct tgsi_full_declaration decl; @@ -956,6 +1065,8 @@ static boolean parse_declaration( struct translate_ctx *ctx ) uint writemask; const char *cur; uint advance; + boolean is_vs_input; + boolean is_imm_array; assert(Elements(semantic_names) == TGSI_SEMANTIC_COUNT); assert(Elements(interpolate_names) == TGSI_INTERPOLATE_COUNT); @@ -984,9 +1095,13 @@ static boolean parse_declaration( struct translate_ctx *ctx ) decl.Dim.Index2D = brackets[0].first; } + is_vs_input = (file == TGSI_FILE_INPUT && + ctx->processor == TGSI_PROCESSOR_VERTEX); + is_imm_array = (file == TGSI_FILE_IMMEDIATE_ARRAY); + cur = ctx->cur; eat_opt_white( &cur ); - if (*cur == ',') { + if (*cur == ',' && !is_vs_input) { uint i; cur++; @@ -1025,11 +1140,49 @@ static boolean parse_declaration( struct translate_ctx *ctx ) break; } } + } else if (is_imm_array) { + unsigned i; + float *vals_itr; + /* we have our immediate data */ + if (*cur != '{') { + report_error( ctx, "Immediate array without data" ); + return FALSE; + } + ++cur; + ctx->cur = cur; + + decl.ImmediateData.u = + MALLOC(sizeof(union tgsi_immediate_data) * 4 * + (decl.Range.Last + 1)); + vals_itr = (float*)decl.ImmediateData.u; + for (i = 0; i <= decl.Range.Last; ++i) { + if (!parse_immediate_data(ctx, vals_itr)) { + FREE(decl.ImmediateData.u); + return FALSE; + } + vals_itr += 4; + eat_opt_white( &ctx->cur ); + if (*ctx->cur != ',') { + if (i != decl.Range.Last) { + report_error( ctx, "Not enough data in immediate array!" ); + FREE(decl.ImmediateData.u); + return FALSE; + } + } else + ++ctx->cur; + } + eat_opt_white( &ctx->cur ); + if (*ctx->cur != '}') { + FREE(decl.ImmediateData.u); + report_error( ctx, "Immediate array data missing closing '}'" ); + return FALSE; + } + ++ctx->cur; } cur = ctx->cur; eat_opt_white( &cur ); - if (*cur == ',') { + if (*cur == ',' && !is_vs_input) { uint i; cur++; @@ -1055,6 +1208,10 @@ static boolean parse_declaration( struct translate_ctx *ctx ) ctx->tokens_cur, ctx->header, (uint) (ctx->tokens_end - ctx->tokens_cur) ); + + if (is_imm_array) + FREE(decl.ImmediateData.u); + if (advance == 0) return FALSE; ctx->tokens_cur += advance; @@ -1065,7 +1222,6 @@ static boolean parse_declaration( struct translate_ctx *ctx ) static boolean parse_immediate( struct translate_ctx *ctx ) { struct tgsi_full_immediate imm; - uint i; float values[4]; uint advance; @@ -1073,37 +1229,13 @@ static boolean parse_immediate( struct translate_ctx *ctx ) report_error( ctx, "Syntax error" ); return FALSE; } - if (!str_match_no_case( &ctx->cur, "FLT32" ) || is_digit_alpha_underscore( ctx->cur )) { + if (!str_match_no_case( &ctx->cur, "FLT32" ) || + is_digit_alpha_underscore( ctx->cur )) { report_error( ctx, "Expected `FLT32'" ); return FALSE; } - eat_opt_white( &ctx->cur ); - if (*ctx->cur != '{') { - report_error( ctx, "Expected `{'" ); - return FALSE; - } - ctx->cur++; - for (i = 0; i < 4; i++) { - eat_opt_white( &ctx->cur ); - if (i > 0) { - if (*ctx->cur != ',') { - report_error( ctx, "Expected `,'" ); - return FALSE; - } - ctx->cur++; - eat_opt_white( &ctx->cur ); - } - if (!parse_float( &ctx->cur, &values[i] )) { - report_error( ctx, "Expected literal floating point" ); - return FALSE; - } - } - eat_opt_white( &ctx->cur ); - if (*ctx->cur != '}') { - report_error( ctx, "Expected `}'" ); - return FALSE; - } - ctx->cur++; + + parse_immediate_data(ctx, values); imm = tgsi_default_full_immediate(); imm.Immediate.NrTokens += 4; diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c index f725405ade1..7d13a17bdbc 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c +++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c @@ -74,7 +74,6 @@ struct ureg_tokens { #define UREG_MAX_IMMEDIATE 32 #define UREG_MAX_TEMP 256 #define UREG_MAX_ADDR 2 -#define UREG_MAX_LOOP 1 #define UREG_MAX_PRED 1 struct const_decl { @@ -97,7 +96,8 @@ struct ureg_program unsigned semantic_name; unsigned semantic_index; unsigned interp; - unsigned cylindrical_wrap; + unsigned char cylindrical_wrap; + unsigned char centroid; } fs_input[UREG_MAX_INPUT]; unsigned nr_fs_inputs; @@ -151,7 +151,6 @@ struct ureg_program unsigned nr_addrs; unsigned nr_preds; - unsigned nr_loops; unsigned nr_instructions; struct ureg_tokens domain[2]; @@ -288,11 +287,12 @@ ureg_property_fs_coord_pixel_center(struct ureg_program *ureg, struct ureg_src -ureg_DECL_fs_input_cyl(struct ureg_program *ureg, +ureg_DECL_fs_input_cyl_centroid(struct ureg_program *ureg, unsigned semantic_name, unsigned semantic_index, unsigned interp_mode, - unsigned cylindrical_wrap) + unsigned cylindrical_wrap, + unsigned centroid) { unsigned i; @@ -308,6 +308,7 @@ ureg_DECL_fs_input_cyl(struct ureg_program *ureg, ureg->fs_input[i].semantic_index = semantic_index; ureg->fs_input[i].interp = interp_mode; ureg->fs_input[i].cylindrical_wrap = cylindrical_wrap; + ureg->fs_input[i].centroid = centroid; ureg->nr_fs_inputs++; } else { set_bad(ureg); @@ -537,19 +538,6 @@ struct ureg_dst ureg_DECL_address( struct ureg_program *ureg ) return ureg_dst_register( TGSI_FILE_ADDRESS, 0 ); } -/* Allocate a new loop register. - */ -struct ureg_dst -ureg_DECL_loop(struct ureg_program *ureg) -{ - if (ureg->nr_loops < UREG_MAX_LOOP) { - return ureg_dst_register(TGSI_FILE_LOOP, ureg->nr_loops++); - } - - assert(0); - return ureg_dst_register(TGSI_FILE_LOOP, 0); -} - /* Allocate a new predicate register. */ struct ureg_dst @@ -747,11 +735,12 @@ ureg_DECL_immediate_int( struct ureg_program *ureg, } -void +void ureg_emit_src( struct ureg_program *ureg, struct ureg_src src ) { - unsigned size = 1 + (src.Indirect ? 1 : 0) + (src.Dimension ? 1 : 0); + unsigned size = 1 + (src.Indirect ? 1 : 0) + + (src.Dimension ? (src.DimIndirect ? 2 : 1) : 0); union tgsi_any_token *out = get_tokens( ureg, DOMAIN_INSN, size ); unsigned n = 0; @@ -784,11 +773,27 @@ ureg_emit_src( struct ureg_program *ureg, } if (src.Dimension) { - out[0].src.Dimension = 1; - out[n].dim.Indirect = 0; - out[n].dim.Dimension = 0; - out[n].dim.Padding = 0; - out[n].dim.Index = src.DimensionIndex; + if (src.DimIndirect) { + out[0].src.Dimension = 1; + out[n].dim.Indirect = 1; + out[n].dim.Dimension = 0; + out[n].dim.Padding = 0; + out[n].dim.Index = src.DimensionIndex; + n++; + out[n].value = 0; + out[n].src.File = src.DimIndFile; + out[n].src.SwizzleX = src.DimIndSwizzle; + out[n].src.SwizzleY = src.DimIndSwizzle; + out[n].src.SwizzleZ = src.DimIndSwizzle; + out[n].src.SwizzleW = src.DimIndSwizzle; + out[n].src.Index = src.DimIndIndex; + } else { + out[0].src.Dimension = 1; + out[n].dim.Indirect = 0; + out[n].dim.Dimension = 0; + out[n].dim.Padding = 0; + out[n].dim.Index = src.DimensionIndex; + } n++; } @@ -1124,7 +1129,8 @@ emit_decl_fs(struct ureg_program *ureg, unsigned semantic_name, unsigned semantic_index, unsigned interpolate, - unsigned cylindrical_wrap) + unsigned cylindrical_wrap, + unsigned centroid) { union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3); @@ -1136,6 +1142,7 @@ emit_decl_fs(struct ureg_program *ureg, out[0].decl.Interpolate = interpolate; out[0].decl.Semantic = 1; out[0].decl.CylindricalWrap = cylindrical_wrap; + out[0].decl.Centroid = centroid; out[1].value = 0; out[1].decl_range.First = index; @@ -1251,7 +1258,7 @@ static void emit_decls( struct ureg_program *ureg ) assert(ureg->processor == TGSI_PROCESSOR_GEOMETRY); emit_property(ureg, - TGSI_PROPERTY_GS_MAX_VERTICES, + TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES, ureg->property_gs_max_vertices); } @@ -1285,7 +1292,8 @@ static void emit_decls( struct ureg_program *ureg ) ureg->fs_input[i].semantic_name, ureg->fs_input[i].semantic_index, ureg->fs_input[i].interp, - ureg->fs_input[i].cylindrical_wrap); + ureg->fs_input[i].cylindrical_wrap, + ureg->fs_input[i].centroid); } } else { for (i = 0; i < ureg->nr_gs_inputs; i++) { @@ -1356,13 +1364,6 @@ static void emit_decls( struct ureg_program *ureg ) 0, ureg->nr_addrs ); } - if (ureg->nr_loops) { - emit_decl_range(ureg, - TGSI_FILE_LOOP, - 0, - ureg->nr_loops); - } - if (ureg->nr_preds) { emit_decl_range(ureg, TGSI_FILE_PREDICATE, @@ -1489,6 +1490,12 @@ const struct tgsi_token *ureg_get_tokens( struct ureg_program *ureg, } +void ureg_free_tokens( const struct tgsi_token *tokens ) +{ + FREE((struct tgsi_token *)tokens); +} + + struct ureg_program *ureg_create( unsigned processor ) { struct ureg_program *ureg = CALLOC_STRUCT( ureg_program ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h index 0130a77aadb..acc463200a6 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h +++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h @@ -49,14 +49,18 @@ struct ureg_src unsigned SwizzleZ : 2; /* TGSI_SWIZZLE_ */ unsigned SwizzleW : 2; /* TGSI_SWIZZLE_ */ unsigned Indirect : 1; /* BOOL */ + unsigned DimIndirect : 1; /* BOOL */ unsigned Dimension : 1; /* BOOL */ unsigned Absolute : 1; /* BOOL */ unsigned Negate : 1; /* BOOL */ int Index : 16; /* SINT */ - unsigned IndirectFile : 4; /* TGSI_FILE_ */ - int IndirectIndex : 16; /* SINT */ - unsigned IndirectSwizzle : 2; /* TGSI_SWIZZLE_ */ - int DimensionIndex : 16; /* SINT */ + unsigned IndirectFile : 4; /* TGSI_FILE_ */ + int IndirectIndex : 16; /* SINT */ + unsigned IndirectSwizzle : 2; /* TGSI_SWIZZLE_ */ + int DimensionIndex : 16; /* SINT */ + unsigned DimIndFile : 4; /* TGSI_FILE_ */ + int DimIndIndex : 16; /* SINT */ + unsigned DimIndSwizzle : 2; /* TGSI_SWIZZLE_ */ }; /* Very similar to a tgsi_dst_register, removing unsupported fields @@ -104,6 +108,10 @@ ureg_get_tokens( struct ureg_program *ureg, unsigned *nr_tokens ); +/* Free the tokens created by ureg_get_tokens() */ +void ureg_free_tokens( const struct tgsi_token *tokens ); + + void ureg_destroy( struct ureg_program * ); @@ -150,11 +158,27 @@ ureg_property_fs_coord_pixel_center(struct ureg_program *ureg, */ struct ureg_src -ureg_DECL_fs_input_cyl(struct ureg_program *, +ureg_DECL_fs_input_cyl_centroid(struct ureg_program *, unsigned semantic_name, unsigned semantic_index, unsigned interp_mode, - unsigned cylindrical_wrap); + unsigned cylindrical_wrap, + unsigned centroid); + +static INLINE struct ureg_src +ureg_DECL_fs_input_cyl(struct ureg_program *ureg, + unsigned semantic_name, + unsigned semantic_index, + unsigned interp_mode, + unsigned cylindrical_wrap) +{ + return ureg_DECL_fs_input_cyl_centroid(ureg, + semantic_name, + semantic_index, + interp_mode, + cylindrical_wrap, + 0); +} static INLINE struct ureg_src ureg_DECL_fs_input(struct ureg_program *ureg, @@ -162,11 +186,11 @@ ureg_DECL_fs_input(struct ureg_program *ureg, unsigned semantic_index, unsigned interp_mode) { - return ureg_DECL_fs_input_cyl(ureg, + return ureg_DECL_fs_input_cyl_centroid(ureg, semantic_name, semantic_index, interp_mode, - 0); + 0, 0); } struct ureg_src @@ -231,9 +255,6 @@ struct ureg_dst ureg_DECL_address( struct ureg_program * ); struct ureg_dst -ureg_DECL_loop( struct ureg_program * ); - -struct ureg_dst ureg_DECL_predicate(struct ureg_program *); /* Supply an index to the sampler declaration as this is the hook to @@ -820,15 +841,31 @@ ureg_src_indirect( struct ureg_src reg, struct ureg_src addr ) return reg; } -static INLINE struct ureg_src +static INLINE struct ureg_src ureg_src_dimension( struct ureg_src reg, int index ) { assert(reg.File != TGSI_FILE_NULL); reg.Dimension = 1; + reg.DimIndirect = 0; reg.DimensionIndex = index; return reg; } + +static INLINE struct ureg_src +ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr, + int index ) +{ + assert(reg.File != TGSI_FILE_NULL); + reg.Dimension = 1; + reg.DimIndirect = 1; + reg.DimensionIndex = index; + reg.DimIndFile = addr.File; + reg.DimIndIndex = addr.Index; + reg.DimIndSwizzle = addr.SwizzleX; + return reg; +} + static INLINE struct ureg_dst ureg_dst( struct ureg_src src ) { @@ -873,6 +910,10 @@ ureg_src_register(unsigned file, src.Negate = 0; src.Dimension = 0; src.DimensionIndex = 0; + src.DimIndirect = 0; + src.DimIndFile = TGSI_FILE_NULL; + src.DimIndIndex = 0; + src.DimIndSwizzle = 0; return src; } @@ -896,6 +937,10 @@ ureg_src( struct ureg_dst dst ) src.Negate = 0; src.Dimension = 0; src.DimensionIndex = 0; + src.DimIndirect = 0; + src.DimIndFile = TGSI_FILE_NULL; + src.DimIndIndex = 0; + src.DimIndSwizzle = 0; return src; } @@ -943,7 +988,11 @@ ureg_src_undef( void ) src.Negate = 0; src.Dimension = 0; src.DimensionIndex = 0; - + src.DimIndirect = 0; + src.DimIndFile = TGSI_FILE_NULL; + src.DimIndIndex = 0; + src.DimIndSwizzle = 0; + return src; } diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c index 0a7e4105a80..08e7e89bd67 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.c +++ b/src/gallium/auxiliary/tgsi/tgsi_util.c @@ -163,3 +163,150 @@ tgsi_util_set_full_src_register_sign_mode( assert( 0 ); } } + +/** + * Determine which channels of the specificed src register are effectively + * used by this instruction. + */ +unsigned +tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst, + unsigned src_idx) +{ + const struct tgsi_full_src_register *src = &inst->Src[src_idx]; + unsigned write_mask = inst->Dst[0].Register.WriteMask; + unsigned read_mask; + unsigned usage_mask; + unsigned chan; + + switch (inst->Instruction.Opcode) { + case TGSI_OPCODE_MOV: + case TGSI_OPCODE_ARL: + case TGSI_OPCODE_ARR: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_MUL: + case TGSI_OPCODE_DIV: + case TGSI_OPCODE_ADD: + case TGSI_OPCODE_MIN: + case TGSI_OPCODE_MAX: + case TGSI_OPCODE_SLT: + case TGSI_OPCODE_SGE: + case TGSI_OPCODE_MAD: + case TGSI_OPCODE_SUB: + case TGSI_OPCODE_LRP: + case TGSI_OPCODE_CND: + case TGSI_OPCODE_FRC: + case TGSI_OPCODE_CEIL: + case TGSI_OPCODE_CLAMP: + case TGSI_OPCODE_FLR: + case TGSI_OPCODE_ROUND: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_ABS: + case TGSI_OPCODE_COS: + case TGSI_OPCODE_SIN: + case TGSI_OPCODE_DDX: + case TGSI_OPCODE_DDY: + case TGSI_OPCODE_SEQ: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_SSG: + case TGSI_OPCODE_CMP: + case TGSI_OPCODE_TRUNC: + case TGSI_OPCODE_NOT: + case TGSI_OPCODE_AND: + case TGSI_OPCODE_OR: + case TGSI_OPCODE_XOR: + case TGSI_OPCODE_SAD: + /* Channel-wise operations */ + read_mask = write_mask; + break; + + case TGSI_OPCODE_EX2: + case TGSI_OPCODE_LG2: + case TGSI_OPCODE_RCC: + read_mask = TGSI_WRITEMASK_X; + break; + + case TGSI_OPCODE_SCS: + read_mask = write_mask & TGSI_WRITEMASK_XY ? TGSI_WRITEMASK_X : 0; + break; + + case TGSI_OPCODE_EXP: + case TGSI_OPCODE_LOG: + read_mask = write_mask & TGSI_WRITEMASK_XYZ ? TGSI_WRITEMASK_X : 0; + break; + + case TGSI_OPCODE_DP2A: + read_mask = src_idx == 2 ? TGSI_WRITEMASK_X : TGSI_WRITEMASK_XY; + break; + + case TGSI_OPCODE_DP2: + read_mask = TGSI_WRITEMASK_XY; + break; + + case TGSI_OPCODE_DP3: + read_mask = TGSI_WRITEMASK_XYZ; + break; + + case TGSI_OPCODE_DP4: + read_mask = TGSI_WRITEMASK_XYZW; + break; + + case TGSI_OPCODE_DPH: + read_mask = src_idx == 0 ? TGSI_WRITEMASK_XYZ : TGSI_WRITEMASK_XYZW; + break; + + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXD: + case TGSI_OPCODE_TXB: + case TGSI_OPCODE_TXL: + case TGSI_OPCODE_TXP: + if (src_idx == 0) { + /* Note that the SHADOW variants use the Z component too */ + switch (inst->Texture.Texture) { + case TGSI_TEXTURE_1D: + read_mask = TGSI_WRITEMASK_X; + break; + case TGSI_TEXTURE_SHADOW1D: + read_mask = TGSI_WRITEMASK_XZ; + break; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + read_mask = TGSI_WRITEMASK_XY; + break; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + read_mask = TGSI_WRITEMASK_XYZ; + break; + + default: + assert(0); + read_mask = 0; + } + + if (inst->Instruction.Opcode != TGSI_OPCODE_TEX) { + read_mask |= TGSI_WRITEMASK_W; + } + } else { + /* A safe approximation */ + read_mask = TGSI_WRITEMASK_XYZW; + } + break; + + default: + /* Assume all channels are read */ + read_mask = TGSI_WRITEMASK_XYZW; + break; + } + + usage_mask = 0; + for (chan = 0; chan < 4; ++chan) { + if (read_mask & (1 << chan)) { + usage_mask |= 1 << tgsi_util_get_full_src_register_swizzle(src, chan); + } + } + + return usage_mask; +} diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h index 19ee2e7cf2a..04702ba9826 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.h +++ b/src/gallium/auxiliary/tgsi/tgsi_util.h @@ -34,6 +34,7 @@ extern "C" { struct tgsi_src_register; struct tgsi_full_src_register; +struct tgsi_full_instruction; void * tgsi_align_128bit( @@ -71,6 +72,10 @@ tgsi_util_set_full_src_register_sign_mode( struct tgsi_full_src_register *reg, unsigned sign_mode ); +unsigned +tgsi_util_get_inst_usage_mask(const struct tgsi_full_instruction *inst, + unsigned src_idx); + #if defined __cplusplus } #endif diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c index a9b7253bf44..73287b667db 100644 --- a/src/gallium/auxiliary/translate/translate.c +++ b/src/gallium/auxiliary/translate/translate.c @@ -38,7 +38,7 @@ struct translate *translate_create( const struct translate_key *key ) { struct translate *translate = NULL; -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) translate = translate_sse2_create( key ); if (translate) return translate; @@ -48,3 +48,8 @@ struct translate *translate_create( const struct translate_key *key ) return translate_generic_create( key ); } + +boolean translate_is_output_format_supported(enum pipe_format format) +{ + return translate_generic_is_output_format_supported(format); +} diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h index edd95e07882..a75380228b1 100644 --- a/src/gallium/auxiliary/translate/translate.h +++ b/src/gallium/auxiliary/translate/translate.h @@ -85,6 +85,18 @@ struct translate { unsigned instance_id, void *output_buffer); + void (PIPE_CDECL *run_elts16)( struct translate *, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + + void (PIPE_CDECL *run_elts8)( struct translate *, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + void (PIPE_CDECL *run)( struct translate *, unsigned start, unsigned count, @@ -105,6 +117,8 @@ struct translate *translate_lookup_or_create( struct translate_context *tctx, struct translate *translate_create( const struct translate_key *key ); +boolean translate_is_output_format_supported(enum pipe_format format); + static INLINE int translate_keysize( const struct translate_key *key ) { return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element); @@ -138,5 +152,6 @@ struct translate *translate_sse2_create( const struct translate_key *key ); struct translate *translate_generic_create( const struct translate_key *key ); +boolean translate_generic_is_output_format_supported(enum pipe_format format); #endif diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c index a9272fbb491..ad809db720d 100644 --- a/src/gallium/auxiliary/translate/translate_generic.c +++ b/src/gallium/auxiliary/translate/translate_generic.c @@ -31,6 +31,7 @@ */ #include "util/u_memory.h" +#include "util/u_format.h" #include "util/u_math.h" #include "pipe/p_state.h" #include "translate.h" @@ -38,7 +39,9 @@ #define DRAW_DBG 0 -typedef void (*fetch_func)(const void *ptr, float *attrib); +typedef void (*fetch_func)(float *dst, + const uint8_t *src, + unsigned i, unsigned j); typedef void (*emit_func)(const float *attrib, void *ptr); @@ -57,10 +60,18 @@ struct translate_generic { emit_func emit; unsigned output_offset; - char *input_ptr; + const uint8_t *input_ptr; unsigned input_stride; unsigned max_index; + /* this value is set to -1 if this is a normal element with output_format != input_format: + * in this case, u_format is used to do a full conversion + * + * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids: + * in this case, memcpy is used to copy this amount of bytes + */ + int copy_size; + } attrib[PIPE_MAX_ATTRIBS]; unsigned nr_attrib; @@ -79,22 +90,7 @@ static struct translate_generic *translate_generic( struct translate *translate * This is probably needed/dupliocated elsewhere, eg format * conversion, texture sampling etc. */ -#define ATTRIB( NAME, SZ, TYPE, FROM, TO ) \ -static void \ -fetch_##NAME(const void *ptr, float *attrib) \ -{ \ - const float defaults[4] = { 0.0f,0.0f,0.0f,1.0f }; \ - unsigned i; \ - \ - for (i = 0; i < SZ; i++) { \ - attrib[i] = FROM(i); \ - } \ - \ - for (; i < 4; i++) { \ - attrib[i] = defaults[i]; \ - } \ -} \ - \ +#define ATTRIB( NAME, SZ, TYPE, TO ) \ static void \ emit_##NAME(const float *attrib, void *ptr) \ { \ @@ -107,27 +103,6 @@ emit_##NAME(const float *attrib, void *ptr) \ } -#define FROM_64_FLOAT(i) ((float) ((double *) ptr)[i]) -#define FROM_32_FLOAT(i) (((float *) ptr)[i]) - -#define FROM_8_USCALED(i) ((float) ((unsigned char *) ptr)[i]) -#define FROM_16_USCALED(i) ((float) ((unsigned short *) ptr)[i]) -#define FROM_32_USCALED(i) ((float) ((unsigned int *) ptr)[i]) - -#define FROM_8_SSCALED(i) ((float) ((char *) ptr)[i]) -#define FROM_16_SSCALED(i) ((float) ((short *) ptr)[i]) -#define FROM_32_SSCALED(i) ((float) ((int *) ptr)[i]) - -#define FROM_8_UNORM(i) ((float) ((unsigned char *) ptr)[i] / 255.0f) -#define FROM_16_UNORM(i) ((float) ((unsigned short *) ptr)[i] / 65535.0f) -#define FROM_32_UNORM(i) ((float) ((unsigned int *) ptr)[i] / 4294967295.0f) - -#define FROM_8_SNORM(i) ((float) ((char *) ptr)[i] / 127.0f) -#define FROM_16_SNORM(i) ((float) ((short *) ptr)[i] / 32767.0f) -#define FROM_32_SNORM(i) ((float) ((int *) ptr)[i] / 2147483647.0f) - -#define FROM_32_FIXED(i) (((int *) ptr)[i] / 65536.0f) - #define TO_64_FLOAT(x) ((double) x) #define TO_32_FLOAT(x) (x) @@ -150,94 +125,84 @@ emit_##NAME(const float *attrib, void *ptr) \ #define TO_32_FIXED(x) ((int) (x * 65536.0f)) - -ATTRIB( R64G64B64A64_FLOAT, 4, double, FROM_64_FLOAT, TO_64_FLOAT ) -ATTRIB( R64G64B64_FLOAT, 3, double, FROM_64_FLOAT, TO_64_FLOAT ) -ATTRIB( R64G64_FLOAT, 2, double, FROM_64_FLOAT, TO_64_FLOAT ) -ATTRIB( R64_FLOAT, 1, double, FROM_64_FLOAT, TO_64_FLOAT ) - -ATTRIB( R32G32B32A32_FLOAT, 4, float, FROM_32_FLOAT, TO_32_FLOAT ) -ATTRIB( R32G32B32_FLOAT, 3, float, FROM_32_FLOAT, TO_32_FLOAT ) -ATTRIB( R32G32_FLOAT, 2, float, FROM_32_FLOAT, TO_32_FLOAT ) -ATTRIB( R32_FLOAT, 1, float, FROM_32_FLOAT, TO_32_FLOAT ) - -ATTRIB( R32G32B32A32_USCALED, 4, unsigned, FROM_32_USCALED, TO_32_USCALED ) -ATTRIB( R32G32B32_USCALED, 3, unsigned, FROM_32_USCALED, TO_32_USCALED ) -ATTRIB( R32G32_USCALED, 2, unsigned, FROM_32_USCALED, TO_32_USCALED ) -ATTRIB( R32_USCALED, 1, unsigned, FROM_32_USCALED, TO_32_USCALED ) - -ATTRIB( R32G32B32A32_SSCALED, 4, int, FROM_32_SSCALED, TO_32_SSCALED ) -ATTRIB( R32G32B32_SSCALED, 3, int, FROM_32_SSCALED, TO_32_SSCALED ) -ATTRIB( R32G32_SSCALED, 2, int, FROM_32_SSCALED, TO_32_SSCALED ) -ATTRIB( R32_SSCALED, 1, int, FROM_32_SSCALED, TO_32_SSCALED ) - -ATTRIB( R32G32B32A32_UNORM, 4, unsigned, FROM_32_UNORM, TO_32_UNORM ) -ATTRIB( R32G32B32_UNORM, 3, unsigned, FROM_32_UNORM, TO_32_UNORM ) -ATTRIB( R32G32_UNORM, 2, unsigned, FROM_32_UNORM, TO_32_UNORM ) -ATTRIB( R32_UNORM, 1, unsigned, FROM_32_UNORM, TO_32_UNORM ) - -ATTRIB( R32G32B32A32_SNORM, 4, int, FROM_32_SNORM, TO_32_SNORM ) -ATTRIB( R32G32B32_SNORM, 3, int, FROM_32_SNORM, TO_32_SNORM ) -ATTRIB( R32G32_SNORM, 2, int, FROM_32_SNORM, TO_32_SNORM ) -ATTRIB( R32_SNORM, 1, int, FROM_32_SNORM, TO_32_SNORM ) - -ATTRIB( R16G16B16A16_USCALED, 4, ushort, FROM_16_USCALED, TO_16_USCALED ) -ATTRIB( R16G16B16_USCALED, 3, ushort, FROM_16_USCALED, TO_16_USCALED ) -ATTRIB( R16G16_USCALED, 2, ushort, FROM_16_USCALED, TO_16_USCALED ) -ATTRIB( R16_USCALED, 1, ushort, FROM_16_USCALED, TO_16_USCALED ) - -ATTRIB( R16G16B16A16_SSCALED, 4, short, FROM_16_SSCALED, TO_16_SSCALED ) -ATTRIB( R16G16B16_SSCALED, 3, short, FROM_16_SSCALED, TO_16_SSCALED ) -ATTRIB( R16G16_SSCALED, 2, short, FROM_16_SSCALED, TO_16_SSCALED ) -ATTRIB( R16_SSCALED, 1, short, FROM_16_SSCALED, TO_16_SSCALED ) - -ATTRIB( R16G16B16A16_UNORM, 4, ushort, FROM_16_UNORM, TO_16_UNORM ) -ATTRIB( R16G16B16_UNORM, 3, ushort, FROM_16_UNORM, TO_16_UNORM ) -ATTRIB( R16G16_UNORM, 2, ushort, FROM_16_UNORM, TO_16_UNORM ) -ATTRIB( R16_UNORM, 1, ushort, FROM_16_UNORM, TO_16_UNORM ) - -ATTRIB( R16G16B16A16_SNORM, 4, short, FROM_16_SNORM, TO_16_SNORM ) -ATTRIB( R16G16B16_SNORM, 3, short, FROM_16_SNORM, TO_16_SNORM ) -ATTRIB( R16G16_SNORM, 2, short, FROM_16_SNORM, TO_16_SNORM ) -ATTRIB( R16_SNORM, 1, short, FROM_16_SNORM, TO_16_SNORM ) - -ATTRIB( R8G8B8A8_USCALED, 4, ubyte, FROM_8_USCALED, TO_8_USCALED ) -ATTRIB( R8G8B8_USCALED, 3, ubyte, FROM_8_USCALED, TO_8_USCALED ) -ATTRIB( R8G8_USCALED, 2, ubyte, FROM_8_USCALED, TO_8_USCALED ) -ATTRIB( R8_USCALED, 1, ubyte, FROM_8_USCALED, TO_8_USCALED ) - -ATTRIB( R8G8B8A8_SSCALED, 4, char, FROM_8_SSCALED, TO_8_SSCALED ) -ATTRIB( R8G8B8_SSCALED, 3, char, FROM_8_SSCALED, TO_8_SSCALED ) -ATTRIB( R8G8_SSCALED, 2, char, FROM_8_SSCALED, TO_8_SSCALED ) -ATTRIB( R8_SSCALED, 1, char, FROM_8_SSCALED, TO_8_SSCALED ) - -ATTRIB( R8G8B8A8_UNORM, 4, ubyte, FROM_8_UNORM, TO_8_UNORM ) -ATTRIB( R8G8B8_UNORM, 3, ubyte, FROM_8_UNORM, TO_8_UNORM ) -ATTRIB( R8G8_UNORM, 2, ubyte, FROM_8_UNORM, TO_8_UNORM ) -ATTRIB( R8_UNORM, 1, ubyte, FROM_8_UNORM, TO_8_UNORM ) - -ATTRIB( R8G8B8A8_SNORM, 4, char, FROM_8_SNORM, TO_8_SNORM ) -ATTRIB( R8G8B8_SNORM, 3, char, FROM_8_SNORM, TO_8_SNORM ) -ATTRIB( R8G8_SNORM, 2, char, FROM_8_SNORM, TO_8_SNORM ) -ATTRIB( R8_SNORM, 1, char, FROM_8_SNORM, TO_8_SNORM ) - -ATTRIB( A8R8G8B8_UNORM, 4, ubyte, FROM_8_UNORM, TO_8_UNORM ) -/*ATTRIB( R8G8B8A8_UNORM, 4, ubyte, FROM_8_UNORM, TO_8_UNORM )*/ - -ATTRIB( R32G32B32A32_FIXED, 4, int, FROM_32_FIXED, TO_32_FIXED ) -ATTRIB( R32G32B32_FIXED, 3, int, FROM_32_FIXED, TO_32_FIXED ) -ATTRIB( R32G32_FIXED, 2, int, FROM_32_FIXED, TO_32_FIXED ) -ATTRIB( R32_FIXED, 1, int, FROM_32_FIXED, TO_32_FIXED ) - - +ATTRIB( R64G64B64A64_FLOAT, 4, double, TO_64_FLOAT ) +ATTRIB( R64G64B64_FLOAT, 3, double, TO_64_FLOAT ) +ATTRIB( R64G64_FLOAT, 2, double, TO_64_FLOAT ) +ATTRIB( R64_FLOAT, 1, double, TO_64_FLOAT ) + +ATTRIB( R32G32B32A32_FLOAT, 4, float, TO_32_FLOAT ) +ATTRIB( R32G32B32_FLOAT, 3, float, TO_32_FLOAT ) +ATTRIB( R32G32_FLOAT, 2, float, TO_32_FLOAT ) +ATTRIB( R32_FLOAT, 1, float, TO_32_FLOAT ) + +ATTRIB( R32G32B32A32_USCALED, 4, unsigned, TO_32_USCALED ) +ATTRIB( R32G32B32_USCALED, 3, unsigned, TO_32_USCALED ) +ATTRIB( R32G32_USCALED, 2, unsigned, TO_32_USCALED ) +ATTRIB( R32_USCALED, 1, unsigned, TO_32_USCALED ) + +ATTRIB( R32G32B32A32_SSCALED, 4, int, TO_32_SSCALED ) +ATTRIB( R32G32B32_SSCALED, 3, int, TO_32_SSCALED ) +ATTRIB( R32G32_SSCALED, 2, int, TO_32_SSCALED ) +ATTRIB( R32_SSCALED, 1, int, TO_32_SSCALED ) + +ATTRIB( R32G32B32A32_UNORM, 4, unsigned, TO_32_UNORM ) +ATTRIB( R32G32B32_UNORM, 3, unsigned, TO_32_UNORM ) +ATTRIB( R32G32_UNORM, 2, unsigned, TO_32_UNORM ) +ATTRIB( R32_UNORM, 1, unsigned, TO_32_UNORM ) + +ATTRIB( R32G32B32A32_SNORM, 4, int, TO_32_SNORM ) +ATTRIB( R32G32B32_SNORM, 3, int, TO_32_SNORM ) +ATTRIB( R32G32_SNORM, 2, int, TO_32_SNORM ) +ATTRIB( R32_SNORM, 1, int, TO_32_SNORM ) + +ATTRIB( R16G16B16A16_USCALED, 4, ushort, TO_16_USCALED ) +ATTRIB( R16G16B16_USCALED, 3, ushort, TO_16_USCALED ) +ATTRIB( R16G16_USCALED, 2, ushort, TO_16_USCALED ) +ATTRIB( R16_USCALED, 1, ushort, TO_16_USCALED ) + +ATTRIB( R16G16B16A16_SSCALED, 4, short, TO_16_SSCALED ) +ATTRIB( R16G16B16_SSCALED, 3, short, TO_16_SSCALED ) +ATTRIB( R16G16_SSCALED, 2, short, TO_16_SSCALED ) +ATTRIB( R16_SSCALED, 1, short, TO_16_SSCALED ) + +ATTRIB( R16G16B16A16_UNORM, 4, ushort, TO_16_UNORM ) +ATTRIB( R16G16B16_UNORM, 3, ushort, TO_16_UNORM ) +ATTRIB( R16G16_UNORM, 2, ushort, TO_16_UNORM ) +ATTRIB( R16_UNORM, 1, ushort, TO_16_UNORM ) + +ATTRIB( R16G16B16A16_SNORM, 4, short, TO_16_SNORM ) +ATTRIB( R16G16B16_SNORM, 3, short, TO_16_SNORM ) +ATTRIB( R16G16_SNORM, 2, short, TO_16_SNORM ) +ATTRIB( R16_SNORM, 1, short, TO_16_SNORM ) + +ATTRIB( R8G8B8A8_USCALED, 4, ubyte, TO_8_USCALED ) +ATTRIB( R8G8B8_USCALED, 3, ubyte, TO_8_USCALED ) +ATTRIB( R8G8_USCALED, 2, ubyte, TO_8_USCALED ) +ATTRIB( R8_USCALED, 1, ubyte, TO_8_USCALED ) + +ATTRIB( R8G8B8A8_SSCALED, 4, char, TO_8_SSCALED ) +ATTRIB( R8G8B8_SSCALED, 3, char, TO_8_SSCALED ) +ATTRIB( R8G8_SSCALED, 2, char, TO_8_SSCALED ) +ATTRIB( R8_SSCALED, 1, char, TO_8_SSCALED ) + +ATTRIB( R8G8B8A8_UNORM, 4, ubyte, TO_8_UNORM ) +ATTRIB( R8G8B8_UNORM, 3, ubyte, TO_8_UNORM ) +ATTRIB( R8G8_UNORM, 2, ubyte, TO_8_UNORM ) +ATTRIB( R8_UNORM, 1, ubyte, TO_8_UNORM ) + +ATTRIB( R8G8B8A8_SNORM, 4, char, TO_8_SNORM ) +ATTRIB( R8G8B8_SNORM, 3, char, TO_8_SNORM ) +ATTRIB( R8G8_SNORM, 2, char, TO_8_SNORM ) +ATTRIB( R8_SNORM, 1, char, TO_8_SNORM ) static void -fetch_B8G8R8A8_UNORM(const void *ptr, float *attrib) +emit_A8R8G8B8_UNORM( const float *attrib, void *ptr) { - attrib[2] = FROM_8_UNORM(0); - attrib[1] = FROM_8_UNORM(1); - attrib[0] = FROM_8_UNORM(2); - attrib[3] = FROM_8_UNORM(3); + ubyte *out = (ubyte *)ptr; + out[0] = TO_8_UNORM(attrib[3]); + out[1] = TO_8_UNORM(attrib[0]); + out[2] = TO_8_UNORM(attrib[1]); + out[3] = TO_8_UNORM(attrib[2]); } static void @@ -251,181 +216,13 @@ emit_B8G8R8A8_UNORM( const float *attrib, void *ptr) } static void -fetch_NULL( const void *ptr, float *attrib ) -{ - attrib[0] = 0; - attrib[1] = 0; - attrib[2] = 0; - attrib[3] = 1; -} - -static void emit_NULL( const float *attrib, void *ptr ) { /* do nothing is the only sensible option */ } -static fetch_func get_fetch_func( enum pipe_format format ) -{ - switch (format) { - case PIPE_FORMAT_R64_FLOAT: - return &fetch_R64_FLOAT; - case PIPE_FORMAT_R64G64_FLOAT: - return &fetch_R64G64_FLOAT; - case PIPE_FORMAT_R64G64B64_FLOAT: - return &fetch_R64G64B64_FLOAT; - case PIPE_FORMAT_R64G64B64A64_FLOAT: - return &fetch_R64G64B64A64_FLOAT; - - case PIPE_FORMAT_R32_FLOAT: - return &fetch_R32_FLOAT; - case PIPE_FORMAT_R32G32_FLOAT: - return &fetch_R32G32_FLOAT; - case PIPE_FORMAT_R32G32B32_FLOAT: - return &fetch_R32G32B32_FLOAT; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - return &fetch_R32G32B32A32_FLOAT; - - case PIPE_FORMAT_R32_UNORM: - return &fetch_R32_UNORM; - case PIPE_FORMAT_R32G32_UNORM: - return &fetch_R32G32_UNORM; - case PIPE_FORMAT_R32G32B32_UNORM: - return &fetch_R32G32B32_UNORM; - case PIPE_FORMAT_R32G32B32A32_UNORM: - return &fetch_R32G32B32A32_UNORM; - - case PIPE_FORMAT_R32_USCALED: - return &fetch_R32_USCALED; - case PIPE_FORMAT_R32G32_USCALED: - return &fetch_R32G32_USCALED; - case PIPE_FORMAT_R32G32B32_USCALED: - return &fetch_R32G32B32_USCALED; - case PIPE_FORMAT_R32G32B32A32_USCALED: - return &fetch_R32G32B32A32_USCALED; - - case PIPE_FORMAT_R32_SNORM: - return &fetch_R32_SNORM; - case PIPE_FORMAT_R32G32_SNORM: - return &fetch_R32G32_SNORM; - case PIPE_FORMAT_R32G32B32_SNORM: - return &fetch_R32G32B32_SNORM; - case PIPE_FORMAT_R32G32B32A32_SNORM: - return &fetch_R32G32B32A32_SNORM; - - case PIPE_FORMAT_R32_SSCALED: - return &fetch_R32_SSCALED; - case PIPE_FORMAT_R32G32_SSCALED: - return &fetch_R32G32_SSCALED; - case PIPE_FORMAT_R32G32B32_SSCALED: - return &fetch_R32G32B32_SSCALED; - case PIPE_FORMAT_R32G32B32A32_SSCALED: - return &fetch_R32G32B32A32_SSCALED; - - case PIPE_FORMAT_R16_UNORM: - return &fetch_R16_UNORM; - case PIPE_FORMAT_R16G16_UNORM: - return &fetch_R16G16_UNORM; - case PIPE_FORMAT_R16G16B16_UNORM: - return &fetch_R16G16B16_UNORM; - case PIPE_FORMAT_R16G16B16A16_UNORM: - return &fetch_R16G16B16A16_UNORM; - - case PIPE_FORMAT_R16_USCALED: - return &fetch_R16_USCALED; - case PIPE_FORMAT_R16G16_USCALED: - return &fetch_R16G16_USCALED; - case PIPE_FORMAT_R16G16B16_USCALED: - return &fetch_R16G16B16_USCALED; - case PIPE_FORMAT_R16G16B16A16_USCALED: - return &fetch_R16G16B16A16_USCALED; - - case PIPE_FORMAT_R16_SNORM: - return &fetch_R16_SNORM; - case PIPE_FORMAT_R16G16_SNORM: - return &fetch_R16G16_SNORM; - case PIPE_FORMAT_R16G16B16_SNORM: - return &fetch_R16G16B16_SNORM; - case PIPE_FORMAT_R16G16B16A16_SNORM: - return &fetch_R16G16B16A16_SNORM; - - case PIPE_FORMAT_R16_SSCALED: - return &fetch_R16_SSCALED; - case PIPE_FORMAT_R16G16_SSCALED: - return &fetch_R16G16_SSCALED; - case PIPE_FORMAT_R16G16B16_SSCALED: - return &fetch_R16G16B16_SSCALED; - case PIPE_FORMAT_R16G16B16A16_SSCALED: - return &fetch_R16G16B16A16_SSCALED; - - case PIPE_FORMAT_R8_UNORM: - return &fetch_R8_UNORM; - case PIPE_FORMAT_R8G8_UNORM: - return &fetch_R8G8_UNORM; - case PIPE_FORMAT_R8G8B8_UNORM: - return &fetch_R8G8B8_UNORM; - case PIPE_FORMAT_R8G8B8A8_UNORM: - return &fetch_R8G8B8A8_UNORM; - - case PIPE_FORMAT_R8_USCALED: - return &fetch_R8_USCALED; - case PIPE_FORMAT_R8G8_USCALED: - return &fetch_R8G8_USCALED; - case PIPE_FORMAT_R8G8B8_USCALED: - return &fetch_R8G8B8_USCALED; - case PIPE_FORMAT_R8G8B8A8_USCALED: - return &fetch_R8G8B8A8_USCALED; - - case PIPE_FORMAT_R8_SNORM: - return &fetch_R8_SNORM; - case PIPE_FORMAT_R8G8_SNORM: - return &fetch_R8G8_SNORM; - case PIPE_FORMAT_R8G8B8_SNORM: - return &fetch_R8G8B8_SNORM; - case PIPE_FORMAT_R8G8B8A8_SNORM: - return &fetch_R8G8B8A8_SNORM; - - case PIPE_FORMAT_R8_SSCALED: - return &fetch_R8_SSCALED; - case PIPE_FORMAT_R8G8_SSCALED: - return &fetch_R8G8_SSCALED; - case PIPE_FORMAT_R8G8B8_SSCALED: - return &fetch_R8G8B8_SSCALED; - case PIPE_FORMAT_R8G8B8A8_SSCALED: - return &fetch_R8G8B8A8_SSCALED; - - case PIPE_FORMAT_B8G8R8A8_UNORM: - return &fetch_B8G8R8A8_UNORM; - - case PIPE_FORMAT_A8R8G8B8_UNORM: - return &fetch_A8R8G8B8_UNORM; - - case PIPE_FORMAT_R32_FIXED: - return &fetch_R32_FIXED; - case PIPE_FORMAT_R32G32_FIXED: - return &fetch_R32G32_FIXED; - case PIPE_FORMAT_R32G32B32_FIXED: - return &fetch_R32G32B32_FIXED; - case PIPE_FORMAT_R32G32B32A32_FIXED: - return &fetch_R32G32B32A32_FIXED; - - default: - assert(0); - return &fetch_NULL; - } -} - - - - static emit_func get_emit_func( enum pipe_format format ) { - /* silence warnings */ - (void) emit_R32G32B32A32_FIXED; - (void) emit_R32G32B32_FIXED; - (void) emit_R32G32_FIXED; - (void) emit_R32_FIXED; - switch (format) { case PIPE_FORMAT_R64_FLOAT: return &emit_R64_FLOAT; @@ -565,60 +362,116 @@ static emit_func get_emit_func( enum pipe_format format ) } } - - -/** - * Fetch vertex attributes for 'count' vertices. - */ -static void PIPE_CDECL generic_run_elts( struct translate *translate, - const unsigned *elts, - unsigned count, +static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg, + unsigned elt, unsigned instance_id, - void *output_buffer ) + void *vert ) { - struct translate_generic *tg = translate_generic(translate); - char *vert = output_buffer; unsigned nr_attrs = tg->nr_attrib; unsigned attr; - unsigned i; - /* loop over vertex attributes (vertex shader inputs) - */ - for (i = 0; i < count; i++) { - unsigned elt = *elts++; + for (attr = 0; attr < nr_attrs; attr++) { + float data[4]; + uint8_t *dst = (uint8_t *)vert + tg->attrib[attr].output_offset; - for (attr = 0; attr < nr_attrs; attr++) { - float data[4]; - const char *src; + if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { + const uint8_t *src; unsigned index; - - char *dst = (vert + - tg->attrib[attr].output_offset); + int copy_size; if (tg->attrib[attr].instance_divisor) { index = instance_id / tg->attrib[attr].instance_divisor; - } else { + } + else { index = elt; } + /* clamp to void going out of bounds */ index = MIN2(index, tg->attrib[attr].max_index); src = tg->attrib[attr].input_ptr + tg->attrib[attr].input_stride * index; - tg->attrib[attr].fetch( src, data ); + copy_size = tg->attrib[attr].copy_size; + if(likely(copy_size >= 0)) + memcpy(dst, src, copy_size); + else + { + tg->attrib[attr].fetch( data, src, 0, 0 ); + + if (0) + debug_printf("Fetch linear attr %d from %p stride %d index %d: " + " %f, %f, %f, %f \n", + attr, + tg->attrib[attr].input_ptr, + tg->attrib[attr].input_stride, + index, + data[0], data[1],data[2], data[3]); + + tg->attrib[attr].emit( data, dst ); + } + } else { + if(likely(tg->attrib[attr].copy_size >= 0)) + memcpy(data, &instance_id, 4); + else + { + data[0] = (float)instance_id; + tg->attrib[attr].emit( data, dst ); + } + } + } +} + +/** + * Fetch vertex attributes for 'count' vertices. + */ +static void PIPE_CDECL generic_run_elts( struct translate *translate, + const unsigned *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; + + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); + vert += tg->translate.key.output_stride; + } +} - if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n", - i, elt, attr, data[0], data[1], data[2], data[3]); +static void PIPE_CDECL generic_run_elts16( struct translate *translate, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; - tg->attrib[attr].emit( data, dst ); - } - + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); vert += tg->translate.key.output_stride; } } +static void PIPE_CDECL generic_run_elts8( struct translate *translate, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); + vert += tg->translate.key.output_stride; + } +} static void PIPE_CDECL generic_run( struct translate *translate, unsigned start, @@ -628,44 +481,10 @@ static void PIPE_CDECL generic_run( struct translate *translate, { struct translate_generic *tg = translate_generic(translate); char *vert = output_buffer; - unsigned nr_attrs = tg->nr_attrib; - unsigned attr; unsigned i; - /* loop over vertex attributes (vertex shader inputs) - */ for (i = 0; i < count; i++) { - unsigned elt = start + i; - - for (attr = 0; attr < nr_attrs; attr++) { - float data[4]; - - char *dst = (vert + - tg->attrib[attr].output_offset); - - if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { - const char *src; - - if (tg->attrib[attr].instance_divisor) { - src = tg->attrib[attr].input_ptr + - tg->attrib[attr].input_stride * - (instance_id / tg->attrib[attr].instance_divisor); - } else { - src = tg->attrib[attr].input_ptr + - tg->attrib[attr].input_stride * elt; - } - - tg->attrib[attr].fetch( src, data ); - } else { - data[0] = (float)instance_id; - } - - if (0) debug_printf("vert %d attr %d: %f %f %f %f\n", - i, attr, data[0], data[1], data[2], data[3]); - - tg->attrib[attr].emit( data, dst ); - } - + generic_run_one(tg, start + i, instance_id, vert); vert += tg->translate.key.output_stride; } } @@ -683,7 +502,7 @@ static void generic_set_buffer( struct translate *translate, for (i = 0; i < tg->nr_attrib; i++) { if (tg->attrib[i].buffer == buf) { - tg->attrib[i].input_ptr = ((char *)ptr + + tg->attrib[i].input_ptr = ((const uint8_t *)ptr + tg->attrib[i].input_offset); tg->attrib[i].input_stride = stride; tg->attrib[i].max_index = max_index; @@ -711,19 +530,46 @@ struct translate *translate_generic_create( const struct translate_key *key ) tg->translate.release = generic_release; tg->translate.set_buffer = generic_set_buffer; tg->translate.run_elts = generic_run_elts; + tg->translate.run_elts16 = generic_run_elts16; + tg->translate.run_elts8 = generic_run_elts8; tg->translate.run = generic_run; for (i = 0; i < key->nr_elements; i++) { + const struct util_format_description *format_desc = + util_format_description(key->element[i].input_format); + + assert(format_desc); + assert(format_desc->fetch_rgba_float); + tg->attrib[i].type = key->element[i].type; - tg->attrib[i].fetch = get_fetch_func(key->element[i].input_format); + tg->attrib[i].fetch = format_desc->fetch_rgba_float; tg->attrib[i].buffer = key->element[i].input_buffer; tg->attrib[i].input_offset = key->element[i].input_offset; tg->attrib[i].instance_divisor = key->element[i].instance_divisor; - tg->attrib[i].emit = get_emit_func(key->element[i].output_format); tg->attrib[i].output_offset = key->element[i].output_offset; + tg->attrib[i].copy_size = -1; + if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID) + { + if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED + || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED) + tg->attrib[i].copy_size = 4; + } + else + { + if(key->element[i].input_format == key->element[i].output_format + && format_desc->block.width == 1 + && format_desc->block.height == 1 + && !(format_desc->block.bits & 7)) + tg->attrib[i].copy_size = format_desc->block.bits >> 3; + } + + if(tg->attrib[i].copy_size < 0) + tg->attrib[i].emit = get_emit_func(key->element[i].output_format); + else + tg->attrib[i].emit = NULL; } tg->nr_attrib = key->nr_elements; @@ -731,3 +577,83 @@ struct translate *translate_generic_create( const struct translate_key *key ) return &tg->translate; } + +boolean translate_generic_is_output_format_supported(enum pipe_format format) +{ + switch(format) + { + case PIPE_FORMAT_R64G64B64A64_FLOAT: return TRUE; + case PIPE_FORMAT_R64G64B64_FLOAT: return TRUE; + case PIPE_FORMAT_R64G64_FLOAT: return TRUE; + case PIPE_FORMAT_R64_FLOAT: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_FLOAT: return TRUE; + case PIPE_FORMAT_R32G32B32_FLOAT: return TRUE; + case PIPE_FORMAT_R32G32_FLOAT: return TRUE; + case PIPE_FORMAT_R32_FLOAT: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_USCALED: return TRUE; + case PIPE_FORMAT_R32G32B32_USCALED: return TRUE; + case PIPE_FORMAT_R32G32_USCALED: return TRUE; + case PIPE_FORMAT_R32_USCALED: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_SSCALED: return TRUE; + case PIPE_FORMAT_R32G32B32_SSCALED: return TRUE; + case PIPE_FORMAT_R32G32_SSCALED: return TRUE; + case PIPE_FORMAT_R32_SSCALED: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_UNORM: return TRUE; + case PIPE_FORMAT_R32G32B32_UNORM: return TRUE; + case PIPE_FORMAT_R32G32_UNORM: return TRUE; + case PIPE_FORMAT_R32_UNORM: return TRUE; + + case PIPE_FORMAT_R32G32B32A32_SNORM: return TRUE; + case PIPE_FORMAT_R32G32B32_SNORM: return TRUE; + case PIPE_FORMAT_R32G32_SNORM: return TRUE; + case PIPE_FORMAT_R32_SNORM: return TRUE; + + case PIPE_FORMAT_R16G16B16A16_USCALED: return TRUE; + case PIPE_FORMAT_R16G16B16_USCALED: return TRUE; + case PIPE_FORMAT_R16G16_USCALED: return TRUE; + case PIPE_FORMAT_R16_USCALED: return TRUE; + + case PIPE_FORMAT_R16G16B16A16_SSCALED: return TRUE; + case PIPE_FORMAT_R16G16B16_SSCALED: return TRUE; + case PIPE_FORMAT_R16G16_SSCALED: return TRUE; + case PIPE_FORMAT_R16_SSCALED: return TRUE; + + case PIPE_FORMAT_R16G16B16A16_UNORM: return TRUE; + case PIPE_FORMAT_R16G16B16_UNORM: return TRUE; + case PIPE_FORMAT_R16G16_UNORM: return TRUE; + case PIPE_FORMAT_R16_UNORM: return TRUE; + + case PIPE_FORMAT_R16G16B16A16_SNORM: return TRUE; + case PIPE_FORMAT_R16G16B16_SNORM: return TRUE; + case PIPE_FORMAT_R16G16_SNORM: return TRUE; + case PIPE_FORMAT_R16_SNORM: return TRUE; + + case PIPE_FORMAT_R8G8B8A8_USCALED: return TRUE; + case PIPE_FORMAT_R8G8B8_USCALED: return TRUE; + case PIPE_FORMAT_R8G8_USCALED: return TRUE; + case PIPE_FORMAT_R8_USCALED: return TRUE; + + case PIPE_FORMAT_R8G8B8A8_SSCALED: return TRUE; + case PIPE_FORMAT_R8G8B8_SSCALED: return TRUE; + case PIPE_FORMAT_R8G8_SSCALED: return TRUE; + case PIPE_FORMAT_R8_SSCALED: return TRUE; + + case PIPE_FORMAT_R8G8B8A8_UNORM: return TRUE; + case PIPE_FORMAT_R8G8B8_UNORM: return TRUE; + case PIPE_FORMAT_R8G8_UNORM: return TRUE; + case PIPE_FORMAT_R8_UNORM: return TRUE; + + case PIPE_FORMAT_R8G8B8A8_SNORM: return TRUE; + case PIPE_FORMAT_R8G8B8_SNORM: return TRUE; + case PIPE_FORMAT_R8G8_SNORM: return TRUE; + case PIPE_FORMAT_R8_SNORM: return TRUE; + + case PIPE_FORMAT_A8R8G8B8_UNORM: return TRUE; + case PIPE_FORMAT_B8G8R8A8_UNORM: return TRUE; + default: return FALSE; + } +} diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index ef3aa674a34..f8bf5b46692 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -30,11 +30,12 @@ #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_format.h" #include "translate.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "rtasm/rtasm_cpu.h" #include "rtasm/rtasm_x86sse.h" @@ -46,21 +47,9 @@ #define W 3 -typedef void (PIPE_CDECL *run_func)( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer); - -typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer); - struct translate_buffer { const void *base_ptr; - unsigned stride; + uintptr_t stride; unsigned max_index; }; @@ -73,21 +62,43 @@ struct translate_buffer_varient { #define ELEMENT_BUFFER_INSTANCE_ID 1001 +#define NUM_CONSTS 7 + +enum +{ + CONST_IDENTITY, + CONST_INV_127, + CONST_INV_255, + CONST_INV_32767, + CONST_INV_65535, + CONST_INV_2147483647, + CONST_255 +}; + +#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} +static float consts[NUM_CONSTS][4] = { + {0, 0, 0, 1}, + C(1.0 / 127.0), + C(1.0 / 255.0), + C(1.0 / 32767.0), + C(1.0 / 65535.0), + C(1.0 / 2147483647.0), + C(255.0) +}; +#undef C struct translate_sse { struct translate translate; struct x86_function linear_func; struct x86_function elt_func; + struct x86_function elt16_func; + struct x86_function elt8_func; struct x86_function *func; - boolean loaded_identity; - boolean loaded_255; - boolean loaded_inv_255; - - float identity[4]; - float float_255[4]; - float inv_255[4]; + PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; + int8_t reg_to_const[16]; + int8_t const_to_reg[NUM_CONSTS]; struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; unsigned nr_buffers; @@ -102,17 +113,16 @@ struct translate_sse { boolean use_instancing; unsigned instance_id; - run_func gen_run; - run_elts_func gen_run_elts; - /* these are actually known values, but putting them in a struct * like this is helpful to keep them in sync across the file. */ struct x86_reg tmp_EAX; - struct x86_reg idx_EBX; /* either start+i or &elt[i] */ - struct x86_reg outbuf_ECX; - struct x86_reg machine_EDX; - struct x86_reg count_ESI; /* decrements to zero */ + struct x86_reg tmp2_EDX; + struct x86_reg src_ECX; + struct x86_reg idx_ESI; /* either start+i or &elt[i] */ + struct x86_reg machine_EDI; + struct x86_reg outbuf_EBX; + struct x86_reg count_EBP; /* decrements to zero */ }; static int get_offset( const void *a, const void *b ) @@ -120,281 +130,950 @@ static int get_offset( const void *a, const void *b ) return (const char *)b - (const char *)a; } +static struct x86_reg get_const( struct translate_sse *p, unsigned id) +{ + struct x86_reg reg; + unsigned i; + if(p->const_to_reg[id] >= 0) + return x86_make_reg(file_XMM, p->const_to_reg[id]); -static struct x86_reg get_identity( struct translate_sse *p ) -{ - struct x86_reg reg = x86_make_reg(file_XMM, 6); - - if (!p->loaded_identity) { - p->loaded_identity = TRUE; - p->identity[0] = 0; - p->identity[1] = 0; - p->identity[2] = 0; - p->identity[3] = 1; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->identity[0]))); + for(i = 2; i < 8; ++i) + { + if(p->reg_to_const[i] < 0) + break; } + /* TODO: be smarter here */ + if(i == 8) + --i; + + reg = x86_make_reg(file_XMM, i); + + if(p->reg_to_const[i] >= 0) + p->const_to_reg[p->reg_to_const[i]] = -1; + + p->reg_to_const[i] = id; + p->const_to_reg[id] = i; + + /* TODO: this should happen outside the loop, if possible */ + sse_movaps(p->func, reg, + x86_make_disp(p->machine_EDI, + get_offset(p, &p->consts[id][0]))); + return reg; } -static struct x86_reg get_255( struct translate_sse *p ) +/* load the data in a SSE2 register, padding with zeros */ +static boolean emit_load_sse2( struct translate_sse *p, + struct x86_reg data, + struct x86_reg src, + unsigned size) { - struct x86_reg reg = x86_make_reg(file_XMM, 7); - - if (!p->loaded_255) { - p->loaded_255 = TRUE; - p->float_255[0] = - p->float_255[1] = - p->float_255[2] = - p->float_255[3] = 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->float_255[0]))); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + switch(size) + { + case 1: + x86_movzx8(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 2: + x86_movzx16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 3: + x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); + x86_shl_imm(p->func, tmp, 16); + x86_mov16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 4: + sse2_movd(p->func, data, src); + break; + case 6: + sse2_movd(p->func, data, src); + x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); + sse2_movd(p->func, tmpXMM, tmp); + sse2_punpckldq(p->func, data, tmpXMM); + break; + case 8: + sse2_movq(p->func, data, src); + break; + case 12: + sse2_movq(p->func, data, src); + sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); + sse2_punpcklqdq(p->func, data, tmpXMM); + break; + case 16: + sse2_movdqu(p->func, data, src); + break; + default: + return FALSE; } - - return reg; + return TRUE; } -static struct x86_reg get_inv_255( struct translate_sse *p ) +/* this value can be passed for the out_chans argument */ +#define CHANNELS_0001 5 + +/* this function will load #chans float values, and will + * pad the register with zeroes at least up to out_chans. + * + * If out_chans is set to CHANNELS_0001, then the fourth + * value will be padded with 1. Only pass this value if + * chans < 4 or results are undefined. + */ +static void emit_load_float32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) { - struct x86_reg reg = x86_make_reg(file_XMM, 5); - - if (!p->loaded_inv_255) { - p->loaded_inv_255 = TRUE; - p->inv_255[0] = - p->inv_255[1] = - p->inv_255[2] = - p->inv_255[3] = 1.0f / 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->inv_255[0]))); + switch(chans) + { + case 1: + /* a 0 0 0 + * a 0 0 1 + */ + sse_movss(p->func, data, arg0); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 2: + /* 0 0 0 1 + * a b 0 1 + */ + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + sse_movlps(p->func, data, arg0); + break; + case 3: + /* Have to jump through some hoops: + * + * c 0 0 0 + * c 0 0 1 if out_chans == CHANNELS_0001 + * 0 0 c 0/1 + * a b c 0/1 + */ + sse_movss(p->func, data, x86_make_disp(arg0, 8)); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) ); + sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); + sse_movlps(p->func, data, arg0); + break; + case 4: + sse_movups(p->func, data, arg0); + break; } - - return reg; } +/* this function behaves like emit_load_float32, but loads + 64-bit floating point numbers, converting them to 32-bit + ones */ +static void emit_load_float64to32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) +{ + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + switch(chans) + { + case 1: + sse2_movsd(p->func, data, arg0); + if(out_chans > 1) + sse2_cvtpd2ps(p->func, data, data); + else + sse2_cvtsd2ss(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + break; + case 2: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 3: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + if(out_chans > 3) + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + else + sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 4: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + break; + } +} -static void emit_load_R32G32B32A32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm) { - sse_movups(p->func, data, arg0); + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, dst_gpr, src_gpr); + else + { + /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movq(p->func, dst_xmm, src_xmm); + else + sse_movlps(p->func, dst_xmm, src_xmm); + } } -static void emit_load_R32G32B32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src) { - /* Have to jump through some hoops: - * - * c 0 0 0 - * c 0 0 1 - * 0 0 c 1 - * a b c 1 - */ - sse_movss(p->func, data, x86_make_disp(arg0, 8)); - sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); - sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); - sse_movlps(p->func, data, arg0); + emit_mov64(p, dst_gpr, dst_xmm, src, src); } -static void emit_load_R32G32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm) { - /* 0 0 0 1 - * a b 0 1 - */ - sse_movups(p->func, data, get_identity(p) ); - sse_movlps(p->func, data, arg0); + emit_mov64(p, dst, dst, src_gpr, src_xmm); } +static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) +{ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movdqu(p->func, dst, src); + else + sse_movups(p->func, dst, src); +} -static void emit_load_R32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, + * but may or may not be good on older processors + * TODO: may perhaps want to use non-temporal stores here if possible + */ +static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size) { - /* a 0 0 0 - * a 0 0 1 - */ - sse_movss(p->func, data, arg0); - sse_orps(p->func, data, get_identity(p) ); + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); + struct x86_reg dataGPR = p->tmp_EAX; + struct x86_reg dataGPR2 = p->tmp2_EDX; + + if(size < 8) + { + switch (size) + { + case 1: + x86_mov8(p->func, dataGPR, src); + x86_mov8(p->func, dst, dataGPR); + break; + case 2: + x86_mov16(p->func, dataGPR, src); + x86_mov16(p->func, dst, dataGPR); + break; + case 3: + x86_mov16(p->func, dataGPR, src); + x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); + x86_mov16(p->func, dst, dataGPR); + x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); + break; + case 4: + x86_mov(p->func, dataGPR, src); + x86_mov(p->func, dst, dataGPR); + break; + case 6: + x86_mov(p->func, dataGPR, src); + x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); + x86_mov(p->func, dst, dataGPR); + x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); + break; + } + } + else if(!(x86_target_caps(p->func) & X86_SSE)) + { + unsigned i = 0; + assert((size & 3) == 0); + for(i = 0; i < size; i += 4) + { + x86_mov(p->func, dataGPR, x86_make_disp(src, i)); + x86_mov(p->func, x86_make_disp(dst, i), dataGPR); + } + } + else + { + switch(size) + { + case 8: + emit_load64(p, dataGPR, dataXMM, src); + emit_store64(p, dst, dataGPR, dataXMM); + break; + case 12: + emit_load64(p, dataGPR2, dataXMM, src); + x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); + emit_store64(p, dst, dataGPR2, dataXMM); + x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); + break; + case 16: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dst, dataXMM); + break; + case 24: + emit_mov128(p, dataXMM, src); + emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); + break; + case 32: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); + break; + default: + assert(0); + } + } } +static boolean translate_attr_convert( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) -static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg data, - struct x86_reg src ) { + const struct util_format_description* input_desc = util_format_description(a->input_format); + const struct util_format_description* output_desc = util_format_description(a->output_format); + unsigned i; + boolean id_swizzle = TRUE; + unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE}; + unsigned needed_chans = 0; + unsigned imms[2] = {0, 0x3f800000}; - /* Load and unpack twice: - */ - sse_movss(p->func, data, src); - sse2_punpcklbw(p->func, data, get_identity(p)); - sse2_punpcklbw(p->func, data, get_identity(p)); + if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE) + return FALSE; - /* Convert to float: - */ - sse2_cvtdq2ps(p->func, data, data); + if(input_desc->channel[0].size & 7) + return FALSE; + if(input_desc->colorspace != output_desc->colorspace) + return FALSE; - /* Scale by 1/255.0 - */ - sse_mulps(p->func, data, get_inv_255(p)); -} + for(i = 1; i < input_desc->nr_channels; ++i) + { + if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0]))) + return FALSE; + } + for(i = 1; i < output_desc->nr_channels; ++i) + { + if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0]))) + return FALSE; + } + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(output_desc->swizzle[i] < 4) + swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; + } + if((x86_target_caps(p->func) & X86_SSE) && (0 + || a->output_format == PIPE_FORMAT_R32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); -static void emit_store_R32G32B32A32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movups(p->func, dest, dataXMM); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static void emit_store_R32G32B32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - /* Emit two, shuffle, emit one. - */ - sse_movlps(p->func, dest, dataXMM); - sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ - sse_movss(p->func, x86_make_disp(dest,8), dataXMM); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } -static void emit_store_R32G32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movlps(p->func, dest, dataXMM); -} + if(needed_chans > 0) + { + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovzx */ + switch(input_desc->channel[0].size) + { + case 8: + /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */ + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case 32: /* we lose precision here */ + sse2_psrld_imm(p->func, dataXMM, 1); + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_const(p, CONST_INV_255); + break; + case 16: + factor = get_const(p, CONST_INV_65535); + break; + case 32: + factor = get_const(p, CONST_INV_2147483647); + break; + default: + assert(0); + factor.disp = 0; + factor.file = 0; + factor.idx = 0; + factor.mod = 0; + break; + } + sse_mulps(p->func, dataXMM, factor); + } + else if(input_desc->channel[0].size == 32) + sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */ + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovsx */ + switch(input_desc->channel[0].size) + { + case 8: + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 24); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 16); + break; + case 32: /* we lose precision here */ + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_const(p, CONST_INV_127); + break; + case 16: + factor = get_const(p, CONST_INV_32767); + break; + case 32: + factor = get_const(p, CONST_INV_2147483647); + break; + default: + assert(0); + factor.disp = 0; + factor.file = 0; + factor.idx = 0; + factor.mod = 0; + break; + } + sse_mulps(p->func, dataXMM, factor); + } + break; + + break; + case UTIL_FORMAT_TYPE_FLOAT: + if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64) + return FALSE; + if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3) + { + swizzle[3] = UTIL_FORMAT_SWIZZLE_W; + needed_chans = CHANNELS_0001; + } + switch(input_desc->channel[0].size) + { + case 32: + emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + case 64: /* we lose precision here */ + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + default: + return FALSE; + } + break; + default: + return FALSE; + } -static void emit_store_R32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movss(p->func, dest, dataXMM); -} + if(!id_swizzle) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) ); + } + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse_movups(p->func, dst, dataXMM); + else + { + if(output_desc->nr_channels >= 2 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse_movlps(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + sse_movss(p->func, dst, dataXMM); + else + x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 2) + { + if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + if(output_desc->nr_channels >= 3) + { + if(output_desc->nr_channels >= 4 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); + else + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); + sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + } + return TRUE; + } + else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16 + && output_desc->channel[0].normalized == input_desc->channel[0].normalized + && (0 + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + unsigned imms[2] = {0, 1}; + + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - /* Scale by 255.0 - */ - sse_mulps(p->func, dataXMM, get_255(p)); + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } - /* Pack and emit: - */ - sse2_cvtps2dq(p->func, dataXMM, dataXMM); - sse2_packssdw(p->func, dataXMM, dataXMM); - sse2_packuswb(p->func, dataXMM, dataXMM); - sse_movss(p->func, dest, dataXMM); -} + if(needed_chans > 0) + { + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(input_desc->channel[0].normalized) + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + sse2_psrlw_imm(p->func, dataXMM, 1); + } + else + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(input_desc->channel[0].normalized) + { + sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); + sse2_punpcklbw(p->func, tmpXMM, dataXMM); + sse2_psllw_imm(p->func, dataXMM, 9); + sse2_psrlw_imm(p->func, dataXMM, 8); + sse2_por(p->func, tmpXMM, dataXMM); + sse2_psrlw_imm(p->func, dataXMM, 7); + sse2_por(p->func, tmpXMM, dataXMM); + { + struct x86_reg t = dataXMM; + dataXMM = tmpXMM; + tmpXMM = t; + } + } + else + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psraw_imm(p->func, dataXMM, 8); + } + break; + default: + assert(0); + } + if(output_desc->channel[0].normalized) + imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; + if(!id_swizzle) + sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); + } + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse2_movq(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse2_movd(p->func, dst, dataXMM); + else + { + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, dst, tmp); + if(output_desc->nr_channels >= 2) + x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + else + { + if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + if(output_desc->nr_channels >= 2) + { + sse2_movd(p->func, tmp, dataXMM); + x86_shr_imm(p->func, tmp, 16); + x86_mov16(p->func, x86_make_disp(dst, 2), tmp); + } + } + } + if(output_desc->nr_channels >= 3) + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 4), tmp); + if(output_desc->nr_channels >= 4) + { + x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + else + { + if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + sse2_psrlq_imm(p->func, dataXMM, 48); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 6), tmp); + } + } + } + } + } + return TRUE; + } + else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0]))) + { + struct x86_reg tmp = p->tmp_EAX; + unsigned i; + if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4 + && swizzle[0] == UTIL_FORMAT_SWIZZLE_W + && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z + && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y + && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) + { + /* TODO: support movbe */ + x86_mov(p->func, tmp, src); + x86_bswap(p->func, tmp); + x86_mov(p->func, dst, tmp); + return TRUE; + } -/* Extended swizzles? Maybe later. - */ -static void emit_swizzle( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg src, - unsigned char shuffle ) -{ - sse_shufps(p->func, dest, src, shuffle); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + switch(output_desc->channel[0].size) + { + case 8: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[0].normalized ? 0xff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[0].normalized ? 0x7f : 1; + break; + default: + return FALSE; + } + } + x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); + } + else + { + x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); + x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); + } + break; + case 16: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3c00; + break; + default: + return FALSE; + } + } + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); + } + else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0) + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); + else + { + x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); + x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); + } + break; + case 32: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3f800000; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); + x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); + } + break; + case 64: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned l = 0; + unsigned h = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + h = output_desc->channel[1].normalized ? 0xffffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + h = output_desc->channel[1].normalized ? 0x7fffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + h = 0x3ff00000; + l = 0; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); + x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); + } + else + { + if(x86_target_caps(p->func) & X86_SSE) + { + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); + emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8)); + emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); + x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4)); + x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); + } + } + break; + default: + return FALSE; + } + } + return TRUE; + } + /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ + else if((x86_target_caps(p->func) & X86_SSE2) && + a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0 + || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM + || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + /* load */ + sse_movups(p->func, dataXMM, src); -static boolean translate_attr( struct translate_sse *p, - const struct translate_element *a, - struct x86_reg srcECX, - struct x86_reg dstEAX) -{ - struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3)); - switch (a->input_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_load_R32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_load_R32G32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_load_R32G32B32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_load_R32G32B32A32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - break; - default: - return FALSE; - } + /* scale by 255.0 */ + sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); - switch (a->output_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_store_R32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_store_R32G32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_store_R32G32B32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_store_R32G32B32A32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - default: - return FALSE; + /* pack and emit */ + sse2_cvtps2dq(p->func, dataXMM, dataXMM); + sse2_packssdw(p->func, dataXMM, dataXMM); + sse2_packuswb(p->func, dataXMM, dataXMM); + sse2_movd(p->func, dst, dataXMM); + + return TRUE; } - return TRUE; + return FALSE; } +static boolean translate_attr( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) +{ + if(a->input_format == a->output_format) + { + emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); + return TRUE; + } + + return translate_attr_convert(p, a, src, dst); +} static boolean init_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { unsigned i; - struct x86_reg instance_id = x86_make_disp(p->machine_EDX, + struct x86_reg instance_id = x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; struct translate_buffer *buffer = &p->buffer[varient->buffer_index]; - if (linear || varient->instance_divisor) { - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + if (!index_size || varient->instance_divisor) { + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); - struct x86_reg elt = p->idx_EBX; + struct x86_reg elt = p->idx_ESI; struct x86_reg tmp_EAX = p->tmp_EAX; /* Calculate pointer to first attrib: @@ -406,20 +1085,16 @@ static boolean init_inputs( struct translate_sse *p, x86_mov(p->func, tmp_EAX, instance_id); if (varient->instance_divisor != 1) { - struct x86_reg tmp_EDX = p->machine_EDX; - struct x86_reg tmp_ECX = p->outbuf_ECX; + struct x86_reg tmp_EDX = p->tmp2_EDX; + struct x86_reg tmp_ECX = p->src_ECX; /* TODO: Add x86_shr() to rtasm and use it whenever * instance divisor is power of two. */ - x86_push(p->func, tmp_EDX); - x86_push(p->func, tmp_ECX); x86_xor(p->func, tmp_EDX, tmp_EDX); x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor); x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ - x86_pop(p->func, tmp_ECX); - x86_pop(p->func, tmp_EDX); } } else { x86_mov(p->func, tmp_EAX, elt); @@ -430,16 +1105,23 @@ static boolean init_inputs( struct translate_sse *p, */ x86_imul(p->func, tmp_EAX, buf_stride); + x64_rexw(p->func); x86_add(p->func, tmp_EAX, buf_base_ptr); /* In the linear case, keep the buffer pointer instead of the * index number. */ - if (linear && p->nr_buffer_varients == 1) + if (!index_size && p->nr_buffer_varients == 1) + { + x64_rexw(p->func); x86_mov(p->func, elt, tmp_EAX); + } else + { + x64_rexw(p->func); x86_mov(p->func, buf_ptr, tmp_EAX); + } } } @@ -448,44 +1130,57 @@ static boolean init_inputs( struct translate_sse *p, static struct x86_reg get_buffer_ptr( struct translate_sse *p, - boolean linear, + unsigned index_size, unsigned var_idx, struct x86_reg elt ) { if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { - return x86_make_disp(p->machine_EDX, + return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); } - if (linear && p->nr_buffer_varients == 1) { - return p->idx_EBX; + if (!index_size && p->nr_buffer_varients == 1) { + return p->idx_ESI; } - else if (linear || p->buffer_varient[var_idx].instance_divisor) { - struct x86_reg ptr = p->tmp_EAX; + else if (!index_size || p->buffer_varient[var_idx].instance_divisor) { + struct x86_reg ptr = p->src_ECX; struct x86_reg buf_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer_varient[var_idx].ptr)); + x64_rexw(p->func); x86_mov(p->func, ptr, buf_ptr); return ptr; } else { - struct x86_reg ptr = p->tmp_EAX; + struct x86_reg ptr = p->src_ECX; const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx]; struct x86_reg buf_stride = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); struct x86_reg buf_base_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].base_ptr)); /* Calculate pointer to current attrib: */ - x86_mov(p->func, ptr, buf_stride); - x86_imul(p->func, ptr, elt); + switch(index_size) + { + case 1: + x86_movzx8(p->func, ptr, elt); + break; + case 2: + x86_movzx16(p->func, ptr, elt); + break; + case 4: + x86_mov(p->func, ptr, elt); + break; + } + x86_imul(p->func, ptr, buf_stride); + x64_rexw(p->func); x86_add(p->func, ptr, buf_base_ptr); return ptr; } @@ -494,39 +1189,43 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p, static boolean incr_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { - if (linear && p->nr_buffer_varients == 1) { - struct x86_reg stride = x86_make_disp(p->machine_EDX, + if (!index_size && p->nr_buffer_varients == 1) { + struct x86_reg stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[0].stride)); if (p->buffer_varient[0].instance_divisor == 0) { - x86_add(p->func, p->idx_EBX, stride); - sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); + x64_rexw(p->func); + x86_add(p->func, p->idx_ESI, stride); + sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); } } - else if (linear) { + else if (!index_size) { unsigned i; /* Is this worthwhile?? */ for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); if (varient->instance_divisor == 0) { - x86_mov(p->func, p->tmp_EAX, buf_ptr); - x86_add(p->func, p->tmp_EAX, buf_stride); + x86_mov(p->func, p->tmp_EAX, buf_stride); + x64_rexw(p->func); + x86_add(p->func, p->tmp_EAX, buf_ptr); if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); + x64_rexw(p->func); x86_mov(p->func, buf_ptr, p->tmp_EAX); } } } else { - x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4)); + x64_rexw(p->func); + x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); } return TRUE; @@ -551,35 +1250,52 @@ static boolean incr_inputs( struct translate_sse *p, */ static boolean build_vertex_emit( struct translate_sse *p, struct x86_function *func, - boolean linear ) + unsigned index_size ) { int fixup, label; unsigned j; + memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); + memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); + p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); - p->idx_EBX = x86_make_reg(file_REG32, reg_BX); - p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX); - p->machine_EDX = x86_make_reg(file_REG32, reg_DX); - p->count_ESI = x86_make_reg(file_REG32, reg_SI); + p->idx_ESI = x86_make_reg(file_REG32, reg_SI); + p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); + p->machine_EDI = x86_make_reg(file_REG32, reg_DI); + p->count_EBP = x86_make_reg(file_REG32, reg_BP); + p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); + p->src_ECX = x86_make_reg(file_REG32, reg_CX); p->func = func; - p->loaded_inv_255 = FALSE; - p->loaded_255 = FALSE; - p->loaded_identity = FALSE; x86_init_func(p->func); - /* Push a few regs? - */ - x86_push(p->func, p->idx_EBX); - x86_push(p->func, p->count_ESI); + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */ + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6)); + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7)); + } - /* Load arguments into regs: - */ - x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); - x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); - x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); - x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); + x86_push(p->func, p->outbuf_EBX); + x86_push(p->func, p->count_EBP); + +/* on non-Win64 x86-64, these are already in the right registers */ + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_push(p->func, p->machine_EDI); + x86_push(p->func, p->idx_ESI); + + x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); + x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); + } + + x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); + + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); + else + x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); /* Load instance ID. */ @@ -588,25 +1304,25 @@ static boolean build_vertex_emit( struct translate_sse *p, p->tmp_EAX, x86_fn_arg(p->func, 4)); x86_mov(p->func, - x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)), + x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), p->tmp_EAX); } /* Get vertex count, compare to zero */ x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); - x86_cmp(p->func, p->count_ESI, p->tmp_EAX); + x86_cmp(p->func, p->count_EBP, p->tmp_EAX); fixup = x86_jcc_forward(p->func, cc_E); /* always load, needed or not: */ - init_inputs(p, linear); + init_inputs(p, index_size); /* Note address for loop jump */ label = x86_get_label(p->func); { - struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); + struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); int last_varient = -1; struct x86_reg vb; @@ -618,30 +1334,31 @@ static boolean build_vertex_emit( struct translate_sse *p, */ if (varient != last_varient) { last_varient = varient; - vb = get_buffer_ptr(p, linear, varient, elt); + vb = get_buffer_ptr(p, index_size, varient, elt); } if (!translate_attr( p, a, x86_make_disp(vb, a->input_offset), - x86_make_disp(p->outbuf_ECX, a->output_offset))) + x86_make_disp(p->outbuf_EBX, a->output_offset))) return FALSE; } /* Next output vertex: */ + x64_rexw(p->func); x86_lea(p->func, - p->outbuf_ECX, - x86_make_disp(p->outbuf_ECX, + p->outbuf_EBX, + x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); /* Incr index */ - incr_inputs( p, linear ); + incr_inputs( p, index_size ); } /* decr count, loop if not zero */ - x86_dec(p->func, p->count_ESI); + x86_dec(p->func, p->count_EBP); x86_jcc(p->func, cc_NZ, label); /* Exit mmx state? @@ -656,8 +1373,20 @@ static boolean build_vertex_emit( struct translate_sse *p, /* Pop regs and return */ - x86_pop(p->func, p->count_ESI); - x86_pop(p->func, p->idx_EBX); + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_pop(p->func, p->idx_ESI); + x86_pop(p->func, p->machine_EDI); + } + + x86_pop(p->func, p->count_EBP); + x86_pop(p->func, p->outbuf_EBX); + + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); + sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); + } x86_ret(p->func); return TRUE; @@ -697,37 +1426,7 @@ static void translate_sse_release( struct translate *translate ) x86_release_func( &p->linear_func ); x86_release_func( &p->elt_func ); - FREE(p); -} - -static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run_elts( translate, - elts, - count, - instance_id, - output_buffer); -} - -static void PIPE_CDECL translate_sse_run( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run( translate, - start, - count, - instance_id, - output_buffer); + os_free_aligned(p); } @@ -736,18 +1435,19 @@ struct translate *translate_sse2_create( const struct translate_key *key ) struct translate_sse *p = NULL; unsigned i; - if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) + /* this is misnamed, it actually refers to whether rtasm is enabled or not */ + if (!rtasm_cpu_has_sse()) goto fail; - p = CALLOC_STRUCT( translate_sse ); + p = os_malloc_aligned(sizeof(struct translate_sse), 16); if (p == NULL) goto fail; + memset(p, 0, sizeof(*p)); + memcpy(p->consts, consts, sizeof(consts)); p->translate.key = *key; p->translate.release = translate_sse_release; p->translate.set_buffer = translate_sse_set_buffer; - p->translate.run_elts = translate_sse_run_elts; - p->translate.run = translate_sse_run; for (i = 0; i < key->nr_elements; i++) { if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { @@ -783,18 +1483,32 @@ struct translate *translate_sse2_create( const struct translate_key *key ) if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); - if (!build_vertex_emit(p, &p->linear_func, TRUE)) + if (!build_vertex_emit(p, &p->linear_func, 0)) + goto fail; + + if (!build_vertex_emit(p, &p->elt_func, 4)) + goto fail; + + if (!build_vertex_emit(p, &p->elt16_func, 2)) + goto fail; + + if (!build_vertex_emit(p, &p->elt8_func, 1)) + goto fail; + + p->translate.run = (void*)x86_get_func(&p->linear_func); + if (p->translate.run == NULL) goto fail; - if (!build_vertex_emit(p, &p->elt_func, FALSE)) + p->translate.run_elts = (void*)x86_get_func(&p->elt_func); + if (p->translate.run_elts == NULL) goto fail; - p->gen_run = (run_func)x86_get_func(&p->linear_func); - if (p->gen_run == NULL) + p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func); + if (p->translate.run_elts16 == NULL) goto fail; - p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); - if (p->gen_run_elts == NULL) + p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func); + if (p->translate.run_elts8 == NULL) goto fail; return &p->translate; diff --git a/src/gallium/auxiliary/util/u_atomic.h b/src/gallium/auxiliary/util/u_atomic.h index a1568233906..8434491a421 100644 --- a/src/gallium/auxiliary/util/u_atomic.h +++ b/src/gallium/auxiliary/util/u_atomic.h @@ -29,6 +29,8 @@ #define PIPE_ATOMIC_ASM_MSVC_X86 #elif (defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86)) #define PIPE_ATOMIC_ASM_GCC_X86 +#elif (defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86_64)) +#define PIPE_ATOMIC_ASM_GCC_X86_64 #elif defined(PIPE_CC_GCC) && (PIPE_CC_GCC_VERSION >= 401) #define PIPE_ATOMIC_GCC_INTRINSIC #else @@ -36,6 +38,51 @@ #endif +#if defined(PIPE_ATOMIC_ASM_GCC_X86_64) +#define PIPE_ATOMIC "GCC x86_64 assembly" + +#ifdef __cplusplus +extern "C" { +#endif + +#define p_atomic_set(_v, _i) (*(_v) = (_i)) +#define p_atomic_read(_v) (*(_v)) + +static INLINE boolean +p_atomic_dec_zero(int32_t *v) +{ + unsigned char c; + + __asm__ __volatile__("lock; decl %0; sete %1":"+m"(*v), "=qm"(c) + ::"memory"); + + return c != 0; +} + +static INLINE void +p_atomic_inc(int32_t *v) +{ + __asm__ __volatile__("lock; incl %0":"+m"(*v)); +} + +static INLINE void +p_atomic_dec(int32_t *v) +{ + __asm__ __volatile__("lock; decl %0":"+m"(*v)); +} + +static INLINE int32_t +p_atomic_cmpxchg(int32_t *v, int32_t old, int32_t _new) +{ + return __sync_val_compare_and_swap(v, old, _new); +} + +#ifdef __cplusplus +} +#endif + +#endif /* PIPE_ATOMIC_ASM_GCC_X86_64 */ + #if defined(PIPE_ATOMIC_ASM_GCC_X86) diff --git a/src/gallium/auxiliary/util/u_bitmask.h b/src/gallium/auxiliary/util/u_bitmask.h index 87f1110296a..98b85ddecd5 100644 --- a/src/gallium/auxiliary/util/u_bitmask.h +++ b/src/gallium/auxiliary/util/u_bitmask.h @@ -36,6 +36,9 @@ #define U_HANDLE_BITMASK_H_ +#include "pipe/p_compiler.h" + + #ifdef __cplusplus extern "C" { #endif diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c index e45310b9bb7..dfb142b9e1c 100644 --- a/src/gallium/auxiliary/util/u_blit.c +++ b/src/gallium/auxiliary/util/u_blit.c @@ -47,8 +47,6 @@ #include "util/u_memory.h" #include "util/u_sampler.h" #include "util/u_simple_shaders.h" -#include "util/u_surface.h" -#include "util/u_rect.h" #include "cso_cache/cso_context.h" @@ -59,15 +57,18 @@ struct blit_state struct cso_context *cso; struct pipe_blend_state blend; - struct pipe_depth_stencil_alpha_state depthstencil; + struct pipe_depth_stencil_alpha_state depthstencil_keep; + struct pipe_depth_stencil_alpha_state depthstencil_write; struct pipe_rasterizer_state rasterizer; struct pipe_sampler_state sampler; struct pipe_viewport_state viewport; struct pipe_clip_state clip; struct pipe_vertex_element velem[2]; + enum pipe_texture_target internal_target; void *vs; void *fs[TGSI_WRITEMASK_XYZW + 1]; + void *fs_depth; struct pipe_resource *vbuf; /**< quad vertices */ unsigned vbuf_slot; @@ -98,12 +99,15 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->blend.rt[0].colormask = PIPE_MASK_RGBA; /* no-op depth/stencil/alpha */ - memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil)); + memset(&ctx->depthstencil_keep, 0, sizeof(ctx->depthstencil_keep)); + memset(&ctx->depthstencil_write, 0, sizeof(ctx->depthstencil_write)); + ctx->depthstencil_write.depth.enabled = 1; + ctx->depthstencil_write.depth.writemask = 1; + ctx->depthstencil_write.depth.func = PIPE_FUNC_ALWAYS; /* rasterizer */ memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer)); - ctx->rasterizer.front_winding = PIPE_WINDING_CW; - ctx->rasterizer.cull_mode = PIPE_WINDING_NONE; + ctx->rasterizer.cull_face = PIPE_FACE_NONE; ctx->rasterizer.gl_rasterization_rules = 1; /* samplers */ @@ -114,7 +118,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE; ctx->sampler.min_img_filter = 0; /* set later */ ctx->sampler.mag_img_filter = 0; /* set later */ - ctx->sampler.normalized_coords = 1; /* vertex elements state */ memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2); @@ -138,7 +141,8 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) /* fragment shader */ ctx->fs[TGSI_WRITEMASK_XYZW] = - util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D); + util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D, + TGSI_INTERPOLATE_LINEAR); ctx->vbuf = NULL; /* init vertex data that doesn't change */ @@ -148,6 +152,11 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->vertices[i][1][3] = 1.0f; /* q */ } + if(pipe->screen->get_param(pipe->screen, PIPE_CAP_NPOT_TEXTURES)) + ctx->internal_target = PIPE_TEXTURE_2D; + else + ctx->internal_target = PIPE_TEXTURE_RECT; + return ctx; } @@ -167,6 +176,9 @@ util_destroy_blit(struct blit_state *ctx) if (ctx->fs[i]) pipe->delete_fs_state(pipe, ctx->fs[i]); + if (ctx->fs_depth) + pipe->delete_fs_state(pipe, ctx->fs_depth); + pipe_resource_reference(&ctx->vbuf, NULL); FREE(ctx); @@ -192,7 +204,6 @@ get_next_slot( struct blit_state *ctx ) return ctx->vbuf_slot++ * sizeof ctx->vertices; } - @@ -275,14 +286,15 @@ regions_overlap(int srcX0, int srcY0, * \param writemask controls which channels in the dest surface are sourced * from the src surface. Disabled channels are sourced * from (0,0,0,1). - * XXX need some control over blitting Z and/or stencil. + * XXX need some control over blitting stencil. */ void util_blit_pixels_writemask(struct blit_state *ctx, - struct pipe_surface *src, - struct pipe_sampler_view *src_sampler_view, + struct pipe_resource *src_tex, + struct pipe_subresource srcsub, int srcX0, int srcY0, int srcX1, int srcY1, + int srcZ0, struct pipe_surface *dst, int dstX0, int dstY0, int dstX1, int dstY1, @@ -292,23 +304,25 @@ util_blit_pixels_writemask(struct blit_state *ctx, struct pipe_context *pipe = ctx->pipe; struct pipe_screen *screen = pipe->screen; struct pipe_sampler_view *sampler_view = NULL; + struct pipe_sampler_view sv_templ; struct pipe_framebuffer_state fb; const int srcW = abs(srcX1 - srcX0); const int srcH = abs(srcY1 - srcY0); unsigned offset; - boolean overlap; + boolean overlap, dst_is_depth; float s0, t0, s1, t1; + boolean normalized; assert(filter == PIPE_TEX_MIPFILTER_NEAREST || filter == PIPE_TEX_MIPFILTER_LINEAR); - assert(screen->is_format_supported(screen, src->format, PIPE_TEXTURE_2D, - PIPE_BIND_SAMPLER_VIEW, 0)); - assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D, - PIPE_BIND_RENDER_TARGET, 0)); + assert(srcsub.level <= src_tex->last_level); /* do the regions overlap? */ - overlap = util_same_surface(src, dst) && + overlap = src_tex == dst->texture && + dst->face == srcsub.face && + dst->level == srcsub.level && + dst->zslice == srcZ0 && regions_overlap(srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1); @@ -317,8 +331,7 @@ util_blit_pixels_writemask(struct blit_state *ctx, * no overlapping. * Filter mode should not matter since there's no stretching. */ - if (pipe->surface_copy && - dst->format == src->format && + if (dst->format == src_tex->format && srcX0 < srcX1 && dstX0 < dstX1 && srcY0 < srcY1 && @@ -326,29 +339,36 @@ util_blit_pixels_writemask(struct blit_state *ctx, (dstX1 - dstX0) == (srcX1 - srcX0) && (dstY1 - dstY0) == (srcY1 - srcY0) && !overlap) { - pipe->surface_copy(pipe, - dst, dstX0, dstY0, /* dest */ - src, srcX0, srcY0, /* src */ - srcW, srcH); /* size */ + struct pipe_subresource subdst; + subdst.face = dst->face; + subdst.level = dst->level; + pipe->resource_copy_region(pipe, + dst->texture, subdst, + dstX0, dstY0, dst->zslice,/* dest */ + src_tex, srcsub, + srcX0, srcY0, srcZ0,/* src */ + srcW, srcH); /* size */ return; } - - assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D, - PIPE_BIND_RENDER_TARGET, 0)); /* Create a temporary texture when src and dest alias or when src - * is anything other than a single-level 2d texture. + * is anything other than a 2d texture. + * XXX should just use appropriate shader to access 1d / 3d slice / cube face, + * much like the u_blitter code does (should be pretty trivial). * * This can still be improved upon. */ - if (util_same_surface(src, dst) || - src->texture->target != PIPE_TEXTURE_2D || - src->texture->last_level != 0) + if ((src_tex == dst->texture && + dst->face == srcsub.face && + dst->level == srcsub.level && + dst->zslice == srcZ0) || + (src_tex->target != PIPE_TEXTURE_2D && + src_tex->target != PIPE_TEXTURE_RECT)) { struct pipe_resource texTemp; struct pipe_resource *tex; struct pipe_sampler_view sv_templ; - struct pipe_surface *texSurf; + struct pipe_subresource texsub; const int srcLeft = MIN2(srcX0, srcX1); const int srcTop = MIN2(srcY0, srcY1); @@ -368,8 +388,8 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* create temp texture */ memset(&texTemp, 0, sizeof(texTemp)); - texTemp.target = PIPE_TEXTURE_2D; - texTemp.format = src->format; + texTemp.target = ctx->internal_target; + texTemp.format = src_tex->format; texTemp.last_level = 0; texTemp.width0 = srcW; texTemp.height0 = srcH; @@ -380,50 +400,69 @@ util_blit_pixels_writemask(struct blit_state *ctx, if (!tex) return; + texsub.face = 0; + texsub.level = 0; + /* load temp texture */ + pipe->resource_copy_region(pipe, + tex, texsub, 0, 0, 0, /* dest */ + src_tex, srcsub, srcLeft, srcTop, srcZ0, /* src */ + srcW, srcH); /* size */ + + normalized = tex->target != PIPE_TEXTURE_RECT; + if(normalized) { + s0 = 0.0f; + s1 = 1.0f; + t0 = 0.0f; + t1 = 1.0f; + } + else { + s0 = 0; + s1 = srcW; + t0 = 0; + t1 = srcH; + } + u_sampler_view_default_template(&sv_templ, tex, tex->format); + sampler_view = pipe->create_sampler_view(pipe, tex, &sv_templ); - sampler_view = ctx->pipe->create_sampler_view(ctx->pipe, tex, &sv_templ); if (!sampler_view) { pipe_resource_reference(&tex, NULL); return; } - - texSurf = screen->get_tex_surface(screen, tex, 0, 0, 0, - PIPE_BIND_BLIT_DESTINATION); - - /* load temp texture */ - if (pipe->surface_copy) { - pipe->surface_copy(pipe, - texSurf, 0, 0, /* dest */ - src, srcLeft, srcTop, /* src */ - srcW, srcH); /* size */ - } else { - util_surface_copy(pipe, FALSE, - texSurf, 0, 0, /* dest */ - src, srcLeft, srcTop, /* src */ - srcW, srcH); /* size */ - } - - /* free the surface, update the texture if necessary. - */ - pipe_surface_reference(&texSurf, NULL); - s0 = 0.0f; - s1 = 1.0f; - t0 = 0.0f; - t1 = 1.0f; - pipe_resource_reference(&tex, NULL); } else { - pipe_sampler_view_reference(&sampler_view, src_sampler_view); - s0 = srcX0 / (float)src->texture->width0; - s1 = srcX1 / (float)src->texture->width0; - t0 = srcY0 / (float)src->texture->height0; - t1 = srcY1 / (float)src->texture->height0; + u_sampler_view_default_template(&sv_templ, src_tex, src_tex->format); + sv_templ.first_level = sv_templ.last_level = srcsub.level; + sampler_view = pipe->create_sampler_view(pipe, src_tex, &sv_templ); + + if (!sampler_view) { + return; + } + + s0 = srcX0; + s1 = srcX1; + t0 = srcY0; + t1 = srcY1; + normalized = sampler_view->texture->target != PIPE_TEXTURE_RECT; + if(normalized) + { + s0 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level)); + s1 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level)); + t0 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level)); + t1 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level)); + } } - + dst_is_depth = util_format_is_depth_or_stencil(dst->format); + assert(screen->is_format_supported(screen, sampler_view->format, ctx->internal_target, + sampler_view->texture->nr_samples, + PIPE_BIND_SAMPLER_VIEW, 0)); + assert(screen->is_format_supported(screen, dst->format, ctx->internal_target, + dst->texture->nr_samples, + dst_is_depth ? PIPE_BIND_DEPTH_STENCIL : + PIPE_BIND_RENDER_TARGET, 0)); /* save state (restored below) */ cso_save_blend(ctx->cso); cso_save_depth_stencil_alpha(ctx->cso); @@ -439,14 +478,20 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* set misc state we care about */ cso_set_blend(ctx->cso, &ctx->blend); - cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil); + cso_set_depth_stencil_alpha(ctx->cso, + dst_is_depth ? &ctx->depthstencil_write : + &ctx->depthstencil_keep); cso_set_rasterizer(ctx->cso, &ctx->rasterizer); cso_set_clip(ctx->cso, &ctx->clip); cso_set_vertex_elements(ctx->cso, 2, ctx->velem); /* sampler */ + ctx->sampler.normalized_coords = normalized; ctx->sampler.min_img_filter = filter; ctx->sampler.mag_img_filter = filter; + /* we've limited this already with the sampler view but you never know... */ + ctx->sampler.min_lod = srcsub.level; + ctx->sampler.max_lod = srcsub.level; cso_single_sampler(ctx->cso, 0, &ctx->sampler); cso_single_sampler_done(ctx->cso); @@ -464,21 +509,35 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* texture */ cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view); - if (ctx->fs[writemask] == NULL) - ctx->fs[writemask] = - util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D, - writemask); - /* shaders */ - cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]); + if (dst_is_depth) { + if (ctx->fs_depth == NULL) + ctx->fs_depth = + util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D, + TGSI_INTERPOLATE_LINEAR); + + cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth); + } else { + if (ctx->fs[writemask] == NULL) + ctx->fs[writemask] = + util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D, + TGSI_INTERPOLATE_LINEAR, + writemask); + + cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]); + } cso_set_vertex_shader_handle(ctx->cso, ctx->vs); /* drawing dest */ memset(&fb, 0, sizeof(fb)); fb.width = dst->width; fb.height = dst->height; - fb.nr_cbufs = 1; - fb.cbufs[0] = dst; + if (dst_is_depth) { + fb.zsbuf = dst; + } else { + fb.nr_cbufs = 1; + fb.cbufs[0] = dst; + } cso_set_framebuffer(ctx->cso, &fb); /* draw quad */ @@ -515,18 +574,21 @@ util_blit_pixels_writemask(struct blit_state *ctx, void util_blit_pixels(struct blit_state *ctx, - struct pipe_surface *src, - struct pipe_sampler_view *src_sampler_view, + struct pipe_resource *src_tex, + struct pipe_subresource srcsub, int srcX0, int srcY0, int srcX1, int srcY1, + int srcZ, struct pipe_surface *dst, int dstX0, int dstY0, int dstX1, int dstY1, float z, uint filter ) { - util_blit_pixels_writemask( ctx, src, src_sampler_view, + util_blit_pixels_writemask( ctx, src_tex, + srcsub, srcX0, srcY0, srcX1, srcY1, + srcZ, dst, dstX0, dstY0, dstX1, dstY1, @@ -548,7 +610,6 @@ void util_blit_flush( struct blit_state *ctx ) /** * Copy pixel block from src texture to dst surface. - * Overlapping regions are acceptable. * * XXX Should support selection of level. * XXX need some control over blitting Z and/or stencil. @@ -563,6 +624,7 @@ util_blit_pixels_tex(struct blit_state *ctx, int dstX1, int dstY1, float z, uint filter) { + boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT; struct pipe_framebuffer_state fb; float s0, t0, s1, t1; unsigned offset; @@ -575,13 +637,22 @@ util_blit_pixels_tex(struct blit_state *ctx, assert(tex->width0 != 0); assert(tex->height0 != 0); - s0 = srcX0 / (float)tex->width0; - s1 = srcX1 / (float)tex->width0; - t0 = srcY0 / (float)tex->height0; - t1 = srcY1 / (float)tex->height0; + s0 = srcX0; + s1 = srcX1; + t0 = srcY0; + t1 = srcY1; + + if(normalized) + { + s0 /= (float)tex->width0; + s1 /= (float)tex->width0; + t0 /= (float)tex->height0; + t1 /= (float)tex->height0; + } assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format, PIPE_TEXTURE_2D, + dst->texture->nr_samples, PIPE_BIND_RENDER_TARGET, 0)); @@ -599,12 +670,13 @@ util_blit_pixels_tex(struct blit_state *ctx, /* set misc state we care about */ cso_set_blend(ctx->cso, &ctx->blend); - cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil); + cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil_keep); cso_set_rasterizer(ctx->cso, &ctx->rasterizer); cso_set_clip(ctx->cso, &ctx->clip); cso_set_vertex_elements(ctx->cso, 2, ctx->velem); /* sampler */ + ctx->sampler.normalized_coords = normalized; ctx->sampler.min_img_filter = filter; ctx->sampler.mag_img_filter = filter; cso_single_sampler(ctx->cso, 0, &ctx->sampler); diff --git a/src/gallium/auxiliary/util/u_blit.h b/src/gallium/auxiliary/util/u_blit.h index 464ff9aaced..b8a0dfce13f 100644 --- a/src/gallium/auxiliary/util/u_blit.h +++ b/src/gallium/auxiliary/util/u_blit.h @@ -30,18 +30,20 @@ #define U_BLIT_H +#include "pipe/p_compiler.h" + + #ifdef __cplusplus extern "C" { #endif +struct cso_context; struct pipe_context; -struct pipe_surface; struct pipe_resource; -struct cso_context; - - -struct blit_state; +struct pipe_sampler_view; +struct pipe_subresource; +struct pipe_surface; extern struct blit_state * @@ -52,10 +54,11 @@ util_destroy_blit(struct blit_state *ctx); extern void util_blit_pixels(struct blit_state *ctx, - struct pipe_surface *src, - struct pipe_sampler_view *src_sampler_view, + struct pipe_resource *src_tex, + struct pipe_subresource srcsub, int srcX0, int srcY0, int srcX1, int srcY1, + int srcZ0, struct pipe_surface *dst, int dstX0, int dstY0, int dstX1, int dstY1, @@ -63,10 +66,11 @@ util_blit_pixels(struct blit_state *ctx, void util_blit_pixels_writemask(struct blit_state *ctx, - struct pipe_surface *src, - struct pipe_sampler_view *src_sampler_view, + struct pipe_resource *src_tex, + struct pipe_subresource srcsub, int srcX0, int srcY0, int srcX1, int srcY1, + int srcZ0, struct pipe_surface *dst, int dstX0, int dstY0, int dstX1, int dstY1, diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c index 956aedc8a15..a163f93cb82 100644 --- a/src/gallium/auxiliary/util/u_blitter.c +++ b/src/gallium/auxiliary/util/u_blitter.c @@ -26,8 +26,8 @@ /** * @file - * Blitter utility to facilitate acceleration of the clear, surface_copy, - * and surface_fill functions. + * Blitter utility to facilitate acceleration of the clear, clear_render_target, clear_depth_stencil + * resource_copy_region functions. * * @author Marek Olšák */ @@ -43,19 +43,17 @@ #include "util/u_math.h" #include "util/u_blitter.h" #include "util/u_draw_quad.h" -#include "util/u_pack_color.h" -#include "util/u_rect.h" #include "util/u_sampler.h" #include "util/u_simple_shaders.h" +#include "util/u_surface.h" #include "util/u_texture.h" #define INVALID_PTR ((void*)~0) struct blitter_context_priv { - struct blitter_context blitter; + struct blitter_context base; - struct pipe_context *pipe; /**< pipe context */ struct pipe_resource *vbuf; /**< quad */ float vertices[4][2][4]; /**< {pos, color} or {pos, texcoord} */ @@ -69,8 +67,8 @@ struct blitter_context_priv void *vs_tex; /**< Vertex shader which passes {pos, texcoord} to the output.*/ /* Fragment shaders. */ - /* FS which outputs a color to multiple color buffers. */ - void *fs_col[PIPE_MAX_COLOR_BUFS]; + /* The shader at index i outputs color to color buffers 0,1,...,i-1. */ + void *fs_col[PIPE_MAX_COLOR_BUFS+1]; /* FS which outputs a color from a texture, where the index is PIPE_TEXTURE_* to be sampled. */ @@ -88,24 +86,36 @@ struct blitter_context_priv void *dsa_write_depth_stencil; void *dsa_write_depth_keep_stencil; void *dsa_keep_depth_stencil; + void *dsa_keep_depth_write_stencil; + void *dsa_flush_depth_stencil; void *velem_state; /* Sampler state for clamping to a miplevel. */ - void *sampler_state[PIPE_MAX_TEXTURE_LEVELS]; + void *sampler_state[PIPE_MAX_TEXTURE_LEVELS * 2]; /* Rasterizer state. */ void *rs_state; - struct pipe_sampler_view *sampler_view; - /* Viewport state. */ struct pipe_viewport_state viewport; /* Clip state. */ struct pipe_clip_state clip; + + /* Destination surface dimensions. */ + unsigned dst_width; + unsigned dst_height; }; +static void blitter_draw_rectangle(struct blitter_context *blitter, + unsigned x, unsigned y, + unsigned width, unsigned height, + float depth, + enum blitter_attrib_type type, + const float attrib[4]); + + struct blitter_context *util_blitter_create(struct pipe_context *pipe) { struct blitter_context_priv *ctx; @@ -120,19 +130,20 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe) if (!ctx) return NULL; - ctx->pipe = pipe; + ctx->base.pipe = pipe; + ctx->base.draw_rectangle = blitter_draw_rectangle; /* init state objects for them to be considered invalid */ - ctx->blitter.saved_blend_state = INVALID_PTR; - ctx->blitter.saved_dsa_state = INVALID_PTR; - ctx->blitter.saved_rs_state = INVALID_PTR; - ctx->blitter.saved_fs = INVALID_PTR; - ctx->blitter.saved_vs = INVALID_PTR; - ctx->blitter.saved_velem_state = INVALID_PTR; - ctx->blitter.saved_fb_state.nr_cbufs = ~0; - ctx->blitter.saved_num_sampler_views = ~0; - ctx->blitter.saved_num_sampler_states = ~0; - ctx->blitter.saved_num_vertex_buffers = ~0; + ctx->base.saved_blend_state = INVALID_PTR; + ctx->base.saved_dsa_state = INVALID_PTR; + ctx->base.saved_rs_state = INVALID_PTR; + ctx->base.saved_fs = INVALID_PTR; + ctx->base.saved_vs = INVALID_PTR; + ctx->base.saved_velem_state = INVALID_PTR; + ctx->base.saved_fb_state.nr_cbufs = ~0; + ctx->base.saved_num_sampler_views = ~0; + ctx->base.saved_num_sampler_states = ~0; + ctx->base.saved_num_vertex_buffers = ~0; /* blend state objects */ memset(&blend, 0, sizeof(blend)); @@ -146,6 +157,10 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe) ctx->dsa_keep_depth_stencil = pipe->create_depth_stencil_alpha_state(pipe, &dsa); + dsa.depth.writemask = 1; + ctx->dsa_flush_depth_stencil = + pipe->create_depth_stencil_alpha_state(pipe, &dsa); + dsa.depth.enabled = 1; dsa.depth.writemask = 1; dsa.depth.func = PIPE_FUNC_ALWAYS; @@ -161,8 +176,12 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe) dsa.stencil[0].writemask = 0xff; ctx->dsa_write_depth_stencil = pipe->create_depth_stencil_alpha_state(pipe, &dsa); - /* The DSA state objects which write depth and stencil are created - * on-demand. */ + + + dsa.depth.enabled = 0; + dsa.depth.writemask = 0; + ctx->dsa_keep_depth_write_stencil = + pipe->create_depth_stencil_alpha_state(pipe, &dsa); /* sampler state */ sampler_state = &ctx->template_sampler_state; @@ -175,8 +194,7 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe) /* rasterizer state */ memset(&rs_state, 0, sizeof(rs_state)); - rs_state.front_winding = PIPE_WINDING_CW; - rs_state.cull_mode = PIPE_WINDING_NONE; + rs_state.cull_face = PIPE_FACE_NONE; rs_state.gl_rasterization_rules = 1; rs_state.flatshade = 1; ctx->rs_state = pipe->create_rasterizer_state(pipe, &rs_state); @@ -216,17 +234,17 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe) ctx->vertices[i][0][3] = 1; /*v.w*/ /* create the vertex buffer */ - ctx->vbuf = pipe_buffer_create(ctx->pipe->screen, + ctx->vbuf = pipe_buffer_create(ctx->base.pipe->screen, PIPE_BIND_VERTEX_BUFFER, sizeof(ctx->vertices)); - return &ctx->blitter; + return &ctx->base; } void util_blitter_destroy(struct blitter_context *blitter) { struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter; - struct pipe_context *pipe = ctx->pipe; + struct pipe_context *pipe = blitter->pipe; int i; pipe->delete_blend_state(pipe, ctx->blend_write_color); @@ -235,6 +253,8 @@ void util_blitter_destroy(struct blitter_context *blitter) pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_keep_stencil); pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil); + pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil); + pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_flush_depth_stencil); pipe->delete_rasterizer_state(pipe, ctx->rs_state); pipe->delete_vs_state(pipe, ctx->vs_col); @@ -248,18 +268,14 @@ void util_blitter_destroy(struct blitter_context *blitter) pipe->delete_fs_state(pipe, ctx->fs_texfetch_depth[i]); } - for (i = 0; i < PIPE_MAX_COLOR_BUFS && ctx->fs_col[i]; i++) + for (i = 0; i <= PIPE_MAX_COLOR_BUFS; i++) if (ctx->fs_col[i]) pipe->delete_fs_state(pipe, ctx->fs_col[i]); - for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++) + for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS * 2; i++) if (ctx->sampler_state[i]) pipe->delete_sampler_state(pipe, ctx->sampler_state[i]); - if (ctx->sampler_view) { - pipe_sampler_view_reference(&ctx->sampler_view, NULL); - } - pipe_resource_reference(&ctx->vbuf, NULL); FREE(ctx); } @@ -267,104 +283,117 @@ void util_blitter_destroy(struct blitter_context *blitter) static void blitter_check_saved_CSOs(struct blitter_context_priv *ctx) { /* make sure these CSOs have been saved */ - assert(ctx->blitter.saved_blend_state != INVALID_PTR && - ctx->blitter.saved_dsa_state != INVALID_PTR && - ctx->blitter.saved_rs_state != INVALID_PTR && - ctx->blitter.saved_fs != INVALID_PTR && - ctx->blitter.saved_vs != INVALID_PTR && - ctx->blitter.saved_velem_state != INVALID_PTR); + assert(ctx->base.saved_blend_state != INVALID_PTR && + ctx->base.saved_dsa_state != INVALID_PTR && + ctx->base.saved_rs_state != INVALID_PTR && + ctx->base.saved_fs != INVALID_PTR && + ctx->base.saved_vs != INVALID_PTR && + ctx->base.saved_velem_state != INVALID_PTR); } static void blitter_restore_CSOs(struct blitter_context_priv *ctx) { - struct pipe_context *pipe = ctx->pipe; + struct pipe_context *pipe = ctx->base.pipe; + unsigned i; /* restore the state objects which are always required to be saved */ - pipe->bind_blend_state(pipe, ctx->blitter.saved_blend_state); - pipe->bind_depth_stencil_alpha_state(pipe, ctx->blitter.saved_dsa_state); - pipe->bind_rasterizer_state(pipe, ctx->blitter.saved_rs_state); - pipe->bind_fs_state(pipe, ctx->blitter.saved_fs); - pipe->bind_vs_state(pipe, ctx->blitter.saved_vs); - pipe->bind_vertex_elements_state(pipe, ctx->blitter.saved_velem_state); + pipe->bind_blend_state(pipe, ctx->base.saved_blend_state); + pipe->bind_depth_stencil_alpha_state(pipe, ctx->base.saved_dsa_state); + pipe->bind_rasterizer_state(pipe, ctx->base.saved_rs_state); + pipe->bind_fs_state(pipe, ctx->base.saved_fs); + pipe->bind_vs_state(pipe, ctx->base.saved_vs); + pipe->bind_vertex_elements_state(pipe, ctx->base.saved_velem_state); - ctx->blitter.saved_blend_state = INVALID_PTR; - ctx->blitter.saved_dsa_state = INVALID_PTR; - ctx->blitter.saved_rs_state = INVALID_PTR; - ctx->blitter.saved_fs = INVALID_PTR; - ctx->blitter.saved_vs = INVALID_PTR; - ctx->blitter.saved_velem_state = INVALID_PTR; + ctx->base.saved_blend_state = INVALID_PTR; + ctx->base.saved_dsa_state = INVALID_PTR; + ctx->base.saved_rs_state = INVALID_PTR; + ctx->base.saved_fs = INVALID_PTR; + ctx->base.saved_vs = INVALID_PTR; + ctx->base.saved_velem_state = INVALID_PTR; - pipe->set_stencil_ref(pipe, &ctx->blitter.saved_stencil_ref); + pipe->set_stencil_ref(pipe, &ctx->base.saved_stencil_ref); - pipe->set_viewport_state(pipe, &ctx->blitter.saved_viewport); - pipe->set_clip_state(pipe, &ctx->blitter.saved_clip); + pipe->set_viewport_state(pipe, &ctx->base.saved_viewport); + pipe->set_clip_state(pipe, &ctx->base.saved_clip); /* restore the state objects which are required to be saved before copy/fill */ - if (ctx->blitter.saved_fb_state.nr_cbufs != ~0) { - pipe->set_framebuffer_state(pipe, &ctx->blitter.saved_fb_state); - ctx->blitter.saved_fb_state.nr_cbufs = ~0; + if (ctx->base.saved_fb_state.nr_cbufs != ~0) { + pipe->set_framebuffer_state(pipe, &ctx->base.saved_fb_state); + util_unreference_framebuffer_state(&ctx->base.saved_fb_state); + ctx->base.saved_fb_state.nr_cbufs = ~0; } - if (ctx->blitter.saved_num_sampler_states != ~0) { + if (ctx->base.saved_num_sampler_states != ~0) { pipe->bind_fragment_sampler_states(pipe, - ctx->blitter.saved_num_sampler_states, - ctx->blitter.saved_sampler_states); - ctx->blitter.saved_num_sampler_states = ~0; + ctx->base.saved_num_sampler_states, + ctx->base.saved_sampler_states); + ctx->base.saved_num_sampler_states = ~0; } - if (ctx->blitter.saved_num_sampler_views != ~0) { + if (ctx->base.saved_num_sampler_views != ~0) { pipe->set_fragment_sampler_views(pipe, - ctx->blitter.saved_num_sampler_views, - ctx->blitter.saved_sampler_views); - ctx->blitter.saved_num_sampler_views = ~0; + ctx->base.saved_num_sampler_views, + ctx->base.saved_sampler_views); + + for (i = 0; i < ctx->base.saved_num_sampler_views; i++) + pipe_sampler_view_reference(&ctx->base.saved_sampler_views[i], + NULL); + + ctx->base.saved_num_sampler_views = ~0; } - if (ctx->blitter.saved_num_vertex_buffers != ~0) { + if (ctx->base.saved_num_vertex_buffers != ~0) { pipe->set_vertex_buffers(pipe, - ctx->blitter.saved_num_vertex_buffers, - ctx->blitter.saved_vertex_buffers); - ctx->blitter.saved_num_vertex_buffers = ~0; + ctx->base.saved_num_vertex_buffers, + ctx->base.saved_vertex_buffers); + + for (i = 0; i < ctx->base.saved_num_vertex_buffers; i++) { + if (ctx->base.saved_vertex_buffers[i].buffer) { + pipe_resource_reference(&ctx->base.saved_vertex_buffers[i].buffer, + NULL); + } + } + ctx->base.saved_num_vertex_buffers = ~0; } } static void blitter_set_rectangle(struct blitter_context_priv *ctx, unsigned x1, unsigned y1, unsigned x2, unsigned y2, - unsigned width, unsigned height, float depth) { int i; /* set vertex positions */ - ctx->vertices[0][0][0] = (float)x1 / width * 2.0f - 1.0f; /*v0.x*/ - ctx->vertices[0][0][1] = (float)y1 / height * 2.0f - 1.0f; /*v0.y*/ + ctx->vertices[0][0][0] = (float)x1 / ctx->dst_width * 2.0f - 1.0f; /*v0.x*/ + ctx->vertices[0][0][1] = (float)y1 / ctx->dst_height * 2.0f - 1.0f; /*v0.y*/ - ctx->vertices[1][0][0] = (float)x2 / width * 2.0f - 1.0f; /*v1.x*/ - ctx->vertices[1][0][1] = (float)y1 / height * 2.0f - 1.0f; /*v1.y*/ + ctx->vertices[1][0][0] = (float)x2 / ctx->dst_width * 2.0f - 1.0f; /*v1.x*/ + ctx->vertices[1][0][1] = (float)y1 / ctx->dst_height * 2.0f - 1.0f; /*v1.y*/ - ctx->vertices[2][0][0] = (float)x2 / width * 2.0f - 1.0f; /*v2.x*/ - ctx->vertices[2][0][1] = (float)y2 / height * 2.0f - 1.0f; /*v2.y*/ + ctx->vertices[2][0][0] = (float)x2 / ctx->dst_width * 2.0f - 1.0f; /*v2.x*/ + ctx->vertices[2][0][1] = (float)y2 / ctx->dst_height * 2.0f - 1.0f; /*v2.y*/ - ctx->vertices[3][0][0] = (float)x1 / width * 2.0f - 1.0f; /*v3.x*/ - ctx->vertices[3][0][1] = (float)y2 / height * 2.0f - 1.0f; /*v3.y*/ + ctx->vertices[3][0][0] = (float)x1 / ctx->dst_width * 2.0f - 1.0f; /*v3.x*/ + ctx->vertices[3][0][1] = (float)y2 / ctx->dst_height * 2.0f - 1.0f; /*v3.y*/ for (i = 0; i < 4; i++) ctx->vertices[i][0][2] = depth; /*z*/ /* viewport */ - ctx->viewport.scale[0] = 0.5f * width; - ctx->viewport.scale[1] = 0.5f * height; + ctx->viewport.scale[0] = 0.5f * ctx->dst_width; + ctx->viewport.scale[1] = 0.5f * ctx->dst_height; ctx->viewport.scale[2] = 1.0f; ctx->viewport.scale[3] = 1.0f; - ctx->viewport.translate[0] = 0.5f * width; - ctx->viewport.translate[1] = 0.5f * height; + ctx->viewport.translate[0] = 0.5f * ctx->dst_width; + ctx->viewport.translate[1] = 0.5f * ctx->dst_height; ctx->viewport.translate[2] = 0.0f; ctx->viewport.translate[3] = 0.0f; - ctx->pipe->set_viewport_state(ctx->pipe, &ctx->viewport); + ctx->base.pipe->set_viewport_state(ctx->base.pipe, &ctx->viewport); /* clip */ - ctx->pipe->set_clip_state(ctx->pipe, &ctx->clip); + ctx->base.pipe->set_clip_state(ctx->base.pipe, &ctx->clip); } static void blitter_set_clear_color(struct blitter_context_priv *ctx, @@ -372,36 +401,72 @@ static void blitter_set_clear_color(struct blitter_context_priv *ctx, { int i; - for (i = 0; i < 4; i++) { - ctx->vertices[i][1][0] = rgba[0]; - ctx->vertices[i][1][1] = rgba[1]; - ctx->vertices[i][1][2] = rgba[2]; - ctx->vertices[i][1][3] = rgba[3]; + if (rgba) { + for (i = 0; i < 4; i++) { + ctx->vertices[i][1][0] = rgba[0]; + ctx->vertices[i][1][1] = rgba[1]; + ctx->vertices[i][1][2] = rgba[2]; + ctx->vertices[i][1][3] = rgba[3]; + } + } else { + for (i = 0; i < 4; i++) { + ctx->vertices[i][1][0] = 0; + ctx->vertices[i][1][1] = 0; + ctx->vertices[i][1][2] = 0; + ctx->vertices[i][1][3] = 0; + } } } -static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx, - struct pipe_surface *surf, +static void get_texcoords(struct pipe_resource *src, + struct pipe_subresource subsrc, unsigned x1, unsigned y1, - unsigned x2, unsigned y2) + unsigned x2, unsigned y2, + boolean normalized, float out[4]) { - int i; - float s1 = x1 / (float)surf->width; - float t1 = y1 / (float)surf->height; - float s2 = x2 / (float)surf->width; - float t2 = y2 / (float)surf->height; - - ctx->vertices[0][1][0] = s1; /*t0.s*/ - ctx->vertices[0][1][1] = t1; /*t0.t*/ + if(normalized) + { + out[0] = x1 / (float)u_minify(src->width0, subsrc.level); + out[1] = y1 / (float)u_minify(src->height0, subsrc.level); + out[2] = x2 / (float)u_minify(src->width0, subsrc.level); + out[3] = y2 / (float)u_minify(src->height0, subsrc.level); + } + else + { + out[0] = x1; + out[1] = y1; + out[2] = x2; + out[3] = y2; + } +} - ctx->vertices[1][1][0] = s2; /*t1.s*/ - ctx->vertices[1][1][1] = t1; /*t1.t*/ +static void set_texcoords_in_vertices(const float coord[4], + float *out, unsigned stride) +{ + out[0] = coord[0]; /*t0.s*/ + out[1] = coord[1]; /*t0.t*/ + out += stride; + out[0] = coord[2]; /*t1.s*/ + out[1] = coord[1]; /*t1.t*/ + out += stride; + out[0] = coord[2]; /*t2.s*/ + out[1] = coord[3]; /*t2.t*/ + out += stride; + out[0] = coord[0]; /*t3.s*/ + out[1] = coord[3]; /*t3.t*/ +} - ctx->vertices[2][1][0] = s2; /*t2.s*/ - ctx->vertices[2][1][1] = t2; /*t2.t*/ +static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx, + struct pipe_resource *src, + struct pipe_subresource subsrc, + unsigned x1, unsigned y1, + unsigned x2, unsigned y2) +{ + unsigned i; + float coord[4]; - ctx->vertices[3][1][0] = s1; /*t3.s*/ - ctx->vertices[3][1][1] = t2; /*t3.t*/ + get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord); + set_texcoords_in_vertices(coord, &ctx->vertices[0][1][0], 8); for (i = 0; i < 4; i++) { ctx->vertices[i][1][2] = 0; /*r*/ @@ -410,42 +475,35 @@ static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx, } static void blitter_set_texcoords_3d(struct blitter_context_priv *ctx, - struct pipe_surface *surf, + struct pipe_resource *src, + struct pipe_subresource subsrc, + unsigned zslice, unsigned x1, unsigned y1, unsigned x2, unsigned y2) { int i; - float depth = u_minify(surf->texture->depth0, surf->level); - float r = surf->zslice / depth; + float r = zslice / (float)u_minify(src->depth0, subsrc.level); - blitter_set_texcoords_2d(ctx, surf, x1, y1, x2, y2); + blitter_set_texcoords_2d(ctx, src, subsrc, x1, y1, x2, y2); for (i = 0; i < 4; i++) ctx->vertices[i][1][2] = r; /*r*/ } static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx, - struct pipe_surface *surf, + struct pipe_resource *src, + struct pipe_subresource subsrc, unsigned x1, unsigned y1, unsigned x2, unsigned y2) { int i; - float s1 = x1 / (float)surf->width; - float t1 = y1 / (float)surf->height; - float s2 = x2 / (float)surf->width; - float t2 = y2 / (float)surf->height; + float coord[4]; float st[4][2]; - st[0][0] = s1; - st[0][1] = t1; - st[1][0] = s2; - st[1][1] = t1; - st[2][0] = s2; - st[2][1] = t2; - st[3][0] = s1; - st[3][1] = t2; + get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord); + set_texcoords_in_vertices(coord, &st[0][0], 2); - util_map_texcoords2d_onto_cubemap(surf->face, + util_map_texcoords2d_onto_cubemap(subsrc.face, /* pointer, stride in floats */ &st[0][0], 2, &ctx->vertices[0][1][0], 8); @@ -454,9 +512,16 @@ static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx, ctx->vertices[i][1][3] = 1; /*q*/ } +static void blitter_set_dst_dimensions(struct blitter_context_priv *ctx, + unsigned width, unsigned height) +{ + ctx->dst_width = width; + ctx->dst_height = height; +} + static void blitter_draw_quad(struct blitter_context_priv *ctx) { - struct pipe_context *pipe = ctx->pipe; + struct pipe_context *pipe = ctx->base.pipe; /* write vertices and draw them */ pipe_buffer_write(pipe, ctx->vbuf, @@ -469,72 +534,79 @@ static void blitter_draw_quad(struct blitter_context_priv *ctx) static INLINE void **blitter_get_sampler_state(struct blitter_context_priv *ctx, - int miplevel) + int miplevel, boolean normalized) { - struct pipe_context *pipe = ctx->pipe; + struct pipe_context *pipe = ctx->base.pipe; struct pipe_sampler_state *sampler_state = &ctx->template_sampler_state; assert(miplevel < PIPE_MAX_TEXTURE_LEVELS); /* Create the sampler state on-demand. */ - if (!ctx->sampler_state[miplevel]) { + if (!ctx->sampler_state[miplevel * 2 + normalized]) { sampler_state->lod_bias = miplevel; sampler_state->min_lod = miplevel; sampler_state->max_lod = miplevel; + sampler_state->normalized_coords = normalized; - ctx->sampler_state[miplevel] = pipe->create_sampler_state(pipe, + ctx->sampler_state[miplevel * 2 + normalized] = pipe->create_sampler_state(pipe, sampler_state); } /* Return void** so that it can be passed to bind_fragment_sampler_states * directly. */ - return &ctx->sampler_state[miplevel]; + return &ctx->sampler_state[miplevel * 2 + normalized]; } static INLINE void *blitter_get_fs_col(struct blitter_context_priv *ctx, unsigned num_cbufs) { - struct pipe_context *pipe = ctx->pipe; - unsigned index = num_cbufs ? num_cbufs - 1 : 0; + struct pipe_context *pipe = ctx->base.pipe; assert(num_cbufs <= PIPE_MAX_COLOR_BUFS); - if (!ctx->fs_col[index]) - ctx->fs_col[index] = + if (!ctx->fs_col[num_cbufs]) + ctx->fs_col[num_cbufs] = util_make_fragment_clonecolor_shader(pipe, num_cbufs); - return ctx->fs_col[index]; + return ctx->fs_col[num_cbufs]; } +/** Convert PIPE_TEXTURE_x to TGSI_TEXTURE_x */ +static unsigned +pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target) +{ + switch (pipe_tex_target) { + case PIPE_TEXTURE_1D: + return TGSI_TEXTURE_1D; + case PIPE_TEXTURE_2D: + return TGSI_TEXTURE_2D; + case PIPE_TEXTURE_RECT: + return TGSI_TEXTURE_RECT; + case PIPE_TEXTURE_3D: + return TGSI_TEXTURE_3D; + case PIPE_TEXTURE_CUBE: + return TGSI_TEXTURE_CUBE; + default: + assert(0 && "unexpected texture target"); + return TGSI_TEXTURE_UNKNOWN; + } +} + + static INLINE void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx, unsigned tex_target) { - struct pipe_context *pipe = ctx->pipe; + struct pipe_context *pipe = ctx->base.pipe; assert(tex_target < PIPE_MAX_TEXTURE_TYPES); /* Create the fragment shader on-demand. */ if (!ctx->fs_texfetch_col[tex_target]) { - switch (tex_target) { - case PIPE_TEXTURE_1D: - ctx->fs_texfetch_col[PIPE_TEXTURE_1D] = - util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_1D); - break; - case PIPE_TEXTURE_2D: - ctx->fs_texfetch_col[PIPE_TEXTURE_2D] = - util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D); - break; - case PIPE_TEXTURE_3D: - ctx->fs_texfetch_col[PIPE_TEXTURE_3D] = - util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_3D); - break; - case PIPE_TEXTURE_CUBE: - ctx->fs_texfetch_col[PIPE_TEXTURE_CUBE] = - util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE); - break; - default:; - } + unsigned tgsi_tex = pipe_tex_to_tgsi_tex(tex_target); + + ctx->fs_texfetch_col[tex_target] = + util_make_fragment_tex_shader(pipe, tgsi_tex, TGSI_INTERPOLATE_LINEAR); } return ctx->fs_texfetch_col[tex_target]; @@ -544,36 +616,47 @@ static INLINE void *blitter_get_fs_texfetch_depth(struct blitter_context_priv *ctx, unsigned tex_target) { - struct pipe_context *pipe = ctx->pipe; + struct pipe_context *pipe = ctx->base.pipe; assert(tex_target < PIPE_MAX_TEXTURE_TYPES); /* Create the fragment shader on-demand. */ if (!ctx->fs_texfetch_depth[tex_target]) { - switch (tex_target) { - case PIPE_TEXTURE_1D: - ctx->fs_texfetch_depth[PIPE_TEXTURE_1D] = - util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_1D); - break; - case PIPE_TEXTURE_2D: - ctx->fs_texfetch_depth[PIPE_TEXTURE_2D] = - util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D); - break; - case PIPE_TEXTURE_3D: - ctx->fs_texfetch_depth[PIPE_TEXTURE_3D] = - util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_3D); - break; - case PIPE_TEXTURE_CUBE: - ctx->fs_texfetch_depth[PIPE_TEXTURE_CUBE] = - util_make_fragment_tex_shader_writedepth(pipe,TGSI_TEXTURE_CUBE); - break; - default:; - } + unsigned tgsi_tex = pipe_tex_to_tgsi_tex(tex_target); + + ctx->fs_texfetch_depth[tex_target] = + util_make_fragment_tex_shader_writedepth(pipe, tgsi_tex, + TGSI_INTERPOLATE_LINEAR); } return ctx->fs_texfetch_depth[tex_target]; } +static void blitter_draw_rectangle(struct blitter_context *blitter, + unsigned x1, unsigned y1, + unsigned x2, unsigned y2, + float depth, + enum blitter_attrib_type type, + const float attrib[4]) +{ + struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter; + + switch (type) { + case UTIL_BLITTER_ATTRIB_COLOR: + blitter_set_clear_color(ctx, attrib); + break; + + case UTIL_BLITTER_ATTRIB_TEXCOORD: + set_texcoords_in_vertices(attrib, &ctx->vertices[0][1][0], 8); + break; + + default:; + } + + blitter_set_rectangle(ctx, x1, y1, x2, y2, depth); + blitter_draw_quad(ctx); +} + void util_blitter_clear(struct blitter_context *blitter, unsigned width, unsigned height, unsigned num_cbufs, @@ -582,7 +665,7 @@ void util_blitter_clear(struct blitter_context *blitter, double depth, unsigned stencil) { struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter; - struct pipe_context *pipe = ctx->pipe; + struct pipe_context *pipe = ctx->base.pipe; struct pipe_stencil_ref sr = { { 0 } }; assert(num_cbufs <= PIPE_MAX_COLOR_BUFS); @@ -595,11 +678,19 @@ void util_blitter_clear(struct blitter_context *blitter, else pipe->bind_blend_state(pipe, ctx->blend_keep_color); - if (clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) { + if ((clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) { sr.ref_value[0] = stencil & 0xff; pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil); pipe->set_stencil_ref(pipe, &sr); } + else if (clear_buffers & PIPE_CLEAR_DEPTH) { + pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_keep_stencil); + } + else if (clear_buffers & PIPE_CLEAR_STENCIL) { + sr.ref_value[0] = stencil & 0xff; + pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil); + pipe->set_stencil_ref(pipe, &sr); + } else pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil); @@ -608,244 +699,284 @@ void util_blitter_clear(struct blitter_context *blitter, pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, num_cbufs)); pipe->bind_vs_state(pipe, ctx->vs_col); - blitter_set_clear_color(ctx, rgba); - blitter_set_rectangle(ctx, 0, 0, width, height, width, height, depth); - blitter_draw_quad(ctx); + blitter_set_dst_dimensions(ctx, width, height); + blitter->draw_rectangle(blitter, 0, 0, width, height, depth, + UTIL_BLITTER_ATTRIB_COLOR, rgba); blitter_restore_CSOs(ctx); } -static boolean -is_overlap(unsigned sx1, unsigned sx2, unsigned sy1, unsigned sy2, - unsigned dx1, unsigned dx2, unsigned dy1, unsigned dy2) +static +boolean is_overlap(unsigned sx1, unsigned sx2, unsigned sy1, unsigned sy2, + unsigned dx1, unsigned dx2, unsigned dy1, unsigned dy2) { - if (sx1 >= dx2 || sx2 <= dx1 || sy1 >= dy2 || sy2 <= dy1) { - return FALSE; - } else { - return TRUE; - } + return sx1 < dx2 && sx2 > dx1 && sy1 < dy2 && sy2 > dy1; } -static void util_blitter_do_copy(struct blitter_context *blitter, - struct pipe_surface *dst, - unsigned dstx, unsigned dsty, - struct pipe_surface *src, - unsigned srcx, unsigned srcy, - unsigned width, unsigned height, - boolean is_depth) +void util_blitter_copy_region(struct blitter_context *blitter, + struct pipe_resource *dst, + struct pipe_subresource subdst, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + struct pipe_subresource subsrc, + unsigned srcx, unsigned srcy, unsigned srcz, + unsigned width, unsigned height, + boolean ignore_stencil) { struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter; - struct pipe_context *pipe = ctx->pipe; + struct pipe_context *pipe = ctx->base.pipe; + struct pipe_screen *screen = pipe->screen; + struct pipe_surface *dstsurf; struct pipe_framebuffer_state fb_state; struct pipe_sampler_view viewTempl, *view; + unsigned bind; + boolean is_stencil, is_depth; + boolean normalized; + + /* Give up if textures are not set. */ + assert(dst && src); + if (!dst || !src) + return; + /* Sanity checks. */ + if (dst == src) { + assert(!is_overlap(srcx, srcx + width, srcy, srcy + height, + dstx, dstx + width, dsty, dsty + height)); + } else { + assert(dst->format == src->format); + } + assert(src->target < PIPE_MAX_TEXTURE_TYPES); + + /* Is this a ZS format? */ + is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0; + is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0; + + if (is_depth || is_stencil) + bind = PIPE_BIND_DEPTH_STENCIL; + else + bind = PIPE_BIND_RENDER_TARGET; + + /* Check if we can sample from and render to the surfaces. */ + /* (assuming copying a stencil buffer is not possible) */ + if ((!ignore_stencil && is_stencil) || + !screen->is_format_supported(screen, dst->format, dst->target, + dst->nr_samples, bind, 0) || + !screen->is_format_supported(screen, src->format, src->target, + src->nr_samples, PIPE_BIND_SAMPLER_VIEW, 0)) { + util_resource_copy_region(pipe, dst, subdst, dstx, dsty, dstz, + src, subsrc, srcx, srcy, srcz, width, height); + return; + } + + /* Get surfaces. */ + dstsurf = screen->get_tex_surface(screen, dst, + subdst.face, subdst.level, dstz, + bind); + + /* Check whether the states are properly saved. */ + blitter_check_saved_CSOs(ctx); assert(blitter->saved_fb_state.nr_cbufs != ~0); assert(blitter->saved_num_sampler_views != ~0); assert(blitter->saved_num_sampler_states != ~0); - assert(src->texture->target < PIPE_MAX_TEXTURE_TYPES); - /* bind CSOs */ - fb_state.width = dst->width; - fb_state.height = dst->height; + /* Initialize framebuffer state. */ + fb_state.width = dstsurf->width; + fb_state.height = dstsurf->height; if (is_depth) { pipe->bind_blend_state(pipe, ctx->blend_keep_color); pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_keep_stencil); pipe->bind_fs_state(pipe, - blitter_get_fs_texfetch_depth(ctx, src->texture->target)); + blitter_get_fs_texfetch_depth(ctx, src->target)); fb_state.nr_cbufs = 0; - fb_state.zsbuf = dst; + fb_state.zsbuf = dstsurf; } else { pipe->bind_blend_state(pipe, ctx->blend_write_color); pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil); pipe->bind_fs_state(pipe, - blitter_get_fs_texfetch_col(ctx, src->texture->target)); + blitter_get_fs_texfetch_col(ctx, src->target)); fb_state.nr_cbufs = 1; - fb_state.cbufs[0] = dst; + fb_state.cbufs[0] = dstsurf; fb_state.zsbuf = 0; } - u_sampler_view_default_template(&viewTempl, - src->texture, - src->texture->format); - view = pipe->create_sampler_view(pipe, - src->texture, - &viewTempl); + normalized = src->target != PIPE_TEXTURE_RECT; - if (ctx->sampler_view) { - pipe_sampler_view_reference(&ctx->sampler_view, NULL); - } - ctx->sampler_view = view; + /* Initialize sampler view. */ + u_sampler_view_default_template(&viewTempl, src, src->format); + view = pipe->create_sampler_view(pipe, src, &viewTempl); + /* Set rasterizer state, shaders, and textures. */ pipe->bind_rasterizer_state(pipe, ctx->rs_state); pipe->bind_vs_state(pipe, ctx->vs_tex); pipe->bind_fragment_sampler_states(pipe, 1, - blitter_get_sampler_state(ctx, src->level)); + blitter_get_sampler_state(ctx, subsrc.level, normalized)); pipe->bind_vertex_elements_state(pipe, ctx->velem_state); pipe->set_fragment_sampler_views(pipe, 1, &view); pipe->set_framebuffer_state(pipe, &fb_state); - /* set texture coordinates */ - switch (src->texture->target) { + blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height); + + switch (src->target) { + /* Draw the quad with the draw_rectangle callback. */ case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: - blitter_set_texcoords_2d(ctx, src, srcx, srcy, - srcx+width, srcy+height); + case PIPE_TEXTURE_RECT: + { + /* Set texture coordinates. */ + float coord[4]; + get_texcoords(src, subsrc, srcx, srcy, + srcx+width, srcy+height, normalized, coord); + + /* Draw. */ + blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0, + UTIL_BLITTER_ATTRIB_TEXCOORD, coord); + } break; + + /* Draw the quad with the generic codepath. */ case PIPE_TEXTURE_3D: - blitter_set_texcoords_3d(ctx, src, srcx, srcy, - srcx+width, srcy+height); - break; case PIPE_TEXTURE_CUBE: - blitter_set_texcoords_cube(ctx, src, srcx, srcy, - srcx+width, srcy+height); + /* Set texture coordinates. */ + if (src->target == PIPE_TEXTURE_3D) + blitter_set_texcoords_3d(ctx, src, subsrc, srcz, + srcx, srcy, srcx+width, srcy+height); + else + blitter_set_texcoords_cube(ctx, src, subsrc, + srcx, srcy, srcx+width, srcy+height); + + /* Draw. */ + blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height, 0); + blitter_draw_quad(ctx); break; + default: assert(0); + return; } - blitter_set_rectangle(ctx, dstx, dsty, dstx+width, dsty+height, dst->width, dst->height, 0); - blitter_draw_quad(ctx); + blitter_restore_CSOs(ctx); + pipe_surface_reference(&dstsurf, NULL); + pipe_sampler_view_reference(&view, NULL); } -static void util_blitter_overlap_copy(struct blitter_context *blitter, - struct pipe_surface *dst, - unsigned dstx, unsigned dsty, - struct pipe_surface *src, - unsigned srcx, unsigned srcy, - unsigned width, unsigned height) +/* Clear a region of a color surface to a constant value. */ +void util_blitter_clear_render_target(struct blitter_context *blitter, + struct pipe_surface *dstsurf, + const float *rgba, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height) { struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter; - struct pipe_context *pipe = ctx->pipe; - struct pipe_screen *screen = pipe->screen; + struct pipe_context *pipe = ctx->base.pipe; + struct pipe_framebuffer_state fb_state; - struct pipe_resource texTemp; - struct pipe_resource *texture; - struct pipe_surface *tex_surf; + assert(dstsurf->texture); + if (!dstsurf->texture) + return; - /* check whether the states are properly saved */ + /* check the saved state */ blitter_check_saved_CSOs(ctx); + assert(blitter->saved_fb_state.nr_cbufs != ~0); - memset(&texTemp, 0, sizeof(texTemp)); - texTemp.target = PIPE_TEXTURE_2D; - texTemp.format = dst->texture->format; /* XXX verify supported by driver! */ - texTemp.last_level = 0; - texTemp.width0 = width; - texTemp.height0 = height; - texTemp.depth0 = 1; + /* bind CSOs */ + pipe->bind_blend_state(pipe, ctx->blend_write_color); + pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil); + pipe->bind_rasterizer_state(pipe, ctx->rs_state); + pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 1)); + pipe->bind_vs_state(pipe, ctx->vs_col); + pipe->bind_vertex_elements_state(pipe, ctx->velem_state); - texture = screen->resource_create(screen, &texTemp); - if (!texture) - return; + /* set a framebuffer state */ + fb_state.width = dstsurf->width; + fb_state.height = dstsurf->height; + fb_state.nr_cbufs = 1; + fb_state.cbufs[0] = dstsurf; + fb_state.zsbuf = 0; + pipe->set_framebuffer_state(pipe, &fb_state); - tex_surf = screen->get_tex_surface(screen, texture, 0, 0, 0, - PIPE_BIND_BLIT_SOURCE | - PIPE_BIND_BLIT_DESTINATION); - - /* blit from the src to the temp */ - util_blitter_do_copy(blitter, tex_surf, 0, 0, - src, srcx, srcy, - width, height, - FALSE); - util_blitter_do_copy(blitter, dst, dstx, dsty, - tex_surf, 0, 0, - width, height, - FALSE); - pipe_surface_reference(&tex_surf, NULL); - pipe_resource_reference(&texture, NULL); + blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height); + blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0, + UTIL_BLITTER_ATTRIB_COLOR, rgba); blitter_restore_CSOs(ctx); } -void util_blitter_copy(struct blitter_context *blitter, - struct pipe_surface *dst, - unsigned dstx, unsigned dsty, - struct pipe_surface *src, - unsigned srcx, unsigned srcy, - unsigned width, unsigned height, - boolean ignore_stencil) +/* Clear a region of a depth stencil surface. */ +void util_blitter_clear_depth_stencil(struct blitter_context *blitter, + struct pipe_surface *dstsurf, + unsigned clear_flags, + double depth, + unsigned stencil, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height) { struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter; - struct pipe_context *pipe = ctx->pipe; - struct pipe_screen *screen = pipe->screen; - boolean is_stencil, is_depth; - unsigned dst_tex_usage; + struct pipe_context *pipe = ctx->base.pipe; + struct pipe_framebuffer_state fb_state; + struct pipe_stencil_ref sr = { { 0 } }; - /* give up if textures are not set */ - assert(dst->texture && src->texture); - if (!dst->texture || !src->texture) + assert(dstsurf->texture); + if (!dstsurf->texture) return; - if (dst->texture == src->texture) { - if (is_overlap(srcx, srcx + width, srcy, srcy + height, - dstx, dstx + width, dsty, dsty + height)) { - util_blitter_overlap_copy(blitter, dst, dstx, dsty, src, srcx, srcy, - width, height); - return; - } - } - - is_depth = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 0) != 0; - is_stencil = util_format_get_component_bits(src->format, UTIL_FORMAT_COLORSPACE_ZS, 1) != 0; - dst_tex_usage = is_depth || is_stencil ? PIPE_BIND_DEPTH_STENCIL : - PIPE_BIND_RENDER_TARGET; + /* check the saved state */ + blitter_check_saved_CSOs(ctx); + assert(blitter->saved_fb_state.nr_cbufs != ~0); - /* check if we can sample from and render to the surfaces */ - /* (assuming copying a stencil buffer is not possible) */ - if ((!ignore_stencil && is_stencil) || - !screen->is_format_supported(screen, dst->format, dst->texture->target, - dst_tex_usage, 0) || - !screen->is_format_supported(screen, src->format, src->texture->target, - PIPE_BIND_SAMPLER_VIEW, 0)) { - util_surface_copy(pipe, FALSE, dst, dstx, dsty, src, srcx, srcy, - width, height); - return; + /* bind CSOs */ + pipe->bind_blend_state(pipe, ctx->blend_keep_color); + if ((clear_flags & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) { + sr.ref_value[0] = stencil & 0xff; + pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil); + pipe->set_stencil_ref(pipe, &sr); + } + else if (clear_flags & PIPE_CLEAR_DEPTH) { + pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_keep_stencil); + } + else if (clear_flags & PIPE_CLEAR_STENCIL) { + sr.ref_value[0] = stencil & 0xff; + pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil); + pipe->set_stencil_ref(pipe, &sr); } + else + /* hmm that should be illegal probably, or make it a no-op somewhere */ + pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil); - /* check whether the states are properly saved */ - blitter_check_saved_CSOs(ctx); - util_blitter_do_copy(blitter, - dst, dstx, dsty, - src, srcx, srcy, - width, height, is_depth); + pipe->bind_rasterizer_state(pipe, ctx->rs_state); + pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 0)); + pipe->bind_vs_state(pipe, ctx->vs_col); + pipe->bind_vertex_elements_state(pipe, ctx->velem_state); + + /* set a framebuffer state */ + fb_state.width = dstsurf->width; + fb_state.height = dstsurf->height; + fb_state.nr_cbufs = 0; + fb_state.cbufs[0] = 0; + fb_state.zsbuf = dstsurf; + pipe->set_framebuffer_state(pipe, &fb_state); + + blitter_set_dst_dimensions(ctx, dstsurf->width, dstsurf->height); + blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, depth, + UTIL_BLITTER_ATTRIB_NONE, NULL); blitter_restore_CSOs(ctx); } -void util_blitter_fill(struct blitter_context *blitter, - struct pipe_surface *dst, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, - unsigned value) +/* draw a rectangle across a region using a custom dsa stage - for r600g */ +void util_blitter_custom_depth_stencil(struct blitter_context *blitter, + struct pipe_surface *zsurf, + struct pipe_surface *cbsurf, + void *dsa_stage, float depth) { struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter; - struct pipe_context *pipe = ctx->pipe; - struct pipe_screen *screen = pipe->screen; + struct pipe_context *pipe = ctx->base.pipe; struct pipe_framebuffer_state fb_state; - float rgba[4]; - ubyte ub_rgba[4] = {0}; - union util_color color; - int i; - - assert(dst->texture); - if (!dst->texture) - return; - /* check if we can render to the surface */ - if (util_format_is_depth_or_stencil(dst->format) || /* unlikely, but you never know */ - !screen->is_format_supported(screen, dst->format, dst->texture->target, - PIPE_BIND_RENDER_TARGET, 0)) { - util_surface_fill(pipe, dst, dstx, dsty, width, height, value); + assert(zsurf->texture); + if (!zsurf->texture) return; - } - - /* unpack the color */ - color.ui = value; - util_unpack_color_ub(dst->format, &color, - ub_rgba, ub_rgba+1, ub_rgba+2, ub_rgba+3); - for (i = 0; i < 4; i++) - rgba[i] = ubyte_to_float(ub_rgba[i]); /* check the saved state */ blitter_check_saved_CSOs(ctx); @@ -853,22 +984,38 @@ void util_blitter_fill(struct blitter_context *blitter, /* bind CSOs */ pipe->bind_blend_state(pipe, ctx->blend_write_color); - pipe->bind_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_stencil); + pipe->bind_depth_stencil_alpha_state(pipe, dsa_stage); + pipe->bind_rasterizer_state(pipe, ctx->rs_state); - pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 1)); + pipe->bind_fs_state(pipe, blitter_get_fs_col(ctx, 0)); pipe->bind_vs_state(pipe, ctx->vs_col); pipe->bind_vertex_elements_state(pipe, ctx->velem_state); /* set a framebuffer state */ - fb_state.width = dst->width; - fb_state.height = dst->height; + fb_state.width = zsurf->width; + fb_state.height = zsurf->height; fb_state.nr_cbufs = 1; - fb_state.cbufs[0] = dst; - fb_state.zsbuf = 0; + if (cbsurf) { + fb_state.cbufs[0] = cbsurf; + fb_state.nr_cbufs = 1; + } else { + fb_state.cbufs[0] = NULL; + fb_state.nr_cbufs = 0; + } + fb_state.zsbuf = zsurf; pipe->set_framebuffer_state(pipe, &fb_state); - blitter_set_clear_color(ctx, rgba); - blitter_set_rectangle(ctx, 0, 0, width, height, dst->width, dst->height, 0); - blitter_draw_quad(ctx); + blitter_set_dst_dimensions(ctx, zsurf->width, zsurf->height); + blitter->draw_rectangle(blitter, 0, 0, zsurf->width, zsurf->height, depth, + UTIL_BLITTER_ATTRIB_NONE, NULL); blitter_restore_CSOs(ctx); } + +/* flush a region of a depth stencil surface for r300g */ +void util_blitter_flush_depth_stencil(struct blitter_context *blitter, + struct pipe_surface *dstsurf) +{ + struct blitter_context_priv *ctx = (struct blitter_context_priv*)blitter; + util_blitter_custom_depth_stencil(blitter, dstsurf, NULL, + ctx->dsa_flush_depth_stencil, 0.0f); +} diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h index f6e3ce4874e..f9f96f25c77 100644 --- a/src/gallium/auxiliary/util/u_blitter.h +++ b/src/gallium/auxiliary/util/u_blitter.h @@ -27,6 +27,8 @@ #ifndef U_BLITTER_H #define U_BLITTER_H +#include "util/u_framebuffer.h" +#include "util/u_inlines.h" #include "util/u_memory.h" #include "pipe/p_state.h" @@ -38,9 +40,48 @@ extern "C" { struct pipe_context; +enum blitter_attrib_type { + UTIL_BLITTER_ATTRIB_NONE, + UTIL_BLITTER_ATTRIB_COLOR, + UTIL_BLITTER_ATTRIB_TEXCOORD +}; + struct blitter_context { + /** + * Draw a rectangle. + * + * \param x1 An X coordinate of the top-left corner. + * \param y1 A Y coordinate of the top-left corner. + * \param x2 An X coordinate of the bottom-right corner. + * \param y2 A Y coordinate of the bottom-right corner. + * \param depth A depth which the rectangle is rendered at. + * + * \param type Semantics of the attributes "attrib". + * If type is UTIL_BLITTER_ATTRIB_NONE, ignore them. + * If type is UTIL_BLITTER_ATTRIB_COLOR, the attributes + * make up a constant RGBA color, and should go to the COLOR0 + * varying slot of a fragment shader. + * If type is UTIL_BLITTER_ATTRIB_TEXCOORD, {a1, a2} and + * {a3, a4} specify top-left and bottom-right texture + * coordinates of the rectangle, respectively, and should go + * to the GENERIC0 varying slot of a fragment shader. + * + * \param attrib See type. + * + * \note A driver may optionally override this callback to implement + * a specialized hardware path for drawing a rectangle, e.g. using + * a rectangular point sprite. + */ + void (*draw_rectangle)(struct blitter_context *blitter, + unsigned x1, unsigned y1, unsigned x2, unsigned y2, + float depth, + enum blitter_attrib_type type, + const float attrib[4]); + /* Private members, really. */ + struct pipe_context *pipe; /**< pipe context */ + void *saved_blend_state; /**< blend state */ void *saved_dsa_state; /**< depth stencil alpha state */ void *saved_velem_state; /**< vertex elements state */ @@ -72,6 +113,15 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe); */ void util_blitter_destroy(struct blitter_context *blitter); +/** + * Return the pipe context associated with a blitter context. + */ +static INLINE +struct pipe_context *util_blitter_get_pipe(struct blitter_context *blitter) +{ + return blitter->pipe; +} + /* * These CSOs must be saved before any of the following functions is called: * - blend state @@ -112,52 +162,56 @@ void util_blitter_clear(struct blitter_context *blitter, * - fragment sampler states * - fragment sampler textures */ -void util_blitter_copy(struct blitter_context *blitter, - struct pipe_surface *dst, - unsigned dstx, unsigned dsty, - struct pipe_surface *src, - unsigned srcx, unsigned srcy, - unsigned width, unsigned height, - boolean ignore_stencil); +void util_blitter_copy_region(struct blitter_context *blitter, + struct pipe_resource *dst, + struct pipe_subresource subdst, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *src, + struct pipe_subresource subsrc, + unsigned srcx, unsigned srcy, unsigned srcz, + unsigned width, unsigned height, + boolean ignore_stencil); /** - * Fill a region of a surface with a constant value. - * - * If the surface cannot be rendered to or it's a depth-stencil format, - * a software fallback path is taken. + * Clear a region of a (color) surface to a constant value. * * These states must be saved in the blitter in addition to the state objects * already required to be saved: * - framebuffer state */ -void util_blitter_fill(struct blitter_context *blitter, - struct pipe_surface *dst, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, - unsigned value); +void util_blitter_clear_render_target(struct blitter_context *blitter, + struct pipe_surface *dst, + const float *rgba, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height); /** - * Copy all pixels from one surface to another. + * Clear a region of a depth-stencil surface, both stencil and depth + * or only one of them if this is a combined depth-stencil surface. * - * The rules are the same as in util_blitter_copy with the addition that - * surfaces must have the same size. + * These states must be saved in the blitter in addition to the state objects + * already required to be saved: + * - framebuffer state */ -static INLINE -void util_blitter_copy_surface(struct blitter_context *blitter, - struct pipe_surface *dst, - struct pipe_surface *src, - boolean ignore_stencil) -{ - assert(dst->width == src->width && dst->height == src->height); - - util_blitter_copy(blitter, dst, 0, 0, src, 0, 0, src->width, src->height, - ignore_stencil); -} - +void util_blitter_clear_depth_stencil(struct blitter_context *blitter, + struct pipe_surface *dst, + unsigned clear_flags, + double depth, + unsigned stencil, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height); + +void util_blitter_flush_depth_stencil(struct blitter_context *blitter, + struct pipe_surface *dstsurf); + +void util_blitter_custom_depth_stencil(struct blitter_context *blitter, + struct pipe_surface *zsurf, + struct pipe_surface *cbsurf, + void *dsa_stage, float depth); /* The functions below should be used to save currently bound constant state * objects inside a driver. The objects are automatically restored at the end - * of the util_blitter_{clear, fill, copy, copy_surface} functions and then + * of the util_blitter_{clear, copy_region, fill_region} functions and then * forgotten. * * CSOs not listed here are not affected by util_blitter. */ @@ -213,9 +267,10 @@ void util_blitter_save_vertex_shader(struct blitter_context *blitter, static INLINE void util_blitter_save_framebuffer(struct blitter_context *blitter, - struct pipe_framebuffer_state *state) + const struct pipe_framebuffer_state *state) { - blitter->saved_fb_state = *state; + blitter->saved_fb_state.nr_cbufs = 0; /* It's ~0 now, meaning it's unsaved. */ + util_copy_framebuffer_state(&blitter->saved_fb_state, state); } static INLINE @@ -250,12 +305,13 @@ util_blitter_save_fragment_sampler_views(struct blitter_context *blitter, int num_views, struct pipe_sampler_view **views) { + unsigned i; assert(num_views <= Elements(blitter->saved_sampler_views)); blitter->saved_num_sampler_views = num_views; - memcpy(blitter->saved_sampler_views, - views, - num_views * sizeof(struct pipe_sampler_view *)); + for (i = 0; i < num_views; i++) + pipe_sampler_view_reference(&blitter->saved_sampler_views[i], + views[i]); } static INLINE void @@ -263,9 +319,18 @@ util_blitter_save_vertex_buffers(struct blitter_context *blitter, int num_vertex_buffers, struct pipe_vertex_buffer *vertex_buffers) { + unsigned i; assert(num_vertex_buffers <= Elements(blitter->saved_vertex_buffers)); blitter->saved_num_vertex_buffers = num_vertex_buffers; + + for (i = 0; i < num_vertex_buffers; i++) { + if (vertex_buffers[i].buffer) { + pipe_resource_reference(&blitter->saved_vertex_buffers[i].buffer, + vertex_buffers[i].buffer); + } + } + memcpy(blitter->saved_vertex_buffers, vertex_buffers, num_vertex_buffers * sizeof(struct pipe_vertex_buffer)); diff --git a/src/gallium/auxiliary/util/u_box.h b/src/gallium/auxiliary/util/u_box.h index 919967b55a7..e9c71743fc8 100644 --- a/src/gallium/auxiliary/util/u_box.h +++ b/src/gallium/auxiliary/util/u_box.h @@ -60,6 +60,25 @@ void u_box_2d_zslice( unsigned x, box->depth = 1; } + +static INLINE +void u_box_3d( unsigned x, + unsigned y, + unsigned z, + unsigned w, + unsigned h, + unsigned d, + struct pipe_box *box ) +{ + box->x = x; + box->y = y; + box->z = z; + box->width = w; + box->height = h; + box->depth = d; +} + + static INLINE struct pipe_subresource u_subresource( unsigned face, unsigned level ) diff --git a/src/gallium/auxiliary/util/u_caps.c b/src/gallium/auxiliary/util/u_caps.c new file mode 100644 index 00000000000..e209a98b706 --- /dev/null +++ b/src/gallium/auxiliary/util/u_caps.c @@ -0,0 +1,271 @@ +/************************************************************************** + * + * Copyright 2010 Vmware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "pipe/p_screen.h" +#include "util/u_format.h" +#include "util/u_debug.h" +#include "u_caps.h" + +/** + * Iterates over a list of caps checks as defined in u_caps.h. Should + * all checks pass returns TRUE and out is set to the last element of + * the list (TERMINATE). Should any check fail returns FALSE and set + * out to the index of the start of the first failing check. + */ +boolean +util_check_caps_out(struct pipe_screen *screen, const unsigned *list, int *out) +{ + int i, tmpi; + float tmpf; + + for (i = 0; list[i];) { + switch(list[i++]) { + case UTIL_CAPS_CHECK_CAP: + if (!screen->get_param(screen, list[i++])) { + *out = i - 2; + return FALSE; + } + break; + case UTIL_CAPS_CHECK_INT: + tmpi = screen->get_param(screen, list[i++]); + if (tmpi < (int)list[i++]) { + *out = i - 3; + return FALSE; + } + break; + case UTIL_CAPS_CHECK_FLOAT: + tmpf = screen->get_paramf(screen, list[i++]); + if (tmpf < (float)list[i++]) { + *out = i - 3; + return FALSE; + } + break; + case UTIL_CAPS_CHECK_FORMAT: + if (!screen->is_format_supported(screen, + list[i++], + PIPE_TEXTURE_2D, + 0, + PIPE_BIND_SAMPLER_VIEW, + 0)) { + *out = i - 2; + return FALSE; + } + break; + case UTIL_CAPS_CHECK_SHADER: + tmpi = screen->get_shader_param(screen, list[i] >> 24, list[i] & ((1 << 24) - 1)); + ++i; + if (tmpi < (int)list[i++]) { + *out = i - 3; + return FALSE; + } + break; + case UTIL_CAPS_CHECK_UNIMPLEMENTED: + *out = i - 1; + return FALSE; + default: + assert(!"Unsupported check"); + return FALSE; + } + } + + *out = i; + return TRUE; +} + +/** + * Iterates over a list of caps checks as defined in u_caps.h. + * Returns TRUE if all caps checks pass returns FALSE otherwise. + */ +boolean +util_check_caps(struct pipe_screen *screen, const unsigned *list) +{ + int out; + return util_check_caps_out(screen, list, &out); +} + + +/* + * Below follows some demo lists. + * + * None of these lists are exhausting lists of what is + * actually needed to support said API and more here for + * as example on how to uses the above functions. Especially + * for DX10 and DX11 where Gallium is missing features. + */ + +/* DX 9_1 */ +static unsigned caps_dx_9_1[] = { + UTIL_CHECK_INT(MAX_RENDER_TARGETS, 1), + UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 12), /* 2048 */ + UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9), /* 256 */ + UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10), /* 512 */ + UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 2), + UTIL_CHECK_TERMINATE +}; + +/* DX 9_2 */ +static unsigned caps_dx_9_2[] = { + UTIL_CHECK_CAP(OCCLUSION_QUERY), + UTIL_CHECK_CAP(BLEND_EQUATION_SEPARATE), + UTIL_CHECK_INT(MAX_RENDER_TARGETS, 1), + UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 12), /* 2048 */ + UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9), /* 256 */ + UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10), /* 512 */ + UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16), + UTIL_CHECK_TERMINATE +}; + +/* DX 9_3 */ +static unsigned caps_dx_9_3[] = { + UTIL_CHECK_CAP(SM3), + //UTIL_CHECK_CAP(INSTANCING), + UTIL_CHECK_CAP(OCCLUSION_QUERY), + UTIL_CHECK_INT(MAX_RENDER_TARGETS, 4), + UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 13), /* 4096 */ + UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 9), /* 256 */ + UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 10), /* 512 */ + UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16), + UTIL_CHECK_TERMINATE +}; + +/* DX 10 */ +static unsigned caps_dx_10[] = { + UTIL_CHECK_CAP(SM3), + //UTIL_CHECK_CAP(INSTANCING), + UTIL_CHECK_CAP(OCCLUSION_QUERY), + UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8), + UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 14), /* 8192 */ + UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 12), /* 2048 */ + UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 14), /* 8192 */ + UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16), + UTIL_CHECK_UNIMPLEMENTED, /* XXX Unimplemented features in Gallium */ + UTIL_CHECK_TERMINATE +}; + +/* DX11 */ +static unsigned caps_dx_11[] = { + UTIL_CHECK_CAP(SM3), + //UTIL_CHECK_CAP(INSTANCING), + UTIL_CHECK_CAP(OCCLUSION_QUERY), + UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8), + UTIL_CHECK_INT(MAX_TEXTURE_2D_LEVELS, 14), /* 16384 */ + UTIL_CHECK_INT(MAX_TEXTURE_3D_LEVELS, 12), /* 2048 */ + UTIL_CHECK_INT(MAX_TEXTURE_CUBE_LEVELS, 14), /* 16384 */ + UTIL_CHECK_FLOAT(MAX_TEXTURE_ANISOTROPY, 16), + UTIL_CHECK_FORMAT(B8G8R8A8_UNORM), + UTIL_CHECK_UNIMPLEMENTED, /* XXX Unimplemented features in Gallium */ + UTIL_CHECK_TERMINATE +}; + +/* OpenGL 2.1 */ +static unsigned caps_opengl_2_1[] = { + UTIL_CHECK_CAP(GLSL), + UTIL_CHECK_CAP(OCCLUSION_QUERY), + UTIL_CHECK_CAP(TWO_SIDED_STENCIL), + UTIL_CHECK_CAP(BLEND_EQUATION_SEPARATE), + UTIL_CHECK_INT(MAX_RENDER_TARGETS, 2), + UTIL_CHECK_TERMINATE +}; + +/* OpenGL 3.0 */ +/* UTIL_CHECK_INT(MAX_RENDER_TARGETS, 8), */ + +/* Shader Model 3 */ +static unsigned caps_sm3[] = { + UTIL_CHECK_SHADER(FRAGMENT, MAX_INSTRUCTIONS, 512), + UTIL_CHECK_SHADER(FRAGMENT, MAX_INPUTS, 10), + UTIL_CHECK_SHADER(FRAGMENT, MAX_TEMPS, 32), + UTIL_CHECK_SHADER(FRAGMENT, MAX_ADDRS, 1), + UTIL_CHECK_SHADER(FRAGMENT, MAX_CONSTS, 224), + + UTIL_CHECK_SHADER(VERTEX, MAX_INSTRUCTIONS, 512), + UTIL_CHECK_SHADER(VERTEX, MAX_INPUTS, 16), + UTIL_CHECK_SHADER(VERTEX, MAX_TEMPS, 32), + UTIL_CHECK_SHADER(VERTEX, MAX_ADDRS, 2), + UTIL_CHECK_SHADER(VERTEX, MAX_CONSTS, 256), + + UTIL_CHECK_TERMINATE +}; + +/** + * Demo function which checks against theoretical caps needed for different APIs. + */ +void util_caps_demo_print(struct pipe_screen *screen) +{ + struct { + char* name; + unsigned *list; + } list[] = { + {"DX 9.1", caps_dx_9_1}, + {"DX 9.2", caps_dx_9_2}, + {"DX 9.3", caps_dx_9_3}, + {"DX 10", caps_dx_10}, + {"DX 11", caps_dx_11}, + {"OpenGL 2.1", caps_opengl_2_1}, +/* {"OpenGL 3.0", caps_opengl_3_0},*/ + {"SM3", caps_sm3}, + {NULL, NULL} + }; + int i, out = 0; + + for (i = 0; list[i].name; i++) { + if (util_check_caps_out(screen, list[i].list, &out)) { + debug_printf("%s: %s yes\n", __FUNCTION__, list[i].name); + continue; + } + switch (list[i].list[out]) { + case UTIL_CAPS_CHECK_CAP: + debug_printf("%s: %s no (cap %u not supported)\n", __FUNCTION__, + list[i].name, + list[i].list[out + 1]); + break; + case UTIL_CAPS_CHECK_INT: + debug_printf("%s: %s no (cap %u less then %u)\n", __FUNCTION__, + list[i].name, + list[i].list[out + 1], + list[i].list[out + 2]); + break; + case UTIL_CAPS_CHECK_FLOAT: + debug_printf("%s: %s no (cap %u less then %f)\n", __FUNCTION__, + list[i].name, + list[i].list[out + 1], + (double)(int)list[i].list[out + 2]); + break; + case UTIL_CAPS_CHECK_FORMAT: + debug_printf("%s: %s no (format %s not supported)\n", __FUNCTION__, + list[i].name, + util_format_name(list[i].list[out + 1]) + 12); + break; + case UTIL_CAPS_CHECK_UNIMPLEMENTED: + debug_printf("%s: %s no (not implemented in gallium or state tracker)\n", + __FUNCTION__, list[i].name); + break; + default: + assert(!"Unsupported check"); + } + } +} diff --git a/src/gallium/auxiliary/util/u_caps.h b/src/gallium/auxiliary/util/u_caps.h new file mode 100644 index 00000000000..7bd23800414 --- /dev/null +++ b/src/gallium/auxiliary/util/u_caps.h @@ -0,0 +1,71 @@ +/************************************************************************** + * + * Copyright 2010 Vmware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_CAPS_H +#define U_CAPS_H + +#include "pipe/p_compiler.h" + +struct pipe_screen; + +enum u_caps_check_enum { + UTIL_CAPS_CHECK_TERMINATE = 0, + UTIL_CAPS_CHECK_CAP, + UTIL_CAPS_CHECK_INT, + UTIL_CAPS_CHECK_FLOAT, + UTIL_CAPS_CHECK_FORMAT, + UTIL_CAPS_CHECK_SHADER, + UTIL_CAPS_CHECK_UNIMPLEMENTED, +}; + +#define UTIL_CHECK_CAP(cap) \ + UTIL_CAPS_CHECK_CAP, PIPE_CAP_##cap + +#define UTIL_CHECK_INT(cap, higher) \ + UTIL_CAPS_CHECK_INT, PIPE_CAP_##cap, (unsigned)(higher) + +/* Floats currently lose precision */ +#define UTIL_CHECK_FLOAT(cap, higher) \ + UTIL_CAPS_CHECK_FLOAT, PIPE_CAP_##cap, (unsigned)(int)(higher) + +#define UTIL_CHECK_FORMAT(format) \ + UTIL_CAPS_CHECK_FORMAT, PIPE_FORMAT_##format + +#define UTIL_CHECK_SHADER(shader, cap, higher) \ + UTIL_CAPS_CHECK_SHADER, (PIPE_SHADER_##shader << 24) | PIPE_SHADER_CAP_##cap, (unsigned)(higher) + +#define UTIL_CHECK_UNIMPLEMENTED \ + UTIL_CAPS_CHECK_UNIMPLEMENTED + +#define UTIL_CHECK_TERMINATE \ + UTIL_CAPS_CHECK_TERMINATE + +boolean util_check_caps(struct pipe_screen *screen, const unsigned *list); +boolean util_check_caps_out(struct pipe_screen *screen, const unsigned *list, int *out); +void util_caps_demo_print(struct pipe_screen *screen); + +#endif diff --git a/src/gallium/auxiliary/util/u_clear.h b/src/gallium/auxiliary/util/u_clear.h index 2c32db61756..ad69df3f898 100644 --- a/src/gallium/auxiliary/util/u_clear.h +++ b/src/gallium/auxiliary/util/u_clear.h @@ -31,8 +31,6 @@ #include "pipe/p_context.h" #include "pipe/p_state.h" -#include "util/u_pack_color.h" -#include "util/u_rect.h" /** @@ -45,26 +43,17 @@ util_clear(struct pipe_context *pipe, const float *rgba, double depth, unsigned stencil) { if (buffers & PIPE_CLEAR_COLOR) { - struct pipe_surface *ps = framebuffer->cbufs[0]; - union util_color uc; - - util_pack_color(rgba, ps->format, &uc); - if (pipe->surface_fill) { - pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, uc.ui); - } else { - util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height, uc.ui); + unsigned i; + for (i = 0; i < framebuffer->nr_cbufs; i++) { + struct pipe_surface *ps = framebuffer->cbufs[i]; + pipe->clear_render_target(pipe, ps, rgba, 0, 0, ps->width, ps->height); } } if (buffers & PIPE_CLEAR_DEPTHSTENCIL) { struct pipe_surface *ps = framebuffer->zsbuf; - - if (pipe->surface_fill) { - pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, - util_pack_z_stencil(ps->format, depth, stencil)); - } else { - util_surface_fill(pipe, ps, 0, 0, ps->width, ps->height, - util_pack_z_stencil(ps->format, depth, stencil)); - } + pipe->clear_depth_stencil(pipe, ps, buffers & PIPE_CLEAR_DEPTHSTENCIL, + depth, stencil, + 0, 0, ps->width, ps->height); } } diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c index a08241971ca..32519b148b6 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ b/src/gallium/auxiliary/util/u_cpu_detect.c @@ -38,7 +38,7 @@ #include "u_cpu_detect.h" #if defined(PIPE_ARCH_PPC) -#if defined(PIPE_OS_DARWIN) +#if defined(PIPE_OS_APPLE) #include <sys/sysctl.h> #else #include <signal.h> @@ -73,66 +73,19 @@ #endif -struct util_cpu_caps util_cpu_caps; - -static int has_cpuid(void); - -#if defined(PIPE_ARCH_X86) - -/* The sigill handlers */ -#if defined(PIPE_OS_LINUX) /*&& defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)*/ -static void -sigill_handler_sse(int signal, struct sigcontext sc) -{ - /* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1" - * instructions are 3 bytes long. We must increment the instruction - * pointer manually to avoid repeated execution of the offending - * instruction. - * - * If the SIGILL is caused by a divide-by-zero when unmasked - * exceptions aren't supported, the SIMD FPU status and control - * word will be restored at the end of the test, so we don't need - * to worry about doing it here. Besides, we may not be able to... - */ - sc.eip += 3; - - util_cpu_caps.has_sse=0; -} +#ifdef DEBUG +DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE) +#endif -static void -sigfpe_handler_sse(int signal, struct sigcontext sc) -{ - if (sc.fpstate->magic != 0xffff) { - /* Our signal context has the extended FPU state, so reset the - * divide-by-zero exception mask and clear the divide-by-zero - * exception bit. - */ - sc.fpstate->mxcsr |= 0x00000200; - sc.fpstate->mxcsr &= 0xfffffffb; - } else { - /* If we ever get here, we're completely hosed. - */ - } -} -#endif /* PIPE_OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */ -#if defined(PIPE_OS_WINDOWS) -static LONG CALLBACK -win32_sig_handler_sse(EXCEPTION_POINTERS* ep) -{ - if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){ - ep->ContextRecord->Eip +=3; - util_cpu_caps.has_sse=0; - return EXCEPTION_CONTINUE_EXECUTION; - } - return EXCEPTION_CONTINUE_SEARCH; -} -#endif /* PIPE_OS_WINDOWS */ +struct util_cpu_caps util_cpu_caps; -#endif /* PIPE_ARCH_X86 */ +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) +static int has_cpuid(void); +#endif -#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_DARWIN) +#if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) static jmp_buf __lv_powerpc_jmpbuf; static volatile sig_atomic_t __lv_powerpc_canjump = 0; @@ -153,7 +106,7 @@ sigill_handler(int sig) static void check_os_altivec_support(void) { -#if defined(PIPE_OS_DARWIN) +#if defined(PIPE_OS_APPLE) int sels[2] = {CTL_HW, HW_VECTORUNIT}; int has_vu = 0; int len = sizeof (has_vu); @@ -166,8 +119,8 @@ check_os_altivec_support(void) util_cpu_caps.has_altivec = 1; } } -#else /* !PIPE_OS_DARWIN */ - /* no Darwin, do it the brute-force way */ +#else /* !PIPE_OS_APPLE */ + /* not on Apple/Darwin, do it the brute-force way */ /* this is borrowed from the libmpeg2 library */ signal(SIGILL, sigill_handler); if (setjmp(__lv_powerpc_jmpbuf)) { @@ -184,127 +137,12 @@ check_os_altivec_support(void) signal(SIGILL, SIG_DFL); util_cpu_caps.has_altivec = 1; } -#endif /* PIPE_OS_DARWIN */ +#endif /* !PIPE_OS_APPLE */ } #endif /* PIPE_ARCH_PPC */ -/* If we're running on a processor that can do SSE, let's see if we - * are allowed to or not. This will catch 2.4.0 or later kernels that - * haven't been configured for a Pentium III but are running on one, - * and RedHat patched 2.2 kernels that have broken exception handling - * support for user space apps that do SSE. - */ -#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) -static void -check_os_katmai_support(void) -{ -#if defined(PIPE_ARCH_X86) -#if defined(PIPE_OS_FREEBSD) - int has_sse=0, ret; - int len = sizeof (has_sse); - - ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0); - if (ret || !has_sse) - util_cpu_caps.has_sse=0; - -#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) - int has_sse, has_sse2, ret, mib[2]; - int varlen; - - mib[0] = CTL_MACHDEP; - mib[1] = CPU_SSE; - varlen = sizeof (has_sse); - - ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0); - if (ret < 0 || !has_sse) { - util_cpu_caps.has_sse = 0; - } else { - util_cpu_caps.has_sse = 1; - } - - mib[1] = CPU_SSE2; - varlen = sizeof (has_sse2); - ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0); - if (ret < 0 || !has_sse2) { - util_cpu_caps.has_sse2 = 0; - } else { - util_cpu_caps.has_sse2 = 1; - } - util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */ - -#elif defined(PIPE_OS_WINDOWS) - LPTOP_LEVEL_EXCEPTION_FILTER exc_fil; - if (util_cpu_caps.has_sse) { - exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse); -#if defined(PIPE_CC_GCC) - __asm __volatile ("xorps %xmm0, %xmm0"); -#elif defined(PIPE_CC_MSVC) - __asm { - xorps xmm0, xmm0 /* executing SSE instruction */ - } -#else -#error Unsupported compiler -#endif - SetUnhandledExceptionFilter(exc_fil); - } -#elif defined(PIPE_OS_LINUX) - struct sigaction saved_sigill; - struct sigaction saved_sigfpe; - - /* Save the original signal handlers. - */ - sigaction(SIGILL, NULL, &saved_sigill); - sigaction(SIGFPE, NULL, &saved_sigfpe); - - signal(SIGILL, (void (*)(int))sigill_handler_sse); - signal(SIGFPE, (void (*)(int))sigfpe_handler_sse); - - /* Emulate test for OSFXSR in CR4. The OS will set this bit if it - * supports the extended FPU save and restore required for SSE. If - * we execute an SSE instruction on a PIII and get a SIGILL, the OS - * doesn't support Streaming SIMD Exceptions, even if the processor - * does. - */ - if (util_cpu_caps.has_sse) { - __asm __volatile ("xorps %xmm1, %xmm0"); - } - - /* Emulate test for OSXMMEXCPT in CR4. The OS will set this bit if - * it supports unmasked SIMD FPU exceptions. If we unmask the - * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS - * doesn't support unmasked SIMD FPU exceptions. If we get a SIGFPE - * as expected, we're okay but we need to clean up after it. - * - * Are we being too stringent in our requirement that the OS support - * unmasked exceptions? Certain RedHat 2.2 kernels enable SSE by - * setting CR4.OSFXSR but don't support unmasked exceptions. Win98 - * doesn't even support them. We at least know the user-space SSE - * support is good in kernels that do support unmasked exceptions, - * and therefore to be safe I'm going to leave this test in here. - */ - if (util_cpu_caps.has_sse) { - /* test_os_katmai_exception_support(); */ - } - - /* Restore the original signal handlers. - */ - sigaction(SIGILL, &saved_sigill, NULL); - sigaction(SIGFPE, &saved_sigfpe, NULL); - -#else - /* We can't use POSIX signal handling to test the availability of - * SSE, so we disable it by default. - */ - util_cpu_caps.has_sse = 0; -#endif /* __linux__ */ -#endif - -#if defined(PIPE_ARCH_X86_64) - util_cpu_caps.has_sse = 1; -#endif -} - +#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) static int has_cpuid(void) { #if defined(PIPE_ARCH_X86) @@ -385,23 +223,6 @@ util_cpu_detect(void) memset(&util_cpu_caps, 0, sizeof util_cpu_caps); - /* Check for arch type */ -#if defined(PIPE_ARCH_MIPS) - util_cpu_caps.arch = UTIL_CPU_ARCH_MIPS; -#elif defined(PIPE_ARCH_ALPHA) - util_cpu_caps.arch = UTIL_CPU_ARCH_ALPHA; -#elif defined(PIPE_ARCH_SPARC) - util_cpu_caps.arch = UTIL_CPU_ARCH_SPARC; -#elif defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) - util_cpu_caps.arch = UTIL_CPU_ARCH_X86; - util_cpu_caps.little_endian = 1; -#elif defined(PIPE_ARCH_PPC) - util_cpu_caps.arch = UTIL_CPU_ARCH_POWERPC; - util_cpu_caps.little_endian = 0; -#else - util_cpu_caps.arch = UTIL_CPU_ARCH_UNKNOWN; -#endif - /* Count the number of CPUs in system */ #if defined(PIPE_OS_WINDOWS) { @@ -480,9 +301,6 @@ util_cpu_detect(void) util_cpu_caps.cacheline = regs2[2] & 0xFF; } - if (util_cpu_caps.has_sse) - check_os_katmai_support(); - if (!util_cpu_caps.has_sse) { util_cpu_caps.has_sse2 = 0; util_cpu_caps.has_sse3 = 0; @@ -497,23 +315,24 @@ util_cpu_detect(void) #endif /* PIPE_ARCH_PPC */ #ifdef DEBUG - debug_printf("util_cpu_caps.arch = %i\n", util_cpu_caps.arch); - debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); - - debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type); - debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline); - - debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc); - debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx); - debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2); - debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse); - debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2); - debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3); - debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3); - debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1); - debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow); - debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); - debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); + if (debug_get_option_dump_cpu()) { + debug_printf("util_cpu_caps.nr_cpus = %u\n", util_cpu_caps.nr_cpus); + + debug_printf("util_cpu_caps.x86_cpu_type = %u\n", util_cpu_caps.x86_cpu_type); + debug_printf("util_cpu_caps.cacheline = %u\n", util_cpu_caps.cacheline); + + debug_printf("util_cpu_caps.has_tsc = %u\n", util_cpu_caps.has_tsc); + debug_printf("util_cpu_caps.has_mmx = %u\n", util_cpu_caps.has_mmx); + debug_printf("util_cpu_caps.has_mmx2 = %u\n", util_cpu_caps.has_mmx2); + debug_printf("util_cpu_caps.has_sse = %u\n", util_cpu_caps.has_sse); + debug_printf("util_cpu_caps.has_sse2 = %u\n", util_cpu_caps.has_sse2); + debug_printf("util_cpu_caps.has_sse3 = %u\n", util_cpu_caps.has_sse3); + debug_printf("util_cpu_caps.has_ssse3 = %u\n", util_cpu_caps.has_ssse3); + debug_printf("util_cpu_caps.has_sse4_1 = %u\n", util_cpu_caps.has_sse4_1); + debug_printf("util_cpu_caps.has_3dnow = %u\n", util_cpu_caps.has_3dnow); + debug_printf("util_cpu_caps.has_3dnow_ext = %u\n", util_cpu_caps.has_3dnow_ext); + debug_printf("util_cpu_caps.has_altivec = %u\n", util_cpu_caps.has_altivec); + } #endif util_cpu_detect_initialized = TRUE; diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h index 4b3dc39c342..f3bef0993c7 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.h +++ b/src/gallium/auxiliary/util/u_cpu_detect.h @@ -36,26 +36,15 @@ #define _UTIL_CPU_DETECT_H #include "pipe/p_compiler.h" - -enum util_cpu_arch { - UTIL_CPU_ARCH_UNKNOWN = 0, - UTIL_CPU_ARCH_MIPS, - UTIL_CPU_ARCH_ALPHA, - UTIL_CPU_ARCH_SPARC, - UTIL_CPU_ARCH_X86, - UTIL_CPU_ARCH_POWERPC -}; +#include "pipe/p_config.h" struct util_cpu_caps { - enum util_cpu_arch arch; unsigned nr_cpus; /* Feature flags */ int x86_cpu_type; unsigned cacheline; - unsigned little_endian:1; - unsigned has_tsc:1; unsigned has_mmx:1; unsigned has_mmx2:1; diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c index 0de38e791d6..504e6d2a18f 100644 --- a/src/gallium/auxiliary/util/u_debug.c +++ b/src/gallium/auxiliary/util/u_debug.c @@ -42,6 +42,7 @@ #include "util/u_tile.h" #include "util/u_prim.h" +#include <limits.h> /* CHAR_BIT */ void _debug_vprintf(const char *format, va_list ap) { @@ -87,7 +88,7 @@ debug_get_option_should_print(void) * but its cool since we set first to false */ first = FALSE; - value = debug_get_bool_option("GALLIUM_PRINT_OPTIONS", TRUE); + value = debug_get_bool_option("GALLIUM_PRINT_OPTIONS", FALSE); /* XXX should we print this option? Currently it wont */ return value; } @@ -123,8 +124,12 @@ debug_get_bool_option(const char *name, boolean dfault) result = FALSE; else if(!util_strcmp(str, "f")) result = FALSE; + else if(!util_strcmp(str, "F")) + result = FALSE; else if(!util_strcmp(str, "false")) result = FALSE; + else if(!util_strcmp(str, "FALSE")) + result = FALSE; else result = TRUE; @@ -177,16 +182,21 @@ debug_get_flags_option(const char *name, { unsigned long result; const char *str; + const struct debug_named_value *orig = flags; + int namealign = 0; str = os_get_option(name); if(!str) result = dfault; else if (!util_strcmp(str, "help")) { result = dfault; - while (flags->name) { - debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value); - flags++; - } + _debug_printf("%s: help for %s:\n", __FUNCTION__, name); + for (; flags->name; ++flags) + namealign = MAX2(namealign, strlen(flags->name)); + for (flags = orig; flags->name; ++flags) + _debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name, + (int)sizeof(unsigned long)*CHAR_BIT/4, flags->value, + flags->desc ? " " : "", flags->desc ? flags->desc : ""); } else { result = 0; diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h index e8ff2773e69..1c9624ea3ed 100644 --- a/src/gallium/auxiliary/util/u_debug.h +++ b/src/gallium/auxiliary/util/u_debug.h @@ -230,6 +230,7 @@ struct debug_named_value { const char *name; unsigned long value; + const char *desc; }; @@ -252,8 +253,9 @@ struct debug_named_value * ... * @endcode */ -#define DEBUG_NAMED_VALUE(__symbol) {#__symbol, (unsigned long)__symbol} -#define DEBUG_NAMED_VALUE_END {NULL, 0} +#define DEBUG_NAMED_VALUE(__symbol) DEBUG_NAMED_VALUE_WITH_DESCRIPTION(__symbol, NULL) +#define DEBUG_NAMED_VALUE_WITH_DESCRIPTION(__symbol, __desc) {#__symbol, (unsigned long)__symbol, __desc} +#define DEBUG_NAMED_VALUE_END {NULL, 0, NULL} /** diff --git a/src/gallium/auxiliary/util/u_debug_describe.c b/src/gallium/auxiliary/util/u_debug_describe.c new file mode 100644 index 00000000000..1c90ff31069 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_describe.c @@ -0,0 +1,81 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include <pipe/p_state.h> +#include <util/u_format.h> +#include <util/u_debug_describe.h> +#include <util/u_string.h> + +void +debug_describe_reference(char* buf, const struct pipe_reference*ptr) +{ + strcpy(buf, "pipe_object"); +} + +void +debug_describe_resource(char* buf, const struct pipe_resource *ptr) +{ + switch(ptr->target) + { + case PIPE_BUFFER: + util_sprintf(buf, "pipe_buffer<%u>", (unsigned)util_format_get_stride(ptr->format, ptr->width0)); + break; + case PIPE_TEXTURE_1D: + util_sprintf(buf, "pipe_texture1d<%u,%s,%u>", ptr->width0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_2D: + util_sprintf(buf, "pipe_texture2d<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_RECT: + util_sprintf(buf, "pipe_texture_rect<%u,%u,%s>", ptr->width0, ptr->height0, util_format_short_name(ptr->format)); + break; + case PIPE_TEXTURE_CUBE: + util_sprintf(buf, "pipe_texture_cube<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_3D: + util_sprintf(buf, "pipe_texture3d<%u,%u,%u,%s,%u>", ptr->width0, ptr->height0, ptr->depth0, util_format_short_name(ptr->format), ptr->last_level); + break; + default: + util_sprintf(buf, "pipe_martian_resource<%u>", ptr->target); + break; + } +} + +void +debug_describe_surface(char* buf, const struct pipe_surface *ptr) +{ + char res[128]; + debug_describe_resource(res, ptr->texture); + util_sprintf(buf, "pipe_surface<%s,%u,%u,%u>", res, ptr->face, ptr->level, ptr->zslice); +} + +void +debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr) +{ + char res[128]; + debug_describe_resource(res, ptr->texture); + util_sprintf(buf, "pipe_sampler_view<%s,%s>", res, util_format_short_name(ptr->format)); +} diff --git a/src/gallium/auxiliary/util/u_debug_describe.h b/src/gallium/auxiliary/util/u_debug_describe.h new file mode 100644 index 00000000000..26d1f803bf0 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_describe.h @@ -0,0 +1,49 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_DEBUG_DESCRIBE_H_ +#define U_DEBUG_DESCRIBE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct pipe_reference; +struct pipe_resource; +struct pipe_surface; +struct pipe_sampler_view; + +/* a 256-byte buffer is necessary and sufficient */ +void debug_describe_reference(char* buf, const struct pipe_reference*ptr); +void debug_describe_resource(char* buf, const struct pipe_resource *ptr); +void debug_describe_surface(char* buf, const struct pipe_surface *ptr); +void debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr); + +#ifdef __cplusplus +} +#endif + +#endif /* U_DEBUG_DESCRIBE_H_ */ diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.c b/src/gallium/auxiliary/util/u_debug_refcnt.c new file mode 100644 index 00000000000..40a26c9c697 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_refcnt.c @@ -0,0 +1,181 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)) + +/* see http://www.mozilla.org/performance/refcnt-balancer.html for what do with the output + * on Linux, use tools/addr2line.sh to postprocess it before anything else + **/ +#include <util/u_debug.h> +#include <util/u_debug_refcnt.h> +#include <util/u_debug_stack.h> +#include <util/u_debug_symbol.h> +#include <util/u_string.h> +#include <util/u_hash_table.h> +#include <os/os_thread.h> +#include <os/os_stream.h> + +int debug_refcnt_state; + +struct os_stream* stream; + +/* TODO: maybe move this serial machinery to a stand-alone module and expose it? */ +static pipe_mutex serials_mutex; +static struct util_hash_table* serials_hash; +static unsigned serials_last; + +static unsigned hash_ptr(void* p) +{ + return (unsigned)(uintptr_t)p; +} + +static int compare_ptr(void* a, void* b) +{ + if(a == b) + return 0; + else if(a < b) + return -1; + else + return 1; +} + +static boolean debug_serial(void* p, unsigned* pserial) +{ + unsigned serial; + boolean found = TRUE; + pipe_mutex_lock(serials_mutex); + if(!serials_hash) + serials_hash = util_hash_table_create(hash_ptr, compare_ptr); + serial = (unsigned)(uintptr_t)util_hash_table_get(serials_hash, p); + if(!serial) + { + /* time to stop logging... (you'll have a 100 GB logfile at least at this point) + * TODO: avoid this + */ + serial = ++serials_last; + if(!serial) + { + debug_error("More than 2^32 objects detected, aborting.\n"); + os_abort(); + } + + util_hash_table_set(serials_hash, p, (void*)(uintptr_t)serial); + found = FALSE; + } + pipe_mutex_unlock(serials_mutex); + *pserial = serial; + return found; +} + +static void debug_serial_delete(void* p) +{ + pipe_mutex_lock(serials_mutex); + util_hash_table_remove(serials_hash, p); + pipe_mutex_unlock(serials_mutex); +} + +#define STACK_LEN 64 + +static void dump_stack(const char* symbols[STACK_LEN]) +{ + unsigned i; + for(i = 0; i < STACK_LEN; ++i) + { + if(symbols[i]) + os_stream_printf(stream, "%s\n", symbols[i]); + } + os_stream_write(stream, "\n", 1); +} + +void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ + if(debug_refcnt_state < 0) + return; + + if(!debug_refcnt_state) + { + const char* filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL); + if(filename && filename[0]) + stream = os_file_stream_create(filename); + + if(stream) + debug_refcnt_state = 1; + else + debug_refcnt_state = -1; + } + + if(debug_refcnt_state > 0) + { + struct debug_stack_frame frames[STACK_LEN]; + const char* symbols[STACK_LEN]; + char buf[1024]; + + unsigned i; + unsigned refcnt = p->count; + unsigned serial; + boolean existing = debug_serial((void*)p, &serial); + + debug_backtrace_capture(frames, 1, STACK_LEN); + for(i = 0; i < STACK_LEN; ++i) + { + if(frames[i].function) + symbols[i] = debug_symbol_name_cached(frames[i].function); + else + symbols[i] = 0; + } + + get_desc(buf, p); + + if(!existing) + { + os_stream_printf(stream, "<%s> %p %u Create\n", buf, p, serial); + dump_stack(symbols); + + /* this is there to provide a gradual change even if we don't see the initialization */ + for(i = 1; i <= refcnt - change; ++i) + { + os_stream_printf(stream, "<%s> %p %u AddRef %u\n", buf, p, serial, i); + dump_stack(symbols); + } + } + + if(change) + { + os_stream_printf(stream, "<%s> %p %u %s %u\n", buf, p, serial, change > 0 ? "AddRef" : "Release", refcnt); + dump_stack(symbols); + } + + if(!refcnt) + { + debug_serial_delete((void*)p); + os_stream_printf(stream, "<%s> %p %u Destroy\n", buf, p, serial); + dump_stack(symbols); + } + + os_stream_flush(stream); + } +} +#endif diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h new file mode 100644 index 00000000000..bea2d1c478a --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_refcnt.h @@ -0,0 +1,63 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_DEBUG_REFCNT_H_ +#define U_DEBUG_REFCNT_H_ + +#include <pipe/p_config.h> +#include <pipe/p_state.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*debug_reference_descriptor)(char*, const struct pipe_reference*); + +#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)) + +extern int debug_refcnt_state; + +void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change); + +static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ + if (debug_refcnt_state >= 0) + debug_reference_slowpath(p, get_desc, change); +} + +#else + +static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ +} + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* U_DEBUG_REFCNT_H_ */ diff --git a/src/gallium/auxiliary/util/u_debug_symbol.c b/src/gallium/auxiliary/util/u_debug_symbol.c index 417d0cf04c9..332952af88b 100644 --- a/src/gallium/auxiliary/util/u_debug_symbol.c +++ b/src/gallium/auxiliary/util/u_debug_symbol.c @@ -33,9 +33,12 @@ */ #include "pipe/p_compiler.h" +#include "os/os_thread.h" +#include "u_string.h" #include "u_debug.h" #include "u_debug_symbol.h" +#include "u_hash_table.h" #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86) @@ -67,21 +70,6 @@ BOOL WINAPI j_SymInitialize(HANDLE hProcess, PSTR UserSearchPath, BOOL fInvadePr return FALSE; } -typedef BOOL (WINAPI *PFNSYMCLEANUP)(HANDLE); -static PFNSYMCLEANUP pfnSymCleanup = NULL; - -static -BOOL WINAPI j_SymCleanup(HANDLE hProcess) -{ - if( - (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) && - (pfnSymCleanup || (pfnSymCleanup = (PFNSYMCLEANUP) GetProcAddress(hModule_Imagehlp, "SymCleanup"))) - ) - return pfnSymCleanup(hProcess); - else - return FALSE; -} - typedef DWORD (WINAPI *PFNSYMSETOPTIONS)(DWORD); static PFNSYMSETOPTIONS pfnSymSetOptions = NULL; @@ -97,36 +85,6 @@ DWORD WINAPI j_SymSetOptions(DWORD SymOptions) return FALSE; } -typedef BOOL (WINAPI *PFNSYMUNDNAME)(PIMAGEHLP_SYMBOL, PSTR, DWORD); -static PFNSYMUNDNAME pfnSymUnDName = NULL; - -static -BOOL WINAPI j_SymUnDName(PIMAGEHLP_SYMBOL Symbol, PSTR UnDecName, DWORD UnDecNameLength) -{ - if( - (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) && - (pfnSymUnDName || (pfnSymUnDName = (PFNSYMUNDNAME) GetProcAddress(hModule_Imagehlp, "SymUnDName"))) - ) - return pfnSymUnDName(Symbol, UnDecName, UnDecNameLength); - else - return FALSE; -} - -typedef PFUNCTION_TABLE_ACCESS_ROUTINE PFNSYMFUNCTIONTABLEACCESS; -static PFNSYMFUNCTIONTABLEACCESS pfnSymFunctionTableAccess = NULL; - -static -PVOID WINAPI j_SymFunctionTableAccess(HANDLE hProcess, DWORD AddrBase) -{ - if( - (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) && - (pfnSymFunctionTableAccess || (pfnSymFunctionTableAccess = (PFNSYMFUNCTIONTABLEACCESS) GetProcAddress(hModule_Imagehlp, "SymFunctionTableAccess"))) - ) - return pfnSymFunctionTableAccess(hProcess, AddrBase); - else - return NULL; -} - typedef PGET_MODULE_BASE_ROUTINE PFNSYMGETMODULEBASE; static PFNSYMGETMODULEBASE pfnSymGetModuleBase = NULL; @@ -142,41 +100,6 @@ DWORD WINAPI j_SymGetModuleBase(HANDLE hProcess, DWORD dwAddr) return 0; } -typedef BOOL (WINAPI *PFNSTACKWALK)(DWORD, HANDLE, HANDLE, LPSTACKFRAME, LPVOID, PREAD_PROCESS_MEMORY_ROUTINE, PFUNCTION_TABLE_ACCESS_ROUTINE, PGET_MODULE_BASE_ROUTINE, PTRANSLATE_ADDRESS_ROUTINE); -static PFNSTACKWALK pfnStackWalk = NULL; - -static -BOOL WINAPI j_StackWalk( - DWORD MachineType, - HANDLE hProcess, - HANDLE hThread, - LPSTACKFRAME StackFrame, - PVOID ContextRecord, - PREAD_PROCESS_MEMORY_ROUTINE ReadMemoryRoutine, - PFUNCTION_TABLE_ACCESS_ROUTINE FunctionTableAccessRoutine, - PGET_MODULE_BASE_ROUTINE GetModuleBaseRoutine, - PTRANSLATE_ADDRESS_ROUTINE TranslateAddress -) -{ - if( - (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) && - (pfnStackWalk || (pfnStackWalk = (PFNSTACKWALK) GetProcAddress(hModule_Imagehlp, "StackWalk"))) - ) - return pfnStackWalk( - MachineType, - hProcess, - hThread, - StackFrame, - ContextRecord, - ReadMemoryRoutine, - FunctionTableAccessRoutine, - GetModuleBaseRoutine, - TranslateAddress - ); - else - return FALSE; -} - typedef BOOL (WINAPI *PFNSYMGETSYMFROMADDR)(HANDLE, DWORD, LPDWORD, PIMAGEHLP_SYMBOL); static PFNSYMGETSYMFROMADDR pfnSymGetSymFromAddr = NULL; @@ -192,24 +115,9 @@ BOOL WINAPI j_SymGetSymFromAddr(HANDLE hProcess, DWORD Address, PDWORD Displacem return FALSE; } -typedef BOOL (WINAPI *PFNSYMGETLINEFROMADDR)(HANDLE, DWORD, LPDWORD, PIMAGEHLP_LINE); -static PFNSYMGETLINEFROMADDR pfnSymGetLineFromAddr = NULL; -static -BOOL WINAPI j_SymGetLineFromAddr(HANDLE hProcess, DWORD dwAddr, PDWORD pdwDisplacement, PIMAGEHLP_LINE Line) -{ - if( - (hModule_Imagehlp || (hModule_Imagehlp = LoadLibraryA("IMAGEHLP.DLL"))) && - (pfnSymGetLineFromAddr || (pfnSymGetLineFromAddr = (PFNSYMGETLINEFROMADDR) GetProcAddress(hModule_Imagehlp, "SymGetLineFromAddr"))) - ) - return pfnSymGetLineFromAddr(hProcess, dwAddr, pdwDisplacement, Line); - else - return FALSE; -} - - -static INLINE boolean -debug_symbol_print_imagehlp(const void *addr) +static INLINE void +debug_symbol_name_imagehlp(const void *addr, char* buf, unsigned size) { HANDLE hProcess; BYTE symbolBuffer[1024]; @@ -226,25 +134,95 @@ debug_symbol_print_imagehlp(const void *addr) if(j_SymInitialize(hProcess, NULL, TRUE)) bSymInitialized = TRUE; } - + if(!j_SymGetSymFromAddr(hProcess, (DWORD)addr, &dwDisplacement, pSymbol)) - return FALSE; + buf[0] = 0; + else + { + strncpy(buf, pSymbol->Name, size); + buf[size - 1] = 0; + } +} +#endif - debug_printf("\t%s\n", pSymbol->Name); +#ifdef __GLIBC__ +#include <execinfo.h> - return TRUE; - +/* This can only provide dynamic symbols, or binary offsets into a file. + * + * To fix this, post-process the output with tools/addr2line.sh + */ +static INLINE void +debug_symbol_name_glibc(const void *addr, char* buf, unsigned size) +{ + char** syms = backtrace_symbols((void**)&addr, 1); + strncpy(buf, syms[0], size); + buf[size - 1] = 0; + free(syms); } #endif - void -debug_symbol_print(const void *addr) +debug_symbol_name(const void *addr, char* buf, unsigned size) { #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86) - if(debug_symbol_print_imagehlp(addr)) + debug_symbol_name_imagehlp(addr, buf, size); + if(buf[0]) return; #endif - - debug_printf("\t%p\n", addr); + +#ifdef __GLIBC__ + debug_symbol_name_glibc(addr, buf, size); + if(buf[0]) + return; +#endif + + util_snprintf(buf, size, "%p", addr); + buf[size - 1] = 0; +} + +void +debug_symbol_print(const void *addr) +{ + char buf[1024]; + debug_symbol_name(addr, buf, sizeof(buf)); + debug_printf("\t%s\n", buf); +} + +struct util_hash_table* symbols_hash; +pipe_mutex symbols_mutex; + +static unsigned hash_ptr(void* p) +{ + return (unsigned)(uintptr_t)p; +} + +static int compare_ptr(void* a, void* b) +{ + if(a == b) + return 0; + else if(a < b) + return -1; + else + return 1; +} + +const char* +debug_symbol_name_cached(const void *addr) +{ + const char* name; + pipe_mutex_lock(symbols_mutex); + if(!symbols_hash) + symbols_hash = util_hash_table_create(hash_ptr, compare_ptr); + name = util_hash_table_get(symbols_hash, (void*)addr); + if(!name) + { + char buf[1024]; + debug_symbol_name(addr, buf, sizeof(buf)); + name = strdup(buf); + + util_hash_table_set(symbols_hash, (void*)addr, (void*)name); + } + pipe_mutex_unlock(symbols_mutex); + return name; } diff --git a/src/gallium/auxiliary/util/u_debug_symbol.h b/src/gallium/auxiliary/util/u_debug_symbol.h index 021586987b6..b247706c2a0 100644 --- a/src/gallium/auxiliary/util/u_debug_symbol.h +++ b/src/gallium/auxiliary/util/u_debug_symbol.h @@ -43,8 +43,13 @@ extern "C" { void -debug_symbol_print(const void *addr); +debug_symbol_name(const void *addr, char* buf, unsigned size); + +const char* +debug_symbol_name_cached(const void *addr); +void +debug_symbol_print(const void *addr); #ifdef __cplusplus } diff --git a/src/gallium/auxiliary/util/u_dirty_flags.h b/src/gallium/auxiliary/util/u_dirty_flags.h new file mode 100644 index 00000000000..7e1be45ad5a --- /dev/null +++ b/src/gallium/auxiliary/util/u_dirty_flags.h @@ -0,0 +1,28 @@ +#ifndef U_DIRTY_FLAGS_H +#define U_DIRTY_FLAGS_H + +/* Here's a convenient list of dirty flags to use in a driver. Either + * include it directly or use it as a starting point for your own + * list. + */ +#define U_NEW_VIEWPORT 0x1 +#define U_NEW_RASTERIZER 0x2 +#define U_NEW_FS 0x4 +#define U_NEW_FS_CONSTANTS 0x8 +#define U_NEW_FS_SAMPLER_VIEW 0x10 +#define U_NEW_FS_SAMPLER_STATES 0x20 +#define U_NEW_VS 0x40 +#define U_NEW_VS_CONSTANTS 0x80 +#define U_NEW_VS_SAMPLER_VIEW 0x100 +#define U_NEW_VS_SAMPLER_STATES 0x200 +#define U_NEW_BLEND 0x400 +#define U_NEW_CLIP 0x800 +#define U_NEW_SCISSOR 0x1000 +#define U_NEW_POLYGON_STIPPLE 0x2000 +#define U_NEW_FRAMEBUFFER 0x4000 +#define U_NEW_VERTEX_ELEMENTS 0x8000 +#define U_NEW_VERTEX_BUFFER 0x10000 +#define U_NEW_QUERY 0x20000 +#define U_NEW_DEPTH_STENCIL 0x40000 + +#endif diff --git a/src/gallium/auxiliary/util/u_dirty_surfaces.h b/src/gallium/auxiliary/util/u_dirty_surfaces.h index 99f260bf967..fd1bbe5ffdf 100644 --- a/src/gallium/auxiliary/util/u_dirty_surfaces.h +++ b/src/gallium/auxiliary/util/u_dirty_surfaces.h @@ -1,9 +1,39 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #ifndef U_DIRTY_SURFACES_H_ #define U_DIRTY_SURFACES_H_ +#include "pipe/p_state.h" + #include "util/u_double_list.h" #include "util/u_math.h" +struct pipe_context; + typedef void (*util_dirty_surface_flush_t) (struct pipe_context *, struct pipe_surface *); struct util_dirty_surfaces diff --git a/src/gallium/auxiliary/util/u_dl.h b/src/gallium/auxiliary/util/u_dl.h index 2853b447c61..80a00ed6796 100644 --- a/src/gallium/auxiliary/util/u_dl.h +++ b/src/gallium/auxiliary/util/u_dl.h @@ -35,10 +35,13 @@ #if defined(PIPE_OS_WINDOWS) # define UTIL_DL_EXT ".dll" +# define UTIL_DL_PREFIX "" #elif defined(PIPE_OS_APPLE) # define UTIL_DL_EXT ".dylib" +# define UTIL_DL_PREFIX "lib" #else # define UTIL_DL_EXT ".so" +# define UTIL_DL_PREFIX "lib" #endif diff --git a/src/gallium/auxiliary/util/u_double_list.h b/src/gallium/auxiliary/util/u_double_list.h index 53bb1342ddc..42adb1f0699 100644 --- a/src/gallium/auxiliary/util/u_double_list.h +++ b/src/gallium/auxiliary/util/u_double_list.h @@ -98,5 +98,20 @@ struct list_head #define LIST_IS_EMPTY(__list) \ ((__list)->next == (__list)) - +#ifndef container_of +#define container_of(ptr, sample, member) \ + (void *)((char *)(ptr) \ + - ((char *)&(sample)->member - (char *)(sample))) +#endif + +#define LIST_FOR_EACH_ENTRY(pos, head, member) \ + for (pos = container_of((head)->next, pos, member); \ + &pos->member != (head); \ + pos = container_of(pos->member.next, pos, member)) + +#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member) \ + for (pos = container_of((head)->next, pos, member), \ + storage = container_of(pos->member.next, pos, member); \ + &pos->member != (head); \ + pos = storage, storage = container_of(storage->member.next, storage, member)) #endif /*_U_DOUBLE_LIST_H_*/ diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h new file mode 100644 index 00000000000..f06d09ef91d --- /dev/null +++ b/src/gallium/auxiliary/util/u_draw.h @@ -0,0 +1,139 @@ +/************************************************************************** + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_DRAW_H +#define U_DRAW_H + + +#include "pipe/p_compiler.h" +#include "pipe/p_context.h" +#include "pipe/p_state.h" + + +static INLINE void +util_draw_init_info(struct pipe_draw_info *info) +{ + memset(info, 0, sizeof(*info)); + info->instance_count = 1; + info->max_index = 0xffffffff; +} + + +static INLINE void +util_draw_arrays(struct pipe_context *pipe, uint mode, uint start, uint count) +{ + struct pipe_draw_info info; + + util_draw_init_info(&info); + info.mode = mode; + info.start = start; + info.count = count; + info.min_index = start; + info.max_index = start + count - 1; + + pipe->draw_vbo(pipe, &info); +} + +static INLINE void +util_draw_elements(struct pipe_context *pipe, int index_bias, + uint mode, uint start, uint count) +{ + struct pipe_draw_info info; + + util_draw_init_info(&info); + info.indexed = TRUE; + info.mode = mode; + info.start = start; + info.count = count; + info.index_bias = index_bias; + + pipe->draw_vbo(pipe, &info); +} + +static INLINE void +util_draw_arrays_instanced(struct pipe_context *pipe, + uint mode, uint start, uint count, + uint start_instance, + uint instance_count) +{ + struct pipe_draw_info info; + + util_draw_init_info(&info); + info.mode = mode; + info.start = start; + info.count = count; + info.start_instance = start_instance; + info.instance_count = instance_count; + info.min_index = start; + info.max_index = start + count - 1; + + pipe->draw_vbo(pipe, &info); +} + +static INLINE void +util_draw_elements_instanced(struct pipe_context *pipe, + int index_bias, + uint mode, uint start, uint count, + uint start_instance, + uint instance_count) +{ + struct pipe_draw_info info; + + util_draw_init_info(&info); + info.indexed = TRUE; + info.mode = mode; + info.start = start; + info.count = count; + info.index_bias = index_bias; + info.start_instance = start_instance; + info.instance_count = instance_count; + + pipe->draw_vbo(pipe, &info); +} + +static INLINE void +util_draw_range_elements(struct pipe_context *pipe, + int index_bias, + uint min_index, + uint max_index, + uint mode, uint start, uint count) +{ + struct pipe_draw_info info; + + util_draw_init_info(&info); + info.indexed = TRUE; + info.mode = mode; + info.start = start; + info.count = count; + info.index_bias = index_bias; + info.min_index = min_index; + info.max_index = max_index; + + pipe->draw_vbo(pipe, &info); +} + +#endif diff --git a/src/gallium/auxiliary/util/u_draw_quad.c b/src/gallium/auxiliary/util/u_draw_quad.c index b37b48b5aef..0b6dc5880f3 100644 --- a/src/gallium/auxiliary/util/u_draw_quad.c +++ b/src/gallium/auxiliary/util/u_draw_quad.c @@ -60,7 +60,7 @@ util_draw_vertex_buffer(struct pipe_context *pipe, /* note: vertex elements already set by caller */ /* draw */ - pipe->draw_arrays(pipe, prim_type, 0, num_verts); + util_draw_arrays(pipe, prim_type, 0, num_verts); } diff --git a/src/gallium/auxiliary/util/u_draw_quad.h b/src/gallium/auxiliary/util/u_draw_quad.h index 42eb1844289..52994fe05c3 100644 --- a/src/gallium/auxiliary/util/u_draw_quad.h +++ b/src/gallium/auxiliary/util/u_draw_quad.h @@ -29,12 +29,18 @@ #define U_DRAWQUAD_H +#include "pipe/p_compiler.h" +#include "pipe/p_context.h" + + #ifdef __cplusplus extern "C" { #endif struct pipe_resource; +#include "util/u_draw.h" + extern void util_draw_vertex_buffer(struct pipe_context *pipe, struct pipe_resource *vbuf, uint offset, diff --git a/src/gallium/auxiliary/util/u_dump.h b/src/gallium/auxiliary/util/u_dump.h index bdc73ac47d2..49536c0d593 100644 --- a/src/gallium/auxiliary/util/u_dump.h +++ b/src/gallium/auxiliary/util/u_dump.h @@ -71,9 +71,15 @@ const char * util_dump_blend_func(unsigned value, boolean shortened); const char * +util_dump_logicop(unsigned value, boolean shortened); + +const char * util_dump_func(unsigned value, boolean shortened); const char * +util_dump_stencil_op(unsigned value, boolean shortened); + +const char * util_dump_tex_target(unsigned value, boolean shortened); const char * diff --git a/src/gallium/auxiliary/util/u_dump_defines.c b/src/gallium/auxiliary/util/u_dump_defines.c index 96a22563473..692d4447c66 100644 --- a/src/gallium/auxiliary/util/u_dump_defines.c +++ b/src/gallium/auxiliary/util/u_dump_defines.c @@ -160,6 +160,49 @@ DEFINE_UTIL_DUMP_CONTINUOUS(blend_func) static const char * +util_dump_logicop_names[] = { + "PIPE_LOGICOP_CLEAR", + "PIPE_LOGICOP_NOR", + "PIPE_LOGICOP_AND_INVERTED", + "PIPE_LOGICOP_COPY_INVERTED", + "PIPE_LOGICOP_AND_REVERSE", + "PIPE_LOGICOP_INVERT", + "PIPE_LOGICOP_XOR", + "PIPE_LOGICOP_NAND", + "PIPE_LOGICOP_AND", + "PIPE_LOGICOP_EQUIV", + "PIPE_LOGICOP_NOOP", + "PIPE_LOGICOP_OR_INVERTED", + "PIPE_LOGICOP_COPY", + "PIPE_LOGICOP_OR_REVERSE", + "PIPE_LOGICOP_OR", + "PIPE_LOGICOP_SET" +}; + +static const char * +util_dump_logicop_short_names[] = { + "clear", + "nor", + "and_inverted", + "copy_inverted", + "and_reverse", + "invert", + "xor", + "nand", + "and", + "equiv", + "noop", + "or_inverted", + "copy", + "or_reverse", + "or", + "set" +}; + +DEFINE_UTIL_DUMP_CONTINUOUS(logicop) + + +static const char * util_dump_func_names[] = { "PIPE_FUNC_NEVER", "PIPE_FUNC_LESS", @@ -187,7 +230,35 @@ DEFINE_UTIL_DUMP_CONTINUOUS(func) static const char * +util_dump_stencil_op_names[] = { + "PIPE_STENCIL_OP_KEEP", + "PIPE_STENCIL_OP_ZERO", + "PIPE_STENCIL_OP_REPLACE", + "PIPE_STENCIL_OP_INCR", + "PIPE_STENCIL_OP_DECR", + "PIPE_STENCIL_OP_INCR_WRAP", + "PIPE_STENCIL_OP_DECR_WRAP", + "PIPE_STENCIL_OP_INVERT" +}; + +static const char * +util_dump_stencil_op_short_names[] = { + "keep", + "zero", + "replace", + "incr", + "decr", + "incr_wrap", + "decr_wrap", + "invert" +}; + +DEFINE_UTIL_DUMP_CONTINUOUS(stencil_op) + + +static const char * util_dump_tex_target_names[] = { + "PIPE_BUFFER", "PIPE_TEXTURE_1D", "PIPE_TEXTURE_2D", "PIPE_TEXTURE_3D", @@ -196,6 +267,7 @@ util_dump_tex_target_names[] = { static const char * util_dump_tex_target_short_names[] = { + "buffer", "1d", "2d", "3d", diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c index 2ce643e90cd..cda5b8ba512 100644 --- a/src/gallium/auxiliary/util/u_dump_state.c +++ b/src/gallium/auxiliary/util/u_dump_state.c @@ -300,12 +300,13 @@ util_dump_rasterizer_state(struct os_stream *stream, const struct pipe_rasterize util_dump_member(stream, bool, state, flatshade); util_dump_member(stream, bool, state, light_twoside); - util_dump_member(stream, uint, state, front_winding); - util_dump_member(stream, uint, state, cull_mode); - util_dump_member(stream, uint, state, fill_cw); - util_dump_member(stream, uint, state, fill_ccw); - util_dump_member(stream, bool, state, offset_cw); - util_dump_member(stream, bool, state, offset_ccw); + util_dump_member(stream, uint, state, front_ccw); + util_dump_member(stream, uint, state, cull_face); + util_dump_member(stream, uint, state, fill_front); + util_dump_member(stream, uint, state, fill_back); + util_dump_member(stream, bool, state, offset_point); + util_dump_member(stream, bool, state, offset_line); + util_dump_member(stream, bool, state, offset_tri); util_dump_member(stream, bool, state, scissor); util_dump_member(stream, bool, state, poly_smooth); util_dump_member(stream, bool, state, poly_stipple_enable); diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h index 9d1c1713a7c..980cadf22d1 100644 --- a/src/gallium/auxiliary/util/u_dynarray.h +++ b/src/gallium/auxiliary/util/u_dynarray.h @@ -106,6 +106,9 @@ util_dynarray_trim(struct util_dynarray *buf) #define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type))) #define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type) #define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type)) +#define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx)) +#define util_dynarray_begin(buf) ((buf)->data) +#define util_dynarray_end(buf) ((void*)util_dynarray_element((buf), char, (buf)->size)) #endif /* U_DYNARRAY_H */ diff --git a/src/gallium/auxiliary/util/u_format.c b/src/gallium/auxiliary/util/u_format.c index c50c807eb89..4896faa12bf 100644 --- a/src/gallium/auxiliary/util/u_format.c +++ b/src/gallium/auxiliary/util/u_format.c @@ -120,11 +120,67 @@ util_format_write_4ub(enum pipe_format format, const uint8_t *src, unsigned src_ } -static INLINE boolean +boolean +util_is_format_compatible(const struct util_format_description *src_desc, + const struct util_format_description *dst_desc) +{ + unsigned chan; + + if (src_desc->format == dst_desc->format) { + return TRUE; + } + + if (src_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN || + dst_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) { + return FALSE; + } + + if (src_desc->block.bits != dst_desc->block.bits || + src_desc->nr_channels != dst_desc->nr_channels || + src_desc->colorspace != dst_desc->colorspace) { + return FALSE; + } + + for (chan = 0; chan < 4; ++chan) { + if (src_desc->channel[chan].size != + dst_desc->channel[chan].size) { + return FALSE; + } + } + + for (chan = 0; chan < 4; ++chan) { + enum util_format_swizzle swizzle = dst_desc->swizzle[chan]; + + if (swizzle < 4) { + if (src_desc->swizzle[chan] != swizzle) { + return FALSE; + } + if ((src_desc->channel[swizzle].type != + dst_desc->channel[swizzle].type) || + (src_desc->channel[swizzle].normalized != + dst_desc->channel[swizzle].normalized)) { + return FALSE; + } + } + } + + return TRUE; +} + + +boolean util_format_fits_8unorm(const struct util_format_description *format_desc) { unsigned chan; + /* + * After linearized sRGB values require more than 8bits. + */ + + if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { + return FALSE; + } + switch (format_desc->layout) { case UTIL_FORMAT_LAYOUT_S3TC: @@ -189,11 +245,14 @@ util_format_translate(enum pipe_format dst_format, const struct util_format_description *src_format_desc; uint8_t *dst_row; const uint8_t *src_row; - unsigned y_step; + unsigned x_step, y_step; unsigned dst_step; unsigned src_step; - if (dst_format == src_format) { + dst_format_desc = util_format_description(dst_format); + src_format_desc = util_format_description(src_format); + + if (util_is_format_compatible(src_format_desc, dst_format_desc)) { /* * Trivial case. */ @@ -204,9 +263,6 @@ util_format_translate(enum pipe_format dst_format, return; } - dst_format_desc = util_format_description(dst_format); - src_format_desc = util_format_description(src_format); - assert(dst_x % dst_format_desc->block.width == 0); assert(dst_y % dst_format_desc->block.height == 0); assert(src_x % src_format_desc->block.width == 0); @@ -221,6 +277,7 @@ util_format_translate(enum pipe_format dst_format, */ y_step = MAX2(dst_format_desc->block.height, src_format_desc->block.height); + x_step = MAX2(dst_format_desc->block.width, src_format_desc->block.width); assert(y_step % dst_format_desc->block.height == 0); assert(y_step % src_format_desc->block.height == 0); @@ -237,7 +294,7 @@ util_format_translate(enum pipe_format dst_format, unsigned tmp_stride; uint8_t *tmp_row; - tmp_stride = width * 4 * sizeof *tmp_row; + tmp_stride = MAX2(width, x_step) * 4 * sizeof *tmp_row; tmp_row = MALLOC(y_step * tmp_stride); if (!tmp_row) return; @@ -262,7 +319,7 @@ util_format_translate(enum pipe_format dst_format, unsigned tmp_stride; float *tmp_row; - tmp_stride = width * 4 * sizeof *tmp_row; + tmp_stride = MAX2(width, x_step) * 4 * sizeof *tmp_row; tmp_row = MALLOC(y_step * tmp_stride); if (!tmp_row) return; diff --git a/src/gallium/auxiliary/util/u_format.h b/src/gallium/auxiliary/util/u_format.h index 605b13bd114..03b73c0e98f 100644 --- a/src/gallium/auxiliary/util/u_format.h +++ b/src/gallium/auxiliary/util/u_format.h @@ -213,6 +213,16 @@ struct util_format_description unsigned width, unsigned height); /** + * Fetch a single pixel (i, j) from a block. + * + * XXX: Only defined for a very few select formats. + */ + void + (*fetch_rgba_8unorm)(uint8_t *dst, + const uint8_t *src, + unsigned i, unsigned j); + + /** * Unpack pixel blocks to R32G32B32A32_FLOAT. * Note: strides are in bytes. * @@ -332,12 +342,60 @@ util_format_name(enum pipe_format format) assert(desc); if (!desc) { - return "???"; + return "PIPE_FORMAT_???"; } return desc->name; } +static INLINE const char * +util_format_short_name(enum pipe_format format) +{ + const struct util_format_description *desc = util_format_description(format); + + assert(desc); + if (!desc) { + return "???"; + } + + return desc->short_name; +} + +/** + * Whether this format is plain, see UTIL_FORMAT_LAYOUT_PLAIN for more info. + */ +static INLINE boolean +util_format_is_plain(enum pipe_format format) +{ + const struct util_format_description *desc = util_format_description(format); + + if (!format) { + return FALSE; + } + + return desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ? TRUE : FALSE; +} + +static INLINE boolean +util_format_is_compressed(enum pipe_format format) +{ + const struct util_format_description *desc = util_format_description(format); + + assert(desc); + if (!desc) { + return FALSE; + } + + switch (desc->layout) { + case UTIL_FORMAT_LAYOUT_S3TC: + case UTIL_FORMAT_LAYOUT_RGTC: + /* XXX add other formats in the future */ + return TRUE; + default: + return FALSE; + } +} + static INLINE boolean util_format_is_s3tc(enum pipe_format format) { @@ -382,6 +440,48 @@ util_format_is_depth_and_stencil(enum pipe_format format) desc->swizzle[1] != UTIL_FORMAT_SWIZZLE_NONE) ? TRUE : FALSE; } + +/** + * Give the RGBA colormask of the channels that can be represented in this + * format. + * + * That is, the channels whose values are preserved. + */ +static INLINE unsigned +util_format_colormask(const struct util_format_description *desc) +{ + unsigned colormask; + unsigned chan; + + switch (desc->colorspace) { + case UTIL_FORMAT_COLORSPACE_RGB: + case UTIL_FORMAT_COLORSPACE_SRGB: + case UTIL_FORMAT_COLORSPACE_YUV: + colormask = 0; + for (chan = 0; chan < 4; ++chan) { + if (desc->swizzle[chan] < 4) { + colormask |= (1 << chan); + } + } + return colormask; + case UTIL_FORMAT_COLORSPACE_ZS: + return 0; + default: + assert(0); + return 0; + } +} + + +/** + * Whether the src format can be blitted to destation format with a simple + * memcpy. + */ +boolean +util_is_format_compatible(const struct util_format_description *src_desc, + const struct util_format_description *dst_desc); + + /** * Whether this format is a rgab8 variant. * @@ -573,6 +673,44 @@ util_format_has_alpha(enum pipe_format format) } /** + * Return the matching SRGB format, or PIPE_FORMAT_NONE if none. + */ +static INLINE enum pipe_format +util_format_srgb(enum pipe_format format) +{ + switch (format) { + case PIPE_FORMAT_L8_UNORM: + return PIPE_FORMAT_L8_SRGB; + case PIPE_FORMAT_L8A8_UNORM: + return PIPE_FORMAT_L8A8_SRGB; + case PIPE_FORMAT_R8G8B8_UNORM: + return PIPE_FORMAT_R8G8B8_SRGB; + case PIPE_FORMAT_A8B8G8R8_UNORM: + return PIPE_FORMAT_A8B8G8R8_SRGB; + case PIPE_FORMAT_X8B8G8R8_UNORM: + return PIPE_FORMAT_X8B8G8R8_SRGB; + case PIPE_FORMAT_B8G8R8A8_UNORM: + return PIPE_FORMAT_B8G8R8A8_SRGB; + case PIPE_FORMAT_B8G8R8X8_UNORM: + return PIPE_FORMAT_B8G8R8X8_SRGB; + case PIPE_FORMAT_A8R8G8B8_UNORM: + return PIPE_FORMAT_A8R8G8B8_SRGB; + case PIPE_FORMAT_X8R8G8B8_UNORM: + return PIPE_FORMAT_X8R8G8B8_SRGB; + case PIPE_FORMAT_DXT1_RGB: + return PIPE_FORMAT_DXT1_SRGB; + case PIPE_FORMAT_DXT1_RGBA: + return PIPE_FORMAT_DXT1_SRGBA; + case PIPE_FORMAT_DXT3_RGBA: + return PIPE_FORMAT_DXT3_SRGBA; + case PIPE_FORMAT_DXT5_RGBA: + return PIPE_FORMAT_DXT5_SRGBA; + default: + return PIPE_FORMAT_NONE; + } +} + +/** * Return the number of components stored. * Formats with block size != 1x1 will always have 1 component (the block). */ @@ -615,6 +753,9 @@ util_format_write_4ub(enum pipe_format format, * Generic format conversion; */ +boolean +util_format_fits_8unorm(const struct util_format_description *format_desc); + void util_format_translate(enum pipe_format dst_format, void *dst, unsigned dst_stride, diff --git a/src/gallium/auxiliary/util/u_format_other.c b/src/gallium/auxiliary/util/u_format_other.c index 723fa8c3bf9..fa42ec37138 100644 --- a/src/gallium/auxiliary/util/u_format_other.c +++ b/src/gallium/auxiliary/util/u_format_other.c @@ -121,6 +121,15 @@ util_format_r1_unorm_pack_rgba_8unorm(uint8_t *dst_row, unsigned dst_stride, * A.k.a. D3DFMT_CxV8U8 */ +static uint8_t +r8g8bx_derive(int16_t r, int16_t g) +{ + /* Derive blue from red and green components. + * Apparently, we must always use integers to perform calculations, + * otherwise the results won't match D3D's CxV8U8 definition. + */ + return (uint8_t)sqrtf(0x7f * 0x7f - r * r - g * g) * 0xff / 0x7f; +} void util_format_r8g8bx_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, @@ -145,7 +154,7 @@ util_format_r8g8bx_snorm_unpack_rgba_float(float *dst_row, unsigned dst_stride, dst[0] = (float)(r * (1.0f/0x7f)); /* r */ dst[1] = (float)(g * (1.0f/0x7f)); /* g */ - dst[2] = sqrtf(1.0f - dst[0] * dst[0] - dst[1] * dst[1]); /* b */ + dst[2] = r8g8bx_derive(r, g) * (1.0f/0xff); /* b */ dst[3] = 1.0f; /* a */ dst += 4; } @@ -177,7 +186,7 @@ util_format_r8g8bx_snorm_unpack_rgba_8unorm(uint8_t *dst_row, unsigned dst_strid dst[0] = (uint8_t)(((uint16_t)MAX2(r, 0)) * 0xff / 0x7f); /* r */ dst[1] = (uint8_t)(((uint16_t)MAX2(g, 0)) * 0xff / 0x7f); /* g */ - dst[2] = (uint8_t)sqrtf(0x7f*0x7f - r * r - g * g) * 0xff / 0x7f; /* b */ + dst[2] = r8g8bx_derive(r, g); /* b */ dst[3] = 255; /* a */ dst += 4; } @@ -262,6 +271,6 @@ util_format_r8g8bx_snorm_fetch_rgba_float(float *dst, const uint8_t *src, dst[0] = r * (1.0f/0x7f); /* r */ dst[1] = g * (1.0f/0x7f); /* g */ - dst[2] = sqrtf(1.0f - dst[0] * dst[0] - dst[1] * dst[1]); /* b */ + dst[2] = r8g8bx_derive(r, g) * (1.0f/0xff); /* b */ dst[3] = 1.0f; /* a */ } diff --git a/src/gallium/auxiliary/util/u_format_pack.py b/src/gallium/auxiliary/util/u_format_pack.py index 0c1bbc84c17..6d0016c0ad8 100644 --- a/src/gallium/auxiliary/util/u_format_pack.py +++ b/src/gallium/auxiliary/util/u_format_pack.py @@ -37,9 +37,6 @@ ''' -import sys -import math - from u_format_parse import * diff --git a/src/gallium/auxiliary/util/u_format_parse.py b/src/gallium/auxiliary/util/u_format_parse.py index 7076c676aaf..ddb9f2443d9 100755 --- a/src/gallium/auxiliary/util/u_format_parse.py +++ b/src/gallium/auxiliary/util/u_format_parse.py @@ -43,7 +43,7 @@ ZS = 'zs' def is_pot(x): - return (x & (x - 1)) == 0; + return (x & (x - 1)) == 0 VERY_LARGE = 99999999999999999999999 diff --git a/src/gallium/auxiliary/util/u_format_s3tc.c b/src/gallium/auxiliary/util/u_format_s3tc.c index 5b279b8fe26..bb989c29d81 100644 --- a/src/gallium/auxiliary/util/u_format_s3tc.c +++ b/src/gallium/auxiliary/util/u_format_s3tc.c @@ -120,7 +120,7 @@ util_format_s3tc_init(void) library = util_dl_open(DXTN_LIBNAME); if (!library) { debug_printf("couldn't open " DXTN_LIBNAME ", software DXTn " - "compression/decompression unavailable"); + "compression/decompression unavailable\n"); return; } @@ -142,7 +142,7 @@ util_format_s3tc_init(void) !util_format_dxtn_pack) { debug_printf("couldn't reference all symbols in " DXTN_LIBNAME ", software DXTn compression/decompression " - "unavailable"); + "unavailable\n"); util_dl_close(library); return; } diff --git a/src/gallium/auxiliary/util/u_format_srgb.py b/src/gallium/auxiliary/util/u_format_srgb.py index a4c76dc00b3..3e8000f3687 100644 --- a/src/gallium/auxiliary/util/u_format_srgb.py +++ b/src/gallium/auxiliary/util/u_format_srgb.py @@ -39,7 +39,6 @@ ''' -import sys import math diff --git a/src/gallium/auxiliary/util/u_format_table.py b/src/gallium/auxiliary/util/u_format_table.py index ae9a5981973..f0b407b8b8e 100755 --- a/src/gallium/auxiliary/util/u_format_table.py +++ b/src/gallium/auxiliary/util/u_format_table.py @@ -132,12 +132,17 @@ def write_format_table(formats): if format.colorspace != ZS: print " &util_format_%s_unpack_rgba_8unorm," % format.short_name() print " &util_format_%s_pack_rgba_8unorm," % format.short_name() + if format.layout == 's3tc': + print " &util_format_%s_fetch_rgba_8unorm," % format.short_name() + else: + print " NULL, /* fetch_rgba_8unorm */" print " &util_format_%s_unpack_rgba_float," % format.short_name() print " &util_format_%s_pack_rgba_float," % format.short_name() print " &util_format_%s_fetch_rgba_float," % format.short_name() else: print " NULL, /* unpack_rgba_8unorm */" print " NULL, /* pack_rgba_8unorm */" + print " NULL, /* fetch_rgba_8unorm */" print " NULL, /* unpack_rgba_float */" print " NULL, /* pack_rgba_float */" print " NULL, /* fetch_rgba_float */" diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c new file mode 100644 index 00000000000..7803ec6a8b5 --- /dev/null +++ b/src/gallium/auxiliary/util/u_framebuffer.c @@ -0,0 +1,148 @@ +/************************************************************************** + * + * Copyright 2009-2010 VMware, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/** + * @file + * Framebuffer utility functions. + * + * @author Brian Paul + */ + + +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "pipe/p_defines.h" +#include "util/u_inlines.h" + +#include "util/u_memory.h" +#include "util/u_framebuffer.h" + + +/** + * Compare pipe_framebuffer_state objects. + * \return TRUE if same, FALSE if different + */ +boolean +util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst, + const struct pipe_framebuffer_state *src) +{ + unsigned i; + + if (dst->width != src->width || + dst->height != src->height) + return FALSE; + + for (i = 0; i < Elements(src->cbufs); i++) { + if (dst->cbufs[i] != src->cbufs[i]) { + return FALSE; + } + } + + if (dst->nr_cbufs != src->nr_cbufs) { + return FALSE; + } + + if (dst->zsbuf != src->zsbuf) { + return FALSE; + } + + return TRUE; +} + + +/** + * Copy framebuffer state from src to dst, updating refcounts. + */ +void +util_copy_framebuffer_state(struct pipe_framebuffer_state *dst, + const struct pipe_framebuffer_state *src) +{ + unsigned i; + + dst->width = src->width; + dst->height = src->height; + + for (i = 0; i < src->nr_cbufs; i++) + pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]); + + for (i = src->nr_cbufs; i < dst->nr_cbufs; i++) + pipe_surface_reference(&dst->cbufs[i], NULL); + + dst->nr_cbufs = src->nr_cbufs; + + pipe_surface_reference(&dst->zsbuf, src->zsbuf); +} + + +void +util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb) +{ + unsigned i; + + for (i = 0; i < fb->nr_cbufs; i++) { + pipe_surface_reference(&fb->cbufs[i], NULL); + } + + pipe_surface_reference(&fb->zsbuf, NULL); + + fb->width = fb->height = 0; + fb->nr_cbufs = 0; +} + + +/* Where multiple sizes are allowed for framebuffer surfaces, find the + * minimum width and height of all bound surfaces. + */ +boolean +util_framebuffer_min_size(const struct pipe_framebuffer_state *fb, + unsigned *width, + unsigned *height) +{ + unsigned w = ~0; + unsigned h = ~0; + unsigned i; + + for (i = 0; i < fb->nr_cbufs; i++) { + w = MIN2(w, fb->cbufs[i]->width); + h = MIN2(h, fb->cbufs[i]->height); + } + + if (fb->zsbuf) { + w = MIN2(w, fb->zsbuf->width); + h = MIN2(h, fb->zsbuf->height); + } + + if (w == ~0) { + *width = 0; + *height = 0; + return FALSE; + } + else { + *width = w; + *height = h; + return TRUE; + } +} diff --git a/src/gallium/auxiliary/util/u_framebuffer.h b/src/gallium/auxiliary/util/u_framebuffer.h new file mode 100644 index 00000000000..e7dc1e9e41d --- /dev/null +++ b/src/gallium/auxiliary/util/u_framebuffer.h @@ -0,0 +1,54 @@ +/************************************************************************** + * + * Copyright 2009-2010 VMware, Inc. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +#ifndef U_FRAMEBUFFER_H +#define U_FRAMEBUFFER_H + + +#include "pipe/p_compiler.h" +#include "pipe/p_state.h" + + +extern boolean +util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst, + const struct pipe_framebuffer_state *src); + +extern void +util_copy_framebuffer_state(struct pipe_framebuffer_state *dst, + const struct pipe_framebuffer_state *src); + + +extern void +util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb); + + +extern boolean +util_framebuffer_min_size(const struct pipe_framebuffer_state *fb, + unsigned *width, + unsigned *height); + +#endif /* U_FRAMEBUFFER_H */ diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c index eee6030ddcc..6a931a95819 100644 --- a/src/gallium/auxiliary/util/u_gen_mipmap.c +++ b/src/gallium/auxiliary/util/u_gen_mipmap.c @@ -1255,6 +1255,7 @@ fallback_gen_mipmap(struct gen_mipmap_state *ctx, make_1d_mipmap(ctx, pt, face, baseLevel, lastLevel); break; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_CUBE: make_2d_mipmap(ctx, pt, face, baseLevel, lastLevel); break; @@ -1295,8 +1296,7 @@ util_create_gen_mipmap(struct pipe_context *pipe, /* rasterizer */ memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer)); - ctx->rasterizer.front_winding = PIPE_WINDING_CW; - ctx->rasterizer.cull_mode = PIPE_WINDING_NONE; + ctx->rasterizer.cull_face = PIPE_FACE_NONE; ctx->rasterizer.gl_rasterization_rules = 1; /* sampler state */ @@ -1328,8 +1328,10 @@ util_create_gen_mipmap(struct pipe_context *pipe, } /* fragment shader */ - ctx->fs2d = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D); - ctx->fsCube = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE); + ctx->fs2d = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D, + TGSI_INTERPOLATE_LINEAR); + ctx->fsCube = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE, + TGSI_INTERPOLATE_LINEAR); /* vertex data that doesn't change */ for (i = 0; i < 4; i++) { @@ -1494,7 +1496,7 @@ util_gen_mipmap(struct gen_mipmap_state *ctx, /* check if we can render in the texture's format */ if (!screen->is_format_supported(screen, psv->format, PIPE_TEXTURE_2D, - PIPE_BIND_RENDER_TARGET, 0)) { + pt->nr_samples, PIPE_BIND_RENDER_TARGET, 0)) { fallback_gen_mipmap(ctx, pt, face, baseLevel, lastLevel); return; } diff --git a/src/gallium/auxiliary/util/u_half.py b/src/gallium/auxiliary/util/u_half.py index 8007482e971..915cf3b9273 100644 --- a/src/gallium/auxiliary/util/u_half.py +++ b/src/gallium/auxiliary/util/u_half.py @@ -83,11 +83,11 @@ for i in xrange(1, 1024): # normalize number while (m & 0x00800000) == 0: - e -= 0x00800000; - m <<= 1; + e -= 0x00800000 + m <<= 1 - m &= ~0x00800000; - e += 0x38800000; + m &= ~0x00800000 + e += 0x38800000 value(m | e) # normals diff --git a/src/gallium/auxiliary/util/u_index_modify.c b/src/gallium/auxiliary/util/u_index_modify.c new file mode 100644 index 00000000000..65b079ed537 --- /dev/null +++ b/src/gallium/auxiliary/util/u_index_modify.c @@ -0,0 +1,127 @@ +/* + * Copyright 2010 Marek Olšák <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +#include "pipe/p_context.h" +#include "util/u_index_modify.h" +#include "util/u_inlines.h" + +void util_shorten_ubyte_elts(struct pipe_context *context, + struct pipe_resource **elts, + int index_bias, + unsigned start, + unsigned count) +{ + struct pipe_screen* screen = context->screen; + struct pipe_resource* new_elts; + unsigned char *in_map; + unsigned short *out_map; + struct pipe_transfer *src_transfer, *dst_transfer; + unsigned i; + + new_elts = pipe_buffer_create(screen, + PIPE_BIND_INDEX_BUFFER, + 2 * count); + + in_map = pipe_buffer_map(context, *elts, PIPE_TRANSFER_READ, &src_transfer); + out_map = pipe_buffer_map(context, new_elts, PIPE_TRANSFER_WRITE, &dst_transfer); + + in_map += start; + + for (i = 0; i < count; i++) { + *out_map = (unsigned short)(*in_map + index_bias); + in_map++; + out_map++; + } + + pipe_buffer_unmap(context, *elts, src_transfer); + pipe_buffer_unmap(context, new_elts, dst_transfer); + + *elts = new_elts; +} + +void util_rebuild_ushort_elts(struct pipe_context *context, + struct pipe_resource **elts, + int index_bias, + unsigned start, unsigned count) +{ + struct pipe_transfer *in_transfer = NULL; + struct pipe_transfer *out_transfer = NULL; + struct pipe_resource *new_elts; + unsigned short *in_map; + unsigned short *out_map; + unsigned i; + + new_elts = pipe_buffer_create(context->screen, + PIPE_BIND_INDEX_BUFFER, + 2 * count); + + in_map = pipe_buffer_map(context, *elts, + PIPE_TRANSFER_READ, &in_transfer); + out_map = pipe_buffer_map(context, new_elts, + PIPE_TRANSFER_WRITE, &out_transfer); + + in_map += start; + for (i = 0; i < count; i++) { + *out_map = (unsigned short)(*in_map + index_bias); + in_map++; + out_map++; + } + + pipe_buffer_unmap(context, *elts, in_transfer); + pipe_buffer_unmap(context, new_elts, out_transfer); + + *elts = new_elts; +} + +void util_rebuild_uint_elts(struct pipe_context *context, + struct pipe_resource **elts, + int index_bias, + unsigned start, unsigned count) +{ + struct pipe_transfer *in_transfer = NULL; + struct pipe_transfer *out_transfer = NULL; + struct pipe_resource *new_elts; + unsigned int *in_map; + unsigned int *out_map; + unsigned i; + + new_elts = pipe_buffer_create(context->screen, + PIPE_BIND_INDEX_BUFFER, + 2 * count); + + in_map = pipe_buffer_map(context, *elts, + PIPE_TRANSFER_READ, &in_transfer); + out_map = pipe_buffer_map(context, new_elts, + PIPE_TRANSFER_WRITE, &out_transfer); + + in_map += start; + for (i = 0; i < count; i++) { + *out_map = (unsigned int)(*in_map + index_bias); + in_map++; + out_map++; + } + + pipe_buffer_unmap(context, *elts, in_transfer); + pipe_buffer_unmap(context, new_elts, out_transfer); + + *elts = new_elts; +} diff --git a/src/gallium/auxiliary/util/u_index_modify.h b/src/gallium/auxiliary/util/u_index_modify.h new file mode 100644 index 00000000000..01a6cae94fc --- /dev/null +++ b/src/gallium/auxiliary/util/u_index_modify.h @@ -0,0 +1,41 @@ +/* + * Copyright 2010 Marek Olšák <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +#ifndef UTIL_INDEX_MODIFY_H +#define UTIL_INDEX_MODIFY_H + +void util_shorten_ubyte_elts(struct pipe_context *context, + struct pipe_resource **elts, + int index_bias, + unsigned start, + unsigned count); + +void util_rebuild_ushort_elts(struct pipe_context *context, + struct pipe_resource **elts, + int index_bias, + unsigned start, unsigned count); + +void util_rebuild_uint_elts(struct pipe_context *context, + struct pipe_resource **elts, + int index_bias, + unsigned start, unsigned count); +#endif diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h index a48689ee8be..6ed39561fbe 100644 --- a/src/gallium/auxiliary/util/u_inlines.h +++ b/src/gallium/auxiliary/util/u_inlines.h @@ -33,6 +33,8 @@ #include "pipe/p_state.h" #include "pipe/p_screen.h" #include "util/u_debug.h" +#include "util/u_debug_describe.h" +#include "util/u_debug_refcnt.h" #include "util/u_atomic.h" #include "util/u_box.h" #include "util/u_math.h" @@ -67,7 +69,9 @@ pipe_is_referenced(struct pipe_reference *reference) * \return TRUE if the object's refcount hits zero and should be destroyed. */ static INLINE boolean -pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) +pipe_reference_described(struct pipe_reference *ptr, + struct pipe_reference *reference, + debug_reference_descriptor get_desc) { boolean destroy = FALSE; @@ -76,6 +80,7 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) if (reference) { assert(pipe_is_referenced(reference)); p_atomic_inc(&reference->count); + debug_reference(reference, get_desc, 1); } if (ptr) { @@ -83,41 +88,49 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) if (p_atomic_dec_zero(&ptr->count)) { destroy = TRUE; } + debug_reference(ptr, get_desc, -1); } } return destroy; } +static INLINE boolean +pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) +{ + return pipe_reference_described(ptr, reference, + (debug_reference_descriptor)debug_describe_reference); +} static INLINE void pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf) { struct pipe_surface *old_surf = *ptr; - if (pipe_reference(&(*ptr)->reference, &surf->reference)) + if (pipe_reference_described(&(*ptr)->reference, &surf->reference, + (debug_reference_descriptor)debug_describe_surface)) old_surf->texture->screen->tex_surface_destroy(old_surf); *ptr = surf; } - static INLINE void pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex) { struct pipe_resource *old_tex = *ptr; - if (pipe_reference(&(*ptr)->reference, &tex->reference)) + if (pipe_reference_described(&(*ptr)->reference, &tex->reference, + (debug_reference_descriptor)debug_describe_resource)) old_tex->screen->resource_destroy(old_tex->screen, old_tex); *ptr = tex; } - static INLINE void pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view) { struct pipe_sampler_view *old_view = *ptr; - if (pipe_reference(&(*ptr)->reference, &view->reference)) + if (pipe_reference_described(&(*ptr)->reference, &view->reference, + (debug_reference_descriptor)debug_describe_sampler_view)) old_view->context->sampler_view_destroy(old_view->context, old_view); *ptr = view; } @@ -369,6 +382,23 @@ pipe_transfer_destroy( struct pipe_context *context, } +static INLINE boolean util_get_offset( + const struct pipe_rasterizer_state *templ, + unsigned fill_mode) +{ + switch(fill_mode) { + case PIPE_POLYGON_MODE_POINT: + return templ->offset_point; + case PIPE_POLYGON_MODE_LINE: + return templ->offset_line; + case PIPE_POLYGON_MODE_FILL: + return templ->offset_tri; + default: + assert(0); + return FALSE; + } +} + #ifdef __cplusplus } #endif diff --git a/src/gallium/auxiliary/util/u_linear.h b/src/gallium/auxiliary/util/u_linear.h index 42c40b2aa75..81ffc9fb27d 100644 --- a/src/gallium/auxiliary/util/u_linear.h +++ b/src/gallium/auxiliary/util/u_linear.h @@ -33,6 +33,7 @@ #ifndef U_LINEAR_H #define U_LINEAR_H +#include "pipe/p_compiler.h" #include "pipe/p_format.h" struct u_linear_format_block diff --git a/src/gallium/auxiliary/util/u_linkage.c b/src/gallium/auxiliary/util/u_linkage.c new file mode 100644 index 00000000000..2f6f41ba843 --- /dev/null +++ b/src/gallium/auxiliary/util/u_linkage.c @@ -0,0 +1,149 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_debug.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" +#include "util/u_linkage.h" + +/* we must only record the registers that are actually used, not just declared */ +static INLINE boolean +util_semantic_set_test_and_set(struct util_semantic_set *set, unsigned value) +{ + unsigned mask = 1 << (value % (sizeof(long) * 8)); + unsigned long *p = &set->masks[value / (sizeof(long) * 8)]; + unsigned long v = *p & mask; + *p |= mask; + return !!v; +} + +unsigned +util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file) +{ + struct tgsi_shader_info info; + struct tgsi_parse_context parse; + unsigned count = 0; + ubyte *semantic_name; + ubyte *semantic_index; + + tgsi_scan_shader(tokens, &info); + + if(file == TGSI_FILE_INPUT) + { + semantic_name = info.input_semantic_name; + semantic_index = info.input_semantic_index; + } + else if(file == TGSI_FILE_OUTPUT) + { + semantic_name = info.output_semantic_name; + semantic_index = info.output_semantic_index; + } + else + { + assert(0); + semantic_name = NULL; + semantic_index = NULL; + } + + tgsi_parse_init(&parse, tokens); + + memset(set->masks, 0, sizeof(set->masks)); + while(!tgsi_parse_end_of_tokens(&parse)) + { + tgsi_parse_token(&parse); + + if(parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) + { + const struct tgsi_full_instruction *finst = &parse.FullToken.FullInstruction; + unsigned i; + for(i = 0; i < finst->Instruction.NumDstRegs; ++i) + { + if(finst->Dst[i].Register.File == file) + { + unsigned idx = finst->Dst[i].Register.Index; + if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC) + { + if(!util_semantic_set_test_and_set(set, semantic_index[idx])) + ++count; + } + } + } + + for(i = 0; i < finst->Instruction.NumSrcRegs; ++i) + { + if(finst->Src[i].Register.File == file) + { + unsigned idx = finst->Src[i].Register.Index; + if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC) + { + if(!util_semantic_set_test_and_set(set, semantic_index[idx])) + ++count; + } + } + } + } + } + tgsi_parse_free(&parse); + + return count; +} + +#define UTIL_SEMANTIC_SET_FOR_EACH(i, set) for(i = 0; i < 256; ++i) if(set->masks[i / (sizeof(long) * 8)] & (1 << (i % (sizeof(long) * 8)))) + +void +util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots) +{ + int first = -1; + int last = -1; + unsigned i; + + memset(layout, 0xff, num_slots); + + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + { + if(first < 0) + first = i; + last = i; + } + + if(last < efficient_slots) + { + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[i] = i; + } + else if((last - first) < efficient_slots) + { + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[i - first] = i; + } + else + { + unsigned idx = 0; + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[idx++] = i; + } +} diff --git a/src/gallium/auxiliary/util/u_linkage.h b/src/gallium/auxiliary/util/u_linkage.h new file mode 100644 index 00000000000..4720e0ee603 --- /dev/null +++ b/src/gallium/auxiliary/util/u_linkage.h @@ -0,0 +1,66 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_LINKAGE_H_ +#define U_LINKAGE_H_ + +#include "pipe/p_compiler.h" +#include "pipe/p_shader_tokens.h" + +struct util_semantic_set +{ + unsigned long masks[256 / 8 / sizeof(unsigned long)]; +}; + +static INLINE bool +util_semantic_set_contains(struct util_semantic_set *set, unsigned char value) +{ + return !!(set->masks[value / (sizeof(long) * 8)] & (1 << (value / (sizeof(long) * 8)))); +} + +unsigned util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file); + +/* efficient_slots is the number of slots such that hardware performance is + * the same for using that amount, with holes, or less slots but with less + * holes. + * + * num_slots is the size of the layout array and hardware limit instead. + * + * efficient_slots == 0 or efficient_solts == num_slots are typical settings. + */ +void util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots); + +static INLINE void +util_semantic_table_from_layout(unsigned char *table, unsigned char *layout, unsigned char first_slot_value, unsigned char num_slots) +{ + int i; + memset(table, 0xff, sizeof(table)); + + for(i = 0; i < num_slots; ++i) + table[layout[i]] = first_slot_value + i; +} + +#endif /* U_LINKAGE_H_ */ diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index d1ec13def30..69a76814945 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -168,6 +168,9 @@ static INLINE float logf( float f ) #undef logf #define logf(x) ((float)log((double)(x))) #endif /* logf */ + +#define isfinite(x) _finite((double)(x)) +#define isnan(x) _isnan((double)(x)) #endif static INLINE double log2( double x ) @@ -335,26 +338,25 @@ util_iround(float f) } - /** - * Test if x is NaN or +/- infinity. + * Approximate floating point comparison */ static INLINE boolean -util_is_inf_or_nan(float x) +util_is_approx(float a, float b, float tol) { - union fi tmp; - tmp.f = x; - return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31); + return fabs(b - a) <= tol; } /** - * Test whether x is a power of two. + * Test if x is NaN or +/- infinity. */ static INLINE boolean -util_is_pot(unsigned x) +util_is_inf_or_nan(float x) { - return (x & (x - 1)) == 0; + union fi tmp; + tmp.f = x; + return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31); } @@ -554,13 +556,30 @@ util_bswap16(uint16_t n) #define MIN3( A, B, C ) MIN2( MIN2( A, B ), C ) #define MAX3( A, B, C ) MAX2( MAX2( A, B ), C ) +#define MIN4( A, B, C, D ) MIN2( MIN2( A, B ), MIN2(C, D) ) +#define MAX4( A, B, C, D ) MAX2( MAX2( A, B ), MAX2(C, D) ) + +/** + * Align a value, only works pot alignemnts. + */ static INLINE int align(int value, int alignment) { return (value + alignment - 1) & ~(alignment - 1); } +/** + * Works like align but on npot alignments. + */ +static INLINE size_t +util_align_npot(size_t value, size_t alignment) +{ + if (value % alignment) + return value + (alignment - (value % alignment)); + return value; +} + static INLINE unsigned u_minify(unsigned value, unsigned levels) { diff --git a/src/gallium/auxiliary/util/u_mempool.c b/src/gallium/auxiliary/util/u_mempool.c new file mode 100644 index 00000000000..1f336b39a1a --- /dev/null +++ b/src/gallium/auxiliary/util/u_mempool.c @@ -0,0 +1,169 @@ +/* + * Copyright 2010 Marek Olšák <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +#include "util/u_mempool.h" + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_simple_list.h" + +#include <stdio.h> + +#define UTIL_MEMPOOL_MAGIC 0xcafe4321 + +/* The block is either allocated memory or free space. */ +struct util_mempool_block { + /* The header. */ + /* The first next free block. */ + struct util_mempool_block *next_free; + + intptr_t magic; + + /* Memory after the last member is dedicated to the block itself. + * The allocated size is always larger than this structure. */ +}; + +static struct util_mempool_block * +util_mempool_get_block(struct util_mempool *pool, + struct util_mempool_page *page, unsigned index) +{ + return (struct util_mempool_block*) + ((uint8_t*)page + sizeof(struct util_mempool_page) + + (pool->block_size * index)); +} + +static void util_mempool_add_new_page(struct util_mempool *pool) +{ + struct util_mempool_page *page; + struct util_mempool_block *block; + int i; + + page = MALLOC(pool->page_size); + insert_at_tail(&pool->list, page); + + /* Mark all blocks as free. */ + for (i = 0; i < pool->num_blocks-1; i++) { + block = util_mempool_get_block(pool, page, i); + block->next_free = util_mempool_get_block(pool, page, i+1); + block->magic = UTIL_MEMPOOL_MAGIC; + } + + block = util_mempool_get_block(pool, page, pool->num_blocks-1); + block->next_free = pool->first_free; + block->magic = UTIL_MEMPOOL_MAGIC; + pool->first_free = util_mempool_get_block(pool, page, 0); + pool->num_pages++; + +#if 0 + fprintf(stderr, "New page! Num of pages: %i\n", pool->num_pages); +#endif +} + +static void *util_mempool_malloc_st(struct util_mempool *pool) +{ + struct util_mempool_block *block; + + if (!pool->first_free) + util_mempool_add_new_page(pool); + + block = pool->first_free; + assert(block->magic == UTIL_MEMPOOL_MAGIC); + pool->first_free = block->next_free; + + return (uint8_t*)block + sizeof(struct util_mempool_block); +} + +static void util_mempool_free_st(struct util_mempool *pool, void *ptr) +{ + struct util_mempool_block *block = + (struct util_mempool_block*) + ((uint8_t*)ptr - sizeof(struct util_mempool_block)); + + assert(block->magic == UTIL_MEMPOOL_MAGIC); + block->next_free = pool->first_free; + pool->first_free = block; +} + +static void *util_mempool_malloc_mt(struct util_mempool *pool) +{ + void *mem; + + pipe_mutex_lock(pool->mutex); + mem = util_mempool_malloc_st(pool); + pipe_mutex_unlock(pool->mutex); + return mem; +} + +static void util_mempool_free_mt(struct util_mempool *pool, void *ptr) +{ + pipe_mutex_lock(pool->mutex); + util_mempool_free_st(pool, ptr); + pipe_mutex_unlock(pool->mutex); +} + +void util_mempool_set_thread_safety(struct util_mempool *pool, + enum util_mempool_threading threading) +{ + pool->threading = threading; + + if (threading) { + pool->malloc = util_mempool_malloc_mt; + pool->free = util_mempool_free_mt; + } else { + pool->malloc = util_mempool_malloc_st; + pool->free = util_mempool_free_st; + } +} + +void util_mempool_create(struct util_mempool *pool, + unsigned item_size, + unsigned num_blocks, + enum util_mempool_threading threading) +{ + item_size = align(item_size, sizeof(intptr_t)); + + pool->num_pages = 0; + pool->num_blocks = num_blocks; + pool->block_size = sizeof(struct util_mempool_block) + item_size; + pool->block_size = align(pool->block_size, sizeof(intptr_t)); + pool->page_size = sizeof(struct util_mempool_page) + + num_blocks * pool->block_size; + pool->first_free = NULL; + + make_empty_list(&pool->list); + + pipe_mutex_init(pool->mutex); + + util_mempool_set_thread_safety(pool, threading); +} + +void util_mempool_destroy(struct util_mempool *pool) +{ + struct util_mempool_page *page, *temp; + + foreach_s(page, temp, &pool->list) { + remove_from_list(page); + FREE(page); + } + + pipe_mutex_destroy(pool->mutex); +} diff --git a/src/gallium/auxiliary/util/u_mempool.h b/src/gallium/auxiliary/util/u_mempool.h new file mode 100644 index 00000000000..a5b5d6a9b7c --- /dev/null +++ b/src/gallium/auxiliary/util/u_mempool.h @@ -0,0 +1,87 @@ +/* + * Copyright 2010 Marek Olšák <[email protected]> + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/** + * @file + * Simple memory pool for equally sized memory allocations. + * util_mempool_malloc and util_mempool_free are in O(1). + * + * Good for allocations which have very low lifetime and are allocated + * and freed very often. Use a profiler first! + * + * Candidates: get_transfer, user_buffer_create + * + * @author Marek Olšák + */ + +#ifndef U_MEMPOOL_H +#define U_MEMPOOL_H + +#include "os/os_thread.h" + +enum util_mempool_threading { + UTIL_MEMPOOL_SINGLETHREADED = FALSE, + UTIL_MEMPOOL_MULTITHREADED = TRUE +}; + +/* The page is an array of blocks (allocations). */ +struct util_mempool_page { + /* The header (linked-list pointers). */ + struct util_mempool_page *prev, *next; + + /* Memory after the last member is dedicated to the page itself. + * The allocated size is always larger than this structure. */ +}; + +struct util_mempool { + /* Public members. */ + void *(*malloc)(struct util_mempool *pool); + void (*free)(struct util_mempool *pool, void *ptr); + + /* Private members. */ + struct util_mempool_block *first_free; + + struct util_mempool_page list; + + unsigned block_size; + unsigned page_size; + unsigned num_blocks; + unsigned num_pages; + enum util_mempool_threading threading; + + pipe_mutex mutex; +}; + +void util_mempool_create(struct util_mempool *pool, + unsigned item_size, + unsigned num_blocks, + enum util_mempool_threading threading); + +void util_mempool_destroy(struct util_mempool *pool); + +void util_mempool_set_thread_safety(struct util_mempool *pool, + enum util_mempool_threading threading); + +#define util_mempool_malloc(pool) (pool)->malloc(pool) +#define util_mempool_free(pool, ptr) (pool)->free(pool, ptr) + +#endif diff --git a/src/gallium/auxiliary/util/u_network.c b/src/gallium/auxiliary/util/u_network.c index 87ee0e47685..77f2c5fc7de 100644 --- a/src/gallium/auxiliary/util/u_network.c +++ b/src/gallium/auxiliary/util/u_network.c @@ -6,7 +6,7 @@ #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) # include <winsock2.h> # include <windows.h> -#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE) +#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_APPLE) || defined(PIPE_OS_CYGWIN) # include <sys/socket.h> # include <netinet/in.h> # include <unistd.h> diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h index 3ebef9fb749..5378f2d782f 100644 --- a/src/gallium/auxiliary/util/u_pack_color.h +++ b/src/gallium/auxiliary/util/u_pack_color.h @@ -42,12 +42,18 @@ #include "util/u_math.h" - +/** + * Helper union for packing pixel values. + * Will often contain values in formats which are too complex to be described + * in simple terms, hence might just effectively contain a number of bytes. + * Must be big enough to hold data for all formats (currently 256 bits). + */ union util_color { ubyte ub; ushort us; uint ui; float f[4]; + double d[4]; }; /** @@ -388,7 +394,7 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color * return; case PIPE_FORMAT_B4G4R4A4_UNORM: { - uc->ub = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4); + uc->us = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4); } return; case PIPE_FORMAT_A8_UNORM: @@ -425,13 +431,65 @@ util_pack_color(const float rgba[4], enum pipe_format format, union util_color * } } +/* Integer versions of util_pack_z and util_pack_z_stencil - useful for + * constructing clear masks. + */ +static INLINE uint32_t +util_pack_mask_z(enum pipe_format format, uint32_t z) +{ + switch (format) { + case PIPE_FORMAT_Z16_UNORM: + return z & 0xffff; + case PIPE_FORMAT_Z32_UNORM: + case PIPE_FORMAT_Z32_FLOAT: + return z; + case PIPE_FORMAT_Z24_UNORM_S8_USCALED: + case PIPE_FORMAT_Z24X8_UNORM: + return z & 0xffffff; + case PIPE_FORMAT_S8_USCALED_Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + return (z & 0xffffff) << 8; + case PIPE_FORMAT_S8_USCALED: + return 0; + default: + debug_print_format("gallium: unhandled format in util_pack_mask_z()", format); + assert(0); + return 0; + } +} + +static INLINE uint32_t +util_pack_mask_z_stencil(enum pipe_format format, uint32_t z, uint8_t s) +{ + uint32_t packed = util_pack_mask_z(format, z); + + switch (format) { + case PIPE_FORMAT_Z24_UNORM_S8_USCALED: + packed |= (uint32_t)s << 24; + break; + case PIPE_FORMAT_S8_USCALED_Z24_UNORM: + packed |= s; + break; + case PIPE_FORMAT_S8_USCALED: + packed |= s; + break; + default: + break; + } + + return packed; +} + + /** * Note: it's assumed that z is in [0,1] */ -static INLINE uint +static INLINE uint32_t util_pack_z(enum pipe_format format, double z) { + union fi fui; + if (z == 0.0) return 0; @@ -439,24 +497,25 @@ util_pack_z(enum pipe_format format, double z) case PIPE_FORMAT_Z16_UNORM: if (z == 1.0) return 0xffff; - return (uint) (z * 0xffff); + return (uint32_t) (z * 0xffff); case PIPE_FORMAT_Z32_UNORM: /* special-case to avoid overflow */ if (z == 1.0) return 0xffffffff; - return (uint) (z * 0xffffffff); + return (uint32_t) (z * 0xffffffff); case PIPE_FORMAT_Z32_FLOAT: - return (uint)z; + fui.f = (float)z; + return fui.ui; case PIPE_FORMAT_Z24_UNORM_S8_USCALED: case PIPE_FORMAT_Z24X8_UNORM: if (z == 1.0) return 0xffffff; - return (uint) (z * 0xffffff); + return (uint32_t) (z * 0xffffff); case PIPE_FORMAT_S8_USCALED_Z24_UNORM: case PIPE_FORMAT_X8Z24_UNORM: if (z == 1.0) return 0xffffff00; - return ((uint) (z * 0xffffff)) << 8; + return ((uint32_t) (z * 0xffffff)) << 8; case PIPE_FORMAT_S8_USCALED: /* this case can get it via util_pack_z_stencil() */ return 0; @@ -472,14 +531,14 @@ util_pack_z(enum pipe_format format, double z) * Pack Z and/or stencil values into a 32-bit value described by format. * Note: it's assumed that z is in [0,1] and s in [0,255] */ -static INLINE uint -util_pack_z_stencil(enum pipe_format format, double z, uint s) +static INLINE uint32_t +util_pack_z_stencil(enum pipe_format format, double z, uint8_t s) { - unsigned packed = util_pack_z(format, z); + uint32_t packed = util_pack_z(format, z); switch (format) { case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - packed |= s << 24; + packed |= (uint32_t)s << 24; break; case PIPE_FORMAT_S8_USCALED_Z24_UNORM: packed |= s; diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h index e1af9f11cb9..cce0c7430e7 100644 --- a/src/gallium/auxiliary/util/u_pointer.h +++ b/src/gallium/auxiliary/util/u_pointer.h @@ -98,6 +98,29 @@ align16( void *unaligned ) return align_pointer( unaligned, 16 ); } +typedef void (*func_pointer)(void); + +static INLINE func_pointer +pointer_to_func( void *p ) +{ + union { + void *p; + func_pointer f; + } pf; + pf.p = p; + return pf.f; +} + +static INLINE void * +func_to_pointer( func_pointer f ) +{ + union { + void *p; + func_pointer f; + } pf; + pf.f = f; + return pf.p; +} #ifdef __cplusplus diff --git a/src/gallium/auxiliary/util/u_prim.h b/src/gallium/auxiliary/util/u_prim.h index 64390e13851..3c851f73401 100644 --- a/src/gallium/auxiliary/util/u_prim.h +++ b/src/gallium/auxiliary/util/u_prim.h @@ -108,6 +108,20 @@ static INLINE boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr ) ok = (*nr >= 4); *nr -= (*nr % 2); break; + case PIPE_PRIM_LINES_ADJACENCY: + ok = (*nr >= 4); + *nr -= (*nr % 4); + break; + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + ok = (*nr >= 4); + break; + case PIPE_PRIM_TRIANGLES_ADJACENCY: + ok = (*nr >= 6); + *nr -= (*nr % 5); + break; + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + ok = (*nr >= 4); + break; default: ok = 0; break; @@ -169,6 +183,52 @@ u_vertices_per_prim(int primitive) } } +/** + * Returns the number of decomposed primitives for the given + * vertex count. + * Geometry shader is invoked once for each triangle in + * triangle strip, triangle fans and triangles and once + * for each line in line strip, line loop, lines. + */ +static INLINE unsigned +u_gs_prims_for_vertices(int primitive, int vertices) +{ + switch(primitive) { + case PIPE_PRIM_POINTS: + return vertices; + case PIPE_PRIM_LINES: + return vertices / 2; + case PIPE_PRIM_LINE_LOOP: + return vertices; + case PIPE_PRIM_LINE_STRIP: + return vertices - 1; + case PIPE_PRIM_TRIANGLES: + return vertices / 3; + case PIPE_PRIM_TRIANGLE_STRIP: + return vertices - 2; + case PIPE_PRIM_TRIANGLE_FAN: + return vertices - 2; + case PIPE_PRIM_LINES_ADJACENCY: + return vertices / 2; + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + return vertices - 1; + case PIPE_PRIM_TRIANGLES_ADJACENCY: + return vertices / 3; + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + return vertices - 2; + + /* following primitives should never be used + * with geometry shaders abd their size is + * undefined */ + case PIPE_PRIM_POLYGON: + case PIPE_PRIM_QUADS: + case PIPE_PRIM_QUAD_STRIP: + default: + debug_printf("Unrecognized geometry shader primitive"); + return 3; + } +} + const char *u_prim_name( unsigned pipe_prim ); #endif diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c index 098cdfd58b1..56fcfac0693 100644 --- a/src/gallium/auxiliary/util/u_rect.c +++ b/src/gallium/auxiliary/util/u_rect.c @@ -30,13 +30,9 @@ */ -#include "pipe/p_defines.h" -#include "pipe/p_format.h" -#include "pipe/p_context.h" -#include "pipe/p_screen.h" #include "util/u_format.h" -#include "util/u_inlines.h" #include "util/u_rect.h" +#include "util/u_pack_color.h" /** @@ -99,7 +95,7 @@ util_fill_rect(ubyte * dst, unsigned dst_y, unsigned width, unsigned height, - uint32_t value) + union util_color *uc) { unsigned i, j; unsigned width_size; @@ -115,193 +111,54 @@ util_fill_rect(ubyte * dst, dst_y /= blockheight; width = (width + blockwidth - 1)/blockwidth; height = (height + blockheight - 1)/blockheight; - + dst += dst_x * blocksize; dst += dst_y * dst_stride; width_size = width * blocksize; - + switch (blocksize) { case 1: if(dst_stride == width_size) - memset(dst, (ubyte) value, height * width_size); + memset(dst, uc->ub, height * width_size); else { - for (i = 0; i < height; i++) { - memset(dst, (ubyte) value, width_size); - dst += dst_stride; - } + for (i = 0; i < height; i++) { + memset(dst, uc->ub, width_size); + dst += dst_stride; + } } break; case 2: for (i = 0; i < height; i++) { - uint16_t *row = (uint16_t *)dst; - for (j = 0; j < width; j++) - *row++ = (uint16_t) value; - dst += dst_stride; + uint16_t *row = (uint16_t *)dst; + for (j = 0; j < width; j++) + *row++ = uc->us; + dst += dst_stride; } break; case 4: for (i = 0; i < height; i++) { - uint32_t *row = (uint32_t *)dst; - for (j = 0; j < width; j++) - *row++ = value; - dst += dst_stride; + uint32_t *row = (uint32_t *)dst; + for (j = 0; j < width; j++) + *row++ = uc->ui; + dst += dst_stride; } break; - default: - assert(0); - break; - } -} - - - -/** - * Fallback function for pipe->surface_copy(). - * Note: (X,Y)=(0,0) is always the upper-left corner. - * if do_flip, flip the image vertically on its way from src rect to dst rect. - * XXX should probably put this in new u_surface.c file... - */ -void -util_surface_copy(struct pipe_context *pipe, - boolean do_flip, - struct pipe_surface *dst, - unsigned dst_x, unsigned dst_y, - struct pipe_surface *src, - unsigned src_x, unsigned src_y, - unsigned w, unsigned h) -{ - struct pipe_transfer *src_trans, *dst_trans; - void *dst_map; - const void *src_map; - enum pipe_format src_format, dst_format; - - assert(src->texture && dst->texture); - if (!src->texture || !dst->texture) - return; - - src_format = src->texture->format; - dst_format = dst->texture->format; - - src_trans = pipe_get_transfer(pipe, - src->texture, - src->face, - src->level, - src->zslice, - PIPE_TRANSFER_READ, - src_x, src_y, w, h); - - dst_trans = pipe_get_transfer(pipe, - dst->texture, - dst->face, - dst->level, - dst->zslice, - PIPE_TRANSFER_WRITE, - dst_x, dst_y, w, h); - - assert(util_format_get_blocksize(dst_format) == util_format_get_blocksize(src_format)); - assert(util_format_get_blockwidth(dst_format) == util_format_get_blockwidth(src_format)); - assert(util_format_get_blockheight(dst_format) == util_format_get_blockheight(src_format)); - - src_map = pipe->transfer_map(pipe, src_trans); - dst_map = pipe->transfer_map(pipe, dst_trans); - - assert(src_map); - assert(dst_map); - - if (src_map && dst_map) { - /* If do_flip, invert src_y position and pass negative src stride */ - util_copy_rect(dst_map, - dst_format, - dst_trans->stride, - 0, 0, - w, h, - src_map, - do_flip ? -(int) src_trans->stride : src_trans->stride, - 0, - do_flip ? h - 1 : 0); - } - - pipe->transfer_unmap(pipe, src_trans); - pipe->transfer_unmap(pipe, dst_trans); - - pipe->transfer_destroy(pipe, src_trans); - pipe->transfer_destroy(pipe, dst_trans); -} - - - -#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8)) - - -/** - * Fallback for pipe->surface_fill() function. - * XXX should probably put this in new u_surface.c file... - */ -void -util_surface_fill(struct pipe_context *pipe, - struct pipe_surface *dst, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, unsigned value) -{ - struct pipe_transfer *dst_trans; - void *dst_map; - - assert(dst->texture); - if (!dst->texture) - return; - dst_trans = pipe_get_transfer(pipe, - dst->texture, - dst->face, - dst->level, - dst->zslice, - PIPE_TRANSFER_WRITE, - dstx, dsty, width, height); - - dst_map = pipe->transfer_map(pipe, dst_trans); - - assert(dst_map); - - if (dst_map) { - assert(dst_trans->stride > 0); - - switch (util_format_get_blocksize(dst->texture->format)) { - case 1: - case 2: - case 4: - util_fill_rect(dst_map, dst->texture->format, - dst_trans->stride, - 0, 0, width, height, value); - break; - case 8: - { - /* expand the 4-byte clear value to an 8-byte value */ - ushort *row = (ushort *) dst_map; - ushort val0 = UBYTE_TO_USHORT((value >> 0) & 0xff); - ushort val1 = UBYTE_TO_USHORT((value >> 8) & 0xff); - ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff); - ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff); - unsigned i, j; - val0 = (val0 << 8) | val0; - val1 = (val1 << 8) | val1; - val2 = (val2 << 8) | val2; - val3 = (val3 << 8) | val3; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - row[j*4+0] = val0; - row[j*4+1] = val1; - row[j*4+2] = val2; - row[j*4+3] = val3; - } - row += dst_trans->stride/2; - } + case 8: + case 12: + case 16: + case 24: + case 32: + for (i = 0; i < height; i++) { + ubyte *row = dst; + for (j = 0; j < width; j++) { + memcpy(row, uc, blocksize); + row += blocksize; + } + dst += dst_stride; } break; - default: - assert(0); - break; - } + default: + assert(0); + break; } - - pipe->transfer_unmap(pipe, dst_trans); - pipe->transfer_destroy(pipe, dst_trans); } diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h index b44d821904b..4cb90d3c316 100644 --- a/src/gallium/auxiliary/util/u_rect.h +++ b/src/gallium/auxiliary/util/u_rect.h @@ -26,20 +26,67 @@ **************************************************************************/ -/** - * Pipe copy/fill rect helpers. +#ifndef U_RECT_H +#define U_RECT_H + +#include "pipe/p_compiler.h" + +struct u_rect { + int x0, x1; + int y0, y1; +}; + +/* Do two rectangles intersect? */ +static INLINE boolean +u_rect_test_intersection(const struct u_rect *a, + const struct u_rect *b) +{ + return (!(a->x1 < b->x0 || + b->x1 < a->x0 || + a->y1 < b->y0 || + b->y1 < a->y0)); +} +/* Find the intersection of two rectangles known to intersect. + */ +static INLINE void +u_rect_find_intersection(const struct u_rect *a, + struct u_rect *b) +{ + /* Caller should verify intersection exists before calling. + */ + if (b->x0 < a->x0) b->x0 = a->x0; + if (b->x1 > a->x1) b->x1 = a->x1; + if (b->y0 < a->y0) b->y0 = a->y0; + if (b->y1 > a->y1) b->y1 = a->y1; +} -#ifndef U_RECT_H -#define U_RECT_H +static INLINE void +u_rect_possible_intersection(const struct u_rect *a, + struct u_rect *b) +{ + if (u_rect_test_intersection(a,b)) { + u_rect_find_intersection(a,b); + } + else { + b->x0 = b->x1 = b->y0 = b->y1 = 0; + } +} #include "pipe/p_format.h" +#include "util/u_pack_color.h" + -struct pipe_context; -struct pipe_surface; +/********************************************************************** + * Pipe copy/fill rect helpers. + */ + +/* These really should move to a different file: + */ +#include "pipe/p_format.h" extern void util_copy_rect(ubyte * dst, enum pipe_format format, @@ -50,23 +97,7 @@ util_copy_rect(ubyte * dst, enum pipe_format format, extern void util_fill_rect(ubyte * dst, enum pipe_format format, unsigned dst_stride, unsigned dst_x, unsigned dst_y, - unsigned width, unsigned height, uint32_t value); - - -extern void -util_surface_copy(struct pipe_context *pipe, - boolean do_flip, - struct pipe_surface *dst, - unsigned dst_x, unsigned dst_y, - struct pipe_surface *src, - unsigned src_x, unsigned src_y, - unsigned w, unsigned h); - -extern void -util_surface_fill(struct pipe_context *pipe, - struct pipe_surface *dst, - unsigned dstx, unsigned dsty, - unsigned width, unsigned height, unsigned value); + unsigned width, unsigned height, union util_color *uc); #endif /* U_RECT_H */ diff --git a/src/gallium/auxiliary/util/u_simple_list.h b/src/gallium/auxiliary/util/u_simple_list.h index f5f43b0faa2..fe59771371b 100644 --- a/src/gallium/auxiliary/util/u_simple_list.h +++ b/src/gallium/auxiliary/util/u_simple_list.h @@ -46,6 +46,8 @@ do { \ (elem)->next->prev = (elem)->prev; \ (elem)->prev->next = (elem)->next; \ + (elem)->next = elem; \ + (elem)->prev = elem; \ } while (0) /** diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c index 019dda767d0..58ef68377fc 100644 --- a/src/gallium/auxiliary/util/u_simple_shaders.c +++ b/src/gallium/auxiliary/util/u_simple_shaders.c @@ -37,6 +37,7 @@ #include "pipe/p_context.h" #include "pipe/p_shader_tokens.h" +#include "pipe/p_state.h" #include "util/u_simple_shaders.h" #include "util/u_debug.h" #include "tgsi/tgsi_ureg.h" @@ -87,10 +88,15 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe, * MOV OUT[0], IMM[0] // (if writemask != 0xf) * TEX OUT[0].writemask, IN[0], SAMP[0], 2D; * END; + * + * \param tex_target one of PIPE_TEXTURE_x + * \parma interp_mode either TGSI_INTERPOLATE_LINEAR or PERSPECTIVE + * \param writemask mask of TGSI_WRITEMASK_x */ void * util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, unsigned tex_target, + unsigned interp_mode, unsigned writemask ) { struct ureg_program *ureg; @@ -98,6 +104,9 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, struct ureg_src tex; struct ureg_dst out; + assert(interp_mode == TGSI_INTERPOLATE_LINEAR || + interp_mode == TGSI_INTERPOLATE_PERSPECTIVE); + ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT ); if (ureg == NULL) return NULL; @@ -106,7 +115,7 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, tex = ureg_DECL_fs_input( ureg, TGSI_SEMANTIC_GENERIC, 0, - TGSI_INTERPOLATE_PERSPECTIVE ); + interp_mode ); out = ureg_DECL_output( ureg, TGSI_SEMANTIC_COLOR, @@ -133,10 +142,12 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, * \param tex_target one of PIPE_TEXTURE_x */ void * -util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target ) +util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target, + unsigned interp_mode) { return util_make_fragment_tex_shader_writemask( pipe, tex_target, + interp_mode, TGSI_WRITEMASK_XYZW ); } @@ -147,7 +158,8 @@ util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target ) */ void * util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe, - unsigned tex_target) + unsigned tex_target, + unsigned interp_mode) { struct ureg_program *ureg; struct ureg_src sampler; @@ -163,7 +175,7 @@ util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe, tex = ureg_DECL_fs_input( ureg, TGSI_SEMANTIC_GENERIC, 0, - TGSI_INTERPOLATE_PERSPECTIVE ); + interp_mode ); out = ureg_DECL_output( ureg, TGSI_SEMANTIC_COLOR, diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h index 6e760942e25..4aa34bc4757 100644 --- a/src/gallium/auxiliary/util/u_simple_shaders.h +++ b/src/gallium/auxiliary/util/u_simple_shaders.h @@ -52,15 +52,18 @@ util_make_vertex_passthrough_shader(struct pipe_context *pipe, extern void * util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, unsigned tex_target, + unsigned interp_mode, unsigned writemask); extern void * -util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target); +util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target, + unsigned interp_mode); extern void * util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe, - unsigned tex_target); + unsigned tex_target, + unsigned interp_mode); extern void * diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h new file mode 100644 index 00000000000..7f80fc12700 --- /dev/null +++ b/src/gallium/auxiliary/util/u_split_prim.h @@ -0,0 +1,114 @@ +/* Originally written by Ben Skeggs for the nv50 driver*/ + +#ifndef U_SPLIT_PRIM_H +#define U_SPLIT_PRIM_H + +#include "pipe/p_defines.h" +#include "pipe/p_compiler.h" + +#include "util/u_debug.h" + +struct util_split_prim { + void *priv; + void (*emit)(void *priv, unsigned start, unsigned count); + void (*edge)(void *priv, boolean enabled); + + unsigned mode; + unsigned start; + unsigned p_start; + unsigned p_end; + + uint repeat_first:1; + uint close_first:1; + uint edgeflag_off:1; +}; + +static INLINE void +util_split_prim_init(struct util_split_prim *s, + unsigned mode, unsigned start, unsigned count) +{ + if (mode == PIPE_PRIM_LINE_LOOP) { + s->mode = PIPE_PRIM_LINE_STRIP; + s->close_first = 1; + } else { + s->mode = mode; + s->close_first = 0; + } + s->start = start; + s->p_start = start; + s->p_end = start + count; + s->edgeflag_off = 0; + s->repeat_first = 0; +} + +static INLINE boolean +util_split_prim_next(struct util_split_prim *s, unsigned max_verts) +{ + int repeat = 0; + + if (s->repeat_first) { + s->emit(s->priv, s->start, 1); + max_verts--; + if (s->edgeflag_off) { + s->edge(s->priv, TRUE); + s->edgeflag_off = FALSE; + } + } + + if ((s->p_end - s->p_start) + s->close_first <= max_verts) { + s->emit(s->priv, s->p_start, s->p_end - s->p_start); + if (s->close_first) + s->emit(s->priv, s->start, 1); + return TRUE; + } + + switch (s->mode) { + case PIPE_PRIM_LINES: + max_verts &= ~1; + break; + case PIPE_PRIM_LINE_STRIP: + repeat = 1; + break; + case PIPE_PRIM_POLYGON: + max_verts--; + s->emit(s->priv, s->p_start, max_verts); + s->edge(s->priv, FALSE); + s->emit(s->priv, s->p_start + max_verts, 1); + s->p_start += max_verts; + s->repeat_first = TRUE; + s->edgeflag_off = TRUE; + return FALSE; + case PIPE_PRIM_TRIANGLES: + max_verts = max_verts - (max_verts % 3); + break; + case PIPE_PRIM_TRIANGLE_STRIP: + /* to ensure winding stays correct, always split + * on an even number of generated triangles + */ + max_verts = max_verts & ~1; + repeat = 2; + break; + case PIPE_PRIM_TRIANGLE_FAN: + s->repeat_first = TRUE; + repeat = 1; + break; + case PIPE_PRIM_QUADS: + max_verts &= ~3; + break; + case PIPE_PRIM_QUAD_STRIP: + max_verts &= ~1; + repeat = 2; + break; + case PIPE_PRIM_POINTS: + break; + default: + /* TODO: implement adjacency primitives */ + assert(0); + } + + s->emit (s->priv, s->p_start, max_verts); + s->p_start += (max_verts - repeat); + return FALSE; +} + +#endif /* U_SPLIT_PRIM_H */ diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h index e2a8491e62c..03198c91da4 100644 --- a/src/gallium/auxiliary/util/u_sse.h +++ b/src/gallium/auxiliary/util/u_sse.h @@ -41,7 +41,6 @@ #if defined(PIPE_ARCH_SSE) -#include <xmmintrin.h> #include <emmintrin.h> @@ -72,6 +71,33 @@ _mm_castps_si128(__m128 a) #endif /* defined(_MSC_VER) && _MSC_VER < 1500 */ + +#if defined(PIPE_ARCH_SSSE3) + +#include <tmmintrin.h> + +#else /* !PIPE_ARCH_SSSE3 */ + +/** + * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases + * where -mssse3 is not supported/enabled. + * + * MSVC will never get in here as its intrinsics support do not rely on + * compiler command line options. + */ +static __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi8(__m128i a, __m128i mask) +{ + __m128i result; + __asm__("pshufb %1, %0" + : "=x" (result) + : "xm" (mask), "0" (a)); + return result; +} + +#endif /* !PIPE_ARCH_SSSE3 */ + + #endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */ #endif /* U_SSE_H_ */ diff --git a/src/gallium/auxiliary/util/u_staging.c b/src/gallium/auxiliary/util/u_staging.c new file mode 100644 index 00000000000..c5d68f8df86 --- /dev/null +++ b/src/gallium/auxiliary/util/u_staging.c @@ -0,0 +1,117 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_staging.h" +#include "pipe/p_context.h" +#include "util/u_memory.h" +#include "util/u_inlines.h" + +static void +util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigned height, unsigned depth, struct pipe_resource *template) +{ + memset(template, 0, sizeof(struct pipe_resource)); + if(pt->target != PIPE_BUFFER && depth <= 1) + template->target = PIPE_TEXTURE_RECT; + else + template->target = pt->target; + template->format = pt->format; + template->width0 = width; + template->height0 = height; + template->depth0 = depth; + template->last_level = 0; + template->nr_samples = pt->nr_samples; + template->bind = 0; + template->usage = PIPE_USAGE_STAGING; + template->flags = 0; +} + +struct util_staging_transfer * +util_staging_transfer_init(struct pipe_context *pipe, + struct pipe_resource *pt, + struct pipe_subresource sr, + unsigned usage, + const struct pipe_box *box, + bool direct, struct util_staging_transfer *tx) +{ + struct pipe_screen *pscreen = pipe->screen; + + struct pipe_resource staging_resource_template; + + pipe_resource_reference(&tx->base.resource, pt); + tx->base.sr = sr; + tx->base.usage = usage; + tx->base.box = *box; + + if (direct) + { + tx->staging_resource = pt; + return tx; + } + + util_staging_resource_template(pt, box->width, box->height, box->depth, &staging_resource_template); + tx->staging_resource = pscreen->resource_create(pscreen, &staging_resource_template); + if (!tx->staging_resource) + { + pipe_resource_reference(&tx->base.resource, NULL); + FREE(tx); + return NULL; + } + + if (usage & PIPE_TRANSFER_READ) + { + struct pipe_subresource dstsr; + unsigned zi; + dstsr.face = 0; + dstsr.level = 0; + for(zi = 0; zi < box->depth; ++zi) + pipe->resource_copy_region(pipe, tx->staging_resource, dstsr, 0, 0, 0, tx->base.resource, sr, box->x, box->y, box->z + zi, box->width, box->height); + } + + return tx; +} + +void +util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx) +{ + struct util_staging_transfer *tx = (struct util_staging_transfer *)ptx; + + if (tx->staging_resource != tx->base.resource) + { + if(tx->base.usage & PIPE_TRANSFER_WRITE) { + struct pipe_subresource srcsr; + unsigned zi; + srcsr.face = 0; + srcsr.level = 0; + for(zi = 0; zi < tx->base.box.depth; ++zi) + pipe->resource_copy_region(pipe, tx->base.resource, tx->base.sr, tx->base.box.x, tx->base.box.y, tx->base.box.z + zi, tx->staging_resource, srcsr, 0, 0, 0, tx->base.box.width, tx->base.box.height); + } + + pipe_resource_reference(&tx->staging_resource, NULL); + } + + pipe_resource_reference(&ptx->resource, NULL); + FREE(ptx); +} diff --git a/src/gallium/auxiliary/util/u_staging.h b/src/gallium/auxiliary/util/u_staging.h new file mode 100644 index 00000000000..1aab78cc881 --- /dev/null +++ b/src/gallium/auxiliary/util/u_staging.h @@ -0,0 +1,63 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* Direct3D 10/11 has no concept of transfers. Applications instead + * create resources with a STAGING or DYNAMIC usage, copy between them + * and the real resource and use Map to map the STAGING/DYNAMIC resource. + * + * This util module allows to implement Gallium drivers as a Direct3D + * driver would be implemented: transfers allocate a resource with + * PIPE_USAGE_STAGING, and copy the data between it and the real resource + * with resource_copy_region. + */ + +#ifndef U_STAGING_H +#define U_STAGING_H + +#include "pipe/p_state.h" + +struct util_staging_transfer { + struct pipe_transfer base; + + /* if direct, same as base.resource, otherwise the temporary staging resource */ + struct pipe_resource *staging_resource; +}; + +/* user must be stride, slice_stride and offset */ +/* pt->usage == PIPE_USAGE_DYNAMIC || pt->usage == PIPE_USAGE_STAGING should be a good value to pass for direct */ +/* staging resource is currently created with PIPE_USAGE_STAGING */ +struct util_staging_transfer * +util_staging_transfer_init(struct pipe_context *pipe, + struct pipe_resource *pt, + struct pipe_subresource sr, + unsigned usage, + const struct pipe_box *box, + bool direct, struct util_staging_transfer *tx); + +void +util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx); + +#endif diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c index 42440d0d673..f78b6838a72 100644 --- a/src/gallium/auxiliary/util/u_surface.c +++ b/src/gallium/auxiliary/util/u_surface.c @@ -32,13 +32,15 @@ */ +#include "pipe/p_defines.h" #include "pipe/p_screen.h" #include "pipe/p_state.h" -#include "pipe/p_defines.h" -#include "util/u_inlines.h" -#include "util/u_memory.h" +#include "util/u_format.h" +#include "util/u_inlines.h" +#include "util/u_rect.h" #include "util/u_surface.h" +#include "util/u_pack_color.h" /** @@ -68,7 +70,7 @@ util_create_rgba_surface(struct pipe_screen *screen, /* Choose surface format */ for (i = 0; rgbaFormats[i]; i++) { if (screen->is_format_supported(screen, rgbaFormats[i], - target, bind, 0)) { + target, 0, bind, 0)) { format = rgbaFormats[i]; break; } @@ -118,70 +120,231 @@ util_destroy_rgba_surface(struct pipe_resource *texture, /** - * Compare pipe_framebuffer_state objects. - * \return TRUE if same, FALSE if different + * Fallback function for pipe->resource_copy_region(). + * Note: (X,Y)=(0,0) is always the upper-left corner. */ -boolean -util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst, - const struct pipe_framebuffer_state *src) +void +util_resource_copy_region(struct pipe_context *pipe, + struct pipe_resource *dst, + struct pipe_subresource subdst, + unsigned dst_x, unsigned dst_y, unsigned dst_z, + struct pipe_resource *src, + struct pipe_subresource subsrc, + unsigned src_x, unsigned src_y, unsigned src_z, + unsigned w, unsigned h) { - unsigned i; + struct pipe_transfer *src_trans, *dst_trans; + void *dst_map; + const void *src_map; + enum pipe_format src_format, dst_format; + + assert(src && dst); + if (!src || !dst) + return; + + src_format = src->format; + dst_format = dst->format; + + src_trans = pipe_get_transfer(pipe, + src, + subsrc.face, + subsrc.level, + src_z, + PIPE_TRANSFER_READ, + src_x, src_y, w, h); + + dst_trans = pipe_get_transfer(pipe, + dst, + subdst.face, + subdst.level, + src_z, + PIPE_TRANSFER_WRITE, + dst_x, dst_y, w, h); + + assert(util_format_get_blocksize(dst_format) == util_format_get_blocksize(src_format)); + assert(util_format_get_blockwidth(dst_format) == util_format_get_blockwidth(src_format)); + assert(util_format_get_blockheight(dst_format) == util_format_get_blockheight(src_format)); + + src_map = pipe->transfer_map(pipe, src_trans); + dst_map = pipe->transfer_map(pipe, dst_trans); + + assert(src_map); + assert(dst_map); + + if (src_map && dst_map) { + util_copy_rect(dst_map, + dst_format, + dst_trans->stride, + 0, 0, + w, h, + src_map, + src_trans->stride, + 0, + 0); + } - if (dst->width != src->width || - dst->height != src->height) - return FALSE; + pipe->transfer_unmap(pipe, src_trans); + pipe->transfer_unmap(pipe, dst_trans); - for (i = 0; i < Elements(src->cbufs); i++) { - if (dst->cbufs[i] != src->cbufs[i]) { - return FALSE; - } - } + pipe->transfer_destroy(pipe, src_trans); + pipe->transfer_destroy(pipe, dst_trans); +} - if (dst->nr_cbufs != src->nr_cbufs) { - return FALSE; - } - if (dst->zsbuf != src->zsbuf) { - return FALSE; - } - return TRUE; -} +#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8)) /** - * Copy framebuffer state from src to dst, updating refcounts. + * Fallback for pipe->clear_render_target() function. + * XXX this looks too hackish to be really useful. + * cpp > 4 looks like a gross hack at best... + * Plus can't use these transfer fallbacks when clearing + * multisampled surfaces for instance. */ void -util_copy_framebuffer_state(struct pipe_framebuffer_state *dst, - const struct pipe_framebuffer_state *src) +util_clear_render_target(struct pipe_context *pipe, + struct pipe_surface *dst, + const float *rgba, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height) { - unsigned i; + struct pipe_transfer *dst_trans; + void *dst_map; + union util_color uc; - dst->width = src->width; - dst->height = src->height; + assert(dst->texture); + if (!dst->texture) + return; - for (i = 0; i < Elements(src->cbufs); i++) { - pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]); - } + dst_trans = pipe_get_transfer(pipe, + dst->texture, + dst->face, + dst->level, + dst->zslice, + PIPE_TRANSFER_WRITE, + dstx, dsty, width, height); - dst->nr_cbufs = src->nr_cbufs; + dst_map = pipe->transfer_map(pipe, dst_trans); - pipe_surface_reference(&dst->zsbuf, src->zsbuf); -} + assert(dst_map); + + if (dst_map) { + assert(dst_trans->stride > 0); + util_pack_color(rgba, dst->texture->format, &uc); + util_fill_rect(dst_map, dst->texture->format, + dst_trans->stride, + 0, 0, width, height, &uc); + } + + pipe->transfer_unmap(pipe, dst_trans); + pipe->transfer_destroy(pipe, dst_trans); +} +/** + * Fallback for pipe->clear_stencil() function. + * sw fallback doesn't look terribly useful here. + * Plus can't use these transfer fallbacks when clearing + * multisampled surfaces for instance. + */ void -util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb) +util_clear_depth_stencil(struct pipe_context *pipe, + struct pipe_surface *dst, + unsigned clear_flags, + double depth, + unsigned stencil, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height) { - unsigned i; - - for (i = 0; i < fb->nr_cbufs; i++) { - pipe_surface_reference(&fb->cbufs[i], NULL); + struct pipe_transfer *dst_trans; + ubyte *dst_map; + boolean need_rmw = FALSE; + + if ((clear_flags & PIPE_CLEAR_DEPTHSTENCIL) && + ((clear_flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) && + util_format_is_depth_and_stencil(dst->format)) + need_rmw = TRUE; + + assert(dst->texture); + if (!dst->texture) + return; + dst_trans = pipe_get_transfer(pipe, + dst->texture, + dst->face, + dst->level, + dst->zslice, + (need_rmw ? PIPE_TRANSFER_READ_WRITE : + PIPE_TRANSFER_WRITE), + dstx, dsty, width, height); + + dst_map = pipe->transfer_map(pipe, dst_trans); + + assert(dst_map); + + if (dst_map) { + unsigned dst_stride = dst_trans->stride; + unsigned zstencil = util_pack_z_stencil(dst->texture->format, depth, stencil); + unsigned i, j; + assert(dst_trans->stride > 0); + + switch (util_format_get_blocksize(dst->format)) { + case 1: + assert(dst->format == PIPE_FORMAT_S8_USCALED); + if(dst_stride == width) + memset(dst_map, (ubyte) zstencil, height * width); + else { + for (i = 0; i < height; i++) { + memset(dst_map, (ubyte) zstencil, width); + dst_map += dst_stride; + } + } + break; + case 2: + assert(dst->format == PIPE_FORMAT_Z16_UNORM); + for (i = 0; i < height; i++) { + uint16_t *row = (uint16_t *)dst_map; + for (j = 0; j < width; j++) + *row++ = (uint16_t) zstencil; + dst_map += dst_stride; + } + break; + case 4: + if (!need_rmw) { + for (i = 0; i < height; i++) { + uint32_t *row = (uint32_t *)dst_map; + for (j = 0; j < width; j++) + *row++ = zstencil; + dst_map += dst_stride; + } + } + else { + uint32_t dst_mask; + if (dst->format == PIPE_FORMAT_Z24_UNORM_S8_USCALED) + dst_mask = 0xffffff00; + else { + assert(dst->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM); + dst_mask = 0xffffff; + } + if (clear_flags & PIPE_CLEAR_DEPTH) + dst_mask = ~dst_mask; + for (i = 0; i < height; i++) { + uint32_t *row = (uint32_t *)dst_map; + for (j = 0; j < width; j++) { + uint32_t tmp = *row & dst_mask; + *row++ = tmp | (zstencil & ~dst_mask); + } + dst_map += dst_stride; + } + } + break; + case 8: + default: + assert(0); + break; + } } - pipe_surface_reference(&fb->zsbuf, NULL); - - fb->width = fb->height = 0; - fb->nr_cbufs = 0; + pipe->transfer_unmap(pipe, dst_trans); + pipe->transfer_destroy(pipe, dst_trans); } diff --git a/src/gallium/auxiliary/util/u_surface.h b/src/gallium/auxiliary/util/u_surface.h index 119fcd4ce8e..6cd12af3a8b 100644 --- a/src/gallium/auxiliary/util/u_surface.h +++ b/src/gallium/auxiliary/util/u_surface.h @@ -33,23 +33,6 @@ #include "pipe/p_state.h" -/** - * Are s1 and s2 the same surface? - * Surfaces are basically views into textures so check if the two surfaces - * name the same part of the same texture. - */ -static INLINE boolean -util_same_surface(const struct pipe_surface *s1, const struct pipe_surface *s2) -{ - return (s1->texture == s2->texture && - s1->face == s2->face && - s1->level == s2->level && - s1->zslice == s2->zslice); -} - - - - extern boolean util_create_rgba_surface(struct pipe_screen *screen, uint width, uint height, uint bind, @@ -62,17 +45,32 @@ util_destroy_rgba_surface(struct pipe_resource *texture, struct pipe_surface *surface); -extern boolean -util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst, - const struct pipe_framebuffer_state *src); extern void -util_copy_framebuffer_state(struct pipe_framebuffer_state *dst, - const struct pipe_framebuffer_state *src); +util_resource_copy_region(struct pipe_context *pipe, + struct pipe_resource *dst, + struct pipe_subresource subdst, + unsigned dst_x, unsigned dst_y, unsigned dst_z, + struct pipe_resource *src, + struct pipe_subresource subsrc, + unsigned src_x, unsigned src_y, unsigned src_z, + unsigned w, unsigned h); +extern void +util_clear_render_target(struct pipe_context *pipe, + struct pipe_surface *dst, + const float *rgba, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height); extern void -util_unreference_framebuffer_state(struct pipe_framebuffer_state *fb); +util_clear_depth_stencil(struct pipe_context *pipe, + struct pipe_surface *dst, + unsigned clear_flags, + double depth, + unsigned stencil, + unsigned dstx, unsigned dsty, + unsigned width, unsigned height); #endif /* U_SURFACE_H */ diff --git a/src/gallium/auxiliary/util/u_surfaces.c b/src/gallium/auxiliary/util/u_surfaces.c index 668da8c5c27..404e1219952 100644 --- a/src/gallium/auxiliary/util/u_surfaces.c +++ b/src/gallium/auxiliary/util/u_surfaces.c @@ -1,42 +1,50 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #include "u_surfaces.h" #include "util/u_hash_table.h" #include "util/u_inlines.h" #include "util/u_memory.h" -/* TODO: ouch, util_hash_table should do these by default when passed a null function pointer - * this indirect function call is quite bad - */ -static unsigned -hash(void *key) -{ - return (unsigned)(uintptr_t)key; -} - -static int -compare(void *key1, void *key2) -{ - return (unsigned)(uintptr_t)key1 - (unsigned)(uintptr_t)key2; -} - struct pipe_surface * util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags) { struct pipe_surface *ps; - void *key = NULL; if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE) - { /* or 2D array */ - if(!us->u.table) - us->u.table = util_hash_table_create(hash, compare); - key = (void *)(((zslice + face) << 8) | level); - /* TODO: ouch, should have a get-reference function... - * also, shouldn't allocate a two-pointer structure for each item... */ - ps = util_hash_table_get(us->u.table, key); + { /* or 2D array */ + if(!us->u.hash) + us->u.hash = cso_hash_create(); + + ps = cso_hash_iter_data(cso_hash_find(us->u.hash, ((zslice + face) << 8) | level)); } else { if(!us->u.array) - us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *)); + us->u.array = CALLOC(pt->last_level + 1, sizeof(struct pipe_surface *)); ps = us->u.array[level]; } @@ -54,7 +62,7 @@ util_surfaces_do_get(struct util_surfaces *us, unsigned surface_struct_size, str ps->offset = ~0; if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE) - util_hash_table_set(us->u.table, key, ps); + cso_hash_insert(us->u.hash, ((zslice + face) << 8) | level, ps); else us->u.array[level] = ps; @@ -66,47 +74,44 @@ util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps) { struct pipe_resource *pt = ps->texture; if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE) - { /* or 2D array */ - void* key = (void*)(uintptr_t)(((ps->zslice + ps->face) << 8) | ps->level); - util_hash_table_remove(us->u.table, key); + { /* or 2D array */ + cso_hash_erase(us->u.hash, cso_hash_find(us->u.hash, ((ps->zslice + ps->face) << 8) | ps->level)); } else us->u.array[ps->level] = 0; } -static enum pipe_error -util_surfaces_destroy_callback(void *key, void *value, void *data) -{ - void (*destroy_surface) (struct pipe_surface * ps) = data; - destroy_surface((struct pipe_surface *)value); - return PIPE_OK; -} - void util_surfaces_destroy(struct util_surfaces *us, struct pipe_resource *pt, void (*destroy_surface) (struct pipe_surface *)) { if(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE) - { /* or 2D array */ - if(us->u.table) + { /* or 2D array */ + if(us->u.hash) { - util_hash_table_foreach(us->u.table, util_surfaces_destroy_callback, destroy_surface); - util_hash_table_destroy(us->u.table); - us->u.table = NULL; + struct cso_hash_iter iter; + iter = cso_hash_first_node(us->u.hash); + while (!cso_hash_iter_is_null(iter)) { + destroy_surface(cso_hash_iter_data(iter)); + iter = cso_hash_iter_next(iter); + } + + cso_hash_delete(us->u.hash); + us->u.hash = NULL; } } else { if(us->u.array) { - unsigned i; - for(i = 0; i < pt->last_level; ++i) - { - struct pipe_surface *ps = us->u.array[i]; - if(ps) - destroy_surface(ps); - } - FREE(us->u.array); - us->u.array = NULL; + unsigned i; + for(i = 0; i <= pt->last_level; ++i) + { + struct pipe_surface *ps = us->u.array[i]; + if(ps) + destroy_surface(ps); + } + FREE(us->u.array); + us->u.array = NULL; } } } diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h index 0195bf5afba..17d8a5d3a5b 100644 --- a/src/gallium/auxiliary/util/u_surfaces.h +++ b/src/gallium/auxiliary/util/u_surfaces.h @@ -1,18 +1,44 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #ifndef U_SURFACES_H_ #define U_SURFACES_H_ #include "pipe/p_compiler.h" #include "pipe/p_state.h" #include "util/u_atomic.h" - -struct util_hash_table; +#include "cso_cache/cso_hash.h" struct util_surfaces { union { - struct util_hash_table *table; + struct cso_hash *hash; struct pipe_surface **array; + void* pv; } u; }; @@ -22,7 +48,7 @@ struct pipe_surface *util_surfaces_do_get(struct util_surfaces *us, unsigned sur static INLINE struct pipe_surface * util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags) { - if(likely(pt->target == PIPE_TEXTURE_2D && us->u.array)) + if(likely((pt->target == PIPE_TEXTURE_2D || pt->target == PIPE_TEXTURE_RECT) && us->u.array)) { struct pipe_surface *ps = us->u.array[level]; if(ps) @@ -35,12 +61,24 @@ util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct return util_surfaces_do_get(us, surface_struct_size, pscreen, pt, face, level, zslice, flags); } +static INLINE struct pipe_surface * +util_surfaces_peek(struct util_surfaces *us, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice) +{ + if(!us->u.pv) + return 0; + + if(unlikely(pt->target == PIPE_TEXTURE_3D || pt->target == PIPE_TEXTURE_CUBE)) + return cso_hash_iter_data(cso_hash_find(us->u.hash, ((zslice + face) << 8) | level)); + else + return us->u.array[level]; +} + void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps); static INLINE void util_surfaces_detach(struct util_surfaces *us, struct pipe_surface *ps) { - if(likely(ps->texture->target == PIPE_TEXTURE_2D)) + if(likely(ps->texture->target == PIPE_TEXTURE_2D || ps->texture->target == PIPE_TEXTURE_RECT)) { us->u.array[ps->level] = 0; return; diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h index 986eee07435..558351d0ce5 100644 --- a/src/gallium/auxiliary/util/u_tile.h +++ b/src/gallium/auxiliary/util/u_tile.h @@ -29,7 +29,10 @@ #define P_TILE_H #include "pipe/p_compiler.h" +#include "pipe/p_format.h" +#include "pipe/p_state.h" +struct pipe_context; struct pipe_transfer; /** diff --git a/src/gallium/auxiliary/util/u_transfer.c b/src/gallium/auxiliary/util/u_transfer.c index bedace3b1dc..69f6fab9504 100644 --- a/src/gallium/auxiliary/util/u_transfer.c +++ b/src/gallium/auxiliary/util/u_transfer.c @@ -35,12 +35,12 @@ void u_default_transfer_inline_write( struct pipe_context *pipe, util_copy_rect(map, resource->format, - transfer->stride, /* bytes? */ + transfer->stride, /* bytes */ 0, 0, box->width, box->height, data, - box->width, /* bytes? texels? */ + stride, /* bytes */ 0, 0); out: diff --git a/src/gallium/auxiliary/util/u_transfer.h b/src/gallium/auxiliary/util/u_transfer.h index eb07945d15f..e3a38730f21 100644 --- a/src/gallium/auxiliary/util/u_transfer.h +++ b/src/gallium/auxiliary/util/u_transfer.h @@ -8,6 +8,7 @@ #include "pipe/p_state.h" struct pipe_context; +struct winsys_handle; boolean u_default_resource_get_handle(struct pipe_screen *screen, struct pipe_resource *resource, diff --git a/src/gallium/auxiliary/util/u_upload_mgr.c b/src/gallium/auxiliary/util/u_upload_mgr.c index 75d44432d9e..af229e61a00 100644 --- a/src/gallium/auxiliary/util/u_upload_mgr.c +++ b/src/gallium/auxiliary/util/u_upload_mgr.c @@ -59,6 +59,8 @@ struct u_upload_mgr *u_upload_create( struct pipe_context *pipe, unsigned usage ) { struct u_upload_mgr *upload = CALLOC_STRUCT( u_upload_mgr ); + if (!upload) + return NULL; upload->pipe = pipe; upload->default_size = default_size; diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h index a124924fc80..de016df02e0 100644 --- a/src/gallium/auxiliary/util/u_upload_mgr.h +++ b/src/gallium/auxiliary/util/u_upload_mgr.h @@ -32,11 +32,8 @@ #ifndef U_UPLOAD_MGR_H #define U_UPLOAD_MGR_H -#include "pipe/p_defines.h" - -struct pipe_screen; +struct pipe_context; struct pipe_resource; -struct u_upload_mgr; struct u_upload_mgr *u_upload_create( struct pipe_context *pipe, |