diff options
author | Christoph Bumiller <[email protected]> | 2010-09-02 18:31:49 +0200 |
---|---|---|
committer | Christoph Bumiller <[email protected]> | 2010-09-02 18:31:49 +0200 |
commit | 222d2f2ac2c7d93cbc0643082c78278ad2c8cfce (patch) | |
tree | b79152c238022b2a901201c22e5809ac520732bf /src/gallium | |
parent | 443abc80db9e1a288ce770e76cccd43664348098 (diff) | |
parent | e73c5501b2fe20290d1b691c85a5d82ac3a0431c (diff) |
Merge remote branch 'origin/master' into nv50-compiler
Conflicts:
src/gallium/drivers/nv50/nv50_program.c
Diffstat (limited to 'src/gallium')
303 files changed, 18498 insertions, 8515 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile index 9544e90a965..eb86d83d2a2 100644 --- a/src/gallium/auxiliary/Makefile +++ b/src/gallium/auxiliary/Makefile @@ -4,8 +4,8 @@ include $(TOP)/configs/current LIBNAME = gallium C_SOURCES = \ - cso_cache/cso_context.c \ cso_cache/cso_cache.c \ + cso_cache/cso_context.c \ cso_cache/cso_hash.c \ draw/draw_context.c \ draw/draw_gs.c \ @@ -26,7 +26,6 @@ C_SOURCES = \ draw/draw_pipe_wide_line.c \ draw/draw_pipe_wide_point.c \ draw/draw_pt.c \ - draw/draw_pt_elts.c \ draw/draw_pt_emit.c \ draw/draw_pt_fetch.c \ draw/draw_pt_fetch_emit.c \ @@ -35,24 +34,24 @@ C_SOURCES = \ draw/draw_pt_post_vs.c \ draw/draw_pt_so_emit.c \ draw/draw_pt_util.c \ - draw/draw_pt_varray.c \ - draw/draw_pt_vcache.c \ + draw/draw_pt_vsplit.c \ draw/draw_vertex.c \ draw/draw_vs.c \ - draw/draw_vs_varient.c \ draw/draw_vs_aos.c \ draw/draw_vs_aos_io.c \ draw/draw_vs_aos_machine.c \ draw/draw_vs_exec.c \ draw/draw_vs_ppc.c \ draw/draw_vs_sse.c \ + draw/draw_vs_varient.c \ indices/u_indices_gen.c \ indices/u_unfilled_gen.c \ os/os_misc.c \ + os/os_stream.c \ os/os_stream_log.c \ + os/os_stream_null.c \ os/os_stream_stdc.c \ os/os_stream_str.c \ - os/os_stream_null.c \ os/os_time.c \ pipebuffer/pb_buffer_fenced.c \ pipebuffer/pb_buffer_malloc.c \ @@ -65,17 +64,16 @@ C_SOURCES = \ pipebuffer/pb_bufmgr_slab.c \ pipebuffer/pb_validate.c \ rbug/rbug_connection.c \ + rbug/rbug_context.c \ rbug/rbug_core.c \ + rbug/rbug_demarshal.c \ rbug/rbug_texture.c \ - rbug/rbug_context.c \ rbug/rbug_shader.c \ - rbug/rbug_demarshal.c \ rtasm/rtasm_cpu.c \ rtasm/rtasm_execmem.c \ - rtasm/rtasm_x86sse.c \ rtasm/rtasm_ppc.c \ rtasm/rtasm_ppc_spe.c \ - tgsi/tgsi_sanity.c \ + rtasm/rtasm_x86sse.c \ tgsi/tgsi_build.c \ tgsi/tgsi_dump.c \ tgsi/tgsi_exec.c \ @@ -83,19 +81,22 @@ C_SOURCES = \ tgsi/tgsi_iterate.c \ tgsi/tgsi_parse.c \ tgsi/tgsi_ppc.c \ + tgsi/tgsi_sanity.c \ tgsi/tgsi_scan.c \ tgsi/tgsi_sse2.c \ tgsi/tgsi_text.c \ tgsi/tgsi_transform.c \ tgsi/tgsi_ureg.c \ tgsi/tgsi_util.c \ - translate/translate_generic.c \ - translate/translate_sse.c \ translate/translate.c \ translate/translate_cache.c \ + translate/translate_generic.c \ + translate/translate_sse.c \ util/u_debug.c \ - util/u_debug_symbol.c \ + util/u_debug_describe.c \ + util/u_debug_refcnt.c \ util/u_debug_stack.c \ + util/u_debug_symbol.c \ util/u_dump_defines.c \ util/u_dump_state.c \ util/u_bitmask.c \ @@ -118,10 +119,11 @@ C_SOURCES = \ util/u_gen_mipmap.c \ util/u_half.c \ util/u_handle_table.c \ - util/u_hash_table.c \ util/u_hash.c \ + util/u_hash_table.c \ util/u_keymap.c \ util/u_linear.c \ + util/u_linkage.c \ util/u_network.c \ util/u_math.c \ util/u_mempool.c \ @@ -172,10 +174,10 @@ GALLIVM_SOURCES = \ gallivm/lp_bld_tgsi_soa.c \ gallivm/lp_bld_type.c \ draw/draw_llvm.c \ - draw/draw_vs_llvm.c \ - draw/draw_pt_fetch_shade_pipeline_llvm.c \ + draw/draw_llvm_sample.c \ draw/draw_llvm_translate.c \ - draw/draw_llvm_sample.c + draw/draw_vs_llvm.c \ + draw/draw_pt_fetch_shade_pipeline_llvm.c GALLIVM_CPP_SOURCES = \ gallivm/lp_bld_misc.cpp diff --git a/src/gallium/auxiliary/SConscript b/src/gallium/auxiliary/SConscript index 3124e20ce84..6210ada990e 100644 --- a/src/gallium/auxiliary/SConscript +++ b/src/gallium/auxiliary/SConscript @@ -50,10 +50,11 @@ env.Depends('util/u_format_table.c', [ ]) source = [ - 'cso_cache/cso_context.c', 'cso_cache/cso_cache.c', + 'cso_cache/cso_context.c', 'cso_cache/cso_hash.c', 'draw/draw_context.c', + 'draw/draw_gs.c', 'draw/draw_pipe.c', 'draw/draw_pipe_aaline.c', 'draw/draw_pipe_aapoint.c', @@ -71,7 +72,6 @@ source = [ 'draw/draw_pipe_wide_line.c', 'draw/draw_pipe_wide_point.c', 'draw/draw_pt.c', - 'draw/draw_pt_elts.c', 'draw/draw_pt_emit.c', 'draw/draw_pt_fetch.c', 'draw/draw_pt_fetch_emit.c', @@ -80,8 +80,7 @@ source = [ 'draw/draw_pt_post_vs.c', 'draw/draw_pt_so_emit.c', 'draw/draw_pt_util.c', - 'draw/draw_pt_varray.c', - 'draw/draw_pt_vcache.c', + 'draw/draw_pt_vsplit.c', 'draw/draw_vertex.c', 'draw/draw_vs.c', 'draw/draw_vs_aos.c', @@ -91,16 +90,16 @@ source = [ 'draw/draw_vs_ppc.c', 'draw/draw_vs_sse.c', 'draw/draw_vs_varient.c', - 'draw/draw_gs.c', #'indices/u_indices.c', #'indices/u_unfilled_indices.c', 'indices/u_indices_gen.c', 'indices/u_unfilled_gen.c', 'os/os_misc.c', + 'os/os_stream.c', 'os/os_stream_log.c', + 'os/os_stream_null.c', 'os/os_stream_stdc.c', 'os/os_stream_str.c', - 'os/os_stream_null.c', 'os/os_time.c', 'pipebuffer/pb_buffer_fenced.c', 'pipebuffer/pb_buffer_malloc.c', @@ -112,35 +111,35 @@ source = [ 'pipebuffer/pb_bufmgr_pool.c', 'pipebuffer/pb_bufmgr_slab.c', 'pipebuffer/pb_validate.c', + 'rbug/rbug_connection.c', + 'rbug/rbug_context.c', 'rbug/rbug_core.c', + 'rbug/rbug_demarshal.c', 'rbug/rbug_shader.c', - 'rbug/rbug_context.c', 'rbug/rbug_texture.c', - 'rbug/rbug_demarshal.c', - 'rbug/rbug_connection.c', 'rtasm/rtasm_cpu.c', 'rtasm/rtasm_execmem.c', - 'rtasm/rtasm_x86sse.c', 'rtasm/rtasm_ppc.c', 'rtasm/rtasm_ppc_spe.c', + 'rtasm/rtasm_x86sse.c', 'tgsi/tgsi_build.c', 'tgsi/tgsi_dump.c', 'tgsi/tgsi_exec.c', 'tgsi/tgsi_info.c', 'tgsi/tgsi_iterate.c', 'tgsi/tgsi_parse.c', + 'tgsi/tgsi_ppc.c', 'tgsi/tgsi_sanity.c', 'tgsi/tgsi_scan.c', - 'tgsi/tgsi_ppc.c', 'tgsi/tgsi_sse2.c', 'tgsi/tgsi_text.c', 'tgsi/tgsi_transform.c', 'tgsi/tgsi_ureg.c', 'tgsi/tgsi_util.c', - 'translate/translate_generic.c', - 'translate/translate_sse.c', 'translate/translate.c', 'translate/translate_cache.c', + 'translate/translate_generic.c', + 'translate/translate_sse.c', 'util/u_bitmask.c', 'util/u_blit.c', 'util/u_blitter.c', @@ -148,7 +147,9 @@ source = [ 'util/u_caps.c', 'util/u_cpu_detect.c', 'util/u_debug.c', + 'util/u_debug_describe.c', 'util/u_debug_memory.c', + 'util/u_debug_refcnt.c', 'util/u_debug_stack.c', 'util/u_debug_symbol.c', 'util/u_dump_defines.c', @@ -170,6 +171,8 @@ source = [ 'util/u_hash.c', 'util/u_hash_table.c', 'util/u_keymap.c', + 'util/u_linear.c', + 'util/u_linkage.c', 'util/u_network.c', 'util/u_math.c', 'util/u_mempool.c', @@ -208,9 +211,9 @@ if env['llvm']: 'gallivm/lp_bld_format_soa.c', 'gallivm/lp_bld_format_yuv.c', 'gallivm/lp_bld_gather.c', + 'gallivm/lp_bld_init.c', 'gallivm/lp_bld_intr.c', 'gallivm/lp_bld_logic.c', - 'gallivm/lp_bld_init.c', 'gallivm/lp_bld_misc.cpp', 'gallivm/lp_bld_pack.c', 'gallivm/lp_bld_printf.c', @@ -222,10 +225,10 @@ if env['llvm']: 'gallivm/lp_bld_tgsi_soa.c', 'gallivm/lp_bld_type.c', 'draw/draw_llvm.c', - 'draw/draw_pt_fetch_shade_pipeline_llvm.c', + 'draw/draw_llvm_sample.c', 'draw/draw_llvm_translate.c', - 'draw/draw_vs_llvm.c', - 'draw/draw_llvm_sample.c' + 'draw/draw_pt_fetch_shade_pipeline_llvm.c', + 'draw/draw_vs_llvm.c' ] gallium = env.ConvenienceLibrary( diff --git a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h new file mode 100644 index 00000000000..958ed20dc84 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h @@ -0,0 +1,114 @@ +/************************************************************************** + * + * Copyright 2010, VMware, inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + + +static boolean TAG(do_cliptest)( struct pt_post_vs *pvs, + struct draw_vertex_info *info ) +{ + struct vertex_header *out = info->verts; + const float *scale = pvs->draw->viewport.scale; + const float *trans = pvs->draw->viewport.translate; + /* const */ float (*plane)[4] = pvs->draw->plane; + const unsigned pos = draw_current_shader_position_output(pvs->draw); + const unsigned ef = pvs->draw->vs.edgeflag_output; + const unsigned nr = pvs->draw->nr_planes; + const unsigned flags = (FLAGS); + unsigned need_pipeline = 0; + unsigned j; + + for (j = 0; j < info->count; j++) { + float *position = out->data[pos]; + unsigned mask = 0x0; + + initialize_vertex_header(out); + + if (flags & (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_HALF_Z | DO_CLIP_USER)) { + out->clip[0] = position[0]; + out->clip[1] = position[1]; + out->clip[2] = position[2]; + out->clip[3] = position[3]; + + /* Do the hardwired planes first: + */ + if (flags & DO_CLIP_XY) { + if (-position[0] + position[3] < 0) mask |= (1<<0); + if ( position[0] + position[3] < 0) mask |= (1<<1); + if (-position[1] + position[3] < 0) mask |= (1<<2); + if ( position[1] + position[3] < 0) mask |= (1<<3); + } + + /* Clip Z planes according to full cube, half cube or none. + */ + if (flags & DO_CLIP_FULL_Z) { + if ( position[2] + position[3] < 0) mask |= (1<<4); + if (-position[2] + position[3] < 0) mask |= (1<<5); + } + else if (flags & DO_CLIP_HALF_Z) { + if ( position[2] < 0) mask |= (1<<4); + if (-position[2] + position[3] < 0) mask |= (1<<5); + } + + if (flags & DO_CLIP_USER) { + unsigned i; + for (i = 6; i < nr; i++) { + if (dot4(position, plane[i]) < 0) + mask |= (1<<i); + } + } + + out->clipmask = mask; + need_pipeline |= out->clipmask; + } + + if ((flags & DO_VIEWPORT) && mask == 0) + { + /* divide by w */ + float w = 1.0f / position[3]; + + /* Viewport mapping */ + position[0] = position[0] * w * scale[0] + trans[0]; + position[1] = position[1] * w * scale[1] + trans[1]; + position[2] = position[2] * w * scale[2] + trans[2]; + position[3] = w; + } + + if ((flags & DO_EDGEFLAG) && ef) { + const float *edgeflag = out->data[ef]; + out->edgeflag = !(edgeflag[0] != 1.0f); + need_pipeline |= !out->edgeflag; + } + + out = (struct vertex_header *)( (char *)out + info->stride ); + } + + return need_pipeline != 0; +} + + +#undef FLAGS +#undef TAG diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 995b675b9a1..937b0934798 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -34,6 +34,7 @@ #include "pipe/p_context.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_cpu_detect.h" #include "draw_context.h" #include "draw_vs.h" #include "draw_gs.h" @@ -41,6 +42,25 @@ #if HAVE_LLVM #include "gallivm/lp_bld_init.h" #include "draw_llvm.h" + +static boolean +draw_get_option_use_llvm(void) +{ + static boolean first = TRUE; + static boolean value; + if (first) { + first = FALSE; + value = debug_get_bool_option("DRAW_USE_LLVM", TRUE); + +#ifdef PIPE_ARCH_X86 + util_cpu_detect(); + /* require SSE2 due to LLVM PR6960. */ + if (!util_cpu_caps.has_sse2) + value = FALSE; +#endif + } + return value; +} #endif struct draw_context *draw_create( struct pipe_context *pipe ) @@ -50,10 +70,13 @@ struct draw_context *draw_create( struct pipe_context *pipe ) goto fail; #if HAVE_LLVM - lp_build_init(); - assert(lp_build_engine); - draw->engine = lp_build_engine; - draw->llvm = draw_llvm_create(draw); + if(draw_get_option_use_llvm()) + { + lp_build_init(); + assert(lp_build_engine); + draw->engine = lp_build_engine; + draw->llvm = draw_llvm_create(draw); + } #endif if (!draw_init(draw)) @@ -83,6 +106,8 @@ boolean draw_init(struct draw_context *draw) ASSIGN_4V( draw->plane[4], 0, 0, 1, 1 ); /* yes these are correct */ ASSIGN_4V( draw->plane[5], 0, 0, -1, 1 ); /* mesa's a bit wonky */ draw->nr_planes = 6; + draw->clip_xy = 1; + draw->clip_z = 1; draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */ @@ -135,7 +160,8 @@ void draw_destroy( struct draw_context *draw ) draw_vs_destroy( draw ); draw_gs_destroy( draw ); #ifdef HAVE_LLVM - draw_llvm_destroy( draw->llvm ); + if(draw->llvm) + draw_llvm_destroy( draw->llvm ); #endif FREE( draw ); @@ -162,6 +188,14 @@ void draw_set_mrd(struct draw_context *draw, double mrd) } +static void update_clip_flags( struct draw_context *draw ) +{ + draw->clip_xy = !draw->driver.bypass_clip_xy; + draw->clip_z = (!draw->driver.bypass_clip_z && + !draw->depth_clamp); + draw->clip_user = (draw->nr_planes > 6); +} + /** * Register new primitive rasterization/rendering state. * This causes the drawing pipeline to be rebuilt. @@ -176,18 +210,25 @@ void draw_set_rasterizer_state( struct draw_context *draw, draw->rasterizer = raster; draw->rast_handle = rast_handle; - draw->bypass_clipping = draw->driver.bypass_clipping; - } + } } - +/* With a little more work, llvmpipe will be able to turn this off and + * do its own x/y clipping. + * + * Some hardware can turn off clipping altogether - in particular any + * hardware with a TNL unit can do its own clipping, even if it is + * relying on the draw module for some other reason. + */ void draw_set_driver_clipping( struct draw_context *draw, - boolean bypass_clipping ) + boolean bypass_clip_xy, + boolean bypass_clip_z ) { draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE ); - draw->driver.bypass_clipping = bypass_clipping; - draw->bypass_clipping = draw->driver.bypass_clipping; + draw->driver.bypass_clip_xy = bypass_clip_xy; + draw->driver.bypass_clip_z = bypass_clip_z; + update_clip_flags(draw); } @@ -217,6 +258,8 @@ void draw_set_clip_state( struct draw_context *draw, memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0])); draw->nr_planes = 6 + clip->nr; draw->depth_clamp = clip->depth_clamp; + + update_clip_flags(draw); } @@ -472,47 +515,28 @@ void draw_set_render( struct draw_context *draw, } - -/** - * Tell the drawing context about the index/element buffer to use - * (ala glDrawElements) - * If no element buffer is to be used (i.e. glDrawArrays) then this - * should be called with eltSize=0 and elements=NULL. - * - * \param draw the drawing context - * \param eltSize size of each element (1, 2 or 4 bytes) - * \param elements the element buffer ptr - */ void -draw_set_mapped_element_buffer_range( struct draw_context *draw, - unsigned eltSize, - int eltBias, - unsigned min_index, - unsigned max_index, - const void *elements ) +draw_set_index_buffer(struct draw_context *draw, + const struct pipe_index_buffer *ib) { - draw->pt.user.elts = elements; - draw->pt.user.eltSize = eltSize; - draw->pt.user.eltBias = eltBias; - draw->pt.user.min_index = min_index; - draw->pt.user.max_index = max_index; + if (ib) + memcpy(&draw->pt.index_buffer, ib, sizeof(draw->pt.index_buffer)); + else + memset(&draw->pt.index_buffer, 0, sizeof(draw->pt.index_buffer)); } +/** + * Tell drawing context where to find mapped index/element buffer. + */ void -draw_set_mapped_element_buffer( struct draw_context *draw, - unsigned eltSize, - int eltBias, - const void *elements ) +draw_set_mapped_index_buffer(struct draw_context *draw, + const void *elements) { - draw->pt.user.elts = elements; - draw->pt.user.eltSize = eltSize; - draw->pt.user.eltBias = eltBias; - draw->pt.user.min_index = 0; - draw->pt.user.max_index = 0xffffffff; + draw->pt.user.elts = elements; } - + /* Revamp me please: */ void draw_do_flush( struct draw_context *draw, unsigned flags ) @@ -659,7 +683,8 @@ draw_set_mapped_texture(struct draw_context *draw, const void *data[DRAW_MAX_TEXTURE_LEVELS]) { #ifdef HAVE_LLVM - draw_llvm_set_mapped_texture(draw, + if(draw->llvm) + draw_llvm_set_mapped_texture(draw, sampler_idx, width, height, depth, last_level, row_stride, img_stride, data); diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index 116716af6f0..4c780e4dcb4 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -160,18 +160,11 @@ void draw_set_vertex_elements(struct draw_context *draw, unsigned count, const struct pipe_vertex_element *elements); -void -draw_set_mapped_element_buffer_range( struct draw_context *draw, - unsigned eltSize, - int eltBias, - unsigned min_index, - unsigned max_index, - const void *elements ); - -void draw_set_mapped_element_buffer( struct draw_context *draw, - unsigned eltSize, - int eltBias, - const void *elements ); +void draw_set_index_buffer(struct draw_context *draw, + const struct pipe_index_buffer *ib); + +void draw_set_mapped_index_buffer(struct draw_context *draw, + const void *elements); void draw_set_mapped_vertex_buffer(struct draw_context *draw, unsigned attr, const void *buffer); @@ -196,6 +189,9 @@ draw_set_so_state(struct draw_context *draw, * draw_pt.c */ +void draw_vbo(struct draw_context *draw, + const struct pipe_draw_info *info); + void draw_arrays(struct draw_context *draw, unsigned prim, unsigned start, unsigned count); @@ -216,7 +212,8 @@ void draw_set_render( struct draw_context *draw, struct vbuf_render *render ); void draw_set_driver_clipping( struct draw_context *draw, - boolean bypass_clipping ); + boolean bypass_clip_xy, + boolean bypass_clip_z ); void draw_set_force_passthrough( struct draw_context *draw, boolean enable ); diff --git a/src/gallium/auxiliary/draw/draw_decompose_tmp.h b/src/gallium/auxiliary/draw/draw_decompose_tmp.h index a52d2b50588..a142563af97 100644 --- a/src/gallium/auxiliary/draw/draw_decompose_tmp.h +++ b/src/gallium/auxiliary/draw/draw_decompose_tmp.h @@ -54,10 +54,10 @@ FUNC(FUNC_VARS) FUNC_ENTER; - /* prim, count, and last_vertex_last should have been defined */ + /* prim, prim_flags, count, and last_vertex_last should have been defined */ if (0) { - debug_printf("%s: prim 0x%x, count %d, last_vertex_last %d\n", - __FUNCTION__, prim, count, last_vertex_last); + debug_printf("%s: prim 0x%x, prim_flags 0x%x, count %d, last_vertex_last %d\n", + __FUNCTION__, prim, prim_flags, count, last_vertex_last); } switch (prim) { @@ -80,7 +80,7 @@ FUNC(FUNC_VARS) case PIPE_PRIM_LINE_LOOP: case PIPE_PRIM_LINE_STRIP: if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; + flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE; idx[1] = GET_ELT(0); idx[2] = idx[1]; @@ -90,7 +90,7 @@ FUNC(FUNC_VARS) LINE(flags, idx[0], idx[1]); } /* close the loop */ - if (prim == PIPE_PRIM_LINE_LOOP) + if (prim == PIPE_PRIM_LINE_LOOP && !prim_flags) LINE(flags, idx[1], idx[2]); } break; @@ -255,17 +255,23 @@ FUNC(FUNC_VARS) if (last_vertex_last) { flags = (DRAW_PIPE_RESET_STIPPLE | - DRAW_PIPE_EDGE_FLAG_2 | DRAW_PIPE_EDGE_FLAG_0); + if (!(prim_flags & DRAW_SPLIT_BEFORE)) + flags |= DRAW_PIPE_EDGE_FLAG_2; + edge_next = DRAW_PIPE_EDGE_FLAG_0; - edge_finish = DRAW_PIPE_EDGE_FLAG_1; + edge_finish = + (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_1; } else { flags = (DRAW_PIPE_RESET_STIPPLE | - DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1); + if (!(prim_flags & DRAW_SPLIT_BEFORE)) + flags |= DRAW_PIPE_EDGE_FLAG_0; + edge_next = DRAW_PIPE_EDGE_FLAG_1; - edge_finish = DRAW_PIPE_EDGE_FLAG_2; + edge_finish = + (prim_flags & DRAW_SPLIT_AFTER) ? 0 : DRAW_PIPE_EDGE_FLAG_2; } idx[0] = GET_ELT(0); @@ -300,7 +306,7 @@ FUNC(FUNC_VARS) case PIPE_PRIM_LINE_STRIP_ADJACENCY: if (count >= 4) { - flags = DRAW_PIPE_RESET_STIPPLE; + flags = (prim_flags & DRAW_SPLIT_BEFORE) ? 0 : DRAW_PIPE_RESET_STIPPLE; idx[1] = GET_ELT(0); idx[2] = GET_ELT(1); idx[3] = GET_ELT(2); diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c index 4a1013e79a5..50a03ac95a5 100644 --- a/src/gallium/auxiliary/draw/draw_gs.c +++ b/src/gallium/auxiliary/draw/draw_gs.c @@ -380,7 +380,7 @@ static void gs_tri_adj(struct draw_geometry_shader *shader, #define FUNC gs_run_elts #define LOCAL_VARS const ushort *elts = input_prims->elts; -#define GET_ELT(idx) (elts[idx] & ~DRAW_PIPE_FLAG_MASK) +#define GET_ELT(idx) (elts[idx]) #include "draw_gs_tmp.h" @@ -457,6 +457,7 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader, output_prims->start = 0; output_prims->count = shader->emitted_vertices; output_prims->prim = shader->output_primitive; + output_prims->flags = 0x0; output_prims->primitive_lengths = shader->primitive_lengths; output_prims->primitive_count = shader->emitted_primitives; output_verts->count = shader->emitted_vertices; diff --git a/src/gallium/auxiliary/draw/draw_gs_tmp.h b/src/gallium/auxiliary/draw/draw_gs_tmp.h index 4a17af0dea3..de7b02655a5 100644 --- a/src/gallium/auxiliary/draw/draw_gs_tmp.h +++ b/src/gallium/auxiliary/draw/draw_gs_tmp.h @@ -6,12 +6,10 @@ #define FUNC_ENTER \ /* declare more local vars */ \ - struct draw_context *draw = gs->draw; \ const unsigned prim = input_prims->prim; \ + const unsigned prim_flags = input_prims->flags; \ const unsigned count = input_prims->count; \ - const boolean last_vertex_last = \ - !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); \ + const boolean last_vertex_last = TRUE; \ do { \ debug_assert(input_prims->primitive_count == 1); \ switch (prim) { \ diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index 8d53601d195..8759c38cabb 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -210,13 +210,6 @@ draw_llvm_create(struct draw_context *draw) { struct draw_llvm *llvm; -#ifdef PIPE_ARCH_X86 - util_cpu_detect(); - /* require SSE2 due to LLVM PR6960. */ - if (!util_cpu_caps.has_sse2) - return NULL; -#endif - llvm = CALLOC_STRUCT( draw_llvm ); if (!llvm) return NULL; @@ -292,15 +285,23 @@ draw_llvm_destroy(struct draw_llvm *llvm) } struct draw_llvm_variant * -draw_llvm_create_variant(struct draw_llvm *llvm, int num_inputs) +draw_llvm_create_variant(struct draw_llvm *llvm, + unsigned num_inputs, + const struct draw_llvm_variant_key *key) { - struct draw_llvm_variant *variant = MALLOC(sizeof(struct draw_llvm_variant)); + struct draw_llvm_variant *variant; struct llvm_vertex_shader *shader = llvm_vertex_shader(llvm->draw->vs.vertex_shader); + variant = MALLOC(sizeof *variant + + shader->variant_key_size - + sizeof variant->key); + if (variant == NULL) + return NULL; + variant->llvm = llvm; - draw_llvm_make_variant_key(llvm, &variant->key); + memcpy(&variant->key, key, shader->variant_key_size); llvm->vertex_header_ptr_type = create_vertex_header(llvm, num_inputs); @@ -738,8 +739,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant) step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); /* code generated texture sampling */ - sampler = draw_llvm_sampler_soa_create(variant->key.sampler, - context_ptr); + sampler = draw_llvm_sampler_soa_create( + draw_llvm_variant_key_samplers(&variant->key), + context_ptr); #if DEBUG_STORE lp_build_printf(builder, "start = %d, end = %d, step = %d\n", @@ -901,8 +903,9 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian step = LLVMConstInt(LLVMInt32Type(), max_vertices, 0); /* code generated texture sampling */ - sampler = draw_llvm_sampler_soa_create(variant->key.sampler, - context_ptr); + sampler = draw_llvm_sampler_soa_create( + draw_llvm_variant_key_samplers(&variant->key), + context_ptr); fetch_max = LLVMBuildSub(builder, fetch_count, LLVMConstInt(LLVMInt32Type(), 1, 0), @@ -1002,35 +1005,42 @@ draw_llvm_generate_elts(struct draw_llvm *llvm, struct draw_llvm_variant *varian lp_func_delete_body(variant->function_elts); } -void -draw_llvm_make_variant_key(struct draw_llvm *llvm, - struct draw_llvm_variant_key *key) + +struct draw_llvm_variant_key * +draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store) { unsigned i; + struct draw_llvm_variant_key *key; + struct lp_sampler_static_state *sampler; - memset(key, 0, sizeof(struct draw_llvm_variant_key)); + key = (struct draw_llvm_variant_key *)store; + /* Presumably all variants of the shader should have the same + * number of vertex elements - ie the number of shader inputs. + */ key->nr_vertex_elements = llvm->draw->pt.nr_vertex_elements; + /* All variants of this shader will have the same value for + * nr_samplers. Not yet trying to compact away holes in the + * sampler array. + */ + key->nr_samplers = llvm->draw->vs.vertex_shader->info.file_max[TGSI_FILE_SAMPLER] + 1; + + sampler = draw_llvm_variant_key_samplers(key); + memcpy(key->vertex_element, llvm->draw->pt.vertex_element, sizeof(struct pipe_vertex_element) * key->nr_vertex_elements); + + memset(sampler, 0, key->nr_samplers * sizeof *sampler); - memcpy(&key->vs, - &llvm->draw->vs.vertex_shader->state, - sizeof(struct pipe_shader_state)); - - /* if the driver implemented the sampling hooks then - * setup our sampling state */ - if (llvm->draw->num_sampler_views && llvm->draw->num_samplers) { - for(i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; ++i) { - struct draw_vertex_shader *shader = llvm->draw->vs.vertex_shader; - if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) - lp_sampler_static_state(&key->sampler[i], - llvm->draw->sampler_views[i], - llvm->draw->samplers[i]); - } + for (i = 0 ; i < key->nr_samplers; i++) { + lp_sampler_static_state(&sampler[i], + llvm->draw->sampler_views[i], + llvm->draw->samplers[i]); } + + return key; } void diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index 4addb47d2d8..6196b2f983f 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -151,12 +151,43 @@ typedef void struct draw_llvm_variant_key { - struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS]; - unsigned nr_vertex_elements; - struct pipe_shader_state vs; - struct lp_sampler_static_state sampler[PIPE_MAX_VERTEX_SAMPLERS]; + unsigned nr_vertex_elements:16; + unsigned nr_samplers:16; + + /* Variable number of vertex elements: + */ + struct pipe_vertex_element vertex_element[1]; + + /* Followed by variable number of samplers: + */ +/* struct lp_sampler_static_state sampler; */ }; +#define DRAW_LLVM_MAX_VARIANT_KEY_SIZE \ + (sizeof(struct draw_llvm_variant_key) + \ + PIPE_MAX_VERTEX_SAMPLERS * sizeof(struct lp_sampler_static_state) + \ + (PIPE_MAX_ATTRIBS-1) * sizeof(struct pipe_vertex_element)) + + +static INLINE size_t +draw_llvm_variant_key_size(unsigned nr_vertex_elements, + unsigned nr_samplers) +{ + return (sizeof(struct draw_llvm_variant_key) + + nr_samplers * sizeof(struct lp_sampler_static_state) + + (nr_vertex_elements - 1) * sizeof(struct pipe_vertex_element)); +} + + +static INLINE struct lp_sampler_static_state * +draw_llvm_variant_key_samplers(struct draw_llvm_variant_key *key) +{ + return (struct lp_sampler_static_state *) + &key->vertex_element[key->nr_vertex_elements]; +} + + + struct draw_llvm_variant_list_item { struct draw_llvm_variant *base; @@ -165,7 +196,6 @@ struct draw_llvm_variant_list_item struct draw_llvm_variant { - struct draw_llvm_variant_key key; LLVMValueRef function; LLVMValueRef function_elts; draw_jit_vert_func jit_func; @@ -176,11 +206,16 @@ struct draw_llvm_variant struct draw_llvm *llvm; struct draw_llvm_variant_list_item list_item_global; struct draw_llvm_variant_list_item list_item_local; + + /* key is variable-sized, must be last */ + struct draw_llvm_variant_key key; + /* key is variable-sized, must be last */ }; struct llvm_vertex_shader { struct draw_vertex_shader base; + unsigned variant_key_size; struct draw_llvm_variant_list_item variants; unsigned variants_created; unsigned variants_cached; @@ -220,14 +255,15 @@ void draw_llvm_destroy(struct draw_llvm *llvm); struct draw_llvm_variant * -draw_llvm_create_variant(struct draw_llvm *llvm, int num_inputs); +draw_llvm_create_variant(struct draw_llvm *llvm, + unsigned num_vertex_header_attribs, + const struct draw_llvm_variant_key *key); void draw_llvm_destroy_variant(struct draw_llvm_variant *variant); -void -draw_llvm_make_variant_key(struct draw_llvm *llvm, - struct draw_llvm_variant_key *key); +struct draw_llvm_variant_key * +draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store); LLVMValueRef draw_llvm_translate_from(LLVMBuilderRef builder, diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c index 58995e07248..6206197dae9 100644 --- a/src/gallium/auxiliary/draw/draw_pipe.c +++ b/src/gallium/auxiliary/draw/draw_pipe.c @@ -169,35 +169,27 @@ static void do_triangle( struct draw_context *draw, /* * Set up macros for draw_pt_decompose.h template code. * This code uses vertex indexes / elements. - * - * Flags are needed by the stipple and unfilled stages. When the two stages - * are active, vcache_run_extras is called and the flags are stored in the - * higher bits of i0. Otherwise, flags do not matter. */ #define TRIANGLE(flags,i0,i1,i2) \ do { \ - assert(!((i1) & DRAW_PIPE_FLAG_MASK)); \ - assert(!((i2) & DRAW_PIPE_FLAG_MASK)); \ do_triangle( draw, \ - i0, /* flags */ \ - verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK), \ + flags, \ + verts + stride * (i0), \ verts + stride * (i1), \ verts + stride * (i2) ); \ } while (0) #define LINE(flags,i0,i1) \ do { \ - assert(!((i1) & DRAW_PIPE_FLAG_MASK)); \ do_line( draw, \ - i0, /* flags */ \ - verts + stride * (i0 & ~DRAW_PIPE_FLAG_MASK), \ + flags, \ + verts + stride * (i0), \ verts + stride * (i1) ); \ } while (0) #define POINT(i0) \ do { \ - assert(!((i0) & DRAW_PIPE_FLAG_MASK)); \ do_point( draw, verts + stride * (i0) ); \ } while (0) @@ -207,6 +199,7 @@ static void do_triangle( struct draw_context *draw, #define FUNC_VARS \ struct draw_context *draw, \ unsigned prim, \ + unsigned prim_flags, \ struct vertex_header *vertices, \ unsigned stride, \ const ushort *elts, \ @@ -245,22 +238,27 @@ void draw_pipeline_run( struct draw_context *draw, const unsigned count = prim_info->primitive_lengths[i]; #if DEBUG - /* make sure none of the element indexes go outside the vertex buffer */ + /* Warn if one of the element indexes go outside the vertex buffer */ { unsigned max_index = 0x0, i; /* find the largest element index */ for (i = 0; i < count; i++) { - unsigned int index = (prim_info->elts[start + i] - & ~DRAW_PIPE_FLAG_MASK); + unsigned int index = prim_info->elts[start + i]; if (index > max_index) max_index = index; } - assert(max_index <= vert_info->count); + if (max_index >= vert_info->count) { + debug_printf("%s: max_index (%u) outside vertex buffer (%u)\n", + __FUNCTION__, + max_index, + vert_info->count); + } } #endif pipe_run_elts(draw, prim_info->prim, + prim_info->flags, vert_info->verts, vert_info->stride, prim_info->elts + start, @@ -298,6 +296,7 @@ void draw_pipeline_run( struct draw_context *draw, #define FUNC_VARS \ struct draw_context *draw, \ unsigned prim, \ + unsigned prim_flags, \ struct vertex_header *vertices, \ unsigned stride, \ unsigned count @@ -330,6 +329,7 @@ void draw_pipeline_run_linear( struct draw_context *draw, pipe_run_linear(draw, prim_info->prim, + prim_info->flags, (struct vertex_header*)verts, vert_info->stride, count); diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c index eafa29276ff..8b925439876 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_validate.c +++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c @@ -265,7 +265,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage ) /* Clip stage */ - if (!draw->bypass_clipping) + if (draw->clip_xy || draw->clip_z || draw->clip_user) { draw->pipeline.clip->next = next; next = draw->pipeline.clip; diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c index 3c93c9014a6..58c5858734a 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c +++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c @@ -353,9 +353,6 @@ vbuf_alloc_vertices( struct vbuf_stage *vbuf ) /* Allocate a new vertex buffer */ vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size; - /* even number */ - vbuf->max_vertices = vbuf->max_vertices & ~1; - if(vbuf->max_vertices >= UNDEFINED_VERTEX_ID) vbuf->max_vertices = UNDEFINED_VERTEX_ID - 1; diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 397d4bf653c..362f563ba6a 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -140,8 +140,7 @@ struct draw_context } middle; struct { - struct draw_pt_front_end *vcache; - struct draw_pt_front_end *varray; + struct draw_pt_front_end *vsplit; } front; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; @@ -150,6 +149,8 @@ struct draw_context struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS]; unsigned nr_vertex_elements; + struct pipe_index_buffer index_buffer; + /* user-space vertex data, buffers */ struct { /** vertex element/index buffer (ex: glDrawElements) */ @@ -175,13 +176,19 @@ struct draw_context } pt; struct { - boolean bypass_clipping; - boolean bypass_vs; + boolean bypass_clip_xy; + boolean bypass_clip_z; } driver; boolean flushing; /**< debugging/sanity */ boolean suspend_flushing; /**< internally set */ - boolean bypass_clipping; /**< set if either api or driver bypass_clipping true */ + + /* Flags set if API requires clipping in these planes and the + * driver doesn't indicate that it can do it for us. + */ + boolean clip_xy; + boolean clip_z; + boolean clip_user; boolean force_passthrough; /**< never clip or shade */ @@ -296,6 +303,10 @@ struct draw_vertex_info { unsigned count; }; +/* these flags are set if the primitive is a segment of a larger one */ +#define DRAW_SPLIT_BEFORE 0x1 +#define DRAW_SPLIT_AFTER 0x2 + struct draw_prim_info { boolean linear; unsigned start; @@ -304,6 +315,7 @@ struct draw_prim_info { unsigned count; unsigned prim; + unsigned flags; unsigned *primitive_lengths; unsigned primitive_count; }; @@ -369,21 +381,15 @@ void draw_pipeline_destroy( struct draw_context *draw ); -/* We use the top few bits in the elts[] parameter to convey a little - * API information. This limits the number of vertices we can address - * to only 4096 -- if that becomes a problem, we can switch to 32-bit - * draw indices. - * - * These flags expected at first vertex of lines & triangles when - * unfilled and/or line stipple modes are operational. +/* + * These flags are used by the pipeline when unfilled and/or line stipple modes + * are operational. */ -#define DRAW_PIPE_MAX_VERTICES (0x1<<12) -#define DRAW_PIPE_EDGE_FLAG_0 (0x1<<12) -#define DRAW_PIPE_EDGE_FLAG_1 (0x2<<12) -#define DRAW_PIPE_EDGE_FLAG_2 (0x4<<12) -#define DRAW_PIPE_EDGE_FLAG_ALL (0x7<<12) -#define DRAW_PIPE_RESET_STIPPLE (0x8<<12) -#define DRAW_PIPE_FLAG_MASK (0xf<<12) +#define DRAW_PIPE_EDGE_FLAG_0 0x1 +#define DRAW_PIPE_EDGE_FLAG_1 0x2 +#define DRAW_PIPE_EDGE_FLAG_2 0x4 +#define DRAW_PIPE_EDGE_FLAG_ALL 0x7 +#define DRAW_PIPE_RESET_STIPPLE 0x8 void draw_pipeline_run( struct draw_context *draw, const struct draw_vertex_info *vert, diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c index 248927505da..f44bf2507c6 100644 --- a/src/gallium/auxiliary/draw/draw_pt.c +++ b/src/gallium/auxiliary/draw/draw_pt.c @@ -39,25 +39,14 @@ #include "util/u_math.h" #include "util/u_prim.h" #include "util/u_format.h" +#include "util/u_draw.h" DEBUG_GET_ONCE_BOOL_OPTION(draw_fse, "DRAW_FSE", FALSE) DEBUG_GET_ONCE_BOOL_OPTION(draw_no_fse, "DRAW_NO_FSE", FALSE) -#ifdef HAVE_LLVM -DEBUG_GET_ONCE_BOOL_OPTION(draw_use_llvm, "DRAW_USE_LLVM", TRUE) -#endif - -static unsigned trim( unsigned count, unsigned first, unsigned incr ) -{ - if (count < first) - return 0; - return count - (count - first) % incr; -} - - /* Overall we split things into: - * - frontend -- prepare fetch_elts, draw_elts - eg vcache + * - frontend -- prepare fetch_elts, draw_elts - eg vsplit * - middle -- fetch, shade, cliptest, viewport * - pipeline -- the prim pipeline: clipping, wide lines, etc * - backend -- the vbuf_render provided by the driver. @@ -77,7 +66,7 @@ draw_pt_arrays(struct draw_context *draw, { unsigned first, incr; draw_pt_split_prim(prim, &first, &incr); - count = trim(count, first, incr); + count = draw_pt_trim_count(count, first, incr); if (count < first) return TRUE; } @@ -97,7 +86,9 @@ draw_pt_arrays(struct draw_context *draw, opt |= PT_PIPELINE; } - if (!draw->bypass_clipping && !draw->pt.test_fse) { + if ((draw->clip_xy || + draw->clip_z || + draw->clip_user) && !draw->pt.test_fse) { opt |= PT_CLIPTEST; } @@ -115,22 +106,11 @@ draw_pt_arrays(struct draw_context *draw, middle = draw->pt.middle.general; } - - /* Pick the right frontend - */ - if (draw->pt.user.elts || (opt & PT_PIPELINE)) { - frontend = draw->pt.front.vcache; - } else { - frontend = draw->pt.front.varray; - } + frontend = draw->pt.front.vsplit; frontend->prepare( frontend, prim, middle, opt ); - frontend->run(frontend, - draw_pt_elt_func(draw), - draw_pt_elt_ptr(draw, start), - draw->pt.user.eltBias, - count); + frontend->run(frontend, start, count); frontend->finish( frontend ); @@ -143,12 +123,8 @@ boolean draw_pt_init( struct draw_context *draw ) draw->pt.test_fse = debug_get_option_draw_fse(); draw->pt.no_fse = debug_get_option_draw_no_fse(); - draw->pt.front.vcache = draw_pt_vcache( draw ); - if (!draw->pt.front.vcache) - return FALSE; - - draw->pt.front.varray = draw_pt_varray(draw); - if (!draw->pt.front.varray) + draw->pt.front.vsplit = draw_pt_vsplit(draw); + if (!draw->pt.front.vsplit) return FALSE; draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw ); @@ -164,7 +140,7 @@ boolean draw_pt_init( struct draw_context *draw ) return FALSE; #if HAVE_LLVM - if (debug_get_option_draw_use_llvm()) + if (draw->llvm) draw->pt.middle.llvm = draw_pt_fetch_pipeline_or_emit_llvm( draw ); #endif @@ -194,14 +170,9 @@ void draw_pt_destroy( struct draw_context *draw ) draw->pt.middle.fetch_shade_emit = NULL; } - if (draw->pt.front.vcache) { - draw->pt.front.vcache->destroy( draw->pt.front.vcache ); - draw->pt.front.vcache = NULL; - } - - if (draw->pt.front.varray) { - draw->pt.front.varray->destroy( draw->pt.front.varray ); - draw->pt.front.varray = NULL; + if (draw->pt.front.vsplit) { + draw->pt.front.vsplit->destroy( draw->pt.front.vsplit ); + draw->pt.front.vsplit = NULL; } } @@ -221,24 +192,29 @@ draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count) uint ii = 0; uint j; - if (draw->pt.user.elts) { + if (draw->pt.user.eltSize) { + const char *elts; + /* indexed arrays */ + elts = (const char *) draw->pt.user.elts; + elts += draw->pt.index_buffer.offset; + switch (draw->pt.user.eltSize) { case 1: { - const ubyte *elem = (const ubyte *) draw->pt.user.elts; + const ubyte *elem = (const ubyte *) elts; ii = elem[start + i]; } break; case 2: { - const ushort *elem = (const ushort *) draw->pt.user.elts; + const ushort *elem = (const ushort *) elts; ii = elem[start + i]; } break; case 4: { - const uint *elem = (const uint *) draw->pt.user.elts; + const uint *elem = (const uint *) elts; ii = elem[start + i]; } break; @@ -324,17 +300,8 @@ draw_arrays(struct draw_context *draw, unsigned prim, /** - * Draw vertex arrays. - * This is the main entrypoint into the drawing module. - * If drawing an indexed primitive, the draw_set_mapped_element_buffer_range() - * function should have already been called to specify the element/index buffer - * information. - * - * \param prim one of PIPE_PRIM_x - * \param start index of first vertex to draw - * \param count number of vertices to draw - * \param startInstance number for the first primitive instance (usually 0). - * \param instanceCount number of instances to draw (1=non-instanced) + * Instanced drawing. + * \sa draw_vbo */ void draw_arrays_instanced(struct draw_context *draw, @@ -344,10 +311,50 @@ draw_arrays_instanced(struct draw_context *draw, unsigned startInstance, unsigned instanceCount) { - unsigned reduced_prim = u_reduced_prim(mode); + struct pipe_draw_info info; + + util_draw_init_info(&info); + + info.mode = mode; + info.start = start; + info.count = count; + info.start_instance = startInstance; + info.instance_count = instanceCount; + + info.indexed = (draw->pt.user.elts != NULL); + if (!info.indexed) { + info.min_index = start; + info.max_index = start + count - 1; + } + + draw_vbo(draw, &info); +} + + +/** + * Draw vertex arrays. + * This is the main entrypoint into the drawing module. If drawing an indexed + * primitive, the draw_set_index_buffer() and draw_set_mapped_index_buffer() + * functions should have already been called to specify the element/index + * buffer information. + */ +void +draw_vbo(struct draw_context *draw, + const struct pipe_draw_info *info) +{ + unsigned reduced_prim = u_reduced_prim(info->mode); unsigned instance; - assert(instanceCount > 0); + assert(info->instance_count > 0); + if (info->indexed) + assert(draw->pt.user.elts); + + draw->pt.user.eltSize = + (info->indexed) ? draw->pt.index_buffer.index_size : 0; + + draw->pt.user.eltBias = info->index_bias; + draw->pt.user.min_index = info->min_index; + draw->pt.user.max_index = info->max_index; if (reduced_prim != draw->reduced_prim) { draw_do_flush(draw, DRAW_FLUSH_STATE_CHANGE); @@ -355,8 +362,8 @@ draw_arrays_instanced(struct draw_context *draw, } if (0) - debug_printf("draw_arrays(mode=%u start=%u count=%u):\n", - mode, start, count); + debug_printf("draw_vbo(mode=%u start=%u count=%u):\n", + info->mode, info->start, info->count); if (0) tgsi_dump(draw->vs.vertex_shader->state.tokens, 0); @@ -384,10 +391,10 @@ draw_arrays_instanced(struct draw_context *draw, } if (0) - draw_print_arrays(draw, mode, start, MIN2(count, 20)); + draw_print_arrays(draw, info->mode, info->start, MIN2(info->count, 20)); - for (instance = 0; instance < instanceCount; instance++) { - draw->instance_id = instance + startInstance; - draw_pt_arrays(draw, mode, start, count); + for (instance = 0; instance < info->instance_count; instance++) { + draw->instance_id = instance + info->start_instance; + draw_pt_arrays(draw, info->mode, info->start, info->count); } } diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h index 44356fba4c5..5fbb4242915 100644 --- a/src/gallium/auxiliary/draw/draw_pt.h +++ b/src/gallium/auxiliary/draw/draw_pt.h @@ -35,8 +35,6 @@ #include "pipe/p_compiler.h" -typedef unsigned (*pt_elt_func)( const void *elts, unsigned idx ); - struct draw_pt_middle_end; struct draw_context; struct draw_prim_info; @@ -52,13 +50,18 @@ struct draw_vertex_info; /* The "front end" - prepare sets of fetch, draw elements for the * middle end. * - * Currenly one version of this: - * - vcache - catchall implementation, decomposes to TRI/LINE/POINT prims - * Later: - * - varray, varray_split - * - velement, velement_split + * The fetch elements are indices to the vertices. The draw elements are + * indices to the fetched vertices. When both arrays of elements are both + * linear, middle->run_linear is called; When only the fetch elements are + * linear, middle->run_linear_elts is called; Otherwise, middle->run is + * called. + * + * When the number of the draw elements exceeds max_vertex of the middle end, + * the draw elements (as well as the fetch elements) are splitted and the + * middle end is called multiple times. * - * Currenly only using the vcache version. + * Currenly there is: + * - vsplit - catchall implementation, splits big prims */ struct draw_pt_front_end { void (*prepare)( struct draw_pt_front_end *, @@ -67,9 +70,7 @@ struct draw_pt_front_end { unsigned opt ); void (*run)( struct draw_pt_front_end *, - pt_elt_func elt_func, - const void *elt_ptr, - int elt_bias, + unsigned start, unsigned count ); void (*finish)( struct draw_pt_front_end * ); @@ -80,6 +81,8 @@ struct draw_pt_front_end { /* The "middle end" - prepares actual hardware vertices for the * hardware backend. * + * prim_flags is as defined by pipe_draw_info::flags. + * * Currently two versions of this: * - fetch, vertex shade, cliptest, prim-pipeline * - fetch, emit (ie passthrough) @@ -94,11 +97,13 @@ struct draw_pt_middle_end { const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ); + unsigned draw_count, + unsigned prim_flags ); void (*run_linear)(struct draw_pt_middle_end *, unsigned start, - unsigned count); + unsigned count, + unsigned prim_flags ); /* Transform all vertices in a linear range and then draw them with * the supplied element list. May fail and return FALSE. @@ -107,7 +112,8 @@ struct draw_pt_middle_end { unsigned fetch_start, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ); + unsigned draw_count, + unsigned prim_flags ); int (*get_max_vertex_count)( struct draw_pt_middle_end * ); @@ -122,19 +128,11 @@ struct vbuf_render; struct vertex_header; -/* Helper functions. - */ -pt_elt_func draw_pt_elt_func( struct draw_context *draw ); -const void *draw_pt_elt_ptr( struct draw_context *draw, - unsigned start ); - /* Frontends: * - * Currently only the general-purpose vcache implementation, could add - * a special case for tiny vertex buffers. + * Currently only the general-purpose vsplit implementation. */ -struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw ); -struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw); +struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw); /* Middle-ends: @@ -223,7 +221,9 @@ boolean draw_pt_post_vs_run( struct pt_post_vs *pvs, struct draw_vertex_info *info ); void draw_pt_post_vs_prepare( struct pt_post_vs *pvs, - boolean bypass_clipping, + boolean clip_xy, + boolean clip_z, + boolean clip_user, boolean bypass_viewport, boolean opengl, boolean need_edgeflags ); @@ -237,6 +237,7 @@ void draw_pt_post_vs_destroy( struct pt_post_vs *pvs ); * Utils: */ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr); +unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr); #endif diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c deleted file mode 100644 index 88f4d9f495a..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_elts.c +++ /dev/null @@ -1,89 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <[email protected]> - */ - -#include "draw/draw_pt.h" -#include "draw/draw_private.h" - -/* Neat get_elt func that also works for varrays drawing by encoding - * the start value into a pointer. - */ - -static unsigned elt_uint( const void *elts, unsigned idx ) -{ - return *(((const uint *)elts) + idx); -} - -static unsigned elt_ushort( const void *elts, unsigned idx ) -{ - return *(((const ushort *)elts) + idx); -} - -static unsigned elt_ubyte( const void *elts, unsigned idx ) -{ - return *(((const ubyte *)elts) + idx); -} - -static unsigned elt_vert( const void *elts, unsigned idx ) -{ - /* unsigned index is packed in the pointer */ - return (unsigned)(uintptr_t)elts + idx; -} - -pt_elt_func draw_pt_elt_func( struct draw_context *draw ) -{ - switch (draw->pt.user.eltSize) { - case 0: return &elt_vert; - case 1: return &elt_ubyte; - case 2: return &elt_ushort; - case 4: return &elt_uint; - default: return NULL; - } -} - -const void *draw_pt_elt_ptr( struct draw_context *draw, - unsigned start ) -{ - const char *elts = draw->pt.user.elts; - - switch (draw->pt.user.eltSize) { - case 0: - return (const void *)(((const ubyte *)NULL) + start); - case 1: - return (const void *)(((const ubyte *)elts) + start); - case 2: - return (const void *)(((const ushort *)elts) + start); - case 4: - return (const void *)(((const uint *)elts) + start); - default: - return NULL; - } -} diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index 5568fbb9f88..c8dfc16911e 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -120,9 +120,6 @@ void draw_pt_emit_prepare( struct pt_emit *emit, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - - /* even number */ - *max_vertices = *max_vertices & ~1; } @@ -147,11 +144,6 @@ void draw_pt_emit( struct pt_emit *emit, if (vertex_count == 0) return; - if (vertex_count >= UNDEFINED_VERTEX_ID) { - assert(0); - return; - } - /* XXX: and work out some way to coordinate the render primitive * between vbuf.c and here... */ @@ -226,9 +218,6 @@ void draw_pt_emit_linear(struct pt_emit *emit, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - /* XXX: and work out some way to coordinate the render primitive * between vbuf.c and here... */ diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c index 5c8af17c8e3..e706b7796f8 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c @@ -191,15 +191,6 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - - /* Return an even number of verts. - * This prevents "parity" errors when splitting long triangle strips which - * can lead to front/back culling mix-ups. - * Every other triangle in a strip has an alternate front/back orientation - * so splitting at an odd position can cause the orientation of subsequent - * triangles to get reversed. - */ - *max_vertices = *max_vertices & ~1; } @@ -210,7 +201,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -220,11 +212,6 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (fetch_count >= UNDEFINED_VERTEX_ID) { - assert(0); - return; - } - draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)fetch_count ); @@ -273,7 +260,8 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle, static void fetch_emit_run_linear( struct draw_pt_middle_end *middle, unsigned start, - unsigned count ) + unsigned count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -283,9 +271,6 @@ static void fetch_emit_run_linear( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)count )) @@ -334,7 +319,8 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle; struct draw_context *draw = feme->draw; @@ -344,9 +330,6 @@ static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - return FALSE; - if (!draw->render->allocate_vertices( draw->render, (ushort)feme->translate->key.output_stride, (ushort)count )) diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c index b8270280b64..7c198c6026d 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c @@ -102,7 +102,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle, fse->key.nr_inputs); /* inputs - fetch from api format */ fse->key.viewport = !draw->identity_viewport; - fse->key.clip = !draw->bypass_clipping; + fse->key.clip = draw->clip_xy || draw->clip_z || draw->clip_user; fse->key.const_vbuffers = 0; memset(fse->key.element, 0, @@ -175,15 +175,6 @@ static void fse_prepare( struct draw_pt_middle_end *middle, *max_vertices = (draw->render->max_vertex_buffer_bytes / (vinfo->size * 4)); - /* Return an even number of verts. - * This prevents "parity" errors when splitting long triangle strips which - * can lead to front/back culling mix-ups. - * Every other triangle in a strip has an alternate front/back orientation - * so splitting at an odd position can cause the orientation of subsequent - * triangles to get reversed. - */ - *max_vertices = *max_vertices & ~1; - /* Probably need to do this somewhere (or fix exec shader not to * need it): */ @@ -197,7 +188,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle, static void fse_run_linear( struct draw_pt_middle_end *middle, unsigned start, - unsigned count ) + unsigned count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -207,9 +199,6 @@ static void fse_run_linear( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)count )) @@ -265,7 +254,8 @@ fse_run(struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -275,9 +265,6 @@ fse_run(struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (fetch_count >= UNDEFINED_VERTEX_ID) - goto fail; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)fetch_count )) @@ -327,7 +314,8 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle; struct draw_context *draw = fse->draw; @@ -337,9 +325,6 @@ static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, */ draw_do_flush( draw, DRAW_FLUSH_BACKEND ); - if (count >= UNDEFINED_VERTEX_ID) - return FALSE; - if (!draw->render->allocate_vertices( draw->render, (ushort)fse->key.output_stride, (ushort)count )) diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c index 5b16c3788e5..b72fd612451 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c @@ -100,8 +100,10 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, * but gl vs dx9 clip spaces. */ draw_pt_post_vs_prepare( fpme->post_vs, - (boolean)draw->bypass_clipping, - (boolean)draw->identity_viewport, + draw->clip_xy, + draw->clip_z, + draw->clip_user, + draw->identity_viewport, (boolean)draw->rasterizer->gl_rasterization_rules, (draw->vs.edgeflag_output ? TRUE : FALSE) ); @@ -112,16 +114,13 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle, gs_out_prim, max_vertices ); - *max_vertices = MAX2( *max_vertices, - DRAW_PIPE_MAX_VERTICES ); + *max_vertices = MAX2( *max_vertices, 4096 ); } else { - *max_vertices = DRAW_PIPE_MAX_VERTICES; + /* limit max fetches by limiting max_vertices */ + *max_vertices = 4096; } - /* return even number */ - *max_vertices = *max_vertices & ~1; - /* No need to prepare the shader. */ vs->prepare(vs, draw); @@ -295,7 +294,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -311,6 +311,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; @@ -320,7 +321,8 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle, static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, unsigned start, - unsigned count) + unsigned count, + unsigned prim_flags) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -336,6 +338,7 @@ static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle, prim_info.count = count; prim_info.elts = NULL; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &count; @@ -348,7 +351,8 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -364,6 +368,7 @@ static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 4b99bee86a0..77291e304e1 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -66,7 +66,8 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, struct draw_context *draw = fpme->draw; struct llvm_vertex_shader *shader = llvm_vertex_shader(draw->vs.vertex_shader); - struct draw_llvm_variant_key key; + char store[DRAW_LLVM_MAX_VARIANT_KEY_SIZE]; + struct draw_llvm_variant_key *key; struct draw_llvm_variant *variant = NULL; struct draw_llvm_variant_list_item *li; unsigned i; @@ -106,8 +107,10 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, * but gl vs dx9 clip spaces. */ draw_pt_post_vs_prepare( fpme->post_vs, - (boolean)draw->bypass_clipping, - (boolean)(draw->identity_viewport), + draw->clip_xy, + draw->clip_z, + draw->clip_user, + draw->identity_viewport, (boolean)draw->rasterizer->gl_rasterization_rules, (draw->vs.edgeflag_output ? TRUE : FALSE) ); @@ -118,21 +121,21 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, out_prim, max_vertices ); - *max_vertices = MAX2( *max_vertices, - DRAW_PIPE_MAX_VERTICES ); + *max_vertices = MAX2( *max_vertices, 4096 ); } else { - *max_vertices = DRAW_PIPE_MAX_VERTICES; + /* limit max fetches by limiting max_vertices */ + *max_vertices = 4096; } /* return even number */ *max_vertices = *max_vertices & ~1; - - draw_llvm_make_variant_key(fpme->llvm, &key); + + key = draw_llvm_make_variant_key(fpme->llvm, store); li = first_elem(&shader->variants); while(!at_end(&shader->variants, li)) { - if(memcmp(&li->base->key, &key, sizeof key) == 0) { + if(memcmp(&li->base->key, key, shader->variant_key_size) == 0) { variant = li->base; break; } @@ -155,7 +158,7 @@ llvm_middle_end_prepare( struct draw_pt_middle_end *middle, } } - variant = draw_llvm_create_variant(fpme->llvm, nr); + variant = draw_llvm_create_variant(fpme->llvm, nr, key); if (variant) { insert_at_head(&shader->variants, &variant->list_item_local); @@ -294,7 +297,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, const unsigned *fetch_elts, unsigned fetch_count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -310,6 +314,7 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; @@ -319,7 +324,8 @@ static void llvm_middle_end_run( struct draw_pt_middle_end *middle, static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle, unsigned start, - unsigned count) + unsigned count, + unsigned prim_flags) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -335,6 +341,7 @@ static void llvm_middle_end_linear_run( struct draw_pt_middle_end *middle, prim_info.count = count; prim_info.elts = NULL; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &count; @@ -348,7 +355,8 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle, unsigned start, unsigned count, const ushort *draw_elts, - unsigned draw_count ) + unsigned draw_count, + unsigned prim_flags ) { struct llvm_middle_end *fpme = (struct llvm_middle_end *)middle; struct draw_fetch_info fetch_info; @@ -364,6 +372,7 @@ llvm_middle_end_linear_run_elts( struct draw_pt_middle_end *middle, prim_info.count = draw_count; prim_info.elts = draw_elts; prim_info.prim = fpme->input_prim; + prim_info.flags = prim_flags; prim_info.primitive_count = 1; prim_info.primitive_lengths = &draw_count; diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c index 308f927b778..769409cfd67 100644 --- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c +++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c @@ -26,14 +26,26 @@ **************************************************************************/ #include "util/u_memory.h" +#include "util/u_math.h" #include "pipe/p_context.h" #include "draw/draw_context.h" #include "draw/draw_private.h" #include "draw/draw_pt.h" + +#define DO_CLIP_XY 0x1 +#define DO_CLIP_FULL_Z 0x2 +#define DO_CLIP_HALF_Z 0x4 +#define DO_CLIP_USER 0x8 +#define DO_VIEWPORT 0x10 +#define DO_EDGEFLAG 0x20 + + struct pt_post_vs { struct draw_context *draw; + unsigned flags; + boolean (*run)( struct pt_post_vs *pvs, struct draw_vertex_info *info ); }; @@ -56,186 +68,47 @@ dot4(const float *a, const float *b) a[3]*b[3]); } -static INLINE unsigned -compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr, - boolean clip_depth) -{ - unsigned mask = 0x0; - unsigned i; +#define FLAGS (0) +#define TAG(x) x##_none +#include "draw_cliptest_tmp.h" -#if 0 - debug_printf("compute clipmask %f %f %f %f\n", - clip[0], clip[1], clip[2], clip[3]); - assert(clip[3] != 0.0); -#endif +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT) +#define TAG(x) x##_xy_fullz_viewport +#include "draw_cliptest_tmp.h" - /* Do the hardwired planes first: - */ - if (-clip[0] + clip[3] < 0) mask |= (1<<0); - if ( clip[0] + clip[3] < 0) mask |= (1<<1); - if (-clip[1] + clip[3] < 0) mask |= (1<<2); - if ( clip[1] + clip[3] < 0) mask |= (1<<3); - if (clip_depth) { - if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */ - if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */ - } +#define FLAGS (DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT) +#define TAG(x) x##_xy_halfz_viewport +#include "draw_cliptest_tmp.h" - /* Followed by any remaining ones: - */ - for (i = 6; i < nr; i++) { - if (dot4(clip, plane[i]) < 0) - mask |= (1<<i); - } +#define FLAGS (DO_CLIP_FULL_Z | DO_VIEWPORT) +#define TAG(x) x##_fullz_viewport +#include "draw_cliptest_tmp.h" - return mask; -} +#define FLAGS (DO_CLIP_HALF_Z | DO_VIEWPORT) +#define TAG(x) x##_halfz_viewport +#include "draw_cliptest_tmp.h" +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT) +#define TAG(x) x##_xy_fullz_user_viewport +#include "draw_cliptest_tmp.h" -/* The normal case - cliptest, rhw divide, viewport transform. - * - * Also handle identity viewport here at the expense of a few wasted - * instructions - */ -static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs, - struct draw_vertex_info *info ) -{ - struct vertex_header *out = info->verts; - const float *scale = pvs->draw->viewport.scale; - const float *trans = pvs->draw->viewport.translate; - const unsigned pos = draw_current_shader_position_output(pvs->draw); - unsigned clipped = 0; - unsigned j; - - if (0) debug_printf("%s count, %d\n", __FUNCTION__, info->count); - - for (j = 0; j < info->count; j++) { - float *position = out->data[pos]; - - initialize_vertex_header(out); -#if 0 - debug_printf("%d) io = %p, data = %p = [%f, %f, %f, %f]\n", - j, out, position, position[0], position[1], position[2], position[3]); -#endif - - out->clip[0] = position[0]; - out->clip[1] = position[1]; - out->clip[2] = position[2]; - out->clip[3] = position[3]; - - out->vertex_id = 0xffff; - /* Disable depth clipping if depth clamping is enabled. */ - out->clipmask = compute_clipmask_gl(out->clip, - pvs->draw->plane, - pvs->draw->nr_planes, - !pvs->draw->depth_clamp); - clipped += out->clipmask; - - if (out->clipmask == 0) - { - /* divide by w */ - float w = 1.0f / position[3]; - - /* Viewport mapping */ - position[0] = position[0] * w * scale[0] + trans[0]; - position[1] = position[1] * w * scale[1] + trans[1]; - position[2] = position[2] * w * scale[2] + trans[2]; - position[3] = w; -#if 0 - debug_printf("post viewport: %f %f %f %f\n", - position[0], - position[1], - position[2], - position[3]); -#endif - } - - out = (struct vertex_header *)( (char *)out + info->stride ); - } - - return clipped != 0; -} +#define FLAGS (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT | DO_EDGEFLAG) +#define TAG(x) x##_xy_fullz_user_viewport_edgeflag +#include "draw_cliptest_tmp.h" -/* As above plus edgeflags +/* Don't want to create 64 versions of this function, so catch the + * less common ones here. This is looking like something which should + * be code-generated, perhaps appended to the end of the vertex + * shader. */ -static boolean -post_vs_cliptest_viewport_gl_edgeflag(struct pt_post_vs *pvs, - struct draw_vertex_info *info) -{ - unsigned j; - boolean needpipe; - - needpipe = post_vs_cliptest_viewport_gl(pvs, info); - - /* If present, copy edgeflag VS output into vertex header. - * Otherwise, leave header as is. - */ - if (pvs->draw->vs.edgeflag_output) { - struct vertex_header *out = info->verts; - int ef = pvs->draw->vs.edgeflag_output; - - for (j = 0; j < info->count; j++) { - const float *edgeflag = out->data[ef]; - out->edgeflag = !(edgeflag[0] != 1.0f); - needpipe |= !out->edgeflag; - out = (struct vertex_header *)( (char *)out + info->stride ); - } - } - return needpipe; -} - +#define FLAGS (pvs->flags) +#define TAG(x) x##_generic +#include "draw_cliptest_tmp.h" -/* If bypass_clipping is set, skip cliptest and rhw divide. - */ -static boolean post_vs_viewport( struct pt_post_vs *pvs, - struct draw_vertex_info *info ) -{ - struct vertex_header *out = info->verts; - const float *scale = pvs->draw->viewport.scale; - const float *trans = pvs->draw->viewport.translate; - const unsigned pos = draw_current_shader_position_output(pvs->draw); - unsigned j; - - if (0) debug_printf("%s\n", __FUNCTION__); - for (j = 0; j < info->count; j++) { - float *position = out->data[pos]; - - initialize_vertex_header(out); - /* Viewport mapping only, no cliptest/rhw divide - */ - position[0] = position[0] * scale[0] + trans[0]; - position[1] = position[1] * scale[1] + trans[1]; - position[2] = position[2] * scale[2] + trans[2]; - - out = (struct vertex_header *)((char *)out + info->stride); - } - - return FALSE; -} - - -/* If bypass_clipping is set and we have an identity viewport, nothing - * to do. - */ -static boolean post_vs_none( struct pt_post_vs *pvs, - struct draw_vertex_info *info ) -{ - struct vertex_header *out = info->verts; - unsigned j; - - if (0) debug_printf("%s\n", __FUNCTION__); - /* just initialize the vertex_id in all headers */ - for (j = 0; j < info->count; j++) { - initialize_vertex_header(out); - - out = (struct vertex_header *)((char *)out + info->stride); - } - return FALSE; -} - boolean draw_pt_post_vs_run( struct pt_post_vs *pvs, struct draw_vertex_info *info ) { @@ -244,31 +117,72 @@ boolean draw_pt_post_vs_run( struct pt_post_vs *pvs, void draw_pt_post_vs_prepare( struct pt_post_vs *pvs, - boolean bypass_clipping, + boolean clip_xy, + boolean clip_z, + boolean clip_user, boolean bypass_viewport, boolean opengl, boolean need_edgeflags ) { - if (!need_edgeflags) { - if (bypass_clipping) { - if (bypass_viewport) - pvs->run = post_vs_none; - else - pvs->run = post_vs_viewport; - } - else { - /* if (opengl) */ - pvs->run = post_vs_cliptest_viewport_gl; - } + pvs->flags = 0; + + if (clip_xy) + pvs->flags |= DO_CLIP_XY; + + if (clip_z && opengl) { + pvs->flags |= DO_CLIP_FULL_Z; + ASSIGN_4V( pvs->draw->plane[4], 0, 0, 1, 1 ); + } + + if (clip_z && !opengl) { + pvs->flags |= DO_CLIP_HALF_Z; + ASSIGN_4V( pvs->draw->plane[4], 0, 0, 1, 0 ); } - else { - /* If we need to copy edgeflags to the vertex header, it should - * mean we're running the primitive pipeline. Hence the bypass - * flags should be false. - */ - assert(!bypass_clipping); - assert(!bypass_viewport); - pvs->run = post_vs_cliptest_viewport_gl_edgeflag; + + if (clip_user) + pvs->flags |= DO_CLIP_USER; + + if (!bypass_viewport) + pvs->flags |= DO_VIEWPORT; + + if (need_edgeflags) + pvs->flags |= DO_EDGEFLAG; + + /* Now select the relevant function: + */ + switch (pvs->flags) { + case 0: + pvs->run = do_cliptest_none; + break; + + case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_VIEWPORT: + pvs->run = do_cliptest_xy_fullz_viewport; + break; + + case DO_CLIP_XY | DO_CLIP_HALF_Z | DO_VIEWPORT: + pvs->run = do_cliptest_xy_halfz_viewport; + break; + + case DO_CLIP_FULL_Z | DO_VIEWPORT: + pvs->run = do_cliptest_fullz_viewport; + break; + + case DO_CLIP_HALF_Z | DO_VIEWPORT: + pvs->run = do_cliptest_halfz_viewport; + break; + + case DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | DO_VIEWPORT: + pvs->run = do_cliptest_xy_fullz_user_viewport; + break; + + case (DO_CLIP_XY | DO_CLIP_FULL_Z | DO_CLIP_USER | + DO_VIEWPORT | DO_EDGEFLAG): + pvs->run = do_cliptest_xy_fullz_user_viewport_edgeflag; + break; + + default: + pvs->run = do_cliptest_generic; + break; } } diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c index f7f4f24d354..c86bdd99a33 100644 --- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c @@ -225,7 +225,7 @@ static void so_tri(struct pt_so_emit *so, int i0, int i1, int i2) #define FUNC so_run_elts #define LOCAL_VARS const ushort *elts = input_prims->elts; -#define GET_ELT(idx) (elts[start + (idx)] & ~DRAW_PIPE_FLAG_MASK) +#define GET_ELT(idx) (elts[start + (idx)]) #include "draw_so_emit_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c index 182a597cca2..513bbbed216 100644 --- a/src/gallium/auxiliary/draw/draw_pt_util.c +++ b/src/gallium/auxiliary/draw/draw_pt_util.c @@ -92,3 +92,10 @@ void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr) break; } } + +unsigned draw_pt_trim_count(unsigned count, unsigned first, unsigned incr) +{ + if (count < first) + return 0; + return count - (count - first) % incr; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c deleted file mode 100644 index cd7bb7bf253..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray.c +++ /dev/null @@ -1,200 +0,0 @@ -/************************************************************************** - * - * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - -#include "util/u_math.h" -#include "util/u_memory.h" - -#include "draw/draw_context.h" -#include "draw/draw_private.h" -#include "draw/draw_pt.h" - -#define FETCH_MAX 256 -#define DRAW_MAX (FETCH_MAX+8) - -struct varray_frontend { - struct draw_pt_front_end base; - struct draw_context *draw; - - ushort draw_elts[DRAW_MAX]; - unsigned fetch_elts[FETCH_MAX]; - - unsigned driver_fetch_max; - unsigned fetch_max; - - struct draw_pt_middle_end *middle; - - unsigned input_prim; - unsigned output_prim; -}; - - -static void varray_flush_linear(struct varray_frontend *varray, - unsigned start, unsigned count) -{ - if (count) { - assert(varray->middle->run_linear); - varray->middle->run_linear(varray->middle, start, count); - } -} - -static void varray_line_loop_segment(struct varray_frontend *varray, - unsigned start, - unsigned segment_start, - unsigned segment_count, - boolean end ) -{ - assert(segment_count < varray->fetch_max); - if (segment_count >= 1) { - unsigned nr = 0, i; - - for (i = 0; i < segment_count; i++) - varray->fetch_elts[nr++] = start + segment_start + i; - - if (end) - varray->fetch_elts[nr++] = start; - - assert(nr <= FETCH_MAX); - - varray->middle->run(varray->middle, - varray->fetch_elts, - nr, - varray->draw_elts, /* ie. linear */ - nr); - } -} - - - -static void varray_fan_segment(struct varray_frontend *varray, - unsigned start, - unsigned segment_start, - unsigned segment_count ) -{ - assert(segment_count < varray->fetch_max); - if (segment_count >= 2) { - unsigned nr = 0, i; - - if (segment_start != 0) - varray->fetch_elts[nr++] = start; - - for (i = 0 ; i < segment_count; i++) - varray->fetch_elts[nr++] = start + segment_start + i; - - assert(nr <= FETCH_MAX); - - varray->middle->run(varray->middle, - varray->fetch_elts, - nr, - varray->draw_elts, /* ie. linear */ - nr); - } -} - - - - -#define FUNC varray_run -#include "draw_pt_varray_tmp_linear.h" - -static unsigned decompose_prim[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY + 1] = { - PIPE_PRIM_POINTS, - PIPE_PRIM_LINES, - PIPE_PRIM_LINE_STRIP, /* decomposed LINELOOP */ - PIPE_PRIM_LINE_STRIP, - PIPE_PRIM_TRIANGLES, - PIPE_PRIM_TRIANGLE_STRIP, - PIPE_PRIM_TRIANGLE_FAN, - PIPE_PRIM_QUADS, - PIPE_PRIM_QUAD_STRIP, - PIPE_PRIM_POLYGON, - PIPE_PRIM_LINES_ADJACENCY, - PIPE_PRIM_LINE_STRIP_ADJACENCY, - PIPE_PRIM_TRIANGLES_ADJACENCY, - PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY -}; - - - -static void varray_prepare(struct draw_pt_front_end *frontend, - unsigned in_prim, - struct draw_pt_middle_end *middle, - unsigned opt) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - - varray->base.run = varray_run; - - varray->input_prim = in_prim; - assert(in_prim < Elements(decompose_prim)); - varray->output_prim = decompose_prim[in_prim]; - - varray->middle = middle; - middle->prepare(middle, - varray->output_prim, - opt, &varray->driver_fetch_max ); - - /* check that the max is even */ - assert((varray->driver_fetch_max & 1) == 0); - - varray->fetch_max = MIN2(FETCH_MAX, varray->driver_fetch_max); -} - - - - -static void varray_finish(struct draw_pt_front_end *frontend) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - varray->middle->finish(varray->middle); - varray->middle = NULL; -} - -static void varray_destroy(struct draw_pt_front_end *frontend) -{ - FREE(frontend); -} - - -struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw) -{ - ushort i; - struct varray_frontend *varray = CALLOC_STRUCT(varray_frontend); - if (varray == NULL) - return NULL; - - varray->base.prepare = varray_prepare; - varray->base.run = NULL; - varray->base.finish = varray_finish; - varray->base.destroy = varray_destroy; - varray->draw = draw; - - for (i = 0; i < DRAW_MAX; i++) { - varray->draw_elts[i] = i; - } - - return &varray->base; -} diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h deleted file mode 100644 index 7c722457c3c..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h +++ /dev/null @@ -1,238 +0,0 @@ - -static void FUNC(struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - unsigned count) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - struct draw_context *draw = varray->draw; - unsigned start = (unsigned)elts; - - boolean flatfirst = (draw->rasterizer->flatshade && - draw->rasterizer->flatshade_first); - unsigned i, j; - ushort flags; - unsigned first, incr; - - varray->fetch_start = start; - - draw_pt_split_prim(varray->input_prim, &first, &incr); - -#if 0 - debug_printf("%s (%d) %d/%d\n", __FUNCTION__, - varray->input_prim, - start, count); -#endif - - switch (varray->input_prim) { - case PIPE_PRIM_POINTS: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i < end; i++) { - POINT(varray, i + 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_LINES: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+1 < end; i += 2) { - LINE(varray, DRAW_PIPE_RESET_STIPPLE, - i + 0, i + 1); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_LINE_LOOP: - if (count >= 2) { - flags = DRAW_PIPE_RESET_STIPPLE; - - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 1; i < end; i++, flags = 0) { - LINE(varray, flags, i - 1, i); - } - LINE(varray, flags, i - 1, 0); - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - break; - - case PIPE_PRIM_LINE_STRIP: - flags = DRAW_PIPE_RESET_STIPPLE; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 1; i < end; i++, flags = 0) { - LINE(varray, flags, i - 1, i); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_TRIANGLES: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i += 3) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0, i + 1, i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_TRIANGLE_STRIP: - if (flatfirst) { - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0, i + 1 + (i&1), i + 2 - (i&1)); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - } - else { - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i + 2 < end; i++) { - TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, - i + 0 + (i&1), i + 1 - (i&1), i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - } - break; - - case PIPE_PRIM_TRIANGLE_FAN: - if (count >= 3) { - if (flatfirst) { - flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, flags, i + 1, i + 2, 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - else { - flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++) { - TRIANGLE(varray, flags, 0, i + 1, i + 2); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - } - break; - - case PIPE_PRIM_QUADS: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+3 < end; i += 4) { - QUAD(varray, i + 0, i + 1, i + 2, i + 3); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - break; - - case PIPE_PRIM_QUAD_STRIP: - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+3 < end; i += 2) { - QUAD(varray, i + 2, i + 0, i + 1, i + 3); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - if (j + first + i <= count) { - varray->fetch_start -= 2; - i -= 2; - } - } - break; - - case PIPE_PRIM_POLYGON: - { - /* These bitflags look a little odd because we submit the - * vertices as (1,2,0) to satisfy flatshade requirements. - */ - const ushort edge_first = DRAW_PIPE_EDGE_FLAG_2; - const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0; - const ushort edge_last = DRAW_PIPE_EDGE_FLAG_1; - - flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle; - for (j = 0; j + first <= count; j += i) { - unsigned end = MIN2(FETCH_MAX, count - j); - end -= (end % incr); - for (i = 0; i+2 < end; i++, flags = edge_middle) { - - if (i + 3 == count) - flags |= edge_last; - - TRIANGLE(varray, flags, i + 1, i + 2, 0); - } - i = end; - fetch_init(varray, end); - varray_flush(varray); - } - } - break; - - default: - assert(0); - break; - } - - varray_flush(varray); -} - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h deleted file mode 100644 index 55e43b2a714..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h +++ /dev/null @@ -1,103 +0,0 @@ -static unsigned trim( unsigned count, unsigned first, unsigned incr ) -{ - /* - * count either has been trimmed in draw_pt_arrays or is set to - * (driver)_fetch_max which is hopefully always larger than first. - */ - assert(count >= first); - return count - (count - first) % incr; -} - -static void FUNC(struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned count) -{ - struct varray_frontend *varray = (struct varray_frontend *)frontend; - unsigned start = (unsigned) ((char *) elts - (char *) NULL); - - unsigned j; - unsigned first, incr; - - assert(elt_bias == 0); - - draw_pt_split_prim(varray->input_prim, &first, &incr); - - /* Sanitize primitive length: - */ - count = trim(count, first, incr); - if (count < first) - return; - -#if 0 - debug_printf("%s (%d) %d/%d\n", __FUNCTION__, - varray->input_prim, - start, count); -#endif - - switch (varray->input_prim) { - case PIPE_PRIM_POINTS: - case PIPE_PRIM_LINES: - case PIPE_PRIM_TRIANGLES: - case PIPE_PRIM_LINE_STRIP: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_QUADS: - case PIPE_PRIM_QUAD_STRIP: - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - for (j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr ); - varray_flush_linear(varray, start + j, nr); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - break; - - case PIPE_PRIM_LINE_LOOP: - /* Always have to decompose as we've stated that this will be - * emitted as a line-strip. - */ - for (j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr ); - varray_line_loop_segment(varray, start, j, nr, nr == remaining); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - break; - - - case PIPE_PRIM_POLYGON: - case PIPE_PRIM_TRIANGLE_FAN: - if (count < varray->driver_fetch_max) { - varray_flush_linear(varray, start, count); - } - else { - for ( j = 0; j < count;) { - unsigned remaining = count - j; - unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr ); - varray_fan_segment(varray, start, j, nr); - j += nr; - if (nr != remaining) - j -= (first - incr); - } - } - break; - - default: - assert(0); - break; - } -} - -#undef TRIANGLE -#undef QUAD -#undef POINT -#undef LINE -#undef FUNC diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c deleted file mode 100644 index a848b54f7d2..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_vcache.c +++ /dev/null @@ -1,610 +0,0 @@ -/************************************************************************** - * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS - * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR - * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - **************************************************************************/ - - /* - * Authors: - * Keith Whitwell <[email protected]> - */ - -#include "util/u_memory.h" -#include "util/u_prim.h" -#include "draw/draw_context.h" -#include "draw/draw_private.h" -#include "draw/draw_pt.h" - - -#define CACHE_MAX 256 -#define FETCH_MAX 256 -#define DRAW_MAX (16*1024) - - -struct vcache_frontend { - struct draw_pt_front_end base; - struct draw_context *draw; - - unsigned in[CACHE_MAX]; - ushort out[CACHE_MAX]; - - ushort draw_elts[DRAW_MAX]; - unsigned fetch_elts[FETCH_MAX]; - - unsigned draw_count; - unsigned fetch_count; - unsigned fetch_max; - - struct draw_pt_middle_end *middle; - - unsigned input_prim; - unsigned output_prim; - - unsigned middle_prim; - unsigned opt; -}; - - -static INLINE void -vcache_flush( struct vcache_frontend *vcache ) -{ - if (vcache->middle_prim != vcache->output_prim) { - vcache->middle_prim = vcache->output_prim; - vcache->middle->prepare( vcache->middle, - vcache->middle_prim, - vcache->opt, - &vcache->fetch_max ); - } - - if (vcache->draw_count) { - vcache->middle->run( vcache->middle, - vcache->fetch_elts, - vcache->fetch_count, - vcache->draw_elts, - vcache->draw_count ); - } - - memset(vcache->in, ~0, sizeof(vcache->in)); - vcache->fetch_count = 0; - vcache->draw_count = 0; -} - - -static INLINE void -vcache_check_flush( struct vcache_frontend *vcache ) -{ - if (vcache->draw_count + 6 >= DRAW_MAX || - vcache->fetch_count + 6 >= FETCH_MAX) { - vcache_flush( vcache ); - } -} - - -static INLINE void -vcache_elt( struct vcache_frontend *vcache, - unsigned felt, - ushort flags ) -{ - unsigned idx = felt % CACHE_MAX; - - if (vcache->in[idx] != felt) { - assert(vcache->fetch_count < FETCH_MAX); - - vcache->in[idx] = felt; - vcache->out[idx] = (ushort)vcache->fetch_count; - vcache->fetch_elts[vcache->fetch_count++] = felt; - } - - vcache->draw_elts[vcache->draw_count++] = vcache->out[idx] | flags; -} - - - -static INLINE void -vcache_triangle( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1, - unsigned i2 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, i2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_flags( struct vcache_frontend *vcache, - ushort flags, - unsigned i0, - unsigned i1, - unsigned i2 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, i2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line( struct vcache_frontend *vcache, - unsigned i0, - unsigned i1 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_flags( struct vcache_frontend *vcache, - ushort flags, - unsigned i0, - unsigned i1 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_point( struct vcache_frontend *vcache, - unsigned i0 ) -{ - vcache_elt(vcache, i0, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_adj_flags( struct vcache_frontend *vcache, - unsigned flags, - unsigned a0, unsigned i0, unsigned i1, unsigned a1 ) -{ - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_line_adj( struct vcache_frontend *vcache, - unsigned a0, unsigned i0, unsigned i1, unsigned a1 ) -{ - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_adj_flags( struct vcache_frontend *vcache, - unsigned flags, - unsigned i0, unsigned a0, - unsigned i1, unsigned a1, - unsigned i2, unsigned a2 ) -{ - vcache_elt(vcache, i0, flags); - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_elt(vcache, i2, 0); - vcache_elt(vcache, a2, 0); - vcache_check_flush(vcache); -} - - -static INLINE void -vcache_triangle_adj( struct vcache_frontend *vcache, - unsigned i0, unsigned a0, - unsigned i1, unsigned a1, - unsigned i2, unsigned a2 ) -{ - vcache_elt(vcache, i0, 0); - vcache_elt(vcache, a0, 0); - vcache_elt(vcache, i1, 0); - vcache_elt(vcache, a1, 0); - vcache_elt(vcache, i2, 0); - vcache_elt(vcache, a2, 0); - vcache_check_flush(vcache); -} - - -/* At least for now, we're back to using a template include file for - * this. The two paths aren't too different though - it may be - * possible to reunify them. - */ -#define TRIANGLE(flags,i0,i1,i2) vcache_triangle_flags(vcache,flags,i0,i1,i2) -#define LINE(flags,i0,i1) vcache_line_flags(vcache,flags,i0,i1) -#define POINT(i0) vcache_point(vcache,i0) -#define LINE_ADJ(flags,a0,i0,i1,a1) \ - vcache_line_adj_flags(vcache,flags,a0,i0,i1,a1) -#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \ - vcache_triangle_adj_flags(vcache,flags,i0,a0,i1,a1,i2,a2) -#define FUNC vcache_run_extras -#include "draw_pt_vcache_tmp.h" - -#define TRIANGLE(flags,i0,i1,i2) vcache_triangle(vcache,i0,i1,i2) -#define LINE(flags,i0,i1) vcache_line(vcache,i0,i1) -#define POINT(i0) vcache_point(vcache,i0) -#define LINE_ADJ(flags,a0,i0,i1,a1) \ - vcache_line_adj(vcache,a0,i0,i1,a1) -#define TRIANGLE_ADJ(flags,i0,a0,i1,a1,i2,a2) \ - vcache_triangle_adj(vcache,i0,a0,i1,a1,i2,a2) -#define FUNC vcache_run -#include "draw_pt_vcache_tmp.h" - -static INLINE void -rebase_uint_elts( const unsigned *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -rebase_ushort_elts( const ushort *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -rebase_ubyte_elts( const ubyte *src, - unsigned count, - int delta, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i] + delta); -} - - -static INLINE void -translate_uint_elts( const unsigned *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - -static INLINE void -translate_ushort_elts( const ushort *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - -static INLINE void -translate_ubyte_elts( const ubyte *src, - unsigned count, - ushort *dest ) -{ - unsigned i; - for (i = 0; i < count; i++) - dest[i] = (ushort)(src[i]); -} - - - - -#if 0 -static INLINE enum pipe_format -format_from_get_elt( pt_elt_func get_elt ) -{ - switch (draw->pt.user.eltSize) { - case 1: return PIPE_FORMAT_R8_UNORM; - case 2: return PIPE_FORMAT_R16_UNORM; - case 4: return PIPE_FORMAT_R32_UNORM; - default: return PIPE_FORMAT_NONE; - } -} -#endif - - -/** - * Check if any vertex attributes use instance divisors. - * Note that instance divisors complicate vertex fetching so we need - * to take the vcache path when they're in use. - */ -static boolean -any_instance_divisors(const struct draw_context *draw) -{ - uint i; - - for (i = 0; i < draw->pt.nr_vertex_elements; i++) { - uint div = draw->pt.vertex_element[i].instance_divisor; - if (div) - return TRUE; - } - return FALSE; -} - - -static INLINE void -vcache_check_run( struct draw_pt_front_end *frontend, - pt_elt_func get_elt, - const void *elts, - int elt_bias, - unsigned draw_count ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - struct draw_context *draw = vcache->draw; - const unsigned min_index = draw->pt.user.min_index; - const unsigned max_index = draw->pt.user.max_index; - const unsigned index_size = draw->pt.user.eltSize; - unsigned fetch_count; - const ushort *transformed_elts; - ushort *storage = NULL; - boolean ok = FALSE; - - /* debug: verify indexes are in range [min_index, max_index] */ - if (0) { - unsigned i; - for (i = 0; i < draw_count; i++) { - if (index_size == 1) { - assert( ((const ubyte *) elts)[i] >= min_index); - assert( ((const ubyte *) elts)[i] <= max_index); - } - else if (index_size == 2) { - assert( ((const ushort *) elts)[i] >= min_index); - assert( ((const ushort *) elts)[i] <= max_index); - } - else { - assert(index_size == 4); - assert( ((const uint *) elts)[i] >= min_index); - assert( ((const uint *) elts)[i] <= max_index); - } - } - } - - /* Note: max_index is frequently 0xffffffff so we have to be sure - * that any arithmetic involving max_index doesn't overflow! - */ - if (max_index >= (unsigned) DRAW_PIPE_MAX_VERTICES) - goto fail; - - if (any_instance_divisors(draw)) - goto fail; - - fetch_count = max_index + 1 - min_index; - - if (0) - debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, - vcache->fetch_max, - draw_count); - - if (elt_bias + max_index >= DRAW_PIPE_MAX_VERTICES || - fetch_count >= UNDEFINED_VERTEX_ID || - fetch_count > draw_count) { - if (0) debug_printf("fail\n"); - goto fail; - } - - if (vcache->middle_prim != vcache->input_prim) { - vcache->middle_prim = vcache->input_prim; - vcache->middle->prepare( vcache->middle, - vcache->middle_prim, - vcache->opt, - &vcache->fetch_max ); - } - - assert((elt_bias >= 0 && min_index + elt_bias >= min_index) || - (elt_bias < 0 && min_index + elt_bias < min_index)); - - if (min_index == 0 && - index_size == 2) { - transformed_elts = (const ushort *)elts; - } - else { - storage = MALLOC( draw_count * sizeof(ushort) ); - if (!storage) - goto fail; - - if (min_index == 0) { - switch(index_size) { - case 1: - translate_ubyte_elts( (const ubyte *)elts, - draw_count, - storage ); - break; - - case 2: - translate_ushort_elts( (const ushort *)elts, - draw_count, - storage ); - break; - - case 4: - translate_uint_elts( (const uint *)elts, - draw_count, - storage ); - break; - - default: - assert(0); - FREE(storage); - return; - } - } - else { - switch(index_size) { - case 1: - rebase_ubyte_elts( (const ubyte *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - case 2: - rebase_ushort_elts( (const ushort *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - case 4: - rebase_uint_elts( (const uint *)elts, - draw_count, - 0 - (int)min_index, - storage ); - break; - - default: - assert(0); - FREE(storage); - return; - } - } - transformed_elts = storage; - } - - if (fetch_count < UNDEFINED_VERTEX_ID) - ok = vcache->middle->run_linear_elts( vcache->middle, - min_index + elt_bias, /* start */ - fetch_count, - transformed_elts, - draw_count ); - - FREE(storage); - - if (ok) - return; - - debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n", - fetch_count, draw_count); - -fail: - vcache_run( frontend, get_elt, elts, elt_bias, draw_count ); -} - - - - -static void -vcache_prepare( struct draw_pt_front_end *frontend, - unsigned in_prim, - struct draw_pt_middle_end *middle, - unsigned opt ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - - if (opt & PT_PIPELINE) { - vcache->base.run = vcache_run_extras; - } - else { - vcache->base.run = vcache_check_run; - } - - /* VCache will always emit the reduced version of its input - * primitive, ie STRIP/FANS become TRIS, etc. - * - * This is not to be confused with what the GS might be up to, - * which is a separate issue. - */ - vcache->input_prim = in_prim; - switch (in_prim) { - case PIPE_PRIM_LINES_ADJACENCY: - case PIPE_PRIM_LINE_STRIP_ADJACENCY: - vcache->output_prim = PIPE_PRIM_LINES_ADJACENCY; - break; - case PIPE_PRIM_TRIANGLES_ADJACENCY: - case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: - vcache->output_prim = PIPE_PRIM_TRIANGLES_ADJACENCY; - break; - default: - vcache->output_prim = u_reduced_prim(in_prim); - } - - vcache->middle = middle; - vcache->opt = opt; - - /* Have to run prepare here, but try and guess a good prim for - * doing so: - */ - vcache->middle_prim = (opt & PT_PIPELINE) - ? vcache->output_prim : vcache->input_prim; - - middle->prepare( middle, - vcache->middle_prim, - opt, &vcache->fetch_max ); -} - - -static void -vcache_finish( struct draw_pt_front_end *frontend ) -{ - struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; - vcache->middle->finish( vcache->middle ); - vcache->middle = NULL; -} - - -static void -vcache_destroy( struct draw_pt_front_end *frontend ) -{ - FREE(frontend); -} - - -struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw ) -{ - struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend ); - if (vcache == NULL) - return NULL; - - vcache->base.prepare = vcache_prepare; - vcache->base.run = NULL; - vcache->base.finish = vcache_finish; - vcache->base.destroy = vcache_destroy; - vcache->draw = draw; - - memset(vcache->in, ~0, sizeof(vcache->in)); - - return &vcache->base; -} diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h deleted file mode 100644 index 1a3748d5f0b..00000000000 --- a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h +++ /dev/null @@ -1,19 +0,0 @@ -#define FUNC_VARS \ - struct draw_pt_front_end *frontend, \ - pt_elt_func get_elt, \ - const void *elts, \ - int elt_bias, \ - unsigned count - -#define LOCAL_VARS \ - struct vcache_frontend *vcache = (struct vcache_frontend *) frontend; \ - struct draw_context *draw = vcache->draw; \ - const unsigned prim = vcache->input_prim; \ - const boolean last_vertex_last = !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); - -#define GET_ELT(idx) (get_elt(elts, idx) + elt_bias) - -#define FUNC_EXIT do { vcache_flush(vcache); } while (0) - -#include "draw_decompose_tmp.h" diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit.c b/src/gallium/auxiliary/draw/draw_pt_vsplit.c new file mode 100644 index 00000000000..a6875253094 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit.c @@ -0,0 +1,208 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include "util/u_math.h" +#include "util/u_memory.h" + +#include "draw/draw_context.h" +#include "draw/draw_private.h" +#include "draw/draw_pt.h" + +#define SEGMENT_SIZE 1024 +#define MAP_SIZE 256 + +struct vsplit_frontend { + struct draw_pt_front_end base; + struct draw_context *draw; + + unsigned prim; + + struct draw_pt_middle_end *middle; + + unsigned max_vertices; + ushort segment_size; + + /* buffers for splitting */ + unsigned fetch_elts[SEGMENT_SIZE]; + ushort draw_elts[SEGMENT_SIZE]; + ushort identity_draw_elts[SEGMENT_SIZE]; + + struct { + /* map a fetch element to a draw element */ + unsigned fetches[MAP_SIZE]; + ushort draws[MAP_SIZE]; + boolean has_max_fetch; + + ushort num_fetch_elts; + ushort num_draw_elts; + } cache; +}; + + +static void +vsplit_clear_cache(struct vsplit_frontend *vsplit) +{ + memset(vsplit->cache.fetches, 0xff, sizeof(vsplit->cache.fetches)); + vsplit->cache.has_max_fetch = FALSE; + vsplit->cache.num_fetch_elts = 0; + vsplit->cache.num_draw_elts = 0; +} + +static void +vsplit_flush_cache(struct vsplit_frontend *vsplit, unsigned flags) +{ + vsplit->middle->run(vsplit->middle, + vsplit->fetch_elts, vsplit->cache.num_fetch_elts, + vsplit->draw_elts, vsplit->cache.num_draw_elts, flags); +} + +/** + * Add a fetch element and add it to the draw elements. + */ +static INLINE void +vsplit_add_cache(struct vsplit_frontend *vsplit, unsigned fetch) +{ + unsigned hash = fetch % MAP_SIZE; + + if (vsplit->cache.fetches[hash] != fetch) { + /* update cache */ + vsplit->cache.fetches[hash] = fetch; + vsplit->cache.draws[hash] = vsplit->cache.num_fetch_elts; + + /* add fetch */ + assert(vsplit->cache.num_fetch_elts < vsplit->segment_size); + vsplit->fetch_elts[vsplit->cache.num_fetch_elts++] = fetch; + } + + vsplit->draw_elts[vsplit->cache.num_draw_elts++] = vsplit->cache.draws[hash]; +} + + +/** + * Add a fetch element and add it to the draw elements. The fetch element is + * in full range (uint). + */ +static INLINE void +vsplit_add_cache_uint(struct vsplit_frontend *vsplit, unsigned fetch) +{ + /* special care for 0xffffffff */ + if (fetch == 0xffffffff && !vsplit->cache.has_max_fetch) { + unsigned hash = fetch % MAP_SIZE; + vsplit->cache.fetches[hash] = fetch - 1; /* force update */ + vsplit->cache.has_max_fetch = TRUE; + } + + vsplit_add_cache(vsplit, fetch); +} + + +#define FUNC vsplit_run_linear +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_ubyte +#define ELT_TYPE ubyte +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_ushort +#define ELT_TYPE ushort +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + +#define FUNC vsplit_run_uint +#define ELT_TYPE uint +#define ADD_CACHE(vsplit, fetch) vsplit_add_cache_uint(vsplit, fetch) +#include "draw_pt_vsplit_tmp.h" + + +static void vsplit_prepare(struct draw_pt_front_end *frontend, + unsigned in_prim, + struct draw_pt_middle_end *middle, + unsigned opt) +{ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; + + switch (vsplit->draw->pt.user.eltSize) { + case 0: + vsplit->base.run = vsplit_run_linear; + break; + case 1: + vsplit->base.run = vsplit_run_ubyte; + break; + case 2: + vsplit->base.run = vsplit_run_ushort; + break; + case 4: + vsplit->base.run = vsplit_run_uint; + break; + default: + assert(0); + break; + } + + /* split only */ + vsplit->prim = in_prim; + + vsplit->middle = middle; + middle->prepare(middle, vsplit->prim, opt, &vsplit->max_vertices); + + vsplit->segment_size = MIN2(SEGMENT_SIZE, vsplit->max_vertices); +} + + +static void vsplit_finish(struct draw_pt_front_end *frontend) +{ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; + vsplit->middle->finish(vsplit->middle); + vsplit->middle = NULL; +} + + +static void vsplit_destroy(struct draw_pt_front_end *frontend) +{ + FREE(frontend); +} + + +struct draw_pt_front_end *draw_pt_vsplit(struct draw_context *draw) +{ + struct vsplit_frontend *vsplit = CALLOC_STRUCT(vsplit_frontend); + ushort i; + + if (!vsplit) + return NULL; + + vsplit->base.prepare = vsplit_prepare; + vsplit->base.run = NULL; + vsplit->base.finish = vsplit_finish; + vsplit->base.destroy = vsplit_destroy; + vsplit->draw = draw; + + for (i = 0; i < SEGMENT_SIZE; i++) + vsplit->identity_draw_elts[i] = i; + + return &vsplit->base; +} diff --git a/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h new file mode 100644 index 00000000000..3f66f962e11 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_pt_vsplit_tmp.h @@ -0,0 +1,309 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#define CONCAT2(name, elt_type) name ## elt_type +#define CONCAT(name, elt_type) CONCAT2(name, elt_type) + +#ifdef ELT_TYPE + +/** + * Fetch all elements in [min_index, max_index] with bias, and use the + * (rebased) index buffer as the draw elements. + */ +static boolean +CONCAT(vsplit_primitive_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned istart, unsigned icount) +{ + struct draw_context *draw = vsplit->draw; + const ELT_TYPE *ib = (const ELT_TYPE *) + ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset); + const unsigned min_index = draw->pt.user.min_index; + const unsigned max_index = draw->pt.user.max_index; + const int elt_bias = draw->pt.user.eltBias; + unsigned fetch_start, fetch_count; + const ushort *draw_elts = NULL; + unsigned i; + + /* use the ib directly */ + if (min_index == 0 && sizeof(ib[0]) == sizeof(draw_elts[0])) { + if (icount > vsplit->max_vertices) + return FALSE; + + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + assert(idx >= min_index && idx <= max_index); + } + draw_elts = (const ushort *) ib; + } + else { + /* have to go through vsplit->draw_elts */ + if (icount > vsplit->segment_size) + return FALSE; + } + + /* this is faster only when we fetch less elements than the normal path */ + if (max_index - min_index > icount - 1) + return FALSE; + + if (elt_bias < 0 && min_index < -elt_bias) + return FALSE; + + /* why this check? */ + for (i = 0; i < draw->pt.nr_vertex_elements; i++) { + if (draw->pt.vertex_element[i].instance_divisor) + return FALSE; + } + + fetch_start = min_index + elt_bias; + fetch_count = max_index - min_index + 1; + + if (!draw_elts) { + if (min_index == 0) { + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + + assert(idx >= min_index && idx <= max_index); + vsplit->draw_elts[i] = (ushort) idx; + } + } + else { + for (i = 0; i < icount; i++) { + ELT_TYPE idx = ib[istart + i]; + + assert(idx >= min_index && idx <= max_index); + vsplit->draw_elts[i] = (ushort) (idx - min_index); + } + } + + draw_elts = vsplit->draw_elts; + } + + return vsplit->middle->run_linear_elts(vsplit->middle, + fetch_start, fetch_count, + draw_elts, icount, 0x0); +} + +/** + * Use the cache to prepare the fetch and draw elements, and flush. + * + * When spoken is TRUE, ispoken replaces istart; When close is TRUE, iclose is + * appended. + */ +static INLINE void +CONCAT(vsplit_segment_cache_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, unsigned icount, + boolean spoken, unsigned ispoken, + boolean close, unsigned iclose) +{ + struct draw_context *draw = vsplit->draw; + const ELT_TYPE *ib = (const ELT_TYPE *) + ((const char *) draw->pt.user.elts + draw->pt.index_buffer.offset); + const int ibias = draw->pt.user.eltBias; + unsigned i; + + assert(icount + !!close <= vsplit->segment_size); + + vsplit_clear_cache(vsplit); + + spoken = !!spoken; + if (ibias == 0) { + if (spoken) + ADD_CACHE(vsplit, ib[ispoken]); + + for (i = spoken; i < icount; i++) + ADD_CACHE(vsplit, ib[istart + i]); + + if (close) + ADD_CACHE(vsplit, ib[iclose]); + } + else if (ibias > 0) { + if (spoken) + ADD_CACHE(vsplit, (uint) ib[ispoken] + ibias); + + for (i = spoken; i < icount; i++) + ADD_CACHE(vsplit, (uint) ib[istart + i] + ibias); + + if (close) + ADD_CACHE(vsplit, (uint) ib[iclose] + ibias); + } + else { + if (spoken) { + if (ib[ispoken] < -ibias) + return; + ADD_CACHE(vsplit, ib[ispoken] + ibias); + } + + for (i = spoken; i < icount; i++) { + if (ib[istart + i] < -ibias) + return; + ADD_CACHE(vsplit, ib[istart + i] + ibias); + } + + if (close) { + if (ib[iclose] < -ibias) + return; + ADD_CACHE(vsplit, ib[iclose] + ibias); + } + } + + vsplit_flush_cache(vsplit, flags); +} + +static void +CONCAT(vsplit_segment_simple_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount) +{ + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, FALSE, 0, FALSE, 0); +} + +static void +CONCAT(vsplit_segment_loop_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount, + unsigned i0) +{ + const boolean close_loop = ((flags) == DRAW_SPLIT_BEFORE); + + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, FALSE, 0, close_loop, i0); +} + +static void +CONCAT(vsplit_segment_fan_, ELT_TYPE)(struct vsplit_frontend *vsplit, + unsigned flags, + unsigned istart, + unsigned icount, + unsigned i0) +{ + const boolean use_spoken = (((flags) & DRAW_SPLIT_BEFORE) != 0); + + CONCAT(vsplit_segment_cache_, ELT_TYPE)(vsplit, + flags, istart, icount, use_spoken, i0, FALSE, 0); +} + +#define LOCAL_VARS \ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; \ + const unsigned prim = vsplit->prim; \ + const unsigned max_count_simple = vsplit->segment_size; \ + const unsigned max_count_loop = vsplit->segment_size - 1; \ + const unsigned max_count_fan = vsplit->segment_size; + +#define PRIMITIVE(istart, icount) \ + CONCAT(vsplit_primitive_, ELT_TYPE)(vsplit, istart, icount) + +#else /* ELT_TYPE */ + +static void +vsplit_segment_simple_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount) +{ + assert(icount <= vsplit->max_vertices); + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); +} + +static void +vsplit_segment_loop_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount, unsigned i0) +{ + boolean close_loop = (flags == DRAW_SPLIT_BEFORE); + unsigned nr; + + assert(icount + !!close_loop <= vsplit->segment_size); + + if (close_loop) { + for (nr = 0; nr < icount; nr++) + vsplit->fetch_elts[nr] = istart + nr; + vsplit->fetch_elts[nr++] = i0; + + vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr, + vsplit->identity_draw_elts, nr, flags); + } + else { + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); + } +} + +static void +vsplit_segment_fan_linear(struct vsplit_frontend *vsplit, unsigned flags, + unsigned istart, unsigned icount, unsigned i0) +{ + boolean use_spoken = ((flags & DRAW_SPLIT_BEFORE) != 0); + unsigned nr = 0, i; + + assert(icount + !!use_spoken <= vsplit->segment_size); + + if (use_spoken) { + vsplit->fetch_elts[nr++] = i0; + for (i = 1 ; i < icount; i++) + vsplit->fetch_elts[nr++] = istart + i; + + vsplit->middle->run(vsplit->middle, vsplit->fetch_elts, nr, + vsplit->identity_draw_elts, nr, flags); + } + else { + vsplit->middle->run_linear(vsplit->middle, istart, icount, flags); + } +} + +#define LOCAL_VARS \ + struct vsplit_frontend *vsplit = (struct vsplit_frontend *) frontend; \ + const unsigned prim = vsplit->prim; \ + const unsigned max_count_simple = vsplit->max_vertices; \ + const unsigned max_count_loop = vsplit->segment_size - 1; \ + const unsigned max_count_fan = vsplit->segment_size; + +#define PRIMITIVE(istart, icount) FALSE + +#define ELT_TYPE linear + +#endif /* ELT_TYPE */ + +#define FUNC_VARS \ + struct draw_pt_front_end *frontend, \ + unsigned start, \ + unsigned count + +#define SEGMENT_SIMPLE(flags, istart, icount) \ + CONCAT(vsplit_segment_simple_, ELT_TYPE)(vsplit, flags, istart, icount) + +#define SEGMENT_LOOP(flags, istart, icount, i0) \ + CONCAT(vsplit_segment_loop_, ELT_TYPE)(vsplit, flags, istart, icount, i0) + +#define SEGMENT_FAN(flags, istart, icount, i0) \ + CONCAT(vsplit_segment_fan_, ELT_TYPE)(vsplit, flags, istart, icount, i0) + +#include "draw_split_tmp.h" + +#undef CONCAT2 +#undef CONCAT + +#undef ELT_TYPE +#undef ADD_CACHE diff --git a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h index 6d8937a0b41..7fafde9d5e6 100644 --- a/src/gallium/auxiliary/draw/draw_so_emit_tmp.h +++ b/src/gallium/auxiliary/draw/draw_so_emit_tmp.h @@ -7,11 +7,9 @@ #define FUNC_ENTER \ /* declare more local vars */ \ - struct draw_context *draw = so->draw; \ const unsigned prim = input_prims->prim; \ - const boolean last_vertex_last = \ - !(draw->rasterizer->flatshade && \ - draw->rasterizer->flatshade_first); \ + const unsigned prim_flags = input_prims->flags; \ + const boolean last_vertex_last = TRUE; \ do { \ debug_assert(input_prims->primitive_count == 1); \ switch (prim) { \ diff --git a/src/gallium/auxiliary/draw/draw_split_tmp.h b/src/gallium/auxiliary/draw/draw_split_tmp.h new file mode 100644 index 00000000000..47defc62b96 --- /dev/null +++ b/src/gallium/auxiliary/draw/draw_split_tmp.h @@ -0,0 +1,176 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +static void +FUNC(FUNC_VARS) +{ + unsigned first, incr; + LOCAL_VARS + + /* + * prim, start, count, and max_count_{simple,loop,fan} should have been + * defined + */ + if (0) { + debug_printf("%s: prim 0x%x, start %d, count %d, max_count_simple %d, " + "max_count_loop %d, max_count_fan %d\n", + __FUNCTION__, prim, start, count, max_count_simple, + max_count_loop, max_count_fan); + } + + draw_pt_split_prim(prim, &first, &incr); + /* sanitize primitive length */ + count = draw_pt_trim_count(count, first, incr); + if (count < first) + return; + + /* try flushing the entire primitive */ + if (PRIMITIVE(start, count)) + return; + + /* must be able to at least flush two complete primitives */ + assert(max_count_simple >= first + incr && + max_count_loop >= first + incr && + max_count_fan >= first + incr); + + /* no splitting required */ + if (count <= max_count_simple) { + SEGMENT_SIMPLE(0x0, start, count); + } + else { + const unsigned rollback = first - incr; + unsigned flags = DRAW_SPLIT_AFTER, seg_start = 0, seg_max; + + /* + * Both count and seg_max below are explicitly trimmed. Because + * + * seg_start = N * (seg_max - rollback) = N' * incr, + * + * we have + * + * remaining = count - seg_start = first + N'' * incr. + * + * That is, remaining is implicitly trimmed. + */ + switch (prim) { + case PIPE_PRIM_POINTS: + case PIPE_PRIM_LINES: + case PIPE_PRIM_LINE_STRIP: + case PIPE_PRIM_TRIANGLES: + case PIPE_PRIM_TRIANGLE_STRIP: + case PIPE_PRIM_QUADS: + case PIPE_PRIM_QUAD_STRIP: + case PIPE_PRIM_LINES_ADJACENCY: + case PIPE_PRIM_LINE_STRIP_ADJACENCY: + case PIPE_PRIM_TRIANGLES_ADJACENCY: + case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: + seg_max = + draw_pt_trim_count(MIN2(max_count_simple, count), first, incr); + if (prim == PIPE_PRIM_TRIANGLE_STRIP || + prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY) { + /* make sure we flush even number of triangles at a time */ + if (seg_max < count && !(((seg_max - first) / incr) & 1)) + seg_max -= incr; + } + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_SIMPLE(flags, start + seg_start, seg_max); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_SIMPLE(flags, start + seg_start, remaining); + seg_start += remaining; + } + } while (seg_start < count); + break; + + case PIPE_PRIM_LINE_LOOP: + seg_max = + draw_pt_trim_count(MIN2(max_count_loop, count), first, incr); + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_LOOP(flags, start + seg_start, seg_max, start); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_LOOP(flags, start + seg_start, remaining, start); + seg_start += remaining; + } + } while (seg_start < count); + break; + + case PIPE_PRIM_TRIANGLE_FAN: + case PIPE_PRIM_POLYGON: + seg_max = + draw_pt_trim_count(MIN2(max_count_fan, count), first, incr); + + do { + const unsigned remaining = count - seg_start; + + if (remaining > seg_max) { + SEGMENT_FAN(flags, start + seg_start, seg_max, start); + seg_start += seg_max - rollback; + + flags |= DRAW_SPLIT_BEFORE; + } + else { + flags &= ~DRAW_SPLIT_AFTER; + + SEGMENT_FAN(flags, start + seg_start, remaining, start); + seg_start += remaining; + } + } while (seg_start < count); + break; + + default: + assert(0); + break; + } + } +} + +#undef FUNC +#undef FUNC_VARS +#undef LOCAL_VARS + +#undef PRIMITIVE +#undef SEGMENT_SIMPLE +#undef SEGMENT_LOOP +#undef SEGMENT_FAN diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c index d13ad24fff0..fa9992db783 100644 --- a/src/gallium/auxiliary/draw/draw_vs_llvm.c +++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c @@ -28,6 +28,7 @@ #include "util/u_math.h" #include "util/u_memory.h" #include "pipe/p_shader_tokens.h" +#include "pipe/p_screen.h" #include "draw_private.h" #include "draw_context.h" @@ -109,6 +110,11 @@ draw_create_vs_llvm(struct draw_context *draw, tgsi_scan_shader(state->tokens, &vs->base.info); + vs->variant_key_size = + draw_llvm_variant_key_size( + vs->base.info.file_max[TGSI_FILE_INPUT]+1, + vs->base.info.file_max[TGSI_FILE_SAMPLER]+1); + vs->base.draw = draw; vs->base.prepare = vs_llvm_prepare; vs->base.run_linear = vs_llvm_run_linear; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index 7b35dd4bb49..e0d30be98d9 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -59,14 +59,6 @@ #include "lp_bld_arit.h" -/* - * XXX: Increasing eliminates some artifacts, but adds others, most - * noticeably corruption in the Earth halo in Google Earth. - */ -#define RCP_NEWTON_STEPS 0 - -#define RSQRT_NEWTON_STEPS 0 - #define EXP_POLY_DEGREE 3 #define LOG_POLY_DEGREE 5 @@ -267,7 +259,7 @@ lp_build_add(struct lp_build_context *bld, } -/** Return the sum of the elements of a */ +/** Return the scalar sum of the elements of a */ LLVMValueRef lp_build_sum_vector(struct lp_build_context *bld, LLVMValueRef a) @@ -278,11 +270,9 @@ lp_build_sum_vector(struct lp_build_context *bld, assert(lp_check_value(type, a)); - if (a == bld->zero) - return bld->zero; - if (a == bld->undef) - return bld->undef; - assert(type.length > 1); + if (type.length == 1) { + return a; + } assert(!bld->type.norm); @@ -546,7 +536,7 @@ lp_build_mul_imm(struct lp_build_context *bld, if(b == 2 && bld->type.floating) return lp_build_add(bld, a, a); - if(util_is_pot(b)) { + if(util_is_power_of_two(b)) { unsigned shift = ffs(b) - 1; if(bld->type.floating) { @@ -1266,6 +1256,11 @@ lp_build_sqrt(struct lp_build_context *bld, * * x_{i+1} = x_i * (2 - a * x_i) * + * XXX: Unfortunately this won't give IEEE-754 conformant results for 0 or + * +/-Inf, giving NaN instead. Certain applications rely on this behavior, + * such as Google Earth, which does RCP(RSQRT(0.0) when drawing the Earth's + * halo. It would be necessary to clamp the argument to prevent this. + * * See also: * - http://en.wikipedia.org/wiki/Division_(digital)#Newton.E2.80.93Raphson_division * - http://softwarecommunity.intel.com/articles/eng/1818.htm @@ -1306,13 +1301,27 @@ lp_build_rcp(struct lp_build_context *bld, if(LLVMIsConstant(a)) return LLVMConstFDiv(bld->one, a); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + /* + * We don't use RCPPS because: + * - it only has 10bits of precision + * - it doesn't even get the reciprocate of 1.0 exactly + * - doing Newton-Rapshon steps yields wrong (NaN) values for 0.0 or Inf + * - for recent processors the benefit over DIVPS is marginal, a case + * depedent + * + * We could still use it on certain processors if benchmarks show that the + * RCPPS plus necessary workarounds are still preferrable to DIVPS; or for + * particular uses that require less workarounds. + */ + + if (FALSE && util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + const unsigned num_iterations = 0; LLVMValueRef res; unsigned i; res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", bld->vec_type, a); - for (i = 0; i < RCP_NEWTON_STEPS; ++i) { + for (i = 0; i < num_iterations; ++i) { res = lp_build_rcp_refine(bld, a, res); } @@ -1363,13 +1372,14 @@ lp_build_rsqrt(struct lp_build_context *bld, assert(type.floating); - if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + if (util_cpu_caps.has_sse && type.width == 32 && type.length == 4) { + const unsigned num_iterations = 0; LLVMValueRef res; unsigned i; res = lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", bld->vec_type, a); - for (i = 0; i < RSQRT_NEWTON_STEPS; ++i) { + for (i = 0; i < num_iterations; ++i) { res = lp_build_rsqrt_refine(bld, a, res); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.c b/src/gallium/auxiliary/gallivm/lp_bld_debug.c index 39dfc51e503..d3a5afff8c2 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.c @@ -46,7 +46,7 @@ boolean lp_check_alignment(const void *ptr, unsigned alignment) { - assert(util_is_pot(alignment)); + assert(util_is_power_of_two(alignment)); return ((uintptr_t)ptr & (alignment - 1)) == 0; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index 247cb83ce6c..92123e09d32 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -388,7 +388,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, if (format_matches_type(format_desc, type) && format_desc->block.bits <= type.width * 4 && - util_is_pot(format_desc->block.bits)) { + util_is_power_of_two(format_desc->block.bits)) { LLVMValueRef packed; /* @@ -416,7 +416,7 @@ lp_build_fetch_rgba_aos(LLVMBuilderRef builder, format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && - util_is_pot(format_desc->block.bits) && + util_is_power_of_two(format_desc->block.bits) && format_desc->block.bits <= 32 && format_desc->is_bitmask && !format_desc->is_mixed && diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp index 6d5410d9701..48baf7c425c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp @@ -40,6 +40,7 @@ #include <llvm/ExecutionEngine/ExecutionEngine.h> #include <llvm/ExecutionEngine/JITEventListener.h> #include <llvm/Support/CommandLine.h> +#include <llvm/Support/PrettyStackTrace.h> #include "pipe/p_config.h" #include "util/u_debug.h" @@ -143,7 +144,6 @@ lp_set_target_options(void) llvm::UnsafeFPMath = true; #endif -#if 0 /* * LLVM will generate MMX instructions for vectors <= 64 bits, leading to * innefficient code, and in 32bit systems, to the corruption of the FPU @@ -152,10 +152,8 @@ lp_set_target_options(void) * See also: * - http://llvm.org/bugs/show_bug.cgi?id=3287 * - http://l4.me.uk/post/2009/06/07/llvm-wrinkle-3-configuration-what-configuration/ - * - * XXX: Unfortunately this is not working. */ - static boolean first = FALSE; + static boolean first = TRUE; if (first) { static const char* options[] = { "prog", @@ -164,7 +162,13 @@ lp_set_target_options(void) llvm::cl::ParseCommandLineOptions(2, const_cast<char**>(options)); first = FALSE; } -#endif + + /* + * By default LLVM adds a signal handler to output a pretty stack trace. + * This signal handler is never removed, causing problems when unloading the + * shared object where the gallium driver resides. + */ + llvm::DisablePrettyStackTrace = true; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h index e470082b977..e947b90d164 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h @@ -37,6 +37,8 @@ #define LP_BLD_PACK_H +#include "pipe/p_compiler.h" + #include "gallivm/lp_bld.h" diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c index 0fd014ab9b3..259b1142e3c 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c @@ -82,9 +82,9 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, state->swizzle_a = view->swizzle_a; state->target = texture->target; - state->pot_width = util_is_pot(texture->width0); - state->pot_height = util_is_pot(texture->height0); - state->pot_depth = util_is_pot(texture->depth0); + state->pot_width = util_is_power_of_two(texture->width0); + state->pot_height = util_is_power_of_two(texture->height0); + state->pot_depth = util_is_power_of_two(texture->depth0); state->wrap_s = sampler->wrap_s; state->wrap_t = sampler->wrap_t; @@ -124,6 +124,52 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, /** + * Compute the partial offset of a pixel block along an arbitrary axis. + * + * @param coord coordinate in pixels + * @param stride number of bytes between rows of successive pixel blocks + * @param block_length number of pixels in a pixels block along the coordinate + * axis + * @param out_offset resulting relative offset of the pixel block in bytes + * @param out_subcoord resulting sub-block pixel coordinate + */ +void +lp_build_sample_partial_offset(struct lp_build_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_subcoord) +{ + LLVMValueRef offset; + LLVMValueRef subcoord; + + if (block_length == 1) { + subcoord = bld->zero; + } + else { + /* + * Pixel blocks have power of two dimensions. LLVM should convert the + * rem/div to bit arithmetic. + * TODO: Verify this. + */ + + LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length); + subcoord = LLVMBuildURem(bld->builder, coord, block_width, ""); + coord = LLVMBuildUDiv(bld->builder, coord, block_width, ""); + } + + offset = lp_build_mul(bld, coord, stride); + + assert(out_offset); + assert(out_subcoord); + + *out_offset = offset; + *out_subcoord = subcoord; +} + + +/** * Compute the offset of a pixel block. * * x, y, z, y_stride, z_stride are vectors, and they refer to pixels. @@ -144,48 +190,35 @@ lp_build_sample_offset(struct lp_build_context *bld, { LLVMValueRef x_stride; LLVMValueRef offset; - LLVMValueRef i; - LLVMValueRef j; - - /* - * Describe the coordinates in terms of pixel blocks. - * - * TODO: pixel blocks are power of two. LLVM should convert rem/div to - * bit arithmetic. Verify this. - */ - - if (format_desc->block.width == 1) { - i = bld->zero; - } - else { - LLVMValueRef block_width = lp_build_const_int_vec(bld->type, format_desc->block.width); - i = LLVMBuildURem(bld->builder, x, block_width, ""); - x = LLVMBuildUDiv(bld->builder, x, block_width, ""); - } - - if (format_desc->block.height == 1) { - j = bld->zero; - } - else { - LLVMValueRef block_height = lp_build_const_int_vec(bld->type, format_desc->block.height); - j = LLVMBuildURem(bld->builder, y, block_height, ""); - y = LLVMBuildUDiv(bld->builder, y, block_height, ""); - } x_stride = lp_build_const_vec(bld->type, format_desc->block.bits/8); - offset = lp_build_mul(bld, x, x_stride); + + lp_build_sample_partial_offset(bld, + format_desc->block.width, + x, x_stride, + &offset, out_i); if (y && y_stride) { - LLVMValueRef y_offset = lp_build_mul(bld, y, y_stride); + LLVMValueRef y_offset; + lp_build_sample_partial_offset(bld, + format_desc->block.height, + y, y_stride, + &y_offset, out_j); offset = lp_build_add(bld, offset, y_offset); } + else { + *out_j = bld->zero; + } if (z && z_stride) { - LLVMValueRef z_offset = lp_build_mul(bld, z, z_stride); + LLVMValueRef z_offset; + LLVMValueRef k; + lp_build_sample_partial_offset(bld, + 1, /* pixel blocks are always 2D */ + z, z_stride, + &z_offset, &k); offset = lp_build_add(bld, offset, z_offset); } *out_offset = offset; - *out_i = i; - *out_j = j; } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index 5b8f478094b..caafc4eca04 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -36,6 +36,8 @@ #define LP_BLD_SAMPLE_H +#include "pipe/p_format.h" + #include "gallivm/lp_bld.h" struct pipe_resource; @@ -147,6 +149,15 @@ lp_sampler_static_state(struct lp_sampler_static_state *state, void +lp_build_sample_partial_offset(struct lp_build_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef stride, + LLVMValueRef *out_offset, + LLVMValueRef *out_i); + + +void lp_build_sample_offset(struct lp_build_context *bld, const struct util_format_description *format_desc, LLVMValueRef x, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 806c7d56a87..1f39d9c98b5 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -176,6 +176,7 @@ texture_dims(enum pipe_texture_target tex) case PIPE_TEXTURE_1D: return 1; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_CUBE: return 2; case PIPE_TEXTURE_3D: @@ -322,59 +323,6 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, /** - * Fetch the texels as <4n x i8> in AoS form. - */ -static LLVMValueRef -lp_build_sample_packed(struct lp_build_sample_context *bld, - LLVMValueRef x, - LLVMValueRef y, - LLVMValueRef y_stride, - LLVMValueRef data_array) -{ - LLVMValueRef offset, i, j; - LLVMValueRef data_ptr; - LLVMValueRef res; - - /* convert x,y,z coords to linear offset from start of texture, in bytes */ - lp_build_sample_offset(&bld->uint_coord_bld, - bld->format_desc, - x, y, NULL, y_stride, NULL, - &offset, &i, &j); - - /* get pointer to mipmap level 0 data */ - data_ptr = lp_build_get_const_mipmap_level(bld, data_array, 0); - - if (util_format_is_rgba8_variant(bld->format_desc)) { - /* Just fetch the data directly without swizzling */ - assert(bld->format_desc->block.width == 1); - assert(bld->format_desc->block.height == 1); - assert(bld->format_desc->block.bits <= bld->texel_type.width); - - res = lp_build_gather(bld->builder, - bld->texel_type.length, - bld->format_desc->block.bits, - bld->texel_type.width, - data_ptr, offset); - } - else { - struct lp_type type; - - assert(bld->texel_type.width == 32); - - memset(&type, 0, sizeof type); - type.width = 8; - type.length = bld->texel_type.length*4; - type.norm = TRUE; - - res = lp_build_fetch_rgba_aos(bld->builder, bld->format_desc, type, - data_ptr, offset, i, j); - } - - return res; -} - - -/** * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes. */ static LLVMValueRef @@ -408,7 +356,7 @@ lp_build_coord_mirror(struct lp_build_sample_context *bld, /** - * We only support a few wrap modes in lp_build_sample_wrap_int() at this time. + * We only support a few wrap modes in lp_build_sample_wrap_linear_int() at this time. * Return whether the given mode is supported by that function. */ static boolean @@ -430,13 +378,18 @@ is_simple_wrap_mode(unsigned mode) * \param length the texture size along one dimension * \param is_pot if TRUE, length is a power of two * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param i0 resulting sub-block pixel coordinate for coord0 */ -static LLVMValueRef -lp_build_sample_wrap_int(struct lp_build_sample_context *bld, - LLVMValueRef coord, - LLVMValueRef length, - boolean is_pot, - unsigned wrap_mode) +static void +lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, + unsigned block_length, + LLVMValueRef coord, + LLVMValueRef length, + LLVMValueRef stride, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *out_offset, + LLVMValueRef *out_i) { struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; struct lp_build_context *int_coord_bld = &bld->int_coord_bld; @@ -469,7 +422,134 @@ lp_build_sample_wrap_int(struct lp_build_sample_context *bld, assert(0); } - return coord; + lp_build_sample_partial_offset(uint_coord_bld, block_length, coord, stride, + out_offset, out_i); +} + + +/** + * Build LLVM code for texture wrap mode, for scaled integer texcoords. + * \param coord0 the incoming texcoord (s,t,r or q) scaled to the texture size + * \param length the texture size along one dimension + * \param stride pixel stride along the coordinate axis + * \param block_length is the length of the pixel block along the + * coordinate axis + * \param is_pot if TRUE, length is a power of two + * \param wrap_mode one of PIPE_TEX_WRAP_x + * \param offset0 resulting relative offset for coord0 + * \param offset1 resulting relative offset for coord0 + 1 + * \param i0 resulting sub-block pixel coordinate for coord0 + * \param i1 resulting sub-block pixel coordinate for coord0 + 1 + */ +static void +lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, + unsigned block_length, + LLVMValueRef coord0, + LLVMValueRef length, + LLVMValueRef stride, + boolean is_pot, + unsigned wrap_mode, + LLVMValueRef *offset0, + LLVMValueRef *offset1, + LLVMValueRef *i0, + LLVMValueRef *i1) +{ + struct lp_build_context *uint_coord_bld = &bld->uint_coord_bld; + struct lp_build_context *int_coord_bld = &bld->int_coord_bld; + LLVMValueRef length_minus_one; + LLVMValueRef lmask, umask, mask; + + if (block_length != 1) { + /* + * If the pixel block covers more than one pixel then there is no easy + * way to calculate offset1 relative to offset0. Instead, compute them + * independently. + */ + + LLVMValueRef coord1; + + lp_build_sample_wrap_nearest_int(bld, + block_length, + coord0, + length, + stride, + is_pot, + wrap_mode, + offset0, i0); + + coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); + + lp_build_sample_wrap_nearest_int(bld, + block_length, + coord1, + length, + stride, + is_pot, + wrap_mode, + offset1, i1); + + return; + } + + /* + * Scalar pixels -- try to compute offset0 and offset1 with a single stride + * multiplication. + */ + + *i0 = uint_coord_bld->zero; + *i1 = uint_coord_bld->zero; + + length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); + + switch(wrap_mode) { + case PIPE_TEX_WRAP_REPEAT: + if (is_pot) { + coord0 = LLVMBuildAnd(bld->builder, coord0, length_minus_one, ""); + } + else { + /* Signed remainder won't give the right results for negative + * dividends but unsigned remainder does.*/ + coord0 = LLVMBuildURem(bld->builder, coord0, length, ""); + } + + mask = lp_build_compare(bld->builder, int_coord_bld->type, + PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); + + *offset0 = lp_build_mul(uint_coord_bld, coord0, stride); + *offset1 = LLVMBuildAnd(bld->builder, + lp_build_add(uint_coord_bld, *offset0, stride), + mask, ""); + break; + + case PIPE_TEX_WRAP_CLAMP_TO_EDGE: + lmask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type, + PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero); + umask = lp_build_compare(int_coord_bld->builder, int_coord_bld->type, + PIPE_FUNC_LESS, coord0, length_minus_one); + + coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero); + coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one); + + mask = LLVMBuildAnd(bld->builder, lmask, umask, ""); + + *offset0 = lp_build_mul(uint_coord_bld, coord0, stride); + *offset1 = lp_build_add(uint_coord_bld, + *offset0, + LLVMBuildAnd(bld->builder, stride, mask, "")); + break; + + case PIPE_TEX_WRAP_CLAMP: + case PIPE_TEX_WRAP_CLAMP_TO_BORDER: + case PIPE_TEX_WRAP_MIRROR_REPEAT: + case PIPE_TEX_WRAP_MIRROR_CLAMP: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: + case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: + default: + assert(0); + *offset0 = uint_coord_bld->zero; + *offset1 = uint_coord_bld->zero; + break; + } } @@ -1740,16 +1820,21 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, LLVMValueRef i32_c8, i32_c128, i32_c255; LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi; LLVMValueRef t_ipart, t_fpart, t_fpart_lo, t_fpart_hi; - LLVMValueRef x0, x1; - LLVMValueRef y0, y1; - LLVMValueRef neighbors[2][2]; + LLVMValueRef data_ptr; + LLVMValueRef x_stride, y_stride; + LLVMValueRef x_offset0, x_offset1; + LLVMValueRef y_offset0, y_offset1; + LLVMValueRef offset[2][2]; + LLVMValueRef x_subcoord[2], y_subcoord[2]; LLVMValueRef neighbors_lo[2][2]; LLVMValueRef neighbors_hi[2][2]; LLVMValueRef packed, packed_lo, packed_hi; LLVMValueRef unswizzled[4]; - LLVMValueRef stride; + const unsigned level = 0; + unsigned i, j; - assert(bld->static_state->target == PIPE_TEXTURE_2D); + assert(bld->static_state->target == PIPE_TEXTURE_2D + || bld->static_state->target == PIPE_TEXTURE_RECT); assert(bld->static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR); assert(bld->static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR); assert(bld->static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE); @@ -1793,21 +1878,30 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, s_fpart = LLVMBuildAnd(builder, s, i32_c255, ""); t_fpart = LLVMBuildAnd(builder, t, i32_c255, ""); - x0 = s_ipart; - y0 = t_ipart; - - x1 = lp_build_add(&bld->int_coord_bld, x0, bld->int_coord_bld.one); - y1 = lp_build_add(&bld->int_coord_bld, y0, bld->int_coord_bld.one); - - x0 = lp_build_sample_wrap_int(bld, x0, width, bld->static_state->pot_width, - bld->static_state->wrap_s); - y0 = lp_build_sample_wrap_int(bld, y0, height, bld->static_state->pot_height, - bld->static_state->wrap_t); - - x1 = lp_build_sample_wrap_int(bld, x1, width, bld->static_state->pot_width, - bld->static_state->wrap_s); - y1 = lp_build_sample_wrap_int(bld, y1, height, bld->static_state->pot_height, - bld->static_state->wrap_t); + x_stride = lp_build_const_vec(bld->uint_coord_bld.type, + bld->format_desc->block.bits/8); + + y_stride = lp_build_get_const_level_stride_vec(bld, stride_array, level); + + lp_build_sample_wrap_linear_int(bld, + bld->format_desc->block.width, + s_ipart, width, x_stride, + bld->static_state->pot_width, + bld->static_state->wrap_s, + &x_offset0, &x_offset1, + &x_subcoord[0], &x_subcoord[1]); + lp_build_sample_wrap_linear_int(bld, + bld->format_desc->block.height, + t_ipart, height, y_stride, + bld->static_state->pot_height, + bld->static_state->wrap_t, + &y_offset0, &y_offset1, + &y_subcoord[0], &y_subcoord[1]); + + offset[0][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset0); + offset[0][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset0); + offset[1][0] = lp_build_add(&bld->uint_coord_bld, x_offset0, y_offset1); + offset[1][1] = lp_build_add(&bld->uint_coord_bld, x_offset1, y_offset1); /* * Transform 4 x i32 in @@ -1836,7 +1930,6 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; LLVMValueRef shuffle_lo; LLVMValueRef shuffle_hi; - unsigned i, j; for(j = 0; j < h16.type.length; j += 4) { #ifdef PIPE_ARCH_LITTLE_ENDIAN @@ -1864,7 +1957,10 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, ""); } - stride = lp_build_get_const_level_stride_vec(bld, stride_array, 0); + /* + * get pointer to mipmap level 0 data + */ + data_ptr = lp_build_get_const_mipmap_level(bld, data_array, level); /* * Fetch the pixels as 4 x 32bit (rgba order might differ): @@ -1883,20 +1979,38 @@ lp_build_sample_2d_linear_aos(struct lp_build_sample_context *bld, * The higher 8 bits of the resulting elements will be zero. */ - neighbors[0][0] = lp_build_sample_packed(bld, x0, y0, stride, data_array); - neighbors[0][1] = lp_build_sample_packed(bld, x1, y0, stride, data_array); - neighbors[1][0] = lp_build_sample_packed(bld, x0, y1, stride, data_array); - neighbors[1][1] = lp_build_sample_packed(bld, x1, y1, stride, data_array); + for (j = 0; j < 2; ++j) { + for (i = 0; i < 2; ++i) { + LLVMValueRef rgba8; - neighbors[0][0] = LLVMBuildBitCast(builder, neighbors[0][0], u8n_vec_type, ""); - neighbors[0][1] = LLVMBuildBitCast(builder, neighbors[0][1], u8n_vec_type, ""); - neighbors[1][0] = LLVMBuildBitCast(builder, neighbors[1][0], u8n_vec_type, ""); - neighbors[1][1] = LLVMBuildBitCast(builder, neighbors[1][1], u8n_vec_type, ""); + if (util_format_is_rgba8_variant(bld->format_desc)) { + /* + * Given the format is a rgba8, just read the pixels as is, + * without any swizzling. Swizzling will be done later. + */ + rgba8 = lp_build_gather(bld->builder, + bld->texel_type.length, + bld->format_desc->block.bits, + bld->texel_type.width, + data_ptr, offset[j][i]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][0], &neighbors_lo[0][0], &neighbors_hi[0][0]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[0][1], &neighbors_lo[0][1], &neighbors_hi[0][1]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][0], &neighbors_lo[1][0], &neighbors_hi[1][0]); - lp_build_unpack2(builder, u8n.type, h16.type, neighbors[1][1], &neighbors_lo[1][1], &neighbors_hi[1][1]); + rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); + + } + else { + rgba8 = lp_build_fetch_rgba_aos(bld->builder, + bld->format_desc, + u8n.type, + data_ptr, offset[j][i], + x_subcoord[i], + y_subcoord[j]); + } + + lp_build_unpack2(builder, u8n.type, h16.type, + rgba8, + &neighbors_lo[j][i], &neighbors_hi[j][i]); + } + } /* * Linear interpolate with 8.8 fixed point. @@ -2077,7 +2191,8 @@ lp_build_sample_soa(LLVMBuilderRef builder, } else if (util_format_fits_8unorm(bld.format_desc) && bld.format_desc->nr_channels > 1 && - static_state->target == PIPE_TEXTURE_2D && + (static_state->target == PIPE_TEXTURE_2D || + static_state->target == PIPE_TEXTURE_RECT) && static_state->min_img_filter == PIPE_TEX_FILTER_LINEAR && static_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR && static_state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE && diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index 0aa64affacc..0e07f7f3f38 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -200,8 +200,10 @@ static void lp_exec_mask_cond_push(struct lp_exec_mask *mask, } mask->cond_stack[mask->cond_stack_size++] = mask->cond_mask; assert(LLVMTypeOf(val) == mask->int_vec_type); - mask->cond_mask = val; - + mask->cond_mask = LLVMBuildAnd(mask->bld->builder, + mask->cond_mask, + val, + ""); lp_exec_mask_update(mask); } @@ -802,7 +804,7 @@ emit_store( case TGSI_FILE_PREDICATE: lp_exec_mask_store(&bld->exec_mask, pred, value, - bld->preds[index][chan_index]); + bld->preds[reg->Register.Index][chan_index]); break; default: diff --git a/src/gallium/auxiliary/gallivm/lp_bld_type.h b/src/gallium/auxiliary/gallivm/lp_bld_type.h index 3ffe916f8e4..fec1d3dfbc6 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_type.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_type.h @@ -128,16 +128,16 @@ struct lp_build_context */ struct lp_type type; - /** Same as lp_build_undef(type) */ + /** Same as lp_build_elem_type(type) */ LLVMTypeRef elem_type; - /** Same as lp_build_undef(type) */ + /** Same as lp_build_vec_type(type) */ LLVMTypeRef vec_type; - /** Same as lp_build_undef(type) */ + /** Same as lp_build_int_elem_type(type) */ LLVMTypeRef int_elem_type; - /** Same as lp_build_undef(type) */ + /** Same as lp_build_int_vec_type(type) */ LLVMTypeRef int_vec_type; /** Same as lp_build_undef(type) */ diff --git a/src/gallium/auxiliary/os/os_stream.c b/src/gallium/auxiliary/os/os_stream.c new file mode 100644 index 00000000000..3c55fc00d92 --- /dev/null +++ b/src/gallium/auxiliary/os/os_stream.c @@ -0,0 +1,58 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "pipe/p_config.h" + +#include "os_stream.h" +#include "util/u_memory.h" +#include "util/u_string.h" + +int +os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + char buf[1024]; + int retval; + va_list ap2; + va_copy(ap2, ap); + retval = util_vsnprintf(buf, sizeof(buf), format, ap2); + va_end(ap2); + if(retval <= 0) + {} + else if(retval < sizeof(buf)) + stream->write(stream, buf, retval); + else + { + char* str = MALLOC(retval + 1); + if(!str) + return -1; + retval = util_vsnprintf(str, retval + 1, format, ap); + if(retval > 0) + stream->write(stream, str, retval); + FREE(str); + } + + return retval; +} diff --git a/src/gallium/auxiliary/os/os_stream.h b/src/gallium/auxiliary/os/os_stream.h index 693a0621e2d..6c6050bb028 100644 --- a/src/gallium/auxiliary/os/os_stream.h +++ b/src/gallium/auxiliary/os/os_stream.h @@ -50,6 +50,9 @@ struct os_stream void (*flush)(struct os_stream *stream); + + int + (*vprintf)(struct os_stream *stream, const char* format, va_list ap); }; @@ -90,6 +93,27 @@ os_stream_flush(struct os_stream *stream) stream->flush(stream); } +int +os_default_stream_vprintf (struct os_stream* stream, const char *format, va_list ap); + +static INLINE int +os_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + return stream->vprintf(stream, format, ap); +} + +static INLINE int +os_stream_printf (struct os_stream* stream, const char *format, ...) +{ + int retval; + va_list args; + + va_start (args, format); + retval = stream->vprintf(stream, format, args); + va_end (args); + + return retval; +} struct os_stream * os_file_stream_create(const char *filename); @@ -118,5 +142,4 @@ os_str_stream_get_and_close(struct os_stream *stream); #define os_file_stream_create(_filename) os_null_stream_create() #endif - #endif /* _OS_STREAM_H_ */ diff --git a/src/gallium/auxiliary/os/os_stream_log.c b/src/gallium/auxiliary/os/os_stream_log.c index 7cc2028a22c..b01377c3468 100644 --- a/src/gallium/auxiliary/os/os_stream_log.c +++ b/src/gallium/auxiliary/os/os_stream_log.c @@ -73,7 +73,8 @@ static struct os_stream os_log_stream_struct = { &os_log_stream_close, &os_log_stream_write, - &os_log_stream_flush + &os_log_stream_flush, + &os_default_stream_vprintf, }; diff --git a/src/gallium/auxiliary/os/os_stream_null.c b/src/gallium/auxiliary/os/os_stream_null.c index 128c4e8f0e0..a549a789e62 100644 --- a/src/gallium/auxiliary/os/os_stream_null.c +++ b/src/gallium/auxiliary/os/os_stream_null.c @@ -56,12 +56,18 @@ os_null_stream_flush(struct os_stream *stream) (void)stream; } +static int +os_null_stream_vprintf (struct os_stream* stream, const char *format, va_list ap) +{ + return 0; +} static struct os_stream os_null_stream = { &os_null_stream_close, &os_null_stream_write, - &os_null_stream_flush + &os_null_stream_flush, + &os_null_stream_vprintf }; diff --git a/src/gallium/auxiliary/os/os_stream_stdc.c b/src/gallium/auxiliary/os/os_stream_stdc.c index 9e7ed711076..37e7d063e2b 100644 --- a/src/gallium/auxiliary/os/os_stream_stdc.c +++ b/src/gallium/auxiliary/os/os_stream_stdc.c @@ -83,6 +83,14 @@ os_stdc_stream_flush(struct os_stream *_stream) fflush(stream->file); } +static int +os_stdc_stream_vprintf (struct os_stream* _stream, const char *format, va_list ap) +{ + struct os_stdc_stream *stream = os_stdc_stream(_stream); + + return vfprintf(stream->file, format, ap); +} + struct os_stream * os_file_stream_create(const char *filename) @@ -96,6 +104,7 @@ os_file_stream_create(const char *filename) stream->base.close = &os_stdc_stream_close; stream->base.write = &os_stdc_stream_write; stream->base.flush = &os_stdc_stream_flush; + stream->base.vprintf = &os_stdc_stream_vprintf; stream->file = fopen(filename, "w"); if(!stream->file) diff --git a/src/gallium/auxiliary/os/os_stream_str.c b/src/gallium/auxiliary/os/os_stream_str.c index b5c7270d2ae..be9478b2a17 100644 --- a/src/gallium/auxiliary/os/os_stream_str.c +++ b/src/gallium/auxiliary/os/os_stream_str.c @@ -118,6 +118,7 @@ os_str_stream_create(size_t size) stream->base.close = &os_str_stream_close; stream->base.write = &os_str_stream_write; stream->base.flush = &os_str_stream_flush; + stream->base.vprintf = &os_default_stream_vprintf; stream->str = os_malloc(size); if(!stream->str) diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h index cec2524da2b..2ef02160f23 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h @@ -50,8 +50,7 @@ #define PB_BUFMGR_H_ -#include "pipe/p_compiler.h" -#include "pipe/p_defines.h" +#include "pb_buffer.h" #ifdef __cplusplus diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c index 2e15751e508..0461c815504 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c +++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c @@ -30,7 +30,7 @@ #include "rtasm_cpu.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) static boolean rtasm_sse_enabled(void) { static boolean firsttime = 1; @@ -49,7 +49,7 @@ static boolean rtasm_sse_enabled(void) int rtasm_cpu_has_sse(void) { /* FIXME: actually detect this at run-time */ -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) return rtasm_sse_enabled(); #else return 0; @@ -59,7 +59,7 @@ int rtasm_cpu_has_sse(void) int rtasm_cpu_has_sse2(void) { /* FIXME: actually detect this at run-time */ -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) return rtasm_sse_enabled(); #else return 0; diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c index 9f70b73698a..75b0f6a68ea 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c @@ -22,8 +22,9 @@ **************************************************************************/ #include "pipe/p_config.h" +#include "util/u_cpu_detect.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "pipe/p_compiler.h" #include "util/u_debug.h" @@ -231,6 +232,10 @@ static void emit_modrm( struct x86_function *p, assert(reg.mod == mod_REG); + /* TODO: support extended x86-64 registers */ + assert(reg.idx < 8); + assert(regmem.idx < 8); + val |= regmem.mod << 6; /* mod field */ val |= reg.idx << 3; /* reg field */ val |= regmem.idx; /* r/m field */ @@ -363,6 +368,12 @@ int x86_get_label( struct x86_function *p ) */ +void x64_rexw(struct x86_function *p) +{ + if(x86_target(p) != X86_32) + emit_1ub(p, 0x48); +} + void x86_jcc( struct x86_function *p, enum x86_cc cc, int label ) @@ -449,6 +460,52 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm ) emit_1i(p, imm); } +void x86_mov_imm( struct x86_function *p, struct x86_reg dst, int imm ) +{ + DUMP_RI( dst, imm ); + if(dst.mod == mod_REG) + x86_mov_reg_imm(p, dst, imm); + else + { + emit_1ub(p, 0xc7); + emit_modrm_noreg(p, 0, dst); + emit_1i(p, imm); + } +} + +void x86_mov16_imm( struct x86_function *p, struct x86_reg dst, uint16_t imm ) +{ + DUMP_RI( dst, imm ); + emit_1ub(p, 0x66); + if(dst.mod == mod_REG) + { + emit_1ub(p, 0xb8 + dst.idx); + emit_2ub(p, imm & 0xff, imm >> 8); + } + else + { + emit_1ub(p, 0xc7); + emit_modrm_noreg(p, 0, dst); + emit_2ub(p, imm & 0xff, imm >> 8); + } +} + +void x86_mov8_imm( struct x86_function *p, struct x86_reg dst, uint8_t imm ) +{ + DUMP_RI( dst, imm ); + if(dst.mod == mod_REG) + { + emit_1ub(p, 0xb0 + dst.idx); + emit_1ub(p, imm); + } + else + { + emit_1ub(p, 0xc6); + emit_modrm_noreg(p, 0, dst); + emit_1ub(p, imm); + } +} + /** * Immediate group 1 instructions. */ @@ -520,7 +577,7 @@ void x86_push( struct x86_function *p, } - p->stack_offset += 4; + p->stack_offset += sizeof(void*); } void x86_push_imm32( struct x86_function *p, @@ -530,7 +587,7 @@ void x86_push_imm32( struct x86_function *p, emit_1ub(p, 0x68); emit_1i(p, imm32); - p->stack_offset += 4; + p->stack_offset += sizeof(void*); } @@ -540,23 +597,33 @@ void x86_pop( struct x86_function *p, DUMP_R( reg ); assert(reg.mod == mod_REG); emit_1ub(p, 0x58 + reg.idx); - p->stack_offset -= 4; + p->stack_offset -= sizeof(void*); } void x86_inc( struct x86_function *p, struct x86_reg reg ) { DUMP_R( reg ); - assert(reg.mod == mod_REG); - emit_1ub(p, 0x40 + reg.idx); + if(x86_target(p) == X86_32 && reg.mod == mod_REG) + { + emit_1ub(p, 0x40 + reg.idx); + return; + } + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 0, reg); } void x86_dec( struct x86_function *p, struct x86_reg reg ) { DUMP_R( reg ); - assert(reg.mod == mod_REG); - emit_1ub(p, 0x48 + reg.idx); + if(x86_target(p) == X86_32 && reg.mod == mod_REG) + { + emit_1ub(p, 0x48 + reg.idx); + return; + } + emit_1ub(p, 0xff); + emit_modrm_noreg(p, 1, reg); } void x86_ret( struct x86_function *p ) @@ -583,9 +650,82 @@ void x86_mov( struct x86_function *p, struct x86_reg src ) { DUMP_RR( dst, src ); + /* special hack for reading arguments until we support x86-64 registers everywhere */ + if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8)) + { + uint8_t rex = 0x40; + if(dst.idx >= 8) + { + rex |= 4; + dst.idx -= 8; + } + if(src.idx >= 8) + { + rex |= 1; + src.idx -= 8; + } + emit_1ub(p, rex); + } + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_mov16( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_1ub(p, 0x66); + emit_op_modrm( p, 0x8b, 0x89, dst, src ); +} + +void x86_mov8( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_op_modrm( p, 0x8a, 0x88, dst, src ); +} + +void x64_mov64( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + uint8_t rex = 0x48; + DUMP_RR( dst, src ); + assert(x86_target(p) != X86_32); + + /* special hack for reading arguments until we support x86-64 registers everywhere */ + if(src.mod == mod_REG && dst.mod == mod_REG && (src.idx >= 8 || dst.idx >= 8)) + { + if(dst.idx >= 8) + { + rex |= 4; + dst.idx -= 8; + } + if(src.idx >= 8) + { + rex |= 1; + src.idx -= 8; + } + } + emit_1ub(p, rex); emit_op_modrm( p, 0x8b, 0x89, dst, src ); } +void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_2ub(p, 0x0f, 0xb6); + emit_modrm(p, dst, src); +} + +void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_2ub(p, 0x0f, 0xb7); + emit_modrm(p, dst, src); +} + void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -680,6 +820,61 @@ void x86_div( struct x86_function *p, emit_op_modrm(p, 0xf7, 0, x86_make_reg(file_REG32, 6), src); } +void x86_bswap( struct x86_function *p, struct x86_reg reg ) +{ + DUMP_R(reg); + assert(reg.file == file_REG32); + assert(reg.mod == mod_REG); + emit_2ub(p, 0x0f, 0xc8 + reg.idx); +} + +void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 5, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 5, reg); + emit_1ub(p, imm); + } +} + +void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 7, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 7, reg); + emit_1ub(p, imm); + } +} + +void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ) +{ + DUMP_RI(reg, imm); + if(imm == 1) + { + emit_1ub(p, 0xd1); + emit_modrm_noreg(p, 4, reg); + } + else + { + emit_1ub(p, 0xc1); + emit_modrm_noreg(p, 4, reg); + emit_1ub(p, imm); + } +} /*********************************************************************** @@ -1013,6 +1208,77 @@ void sse_movmskps( struct x86_function *p, * SSE2 instructions */ +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + if(dst.mod == mod_REG && dst.file == file_REG32) + { + emit_1ub(p, 0x7e); + emit_modrm(p, src, dst); + } + else + { + emit_op_modrm(p, 0x6e, 0x7e, dst, src); + } +} + +void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + switch (dst.mod) { + case mod_REG: + emit_3ub(p, 0xf3, 0x0f, 0x7e); + emit_modrm(p, dst, src); + break; + case mod_INDIRECT: + case mod_DISP32: + case mod_DISP8: + assert(src.mod == mod_REG); + emit_3ub(p, 0x66, 0x0f, 0xd6); + emit_modrm(p, src, dst); + break; + default: + assert(0); + break; + } +} + +void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0xf3, 0x0f); + emit_op_modrm(p, 0x6f, 0x7f, dst, src); +} + +void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x6f, 0x7f, dst, src); +} + +void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0xf2, 0x0f); + emit_op_modrm(p, 0x10, 0x11, dst, src); +} + +void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x10, 0x11, dst, src); +} + +void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_2ub(p, 0x66, 0x0f); + emit_op_modrm(p, 0x28, 0x29, dst, src); +} + /** * Perform a reduced swizzle: */ @@ -1027,6 +1293,28 @@ void sse2_pshufd( struct x86_function *p, emit_1ub(p, shuf); } +void sse2_pshuflw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf) +{ + DUMP_RRI( dst, src, shuf ); + emit_3ub(p, 0xf2, X86_TWOB, 0x70); + emit_modrm(p, dst, src); + emit_1ub(p, shuf); +} + +void sse2_pshufhw( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src, + unsigned char shuf) +{ + DUMP_RRI( dst, src, shuf ); + emit_3ub(p, 0xf3, X86_TWOB, 0x70); + emit_modrm(p, dst, src); + emit_1ub(p, shuf); +} + void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -1045,6 +1333,24 @@ void sse2_cvtps2dq( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse2_cvtsd2ss( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0xf2, 0x0f, 0x5a); + emit_modrm( p, dst, src ); +} + +void sse2_cvtpd2ps( struct x86_function *p, + struct x86_reg dst, + struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x5a); + emit_modrm( p, dst, src ); +} + void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) @@ -1081,6 +1387,97 @@ void sse2_punpcklbw( struct x86_function *p, emit_modrm( p, dst, src ); } +void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x61); + emit_modrm( p, dst, src ); +} + +void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x62); + emit_modrm( p, dst, src ); +} + +void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR( dst, src ); + emit_3ub(p, 0x66, 0x0f, 0x6c); + emit_modrm( p, dst, src ); +} + +void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x73); + emit_modrm_noreg(p, 6, dst); + emit_1ub(p, imm); +} + +void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x73); + emit_modrm_noreg(p, 2, dst); + emit_1ub(p, imm); +} + +void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x71); + emit_modrm_noreg(p, 4, dst); + emit_1ub(p, imm); +} + +void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ) +{ + DUMP_RI(dst, imm); + emit_3ub(p, 0x66, 0x0f, 0x72); + emit_modrm_noreg(p, 4, dst); + emit_1ub(p, imm); +} + +void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src ) +{ + DUMP_RR(dst, src); + emit_3ub(p, 0x66, 0x0f, 0xeb); + emit_modrm(p, dst, src); +} void sse2_rcpps( struct x86_function *p, struct x86_reg dst, @@ -1100,18 +1497,6 @@ void sse2_rcpss( struct x86_function *p, emit_modrm( p, dst, src ); } -void sse2_movd( struct x86_function *p, - struct x86_reg dst, - struct x86_reg src ) -{ - DUMP_RR( dst, src ); - emit_2ub(p, 0x66, X86_TWOB); - emit_op_modrm( p, 0x6e, 0x7e, dst, src ); -} - - - - /*********************************************************************** * x87 instructions */ @@ -1702,23 +2087,80 @@ void x86_cdecl_caller_pop_regs( struct x86_function *p ) } -/* Retreive a reference to one of the function arguments, taking into - * account any push/pop activity: - */ struct x86_reg x86_fn_arg( struct x86_function *p, - unsigned arg ) + unsigned arg ) { - return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + switch(x86_target(p)) + { + case X86_64_WIN64_ABI: + /* Microsoft uses a different calling convention than the rest of the world */ + switch(arg) + { + case 1: + return x86_make_reg(file_REG32, reg_CX); + case 2: + return x86_make_reg(file_REG32, reg_DX); + case 3: + return x86_make_reg(file_REG32, reg_R8); + case 4: + return x86_make_reg(file_REG32, reg_R9); + default: + /* Win64 allocates stack slots as if it pushed the first 4 arguments too */ + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + arg * 8); + } + case X86_64_STD_ABI: + switch(arg) + { + case 1: + return x86_make_reg(file_REG32, reg_DI); + case 2: + return x86_make_reg(file_REG32, reg_SI); + case 3: + return x86_make_reg(file_REG32, reg_DX); + case 4: + return x86_make_reg(file_REG32, reg_CX); + case 5: + return x86_make_reg(file_REG32, reg_R8); + case 6: + return x86_make_reg(file_REG32, reg_R9); + default: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), + p->stack_offset + (arg - 6) * 8); /* ??? */ + } + case X86_32: + return x86_make_disp(x86_make_reg(file_REG32, reg_SP), p->stack_offset + arg * 4); /* ??? */ + default: + abort(); + } } +static void x86_init_func_common( struct x86_function *p ) +{ + util_cpu_detect(); + p->caps = 0; + if(util_cpu_caps.has_mmx) + p->caps |= X86_MMX; + if(util_cpu_caps.has_mmx2) + p->caps |= X86_MMX2; + if(util_cpu_caps.has_sse) + p->caps |= X86_SSE; + if(util_cpu_caps.has_sse2) + p->caps |= X86_SSE2; + if(util_cpu_caps.has_sse3) + p->caps |= X86_SSE3; + if(util_cpu_caps.has_sse4_1) + p->caps |= X86_SSE4_1; + p->csr = p->store; + DUMP_START(); +} void x86_init_func( struct x86_function *p ) { p->size = 0; p->store = NULL; - p->csr = p->store; - DUMP_START(); + x86_init_func_common(p); } void x86_init_func_size( struct x86_function *p, unsigned code_size ) @@ -1728,8 +2170,7 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size ) if (p->store == NULL) { p->store = p->error_overflow; } - p->csr = p->store; - DUMP_START(); + x86_init_func_common(p); } void x86_release_func( struct x86_function *p ) diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h index 6208e8f707f..2b9678b1765 100644 --- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h +++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h @@ -24,22 +24,31 @@ #ifndef _RTASM_X86SSE_H_ #define _RTASM_X86SSE_H_ +#include "pipe/p_compiler.h" #include "pipe/p_config.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* It is up to the caller to ensure that instructions issued are * suitable for the host cpu. There are no checks made in this module * for mmx/sse/sse2 support on the cpu. */ struct x86_reg { - unsigned file:3; - unsigned idx:3; + unsigned file:2; + unsigned idx:4; unsigned mod:2; /* mod_REG if this is just a register */ int disp:24; /* only +/- 23bits of offset - should be enough... */ }; +#define X86_MMX 1 +#define X86_MMX2 2 +#define X86_SSE 4 +#define X86_SSE2 8 +#define X86_SSE3 0x10 +#define X86_SSE4_1 0x20 + struct x86_function { + unsigned caps; unsigned size; unsigned char *store; unsigned char *csr; @@ -75,7 +84,15 @@ enum x86_reg_name { reg_SP, reg_BP, reg_SI, - reg_DI + reg_DI, + reg_R8, + reg_R9, + reg_R10, + reg_R11, + reg_R12, + reg_R13, + reg_R14, + reg_R15 }; @@ -110,6 +127,29 @@ typedef void (*x86_func)(void); /* Begin/end/retrieve function creation: */ +enum x86_target +{ + X86_32, + X86_64_STD_ABI, + X86_64_WIN64_ABI +}; + +/* make this read a member of x86_function if target != host is desired */ +static INLINE enum x86_target x86_target( struct x86_function* p ) +{ +#ifdef PIPE_ARCH_X86 + return X86_32; +#elif defined(_WIN64) + return X86_64_WIN64_ABI; +#elif defined(PIPE_ARCH_X86_64) + return X86_64_STD_ABI; +#endif +} + +static INLINE unsigned x86_target_caps( struct x86_function* p ) +{ + return p->caps; +} void x86_init_func( struct x86_function *p ); void x86_init_func_size( struct x86_function *p, unsigned code_size ); @@ -138,6 +178,8 @@ struct x86_reg x86_get_base_reg( struct x86_reg reg ); */ int x86_get_label( struct x86_function *p ); +void x64_rexw(struct x86_function *p); + void x86_jcc( struct x86_function *p, enum x86_cc cc, int label ); @@ -178,18 +220,54 @@ void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movdqu( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movdqa( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movsd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movupd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_movapd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvtsd2ss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_cvtpd2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, unsigned char shuf ); +void sse2_pshuflw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); +void sse2_pshufhw( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0, + unsigned char shuf ); void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklwd( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpckldq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void sse2_punpcklqdq( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_psllw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_pslld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psllq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_psrlw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrld_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrlq_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_psraw_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); +void sse2_psrad_imm( struct x86_function *p, struct x86_reg dst, unsigned imm ); + +void sse2_por( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); + +void sse2_pshuflw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); +void sse2_pshufhw( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); +void sse2_pshufd( struct x86_function *p, struct x86_reg dst, struct x86_reg src, uint8_t imm ); void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr); void sse_prefetch0( struct x86_function *p, struct x86_reg ptr); @@ -227,7 +305,6 @@ void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src ); -void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void sse_movmskps( struct x86_function *p, struct x86_reg dst, struct x86_reg src); void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -237,6 +314,14 @@ void x86_dec( struct x86_function *p, struct x86_reg reg ); void x86_inc( struct x86_function *p, struct x86_reg reg ); void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x64_mov64( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov8( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov16( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_movzx8(struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_movzx16(struct x86_function *p, struct x86_reg dst, struct x86_reg src ); +void x86_mov_imm(struct x86_function *p, struct x86_reg dst, int imm ); +void x86_mov8_imm(struct x86_function *p, struct x86_reg dst, uint8_t imm ); +void x86_mov16_imm(struct x86_function *p, struct x86_reg dst, uint16_t imm ); void x86_mul( struct x86_function *p, struct x86_reg src ); void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); @@ -250,7 +335,10 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src ); void x86_sahf( struct x86_function *p ); void x86_div( struct x86_function *p, struct x86_reg src ); - +void x86_bswap( struct x86_function *p, struct x86_reg src ); +void x86_shr_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); +void x86_sar_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); +void x86_shl_imm( struct x86_function *p, struct x86_reg reg, unsigned imm ); void x86_cdecl_caller_push_regs( struct x86_function *p ); void x86_cdecl_caller_pop_regs( struct x86_function *p ); diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h index 4cd27317b36..dd78b361007 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.h +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h @@ -28,6 +28,7 @@ #ifndef TGSI_DUMP_H #define TGSI_DUMP_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index 298f3d0a8bb..0757f05dfab 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -3239,6 +3239,8 @@ exec_instruction( if (mach->CallStackTop == 0) { /* returning from main() */ + mach->CondStackTop = 0; + mach->LoopStackTop = 0; *pc = -1; return; } @@ -3767,6 +3769,9 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) } #endif + /* Strictly speaking, these assertions aren't really needed but they + * can potentially catch some bugs in the control flow code. + */ assert(mach->CondStackTop == 0); assert(mach->LoopStackTop == 0); assert(mach->ContStackTop == 0); diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h index 50248884fd0..1992d11bbe8 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_info.h +++ b/src/gallium/auxiliary/tgsi/tgsi_info.h @@ -28,6 +28,7 @@ #ifndef TGSI_INFO_H #define TGSI_INFO_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c index db9a3422203..1891203abe1 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.c +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c @@ -282,17 +282,6 @@ tgsi_parse_token( } -unsigned -tgsi_num_tokens(const struct tgsi_token *tokens) -{ - struct tgsi_parse_context ctx; - if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) { - unsigned len = (ctx.FullHeader.Header.HeaderSize + - ctx.FullHeader.Header.BodySize); - return len; - } - return 0; -} /** @@ -319,3 +308,19 @@ tgsi_alloc_tokens(unsigned num_tokens) unsigned bytes = num_tokens * sizeof(struct tgsi_token); return (struct tgsi_token *) MALLOC(bytes); } + + +void +tgsi_dump_tokens(const struct tgsi_token *tokens) +{ + const unsigned *dwords = (const unsigned *)tokens; + int nr = tgsi_num_tokens(tokens); + int i; + + assert(sizeof(*tokens) == sizeof(unsigned)); + + debug_printf("const unsigned tokens[%d] = {\n", nr); + for (i = 0; i < nr; i++) + debug_printf("0x%08x,\n", dwords[i]); + debug_printf("};\n"); +} diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h index 36de8807b44..d4df5851764 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_parse.h +++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h @@ -28,6 +28,7 @@ #ifndef TGSI_PARSE_H #define TGSI_PARSE_H +#include "pipe/p_compiler.h" #include "pipe/p_shader_tokens.h" #if defined __cplusplus @@ -132,8 +133,15 @@ void tgsi_parse_token( struct tgsi_parse_context *ctx ); -unsigned -tgsi_num_tokens(const struct tgsi_token *tokens); +static INLINE unsigned +tgsi_num_tokens(const struct tgsi_token *tokens) +{ + struct tgsi_header header = *(const struct tgsi_header *) tokens; + return header.HeaderSize + header.BodySize; +} + +void +tgsi_dump_tokens(const struct tgsi_token *tokens); struct tgsi_token * tgsi_dup_tokens(const struct tgsi_token *tokens); diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h index d81ee3d00ec..00aa8b84fe9 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h +++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h @@ -32,9 +32,12 @@ extern "C" { #endif +#include "pipe/p_compiler.h" + +struct tgsi_exec_machine; +struct tgsi_interp_coef; struct tgsi_token; struct x86_function; -struct tgsi_interp_coef; unsigned tgsi_emit_sse2( diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c index fe638e211fa..73287b667db 100644 --- a/src/gallium/auxiliary/translate/translate.c +++ b/src/gallium/auxiliary/translate/translate.c @@ -38,7 +38,7 @@ struct translate *translate_create( const struct translate_key *key ) { struct translate *translate = NULL; -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) translate = translate_sse2_create( key ); if (translate) return translate; diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h index eb6f2cc4862..a75380228b1 100644 --- a/src/gallium/auxiliary/translate/translate.h +++ b/src/gallium/auxiliary/translate/translate.h @@ -85,6 +85,18 @@ struct translate { unsigned instance_id, void *output_buffer); + void (PIPE_CDECL *run_elts16)( struct translate *, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + + void (PIPE_CDECL *run_elts8)( struct translate *, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer); + void (PIPE_CDECL *run)( struct translate *, unsigned start, unsigned count, diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c index 42cfd763e9c..ad809db720d 100644 --- a/src/gallium/auxiliary/translate/translate_generic.c +++ b/src/gallium/auxiliary/translate/translate_generic.c @@ -64,6 +64,14 @@ struct translate_generic { unsigned input_stride; unsigned max_index; + /* this value is set to -1 if this is a normal element with output_format != input_format: + * in this case, u_format is used to do a full conversion + * + * this value is set to the format size in bytes if output_format == input_format or for 32-bit instance ids: + * in this case, memcpy is used to copy this amount of bytes + */ + int copy_size; + } attrib[PIPE_MAX_ATTRIBS]; unsigned nr_attrib; @@ -354,7 +362,65 @@ static emit_func get_emit_func( enum pipe_format format ) } } +static ALWAYS_INLINE void PIPE_CDECL generic_run_one( struct translate_generic *tg, + unsigned elt, + unsigned instance_id, + void *vert ) +{ + unsigned nr_attrs = tg->nr_attrib; + unsigned attr; + + for (attr = 0; attr < nr_attrs; attr++) { + float data[4]; + uint8_t *dst = (uint8_t *)vert + tg->attrib[attr].output_offset; + + if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { + const uint8_t *src; + unsigned index; + int copy_size; + + if (tg->attrib[attr].instance_divisor) { + index = instance_id / tg->attrib[attr].instance_divisor; + } + else { + index = elt; + } + + /* clamp to void going out of bounds */ + index = MIN2(index, tg->attrib[attr].max_index); + src = tg->attrib[attr].input_ptr + + tg->attrib[attr].input_stride * index; + + copy_size = tg->attrib[attr].copy_size; + if(likely(copy_size >= 0)) + memcpy(dst, src, copy_size); + else + { + tg->attrib[attr].fetch( data, src, 0, 0 ); + + if (0) + debug_printf("Fetch linear attr %d from %p stride %d index %d: " + " %f, %f, %f, %f \n", + attr, + tg->attrib[attr].input_ptr, + tg->attrib[attr].input_stride, + index, + data[0], data[1],data[2], data[3]); + + tg->attrib[attr].emit( data, dst ); + } + } else { + if(likely(tg->attrib[attr].copy_size >= 0)) + memcpy(data, &instance_id, 4); + else + { + data[0] = (float)instance_id; + tg->attrib[attr].emit( data, dst ); + } + } + } +} /** * Fetch vertex attributes for 'count' vertices. @@ -367,62 +433,45 @@ static void PIPE_CDECL generic_run_elts( struct translate *translate, { struct translate_generic *tg = translate_generic(translate); char *vert = output_buffer; - unsigned nr_attrs = tg->nr_attrib; - unsigned attr; unsigned i; - /* loop over vertex attributes (vertex shader inputs) - */ for (i = 0; i < count; i++) { - const unsigned elt = *elts++; - - for (attr = 0; attr < nr_attrs; attr++) { - float data[4]; - char *dst = vert + tg->attrib[attr].output_offset; - - if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { - const uint8_t *src; - unsigned index; - - if (tg->attrib[attr].instance_divisor) { - index = instance_id / tg->attrib[attr].instance_divisor; - } else { - index = elt; - } - - /* clamp to void going out of bounds */ - index = MIN2(index, tg->attrib[attr].max_index); - - src = tg->attrib[attr].input_ptr + - tg->attrib[attr].input_stride * index; - - tg->attrib[attr].fetch( data, src, 0, 0 ); - - if (0) - debug_printf("Fetch elt attr %d from %p stride %d div %u max %u index %d: " - " %f, %f, %f, %f \n", - attr, - tg->attrib[attr].input_ptr, - tg->attrib[attr].input_stride, - tg->attrib[attr].instance_divisor, - tg->attrib[attr].max_index, - index, - data[0], data[1],data[2], data[3]); - } else { - data[0] = (float)instance_id; - } + generic_run_one(tg, *elts++, instance_id, vert); + vert += tg->translate.key.output_stride; + } +} - if (0) - debug_printf("vert %d/%d attr %d: %f %f %f %f\n", - i, elt, attr, data[0], data[1], data[2], data[3]); +static void PIPE_CDECL generic_run_elts16( struct translate *translate, + const uint16_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; - tg->attrib[attr].emit( data, dst ); - } + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); vert += tg->translate.key.output_stride; } } +static void PIPE_CDECL generic_run_elts8( struct translate *translate, + const uint8_t *elts, + unsigned count, + unsigned instance_id, + void *output_buffer ) +{ + struct translate_generic *tg = translate_generic(translate); + char *vert = output_buffer; + unsigned i; + for (i = 0; i < count; i++) { + generic_run_one(tg, *elts++, instance_id, vert); + vert += tg->translate.key.output_stride; + } +} static void PIPE_CDECL generic_run( struct translate *translate, unsigned start, @@ -432,57 +481,10 @@ static void PIPE_CDECL generic_run( struct translate *translate, { struct translate_generic *tg = translate_generic(translate); char *vert = output_buffer; - unsigned nr_attrs = tg->nr_attrib; - unsigned attr; unsigned i; - /* loop over vertex attributes (vertex shader inputs) - */ for (i = 0; i < count; i++) { - unsigned elt = start + i; - - for (attr = 0; attr < nr_attrs; attr++) { - float data[4]; - char *dst = vert + tg->attrib[attr].output_offset; - - if (tg->attrib[attr].type == TRANSLATE_ELEMENT_NORMAL) { - const uint8_t *src; - unsigned index; - - if (tg->attrib[attr].instance_divisor) { - index = instance_id / tg->attrib[attr].instance_divisor; - } - else { - index = elt; - } - - /* clamp to void going out of bounds */ - index = MIN2(index, tg->attrib[attr].max_index); - - src = tg->attrib[attr].input_ptr + - tg->attrib[attr].input_stride * index; - - tg->attrib[attr].fetch( data, src, 0, 0 ); - - if (0) - debug_printf("Fetch linear attr %d from %p stride %d index %d: " - " %f, %f, %f, %f \n", - attr, - tg->attrib[attr].input_ptr, - tg->attrib[attr].input_stride, - index, - data[0], data[1],data[2], data[3]); - } else { - data[0] = (float)instance_id; - } - - if (0) - debug_printf("vert %d attr %d: %f %f %f %f\n", - i, attr, data[0], data[1], data[2], data[3]); - - tg->attrib[attr].emit( data, dst ); - } - + generic_run_one(tg, start + i, instance_id, vert); vert += tg->translate.key.output_stride; } } @@ -528,6 +530,8 @@ struct translate *translate_generic_create( const struct translate_key *key ) tg->translate.release = generic_release; tg->translate.set_buffer = generic_set_buffer; tg->translate.run_elts = generic_run_elts; + tg->translate.run_elts16 = generic_run_elts16; + tg->translate.run_elts8 = generic_run_elts8; tg->translate.run = generic_run; for (i = 0; i < key->nr_elements; i++) { @@ -544,9 +548,28 @@ struct translate *translate_generic_create( const struct translate_key *key ) tg->attrib[i].input_offset = key->element[i].input_offset; tg->attrib[i].instance_divisor = key->element[i].instance_divisor; - tg->attrib[i].emit = get_emit_func(key->element[i].output_format); tg->attrib[i].output_offset = key->element[i].output_offset; + tg->attrib[i].copy_size = -1; + if (tg->attrib[i].type == TRANSLATE_ELEMENT_INSTANCE_ID) + { + if(key->element[i].output_format == PIPE_FORMAT_R32_USCALED + || key->element[i].output_format == PIPE_FORMAT_R32_SSCALED) + tg->attrib[i].copy_size = 4; + } + else + { + if(key->element[i].input_format == key->element[i].output_format + && format_desc->block.width == 1 + && format_desc->block.height == 1 + && !(format_desc->block.bits & 7)) + tg->attrib[i].copy_size = format_desc->block.bits >> 3; + } + + if(tg->attrib[i].copy_size < 0) + tg->attrib[i].emit = get_emit_func(key->element[i].output_format); + else + tg->attrib[i].emit = NULL; } tg->nr_attrib = key->nr_elements; diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c index ef3aa674a34..f8bf5b46692 100644 --- a/src/gallium/auxiliary/translate/translate_sse.c +++ b/src/gallium/auxiliary/translate/translate_sse.c @@ -30,11 +30,12 @@ #include "pipe/p_compiler.h" #include "util/u_memory.h" #include "util/u_math.h" +#include "util/u_format.h" #include "translate.h" -#if defined(PIPE_ARCH_X86) +#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) #include "rtasm/rtasm_cpu.h" #include "rtasm/rtasm_x86sse.h" @@ -46,21 +47,9 @@ #define W 3 -typedef void (PIPE_CDECL *run_func)( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer); - -typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer); - struct translate_buffer { const void *base_ptr; - unsigned stride; + uintptr_t stride; unsigned max_index; }; @@ -73,21 +62,43 @@ struct translate_buffer_varient { #define ELEMENT_BUFFER_INSTANCE_ID 1001 +#define NUM_CONSTS 7 + +enum +{ + CONST_IDENTITY, + CONST_INV_127, + CONST_INV_255, + CONST_INV_32767, + CONST_INV_65535, + CONST_INV_2147483647, + CONST_255 +}; + +#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} +static float consts[NUM_CONSTS][4] = { + {0, 0, 0, 1}, + C(1.0 / 127.0), + C(1.0 / 255.0), + C(1.0 / 32767.0), + C(1.0 / 65535.0), + C(1.0 / 2147483647.0), + C(255.0) +}; +#undef C struct translate_sse { struct translate translate; struct x86_function linear_func; struct x86_function elt_func; + struct x86_function elt16_func; + struct x86_function elt8_func; struct x86_function *func; - boolean loaded_identity; - boolean loaded_255; - boolean loaded_inv_255; - - float identity[4]; - float float_255[4]; - float inv_255[4]; + PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; + int8_t reg_to_const[16]; + int8_t const_to_reg[NUM_CONSTS]; struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; unsigned nr_buffers; @@ -102,17 +113,16 @@ struct translate_sse { boolean use_instancing; unsigned instance_id; - run_func gen_run; - run_elts_func gen_run_elts; - /* these are actually known values, but putting them in a struct * like this is helpful to keep them in sync across the file. */ struct x86_reg tmp_EAX; - struct x86_reg idx_EBX; /* either start+i or &elt[i] */ - struct x86_reg outbuf_ECX; - struct x86_reg machine_EDX; - struct x86_reg count_ESI; /* decrements to zero */ + struct x86_reg tmp2_EDX; + struct x86_reg src_ECX; + struct x86_reg idx_ESI; /* either start+i or &elt[i] */ + struct x86_reg machine_EDI; + struct x86_reg outbuf_EBX; + struct x86_reg count_EBP; /* decrements to zero */ }; static int get_offset( const void *a, const void *b ) @@ -120,281 +130,950 @@ static int get_offset( const void *a, const void *b ) return (const char *)b - (const char *)a; } +static struct x86_reg get_const( struct translate_sse *p, unsigned id) +{ + struct x86_reg reg; + unsigned i; + if(p->const_to_reg[id] >= 0) + return x86_make_reg(file_XMM, p->const_to_reg[id]); -static struct x86_reg get_identity( struct translate_sse *p ) -{ - struct x86_reg reg = x86_make_reg(file_XMM, 6); - - if (!p->loaded_identity) { - p->loaded_identity = TRUE; - p->identity[0] = 0; - p->identity[1] = 0; - p->identity[2] = 0; - p->identity[3] = 1; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->identity[0]))); + for(i = 2; i < 8; ++i) + { + if(p->reg_to_const[i] < 0) + break; } + /* TODO: be smarter here */ + if(i == 8) + --i; + + reg = x86_make_reg(file_XMM, i); + + if(p->reg_to_const[i] >= 0) + p->const_to_reg[p->reg_to_const[i]] = -1; + + p->reg_to_const[i] = id; + p->const_to_reg[id] = i; + + /* TODO: this should happen outside the loop, if possible */ + sse_movaps(p->func, reg, + x86_make_disp(p->machine_EDI, + get_offset(p, &p->consts[id][0]))); + return reg; } -static struct x86_reg get_255( struct translate_sse *p ) +/* load the data in a SSE2 register, padding with zeros */ +static boolean emit_load_sse2( struct translate_sse *p, + struct x86_reg data, + struct x86_reg src, + unsigned size) { - struct x86_reg reg = x86_make_reg(file_XMM, 7); - - if (!p->loaded_255) { - p->loaded_255 = TRUE; - p->float_255[0] = - p->float_255[1] = - p->float_255[2] = - p->float_255[3] = 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->float_255[0]))); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + switch(size) + { + case 1: + x86_movzx8(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 2: + x86_movzx16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 3: + x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); + x86_shl_imm(p->func, tmp, 16); + x86_mov16(p->func, tmp, src); + sse2_movd(p->func, data, tmp); + break; + case 4: + sse2_movd(p->func, data, src); + break; + case 6: + sse2_movd(p->func, data, src); + x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); + sse2_movd(p->func, tmpXMM, tmp); + sse2_punpckldq(p->func, data, tmpXMM); + break; + case 8: + sse2_movq(p->func, data, src); + break; + case 12: + sse2_movq(p->func, data, src); + sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); + sse2_punpcklqdq(p->func, data, tmpXMM); + break; + case 16: + sse2_movdqu(p->func, data, src); + break; + default: + return FALSE; } - - return reg; + return TRUE; } -static struct x86_reg get_inv_255( struct translate_sse *p ) +/* this value can be passed for the out_chans argument */ +#define CHANNELS_0001 5 + +/* this function will load #chans float values, and will + * pad the register with zeroes at least up to out_chans. + * + * If out_chans is set to CHANNELS_0001, then the fourth + * value will be padded with 1. Only pass this value if + * chans < 4 or results are undefined. + */ +static void emit_load_float32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) { - struct x86_reg reg = x86_make_reg(file_XMM, 5); - - if (!p->loaded_inv_255) { - p->loaded_inv_255 = TRUE; - p->inv_255[0] = - p->inv_255[1] = - p->inv_255[2] = - p->inv_255[3] = 1.0f / 255.0f; - - sse_movups(p->func, reg, - x86_make_disp(p->machine_EDX, - get_offset(p, &p->inv_255[0]))); + switch(chans) + { + case 1: + /* a 0 0 0 + * a 0 0 1 + */ + sse_movss(p->func, data, arg0); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 2: + /* 0 0 0 1 + * a b 0 1 + */ + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + sse_movlps(p->func, data, arg0); + break; + case 3: + /* Have to jump through some hoops: + * + * c 0 0 0 + * c 0 0 1 if out_chans == CHANNELS_0001 + * 0 0 c 0/1 + * a b c 0/1 + */ + sse_movss(p->func, data, x86_make_disp(arg0, 8)); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) ); + sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); + sse_movlps(p->func, data, arg0); + break; + case 4: + sse_movups(p->func, data, arg0); + break; } - - return reg; } +/* this function behaves like emit_load_float32, but loads + 64-bit floating point numbers, converting them to 32-bit + ones */ +static void emit_load_float64to32( struct translate_sse *p, + struct x86_reg data, + struct x86_reg arg0, + unsigned out_chans, + unsigned chans) +{ + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + switch(chans) + { + case 1: + sse2_movsd(p->func, data, arg0); + if(out_chans > 1) + sse2_cvtpd2ps(p->func, data, data); + else + sse2_cvtsd2ss(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + break; + case 2: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + if(out_chans == CHANNELS_0001) + sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); + else if(out_chans > 2) + sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 3: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + if(out_chans > 3) + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + else + sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + if(out_chans == CHANNELS_0001) + sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); + break; + case 4: + sse2_movupd(p->func, data, arg0); + sse2_cvtpd2ps(p->func, data, data); + sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); + sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); + sse_movlhps(p->func, data, tmpXMM); + break; + } +} -static void emit_load_R32G32B32A32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm) { - sse_movups(p->func, data, arg0); + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, dst_gpr, src_gpr); + else + { + /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movq(p->func, dst_xmm, src_xmm); + else + sse_movlps(p->func, dst_xmm, src_xmm); + } } -static void emit_load_R32G32B32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src) { - /* Have to jump through some hoops: - * - * c 0 0 0 - * c 0 0 1 - * 0 0 c 1 - * a b c 1 - */ - sse_movss(p->func, data, x86_make_disp(arg0, 8)); - sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); - sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); - sse_movlps(p->func, data, arg0); + emit_mov64(p, dst_gpr, dst_xmm, src, src); } -static void emit_load_R32G32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm) { - /* 0 0 0 1 - * a b 0 1 - */ - sse_movups(p->func, data, get_identity(p) ); - sse_movlps(p->func, data, arg0); + emit_mov64(p, dst, dst, src_gpr, src_xmm); } +static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) +{ + if(x86_target_caps(p->func) & X86_SSE2) + sse2_movdqu(p->func, dst, src); + else + sse_movups(p->func, dst, src); +} -static void emit_load_R32( struct translate_sse *p, - struct x86_reg data, - struct x86_reg arg0 ) +/* TODO: this uses unaligned accesses liberally, which is great on Nehalem, + * but may or may not be good on older processors + * TODO: may perhaps want to use non-temporal stores here if possible + */ +static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size) { - /* a 0 0 0 - * a 0 0 1 - */ - sse_movss(p->func, data, arg0); - sse_orps(p->func, data, get_identity(p) ); + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); + struct x86_reg dataGPR = p->tmp_EAX; + struct x86_reg dataGPR2 = p->tmp2_EDX; + + if(size < 8) + { + switch (size) + { + case 1: + x86_mov8(p->func, dataGPR, src); + x86_mov8(p->func, dst, dataGPR); + break; + case 2: + x86_mov16(p->func, dataGPR, src); + x86_mov16(p->func, dst, dataGPR); + break; + case 3: + x86_mov16(p->func, dataGPR, src); + x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); + x86_mov16(p->func, dst, dataGPR); + x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); + break; + case 4: + x86_mov(p->func, dataGPR, src); + x86_mov(p->func, dst, dataGPR); + break; + case 6: + x86_mov(p->func, dataGPR, src); + x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); + x86_mov(p->func, dst, dataGPR); + x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); + break; + } + } + else if(!(x86_target_caps(p->func) & X86_SSE)) + { + unsigned i = 0; + assert((size & 3) == 0); + for(i = 0; i < size; i += 4) + { + x86_mov(p->func, dataGPR, x86_make_disp(src, i)); + x86_mov(p->func, x86_make_disp(dst, i), dataGPR); + } + } + else + { + switch(size) + { + case 8: + emit_load64(p, dataGPR, dataXMM, src); + emit_store64(p, dst, dataGPR, dataXMM); + break; + case 12: + emit_load64(p, dataGPR2, dataXMM, src); + x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); + emit_store64(p, dst, dataGPR2, dataXMM); + x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); + break; + case 16: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dst, dataXMM); + break; + case 24: + emit_mov128(p, dataXMM, src); + emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); + break; + case 32: + emit_mov128(p, dataXMM, src); + emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); + emit_mov128(p, dst, dataXMM); + emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); + break; + default: + assert(0); + } + } } +static boolean translate_attr_convert( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) -static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg data, - struct x86_reg src ) { + const struct util_format_description* input_desc = util_format_description(a->input_format); + const struct util_format_description* output_desc = util_format_description(a->output_format); + unsigned i; + boolean id_swizzle = TRUE; + unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE}; + unsigned needed_chans = 0; + unsigned imms[2] = {0, 0x3f800000}; - /* Load and unpack twice: - */ - sse_movss(p->func, data, src); - sse2_punpcklbw(p->func, data, get_identity(p)); - sse2_punpcklbw(p->func, data, get_identity(p)); + if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE) + return FALSE; - /* Convert to float: - */ - sse2_cvtdq2ps(p->func, data, data); + if(input_desc->channel[0].size & 7) + return FALSE; + if(input_desc->colorspace != output_desc->colorspace) + return FALSE; - /* Scale by 1/255.0 - */ - sse_mulps(p->func, data, get_inv_255(p)); -} + for(i = 1; i < input_desc->nr_channels; ++i) + { + if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0]))) + return FALSE; + } + for(i = 1; i < output_desc->nr_channels; ++i) + { + if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0]))) + return FALSE; + } + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(output_desc->swizzle[i] < 4) + swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; + } + if((x86_target_caps(p->func) & X86_SSE) && (0 + || a->output_format == PIPE_FORMAT_R32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT + || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); -static void emit_store_R32G32B32A32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movups(p->func, dest, dataXMM); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static void emit_store_R32G32B32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - /* Emit two, shuffle, emit one. - */ - sse_movlps(p->func, dest, dataXMM); - sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ - sse_movss(p->func, x86_make_disp(dest,8), dataXMM); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } -static void emit_store_R32G32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movlps(p->func, dest, dataXMM); -} + if(needed_chans > 0) + { + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovzx */ + switch(input_desc->channel[0].size) + { + case 8: + /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */ + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case 32: /* we lose precision here */ + sse2_psrld_imm(p->func, dataXMM, 1); + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_const(p, CONST_INV_255); + break; + case 16: + factor = get_const(p, CONST_INV_65535); + break; + case 32: + factor = get_const(p, CONST_INV_2147483647); + break; + default: + assert(0); + factor.disp = 0; + factor.file = 0; + factor.idx = 0; + factor.mod = 0; + break; + } + sse_mulps(p->func, dataXMM, factor); + } + else if(input_desc->channel[0].size == 32) + sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */ + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + /* TODO: add support for SSE4.1 pmovsx */ + switch(input_desc->channel[0].size) + { + case 8: + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 24); + break; + case 16: + sse2_punpcklwd(p->func, dataXMM, dataXMM); + sse2_psrad_imm(p->func, dataXMM, 16); + break; + case 32: /* we lose precision here */ + break; + default: + return FALSE; + } + sse2_cvtdq2ps(p->func, dataXMM, dataXMM); + if(input_desc->channel[0].normalized) + { + struct x86_reg factor; + switch(input_desc->channel[0].size) + { + case 8: + factor = get_const(p, CONST_INV_127); + break; + case 16: + factor = get_const(p, CONST_INV_32767); + break; + case 32: + factor = get_const(p, CONST_INV_2147483647); + break; + default: + assert(0); + factor.disp = 0; + factor.file = 0; + factor.idx = 0; + factor.mod = 0; + break; + } + sse_mulps(p->func, dataXMM, factor); + } + break; + + break; + case UTIL_FORMAT_TYPE_FLOAT: + if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64) + return FALSE; + if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3) + { + swizzle[3] = UTIL_FORMAT_SWIZZLE_W; + needed_chans = CHANNELS_0001; + } + switch(input_desc->channel[0].size) + { + case 32: + emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + case 64: /* we lose precision here */ + if(!(x86_target_caps(p->func) & X86_SSE2)) + return FALSE; + emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels); + break; + default: + return FALSE; + } + break; + default: + return FALSE; + } -static void emit_store_R32( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - sse_movss(p->func, dest, dataXMM); -} + if(!id_swizzle) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) ); + } + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse_movups(p->func, dst, dataXMM); + else + { + if(output_desc->nr_channels >= 2 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse_movlps(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + sse_movss(p->func, dst, dataXMM); + else + x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 2) + { + if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + if(output_desc->nr_channels >= 3) + { + if(output_desc->nr_channels >= 4 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); + else + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); + sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); + sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); + } + else + x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + } + return TRUE; + } + else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16 + && output_desc->channel[0].normalized == input_desc->channel[0].normalized + && (0 + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); + struct x86_reg tmp = p->tmp_EAX; + unsigned imms[2] = {0, 1}; + + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) + swizzle[i] = i; + } -static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg dataXMM ) -{ - /* Scale by 255.0 - */ - sse_mulps(p->func, dataXMM, get_255(p)); + for(i = 0; i < output_desc->nr_channels; ++i) + { + if(swizzle[i] < 4) + needed_chans = MAX2(needed_chans, swizzle[i] + 1); + if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) + id_swizzle = FALSE; + } - /* Pack and emit: - */ - sse2_cvtps2dq(p->func, dataXMM, dataXMM); - sse2_packssdw(p->func, dataXMM, dataXMM); - sse2_packuswb(p->func, dataXMM, dataXMM); - sse_movss(p->func, dest, dataXMM); -} + if(needed_chans > 0) + { + emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); + + switch(input_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + if(input_desc->channel[0].normalized) + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) + sse2_psrlw_imm(p->func, dataXMM, 1); + } + else + sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); + break; + case UTIL_FORMAT_TYPE_SIGNED: + if(input_desc->channel[0].normalized) + { + sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); + sse2_punpcklbw(p->func, tmpXMM, dataXMM); + sse2_psllw_imm(p->func, dataXMM, 9); + sse2_psrlw_imm(p->func, dataXMM, 8); + sse2_por(p->func, tmpXMM, dataXMM); + sse2_psrlw_imm(p->func, dataXMM, 7); + sse2_por(p->func, tmpXMM, dataXMM); + { + struct x86_reg t = dataXMM; + dataXMM = tmpXMM; + tmpXMM = t; + } + } + else + { + sse2_punpcklbw(p->func, dataXMM, dataXMM); + sse2_psraw_imm(p->func, dataXMM, 8); + } + break; + default: + assert(0); + } + if(output_desc->channel[0].normalized) + imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; + if(!id_swizzle) + sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); + } + if(output_desc->nr_channels >= 4 + && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 + && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 + ) + sse2_movq(p->func, dst, dataXMM); + else + { + if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) + sse2_movd(p->func, dst, dataXMM); + else + { + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, dst, tmp); + if(output_desc->nr_channels >= 2) + x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); + } + } + else + { + if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); + if(output_desc->nr_channels >= 2) + { + sse2_movd(p->func, tmp, dataXMM); + x86_shr_imm(p->func, tmp, 16); + x86_mov16(p->func, x86_make_disp(dst, 2), tmp); + } + } + } + if(output_desc->nr_channels >= 3) + { + if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) + { + if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); + } + else + { + sse2_psrlq_imm(p->func, dataXMM, 32); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 4), tmp); + if(output_desc->nr_channels >= 4) + { + x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); + } + } + } + else + { + if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) + x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + else + { + x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); + + if(output_desc->nr_channels >= 4) + { + sse2_psrlq_imm(p->func, dataXMM, 48); + sse2_movd(p->func, tmp, dataXMM); + x86_mov16(p->func, x86_make_disp(dst, 6), tmp); + } + } + } + } + } + return TRUE; + } + else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0]))) + { + struct x86_reg tmp = p->tmp_EAX; + unsigned i; + if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4 + && swizzle[0] == UTIL_FORMAT_SWIZZLE_W + && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z + && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y + && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) + { + /* TODO: support movbe */ + x86_mov(p->func, tmp, src); + x86_bswap(p->func, tmp); + x86_mov(p->func, dst, tmp); + return TRUE; + } -/* Extended swizzles? Maybe later. - */ -static void emit_swizzle( struct translate_sse *p, - struct x86_reg dest, - struct x86_reg src, - unsigned char shuffle ) -{ - sse_shufps(p->func, dest, src, shuffle); -} + for(i = 0; i < output_desc->nr_channels; ++i) + { + switch(output_desc->channel[0].size) + { + case 8: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[0].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[0].normalized ? 0xff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[0].normalized ? 0x7f : 1; + break; + default: + return FALSE; + } + } + x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); + } + else + { + x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); + x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); + } + break; + case 16: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3c00; + break; + default: + return FALSE; + } + } + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); + } + else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0) + x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); + else + { + x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); + x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); + } + break; + case 32: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned v = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + v = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + v = output_desc->channel[1].normalized ? 0x7fffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + v = 0x3f800000; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); + x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); + } + break; + case 64: + if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) + { + unsigned l = 0; + unsigned h = 0; + if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) + { + switch(output_desc->channel[1].type) + { + case UTIL_FORMAT_TYPE_UNSIGNED: + h = output_desc->channel[1].normalized ? 0xffffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_SIGNED: + h = output_desc->channel[1].normalized ? 0x7fffffff : 0; + l = output_desc->channel[1].normalized ? 0xffffffff : 1; + break; + case UTIL_FORMAT_TYPE_FLOAT: + h = 0x3ff00000; + l = 0; + break; + default: + return FALSE; + } + } + x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); + x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); + } + else + { + if(x86_target_caps(p->func) & X86_SSE) + { + struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); + emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8)); + emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); + } + else + { + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); + x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); + x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4)); + x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); + } + } + break; + default: + return FALSE; + } + } + return TRUE; + } + /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ + else if((x86_target_caps(p->func) & X86_SSE2) && + a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0 + || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM + || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM + )) + { + struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + /* load */ + sse_movups(p->func, dataXMM, src); -static boolean translate_attr( struct translate_sse *p, - const struct translate_element *a, - struct x86_reg srcECX, - struct x86_reg dstEAX) -{ - struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); + if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) + sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3)); - switch (a->input_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_load_R32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_load_R32G32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_load_R32G32B32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_load_R32G32B32A32(p, dataXMM, srcECX); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); - break; - default: - return FALSE; - } + /* scale by 255.0 */ + sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); - switch (a->output_format) { - case PIPE_FORMAT_R32_FLOAT: - emit_store_R32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32_FLOAT: - emit_store_R32G32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32_FLOAT: - emit_store_R32G32B32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R32G32B32A32_FLOAT: - emit_store_R32G32B32A32(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); - break; - default: - return FALSE; + /* pack and emit */ + sse2_cvtps2dq(p->func, dataXMM, dataXMM); + sse2_packssdw(p->func, dataXMM, dataXMM); + sse2_packuswb(p->func, dataXMM, dataXMM); + sse2_movd(p->func, dst, dataXMM); + + return TRUE; } - return TRUE; + return FALSE; } +static boolean translate_attr( struct translate_sse *p, + const struct translate_element *a, + struct x86_reg src, + struct x86_reg dst) +{ + if(a->input_format == a->output_format) + { + emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); + return TRUE; + } + + return translate_attr_convert(p, a, src, dst); +} static boolean init_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { unsigned i; - struct x86_reg instance_id = x86_make_disp(p->machine_EDX, + struct x86_reg instance_id = x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; struct translate_buffer *buffer = &p->buffer[varient->buffer_index]; - if (linear || varient->instance_divisor) { - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + if (!index_size || varient->instance_divisor) { + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); - struct x86_reg elt = p->idx_EBX; + struct x86_reg elt = p->idx_ESI; struct x86_reg tmp_EAX = p->tmp_EAX; /* Calculate pointer to first attrib: @@ -406,20 +1085,16 @@ static boolean init_inputs( struct translate_sse *p, x86_mov(p->func, tmp_EAX, instance_id); if (varient->instance_divisor != 1) { - struct x86_reg tmp_EDX = p->machine_EDX; - struct x86_reg tmp_ECX = p->outbuf_ECX; + struct x86_reg tmp_EDX = p->tmp2_EDX; + struct x86_reg tmp_ECX = p->src_ECX; /* TODO: Add x86_shr() to rtasm and use it whenever * instance divisor is power of two. */ - x86_push(p->func, tmp_EDX); - x86_push(p->func, tmp_ECX); x86_xor(p->func, tmp_EDX, tmp_EDX); x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor); x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ - x86_pop(p->func, tmp_ECX); - x86_pop(p->func, tmp_EDX); } } else { x86_mov(p->func, tmp_EAX, elt); @@ -430,16 +1105,23 @@ static boolean init_inputs( struct translate_sse *p, */ x86_imul(p->func, tmp_EAX, buf_stride); + x64_rexw(p->func); x86_add(p->func, tmp_EAX, buf_base_ptr); /* In the linear case, keep the buffer pointer instead of the * index number. */ - if (linear && p->nr_buffer_varients == 1) + if (!index_size && p->nr_buffer_varients == 1) + { + x64_rexw(p->func); x86_mov(p->func, elt, tmp_EAX); + } else + { + x64_rexw(p->func); x86_mov(p->func, buf_ptr, tmp_EAX); + } } } @@ -448,44 +1130,57 @@ static boolean init_inputs( struct translate_sse *p, static struct x86_reg get_buffer_ptr( struct translate_sse *p, - boolean linear, + unsigned index_size, unsigned var_idx, struct x86_reg elt ) { if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { - return x86_make_disp(p->machine_EDX, + return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); } - if (linear && p->nr_buffer_varients == 1) { - return p->idx_EBX; + if (!index_size && p->nr_buffer_varients == 1) { + return p->idx_ESI; } - else if (linear || p->buffer_varient[var_idx].instance_divisor) { - struct x86_reg ptr = p->tmp_EAX; + else if (!index_size || p->buffer_varient[var_idx].instance_divisor) { + struct x86_reg ptr = p->src_ECX; struct x86_reg buf_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer_varient[var_idx].ptr)); + x64_rexw(p->func); x86_mov(p->func, ptr, buf_ptr); return ptr; } else { - struct x86_reg ptr = p->tmp_EAX; + struct x86_reg ptr = p->src_ECX; const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx]; struct x86_reg buf_stride = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); struct x86_reg buf_base_ptr = - x86_make_disp(p->machine_EDX, + x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].base_ptr)); /* Calculate pointer to current attrib: */ - x86_mov(p->func, ptr, buf_stride); - x86_imul(p->func, ptr, elt); + switch(index_size) + { + case 1: + x86_movzx8(p->func, ptr, elt); + break; + case 2: + x86_movzx16(p->func, ptr, elt); + break; + case 4: + x86_mov(p->func, ptr, elt); + break; + } + x86_imul(p->func, ptr, buf_stride); + x64_rexw(p->func); x86_add(p->func, ptr, buf_base_ptr); return ptr; } @@ -494,39 +1189,43 @@ static struct x86_reg get_buffer_ptr( struct translate_sse *p, static boolean incr_inputs( struct translate_sse *p, - boolean linear ) + unsigned index_size ) { - if (linear && p->nr_buffer_varients == 1) { - struct x86_reg stride = x86_make_disp(p->machine_EDX, + if (!index_size && p->nr_buffer_varients == 1) { + struct x86_reg stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[0].stride)); if (p->buffer_varient[0].instance_divisor == 0) { - x86_add(p->func, p->idx_EBX, stride); - sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); + x64_rexw(p->func); + x86_add(p->func, p->idx_ESI, stride); + sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); } } - else if (linear) { + else if (!index_size) { unsigned i; /* Is this worthwhile?? */ for (i = 0; i < p->nr_buffer_varients; i++) { struct translate_buffer_varient *varient = &p->buffer_varient[i]; - struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, + struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, get_offset(p, &varient->ptr)); - struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, + struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, get_offset(p, &p->buffer[varient->buffer_index].stride)); if (varient->instance_divisor == 0) { - x86_mov(p->func, p->tmp_EAX, buf_ptr); - x86_add(p->func, p->tmp_EAX, buf_stride); + x86_mov(p->func, p->tmp_EAX, buf_stride); + x64_rexw(p->func); + x86_add(p->func, p->tmp_EAX, buf_ptr); if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); + x64_rexw(p->func); x86_mov(p->func, buf_ptr, p->tmp_EAX); } } } else { - x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4)); + x64_rexw(p->func); + x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); } return TRUE; @@ -551,35 +1250,52 @@ static boolean incr_inputs( struct translate_sse *p, */ static boolean build_vertex_emit( struct translate_sse *p, struct x86_function *func, - boolean linear ) + unsigned index_size ) { int fixup, label; unsigned j; + memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); + memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); + p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); - p->idx_EBX = x86_make_reg(file_REG32, reg_BX); - p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX); - p->machine_EDX = x86_make_reg(file_REG32, reg_DX); - p->count_ESI = x86_make_reg(file_REG32, reg_SI); + p->idx_ESI = x86_make_reg(file_REG32, reg_SI); + p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); + p->machine_EDI = x86_make_reg(file_REG32, reg_DI); + p->count_EBP = x86_make_reg(file_REG32, reg_BP); + p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); + p->src_ECX = x86_make_reg(file_REG32, reg_CX); p->func = func; - p->loaded_inv_255 = FALSE; - p->loaded_255 = FALSE; - p->loaded_identity = FALSE; x86_init_func(p->func); - /* Push a few regs? - */ - x86_push(p->func, p->idx_EBX); - x86_push(p->func, p->count_ESI); + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */ + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6)); + sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7)); + } - /* Load arguments into regs: - */ - x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); - x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); - x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); - x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); + x86_push(p->func, p->outbuf_EBX); + x86_push(p->func, p->count_EBP); + +/* on non-Win64 x86-64, these are already in the right registers */ + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_push(p->func, p->machine_EDI); + x86_push(p->func, p->idx_ESI); + + x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); + x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); + } + + x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); + + if(x86_target(p->func) != X86_32) + x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); + else + x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5)); /* Load instance ID. */ @@ -588,25 +1304,25 @@ static boolean build_vertex_emit( struct translate_sse *p, p->tmp_EAX, x86_fn_arg(p->func, 4)); x86_mov(p->func, - x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)), + x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), p->tmp_EAX); } /* Get vertex count, compare to zero */ x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); - x86_cmp(p->func, p->count_ESI, p->tmp_EAX); + x86_cmp(p->func, p->count_EBP, p->tmp_EAX); fixup = x86_jcc_forward(p->func, cc_E); /* always load, needed or not: */ - init_inputs(p, linear); + init_inputs(p, index_size); /* Note address for loop jump */ label = x86_get_label(p->func); { - struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); + struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); int last_varient = -1; struct x86_reg vb; @@ -618,30 +1334,31 @@ static boolean build_vertex_emit( struct translate_sse *p, */ if (varient != last_varient) { last_varient = varient; - vb = get_buffer_ptr(p, linear, varient, elt); + vb = get_buffer_ptr(p, index_size, varient, elt); } if (!translate_attr( p, a, x86_make_disp(vb, a->input_offset), - x86_make_disp(p->outbuf_ECX, a->output_offset))) + x86_make_disp(p->outbuf_EBX, a->output_offset))) return FALSE; } /* Next output vertex: */ + x64_rexw(p->func); x86_lea(p->func, - p->outbuf_ECX, - x86_make_disp(p->outbuf_ECX, + p->outbuf_EBX, + x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); /* Incr index */ - incr_inputs( p, linear ); + incr_inputs( p, index_size ); } /* decr count, loop if not zero */ - x86_dec(p->func, p->count_ESI); + x86_dec(p->func, p->count_EBP); x86_jcc(p->func, cc_NZ, label); /* Exit mmx state? @@ -656,8 +1373,20 @@ static boolean build_vertex_emit( struct translate_sse *p, /* Pop regs and return */ - x86_pop(p->func, p->count_ESI); - x86_pop(p->func, p->idx_EBX); + if(x86_target(p->func) != X86_64_STD_ABI) + { + x86_pop(p->func, p->idx_ESI); + x86_pop(p->func, p->machine_EDI); + } + + x86_pop(p->func, p->count_EBP); + x86_pop(p->func, p->outbuf_EBX); + + if(x86_target(p->func) == X86_64_WIN64_ABI) + { + sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); + sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); + } x86_ret(p->func); return TRUE; @@ -697,37 +1426,7 @@ static void translate_sse_release( struct translate *translate ) x86_release_func( &p->linear_func ); x86_release_func( &p->elt_func ); - FREE(p); -} - -static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, - const unsigned *elts, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run_elts( translate, - elts, - count, - instance_id, - output_buffer); -} - -static void PIPE_CDECL translate_sse_run( struct translate *translate, - unsigned start, - unsigned count, - unsigned instance_id, - void *output_buffer ) -{ - struct translate_sse *p = (struct translate_sse *)translate; - - p->gen_run( translate, - start, - count, - instance_id, - output_buffer); + os_free_aligned(p); } @@ -736,18 +1435,19 @@ struct translate *translate_sse2_create( const struct translate_key *key ) struct translate_sse *p = NULL; unsigned i; - if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) + /* this is misnamed, it actually refers to whether rtasm is enabled or not */ + if (!rtasm_cpu_has_sse()) goto fail; - p = CALLOC_STRUCT( translate_sse ); + p = os_malloc_aligned(sizeof(struct translate_sse), 16); if (p == NULL) goto fail; + memset(p, 0, sizeof(*p)); + memcpy(p->consts, consts, sizeof(consts)); p->translate.key = *key; p->translate.release = translate_sse_release; p->translate.set_buffer = translate_sse_set_buffer; - p->translate.run_elts = translate_sse_run_elts; - p->translate.run = translate_sse_run; for (i = 0; i < key->nr_elements; i++) { if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { @@ -783,18 +1483,32 @@ struct translate *translate_sse2_create( const struct translate_key *key ) if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); - if (!build_vertex_emit(p, &p->linear_func, TRUE)) + if (!build_vertex_emit(p, &p->linear_func, 0)) + goto fail; + + if (!build_vertex_emit(p, &p->elt_func, 4)) + goto fail; + + if (!build_vertex_emit(p, &p->elt16_func, 2)) + goto fail; + + if (!build_vertex_emit(p, &p->elt8_func, 1)) + goto fail; + + p->translate.run = (void*)x86_get_func(&p->linear_func); + if (p->translate.run == NULL) goto fail; - if (!build_vertex_emit(p, &p->elt_func, FALSE)) + p->translate.run_elts = (void*)x86_get_func(&p->elt_func); + if (p->translate.run_elts == NULL) goto fail; - p->gen_run = (run_func)x86_get_func(&p->linear_func); - if (p->gen_run == NULL) + p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func); + if (p->translate.run_elts16 == NULL) goto fail; - p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); - if (p->gen_run_elts == NULL) + p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func); + if (p->translate.run_elts8 == NULL) goto fail; return &p->translate; diff --git a/src/gallium/auxiliary/util/u_bitmask.h b/src/gallium/auxiliary/util/u_bitmask.h index 87f1110296a..98b85ddecd5 100644 --- a/src/gallium/auxiliary/util/u_bitmask.h +++ b/src/gallium/auxiliary/util/u_bitmask.h @@ -36,6 +36,9 @@ #define U_HANDLE_BITMASK_H_ +#include "pipe/p_compiler.h" + + #ifdef __cplusplus extern "C" { #endif diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c index 97fa99ec65d..dfb142b9e1c 100644 --- a/src/gallium/auxiliary/util/u_blit.c +++ b/src/gallium/auxiliary/util/u_blit.c @@ -42,6 +42,7 @@ #include "util/u_blit.h" #include "util/u_draw_quad.h" +#include "util/u_format.h" #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_sampler.h" @@ -56,15 +57,18 @@ struct blit_state struct cso_context *cso; struct pipe_blend_state blend; - struct pipe_depth_stencil_alpha_state depthstencil; + struct pipe_depth_stencil_alpha_state depthstencil_keep; + struct pipe_depth_stencil_alpha_state depthstencil_write; struct pipe_rasterizer_state rasterizer; struct pipe_sampler_state sampler; struct pipe_viewport_state viewport; struct pipe_clip_state clip; struct pipe_vertex_element velem[2]; + enum pipe_texture_target internal_target; void *vs; void *fs[TGSI_WRITEMASK_XYZW + 1]; + void *fs_depth; struct pipe_resource *vbuf; /**< quad vertices */ unsigned vbuf_slot; @@ -95,7 +99,11 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->blend.rt[0].colormask = PIPE_MASK_RGBA; /* no-op depth/stencil/alpha */ - memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil)); + memset(&ctx->depthstencil_keep, 0, sizeof(ctx->depthstencil_keep)); + memset(&ctx->depthstencil_write, 0, sizeof(ctx->depthstencil_write)); + ctx->depthstencil_write.depth.enabled = 1; + ctx->depthstencil_write.depth.writemask = 1; + ctx->depthstencil_write.depth.func = PIPE_FUNC_ALWAYS; /* rasterizer */ memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer)); @@ -110,7 +118,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE; ctx->sampler.min_img_filter = 0; /* set later */ ctx->sampler.mag_img_filter = 0; /* set later */ - ctx->sampler.normalized_coords = 1; /* vertex elements state */ memset(&ctx->velem[0], 0, sizeof(ctx->velem[0]) * 2); @@ -145,6 +152,11 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso) ctx->vertices[i][1][3] = 1.0f; /* q */ } + if(pipe->screen->get_param(pipe->screen, PIPE_CAP_NPOT_TEXTURES)) + ctx->internal_target = PIPE_TEXTURE_2D; + else + ctx->internal_target = PIPE_TEXTURE_RECT; + return ctx; } @@ -164,6 +176,9 @@ util_destroy_blit(struct blit_state *ctx) if (ctx->fs[i]) pipe->delete_fs_state(pipe, ctx->fs[i]); + if (ctx->fs_depth) + pipe->delete_fs_state(pipe, ctx->fs_depth); + pipe_resource_reference(&ctx->vbuf, NULL); FREE(ctx); @@ -271,7 +286,7 @@ regions_overlap(int srcX0, int srcY0, * \param writemask controls which channels in the dest surface are sourced * from the src surface. Disabled channels are sourced * from (0,0,0,1). - * XXX need some control over blitting Z and/or stencil. + * XXX need some control over blitting stencil. */ void util_blit_pixels_writemask(struct blit_state *ctx, @@ -294,8 +309,9 @@ util_blit_pixels_writemask(struct blit_state *ctx, const int srcW = abs(srcX1 - srcX0); const int srcH = abs(srcY1 - srcY0); unsigned offset; - boolean overlap; + boolean overlap, dst_is_depth; float s0, t0, s1, t1; + boolean normalized; assert(filter == PIPE_TEX_MIPFILTER_NEAREST || filter == PIPE_TEX_MIPFILTER_LINEAR); @@ -335,7 +351,6 @@ util_blit_pixels_writemask(struct blit_state *ctx, return; } - /* Create a temporary texture when src and dest alias or when src * is anything other than a 2d texture. * XXX should just use appropriate shader to access 1d / 3d slice / cube face, @@ -347,7 +362,8 @@ util_blit_pixels_writemask(struct blit_state *ctx, dst->face == srcsub.face && dst->level == srcsub.level && dst->zslice == srcZ0) || - src_tex->target != PIPE_TEXTURE_2D) + (src_tex->target != PIPE_TEXTURE_2D && + src_tex->target != PIPE_TEXTURE_RECT)) { struct pipe_resource texTemp; struct pipe_resource *tex; @@ -372,7 +388,7 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* create temp texture */ memset(&texTemp, 0, sizeof(texTemp)); - texTemp.target = PIPE_TEXTURE_2D; + texTemp.target = ctx->internal_target; texTemp.format = src_tex->format; texTemp.last_level = 0; texTemp.width0 = srcW; @@ -392,10 +408,19 @@ util_blit_pixels_writemask(struct blit_state *ctx, src_tex, srcsub, srcLeft, srcTop, srcZ0, /* src */ srcW, srcH); /* size */ - s0 = 0.0f; - s1 = 1.0f; - t0 = 0.0f; - t1 = 1.0f; + normalized = tex->target != PIPE_TEXTURE_RECT; + if(normalized) { + s0 = 0.0f; + s1 = 1.0f; + t0 = 0.0f; + t1 = 1.0f; + } + else { + s0 = 0; + s1 = srcW; + t0 = 0; + t1 = srcH; + } u_sampler_view_default_template(&sv_templ, tex, tex->format); sampler_view = pipe->create_sampler_view(pipe, tex, &sv_templ); @@ -415,20 +440,29 @@ util_blit_pixels_writemask(struct blit_state *ctx, return; } - s0 = srcX0 / (float)(u_minify(sampler_view->texture->width0, srcsub.level)); - s1 = srcX1 / (float)(u_minify(sampler_view->texture->width0, srcsub.level)); - t0 = srcY0 / (float)(u_minify(sampler_view->texture->height0, srcsub.level)); - t1 = srcY1 / (float)(u_minify(sampler_view->texture->height0, srcsub.level)); + s0 = srcX0; + s1 = srcX1; + t0 = srcY0; + t1 = srcY1; + normalized = sampler_view->texture->target != PIPE_TEXTURE_RECT; + if(normalized) + { + s0 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level)); + s1 /= (float)(u_minify(sampler_view->texture->width0, srcsub.level)); + t0 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level)); + t1 /= (float)(u_minify(sampler_view->texture->height0, srcsub.level)); + } } + dst_is_depth = util_format_is_depth_or_stencil(dst->format); - assert(screen->is_format_supported(screen, sampler_view->format, PIPE_TEXTURE_2D, + assert(screen->is_format_supported(screen, sampler_view->format, ctx->internal_target, sampler_view->texture->nr_samples, PIPE_BIND_SAMPLER_VIEW, 0)); - assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D, + assert(screen->is_format_supported(screen, dst->format, ctx->internal_target, dst->texture->nr_samples, - PIPE_BIND_RENDER_TARGET, 0)); - + dst_is_depth ? PIPE_BIND_DEPTH_STENCIL : + PIPE_BIND_RENDER_TARGET, 0)); /* save state (restored below) */ cso_save_blend(ctx->cso); cso_save_depth_stencil_alpha(ctx->cso); @@ -444,12 +478,15 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* set misc state we care about */ cso_set_blend(ctx->cso, &ctx->blend); - cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil); + cso_set_depth_stencil_alpha(ctx->cso, + dst_is_depth ? &ctx->depthstencil_write : + &ctx->depthstencil_keep); cso_set_rasterizer(ctx->cso, &ctx->rasterizer); cso_set_clip(ctx->cso, &ctx->clip); cso_set_vertex_elements(ctx->cso, 2, ctx->velem); /* sampler */ + ctx->sampler.normalized_coords = normalized; ctx->sampler.min_img_filter = filter; ctx->sampler.mag_img_filter = filter; /* we've limited this already with the sampler view but you never know... */ @@ -472,22 +509,35 @@ util_blit_pixels_writemask(struct blit_state *ctx, /* texture */ cso_set_fragment_sampler_views(ctx->cso, 1, &sampler_view); - if (ctx->fs[writemask] == NULL) - ctx->fs[writemask] = - util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D, - TGSI_INTERPOLATE_LINEAR, - writemask); - /* shaders */ - cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]); + if (dst_is_depth) { + if (ctx->fs_depth == NULL) + ctx->fs_depth = + util_make_fragment_tex_shader_writedepth(pipe, TGSI_TEXTURE_2D, + TGSI_INTERPOLATE_LINEAR); + + cso_set_fragment_shader_handle(ctx->cso, ctx->fs_depth); + } else { + if (ctx->fs[writemask] == NULL) + ctx->fs[writemask] = + util_make_fragment_tex_shader_writemask(pipe, TGSI_TEXTURE_2D, + TGSI_INTERPOLATE_LINEAR, + writemask); + + cso_set_fragment_shader_handle(ctx->cso, ctx->fs[writemask]); + } cso_set_vertex_shader_handle(ctx->cso, ctx->vs); /* drawing dest */ memset(&fb, 0, sizeof(fb)); fb.width = dst->width; fb.height = dst->height; - fb.nr_cbufs = 1; - fb.cbufs[0] = dst; + if (dst_is_depth) { + fb.zsbuf = dst; + } else { + fb.nr_cbufs = 1; + fb.cbufs[0] = dst; + } cso_set_framebuffer(ctx->cso, &fb); /* draw quad */ @@ -574,6 +624,7 @@ util_blit_pixels_tex(struct blit_state *ctx, int dstX1, int dstY1, float z, uint filter) { + boolean normalized = src_sampler_view->texture->target != PIPE_TEXTURE_RECT; struct pipe_framebuffer_state fb; float s0, t0, s1, t1; unsigned offset; @@ -586,10 +637,18 @@ util_blit_pixels_tex(struct blit_state *ctx, assert(tex->width0 != 0); assert(tex->height0 != 0); - s0 = srcX0 / (float)tex->width0; - s1 = srcX1 / (float)tex->width0; - t0 = srcY0 / (float)tex->height0; - t1 = srcY1 / (float)tex->height0; + s0 = srcX0; + s1 = srcX1; + t0 = srcY0; + t1 = srcY1; + + if(normalized) + { + s0 /= (float)tex->width0; + s1 /= (float)tex->width0; + t0 /= (float)tex->height0; + t1 /= (float)tex->height0; + } assert(ctx->pipe->screen->is_format_supported(ctx->pipe->screen, dst->format, PIPE_TEXTURE_2D, @@ -611,12 +670,13 @@ util_blit_pixels_tex(struct blit_state *ctx, /* set misc state we care about */ cso_set_blend(ctx->cso, &ctx->blend); - cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil); + cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil_keep); cso_set_rasterizer(ctx->cso, &ctx->rasterizer); cso_set_clip(ctx->cso, &ctx->clip); cso_set_vertex_elements(ctx->cso, 2, ctx->velem); /* sampler */ + ctx->sampler.normalized_coords = normalized; ctx->sampler.min_img_filter = filter; ctx->sampler.mag_img_filter = filter; cso_single_sampler(ctx->cso, 0, &ctx->sampler); diff --git a/src/gallium/auxiliary/util/u_blit.h b/src/gallium/auxiliary/util/u_blit.h index ef95134f324..b8a0dfce13f 100644 --- a/src/gallium/auxiliary/util/u_blit.h +++ b/src/gallium/auxiliary/util/u_blit.h @@ -30,18 +30,20 @@ #define U_BLIT_H +#include "pipe/p_compiler.h" + + #ifdef __cplusplus extern "C" { #endif +struct cso_context; struct pipe_context; -struct pipe_surface; struct pipe_resource; -struct cso_context; - - -struct blit_state; +struct pipe_sampler_view; +struct pipe_subresource; +struct pipe_surface; extern struct blit_state * diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c index b5b86b72142..f93ef26ae73 100644 --- a/src/gallium/auxiliary/util/u_blitter.c +++ b/src/gallium/auxiliary/util/u_blitter.c @@ -92,7 +92,7 @@ struct blitter_context_priv void *velem_state; /* Sampler state for clamping to a miplevel. */ - void *sampler_state[PIPE_MAX_TEXTURE_LEVELS]; + void *sampler_state[PIPE_MAX_TEXTURE_LEVELS * 2]; /* Rasterizer state. */ void *rs_state; @@ -254,6 +254,7 @@ void util_blitter_destroy(struct blitter_context *blitter) ctx->dsa_write_depth_keep_stencil); pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_write_depth_stencil); pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_keep_depth_write_stencil); + pipe->delete_depth_stencil_alpha_state(pipe, ctx->dsa_flush_depth_stencil); pipe->delete_rasterizer_state(pipe, ctx->rs_state); pipe->delete_vs_state(pipe, ctx->vs_col); @@ -271,7 +272,7 @@ void util_blitter_destroy(struct blitter_context *blitter) if (ctx->fs_col[i]) pipe->delete_fs_state(pipe, ctx->fs_col[i]); - for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++) + for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS * 2; i++) if (ctx->sampler_state[i]) pipe->delete_sampler_state(pipe, ctx->sampler_state[i]); @@ -319,7 +320,7 @@ static void blitter_restore_CSOs(struct blitter_context_priv *ctx) */ if (ctx->base.saved_fb_state.nr_cbufs != ~0) { pipe->set_framebuffer_state(pipe, &ctx->base.saved_fb_state); - util_assign_framebuffer_state(&ctx->base.saved_fb_state, NULL); + util_unreference_framebuffer_state(&ctx->base.saved_fb_state); ctx->base.saved_fb_state.nr_cbufs = ~0; } @@ -417,16 +418,26 @@ static void blitter_set_clear_color(struct blitter_context_priv *ctx, } } -static void get_normalized_texcoords(struct pipe_resource *src, +static void get_texcoords(struct pipe_resource *src, struct pipe_subresource subsrc, unsigned x1, unsigned y1, unsigned x2, unsigned y2, - float out[4]) + boolean normalized, float out[4]) { - out[0] = x1 / (float)u_minify(src->width0, subsrc.level); - out[1] = y1 / (float)u_minify(src->height0, subsrc.level); - out[2] = x2 / (float)u_minify(src->width0, subsrc.level); - out[3] = y2 / (float)u_minify(src->height0, subsrc.level); + if(normalized) + { + out[0] = x1 / (float)u_minify(src->width0, subsrc.level); + out[1] = y1 / (float)u_minify(src->height0, subsrc.level); + out[2] = x2 / (float)u_minify(src->width0, subsrc.level); + out[3] = y2 / (float)u_minify(src->height0, subsrc.level); + } + else + { + out[0] = x1; + out[1] = y1; + out[2] = x2; + out[3] = y2; + } } static void set_texcoords_in_vertices(const float coord[4], @@ -454,7 +465,7 @@ static void blitter_set_texcoords_2d(struct blitter_context_priv *ctx, unsigned i; float coord[4]; - get_normalized_texcoords(src, subsrc, x1, y1, x2, y2, coord); + get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord); set_texcoords_in_vertices(coord, &ctx->vertices[0][1][0], 8); for (i = 0; i < 4; i++) { @@ -489,7 +500,7 @@ static void blitter_set_texcoords_cube(struct blitter_context_priv *ctx, float coord[4]; float st[4][2]; - get_normalized_texcoords(src, subsrc, x1, y1, x2, y2, coord); + get_texcoords(src, subsrc, x1, y1, x2, y2, TRUE, coord); set_texcoords_in_vertices(coord, &st[0][0], 2); util_map_texcoords2d_onto_cubemap(subsrc.face, @@ -523,7 +534,7 @@ static void blitter_draw_quad(struct blitter_context_priv *ctx) static INLINE void **blitter_get_sampler_state(struct blitter_context_priv *ctx, - int miplevel) + int miplevel, boolean normalized) { struct pipe_context *pipe = ctx->base.pipe; struct pipe_sampler_state *sampler_state = &ctx->template_sampler_state; @@ -531,18 +542,19 @@ void **blitter_get_sampler_state(struct blitter_context_priv *ctx, assert(miplevel < PIPE_MAX_TEXTURE_LEVELS); /* Create the sampler state on-demand. */ - if (!ctx->sampler_state[miplevel]) { + if (!ctx->sampler_state[miplevel * 2 + normalized]) { sampler_state->lod_bias = miplevel; sampler_state->min_lod = miplevel; sampler_state->max_lod = miplevel; + sampler_state->normalized_coords = normalized; - ctx->sampler_state[miplevel] = pipe->create_sampler_state(pipe, + ctx->sampler_state[miplevel * 2 + normalized] = pipe->create_sampler_state(pipe, sampler_state); } /* Return void** so that it can be passed to bind_fragment_sampler_states * directly. */ - return &ctx->sampler_state[miplevel]; + return &ctx->sampler_state[miplevel * 2 + normalized]; } static INLINE @@ -568,6 +580,8 @@ pipe_tex_to_tgsi_tex(enum pipe_texture_target pipe_tex_target) return TGSI_TEXTURE_1D; case PIPE_TEXTURE_2D: return TGSI_TEXTURE_2D; + case PIPE_TEXTURE_RECT: + return TGSI_TEXTURE_RECT; case PIPE_TEXTURE_3D: return TGSI_TEXTURE_3D; case PIPE_TEXTURE_CUBE: @@ -716,6 +730,7 @@ void util_blitter_copy_region(struct blitter_context *blitter, struct pipe_sampler_view viewTempl, *view; unsigned bind; boolean is_stencil, is_depth; + boolean normalized; /* Give up if textures are not set. */ assert(dst && src); @@ -787,6 +802,8 @@ void util_blitter_copy_region(struct blitter_context *blitter, fb_state.zsbuf = 0; } + normalized = src->target != PIPE_TEXTURE_RECT; + /* Initialize sampler view. */ u_sampler_view_default_template(&viewTempl, src, src->format); view = pipe->create_sampler_view(pipe, src, &viewTempl); @@ -795,7 +812,7 @@ void util_blitter_copy_region(struct blitter_context *blitter, pipe->bind_rasterizer_state(pipe, ctx->rs_state); pipe->bind_vs_state(pipe, ctx->vs_tex); pipe->bind_fragment_sampler_states(pipe, 1, - blitter_get_sampler_state(ctx, subsrc.level)); + blitter_get_sampler_state(ctx, subsrc.level, normalized)); pipe->bind_vertex_elements_state(pipe, ctx->velem_state); pipe->set_fragment_sampler_views(pipe, 1, &view); pipe->set_framebuffer_state(pipe, &fb_state); @@ -806,11 +823,12 @@ void util_blitter_copy_region(struct blitter_context *blitter, /* Draw the quad with the draw_rectangle callback. */ case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: { /* Set texture coordinates. */ float coord[4]; - get_normalized_texcoords(src, subsrc, srcx, srcy, - srcx+width, srcy+height, coord); + get_texcoords(src, subsrc, srcx, srcy, + srcx+width, srcy+height, normalized, coord); /* Draw. */ blitter->draw_rectangle(blitter, dstx, dsty, dstx+width, dsty+height, 0, diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h index f316587dea0..e33d2e283f8 100644 --- a/src/gallium/auxiliary/util/u_blitter.h +++ b/src/gallium/auxiliary/util/u_blitter.h @@ -27,6 +27,7 @@ #ifndef U_BLITTER_H #define U_BLITTER_H +#include "util/u_framebuffer.h" #include "util/u_inlines.h" #include "util/u_memory.h" @@ -258,45 +259,12 @@ void util_blitter_save_vertex_shader(struct blitter_context *blitter, blitter->saved_vs = vs; } -/* XXX This should probably be moved elsewhere. */ -static INLINE -void util_assign_framebuffer_state(struct pipe_framebuffer_state *dst, - const struct pipe_framebuffer_state *src) -{ - unsigned i; - - if (src) { - /* Reference all surfaces. */ - for (i = 0; i < src->nr_cbufs; i++) { - pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]); - } - for (; i < dst->nr_cbufs; i++) { - pipe_surface_reference(&dst->cbufs[i], NULL); - } - - pipe_surface_reference(&dst->zsbuf, src->zsbuf); - - dst->nr_cbufs = src->nr_cbufs; - dst->width = src->width; - dst->height = src->height; - } else { - /* Set all surfaces to NULL. */ - for (i = 0; i < dst->nr_cbufs; i++) { - pipe_surface_reference(&dst->cbufs[i], NULL); - } - - pipe_surface_reference(&dst->zsbuf, NULL); - - dst->nr_cbufs = 0; - } -} - static INLINE void util_blitter_save_framebuffer(struct blitter_context *blitter, const struct pipe_framebuffer_state *state) { blitter->saved_fb_state.nr_cbufs = 0; /* It's ~0 now, meaning it's unsaved. */ - util_assign_framebuffer_state(&blitter->saved_fb_state, state); + util_copy_framebuffer_state(&blitter->saved_fb_state, state); } static INLINE diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c index 50563513072..32519b148b6 100644 --- a/src/gallium/auxiliary/util/u_cpu_detect.c +++ b/src/gallium/auxiliary/util/u_cpu_detect.c @@ -73,7 +73,9 @@ #endif +#ifdef DEBUG DEBUG_GET_ONCE_BOOL_OPTION(dump_cpu, "GALLIUM_DUMP_CPU", FALSE) +#endif struct util_cpu_caps util_cpu_caps; @@ -83,61 +85,6 @@ static int has_cpuid(void); #endif -#if defined(PIPE_ARCH_X86) - -/* The sigill handlers */ -#if defined(PIPE_OS_LINUX) /*&& defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)*/ -static void -sigill_handler_sse(int signal, struct sigcontext sc) -{ - /* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1" - * instructions are 3 bytes long. We must increment the instruction - * pointer manually to avoid repeated execution of the offending - * instruction. - * - * If the SIGILL is caused by a divide-by-zero when unmasked - * exceptions aren't supported, the SIMD FPU status and control - * word will be restored at the end of the test, so we don't need - * to worry about doing it here. Besides, we may not be able to... - */ - sc.eip += 3; - - util_cpu_caps.has_sse=0; -} - -static void -sigfpe_handler_sse(int signal, struct sigcontext sc) -{ - if (sc.fpstate->magic != 0xffff) { - /* Our signal context has the extended FPU state, so reset the - * divide-by-zero exception mask and clear the divide-by-zero - * exception bit. - */ - sc.fpstate->mxcsr |= 0x00000200; - sc.fpstate->mxcsr &= 0xfffffffb; - } else { - /* If we ever get here, we're completely hosed. - */ - } -} -#endif /* PIPE_OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */ - -#if defined(PIPE_OS_WINDOWS) -static LONG CALLBACK -win32_sig_handler_sse(EXCEPTION_POINTERS* ep) -{ - if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){ - ep->ContextRecord->Eip +=3; - util_cpu_caps.has_sse=0; - return EXCEPTION_CONTINUE_EXECUTION; - } - return EXCEPTION_CONTINUE_SEARCH; -} -#endif /* PIPE_OS_WINDOWS */ - -#endif /* PIPE_ARCH_X86 */ - - #if defined(PIPE_ARCH_PPC) && !defined(PIPE_OS_APPLE) static jmp_buf __lv_powerpc_jmpbuf; static volatile sig_atomic_t __lv_powerpc_canjump = 0; @@ -194,123 +141,8 @@ check_os_altivec_support(void) } #endif /* PIPE_ARCH_PPC */ -/* If we're running on a processor that can do SSE, let's see if we - * are allowed to or not. This will catch 2.4.0 or later kernels that - * haven't been configured for a Pentium III but are running on one, - * and RedHat patched 2.2 kernels that have broken exception handling - * support for user space apps that do SSE. - */ -#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) -static void -check_os_katmai_support(void) -{ -#if defined(PIPE_ARCH_X86) -#if defined(PIPE_OS_FREEBSD) - int has_sse=0, ret; - int len = sizeof (has_sse); - - ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0); - if (ret || !has_sse) - util_cpu_caps.has_sse=0; - -#elif defined(PIPE_OS_NETBSD) || defined(PIPE_OS_OPENBSD) - int has_sse, has_sse2, ret, mib[2]; - int varlen; - - mib[0] = CTL_MACHDEP; - mib[1] = CPU_SSE; - varlen = sizeof (has_sse); - - ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0); - if (ret < 0 || !has_sse) { - util_cpu_caps.has_sse = 0; - } else { - util_cpu_caps.has_sse = 1; - } - - mib[1] = CPU_SSE2; - varlen = sizeof (has_sse2); - ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0); - if (ret < 0 || !has_sse2) { - util_cpu_caps.has_sse2 = 0; - } else { - util_cpu_caps.has_sse2 = 1; - } - util_cpu_caps.has_sse = 0; /* FIXME ?!?!? */ - -#elif defined(PIPE_OS_WINDOWS) - LPTOP_LEVEL_EXCEPTION_FILTER exc_fil; - if (util_cpu_caps.has_sse) { - exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse); -#if defined(PIPE_CC_GCC) - __asm __volatile ("xorps %xmm0, %xmm0"); -#elif defined(PIPE_CC_MSVC) - __asm { - xorps xmm0, xmm0 /* executing SSE instruction */ - } -#else -#error Unsupported compiler -#endif - SetUnhandledExceptionFilter(exc_fil); - } -#elif defined(PIPE_OS_LINUX) - struct sigaction saved_sigill; - struct sigaction saved_sigfpe; - - /* Save the original signal handlers. - */ - sigaction(SIGILL, NULL, &saved_sigill); - sigaction(SIGFPE, NULL, &saved_sigfpe); - - signal(SIGILL, (void (*)(int))sigill_handler_sse); - signal(SIGFPE, (void (*)(int))sigfpe_handler_sse); - - /* Emulate test for OSFXSR in CR4. The OS will set this bit if it - * supports the extended FPU save and restore required for SSE. If - * we execute an SSE instruction on a PIII and get a SIGILL, the OS - * doesn't support Streaming SIMD Exceptions, even if the processor - * does. - */ - if (util_cpu_caps.has_sse) { - __asm __volatile ("xorps %xmm1, %xmm0"); - } - - /* Emulate test for OSXMMEXCPT in CR4. The OS will set this bit if - * it supports unmasked SIMD FPU exceptions. If we unmask the - * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS - * doesn't support unmasked SIMD FPU exceptions. If we get a SIGFPE - * as expected, we're okay but we need to clean up after it. - * - * Are we being too stringent in our requirement that the OS support - * unmasked exceptions? Certain RedHat 2.2 kernels enable SSE by - * setting CR4.OSFXSR but don't support unmasked exceptions. Win98 - * doesn't even support them. We at least know the user-space SSE - * support is good in kernels that do support unmasked exceptions, - * and therefore to be safe I'm going to leave this test in here. - */ - if (util_cpu_caps.has_sse) { - /* test_os_katmai_exception_support(); */ - } - - /* Restore the original signal handlers. - */ - sigaction(SIGILL, &saved_sigill, NULL); - sigaction(SIGFPE, &saved_sigfpe, NULL); - -#else - /* We can't use POSIX signal handling to test the availability of - * SSE, so we disable it by default. - */ - util_cpu_caps.has_sse = 0; -#endif /* __linux__ */ -#endif - -#if defined(PIPE_ARCH_X86_64) - util_cpu_caps.has_sse = 1; -#endif -} - +#if defined(PIPE_ARCH_X86) || defined (PIPE_ARCH_X86_64) static int has_cpuid(void) { #if defined(PIPE_ARCH_X86) @@ -469,9 +301,6 @@ util_cpu_detect(void) util_cpu_caps.cacheline = regs2[2] & 0xFF; } - if (util_cpu_caps.has_sse) - check_os_katmai_support(); - if (!util_cpu_caps.has_sse) { util_cpu_caps.has_sse2 = 0; util_cpu_caps.has_sse3 = 0; diff --git a/src/gallium/auxiliary/util/u_debug_describe.c b/src/gallium/auxiliary/util/u_debug_describe.c new file mode 100644 index 00000000000..1c90ff31069 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_describe.c @@ -0,0 +1,81 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include <pipe/p_state.h> +#include <util/u_format.h> +#include <util/u_debug_describe.h> +#include <util/u_string.h> + +void +debug_describe_reference(char* buf, const struct pipe_reference*ptr) +{ + strcpy(buf, "pipe_object"); +} + +void +debug_describe_resource(char* buf, const struct pipe_resource *ptr) +{ + switch(ptr->target) + { + case PIPE_BUFFER: + util_sprintf(buf, "pipe_buffer<%u>", (unsigned)util_format_get_stride(ptr->format, ptr->width0)); + break; + case PIPE_TEXTURE_1D: + util_sprintf(buf, "pipe_texture1d<%u,%s,%u>", ptr->width0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_2D: + util_sprintf(buf, "pipe_texture2d<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_RECT: + util_sprintf(buf, "pipe_texture_rect<%u,%u,%s>", ptr->width0, ptr->height0, util_format_short_name(ptr->format)); + break; + case PIPE_TEXTURE_CUBE: + util_sprintf(buf, "pipe_texture_cube<%u,%u,%s,%u>", ptr->width0, ptr->height0, util_format_short_name(ptr->format), ptr->last_level); + break; + case PIPE_TEXTURE_3D: + util_sprintf(buf, "pipe_texture3d<%u,%u,%u,%s,%u>", ptr->width0, ptr->height0, ptr->depth0, util_format_short_name(ptr->format), ptr->last_level); + break; + default: + util_sprintf(buf, "pipe_martian_resource<%u>", ptr->target); + break; + } +} + +void +debug_describe_surface(char* buf, const struct pipe_surface *ptr) +{ + char res[128]; + debug_describe_resource(res, ptr->texture); + util_sprintf(buf, "pipe_surface<%s,%u,%u,%u>", res, ptr->face, ptr->level, ptr->zslice); +} + +void +debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr) +{ + char res[128]; + debug_describe_resource(res, ptr->texture); + util_sprintf(buf, "pipe_sampler_view<%s,%s>", res, util_format_short_name(ptr->format)); +} diff --git a/src/gallium/auxiliary/util/u_debug_describe.h b/src/gallium/auxiliary/util/u_debug_describe.h new file mode 100644 index 00000000000..26d1f803bf0 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_describe.h @@ -0,0 +1,49 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_DEBUG_DESCRIBE_H_ +#define U_DEBUG_DESCRIBE_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct pipe_reference; +struct pipe_resource; +struct pipe_surface; +struct pipe_sampler_view; + +/* a 256-byte buffer is necessary and sufficient */ +void debug_describe_reference(char* buf, const struct pipe_reference*ptr); +void debug_describe_resource(char* buf, const struct pipe_resource *ptr); +void debug_describe_surface(char* buf, const struct pipe_surface *ptr); +void debug_describe_sampler_view(char* buf, const struct pipe_sampler_view *ptr); + +#ifdef __cplusplus +} +#endif + +#endif /* U_DEBUG_DESCRIBE_H_ */ diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.c b/src/gallium/auxiliary/util/u_debug_refcnt.c new file mode 100644 index 00000000000..40a26c9c697 --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_refcnt.c @@ -0,0 +1,181 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)) + +/* see http://www.mozilla.org/performance/refcnt-balancer.html for what do with the output + * on Linux, use tools/addr2line.sh to postprocess it before anything else + **/ +#include <util/u_debug.h> +#include <util/u_debug_refcnt.h> +#include <util/u_debug_stack.h> +#include <util/u_debug_symbol.h> +#include <util/u_string.h> +#include <util/u_hash_table.h> +#include <os/os_thread.h> +#include <os/os_stream.h> + +int debug_refcnt_state; + +struct os_stream* stream; + +/* TODO: maybe move this serial machinery to a stand-alone module and expose it? */ +static pipe_mutex serials_mutex; +static struct util_hash_table* serials_hash; +static unsigned serials_last; + +static unsigned hash_ptr(void* p) +{ + return (unsigned)(uintptr_t)p; +} + +static int compare_ptr(void* a, void* b) +{ + if(a == b) + return 0; + else if(a < b) + return -1; + else + return 1; +} + +static boolean debug_serial(void* p, unsigned* pserial) +{ + unsigned serial; + boolean found = TRUE; + pipe_mutex_lock(serials_mutex); + if(!serials_hash) + serials_hash = util_hash_table_create(hash_ptr, compare_ptr); + serial = (unsigned)(uintptr_t)util_hash_table_get(serials_hash, p); + if(!serial) + { + /* time to stop logging... (you'll have a 100 GB logfile at least at this point) + * TODO: avoid this + */ + serial = ++serials_last; + if(!serial) + { + debug_error("More than 2^32 objects detected, aborting.\n"); + os_abort(); + } + + util_hash_table_set(serials_hash, p, (void*)(uintptr_t)serial); + found = FALSE; + } + pipe_mutex_unlock(serials_mutex); + *pserial = serial; + return found; +} + +static void debug_serial_delete(void* p) +{ + pipe_mutex_lock(serials_mutex); + util_hash_table_remove(serials_hash, p); + pipe_mutex_unlock(serials_mutex); +} + +#define STACK_LEN 64 + +static void dump_stack(const char* symbols[STACK_LEN]) +{ + unsigned i; + for(i = 0; i < STACK_LEN; ++i) + { + if(symbols[i]) + os_stream_printf(stream, "%s\n", symbols[i]); + } + os_stream_write(stream, "\n", 1); +} + +void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ + if(debug_refcnt_state < 0) + return; + + if(!debug_refcnt_state) + { + const char* filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL); + if(filename && filename[0]) + stream = os_file_stream_create(filename); + + if(stream) + debug_refcnt_state = 1; + else + debug_refcnt_state = -1; + } + + if(debug_refcnt_state > 0) + { + struct debug_stack_frame frames[STACK_LEN]; + const char* symbols[STACK_LEN]; + char buf[1024]; + + unsigned i; + unsigned refcnt = p->count; + unsigned serial; + boolean existing = debug_serial((void*)p, &serial); + + debug_backtrace_capture(frames, 1, STACK_LEN); + for(i = 0; i < STACK_LEN; ++i) + { + if(frames[i].function) + symbols[i] = debug_symbol_name_cached(frames[i].function); + else + symbols[i] = 0; + } + + get_desc(buf, p); + + if(!existing) + { + os_stream_printf(stream, "<%s> %p %u Create\n", buf, p, serial); + dump_stack(symbols); + + /* this is there to provide a gradual change even if we don't see the initialization */ + for(i = 1; i <= refcnt - change; ++i) + { + os_stream_printf(stream, "<%s> %p %u AddRef %u\n", buf, p, serial, i); + dump_stack(symbols); + } + } + + if(change) + { + os_stream_printf(stream, "<%s> %p %u %s %u\n", buf, p, serial, change > 0 ? "AddRef" : "Release", refcnt); + dump_stack(symbols); + } + + if(!refcnt) + { + debug_serial_delete((void*)p); + os_stream_printf(stream, "<%s> %p %u Destroy\n", buf, p, serial); + dump_stack(symbols); + } + + os_stream_flush(stream); + } +} +#endif diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h new file mode 100644 index 00000000000..bea2d1c478a --- /dev/null +++ b/src/gallium/auxiliary/util/u_debug_refcnt.h @@ -0,0 +1,63 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_DEBUG_REFCNT_H_ +#define U_DEBUG_REFCNT_H_ + +#include <pipe/p_config.h> +#include <pipe/p_state.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*debug_reference_descriptor)(char*, const struct pipe_reference*); + +#if defined(DEBUG) && (!defined(PIPE_OS_WINDOWS) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)) + +extern int debug_refcnt_state; + +void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change); + +static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ + if (debug_refcnt_state >= 0) + debug_reference_slowpath(p, get_desc, change); +} + +#else + +static INLINE void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change) +{ +} + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* U_DEBUG_REFCNT_H_ */ diff --git a/src/gallium/auxiliary/util/u_debug_symbol.c b/src/gallium/auxiliary/util/u_debug_symbol.c index 6e250575d66..332952af88b 100644 --- a/src/gallium/auxiliary/util/u_debug_symbol.c +++ b/src/gallium/auxiliary/util/u_debug_symbol.c @@ -33,9 +33,12 @@ */ #include "pipe/p_compiler.h" +#include "os/os_thread.h" +#include "u_string.h" #include "u_debug.h" #include "u_debug_symbol.h" +#include "u_hash_table.h" #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86) @@ -113,8 +116,8 @@ BOOL WINAPI j_SymGetSymFromAddr(HANDLE hProcess, DWORD Address, PDWORD Displacem } -static INLINE boolean -debug_symbol_print_imagehlp(const void *addr) +static INLINE void +debug_symbol_name_imagehlp(const void *addr, char* buf, unsigned size) { HANDLE hProcess; BYTE symbolBuffer[1024]; @@ -131,25 +134,95 @@ debug_symbol_print_imagehlp(const void *addr) if(j_SymInitialize(hProcess, NULL, TRUE)) bSymInitialized = TRUE; } - + if(!j_SymGetSymFromAddr(hProcess, (DWORD)addr, &dwDisplacement, pSymbol)) - return FALSE; + buf[0] = 0; + else + { + strncpy(buf, pSymbol->Name, size); + buf[size - 1] = 0; + } +} +#endif - debug_printf("\t%s\n", pSymbol->Name); +#ifdef __GLIBC__ +#include <execinfo.h> - return TRUE; - +/* This can only provide dynamic symbols, or binary offsets into a file. + * + * To fix this, post-process the output with tools/addr2line.sh + */ +static INLINE void +debug_symbol_name_glibc(const void *addr, char* buf, unsigned size) +{ + char** syms = backtrace_symbols((void**)&addr, 1); + strncpy(buf, syms[0], size); + buf[size - 1] = 0; + free(syms); } #endif - void -debug_symbol_print(const void *addr) +debug_symbol_name(const void *addr, char* buf, unsigned size) { #if defined(PIPE_SUBSYSTEM_WINDOWS_USER) && defined(PIPE_ARCH_X86) - if(debug_symbol_print_imagehlp(addr)) + debug_symbol_name_imagehlp(addr, buf, size); + if(buf[0]) return; #endif - - debug_printf("\t%p\n", addr); + +#ifdef __GLIBC__ + debug_symbol_name_glibc(addr, buf, size); + if(buf[0]) + return; +#endif + + util_snprintf(buf, size, "%p", addr); + buf[size - 1] = 0; +} + +void +debug_symbol_print(const void *addr) +{ + char buf[1024]; + debug_symbol_name(addr, buf, sizeof(buf)); + debug_printf("\t%s\n", buf); +} + +struct util_hash_table* symbols_hash; +pipe_mutex symbols_mutex; + +static unsigned hash_ptr(void* p) +{ + return (unsigned)(uintptr_t)p; +} + +static int compare_ptr(void* a, void* b) +{ + if(a == b) + return 0; + else if(a < b) + return -1; + else + return 1; +} + +const char* +debug_symbol_name_cached(const void *addr) +{ + const char* name; + pipe_mutex_lock(symbols_mutex); + if(!symbols_hash) + symbols_hash = util_hash_table_create(hash_ptr, compare_ptr); + name = util_hash_table_get(symbols_hash, (void*)addr); + if(!name) + { + char buf[1024]; + debug_symbol_name(addr, buf, sizeof(buf)); + name = strdup(buf); + + util_hash_table_set(symbols_hash, (void*)addr, (void*)name); + } + pipe_mutex_unlock(symbols_mutex); + return name; } diff --git a/src/gallium/auxiliary/util/u_debug_symbol.h b/src/gallium/auxiliary/util/u_debug_symbol.h index 021586987b6..b247706c2a0 100644 --- a/src/gallium/auxiliary/util/u_debug_symbol.h +++ b/src/gallium/auxiliary/util/u_debug_symbol.h @@ -43,8 +43,13 @@ extern "C" { void -debug_symbol_print(const void *addr); +debug_symbol_name(const void *addr, char* buf, unsigned size); + +const char* +debug_symbol_name_cached(const void *addr); +void +debug_symbol_print(const void *addr); #ifdef __cplusplus } diff --git a/src/gallium/auxiliary/util/u_dirty_surfaces.h b/src/gallium/auxiliary/util/u_dirty_surfaces.h index 99f260bf967..fd1bbe5ffdf 100644 --- a/src/gallium/auxiliary/util/u_dirty_surfaces.h +++ b/src/gallium/auxiliary/util/u_dirty_surfaces.h @@ -1,9 +1,39 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #ifndef U_DIRTY_SURFACES_H_ #define U_DIRTY_SURFACES_H_ +#include "pipe/p_state.h" + #include "util/u_double_list.h" #include "util/u_math.h" +struct pipe_context; + typedef void (*util_dirty_surface_flush_t) (struct pipe_context *, struct pipe_surface *); struct util_dirty_surfaces diff --git a/src/gallium/auxiliary/util/u_draw.h b/src/gallium/auxiliary/util/u_draw.h index 2a91ea0f9ae..f06d09ef91d 100644 --- a/src/gallium/auxiliary/util/u_draw.h +++ b/src/gallium/auxiliary/util/u_draw.h @@ -31,6 +31,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" static INLINE void diff --git a/src/gallium/auxiliary/util/u_dynarray.h b/src/gallium/auxiliary/util/u_dynarray.h index 9d1c1713a7c..980cadf22d1 100644 --- a/src/gallium/auxiliary/util/u_dynarray.h +++ b/src/gallium/auxiliary/util/u_dynarray.h @@ -106,6 +106,9 @@ util_dynarray_trim(struct util_dynarray *buf) #define util_dynarray_pop_ptr(buf, type) (type*)((char*)(buf)->data + ((buf)->size -= sizeof(type))) #define util_dynarray_pop(buf, type) *util_dynarray_pop_ptr(buf, type) #define util_dynarray_contains(buf, type) ((buf)->size >= sizeof(type)) +#define util_dynarray_element(buf, type, idx) ((type*)(buf)->data + (idx)) +#define util_dynarray_begin(buf) ((buf)->data) +#define util_dynarray_end(buf) ((void*)util_dynarray_element((buf), char, (buf)->size)) #endif /* U_DYNARRAY_H */ diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c index b7fe2d3003a..6a931a95819 100644 --- a/src/gallium/auxiliary/util/u_gen_mipmap.c +++ b/src/gallium/auxiliary/util/u_gen_mipmap.c @@ -1255,6 +1255,7 @@ fallback_gen_mipmap(struct gen_mipmap_state *ctx, make_1d_mipmap(ctx, pt, face, baseLevel, lastLevel); break; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_CUBE: make_2d_mipmap(ctx, pt, face, baseLevel, lastLevel); break; diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h index 540305c1465..78473bf35ac 100644 --- a/src/gallium/auxiliary/util/u_inlines.h +++ b/src/gallium/auxiliary/util/u_inlines.h @@ -33,6 +33,8 @@ #include "pipe/p_state.h" #include "pipe/p_screen.h" #include "util/u_debug.h" +#include "util/u_debug_describe.h" +#include "util/u_debug_refcnt.h" #include "util/u_atomic.h" #include "util/u_box.h" #include "util/u_math.h" @@ -67,7 +69,9 @@ pipe_is_referenced(struct pipe_reference *reference) * \return TRUE if the object's refcount hits zero and should be destroyed. */ static INLINE boolean -pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) +pipe_reference_described(struct pipe_reference *ptr, + struct pipe_reference *reference, + debug_reference_descriptor get_desc) { boolean destroy = FALSE; @@ -76,6 +80,7 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) if (reference) { assert(pipe_is_referenced(reference)); p_atomic_inc(&reference->count); + debug_reference(reference, get_desc, 1); } if (ptr) { @@ -83,41 +88,49 @@ pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) if (p_atomic_dec_zero(&ptr->count)) { destroy = TRUE; } + debug_reference(ptr, get_desc, -1); } } return destroy; } +static INLINE boolean +pipe_reference(struct pipe_reference *ptr, struct pipe_reference *reference) +{ + return pipe_reference_described(ptr, reference, + (debug_reference_descriptor)debug_describe_reference); +} static INLINE void pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf) { struct pipe_surface *old_surf = *ptr; - if (pipe_reference(&(*ptr)->reference, &surf->reference)) + if (pipe_reference_described(&(*ptr)->reference, &surf->reference, + (debug_reference_descriptor)debug_describe_surface)) old_surf->texture->screen->tex_surface_destroy(old_surf); *ptr = surf; } - static INLINE void pipe_resource_reference(struct pipe_resource **ptr, struct pipe_resource *tex) { struct pipe_resource *old_tex = *ptr; - if (pipe_reference(&(*ptr)->reference, &tex->reference)) + if (pipe_reference_described(&(*ptr)->reference, &tex->reference, + (debug_reference_descriptor)debug_describe_resource)) old_tex->screen->resource_destroy(old_tex->screen, old_tex); *ptr = tex; } - static INLINE void pipe_sampler_view_reference(struct pipe_sampler_view **ptr, struct pipe_sampler_view *view) { struct pipe_sampler_view *old_view = *ptr; - if (pipe_reference(&(*ptr)->reference, &view->reference)) + if (pipe_reference_described(&(*ptr)->reference, &view->reference, + (debug_reference_descriptor)debug_describe_sampler_view)) old_view->context->sampler_view_destroy(old_view->context, old_view); *ptr = view; } diff --git a/src/gallium/auxiliary/util/u_linkage.c b/src/gallium/auxiliary/util/u_linkage.c new file mode 100644 index 00000000000..2f6f41ba843 --- /dev/null +++ b/src/gallium/auxiliary/util/u_linkage.c @@ -0,0 +1,149 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "util/u_debug.h" +#include "pipe/p_shader_tokens.h" +#include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_scan.h" +#include "util/u_linkage.h" + +/* we must only record the registers that are actually used, not just declared */ +static INLINE boolean +util_semantic_set_test_and_set(struct util_semantic_set *set, unsigned value) +{ + unsigned mask = 1 << (value % (sizeof(long) * 8)); + unsigned long *p = &set->masks[value / (sizeof(long) * 8)]; + unsigned long v = *p & mask; + *p |= mask; + return !!v; +} + +unsigned +util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file) +{ + struct tgsi_shader_info info; + struct tgsi_parse_context parse; + unsigned count = 0; + ubyte *semantic_name; + ubyte *semantic_index; + + tgsi_scan_shader(tokens, &info); + + if(file == TGSI_FILE_INPUT) + { + semantic_name = info.input_semantic_name; + semantic_index = info.input_semantic_index; + } + else if(file == TGSI_FILE_OUTPUT) + { + semantic_name = info.output_semantic_name; + semantic_index = info.output_semantic_index; + } + else + { + assert(0); + semantic_name = NULL; + semantic_index = NULL; + } + + tgsi_parse_init(&parse, tokens); + + memset(set->masks, 0, sizeof(set->masks)); + while(!tgsi_parse_end_of_tokens(&parse)) + { + tgsi_parse_token(&parse); + + if(parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) + { + const struct tgsi_full_instruction *finst = &parse.FullToken.FullInstruction; + unsigned i; + for(i = 0; i < finst->Instruction.NumDstRegs; ++i) + { + if(finst->Dst[i].Register.File == file) + { + unsigned idx = finst->Dst[i].Register.Index; + if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC) + { + if(!util_semantic_set_test_and_set(set, semantic_index[idx])) + ++count; + } + } + } + + for(i = 0; i < finst->Instruction.NumSrcRegs; ++i) + { + if(finst->Src[i].Register.File == file) + { + unsigned idx = finst->Src[i].Register.Index; + if(semantic_name[idx] == TGSI_SEMANTIC_GENERIC) + { + if(!util_semantic_set_test_and_set(set, semantic_index[idx])) + ++count; + } + } + } + } + } + tgsi_parse_free(&parse); + + return count; +} + +#define UTIL_SEMANTIC_SET_FOR_EACH(i, set) for(i = 0; i < 256; ++i) if(set->masks[i / (sizeof(long) * 8)] & (1 << (i % (sizeof(long) * 8)))) + +void +util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots) +{ + int first = -1; + int last = -1; + unsigned i; + + memset(layout, 0xff, num_slots); + + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + { + if(first < 0) + first = i; + last = i; + } + + if(last < efficient_slots) + { + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[i] = i; + } + else if((last - first) < efficient_slots) + { + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[i - first] = i; + } + else + { + unsigned idx = 0; + UTIL_SEMANTIC_SET_FOR_EACH(i, set) + layout[idx++] = i; + } +} diff --git a/src/gallium/auxiliary/util/u_linkage.h b/src/gallium/auxiliary/util/u_linkage.h new file mode 100644 index 00000000000..4720e0ee603 --- /dev/null +++ b/src/gallium/auxiliary/util/u_linkage.h @@ -0,0 +1,66 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef U_LINKAGE_H_ +#define U_LINKAGE_H_ + +#include "pipe/p_compiler.h" +#include "pipe/p_shader_tokens.h" + +struct util_semantic_set +{ + unsigned long masks[256 / 8 / sizeof(unsigned long)]; +}; + +static INLINE bool +util_semantic_set_contains(struct util_semantic_set *set, unsigned char value) +{ + return !!(set->masks[value / (sizeof(long) * 8)] & (1 << (value / (sizeof(long) * 8)))); +} + +unsigned util_semantic_set_from_program_file(struct util_semantic_set *set, const struct tgsi_token *tokens, enum tgsi_file_type file); + +/* efficient_slots is the number of slots such that hardware performance is + * the same for using that amount, with holes, or less slots but with less + * holes. + * + * num_slots is the size of the layout array and hardware limit instead. + * + * efficient_slots == 0 or efficient_solts == num_slots are typical settings. + */ +void util_semantic_layout_from_set(unsigned char *layout, const struct util_semantic_set *set, unsigned efficient_slots, unsigned num_slots); + +static INLINE void +util_semantic_table_from_layout(unsigned char *table, unsigned char *layout, unsigned char first_slot_value, unsigned char num_slots) +{ + int i; + memset(table, 0xff, sizeof(table)); + + for(i = 0; i < num_slots; ++i) + table[layout[i]] = first_slot_value + i; +} + +#endif /* U_LINKAGE_H_ */ diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h index fe19466436a..69a76814945 100644 --- a/src/gallium/auxiliary/util/u_math.h +++ b/src/gallium/auxiliary/util/u_math.h @@ -361,16 +361,6 @@ util_is_inf_or_nan(float x) /** - * Test whether x is a power of two. - */ -static INLINE boolean -util_is_pot(unsigned x) -{ - return (x & (x - 1)) == 0; -} - - -/** * Find first bit set in word. Least significant bit is 1. * Return 0 if no bits set. */ @@ -566,6 +556,9 @@ util_bswap16(uint16_t n) #define MIN3( A, B, C ) MIN2( MIN2( A, B ), C ) #define MAX3( A, B, C ) MAX2( MAX2( A, B ), C ) +#define MIN4( A, B, C, D ) MIN2( MIN2( A, B ), MIN2(C, D) ) +#define MAX4( A, B, C, D ) MAX2( MAX2( A, B ), MAX2(C, D) ) + /** * Align a value, only works pot alignemnts. diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h index 5f113f742b1..aae8b8bdf18 100644 --- a/src/gallium/auxiliary/util/u_pack_color.h +++ b/src/gallium/auxiliary/util/u_pack_color.h @@ -42,12 +42,18 @@ #include "util/u_math.h" - +/** + * Helper union for packing pixel values. + * Will often contain values in formats which are too complex to be described + * in simple terms, hence might just effectively contain a number of bytes. + * Must be big enough to hold data for all formats (currently 256 bits). + */ union util_color { ubyte ub; ushort us; uint ui; float f[4]; + double d[4]; }; /** diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c index 9bbcf1c8c49..56fcfac0693 100644 --- a/src/gallium/auxiliary/util/u_rect.c +++ b/src/gallium/auxiliary/util/u_rect.c @@ -32,6 +32,7 @@ #include "util/u_format.h" #include "util/u_rect.h" +#include "util/u_pack_color.h" /** @@ -94,7 +95,7 @@ util_fill_rect(ubyte * dst, unsigned dst_y, unsigned width, unsigned height, - uint32_t value) + union util_color *uc) { unsigned i, j; unsigned width_size; @@ -110,40 +111,54 @@ util_fill_rect(ubyte * dst, dst_y /= blockheight; width = (width + blockwidth - 1)/blockwidth; height = (height + blockheight - 1)/blockheight; - + dst += dst_x * blocksize; dst += dst_y * dst_stride; width_size = width * blocksize; - + switch (blocksize) { case 1: if(dst_stride == width_size) - memset(dst, (ubyte) value, height * width_size); + memset(dst, uc->ub, height * width_size); else { - for (i = 0; i < height; i++) { - memset(dst, (ubyte) value, width_size); - dst += dst_stride; - } + for (i = 0; i < height; i++) { + memset(dst, uc->ub, width_size); + dst += dst_stride; + } } break; case 2: for (i = 0; i < height; i++) { - uint16_t *row = (uint16_t *)dst; - for (j = 0; j < width; j++) - *row++ = (uint16_t) value; - dst += dst_stride; + uint16_t *row = (uint16_t *)dst; + for (j = 0; j < width; j++) + *row++ = uc->us; + dst += dst_stride; } break; case 4: for (i = 0; i < height; i++) { - uint32_t *row = (uint32_t *)dst; - for (j = 0; j < width; j++) - *row++ = value; - dst += dst_stride; + uint32_t *row = (uint32_t *)dst; + for (j = 0; j < width; j++) + *row++ = uc->ui; + dst += dst_stride; + } + break; + case 8: + case 12: + case 16: + case 24: + case 32: + for (i = 0; i < height; i++) { + ubyte *row = dst; + for (j = 0; j < width; j++) { + memcpy(row, uc, blocksize); + row += blocksize; + } + dst += dst_stride; } break; default: - assert(0); - break; + assert(0); + break; } } diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h index 40d57e662d7..4cb90d3c316 100644 --- a/src/gallium/auxiliary/util/u_rect.h +++ b/src/gallium/auxiliary/util/u_rect.h @@ -26,17 +26,67 @@ **************************************************************************/ -/** - * Pipe copy/fill rect helpers. +#ifndef U_RECT_H +#define U_RECT_H + +#include "pipe/p_compiler.h" + +struct u_rect { + int x0, x1; + int y0, y1; +}; + +/* Do two rectangles intersect? */ +static INLINE boolean +u_rect_test_intersection(const struct u_rect *a, + const struct u_rect *b) +{ + return (!(a->x1 < b->x0 || + b->x1 < a->x0 || + a->y1 < b->y0 || + b->y1 < a->y0)); +} +/* Find the intersection of two rectangles known to intersect. + */ +static INLINE void +u_rect_find_intersection(const struct u_rect *a, + struct u_rect *b) +{ + /* Caller should verify intersection exists before calling. + */ + if (b->x0 < a->x0) b->x0 = a->x0; + if (b->x1 > a->x1) b->x1 = a->x1; + if (b->y0 < a->y0) b->y0 = a->y0; + if (b->y1 > a->y1) b->y1 = a->y1; +} -#ifndef U_RECT_H -#define U_RECT_H +static INLINE void +u_rect_possible_intersection(const struct u_rect *a, + struct u_rect *b) +{ + if (u_rect_test_intersection(a,b)) { + u_rect_find_intersection(a,b); + } + else { + b->x0 = b->x1 = b->y0 = b->y1 = 0; + } +} #include "pipe/p_format.h" +#include "util/u_pack_color.h" + + + +/********************************************************************** + * Pipe copy/fill rect helpers. + */ +/* These really should move to a different file: + */ +#include "pipe/p_format.h" extern void util_copy_rect(ubyte * dst, enum pipe_format format, @@ -47,7 +97,7 @@ util_copy_rect(ubyte * dst, enum pipe_format format, extern void util_fill_rect(ubyte * dst, enum pipe_format format, unsigned dst_stride, unsigned dst_x, unsigned dst_y, - unsigned width, unsigned height, uint32_t value); + unsigned width, unsigned height, union util_color *uc); #endif /* U_RECT_H */ diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c index 5b682f496cb..58ef68377fc 100644 --- a/src/gallium/auxiliary/util/u_simple_shaders.c +++ b/src/gallium/auxiliary/util/u_simple_shaders.c @@ -37,6 +37,7 @@ #include "pipe/p_context.h" #include "pipe/p_shader_tokens.h" +#include "pipe/p_state.h" #include "util/u_simple_shaders.h" #include "util/u_debug.h" #include "tgsi/tgsi_ureg.h" diff --git a/src/gallium/auxiliary/util/u_split_prim.h b/src/gallium/auxiliary/util/u_split_prim.h index 206e1ec3118..7f80fc12700 100644 --- a/src/gallium/auxiliary/util/u_split_prim.h +++ b/src/gallium/auxiliary/util/u_split_prim.h @@ -1,5 +1,12 @@ /* Originally written by Ben Skeggs for the nv50 driver*/ -#include <pipe/p_defines.h> + +#ifndef U_SPLIT_PRIM_H +#define U_SPLIT_PRIM_H + +#include "pipe/p_defines.h" +#include "pipe/p_compiler.h" + +#include "util/u_debug.h" struct util_split_prim { void *priv; @@ -48,7 +55,7 @@ util_split_prim_next(struct util_split_prim *s, unsigned max_verts) } } - if (s->p_start + s->close_first + max_verts >= s->p_end) { + if ((s->p_end - s->p_start) + s->close_first <= max_verts) { s->emit(s->priv, s->p_start, s->p_end - s->p_start); if (s->close_first) s->emit(s->priv, s->start, 1); @@ -103,3 +110,5 @@ util_split_prim_next(struct util_split_prim *s, unsigned max_verts) s->p_start += (max_verts - repeat); return FALSE; } + +#endif /* U_SPLIT_PRIM_H */ diff --git a/src/gallium/auxiliary/util/u_staging.c b/src/gallium/auxiliary/util/u_staging.c index 607c31f5ee7..c5d68f8df86 100644 --- a/src/gallium/auxiliary/util/u_staging.c +++ b/src/gallium/auxiliary/util/u_staging.c @@ -1,3 +1,29 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #include "util/u_staging.h" #include "pipe/p_context.h" #include "util/u_memory.h" @@ -8,7 +34,7 @@ util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigne { memset(template, 0, sizeof(struct pipe_resource)); if(pt->target != PIPE_BUFFER && depth <= 1) - template->target = PIPE_TEXTURE_2D; + template->target = PIPE_TEXTURE_RECT; else template->target = pt->target; template->format = pt->format; @@ -23,20 +49,16 @@ util_staging_resource_template(struct pipe_resource *pt, unsigned width, unsigne } struct util_staging_transfer * -util_staging_transfer_new(struct pipe_context *pipe, +util_staging_transfer_init(struct pipe_context *pipe, struct pipe_resource *pt, struct pipe_subresource sr, unsigned usage, const struct pipe_box *box, - bool direct) + bool direct, struct util_staging_transfer *tx) { struct pipe_screen *pscreen = pipe->screen; - struct util_staging_transfer *tx; - struct pipe_resource staging_resource_template; - tx = CALLOC_STRUCT(util_staging_transfer); - if (!tx) - return NULL; + struct pipe_resource staging_resource_template; pipe_resource_reference(&tx->base.resource, pt); tx->base.sr = sr; diff --git a/src/gallium/auxiliary/util/u_staging.h b/src/gallium/auxiliary/util/u_staging.h index 602faa2971d..1aab78cc881 100644 --- a/src/gallium/auxiliary/util/u_staging.h +++ b/src/gallium/auxiliary/util/u_staging.h @@ -1,3 +1,29 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + /* Direct3D 10/11 has no concept of transfers. Applications instead * create resources with a STAGING or DYNAMIC usage, copy between them * and the real resource and use Map to map the STAGING/DYNAMIC resource. @@ -21,15 +47,15 @@ struct util_staging_transfer { }; /* user must be stride, slice_stride and offset */ -/* pt->usage == PIPE_USAGE_DYNAMIC should be a good value to pass for direct */ -/* staging resource is currently created with PIPE_USAGE_DYNAMIC */ +/* pt->usage == PIPE_USAGE_DYNAMIC || pt->usage == PIPE_USAGE_STAGING should be a good value to pass for direct */ +/* staging resource is currently created with PIPE_USAGE_STAGING */ struct util_staging_transfer * -util_staging_transfer_new(struct pipe_context *pipe, +util_staging_transfer_init(struct pipe_context *pipe, struct pipe_resource *pt, struct pipe_subresource sr, unsigned usage, const struct pipe_box *box, - bool direct); + bool direct, struct util_staging_transfer *tx); void util_staging_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx); diff --git a/src/gallium/auxiliary/util/u_surface.c b/src/gallium/auxiliary/util/u_surface.c index cab7691c705..af99163b2ed 100644 --- a/src/gallium/auxiliary/util/u_surface.c +++ b/src/gallium/auxiliary/util/u_surface.c @@ -216,7 +216,7 @@ util_clear_render_target(struct pipe_context *pipe, assert(dst->texture); if (!dst->texture) return; - util_pack_color(rgba, dst->texture->format, &uc); + dst_trans = pipe_get_transfer(pipe, dst->texture, dst->face, @@ -232,46 +232,10 @@ util_clear_render_target(struct pipe_context *pipe, if (dst_map) { assert(dst_trans->stride > 0); - switch (util_format_get_blocksize(dst->texture->format)) { - case 1: - case 2: - case 4: - util_pack_color(rgba, dst->texture->format, &uc); - util_fill_rect(dst_map, dst->texture->format, - dst_trans->stride, - 0, 0, width, height, uc.ui); - break; - case 8: - { - /* expand the 4-byte clear value to an 8-byte value */ - /* should probably not convert back from ubyte but not - sure what this code really achieved since it doesn't even - check for format type... */ - ushort *row = (ushort *) dst_map; - ushort val0 = UBYTE_TO_USHORT((uc.ui >> 0) & 0xff); - ushort val1 = UBYTE_TO_USHORT((uc.ui >> 8) & 0xff); - ushort val2 = UBYTE_TO_USHORT((uc.ui >> 16) & 0xff); - ushort val3 = UBYTE_TO_USHORT((uc.ui >> 24) & 0xff); - unsigned i, j; - val0 = (val0 << 8) | val0; - val1 = (val1 << 8) | val1; - val2 = (val2 << 8) | val2; - val3 = (val3 << 8) | val3; - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - row[j*4+0] = val0; - row[j*4+1] = val1; - row[j*4+2] = val2; - row[j*4+3] = val3; - } - row += dst_trans->stride/2; - } - } - break; - default: - assert(0); - break; - } + util_pack_color(rgba, dst->texture->format, &uc); + util_fill_rect(dst_map, dst->texture->format, + dst_trans->stride, + 0, 0, width, height, &uc); } pipe->transfer_unmap(pipe, dst_trans); diff --git a/src/gallium/auxiliary/util/u_surfaces.c b/src/gallium/auxiliary/util/u_surfaces.c index 7733ad24d0d..404e1219952 100644 --- a/src/gallium/auxiliary/util/u_surfaces.c +++ b/src/gallium/auxiliary/util/u_surfaces.c @@ -1,3 +1,29 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #include "u_surfaces.h" #include "util/u_hash_table.h" #include "util/u_inlines.h" diff --git a/src/gallium/auxiliary/util/u_surfaces.h b/src/gallium/auxiliary/util/u_surfaces.h index af978c70579..17d8a5d3a5b 100644 --- a/src/gallium/auxiliary/util/u_surfaces.h +++ b/src/gallium/auxiliary/util/u_surfaces.h @@ -1,3 +1,29 @@ +/************************************************************************** + * + * Copyright 2010 Luca Barbieri + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #ifndef U_SURFACES_H_ #define U_SURFACES_H_ @@ -22,7 +48,7 @@ struct pipe_surface *util_surfaces_do_get(struct util_surfaces *us, unsigned sur static INLINE struct pipe_surface * util_surfaces_get(struct util_surfaces *us, unsigned surface_struct_size, struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags) { - if(likely(pt->target == PIPE_TEXTURE_2D && us->u.array)) + if(likely((pt->target == PIPE_TEXTURE_2D || pt->target == PIPE_TEXTURE_RECT) && us->u.array)) { struct pipe_surface *ps = us->u.array[level]; if(ps) @@ -52,7 +78,7 @@ void util_surfaces_do_detach(struct util_surfaces *us, struct pipe_surface *ps); static INLINE void util_surfaces_detach(struct util_surfaces *us, struct pipe_surface *ps) { - if(likely(ps->texture->target == PIPE_TEXTURE_2D)) + if(likely(ps->texture->target == PIPE_TEXTURE_2D || ps->texture->target == PIPE_TEXTURE_RECT)) { us->u.array[ps->level] = 0; return; diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h index 986eee07435..558351d0ce5 100644 --- a/src/gallium/auxiliary/util/u_tile.h +++ b/src/gallium/auxiliary/util/u_tile.h @@ -29,7 +29,10 @@ #define P_TILE_H #include "pipe/p_compiler.h" +#include "pipe/p_format.h" +#include "pipe/p_state.h" +struct pipe_context; struct pipe_transfer; /** diff --git a/src/gallium/auxiliary/util/u_transfer.h b/src/gallium/auxiliary/util/u_transfer.h index eb07945d15f..e3a38730f21 100644 --- a/src/gallium/auxiliary/util/u_transfer.h +++ b/src/gallium/auxiliary/util/u_transfer.h @@ -8,6 +8,7 @@ #include "pipe/p_state.h" struct pipe_context; +struct winsys_handle; boolean u_default_resource_get_handle(struct pipe_screen *screen, struct pipe_resource *resource, diff --git a/src/gallium/auxiliary/util/u_upload_mgr.h b/src/gallium/auxiliary/util/u_upload_mgr.h index a124924fc80..de016df02e0 100644 --- a/src/gallium/auxiliary/util/u_upload_mgr.h +++ b/src/gallium/auxiliary/util/u_upload_mgr.h @@ -32,11 +32,8 @@ #ifndef U_UPLOAD_MGR_H #define U_UPLOAD_MGR_H -#include "pipe/p_defines.h" - -struct pipe_screen; +struct pipe_context; struct pipe_resource; -struct u_upload_mgr; struct u_upload_mgr *u_upload_create( struct pipe_context *pipe, diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst index f241411a002..8250c30f2ab 100644 --- a/src/gallium/docs/source/context.rst +++ b/src/gallium/docs/source/context.rst @@ -63,7 +63,9 @@ objects. They all follow simple, one-method binding calls, e.g. * ``set_scissor_state`` sets the bounds for the scissor test, which culls pixels before blending to render targets. If the :ref:`Rasterizer` does not have the scissor test enabled, then the scissor bounds never need to - be set since they will not be used. + be set since they will not be used. Note that scissor xmin and ymin are + inclusive, but xmax and ymax are exclusive. The inclusive ranges in x + and y would be [xmin..xmax-1] and [ymin..ymax-1]. * ``set_viewport_state`` diff --git a/src/gallium/docs/source/debugging.rst b/src/gallium/docs/source/debugging.rst index 42bda5aee93..e081cbf74e1 100644 --- a/src/gallium/docs/source/debugging.rst +++ b/src/gallium/docs/source/debugging.rst @@ -21,6 +21,10 @@ This option controls if the debug variables should be printed to stderr. This is probably the most useful variable, since it allows you to find which variables a driver uses. +.. envvar:: GALLIUM_GALAHAD <bool> (false) + +Controls if the :ref:`galahad` sanity checker module should be used. + .. envvar:: GALLIUM_RBUG <bool> (false) Controls if the :ref:`rbug` should be used. diff --git a/src/gallium/docs/source/distro.rst b/src/gallium/docs/source/distro.rst index 70d75b51e65..08c8eab890a 100644 --- a/src/gallium/docs/source/distro.rst +++ b/src/gallium/docs/source/distro.rst @@ -79,6 +79,15 @@ Rbug Wrapper driver. :ref:`rbug` driver used with stand alone rbug-gui. +.. _galahad: + +Galahad +^^^^^^^ + +Wrapper driver. Sanity checker for the internal gallium state. Normally +a driver should n't have to sanity check the input it gets from a state +tracker. Any wrong state received should be perceived as a state tracker bug. + State Trackers -------------- diff --git a/src/gallium/docs/source/index.rst b/src/gallium/docs/source/index.rst index 6c19842dac4..2a73e3ab59d 100644 --- a/src/gallium/docs/source/index.rst +++ b/src/gallium/docs/source/index.rst @@ -15,6 +15,7 @@ Contents: debugging tgsi screen + resources context cso distro diff --git a/src/gallium/docs/source/resources.rst b/src/gallium/docs/source/resources.rst new file mode 100644 index 00000000000..c8a5766821b --- /dev/null +++ b/src/gallium/docs/source/resources.rst @@ -0,0 +1,195 @@ +Resources and derived objects +============================= + +Resources represent objects that hold data: textures and buffers. + +They are mostly modelled after the resources in Direct3D 10/11, but with a +different transfer/update mechanism, and more features for OpenGL support. + +Resources can be used in several ways, and it is required to specify all planned uses through an appropriate set of bind flags. + +TODO: write much more on resources + +Transfers +--------- + +Transfers are the mechanism used to access resources with the CPU. + +OpenGL: OpenGL supports mapping buffers and has inline transfer functions for both buffers and textures + +D3D11: D3D11 lacks transfers, but has special resource types that are mappable to the CPU address space + +TODO: write much more on transfers + +Resource targets +---------------- + +Resource targets determine the type of a resource. + +Note that drivers may not actually have the restrictions listed regarding +coordinate normalization and wrap modes, and in fact efficient OpenCL +support will probably require drivers that don't have any of them, which +will probably be advertised with an appropriate cap. + +TODO: document all targets. Note that both 3D and cube have restrictions +that depend on the hardware generation. + +TODO: can buffers have a non-R8 format? + +PIPE_BUFFER +^^^^^^^^^^^ + +Buffer resource: can be used as a vertex, index, constant buffer (appropriate bind flags must be requested). + +They can be bound to stream output if supported. +TODO: what about the restrictions lifted by the several later GL transform feedback extensions? How does one advertise that in Gallium? + +They can be also be bound to a shader stage as usual. +TODO: are all drivers supposed to support this? how does this work exactly? are there size limits? + +They can be also be bound to the framebuffer as usual. +TODO: are all drivers supposed to support this? how does this work exactly? are there size limits? +TODO: is there any chance of supporting GL pixel buffer object acceleration with this? + +- depth0 must be 1 +- last_level must be 0 +- TODO: what about normalization? +- TODO: wrap modes/other sampling state? +- TODO: are arbitrary formats supported? in which cases? + +OpenGL: vertex buffers in GL 1.5 or GL_ARB_vertex_buffer_object + +- Binding to stream out requires GL 3.0 or GL_NV_transform_feedback +- Binding as constant buffers requires GL 3.1 or GL_ARB_uniform_buffer_object +- Binding to a sampling stage requires GL 3.1 or GL_ARB_texture_buffer_object +- TODO: can they be bound to an FBO? + +D3D11: buffer resources +- Binding to a render target requires D3D_FEATURE_LEVEL_10_0 + +PIPE_TEXTURE_1D +^^^^^^^^^^^^^^^ +1D surface accessed with normalized coordinates. + +UNIMPLEMENTED: 1D texture arrays not supported + +- If PIPE_CAP_NPOT_TEXTURES is not supported, + width must be a power of two +- height0 must be 1 +- depth0 must be 1 +- Mipmaps can be used +- Must use normalized coordinates + +OpenGL: GL_TEXTURE_1D in GL 1.0 + +- PIPE_CAP_NPOT_TEXTURES is equivalent to GL 2.0 or GL_ARB_texture_non_power_of_two + +D3D11: 1D textures in D3D_FEATURE_LEVEL_10_0 + +PIPE_TEXTURE_RECT +^^^^^^^^^^^^^^^^^ +2D surface with OpenGL GL_TEXTURE_RECTANGLE semantics. + +- depth0 must be 1 +- last_level must be 0 +- Must use unnormalized coordinates +- Must use a clamp wrap mode + +OpenGL: GL_TEXTURE_RECTANGLE in GL 3.1 or GL_ARB_texture_rectangle or GL_NV_texture_rectangle + +OpenCL: can create OpenCL images based on this, that can then be sampled arbitrarily + +D3D11: not supported (only PIPE_TEXTURE_2D with normalized coordinates is supported) + +PIPE_TEXTURE_2D +^^^^^^^^^^^^^^^ +2D surface accessed with normalized coordinates. + +UNIMPLEMENTED: 2D texture arrays not supported + +- If PIPE_CAP_NPOT_TEXTURES is not supported, + width and height must be powers of two +- depth0 must be 1 +- Mipmaps can be used +- Must use normalized coordinates +- No special restrictions on wrap modes + +OpenGL: GL_TEXTURE_2D in GL 1.0 + +- PIPE_CAP_NPOT_TEXTURES is equivalent to GL 2.0 or GL_ARB_texture_non_power_of_two + +OpenCL: can create OpenCL images based on this, that can then be sampled arbitrarily + +D3D11: 2D textures + +- PIPE_CAP_NPOT_TEXTURES is equivalent to D3D_FEATURE_LEVEL_9_3 + +PIPE_TEXTURE_3D +^^^^^^^^^^^^^^^ + +3-dimensional array of texels. +Mipmap dimensions are reduced in all 3 coordinates. + +- If PIPE_CAP_NPOT_TEXTURES is not supported, + width, height and depth must be powers of two +- Must use normalized coordinates + +OpenGL: GL_TEXTURE_3D in GL 1.2 or GL_EXT_texture3D + +- PIPE_CAP_NPOT_TEXTURES is equivalent to GL 2.0 or GL_ARB_texture_non_power_of_two + +D3D11: 3D textures + +- PIPE_CAP_NPOT_TEXTURES is equivalent to D3D_FEATURE_LEVEL_10_0 + +PIPE_TEXTURE_CUBE +^^^^^^^^^^^^^^^^^ + +Cube maps consist of 6 2D faces. +The 6 surfaces form an imaginary cube, and sampling happens by mapping an +input 3-vector to the point of the cube surface in that direction. + +Sampling may be optionally seamless, resulting in filtering taking samples +from multiple surfaces near to the edge. +UNIMPLEMENTED: seamless cube map sampling not supported + +UNIMPLEMENTED: cube map arrays not supported + +- Width and height must be equal +- If PIPE_CAP_NPOT_TEXTURES is not supported, + width and height must be powers of two +- Must use normalized coordinates + +OpenGL: GL_TEXTURE_CUBE_MAP in GL 1.3 or EXT_texture_cube_map + +- PIPE_CAP_NPOT_TEXTURES is equivalent to GL 2.0 or GL_ARB_texture_non_power_of_two +- Seamless cube maps require GL 3.2 or GL_ARB_seamless_cube_map or GL_AMD_seamless_cubemap_per_texture +- Cube map arrays require GL 4.0 or GL_ARB_texture_cube_map_array + +D3D11: 2D array textures with the D3D11_RESOURCE_MISC_TEXTURECUBE flag + +- PIPE_CAP_NPOT_TEXTURES is equivalent to D3D_FEATURE_LEVEL_10_0 +- Cube map arrays require D3D_FEATURE_LEVEL_10_1 +- TODO: are (non)seamless cube maps supported in D3D11? how? + +Surfaces +-------- + +Surfaces are views of a resource that can be bound as a framebuffer to serve as the render target or depth buffer. + +TODO: write much more on surfaces + +OpenGL: FBOs are collections of surfaces in GL 3.0 or GL_ARB_framebuffer_object + +D3D11: render target views and depth/stencil views + +Sampler views +------------- + +Sampler views are views of a resource that can be bound to a pipeline stage to be sampled from shaders. + +TODO: write much more on sampler views + +OpenGL: texture objects are actually sampler view and resource in a single unit + +D3D11: shader resource views diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c index 4adef5b8c07..a367fa3fe15 100644 --- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c +++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c @@ -78,20 +78,13 @@ cell_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) draw_set_mapped_vertex_buffer(draw, i, buf); } /* Map index buffer, if present */ - if (info->indexed && cell->index_buffer.buffer) { + if (info->indexed && cell->index_buffer.buffer) mapped_indices = cell_resource(cell->index_buffer.buffer)->data; - mapped_indices += cell->index_buffer.offset; - } - draw_set_mapped_element_buffer_range(draw, (mapped_indices) ? - lp->index_buffer.index_size : 0, - info->index_bias, - info->min_index, - info->max_index, - mapped_indices); + draw_set_mapped_index_buffer(draw, mapped_indices); /* draw! */ - draw_arrays(draw, info->mode, info->start, info->count); + draw_vbo(draw, info); /* * unmap vertex/index buffers - will cause draw module to flush @@ -100,7 +93,7 @@ cell_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) draw_set_mapped_vertex_buffer(draw, i, NULL); } if (mapped_indices) { - draw_set_mapped_element_buffer(draw, 0, 0, NULL); + draw_set_mapped_index_buffer(draw, NULL); } /* diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c index 4e3701cd0ac..a065d68b5a6 100644 --- a/src/gallium/drivers/cell/ppu/cell_state_vertex.c +++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c @@ -102,7 +102,7 @@ cell_set_index_buffer(struct pipe_context *pipe, else memset(&cell->index_buffer, 0, sizeof(cell->index_buffer)); - /* TODO make this more like a state */ + draw_set_index_buffer(cell->draw, ib); } diff --git a/src/gallium/drivers/galahad/glhd_context.c b/src/gallium/drivers/galahad/glhd_context.c index fe14a287efb..383c4489261 100644 --- a/src/gallium/drivers/galahad/glhd_context.c +++ b/src/gallium/drivers/galahad/glhd_context.c @@ -185,6 +185,12 @@ galahad_bind_fragment_sampler_states(struct pipe_context *_pipe, struct galahad_context *glhd_pipe = galahad_context(_pipe); struct pipe_context *pipe = glhd_pipe->pipe; + if (num_samplers > PIPE_MAX_SAMPLERS) { + glhd_error("%u fragment samplers requested, " + "but only %u are permitted by API", + num_samplers, PIPE_MAX_SAMPLERS); + } + pipe->bind_fragment_sampler_states(pipe, num_samplers, samplers); @@ -198,6 +204,12 @@ galahad_bind_vertex_sampler_states(struct pipe_context *_pipe, struct galahad_context *glhd_pipe = galahad_context(_pipe); struct pipe_context *pipe = glhd_pipe->pipe; + if (num_samplers > PIPE_MAX_VERTEX_SAMPLERS) { + glhd_error("%u vertex samplers requested, " + "but only %u are permitted by API", + num_samplers, PIPE_MAX_VERTEX_SAMPLERS); + } + pipe->bind_vertex_sampler_states(pipe, num_samplers, samplers); @@ -447,6 +459,19 @@ galahad_set_constant_buffer(struct pipe_context *_pipe, struct pipe_resource *unwrapped_resource; struct pipe_resource *resource = NULL; + if (shader >= PIPE_SHADER_TYPES) { + glhd_error("Unknown shader type %u", shader); + } + + if (index && + index >= + pipe->screen->get_param(pipe->screen, PIPE_CAP_MAX_CONST_BUFFERS)) { + glhd_error("Access to constant buffer %u requested, " + "but only %d are supported", + index, + pipe->screen->get_param(pipe->screen, PIPE_CAP_MAX_CONST_BUFFERS)); + } + /* XXX hmm? unwrap the input state */ if (_resource) { unwrapped_resource = galahad_resource_unwrap(_resource); @@ -972,5 +997,7 @@ galahad_context_create(struct pipe_screen *_screen, struct pipe_context *pipe) glhd_pipe->pipe = pipe; + glhd_warn("Created context %p", glhd_pipe); + return &glhd_pipe->base; } diff --git a/src/gallium/drivers/galahad/glhd_screen.c b/src/gallium/drivers/galahad/glhd_screen.c index 4117485702e..75e4c2d82e9 100644 --- a/src/gallium/drivers/galahad/glhd_screen.c +++ b/src/gallium/drivers/galahad/glhd_screen.c @@ -30,6 +30,7 @@ #include "pipe/p_screen.h" #include "pipe/p_state.h" #include "util/u_memory.h" +#include "util/u_math.h" #include "glhd_public.h" #include "glhd_screen.h" @@ -134,6 +135,33 @@ galahad_screen_resource_create(struct pipe_screen *_screen, struct pipe_screen *screen = glhd_screen->screen; struct pipe_resource *result; + if (templat->target >= PIPE_MAX_TEXTURE_TYPES) + glhd_warn("Received bogus resource target %d", templat->target); + + if(templat->target != PIPE_TEXTURE_RECT && templat->target != PIPE_BUFFER && !screen->get_param(screen, PIPE_CAP_NPOT_TEXTURES)) + { + if(!util_is_power_of_two(templat->width0) || !util_is_power_of_two(templat->height0)) + glhd_warn("Requested NPOT (%ux%u) non-rectangle texture without NPOT support", templat->width0, templat->height0); + } + + if(templat->target == PIPE_TEXTURE_RECT && templat->last_level) + glhd_warn("Rectangle textures cannot have mipmaps, but last_level = %u", templat->last_level); + + if(templat->target == PIPE_BUFFER && templat->last_level) + glhd_warn("Buffers cannot have mipmaps, but last_level = %u", templat->last_level); + + if(templat->target != PIPE_TEXTURE_3D && templat->depth0 != 1) + glhd_warn("Only 3D textures can have depth != 1, but received target %u and depth %u", templat->target, templat->depth0); + + if(templat->target == PIPE_TEXTURE_1D && templat->height0 != 1) + glhd_warn("1D textures must have height 1 but got asked for height %u", templat->height0); + + if(templat->target == PIPE_BUFFER && templat->height0 != 1) + glhd_warn("Buffers must have height 1 but got asked for height %u", templat->height0); + + if(templat->target == PIPE_TEXTURE_CUBE && templat->width0 != templat->height0) + glhd_warn("Cube maps must be square, but got asked for %ux%u", templat->width0, templat->height0); + result = screen->resource_create(screen, templat); @@ -330,5 +358,7 @@ galahad_screen_create(struct pipe_screen *screen) glhd_screen->screen = screen; + glhd_warn("Created screen %p", glhd_screen); + return &glhd_screen->base; } diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c index 2beb9e3091f..847dd6dd47e 100644 --- a/src/gallium/drivers/i915/i915_context.c +++ b/src/gallium/drivers/i915/i915_context.c @@ -66,18 +66,9 @@ i915_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) /* * Map index buffer, if present */ - if (info->indexed && i915->index_buffer.buffer) { - char *indices = (char *) i915_buffer(i915->index_buffer.buffer)->data; - mapped_indices = (void *) (indices + i915->index_buffer.offset); - } - - draw_set_mapped_element_buffer_range(draw, (mapped_indices) ? - i915->index_buffer.index_size : 0, - info->index_bias, - info->min_index, - info->max_index, - mapped_indices); - + if (info->indexed && i915->index_buffer.buffer) + mapped_indices = i915_buffer(i915->index_buffer.buffer)->data; + draw_set_mapped_index_buffer(draw, mapped_indices); draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0, i915->current.constants[PIPE_SHADER_VERTEX], @@ -87,7 +78,7 @@ i915_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) /* * Do the drawing */ - draw_arrays(i915->draw, info->mode, info->start, info->count); + draw_vbo(i915->draw, info); /* * unmap vertex/index buffers @@ -96,9 +87,8 @@ i915_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) draw_set_mapped_vertex_buffer(draw, i, NULL); } - if (mapped_indices) { - draw_set_mapped_element_buffer(draw, 0, 0, NULL); - } + if (mapped_indices) + draw_set_mapped_index_buffer(draw, NULL); } diff --git a/src/gallium/drivers/i915/i915_resource_texture.c b/src/gallium/drivers/i915/i915_resource_texture.c index 752ddaae7b1..c5c6179b169 100644 --- a/src/gallium/drivers/i915/i915_resource_texture.c +++ b/src/gallium/drivers/i915/i915_resource_texture.c @@ -360,6 +360,7 @@ i915_texture_layout(struct i915_texture * tex) switch (pt->target) { case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: if (!i9x5_special_layout(tex)) i915_texture_layout_2d(tex); break; @@ -605,6 +606,7 @@ i945_texture_layout(struct i915_texture * tex) switch (pt->target) { case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: if (!i9x5_special_layout(tex)) i945_texture_layout_2d(tex); break; @@ -829,7 +831,8 @@ i915_texture_from_handle(struct pipe_screen * screen, buffer = iws->buffer_from_handle(iws, whandle, &stride); /* Only supports one type */ - if (template->target != PIPE_TEXTURE_2D || + if ((template->target != PIPE_TEXTURE_2D && + template->target != PIPE_TEXTURE_RECT) || template->last_level != 0 || template->depth0 != 1) { return NULL; diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c index 385c3b2d2d3..bbfcff6bc4d 100644 --- a/src/gallium/drivers/i915/i915_state.c +++ b/src/gallium/drivers/i915/i915_state.c @@ -294,8 +294,6 @@ static void i915_bind_sampler_states(struct pipe_context *pipe, struct i915_context *i915 = i915_context(pipe); unsigned i; - assert(num <= PIPE_MAX_SAMPLERS); - /* Check for no-op */ if (num == i915->num_samplers && !memcmp(i915->sampler, sampler, num * sizeof(void *))) @@ -529,9 +527,6 @@ static void i915_set_constant_buffer(struct pipe_context *pipe, struct i915_context *i915 = i915_context(pipe); draw_flush(i915->draw); - assert(shader < PIPE_SHADER_TYPES); - assert(index == 0); - /* Make a copy of shader constants. * During fragment program translation we may add additional * constants to the array. @@ -822,7 +817,8 @@ static void i915_set_index_buffer(struct pipe_context *pipe, else memset(&i915->index_buffer, 0, sizeof(i915->index_buffer)); - /* TODO make this more like a state */ + /* pass-through to draw module */ + draw_set_index_buffer(i915->draw, ib); } static void diff --git a/src/gallium/drivers/i965/brw_batchbuffer.c b/src/gallium/drivers/i965/brw_batchbuffer.c index 8b3f46f2c16..e80067f3b19 100644 --- a/src/gallium/drivers/i965/brw_batchbuffer.c +++ b/src/gallium/drivers/i965/brw_batchbuffer.c @@ -162,7 +162,7 @@ brw_batchbuffer_emit_reloc(struct brw_batchbuffer *batch, if (batch->ptr - batch->map > batch->buf->size) { debug_printf("bad relocation ptr %p map %p offset %li size %i\n", - batch->ptr, batch->map, batch->ptr - batch->map, batch->buf->size); + batch->ptr, batch->map, (long) (batch->ptr - batch->map), batch->buf->size); return PIPE_ERROR_OUT_OF_MEMORY; } diff --git a/src/gallium/drivers/i965/brw_resource_texture.c b/src/gallium/drivers/i965/brw_resource_texture.c index ffd0f38672c..3860d18a7a2 100644 --- a/src/gallium/drivers/i965/brw_resource_texture.c +++ b/src/gallium/drivers/i965/brw_resource_texture.c @@ -66,6 +66,7 @@ static GLuint translate_tex_target( unsigned target ) return BRW_SURFACE_1D; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: return BRW_SURFACE_2D; case PIPE_TEXTURE_3D: @@ -498,7 +499,8 @@ brw_texture_from_handle(struct pipe_screen *screen, unsigned pitch; GLuint format; - if (template->target != PIPE_TEXTURE_2D || + if ((template->target != PIPE_TEXTURE_2D + && template->target != PIPE_TEXTURE_RECT) || template->last_level != 0 || template->depth0 != 1) return NULL; diff --git a/src/gallium/drivers/i965/brw_wm_debug.c b/src/gallium/drivers/i965/brw_wm_debug.c index e2767264e7e..1b2aa93befc 100644 --- a/src/gallium/drivers/i965/brw_wm_debug.c +++ b/src/gallium/drivers/i965/brw_wm_debug.c @@ -101,16 +101,16 @@ void brw_wm_print_value( struct brw_wm_compile *c, debug_printf("undef"); else if( value - c->vreg >= 0 && value - c->vreg < BRW_WM_MAX_VREG) - debug_printf("r%d", value - c->vreg); + debug_printf("r%ld", (long) (value - c->vreg)); else if (value - c->creg >= 0 && value - c->creg < BRW_WM_MAX_PARAM) - debug_printf("c%d", value - c->creg); + debug_printf("c%ld", (long) (value - c->creg)); else if (value - c->payload.input_interp >= 0 && value - c->payload.input_interp < PIPE_MAX_SHADER_INPUTS) - debug_printf("i%d", value - c->payload.input_interp); + debug_printf("i%ld", (long) (value - c->payload.input_interp)); else if (value - c->payload.depth >= 0 && value - c->payload.depth < PIPE_MAX_SHADER_INPUTS) - debug_printf("d%d", value - c->payload.depth); + debug_printf("d%ld", (long) (value - c->payload.depth)); else debug_printf("?"); } diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile index 2892b62920e..dec874623e5 100644 --- a/src/gallium/drivers/llvmpipe/Makefile +++ b/src/gallium/drivers/llvmpipe/Makefile @@ -27,6 +27,8 @@ C_SOURCES = \ lp_scene_queue.c \ lp_screen.c \ lp_setup.c \ + lp_setup_coef.c \ + lp_setup_coef_intrin.c \ lp_setup_line.c \ lp_setup_point.c \ lp_setup_tri.c \ diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript index 5583fca38e6..8d57db72cfb 100644 --- a/src/gallium/drivers/llvmpipe/SConscript +++ b/src/gallium/drivers/llvmpipe/SConscript @@ -63,6 +63,8 @@ llvmpipe = env.ConvenienceLibrary( 'lp_setup_line.c', 'lp_setup_point.c', 'lp_setup_tri.c', + 'lp_setup_coef.c', + 'lp_setup_coef_intrin.c', 'lp_setup_vbuf.c', 'lp_state_blend.c', 'lp_state_clip.c', diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c index 7543bd7b2b0..39f2c6085ef 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.c +++ b/src/gallium/drivers/llvmpipe/lp_context.c @@ -85,6 +85,14 @@ static void llvmpipe_destroy( struct pipe_context *pipe ) align_free( llvmpipe ); } +static void +do_flush( struct pipe_context *pipe, + unsigned flags, + struct pipe_fence_handle **fence) +{ + llvmpipe_flush(pipe, flags, fence, __FUNCTION__); +} + struct pipe_context * llvmpipe_create_context( struct pipe_screen *screen, void *priv ) @@ -109,7 +117,7 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv ) llvmpipe->pipe.destroy = llvmpipe_destroy; llvmpipe->pipe.set_framebuffer_state = llvmpipe_set_framebuffer_state; llvmpipe->pipe.clear = llvmpipe_clear; - llvmpipe->pipe.flush = llvmpipe_flush; + llvmpipe->pipe.flush = do_flush; llvmpipe_init_blend_funcs(llvmpipe); llvmpipe_init_clip_funcs(llvmpipe); @@ -147,9 +155,13 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv ) draw_install_aapoint_stage(llvmpipe->draw, &llvmpipe->pipe); draw_install_pstipple_stage(llvmpipe->draw, &llvmpipe->pipe); - /* convert points and lines into triangles: */ - draw_wide_point_threshold(llvmpipe->draw, 0.0); - draw_wide_line_threshold(llvmpipe->draw, 0.0); + /* convert points and lines into triangles: + * (otherwise, draw points and lines natively) + */ + draw_wide_point_sprites(llvmpipe->draw, FALSE); + draw_enable_point_sprites(llvmpipe->draw, FALSE); + draw_wide_point_threshold(llvmpipe->draw, 10000.0); + draw_wide_line_threshold(llvmpipe->draw, 10000.0); #if USE_DRAW_STAGE_PSTIPPLE /* Do polygon stipple w/ texture map + frag prog? */ diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index 50f9091c3ca..34fa20e204a 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -101,6 +101,9 @@ struct llvmpipe_context { /** Vertex format */ struct vertex_info vertex_info; + + /** Which vertex shader output slot contains point size */ + int psize_slot; /** Fragment shader input interpolation info */ unsigned num_inputs; diff --git a/src/gallium/drivers/llvmpipe/lp_debug.h b/src/gallium/drivers/llvmpipe/lp_debug.h index 92fb2b3ee5b..a928ee38bec 100644 --- a/src/gallium/drivers/llvmpipe/lp_debug.h +++ b/src/gallium/drivers/llvmpipe/lp_debug.h @@ -46,6 +46,8 @@ st_print_current(void); #define DEBUG_SHOW_TILES 0x200 #define DEBUG_SHOW_SUBTILES 0x400 #define DEBUG_COUNTERS 0x800 +#define DEBUG_SCENE 0x1000 +#define DEBUG_FENCE 0x2000 #ifdef DEBUG diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c index e73b431cb4d..3af5c8d5c55 100644 --- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c +++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c @@ -68,25 +68,17 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) } /* Map index buffer, if present */ - if (info->indexed && lp->index_buffer.buffer) { - char *indices = (char *) llvmpipe_resource_data(lp->index_buffer.buffer); - mapped_indices = (void *) (indices + lp->index_buffer.offset); - } + if (info->indexed && lp->index_buffer.buffer) + mapped_indices = llvmpipe_resource_data(lp->index_buffer.buffer); - draw_set_mapped_element_buffer_range(draw, (mapped_indices) ? - lp->index_buffer.index_size : 0, - info->index_bias, - info->min_index, - info->max_index, - mapped_indices); + draw_set_mapped_index_buffer(draw, mapped_indices); llvmpipe_prepare_vertex_sampling(lp, lp->num_vertex_sampler_views, lp->vertex_sampler_views); /* draw! */ - draw_arrays_instanced(draw, info->mode, info->start, info->count, - info->start_instance, info->instance_count); + draw_vbo(draw, info); /* * unmap vertex/index buffers @@ -95,7 +87,7 @@ llvmpipe_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) draw_set_mapped_vertex_buffer(draw, i, NULL); } if (mapped_indices) { - draw_set_mapped_element_buffer(draw, 0, 0, NULL); + draw_set_mapped_index_buffer(draw, NULL); } llvmpipe_cleanup_vertex_sampling(lp); diff --git a/src/gallium/drivers/llvmpipe/lp_fence.c b/src/gallium/drivers/llvmpipe/lp_fence.c index f9805e5d688..3a55e76bc35 100644 --- a/src/gallium/drivers/llvmpipe/lp_fence.c +++ b/src/gallium/drivers/llvmpipe/lp_fence.c @@ -44,6 +44,7 @@ struct lp_fence * lp_fence_create(unsigned rank) { + static int fence_id; struct lp_fence *fence = CALLOC_STRUCT(lp_fence); pipe_reference_init(&fence->reference, 1); @@ -51,8 +52,12 @@ lp_fence_create(unsigned rank) pipe_mutex_init(fence->mutex); pipe_condvar_init(fence->signalled); + fence->id = fence_id++; fence->rank = rank; + if (LP_DEBUG & DEBUG_FENCE) + debug_printf("%s %d\n", __FUNCTION__, fence->id); + return fence; } @@ -61,6 +66,9 @@ lp_fence_create(unsigned rank) void lp_fence_destroy(struct lp_fence *fence) { + if (LP_DEBUG & DEBUG_FENCE) + debug_printf("%s %d\n", __FUNCTION__, fence->id); + pipe_mutex_destroy(fence->mutex); pipe_condvar_destroy(fence->signalled); FREE(fence); @@ -68,82 +76,49 @@ lp_fence_destroy(struct lp_fence *fence) /** - * For reference counting. - * This is a Gallium API function. - */ -static void -llvmpipe_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *fence) -{ - struct lp_fence **old = (struct lp_fence **) ptr; - struct lp_fence *f = (struct lp_fence *) fence; - - lp_fence_reference(old, f); -} - - -/** - * Has the fence been executed/finished? - * This is a Gallium API function. - */ -static int -llvmpipe_fence_signalled(struct pipe_screen *screen, - struct pipe_fence_handle *fence, - unsigned flag) -{ - struct lp_fence *f = (struct lp_fence *) fence; - - return f->count == f->rank; -} - - -/** - * Wait for the fence to finish. - * This is a Gallium API function. - */ -static int -llvmpipe_fence_finish(struct pipe_screen *screen, - struct pipe_fence_handle *fence_handle, - unsigned flag) -{ - struct lp_fence *fence = (struct lp_fence *) fence_handle; - - pipe_mutex_lock(fence->mutex); - while (fence->count < fence->rank) { - pipe_condvar_wait(fence->signalled, fence->mutex); - } - pipe_mutex_unlock(fence->mutex); - - return 0; -} - - -/** * Called by the rendering threads to increment the fence counter. * When the counter == the rank, the fence is finished. */ void lp_fence_signal(struct lp_fence *fence) { + if (LP_DEBUG & DEBUG_FENCE) + debug_printf("%s %d\n", __FUNCTION__, fence->id); + pipe_mutex_lock(fence->mutex); fence->count++; assert(fence->count <= fence->rank); - LP_DBG(DEBUG_RAST, "%s count=%u rank=%u\n", __FUNCTION__, - fence->count, fence->rank); + if (LP_DEBUG & DEBUG_FENCE) + debug_printf("%s count=%u rank=%u\n", __FUNCTION__, + fence->count, fence->rank); - pipe_condvar_signal(fence->signalled); + /* Wakeup all threads waiting on the mutex: + */ + pipe_condvar_broadcast(fence->signalled); pipe_mutex_unlock(fence->mutex); } +boolean +lp_fence_signalled(struct lp_fence *f) +{ + return f->count == f->rank; +} void -llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen) +lp_fence_wait(struct lp_fence *f) { - screen->fence_reference = llvmpipe_fence_reference; - screen->fence_signalled = llvmpipe_fence_signalled; - screen->fence_finish = llvmpipe_fence_finish; + if (LP_DEBUG & DEBUG_FENCE) + debug_printf("%s %d\n", __FUNCTION__, f->id); + + pipe_mutex_lock(f->mutex); + assert(f->issued); + while (f->count < f->rank) { + pipe_condvar_wait(f->signalled, f->mutex); + } + pipe_mutex_unlock(f->mutex); } + + diff --git a/src/gallium/drivers/llvmpipe/lp_fence.h b/src/gallium/drivers/llvmpipe/lp_fence.h index 13358fb99f2..3c591187801 100644 --- a/src/gallium/drivers/llvmpipe/lp_fence.h +++ b/src/gallium/drivers/llvmpipe/lp_fence.h @@ -41,10 +41,12 @@ struct pipe_screen; struct lp_fence { struct pipe_reference reference; + unsigned id; pipe_mutex mutex; pipe_condvar signalled; + boolean issued; unsigned rank; unsigned count; }; @@ -57,6 +59,11 @@ lp_fence_create(unsigned rank); void lp_fence_signal(struct lp_fence *fence); +boolean +lp_fence_signalled(struct lp_fence *fence); + +void +lp_fence_wait(struct lp_fence *fence); void llvmpipe_init_screen_fence_funcs(struct pipe_screen *screen); @@ -78,5 +85,11 @@ lp_fence_reference(struct lp_fence **ptr, *ptr = f; } +static INLINE boolean +lp_fence_issued(const struct lp_fence *fence) +{ + return fence->issued; +} + #endif /* LP_FENCE_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_flush.c b/src/gallium/drivers/llvmpipe/lp_flush.c index 845292f4ab2..e2c723b7a87 100644 --- a/src/gallium/drivers/llvmpipe/lp_flush.c +++ b/src/gallium/drivers/llvmpipe/lp_flush.c @@ -31,6 +31,7 @@ #include "pipe/p_defines.h" +#include "pipe/p_screen.h" #include "util/u_string.h" #include "draw/draw_context.h" #include "lp_flush.h" @@ -45,14 +46,15 @@ void llvmpipe_flush( struct pipe_context *pipe, unsigned flags, - struct pipe_fence_handle **fence ) + struct pipe_fence_handle **fence, + const char *reason) { struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); draw_flush(llvmpipe->draw); /* ask the setup module to flush */ - lp_setup_flush(llvmpipe->setup, flags, fence); + lp_setup_flush(llvmpipe->setup, flags, fence, reason); /* Enable to dump BMPs of the color/depth buffers each frame */ if (0) { @@ -76,6 +78,17 @@ llvmpipe_flush( struct pipe_context *pipe, } } +void +llvmpipe_finish( struct pipe_context *pipe, + const char *reason ) +{ + struct pipe_fence_handle *fence = NULL; + llvmpipe_flush(pipe, 0, &fence, reason); + if (fence) { + pipe->screen->fence_finish(pipe->screen, fence, 0); + pipe->screen->fence_reference(pipe->screen, &fence, NULL); + } +} /** * Flush context if necessary. @@ -93,7 +106,8 @@ llvmpipe_flush_resource(struct pipe_context *pipe, unsigned flush_flags, boolean read_only, boolean cpu_access, - boolean do_not_block) + boolean do_not_block, + const char *reason) { unsigned referenced; @@ -106,31 +120,16 @@ llvmpipe_flush_resource(struct pipe_context *pipe, /* * Flush and wait. */ - - struct pipe_fence_handle *fence = NULL; - if (do_not_block) return FALSE; - /* - * Do the unswizzling in parallel. - * - * XXX: Don't abuse the PIPE_FLUSH_FRAME flag for this. - */ - flush_flags |= PIPE_FLUSH_FRAME; - - llvmpipe_flush(pipe, flush_flags, &fence); - - if (fence) { - pipe->screen->fence_finish(pipe->screen, fence, 0); - pipe->screen->fence_reference(pipe->screen, &fence, NULL); - } + llvmpipe_finish(pipe, reason); } else { /* * Just flush. */ - llvmpipe_flush(pipe, flush_flags, NULL); + llvmpipe_flush(pipe, flush_flags, NULL, reason); } } diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h index 7b605681a93..bb538b2bd83 100644 --- a/src/gallium/drivers/llvmpipe/lp_flush.h +++ b/src/gallium/drivers/llvmpipe/lp_flush.h @@ -34,8 +34,14 @@ struct pipe_context; struct pipe_fence_handle; void -llvmpipe_flush(struct pipe_context *pipe, unsigned flags, - struct pipe_fence_handle **fence); +llvmpipe_flush(struct pipe_context *pipe, + unsigned flags, + struct pipe_fence_handle **fence, + const char *reason); + +void +llvmpipe_finish( struct pipe_context *pipe, + const char *reason ); boolean llvmpipe_flush_resource(struct pipe_context *pipe, @@ -45,6 +51,7 @@ llvmpipe_flush_resource(struct pipe_context *pipe, unsigned flush_flags, boolean read_only, boolean cpu_access, - boolean do_not_block); + boolean do_not_block, + const char *reason); #endif diff --git a/src/gallium/drivers/llvmpipe/lp_perf.c b/src/gallium/drivers/llvmpipe/lp_perf.c index 083e7e30a5b..e22532f25c1 100644 --- a/src/gallium/drivers/llvmpipe/lp_perf.c +++ b/src/gallium/drivers/llvmpipe/lp_perf.c @@ -46,7 +46,7 @@ lp_print_counters(void) { if (LP_DEBUG & DEBUG_COUNTERS) { unsigned total_64, total_16, total_4; - float p1, p2, p3, p4; + float p1, p2, p3, p5, p6; debug_printf("llvmpipe: nr_triangles: %9u\n", lp_count.nr_tris); debug_printf("llvmpipe: nr_culled_triangles: %9u\n", lp_count.nr_culled_tris); @@ -58,11 +58,15 @@ lp_print_counters(void) p1 = 100.0 * (float) lp_count.nr_empty_64 / (float) total_64; p2 = 100.0 * (float) lp_count.nr_fully_covered_64 / (float) total_64; p3 = 100.0 * (float) lp_count.nr_partially_covered_64 / (float) total_64; - p4 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64; + p5 = 100.0 * (float) lp_count.nr_shade_opaque_64 / (float) total_64; + p6 = 100.0 * (float) lp_count.nr_shade_64 / (float) total_64; debug_printf("llvmpipe: nr_64x64: %9u\n", total_64); debug_printf("llvmpipe: nr_fully_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_64, p2, total_64); - debug_printf("llvmpipe: nr_shade_opaque_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p4, total_64); + debug_printf("llvmpipe: nr_shade_opaque_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_shade_opaque_64, p5, total_64); + debug_printf("llvmpipe: nr_pure_shade_opaque: %9u (%3.0f%% of %u)\n", lp_count.nr_pure_shade_opaque_64, 0.0, lp_count.nr_shade_opaque_64); + debug_printf("llvmpipe: nr_shade_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_shade_64, p6, total_64); + debug_printf("llvmpipe: nr_pure_shade: %9u (%3.0f%% of %u)\n", lp_count.nr_pure_shade_64, 0.0, lp_count.nr_shade_64); debug_printf("llvmpipe: nr_partially_covered_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_64, p3, total_64); debug_printf("llvmpipe: nr_empty_64x64: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_64, p1, total_64); @@ -79,12 +83,17 @@ lp_print_counters(void) debug_printf("llvmpipe: nr_partially_covered_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_16, p3, total_16); debug_printf("llvmpipe: nr_empty_16x16: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_16, p1, total_16); - total_4 = (lp_count.nr_empty_4 + lp_count.nr_non_empty_4); + total_4 = (lp_count.nr_empty_4 + + lp_count.nr_fully_covered_4 + + lp_count.nr_partially_covered_4); p1 = 100.0 * (float) lp_count.nr_empty_4 / (float) total_4; - p2 = 100.0 * (float) lp_count.nr_non_empty_4 / (float) total_4; + p2 = 100.0 * (float) lp_count.nr_fully_covered_4 / (float) total_4; + p3 = 100.0 * (float) lp_count.nr_partially_covered_4 / (float) total_4; - debug_printf("llvmpipe: nr_4x4: %9u\n", total_4); + debug_printf("llvmpipe: nr_tri_4x4: %9u\n", total_4); + debug_printf("llvmpipe: nr_fully_covered_4x4: %9u (%3.0f%% of %u)\n", lp_count.nr_fully_covered_4, p2, total_4); + debug_printf("llvmpipe: nr_partially_covered_4x4: %9u (%3.0f%% of %u)\n", lp_count.nr_partially_covered_4, p3, total_4); debug_printf("llvmpipe: nr_empty_4x4: %9u (%3.0f%% of %u)\n", lp_count.nr_empty_4, p1, total_4); debug_printf("llvmpipe: nr_non_empty_4x4: %9u (%3.0f%% of %u)\n", lp_count.nr_non_empty_4, p2, total_4); diff --git a/src/gallium/drivers/llvmpipe/lp_perf.h b/src/gallium/drivers/llvmpipe/lp_perf.h index 4774f645508..c28652fc305 100644 --- a/src/gallium/drivers/llvmpipe/lp_perf.h +++ b/src/gallium/drivers/llvmpipe/lp_perf.h @@ -44,11 +44,16 @@ struct lp_counters unsigned nr_empty_64; unsigned nr_fully_covered_64; unsigned nr_partially_covered_64; + unsigned nr_pure_shade_opaque_64; + unsigned nr_pure_shade_64; + unsigned nr_shade_64; unsigned nr_shade_opaque_64; unsigned nr_empty_16; unsigned nr_fully_covered_16; unsigned nr_partially_covered_16; unsigned nr_empty_4; + unsigned nr_fully_covered_4; + unsigned nr_partially_covered_4; unsigned nr_non_empty_4; unsigned nr_llvm_compiles; int64_t llvm_compile_time; /**< total, in microseconds */ @@ -66,9 +71,11 @@ extern struct lp_counters lp_count; #ifdef DEBUG #define LP_COUNT(counter) lp_count.counter++ #define LP_COUNT_ADD(counter, incr) lp_count.counter += (incr) +#define LP_COUNT_GET(counter) (lp_count.counter) #else #define LP_COUNT(counter) #define LP_COUNT_ADD(counter, incr) (void) incr +#define LP_COUNT_GET(counter) 0 #endif diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c index 02eeaf64871..67fd797af22 100644 --- a/src/gallium/drivers/llvmpipe/lp_query.c +++ b/src/gallium/drivers/llvmpipe/lp_query.c @@ -35,9 +35,8 @@ #include "util/u_memory.h" #include "lp_context.h" #include "lp_flush.h" +#include "lp_fence.h" #include "lp_query.h" -#include "lp_rast.h" -#include "lp_rast_priv.h" #include "lp_state.h" @@ -69,12 +68,7 @@ llvmpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q) struct llvmpipe_query *pq = llvmpipe_query(q); /* query might still be in process if we never waited for the result */ if (!pq->done) { - struct pipe_fence_handle *fence = NULL; - llvmpipe_flush(pipe, 0, &fence); - if (fence) { - pipe->screen->fence_finish(pipe->screen, fence, 0); - pipe->screen->fence_reference(pipe->screen, &fence, NULL); - } + llvmpipe_finish(pipe, __FUNCTION__); } pipe_mutex_destroy(pq->mutex); @@ -93,16 +87,11 @@ llvmpipe_get_query_result(struct pipe_context *pipe, if (!pq->done) { if (wait) { - struct pipe_fence_handle *fence = NULL; - llvmpipe_flush(pipe, 0, &fence); - if (fence) { - pipe->screen->fence_finish(pipe->screen, fence, 0); - pipe->screen->fence_reference(pipe->screen, &fence, NULL); - } + llvmpipe_finish(pipe, __FUNCTION__); } /* this is a bit inconsequent but should be ok */ else { - llvmpipe_flush(pipe, 0, NULL); + llvmpipe_flush(pipe, 0, NULL, __FUNCTION__); } } @@ -125,12 +114,7 @@ llvmpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q) * frame of rendering. */ if (pq->binned) { - struct pipe_fence_handle *fence; - llvmpipe_flush(pipe, 0, &fence); - if (fence) { - pipe->screen->fence_finish(pipe->screen, fence, 0); - pipe->screen->fence_reference(pipe->screen, &fence, NULL); - } + llvmpipe_finish(pipe, __FUNCTION__); } lp_setup_begin_query(llvmpipe->setup, pq); diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index 3215d0f6525..b1c306bbe94 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -316,43 +316,6 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task, } -/** - * Load tile color from the framebuffer surface. - * This is a bin command called during bin processing. - */ -#if 0 -void -lp_rast_load_color(struct lp_rasterizer_task *task, - const union lp_rast_cmd_arg arg) -{ - struct lp_rasterizer *rast = task->rast; - unsigned buf; - enum lp_texture_usage usage; - - LP_DBG(DEBUG_RAST, "%s at %u, %u\n", __FUNCTION__, x, y); - - if (scene->has_color_clear) - usage = LP_TEX_USAGE_WRITE_ALL; - else - usage = LP_TEX_USAGE_READ_WRITE; - - /* Get pointers to color tile(s). - * This will convert linear data to tiled if needed. - */ - for (buf = 0; buf < rast->state.nr_cbufs; buf++) { - struct pipe_surface *cbuf = rast->curr_scene->fb.cbufs[buf]; - struct llvmpipe_texture *lpt; - assert(cbuf); - lpt = llvmpipe_texture(cbuf->texture); - task->color_tiles[buf] = llvmpipe_get_texture_tile(lpt, - cbuf->face + cbuf->zslice, - cbuf->level, - usage, - task->x, task->y); - assert(task->color_tiles[buf]); - } -} -#endif /** diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h index 44319a0ad6f..b4564ef33bd 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.h +++ b/src/gallium/drivers/llvmpipe/lp_rast.h @@ -120,7 +120,7 @@ struct lp_rast_triangle { float v[3][2]; #endif - struct lp_rast_plane plane[7]; /* NOTE: may allocate fewer planes */ + struct lp_rast_plane plane[8]; /* NOTE: may allocate fewer planes */ }; @@ -236,6 +236,8 @@ void lp_rast_triangle_6( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); void lp_rast_triangle_7( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); +void lp_rast_triangle_8( struct lp_rasterizer_task *, + const union lp_rast_cmd_arg ); void lp_rast_shade_tile( struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); @@ -256,5 +258,9 @@ void lp_rast_begin_query(struct lp_rasterizer_task *, void lp_rast_end_query(struct lp_rasterizer_task *, const union lp_rast_cmd_arg ); +void +lp_rast_triangle_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg); + #endif diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c index 980c18c0240..dbaa8e023a4 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c @@ -67,7 +67,7 @@ block_full_16(struct lp_rasterizer_task *task, block_full_4(task, tri, x + ix, y + iy); } - +#if !defined(PIPE_ARCH_SSE) static INLINE unsigned build_mask(int c, int dcdx, int dcdy) { @@ -98,6 +98,7 @@ build_mask(int c, int dcdx, int dcdy) return mask; } + static INLINE unsigned build_mask_linear(int c, int dcdx, int dcdy) { @@ -129,6 +130,137 @@ build_mask_linear(int c, int dcdx, int dcdy) } +static INLINE void +build_masks(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) +{ + *outmask |= build_mask_linear(c, dcdx, dcdy); + *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); +} + +#else +#include <emmintrin.h> +#include "util/u_sse.h" + + +static INLINE void +build_masks(int c, + int cdiff, + int dcdx, + int dcdy, + unsigned *outmask, + unsigned *partmask) +{ + __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = _mm_set1_epi32(dcdy); + + /* Get values across the quad + */ + __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); + __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); + __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); + + { + __m128i cstep01, cstep23, result; + + cstep01 = _mm_packs_epi32(cstep0, cstep1); + cstep23 = _mm_packs_epi32(cstep2, cstep3); + result = _mm_packs_epi16(cstep01, cstep23); + + *outmask |= _mm_movemask_epi8(result); + } + + + { + __m128i cio4 = _mm_set1_epi32(cdiff); + __m128i cstep01, cstep23, result; + + cstep0 = _mm_add_epi32(cstep0, cio4); + cstep1 = _mm_add_epi32(cstep1, cio4); + cstep2 = _mm_add_epi32(cstep2, cio4); + cstep3 = _mm_add_epi32(cstep3, cio4); + + cstep01 = _mm_packs_epi32(cstep0, cstep1); + cstep23 = _mm_packs_epi32(cstep2, cstep3); + result = _mm_packs_epi16(cstep01, cstep23); + + *partmask |= _mm_movemask_epi8(result); + } +} + + +static INLINE unsigned +build_mask_linear(int c, int dcdx, int dcdy) +{ + __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); + __m128i xdcdy = _mm_set1_epi32(dcdy); + + /* Get values across the quad + */ + __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); + __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); + __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); + + /* pack pairs of results into epi16 + */ + __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); + __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); + + /* pack into epi8, preserving sign bits + */ + __m128i result = _mm_packs_epi16(cstep01, cstep23); + + /* extract sign bits to create mask + */ + return _mm_movemask_epi8(result); +} + +static INLINE unsigned +build_mask(int c, int dcdx, int dcdy) +{ + __m128i step = _mm_setr_epi32(0, dcdx, dcdy, dcdx + dcdy); + __m128i c0 = _mm_set1_epi32(c); + + /* Get values across the quad + */ + __m128i cstep0 = _mm_add_epi32(c0, step); + + /* Scale up step for moving between quads. + */ + __m128i step4 = _mm_add_epi32(step, step); + + /* Get values for the remaining quads: + */ + __m128i cstep1 = _mm_add_epi32(cstep0, + _mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1))); + __m128i cstep2 = _mm_add_epi32(cstep0, + _mm_shuffle_epi32(step4, _MM_SHUFFLE(2,2,2,2))); + __m128i cstep3 = _mm_add_epi32(cstep2, + _mm_shuffle_epi32(step4, _MM_SHUFFLE(1,1,1,1))); + + /* pack pairs of results into epi16 + */ + __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); + __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); + + /* pack into epi8, preserving sign bits + */ + __m128i result = _mm_packs_epi16(cstep01, cstep23); + + /* extract sign bits to create mask + */ + return _mm_movemask_epi8(result); +} + +#endif + + + + #define TAG(x) x##_1 #define NR_PLANES 1 #include "lp_rast_tri_tmp.h" @@ -157,3 +289,92 @@ build_mask_linear(int c, int dcdx, int dcdy) #define NR_PLANES 7 #include "lp_rast_tri_tmp.h" +#define TAG(x) x##_8 +#define NR_PLANES 8 +#include "lp_rast_tri_tmp.h" + + +/* Special case for 3 plane triangle which is contained entirely + * within a 16x16 block. + */ +void +lp_rast_triangle_3_16(struct lp_rasterizer_task *task, + const union lp_rast_cmd_arg arg) +{ + const struct lp_rast_triangle *tri = arg.triangle.tri; + const struct lp_rast_plane *plane = tri->plane; + unsigned mask = arg.triangle.plane_mask; + const int x = task->x + (mask & 0xf) * 16; + const int y = task->y + (mask >> 4) * 16; + unsigned outmask, inmask, partmask, partial_mask; + unsigned j; + int c[3]; + + outmask = 0; /* outside one or more trivial reject planes */ + partmask = 0; /* outside one or more trivial accept planes */ + + for (j = 0; j < 3; j++) { + c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x; + + { + const int dcdx = -plane[j].dcdx * 4; + const int dcdy = plane[j].dcdy * 4; + const int cox = plane[j].eo * 4; + const int cio = plane[j].ei * 4 - 1; + + build_masks(c[j] + cox, + cio - cox, + dcdx, dcdy, + &outmask, /* sign bits from c[i][0..15] + cox */ + &partmask); /* sign bits from c[i][0..15] + cio */ + } + } + + if (outmask == 0xffff) + return; + + /* Mask of sub-blocks which are inside all trivial accept planes: + */ + inmask = ~partmask & 0xffff; + + /* Mask of sub-blocks which are inside all trivial reject planes, + * but outside at least one trivial accept plane: + */ + partial_mask = partmask & ~outmask; + + assert((partial_mask & inmask) == 0); + + /* Iterate over partials: + */ + while (partial_mask) { + int i = ffs(partial_mask) - 1; + int ix = (i & 3) * 4; + int iy = (i >> 2) * 4; + int px = x + ix; + int py = y + iy; + int cx[3]; + + partial_mask &= ~(1 << i); + + for (j = 0; j < 3; j++) + cx[j] = (c[j] + - plane[j].dcdx * ix + + plane[j].dcdy * iy); + + do_block_4_3(task, tri, plane, px, py, cx); + } + + /* Iterate over fulls: + */ + while (inmask) { + int i = ffs(inmask) - 1; + int ix = (i & 3) * 4; + int iy = (i >> 2) * 4; + int px = x + ix; + int py = y + iy; + + inmask &= ~(1 << i); + + block_full_4(task, tri, px, py); + } +} diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h index 43f72d8ca8f..99a0bae45db 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h +++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h @@ -32,7 +32,7 @@ /** - * Prototype for a 7 plane rasterizer function. Will codegenerate + * Prototype for a 8 plane rasterizer function. Will codegenerate * several of these. * * XXX: Varients for more/fewer planes. @@ -81,11 +81,14 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, for (j = 0; j < NR_PLANES; j++) { const int dcdx = -plane[j].dcdx * 4; const int dcdy = plane[j].dcdy * 4; - const int cox = c[j] + plane[j].eo * 4; - const int cio = c[j] + plane[j].ei * 4 - 1; - - outmask |= build_mask_linear(cox, dcdx, dcdy); - partmask |= build_mask_linear(cio, dcdx, dcdy); + const int cox = plane[j].eo * 4; + const int cio = plane[j].ei * 4 - 1; + + build_masks(c[j] + cox, + cio - cox, + dcdx, dcdy, + &outmask, /* sign bits from c[i][0..15] + cox */ + &partmask); /* sign bits from c[i][0..15] + cio */ } if (outmask == 0xffff) @@ -102,6 +105,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, assert((partial_mask & inmask) == 0); + LP_COUNT_ADD(nr_empty_4, util_bitcount(0xffff & ~(partial_mask | inmask))); + /* Iterate over partials: */ while (partial_mask) { @@ -114,6 +119,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, partial_mask &= ~(1 << i); + LP_COUNT(nr_partially_covered_4); + for (j = 0; j < NR_PLANES; j++) cx[j] = (c[j] - plane[j].dcdx * ix @@ -133,6 +140,7 @@ TAG(do_block_16)(struct lp_rasterizer_task *task, inmask &= ~(1 << i); + LP_COUNT(nr_fully_covered_4); block_full_4(task, tri, px, py); } } @@ -166,11 +174,14 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, { const int dcdx = -plane[j].dcdx * 16; const int dcdy = plane[j].dcdy * 16; - const int cox = c[j] + plane[j].eo * 16; - const int cio = c[j] + plane[j].ei * 16 - 1; - - outmask |= build_mask_linear(cox, dcdx, dcdy); - partmask |= build_mask_linear(cio, dcdx, dcdy); + const int cox = plane[j].eo * 16; + const int cio = plane[j].ei * 16 - 1; + + build_masks(c[j] + cox, + cio - cox, + dcdx, dcdy, + &outmask, /* sign bits from c[i][0..15] + cox */ + &partmask); /* sign bits from c[i][0..15] + cio */ } j++; @@ -190,6 +201,8 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task, assert((partial_mask & inmask) == 0); + LP_COUNT_ADD(nr_empty_16, util_bitcount(0xffff & ~(partial_mask | inmask))); + /* Iterate over partials: */ while (partial_mask) { diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c index f88a759fe70..15a09b71006 100644 --- a/src/gallium/drivers/llvmpipe/lp_scene.c +++ b/src/gallium/drivers/llvmpipe/lp_scene.c @@ -163,12 +163,15 @@ lp_scene_reset(struct lp_scene *scene ) /* Free all but last binner command lists: */ - for (i = 0; i < TILES_X; i++) { - for (j = 0; j < TILES_Y; j++) { + for (i = 0; i < scene->tiles_x; i++) { + for (j = 0; j < scene->tiles_y; j++) { lp_scene_bin_reset(scene, i, j); } } + /* If there are any bins which weren't cleared by the loop above, + * they will be caught (on debug builds at least) by this assert: + */ assert(lp_scene_is_empty(scene)); /* Free all but last binned data block: diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index 167cb2ee2e0..1e65a91fc67 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -61,6 +61,8 @@ static const struct debug_named_value lp_debug_flags[] = { { "show_tiles", DEBUG_SHOW_TILES, NULL }, { "show_subtiles", DEBUG_SHOW_SUBTILES, NULL }, { "counters", DEBUG_COUNTERS, NULL }, + { "scene", DEBUG_SCENE, NULL }, + { "fence", DEBUG_FENCE, NULL }, DEBUG_NAMED_VALUE_END }; #endif @@ -87,7 +89,14 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS: return PIPE_MAX_SAMPLERS; case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS: - return PIPE_MAX_VERTEX_SAMPLERS; + /* At this time, the draw module and llvmpipe driver only + * support vertex shader texture lookups when LLVM is enabled in + * the draw module. + */ + if (debug_get_bool_option("DRAW_USE_LLVM", TRUE)) + return PIPE_MAX_VERTEX_SAMPLERS; + else + return 0; case PIPE_CAP_MAX_COMBINED_SAMPLERS: return PIPE_MAX_SAMPLERS + PIPE_MAX_VERTEX_SAMPLERS; case PIPE_CAP_NPOT_TEXTURES: @@ -230,6 +239,7 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen, assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D || target == PIPE_TEXTURE_2D || + target == PIPE_TEXTURE_RECT || target == PIPE_TEXTURE_3D || target == PIPE_TEXTURE_CUBE); @@ -314,6 +324,51 @@ llvmpipe_destroy_screen( struct pipe_screen *_screen ) + +/** + * Fence reference counting. + */ +static void +llvmpipe_fence_reference(struct pipe_screen *screen, + struct pipe_fence_handle **ptr, + struct pipe_fence_handle *fence) +{ + struct lp_fence **old = (struct lp_fence **) ptr; + struct lp_fence *f = (struct lp_fence *) fence; + + lp_fence_reference(old, f); +} + + +/** + * Has the fence been executed/finished? + */ +static int +llvmpipe_fence_signalled(struct pipe_screen *screen, + struct pipe_fence_handle *fence, + unsigned flag) +{ + struct lp_fence *f = (struct lp_fence *) fence; + return lp_fence_signalled(f); +} + + +/** + * Wait for the fence to finish. + */ +static int +llvmpipe_fence_finish(struct pipe_screen *screen, + struct pipe_fence_handle *fence_handle, + unsigned flag) +{ + struct lp_fence *f = (struct lp_fence *) fence_handle; + + lp_fence_wait(f); + return 0; +} + + + /** * Create a new pipe_screen object * Note: we're not presently subclassing pipe_screen (no llvmpipe_screen). @@ -351,9 +406,11 @@ llvmpipe_create_screen(struct sw_winsys *winsys) screen->base.context_create = llvmpipe_create_context; screen->base.flush_frontbuffer = llvmpipe_flush_frontbuffer; + screen->base.fence_reference = llvmpipe_fence_reference; + screen->base.fence_signalled = llvmpipe_fence_signalled; + screen->base.fence_finish = llvmpipe_fence_finish; llvmpipe_init_screen_resource_funcs(&screen->base); - llvmpipe_init_screen_fence_funcs(&screen->base); lp_jit_screen_init(screen); diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 556e571585d..3da9097154e 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -275,9 +275,10 @@ set_scene_state( struct lp_setup_context *setup, void lp_setup_flush( struct lp_setup_context *setup, unsigned flags, - struct pipe_fence_handle **fence) + struct pipe_fence_handle **fence, + const char *reason) { - LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); + LP_DBG(DEBUG_SETUP, "%s %s\n", __FUNCTION__, reason); if (setup->scene) { if (fence) { @@ -287,6 +288,8 @@ lp_setup_flush( struct lp_setup_context *setup, *fence = lp_setup_fence( setup ); } + if (setup->scene->fence) + setup->scene->fence->issued = TRUE; } set_scene_state( setup, SETUP_FLUSHED ); @@ -312,6 +315,11 @@ lp_setup_bind_framebuffer( struct lp_setup_context *setup, * scene. */ util_copy_framebuffer_state(&setup->fb, fb); + setup->framebuffer.x0 = 0; + setup->framebuffer.y0 = 0; + setup->framebuffer.x1 = fb->width-1; + setup->framebuffer.y1 = fb->height-1; + setup->dirty |= LP_SETUP_NEW_SCISSOR; } @@ -469,11 +477,35 @@ lp_setup_set_triangle_state( struct lp_setup_context *setup, setup->ccw_is_frontface = ccw_is_frontface; setup->cullmode = cull_mode; setup->triangle = first_triangle; - setup->scissor_test = scissor; setup->pixel_offset = gl_rasterization_rules ? 0.5f : 0.0f; + + if (setup->scissor_test != scissor) { + setup->dirty |= LP_SETUP_NEW_SCISSOR; + setup->scissor_test = scissor; + } } +void +lp_setup_set_line_state( struct lp_setup_context *setup, + float line_width) +{ + LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); + setup->line_width = line_width; +} + +void +lp_setup_set_point_state( struct lp_setup_context *setup, + float point_size, + boolean point_size_per_vertex, + uint sprite) +{ + LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__); + + setup->point_size = point_size; + setup->sprite = sprite; + setup->point_size_per_vertex = point_size_per_vertex; +} void lp_setup_set_fs_inputs( struct lp_setup_context *setup, @@ -559,10 +591,11 @@ lp_setup_set_scissor( struct lp_setup_context *setup, assert(scissor); - if (memcmp(&setup->scissor.current, scissor, sizeof(*scissor)) != 0) { - setup->scissor.current = *scissor; /* struct copy */ - setup->dirty |= LP_SETUP_NEW_SCISSOR; - } + setup->scissor.x0 = scissor->minx; + setup->scissor.x1 = scissor->maxx-1; + setup->scissor.y0 = scissor->miny; + setup->scissor.y1 = scissor->maxy-1; + setup->dirty |= LP_SETUP_NEW_SCISSOR; } @@ -713,6 +746,12 @@ lp_setup_update_state( struct lp_setup_context *setup ) */ { struct llvmpipe_context *lp = llvmpipe_context(scene->pipe); + + /* Will probably need to move this somewhere else, just need + * to know about vertex shader point size attribute. + */ + setup->psize = lp->psize_slot; + if (lp->dirty) { llvmpipe_update_derived(lp); } @@ -806,6 +845,14 @@ lp_setup_update_state( struct lp_setup_context *setup ) } } + if (setup->dirty & LP_SETUP_NEW_SCISSOR) { + setup->draw_region = setup->framebuffer; + if (setup->scissor_test) { + u_rect_possible_intersection(&setup->scissor, + &setup->draw_region); + } + } + setup->dirty = 0; assert(setup->fs.stored); diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h index 73b1c85325a..821ebb1087d 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_setup.h @@ -85,7 +85,8 @@ lp_setup_fence( struct lp_setup_context *setup ); void lp_setup_flush( struct lp_setup_context *setup, unsigned flags, - struct pipe_fence_handle **fence); + struct pipe_fence_handle **fence, + const char *reason); void @@ -99,6 +100,16 @@ lp_setup_set_triangle_state( struct lp_setup_context *setup, boolean scissor, boolean gl_rasterization_rules ); +void +lp_setup_set_line_state( struct lp_setup_context *setup, + float line_width); + +void +lp_setup_set_point_state( struct lp_setup_context *setup, + float point_size, + boolean point_size_per_vertex, + uint sprite); + void lp_setup_set_fs_inputs( struct lp_setup_context *setup, const struct lp_shader_input *interp, diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c new file mode 100644 index 00000000000..95e3e8fffe8 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.c @@ -0,0 +1,258 @@ +/************************************************************************** + * + * Copyright 2010, VMware. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Binning code for triangles + */ + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "lp_perf.h" +#include "lp_setup_context.h" +#include "lp_setup_coef.h" +#include "lp_rast.h" +#include "lp_state_fs.h" + +#if !defined(PIPE_ARCH_SSE) + +/** + * Compute a0 for a constant-valued coefficient (GL_FLAT shading). + */ +static void constant_coef( struct lp_rast_shader_inputs *inputs, + unsigned slot, + const float value, + unsigned i ) +{ + inputs->a0[slot][i] = value; + inputs->dadx[slot][i] = 0.0f; + inputs->dady[slot][i] = 0.0f; +} + + + +static void linear_coef( struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info, + unsigned slot, + unsigned vert_attr, + unsigned i) +{ + float a0 = info->v0[vert_attr][i]; + float a1 = info->v1[vert_attr][i]; + float a2 = info->v2[vert_attr][i]; + + float da01 = a0 - a1; + float da20 = a2 - a0; + float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20); + float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01); + + inputs->dadx[slot][i] = dadx; + inputs->dady[slot][i] = dady; + + /* calculate a0 as the value which would be sampled for the + * fragment at (0,0), taking into account that we want to sample at + * pixel centers, in other words (0.5, 0.5). + * + * this is neat but unfortunately not a good way to do things for + * triangles with very large values of dadx or dady as it will + * result in the subtraction and re-addition from a0 of a very + * large number, which means we'll end up loosing a lot of the + * fractional bits and precision from a0. the way to fix this is + * to define a0 as the sample at a pixel center somewhere near vmin + * instead - i'll switch to this later. + */ + inputs->a0[slot][i] = a0 - (dadx * info->x0_center + + dady * info->y0_center); +} + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void perspective_coef( struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info, + unsigned slot, + unsigned vert_attr, + unsigned i) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + float a0 = info->v0[vert_attr][i] * info->v0[0][3]; + float a1 = info->v1[vert_attr][i] * info->v1[0][3]; + float a2 = info->v2[vert_attr][i] * info->v2[0][3]; + float da01 = a0 - a1; + float da20 = a2 - a0; + float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20; + float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01; + + inputs->dadx[slot][i] = dadx; + inputs->dady[slot][i] = dady; + inputs->a0[slot][i] = a0 - (dadx * info->x0_center + + dady * info->y0_center); +} + + +/** + * Special coefficient setup for gl_FragCoord. + * X and Y are trivial + * Z and W are copied from position_coef which should have already been computed. + * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. + */ +static void +setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info, + unsigned slot, + unsigned usage_mask) +{ + /*X*/ + if (usage_mask & TGSI_WRITEMASK_X) { + inputs->a0[slot][0] = 0.0; + inputs->dadx[slot][0] = 1.0; + inputs->dady[slot][0] = 0.0; + } + + /*Y*/ + if (usage_mask & TGSI_WRITEMASK_Y) { + inputs->a0[slot][1] = 0.0; + inputs->dadx[slot][1] = 0.0; + inputs->dady[slot][1] = 1.0; + } + + /*Z*/ + if (usage_mask & TGSI_WRITEMASK_Z) { + linear_coef(inputs, info, slot, 0, 2); + } + + /*W*/ + if (usage_mask & TGSI_WRITEMASK_W) { + linear_coef(inputs, info, slot, 0, 3); + } +} + + +/** + * Setup the fragment input attribute with the front-facing value. + * \param frontface is the triangle front facing? + */ +static void setup_facing_coef( struct lp_rast_shader_inputs *inputs, + unsigned slot, + boolean frontface, + unsigned usage_mask) +{ + /* convert TRUE to 1.0 and FALSE to -1.0 */ + if (usage_mask & TGSI_WRITEMASK_X) + constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 ); + + if (usage_mask & TGSI_WRITEMASK_Y) + constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */ + + if (usage_mask & TGSI_WRITEMASK_Z) + constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */ + + if (usage_mask & TGSI_WRITEMASK_W) + constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */ +} + + +/** + * Compute the tri->coef[] array dadx, dady, a0 values. + */ +void lp_setup_tri_coef( struct lp_setup_context *setup, + struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info) +{ + unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; + unsigned slot; + unsigned i; + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < setup->fs.nr_inputs; slot++) { + unsigned vert_attr = setup->fs.input[slot].src_index; + unsigned usage_mask = setup->fs.input[slot].usage_mask; + + switch (setup->fs.input[slot].interp) { + case LP_INTERP_CONSTANT: + if (setup->flatshade_first) { + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + constant_coef(inputs, slot+1, info->v0[vert_attr][i], i); + } + else { + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + constant_coef(inputs, slot+1, info->v2[vert_attr][i], i); + } + break; + + case LP_INTERP_LINEAR: + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + linear_coef(inputs, info, slot+1, vert_attr, i); + break; + + case LP_INTERP_PERSPECTIVE: + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + perspective_coef(inputs, info, slot+1, vert_attr, i); + fragcoord_usage_mask |= TGSI_WRITEMASK_W; + break; + + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0, so all need to ensure that the usage mask is covers all + * usages. + */ + fragcoord_usage_mask |= usage_mask; + break; + + case LP_INTERP_FACING: + setup_facing_coef(inputs, slot+1, info->frontfacing, usage_mask); + break; + + default: + assert(0); + } + } + + /* The internal position input is in slot zero: + */ + setup_fragcoord_coef(inputs, info, 0, fragcoord_usage_mask); +} + +#else +extern void lp_setup_coef_dummy(void); +void lp_setup_coef_dummy(void) +{ +} + +#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h new file mode 100644 index 00000000000..d68b39c603f --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_setup_coef.h @@ -0,0 +1,61 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + + +/** + * The setup code is concerned with point/line/triangle setup and + * putting commands/data into the bins. + */ + + +#ifndef LP_SETUP_COEF_H +#define LP_SETUP_COEF_H + + +struct lp_tri_info { + + float x0_center; + float y0_center; + + /* turn these into an aligned float[4] */ + float dy01_ooa; + float dy20_ooa; + float dx01_ooa; + float dx20_ooa; + + const float (*v0)[4]; + const float (*v1)[4]; + const float (*v2)[4]; + + boolean frontfacing; /* remove eventually */ +}; + +void lp_setup_tri_coef( struct lp_setup_context *setup, + struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info); + +#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c new file mode 100644 index 00000000000..73fb70599c9 --- /dev/null +++ b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c @@ -0,0 +1,207 @@ +/************************************************************************** + * + * Copyright 2010 VMware. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Binning code for triangles + */ + +#include "util/u_math.h" +#include "util/u_memory.h" +#include "lp_perf.h" +#include "lp_setup_context.h" +#include "lp_setup_coef.h" +#include "lp_rast.h" + +#if defined(PIPE_ARCH_SSE) +#include <emmintrin.h> + + +static void constant_coef4( struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info, + unsigned slot, + const float *attr) +{ + *(__m128 *)inputs->a0[slot] = *(__m128 *)attr; + *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0); + *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0); +} + + + +/** + * Setup the fragment input attribute with the front-facing value. + * \param frontface is the triangle front facing? + */ +static void setup_facing_coef( struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info, + unsigned slot ) +{ + /* XXX: just pass frontface directly to the shader, don't bother + * treating it as an input. + */ + __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0, + 0, 0, 0); + + *(__m128 *)inputs->a0[slot] = a0; + *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0); + *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0); +} + + + +static void calc_coef4( struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info, + unsigned slot, + __m128 a0, + __m128 a1, + __m128 a2) +{ + __m128 da01 = _mm_sub_ps(a0, a1); + __m128 da20 = _mm_sub_ps(a2, a0); + + __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dy20_ooa)); + __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dy01_ooa)); + __m128 dadx = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa); + + __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dx20_ooa)); + __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dx01_ooa)); + __m128 dady = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa); + + __m128 dadx_x0 = _mm_mul_ps(dadx, _mm_set1_ps(info->x0_center)); + __m128 dady_y0 = _mm_mul_ps(dady, _mm_set1_ps(info->y0_center)); + __m128 attr_v0 = _mm_add_ps(dadx_x0, dady_y0); + __m128 attr_0 = _mm_sub_ps(a0, attr_v0); + + *(__m128 *)inputs->a0[slot] = attr_0; + *(__m128 *)inputs->dadx[slot] = dadx; + *(__m128 *)inputs->dady[slot] = dady; +} + + +static void linear_coef( struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info, + unsigned slot, + unsigned vert_attr) +{ + __m128 a0 = *(const __m128 *)info->v0[vert_attr]; + __m128 a1 = *(const __m128 *)info->v1[vert_attr]; + __m128 a2 = *(const __m128 *)info->v2[vert_attr]; + + calc_coef4(inputs, info, slot, a0, a1, a2); +} + + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void perspective_coef( struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info, + unsigned slot, + unsigned vert_attr) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + __m128 a0 = *(const __m128 *)info->v0[vert_attr]; + __m128 a1 = *(const __m128 *)info->v1[vert_attr]; + __m128 a2 = *(const __m128 *)info->v2[vert_attr]; + + __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(info->v0[0][3])); + __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3])); + __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3])); + + calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow); +} + + + + + +/** + * Compute the inputs-> dadx, dady, a0 values. + */ +void lp_setup_tri_coef( struct lp_setup_context *setup, + struct lp_rast_shader_inputs *inputs, + const struct lp_tri_info *info) +{ + unsigned slot; + + /* The internal position input is in slot zero: + */ + linear_coef(inputs, info, 0, 0); + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < setup->fs.nr_inputs; slot++) { + unsigned vert_attr = setup->fs.input[slot].src_index; + + switch (setup->fs.input[slot].interp) { + case LP_INTERP_CONSTANT: + if (setup->flatshade_first) { + constant_coef4(inputs, info, slot+1, info->v0[vert_attr]); + } + else { + constant_coef4(inputs, info, slot+1, info->v2[vert_attr]); + } + break; + + case LP_INTERP_LINEAR: + linear_coef(inputs, info, slot+1, vert_attr); + break; + + case LP_INTERP_PERSPECTIVE: + perspective_coef(inputs, info, slot+1, vert_attr); + break; + + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0. + */ + break; + + case LP_INTERP_FACING: + setup_facing_coef(inputs, info, slot+1); + break; + + default: + assert(0); + } + } +} + +#else +extern void lp_setup_coef_dummy(void); +void lp_setup_coef_dummy(void) +{ +} +#endif diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index a0606f50340..877a492c6d8 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -41,6 +41,7 @@ #include "lp_scene.h" #include "draw/draw_vbuf.h" +#include "util/u_rect.h" #define LP_SETUP_NEW_FS 0x01 #define LP_SETUP_NEW_CONSTANTS 0x02 @@ -73,6 +74,7 @@ struct lp_setup_context uint prim; uint vertex_size; uint nr_vertices; + uint sprite; uint vertex_buffer_size; void *vertex_buffer; @@ -88,10 +90,17 @@ struct lp_setup_context boolean flatshade_first; boolean ccw_is_frontface; boolean scissor_test; + boolean point_size_per_vertex; unsigned cullmode; float pixel_offset; + float line_width; + float point_size; + float psize; struct pipe_framebuffer_state fb; + struct u_rect framebuffer; + struct u_rect scissor; + struct u_rect draw_region; /* intersection of fb & scissor */ struct { unsigned flags; @@ -127,9 +136,6 @@ struct lp_setup_context uint8_t *stored; } blend_color; - struct { - struct pipe_scissor_state current; - } scissor; unsigned dirty; /**< bitmask of LP_SETUP_NEW_x bits */ @@ -158,4 +164,29 @@ void lp_setup_update_state( struct lp_setup_context *setup ); void lp_setup_destroy( struct lp_setup_context *setup ); +void +lp_setup_print_triangle(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]); + +void +lp_setup_print_vertex(struct lp_setup_context *setup, + const char *name, + const float (*v)[4]); + + +struct lp_rast_triangle * +lp_setup_alloc_triangle(struct lp_scene *scene, + unsigned nr_inputs, + unsigned nr_planes, + unsigned *tri_size); + +void +lp_setup_bin_triangle( struct lp_setup_context *setup, + struct lp_rast_triangle *tri, + const struct u_rect *bbox, + int nr_planes ); + #endif + diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c index be41c44e6f5..ce2da55cf49 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_line.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c @@ -29,19 +29,671 @@ * Binning code for lines */ +#include "util/u_math.h" +#include "util/u_memory.h" +#include "lp_perf.h" #include "lp_setup_context.h" +#include "lp_rast.h" +#include "lp_state_fs.h" -static void line_nop( struct lp_setup_context *setup, - const float (*v0)[4], - const float (*v1)[4] ) +#define NUM_CHANNELS 4 + +struct lp_line_info { + + float dx; + float dy; + float oneoverarea; + + const float (*v1)[4]; + const float (*v2)[4]; +}; + + +/** + * Compute a0 for a constant-valued coefficient (GL_FLAT shading). + */ +static void constant_coef( struct lp_setup_context *setup, + struct lp_rast_triangle *tri, + unsigned slot, + const float value, + unsigned i ) +{ + tri->inputs.a0[slot][i] = value; + tri->inputs.dadx[slot][i] = 0.0f; + tri->inputs.dady[slot][i] = 0.0f; +} + + +/** + * Compute a0, dadx and dady for a linearly interpolated coefficient, + * for a triangle. + */ +static void linear_coef( struct lp_setup_context *setup, + struct lp_rast_triangle *tri, + struct lp_line_info *info, + unsigned slot, + unsigned vert_attr, + unsigned i) +{ + float a1 = info->v1[vert_attr][i]; + float a2 = info->v2[vert_attr][i]; + + float da21 = a1 - a2; + float dadx = da21 * info->dx * info->oneoverarea; + float dady = da21 * info->dy * info->oneoverarea; + + tri->inputs.dadx[slot][i] = dadx; + tri->inputs.dady[slot][i] = dady; + + tri->inputs.a0[slot][i] = (a1 - + (dadx * (info->v1[0][0] - setup->pixel_offset) + + dady * (info->v1[0][1] - setup->pixel_offset))); +} + + +/** + * Compute a0, dadx and dady for a perspective-corrected interpolant, + * for a triangle. + * We basically multiply the vertex value by 1/w before computing + * the plane coefficients (a0, dadx, dady). + * Later, when we compute the value at a particular fragment position we'll + * divide the interpolated value by the interpolated W at that fragment. + */ +static void perspective_coef( struct lp_setup_context *setup, + struct lp_rast_triangle *tri, + struct lp_line_info *info, + unsigned slot, + unsigned vert_attr, + unsigned i) +{ + /* premultiply by 1/w (v[0][3] is always 1/w): + */ + float a1 = info->v1[vert_attr][i] * info->v1[0][3]; + float a2 = info->v2[vert_attr][i] * info->v2[0][3]; + + float da21 = a1 - a2; + float dadx = da21 * info->dx * info->oneoverarea; + float dady = da21 * info->dy * info->oneoverarea; + + tri->inputs.dadx[slot][i] = dadx; + tri->inputs.dady[slot][i] = dady; + + tri->inputs.a0[slot][i] = (a1 - + (dadx * (info->v1[0][0] - setup->pixel_offset) + + dady * (info->v1[0][1] - setup->pixel_offset))); +} + +static void +setup_fragcoord_coef( struct lp_setup_context *setup, + struct lp_rast_triangle *tri, + struct lp_line_info *info, + unsigned slot, + unsigned usage_mask) +{ + /*X*/ + if (usage_mask & TGSI_WRITEMASK_X) { + tri->inputs.a0[slot][0] = 0.0; + tri->inputs.dadx[slot][0] = 1.0; + tri->inputs.dady[slot][0] = 0.0; + } + + /*Y*/ + if (usage_mask & TGSI_WRITEMASK_Y) { + tri->inputs.a0[slot][1] = 0.0; + tri->inputs.dadx[slot][1] = 0.0; + tri->inputs.dady[slot][1] = 1.0; + } + + /*Z*/ + if (usage_mask & TGSI_WRITEMASK_Z) { + linear_coef(setup, tri, info, slot, 0, 2); + } + + /*W*/ + if (usage_mask & TGSI_WRITEMASK_W) { + linear_coef(setup, tri, info, slot, 0, 3); + } +} + +/** + * Compute the tri->coef[] array dadx, dady, a0 values. + */ +static void setup_line_coefficients( struct lp_setup_context *setup, + struct lp_rast_triangle *tri, + struct lp_line_info *info) +{ + unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; + unsigned slot; + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < setup->fs.nr_inputs; slot++) { + unsigned vert_attr = setup->fs.input[slot].src_index; + unsigned usage_mask = setup->fs.input[slot].usage_mask; + unsigned i; + + switch (setup->fs.input[slot].interp) { + case LP_INTERP_CONSTANT: + if (setup->flatshade_first) { + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i); + } + else { + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + constant_coef(setup, tri, slot+1, info->v2[vert_attr][i], i); + } + break; + + case LP_INTERP_LINEAR: + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + linear_coef(setup, tri, info, slot+1, vert_attr, i); + break; + + case LP_INTERP_PERSPECTIVE: + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + perspective_coef(setup, tri, info, slot+1, vert_attr, i); + fragcoord_usage_mask |= TGSI_WRITEMASK_W; + break; + + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0, so all need to ensure that the usage mask is covers all + * usages. + */ + fragcoord_usage_mask |= usage_mask; + break; + + default: + assert(0); + } + } + + /* The internal position input is in slot zero: + */ + setup_fragcoord_coef(setup, tri, info, 0, + fragcoord_usage_mask); +} + + + +static INLINE int subpixel_snap( float a ) +{ + return util_iround(FIXED_ONE * a); +} + + +/** + * Print line vertex attribs (for debug). + */ +static void +print_line(struct lp_setup_context *setup, + const float (*v1)[4], + const float (*v2)[4]) +{ + uint i; + + debug_printf("llvmpipe line\n"); + for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { + debug_printf(" v1[%d]: %f %f %f %f\n", i, + v1[i][0], v1[i][1], v1[i][2], v1[i][3]); + } + for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { + debug_printf(" v2[%d]: %f %f %f %f\n", i, + v2[i][0], v2[i][1], v2[i][2], v2[i][3]); + } +} + + +static INLINE boolean sign(float x){ + return x >= 0; +} + + +/* Used on positive floats only: + */ +static INLINE float fracf(float f) { + return f - floorf(f); } -void -lp_setup_choose_line( struct lp_setup_context *setup ) + +static void +lp_setup_line( struct lp_setup_context *setup, + const float (*v1)[4], + const float (*v2)[4]) { - setup->line = line_nop; + struct lp_scene *scene = lp_setup_get_current_scene(setup); + struct lp_rast_triangle *line; + struct lp_line_info info; + float width = MAX2(1.0, setup->line_width); + struct u_rect bbox; + unsigned tri_bytes; + int x[4]; + int y[4]; + int i; + int nr_planes = 4; + + /* linewidth should be interpreted as integer */ + int fixed_width = util_iround(width) * FIXED_ONE; + + float x_offset=0; + float y_offset=0; + float x_offset_end=0; + float y_offset_end=0; + + float x1diff; + float y1diff; + float x2diff; + float y2diff; + float dx, dy; + + boolean draw_start; + boolean draw_end; + boolean will_draw_start; + boolean will_draw_end; + + if (0) + print_line(setup, v1, v2); + + if (setup->scissor_test) { + nr_planes = 8; + } + else { + nr_planes = 4; + } + + + dx = v1[0][0] - v2[0][0]; + dy = v1[0][1] - v2[0][1]; + + /* X-MAJOR LINE */ + if (fabsf(dx) >= fabsf(dy)) { + float dydx = dy / dx; + + x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5; + y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5; + x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5; + y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5; + + if (y2diff==-0.5 && dy<0){ + y2diff = 0.5; + } + + /* + * Diamond exit rule test for starting point + */ + if (fabsf(x1diff) + fabsf(y1diff) < 0.5) { + draw_start = TRUE; + } + else if (sign(x1diff) == sign(-dx)) { + draw_start = FALSE; + } + else if (sign(-y1diff) != sign(dy)) { + draw_start = TRUE; + } + else { + /* do intersection test */ + float yintersect = fracf(v1[0][1]) + x1diff * dydx; + draw_start = (yintersect < 1.0 && yintersect > 0.0); + } + + + /* + * Diamond exit rule test for ending point + */ + if (fabsf(x2diff) + fabsf(y2diff) < 0.5) { + draw_end = FALSE; + } + else if (sign(x2diff) != sign(-dx)) { + draw_end = FALSE; + } + else if (sign(-y2diff) == sign(dy)) { + draw_end = TRUE; + } + else { + /* do intersection test */ + float yintersect = fracf(v2[0][1]) + x2diff * dydx; + draw_end = (yintersect < 1.0 && yintersect > 0.0); + } + + /* Are we already drawing start/end? + */ + will_draw_start = sign(-x1diff) != sign(dx); + will_draw_end = (sign(x2diff) == sign(-dx)) || x2diff==0; + + if (dx < 0) { + /* if v2 is to the right of v1, swap pointers */ + const float (*temp)[4] = v1; + v1 = v2; + v2 = temp; + dx = -dx; + dy = -dy; + /* Otherwise shift planes appropriately */ + if (will_draw_start != draw_start) { + x_offset_end = - x1diff - 0.5; + y_offset_end = x_offset_end * dydx; + + } + if (will_draw_end != draw_end) { + x_offset = - x2diff - 0.5; + y_offset = x_offset * dydx; + } + + } + else{ + /* Otherwise shift planes appropriately */ + if (will_draw_start != draw_start) { + x_offset = - x1diff + 0.5; + y_offset = x_offset * dydx; + } + if (will_draw_end != draw_end) { + x_offset_end = - x2diff + 0.5; + y_offset_end = x_offset_end * dydx; + } + } + + /* x/y positions in fixed point */ + x[0] = subpixel_snap(v1[0][0] + x_offset - setup->pixel_offset); + x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset); + x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset); + x[3] = subpixel_snap(v1[0][0] + x_offset - setup->pixel_offset); + + y[0] = subpixel_snap(v1[0][1] + y_offset - setup->pixel_offset) - fixed_width/2; + y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) - fixed_width/2; + y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) + fixed_width/2; + y[3] = subpixel_snap(v1[0][1] + y_offset - setup->pixel_offset) + fixed_width/2; + + } + else { + const float dxdy = dx / dy; + + /* Y-MAJOR LINE */ + x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5; + y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5; + x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5; + y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5; + + if (x2diff==-0.5 && dx<0) { + x2diff = 0.5; + } + + /* + * Diamond exit rule test for starting point + */ + if (fabsf(x1diff) + fabsf(y1diff) < 0.5) { + draw_start = TRUE; + } + else if (sign(-y1diff) == sign(dy)) { + draw_start = FALSE; + } + else if (sign(x1diff) != sign(-dx)) { + draw_start = TRUE; + } + else { + /* do intersection test */ + float xintersect = fracf(v1[0][0]) + y1diff * dxdy; + draw_start = (xintersect < 1.0 && xintersect > 0.0); + } + + /* + * Diamond exit rule test for ending point + */ + if (fabsf(x2diff) + fabsf(y2diff) < 0.5) { + draw_end = FALSE; + } + else if (sign(-y2diff) != sign(dy) ) { + draw_end = FALSE; + } + else if (sign(x2diff) == sign(-dx) ) { + draw_end = TRUE; + } + else { + /* do intersection test */ + float xintersect = fracf(v2[0][0]) + y2diff * dxdy; + draw_end = (xintersect < 1.0 && xintersect > 0.0); + } + + /* Are we already drawing start/end? + */ + will_draw_start = sign(y1diff) == sign(dy); + will_draw_end = (sign(-y2diff) == sign(dy)) || y2diff==0; + + if (dy > 0) { + /* if v2 is on top of v1, swap pointers */ + const float (*temp)[4] = v1; + v1 = v2; + v2 = temp; + dx = -dx; + dy = -dy; + + /* Otherwise shift planes appropriately */ + if (will_draw_start != draw_start) { + y_offset_end = - y1diff + 0.5; + x_offset_end = y_offset_end * dxdy; + } + if (will_draw_end != draw_end) { + y_offset = - y2diff + 0.5; + x_offset = y_offset * dxdy; + } + } + else { + /* Otherwise shift planes appropriately */ + if (will_draw_start != draw_start) { + y_offset = - y1diff - 0.5; + x_offset = y_offset * dxdy; + + } + if (will_draw_end != draw_end) { + y_offset_end = - y2diff - 0.5; + x_offset_end = y_offset_end * dxdy; + } + } + + /* x/y positions in fixed point */ + x[0] = subpixel_snap(v1[0][0] + x_offset - setup->pixel_offset) - fixed_width/2; + x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2; + x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) + fixed_width/2; + x[3] = subpixel_snap(v1[0][0] + x_offset - setup->pixel_offset) + fixed_width/2; + + y[0] = subpixel_snap(v1[0][1] + y_offset - setup->pixel_offset); + y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset); + y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset); + y[3] = subpixel_snap(v1[0][1] + y_offset - setup->pixel_offset); + } + + + + LP_COUNT(nr_tris); + + + /* Bounding rectangle (in pixels) */ + { + /* Yes this is necessary to accurately calculate bounding boxes + * with the two fill-conventions we support. GL (normally) ends + * up needing a bottom-left fill convention, which requires + * slightly different rounding. + */ + int adj = (setup->pixel_offset != 0) ? 1 : 0; + + bbox.x0 = (MIN4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER; + bbox.x1 = (MAX4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER; + bbox.y0 = (MIN4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + bbox.y1 = (MAX4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + + /* Inclusive coordinates: + */ + bbox.x1--; + bbox.y1--; + } + + if (bbox.x1 < bbox.x0 || + bbox.y1 < bbox.y0) { + if (0) debug_printf("empty bounding box\n"); + LP_COUNT(nr_culled_tris); + return; + } + + if (!u_rect_test_intersection(&setup->draw_region, &bbox)) { + if (0) debug_printf("offscreen\n"); + LP_COUNT(nr_culled_tris); + return; + } + + u_rect_find_intersection(&setup->draw_region, &bbox); + + line = lp_setup_alloc_triangle(scene, + setup->fs.nr_inputs, + nr_planes, + &tri_bytes); + if (!line) + return; + +#ifdef DEBUG + line->v[0][0] = v1[0][0]; + line->v[1][0] = v2[0][0]; + line->v[0][1] = v1[0][1]; + line->v[1][1] = v2[0][1]; +#endif + + /* calculate the deltas */ + line->plane[0].dcdy = x[0] - x[1]; + line->plane[1].dcdy = x[1] - x[2]; + line->plane[2].dcdy = x[2] - x[3]; + line->plane[3].dcdy = x[3] - x[0]; + + line->plane[0].dcdx = y[0] - y[1]; + line->plane[1].dcdx = y[1] - y[2]; + line->plane[2].dcdx = y[2] - y[3]; + line->plane[3].dcdx = y[3] - y[0]; + + + info.oneoverarea = 1.0f / (dx * dx + dy * dy); + info.dx = dx; + info.dy = dy; + info.v1 = v1; + info.v2 = v2; + + /* Setup parameter interpolants: + */ + setup_line_coefficients( setup, line, &info); + + line->inputs.facing = 1.0F; + line->inputs.state = setup->fs.stored; + + for (i = 0; i < 4; i++) { + struct lp_rast_plane *plane = &line->plane[i]; + + /* half-edge constants, will be interated over the whole render + * target. + */ + plane->c = plane->dcdx * x[i] - plane->dcdy * y[i]; + + + /* correct for top-left vs. bottom-left fill convention. + * + * note that we're overloading gl_rasterization_rules to mean + * both (0.5,0.5) pixel centers *and* bottom-left filling + * convention. + * + * GL actually has a top-left filling convention, but GL's + * notion of "top" differs from gallium's... + * + * Also, sometimes (in FBO cases) GL will render upside down + * to its usual method, in which case it will probably want + * to use the opposite, top-left convention. + */ + if (plane->dcdx < 0) { + /* both fill conventions want this - adjust for left edges */ + plane->c++; + } + else if (plane->dcdx == 0) { + if (setup->pixel_offset == 0) { + /* correct for top-left fill convention: + */ + if (plane->dcdy > 0) plane->c++; + } + else { + /* correct for bottom-left fill convention: + */ + if (plane->dcdy < 0) plane->c++; + } + } + + plane->dcdx *= FIXED_ONE; + plane->dcdy *= FIXED_ONE; + + /* find trivial reject offsets for each edge for a single-pixel + * sized block. These will be scaled up at each recursive level to + * match the active blocksize. Scaling in this way works best if + * the blocks are square. + */ + plane->eo = 0; + if (plane->dcdx < 0) plane->eo -= plane->dcdx; + if (plane->dcdy > 0) plane->eo += plane->dcdy; + + /* Calculate trivial accept offsets from the above. + */ + plane->ei = plane->dcdy - plane->dcdx - plane->eo; + } + + + /* + * When rasterizing scissored tris, use the intersection of the + * triangle bounding box and the scissor rect to generate the + * scissor planes. + * + * This permits us to cut off the triangle "tails" that are present + * in the intermediate recursive levels caused when two of the + * triangles edges don't diverge quickly enough to trivially reject + * exterior blocks from the triangle. + * + * It's not really clear if it's worth worrying about these tails, + * but since we generate the planes for each scissored tri, it's + * free to trim them in this case. + * + * Note that otherwise, the scissor planes only vary in 'C' value, + * and even then only on state-changes. Could alternatively store + * these planes elsewhere. + */ + if (nr_planes == 8) { + line->plane[4].dcdx = -1; + line->plane[4].dcdy = 0; + line->plane[4].c = 1-bbox.x0; + line->plane[4].ei = 0; + line->plane[4].eo = 1; + + line->plane[5].dcdx = 1; + line->plane[5].dcdy = 0; + line->plane[5].c = bbox.x1+1; + line->plane[5].ei = -1; + line->plane[5].eo = 0; + + line->plane[6].dcdx = 0; + line->plane[6].dcdy = 1; + line->plane[6].c = 1-bbox.y0; + line->plane[6].ei = 0; + line->plane[6].eo = 1; + + line->plane[7].dcdx = 0; + line->plane[7].dcdy = -1; + line->plane[7].c = bbox.y1+1; + line->plane[7].ei = -1; + line->plane[7].eo = 0; + } + + lp_setup_bin_triangle(setup, line, &bbox, nr_planes); +} + + +void lp_setup_choose_line( struct lp_setup_context *setup ) +{ + setup->line = lp_setup_line; } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index 9f69e6c5ce2..6ae318d328d 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -1,6 +1,6 @@ /************************************************************************** * - * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas. + * Copyright 2010, VMware Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a @@ -18,7 +18,7 @@ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. - * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. @@ -30,17 +30,299 @@ */ #include "lp_setup_context.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "lp_perf.h" +#include "lp_setup_context.h" +#include "lp_rast.h" +#include "lp_state_fs.h" +#include "tgsi/tgsi_scan.h" + +#define NUM_CHANNELS 4 + +struct point_info { + /* x,y deltas */ + int dy01, dy12; + int dx01, dx12; + + const float (*v0)[4]; +}; + + +/** + * Compute a0 for a constant-valued coefficient (GL_FLAT shading). + */ +static void constant_coef( struct lp_setup_context *setup, + struct lp_rast_triangle *point, + unsigned slot, + const float value, + unsigned i ) +{ + point->inputs.a0[slot][i] = value; + point->inputs.dadx[slot][i] = 0.0f; + point->inputs.dady[slot][i] = 0.0f; +} + +static void perspective_coef( struct lp_setup_context *setup, + struct lp_rast_triangle *point, + const struct point_info *info, + unsigned slot, + unsigned vert_attr, + unsigned i) +{ + if (i == 0) { + float dadx = FIXED_ONE / (float)info->dx12; + float dady = 0.0f; + point->inputs.dadx[slot][i] = dadx; + point->inputs.dady[slot][i] = dady; + point->inputs.a0[slot][i] = (0.5 - + (dadx * ((float)info->v0[0][0] - setup->pixel_offset) + + dady * ((float)info->v0[0][1] - setup->pixel_offset))); + } + + else if (i == 1) { + float dadx = 0.0f; + float dady = FIXED_ONE / (float)info->dx12; + + point->inputs.dadx[slot][i] = dadx; + point->inputs.dady[slot][i] = dady; + point->inputs.a0[slot][i] = (0.5 - + (dadx * ((float)info->v0[0][0] - setup->pixel_offset) + + dady * ((float)info->v0[0][1] - setup->pixel_offset))); + } + + else if (i == 2) { + point->inputs.a0[slot][i] = 0.0f; + point->inputs.dadx[slot][i] = 0.0f; + point->inputs.dady[slot][i] = 0.0f; + } + + else if (i == 3) { + point->inputs.a0[slot][i] = 1.0f; + point->inputs.dadx[slot][i] = 0.0f; + point->inputs.dady[slot][i] = 0.0f; + } + +} + + +/** + * Special coefficient setup for gl_FragCoord. + * X and Y are trivial + * Z and W are copied from position_coef which should have already been computed. + * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. + */ +static void +setup_point_fragcoord_coef(struct lp_setup_context *setup, + struct lp_rast_triangle *point, + const struct point_info *info, + unsigned slot, + unsigned usage_mask) +{ + /*X*/ + if (usage_mask & TGSI_WRITEMASK_X) { + point->inputs.a0[slot][0] = 0.0; + point->inputs.dadx[slot][0] = 1.0; + point->inputs.dady[slot][0] = 0.0; + } + + /*Y*/ + if (usage_mask & TGSI_WRITEMASK_Y) { + point->inputs.a0[slot][1] = 0.0; + point->inputs.dadx[slot][1] = 0.0; + point->inputs.dady[slot][1] = 1.0; + } + + /*Z*/ + if (usage_mask & TGSI_WRITEMASK_Z) { + constant_coef(setup, point, slot, info->v0[0][2], 2); + } + + /*W*/ + if (usage_mask & TGSI_WRITEMASK_W) { + constant_coef(setup, point, slot, info->v0[0][3], 3); + } +} + +/** + * Compute the point->coef[] array dadx, dady, a0 values. + */ +static void +setup_point_coefficients( struct lp_setup_context *setup, + struct lp_rast_triangle *point, + const struct point_info *info) +{ + unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; + unsigned slot; + + /* setup interpolation for all the remaining attributes: + */ + for (slot = 0; slot < setup->fs.nr_inputs; slot++) { + unsigned vert_attr = setup->fs.input[slot].src_index; + unsigned usage_mask = setup->fs.input[slot].usage_mask; + unsigned i; + + switch (setup->fs.input[slot].interp) { + case LP_INTERP_POSITION: + /* + * The generated pixel interpolators will pick up the coeffs from + * slot 0, so all need to ensure that the usage mask is covers all + * usages. + */ + fragcoord_usage_mask |= usage_mask; + break; + + case LP_INTERP_PERSPECTIVE: + /* For point sprite textures */ + if (setup->fs.current.variant->shader->info.input_semantic_name[slot] + == TGSI_SEMANTIC_GENERIC) + { + int index = setup->fs.current.variant->shader->info.input_semantic_index[slot]; + + if (setup->sprite & (1 << index)) { + for (i = 0; i < NUM_CHANNELS; i++) + if (usage_mask & (1 << i)) + perspective_coef(setup, point, info, slot+1, vert_attr, i); + fragcoord_usage_mask |= TGSI_WRITEMASK_W; + break; + } + } + + /* Otherwise fallthrough */ + default: + for (i = 0; i < NUM_CHANNELS; i++) { + if (usage_mask & (1 << i)) + constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i); + } + } + } -static void point_nop( struct lp_setup_context *setup, - const float (*v0)[4] ) + /* The internal position input is in slot zero: + */ + setup_point_fragcoord_coef(setup, point, info, 0, + fragcoord_usage_mask); +} + +static INLINE int +subpixel_snap(float a) { + return util_iround(FIXED_ONE * a); +} + + +static void lp_setup_point( struct lp_setup_context *setup, + const float (*v0)[4] ) +{ + /* x/y positions in fixed point */ + const int sizeAttr = setup->psize; + const float size + = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0] + : setup->point_size; + + /* Point size as fixed point integer, remove rounding errors + * and gives minimum width for very small points + */ + int fixed_width = MAX2(FIXED_ONE, + (subpixel_snap(size) + FIXED_ONE/2 - 1) & ~(FIXED_ONE-1)); + + const int x0 = subpixel_snap(v0[0][0] - setup->pixel_offset) - fixed_width/2; + const int y0 = subpixel_snap(v0[0][1] - setup->pixel_offset) - fixed_width/2; + + struct lp_scene *scene = lp_setup_get_current_scene(setup); + struct lp_rast_triangle *point; + unsigned bytes; + struct u_rect bbox; + unsigned nr_planes = 4; + struct point_info info; + + + /* Bounding rectangle (in pixels) */ + { + /* Yes this is necessary to accurately calculate bounding boxes + * with the two fill-conventions we support. GL (normally) ends + * up needing a bottom-left fill convention, which requires + * slightly different rounding. + */ + int adj = (setup->pixel_offset != 0) ? 1 : 0; + + bbox.x0 = (x0 + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + bbox.x1 = (x0 + fixed_width + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + bbox.y0 = (y0 + (FIXED_ONE-1)) >> FIXED_ORDER; + bbox.y1 = (y0 + fixed_width + (FIXED_ONE-1)) >> FIXED_ORDER; + + /* Inclusive coordinates: + */ + bbox.x1--; + bbox.y1--; + } + + if (!u_rect_test_intersection(&setup->draw_region, &bbox)) { + if (0) debug_printf("offscreen\n"); + LP_COUNT(nr_culled_tris); + return; + } + + u_rect_find_intersection(&setup->draw_region, &bbox); + + point = lp_setup_alloc_triangle(scene, + setup->fs.nr_inputs, + nr_planes, + &bytes); + if (!point) + return; + +#ifdef DEBUG + point->v[0][0] = v0[0][0]; + point->v[0][1] = v0[0][1]; +#endif + + info.v0 = v0; + info.dx01 = 0; + info.dx12 = fixed_width; + info.dy01 = fixed_width; + info.dy12 = 0; + + /* Setup parameter interpolants: + */ + setup_point_coefficients(setup, point, &info); + + point->inputs.facing = 1.0F; + point->inputs.state = setup->fs.stored; + + { + point->plane[0].dcdx = -1; + point->plane[0].dcdy = 0; + point->plane[0].c = 1-bbox.x0; + point->plane[0].ei = 0; + point->plane[0].eo = 1; + + point->plane[1].dcdx = 1; + point->plane[1].dcdy = 0; + point->plane[1].c = bbox.x1+1; + point->plane[1].ei = -1; + point->plane[1].eo = 0; + + point->plane[2].dcdx = 0; + point->plane[2].dcdy = 1; + point->plane[2].c = 1-bbox.y0; + point->plane[2].ei = 0; + point->plane[2].eo = 1; + + point->plane[3].dcdx = 0; + point->plane[3].dcdy = -1; + point->plane[3].c = bbox.y1+1; + point->plane[3].ei = -1; + point->plane[3].eo = 0; + } + + lp_setup_bin_triangle(setup, point, &bbox, nr_planes); } void lp_setup_choose_point( struct lp_setup_context *setup ) { - setup->point = point_nop; + setup->point = lp_setup_point; } diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c index 393533ebee4..0180d95090f 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c @@ -31,35 +31,15 @@ #include "util/u_math.h" #include "util/u_memory.h" +#include "util/u_rect.h" #include "lp_perf.h" #include "lp_setup_context.h" +#include "lp_setup_coef.h" #include "lp_rast.h" #include "lp_state_fs.h" #define NUM_CHANNELS 4 -struct tri_info { - - float pixel_offset; - - /* fixed point vertex coordinates */ - int x[3]; - int y[3]; - - /* float x,y deltas - all from the original coordinates - */ - float dy01, dy20; - float dx01, dx20; - float oneoverarea; - - const float (*v0)[4]; - const float (*v1)[4]; - const float (*v2)[4]; - - boolean frontfacing; -}; - - static INLINE int @@ -76,247 +56,6 @@ fixed_to_float(int a) -/** - * Compute a0 for a constant-valued coefficient (GL_FLAT shading). - */ -static void constant_coef( struct lp_rast_triangle *tri, - unsigned slot, - const float value, - unsigned i ) -{ - tri->inputs.a0[slot][i] = value; - tri->inputs.dadx[slot][i] = 0.0f; - tri->inputs.dady[slot][i] = 0.0f; -} - - - -static void linear_coef( struct lp_rast_triangle *tri, - const struct tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - float a0 = info->v0[vert_attr][i]; - float a1 = info->v1[vert_attr][i]; - float a2 = info->v2[vert_attr][i]; - - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; - float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea; - - tri->inputs.dadx[slot][i] = dadx; - tri->inputs.dady[slot][i] = dady; - - /* calculate a0 as the value which would be sampled for the - * fragment at (0,0), taking into account that we want to sample at - * pixel centers, in other words (0.5, 0.5). - * - * this is neat but unfortunately not a good way to do things for - * triangles with very large values of dadx or dady as it will - * result in the subtraction and re-addition from a0 of a very - * large number, which means we'll end up loosing a lot of the - * fractional bits and precision from a0. the way to fix this is - * to define a0 as the sample at a pixel center somewhere near vmin - * instead - i'll switch to this later. - */ - tri->inputs.a0[slot][i] = (a0 - - (dadx * (info->v0[0][0] - info->pixel_offset) + - dady * (info->v0[0][1] - info->pixel_offset))); -} - - -/** - * Compute a0, dadx and dady for a perspective-corrected interpolant, - * for a triangle. - * We basically multiply the vertex value by 1/w before computing - * the plane coefficients (a0, dadx, dady). - * Later, when we compute the value at a particular fragment position we'll - * divide the interpolated value by the interpolated W at that fragment. - */ -static void perspective_coef( struct lp_rast_triangle *tri, - const struct tri_info *info, - unsigned slot, - unsigned vert_attr, - unsigned i) -{ - /* premultiply by 1/w (v[0][3] is always 1/w): - */ - float a0 = info->v0[vert_attr][i] * info->v0[0][3]; - float a1 = info->v1[vert_attr][i] * info->v1[0][3]; - float a2 = info->v2[vert_attr][i] * info->v2[0][3]; - float da01 = a0 - a1; - float da20 = a2 - a0; - float dadx = (da01 * info->dy20 - info->dy01 * da20) * info->oneoverarea; - float dady = (da20 * info->dx01 - info->dx20 * da01) * info->oneoverarea; - - tri->inputs.dadx[slot][i] = dadx; - tri->inputs.dady[slot][i] = dady; - tri->inputs.a0[slot][i] = (a0 - - (dadx * (info->v0[0][0] - info->pixel_offset) + - dady * (info->v0[0][1] - info->pixel_offset))); -} - - -/** - * Special coefficient setup for gl_FragCoord. - * X and Y are trivial - * Z and W are copied from position_coef which should have already been computed. - * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask. - */ -static void -setup_fragcoord_coef(struct lp_rast_triangle *tri, - const struct tri_info *info, - unsigned slot, - unsigned usage_mask) -{ - /*X*/ - if (usage_mask & TGSI_WRITEMASK_X) { - tri->inputs.a0[slot][0] = 0.0; - tri->inputs.dadx[slot][0] = 1.0; - tri->inputs.dady[slot][0] = 0.0; - } - - /*Y*/ - if (usage_mask & TGSI_WRITEMASK_Y) { - tri->inputs.a0[slot][1] = 0.0; - tri->inputs.dadx[slot][1] = 0.0; - tri->inputs.dady[slot][1] = 1.0; - } - - /*Z*/ - if (usage_mask & TGSI_WRITEMASK_Z) { - linear_coef(tri, info, slot, 0, 2); - } - - /*W*/ - if (usage_mask & TGSI_WRITEMASK_W) { - linear_coef(tri, info, slot, 0, 3); - } -} - - -/** - * Setup the fragment input attribute with the front-facing value. - * \param frontface is the triangle front facing? - */ -static void setup_facing_coef( struct lp_rast_triangle *tri, - unsigned slot, - boolean frontface, - unsigned usage_mask) -{ - /* convert TRUE to 1.0 and FALSE to -1.0 */ - if (usage_mask & TGSI_WRITEMASK_X) - constant_coef( tri, slot, 2.0f * frontface - 1.0f, 0 ); - - if (usage_mask & TGSI_WRITEMASK_Y) - constant_coef( tri, slot, 0.0f, 1 ); /* wasted */ - - if (usage_mask & TGSI_WRITEMASK_Z) - constant_coef( tri, slot, 0.0f, 2 ); /* wasted */ - - if (usage_mask & TGSI_WRITEMASK_W) - constant_coef( tri, slot, 0.0f, 3 ); /* wasted */ -} - - -/** - * Compute the tri->coef[] array dadx, dady, a0 values. - */ -static void setup_tri_coefficients( struct lp_setup_context *setup, - struct lp_rast_triangle *tri, - const struct tri_info *info) -{ - unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ; - unsigned slot; - unsigned i; - - /* setup interpolation for all the remaining attributes: - */ - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned vert_attr = setup->fs.input[slot].src_index; - unsigned usage_mask = setup->fs.input[slot].usage_mask; - - switch (setup->fs.input[slot].interp) { - case LP_INTERP_CONSTANT: - if (setup->flatshade_first) { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(tri, slot+1, info->v0[vert_attr][i], i); - } - else { - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - constant_coef(tri, slot+1, info->v2[vert_attr][i], i); - } - break; - - case LP_INTERP_LINEAR: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - linear_coef(tri, info, slot+1, vert_attr, i); - break; - - case LP_INTERP_PERSPECTIVE: - for (i = 0; i < NUM_CHANNELS; i++) - if (usage_mask & (1 << i)) - perspective_coef(tri, info, slot+1, vert_attr, i); - fragcoord_usage_mask |= TGSI_WRITEMASK_W; - break; - - case LP_INTERP_POSITION: - /* - * The generated pixel interpolators will pick up the coeffs from - * slot 0, so all need to ensure that the usage mask is covers all - * usages. - */ - fragcoord_usage_mask |= usage_mask; - break; - - case LP_INTERP_FACING: - setup_facing_coef(tri, slot+1, info->frontfacing, usage_mask); - break; - - default: - assert(0); - } - } - - /* The internal position input is in slot zero: - */ - setup_fragcoord_coef(tri, info, 0, fragcoord_usage_mask); - - if (0) { - for (i = 0; i < NUM_CHANNELS; i++) { - float a0 = tri->inputs.a0 [0][i]; - float dadx = tri->inputs.dadx[0][i]; - float dady = tri->inputs.dady[0][i]; - - debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n", - "xyzw"[i], - a0, dadx, dady); - } - - for (slot = 0; slot < setup->fs.nr_inputs; slot++) { - unsigned usage_mask = setup->fs.input[slot].usage_mask; - for (i = 0; i < NUM_CHANNELS; i++) { - if (usage_mask & (1 << i)) { - float a0 = tri->inputs.a0 [1 + slot][i]; - float dadx = tri->inputs.dadx[1 + slot][i]; - float dady = tri->inputs.dady[1 + slot][i]; - - debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n", - slot, - "xyzw"[i], - a0, dadx, dady); - } - } - } - } -} - - @@ -329,11 +68,11 @@ static void setup_tri_coefficients( struct lp_setup_context *setup, * \param nr_inputs number of fragment shader inputs * \return pointer to triangle space */ -static INLINE struct lp_rast_triangle * -alloc_triangle(struct lp_scene *scene, - unsigned nr_inputs, - unsigned nr_planes, - unsigned *tri_size) +struct lp_rast_triangle * +lp_setup_alloc_triangle(struct lp_scene *scene, + unsigned nr_inputs, + unsigned nr_planes, + unsigned *tri_size) { unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float); struct lp_rast_triangle *tri; @@ -357,35 +96,71 @@ alloc_triangle(struct lp_scene *scene, return tri; } +void +lp_setup_print_vertex(struct lp_setup_context *setup, + const char *name, + const float (*v)[4]) +{ + int i, j; + + debug_printf(" wpos (%s[0]) xyzw %f %f %f %f\n", + name, + v[0][0], v[0][1], v[0][2], v[0][3]); + + for (i = 0; i < setup->fs.nr_inputs; i++) { + const float *in = v[setup->fs.input[i].src_index]; + + debug_printf(" in[%d] (%s[%d]) %s%s%s%s ", + i, + name, setup->fs.input[i].src_index, + (setup->fs.input[i].usage_mask & 0x1) ? "x" : " ", + (setup->fs.input[i].usage_mask & 0x2) ? "y" : " ", + (setup->fs.input[i].usage_mask & 0x4) ? "z" : " ", + (setup->fs.input[i].usage_mask & 0x8) ? "w" : " "); + + for (j = 0; j < 4; j++) + if (setup->fs.input[i].usage_mask & (1<<j)) + debug_printf("%.5f ", in[j]); + + debug_printf("\n"); + } +} + /** * Print triangle vertex attribs (for debug). */ -static void -print_triangle(struct lp_setup_context *setup, - const float (*v1)[4], - const float (*v2)[4], - const float (*v3)[4]) +void +lp_setup_print_triangle(struct lp_setup_context *setup, + const float (*v0)[4], + const float (*v1)[4], + const float (*v2)[4]) { - uint i; + debug_printf("triangle\n"); - debug_printf("llvmpipe triangle\n"); - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { - debug_printf(" v1[%d]: %f %f %f %f\n", i, - v1[i][0], v1[i][1], v1[i][2], v1[i][3]); - } - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { - debug_printf(" v2[%d]: %f %f %f %f\n", i, - v2[i][0], v2[i][1], v2[i][2], v2[i][3]); - } - for (i = 0; i < 1 + setup->fs.nr_inputs; i++) { - debug_printf(" v3[%d]: %f %f %f %f\n", i, - v3[i][0], v3[i][1], v3[i][2], v3[i][3]); + { + const float ex = v0[0][0] - v2[0][0]; + const float ey = v0[0][1] - v2[0][1]; + const float fx = v1[0][0] - v2[0][0]; + const float fy = v1[0][1] - v2[0][1]; + + /* det = cross(e,f).z */ + const float det = ex * fy - ey * fx; + if (det < 0.0f) + debug_printf(" - ccw\n"); + else if (det > 0.0f) + debug_printf(" - cw\n"); + else + debug_printf(" - zero area\n"); } + + lp_setup_print_vertex(setup, "v0", v0); + lp_setup_print_vertex(setup, "v1", v1); + lp_setup_print_vertex(setup, "v2", v2); } -lp_rast_cmd lp_rast_tri_tab[8] = { +lp_rast_cmd lp_rast_tri_tab[9] = { NULL, /* should be impossible */ lp_rast_triangle_1, lp_rast_triangle_2, @@ -393,7 +168,8 @@ lp_rast_cmd lp_rast_tri_tab[8] = { lp_rast_triangle_4, lp_rast_triangle_5, lp_rast_triangle_6, - lp_rast_triangle_7 + lp_rast_triangle_7, + lp_rast_triangle_8 }; /** @@ -403,25 +179,27 @@ lp_rast_cmd lp_rast_tri_tab[8] = { */ static void do_triangle_ccw(struct lp_setup_context *setup, + const float (*v0)[4], const float (*v1)[4], const float (*v2)[4], - const float (*v3)[4], boolean frontfacing ) { - struct lp_scene *scene = lp_setup_get_current_scene(setup); - struct lp_fragment_shader_variant *variant = setup->fs.current.variant; struct lp_rast_triangle *tri; - struct tri_info info; + int x[3]; + int y[3]; + float dy01, dy20; + float dx01, dx20; + float oneoverarea; + struct lp_tri_info info; int area; - int minx, maxx, miny, maxy; - int ix0, ix1, iy0, iy1; + struct u_rect bbox; unsigned tri_bytes; int i; int nr_planes = 3; if (0) - print_triangle(setup, v1, v2, v3); + lp_setup_print_triangle(setup, v0, v1, v2); if (setup->scissor_test) { nr_planes = 7; @@ -430,38 +208,73 @@ do_triangle_ccw(struct lp_setup_context *setup, nr_planes = 3; } + /* x/y positions in fixed point */ + x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset); + x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset); + x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset); + y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset); + y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset); + y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset); + + + /* Bounding rectangle (in pixels) */ + { + /* Yes this is necessary to accurately calculate bounding boxes + * with the two fill-conventions we support. GL (normally) ends + * up needing a bottom-left fill convention, which requires + * slightly different rounding. + */ + int adj = (setup->pixel_offset != 0) ? 1 : 0; + + bbox.x0 = (MIN3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; + bbox.x1 = (MAX3(x[0], x[1], x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; + bbox.y0 = (MIN3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + bbox.y1 = (MAX3(y[0], y[1], y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; + + /* Inclusive coordinates: + */ + bbox.x1--; + bbox.y1--; + } + + if (bbox.x1 < bbox.x0 || + bbox.y1 < bbox.y0) { + if (0) debug_printf("empty bounding box\n"); + LP_COUNT(nr_culled_tris); + return; + } + + if (!u_rect_test_intersection(&setup->draw_region, &bbox)) { + if (0) debug_printf("offscreen\n"); + LP_COUNT(nr_culled_tris); + return; + } + + u_rect_find_intersection(&setup->draw_region, &bbox); - tri = alloc_triangle(scene, - setup->fs.nr_inputs, - nr_planes, - &tri_bytes); + tri = lp_setup_alloc_triangle(scene, + setup->fs.nr_inputs, + nr_planes, + &tri_bytes); if (!tri) return; #ifdef DEBUG - tri->v[0][0] = v1[0][0]; - tri->v[1][0] = v2[0][0]; - tri->v[2][0] = v3[0][0]; - tri->v[0][1] = v1[0][1]; - tri->v[1][1] = v2[0][1]; - tri->v[2][1] = v3[0][1]; + tri->v[0][0] = v0[0][0]; + tri->v[1][0] = v1[0][0]; + tri->v[2][0] = v2[0][0]; + tri->v[0][1] = v0[0][1]; + tri->v[1][1] = v1[0][1]; + tri->v[2][1] = v2[0][1]; #endif - /* x/y positions in fixed point */ - info.x[0] = subpixel_snap(v1[0][0] - setup->pixel_offset); - info.x[1] = subpixel_snap(v2[0][0] - setup->pixel_offset); - info.x[2] = subpixel_snap(v3[0][0] - setup->pixel_offset); - info.y[0] = subpixel_snap(v1[0][1] - setup->pixel_offset); - info.y[1] = subpixel_snap(v2[0][1] - setup->pixel_offset); - info.y[2] = subpixel_snap(v3[0][1] - setup->pixel_offset); - - tri->plane[0].dcdy = info.x[0] - info.x[1]; - tri->plane[1].dcdy = info.x[1] - info.x[2]; - tri->plane[2].dcdy = info.x[2] - info.x[0]; + tri->plane[0].dcdy = x[0] - x[1]; + tri->plane[1].dcdy = x[1] - x[2]; + tri->plane[2].dcdy = x[2] - x[0]; - tri->plane[0].dcdx = info.y[0] - info.y[1]; - tri->plane[1].dcdx = info.y[1] - info.y[2]; - tri->plane[2].dcdx = info.y[2] - info.y[0]; + tri->plane[0].dcdx = y[0] - y[1]; + tri->plane[1].dcdx = y[1] - y[2]; + tri->plane[2].dcdx = y[2] - y[0]; area = (tri->plane[0].dcdy * tri->plane[2].dcdx - tri->plane[2].dcdy * tri->plane[0].dcdx); @@ -478,57 +291,29 @@ do_triangle_ccw(struct lp_setup_context *setup, return; } - /* Bounding rectangle (in pixels) */ - { - /* Yes this is necessary to accurately calculate bounding boxes - * with the two fill-conventions we support. GL (normally) ends - * up needing a bottom-left fill convention, which requires - * slightly different rounding. - */ - int adj = (setup->pixel_offset != 0) ? 1 : 0; - - minx = (MIN3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; - maxx = (MAX3(info.x[0], info.x[1], info.x[2]) + (FIXED_ONE-1)) >> FIXED_ORDER; - miny = (MIN3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; - maxy = (MAX3(info.y[0], info.y[1], info.y[2]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER; - } - - if (setup->scissor_test) { - minx = MAX2(minx, setup->scissor.current.minx); - maxx = MIN2(maxx, setup->scissor.current.maxx); - miny = MAX2(miny, setup->scissor.current.miny); - maxy = MIN2(maxy, setup->scissor.current.maxy); - } - else { - minx = MAX2(minx, 0); - miny = MAX2(miny, 0); - maxx = MIN2(maxx, scene->fb.width); - maxy = MIN2(maxy, scene->fb.height); - } - - - if (miny >= maxy || minx >= maxx) { - lp_scene_putback_data( scene, tri_bytes ); - LP_COUNT(nr_culled_tris); - return; - } /* */ - info.pixel_offset = setup->pixel_offset; - info.v0 = v1; - info.v1 = v2; - info.v2 = v3; - info.dx01 = info.v0[0][0] - info.v1[0][0]; - info.dx20 = info.v2[0][0] - info.v0[0][0]; - info.dy01 = info.v0[0][1] - info.v1[0][1]; - info.dy20 = info.v2[0][1] - info.v0[0][1]; - info.oneoverarea = 1.0f / (info.dx01 * info.dy20 - info.dx20 * info.dy01); + dx01 = v0[0][0] - v1[0][0]; + dy01 = v0[0][1] - v1[0][1]; + dx20 = v2[0][0] - v0[0][0]; + dy20 = v2[0][1] - v0[0][1]; + oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01); + + info.v0 = v0; + info.v1 = v1; + info.v2 = v2; info.frontfacing = frontfacing; + info.x0_center = v0[0][0] - setup->pixel_offset; + info.y0_center = v0[0][1] - setup->pixel_offset; + info.dx01_ooa = dx01 * oneoverarea; + info.dx20_ooa = dx20 * oneoverarea; + info.dy01_ooa = dy01 * oneoverarea; + info.dy20_ooa = dy20 * oneoverarea; /* Setup parameter interpolants: */ - setup_tri_coefficients( setup, tri, &info ); + lp_setup_tri_coef( setup, &tri->inputs, &info ); tri->inputs.facing = frontfacing ? 1.0F : -1.0F; tri->inputs.state = setup->fs.stored; @@ -541,7 +326,7 @@ do_triangle_ccw(struct lp_setup_context *setup, /* half-edge constants, will be interated over the whole render * target. */ - plane->c = plane->dcdx * info.x[i] - plane->dcdy * info.y[i]; + plane->c = plane->dcdx * x[i] - plane->dcdy * y[i]; /* correct for top-left vs. bottom-left fill convention. * @@ -612,29 +397,43 @@ do_triangle_ccw(struct lp_setup_context *setup, if (nr_planes == 7) { tri->plane[3].dcdx = -1; tri->plane[3].dcdy = 0; - tri->plane[3].c = 1-minx; + tri->plane[3].c = 1-bbox.x0; tri->plane[3].ei = 0; tri->plane[3].eo = 1; tri->plane[4].dcdx = 1; tri->plane[4].dcdy = 0; - tri->plane[4].c = maxx; + tri->plane[4].c = bbox.x1+1; tri->plane[4].ei = -1; tri->plane[4].eo = 0; tri->plane[5].dcdx = 0; tri->plane[5].dcdy = 1; - tri->plane[5].c = 1-miny; + tri->plane[5].c = 1-bbox.y0; tri->plane[5].ei = 0; tri->plane[5].eo = 1; tri->plane[6].dcdx = 0; tri->plane[6].dcdy = -1; - tri->plane[6].c = maxy; + tri->plane[6].c = bbox.y1+1; tri->plane[6].ei = -1; tri->plane[6].eo = 0; } + lp_setup_bin_triangle( setup, tri, &bbox, nr_planes ); +} + + +void +lp_setup_bin_triangle( struct lp_setup_context *setup, + struct lp_rast_triangle *tri, + const struct u_rect *bbox, + int nr_planes ) +{ + struct lp_scene *scene = setup->scene; + struct lp_fragment_shader_variant *variant = setup->fs.current.variant; + int ix0, ix1, iy0, iy1; + int i; /* * All fields of 'tri' are now set. The remaining code here is @@ -643,10 +442,30 @@ do_triangle_ccw(struct lp_setup_context *setup, /* Convert to tile coordinates, and inclusive ranges: */ - ix0 = minx / TILE_SIZE; - iy0 = miny / TILE_SIZE; - ix1 = (maxx-1) / TILE_SIZE; - iy1 = (maxy-1) / TILE_SIZE; + if (nr_planes == 3) { + int ix0 = bbox->x0 / 16; + int iy0 = bbox->y0 / 16; + int ix1 = bbox->x1 / 16; + int iy1 = bbox->y1 / 16; + + if (iy0 == iy1 && ix0 == ix1) + { + + /* Triangle is contained in a single 16x16 block: + */ + int mask = (ix0 & 3) | ((iy0 & 3) << 4); + + lp_scene_bin_command( scene, ix0/4, iy0/4, + lp_rast_triangle_3_16, + lp_rast_arg_triangle(tri, mask) ); + return; + } + } + + ix0 = bbox->x0 / TILE_SIZE; + iy0 = bbox->y0 / TILE_SIZE; + ix1 = bbox->x1 / TILE_SIZE; + iy1 = bbox->y1 / TILE_SIZE; /* * Clamp to framebuffer size @@ -799,9 +618,10 @@ static void triangle_both( struct lp_setup_context *setup, const float fy = v1[0][1] - v2[0][1]; /* det = cross(e,f).z */ - if (ex * fy - ey * fx < 0.0f) + const float det = ex * fy - ey * fx; + if (det < 0.0f) triangle_ccw( setup, v0, v1, v2 ); - else + else if (det > 0.0f) triangle_cw( setup, v0, v1, v2 ); } diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c index 77bec4640bb..edd723f65f2 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_derived.c +++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c @@ -74,6 +74,15 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) vs_index = draw_find_shader_output(llvmpipe->draw, lpfs->info.input_semantic_name[i], lpfs->info.input_semantic_index[i]); + if (vs_index < 0) { + /* + * This can happen with sprite coordinates - the vertex + * shader doesn't need to provide an output as we generate + * them internally. However, lets keep pretending that there + * is something there to not confuse other code. + */ + vs_index = 0; + } /* This can be pre-computed, except for flatshade: */ @@ -125,6 +134,17 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) inputs[i].src_index = vinfo->num_attribs; draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); } + + /* Figure out if we need pointsize as well. + */ + vs_index = draw_find_shader_output(llvmpipe->draw, + TGSI_SEMANTIC_PSIZE, 0); + + if (vs_index > 0) { + llvmpipe->psize_slot = vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + } + llvmpipe->num_inputs = lpfs->info.num_inputs; draw_compute_vertex_size(vinfo); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index dbca49a2efa..33c1a49efec 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -808,7 +808,7 @@ generate_variant(struct llvmpipe_context *lp, variant->list_item_local.base = variant; variant->no = shader->variants_created++; - memcpy(&variant->key, key, sizeof *key); + memcpy(&variant->key, key, shader->variant_key_size); if (gallivm_debug & GALLIVM_DEBUG_IR) { debug_printf("llvmpipe: Creating fragment shader #%u variant #%u:\n", @@ -840,6 +840,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, const struct pipe_shader_state *templ) { struct lp_fragment_shader *shader; + int nr_samplers; shader = CALLOC_STRUCT(lp_fragment_shader); if (!shader) @@ -854,6 +855,11 @@ llvmpipe_create_fs_state(struct pipe_context *pipe, /* we need to keep a local copy of the tokens */ shader->base.tokens = tgsi_dup_tokens(templ->tokens); + nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1; + + shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key, + sampler[nr_samplers]); + if (LP_DEBUG & DEBUG_TGSI) { unsigned attrib; debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader); @@ -921,7 +927,6 @@ static void llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs) { struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe); - struct pipe_fence_handle *fence = NULL; struct lp_fragment_shader *shader = fs; struct lp_fs_variant_list_item *li; @@ -934,12 +939,7 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs) * Flushing alone might not sufficient we need to wait on it too. */ - llvmpipe_flush(pipe, 0, &fence); - - if (fence) { - pipe->screen->fence_finish(pipe->screen, fence, 0); - pipe->screen->fence_reference(pipe->screen, &fence, NULL); - } + llvmpipe_finish(pipe, __FUNCTION__); li = first_elem(&shader->variants); while(!at_end(&shader->variants, li)) { @@ -1027,7 +1027,7 @@ make_variant_key(struct llvmpipe_context *lp, { unsigned i; - memset(key, 0, sizeof *key); + memset(key, 0, shader->variant_key_size); if (lp->framebuffer.zsbuf) { if (lp->depth_stencil->depth.enabled) { @@ -1097,9 +1097,17 @@ make_variant_key(struct llvmpipe_context *lp, } } - for(i = 0; i < PIPE_MAX_SAMPLERS; ++i) - if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) - lp_sampler_static_state(&key->sampler[i], lp->fragment_sampler_views[i], lp->sampler[i]); + /* This value will be the same for all the variants of a given shader: + */ + key->nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1; + + for(i = 0; i < key->nr_samplers; ++i) { + if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) { + lp_sampler_static_state(&key->sampler[i], + lp->fragment_sampler_views[i], + lp->sampler[i]); + } + } } /** @@ -1118,7 +1126,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) li = first_elem(&shader->variants); while(!at_end(&shader->variants, li)) { - if(memcmp(&li->base->key, &key, sizeof key) == 0) { + if(memcmp(&li->base->key, &key, shader->variant_key_size) == 0) { variant = li->base; break; } @@ -1134,19 +1142,14 @@ llvmpipe_update_fs(struct llvmpipe_context *lp) unsigned i; if (lp->nr_fs_variants >= LP_MAX_SHADER_VARIANTS) { struct pipe_context *pipe = &lp->pipe; - struct pipe_fence_handle *fence = NULL; /* * XXX: we need to flush the context until we have some sort of reference * counting in fragment shaders as they may still be binned * Flushing alone might not be sufficient we need to wait on it too. */ - llvmpipe_flush(pipe, 0, &fence); + llvmpipe_finish(pipe, __FUNCTION__); - if (fence) { - pipe->screen->fence_finish(pipe->screen, fence, 0); - pipe->screen->fence_reference(pipe->screen, &fence, NULL); - } for (i = 0; i < LP_MAX_SHADER_VARIANTS / 4; i++) { struct lp_fs_variant_list_item *item = last_elem(&lp->fs_variants_list); remove_shader_variant(lp, item->base); diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h index 37900fc5443..33c480010dd 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.h +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h @@ -53,13 +53,10 @@ struct lp_fragment_shader_variant_key struct pipe_blend_state blend; enum pipe_format zsbuf_format; unsigned nr_cbufs:8; + unsigned nr_samplers:8; /* actually derivable from just the shader */ unsigned flatshade:1; unsigned occlusion_count:1; - struct { - ubyte colormask; - } cbuf_blend[PIPE_MAX_COLOR_BUFS]; - struct lp_sampler_static_state sampler[PIPE_MAX_SAMPLERS]; }; @@ -97,6 +94,7 @@ struct lp_fragment_shader struct lp_fs_variant_list_item variants; /* For debugging/profiling purposes */ + unsigned variant_key_size; unsigned no; unsigned variants_created; unsigned variants_cached; diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c index afd3e0b21c9..0bad7320f3e 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c +++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c @@ -73,7 +73,13 @@ llvmpipe_bind_rasterizer_state(struct pipe_context *pipe, void *handle) llvmpipe->rasterizer->gl_rasterization_rules); lp_setup_set_flatshade_first( llvmpipe->setup, llvmpipe->rasterizer->flatshade_first); - } + lp_setup_set_line_state( llvmpipe->setup, + llvmpipe->rasterizer->line_width); + lp_setup_set_point_state( llvmpipe->setup, + llvmpipe->rasterizer->point_size, + llvmpipe->rasterizer->point_size_per_vertex, + llvmpipe->rasterizer->sprite_coord_enable); + } llvmpipe->dirty |= LP_NEW_RASTERIZER; } diff --git a/src/gallium/drivers/llvmpipe/lp_state_vertex.c b/src/gallium/drivers/llvmpipe/lp_state_vertex.c index d86e66b4fb8..fb29423dd35 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_vertex.c +++ b/src/gallium/drivers/llvmpipe/lp_state_vertex.c @@ -100,7 +100,7 @@ llvmpipe_set_index_buffer(struct pipe_context *pipe, else memset(&llvmpipe->index_buffer, 0, sizeof(llvmpipe->index_buffer)); - /* TODO make this more like a state */ + draw_set_index_buffer(llvmpipe->draw, ib); } void diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c index f761e828500..63ddc669c2c 100644 --- a/src/gallium/drivers/llvmpipe/lp_surface.c +++ b/src/gallium/drivers/llvmpipe/lp_surface.c @@ -68,14 +68,16 @@ lp_resource_copy(struct pipe_context *pipe, 0, /* flush_flags */ FALSE, /* read_only */ TRUE, /* cpu_access */ - FALSE); /* do_not_block */ + FALSE, + "blit dst"); /* do_not_block */ llvmpipe_flush_resource(pipe, src, subsrc.face, subsrc.level, 0, /* flush_flags */ TRUE, /* read_only */ TRUE, /* cpu_access */ - FALSE); /* do_not_block */ + FALSE, + "blit src"); /* do_not_block */ /* printf("surface copy from %u to %u: %u,%u to %u,%u %u x %u\n", diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c index 25112c10a66..5832ea27445 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.c +++ b/src/gallium/drivers/llvmpipe/lp_texture.c @@ -67,6 +67,7 @@ resource_is_texture(const struct pipe_resource *resource) return FALSE; case PIPE_TEXTURE_1D: case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_3D: case PIPE_TEXTURE_CUBE: return TRUE; @@ -583,7 +584,8 @@ llvmpipe_get_transfer(struct pipe_context *pipe, 0, /* flush_flags */ read_only, TRUE, /* cpu_access */ - do_not_block)) { + do_not_block, + "transfer dest")) { /* * It would have blocked, but state tracker requested no to. */ diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h index f44979e562c..d9f35b4c4b9 100644 --- a/src/gallium/drivers/nouveau/nouveau_class.h +++ b/src/gallium/drivers/nouveau/nouveau_class.h @@ -6189,6 +6189,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV34TCL_FP_REG_CONTROL_UNK1_MASK 0xffff0000 #define NV34TCL_FP_REG_CONTROL_UNK0_SHIFT 0 #define NV34TCL_FP_REG_CONTROL_UNK0_MASK 0x0000ffff +#define NV34TCL_FLATSHADE_FIRST 0x00001454 +#define NV34TCL_EDGEFLAG_ENABLE 0x0000145c #define NV34TCL_VP_CLIP_PLANES_ENABLE 0x00001478 #define NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 (1 << 1) #define NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 (1 << 5) @@ -6222,10 +6224,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV34TCL_VTXFMT__SIZE 0x00000010 #define NV34TCL_VTXFMT_TYPE_SHIFT 0 #define NV34TCL_VTXFMT_TYPE_MASK 0x0000000f -#define NV34TCL_VTXFMT_TYPE_FLOAT 0x00000002 -#define NV34TCL_VTXFMT_TYPE_HALF 0x00000003 -#define NV34TCL_VTXFMT_TYPE_UBYTE 0x00000004 -#define NV34TCL_VTXFMT_TYPE_USHORT 0x00000005 +#define NV34TCL_VTXFMT_TYPE_16_SNORM 0x00000001 +#define NV34TCL_VTXFMT_TYPE_32_FLOAT 0x00000002 +#define NV34TCL_VTXFMT_TYPE_16_FLOAT 0x00000003 +#define NV34TCL_VTXFMT_TYPE_8_UNORM 0x00000004 +#define NV34TCL_VTXFMT_TYPE_16_SSCALED 0x00000005 +#define NV34TCL_VTXFMT_TYPE_11_11_10_SNORM 0x00000006 +#define NV34TCL_VTXFMT_TYPE_8_USCALED 0x00000007 #define NV34TCL_VTXFMT_SIZE_SHIFT 4 #define NV34TCL_VTXFMT_SIZE_MASK 0x000000f0 #define NV34TCL_VTXFMT_STRIDE_SHIFT 8 @@ -6368,6 +6373,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #define NV34TCL_TX_FORMAT_FORMAT_R8G8B8_RECT 0x00001e00 #define NV34TCL_TX_FORMAT_FORMAT_A8L8_RECT 0x00002000 #define NV34TCL_TX_FORMAT_FORMAT_DSDT8 0x00002800 +#define NV34TCL_TX_FORMAT_FORMAT_Z24 0x2a00 +#define NV34TCL_TX_FORMAT_FORMAT_Z24_RECT 0x2b00 /* XXX: guess! */ +#define NV34TCL_TX_FORMAT_FORMAT_Z16 0x2c00 +#define NV34TCL_TX_FORMAT_FORMAT_Z16_RECT 0x2d00 /* XXX: guess! */ #define NV34TCL_TX_FORMAT_FORMAT_HILO16 0x00003300 #define NV34TCL_TX_FORMAT_FORMAT_HILO16_RECT 0x00003600 #define NV34TCL_TX_FORMAT_FORMAT_HILO8 0x00004400 diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c index 513e5e02bc0..ebb21a6e5a3 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.c +++ b/src/gallium/drivers/nouveau/nouveau_screen.c @@ -258,6 +258,7 @@ nouveau_screen_fini(struct nouveau_screen *screen) { struct pipe_winsys *ws = screen->base.winsys; nouveau_channel_free(&screen->channel); - ws->destroy(ws); + if (ws) + ws->destroy(ws); } diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h deleted file mode 100644 index b165f7a611a..00000000000 --- a/src/gallium/drivers/nouveau/nouveau_util.h +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef __NOUVEAU_UTIL_H__ -#define __NOUVEAU_UTIL_H__ - -/* Determine how many vertices can be pushed into the command stream. - * Where the remaining space isn't large enough to represent all verices, - * split the buffer at primitive boundaries. - * - * Returns a count of vertices that can be rendered, and an index to - * restart drawing at after a flush. - */ -static INLINE unsigned -nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp, - unsigned mode, unsigned start, unsigned count, - unsigned *restart) -{ - int max, adj = 0; - - max = remaining - overhead; - if (max < 0) - return 0; - - max *= vpp; - if (max >= count) - return count; - - switch (mode) { - case PIPE_PRIM_POINTS: - break; - case PIPE_PRIM_LINES: - max = max & 1; - break; - case PIPE_PRIM_TRIANGLES: - max = max - (max % 3); - break; - case PIPE_PRIM_QUADS: - max = max & ~3; - break; - case PIPE_PRIM_LINE_LOOP: - case PIPE_PRIM_LINE_STRIP: - if (max < 2) - max = 0; - adj = 1; - break; - case PIPE_PRIM_POLYGON: - case PIPE_PRIM_TRIANGLE_STRIP: - case PIPE_PRIM_TRIANGLE_FAN: - if (max < 3) - max = 0; - adj = 2; - break; - case PIPE_PRIM_QUAD_STRIP: - if (max < 4) - max = 0; - adj = 3; - break; - default: - assert(0); - } - - *restart = start + max - adj; - return max; -} - -/* Integer base-2 logarithm, rounded towards zero. */ -static INLINE unsigned log2i(unsigned i) -{ - unsigned r = 0; - - if (i & 0xffff0000) { - i >>= 16; - r += 16; - } - if (i & 0x0000ff00) { - i >>= 8; - r += 8; - } - if (i & 0x000000f0) { - i >>= 4; - r += 4; - } - if (i & 0x0000000c) { - i >>= 2; - r += 2; - } - if (i & 0x00000002) { - r += 1; - } - return r; -} - -#endif diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c index 12b5ad106ca..dd0e8fd41b1 100644 --- a/src/gallium/drivers/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nv50/nv50_miptree.c @@ -238,7 +238,8 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen, unsigned stride; /* Only supports 2D, non-mipmapped textures for the moment */ - if (template->target != PIPE_TEXTURE_2D || + if ((template->target != PIPE_TEXTURE_2D && + template->target != PIPE_TEXTURE_RECT) || template->last_level != 0 || template->depth0 != 1) return NULL; diff --git a/src/gallium/drivers/nv50/nv50_push.c b/src/gallium/drivers/nv50/nv50_push.c index 0091927a982..380f69406a2 100644 --- a/src/gallium/drivers/nv50/nv50_push.c +++ b/src/gallium/drivers/nv50/nv50_push.c @@ -108,8 +108,9 @@ emit_vertex(struct push_context *ctx, unsigned n) int i; if (ctx->edgeflag_attr < 16) { - float *edgeflag = (uint8_t *)ctx->attr[ctx->edgeflag_attr].map + - ctx->attr[ctx->edgeflag_attr].stride * n; + float *edgeflag = (float *) + ((uint8_t *)ctx->attr[ctx->edgeflag_attr].map + + ctx->attr[ctx->edgeflag_attr].stride * n); if (*edgeflag != ctx->edgeflag) { BEGIN_RING(chan, tesla, NV50TCL_EDGEFLAG_ENABLE, 1); diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c index 55358183703..658324ec5be 100644 --- a/src/gallium/drivers/nv50/nv50_tex.c +++ b/src/gallium/drivers/nv50/nv50_tex.c @@ -83,6 +83,9 @@ nv50_tex_construct(struct nv50_sampler_view *view) case PIPE_TEXTURE_2D: tic[2] |= NV50TIC_0_2_TARGET_2D; break; + case PIPE_TEXTURE_RECT: + tic[2] |= NV50TIC_0_2_TARGET_RECT; + break; case PIPE_TEXTURE_3D: tic[2] |= NV50TIC_0_2_TARGET_3D; break; diff --git a/src/gallium/drivers/nvfx/Makefile b/src/gallium/drivers/nvfx/Makefile index c1d57ca3969..6cbbad699eb 100644 --- a/src/gallium/drivers/nvfx/Makefile +++ b/src/gallium/drivers/nvfx/Makefile @@ -4,7 +4,7 @@ include $(TOP)/configs/current LIBNAME = nvfx C_SOURCES = \ - nv04_surface_2d.c \ + nv04_2d.c \ nvfx_buffer.c \ nvfx_context.c \ nvfx_clear.c \ @@ -14,6 +14,7 @@ C_SOURCES = \ nv30_fragtex.c \ nv40_fragtex.c \ nvfx_miptree.c \ + nvfx_push.c \ nvfx_query.c \ nvfx_resource.c \ nvfx_screen.c \ diff --git a/src/gallium/drivers/nvfx/SConscript b/src/gallium/drivers/nvfx/SConscript index 02d931b10e8..80e3ef2257f 100644 --- a/src/gallium/drivers/nvfx/SConscript +++ b/src/gallium/drivers/nvfx/SConscript @@ -9,7 +9,7 @@ env.PrependUnique(delete_existing=1, CPPPATH = [ nvfx = env.ConvenienceLibrary( target = 'nvfx', source = [ - 'nv04_surface_2d.c', + 'nv04_2d.c', 'nvfx_buffer.c', 'nvfx_context.c', 'nvfx_clear.c', @@ -19,6 +19,7 @@ nvfx = env.ConvenienceLibrary( 'nv30_fragtex.c', 'nv40_fragtex.c', 'nvfx_miptree.c', + 'nvfx_push.c', 'nvfx_query.c', 'nvfx_resource.c', 'nvfx_screen.c', diff --git a/src/gallium/drivers/nvfx/nv04_2d.c b/src/gallium/drivers/nvfx/nv04_2d.c new file mode 100644 index 00000000000..c05312219b6 --- /dev/null +++ b/src/gallium/drivers/nvfx/nv04_2d.c @@ -0,0 +1,1341 @@ +/************************************************************************** + * + * Copyright 2009 Ben Skeggs + * Copyright 2009 Younes Manton + * Copyright 2010 Luca Barbieri + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS + * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + +/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <nouveau/nouveau_class.h> +#include <nouveau/nouveau_device.h> +#include <nouveau/nouveau_pushbuf.h> +#include <nouveau/nouveau_channel.h> +#include <nouveau/nouveau_bo.h> +#include <nouveau/nouveau_notifier.h> +#include <nouveau/nouveau_grobj.h> +#include "nv04_2d.h" + +/* avoid depending on Mesa/Gallium */ +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define likely(x) !!(x) +#define unlikely(x) !!(x) +#endif + +#define MIN2( A, B ) ( (A)<(B) ? (A) : (B) ) +#define MAX2( A, B ) ( (A)>(B) ? (A) : (B) ) + +struct nv04_2d_context +{ + struct nouveau_notifier *ntfy; + struct nouveau_grobj *surf2d; + struct nouveau_grobj *swzsurf; + struct nouveau_grobj *m2mf; + struct nouveau_grobj *rect; + struct nouveau_grobj *sifm; + struct nouveau_grobj *blit; +}; + +static inline int +align(int value, int alignment) +{ + return (value + alignment - 1) & ~(alignment - 1); +} + +static inline int +util_is_pot(unsigned x) +{ + return (x & (x - 1)) == 0; +} + +/* Integer base-2 logarithm, rounded towards zero. */ +static inline unsigned log2i(unsigned i) +{ + unsigned r = 0; + + if (i & 0xffff0000) { + i >>= 16; + r += 16; + } + if (i & 0x0000ff00) { + i >>= 8; + r += 8; + } + if (i & 0x000000f0) { + i >>= 4; + r += 4; + } + if (i & 0x0000000c) { + i >>= 2; + r += 2; + } + if (i & 0x00000002) { + r += 1; + } + return r; +} + +//#define NV04_REGION_DEBUG + +// Yes, we really want to inline everything, since all the functions are used only once +#if defined(__GNUC__) && defined(DEBUG) +#define inline __attribute__((always_inline)) inline +#endif + +static inline unsigned +nv04_swizzle_bits_square(unsigned x, unsigned y) +{ + unsigned u = (x & 0x001) << 0 | + (x & 0x002) << 1 | + (x & 0x004) << 2 | + (x & 0x008) << 3 | + (x & 0x010) << 4 | + (x & 0x020) << 5 | + (x & 0x040) << 6 | + (x & 0x080) << 7 | + (x & 0x100) << 8 | + (x & 0x200) << 9 | + (x & 0x400) << 10 | + (x & 0x800) << 11; + + unsigned v = (y & 0x001) << 1 | + (y & 0x002) << 2 | + (y & 0x004) << 3 | + (y & 0x008) << 4 | + (y & 0x010) << 5 | + (y & 0x020) << 6 | + (y & 0x040) << 7 | + (y & 0x080) << 8 | + (y & 0x100) << 9 | + (y & 0x200) << 10 | + (y & 0x400) << 11 | + (y & 0x800) << 12; + return v | u; +} + +/* rectangular swizzled textures are linear concatenations of swizzled square tiles */ +static inline unsigned +nv04_swizzle_bits_2d(unsigned x, unsigned y, unsigned w, unsigned h) +{ + if(h <= 1) + return x; + else + { + unsigned s = MIN2(w, h); + unsigned m = s - 1; + return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m); + } +} + +// general 3D texture case +static inline unsigned +nv04_swizzle_bits(unsigned x, unsigned y, unsigned z, unsigned w, unsigned h, unsigned d) +{ + if(d <= 1) + return nv04_swizzle_bits_2d(x, y, w, h); + else + { + // TODO: autogenerate code for all possible texture sizes (13 * 13 * 13 with dims <= 4096) and do a single indirect call + unsigned v = 0; + w >>= 1; + h >>= 1; + d >>= 1; + for(int i = 0;;) + { + int oldi = i; + if(likely(w)) + { + v |= (x & 1) << i; + x >>= 1; + w >>= 1; + ++i; + } + + if(likely(h)) + { + v |= (y & 1) << i; + y >>= 1; + h >>= 1; + ++i; + } + + if(likely(d)) + { + v |= (z & 1) << i; + z >>= 1; + d >>= 1; + ++i; + } + + if(i == oldi) + break; + } + return v; + } +} + +unsigned +nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h) +{ + if(rgn->pitch) + return rgn->pitch * rgn->y + (rgn->x << rgn->bpps); + else + return nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d) << rgn->bpps; +} + +unsigned +nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h) +{ + if(rgn->pitch) + return rgn->pitch * (rgn->y + h - 1) + ((rgn->x + w) << rgn->bpps); + else + return (nv04_swizzle_bits(rgn->x + w - 1, rgn->y + h - 1, rgn->z, rgn->w, rgn->h, rgn->d) + 1) << rgn->bpps; +} + +// *pitch = -1 -> use 3D swizzling for (x, y), *pitch = 0 -> use 2D swizzling, other *pitch -> use linear calculations +// returns 2 if pixel order is 3D-swizzled and 1 if subrect is 2D-swizzled +/* *pitch == -1 ret = 0 -> 3D swizzled subrect + * *pitch == 0 ret = 0 -> 2D swizzled subrect + * *pitch > 0 ret = 0 -> linear subrect + * *pitch > 0 ret = 1 -> linear subrect, but with swizzled 3D data inside + */ + +static inline void +nv04_region_print(struct nv04_region* rgn) +{ + fprintf(stderr, "<%i[%i]> ", rgn->bo->handle, rgn->offset); + if(rgn->pitch) + fprintf(stderr, "lin %i", rgn->pitch); + else + fprintf(stderr, "swz %ix%ix%i", rgn->w, rgn->h, rgn->d); + fprintf(stderr, " (%i, %i, %i)", rgn->x, rgn->y, rgn->z); +} + +static inline void +nv04_region_assert(struct nv04_region* rgn, unsigned w, unsigned h) +{ + unsigned end = rgn->offset + nv04_region_end(rgn, w, h); + + assert(rgn->offset <= (int)rgn->bo->size); + assert(end <= rgn->bo->size); + (void) end; + if(!rgn->pitch) { + assert(util_is_pot(rgn->w)); + assert(util_is_pot(rgn->h)); + } +} + +/* determine if region can be linearized or fake-linearized */ +static inline int +nv04_region_is_contiguous(struct nv04_region* rgn, int w, int h) +{ + int surf_min; + int rect_min; + + if(rgn->pitch) + return rgn->pitch == w << rgn->bpps; + + // redundant, but this is the fast path for the common case + if(w == rgn->w && h == rgn->h && rgn->d <= 1) + return 1; + + // must be POT + if((w & (w - 1)) || (h & (h - 1))) + return 0; + + // must be aligned + if((rgn->x & (w - 1)) || (rgn->y & (h - 1))) + return 0; + + if(rgn->d > 1) + return 0; + + surf_min = MIN2(rgn->w, rgn->h); + rect_min = MIN2(w, h); + + if((rect_min == surf_min) || (w == h) || (w == 2 * h)) + return 1; + + return 0; +} + +// double the pitch until it is larger than the alignment, or the height becomes odd or 1 +static inline void +nv04_region_contiguous_shape(struct nv04_region* rgn, int* w, int* h, int align) +{ + while(!(*h & 1) && (*w << rgn->bpps) < (1 << align)) + { + *w <<= 1; + *h >>= 1; + } + + while((*w << rgn->bpps) > 16384 && !(*w & 1)) + { + *w >>= 1; + *h <<= 1; + } + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tCONTIGUOUS %ix%i\n", *w, *h); +#endif +} + +static inline void +nv04_region_linearize_contiguous(struct nv04_region* rgn, unsigned w, unsigned h) +{ + int pos; + if(rgn->pitch) + { + rgn->offset += rgn->y * rgn->pitch + (rgn->x << rgn->bpps); + rgn->x = 0; + rgn->y = 0; + } + else + { + rgn->offset += (rgn->w * rgn->h * rgn->z) << rgn->bpps; + pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d); + rgn->x = pos & (w - 1); + rgn->y = pos / w; + } + rgn->pitch = w << rgn->bpps; + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tLINEARIZE "); + nv04_region_print(rgn); + fprintf(stderr, "\n"); +#endif +} + + /* preserve the offset! */ + /* + rgn->pitch = util_format_get_stride(rgn->format, w); + int pos = nv04_swizzle_bits(rgn->x, rgn->y, rgn->z, rgn->w, rgn->h, rgn->d); + rgn->x = pos & (w - 1); + rgn->y = pos & ~(w - 1); + */ + + /* + rgn->offset += + rgn->pitch = util_format_get_stride(rgn->format, w); + rgn->x = 0; + rgn->y = 0; + */ + +/* This code will get used for, and always succeed on: + * - 4x2 1bpp swizzled texture mipmap levels + * - linear regions created by linearization + * + * This code will get used for, and MAY work for: + * - misaligned texture blanket + * - linear surfaces created without wide_pitch (in this case, it will only work if we are lucky) + * + * The general case requires splitting the region in 2. + */ +static inline int +nv04_region_do_align_offset(struct nv04_region* rgn, unsigned w, unsigned h, int shift) +{ + if(rgn->pitch > 0) + { + int delta; + + assert(!(rgn->offset & ((1 << rgn->bpps) - 1))); // fatal! + delta = rgn->offset & ((1 << shift) - 1); + + if(h <= 1) + { + rgn->x += delta >> rgn->bpps; + rgn->offset -= delta; + rgn->pitch = align((rgn->x + w) << rgn->bpps, 1 << shift); + } + else + { + int newxo = (rgn->x << rgn->bpps) + delta; + int dy = newxo / rgn->pitch; + newxo -= dy * rgn->pitch; + if((newxo + (w << rgn->bpps)) > rgn->pitch) + { + // TODO: split the region into two rectangles (!) if *really* necessary, unless the hardware actually supports "wrapping" rectangles + // this does not happen if the surface is pitch-aligned, which it should always be + assert(0); + return -1; + } + rgn->x = newxo >> rgn->bpps; + rgn->y += dy; + } + } + else + { + int size; + int min; + int v; + + // we don't care about the alignment of 3D surfaces since the 2D engine can't use them + if(rgn->d < 0) + return -1; + + min = MIN2(rgn->w, rgn->h); + size = min * min << rgn->bpps; + + // this is unfixable, and should not be happening + if(rgn->offset & (size - 1)) + return -1; + + v = (rgn->offset & ((1 << shift) - 1)) / size; + rgn->offset -= v * size; + + if(rgn->h == min) + { + unsigned w; + rgn->x += rgn->h * v; + w = rgn->w + rgn->h * v; + + while(rgn->w < w) + rgn->w += rgn->w; + } + else + { + unsigned h; + rgn->y += rgn->w * v; + h = rgn->h + rgn->w * v; + + while(rgn->h < h) + rgn->h += rgn->h; + } + } + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tALIGNED "); + nv04_region_print(rgn); + fprintf(stderr, "\n"); +#endif + return 0; +} + +// both pitch and shift +// will leave the region unchanged if it fails +static inline int +nv04_region_align(struct nv04_region* rgn, unsigned w, unsigned h, int shift) +{ + if(rgn->pitch & ((1 << shift) - 1)) + { + if(h == 1) + goto do_align; /* this will fix pitch too in this case */ + else + return -1; + } + + if(rgn->offset & ((1 << shift) - 1)) + { + do_align: + if(nv04_region_do_align_offset(rgn, w, h, shift)) + return -1; + } + return 0; +} + +/* this contains 22 different copy loops after preprocessing. unfortunately, it's necessary */ +void +nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h) +{ + uint8_t* mdst; + uint8_t* msrc; + int size; + + if(dst->bo != src->bo) + { + nouveau_bo_map(dst->bo, NOUVEAU_BO_WR); + nouveau_bo_map(src->bo, NOUVEAU_BO_RD); + } + else + nouveau_bo_map(dst->bo, NOUVEAU_BO_WR | NOUVEAU_BO_RD); + + mdst = (uint8_t*)dst->bo->map + dst->offset; + msrc = (uint8_t*)src->bo->map + src->offset; + + size = w << dst->bpps; + + nv04_region_assert(dst, w, h); + nv04_region_assert(src, w, h); + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tRGN_COPY_CPU [%i, %i: %i] ", w, h, dst->bpps); + for(int i = 0; i < 2; ++i) + { + nv04_region_print(i ? src : dst); + fprintf(stderr, i ? "\n" : " <- "); + } + +// for(int i = 0; i < 16; ++i) +// fprintf(stderr, "%02x ", msrc[i]); +// fprintf(stderr, "\n"); +#endif + + // TODO: support overlapping copies! + if(src->pitch && dst->pitch) + { + mdst += dst->y * dst->pitch + (dst->x << dst->bpps); + msrc += src->y * src->pitch + (src->x << src->bpps); + if(dst->bo != src->bo) + goto simple; + else if(mdst < msrc) + { + if(mdst + size <= msrc) + { +simple: + for(int iy = 0; iy < h; ++iy) + { + assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size); + assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size); + memcpy(mdst, msrc, size); + msrc += src->pitch; mdst += dst->pitch; + } + } + else + { + for(int iy = 0; iy < h; ++iy) + { + assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size); + assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size); + memmove(mdst, msrc, size); + msrc += src->pitch; mdst += dst->pitch; + } + } + } + else + { + /* copy backwards so we don't destroy data we have to read yet */ + if(msrc + size <= mdst) + { + for(int iy = h - 1; iy >= 0; --iy) + { + assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size); + assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size); + memcpy(mdst, msrc, size); + msrc += src->pitch; mdst += dst->pitch; + } + } + else + { + for(int iy = h - 1; iy >= 0; --iy) + { + assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size); + assert(msrc + size <= (uint8_t*)src->bo->map + src->bo->size); + memmove(mdst, msrc, size); + msrc += src->pitch; mdst += dst->pitch; + } + } + } + } + else + { + int* dswx = NULL; + int* dswy = NULL; + int* sswx = NULL; + int* sswy = NULL; + int dir; + + if(!dst->pitch) + { + dswx = alloca(w * sizeof(int)); + for(int ix = 0; ix < w; ++ix) // we are adding, so z cannot be contributed by both + dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, 0, dst->w, dst->h, dst->d); + dswy = alloca(h * sizeof(int)); + for(int iy = 0; iy < h; ++iy) + dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d); + } + + if(!src->pitch) + { + sswx = alloca(w * sizeof(int)); + for(int ix = 0; ix < w; ++ix) + sswx[ix] = nv04_swizzle_bits(src->x + ix, 0, 0, src->w, src->h, src->d); + sswy = alloca(h * sizeof(int)); + for(int iy = 0; iy < h; ++iy) + sswy[iy] = nv04_swizzle_bits(0, src->y + iy, src->z, src->w, src->h, src->d); + } + + dir = 1; + /* do backwards copies for overlapping swizzled surfaces */ + if(dst->pitch == src->pitch && dst->offset == src->offset) + { + if(dst->y > src->y || (dst->y == src->y && dst->x > src->x)) + dir = -1; + } + +#define SWIZZLED_COPY_LOOPS + if(dir == 1) + { + int dir = 1; +#define LOOP_Y for(int iy = 0; iy < h; ++iy) +#define LOOP_X for(int ix = 0; ix < w; ++ix) +#include "nv04_2d_loops.h" +#undef LOOP_X +#undef LOOP_Y + } + else + { + int dir = -1; +#define LOOP_Y for(int iy = h - 1; iy >= 0; --iy) +#define LOOP_X for(int ix = w - 1; ix >= 0; --ix) +#include "nv04_2d_loops.h" +#undef LOOP_X +#undef LOOP_Y + } +#undef SWIZZLED_COPY_LOOP + } + + if(src->bo != dst->bo) + nouveau_bo_unmap(src->bo); + nouveau_bo_unmap(dst->bo); +} + +/* TODO: if the destination is swizzled, we are doing random writes, which causes write combining to fail + * the alternative is to read, modify and copy back, which may or may not be faster + * loading 3D textures is a common case that hits this and could probably benefit from the temporary + */ +void +nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value) +{ + uint8_t* mdst = (nouveau_bo_map(dst->bo, NOUVEAU_BO_WR), (uint8_t*)dst->bo->map + dst->offset); + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tRGN_FILL_CPU "); + nv04_region_print(dst); + fprintf(stderr, "\n"); +#endif + + nv04_region_assert(dst, w, h); + + if(dst->pitch) + { + unsigned size = w << dst->bpps; + +#define FILL(T) do { \ + for(int iy = 0; iy < h; ++iy) \ + { \ + assert((char*)((T*)mdst + w) <= (char*)dst->bo->map + dst->bo->size); \ + for(int ix = 0; ix < w; ++ix) \ + ((T*)mdst)[ix] = (T)value; \ + mdst += dst->pitch; \ + } \ + } while(0) + + mdst += dst->y * dst->pitch + (dst->x << dst->bpps); + + if(dst->bpps == 0) + { +ms: + assert(mdst + size * h <= (uint8_t*)dst->bo->map + dst->bo->size); + if(size == dst->pitch) + memset(mdst, (uint8_t)value, size * h); + else + { + for(int iy = 0; iy < h; ++iy) + { + assert(mdst + size <= (uint8_t*)dst->bo->map + dst->bo->size); + memset(mdst, (uint8_t)value, size); + mdst += dst->pitch; + } + } + } + else if(dst->bpps == 1) + { + if(!((uint8_t)value ^ (uint8_t)(value >> 8))) + goto ms; + + FILL(uint16_t); + } + else if(dst->bpps == 2) + { + if(value == (uint8_t)value * 0x1010101) + goto ms; + FILL(uint32_t); + } + else + assert(0); +#undef FILL + } + else + { + int* dswx; + int* dswy; + + dswx = alloca(w * sizeof(int)); + for(int ix = 0; ix < w; ++ix) + dswx[ix] = nv04_swizzle_bits(dst->x + ix, 0, dst->z, dst->w, dst->h, dst->d); + dswy = alloca(h * sizeof(int)); + for(int iy = 0; iy < h; ++iy) + dswy[iy] = nv04_swizzle_bits(0, dst->y + iy, dst->z, dst->w, dst->h, dst->d); + +#define FILL(T) do { \ + T tvalue = (T)value; \ + for(int iy = 0; iy < h; ++iy) \ + { \ + T* pdst = (T*)mdst + dswy[iy]; \ + for(int ix = 0; ix < w; ++ix) \ + { \ + assert((uint8_t*)&pdst[dswx[ix] + 1] <= (uint8_t*)dst->bo->map + dst->bo->size); \ + pdst[dswx[ix]] = tvalue; \ + } \ + } \ + } while(0) + + if(dst->bpps == 0) + FILL(uint8_t); + else if(dst->bpps == 1) + FILL(uint16_t); + else if(dst->bpps == 2) + FILL(uint32_t); + else + assert(0 && "unhandled bpp"); +#undef FILL + } + + nouveau_bo_unmap(dst->bo); +} + +static void +nv04_region_copy_swizzle(struct nv04_2d_context *ctx, + struct nv04_region* dst, + struct nv04_region* src, + int w, int h, int cs2d_format, int sifm_format) +{ + struct nouveau_channel *chan = ctx->swzsurf->channel; + struct nouveau_grobj *swzsurf = ctx->swzsurf; + struct nouveau_grobj *sifm = ctx->sifm; + /* Max width & height may not be the same on all HW, but must be POT */ + unsigned max_shift = 10; + unsigned cw = 1 << max_shift; + unsigned ch = 1 << max_shift; + unsigned sx = dst->x >> max_shift; + unsigned sy = dst->y >> max_shift; + unsigned ex = (dst->x + w - 1) >> max_shift; + unsigned ey = (dst->y + h - 1) >> max_shift; + unsigned chunks = (ex - sx + 1) * (ey - sy + 1); + unsigned chunk_size; + if(dst->w < cw) + cw = dst->w; + if(dst->h < ch) + ch = dst->h; + chunk_size = cw * ch << dst->bpps; + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tRGN_COPY_SWIZZLE [%i, %i: %i] ", w, h, dst->bpps); + for(int i = 0; i < 2; ++i) + { + nv04_region_print(i ? src : dst); + fprintf(stderr, i ? "\n" : " <- "); + } +#endif + + nv04_region_assert(dst, w, h); + nv04_region_assert(src, w, h); + + MARK_RING (chan, 8 + chunks * 17, 2 + chunks * 2); + + BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1); + OUT_RELOCo(chan, dst->bo, + NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + + BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1); + OUT_RING (chan, cs2d_format | + log2i(cw) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT | + log2i(ch) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT); + + BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1); + OUT_RELOCo(chan, src->bo, + NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); + BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1); + OUT_RING (chan, swzsurf->handle); + + assert(!(dst->offset & 63)); + + for (int cy = sy; cy <= ey; ++cy) { + int ry = MAX2(0, (int)(dst->y - ch * cy)); + int rh = MIN2((int)ch, (int)(dst->y - ch * cy + h)) - ry; + for (int cx = sx; cx <= ex; ++cx) { + int rx = MAX2(0, (int)(dst->x - cw * cx)); + int rw = MIN2((int)cw, (int)(dst->x - cw * cx + w)) - rx; + unsigned dst_offset; + unsigned src_offset; + + BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1); + + dst_offset = dst->offset + (nv04_swizzle_bits_2d(cx * cw, cy * ch, dst->w, dst->h) << dst->bpps); + assert(dst_offset <= dst->bo->size); + assert(dst_offset + chunk_size <= dst->bo->size); + OUT_RELOCl(chan, dst->bo, dst_offset, + NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + + BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9); + OUT_RING (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE); + OUT_RING (chan, sifm_format); + OUT_RING (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY); + OUT_RING (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT)); + OUT_RING (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | rw); + OUT_RING (chan, rx | (ry << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT)); + OUT_RING (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | rw); + OUT_RING (chan, 1 << 20); + OUT_RING (chan, 1 << 20); + + BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4); + OUT_RING (chan, rh << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | align(rw, 8)); + OUT_RING (chan, src->pitch | + NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER | + NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE); + src_offset = src->offset + (cy * ch + ry + src->y - dst->y) * src->pitch + ((cx * cw + rx + src->x - dst->x) << src->bpps); + assert(src_offset <= src->bo->size); + assert(src_offset + (src->pitch * (rh - 1)) + (rw << src->bpps) <= src->bo->size); + OUT_RELOCl(chan, src->bo, src_offset, + NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); + OUT_RING (chan, 0); + } + } +} + +static inline void +nv04_copy_m2mf_begin(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, struct nouveau_bo* srcbo, unsigned commands) +{ + struct nouveau_channel *chan = ctx->m2mf->channel; + struct nouveau_grobj *m2mf = ctx->m2mf; + MARK_RING (chan, 3 + commands * 9, 2 + commands * 2); + BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2); + OUT_RELOCo(chan, srcbo, + NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); + OUT_RELOCo(chan, dstbo, + NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); +} + +static inline void +nv04_copy_m2mf_body(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int* pdstoff, unsigned dstpitch, struct nouveau_bo* srcbo, int* psrcoff, unsigned srcpitch, unsigned size, unsigned lines) +{ + struct nouveau_channel *chan = ctx->m2mf->channel; + struct nouveau_grobj *m2mf = ctx->m2mf; + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\t\t\tCOPY_M2MF_BODY [%i, %i] <%i[%u]> lin %u <- <%i[%u]> lin %u\n", size, lines, dstbo->handle, *pdstoff, dstpitch, srcbo->handle, *psrcoff, srcpitch); +#endif + + BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8); + OUT_RELOCl(chan, srcbo, *psrcoff, + NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD); + OUT_RELOCl(chan, dstbo, *pdstoff, + NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR); + OUT_RING (chan, srcpitch); + OUT_RING (chan, dstpitch); + OUT_RING (chan, size); + OUT_RING (chan, lines); + OUT_RING (chan, 0x0101); + OUT_RING (chan, 0); + + *psrcoff += srcpitch * lines; + *pdstoff += dstpitch * lines; +} + +static void +nv04_copy_m2mf(struct nv04_2d_context *ctx, + struct nouveau_bo* dstbo, int dstoff, unsigned dstpitch, + struct nouveau_bo* srcbo, int srcoff, unsigned srcpitch, + unsigned size, unsigned h) +{ + unsigned max_pitch = 32767; + unsigned max_lines = 2047; + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\t\tCOPY_M2MF [%i, %i] <%i[%i]> lin %u <- <%i[%i]> lin %u\n", size, h, dstbo->handle, dstoff, dstpitch, srcbo->handle, srcoff, srcpitch); +#endif + + if(srcpitch <= max_pitch && dstpitch <= max_pitch) + { + unsigned full_pages = h / max_lines; + unsigned leftover_lines = h - full_pages * max_lines; + + nv04_copy_m2mf_begin(ctx, dstbo, srcbo, full_pages + !!leftover_lines); + + for(unsigned i = 0; i < full_pages; ++i) + nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, max_lines); + + if(leftover_lines) + nv04_copy_m2mf_body(ctx, dstbo, &dstoff, dstpitch, srcbo, &srcoff, srcpitch, size, leftover_lines); + } + else + { + unsigned lines = size / max_pitch; + unsigned leftover = size - lines * max_pitch; + unsigned full_pages = lines / max_lines; + unsigned leftover_lines = lines - full_pages * max_lines; + unsigned srcgap = srcpitch - size; + unsigned dstgap = dstpitch - size; + + nv04_copy_m2mf_begin(ctx, dstbo, srcbo, h * (full_pages + !!leftover_lines + !!leftover)); + + for(unsigned i = 0; i < h; ++i) + { + for(unsigned j = 0; j < full_pages; ++j) + nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, max_lines); + + if(leftover_lines) + nv04_copy_m2mf_body(ctx, dstbo, &dstoff, max_pitch, srcbo, &srcoff, max_pitch, max_pitch, leftover_lines); + + if(leftover) + nv04_copy_m2mf_body(ctx, dstbo, &dstoff, leftover, srcbo, &srcoff, leftover, leftover, 1); + + srcoff += srcgap; + dstoff += dstgap; + } + } +} + +void +nv04_memcpy(struct nv04_2d_context *ctx, struct nouveau_bo* dstbo, int dstoff, struct nouveau_bo* srcbo, int srcoff, unsigned size) +{ +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tMEMCPY [%i] <%i[%i]> <- <%i[%i]>\n", size, dstbo->handle, dstoff, srcbo->handle, srcoff); +#endif + + nv04_copy_m2mf(ctx, dstbo, dstoff, size, srcbo, srcoff, size, size, 1); +} + +static void +nv04_region_copy_m2mf(struct nv04_2d_context *ctx, struct nv04_region *dst, struct nv04_region *src, int w, int h) +{ +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tRGN_COPY_M2MF [%i, %i: %i] ", w, h, dst->bpps); + for(int i = 0; i < 2; ++i) + { + nv04_region_print(i ? src : dst); + fprintf(stderr, i ? "\n" : " <- "); + } +#endif + + nv04_region_assert(dst, w, h); + nv04_region_assert(src, w, h); + assert(src->pitch); + assert(dst->pitch); + + nv04_copy_m2mf(ctx, + dst->bo, dst->offset + dst->y * dst->pitch + (dst->x << dst->bpps), dst->pitch, + src->bo, src->offset + src->y * src->pitch + (src->x << src->bpps), src->pitch, + w << src->bpps, h); +} + +static inline void +nv04_region_copy_blit(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src, int w, int h, int format) +{ + struct nouveau_channel *chan = ctx->surf2d->channel; + struct nouveau_grobj *surf2d = ctx->surf2d; + struct nouveau_grobj *blit = ctx->blit; + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tRGN_COPY_BLIT [%i, %i: %i] ", w, h, dst->bpps); + for(int i = 0; i < 2; ++i) + { + nv04_region_print(i ? src : dst); + fprintf(stderr, i ? "\n" : " <- "); + } +#endif + + assert(!(src->pitch & 63) && src->pitch); + assert(!(dst->pitch & 63) && dst->pitch); + nv04_region_assert(dst, w, h); + nv04_region_assert(src, w, h); + + MARK_RING (chan, 12, 4); + BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2); + OUT_RELOCo(chan, src->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); + OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4); + OUT_RING (chan, format); + OUT_RING (chan, (dst->pitch << 16) | src->pitch); + OUT_RELOCl(chan, src->bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); + OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + + BEGIN_RING(chan, blit, 0x0300, 3); + OUT_RING (chan, (src->y << 16) | src->x); + OUT_RING (chan, (dst->y << 16) | dst->x); + OUT_RING (chan, ( h << 16) | w); +} + +/* THEOREM: a non-linearizable swizzled destination is always 64 byte aligned, except for 4x2 mipmap levels of swizzled 1bpp surfaces + * HYPOTESIS: + * 1. The first mipmap level is 64-byte-aligned + * PROOF: + * 1. Thus, all mipmaps level with a parent which is 64-byte or more in size are. + * 2. At 1bpp, the smallest levels with a <= 32-byte parent are either Nx1 or 1xN or size <=8, thus 4x2, 2x2 or 2x4 + * 3. Nx1, 1xN, 2x4, 2x2 have all subrects linearizable. 4x2 does not. + * 4. At 2/4bpp or more, the smallest levels with a 32-byte parent are 1xN, Nx1 or 2x2 + * + * However, nv04_region_align handles that. + */ + +// 0 -> done, 1 -> do with 3D engine or CPU, -1 -> do with CPU +// dst and src may be modified, and the possibly modified version should be passed to nv04_region_cpu if necessary +int +nv04_region_copy_2d(struct nv04_2d_context *ctx, struct nv04_region* dst, struct nv04_region* src, + int w, int h, int cs2d_format, int sifm_format, int dst_to_gpu, int src_on_gpu) +{ + assert(src->bpps == dst->bpps); + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "RGN_COPY%s [%i, %i: %i] ", (cs2d_format >= 0) ? "_2D" : "_NO2D", w, h, dst->bpps); + for(int i = 0; i < 2; ++i) + { + int gpu = i ? src_on_gpu : dst_to_gpu; + nv04_region_print(i ? src : dst); + fprintf(stderr, " %s", gpu ? "gpu" : "cpu"); + fprintf(stderr, i ? "\n" : " <- "); + } +#endif + + // if they are contiguous and either both swizzled or both linear, reshape + if(!dst->pitch == !src->pitch + && nv04_region_is_contiguous(dst, w, h) + && nv04_region_is_contiguous(src, w, h)) + { + nv04_region_contiguous_shape(dst, &w, &h, 6); + nv04_region_linearize_contiguous(dst, w, h); + nv04_region_linearize_contiguous(src, w, h); + } + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tOPT "); + for(int i = 0; i < 2; ++i) + { + nv04_region_print(i ? src : dst); + fprintf(stderr, i ? "\n" : " <- "); + } +#endif + + /* if the destination is not for GPU _and_ source is on CPU, use CPU */ + /* if the destination is not for GPU _or_ source is on CPU, use CPU only if we think it's faster than the GPU */ + /* TODO: benchmark to find out in which cases exactly we should prefer the CPU */ + if((!dst_to_gpu && !src_on_gpu) + || (!dst->pitch && dst->d > 1) + /* 3D swizzled destination are unwritable by the GPU, and 2D swizzled ones are readable only by the 3D engine */ + ) + return -1; + /* there is no known way to read 2D/3D-swizzled surfaces with the 2D engine + * ask the caller to use the 3D engine + * If a format cannot be sampled from the 3D engine there is no point in making it swizzled, so we must not do so + */ + else if(!src->pitch) + { +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tCOPY_ENG3D\n"); +#endif + return 1; + } + /* Setup transfer to swizzle the texture to vram if needed */ + else + { + if (!dst->pitch) + { + if(cs2d_format < 0 || sifm_format < 0 || !dst_to_gpu) + { +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tCOPY_ENG3D\n"); +#endif + return 1; + } + else + { + assert(!nv04_region_align(dst, w, h, 6)); + + nv04_region_copy_swizzle(ctx, dst, src, w, h, cs2d_format, sifm_format); + return 0; + } + } + else + { + /* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback + * to NV_MEMORY_TO_MEMORY_FORMAT in this case. + * TODO: is this also true for the source? possibly not + */ + + if ((cs2d_format < 0) + || !dst_to_gpu + || nv04_region_align(src, w, h, 6) + || nv04_region_align(dst, w, h, 6) + ) + nv04_region_copy_m2mf(ctx, dst, src, w, h); + else + nv04_region_copy_blit(ctx, dst, src, w, h, cs2d_format); + + return 0; + } + } +} + +static inline void +nv04_region_fill_gdirect(struct nv04_2d_context *ctx, struct nv04_region* dst, int w, int h, unsigned value) +{ + struct nouveau_channel *chan = ctx->surf2d->channel; + struct nouveau_grobj *surf2d = ctx->surf2d; + struct nouveau_grobj *rect = ctx->rect; + int cs2d_format, gdirect_format; + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tFILL_GDIRECT\n"); +#endif + + assert(!(dst->pitch & 63) && dst->pitch); + nv04_region_assert(dst, w, h); + + if(dst->bpps == 0) + { + gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8; + cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y8; + } + else if(dst->bpps == 1) + { + gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5; + cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y16; + } + else if(dst->bpps == 2) + { + gdirect_format = NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8; + cs2d_format = NV04_CONTEXT_SURFACES_2D_FORMAT_Y32; + } + else + { + assert(0); + gdirect_format = 0; + cs2d_format = 0; + } + + MARK_RING (chan, 15, 4); + BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2); + OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCo(chan, dst->bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4); + OUT_RING (chan, cs2d_format); + OUT_RING (chan, (dst->pitch << 16) | dst->pitch); + OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + OUT_RELOCl(chan, dst->bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); + + BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1); + OUT_RING (chan, gdirect_format); + BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1); + OUT_RING (chan, value); + BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2); + OUT_RING (chan, (dst->x << 16) | dst->y); + OUT_RING (chan, ( w << 16) | h); +} + +int +nv04_region_fill_2d(struct nv04_2d_context *ctx, struct nv04_region *dst, + int w, int h, unsigned value) +{ + if(!w || !h) + return 0; + +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "FILL [%i, %i: %i] ", w, h, dst->bpps); + nv04_region_print(dst); + fprintf(stderr, " <- 0x%x\n", value); +#endif + + if(nv04_region_is_contiguous(dst, w, h)) + { + nv04_region_contiguous_shape(dst, &w, &h, 6); + nv04_region_linearize_contiguous(dst, w, h); + } + + // TODO: maybe do intermediate copies for some cases instead of using the 3D engine/CPU + /* GdiRect doesn't work together with swzsurf, so the 3D engine, or an intermediate copy, is the only option here */ + if(!dst->pitch) + { +#ifdef NV04_REGION_DEBUG + fprintf(stderr, "\tFILL_ENG3D\n"); +#endif + return 1; + } + else if(!nv04_region_align(dst, w, h, 6)) + { + nv04_region_fill_gdirect(ctx, dst, w, h, value); + return 0; + } + else + return -1; +} + + +void +nv04_2d_context_takedown(struct nv04_2d_context *ctx) +{ + nouveau_notifier_free(&ctx->ntfy); + nouveau_grobj_free(&ctx->m2mf); + nouveau_grobj_free(&ctx->surf2d); + nouveau_grobj_free(&ctx->swzsurf); + nouveau_grobj_free(&ctx->rect); + nouveau_grobj_free(&ctx->blit); + nouveau_grobj_free(&ctx->sifm); + + free(ctx); +} + +struct nv04_2d_context * +nv04_2d_context_init(struct nouveau_channel* chan) +{ + struct nv04_2d_context *ctx = calloc(1, sizeof(struct nv04_2d_context)); + unsigned handle = 0x88000000, class; + int ret; + + if (!ctx) + return NULL; + + ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy); + if (ret) { + nv04_2d_context_takedown(ctx); + return NULL; + } + + ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf); + if (ret) { + nv04_2d_context_takedown(ctx); + return NULL; + } + + BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1); + OUT_RING (chan, ctx->ntfy->handle); + + if (chan->device->chipset < 0x10) + class = NV04_CONTEXT_SURFACES_2D; + else + class = NV10_CONTEXT_SURFACES_2D; + + ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d); + if (ret) { + nv04_2d_context_takedown(ctx); + return NULL; + } + + BEGIN_RING(chan, ctx->surf2d, + NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2); + OUT_RING (chan, chan->vram->handle); + OUT_RING (chan, chan->vram->handle); + + if (chan->device->chipset < 0x10) + class = NV04_IMAGE_BLIT; + else + class = NV12_IMAGE_BLIT; + + ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit); + if (ret) { + nv04_2d_context_takedown(ctx); + return NULL; + } + + BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1); + OUT_RING (chan, ctx->ntfy->handle); + BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1); + OUT_RING (chan, ctx->surf2d->handle); + BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1); + OUT_RING (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY); + + ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT, + &ctx->rect); + if (ret) { + nv04_2d_context_takedown(ctx); + return NULL; + } + + BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1); + OUT_RING (chan, ctx->ntfy->handle); + BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1); + OUT_RING (chan, ctx->surf2d->handle); + BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1); + OUT_RING (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY); + BEGIN_RING(chan, ctx->rect, + NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1); + OUT_RING (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE); + + switch (chan->device->chipset & 0xf0) { + case 0x00: + case 0x10: + class = NV04_SWIZZLED_SURFACE; + break; + case 0x20: + class = NV20_SWIZZLED_SURFACE; + break; + case 0x30: + class = NV30_SWIZZLED_SURFACE; + break; + case 0x40: + case 0x60: + class = NV40_SWIZZLED_SURFACE; + break; + default: + /* Famous last words: this really can't happen.. */ + assert(0); + break; + } + + ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf); + if (ret) { + nv04_2d_context_takedown(ctx); + return NULL; + } + + /* all the Gallium MARK_RING calculations assume no autobinding, so do that now */ + if(ctx->swzsurf->bound == NOUVEAU_GROBJ_UNBOUND) + nouveau_grobj_autobind(ctx->swzsurf); + + switch (chan->device->chipset & 0xf0) { + case 0x10: + case 0x20: + class = NV10_SCALED_IMAGE_FROM_MEMORY; + break; + case 0x30: + class = NV30_SCALED_IMAGE_FROM_MEMORY; + break; + case 0x40: + case 0x60: + class = NV40_SCALED_IMAGE_FROM_MEMORY; + break; + default: + class = NV04_SCALED_IMAGE_FROM_MEMORY; + break; + } + + ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm); + if (ret) { + nv04_2d_context_takedown(ctx); + return NULL; + } + + /* all the Gallium MARK_RING calculations assume no autobinding, so do that now */ + if(ctx->sifm->bound == NOUVEAU_GROBJ_UNBOUND) + nouveau_grobj_autobind(ctx->sifm); + + return ctx; +} diff --git a/src/gallium/drivers/nvfx/nv04_2d.h b/src/gallium/drivers/nvfx/nv04_2d.h new file mode 100644 index 00000000000..e638b8c8740 --- /dev/null +++ b/src/gallium/drivers/nvfx/nv04_2d.h @@ -0,0 +1,87 @@ +/************************************************************************** + * + * Copyright 2009 Ben Skeggs + * Copyright 2009 Younes Manton + * Copyright 2010 Luca Barbieri + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS + * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + +/* this code has no Mesa or Gallium dependency and can be reused in the classic Mesa driver or DDX */ + +#ifndef __NV04_2D_H__ +#define __NV04_2D_H__ + +struct nv04_2d_context; +struct nouveau_channel; +struct nouveau_bo; + +// NOTE: all functions taking this as a parameter will CLOBBER it (except for ->bo) +struct nv04_region { + struct nouveau_bo* bo; + int offset; + unsigned pitch; // 0 -> swizzled + unsigned bpps; // bpp shift (0, 1, 2; 3, 4 for fp/compressed) + unsigned x, y, z; + unsigned w, h, d; +}; + +void +nv04_memcpy(struct nv04_2d_context *ctx, + struct nouveau_bo* dstbo, int dstoff, + struct nouveau_bo* srcbo, int srcoff, + unsigned size); + +unsigned +nv04_region_begin(struct nv04_region* rgn, unsigned w, unsigned h); + +unsigned +nv04_region_end(struct nv04_region* rgn, unsigned w, unsigned h); + +void +nv04_2d_context_takedown(struct nv04_2d_context *pctx); + +struct nv04_2d_context * +nv04_2d_context_init(struct nouveau_channel* chan); + +void +nv04_region_copy_cpu(struct nv04_region* dst, struct nv04_region* src, int w, int h); + +void +nv04_region_fill_cpu(struct nv04_region* dst, int w, int h, unsigned value); + +int +nv04_region_copy_2d(struct nv04_2d_context *ctx, + struct nv04_region* dst, struct nv04_region* src, + int w, int h, + int cs2d_format, int sifm_format, + int dst_to_gpu, int src_on_gpu); + +int +nv04_region_fill_2d(struct nv04_2d_context *ctx, + struct nv04_region *dst, + int w, int h, + unsigned value); + +#endif diff --git a/src/gallium/drivers/nvfx/nv04_2d_loops.h b/src/gallium/drivers/nvfx/nv04_2d_loops.h new file mode 100644 index 00000000000..3a6787c0717 --- /dev/null +++ b/src/gallium/drivers/nvfx/nv04_2d_loops.h @@ -0,0 +1,70 @@ +#ifndef T +{ + if(dst->bpps == 0) +#define T uint8_t +#include "nv04_2d_loops.h" +#undef T + else if(dst->bpps == 1) +#define T uint16_t +#include "nv04_2d_loops.h" +#undef T + else if(dst->bpps == 2) +#define T uint32_t +#include "nv04_2d_loops.h" +#undef T + else + assert(0); +} +#else +#ifdef SWIZZLED_COPY_LOOPS +{ + if(!dst->pitch) + { + if(!src->pitch) + { + LOOP_Y + { + T* pdst = (T*)mdst + dswy[iy]; + T* psrc = (T*)msrc + sswy[iy]; + LOOP_X + { + assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size)); + assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size)); + pdst[dswx[ix]] = psrc[sswx[ix]]; + } + } + } + else + { + T* psrc = (T*)(msrc + ((dir > 0) ? src->y : (src->y + h - 1)) * src->pitch) + src->x; + LOOP_Y + { + T* pdst = (T*)mdst + dswy[iy]; + LOOP_X + { + assert((char*)&psrc[ix + 1] <= ((char*)src->bo->map + src->bo->size)); + assert((char*)&pdst[dswx[ix] + 1] <= ((char*)dst->bo->map + dst->bo->size)); + pdst[dswx[ix]] = psrc[ix]; + } + psrc = (T*)((char*)psrc + dir * src->pitch); + } + } + } + else + { + T* pdst = (T*)(mdst + ((dir > 0) ? dst->y : (dst->y + h - 1)) * dst->pitch) + dst->x; + LOOP_Y + { + T* psrc = (T*)msrc + sswy[iy]; + LOOP_X + { + assert((char*)&psrc[sswx[ix] + 1] <= ((char*)src->bo->map + src->bo->size)); + assert((char*)&pdst[ix + 1] <= ((char*)dst->bo->map + dst->bo->size)); + pdst[ix] = psrc[sswx[ix]]; + } + pdst = (T*)((char*)pdst + dir * dst->pitch); + } + } +} +#endif +#endif diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.c b/src/gallium/drivers/nvfx/nv04_surface_2d.c deleted file mode 100644 index 7acbb505df3..00000000000 --- a/src/gallium/drivers/nvfx/nv04_surface_2d.c +++ /dev/null @@ -1,532 +0,0 @@ -#include "pipe/p_context.h" -#include "pipe/p_format.h" -#include "util/u_format.h" -#include "util/u_math.h" -#include "util/u_memory.h" - -#include "nouveau/nouveau_winsys.h" -#include "nouveau/nouveau_util.h" -#include "nouveau/nouveau_screen.h" -#include "nv04_surface_2d.h" - -static INLINE int -nv04_surface_format(enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_I8_UNORM: - return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8; - case PIPE_FORMAT_R16_SNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_Z16_UNORM: - case PIPE_FORMAT_L8A8_UNORM: - return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5; - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_B8G8R8A8_UNORM: - return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8; - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: - return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32; - default: - return -1; - } -} - -static INLINE int -nv04_rect_format(enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_A8_UNORM: - return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8; - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_L8A8_UNORM: - case PIPE_FORMAT_Z16_UNORM: - return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5; - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: - return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8; - default: - return -1; - } -} - -static INLINE int -nv04_scaled_image_format(enum pipe_format format) -{ - switch (format) { - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_I8_UNORM: - return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8; - case PIPE_FORMAT_B5G5R5A1_UNORM: - return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5; - case PIPE_FORMAT_B8G8R8A8_UNORM: - return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8; - case PIPE_FORMAT_B8G8R8X8_UNORM: - return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8; - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_R16_SNORM: - case PIPE_FORMAT_L8A8_UNORM: - return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5; - default: - return -1; - } -} - -static INLINE unsigned -nv04_swizzle_bits_square(unsigned x, unsigned y) -{ - unsigned u = (x & 0x001) << 0 | - (x & 0x002) << 1 | - (x & 0x004) << 2 | - (x & 0x008) << 3 | - (x & 0x010) << 4 | - (x & 0x020) << 5 | - (x & 0x040) << 6 | - (x & 0x080) << 7 | - (x & 0x100) << 8 | - (x & 0x200) << 9 | - (x & 0x400) << 10 | - (x & 0x800) << 11; - - unsigned v = (y & 0x001) << 1 | - (y & 0x002) << 2 | - (y & 0x004) << 3 | - (y & 0x008) << 4 | - (y & 0x010) << 5 | - (y & 0x020) << 6 | - (y & 0x040) << 7 | - (y & 0x080) << 8 | - (y & 0x100) << 9 | - (y & 0x200) << 10 | - (y & 0x400) << 11 | - (y & 0x800) << 12; - return v | u; -} - -/* rectangular swizzled textures are linear concatenations of swizzled square tiles */ -static INLINE unsigned -nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h) -{ - unsigned s = MIN2(w, h); - unsigned m = s - 1; - return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m); -} - -static int -nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx, - struct pipe_surface *dst, int dx, int dy, - struct pipe_surface *src, int sx, int sy, - int w, int h) -{ - struct nouveau_channel *chan = ctx->swzsurf->channel; - struct nouveau_grobj *swzsurf = ctx->swzsurf; - struct nouveau_grobj *sifm = ctx->sifm; - struct nouveau_bo *src_bo = ctx->buf(src); - struct nouveau_bo *dst_bo = ctx->buf(dst); - const unsigned src_pitch = ((struct nv04_surface *)src)->pitch; - /* Max width & height may not be the same on all HW, but must be POT */ - const unsigned max_w = 1024; - const unsigned max_h = 1024; - unsigned sub_w = w > max_w ? max_w : w; - unsigned sub_h = h > max_h ? max_h : h; - unsigned x; - unsigned y; - - /* Swizzled surfaces must be POT */ - assert(util_is_pot(dst->width) && util_is_pot(dst->height)); - - /* If area is too large to copy in one shot we must copy it in POT chunks to meet alignment requirements */ - assert(sub_w == w || util_is_pot(sub_w)); - assert(sub_h == h || util_is_pot(sub_h)); - - MARK_RING (chan, 8 + ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*17, 2 + - ((w+sub_w)/sub_w)*((h+sub_h)/sub_h)*2); - - BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1); - OUT_RELOCo(chan, dst_bo, - NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - - BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_FORMAT, 1); - OUT_RING (chan, nv04_surface_format(dst->format) | - log2i(dst->width) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT | - log2i(dst->height) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT); - - BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1); - OUT_RELOCo(chan, src_bo, - NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); - BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1); - OUT_RING (chan, swzsurf->handle); - - for (y = 0; y < h; y += sub_h) { - sub_h = MIN2(sub_h, h - y); - - for (x = 0; x < w; x += sub_w) { - sub_w = MIN2(sub_w, w - x); - - assert(!(dst->offset & 63)); - - BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1); - OUT_RELOCl(chan, dst_bo, dst->offset, - NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - - BEGIN_RING(chan, sifm, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9); - OUT_RING (chan, NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE); - OUT_RING (chan, nv04_scaled_image_format(src->format)); - OUT_RING (chan, NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY); - OUT_RING (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT)); - OUT_RING (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w); - OUT_RING (chan, (x + dx) | ((y + dy) << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT)); - OUT_RING (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w); - OUT_RING (chan, 1 << 20); - OUT_RING (chan, 1 << 20); - - BEGIN_RING(chan, sifm, NV03_SCALED_IMAGE_FROM_MEMORY_SIZE, 4); - OUT_RING (chan, sub_h << NV03_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT | sub_w); - OUT_RING (chan, src_pitch | - NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER | - NV03_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE); - OUT_RELOCl(chan, src_bo, src->offset + (sy+y) * src_pitch + (sx+x) * util_format_get_blocksize(src->texture->format), - NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); - OUT_RING (chan, 0); - } - } - - return 0; -} - -static int -nv04_surface_copy_m2mf(struct nv04_surface_2d *ctx, - struct pipe_surface *dst, int dx, int dy, - struct pipe_surface *src, int sx, int sy, int w, int h) -{ - struct nouveau_channel *chan = ctx->m2mf->channel; - struct nouveau_grobj *m2mf = ctx->m2mf; - struct nouveau_bo *src_bo = ctx->buf(src); - struct nouveau_bo *dst_bo = ctx->buf(dst); - unsigned src_pitch = ((struct nv04_surface *)src)->pitch; - unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch; - unsigned dst_offset = dst->offset + dy * dst_pitch + - dx * util_format_get_blocksize(dst->texture->format); - unsigned src_offset = src->offset + sy * src_pitch + - sx * util_format_get_blocksize(src->texture->format); - - MARK_RING (chan, 3 + ((h / 2047) + 1) * 9, 2 + ((h / 2047) + 1) * 2); - BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2); - OUT_RELOCo(chan, src_bo, - NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); - OUT_RELOCo(chan, dst_bo, - NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - - while (h) { - int count = (h > 2047) ? 2047 : h; - - BEGIN_RING(chan, m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8); - OUT_RELOCl(chan, src_bo, src_offset, - NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD); - OUT_RELOCl(chan, dst_bo, dst_offset, - NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR); - OUT_RING (chan, src_pitch); - OUT_RING (chan, dst_pitch); - OUT_RING (chan, w * util_format_get_blocksize(src->texture->format)); - OUT_RING (chan, count); - OUT_RING (chan, 0x0101); - OUT_RING (chan, 0); - - h -= count; - src_offset += src_pitch * count; - dst_offset += dst_pitch * count; - } - - return 0; -} - -static int -nv04_surface_copy_blit(struct nv04_surface_2d *ctx, struct pipe_surface *dst, - int dx, int dy, struct pipe_surface *src, int sx, int sy, - int w, int h) -{ - struct nouveau_channel *chan = ctx->surf2d->channel; - struct nouveau_grobj *surf2d = ctx->surf2d; - struct nouveau_grobj *blit = ctx->blit; - struct nouveau_bo *src_bo = ctx->buf(src); - struct nouveau_bo *dst_bo = ctx->buf(dst); - unsigned src_pitch = ((struct nv04_surface *)src)->pitch; - unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch; - int format; - - format = nv04_surface_format(dst->format); - if (format < 0) - return 1; - - MARK_RING (chan, 12, 4); - BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2); - OUT_RELOCo(chan, src_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); - OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4); - OUT_RING (chan, format); - OUT_RING (chan, (dst_pitch << 16) | src_pitch); - OUT_RELOCl(chan, src_bo, src->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD); - OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - - BEGIN_RING(chan, blit, 0x0300, 3); - OUT_RING (chan, (sy << 16) | sx); - OUT_RING (chan, (dy << 16) | dx); - OUT_RING (chan, ( h << 16) | w); - - return 0; -} - -static void -nv04_surface_copy(struct nv04_surface_2d *ctx, struct pipe_surface *dst, - int dx, int dy, struct pipe_surface *src, int sx, int sy, - int w, int h) -{ - int src_linear = src->texture->flags & NVFX_RESOURCE_FLAG_LINEAR; - int dst_linear = dst->texture->flags & NVFX_RESOURCE_FLAG_LINEAR; - - assert(src->format == dst->format); - - /* Setup transfer to swizzle the texture to vram if needed */ - if (src_linear && !dst_linear && w > 1 && h > 1) { - nv04_surface_copy_swizzle(ctx, dst, dx, dy, src, sx, sy, w, h); - return; - } - - /* Use M2MF instead of the blitter since it always works - * Any possible performance drop is likely to be not very significant - * and dwarfed anyway by the current buffer management problems - */ - nv04_surface_copy_m2mf(ctx, dst, dx, dy, src, sx, sy, w, h); -} - -static void -nv04_surface_fill(struct nv04_surface_2d *ctx, struct pipe_surface *dst, - int dx, int dy, int w, int h, unsigned value) -{ - struct nouveau_channel *chan = ctx->surf2d->channel; - struct nouveau_grobj *surf2d = ctx->surf2d; - struct nouveau_grobj *rect = ctx->rect; - struct nouveau_bo *dst_bo = ctx->buf(dst); - unsigned dst_pitch = ((struct nv04_surface *)dst)->pitch; - int cs2d_format, gdirect_format; - - cs2d_format = nv04_surface_format(dst->format); - assert(cs2d_format >= 0); - - gdirect_format = nv04_rect_format(dst->format); - assert(gdirect_format >= 0); - - MARK_RING (chan, 16, 4); - BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2); - OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - OUT_RELOCo(chan, dst_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4); - OUT_RING (chan, cs2d_format); - OUT_RING (chan, (dst_pitch << 16) | dst_pitch); - OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - OUT_RELOCl(chan, dst_bo, dst->offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); - - BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1); - OUT_RING (chan, gdirect_format); - BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1); - OUT_RING (chan, value); - BEGIN_RING(chan, rect, - NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2); - OUT_RING (chan, (dx << 16) | dy); - OUT_RING (chan, ( w << 16) | h); -} - -void -nv04_surface_2d_takedown(struct nv04_surface_2d **pctx) -{ - struct nv04_surface_2d *ctx; - - if (!pctx || !*pctx) - return; - ctx = *pctx; - *pctx = NULL; - - nouveau_notifier_free(&ctx->ntfy); - nouveau_grobj_free(&ctx->m2mf); - nouveau_grobj_free(&ctx->surf2d); - nouveau_grobj_free(&ctx->swzsurf); - nouveau_grobj_free(&ctx->rect); - nouveau_grobj_free(&ctx->blit); - nouveau_grobj_free(&ctx->sifm); - - FREE(ctx); -} - -struct nv04_surface_2d * -nv04_surface_2d_init(struct nouveau_screen *screen) -{ - struct nv04_surface_2d *ctx = CALLOC_STRUCT(nv04_surface_2d); - struct nouveau_channel *chan = screen->channel; - unsigned handle = 0x88000000, class; - int ret; - - if (!ctx) - return NULL; - - ret = nouveau_notifier_alloc(chan, handle++, 1, &ctx->ntfy); - if (ret) { - nv04_surface_2d_takedown(&ctx); - return NULL; - } - - ret = nouveau_grobj_alloc(chan, handle++, 0x0039, &ctx->m2mf); - if (ret) { - nv04_surface_2d_takedown(&ctx); - return NULL; - } - - BEGIN_RING(chan, ctx->m2mf, NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1); - OUT_RING (chan, ctx->ntfy->handle); - - if (chan->device->chipset < 0x10) - class = NV04_CONTEXT_SURFACES_2D; - else - class = NV10_CONTEXT_SURFACES_2D; - - ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->surf2d); - if (ret) { - nv04_surface_2d_takedown(&ctx); - return NULL; - } - - BEGIN_RING(chan, ctx->surf2d, - NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2); - OUT_RING (chan, chan->vram->handle); - OUT_RING (chan, chan->vram->handle); - - if (chan->device->chipset < 0x10) - class = NV04_IMAGE_BLIT; - else - class = NV12_IMAGE_BLIT; - - ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->blit); - if (ret) { - nv04_surface_2d_takedown(&ctx); - return NULL; - } - - BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_DMA_NOTIFY, 1); - OUT_RING (chan, ctx->ntfy->handle); - BEGIN_RING(chan, ctx->blit, NV04_IMAGE_BLIT_SURFACE, 1); - OUT_RING (chan, ctx->surf2d->handle); - BEGIN_RING(chan, ctx->blit, NV01_IMAGE_BLIT_OPERATION, 1); - OUT_RING (chan, NV01_IMAGE_BLIT_OPERATION_SRCCOPY); - - ret = nouveau_grobj_alloc(chan, handle++, NV04_GDI_RECTANGLE_TEXT, - &ctx->rect); - if (ret) { - nv04_surface_2d_takedown(&ctx); - return NULL; - } - - BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1); - OUT_RING (chan, ctx->ntfy->handle); - BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1); - OUT_RING (chan, ctx->surf2d->handle); - BEGIN_RING(chan, ctx->rect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1); - OUT_RING (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY); - BEGIN_RING(chan, ctx->rect, - NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1); - OUT_RING (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE); - - switch (chan->device->chipset & 0xf0) { - case 0x00: - case 0x10: - class = NV04_SWIZZLED_SURFACE; - break; - case 0x20: - class = NV20_SWIZZLED_SURFACE; - break; - case 0x30: - class = NV30_SWIZZLED_SURFACE; - break; - case 0x40: - case 0x60: - class = NV40_SWIZZLED_SURFACE; - break; - default: - /* Famous last words: this really can't happen.. */ - assert(0); - break; - } - - ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->swzsurf); - if (ret) { - nv04_surface_2d_takedown(&ctx); - return NULL; - } - - switch (chan->device->chipset & 0xf0) { - case 0x10: - case 0x20: - class = NV10_SCALED_IMAGE_FROM_MEMORY; - break; - case 0x30: - class = NV30_SCALED_IMAGE_FROM_MEMORY; - break; - case 0x40: - case 0x60: - class = NV40_SCALED_IMAGE_FROM_MEMORY; - break; - default: - class = NV04_SCALED_IMAGE_FROM_MEMORY; - break; - } - - ret = nouveau_grobj_alloc(chan, handle++, class, &ctx->sifm); - if (ret) { - nv04_surface_2d_takedown(&ctx); - return NULL; - } - - ctx->copy = nv04_surface_copy; - ctx->fill = nv04_surface_fill; - return ctx; -} - -struct nv04_surface* -nv04_surface_wrap_for_render(struct pipe_screen *pscreen, - struct nv04_surface_2d* eng2d, struct nv04_surface* ns) -{ - struct pipe_resource templ; - struct pipe_resource* temp_tex; - struct nv04_surface* temp_ns; - int temp_flags; - - temp_flags = ns->base.usage; - - ns->base.usage = 0; - - memset(&templ, 0, sizeof(templ)); - templ.format = ns->base.texture->format; - templ.target = PIPE_TEXTURE_2D; - templ.width0 = ns->base.width; - templ.height0 = ns->base.height; - templ.depth0 = 1; - templ.last_level = 0; - - // TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented - templ.nr_samples = ns->base.texture->nr_samples; - - templ.bind = ns->base.texture->bind | PIPE_BIND_RENDER_TARGET; - - temp_tex = pscreen->resource_create(pscreen, &templ); - temp_ns = (struct nv04_surface*)pscreen->get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags); - temp_ns->backing = ns; - - if(1) /* hmm */ - eng2d->copy(eng2d, &temp_ns->backing->base, - 0, 0, &ns->base, - 0, 0, ns->base.width, ns->base.height); - - return temp_ns; -} diff --git a/src/gallium/drivers/nvfx/nv04_surface_2d.h b/src/gallium/drivers/nvfx/nv04_surface_2d.h deleted file mode 100644 index 2123c3ed08b..00000000000 --- a/src/gallium/drivers/nvfx/nv04_surface_2d.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef __NV04_SURFACE_2D_H__ -#define __NV04_SURFACE_2D_H__ - -#include "pipe/p_state.h" - -struct nouveau_screen; - -struct nv04_surface { - struct pipe_surface base; - unsigned pitch; - struct nv04_surface* backing; -}; - -struct nv04_surface_2d { - struct nouveau_notifier *ntfy; - struct nouveau_grobj *surf2d; - struct nouveau_grobj *swzsurf; - struct nouveau_grobj *m2mf; - struct nouveau_grobj *rect; - struct nouveau_grobj *blit; - struct nouveau_grobj *sifm; - - struct nouveau_bo *(*buf)(struct pipe_surface *); - - void (*copy)(struct nv04_surface_2d *, struct pipe_surface *dst, - int dx, int dy, struct pipe_surface *src, int sx, int sy, - int w, int h); - void (*fill)(struct nv04_surface_2d *, struct pipe_surface *dst, - int dx, int dy, int w, int h, unsigned value); -}; - -struct nv04_surface_2d * -nv04_surface_2d_init(struct nouveau_screen *screen); - -void -nv04_surface_2d_takedown(struct nv04_surface_2d **); - -struct nv04_surface* -nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns); - -#define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) - -#endif diff --git a/src/gallium/drivers/nvfx/nv30_fragtex.c b/src/gallium/drivers/nvfx/nv30_fragtex.c index dec073ac900..0c3d43fd573 100644 --- a/src/gallium/drivers/nvfx/nv30_fragtex.c +++ b/src/gallium/drivers/nvfx/nv30_fragtex.c @@ -1,7 +1,6 @@ #include "util/u_format.h" #include "nvfx_context.h" -#include "nouveau/nouveau_util.h" #include "nvfx_tex.h" #include "nvfx_resource.h" @@ -10,138 +9,109 @@ nv30_sampler_state_init(struct pipe_context *pipe, struct nvfx_sampler_state *ps, const struct pipe_sampler_state *cso) { - if (cso->max_anisotropy >= 8) { - ps->en |= NV34TCL_TX_ENABLE_ANISO_8X; - } else - if (cso->max_anisotropy >= 4) { - ps->en |= NV34TCL_TX_ENABLE_ANISO_4X; - } else - if (cso->max_anisotropy >= 2) { - ps->en |= NV34TCL_TX_ENABLE_ANISO_2X; - } + float limit; + if (cso->max_anisotropy >= 2) { - float limit; + if (cso->max_anisotropy >= 8) + ps->en |= NV34TCL_TX_ENABLE_ANISO_8X; + else if (cso->max_anisotropy >= 4) + ps->en |= NV34TCL_TX_ENABLE_ANISO_4X; + else if (cso->max_anisotropy >= 2) + ps->en |= NV34TCL_TX_ENABLE_ANISO_2X; + } - limit = CLAMP(cso->lod_bias, -16.0, 15.0); - ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff; + limit = CLAMP(cso->lod_bias, -16.0, 15.0 + (255.0 / 256.0)); + ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff; - limit = CLAMP(cso->max_lod, 0.0, 15.0); - ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/; + ps->max_lod = (int)CLAMP(cso->max_lod, 0.0, 15.0); + ps->min_lod = (int)CLAMP(cso->min_lod, 0.0, 15.0); - limit = CLAMP(cso->min_lod, 0.0, 15.0); - ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/; - } + ps->en |= NV34TCL_TX_ENABLE_ENABLE; } -#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w) \ -{ \ - TRUE, \ - PIPE_FORMAT_##m, \ - NV34TCL_TX_FORMAT_FORMAT_##tf, \ - (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y | \ - NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w | \ - NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y | \ - NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w) \ -} - -struct nv30_texture_format { - boolean defined; - uint pipe; - int format; - int swizzle; -}; - -static struct nv30_texture_format -nv30_texture_formats[] = { - _(B8G8R8X8_UNORM, A8R8G8B8, S1, S1, S1, ONE, X, Y, Z, W), - _(B8G8R8A8_UNORM, A8R8G8B8, S1, S1, S1, S1, X, Y, Z, W), - _(B5G5R5A1_UNORM, A1R5G5B5, S1, S1, S1, S1, X, Y, Z, W), - _(B4G4R4A4_UNORM, A4R4G4B4, S1, S1, S1, S1, X, Y, Z, W), - _(B5G6R5_UNORM , R5G6B5 , S1, S1, S1, ONE, X, Y, Z, W), - _(L8_UNORM , L8 , S1, S1, S1, ONE, X, X, X, X), - _(A8_UNORM , L8 , ZERO, ZERO, ZERO, S1, X, X, X, X), - _(I8_UNORM , L8 , S1, S1, S1, S1, X, X, X, X), - _(L8A8_UNORM , A8L8 , S1, S1, S1, S1, X, X, X, Y), - _(Z16_UNORM , R5G6B5 , S1, S1, S1, ONE, X, X, X, X), - _(S8_USCALED_Z24_UNORM , A8R8G8B8, S1, S1, S1, ONE, X, X, X, X), - _(DXT1_RGB , DXT1 , S1, S1, S1, ONE, X, Y, Z, W), - _(DXT1_RGBA , DXT1 , S1, S1, S1, S1, X, Y, Z, W), - _(DXT3_RGBA , DXT3 , S1, S1, S1, S1, X, Y, Z, W), - _(DXT5_RGBA , DXT5 , S1, S1, S1, S1, X, Y, Z, W), - {}, -}; - -static struct nv30_texture_format * -nv30_fragtex_format(uint pipe_format) +void +nv30_sampler_view_init(struct pipe_context *pipe, + struct nvfx_sampler_view *sv) { - struct nv30_texture_format *tf = nv30_texture_formats; - - while (tf->defined) { - if (tf->pipe == pipe_format) - return tf; - tf++; - } - - NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format)); - return NULL; + struct pipe_resource* pt = sv->base.texture; + struct nvfx_texture_format *tf = &nvfx_texture_formats[sv->base.format]; + unsigned txf; + unsigned level = pt->target == PIPE_TEXTURE_CUBE ? 0 : sv->base.first_level; + + assert(tf->fmt[0] >= 0); + + txf = sv->u.init_fmt; + txf |= (level != sv->base.last_level ? NV34TCL_TX_FORMAT_MIPMAP : 0); + txf |= util_logbase2(u_minify(pt->width0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT; + txf |= util_logbase2(u_minify(pt->height0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT; + txf |= util_logbase2(u_minify(pt->depth0, level)) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT; + txf |= 0x10000; + + sv->u.nv30.fmt[0] = tf->fmt[0] | txf; + sv->u.nv30.fmt[1] = tf->fmt[1] | txf; + sv->u.nv30.fmt[2] = tf->fmt[2] | txf; + sv->u.nv30.fmt[3] = tf->fmt[3] | txf; + + sv->swizzle |= (nvfx_subresource_pitch(pt, 0) << NV34TCL_TX_SWIZZLE_RECT_PITCH_SHIFT); + + if(pt->height0 <= 1 || util_format_is_compressed(sv->base.format)) + sv->u.nv30.rect = -1; + else + sv->u.nv30.rect = !!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR); + + sv->lod_offset = sv->base.first_level - level; + sv->max_lod_limit = sv->base.last_level - level; } - void nv30_fragtex_set(struct nvfx_context *nvfx, int unit) { struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit]; - struct nvfx_miptree *nv30mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture; - struct pipe_resource *pt = &nv30mt->base.base; - struct nouveau_bo *bo = nv30mt->base.bo; - struct nv30_texture_format *tf; + struct nvfx_sampler_view* sv = (struct nvfx_sampler_view*)nvfx->fragment_sampler_views[unit]; + struct nouveau_bo *bo = ((struct nvfx_miptree *)sv->base.texture)->base.bo; struct nouveau_channel* chan = nvfx->screen->base.channel; - uint32_t txf, txs; + unsigned txf; unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD; + unsigned use_rect; + unsigned max_lod = MIN2(ps->max_lod + sv->lod_offset, sv->max_lod_limit); + unsigned min_lod = MIN2(ps->min_lod + sv->lod_offset, max_lod) ; - tf = nv30_fragtex_format(pt->format); - if (!tf) - return; - - txf = tf->format; - txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0); - txf |= log2i(pt->width0) << NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT; - txf |= log2i(pt->height0) << NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT; - txf |= log2i(pt->depth0) << NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT; - txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000; - - switch (pt->target) { - case PIPE_TEXTURE_CUBE: - txf |= NV34TCL_TX_FORMAT_CUBIC; - /* fall-through */ - case PIPE_TEXTURE_2D: - txf |= NV34TCL_TX_FORMAT_DIMS_2D; - break; - case PIPE_TEXTURE_3D: - txf |= NV34TCL_TX_FORMAT_DIMS_3D; - break; - case PIPE_TEXTURE_1D: - txf |= NV34TCL_TX_FORMAT_DIMS_1D; - break; - default: - NOUVEAU_ERR("Unknown target %d\n", pt->target); - return; + if(sv->u.nv30.rect < 0) + { + /* in the case of compressed or 1D textures, we can get away with this, + * since the layout is the same + */ + use_rect = ps->fmt; + } + else + { + static boolean warned = FALSE; + if( !!ps->fmt != sv->u.nv30.rect && !warned) { + warned = TRUE; + fprintf(stderr, + "Unimplemented: coordinate normalization mismatch. Possible reasons:\n" + "1. ARB_texture_non_power_of_two is being used despite the fact it isn't supported\n" + "2. The state tracker is not using the appropriate coordinate normalization\n" + "3. The state tracker is not supported\n"); + } + + use_rect = sv->u.nv30.rect; } - txs = tf->swizzle; + txf = sv->u.nv30.fmt[ps->compare + (use_rect ? 2 : 0)]; MARK_RING(chan, 9, 2); OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8)); - OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0); - OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR, - NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1); - OUT_RING(chan, ps->wrap); - OUT_RING(chan, NV34TCL_TX_ENABLE_ENABLE | ps->en); - OUT_RING(chan, txs); - OUT_RING(chan, ps->filt | 0x2000 /*voodoo*/); - OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | - pt->height0); + OUT_RELOC(chan, bo, sv->offset, tex_flags | NOUVEAU_BO_LOW, 0, 0); + OUT_RELOC(chan, bo, txf, + tex_flags | NOUVEAU_BO_OR, + NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1); + OUT_RING(chan, (ps->wrap & sv->wrap_mask) | sv->wrap); + OUT_RING(chan, ps->en | (min_lod << NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT) | (max_lod << NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT)); + OUT_RING(chan, sv->swizzle); + OUT_RING(chan, ps->filt | sv->filt); + OUT_RING(chan, sv->npot_size); OUT_RING(chan, ps->bcol); nvfx->hw_txf[unit] = txf; diff --git a/src/gallium/drivers/nvfx/nv30_vertprog.h b/src/gallium/drivers/nvfx/nv30_vertprog.h index ec0444c07f8..9a68f5c1fb0 100644 --- a/src/gallium/drivers/nvfx/nv30_vertprog.h +++ b/src/gallium/drivers/nvfx/nv30_vertprog.h @@ -68,7 +68,7 @@ #define NV30_VP_INST_DEST_TEMP_ID_SHIFT 16 #define NV30_VP_INST_DEST_TEMP_ID_MASK (0x0F << 16) #define NV30_VP_INST_COND_UPDATE_ENABLE (1<<15) -#define NV30_VP_INST_VEC_DEST_TEMP_MASK (0xF << 16) +#define NV30_VP_INST_VEC_DEST_TEMP_MASK (0x1F << 16) #define NV30_VP_INST_COND_TEST_ENABLE (1<<14) #define NV30_VP_INST_COND_SHIFT 11 #define NV30_VP_INST_COND_MASK (0x07 << 11) @@ -111,7 +111,7 @@ #define NV30_VP_INST_SRC2H_SHIFT 0 /*NV20*/ #define NV30_VP_INST_SRC2H_MASK (0x7FF << 0) /* NV30_VP_SRC2_HIGH_MASK >> 4*/ #define NV30_VP_INST_IADDR_SHIFT 2 -#define NV30_VP_INST_IADDR_MASK (0xF << 28) /* NV30_VP_SRC2_LOW_MASK << 28 */ +#define NV30_VP_INST_IADDR_MASK (0x1FF << 2) /* NV30_VP_SRC2_LOW_MASK << 28 */ /* DWORD 3 */ #define NV30_VP_INST_SRC2L_SHIFT 28 /*NV20*/ @@ -125,7 +125,7 @@ #define NV30_VP_INST_VDEST_WRITEMASK_SHIFT 12 /*NV20*/ #define NV30_VP_INST_VDEST_WRITEMASK_MASK (0x0F << 12) /*NV20*/ #define NV30_VP_INST_DEST_SHIFT 2 -#define NV30_VP_INST_DEST_MASK (0x0F << 2) +#define NV30_VP_INST_DEST_MASK (0x1F << 2) # define NV30_VP_INST_DEST_POS 0 # define NV30_VP_INST_DEST_BFC0 1 # define NV30_VP_INST_DEST_BFC1 2 @@ -133,7 +133,8 @@ # define NV30_VP_INST_DEST_COL1 4 # define NV30_VP_INST_DEST_FOGC 5 # define NV30_VP_INST_DEST_PSZ 6 -# define NV30_VP_INST_DEST_TC(n) (8+n) +# define NV30_VP_INST_DEST_TC(n) (8+(n)) +# define NV30_VP_INST_DEST_CLP(n) (17 + (n)) /* Useful to split the source selection regs into their pieces */ #define NV30_VP_SRC0_HIGH_SHIFT 6 diff --git a/src/gallium/drivers/nvfx/nv40_fragtex.c b/src/gallium/drivers/nvfx/nv40_fragtex.c index 0068b1ba54a..106ce71a079 100644 --- a/src/gallium/drivers/nvfx/nv40_fragtex.c +++ b/src/gallium/drivers/nvfx/nv40_fragtex.c @@ -8,168 +8,97 @@ nv40_sampler_state_init(struct pipe_context *pipe, struct nvfx_sampler_state *ps, const struct pipe_sampler_state *cso) { + float limit; if (cso->max_anisotropy >= 2) { /* no idea, binary driver sets it, works without it.. meh.. */ ps->wrap |= (1 << 5); - if (cso->max_anisotropy >= 16) { + if (cso->max_anisotropy >= 16) ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X; - } else - if (cso->max_anisotropy >= 12) { + else if (cso->max_anisotropy >= 12) ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X; - } else - if (cso->max_anisotropy >= 10) { + else if (cso->max_anisotropy >= 10) ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X; - } else - if (cso->max_anisotropy >= 8) { + else if (cso->max_anisotropy >= 8) ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X; - } else - if (cso->max_anisotropy >= 6) { + else if (cso->max_anisotropy >= 6) ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X; - } else - if (cso->max_anisotropy >= 4) { + else if (cso->max_anisotropy >= 4) ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X; - } else { + else ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X; - } } - { - float limit; + limit = CLAMP(cso->lod_bias, -16.0, 15.0 + (255.0 / 256.0)); + ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff; - limit = CLAMP(cso->lod_bias, -16.0, 15.0); - ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff; + ps->max_lod = (int)(CLAMP(cso->max_lod, 0.0, 15.0 + (255.0 / 256.0)) * 256.0); + ps->min_lod = (int)(CLAMP(cso->min_lod, 0.0, 15.0 + (255.0 / 256.0)) * 256.0); - limit = CLAMP(cso->max_lod, 0.0, 15.0); - ps->en |= (int)(limit * 256.0) << 7; - - limit = CLAMP(cso->min_lod, 0.0, 15.0); - ps->en |= (int)(limit * 256.0) << 19; - } -} - -#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw) \ -{ \ - TRUE, \ - PIPE_FORMAT_##m, \ - NV40TCL_TEX_FORMAT_FORMAT_##tf, \ - (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y | \ - NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w | \ - NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y | \ - NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w), \ - ((NV34TCL_TX_FILTER_SIGNED_RED*sx) | (NV34TCL_TX_FILTER_SIGNED_GREEN*sy) | \ - (NV34TCL_TX_FILTER_SIGNED_BLUE*sz) | (NV34TCL_TX_FILTER_SIGNED_ALPHA*sw)) \ + ps->en |= NV40TCL_TEX_ENABLE_ENABLE; } -struct nv40_texture_format { - boolean defined; - uint pipe; - int format; - int swizzle; - int sign; -}; - -static struct nv40_texture_format -nv40_texture_formats[] = { - _(B8G8R8X8_UNORM, A8R8G8B8, S1, S1, S1, ONE, X, Y, Z, W, 0, 0, 0, 0), - _(B8G8R8A8_UNORM, A8R8G8B8, S1, S1, S1, S1, X, Y, Z, W, 0, 0, 0, 0), - _(B5G5R5A1_UNORM, A1R5G5B5, S1, S1, S1, S1, X, Y, Z, W, 0, 0, 0, 0), - _(B4G4R4A4_UNORM, A4R4G4B4, S1, S1, S1, S1, X, Y, Z, W, 0, 0, 0, 0), - _(B5G6R5_UNORM , R5G6B5 , S1, S1, S1, ONE, X, Y, Z, W, 0, 0, 0, 0), - _(L8_UNORM , L8 , S1, S1, S1, ONE, X, X, X, X, 0, 0, 0, 0), - _(A8_UNORM , L8 , ZERO, ZERO, ZERO, S1, X, X, X, X, 0, 0, 0, 0), - _(R16_SNORM , A16 , ZERO, ZERO, S1, ONE, X, X, X, Y, 1, 1, 1, 1), - _(I8_UNORM , L8 , S1, S1, S1, S1, X, X, X, X, 0, 0, 0, 0), - _(L8A8_UNORM , A8L8 , S1, S1, S1, S1, X, X, X, Y, 0, 0, 0, 0), - _(Z16_UNORM , Z16 , S1, S1, S1, ONE, X, X, X, X, 0, 0, 0, 0), - _(S8_USCALED_Z24_UNORM , Z24 , S1, S1, S1, ONE, X, X, X, X, 0, 0, 0, 0), - _(DXT1_RGB , DXT1 , S1, S1, S1, ONE, X, Y, Z, W, 0, 0, 0, 0), - _(DXT1_RGBA , DXT1 , S1, S1, S1, S1, X, Y, Z, W, 0, 0, 0, 0), - _(DXT3_RGBA , DXT3 , S1, S1, S1, S1, X, Y, Z, W, 0, 0, 0, 0), - _(DXT5_RGBA , DXT5 , S1, S1, S1, S1, X, Y, Z, W, 0, 0, 0, 0), - {}, -}; - -static struct nv40_texture_format * -nv40_fragtex_format(uint pipe_format) +void +nv40_sampler_view_init(struct pipe_context *pipe, + struct nvfx_sampler_view *sv) { - struct nv40_texture_format *tf = nv40_texture_formats; - - while (tf->defined) { - if (tf->pipe == pipe_format) - return tf; - tf++; + struct pipe_resource* pt = sv->base.texture; + struct nvfx_miptree* mt = (struct nvfx_miptree*)pt; + struct nvfx_texture_format *tf = &nvfx_texture_formats[sv->base.format]; + unsigned txf; + unsigned level = pt->target == PIPE_TEXTURE_CUBE ? 0 : sv->base.first_level; + assert(tf->fmt[4] >= 0); + + txf = sv->u.init_fmt; + txf |= 0x8000; + if(pt->target == PIPE_TEXTURE_CUBE) + txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT); + else + txf |= (((sv->base.last_level - sv->base.first_level) + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT); + + if (!mt->linear_pitch) + sv->u.nv40.npot_size2 = 0; + else { + sv->u.nv40.npot_size2 = mt->linear_pitch; + txf |= NV40TCL_TEX_FORMAT_LINEAR; } - NOUVEAU_ERR("unknown texture format %s\n", util_format_name(pipe_format)); - return NULL; -} + sv->u.nv40.fmt[0] = tf->fmt[4] | txf; + sv->u.nv40.fmt[1] = tf->fmt[5] | txf; + sv->u.nv40.npot_size2 |= (u_minify(pt->depth0, level) << NV40TCL_TEX_SIZE1_DEPTH_SHIFT); + + sv->lod_offset = (sv->base.first_level - level) * 256; + sv->max_lod_limit = (sv->base.last_level - level) * 256; +} void nv40_fragtex_set(struct nvfx_context *nvfx, int unit) { struct nouveau_channel* chan = nvfx->screen->base.channel; struct nvfx_sampler_state *ps = nvfx->tex_sampler[unit]; - struct nvfx_miptree *nv40mt = (struct nvfx_miptree *)nvfx->fragment_sampler_views[unit]->texture; - struct nouveau_bo *bo = nv40mt->base.bo; - struct pipe_resource *pt = &nv40mt->base.base; - struct nv40_texture_format *tf; - - uint32_t txf, txs, txp; + struct nvfx_sampler_view* sv = (struct nvfx_sampler_view*)nvfx->fragment_sampler_views[unit]; + struct nouveau_bo *bo = ((struct nvfx_miptree *)sv->base.texture)->base.bo; unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD; + unsigned txf; + unsigned max_lod = MIN2(ps->max_lod + sv->lod_offset, sv->max_lod_limit); + unsigned min_lod = MIN2(ps->min_lod + sv->lod_offset, max_lod); - tf = nv40_fragtex_format(pt->format); - if (!tf) - assert(0); - - txf = ps->fmt; - txf |= tf->format | 0x8000; - txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT); - - if (1) /* XXX */ - txf |= NV34TCL_TX_FORMAT_NO_BORDER; - - switch (pt->target) { - case PIPE_TEXTURE_CUBE: - txf |= NV34TCL_TX_FORMAT_CUBIC; - /* fall-through */ - case PIPE_TEXTURE_2D: - txf |= NV34TCL_TX_FORMAT_DIMS_2D; - break; - case PIPE_TEXTURE_3D: - txf |= NV34TCL_TX_FORMAT_DIMS_3D; - break; - case PIPE_TEXTURE_1D: - txf |= NV34TCL_TX_FORMAT_DIMS_1D; - break; - default: - NOUVEAU_ERR("Unknown target %d\n", pt->target); - return; - } - - if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR)) { - txp = 0; - } else { - txp = nv40mt->level[0].pitch; - txf |= NV40TCL_TEX_FORMAT_LINEAR; - } - - txs = tf->swizzle; + txf = sv->u.nv40.fmt[ps->compare] | ps->fmt; - MARK_RING(chan, 11 + 2 * !unit, 2); + MARK_RING(chan, 11, 2); OUT_RING(chan, RING_3D(NV34TCL_TX_OFFSET(unit), 8)); - OUT_RELOC(chan, bo, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0); + OUT_RELOC(chan, bo, sv->offset, tex_flags | NOUVEAU_BO_LOW, 0, 0); OUT_RELOC(chan, bo, txf, tex_flags | NOUVEAU_BO_OR, NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1); - OUT_RING(chan, ps->wrap); - OUT_RING(chan, NV40TCL_TEX_ENABLE_ENABLE | ps->en); - OUT_RING(chan, txs); - OUT_RING(chan, ps->filt | tf->sign | 0x2000 /*voodoo*/); - OUT_RING(chan, (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | pt->height0); + OUT_RING(chan, (ps->wrap & sv->wrap_mask) | sv->wrap); + OUT_RING(chan, ps->en | (min_lod << 19) | (max_lod << 7)); + OUT_RING(chan, sv->swizzle); + OUT_RING(chan, ps->filt | sv->filt); + OUT_RING(chan, sv->npot_size); OUT_RING(chan, ps->bcol); OUT_RING(chan, RING_3D(NV40TCL_TEX_SIZE1(unit), 1)); - OUT_RING(chan, (pt->depth0 << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp); + OUT_RING(chan, sv->u.nv40.npot_size2); nvfx->hw_txf[unit] = txf; nvfx->hw_samplers |= (1 << unit); diff --git a/src/gallium/drivers/nvfx/nv40_vertprog.h b/src/gallium/drivers/nvfx/nv40_vertprog.h index 7337293babc..3d0a1fe3d10 100644 --- a/src/gallium/drivers/nvfx/nv40_vertprog.h +++ b/src/gallium/drivers/nvfx/nv40_vertprog.h @@ -44,7 +44,7 @@ #define NV40_VP_INST_SRC1_ABS (1 << 22) #define NV40_VP_INST_SRC0_ABS (1 << 21) #define NV40_VP_INST_VEC_DEST_TEMP_SHIFT 15 -#define NV40_VP_INST_VEC_DEST_TEMP_MASK (0x1F << 15) +#define NV40_VP_INST_VEC_DEST_TEMP_MASK (0x3F << 15) #define NV40_VP_INST_COND_TEST_ENABLE (1 << 13) #define NV40_VP_INST_COND_SHIFT 10 #define NV40_VP_INST_COND_MASK (0x7 << 10) @@ -100,7 +100,7 @@ #define NV40_VP_INST_SRC2H_SHIFT 0 #define NV40_VP_INST_SRC2H_MASK (0x3F << 0) #define NV40_VP_INST_IADDRH_SHIFT 0 -#define NV40_VP_INST_IADDRH_MASK (0x1F << 0) +#define NV40_VP_INST_IADDRH_MASK (0x3F << 0) /* ---- OPCODE BITS 31:0 / data DWORD 3 --- */ #define NV40_VP_INST_IADDRL_SHIFT 29 diff --git a/src/gallium/drivers/nvfx/nvfx_buffer.c b/src/gallium/drivers/nvfx/nvfx_buffer.c index 05b824b8f74..041099e0e56 100644 --- a/src/gallium/drivers/nvfx/nvfx_buffer.c +++ b/src/gallium/drivers/nvfx/nvfx_buffer.c @@ -6,115 +6,39 @@ #include "nouveau/nouveau_screen.h" #include "nouveau/nouveau_winsys.h" #include "nvfx_resource.h" +#include "nvfx_screen.h" - -/* Currently using separate implementations for buffers and textures, - * even though gallium has a unified abstraction of these objects. - * Eventually these should be combined, and mechanisms like transfers - * be adapted to work for both buffer and texture uploads. - */ -static void nvfx_buffer_destroy(struct pipe_screen *pscreen, +void nvfx_buffer_destroy(struct pipe_screen *pscreen, struct pipe_resource *presource) { - struct nvfx_resource *buffer = nvfx_resource(presource); + struct nvfx_buffer *buffer = nvfx_buffer(presource); - nouveau_screen_bo_release(pscreen, buffer->bo); + if(!(buffer->base.base.flags & NVFX_RESOURCE_FLAG_USER)) + align_free(buffer->data); + nouveau_screen_bo_release(pscreen, buffer->base.bo); FREE(buffer); } - - - -/* Utility functions for transfer create/destroy are hooked in and - * just record the arguments to those functions. - */ -static void * -nvfx_buffer_transfer_map( struct pipe_context *pipe, - struct pipe_transfer *transfer ) -{ - struct nvfx_resource *buffer = nvfx_resource(transfer->resource); - uint8_t *map; - - map = nouveau_screen_bo_map_range( pipe->screen, - buffer->bo, - transfer->box.x, - transfer->box.width, - nouveau_screen_transfer_flags(transfer->usage) ); - if (map == NULL) - return NULL; - - return map + transfer->box.x; -} - - - -static void nvfx_buffer_transfer_flush_region( struct pipe_context *pipe, - struct pipe_transfer *transfer, - const struct pipe_box *box) -{ - struct nvfx_resource *buffer = nvfx_resource(transfer->resource); - - nouveau_screen_bo_map_flush_range(pipe->screen, - buffer->bo, - transfer->box.x + box->x, - box->width); -} - -static void nvfx_buffer_transfer_unmap( struct pipe_context *pipe, - struct pipe_transfer *transfer ) -{ - struct nvfx_resource *buffer = nvfx_resource(transfer->resource); - - nouveau_screen_bo_unmap(pipe->screen, buffer->bo); -} - - - - -struct u_resource_vtbl nvfx_buffer_vtbl = -{ - u_default_resource_get_handle, /* get_handle */ - nvfx_buffer_destroy, /* resource_destroy */ - NULL, /* is_resource_referenced */ - u_default_get_transfer, /* get_transfer */ - u_default_transfer_destroy, /* transfer_destroy */ - nvfx_buffer_transfer_map, /* transfer_map */ - nvfx_buffer_transfer_flush_region, /* transfer_flush_region */ - nvfx_buffer_transfer_unmap, /* transfer_unmap */ - u_default_transfer_inline_write /* transfer_inline_write */ -}; - - - struct pipe_resource * nvfx_buffer_create(struct pipe_screen *pscreen, const struct pipe_resource *template) { - struct nvfx_resource *buffer; + struct nvfx_screen* screen = nvfx_screen(pscreen); + struct nvfx_buffer* buffer; - buffer = CALLOC_STRUCT(nvfx_resource); + buffer = CALLOC_STRUCT(nvfx_buffer); if (!buffer) return NULL; - buffer->base = *template; - buffer->vtbl = &nvfx_buffer_vtbl; - pipe_reference_init(&buffer->base.reference, 1); - buffer->base.screen = pscreen; - - buffer->bo = nouveau_screen_bo_new(pscreen, - 16, - buffer->base.usage, - buffer->base.bind, - buffer->base.width0); - - if (buffer->bo == NULL) - goto fail; - - return &buffer->base; + buffer->base.base = *template; + buffer->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; + pipe_reference_init(&buffer->base.base.reference, 1); + buffer->base.base.screen = pscreen; + buffer->size = util_format_get_stride(template->format, template->width0); + buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold; + buffer->data = align_malloc(buffer->size, 16); -fail: - FREE(buffer); - return NULL; + return &buffer->base.base; } @@ -124,30 +48,49 @@ nvfx_user_buffer_create(struct pipe_screen *pscreen, unsigned bytes, unsigned usage) { - struct nvfx_resource *buffer; + struct nvfx_screen* screen = nvfx_screen(pscreen); + struct nvfx_buffer* buffer; - buffer = CALLOC_STRUCT(nvfx_resource); + buffer = CALLOC_STRUCT(nvfx_buffer); if (!buffer) return NULL; - pipe_reference_init(&buffer->base.reference, 1); - buffer->vtbl = &nvfx_buffer_vtbl; - buffer->base.screen = pscreen; - buffer->base.format = PIPE_FORMAT_R8_UNORM; - buffer->base.usage = PIPE_USAGE_IMMUTABLE; - buffer->base.bind = usage; - buffer->base.width0 = bytes; - buffer->base.height0 = 1; - buffer->base.depth0 = 1; - - buffer->bo = nouveau_screen_bo_user(pscreen, ptr, bytes); - if (!buffer->bo) - goto fail; - - return &buffer->base; - -fail: - FREE(buffer); - return NULL; + pipe_reference_init(&buffer->base.base.reference, 1); + buffer->base.base.flags = NVFX_RESOURCE_FLAG_LINEAR | NVFX_RESOURCE_FLAG_USER; + buffer->base.base.screen = pscreen; + buffer->base.base.format = PIPE_FORMAT_R8_UNORM; + buffer->base.base.usage = PIPE_USAGE_IMMUTABLE; + buffer->base.base.bind = usage; + buffer->base.base.width0 = bytes; + buffer->base.base.height0 = 1; + buffer->base.base.depth0 = 1; + buffer->data = ptr; + buffer->size = bytes; + buffer->bytes_to_draw_until_static = bytes * screen->static_reuse_threshold; + buffer->dirty_end = bytes; + + return &buffer->base.base; } +void nvfx_buffer_upload(struct nvfx_buffer* buffer) +{ + unsigned dirty = buffer->dirty_end - buffer->dirty_begin; + if(!buffer->base.bo) + { + buffer->base.bo = nouveau_screen_bo_new(buffer->base.base.screen, + 16, + buffer->base.base.usage, + buffer->base.base.bind, + buffer->base.base.width0); + } + + if(dirty) + { + // TODO: may want to use a temporary in some cases + nouveau_bo_map(buffer->base.bo, NOUVEAU_BO_WR + | (buffer->dirty_unsynchronized ? NOUVEAU_BO_NOSYNC : 0)); + memcpy((uint8_t*)buffer->base.bo->map + buffer->dirty_begin, buffer->data + buffer->dirty_begin, dirty); + nouveau_bo_unmap(buffer->base.bo); + buffer->dirty_begin = buffer->dirty_end = 0; + } +} diff --git a/src/gallium/drivers/nvfx/nvfx_context.c b/src/gallium/drivers/nvfx/nvfx_context.c index 7218abff22d..5a2fa14c887 100644 --- a/src/gallium/drivers/nvfx/nvfx_context.c +++ b/src/gallium/drivers/nvfx/nvfx_context.c @@ -1,5 +1,6 @@ #include "draw/draw_context.h" #include "pipe/p_defines.h" +#include "util/u_framebuffer.h" #include "nvfx_context.h" #include "nvfx_screen.h" @@ -14,6 +15,7 @@ nvfx_flush(struct pipe_context *pipe, unsigned flags, struct nouveau_channel *chan = screen->base.channel; struct nouveau_grobj *eng3d = screen->eng3d; + /* XXX: we need to actually be intelligent here */ if (flags & PIPE_FLUSH_TEXTURE_CACHE) { BEGIN_RING(chan, eng3d, 0x1fd8, 1); OUT_RING (chan, 2); @@ -31,8 +33,22 @@ nvfx_destroy(struct pipe_context *pipe) { struct nvfx_context *nvfx = nvfx_context(pipe); + if(nvfx->dummy_fs) + pipe->delete_fs_state(pipe, nvfx->dummy_fs); + + for(unsigned i = 0; i < nvfx->vtxbuf_nr; ++i) + pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0); + pipe_resource_reference(&nvfx->idxbuf.buffer, 0); + util_unreference_framebuffer_state(&nvfx->framebuffer); + for(unsigned i = 0; i < PIPE_MAX_SAMPLERS; ++i) + pipe_sampler_view_reference(&nvfx->fragment_sampler_views[i], 0); + if (nvfx->draw) draw_destroy(nvfx->draw); + + if(nvfx->screen->cur_ctx == nvfx) + nvfx->screen->cur_ctx = NULL; + FREE(nvfx); } @@ -59,14 +75,21 @@ nvfx_create(struct pipe_screen *pscreen, void *priv) nvfx->pipe.clear = nvfx_clear; nvfx->pipe.flush = nvfx_flush; - screen->base.channel->user_private = nvfx; - nvfx->is_nv4x = screen->is_nv4x; + /* TODO: it seems that nv30 might have fixed function clipping usable with vertex programs + * However, my code for that doesn't work, so use vp clipping for all cards, which works. + */ + nvfx->use_vp_clipping = TRUE; nvfx_init_query_functions(nvfx); nvfx_init_surface_functions(nvfx); nvfx_init_state_functions(nvfx); + nvfx_init_sampling_functions(nvfx); + nvfx_init_vbo_functions(nvfx); + nvfx_init_fragprog_functions(nvfx); + nvfx_init_vertprog_functions(nvfx); nvfx_init_resource_functions(&nvfx->pipe); + nvfx_init_transfer_functions(&nvfx->pipe); /* Create, configure, and install fallback swtnl path */ nvfx->draw = draw_create(&nvfx->pipe); @@ -78,6 +101,12 @@ nvfx_create(struct pipe_screen *pscreen, void *priv) /* set these to that we init them on first validation */ nvfx->state.scissor_enabled = ~0; - nvfx->state.stipple_enabled = ~0; + nvfx->hw_pointsprite_control = -1; + nvfx->hw_vp_output = -1; + nvfx->use_vertex_buffers = -1; + nvfx->relocs_needed = NVFX_RELOCATE_ALL; + + LIST_INITHEAD(&nvfx->render_cache); + return &nvfx->pipe; } diff --git a/src/gallium/drivers/nvfx/nvfx_context.h b/src/gallium/drivers/nvfx/nvfx_context.h index 89f94c10bd1..4c654bfa8ba 100644 --- a/src/gallium/drivers/nvfx/nvfx_context.h +++ b/src/gallium/drivers/nvfx/nvfx_context.h @@ -11,8 +11,10 @@ #include "util/u_memory.h" #include "util/u_math.h" #include "util/u_inlines.h" +#include "util/u_double_list.h" #include "draw/draw_vertex.h" +#include "util/u_blitter.h" #include "nouveau/nouveau_winsys.h" #include "nouveau/nouveau_gldefs.h" @@ -42,17 +44,26 @@ #define NVFX_NEW_SR (1 << 13) #define NVFX_NEW_VERTCONST (1 << 14) #define NVFX_NEW_FRAGCONST (1 << 15) +#define NVFX_NEW_INDEX (1 << 16) +#define NVFX_NEW_SPRITE (1 << 17) + +#define NVFX_RELOCATE_FRAMEBUFFER (1 << 0) +#define NVFX_RELOCATE_FRAGTEX (1 << 1) +#define NVFX_RELOCATE_FRAGPROG (1 << 2) +#define NVFX_RELOCATE_VTXBUF (1 << 3) +#define NVFX_RELOCATE_IDXBUF (1 << 4) +#define NVFX_RELOCATE_ALL 0x1f struct nvfx_rasterizer_state { struct pipe_rasterizer_state pipe; unsigned sb_len; - uint32_t sb[32]; + uint32_t sb[34]; }; struct nvfx_zsa_state { struct pipe_depth_stencil_alpha_state pipe; unsigned sb_len; - uint32_t sb[26]; + uint32_t sb[24]; }; struct nvfx_blend_state { @@ -64,13 +75,57 @@ struct nvfx_blend_state { struct nvfx_state { unsigned scissor_enabled; - unsigned stipple_enabled; unsigned fp_samplers; + unsigned render_temps; +}; + +struct nvfx_per_vertex_element { + unsigned idx; + unsigned vertex_buffer_index; + unsigned src_offset; +}; + +struct nvfx_low_frequency_element { + unsigned idx; + unsigned vertex_buffer_index; + unsigned src_offset; + void (*fetch_rgba_float)(float *dst, const uint8_t *src, unsigned i, unsigned j); + unsigned ncomp; +}; + +struct nvfx_per_instance_element { + struct nvfx_low_frequency_element base; + unsigned instance_divisor; +}; + +struct nvfx_per_vertex_buffer_info +{ + unsigned vertex_buffer_index; + unsigned per_vertex_size; }; struct nvfx_vtxelt_state { struct pipe_vertex_element pipe[16]; unsigned num_elements; + unsigned vtxfmt[16]; + + unsigned num_per_vertex_buffer_infos; + struct nvfx_per_vertex_buffer_info per_vertex_buffer_info[16]; + + unsigned num_per_vertex; + struct nvfx_per_vertex_element per_vertex[16]; + + unsigned num_per_instance; + struct nvfx_per_instance_element per_instance[16]; + + unsigned num_constant; + struct nvfx_low_frequency_element constant[16]; + + boolean needs_translate; + struct translate* translate; + + unsigned vertex_length; + unsigned max_vertices_per_packet; }; struct nvfx_render_target { @@ -86,8 +141,11 @@ struct nvfx_context { struct nvfx_screen *screen; unsigned is_nv4x; /* either 0 or ~0 */ + boolean use_vp_clipping; struct draw_context *draw; + struct blitter_context* blitter; + struct list_head render_cache; /* HW state derived from pipe states */ struct nvfx_state state; @@ -111,7 +169,7 @@ struct nvfx_context { unsigned stipple[32]; struct pipe_clip_state clip; struct nvfx_vertex_program *vertprog; - struct nvfx_fragment_program *fragprog; + struct nvfx_pipe_fragment_program *fragprog; struct pipe_resource *constbuf[PIPE_SHADER_TYPES]; unsigned constbuf_nr[PIPE_SHADER_TYPES]; struct nvfx_rasterizer_state *rasterizer; @@ -122,23 +180,34 @@ struct nvfx_context { struct pipe_viewport_state viewport; struct pipe_framebuffer_state framebuffer; struct pipe_index_buffer idxbuf; - struct pipe_resource *idxbuf_buffer; - unsigned idxbuf_format; struct nvfx_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS]; struct pipe_sampler_view *fragment_sampler_views[PIPE_MAX_SAMPLERS]; + struct nvfx_pipe_fragment_program* dummy_fs; + unsigned nr_samplers; unsigned nr_textures; unsigned dirty_samplers; struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS]; unsigned vtxbuf_nr; struct nvfx_vtxelt_state *vtxelt; + int base_vertex; + boolean use_index_buffer; + /* -1 = hardware input setup is outdated + * 0 = hardware input setup is for inline vertices + * 1 = hardware input setup is for hardware vertices + */ + int use_vertex_buffers; - unsigned vbo_bo; unsigned hw_vtxelt_nr; uint8_t hw_samplers; uint32_t hw_txf[8]; struct nvfx_render_target hw_rt[4]; struct nvfx_render_target hw_zeta; + int hw_pointsprite_control; + int hw_vp_output; + struct nvfx_fragment_program* hw_fragprog; + + unsigned relocs_needed; }; static INLINE struct nvfx_context * @@ -175,15 +244,12 @@ extern void nvfx_clear(struct pipe_context *pipe, unsigned buffers, /* nvfx_draw.c */ extern struct draw_stage *nvfx_draw_render_stage(struct nvfx_context *nvfx); -extern void nvfx_draw_elements_swtnl(struct pipe_context *pipe, - struct pipe_resource *idxbuf, - unsigned ib_size, int ib_bias, - unsigned mode, - unsigned start, unsigned count); +extern void nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info); extern void nvfx_vtxfmt_validate(struct nvfx_context *nvfx); /* nvfx_fb.c */ -extern void nvfx_state_framebuffer_validate(struct nvfx_context *nvfx); +extern int nvfx_framebuffer_prepare(struct nvfx_context *nvfx); +extern void nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result); void nvfx_framebuffer_relocate(struct nvfx_context *nvfx); @@ -191,19 +257,24 @@ nvfx_framebuffer_relocate(struct nvfx_context *nvfx); extern void nvfx_fragprog_destroy(struct nvfx_context *, struct nvfx_fragment_program *); extern void nvfx_fragprog_validate(struct nvfx_context *nvfx); -extern void -nvfx_fragprog_relocate(struct nvfx_context *nvfx); +extern void nvfx_fragprog_relocate(struct nvfx_context *nvfx); +extern void nvfx_init_fragprog_functions(struct nvfx_context *nvfx); /* nvfx_fragtex.c */ +extern void nvfx_init_sampling_functions(struct nvfx_context *nvfx); extern void nvfx_fragtex_validate(struct nvfx_context *nvfx); -extern void -nvfx_fragtex_relocate(struct nvfx_context *nvfx); +extern void nvfx_fragtex_relocate(struct nvfx_context *nvfx); + +struct nvfx_sampler_view; /* nv30_fragtex.c */ extern void nv30_sampler_state_init(struct pipe_context *pipe, struct nvfx_sampler_state *ps, const struct pipe_sampler_state *cso); +extern void +nv30_sampler_view_init(struct pipe_context *pipe, + struct nvfx_sampler_view *sv); extern void nv30_fragtex_set(struct nvfx_context *nvfx, int unit); /* nv40_fragtex.c */ @@ -211,6 +282,9 @@ extern void nv40_sampler_state_init(struct pipe_context *pipe, struct nvfx_sampler_state *ps, const struct pipe_sampler_state *cso); +extern void +nv40_sampler_view_init(struct pipe_context *pipe, + struct nvfx_sampler_view *sv); extern void nv40_fragtex_set(struct nvfx_context *nvfx, int unit); /* nvfx_state.c */ @@ -225,23 +299,75 @@ extern void nvfx_state_sr_validate(struct nvfx_context *nvfx); extern void nvfx_state_zsa_validate(struct nvfx_context *nvfx); /* nvfx_state_emit.c */ -extern void nvfx_state_relocate(struct nvfx_context *nvfx); +extern void nvfx_state_relocate(struct nvfx_context *nvfx, unsigned relocs); extern boolean nvfx_state_validate(struct nvfx_context *nvfx); extern boolean nvfx_state_validate_swtnl(struct nvfx_context *nvfx); -extern void nvfx_state_emit(struct nvfx_context *nvfx); + +static inline void +nvfx_state_emit(struct nvfx_context *nvfx) +{ + unsigned relocs = NVFX_RELOCATE_FRAMEBUFFER | NVFX_RELOCATE_FRAGTEX | NVFX_RELOCATE_FRAGPROG; + if (nvfx->render_mode == HW) + { + relocs |= NVFX_RELOCATE_VTXBUF; + if(nvfx->use_index_buffer) + relocs |= NVFX_RELOCATE_IDXBUF; + } + + relocs &= nvfx->relocs_needed; + if(relocs) + nvfx_state_relocate(nvfx, relocs); +} /* nvfx_transfer.c */ -extern void nvfx_init_transfer_functions(struct nvfx_context *nvfx); +extern void nvfx_init_transfer_functions(struct pipe_context *pipe); /* nvfx_vbo.c */ extern boolean nvfx_vbo_validate(struct nvfx_context *nvfx); extern void nvfx_vbo_relocate(struct nvfx_context *nvfx); +extern void nvfx_idxbuf_validate(struct nvfx_context* nvfx); +extern void nvfx_idxbuf_relocate(struct nvfx_context* nvfx); extern void nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info); +extern void nvfx_init_vbo_functions(struct nvfx_context *nvfx); +extern unsigned nvfx_vertex_formats[]; /* nvfx_vertprog.c */ extern boolean nvfx_vertprog_validate(struct nvfx_context *nvfx); extern void nvfx_vertprog_destroy(struct nvfx_context *, struct nvfx_vertex_program *); +extern void nvfx_init_vertprog_functions(struct nvfx_context *nvfx); + +/* nvfx_push.c */ +extern void nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info); + +/* must WAIT_RING(chan, ncomp + 1) or equivalent beforehand! */ +static inline void nvfx_emit_vtx_attr(struct nouveau_channel* chan, unsigned attrib, const float* v, unsigned ncomp) +{ + switch (ncomp) { + case 4: + OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4)); + OUT_RING(chan, fui(v[0])); + OUT_RING(chan, fui(v[1])); + OUT_RING(chan, fui(v[2])); + OUT_RING(chan, fui(v[3])); + break; + case 3: + OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3)); + OUT_RING(chan, fui(v[0])); + OUT_RING(chan, fui(v[1])); + OUT_RING(chan, fui(v[2])); + break; + case 2: + OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2)); + OUT_RING(chan, fui(v[0])); + OUT_RING(chan, fui(v[1])); + break; + case 1: + OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1)); + OUT_RING(chan, fui(v[0])); + break; + } +} #endif diff --git a/src/gallium/drivers/nvfx/nvfx_draw.c b/src/gallium/drivers/nvfx/nvfx_draw.c index 22cff370b77..2601d5b8e2e 100644 --- a/src/gallium/drivers/nvfx/nvfx_draw.c +++ b/src/gallium/drivers/nvfx/nvfx_draw.c @@ -9,6 +9,7 @@ #include "draw/draw_pipe.h" #include "nvfx_context.h" +#include "nvfx_resource.h" /* Simple, but crappy, swtnl path, hopefully we wont need to hit this very * often at all. Uses "quadro style" vertex submission + a fixed vertex @@ -39,30 +40,21 @@ nvfx_render_vertex(struct nvfx_context *nvfx, const struct vertex_header *v) unsigned idx = nvfx->swtnl.draw[i]; unsigned hw = nvfx->swtnl.hw[i]; + WAIT_RING(chan, 5); switch (nvfx->swtnl.emit[i]) { case EMIT_OMIT: break; case EMIT_1F: - BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_1F(hw), 1); - OUT_RING (chan, fui(v->data[idx][0])); + nvfx_emit_vtx_attr(chan, hw, v->data[idx], 1); break; case EMIT_2F: - BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_2F_X(hw), 2); - OUT_RING (chan, fui(v->data[idx][0])); - OUT_RING (chan, fui(v->data[idx][1])); + nvfx_emit_vtx_attr(chan, hw, v->data[idx], 2); break; case EMIT_3F: - BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_3F_X(hw), 3); - OUT_RING (chan, fui(v->data[idx][0])); - OUT_RING (chan, fui(v->data[idx][1])); - OUT_RING (chan, fui(v->data[idx][2])); + nvfx_emit_vtx_attr(chan, hw, v->data[idx], 3); break; case EMIT_4F: - BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4); - OUT_RING (chan, fui(v->data[idx][0])); - OUT_RING (chan, fui(v->data[idx][1])); - OUT_RING (chan, fui(v->data[idx][2])); - OUT_RING (chan, fui(v->data[idx][3])); + nvfx_emit_vtx_attr(chan, hw, v->data[idx], 4); break; case 0xff: BEGIN_RING(chan, eng3d, NV34TCL_VTX_ATTR_4F_X(hw), 4); @@ -231,15 +223,9 @@ nvfx_draw_render_stage(struct nvfx_context *nvfx) } void -nvfx_draw_elements_swtnl(struct pipe_context *pipe, - struct pipe_resource *idxbuf, - unsigned idxbuf_size, int idxbuf_bias, - unsigned mode, unsigned start, unsigned count) +nvfx_draw_vbo_swtnl(struct pipe_context *pipe, const struct pipe_draw_info* info) { struct nvfx_context *nvfx = nvfx_context(pipe); - struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS]; - struct pipe_transfer *ib_transfer = NULL; - struct pipe_transfer *cb_transfer = NULL; unsigned i; void *map; @@ -247,47 +233,28 @@ nvfx_draw_elements_swtnl(struct pipe_context *pipe, return; nvfx_state_emit(nvfx); + /* these must be passed without adding the offsets */ for (i = 0; i < nvfx->vtxbuf_nr; i++) { - map = pipe_buffer_map(pipe, nvfx->vtxbuf[i].buffer, - PIPE_TRANSFER_READ, - &vb_transfer[i]); + map = nvfx_buffer(nvfx->vtxbuf[i].buffer)->data; draw_set_mapped_vertex_buffer(nvfx->draw, i, map); } - if (idxbuf) { - map = pipe_buffer_map(pipe, idxbuf, - PIPE_TRANSFER_READ, - &ib_transfer); - draw_set_mapped_element_buffer(nvfx->draw, idxbuf_size, idxbuf_bias, map); - } else { - draw_set_mapped_element_buffer(nvfx->draw, 0, 0, NULL); - } + map = NULL; + if (info->indexed && nvfx->idxbuf.buffer) + map = nvfx_buffer(nvfx->idxbuf.buffer)->data; + draw_set_mapped_index_buffer(nvfx->draw, map); if (nvfx->constbuf[PIPE_SHADER_VERTEX]) { const unsigned nr = nvfx->constbuf_nr[PIPE_SHADER_VERTEX]; - map = pipe_buffer_map(pipe, - nvfx->constbuf[PIPE_SHADER_VERTEX], - PIPE_TRANSFER_READ, - &cb_transfer); + map = nvfx_buffer(nvfx->constbuf[PIPE_SHADER_VERTEX])->data; draw_set_mapped_constant_buffer(nvfx->draw, PIPE_SHADER_VERTEX, 0, map, nr); } - draw_arrays(nvfx->draw, mode, start, count); - - for (i = 0; i < nvfx->vtxbuf_nr; i++) - pipe_buffer_unmap(pipe, nvfx->vtxbuf[i].buffer, vb_transfer[i]); - - if (idxbuf) - pipe_buffer_unmap(pipe, idxbuf, ib_transfer); - - if (nvfx->constbuf[PIPE_SHADER_VERTEX]) - pipe_buffer_unmap(pipe, nvfx->constbuf[PIPE_SHADER_VERTEX], - cb_transfer); + draw_vbo(nvfx->draw, info); draw_flush(nvfx->draw); - pipe->flush(pipe, 0, NULL); } static INLINE void @@ -305,19 +272,19 @@ emit_attrib(struct nvfx_context *nvfx, unsigned hw, unsigned emit, void nvfx_vtxfmt_validate(struct nvfx_context *nvfx) { - struct nvfx_fragment_program *fp = nvfx->fragprog; + struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog; unsigned colour = 0, texcoords = 0, fog = 0, i; /* Determine needed fragprog inputs */ - for (i = 0; i < fp->info.num_inputs; i++) { - switch (fp->info.input_semantic_name[i]) { + for (i = 0; i < pfp->info.num_inputs; i++) { + switch (pfp->info.input_semantic_name[i]) { case TGSI_SEMANTIC_POSITION: break; case TGSI_SEMANTIC_COLOR: - colour |= (1 << fp->info.input_semantic_index[i]); + colour |= (1 << pfp->info.input_semantic_index[i]); break; case TGSI_SEMANTIC_GENERIC: - texcoords |= (1 << fp->info.input_semantic_index[i]); + texcoords |= (1 << pfp->info.input_semantic_index[i]); break; case TGSI_SEMANTIC_FOG: fog = 1; diff --git a/src/gallium/drivers/nvfx/nvfx_fragprog.c b/src/gallium/drivers/nvfx/nvfx_fragprog.c index ee41f03b9b8..275672a31fa 100644 --- a/src/gallium/drivers/nvfx/nvfx_fragprog.c +++ b/src/gallium/drivers/nvfx/nvfx_fragprog.c @@ -2,25 +2,31 @@ #include "pipe/p_defines.h" #include "pipe/p_state.h" #include "util/u_inlines.h" +#include "util/u_debug.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" +#include "tgsi/tgsi_dump.h" +#include "tgsi/tgsi_ureg.h" #include "nvfx_context.h" #include "nvfx_shader.h" +#include "nvfx_resource.h" #define MAX_CONSTS 128 #define MAX_IMM 32 + struct nvfx_fpc { + struct nvfx_pipe_fragment_program* pfp; struct nvfx_fragment_program *fp; - uint attrib_map[PIPE_MAX_SHADER_INPUTS]; - - unsigned r_temps; - unsigned r_temps_discard; - struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS]; - struct nvfx_sreg *r_temp; + unsigned max_temps; + unsigned long long r_temps; + unsigned long long r_temps_discard; + struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS]; + struct nvfx_reg *r_temp; + unsigned sprite_coord_temp; int num_regs; @@ -33,34 +39,40 @@ struct nvfx_fpc { } consts[MAX_CONSTS]; int nr_consts; - struct nvfx_sreg imm[MAX_IMM]; + struct nvfx_reg imm[MAX_IMM]; unsigned nr_imm; + + unsigned char generic_to_slot[256]; /* semantic idx for each input semantic */ + + struct util_dynarray if_stack; + //struct util_dynarray loop_stack; + struct util_dynarray label_relocs; }; -static INLINE struct nvfx_sreg +static INLINE struct nvfx_reg temp(struct nvfx_fpc *fpc) { - int idx = ffs(~fpc->r_temps) - 1; + int idx = __builtin_ctzll(~fpc->r_temps); - if (idx < 0) { + if (idx >= fpc->max_temps) { NOUVEAU_ERR("out of temps!!\n"); assert(0); - return nvfx_sr(NVFXSR_TEMP, 0); + return nvfx_reg(NVFXSR_TEMP, 0); } - fpc->r_temps |= (1 << idx); - fpc->r_temps_discard |= (1 << idx); - return nvfx_sr(NVFXSR_TEMP, idx); + fpc->r_temps |= (1ULL << idx); + fpc->r_temps_discard |= (1ULL << idx); + return nvfx_reg(NVFXSR_TEMP, idx); } static INLINE void release_temps(struct nvfx_fpc *fpc) { fpc->r_temps &= ~fpc->r_temps_discard; - fpc->r_temps_discard = 0; + fpc->r_temps_discard = 0ULL; } -static INLINE struct nvfx_sreg +static INLINE struct nvfx_reg constant(struct nvfx_fpc *fpc, int pipe, float vals[4]) { int idx; @@ -72,16 +84,9 @@ constant(struct nvfx_fpc *fpc, int pipe, float vals[4]) fpc->consts[idx].pipe = pipe; if (pipe == -1) memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float)); - return nvfx_sr(NVFXSR_CONST, idx); + return nvfx_reg(NVFXSR_CONST, idx); } -#define arith(cc,s,o,d,m,s0,s1,s2) \ - nvfx_fp_arith((cc), (s), NVFX_FP_OP_OPCODE_##o, \ - (d), (m), (s0), (s1), (s2)) -#define tex(cc,s,o,u,d,m,s0,s1,s2) \ - nvfx_fp_tex((cc), (s), NVFX_FP_OP_OPCODE_##o, (u), \ - (d), (m), (s0), none, none) - static void grow_insns(struct nvfx_fpc *fpc, int size) { @@ -92,23 +97,29 @@ grow_insns(struct nvfx_fpc *fpc, int size) } static void -emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src) +emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_src src) { struct nvfx_fragment_program *fp = fpc->fp; uint32_t *hw = &fp->insn[fpc->inst_offset]; uint32_t sr = 0; - switch (src.type) { + switch (src.reg.type) { case NVFXSR_INPUT: sr |= (NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT); - hw[0] |= (src.index << NVFX_FP_OP_INPUT_SRC_SHIFT); + hw[0] |= (src.reg.index << NVFX_FP_OP_INPUT_SRC_SHIFT); break; case NVFXSR_OUTPUT: sr |= NVFX_FP_REG_SRC_HALF; /* fall-through */ case NVFXSR_TEMP: sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT); - sr |= (src.index << NVFX_FP_REG_SRC_SHIFT); + sr |= (src.reg.index << NVFX_FP_REG_SRC_SHIFT); + break; + case NVFXSR_RELOCATED: + sr |= (NVFX_FP_REG_TYPE_TEMP << NVFX_FP_REG_TYPE_SHIFT); + sr |= (fpc->sprite_coord_temp << NVFX_FP_REG_SRC_SHIFT); + //printf("adding relocation at %x for %x\n", fpc->inst_offset, src.index); + util_dynarray_append(&fpc->fp->slot_relocations[src.reg.index], unsigned, fpc->inst_offset + pos + 1); break; case NVFXSR_CONST: if (!fpc->have_const) { @@ -117,18 +128,18 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src) } hw = &fp->insn[fpc->inst_offset]; - if (fpc->consts[src.index].pipe >= 0) { + if (fpc->consts[src.reg.index].pipe >= 0) { struct nvfx_fragment_program_data *fpd; fp->consts = realloc(fp->consts, ++fp->nr_consts * sizeof(*fpd)); fpd = &fp->consts[fp->nr_consts - 1]; fpd->offset = fpc->inst_offset + 4; - fpd->index = fpc->consts[src.index].pipe; + fpd->index = fpc->consts[src.reg.index].pipe; memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4); } else { memcpy(&fp->insn[fpc->inst_offset + 4], - fpc->consts[src.index].vals, + fpc->consts[src.reg.index].vals, sizeof(uint32_t) * 4); } @@ -156,7 +167,7 @@ emit_src(struct nvfx_fpc *fpc, int pos, struct nvfx_sreg src) } static void -emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst) +emit_dst(struct nvfx_fpc *fpc, struct nvfx_reg dst) { struct nvfx_fragment_program *fp = fpc->fp; uint32_t *hw = &fp->insn[fpc->inst_offset]; @@ -184,9 +195,7 @@ emit_dst(struct nvfx_fpc *fpc, struct nvfx_sreg dst) } static void -nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op, - struct nvfx_sreg dst, int mask, - struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2) +nvfx_fp_emit(struct nvfx_fpc *fpc, struct nvfx_insn insn) { struct nvfx_fragment_program *fp = fpc->fp; uint32_t *hw; @@ -197,68 +206,225 @@ nvfx_fp_arith(struct nvfx_fpc *fpc, int sat, int op, hw = &fp->insn[fpc->inst_offset]; memset(hw, 0, sizeof(uint32_t) * 4); - if (op == NVFX_FP_OP_OPCODE_KIL) + if (insn.op == NVFX_FP_OP_OPCODE_KIL) fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL; - hw[0] |= (op << NVFX_FP_OP_OPCODE_SHIFT); - hw[0] |= (mask << NVFX_FP_OP_OUTMASK_SHIFT); - hw[2] |= (dst.dst_scale << NVFX_FP_OP_DST_SCALE_SHIFT); + hw[0] |= (insn.op << NVFX_FP_OP_OPCODE_SHIFT); + hw[0] |= (insn.mask << NVFX_FP_OP_OUTMASK_SHIFT); + hw[2] |= (insn.scale << NVFX_FP_OP_DST_SCALE_SHIFT); - if (sat) + if (insn.sat) hw[0] |= NVFX_FP_OP_OUT_SAT; - if (dst.cc_update) + if (insn.cc_update) hw[0] |= NVFX_FP_OP_COND_WRITE_ENABLE; - hw[1] |= (dst.cc_test << NVFX_FP_OP_COND_SHIFT); - hw[1] |= ((dst.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) | - (dst.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) | - (dst.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) | - (dst.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT)); - - emit_dst(fpc, dst); - emit_src(fpc, 0, s0); - emit_src(fpc, 1, s1); - emit_src(fpc, 2, s2); + hw[1] |= (insn.cc_test << NVFX_FP_OP_COND_SHIFT); + hw[1] |= ((insn.cc_swz[0] << NVFX_FP_OP_COND_SWZ_X_SHIFT) | + (insn.cc_swz[1] << NVFX_FP_OP_COND_SWZ_Y_SHIFT) | + (insn.cc_swz[2] << NVFX_FP_OP_COND_SWZ_Z_SHIFT) | + (insn.cc_swz[3] << NVFX_FP_OP_COND_SWZ_W_SHIFT)); + + if(insn.unit >= 0) + { + hw[0] |= (insn.unit << NVFX_FP_OP_TEX_UNIT_SHIFT); + fp->samplers |= (1 << insn.unit); + } + + emit_dst(fpc, insn.dst); + emit_src(fpc, 0, insn.src[0]); + emit_src(fpc, 1, insn.src[1]); + emit_src(fpc, 2, insn.src[2]); } +#define arith(s,o,d,m,s0,s1,s2) \ + nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, -1, \ + (d), (m), (s0), (s1), (s2)) + +#define tex(s,o,u,d,m,s0,s1,s2) \ + nvfx_insn((s), NVFX_FP_OP_OPCODE_##o, (u), \ + (d), (m), (s0), none, none) + +/* IF src.x != 0, as TGSI specifies */ static void -nvfx_fp_tex(struct nvfx_fpc *fpc, int sat, int op, int unit, - struct nvfx_sreg dst, int mask, - struct nvfx_sreg s0, struct nvfx_sreg s1, struct nvfx_sreg s2) +nv40_fp_if(struct nvfx_fpc *fpc, struct nvfx_src src) { - struct nvfx_fragment_program *fp = fpc->fp; + const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0)); + struct nvfx_insn insn = arith(0, MOV, none.reg, NVFX_FP_MASK_X, src, none, none); + uint32_t *hw; + insn.cc_update = 1; + nvfx_fp_emit(fpc, insn); - nvfx_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2); + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) | + NV40_FP_OP_OUT_NONE | + (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (0 << NVFX_FP_OP_COND_SWZ_X_SHIFT) | + (0 << NVFX_FP_OP_COND_SWZ_Y_SHIFT) | + (0 << NVFX_FP_OP_COND_SWZ_Z_SHIFT) | + (0 << NVFX_FP_OP_COND_SWZ_W_SHIFT) | + (NVFX_FP_OP_COND_NE << NVFX_FP_OP_COND_SHIFT); + hw[2] = 0; /* | NV40_FP_OP_OPCODE_IS_BRANCH | else_offset */ + hw[3] = 0; /* | endif_offset */ + util_dynarray_append(&fpc->if_stack, unsigned, fpc->inst_offset); +} + +/* IF src.x != 0, as TGSI specifies */ +static void +nv40_fp_cal(struct nvfx_fpc *fpc, unsigned target) +{ + struct nvfx_relocation reloc; + uint32_t *hw; + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_CAL << NVFX_FP_OP_OPCODE_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) | + (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */ + hw[3] = 0; + reloc.target = target; + reloc.location = fpc->inst_offset + 2; + util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc); +} + +static void +nv40_fp_ret(struct nvfx_fpc *fpc) +{ + uint32_t *hw; + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_RET << NVFX_FP_OP_OPCODE_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) | + (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | call_offset */ + hw[3] = 0; +} - fp->insn[fpc->inst_offset] |= (unit << NVFX_FP_OP_TEX_UNIT_SHIFT); - fp->samplers |= (1 << unit); +static void +nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target) +{ + struct nvfx_relocation reloc; + uint32_t *hw; + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_REP << NVFX_FP_OP_OPCODE_SHIFT) | + NV40_FP_OP_OUT_NONE | + (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_ALL_SHIFT) | + (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | + (count << NV40_FP_OP_REP_COUNT1_SHIFT) | + (count << NV40_FP_OP_REP_COUNT2_SHIFT) | + (count << NV40_FP_OP_REP_COUNT3_SHIFT); + hw[3] = 0; /* | end_offset */ + reloc.target = target; + reloc.location = fpc->inst_offset + 3; + util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc); + //util_dynarray_append(&fpc->loop_stack, unsigned, target); } -static INLINE struct nvfx_sreg +/* warning: this only works forward, and probably only if not inside any IF */ +static void +nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target) +{ + struct nvfx_relocation reloc; + uint32_t *hw; + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_IF << NVFX_FP_OP_OPCODE_SHIFT) | + NV40_FP_OP_OUT_NONE | + (NVFX_FP_PRECISION_FP16 << NVFX_FP_OP_PRECISION_SHIFT); + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) | + (NVFX_FP_OP_COND_FL << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; /* | else_offset */ + hw[3] = 0; /* | endif_offset */ + reloc.target = target; + reloc.location = fpc->inst_offset + 2; + util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc); + reloc.target = target; + reloc.location = fpc->inst_offset + 3; + util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc); +} + +static void +nv40_fp_brk(struct nvfx_fpc *fpc) +{ + uint32_t *hw; + fpc->inst_offset = fpc->fp->insn_len; + grow_insns(fpc, 4); + hw = &fpc->fp->insn[fpc->inst_offset]; + /* I really wonder why fp16 precision is used. Presumably the hardware ignores it? */ + hw[0] = (NV40_FP_OP_BRA_OPCODE_BRK << NVFX_FP_OP_OPCODE_SHIFT) | + NV40_FP_OP_OUT_NONE; + /* Use .xxxx swizzle so that we check only src[0].x*/ + hw[1] = (NVFX_SWZ_IDENTITY << NVFX_FP_OP_COND_SWZ_X_SHIFT) | + (NVFX_FP_OP_COND_TR << NVFX_FP_OP_COND_SHIFT); + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH; + hw[3] = 0; +} + +static INLINE struct nvfx_src tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc) { - struct nvfx_sreg src = { 0 }; + struct nvfx_src src; switch (fsrc->Register.File) { case TGSI_FILE_INPUT: - src = nvfx_sr(NVFXSR_INPUT, - fpc->attrib_map[fsrc->Register.Index]); + if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_POSITION) { + assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0); + src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_POSITION); + } else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_COLOR) { + if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0) + src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL0); + else if(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 1) + src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_COL1); + else + assert(0); + } else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FOG) { + assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0); + src.reg = nvfx_reg(NVFXSR_INPUT, NVFX_FP_OP_INPUT_SRC_FOGC); + } else if(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_FACE) { + /* TODO: check this has the correct values */ + /* XXX: what do we do for nv30 here (assuming it lacks facing)?! */ + assert(fpc->pfp->info.input_semantic_index[fsrc->Register.Index] == 0); + src.reg = nvfx_reg(NVFXSR_INPUT, NV40_FP_OP_INPUT_SRC_FACING); + } else { + assert(fpc->pfp->info.input_semantic_name[fsrc->Register.Index] == TGSI_SEMANTIC_GENERIC); + src.reg = nvfx_reg(NVFXSR_RELOCATED, fpc->generic_to_slot[fpc->pfp->info.input_semantic_index[fsrc->Register.Index]]); + } break; case TGSI_FILE_CONSTANT: - src = constant(fpc, fsrc->Register.Index, NULL); + src.reg = constant(fpc, fsrc->Register.Index, NULL); break; case TGSI_FILE_IMMEDIATE: assert(fsrc->Register.Index < fpc->nr_imm); - src = fpc->imm[fsrc->Register.Index]; + src.reg = fpc->imm[fsrc->Register.Index]; break; case TGSI_FILE_TEMPORARY: - src = fpc->r_temp[fsrc->Register.Index]; + src.reg = fpc->r_temp[fsrc->Register.Index]; break; /* NV40 fragprog result regs are just temps, so this is simple */ case TGSI_FILE_OUTPUT: - src = fpc->r_result[fsrc->Register.Index]; + src.reg = fpc->r_result[fsrc->Register.Index]; break; default: NOUVEAU_ERR("bad src file\n"); + src.reg.index = 0; + src.reg.type = 0; break; } @@ -271,7 +437,7 @@ tgsi_src(struct nvfx_fpc *fpc, const struct tgsi_full_src_register *fsrc) return src; } -static INLINE struct nvfx_sreg +static INLINE struct nvfx_reg tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) { switch (fdst->Register.File) { case TGSI_FILE_OUTPUT: @@ -279,10 +445,10 @@ tgsi_dst(struct nvfx_fpc *fpc, const struct tgsi_full_dst_register *fdst) { case TGSI_FILE_TEMPORARY: return fpc->r_temp[fdst->Register.Index]; case TGSI_FILE_NULL: - return nvfx_sr(NVFXSR_NONE, 0); + return nvfx_reg(NVFXSR_NONE, 0); default: NOUVEAU_ERR("bad dst file %d\n", fdst->Register.File); - return nvfx_sr(NVFXSR_NONE, 0); + return nvfx_reg(NVFXSR_NONE, 0); } } @@ -302,8 +468,10 @@ static boolean nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, const struct tgsi_full_instruction *finst) { - const struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0); - struct nvfx_sreg src[3], dst, tmp; + const struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0)); + struct nvfx_insn insn; + struct nvfx_src src[3], tmp, tmp2; + struct nvfx_reg dst; int mask, sat, unit = 0; int ai = -1, ci = -1, ii = -1; int i; @@ -331,9 +499,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, ai = fsrc->Register.Index; src[i] = tgsi_src(fpc, fsrc); } else { - src[i] = temp(fpc); - arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL, - tgsi_src(fpc, fsrc), none, none); + src[i] = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none)); } break; case TGSI_FILE_CONSTANT: @@ -342,9 +509,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, ci = fsrc->Register.Index; src[i] = tgsi_src(fpc, fsrc); } else { - src[i] = temp(fpc); - arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL, - tgsi_src(fpc, fsrc), none, none); + src[i] = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none)); } break; case TGSI_FILE_IMMEDIATE: @@ -353,9 +519,8 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, ii = fsrc->Register.Index; src[i] = tgsi_src(fpc, fsrc); } else { - src[i] = temp(fpc); - arith(fpc, 0, MOV, src[i], NVFX_FP_MASK_ALL, - tgsi_src(fpc, fsrc), none, none); + src[i] = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, MOV, src[i].reg, NVFX_FP_MASK_ALL, tgsi_src(fpc, fsrc), none, none)); } break; case TGSI_FILE_TEMPORARY: @@ -378,277 +543,345 @@ nvfx_fragprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, switch (finst->Instruction.Opcode) { case TGSI_OPCODE_ABS: - arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none); + nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, abs(src[0]), none, none)); break; case TGSI_OPCODE_ADD: - arith(fpc, sat, ADD, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_CMP: - tmp = nvfx_sr(NVFXSR_NONE, 0); - tmp.cc_update = 1; - arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none); - dst.cc_test = NVFX_COND_GE; - arith(fpc, sat, MOV, dst, mask, src[2], none, none); - dst.cc_test = NVFX_COND_LT; - arith(fpc, sat, MOV, dst, mask, src[1], none, none); + insn = arith(0, MOV, none.reg, mask, src[0], none, none); + insn.cc_update = 1; + nvfx_fp_emit(fpc, insn); + + insn = arith(sat, MOV, dst, mask, src[2], none, none); + insn.cc_test = NVFX_COND_GE; + nvfx_fp_emit(fpc, insn); + + insn = arith(sat, MOV, dst, mask, src[1], none, none); + insn.cc_test = NVFX_COND_LT; + nvfx_fp_emit(fpc, insn); break; case TGSI_OPCODE_COS: - arith(fpc, sat, COS, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, COS, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_DDX: if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) { - tmp = temp(fpc); - arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, - swz(src[0], Z, W, Z, W), none, none); - arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, - swz(tmp, X, Y, X, Y), none, none); - arith(fpc, sat, DDX, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], - none, none); - arith(fpc, 0, MOV, dst, mask, tmp, none, none); + tmp = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none)); + nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none)); + nvfx_fp_emit(fpc, arith(sat, DDX, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none)); + nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none)); } else { - arith(fpc, sat, DDX, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, DDX, dst, mask, src[0], none, none)); } break; case TGSI_OPCODE_DDY: if (mask & (NVFX_FP_MASK_Z | NVFX_FP_MASK_W)) { - tmp = temp(fpc); - arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, - swz(src[0], Z, W, Z, W), none, none); - arith(fpc, 0, MOV, tmp, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, - swz(tmp, X, Y, X, Y), none, none); - arith(fpc, sat, DDY, tmp, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], - none, none); - arith(fpc, 0, MOV, dst, mask, tmp, none, none); + tmp = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, swz(src[0], Z, W, Z, W), none, none)); + nvfx_fp_emit(fpc, arith(0, MOV, tmp.reg, NVFX_FP_MASK_Z | NVFX_FP_MASK_W, swz(tmp, X, Y, X, Y), none, none)); + nvfx_fp_emit(fpc, arith(sat, DDY, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], none, none)); + nvfx_fp_emit(fpc, arith(0, MOV, dst, mask, tmp, none, none)); } else { - arith(fpc, sat, DDY, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, DDY, dst, mask, src[0], none, none)); } break; + case TGSI_OPCODE_DP2: + tmp = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X | NVFX_FP_MASK_Y, src[0], src[1], none)); + nvfx_fp_emit(fpc, arith(0, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none)); + break; case TGSI_OPCODE_DP3: - arith(fpc, sat, DP3, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, DP3, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_DP4: - arith(fpc, sat, DP4, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, DP4, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_DPH: - tmp = temp(fpc); - arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[1], none); - arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X), - swz(src[1], W, W, W, W), none); + tmp = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[1], none)); + nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, swz(tmp, X, X, X, X), swz(src[1], W, W, W, W), none)); break; case TGSI_OPCODE_DST: - arith(fpc, sat, DST, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, DST, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_EX2: - arith(fpc, sat, EX2, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_FLR: - arith(fpc, sat, FLR, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, FLR, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_FRC: - arith(fpc, sat, FRC, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, FRC, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_KILP: - arith(fpc, 0, KIL, none, 0, none, none, none); + nvfx_fp_emit(fpc, arith(0, KIL, none.reg, 0, none, none, none)); break; case TGSI_OPCODE_KIL: - dst = nvfx_sr(NVFXSR_NONE, 0); - dst.cc_update = 1; - arith(fpc, 0, MOV, dst, NVFX_FP_MASK_ALL, src[0], none, none); - dst.cc_update = 0; dst.cc_test = NVFX_COND_LT; - arith(fpc, 0, KIL, dst, 0, none, none, none); + insn = arith(0, MOV, none.reg, NVFX_FP_MASK_ALL, src[0], none, none); + insn.cc_update = 1; + nvfx_fp_emit(fpc, insn); + + insn = arith(0, KIL, none.reg, 0, none, none, none); + insn.cc_test = NVFX_COND_LT; + nvfx_fp_emit(fpc, insn); break; case TGSI_OPCODE_LG2: - arith(fpc, sat, LG2, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, LG2, dst, mask, src[0], none, none)); break; // case TGSI_OPCODE_LIT: case TGSI_OPCODE_LRP: if(!nvfx->is_nv4x) - arith(fpc, sat, LRP_NV30, dst, mask, src[0], src[1], src[2]); + nvfx_fp_emit(fpc, arith(sat, LRP_NV30, dst, mask, src[0], src[1], src[2])); else { - tmp = temp(fpc); - arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]); - arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp); + tmp = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2])); + nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], tmp)); } break; case TGSI_OPCODE_MAD: - arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]); + nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, src[0], src[1], src[2])); break; case TGSI_OPCODE_MAX: - arith(fpc, sat, MAX, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, MAX, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_MIN: - arith(fpc, sat, MIN, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, MIN, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_MOV: - arith(fpc, sat, MOV, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_MUL: - arith(fpc, sat, MUL, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, MUL, dst, mask, src[0], src[1], none)); + break; + case TGSI_OPCODE_NOP: break; case TGSI_OPCODE_POW: if(!nvfx->is_nv4x) - arith(fpc, sat, POW_NV30, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, POW_NV30, dst, mask, src[0], src[1], none)); else { - tmp = temp(fpc); - arith(fpc, 0, LG2, tmp, NVFX_FP_MASK_X, - swz(src[0], X, X, X, X), none, none); - arith(fpc, 0, MUL, tmp, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), - swz(src[1], X, X, X, X), none); - arith(fpc, sat, EX2, dst, mask, - swz(tmp, X, X, X, X), none, none); + tmp = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none)); + nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, NVFX_FP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none)); + nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, swz(tmp, X, X, X, X), none, none)); } break; case TGSI_OPCODE_RCP: - arith(fpc, sat, RCP, dst, mask, src[0], none, none); - break; - case TGSI_OPCODE_RET: - assert(0); + nvfx_fp_emit(fpc, arith(sat, RCP, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_RFL: if(!nvfx->is_nv4x) - arith(fpc, 0, RFL_NV30, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(0, RFL_NV30, dst, mask, src[0], src[1], none)); else { - tmp = temp(fpc); - arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_X, src[0], src[0], none); - arith(fpc, 0, DP3, tmp, NVFX_FP_MASK_Y, src[0], src[1], none); - arith(fpc, 0, DIV, scale(tmp, 2X), NVFX_FP_MASK_Z, - swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none); - arith(fpc, sat, MAD, dst, mask, - swz(tmp, Z, Z, Z, Z), src[0], neg(src[1])); + tmp = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_X, src[0], src[0], none)); + nvfx_fp_emit(fpc, arith(0, DP3, tmp.reg, NVFX_FP_MASK_Y, src[0], src[1], none)); + insn = arith(0, DIV, tmp.reg, NVFX_FP_MASK_Z, swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none); + insn.scale = NVFX_FP_OP_DST_SCALE_2X; + nvfx_fp_emit(fpc, insn); + nvfx_fp_emit(fpc, arith(sat, MAD, dst, mask, swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]))); } break; case TGSI_OPCODE_RSQ: if(!nvfx->is_nv4x) - arith(fpc, sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none); + nvfx_fp_emit(fpc, arith(sat, RSQ_NV30, dst, mask, abs(swz(src[0], X, X, X, X)), none, none)); else { - tmp = temp(fpc); - arith(fpc, 0, LG2, scale(tmp, INV_2X), NVFX_FP_MASK_X, - abs(swz(src[0], X, X, X, X)), none, none); - arith(fpc, sat, EX2, dst, mask, - neg(swz(tmp, X, X, X, X)), none, none); + tmp = nvfx_src(temp(fpc)); + insn = arith(0, LG2, tmp.reg, NVFX_FP_MASK_X, abs(swz(src[0], X, X, X, X)), none, none); + insn.scale = NVFX_FP_OP_DST_SCALE_INV_2X; + nvfx_fp_emit(fpc, insn); + nvfx_fp_emit(fpc, arith(sat, EX2, dst, mask, neg(swz(tmp, X, X, X, X)), none, none)); } break; case TGSI_OPCODE_SCS: /* avoid overwriting the source */ if(src[0].swz[NVFX_SWZ_X] != NVFX_SWZ_X) { - if (mask & NVFX_FP_MASK_X) { - arith(fpc, sat, COS, dst, NVFX_FP_MASK_X, - swz(src[0], X, X, X, X), none, none); - } - if (mask & NVFX_FP_MASK_Y) { - arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y, - swz(src[0], X, X, X, X), none, none); - } + if (mask & NVFX_FP_MASK_X) + nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none)); + if (mask & NVFX_FP_MASK_Y) + nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none)); } else { - if (mask & NVFX_FP_MASK_Y) { - arith(fpc, sat, SIN, dst, NVFX_FP_MASK_Y, - swz(src[0], X, X, X, X), none, none); - } - if (mask & NVFX_FP_MASK_X) { - arith(fpc, sat, COS, dst, NVFX_FP_MASK_X, - swz(src[0], X, X, X, X), none, none); - } + if (mask & NVFX_FP_MASK_Y) + nvfx_fp_emit(fpc, arith(sat, SIN, dst, NVFX_FP_MASK_Y, swz(src[0], X, X, X, X), none, none)); + if (mask & NVFX_FP_MASK_X) + nvfx_fp_emit(fpc, arith(sat, COS, dst, NVFX_FP_MASK_X, swz(src[0], X, X, X, X), none, none)); } break; case TGSI_OPCODE_SEQ: - arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, SEQ, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SFL: - arith(fpc, sat, SFL, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, SFL, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SGE: - arith(fpc, sat, SGE, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, SGE, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SGT: - arith(fpc, sat, SGT, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, SGT, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SIN: - arith(fpc, sat, SIN, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, arith(sat, SIN, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_SLE: - arith(fpc, sat, SLE, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, SLE, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SLT: - arith(fpc, sat, SLT, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, SLT, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SNE: - arith(fpc, sat, SNE, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, SNE, dst, mask, src[0], src[1], none)); + break; + case TGSI_OPCODE_SSG: + tmp = nvfx_src(temp(fpc)); + tmp2 = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, SGT, tmp.reg, mask, src[0], nvfx_src(nvfx_reg(NVFXSR_CONST, 0)), none)); + nvfx_fp_emit(fpc, arith(0, SLT, tmp.reg, mask, src[0], nvfx_src(nvfx_reg(NVFXSR_CONST, 0)), none)); + nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, tmp, neg(tmp2), none)); break; case TGSI_OPCODE_STR: - arith(fpc, sat, STR, dst, mask, src[0], src[1], none); + nvfx_fp_emit(fpc, arith(sat, STR, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SUB: - arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none); + nvfx_fp_emit(fpc, arith(sat, ADD, dst, mask, src[0], neg(src[1]), none)); break; case TGSI_OPCODE_TEX: - tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none); + nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none)); break; - case TGSI_OPCODE_TXB: - tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none); + case TGSI_OPCODE_TRUNC: + tmp = nvfx_src(temp(fpc)); + insn = arith(0, MOV, none.reg, mask, src[0], none, none); + insn.cc_update = 1; + nvfx_fp_emit(fpc, insn); + + nvfx_fp_emit(fpc, arith(0, FLR, tmp.reg, mask, abs(src[0]), none, none)); + nvfx_fp_emit(fpc, arith(sat, MOV, dst, mask, tmp, none, none)); + + insn = arith(sat, MOV, dst, mask, neg(tmp), none, none); + insn.cc_test = NVFX_COND_LT; + nvfx_fp_emit(fpc, insn); + break; + case TGSI_OPCODE_TXB: + nvfx_fp_emit(fpc, tex(sat, TXB, unit, dst, mask, src[0], none, none)); + break; + case TGSI_OPCODE_TXL: + if(nvfx->is_nv4x) + nvfx_fp_emit(fpc, tex(sat, TXL_NV40, unit, dst, mask, src[0], none, none)); + else /* unsupported on nv30, use TEX and hope they like it */ + nvfx_fp_emit(fpc, tex(sat, TEX, unit, dst, mask, src[0], none, none)); + break; + case TGSI_OPCODE_TXP: + nvfx_fp_emit(fpc, tex(sat, TXP, unit, dst, mask, src[0], none, none)); + break; + case TGSI_OPCODE_XPD: + tmp = nvfx_src(temp(fpc)); + nvfx_fp_emit(fpc, arith(0, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none)); + nvfx_fp_emit(fpc, arith(sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp))); break; - case TGSI_OPCODE_TXP: - tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none); + + case TGSI_OPCODE_IF: + // MOVRC0 R31 (TR0.xyzw), R<src>: + // IF (NE.xxxx) ELSE <else> END <end> + if(!nvfx->is_nv4x) + goto nv3x_cflow; + nv40_fp_if(fpc, src[0]); break; - case TGSI_OPCODE_XPD: - tmp = temp(fpc); - arith(fpc, 0, MUL, tmp, mask, - swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none); - arith(fpc, sat, MAD, dst, (mask & ~NVFX_FP_MASK_W), - swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), - neg(tmp)); + + case TGSI_OPCODE_ELSE: + { + uint32_t *hw; + if(!nvfx->is_nv4x) + goto nv3x_cflow; + assert(util_dynarray_contains(&fpc->if_stack, unsigned)); + hw = &fpc->fp->insn[util_dynarray_top(&fpc->if_stack, unsigned)]; + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len; break; - default: - NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode); - return FALSE; } - release_temps(fpc); - return TRUE; -} + case TGSI_OPCODE_ENDIF: + { + uint32_t *hw; + if(!nvfx->is_nv4x) + goto nv3x_cflow; + assert(util_dynarray_contains(&fpc->if_stack, unsigned)); + hw = &fpc->fp->insn[util_dynarray_pop(&fpc->if_stack, unsigned)]; + if(!hw[2]) + hw[2] = NV40_FP_OP_OPCODE_IS_BRANCH | fpc->fp->insn_len; + hw[3] = fpc->fp->insn_len; + break; + } -static boolean -nvfx_fragprog_parse_decl_attrib(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, - const struct tgsi_full_declaration *fdec) -{ - int hw; + case TGSI_OPCODE_BRA: + /* This can in limited cases be implemented with an IF with the else and endif labels pointing to the target */ + /* no state tracker uses this, so don't implement this for now */ + assert(0); + nv40_fp_bra(fpc, finst->Label.Label); + break; - switch (fdec->Semantic.Name) { - case TGSI_SEMANTIC_POSITION: - hw = NVFX_FP_OP_INPUT_SRC_POSITION; + case TGSI_OPCODE_BGNSUB: + case TGSI_OPCODE_ENDSUB: + /* nothing to do here */ break; - case TGSI_SEMANTIC_COLOR: - if (fdec->Semantic.Index == 0) { - hw = NVFX_FP_OP_INPUT_SRC_COL0; - } else - if (fdec->Semantic.Index == 1) { - hw = NVFX_FP_OP_INPUT_SRC_COL1; - } else { - NOUVEAU_ERR("bad colour semantic index\n"); - return FALSE; - } + + case TGSI_OPCODE_CAL: + if(!nvfx->is_nv4x) + goto nv3x_cflow; + nv40_fp_cal(fpc, finst->Label.Label); break; - case TGSI_SEMANTIC_FOG: - hw = NVFX_FP_OP_INPUT_SRC_FOGC; + + case TGSI_OPCODE_RET: + if(!nvfx->is_nv4x) + goto nv3x_cflow; + nv40_fp_ret(fpc); break; - case TGSI_SEMANTIC_GENERIC: - if (fdec->Semantic.Index <= 7) { - hw = NVFX_FP_OP_INPUT_SRC_TC(fdec->Semantic. - Index); - } else { - NOUVEAU_ERR("bad generic semantic index\n"); - return FALSE; + + case TGSI_OPCODE_BGNLOOP: + if(!nvfx->is_nv4x) + goto nv3x_cflow; + /* TODO: we should support using two nested REPs to allow a > 255 iteration count */ + nv40_fp_rep(fpc, 255, finst->Label.Label); + break; + + case TGSI_OPCODE_ENDLOOP: + break; + + case TGSI_OPCODE_BRK: + if(!nvfx->is_nv4x) + goto nv3x_cflow; + nv40_fp_brk(fpc); + break; + + case TGSI_OPCODE_CONT: + { + static int warned = 0; + if(!warned) { + NOUVEAU_ERR("Sorry, the continue keyword is not implemented: ignoring it.\n"); + warned = 1; } break; - default: - NOUVEAU_ERR("bad input semantic\n"); + } + + default: + NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode); return FALSE; } - fpc->attrib_map[fdec->Range.First] = hw; +out: + release_temps(fpc); return TRUE; +nv3x_cflow: + { + static int warned = 0; + if(!warned) { + NOUVEAU_ERR( + "Sorry, control flow instructions are not supported in hardware on nv3x: ignoring them\n" + "If rendering is incorrect, try to disable GLSL support in the application.\n"); + warned = 1; + } + } + goto out; } static boolean @@ -680,8 +913,8 @@ nvfx_fragprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_fpc *fpc, return FALSE; } - fpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw); - fpc->r_temps |= (1 << hw); + fpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw); + fpc->r_temps |= (1ULL << hw); return TRUE; } @@ -690,8 +923,22 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc) { struct tgsi_parse_context p; int high_temp = -1, i; + struct util_semantic_set set; + float const0v[4] = {0, 0, 0, 0}; + struct nvfx_reg const0; + + fpc->fp->num_slots = util_semantic_set_from_program_file(&set, fpc->pfp->pipe.tokens, TGSI_FILE_INPUT); + if(fpc->fp->num_slots > 8) + return FALSE; + util_semantic_layout_from_set(fpc->fp->slot_to_generic, &set, 0, 8); + util_semantic_table_from_layout(fpc->generic_to_slot, fpc->fp->slot_to_generic, 0, 8); - tgsi_parse_init(&p, fpc->fp->pipe.tokens); + memset(fpc->fp->slot_to_fp_input, 0xff, sizeof(fpc->fp->slot_to_fp_input)); + + const0 = constant(fpc, -1, const0v); + assert(const0.index == 0); + + tgsi_parse_init(&p, fpc->pfp->pipe.tokens); while (!tgsi_parse_end_of_tokens(&p)) { const union tgsi_full_token *tok = &p.FullToken; @@ -702,10 +949,6 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc) const struct tgsi_full_declaration *fdec; fdec = &p.FullToken.FullDeclaration; switch (fdec->Declaration.File) { - case TGSI_FILE_INPUT: - if (!nvfx_fragprog_parse_decl_attrib(nvfx, fpc, fdec)) - goto out_err; - break; case TGSI_FILE_OUTPUT: if (!nvfx_fragprog_parse_decl_output(nvfx, fpc, fdec)) goto out_err; @@ -744,40 +987,66 @@ nvfx_fragprog_prepare(struct nvfx_context* nvfx, struct nvfx_fpc *fpc) tgsi_parse_free(&p); if (++high_temp) { - fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg)); + fpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg)); for (i = 0; i < high_temp; i++) fpc->r_temp[i] = temp(fpc); - fpc->r_temps_discard = 0; + fpc->r_temps_discard = 0ULL; } return TRUE; out_err: - if (fpc->r_temp) + if (fpc->r_temp) { FREE(fpc->r_temp); + fpc->r_temp = NULL; + } tgsi_parse_free(&p); return FALSE; } -static void +DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_fp, "NVFX_DUMP_FP", FALSE) + +static struct nvfx_fragment_program* nvfx_fragprog_translate(struct nvfx_context *nvfx, - struct nvfx_fragment_program *fp) + struct nvfx_pipe_fragment_program *pfp, + boolean emulate_sprite_flipping) { struct tgsi_parse_context parse; struct nvfx_fpc *fpc = NULL; + struct util_dynarray insns; + struct nvfx_fragment_program* fp = NULL; + const int min_size = 4096; - fpc = CALLOC(1, sizeof(struct nvfx_fpc)); + fp = CALLOC_STRUCT(nvfx_fragment_program); + if(!fp) + goto out_err; + + fpc = CALLOC_STRUCT(nvfx_fpc); if (!fpc) - return; + goto out_err; + + fpc->max_temps = nvfx->is_nv4x ? 48 : 32; + fpc->pfp = pfp; fpc->fp = fp; fpc->num_regs = 2; - if (!nvfx_fragprog_prepare(nvfx, fpc)) { - FREE(fpc); - return; - } + if (!nvfx_fragprog_prepare(nvfx, fpc)) + goto out_err; - tgsi_parse_init(&parse, fp->pipe.tokens); + tgsi_parse_init(&parse, pfp->pipe.tokens); + util_dynarray_init(&insns); + + if(emulate_sprite_flipping) + { + struct nvfx_reg reg = temp(fpc); + struct nvfx_src sprite_input = nvfx_src(nvfx_reg(NVFXSR_RELOCATED, fp->num_slots)); + float v[4] = {1, -1, 0, 0}; + struct nvfx_src imm = nvfx_src(constant(fpc, -1, v)); + + fpc->sprite_coord_temp = reg.index; + fpc->r_temps_discard = 0ULL; + nvfx_fp_emit(fpc, arith(0, MAD, reg, NVFX_FP_MASK_ALL, sprite_input, swz(imm, X, Y, X, X), swz(imm, Z, X, Z, Z))); + } while (!tgsi_parse_end_of_tokens(&parse)) { tgsi_parse_token(&parse); @@ -787,6 +1056,7 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, { const struct tgsi_full_instruction *finst; + util_dynarray_append(&insns, unsigned, fp->insn_len); finst = &parse.FullToken.FullInstruction; if (!nvfx_fragprog_parse_instruction(nvfx, fpc, finst)) goto out_err; @@ -796,6 +1066,14 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, break; } } + util_dynarray_append(&insns, unsigned, fp->insn_len); + + for(unsigned i = 0; i < fpc->label_relocs.size; i += sizeof(struct nvfx_relocation)) + { + struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)fpc->label_relocs.data + i); + fp->insn[label_reloc->location] |= ((unsigned*)insns.data)[label_reloc->target]; + } + util_dynarray_fini(&insns); if(!nvfx->is_nv4x) fp->fp_control |= (fpc->num_regs-1)/2; @@ -804,9 +1082,9 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, /* Terminate final instruction */ if(fp->insn) - fp->insn[fpc->inst_offset] |= 0x00000001; + fp->insn[fpc->inst_offset] |= 0x00000001; - /* Append NOP + END instruction, may or may not be necessary. */ + /* Append NOP + END instruction for branches to the end of the program */ fpc->inst_offset = fp->insn_len; grow_insns(fpc, 4); fp->insn[fpc->inst_offset + 0] = 0x00000001; @@ -814,12 +1092,48 @@ nvfx_fragprog_translate(struct nvfx_context *nvfx, fp->insn[fpc->inst_offset + 2] = 0x00000000; fp->insn[fpc->inst_offset + 3] = 0x00000000; - fp->translated = TRUE; -out_err: + if(debug_get_option_nvfx_dump_fp()) + { + debug_printf("\n"); + tgsi_dump(pfp->pipe.tokens, 0); + + debug_printf("\n%s fragment program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x"); + for (unsigned i = 0; i < fp->insn_len; i += 4) + debug_printf("%3u: %08x %08x %08x %08x\n", i >> 2, fp->insn[i], fp->insn[i + 1], fp->insn[i + 2], fp->insn[i + 3]); + debug_printf("\n"); + } + + fp->prog_size = (fp->insn_len * 4 + 63) & ~63; + + if(fp->prog_size >= min_size) + fp->progs_per_bo = 1; + else + fp->progs_per_bo = min_size / fp->prog_size; + fp->bo_prog_idx = fp->progs_per_bo - 1; + +out: tgsi_parse_free(&parse); - if (fpc->r_temp) - FREE(fpc->r_temp); - FREE(fpc); + if(fpc) + { + if (fpc->r_temp) + FREE(fpc->r_temp); + util_dynarray_fini(&fpc->if_stack); + util_dynarray_fini(&fpc->label_relocs); + //util_dynarray_fini(&fpc->loop_stack); + FREE(fpc); + } + return fp; + +out_err: + _debug_printf("Error: failed to compile this fragment program:\n"); + tgsi_dump(pfp->pipe.tokens, 0); + + if(fp) + { + FREE(fp); + fp = NULL; + } + goto out; } static inline void @@ -836,53 +1150,189 @@ nvfx_fp_memcpy(void* dst, const void* src, size_t len) #endif } +/* The hardware only supports immediate constants inside the fragment program, + * and at least on nv30 doesn't support an indirect linkage table. + * + * Hence, we need to patch the fragment program itself both to update constants + * and update linkage. + * + * Using a single fragment program would entail unacceptable stalls if the GPU is + * already rendering with that fragment program. + * Thus, we instead use a "rotating queue" of buffer objects, each of which is + * packed with multiple versions of the same program. + * + * Whenever we need to patch something, we move to the next program and + * patch it. If all buffer objects are in use by the GPU, we allocate another one, + * expanding the queue. + * + * As an additional optimization, we record when all the programs have the + * current input slot configuration, and at that point we stop patching inputs. + * This happens, for instance, if a given fragment program is always used with + * the same vertex program (i.e. always with GLSL), or if the layouts match + * enough (non-GLSL). + * + * Note that instead of using multiple programs, we could push commands + * on the FIFO to patch a single program: it's not fully clear which option is + * faster, but my guess is that the current way is faster. + * + * We also track the previous slot assignments for each version and don't + * patch if they are the same (this could perhaps be removed). + */ + void nvfx_fragprog_validate(struct nvfx_context *nvfx) { struct nouveau_channel* chan = nvfx->screen->base.channel; - struct nvfx_fragment_program *fp = nvfx->fragprog; - int update = 0; - - if (!fp->translated) + struct nvfx_pipe_fragment_program *pfp = nvfx->fragprog; + struct nvfx_vertex_program* vp; + /* Gallium always puts the point coord in GENERIC[0] + * TODO: this is wrong, Gallium needs to be fixed + */ + unsigned sprite_coord_enable = nvfx->rasterizer->pipe.point_quad_rasterization * (nvfx->rasterizer->pipe.sprite_coord_enable | 1); + + boolean emulate_sprite_flipping = sprite_coord_enable && nvfx->rasterizer->pipe.sprite_coord_mode; + unsigned key = emulate_sprite_flipping; + struct nvfx_fragment_program* fp; + + fp = pfp->fps[key]; + if (!fp) { - const int min_size = 4096; + fp = nvfx_fragprog_translate(nvfx, pfp, emulate_sprite_flipping); - nvfx_fragprog_translate(nvfx, fp); - if (!fp->translated) { - static unsigned dummy[8] = {1, 0, 0, 0, 1, 0, 0, 0}; - static int warned = 0; - if(!warned) + if(!fp) + { + if(!nvfx->dummy_fs) { - fprintf(stderr, "nvfx: failed to translate fragment program!\n"); - warned = 1; + struct ureg_program *ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT ); + if (ureg) + { + ureg_END( ureg ); + nvfx->dummy_fs = ureg_create_shader_and_destroy( ureg, &nvfx->pipe ); + } + + if(!nvfx->dummy_fs) + { + _debug_printf("Error: unable to create a dummy fragment shader: aborting."); + abort(); + } } - /* use dummy program: we cannot fail here */ - fp->translated = TRUE; - fp->insn = malloc(sizeof(dummy)); - memcpy(fp->insn, dummy, sizeof(dummy)); - fp->insn_len = sizeof(dummy) / sizeof(dummy[0]); + fp = nvfx_fragprog_translate(nvfx, nvfx->dummy_fs, FALSE); + emulate_sprite_flipping = FALSE; + + if(!fp) + { + _debug_printf("Error: unable to compile even a dummy fragment shader: aborting."); + abort(); + } } - update = TRUE; - fp->prog_size = (fp->insn_len * 4 + 63) & ~63; + pfp->fps[key] = fp; + } + + vp = nvfx->render_mode == HW ? nvfx->vertprog : nvfx->swtnl.vertprog; - if(fp->prog_size >= min_size) - fp->progs_per_bo = 1; + if (fp->last_vp_id != vp->id || fp->last_sprite_coord_enable != sprite_coord_enable) { + int sprite_real_input = -1; + int sprite_reloc_input; + unsigned i; + fp->last_vp_id = vp->id; + fp->last_sprite_coord_enable = sprite_coord_enable; + + if(sprite_coord_enable) + { + sprite_real_input = vp->sprite_fp_input; + if(sprite_real_input < 0) + { + unsigned used_texcoords = 0; + for(unsigned i = 0; i < fp->num_slots; ++i) { + unsigned generic = fp->slot_to_generic[i]; + if(!((1 << generic) & sprite_coord_enable)) + { + unsigned char slot_mask = vp->generic_to_fp_input[generic]; + if(slot_mask >= 0xf0) + used_texcoords |= 1 << ((slot_mask & 0xf) - NVFX_FP_OP_INPUT_SRC_TC0); + } + } + + sprite_real_input = NVFX_FP_OP_INPUT_SRC_TC(__builtin_ctz(~used_texcoords)); + } + + fp->point_sprite_control |= (1 << (sprite_real_input - NVFX_FP_OP_INPUT_SRC_TC0 + 8)); + } else - fp->progs_per_bo = min_size / fp->prog_size; - fp->bo_prog_idx = fp->progs_per_bo - 1; - } + fp->point_sprite_control = 0; - /* we must update constants even on "just" fragprog changes, because - we don't check whether the current constant buffer matches the latest - one bound to this fragment program */ - if (nvfx->dirty & (NVFX_NEW_FRAGCONST | NVFX_NEW_FRAGPROG)) - update = TRUE; + if(emulate_sprite_flipping) + sprite_reloc_input = 0; + else + sprite_reloc_input = sprite_real_input; - if(update) { + for(i = 0; i < fp->num_slots; ++i) { + unsigned generic = fp->slot_to_generic[i]; + if((1 << generic) & sprite_coord_enable) + { + if(fp->slot_to_fp_input[i] != sprite_reloc_input) + goto update_slots; + } + else + { + unsigned char slot_mask = vp->generic_to_fp_input[generic]; + if((slot_mask >> 4) & (slot_mask ^ fp->slot_to_fp_input[i])) + goto update_slots; + } + } + + if(emulate_sprite_flipping) + { + if(fp->slot_to_fp_input[fp->num_slots] != sprite_real_input) + goto update_slots; + } + + if(0) + { +update_slots: + /* optimization: we start updating from the slot we found the first difference in */ + for(; i < fp->num_slots; ++i) + { + unsigned generic = fp->slot_to_generic[i]; + if((1 << generic) & sprite_coord_enable) + fp->slot_to_fp_input[i] = sprite_reloc_input; + else + fp->slot_to_fp_input[i] = vp->generic_to_fp_input[generic] & 0xf; + } + + fp->slot_to_fp_input[fp->num_slots] = sprite_real_input; + + if(nvfx->is_nv4x) + { + fp->or = 0; + for(i = 0; i <= fp->num_slots; ++i) { + unsigned fp_input = fp->slot_to_fp_input[i]; + if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(8)) + fp->or |= (1 << 12); + else if(fp_input == NVFX_FP_OP_INPUT_SRC_TC(9)) + fp->or |= (1 << 13); + else if(fp_input >= NVFX_FP_OP_INPUT_SRC_TC(0) && fp_input <= NVFX_FP_OP_INPUT_SRC_TC(7)) + fp->or |= (1 << (fp_input - NVFX_FP_OP_INPUT_SRC_TC0 + 14)); + } + } + + fp->progs_left_with_obsolete_slot_assignments = fp->progs; + goto update; + } + } + + /* We must update constants even on "just" fragprog changes, because + * we don't check whether the current constant buffer matches the latest + * one bound to this fragment program. + * Doing such a check would likely be a pessimization. + */ + if ((nvfx->hw_fragprog != fp) || (nvfx->dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST))) { int offset; + uint32_t* fpmap; +update: ++fp->bo_prog_idx; if(fp->bo_prog_idx >= fp->progs_per_bo) { @@ -892,10 +1342,12 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } else { - struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + fp->prog_size * fp->progs_per_bo, 16); - char *map, *buf; - int i; + struct nvfx_fragment_program_bo* fpbo = os_malloc_aligned(sizeof(struct nvfx_fragment_program) + (fp->prog_size + 8) * fp->progs_per_bo, 16); + uint8_t* map; + uint8_t* buf; + fpbo->slots = (unsigned char*)&fpbo->insn[(fp->prog_size) * fp->progs_per_bo]; + memset(fpbo->slots, 0, 8 * fp->progs_per_bo); if(fp->fpbo) { fpbo->next = fp->fpbo->next; @@ -905,12 +1357,14 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) fpbo->next = fpbo; fp->fpbo = fpbo; fpbo->bo = 0; + fp->progs += fp->progs_per_bo; + fp->progs_left_with_obsolete_slot_assignments += fp->progs_per_bo; nouveau_bo_new(nvfx->screen->base.device, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 64, fp->prog_size * fp->progs_per_bo, &fpbo->bo); nouveau_bo_map(fpbo->bo, NOUVEAU_BO_NOSYNC); map = fpbo->bo->map; - buf = fpbo->insn; - for(i = 0; i < fp->progs_per_bo; ++i) + buf = (uint8_t*)fpbo->insn; + for(unsigned i = 0; i < fp->progs_per_bo; ++i) { memcpy(buf, fp->insn, fp->insn_len * 4); nvfx_fp_memcpy(map, fp->insn, fp->insn_len * 4); @@ -922,13 +1376,11 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) } offset = fp->bo_prog_idx * fp->prog_size; + fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset); if(nvfx->constbuf[PIPE_SHADER_FRAGMENT]) { struct pipe_resource* constbuf = nvfx->constbuf[PIPE_SHADER_FRAGMENT]; - // TODO: avoid using transfers, just directly the buffer - struct pipe_transfer* transfer; - // TODO: does this check make any sense, or should we do this unconditionally? - uint32_t* map = pipe_buffer_map(&nvfx->pipe, constbuf, PIPE_TRANSFER_READ, &transfer); + uint32_t* map = (uint32_t*)nvfx_buffer(constbuf)->data; uint32_t* fpmap = (uint32_t*)((char*)fp->fpbo->bo->map + offset); uint32_t* buf = (uint32_t*)((char*)fp->fpbo->insn + offset); int i; @@ -942,12 +1394,61 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) nvfx_fp_memcpy(&fpmap[off], &map[idx], 4 * sizeof(uint32_t)); } } - pipe_buffer_unmap(&nvfx->pipe, constbuf, transfer); } - } - if(update || (nvfx->dirty & NVFX_NEW_FRAGPROG)) { - int offset = fp->bo_prog_idx * fp->prog_size; + /* we only do this if we aren't sure that all program versions have the + * current slot assignments, otherwise we just update constants for speed + */ + if(fp->progs_left_with_obsolete_slot_assignments) { + unsigned char* fpbo_slots = &fp->fpbo->slots[fp->bo_prog_idx * 8]; + /* also relocate sprite coord slot, if any */ + for(unsigned i = 0; i <= fp->num_slots; ++i) { + unsigned value = fp->slot_to_fp_input[i];; + if(value != fpbo_slots[i]) { + unsigned* p; + unsigned* begin = (unsigned*)fp->slot_relocations[i].data; + unsigned* end = (unsigned*)((char*)fp->slot_relocations[i].data + fp->slot_relocations[i].size); + //printf("fp %p reloc slot %u/%u: %u -> %u\n", fp, i, fp->num_slots, fpbo_slots[i], value); + if(value == 0) + { + /* was relocated to an input, switch type to temporary */ + for(p = begin; p != end; ++p) { + unsigned off = *p; + unsigned dw = fp->insn[off]; + dw &=~ NVFX_FP_REG_TYPE_MASK; + //printf("reloc_tmp at %x\n", off); + nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw)); + } + } else { + if(!fpbo_slots[i]) + { + /* was relocated to a temporary, switch type to input */ + for(p= begin; p != end; ++p) { + unsigned off = *p; + unsigned dw = fp->insn[off]; + //printf("reloc_in at %x\n", off); + dw |= NVFX_FP_REG_TYPE_INPUT << NVFX_FP_REG_TYPE_SHIFT; + nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw)); + } + } + + /* set the correct input index */ + for(p = begin; p != end; ++p) { + unsigned off = *p & ~3; + unsigned dw = fp->insn[off]; + //printf("reloc&~3 at %x\n", off); + dw = (dw & ~NVFX_FP_OP_INPUT_SRC_MASK) | (value << NVFX_FP_OP_INPUT_SRC_SHIFT); + nvfx_fp_memcpy(&fpmap[off], &dw, sizeof(dw)); + } + } + fpbo_slots[i] = value; + } + } + --fp->progs_left_with_obsolete_slot_assignments; + } + + nvfx->hw_fragprog = fp; + MARK_RING(chan, 8, 1); OUT_RING(chan, RING_3D(NV34TCL_FP_ACTIVE_PROGRAM, 1)); OUT_RELOC(chan, fp->fpbo->bo, offset, NOUVEAU_BO_VRAM | @@ -963,13 +1464,26 @@ nvfx_fragprog_validate(struct nvfx_context *nvfx) OUT_RING(chan, fp->samplers); } } + + { + unsigned pointsprite_control = fp->point_sprite_control | nvfx->rasterizer->pipe.point_quad_rasterization; + if(pointsprite_control != nvfx->hw_pointsprite_control) + { + WAIT_RING(chan, 2); + OUT_RING(chan, RING_3D(NV34TCL_POINT_SPRITE, 1)); + OUT_RING(chan, pointsprite_control); + nvfx->hw_pointsprite_control = pointsprite_control; + } + } + + nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG; } void nvfx_fragprog_relocate(struct nvfx_context *nvfx) { struct nouveau_channel* chan = nvfx->screen->base.channel; - struct nvfx_fragment_program *fp = nvfx->fragprog; + struct nvfx_fragment_program *fp = nvfx->hw_fragprog; struct nouveau_bo* bo = fp->fpbo->bo; int offset = fp->bo_prog_idx * fp->prog_size; unsigned fp_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; // TODO: GART? @@ -979,12 +1493,14 @@ nvfx_fragprog_relocate(struct nvfx_context *nvfx) OUT_RELOC(chan, bo, offset, fp_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR, NV34TCL_FP_ACTIVE_PROGRAM_DMA0, NV34TCL_FP_ACTIVE_PROGRAM_DMA1); + nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGPROG; } void nvfx_fragprog_destroy(struct nvfx_context *nvfx, struct nvfx_fragment_program *fp) { + unsigned i; struct nvfx_fragment_program_bo* fpbo = fp->fpbo; if(fpbo) { @@ -999,7 +1515,60 @@ nvfx_fragprog_destroy(struct nvfx_context *nvfx, while(fpbo != fp->fpbo); } + for(i = 0; i < Elements(fp->slot_relocations); ++i) + util_dynarray_fini(&fp->slot_relocations[i]); + if (fp->insn_len) FREE(fp->insn); } +static void * +nvfx_fp_state_create(struct pipe_context *pipe, + const struct pipe_shader_state *cso) +{ + struct nvfx_pipe_fragment_program *pfp; + + pfp = CALLOC(1, sizeof(struct nvfx_pipe_fragment_program)); + pfp->pipe.tokens = tgsi_dup_tokens(cso->tokens); + + tgsi_scan_shader(pfp->pipe.tokens, &pfp->info); + + return (void *)pfp; +} + +static void +nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + + nvfx->fragprog = hwcso; + nvfx->dirty |= NVFX_NEW_FRAGPROG; +} + +static void +nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + struct nvfx_pipe_fragment_program *pfp = hwcso; + unsigned i; + + for(i = 0; i < Elements(pfp->fps); ++i) + { + if(pfp->fps[i]) + { + nvfx_fragprog_destroy(nvfx, pfp->fps[i]); + FREE(pfp->fps[i]); + } + } + + FREE((void*)pfp->pipe.tokens); + FREE(pfp); +} + +void +nvfx_init_fragprog_functions(struct nvfx_context *nvfx) +{ + nvfx->pipe.create_fs_state = nvfx_fp_state_create; + nvfx->pipe.bind_fs_state = nvfx_fp_state_bind; + nvfx->pipe.delete_fs_state = nvfx_fp_state_delete; +} diff --git a/src/gallium/drivers/nvfx/nvfx_fragtex.c b/src/gallium/drivers/nvfx/nvfx_fragtex.c index 0b4a434fecc..6503c7afcbf 100644 --- a/src/gallium/drivers/nvfx/nvfx_fragtex.c +++ b/src/gallium/drivers/nvfx/nvfx_fragtex.c @@ -1,5 +1,177 @@ #include "nvfx_context.h" #include "nvfx_resource.h" +#include "nvfx_tex.h" + +static void * +nvfx_sampler_state_create(struct pipe_context *pipe, + const struct pipe_sampler_state *cso) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + struct nvfx_sampler_state *ps; + + ps = MALLOC(sizeof(struct nvfx_sampler_state)); + + /* on nv30, we use this as an internal flag */ + ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT; + ps->en = 0; + ps->filt = nvfx_tex_filter(cso) | 0x2000; /*voodoo*/ + ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) | + (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) | + (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT); + ps->compare = FALSE; + + if(cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) + { + ps->wrap |= nvfx_tex_wrap_compare_mode(cso->compare_func); + ps->compare = TRUE; + } + ps->bcol = nvfx_tex_border_color(cso->border_color); + + if(nvfx->is_nv4x) + nv40_sampler_state_init(pipe, ps, cso); + else + nv30_sampler_state_init(pipe, ps, cso); + + return (void *)ps; +} + +static void +nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso) +{ + FREE(hwcso); +} + +static void +nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + unsigned unit; + + for (unit = 0; unit < nr; unit++) { + nvfx->tex_sampler[unit] = sampler[unit]; + nvfx->dirty_samplers |= (1 << unit); + } + + for (unit = nr; unit < nvfx->nr_samplers; unit++) { + nvfx->tex_sampler[unit] = NULL; + nvfx->dirty_samplers |= (1 << unit); + } + + nvfx->nr_samplers = nr; + nvfx->dirty |= NVFX_NEW_SAMPLER; +} + +static struct pipe_sampler_view * +nvfx_create_sampler_view(struct pipe_context *pipe, + struct pipe_resource *pt, + const struct pipe_sampler_view *templ) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + struct nvfx_sampler_view *sv = CALLOC_STRUCT(nvfx_sampler_view); + struct nvfx_texture_format *tf = &nvfx_texture_formats[templ->format]; + unsigned txf; + + if (!sv) + return NULL; + + sv->base = *templ; + sv->base.reference.count = 1; + sv->base.texture = NULL; + pipe_resource_reference(&sv->base.texture, pt); + sv->base.context = pipe; + + txf = NV34TCL_TX_FORMAT_NO_BORDER; + + switch (pt->target) { + case PIPE_TEXTURE_CUBE: + txf |= NV34TCL_TX_FORMAT_CUBIC; + /* fall-through */ + case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: + txf |= NV34TCL_TX_FORMAT_DIMS_2D; + break; + case PIPE_TEXTURE_3D: + txf |= NV34TCL_TX_FORMAT_DIMS_3D; + break; + case PIPE_TEXTURE_1D: + txf |= NV34TCL_TX_FORMAT_DIMS_1D; + break; + default: + assert(0); + } + sv->u.init_fmt = txf; + + sv->swizzle = 0 + | (tf->src[sv->base.swizzle_r] << NV34TCL_TX_SWIZZLE_S0_Z_SHIFT) + | (tf->src[sv->base.swizzle_g] << NV34TCL_TX_SWIZZLE_S0_Y_SHIFT) + | (tf->src[sv->base.swizzle_b] << NV34TCL_TX_SWIZZLE_S0_X_SHIFT) + | (tf->src[sv->base.swizzle_a] << NV34TCL_TX_SWIZZLE_S0_W_SHIFT) + | (tf->comp[sv->base.swizzle_r] << NV34TCL_TX_SWIZZLE_S1_Z_SHIFT) + | (tf->comp[sv->base.swizzle_g] << NV34TCL_TX_SWIZZLE_S1_Y_SHIFT) + | (tf->comp[sv->base.swizzle_b] << NV34TCL_TX_SWIZZLE_S1_X_SHIFT) + | (tf->comp[sv->base.swizzle_a] << NV34TCL_TX_SWIZZLE_S1_W_SHIFT); + + sv->filt = tf->sign; + sv->wrap = tf->wrap; + sv->wrap_mask = ~0; + + if (pt->target == PIPE_TEXTURE_CUBE) + { + sv->offset = 0; + sv->npot_size = (pt->width0 << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | pt->height0; + } + else + { + sv->offset = nvfx_subresource_offset(pt, 0, sv->base.first_level, 0); + sv->npot_size = (u_minify(pt->width0, sv->base.first_level) << NV34TCL_TX_NPOT_SIZE_W_SHIFT) | u_minify(pt->height0, sv->base.first_level); + + /* apparently, we need to ignore the t coordinate for 1D textures to fix piglit tex1d-2dborder */ + if(pt->target == PIPE_TEXTURE_1D) + { + sv->wrap_mask &=~ NV34TCL_TX_WRAP_T_MASK; + sv->wrap |= NV34TCL_TX_WRAP_T_REPEAT; + } + } + + if(nvfx->is_nv4x) + nv40_sampler_view_init(pipe, sv); + else + nv30_sampler_view_init(pipe, sv); + + return &sv->base; +} + +static void +nvfx_sampler_view_destroy(struct pipe_context *pipe, + struct pipe_sampler_view *view) +{ + pipe_resource_reference(&view->texture, NULL); + FREE(view); +} + +static void +nvfx_set_fragment_sampler_views(struct pipe_context *pipe, + unsigned nr, + struct pipe_sampler_view **views) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + unsigned unit; + + for (unit = 0; unit < nr; unit++) { + pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit], + views[unit]); + nvfx->dirty_samplers |= (1 << unit); + } + + for (unit = nr; unit < nvfx->nr_textures; unit++) { + pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit], + NULL); + nvfx->dirty_samplers |= (1 << unit); + } + + nvfx->nr_textures = nr; + nvfx->dirty |= NVFX_NEW_SAMPLER; +} void nvfx_fragtex_validate(struct nvfx_context *nvfx) @@ -16,6 +188,10 @@ nvfx_fragtex_validate(struct nvfx_context *nvfx) samplers &= ~(1 << unit); if(nvfx->fragment_sampler_views[unit] && nvfx->tex_sampler[unit]) { + util_dirty_surfaces_use_for_sampling(&nvfx->pipe, + &((struct nvfx_miptree*)nvfx->fragment_sampler_views[unit]->texture)->dirty_surfaces, + nvfx_surface_flush); + if(!nvfx->is_nv4x) nv30_fragtex_set(nvfx, unit); else @@ -29,6 +205,7 @@ nvfx_fragtex_validate(struct nvfx_context *nvfx) } } nvfx->dirty_samplers = 0; + nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGTEX; } void @@ -55,4 +232,128 @@ nvfx_fragtex_relocate(struct nvfx_context *nvfx) OUT_RELOC(chan, bo, nvfx->hw_txf[unit], tex_flags | NOUVEAU_BO_OR | NOUVEAU_BO_DUMMY, NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1); } + nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAGTEX; +} + +void +nvfx_init_sampling_functions(struct nvfx_context *nvfx) +{ + nvfx->pipe.create_sampler_state = nvfx_sampler_state_create; + nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind; + nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete; + nvfx->pipe.set_fragment_sampler_views = nvfx_set_fragment_sampler_views; + nvfx->pipe.create_sampler_view = nvfx_create_sampler_view; + nvfx->pipe.sampler_view_destroy = nvfx_sampler_view_destroy; +} + +#define NV34TCL_TX_FORMAT_FORMAT_DXT1_RECT NV34TCL_TX_FORMAT_FORMAT_DXT1 +#define NV34TCL_TX_FORMAT_FORMAT_DXT3_RECT NV34TCL_TX_FORMAT_FORMAT_DXT3 +#define NV34TCL_TX_FORMAT_FORMAT_DXT5_RECT NV34TCL_TX_FORMAT_FORMAT_DXT5 + +#define NV40TCL_TEX_FORMAT_FORMAT_HILO16 NV40TCL_TEX_FORMAT_FORMAT_A16L16 + +#define NV34TCL_TX_FORMAT_FORMAT_RGBA16F 0x00004a00 +#define NV34TCL_TX_FORMAT_FORMAT_RGBA16F_RECT NV34TCL_TX_FORMAT_FORMAT_RGBA16F +#define NV34TCL_TX_FORMAT_FORMAT_RGBA32F 0x00004b00 +#define NV34TCL_TX_FORMAT_FORMAT_RGBA32F_RECT NV34TCL_TX_FORMAT_FORMAT_RGBA32F +#define NV34TCL_TX_FORMAT_FORMAT_R32F 0x00004c00 +#define NV34TCL_TX_FORMAT_FORMAT_R32F_RECT NV34TCL_TX_FORMAT_FORMAT_R32F + +// TODO: guess! +#define NV40TCL_TEX_FORMAT_FORMAT_R32F 0x00001c00 + +#define SRGB 0x00700000 + +#define __(m,tf,tfc,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign,wrap) \ +[PIPE_FORMAT_##m] = { \ + {NV34TCL_TX_FORMAT_FORMAT_##tf, \ + NV34TCL_TX_FORMAT_FORMAT_##tfc, \ + NV34TCL_TX_FORMAT_FORMAT_##tf##_RECT, \ + NV34TCL_TX_FORMAT_FORMAT_##tfc##_RECT, \ + NV40TCL_TEX_FORMAT_FORMAT_##tf, \ + NV40TCL_TEX_FORMAT_FORMAT_##tfc}, \ + sign, wrap, \ + {ts0z, ts0y, ts0x, ts0w, 0, 1}, {ts1z, ts1y, ts1x, ts1w, 0, 0} \ } + +#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign, wrap) \ + __(m,tf,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sign, wrap) + +/* Depth formats works by reading the depth value most significant 8/16 bits. + * We are losing precision, but nVidia loses even more by using A8R8G8B8 instead of HILO16 + * There is no 32-bit integer texture support, so other things are infeasible. + * + * TODO: is it possible to read 16 bits for Z16? A16 doesn't seem to work, either due to normalization or endianness issues + */ + +#define T 2 + +#define X 3 +#define Y 2 +#define Z 1 +#define W 0 + +#define SNORM ((NV34TCL_TX_FILTER_SIGNED_RED) | (NV34TCL_TX_FILTER_SIGNED_GREEN) | (NV34TCL_TX_FILTER_SIGNED_BLUE) | (NV34TCL_TX_FILTER_SIGNED_ALPHA)) +#define UNORM 0 + +struct nvfx_texture_format +nvfx_texture_formats[PIPE_FORMAT_COUNT] = { + [0 ... PIPE_FORMAT_COUNT - 1] = {{-1, -1, -1, -1, -1, -1}}, + _(B8G8R8X8_UNORM, A8R8G8B8, T, T, T, 1, X, Y, Z, W, UNORM, 0), + _(B8G8R8X8_SRGB, A8R8G8B8, T, T, T, 1, X, Y, Z, W, UNORM, SRGB), + _(B8G8R8A8_UNORM, A8R8G8B8, T, T, T, T, X, Y, Z, W, UNORM, 0), + _(B8G8R8A8_SRGB, A8R8G8B8, T, T, T, T, X, Y, Z, W, UNORM, SRGB), + + _(R8G8B8A8_UNORM, A8R8G8B8, T, T, T, T, Z, Y, X, W, UNORM, 0), + _(R8G8B8A8_SRGB, A8R8G8B8, T, T, T, T, Z, Y, X, W, UNORM, SRGB), + _(R8G8B8X8_UNORM, A8R8G8B8, T, T, T, 1, Z, Y, X, W, UNORM, 0), + + _(A8R8G8B8_UNORM, A8R8G8B8, T, T, T, T, W, Z, Y, X, UNORM, 0), + _(A8R8G8B8_SRGB, A8R8G8B8, T, T, T, T, W, Z, Y, X, UNORM, SRGB), + _(A8B8G8R8_UNORM, A8R8G8B8, T, T, T, T, W, X, Y, Z, UNORM, 0), + _(A8B8G8R8_SRGB, A8R8G8B8, T, T, T, T, W, X, Y, Z, UNORM, SRGB), + _(X8R8G8B8_UNORM, A8R8G8B8, T, T, T, 1, W, Z, Y, X, UNORM, 0), + _(X8R8G8B8_SRGB, A8R8G8B8, T, T, T, 1, W, Z, Y, X, UNORM, SRGB), + + _(B5G5R5A1_UNORM, A1R5G5B5, T, T, T, T, X, Y, Z, W, UNORM, 0), + _(B5G5R5X1_UNORM, A1R5G5B5, T, T, T, 1, X, Y, Z, W, UNORM, 0), + + _(B4G4R4A4_UNORM, A4R4G4B4, T, T, T, T, X, Y, Z, W, UNORM, 0), + _(B4G4R4X4_UNORM, A4R4G4B4, T, T, T, 1, X, Y, Z, W, UNORM, 0), + + _(B5G6R5_UNORM, R5G6B5, T, T, T, 1, X, Y, Z, W, UNORM, 0), + + _(R8_UNORM, L8, T, 0, 0, 1, X, X, X, X, UNORM, 0), + _(R8_SNORM, L8, T, 0, 0, 1, X, X, X, X, SNORM, 0), + _(L8_UNORM, L8, T, T, T, 1, X, X, X, X, UNORM, 0), + _(L8_SRGB, L8, T, T, T, 1, X, X, X, X, UNORM, SRGB), + _(A8_UNORM, L8, 0, 0, 0, T, X, X, X, X, UNORM, 0), + _(I8_UNORM, L8, T, T, T, T, X, X, X, X, UNORM, 0), + + _(R8G8_UNORM, A8L8, T, T, T, T, X, X, X, W, UNORM, 0), + _(R8G8_SNORM, A8L8, T, T, T, T, X, X, X, W, SNORM, 0), + _(L8A8_UNORM, A8L8, T, T, T, T, X, X, X, W, UNORM, 0), + _(L8A8_SRGB, A8L8, T, T, T, T, X, X, X, W, UNORM, SRGB), + + _(DXT1_RGB, DXT1, T, T, T, 1, X, Y, Z, W, UNORM, 0), + _(DXT1_SRGB, DXT1, T, T, T, 1, X, Y, Z, W, UNORM, SRGB), + _(DXT1_RGBA, DXT1, T, T, T, T, X, Y, Z, W, UNORM, 0), + _(DXT1_SRGBA, DXT1, T, T, T, T, X, Y, Z, W, UNORM, SRGB), + _(DXT3_RGBA, DXT3, T, T, T, T, X, Y, Z, W, UNORM, 0), + _(DXT3_SRGBA, DXT3, T, T, T, T, X, Y, Z, W, UNORM, SRGB), + _(DXT5_RGBA, DXT5, T, T, T, T, X, Y, Z, W, UNORM, 0), + _(DXT5_SRGBA, DXT5, T, T, T, T, X, Y, Z, W, UNORM, SRGB), + + __(Z16_UNORM, A8L8, Z16, T, T, T, 1, W, W, W, W, UNORM, 0), + __(S8_USCALED_Z24_UNORM,HILO16,Z24, T, T, T, 1, W, W, W, W, UNORM, 0), + __(X8Z24_UNORM, HILO16,Z24, T, T, T, 1, W, W, W, W, UNORM, 0), + + _(R16_UNORM, A16, T, 0, 0, 1, X, X, X, X, UNORM, 0), + _(R16_SNORM, A16, T, 0, 0, 1, X, X, X, X, SNORM, 0), + _(R16G16_UNORM, HILO16, T, T, 0, 1, X, Y, X, X, UNORM, 0), + _(R16G16_SNORM, HILO16, T, T, 0, 1, X, Y, X, X, SNORM, 0), + + _(R16G16B16A16_FLOAT, RGBA16F, T, T, T, T, X, Y, Z, W, UNORM, 0), + _(R32G32B32A32_FLOAT, RGBA32F, T, T, T, T, X, Y, Z, W, UNORM, 0), + _(R32_FLOAT, R32F, T, 0, 0, 1, X, X, X, X, UNORM, 0) +}; diff --git a/src/gallium/drivers/nvfx/nvfx_miptree.c b/src/gallium/drivers/nvfx/nvfx_miptree.c index b5639bb4645..0916aaa8289 100644 --- a/src/gallium/drivers/nvfx/nvfx_miptree.c +++ b/src/gallium/drivers/nvfx/nvfx_miptree.c @@ -2,309 +2,220 @@ #include "pipe/p_defines.h" #include "util/u_inlines.h" #include "util/u_format.h" +#include "util/u_memory.h" #include "util/u_math.h" - -#include "nvfx_context.h" +#include "util/u_staging.h" +#include "state_tracker/drm_driver.h" +#include "nouveau/nouveau_winsys.h" +#include "nouveau/nouveau_screen.h" +#include "nvfx_screen.h" #include "nvfx_resource.h" -#include "nvfx_transfer.h" -#include "nv04_surface_2d.h" - -/* Currently using separate implementations for buffers and textures, - * even though gallium has a unified abstraction of these objects. - * Eventually these should be combined, and mechanisms like transfers - * be adapted to work for both buffer and texture uploads. - */ static void -nvfx_miptree_layout(struct nvfx_miptree *mt) +nvfx_miptree_choose_format(struct nvfx_miptree *mt) { struct pipe_resource *pt = &mt->base.base; - uint width = pt->width0; - uint offset = 0; - int nr_faces, l, f; - uint wide_pitch = pt->bind & (PIPE_BIND_SAMPLER_VIEW | - PIPE_BIND_DEPTH_STENCIL | - PIPE_BIND_RENDER_TARGET | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_SCANOUT); - - if (pt->target == PIPE_TEXTURE_CUBE) { - nr_faces = 6; - } else - if (pt->target == PIPE_TEXTURE_3D) { - nr_faces = pt->depth0; - } else { - nr_faces = 1; + unsigned uniform_pitch = 0; + static int no_swizzle = -1; + if(no_swizzle < 0) + no_swizzle = debug_get_bool_option("NV40_NO_SWIZZLE", FALSE); /* this will break things on nv30 */ + + if (!util_is_power_of_two(pt->width0) || + !util_is_power_of_two(pt->height0) || + !util_is_power_of_two(pt->depth0) || + (!nvfx_screen(pt->screen)->is_nv4x && pt->target == PIPE_TEXTURE_RECT) + ) + uniform_pitch = 1; + + if ( + (pt->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_DISPLAY_TARGET)) + || (pt->usage & PIPE_USAGE_DYNAMIC) || (pt->usage & PIPE_USAGE_STAGING) + || util_format_is_compressed(pt->format) + || no_swizzle + ) + mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; + + /* non compressed formats with uniform pitch must be linear, and vice versa */ + if(!util_format_is_s3tc(pt->format) + && (uniform_pitch || mt->base.base.flags & NVFX_RESOURCE_FLAG_LINEAR)) + { + mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; + uniform_pitch = 1; } - for (l = 0; l <= pt->last_level; l++) { - if (wide_pitch && (pt->flags & NVFX_RESOURCE_FLAG_LINEAR)) - mt->level[l].pitch = align(util_format_get_stride(pt->format, pt->width0), 64); - else - mt->level[l].pitch = util_format_get_stride(pt->format, width); + if(uniform_pitch) + { + mt->linear_pitch = util_format_get_stride(pt->format, pt->width0); - mt->level[l].image_offset = - CALLOC(nr_faces, sizeof(unsigned)); + // TODO: this is only a constraint for rendering and not sampling, apparently + // we may also want this unconditionally + if(pt->bind & (PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_DEPTH_STENCIL | + PIPE_BIND_RENDER_TARGET | + PIPE_BIND_DISPLAY_TARGET | + PIPE_BIND_SCANOUT)) + mt->linear_pitch = align(mt->linear_pitch, 64); + } + else + mt->linear_pitch = 0; +} + +static unsigned +nvfx_miptree_layout(struct nvfx_miptree *mt) +{ + struct pipe_resource* pt = &mt->base.base; + uint offset = 0; - width = u_minify(width, 1); + if(!nvfx_screen(pt->screen)->is_nv4x) + { + assert(pt->target == PIPE_TEXTURE_RECT + || (util_is_power_of_two(pt->width0) && util_is_power_of_two(pt->height0))); } - for (f = 0; f < nr_faces; f++) { - for (l = 0; l < pt->last_level; l++) { - mt->level[l].image_offset[f] = offset; + for (unsigned l = 0; l <= pt->last_level; l++) + { + unsigned size; + mt->level_offset[l] = offset; - if (!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR) && - u_minify(pt->width0, l + 1) > 1 && u_minify(pt->height0, l + 1) > 1) - offset += align(mt->level[l].pitch * u_minify(pt->height0, l), 64); - else - offset += mt->level[l].pitch * u_minify(pt->height0, l); - } + if(mt->linear_pitch) + size = mt->linear_pitch; + else + size = util_format_get_stride(pt->format, u_minify(pt->width0, l)); + size = util_format_get_2d_size(pt->format, size, u_minify(pt->height0, l)); - mt->level[l].image_offset[f] = offset; - offset += mt->level[l].pitch * u_minify(pt->height0, l); + if(pt->target == PIPE_TEXTURE_3D) + size *= u_minify(pt->depth0, l); + + offset += size; } - mt->total_size = offset; + offset = align(offset, 128); + mt->face_size = offset; + if(mt->base.base.target == PIPE_TEXTURE_CUBE) + offset += 5 * mt->face_size; + return offset; } -static boolean -nvfx_miptree_get_handle(struct pipe_screen *pscreen, - struct pipe_resource *ptexture, - struct winsys_handle *whandle) +static void +nvfx_miptree_surface_final_destroy(struct pipe_surface* ps) { - struct nvfx_miptree* mt = (struct nvfx_miptree*)ptexture; - - if (!mt || !mt->base.bo) - return FALSE; - - return nouveau_screen_bo_get_handle(pscreen, - mt->base.bo, - mt->level[0].pitch, - whandle); + struct nvfx_surface* ns = (struct nvfx_surface*)ps; + pipe_resource_reference(&ps->texture, 0); + pipe_resource_reference((struct pipe_resource**)&ns->temp, 0); + FREE(ps); } - -static void +void nvfx_miptree_destroy(struct pipe_screen *screen, struct pipe_resource *pt) { struct nvfx_miptree *mt = (struct nvfx_miptree *)pt; - int l; - + util_surfaces_destroy(&mt->surfaces, pt, nvfx_miptree_surface_final_destroy); nouveau_screen_bo_release(screen, mt->base.bo); - - for (l = 0; l <= pt->last_level; l++) { - if (mt->level[l].image_offset) - FREE(mt->level[l].image_offset); - } - FREE(mt); } - - - -struct u_resource_vtbl nvfx_miptree_vtbl = +static struct nvfx_miptree* +nvfx_miptree_create_skeleton(struct pipe_screen *pscreen, const struct pipe_resource *pt) { - nvfx_miptree_get_handle, /* get_handle */ - nvfx_miptree_destroy, /* resource_destroy */ - NULL, /* is_resource_referenced */ - nvfx_miptree_transfer_new, /* get_transfer */ - nvfx_miptree_transfer_del, /* transfer_destroy */ - nvfx_miptree_transfer_map, /* transfer_map */ - u_default_transfer_flush_region, /* transfer_flush_region */ - nvfx_miptree_transfer_unmap, /* transfer_unmap */ - u_default_transfer_inline_write /* transfer_inline_write */ -}; + struct nvfx_miptree *mt; + if(pt->width0 > 4096 || pt->height0 > 4096) + return NULL; + mt = CALLOC_STRUCT(nvfx_miptree); + if (!mt) + return NULL; -struct pipe_resource * -nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt) -{ - struct nvfx_miptree *mt; - static int no_swizzle = -1; - if(no_swizzle < 0) - no_swizzle = debug_get_bool_option("NOUVEAU_NO_SWIZZLE", FALSE); - - mt = CALLOC_STRUCT(nvfx_miptree); - if (!mt) - return NULL; - - mt->base.base = *pt; - mt->base.vtbl = &nvfx_miptree_vtbl; - pipe_reference_init(&mt->base.base.reference, 1); - mt->base.base.screen = pscreen; + mt->base.base = *pt; + util_dirty_surfaces_init(&mt->dirty_surfaces); - /* Swizzled textures must be POT */ - if (pt->width0 & (pt->width0 - 1) || - pt->height0 & (pt->height0 - 1)) - mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; - else - if (pt->bind & (PIPE_BIND_SCANOUT | - PIPE_BIND_DISPLAY_TARGET | - PIPE_BIND_DEPTH_STENCIL)) - mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; - else - if (pt->usage == PIPE_USAGE_DYNAMIC) - mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; - else { - switch (pt->format) { - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_L8A8_UNORM: - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_I8_UNORM: - /* TODO: we can actually swizzle these formats on nv40, we - are just preserving the pre-unification behavior. - The whole 2D code is going to be rewritten anyway. */ - if(nvfx_screen(pscreen)->is_nv4x) { - mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; - break; - } - /* TODO: Figure out which formats can be swizzled */ - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_R16_SNORM: - { - if (no_swizzle) - mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; - break; - } - default: - mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; - } - } + pipe_reference_init(&mt->base.base.reference, 1); + mt->base.base.screen = pscreen; - /* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear. - * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy. - * This also happens for small mipmaps of large textures. */ - if (pt->bind & PIPE_BIND_RENDER_TARGET && - util_format_get_stride(pt->format, pt->width0) < 64) - mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; + // set this to the actual capabilities, we use it to decide whether to use the 3D engine for copies + // TODO: is this the correct way to use Gallium? + mt->base.base.bind = pt->bind | PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_DEPTH_STENCIL; - nvfx_miptree_layout(mt); + // on our current driver (and the driver too), format support does not depend on geometry, so don't bother computing it + // TODO: may want to revisit this + if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_RENDER_TARGET, 0)) + mt->base.base.bind &=~ PIPE_BIND_RENDER_TARGET; + if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_SAMPLER_VIEW, 0)) + mt->base.base.bind &=~ PIPE_BIND_SAMPLER_VIEW; + if(!pscreen->is_format_supported(pscreen, pt->format, pt->target, 0, PIPE_BIND_DEPTH_STENCIL, 0)) + mt->base.base.bind &=~ PIPE_BIND_DEPTH_STENCIL; - mt->base.bo = nouveau_screen_bo_new(pscreen, 256, - pt->usage, pt->bind, mt->total_size); - if (!mt->base.bo) { - FREE(mt); - return NULL; - } - return &mt->base.base; + return mt; } - - struct pipe_resource * -nvfx_miptree_from_handle(struct pipe_screen *pscreen, - const struct pipe_resource *template, - struct winsys_handle *whandle) +nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt) { - struct nvfx_miptree *mt; - unsigned stride; + struct nvfx_miptree* mt = nvfx_miptree_create_skeleton(pscreen, pt); + unsigned size; + nvfx_miptree_choose_format(mt); - /* Only supports 2D, non-mipmapped textures for the moment */ - if (template->target != PIPE_TEXTURE_2D || - template->last_level != 0 || - template->depth0 != 1) - return NULL; + size = nvfx_miptree_layout(mt); - mt = CALLOC_STRUCT(nvfx_miptree); - if (!mt) - return NULL; + mt->base.bo = nouveau_screen_bo_new(pscreen, 256, pt->usage, pt->bind, size); - mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride); - if (mt->base.bo == NULL) { + if (!mt->base.bo) { FREE(mt); return NULL; } - - mt->base.base = *template; - mt->base.vtbl = &nvfx_miptree_vtbl; - pipe_reference_init(&mt->base.base.reference, 1); - mt->base.base.screen = pscreen; - mt->level[0].pitch = stride; - mt->level[0].image_offset = CALLOC(1, sizeof(unsigned)); - - /* Assume whoever created this buffer expects it to be linear for now */ - mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; - - /* XXX: Need to adjust bo refcount?? - */ - /* nouveau_bo_ref(bo, &mt->base.bo); */ return &mt->base.base; } +// TODO: redo this, just calling miptree_layout +struct pipe_resource * +nvfx_miptree_from_handle(struct pipe_screen *pscreen, const struct pipe_resource *template, struct winsys_handle *whandle) +{ + struct nvfx_miptree* mt = nvfx_miptree_create_skeleton(pscreen, template); + unsigned stride; + if(whandle->stride) { + mt->linear_pitch = whandle->stride; + mt->base.base.flags |= NVFX_RESOURCE_FLAG_LINEAR; + } else + nvfx_miptree_choose_format(mt); + nvfx_miptree_layout(mt); + mt->base.bo = nouveau_screen_bo_from_handle(pscreen, whandle, &stride); + if (mt->base.bo == NULL) { + FREE(mt); + return NULL; + } + return &mt->base.base; +} - -/* Surface helpers, not strictly required to implement the resource vtbl: - */ struct pipe_surface * nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt, unsigned face, unsigned level, unsigned zslice, unsigned flags) { - struct nvfx_miptree *mt = (struct nvfx_miptree *)pt; - struct nv04_surface *ns; - - ns = CALLOC_STRUCT(nv04_surface); - if (!ns) - return NULL; - pipe_resource_reference(&ns->base.texture, pt); - ns->base.format = pt->format; - ns->base.width = u_minify(pt->width0, level); - ns->base.height = u_minify(pt->height0, level); - ns->base.usage = flags; - pipe_reference_init(&ns->base.reference, 1); - ns->base.face = face; - ns->base.level = level; - ns->base.zslice = zslice; - ns->pitch = mt->level[level].pitch; - - if (pt->target == PIPE_TEXTURE_CUBE) { - ns->base.offset = mt->level[level].image_offset[face]; - } else - if (pt->target == PIPE_TEXTURE_3D) { - ns->base.offset = mt->level[level].image_offset[zslice]; - } else { - ns->base.offset = mt->level[level].image_offset[0]; - } - - /* create a linear temporary that we can render into if - * necessary. - * - * Note that ns->pitch is always a multiple of 64 for linear - * surfaces and swizzled surfaces are POT, so ns->pitch & 63 - * is equivalent to (ns->pitch < 64 && swizzled) - */ - - if ((ns->pitch & 63) && - (ns->base.usage & PIPE_BIND_RENDER_TARGET)) - { - struct nv04_surface_2d* eng2d = - ((struct nvfx_screen*)pscreen)->eng2d; - - ns = nv04_surface_wrap_for_render(pscreen, eng2d, ns); + struct nvfx_miptree* mt = (struct nvfx_miptree*)pt; + struct nvfx_surface *ns; + + ns = (struct nvfx_surface*)util_surfaces_get(&mt->surfaces, sizeof(struct nvfx_surface), pscreen, pt, face, level, zslice, flags); + if(ns->base.base.offset == ~0) { + util_dirty_surface_init(&ns->base); + ns->pitch = nvfx_subresource_pitch(pt, level); + ns->base.base.offset = nvfx_subresource_offset(pt, face, level, zslice); } - return &ns->base; + return &ns->base.base; } void nvfx_miptree_surface_del(struct pipe_surface *ps) { - struct nv04_surface* ns = (struct nv04_surface*)ps; - if(ns->backing) + struct nvfx_surface* ns = (struct nvfx_surface*)ps; + + if(!ns->temp) { - struct nvfx_screen* screen = (struct nvfx_screen*)ps->texture->screen; - if(1 /*ns->backing->base.usage & PIPE_BIND_BLIT_DESTINATION*/) - screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height); - nvfx_miptree_surface_del(&ns->backing->base); + util_surfaces_detach(&((struct nvfx_miptree*)ps->texture)->surfaces, ps); + pipe_resource_reference(&ps->texture, 0); + FREE(ps); } - - pipe_resource_reference(&ps->texture, NULL); - FREE(ps); } diff --git a/src/gallium/drivers/nvfx/nvfx_push.c b/src/gallium/drivers/nvfx/nvfx_push.c new file mode 100644 index 00000000000..ffe7e983578 --- /dev/null +++ b/src/gallium/drivers/nvfx/nvfx_push.c @@ -0,0 +1,414 @@ +#include "pipe/p_context.h" +#include "pipe/p_state.h" +#include "util/u_inlines.h" +#include "util/u_format.h" +#include "util/u_split_prim.h" +#include "translate/translate.h" + +#include "nvfx_context.h" +#include "nvfx_resource.h" + +struct push_context { + struct nouveau_channel* chan; + + void *idxbuf; + int32_t idxbias; + + float edgeflag; + int edgeflag_attr; + + unsigned vertex_length; + unsigned max_vertices_per_packet; + + struct translate* translate; +}; + +static void +emit_edgeflag(void *priv, boolean enabled) +{ + struct push_context* ctx = priv; + struct nouveau_channel *chan = ctx->chan; + + OUT_RING(chan, RING_3D(NV34TCL_EDGEFLAG_ENABLE, 1)); + OUT_RING(chan, enabled ? 1 : 0); +} + +static void +emit_vertices_lookup8(void *priv, unsigned start, unsigned count) +{ + struct push_context *ctx = priv; + uint8_t* elts = (uint8_t*)ctx->idxbuf + start; + + while(count) + { + unsigned push = MIN2(count, ctx->max_vertices_per_packet); + unsigned length = push * ctx->vertex_length; + + OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length)); + ctx->translate->run_elts8(ctx->translate, elts, push, 0, ctx->chan->cur); + ctx->chan->cur += length; + + count -= push; + elts += push; + } +} + +static void +emit_vertices_lookup16(void *priv, unsigned start, unsigned count) +{ + struct push_context *ctx = priv; + uint16_t* elts = (uint16_t*)ctx->idxbuf + start; + + while(count) + { + unsigned push = MIN2(count, ctx->max_vertices_per_packet); + unsigned length = push * ctx->vertex_length; + + OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length)); + ctx->translate->run_elts16(ctx->translate, elts, push, 0, ctx->chan->cur); + ctx->chan->cur += length; + + count -= push; + elts += push; + } +} + +static void +emit_vertices_lookup32(void *priv, unsigned start, unsigned count) +{ + struct push_context *ctx = priv; + uint32_t* elts = (uint32_t*)ctx->idxbuf + start; + + while(count) + { + unsigned push = MIN2(count, ctx->max_vertices_per_packet); + unsigned length = push * ctx->vertex_length; + + OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length)); + ctx->translate->run_elts(ctx->translate, elts, push, 0, ctx->chan->cur); + ctx->chan->cur += length; + + count -= push; + elts += push; + } +} + +static void +emit_vertices(void *priv, unsigned start, unsigned count) +{ + struct push_context *ctx = priv; + + while(count) + { + unsigned push = MIN2(count, ctx->max_vertices_per_packet); + unsigned length = push * ctx->vertex_length; + + OUT_RING(ctx->chan, RING_3D_NI(NV34TCL_VERTEX_DATA, length)); + ctx->translate->run(ctx->translate, start, push, 0, ctx->chan->cur); + ctx->chan->cur += length; + + count -= push; + start += push; + } +} + +static void +emit_ranges(void* priv, unsigned start, unsigned vc, unsigned reg) +{ + struct push_context* ctx = priv; + struct nouveau_channel *chan = ctx->chan; + unsigned nr = (vc & 0xff); + if (nr) { + OUT_RING(chan, RING_3D(reg, 1)); + OUT_RING (chan, ((nr - 1) << 24) | start); + start += nr; + } + + nr = vc >> 8; + while (nr) { + unsigned push = nr > 2047 ? 2047 : nr; + + nr -= push; + + OUT_RING(chan, RING_3D_NI(reg, push)); + while (push--) { + OUT_RING(chan, ((0x100 - 1) << 24) | start); + start += 0x100; + } + } +} + +static void +emit_ib_ranges(void* priv, unsigned start, unsigned vc) +{ + emit_ranges(priv, start, vc, NV34TCL_VB_INDEX_BATCH); +} + +static void +emit_vb_ranges(void* priv, unsigned start, unsigned vc) +{ + emit_ranges(priv, start, vc, NV34TCL_VB_VERTEX_BATCH); +} + +static INLINE void +emit_elt8(void* priv, unsigned start, unsigned vc) +{ + struct push_context* ctx = priv; + struct nouveau_channel *chan = ctx->chan; + uint8_t *elts = (uint8_t *)ctx->idxbuf + start; + int idxbias = ctx->idxbias; + + if (vc & 1) { + OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1)); + OUT_RING (chan, elts[0]); + elts++; vc--; + } + + while (vc) { + unsigned i; + unsigned push = MIN2(vc, 2047 * 2); + + OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1)); + for (i = 0; i < push; i+=2) + OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias)); + + vc -= push; + elts += push; + } +} + +static INLINE void +emit_elt16(void* priv, unsigned start, unsigned vc) +{ + struct push_context* ctx = priv; + struct nouveau_channel *chan = ctx->chan; + uint16_t *elts = (uint16_t *)ctx->idxbuf + start; + int idxbias = ctx->idxbias; + + if (vc & 1) { + OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1)); + OUT_RING (chan, elts[0]); + elts++; vc--; + } + + while (vc) { + unsigned i; + unsigned push = MIN2(vc, 2047 * 2); + + OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1)); + for (i = 0; i < push; i+=2) + OUT_RING(chan, ((elts[i+1] + idxbias) << 16) | (elts[i] + idxbias)); + + vc -= push; + elts += push; + } +} + +static INLINE void +emit_elt32(void* priv, unsigned start, unsigned vc) +{ + struct push_context* ctx = priv; + struct nouveau_channel *chan = ctx->chan; + uint32_t *elts = (uint32_t *)ctx->idxbuf + start; + int idxbias = ctx->idxbias; + + while (vc) { + unsigned push = MIN2(vc, 2047); + + OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push)); + assert(AVAIL_RING(chan) >= push); + if(idxbias) + { + for(unsigned i = 0; i < push; ++i) + OUT_RING(chan, elts[i] + idxbias); + } + else + OUT_RINGp(chan, elts, push); + + vc -= push; + elts += push; + } +} + +void +nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + struct nouveau_channel *chan = nvfx->screen->base.channel; + struct push_context ctx; + struct util_split_prim s; + unsigned instances_left = info->instance_count; + int vtx_value; + unsigned hw_mode = nvgl_primitive(info->mode); + int i; + struct + { + uint8_t* map; + unsigned step; + } per_instance[16]; + unsigned p_overhead = 64 /* magic fix */ + + 4 /* begin/end */ + + 4; /* potential edgeflag enable/disable */ + + ctx.chan = nvfx->screen->base.channel; + ctx.translate = nvfx->vtxelt->translate; + ctx.idxbuf = NULL; + ctx.vertex_length = nvfx->vtxelt->vertex_length; + ctx.max_vertices_per_packet = nvfx->vtxelt->max_vertices_per_packet; + ctx.edgeflag = 0.5f; + // TODO: figure out if we really want to handle this, and do so in that case + ctx.edgeflag_attr = 0xff; // nvfx->vertprog->cfg.edgeflag_in; + + if(!nvfx->use_vertex_buffers) + { + for(i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i) + { + struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index]; + uint8_t* data = nvfx_buffer(vb->buffer)->data + vb->buffer_offset; + if(info->indexed) + data += info->index_bias * vb->stride; + ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0); + } + + if(ctx.edgeflag_attr < 16) + vtx_value = -(ctx.vertex_length + 3); /* vertex data and edgeflag header and value */ + else + { + p_overhead += 1; /* initial vertex_data header */ + vtx_value = -ctx.vertex_length; /* vertex data and edgeflag header and value */ + } + + if (info->indexed) { + // XXX: this case and is broken and probably need a new VTX_ATTR push path + if (nvfx->idxbuf.index_size == 1) + s.emit = emit_vertices_lookup8; + else if (nvfx->idxbuf.index_size == 2) + s.emit = emit_vertices_lookup16; + else + s.emit = emit_vertices_lookup32; + } else + s.emit = emit_vertices; + } + else + { + if(!info->indexed || nvfx->use_index_buffer) + { + s.emit = info->indexed ? emit_ib_ranges : emit_vb_ranges; + p_overhead += 3; + vtx_value = 0; + } + else if (nvfx->idxbuf.index_size == 4) + { + s.emit = emit_elt32; + p_overhead += 1; + vtx_value = 8; + } + else + { + s.emit = (nvfx->idxbuf.index_size == 2) ? emit_elt16 : emit_elt8; + p_overhead += 3; + vtx_value = 7; + } + } + + ctx.idxbias = info->index_bias; + if(nvfx->use_vertex_buffers) + ctx.idxbias -= nvfx->base_vertex; + + /* map index buffer, if present */ + if (info->indexed && !nvfx->use_index_buffer) + ctx.idxbuf = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset; + + s.priv = &ctx; + s.edge = emit_edgeflag; + + for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i) + { + struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index]; + float v[4]; + per_instance[i].step = info->start_instance % ve->instance_divisor; + per_instance[i].map = nvfx_buffer(vb->buffer)->data + vb->buffer_offset + ve->base.src_offset; + + nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0); + + WAIT_RING(chan, 5); + nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp); + } + + /* per-instance loop */ + while (instances_left--) { + int max_verts; + boolean done; + + util_split_prim_init(&s, info->mode, info->start, info->count); + nvfx_state_emit(nvfx); + for(;;) { + max_verts = AVAIL_RING(chan); + max_verts -= p_overhead; + + /* if vtx_value < 0, each vertex is -vtx_value words long + * otherwise, each vertex is 2^(vtx_value) / 255 words long (this is an approximation) + */ + if(vtx_value < 0) + { + max_verts /= -vtx_value; + max_verts -= (max_verts >> 10); /* vertex data headers */ + } + else + { + if(max_verts >= (1 << 23)) /* avoid overflow here */ + max_verts = (1 << 23); + max_verts = (max_verts * 255) >> vtx_value; + } + + //printf("avail %u max_verts %u\n", AVAIL_RING(chan), max_verts); + + if(max_verts >= 16) + { + /* XXX: any command a lot of times seems to (mostly) fix corruption that would otherwise happen */ + /* this seems to cause issues on nv3x, and also be unneeded there */ + if(nvfx->is_nv4x) + { + int i; + for(i = 0; i < 32; ++i) + { + OUT_RING(chan, RING_3D(0x1dac, 1)); + OUT_RING(chan, 0); + } + } + + OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); + OUT_RING(chan, hw_mode); + done = util_split_prim_next(&s, max_verts); + OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); + OUT_RING(chan, 0); + + if(done) + break; + } + + FIRE_RING(chan); + nvfx_state_emit(nvfx); + } + + /* set data for the next instance, if any changed */ + for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i) + { + struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index]; + + if(++per_instance[i].step == ve->instance_divisor) + { + float v[4]; + per_instance[i].map += vb->stride; + per_instance[i].step = 0; + + nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0); + WAIT_RING(chan, 5); + nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp); + } + } + } +} diff --git a/src/gallium/drivers/nvfx/nvfx_resource.c b/src/gallium/drivers/nvfx/nvfx_resource.c index 10cdeed2a37..39ae893f1b3 100644 --- a/src/gallium/drivers/nvfx/nvfx_resource.c +++ b/src/gallium/drivers/nvfx/nvfx_resource.c @@ -1,23 +1,15 @@ #include "pipe/p_context.h" +#include "util/u_staging.h" #include "nvfx_resource.h" #include "nouveau/nouveau_screen.h" - -/* This doesn't look quite right - this query is supposed to ask - * whether the particular context has references to the resource in - * any unflushed rendering command buffer, and hence requires a - * pipe->flush() for serializing some modification to that resource. - * - * This seems to be answering the question of whether the resource is - * currently on hardware. - */ static unsigned int nvfx_resource_is_referenced(struct pipe_context *pipe, - struct pipe_resource *resource, + struct pipe_resource *pr, unsigned face, unsigned level) { - return nouveau_reference_flags(nvfx_resource(resource)->bo); + return !!nouveau_reference_flags(nvfx_resource(pr)->bo); } static struct pipe_resource * @@ -30,6 +22,15 @@ nvfx_resource_create(struct pipe_screen *screen, return nvfx_miptree_create(screen, template); } +static void +nvfx_resource_destroy(struct pipe_screen *screen, struct pipe_resource *pr) +{ + if (pr->target == PIPE_BUFFER) + return nvfx_buffer_destroy(screen, pr); + else + return nvfx_miptree_destroy(screen, pr); +} + static struct pipe_resource * nvfx_resource_from_handle(struct pipe_screen * screen, const struct pipe_resource *template, @@ -41,15 +42,22 @@ nvfx_resource_from_handle(struct pipe_screen * screen, return nvfx_miptree_from_handle(screen, template, whandle); } +static boolean +nvfx_resource_get_handle(struct pipe_screen *pscreen, + struct pipe_resource *pr, + struct winsys_handle *whandle) +{ + struct nvfx_resource* res = (struct nvfx_resource*)pr; + + if (!res || !res->bo) + return FALSE; + + return nouveau_screen_bo_get_handle(pscreen, res->bo, nvfx_subresource_pitch(pr, 0), whandle); +} + void nvfx_init_resource_functions(struct pipe_context *pipe) { - pipe->get_transfer = u_get_transfer_vtbl; - pipe->transfer_map = u_transfer_map_vtbl; - pipe->transfer_flush_region = u_transfer_flush_region_vtbl; - pipe->transfer_unmap = u_transfer_unmap_vtbl; - pipe->transfer_destroy = u_transfer_destroy_vtbl; - pipe->transfer_inline_write = u_transfer_inline_write_vtbl; pipe->is_resource_referenced = nvfx_resource_is_referenced; } @@ -58,10 +66,10 @@ nvfx_screen_init_resource_functions(struct pipe_screen *pscreen) { pscreen->resource_create = nvfx_resource_create; pscreen->resource_from_handle = nvfx_resource_from_handle; - pscreen->resource_get_handle = u_resource_get_handle_vtbl; - pscreen->resource_destroy = u_resource_destroy_vtbl; + pscreen->resource_get_handle = nvfx_resource_get_handle; + pscreen->resource_destroy = nvfx_resource_destroy; pscreen->user_buffer_create = nvfx_user_buffer_create; - + pscreen->get_tex_surface = nvfx_miptree_surface_new; pscreen->tex_surface_destroy = nvfx_miptree_surface_del; } diff --git a/src/gallium/drivers/nvfx/nvfx_resource.h b/src/gallium/drivers/nvfx/nvfx_resource.h index a68c14cf3fb..583be4de2ae 100644 --- a/src/gallium/drivers/nvfx/nvfx_resource.h +++ b/src/gallium/drivers/nvfx/nvfx_resource.h @@ -1,44 +1,82 @@ - #ifndef NVFX_RESOURCE_H #define NVFX_RESOURCE_H #include "util/u_transfer.h" +#include "util/u_format.h" +#include "util/u_math.h" +#include "util/u_double_list.h" +#include "util/u_surfaces.h" +#include "util/u_dirty_surfaces.h" +#include <nouveau/nouveau_bo.h> struct pipe_resource; -struct nouveau_bo; - +struct nv04_region; -/* This gets further specialized into either buffer or texture - * structures. In the future we'll want to remove much of that - * distinction, but for now try to keep as close to the existing code - * as possible and use the vtbl struct to choose between the two - * underlying implementations. - */ struct nvfx_resource { struct pipe_resource base; - struct u_resource_vtbl *vtbl; struct nouveau_bo *bo; }; +static INLINE +struct nvfx_resource *nvfx_resource(struct pipe_resource *resource) +{ + return (struct nvfx_resource *)resource; +} + +#define NVFX_RESOURCE_FLAG_LINEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) +#define NVFX_RESOURCE_FLAG_USER (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) + +/* is resource mapped into the GPU's address space (i.e. VRAM or GART) ? */ +static INLINE boolean +nvfx_resource_mapped_by_gpu(struct pipe_resource *resource) +{ + return nvfx_resource(resource)->bo->handle; +} + +/* is resource in VRAM? */ +static inline int +nvfx_resource_on_gpu(struct pipe_resource* pr) +{ +#if 0 + // a compiler error here means you need to apply libdrm-nouveau-add-domain.patch to libdrm + // TODO: return FALSE if not VRAM and on a PCI-E system + return ((struct nvfx_resource*)pr)->bo->domain & (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART); +#else + return TRUE; +#endif +} + #define NVFX_MAX_TEXTURE_LEVELS 16 +/* We have the following invariants for render temporaries + * + * 1. Render temporaries are always linear + * 2. Render temporaries are always up to date + * 3. Currently, render temporaries are destroyed when the resource is used for sampling, but kept for any other use + * + * Also, we do NOT flush temporaries on any pipe->flush(). + * This is fine, as long as scanout targets and shared resources never need temps. + * + * TODO: we may want to also support swizzled temporaries to improve performance in some cases. + */ + struct nvfx_miptree { - struct nvfx_resource base; - uint total_size; + struct nvfx_resource base; - struct { - uint pitch; - uint *image_offset; - } level[NVFX_MAX_TEXTURE_LEVELS]; + unsigned linear_pitch; /* for linear textures, 0 for swizzled and compressed textures with level-dependent minimal pitch */ + unsigned face_size; /* 128-byte aligned face/total size */ + unsigned level_offset[NVFX_MAX_TEXTURE_LEVELS]; - unsigned image_nr; + struct util_surfaces surfaces; + struct util_dirty_surfaces dirty_surfaces; }; -static INLINE -struct nvfx_resource *nvfx_resource(struct pipe_resource *resource) -{ - return (struct nvfx_resource *)resource; -} +struct nvfx_surface { + struct util_dirty_surface base; + unsigned pitch; + + struct nvfx_miptree* temp; +}; static INLINE struct nouveau_bo * nvfx_surface_buffer(struct pipe_surface *surf) @@ -48,6 +86,12 @@ nvfx_surface_buffer(struct pipe_surface *surf) return mt->bo; } +static INLINE struct util_dirty_surfaces* +nvfx_surface_get_dirty_surfaces(struct pipe_surface* surf) +{ + struct nvfx_miptree *mt = (struct nvfx_miptree *)surf->texture; + return &mt->dirty_surfaces; +} void nvfx_init_resource_functions(struct pipe_context *pipe); @@ -62,30 +106,118 @@ nvfx_screen_init_resource_functions(struct pipe_screen *pscreen); struct pipe_resource * nvfx_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *pt); +void +nvfx_miptree_destroy(struct pipe_screen *pscreen, + struct pipe_resource *presource); + struct pipe_resource * nvfx_miptree_from_handle(struct pipe_screen *pscreen, const struct pipe_resource *template, struct winsys_handle *whandle); +void +nvfx_miptree_surface_del(struct pipe_surface *ps); + +struct pipe_surface * +nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt, + unsigned face, unsigned level, unsigned zslice, + unsigned flags); + +/* only for miptrees, don't use for buffers */ + +/* NOTE: for swizzled 3D textures, this just returns the offset of the mipmap level */ +static inline unsigned +nvfx_subresource_offset(struct pipe_resource* pt, unsigned face, unsigned level, unsigned zslice) +{ + if(pt->target == PIPE_BUFFER) + return 0; + else + { + struct nvfx_miptree *mt = (struct nvfx_miptree *)pt; + + unsigned offset = mt->level_offset[level]; + if (pt->target == PIPE_TEXTURE_CUBE) + offset += mt->face_size * face; + else if (pt->target == PIPE_TEXTURE_3D && mt->linear_pitch) + offset += zslice * util_format_get_2d_size(pt->format, (mt->linear_pitch ? mt->linear_pitch : util_format_get_stride(pt->format, u_minify(pt->width0, level))), u_minify(pt->height0, level)); + return offset; + } +} + +static inline unsigned +nvfx_subresource_pitch(struct pipe_resource* pt, unsigned level) +{ + if(pt->target == PIPE_BUFFER) + return ((struct nvfx_resource*)pt)->bo->size; + else + { + struct nvfx_miptree *mt = (struct nvfx_miptree *)pt; + + if(mt->linear_pitch) + return mt->linear_pitch; + else + return util_format_get_stride(pt->format, u_minify(pt->width0, level)); + } +} + +void +nvfx_surface_create_temp(struct pipe_context* pipe, struct pipe_surface* surf); + +void +nvfx_surface_flush(struct pipe_context* pipe, struct pipe_surface* surf); + +struct nvfx_buffer +{ + struct nvfx_resource base; + uint8_t* data; + unsigned size; + + /* the range of data not yet uploaded to the GPU bo */ + unsigned dirty_begin; + unsigned dirty_end; + + /* whether all transfers were unsynchronized */ + boolean dirty_unsynchronized; + + /* whether it would have been profitable to upload + * the latest updated data to the GPU immediately */ + boolean last_update_static; + + /* how many bytes we need to draw before we deem + * the buffer to be static + */ + long long bytes_to_draw_until_static; +}; + +static inline struct nvfx_buffer* nvfx_buffer(struct pipe_resource* pr) +{ + return (struct nvfx_buffer*)pr; +} + +/* this is an heuristic to determine whether we are better off uploading the + * buffer to the GPU, or just continuing pushing it on the FIFO + */ +static inline boolean nvfx_buffer_seems_static(struct nvfx_buffer* buffer) +{ + return buffer->last_update_static + || buffer->bytes_to_draw_until_static < 0; +} + struct pipe_resource * nvfx_buffer_create(struct pipe_screen *pscreen, const struct pipe_resource *template); +void +nvfx_buffer_destroy(struct pipe_screen *pscreen, + struct pipe_resource *presource); + struct pipe_resource * nvfx_user_buffer_create(struct pipe_screen *screen, void *ptr, unsigned bytes, unsigned usage); - - void -nvfx_miptree_surface_del(struct pipe_surface *ps); - -struct pipe_surface * -nvfx_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_resource *pt, - unsigned face, unsigned level, unsigned zslice, - unsigned flags); - +nvfx_buffer_upload(struct nvfx_buffer* buffer); #endif diff --git a/src/gallium/drivers/nvfx/nvfx_screen.c b/src/gallium/drivers/nvfx/nvfx_screen.c index f2525ccb38f..65ca265d45c 100644 --- a/src/gallium/drivers/nvfx/nvfx_screen.c +++ b/src/gallium/drivers/nvfx/nvfx_screen.c @@ -8,23 +8,12 @@ #include "nvfx_context.h" #include "nvfx_screen.h" #include "nvfx_resource.h" +#include "nvfx_tex.h" #define NV30TCL_CHIPSET_3X_MASK 0x00000003 #define NV34TCL_CHIPSET_3X_MASK 0x00000010 #define NV35TCL_CHIPSET_3X_MASK 0x000001e0 -/* FIXME: It seems I should not include directly ../../winsys/drm/nouveau/drm/nouveau_drm_api.h -* to get the pointer to the context front buffer, so I copied nouveau_winsys here. -* nv30_screen_surface_format_supported() can then use it to enforce creating fbo -* with same number of bits everywhere. -*/ -struct nouveau_winsys { - struct pipe_winsys base; - - struct pipe_screen *pscreen; - - struct pipe_surface *front; -}; #define NV4X_GRCLASS4097_CHIPSETS 0x00000baf #define NV4X_GRCLASS4497_CHIPSETS 0x00005450 #define NV6X_GRCLASS4497_CHIPSETS 0x00000088 @@ -43,7 +32,7 @@ nvfx_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TWO_SIDED_STENCIL: return 1; case PIPE_CAP_GLSL: - return 0; + return 1; case PIPE_CAP_ANISOTROPIC_FILTER: return 1; case PIPE_CAP_POINT_SPRITE: @@ -162,77 +151,74 @@ nvfx_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_cap param) } static boolean -nvfx_screen_surface_format_supported(struct pipe_screen *pscreen, +nvfx_screen_is_format_supported(struct pipe_screen *pscreen, enum pipe_format format, enum pipe_texture_target target, unsigned sample_count, - unsigned tex_usage, unsigned geom_flags) + unsigned bind, unsigned geom_flags) { struct nvfx_screen *screen = nvfx_screen(pscreen); - struct pipe_surface *front = ((struct nouveau_winsys *) pscreen->winsys)->front; if (sample_count > 1) return FALSE; - if (tex_usage & PIPE_BIND_RENDER_TARGET) { + if (bind & PIPE_BIND_RENDER_TARGET) { switch (format) { case PIPE_FORMAT_B8G8R8A8_UNORM: case PIPE_FORMAT_B8G8R8X8_UNORM: case PIPE_FORMAT_B5G6R5_UNORM: - return TRUE; - default: break; + default: + return FALSE; } - } else - if (tex_usage & PIPE_BIND_DEPTH_STENCIL) { + } + + if (bind & PIPE_BIND_DEPTH_STENCIL) { switch (format) { case PIPE_FORMAT_S8_USCALED_Z24_UNORM: case PIPE_FORMAT_X8Z24_UNORM: - return TRUE; case PIPE_FORMAT_Z16_UNORM: - /* TODO: this nv30 limitation probably does not exist */ - if (!screen->is_nv4x && front) - return (front->format == PIPE_FORMAT_B5G6R5_UNORM); - return TRUE; - default: break; + default: + return FALSE; } - } else { - switch (format) { - if (tex_usage & PIPE_BIND_SAMPLER_VIEW) { - switch (format) { - case PIPE_FORMAT_DXT1_RGB: - case PIPE_FORMAT_DXT1_RGBA: - case PIPE_FORMAT_DXT3_RGBA: - case PIPE_FORMAT_DXT5_RGBA: - return util_format_s3tc_enabled; - default: - break; - } + } + + if (bind & PIPE_BIND_SAMPLER_VIEW) { + struct nvfx_texture_format* tf = &nvfx_texture_formats[format]; + if(util_format_is_s3tc(format) && !util_format_s3tc_enabled) + return FALSE; + + if(screen->is_nv4x) + { + if(tf->fmt[4] < 0) + return FALSE; } - case PIPE_FORMAT_B8G8R8A8_UNORM: - case PIPE_FORMAT_B8G8R8X8_UNORM: - case PIPE_FORMAT_B5G5R5A1_UNORM: - case PIPE_FORMAT_B4G4R4A4_UNORM: - case PIPE_FORMAT_B5G6R5_UNORM: - case PIPE_FORMAT_L8_UNORM: - case PIPE_FORMAT_A8_UNORM: - case PIPE_FORMAT_I8_UNORM: - case PIPE_FORMAT_L8A8_UNORM: - case PIPE_FORMAT_Z16_UNORM: - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - return TRUE; - /* TODO: does nv30 support this? */ - case PIPE_FORMAT_R16_SNORM: - return !!screen->is_nv4x; - default: - break; + else + { + if(tf->fmt[0] < 0) + return FALSE; } } - return FALSE; -} + // note that we do actually support everything through translate + if (bind & PIPE_BIND_VERTEX_BUFFER) { + unsigned type = nvfx_vertex_formats[format]; + if(!type) + return FALSE; + } + + if (bind & PIPE_BIND_INDEX_BUFFER) { + // 8-bit indices supported, but not in hardware index buffer + if(format != PIPE_FORMAT_R16_USCALED && format != PIPE_FORMAT_R32_USCALED) + return FALSE; + } + + if(bind & PIPE_BIND_STREAM_OUTPUT) + return FALSE; + return TRUE; +} static void nvfx_screen_destroy(struct pipe_screen *pscreen) @@ -245,7 +231,7 @@ nvfx_screen_destroy(struct pipe_screen *pscreen) nouveau_notifier_free(&screen->query); nouveau_notifier_free(&screen->sync); nouveau_grobj_free(&screen->eng3d); - nv04_surface_2d_takedown(&screen->eng2d); + nvfx_screen_surface_takedown(pscreen); nouveau_screen_fini(&screen->base); @@ -374,6 +360,14 @@ nvfx_screen_get_vertex_buffer_flags(struct nvfx_screen* screen) return vram_hack ? NOUVEAU_BO_VRAM : NOUVEAU_BO_GART; } +static void nvfx_channel_flush_notify(struct nouveau_channel* chan) +{ + struct nvfx_screen* screen = chan->user_private; + struct nvfx_context* nvfx = screen->cur_ctx; + if(nvfx) + nvfx->relocs_needed = NVFX_RELOCATE_ALL; +} + struct pipe_screen * nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) { @@ -395,12 +389,15 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) return NULL; } chan = screen->base.channel; + screen->cur_ctx = NULL; + chan->user_private = screen; + chan->flush_notify = nvfx_channel_flush_notify; pscreen->winsys = ws; pscreen->destroy = nvfx_screen_destroy; pscreen->get_param = nvfx_screen_get_param; pscreen->get_paramf = nvfx_screen_get_paramf; - pscreen->is_format_supported = nvfx_screen_surface_format_supported; + pscreen->is_format_supported = nvfx_screen_is_format_supported; pscreen->context_create = nvfx_create; switch (dev->chipset & 0xf0) { @@ -432,6 +429,11 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) } screen->force_swtnl = debug_get_bool_option("NOUVEAU_SWTNL", FALSE); + screen->trace_draw = debug_get_bool_option("NVFX_TRACE_DRAW", FALSE); + + screen->buffer_allocation_cost = debug_get_num_option("NVFX_BUFFER_ALLOCATION_COST", 16384); + screen->inline_cost_per_hardware_cost = atof(debug_get_option("NVFX_INLINE_COST_PER_HARDWARE_COST", "1.0")); + screen->static_reuse_threshold = atof(debug_get_option("NVFX_STATIC_REUSE_THRESHOLD", "2.0")); screen->vertex_buffer_reloc_flags = nvfx_screen_get_vertex_buffer_flags(screen); @@ -451,8 +453,7 @@ nvfx_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev) } /* 2D engine setup */ - screen->eng2d = nv04_surface_2d_init(&screen->base); - screen->eng2d->buf = nvfx_surface_buffer; + nvfx_screen_surface_init(pscreen); /* Notifier for sync purposes */ ret = nouveau_notifier_alloc(chan, 0xbeef0301, 1, &screen->sync); diff --git a/src/gallium/drivers/nvfx/nvfx_screen.h b/src/gallium/drivers/nvfx/nvfx_screen.h index 5e1c3945aef..1b79235ae0d 100644 --- a/src/gallium/drivers/nvfx/nvfx_screen.h +++ b/src/gallium/drivers/nvfx/nvfx_screen.h @@ -1,11 +1,11 @@ #ifndef __NVFX_SCREEN_H__ #define __NVFX_SCREEN_H__ +#include "pipe/p_compiler.h" #include "util/u_double_list.h" #include "nouveau/nouveau_screen.h" -#include "nv04_surface_2d.h" -struct nvfx_context; +struct pipe_screen; struct nvfx_screen { struct nouveau_screen base; @@ -16,11 +16,11 @@ struct nvfx_screen { unsigned is_nv4x; /* either 0 or ~0 */ boolean force_swtnl; + boolean trace_draw; unsigned vertex_buffer_reloc_flags; unsigned index_buffer_reloc_flags; /* HW graphics objects */ - struct nv04_surface_2d *eng2d; struct nouveau_grobj *eng3d; struct nouveau_notifier *sync; @@ -32,6 +32,20 @@ struct nvfx_screen { /* Vtxprog resources */ struct nouveau_resource *vp_exec_heap; struct nouveau_resource *vp_data_heap; + + struct nv04_2d_context* eng2d; + + /* Once the amount of bytes drawn from the buffer reaches the updated size times this value, + * we will assume that the buffer will be drawn an huge number of times before the + * next modification + */ + float static_reuse_threshold; + + /* Cost of allocating a buffer in terms of the cost of copying a byte to an hardware buffer */ + unsigned buffer_allocation_cost; + + /* inline_cost/hardware_cost conversion ration */ + float inline_cost_per_hardware_cost; }; static INLINE struct nvfx_screen * @@ -40,4 +54,7 @@ nvfx_screen(struct pipe_screen *screen) return (struct nvfx_screen *)screen; } +int nvfx_screen_surface_init(struct pipe_screen *pscreen); +void nvfx_screen_surface_takedown(struct pipe_screen *pscreen); + #endif diff --git a/src/gallium/drivers/nvfx/nvfx_shader.h b/src/gallium/drivers/nvfx/nvfx_shader.h index 50830b39164..35006eec3d4 100644 --- a/src/gallium/drivers/nvfx/nvfx_shader.h +++ b/src/gallium/drivers/nvfx/nvfx_shader.h @@ -1,6 +1,12 @@ #ifndef __NVFX_SHADER_H__ #define __NVFX_SHADER_H__ +#include <stdint.h> + +#include "pipe/p_compiler.h" + +#define NVFX_SWZ_IDENTITY ((3 << 6) | (2 << 4) | (1 << 2) | (0 << 0)) + /* this will resolve to either the NV30 or the NV40 version * depending on the current hardware */ /* unusual, but very fast and compact method */ @@ -71,11 +77,58 @@ /* * Each fragment program opcode appears to be comprised of 4 32-bit values. * - * 0 - Opcode, output reg/mask, ATTRIB source - * 1 - Source 0 - * 2 - Source 1 - * 3 - Source 2 + * 0: OPDEST + * 0: program end + * 1-6: destination register + * 7: destination register is fp16?? (use for outputs) + * 8: set condition code + * 9: writemask x + * 10: writemask y + * 11: writemask z + * 12: writemask w + * 13-16: source attribute register number (e.g. COL0) + * 17-20: texture unit number + * 21: expand value on texture operation (x -> 2x - 1) + * 22-23: precision 0 = fp32, 1 = fp16, 2 = s1.10 fixed, 3 = s0.8 fixed (nv40-only)) + * 24-29: opcode + * 30: no destination + * 31: saturate + * 1 - SRC0 + * 0-17: see common source fields + * 18: execute if condition code less + * 19: execute if condition code equal + * 20: execute if condition code greater + * 21-22: condition code swizzle x source component + * 23-24: condition code swizzle y source component + * 25-26: condition code swizzle z source component + * 27-28: condition code swizzle w source component + * 29: source 0 absolute + * 30: always 0 in renouveau tests + * 31: always 0 in renouveau tests + * 2 - SRC1 + * 0-17: see common source fields + * 18: source 1 absolute + * 19-20: input precision 0 = fp32, 1 = fp16, 2 = s1.10 fixed, 3 = ??? + * 21-27: always 0 in renouveau tests + * 28-30: scale (0 = 1x, 1 = 2x, 2 = 4x, 3 = 8x, 4 = ???, 5, = 1/2, 6 = 1/4, 7 = 1/8) + * 31: opcode is branch + * 3 - SRC2 + * 0-17: see common source fields + * 18: source 2 absolute + * 19-29: address register displacement + * 30: use index register + * 31: disable perspective-correct interpolation? * +* Common fields of 0, 1, 2 - SRC + * 0-1: source register type (0 = temp, 1 = input, 2 = immediate, 3 = ???) + * 2-7: source temp register index + * 8: source register is fp16?? + * 9-10: source swizzle x source component + * 11-12: source swizzle y source component + * 13-14: source swizzle z source component + * 15-16: source swizzle w source component + * 17: negate + * There appears to be no special difference between result regs and temp regs. * result.color == R0.xyzw * result.depth == R1.z @@ -210,6 +263,7 @@ /* NV40 only fragment program opcodes */ #define NVFX_FP_OP_OPCODE_TXL_NV40 0x2F + /* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/ #define NV40_FP_OP_BRA_OPCODE_BRK 0x0 #define NV40_FP_OP_BRA_OPCODE_CAL 0x1 @@ -218,10 +272,11 @@ #define NV40_FP_OP_BRA_OPCODE_REP 0x4 #define NV40_FP_OP_BRA_OPCODE_RET 0x5 +#define NV40_FP_OP_OUT_NONE (1 << 30) #define NVFX_FP_OP_OUT_SAT (1 << 31) /* high order bits of SRC0 */ -#define NVFX_FP_OP_OUT_ABS (1 << 29) +#define NVFX_FP_OP_SRC0_ABS (1 << 29) #define NVFX_FP_OP_COND_SWZ_W_SHIFT 27 #define NVFX_FP_OP_COND_SWZ_W_MASK (3 << 27) #define NVFX_FP_OP_COND_SWZ_Z_SHIFT 25 @@ -254,6 +309,7 @@ #define NVFX_FP_OP_DST_SCALE_INV_2X 5 #define NVFX_FP_OP_DST_SCALE_INV_4X 6 #define NVFX_FP_OP_DST_SCALE_INV_8X 7 +#define NVFX_FP_OP_SRC1_ABS (1 << 18) /* SRC1 LOOP */ #define NV40_FP_OP_LOOP_INCR_SHIFT 19 @@ -263,13 +319,13 @@ #define NV40_FP_OP_LOOP_COUNT_SHIFT 2 #define NV40_FP_OP_LOOP_COUNT_MASK (0xFF << 2) -/* SRC1 IF */ -#define NV40_FP_OP_ELSE_ID_SHIFT 2 -#define NV40_FP_OP_ELSE_ID_MASK (0xFF << 2) +/* SRC1 IF: absolute offset in dwords */ +#define NV40_FP_OP_ELSE_OFFSET_SHIFT 0 +#define NV40_FP_OP_ELSE_OFFSET_MASK (0x7FFFFFFF << 0) /* SRC1 CAL */ -#define NV40_FP_OP_IADDR_SHIFT 2 -#define NV40_FP_OP_IADDR_MASK (0xFF << 2) +#define NV40_FP_OP_SUB_OFFSET_SHIFT 0 +#define NV40_FP_OP_SUB_OFFSET_MASK (0x7FFFFFFF << 0) /* SRC1 REP * I have no idea why there are 3 count values here.. but they @@ -283,9 +339,9 @@ #define NV40_FP_OP_REP_COUNT3_SHIFT 19 #define NV40_FP_OP_REP_COUNT3_MASK (0xFF << 19) -/* SRC2 REP/IF */ -#define NV40_FP_OP_END_ID_SHIFT 2 -#define NV40_FP_OP_END_ID_MASK (0xFF << 2) +/* SRC2 REP/IF: absolute offset in dwords */ +#define NV40_FP_OP_END_OFFSET_SHIFT 0 +#define NV40_FP_OP_END_OFFSET_MASK (0x7FFFFFFF << 0) /* high order bits of SRC2 */ #define NVFX_FP_OP_INDEX_INPUT (1 << 30) @@ -323,6 +379,7 @@ #define NVFXSR_INPUT 2 #define NVFXSR_TEMP 3 #define NVFXSR_CONST 4 +#define NVFXSR_RELOCATED 5 #define NVFX_COND_FL 0 #define NVFX_COND_LT 1 @@ -352,51 +409,88 @@ #define NVFX_SWZ_Z 2 #define NVFX_SWZ_W 3 -#define swz(s,x,y,z,w) nvfx_sr_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w) -#define neg(s) nvfx_sr_neg((s)) -#define abs(s) nvfx_sr_abs((s)) -#define scale(s,v) nvfx_sr_scale((s), NVFX_FP_OP_DST_SCALE_##v) +#define swz(s,x,y,z,w) nvfx_src_swz((s), NVFX_SWZ_##x, NVFX_SWZ_##y, NVFX_SWZ_##z, NVFX_SWZ_##w) +#define neg(s) nvfx_src_neg((s)) +#define abs(s) nvfx_src_abs((s)) -struct nvfx_sreg { - int type; - int index; +struct nvfx_reg { + uint8_t type; + uint32_t index; +}; - int dst_scale; +struct nvfx_src { + struct nvfx_reg reg; - int negate; - int abs; - int swz[4]; + /* src only */ + uint8_t negate : 1; + uint8_t abs : 1; + uint8_t swz[4]; +}; - int cc_update; - int cc_update_reg; - int cc_test; - int cc_test_reg; - int cc_swz[4]; +struct nvfx_insn +{ + uint8_t op; + char scale; + int8_t unit; + uint8_t mask; + uint8_t cc_swz[4]; + + uint8_t sat : 1; + uint8_t cc_update : 1; + uint8_t cc_update_reg : 1; + uint8_t cc_test : 3; + uint8_t cc_test_reg : 1; + + struct nvfx_reg dst; + struct nvfx_src src[3]; }; -static INLINE struct nvfx_sreg -nvfx_sr(int type, int index) +static INLINE struct nvfx_insn +nvfx_insn(boolean sat, unsigned op, int unit, struct nvfx_reg dst, unsigned mask, struct nvfx_src s0, struct nvfx_src s1, struct nvfx_src s2) { - struct nvfx_sreg temp = { - .type = type, - .index = index, - .dst_scale = 0, - .abs = 0, - .negate = 0, - .swz = { 0, 1, 2, 3 }, + struct nvfx_insn insn = { + .op = op, + .scale = 0, + .unit = unit, + .sat = sat, + .mask = mask, .cc_update = 0, .cc_update_reg = 0, .cc_test = NVFX_COND_TR, .cc_test_reg = 0, .cc_swz = { 0, 1, 2, 3 }, + .dst = dst, + .src = {s0, s1, s2} + }; + return insn; +} + +static INLINE struct nvfx_reg +nvfx_reg(int type, int index) +{ + struct nvfx_reg temp = { + .type = type, + .index = index, }; return temp; } -static INLINE struct nvfx_sreg -nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w) +static INLINE struct nvfx_src +nvfx_src(struct nvfx_reg reg) { - struct nvfx_sreg dst = src; + struct nvfx_src temp = { + .reg = reg, + .abs = 0, + .negate = 0, + .swz = { 0, 1, 2, 3 }, + }; + return temp; +} + +static INLINE struct nvfx_src +nvfx_src_swz(struct nvfx_src src, int x, int y, int z, int w) +{ + struct nvfx_src dst = src; dst.swz[NVFX_SWZ_X] = src.swz[x]; dst.swz[NVFX_SWZ_Y] = src.swz[y]; @@ -405,25 +499,23 @@ nvfx_sr_swz(struct nvfx_sreg src, int x, int y, int z, int w) return dst; } -static INLINE struct nvfx_sreg -nvfx_sr_neg(struct nvfx_sreg src) +static INLINE struct nvfx_src +nvfx_src_neg(struct nvfx_src src) { src.negate = !src.negate; return src; } -static INLINE struct nvfx_sreg -nvfx_sr_abs(struct nvfx_sreg src) +static INLINE struct nvfx_src +nvfx_src_abs(struct nvfx_src src) { src.abs = 1; return src; } -static INLINE struct nvfx_sreg -nvfx_sr_scale(struct nvfx_sreg src, int scale) -{ - src.dst_scale = scale; - return src; -} +struct nvfx_relocation { + unsigned location; + unsigned target; +}; #endif diff --git a/src/gallium/drivers/nvfx/nvfx_state.c b/src/gallium/drivers/nvfx/nvfx_state.c index cd58e439d71..5bd7dc07f02 100644 --- a/src/gallium/drivers/nvfx/nvfx_state.c +++ b/src/gallium/drivers/nvfx/nvfx_state.c @@ -1,6 +1,7 @@ #include "pipe/p_state.h" #include "pipe/p_defines.h" #include "util/u_inlines.h" +#include "util/u_framebuffer.h" #include "draw/draw_context.h" @@ -81,111 +82,6 @@ nvfx_blend_state_delete(struct pipe_context *pipe, void *hwcso) } static void * -nvfx_sampler_state_create(struct pipe_context *pipe, - const struct pipe_sampler_state *cso) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nvfx_sampler_state *ps; - - ps = MALLOC(sizeof(struct nvfx_sampler_state)); - - /* on nv30, we use this as an internal flag */ - ps->fmt = cso->normalized_coords ? 0 : NV40TCL_TEX_FORMAT_RECT; - ps->en = 0; - ps->filt = nvfx_tex_filter(cso); - ps->wrap = (nvfx_tex_wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) | - (nvfx_tex_wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) | - (nvfx_tex_wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT) | - nvfx_tex_wrap_compare_mode(cso); - ps->bcol = nvfx_tex_border_color(cso->border_color); - - if(nvfx->is_nv4x) - nv40_sampler_state_init(pipe, ps, cso); - else - nv30_sampler_state_init(pipe, ps, cso); - - return (void *)ps; -} - -static void -nvfx_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - unsigned unit; - - for (unit = 0; unit < nr; unit++) { - nvfx->tex_sampler[unit] = sampler[unit]; - nvfx->dirty_samplers |= (1 << unit); - } - - for (unit = nr; unit < nvfx->nr_samplers; unit++) { - nvfx->tex_sampler[unit] = NULL; - nvfx->dirty_samplers |= (1 << unit); - } - - nvfx->nr_samplers = nr; - nvfx->dirty |= NVFX_NEW_SAMPLER; -} - -static void -nvfx_sampler_state_delete(struct pipe_context *pipe, void *hwcso) -{ - FREE(hwcso); -} - -static void -nvfx_set_fragment_sampler_views(struct pipe_context *pipe, - unsigned nr, - struct pipe_sampler_view **views) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - unsigned unit; - - for (unit = 0; unit < nr; unit++) { - pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit], - views[unit]); - nvfx->dirty_samplers |= (1 << unit); - } - - for (unit = nr; unit < nvfx->nr_textures; unit++) { - pipe_sampler_view_reference(&nvfx->fragment_sampler_views[unit], - NULL); - nvfx->dirty_samplers |= (1 << unit); - } - - nvfx->nr_textures = nr; - nvfx->dirty |= NVFX_NEW_SAMPLER; -} - - -static struct pipe_sampler_view * -nvfx_create_sampler_view(struct pipe_context *pipe, - struct pipe_resource *texture, - const struct pipe_sampler_view *templ) -{ - struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view); - - if (view) { - *view = *templ; - view->reference.count = 1; - view->texture = NULL; - pipe_resource_reference(&view->texture, texture); - view->context = pipe; - } - - return view; -} - - -static void -nvfx_sampler_view_destroy(struct pipe_context *pipe, - struct pipe_sampler_view *view) -{ - pipe_resource_reference(&view->texture, NULL); - FREE(view); -} - -static void * nvfx_rasterizer_state_create(struct pipe_context *pipe, const struct pipe_rasterizer_state *cso) { @@ -195,6 +91,7 @@ nvfx_rasterizer_state_create(struct pipe_context *pipe, /*XXX: ignored: * point_smooth -nohw * multisample + * sprite_coord_origin */ sb_method(sb, NV34TCL_SHADE_MODEL, 1); @@ -254,19 +151,8 @@ nvfx_rasterizer_state_create(struct pipe_context *pipe, sb_data(sb, fui(cso->offset_units * 2)); } - sb_method(sb, NV34TCL_POINT_SPRITE, 1); - if (cso->point_quad_rasterization) { - unsigned psctl = (1 << 0), i; - - for (i = 0; i < 8; i++) { - if ((cso->sprite_coord_enable >> i) & 1) - psctl |= (1 << (8 + i)); - } - - sb_data(sb, psctl); - } else { - sb_data(sb, 0); - } + sb_method(sb, NV34TCL_FLATSHADE_FIRST, 1); + sb_data(sb, cso->flatshade_first); rsso->pipe = *cso; rsso->sb_len = sb_len(sb, rsso->sb); @@ -287,11 +173,11 @@ nvfx_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso) nvfx->draw_dirty |= NVFX_NEW_SCISSOR; } - if(((struct nvfx_rasterizer_state*)hwcso)->pipe.poly_stipple_enable - != nvfx->rasterizer->pipe.poly_stipple_enable) + if(((struct nvfx_rasterizer_state*)hwcso)->pipe.point_quad_rasterization != nvfx->rasterizer->pipe.point_quad_rasterization + || ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_enable != nvfx->rasterizer->pipe.sprite_coord_enable + || ((struct nvfx_rasterizer_state*)hwcso)->pipe.sprite_coord_mode != nvfx->rasterizer->pipe.sprite_coord_mode) { - nvfx->dirty |= NVFX_NEW_STIPPLE; - nvfx->draw_dirty |= NVFX_NEW_STIPPLE; + nvfx->dirty |= NVFX_NEW_SPRITE; } } @@ -315,10 +201,8 @@ nvfx_depth_stencil_alpha_state_create(struct pipe_context *pipe, struct nvfx_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso)); struct nouveau_statebuf_builder sb = sb_init(zsaso->sb); - sb_method(sb, NV34TCL_DEPTH_FUNC, 3); + sb_method(sb, NV34TCL_DEPTH_FUNC, 1); sb_data (sb, nvgl_comparison_op(cso->depth.func)); - sb_data (sb, cso->depth.writemask ? 1 : 0); - sb_data (sb, cso->depth.enabled ? 1 : 0); sb_method(sb, NV34TCL_ALPHA_FUNC_ENABLE, 3); sb_data (sb, cso->alpha.enabled ? 1 : 0); @@ -377,76 +261,6 @@ nvfx_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso) FREE(zsaso); } -static void * -nvfx_vp_state_create(struct pipe_context *pipe, - const struct pipe_shader_state *cso) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nvfx_vertex_program *vp; - - vp = CALLOC(1, sizeof(struct nvfx_vertex_program)); - vp->pipe.tokens = tgsi_dup_tokens(cso->tokens); - vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe); - - return (void *)vp; -} - -static void -nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - - nvfx->vertprog = hwcso; - nvfx->dirty |= NVFX_NEW_VERTPROG; - nvfx->draw_dirty |= NVFX_NEW_VERTPROG; -} - -static void -nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nvfx_vertex_program *vp = hwcso; - - draw_delete_vertex_shader(nvfx->draw, vp->draw); - nvfx_vertprog_destroy(nvfx, vp); - FREE((void*)vp->pipe.tokens); - FREE(vp); -} - -static void * -nvfx_fp_state_create(struct pipe_context *pipe, - const struct pipe_shader_state *cso) -{ - struct nvfx_fragment_program *fp; - - fp = CALLOC(1, sizeof(struct nvfx_fragment_program)); - fp->pipe.tokens = tgsi_dup_tokens(cso->tokens); - - tgsi_scan_shader(fp->pipe.tokens, &fp->info); - - return (void *)fp; -} - -static void -nvfx_fp_state_bind(struct pipe_context *pipe, void *hwcso) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - - nvfx->fragprog = hwcso; - nvfx->dirty |= NVFX_NEW_FRAGPROG; -} - -static void -nvfx_fp_state_delete(struct pipe_context *pipe, void *hwcso) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nvfx_fragment_program *fp = hwcso; - - nvfx_fragprog_destroy(nvfx, fp); - FREE((void*)fp->pipe.tokens); - FREE(fp); -} - static void nvfx_set_blend_color(struct pipe_context *pipe, const struct pipe_blend_color *bcol) @@ -507,7 +321,10 @@ nvfx_set_framebuffer_state(struct pipe_context *pipe, { struct nvfx_context *nvfx = nvfx_context(pipe); - nvfx->framebuffer = *fb; + if(fb) + util_copy_framebuffer_state(&nvfx->framebuffer, fb); + else + util_unreference_framebuffer_state(&nvfx->framebuffer); nvfx->dirty |= NVFX_NEW_FB; } @@ -542,65 +359,6 @@ nvfx_set_viewport_state(struct pipe_context *pipe, nvfx->draw_dirty |= NVFX_NEW_VIEWPORT; } -static void -nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count, - const struct pipe_vertex_buffer *vb) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - - memcpy(nvfx->vtxbuf, vb, sizeof(*vb) * count); - nvfx->vtxbuf_nr = count; - - nvfx->dirty |= NVFX_NEW_ARRAYS; - nvfx->draw_dirty |= NVFX_NEW_ARRAYS; -} - -static void -nvfx_set_index_buffer(struct pipe_context *pipe, - const struct pipe_index_buffer *ib) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - - if (ib) - memcpy(&nvfx->idxbuf, ib, sizeof(nvfx->idxbuf)); - else - memset(&nvfx->idxbuf, 0, sizeof(nvfx->idxbuf)); - - /* TODO make this more like a state */ -} - -static void * -nvfx_vtxelts_state_create(struct pipe_context *pipe, - unsigned num_elements, - const struct pipe_vertex_element *elements) -{ - struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state); - - assert(num_elements < 16); /* not doing fallbacks yet */ - cso->num_elements = num_elements; - memcpy(cso->pipe, elements, num_elements * sizeof(*elements)); - -/* nvfx_vtxelt_construct(cso);*/ - - return (void *)cso; -} - -static void -nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso) -{ - FREE(hwcso); -} - -static void -nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso) -{ - struct nvfx_context *nvfx = nvfx_context(pipe); - - nvfx->vtxelt = hwcso; - nvfx->dirty |= NVFX_NEW_ARRAYS; - /*nvfx->draw_dirty |= NVFX_NEW_ARRAYS;*/ -} - void nvfx_init_state_functions(struct nvfx_context *nvfx) { @@ -608,13 +366,6 @@ nvfx_init_state_functions(struct nvfx_context *nvfx) nvfx->pipe.bind_blend_state = nvfx_blend_state_bind; nvfx->pipe.delete_blend_state = nvfx_blend_state_delete; - nvfx->pipe.create_sampler_state = nvfx_sampler_state_create; - nvfx->pipe.bind_fragment_sampler_states = nvfx_sampler_state_bind; - nvfx->pipe.delete_sampler_state = nvfx_sampler_state_delete; - nvfx->pipe.set_fragment_sampler_views = nvfx_set_fragment_sampler_views; - nvfx->pipe.create_sampler_view = nvfx_create_sampler_view; - nvfx->pipe.sampler_view_destroy = nvfx_sampler_view_destroy; - nvfx->pipe.create_rasterizer_state = nvfx_rasterizer_state_create; nvfx->pipe.bind_rasterizer_state = nvfx_rasterizer_state_bind; nvfx->pipe.delete_rasterizer_state = nvfx_rasterizer_state_delete; @@ -626,14 +377,6 @@ nvfx_init_state_functions(struct nvfx_context *nvfx) nvfx->pipe.delete_depth_stencil_alpha_state = nvfx_depth_stencil_alpha_state_delete; - nvfx->pipe.create_vs_state = nvfx_vp_state_create; - nvfx->pipe.bind_vs_state = nvfx_vp_state_bind; - nvfx->pipe.delete_vs_state = nvfx_vp_state_delete; - - nvfx->pipe.create_fs_state = nvfx_fp_state_create; - nvfx->pipe.bind_fs_state = nvfx_fp_state_bind; - nvfx->pipe.delete_fs_state = nvfx_fp_state_delete; - nvfx->pipe.set_blend_color = nvfx_set_blend_color; nvfx->pipe.set_stencil_ref = nvfx_set_stencil_ref; nvfx->pipe.set_clip_state = nvfx_set_clip_state; @@ -643,11 +386,4 @@ nvfx_init_state_functions(struct nvfx_context *nvfx) nvfx->pipe.set_polygon_stipple = nvfx_set_polygon_stipple; nvfx->pipe.set_scissor_state = nvfx_set_scissor_state; nvfx->pipe.set_viewport_state = nvfx_set_viewport_state; - - nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create; - nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete; - nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind; - - nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers; - nvfx->pipe.set_index_buffer = nvfx_set_index_buffer; } diff --git a/src/gallium/drivers/nvfx/nvfx_state.h b/src/gallium/drivers/nvfx/nvfx_state.h index 9ceb2577ecc..e9c1f2c26d2 100644 --- a/src/gallium/drivers/nvfx/nvfx_state.h +++ b/src/gallium/drivers/nvfx/nvfx_state.h @@ -4,11 +4,11 @@ #include "pipe/p_state.h" #include "tgsi/tgsi_scan.h" #include "nouveau/nouveau_statebuf.h" +#include "util/u_dynarray.h" +#include "util/u_linkage.h" struct nvfx_vertex_program_exec { uint32_t data[4]; - boolean has_branch_offset; - int const_index; }; struct nvfx_vertex_program_data { @@ -18,18 +18,20 @@ struct nvfx_vertex_program_data { struct nvfx_vertex_program { struct pipe_shader_state pipe; + unsigned long long id; struct draw_vertex_shader *draw; boolean translated; - struct pipe_clip_state ucp; - struct nvfx_vertex_program_exec *insns; unsigned nr_insns; struct nvfx_vertex_program_data *consts; unsigned nr_consts; + char generic_to_fp_input[256]; + int sprite_fp_input; + struct nouveau_resource *exec; unsigned exec_start; struct nouveau_resource *data; @@ -38,7 +40,10 @@ struct nvfx_vertex_program { uint32_t ir; uint32_t or; - uint32_t clip_ctrl; + int clip_nr; + + struct util_dynarray branch_relocs; + struct util_dynarray const_relocs; }; struct nvfx_fragment_program_data { @@ -49,15 +54,14 @@ struct nvfx_fragment_program_data { struct nvfx_fragment_program_bo { struct nvfx_fragment_program_bo* next; struct nouveau_bo* bo; + unsigned char* slots; char insn[] __attribute__((aligned(16))); }; struct nvfx_fragment_program { - struct pipe_shader_state pipe; - struct tgsi_shader_info info; - - boolean translated; unsigned samplers; + unsigned point_sprite_control; + unsigned or; uint32_t *insn; int insn_len; @@ -65,13 +69,36 @@ struct nvfx_fragment_program { struct nvfx_fragment_program_data *consts; unsigned nr_consts; + /* the slot at num_slots is for the sprite coordinate, if any */ + unsigned num_slots; /* how many input semantics? */ + unsigned char slot_to_generic[10]; /* semantics */ + unsigned char slot_to_fp_input[11]; /* current assignment of slots for each used semantic */ + struct util_dynarray slot_relocations[11]; + + /* This is reset to progs on any relocation update, and decreases every time we + * move to a new prog due to a constant update + * When this is the same as progs, applying relocations is no longer necessary. + */ + unsigned progs_left_with_obsolete_slot_assignments; + + unsigned long long last_vp_id; + unsigned last_sprite_coord_enable; + uint32_t fp_control; unsigned bo_prog_idx; unsigned prog_size; unsigned progs_per_bo; + unsigned progs; + struct nvfx_fragment_program_bo* fpbo; }; +struct nvfx_pipe_fragment_program { + struct pipe_shader_state pipe; + struct tgsi_shader_info info; + + struct nvfx_fragment_program* fps[2]; +}; #endif diff --git a/src/gallium/drivers/nvfx/nvfx_state_emit.c b/src/gallium/drivers/nvfx/nvfx_state_emit.c index f91ae19ecd3..390bca8cdb5 100644 --- a/src/gallium/drivers/nvfx/nvfx_state_emit.c +++ b/src/gallium/drivers/nvfx/nvfx_state_emit.c @@ -1,15 +1,54 @@ #include "nvfx_context.h" #include "nvfx_state.h" +#include "nvfx_resource.h" #include "draw/draw_context.h" static boolean nvfx_state_validate_common(struct nvfx_context *nvfx) { struct nouveau_channel* chan = nvfx->screen->base.channel; - unsigned dirty = nvfx->dirty; + unsigned dirty; + unsigned still_dirty = 0; + int all_swizzled = -1; + boolean flush_tex_cache = FALSE; + unsigned render_temps; if(nvfx != nvfx->screen->cur_ctx) - dirty = ~0; + { + nvfx->dirty = ~0; + nvfx->hw_vtxelt_nr = 16; + nvfx->hw_pointsprite_control = -1; + nvfx->hw_vp_output = -1; + nvfx->screen->cur_ctx = nvfx; + nvfx->relocs_needed = NVFX_RELOCATE_ALL; + } + + /* These can trigger use the of 3D engine to copy temporaries. + * That will recurse here and thus dirty all 3D state, so we need to this before anything else, and in a loop.. + * This converges to having clean temps, then binding both fragtexes and framebuffers. + */ + while(nvfx->dirty & (NVFX_NEW_FB | NVFX_NEW_SAMPLER)) + { + if(nvfx->dirty & NVFX_NEW_SAMPLER) + { + nvfx->dirty &=~ NVFX_NEW_SAMPLER; + nvfx_fragtex_validate(nvfx); + + // TODO: only set this if really necessary + flush_tex_cache = TRUE; + } + + if(nvfx->dirty & NVFX_NEW_FB) + { + nvfx->dirty &=~ NVFX_NEW_FB; + all_swizzled = nvfx_framebuffer_prepare(nvfx); + + // TODO: make sure this doesn't happen, i.e. fbs have matching formats + assert(all_swizzled >= 0); + } + } + + dirty = nvfx->dirty; if(nvfx->render_mode == HW) { @@ -19,11 +58,19 @@ nvfx_state_validate_common(struct nvfx_context *nvfx) return FALSE; } - if(dirty & (NVFX_NEW_ARRAYS)) + if(dirty & NVFX_NEW_ARRAYS) { if(!nvfx_vbo_validate(nvfx)) return FALSE; } + + if(dirty & NVFX_NEW_INDEX) + { + if(nvfx->use_index_buffer) + nvfx_idxbuf_validate(nvfx); + else + still_dirty = NVFX_NEW_INDEX; + } } else { @@ -31,13 +78,10 @@ nvfx_state_validate_common(struct nvfx_context *nvfx) if(dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP)) nvfx_vertprog_validate(nvfx); - if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_FRAGPROG)) + if(dirty & (NVFX_NEW_ARRAYS | NVFX_NEW_INDEX | NVFX_NEW_FRAGPROG)) nvfx_vtxfmt_validate(nvfx); } - if(dirty & NVFX_NEW_FB) - nvfx_state_framebuffer_validate(nvfx); - if(dirty & NVFX_NEW_RAST) sb_emit(chan, nvfx->rasterizer->sb, nvfx->rasterizer->sb_len); @@ -47,11 +91,97 @@ nvfx_state_validate_common(struct nvfx_context *nvfx) if(dirty & NVFX_NEW_STIPPLE) nvfx_state_stipple_validate(nvfx); - if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST)) + if(nvfx->dirty & NVFX_NEW_UCP) + { + unsigned enables[7] = + { + 0, + NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0, + NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1, + NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2, + NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3, + NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4, + NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4 | NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5, + }; + + if(!nvfx->use_vp_clipping) + { + WAIT_RING(chan, 2); + OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1)); + OUT_RING(chan, 0); + + WAIT_RING(chan, 6 * 4 + 1); + OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANE_A(0), nvfx->clip.nr * 4)); + OUT_RINGp(chan, &nvfx->clip.ucp[0][0], nvfx->clip.nr * 4); + } + + WAIT_RING(chan, 2); + OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1)); + OUT_RING(chan, enables[nvfx->clip.nr]); + } + + if(nvfx->use_vp_clipping && (nvfx->dirty & (NVFX_NEW_UCP | NVFX_NEW_VERTPROG))) + { + unsigned i; + struct nvfx_vertex_program* vp = nvfx->vertprog; + if(nvfx->clip.nr != vp->clip_nr) + { + unsigned idx; + WAIT_RING(chan, 14); + + /* remove last instruction bit */ + if(vp->clip_nr >= 0) + { + idx = vp->nr_insns - 7 + vp->clip_nr; + OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_FROM_ID, 1)); + OUT_RING(chan, vp->exec->start + idx); + OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_INST(0), 4)); + OUT_RINGp (chan, vp->insns[idx].data, 4); + } + + /* set last instruction bit */ + idx = vp->nr_insns - 7 + nvfx->clip.nr; + OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_FROM_ID, 1)); + OUT_RING(chan, vp->exec->start + idx); + OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_INST(0), 4)); + OUT_RINGp(chan, vp->insns[idx].data, 3); + OUT_RING(chan, vp->insns[idx].data[3] | 1); + vp->clip_nr = nvfx->clip.nr; + } + + // TODO: only do this for the ones changed + WAIT_RING(chan, 6 * 6); + for(i = 0; i < nvfx->clip.nr; ++i) + { + OUT_RING(chan, RING_3D(NV34TCL_VP_UPLOAD_CONST_ID, 5)); + OUT_RING(chan, vp->data->start + i); + OUT_RINGp (chan, nvfx->clip.ucp[i], 4); + } + } + + if(dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_FRAGCONST | NVFX_NEW_VERTPROG | NVFX_NEW_SPRITE)) + { nvfx_fragprog_validate(nvfx); + if(dirty & NVFX_NEW_FRAGPROG) + flush_tex_cache = TRUE; // TODO: do we need this? + } - if(dirty & NVFX_NEW_SAMPLER) - nvfx_fragtex_validate(nvfx); + if(nvfx->is_nv4x) + { + unsigned vp_output = nvfx->vertprog->or | nvfx->hw_fragprog->or; + vp_output |= (1 << (nvfx->clip.nr + 6)) - (1 << 6); + + if(vp_output != nvfx->hw_vp_output) + { + WAIT_RING(chan, 2); + OUT_RING(chan, RING_3D(NV40TCL_VP_RESULT_EN, 1)); + OUT_RING(chan, vp_output); + nvfx->hw_vp_output = vp_output; + } + } + + if(all_swizzled >= 0) + nvfx_framebuffer_validate(nvfx, all_swizzled); if(dirty & NVFX_NEW_BLEND) sb_emit(chan, nvfx->blend->sb, nvfx->blend->sb_len); @@ -65,31 +195,62 @@ nvfx_state_validate_common(struct nvfx_context *nvfx) if(dirty & NVFX_NEW_SR) nvfx_state_sr_validate(nvfx); -/* Having this depend on FB looks wrong, but it seems - necessary to make this work on nv3x +/* All these dependencies are wrong, but otherwise + etracer, neverball, foobillard, glest totally misrender TODO: find the right fix */ - if(dirty & (NVFX_NEW_VIEWPORT | NVFX_NEW_FB)) + if(dirty & (NVFX_NEW_VIEWPORT | NVFX_NEW_RAST | NVFX_NEW_ZSA) || (all_swizzled >= 0)) + { nvfx_state_viewport_validate(nvfx); + } + + if(dirty & NVFX_NEW_ZSA || (all_swizzled >= 0)) + { + WAIT_RING(chan, 3); + OUT_RING(chan, RING_3D(NV34TCL_DEPTH_WRITE_ENABLE, 2)); + OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.writemask); + OUT_RING(chan, nvfx->framebuffer.zsbuf && nvfx->zsa->pipe.depth.enabled); + } - /* TODO: could nv30 need this or something similar too? */ - if((dirty & (NVFX_NEW_FRAGPROG | NVFX_NEW_SAMPLER)) && nvfx->is_nv4x) { - WAIT_RING(chan, 4); - OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1)); - OUT_RING(chan, 2); - OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1)); - OUT_RING(chan, 1); + if(flush_tex_cache) + { + // TODO: what about nv30? + if(nvfx->is_nv4x) + { + WAIT_RING(chan, 4); + OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1)); + OUT_RING(chan, 2); + OUT_RING(chan, RING_3D(NV40TCL_TEX_CACHE_CTL, 1)); + OUT_RING(chan, 1); + } } - nvfx->dirty = 0; + + nvfx->dirty = dirty & still_dirty; + + render_temps = nvfx->state.render_temps; + if(render_temps) + { + for(int i = 0; i < nvfx->framebuffer.nr_cbufs; ++i) + { + if(render_temps & (1 << i)) + util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.cbufs[i]), + (struct util_dirty_surface*)nvfx->framebuffer.cbufs[i]); + } + + if(render_temps & 0x80) + util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(nvfx->framebuffer.zsbuf), + (struct util_dirty_surface*)nvfx->framebuffer.zsbuf); + } + return TRUE; } -void -nvfx_state_emit(struct nvfx_context *nvfx) +inline void +nvfx_state_relocate(struct nvfx_context *nvfx, unsigned relocs) { struct nouveau_channel* chan = nvfx->screen->base.channel; /* we need to ensure there is enough space to output relocations in one go */ - unsigned max_relocs = 0 + const unsigned max_relocs = 0 + 16 /* vertex buffers, incl. dma flag */ + 2 /* index buffer plus format+dma flag */ + 2 * 5 /* 4 cbufs + zsbuf, plus dma objects */ @@ -97,18 +258,19 @@ nvfx_state_emit(struct nvfx_context *nvfx) + 2 * 4 /* vertex textures plus format+dma flag */ + 1 /* fragprog incl dma flag */ ; + MARK_RING(chan, max_relocs * 2, max_relocs * 2); - nvfx_state_relocate(nvfx); -} -void -nvfx_state_relocate(struct nvfx_context *nvfx) -{ - nvfx_framebuffer_relocate(nvfx); - nvfx_fragtex_relocate(nvfx); - nvfx_fragprog_relocate(nvfx); - if (nvfx->render_mode == HW) + if(relocs & NVFX_RELOCATE_FRAMEBUFFER) + nvfx_framebuffer_relocate(nvfx); + if(relocs & NVFX_RELOCATE_FRAGTEX) + nvfx_fragtex_relocate(nvfx); + if(relocs & NVFX_RELOCATE_FRAGPROG) + nvfx_fragprog_relocate(nvfx); + if(relocs & NVFX_RELOCATE_VTXBUF) nvfx_vbo_relocate(nvfx); + if(relocs & NVFX_RELOCATE_IDXBUF) + nvfx_idxbuf_relocate(nvfx); } boolean @@ -173,6 +335,9 @@ nvfx_state_validate_swtnl(struct nvfx_context *nvfx) draw_set_vertex_elements(draw, nvfx->vtxelt->num_elements, nvfx->vtxelt->pipe); } + if (nvfx->draw_dirty & NVFX_NEW_INDEX) + draw_set_index_buffer(draw, &nvfx->idxbuf); + nvfx_state_validate_common(nvfx); nvfx->draw_dirty = 0; diff --git a/src/gallium/drivers/nvfx/nvfx_state_fb.c b/src/gallium/drivers/nvfx/nvfx_state_fb.c index 360e569f77c..3b869d43a15 100644 --- a/src/gallium/drivers/nvfx/nvfx_state_fb.c +++ b/src/gallium/drivers/nvfx/nvfx_state_fb.c @@ -1,21 +1,55 @@ #include "nvfx_context.h" #include "nvfx_resource.h" -#include "nouveau/nouveau_util.h" +#include "util/u_format.h" +static inline boolean +nvfx_surface_linear_renderable(struct pipe_surface* surf) +{ + return (surf->texture->flags & NVFX_RESOURCE_FLAG_LINEAR) + && !(surf->offset & 63) + && !(((struct nvfx_surface*)surf)->pitch & 63); +} +static inline boolean +nvfx_surface_swizzled_renderable(struct pipe_framebuffer_state* fb, struct pipe_surface* surf) +{ + /* TODO: return FALSE if we have a format not supporting swizzled rendering (e.g. r8); currently those are not supported at all */ + return !((struct nvfx_miptree*)surf->texture)->linear_pitch + && (surf->texture->target != PIPE_TEXTURE_3D || u_minify(surf->texture->depth0, surf->level) <= 1) + && !(surf->offset & 127) + && (surf->width == fb->width) + && (surf->height == fb->height) + && !((struct nvfx_surface*)surf)->temp; +} -void -nvfx_state_framebuffer_validate(struct nvfx_context *nvfx) +static boolean +nvfx_surface_get_render_target(struct pipe_surface* surf, int all_swizzled, struct nvfx_render_target* target) +{ + struct nvfx_surface* ns = (struct nvfx_surface*)surf; + if(!ns->temp) + { + target->bo = ((struct nvfx_miptree*)surf->texture)->base.bo; + target->offset = surf->offset; + target->pitch = align(ns->pitch, 64); + assert(target->pitch); + return FALSE; + } + else + { + target->offset = 0; + target->pitch = ns->temp->linear_pitch; + target->bo = ns->temp->base.bo; + assert(target->pitch); + return TRUE; + } +} + +int +nvfx_framebuffer_prepare(struct nvfx_context *nvfx) { struct pipe_framebuffer_state *fb = &nvfx->framebuffer; - struct nouveau_channel *chan = nvfx->screen->base.channel; - uint32_t rt_enable = 0, rt_format = 0; - int i, colour_format = 0, zeta_format = 0; - int depth_only = 0; - unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM; - unsigned w = fb->width; - unsigned h = fb->height; - int colour_bits = 32, zeta_bits = 32; + int i, color_format = 0, zeta_format = 0; + int all_swizzled = 1; if(!nvfx->is_nv4x) assert(fb->nr_cbufs <= 2); @@ -23,113 +57,135 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx) assert(fb->nr_cbufs <= 4); for (i = 0; i < fb->nr_cbufs; i++) { - if (colour_format) - assert(colour_format == fb->cbufs[i]->format); - else - colour_format = fb->cbufs[i]->format; - - rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i); - nvfx->hw_rt[i].bo = nvfx_surface_buffer(fb->cbufs[i]); - nvfx->hw_rt[i].offset = fb->cbufs[i]->offset; - nvfx->hw_rt[i].pitch = ((struct nv04_surface *)fb->cbufs[i])->pitch; + if (color_format) { + if(color_format != fb->cbufs[i]->format) + return -1; + } else + color_format = fb->cbufs[i]->format; + + if(!nvfx_surface_swizzled_renderable(fb, fb->cbufs[i])) + all_swizzled = 0; } - for(; i < 4; ++i) - nvfx->hw_rt[i].bo = 0; + if (fb->zsbuf) { + /* TODO: return FALSE if we have a format not supporting a depth buffer (e.g. r8); currently those are not supported at all */ + if(!nvfx_surface_swizzled_renderable(fb, fb->zsbuf)) + all_swizzled = 0; + + if(all_swizzled && util_format_get_blocksize(color_format) != util_format_get_blocksize(zeta_format)) + all_swizzled = 0; + } + + for (i = 0; i < fb->nr_cbufs; i++) { + if(!((struct nvfx_surface*)fb->cbufs[i])->temp && !all_swizzled && !nvfx_surface_linear_renderable(fb->cbufs[i])) + nvfx_surface_create_temp(&nvfx->pipe, fb->cbufs[i]); + } + + if(fb->zsbuf) { + if(!((struct nvfx_surface*)fb->zsbuf)->temp && !all_swizzled && !nvfx_surface_linear_renderable(fb->zsbuf)) + nvfx_surface_create_temp(&nvfx->pipe, fb->zsbuf); + } + + return all_swizzled; +} + +void +nvfx_framebuffer_validate(struct nvfx_context *nvfx, unsigned prepare_result) +{ + struct pipe_framebuffer_state *fb = &nvfx->framebuffer; + struct nouveau_channel *chan = nvfx->screen->base.channel; + uint32_t rt_enable, rt_format; + int i; + unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM; + unsigned w = fb->width; + unsigned h = fb->height; + + rt_enable = (NV34TCL_RT_ENABLE_COLOR0 << fb->nr_cbufs) - 1; if (rt_enable & (NV34TCL_RT_ENABLE_COLOR1 | NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3)) rt_enable |= NV34TCL_RT_ENABLE_MRT; + nvfx->state.render_temps = 0; + + for (i = 0; i < fb->nr_cbufs; i++) + nvfx->state.render_temps |= nvfx_surface_get_render_target(fb->cbufs[i], prepare_result, &nvfx->hw_rt[i]) << i; + + for(; i < 4; ++i) + nvfx->hw_rt[i].bo = 0; + if (fb->zsbuf) { - zeta_format = fb->zsbuf->format; - nvfx->hw_zeta.bo = nvfx_surface_buffer(fb->zsbuf); - nvfx->hw_zeta.offset = fb->zsbuf->offset; - nvfx->hw_zeta.pitch = ((struct nv04_surface *)fb->zsbuf)->pitch; - } - else - nvfx->hw_zeta.bo = 0; - - if (rt_enable & (NV34TCL_RT_ENABLE_COLOR0 | NV34TCL_RT_ENABLE_COLOR1 | - NV40TCL_RT_ENABLE_COLOR2 | NV40TCL_RT_ENABLE_COLOR3)) { - /* Render to at least a colour buffer */ - if (!(fb->cbufs[0]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR)) { - assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1))); - for (i = 1; i < fb->nr_cbufs; i++) - assert(!(fb->cbufs[i]->texture->flags & NVFX_RESOURCE_FLAG_LINEAR)); - - rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED | - (log2i(fb->cbufs[0]->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) | - (log2i(fb->cbufs[0]->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT); - } - else - rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR; - } else if (fb->zsbuf) { - depth_only = 1; - - /* Render to depth buffer only */ - if (!(fb->zsbuf->texture->usage & NVFX_RESOURCE_FLAG_LINEAR)) { - assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1))); - - rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED | - (log2i(fb->zsbuf->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) | - (log2i(fb->zsbuf->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT); - } - else - rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR; - } else { - return; + nvfx->state.render_temps |= nvfx_surface_get_render_target(fb->zsbuf, prepare_result, &nvfx->hw_zeta) << 7; + + assert(util_format_get_stride(fb->zsbuf->format, fb->width) <= nvfx->hw_zeta.pitch); + assert(nvfx->hw_zeta.offset + nvfx->hw_zeta.pitch * fb->height <= nvfx->hw_zeta.bo->size); } - switch (colour_format) { - case PIPE_FORMAT_B8G8R8X8_UNORM: - rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8; - break; - case PIPE_FORMAT_B8G8R8A8_UNORM: - case 0: - rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8; - break; - case PIPE_FORMAT_B5G6R5_UNORM: + if (prepare_result) { + assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1))); + + rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED | + (util_logbase2(fb->width) << NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT) | + (util_logbase2(fb->height) << NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT); + } else + rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR; + + if(fb->nr_cbufs > 0) { + switch (fb->cbufs[0]->format) { + case PIPE_FORMAT_B8G8R8X8_UNORM: + rt_format |= NV34TCL_RT_FORMAT_COLOR_X8R8G8B8; + break; + case PIPE_FORMAT_B8G8R8A8_UNORM: + case 0: + rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8; + break; + case PIPE_FORMAT_B5G6R5_UNORM: + rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5; + break; + default: + assert(0); + } + } else if(fb->zsbuf && util_format_get_blocksize(fb->zsbuf->format) == 2) rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5; - colour_bits = 16; - break; - default: - assert(0); - } + else + rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8; - switch (zeta_format) { - case PIPE_FORMAT_Z16_UNORM: + if(fb->zsbuf) { + switch (fb->zsbuf->format) { + case PIPE_FORMAT_Z16_UNORM: + rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16; + break; + case PIPE_FORMAT_S8_USCALED_Z24_UNORM: + case PIPE_FORMAT_X8Z24_UNORM: + case 0: + rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8; + break; + default: + assert(0); + } + } else if(fb->nr_cbufs && util_format_get_blocksize(fb->cbufs[0]->format) == 2) rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16; - zeta_bits = 16; - break; - case PIPE_FORMAT_S8_USCALED_Z24_UNORM: - case PIPE_FORMAT_X8Z24_UNORM: - case 0: + else rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8; - break; - default: - assert(0); - } - if ((!nvfx->is_nv4x) && colour_bits > zeta_bits) { - /* TODO: does this limitation really exist? - TODO: can it be worked around somehow? */ - assert(0); - } + if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0) || fb->zsbuf) { + struct nvfx_render_target *rt0 = &nvfx->hw_rt[0]; + uint32_t pitch; - if ((rt_enable & NV34TCL_RT_ENABLE_COLOR0) - || ((!nvfx->is_nv4x) && depth_only)) { - struct nvfx_render_target *rt0 = (depth_only ? &nvfx->hw_zeta : &nvfx->hw_rt[0]); - uint32_t pitch = rt0->pitch; + if(!(rt_enable & NV34TCL_RT_ENABLE_COLOR0)) + rt0 = &nvfx->hw_zeta; + + pitch = rt0->pitch; if(!nvfx->is_nv4x) { - if (nvfx->hw_zeta.bo) { + if (nvfx->hw_zeta.bo) pitch |= (nvfx->hw_zeta.pitch << 16); - } else { + else pitch |= (pitch << 16); - } } + //printf("rendering to bo %p [%i] at offset %i with pitch %i\n", rt0->bo, rt0->bo->handle, rt0->offset, pitch); + OUT_RING(chan, RING_3D(NV34TCL_DMA_COLOR0, 1)); OUT_RELOC(chan, rt0->bo, 0, rt_flags | NOUVEAU_BO_OR, @@ -182,7 +238,7 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx) } } - if (zeta_format) { + if (fb->zsbuf) { OUT_RING(chan, RING_3D(NV34TCL_DMA_ZETA, 1)); OUT_RELOC(chan, nvfx->hw_zeta.bo, 0, rt_flags | NOUVEAU_BO_OR, @@ -196,6 +252,10 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx) OUT_RING(chan, nvfx->hw_zeta.pitch); } } + else if(nvfx->is_nv4x) { + OUT_RING(chan, RING_3D(NV40TCL_ZETA_PITCH, 1)); + OUT_RING(chan, 64); + } OUT_RING(chan, RING_3D(NV34TCL_RT_ENABLE, 1)); OUT_RING(chan, rt_enable); @@ -218,6 +278,7 @@ nvfx_state_framebuffer_validate(struct nvfx_context *nvfx) OUT_RING(chan, RING_3D(NV34TCL_VIEWPORT_TX_ORIGIN, 1)); OUT_RING(chan, 0); } + nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAMEBUFFER; } void @@ -247,4 +308,5 @@ nvfx_framebuffer_relocate(struct nvfx_context *nvfx) DO(NV40, 3); DO_(nvfx->hw_zeta, NV34, ZETA); + nvfx->relocs_needed &=~ NVFX_RELOCATE_FRAMEBUFFER; } diff --git a/src/gallium/drivers/nvfx/nvfx_state_stipple.c b/src/gallium/drivers/nvfx/nvfx_state_stipple.c index 4da968f093f..b76e9dd3824 100644 --- a/src/gallium/drivers/nvfx/nvfx_state_stipple.c +++ b/src/gallium/drivers/nvfx/nvfx_state_stipple.c @@ -4,23 +4,8 @@ void nvfx_state_stipple_validate(struct nvfx_context *nvfx) { struct nouveau_channel *chan = nvfx->screen->base.channel; - struct pipe_rasterizer_state *rast = &nvfx->rasterizer->pipe; - if ((rast->poly_stipple_enable == 0 && nvfx->state.stipple_enabled == 0)) - return; - - if (rast->poly_stipple_enable) { - unsigned i; - - WAIT_RING(chan, 35); - OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1)); - OUT_RING(chan, 1); - OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32)); - for (i = 0; i < 32; i++) - OUT_RING(chan, nvfx->stipple[i]); - } else { - WAIT_RING(chan, 2); - OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_ENABLE, 1)); - OUT_RING(chan, 0); - } + WAIT_RING(chan, 33); + OUT_RING(chan, RING_3D(NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32)); + OUT_RINGp(chan, nvfx->stipple, 32); } diff --git a/src/gallium/drivers/nvfx/nvfx_surface.c b/src/gallium/drivers/nvfx/nvfx_surface.c index a605d2b7545..a5931b6e152 100644 --- a/src/gallium/drivers/nvfx/nvfx_surface.c +++ b/src/gallium/drivers/nvfx/nvfx_surface.c @@ -26,33 +26,421 @@ * **************************************************************************/ +#include "pipe/p_context.h" +#include "pipe/p_format.h" +#include "util/u_format.h" +#include "util/u_math.h" +#include "util/u_memory.h" +#include "util/u_pack_color.h" +#include "util/u_blitter.h" + +#include "nouveau/nouveau_winsys.h" +#include "nouveau/nouveau_screen.h" #include "nvfx_context.h" +#include "nvfx_screen.h" #include "nvfx_resource.h" -#include "pipe/p_defines.h" -#include "util/u_inlines.h" -#include "util/u_pack_color.h" +#include "nv04_2d.h" + +#include <nouveau/nouveau_bo.h> + +static INLINE void +nvfx_region_set_format(struct nv04_region* rgn, enum pipe_format format) +{ + unsigned bits = util_format_get_blocksizebits(format); + switch(bits) + { + case 8: + rgn->bpps = 0; + break; + case 16: + rgn->bpps = 1; + break; + case 32: + rgn->bpps = 2; + break; + default: + { + int shift; + assert(util_is_power_of_two(bits)); + shift = util_logbase2(bits) - 3; + assert(shift >= 2); + rgn->bpps = 2; + shift -= 2; + + rgn->x = util_format_get_nblocksx(format, rgn->x) << shift; + rgn->y = util_format_get_nblocksy(format, rgn->y); + } + } +} + +static INLINE void +nvfx_region_fixup_swizzled(struct nv04_region* rgn, unsigned zslice, unsigned width, unsigned height, unsigned depth) +{ + // TODO: move this code to surface creation? + if((depth <= 1) && (height <= 1 || width <= 2)) + rgn->pitch = width << rgn->bpps; + else if(depth > 1 && height <= 2 && width <= 2) + { + rgn->pitch = width << rgn->bpps; + rgn->offset += (zslice * width * height) << rgn->bpps; + } + else + { + rgn->pitch = 0; + rgn->z = zslice; + rgn->w = width; + rgn->h = height; + rgn->d = depth; + } +} + +static INLINE void +nvfx_region_init_for_surface(struct nv04_region* rgn, struct nvfx_surface* surf, unsigned x, unsigned y, bool for_write) +{ + rgn->x = x; + rgn->y = y; + rgn->z = 0; + nvfx_region_set_format(rgn, surf->base.base.format); + + if(surf->temp) + { + rgn->bo = surf->temp->base.bo; + rgn->offset = 0; + rgn->pitch = surf->temp->linear_pitch; + + if(for_write) + util_dirty_surface_set_dirty(nvfx_surface_get_dirty_surfaces(&surf->base.base), &surf->base); + } else { + rgn->bo = ((struct nvfx_resource*)surf->base.base.texture)->bo; + rgn->offset = surf->base.base.offset; + rgn->pitch = surf->pitch; + + if(!(surf->base.base.texture->flags & NVFX_RESOURCE_FLAG_LINEAR)) + nvfx_region_fixup_swizzled(rgn, surf->base.base.zslice, surf->base.base.width, surf->base.base.height, u_minify(surf->base.base.texture->depth0, surf->base.base.level)); + } +} + +static INLINE void +nvfx_region_init_for_subresource(struct nv04_region* rgn, struct pipe_resource* pt, struct pipe_subresource sub, unsigned x, unsigned y, unsigned z, bool for_write) +{ + if(pt->target != PIPE_BUFFER) + { + struct nvfx_surface* ns = (struct nvfx_surface*)util_surfaces_peek(&((struct nvfx_miptree*)pt)->surfaces, pt, sub.face, sub.level, z); + if(ns && util_dirty_surface_is_dirty(&ns->base)) + { + nvfx_region_init_for_surface(rgn, ns, x, y, for_write); + return; + } + } + + rgn->bo = ((struct nvfx_resource*)pt)->bo; + rgn->offset = nvfx_subresource_offset(pt, sub.face, sub.level, z); + rgn->pitch = nvfx_subresource_pitch(pt, sub.level); + rgn->x = x; + rgn->y = y; + rgn->z = 0; + + nvfx_region_set_format(rgn, pt->format); + if(!(pt->flags & NVFX_RESOURCE_FLAG_LINEAR)) + nvfx_region_fixup_swizzled(rgn, z, u_minify(pt->width0, sub.level), u_minify(pt->height0, sub.level), u_minify(pt->depth0, sub.level)); +} + +// TODO: actually test this for all formats, it's probably wrong for some... + +static INLINE int +nvfx_surface_format(enum pipe_format format) +{ + switch(util_format_get_blocksize(format)) { + case 1: + return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8; + case 2: + //return NV04_CONTEXT_SURFACES_2D_FORMAT_Y16; + return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5; + case 4: + //if(format == PIPE_FORMAT_B8G8R8X8_UNORM || format == PIPE_FORMAT_B8G8R8A8_UNORM) + return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8; + //else + // return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32; + default: + return -1; + } +} + +static INLINE int +nv04_scaled_image_format(enum pipe_format format) +{ + switch(util_format_get_blocksize(format)) { + case 1: + return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8; + case 2: + //if(format == PIPE_FORMAT_B5G5R5A1_UNORM) + // return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5; + //else + return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5; + case 4: + if(format == PIPE_FORMAT_B8G8R8X8_UNORM) + return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8; + else + return NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8; + default: + return -1; + } +} + +// XXX: must save index buffer too! +static struct blitter_context* +nvfx_get_blitter(struct pipe_context* pipe, int copy) +{ + struct nvfx_context* nvfx = nvfx_context(pipe); + + struct blitter_context* blitter = nvfx->blitter; + if(!blitter) + nvfx->blitter = blitter = util_blitter_create(pipe); + + util_blitter_save_blend(blitter, nvfx->blend); + util_blitter_save_depth_stencil_alpha(blitter, nvfx->zsa); + util_blitter_save_stencil_ref(blitter, &nvfx->stencil_ref); + util_blitter_save_rasterizer(blitter, nvfx->rasterizer); + util_blitter_save_fragment_shader(blitter, nvfx->fragprog); + util_blitter_save_vertex_shader(blitter, nvfx->vertprog); + util_blitter_save_viewport(blitter, &nvfx->viewport); + util_blitter_save_framebuffer(blitter, &nvfx->framebuffer); + util_blitter_save_clip(blitter, &nvfx->clip); + util_blitter_save_vertex_elements(blitter, nvfx->vtxelt); + util_blitter_save_vertex_buffers(blitter, nvfx->vtxbuf_nr, nvfx->vtxbuf); + + if(copy) + { + util_blitter_save_fragment_sampler_states(blitter, nvfx->nr_samplers, (void**)nvfx->tex_sampler); + util_blitter_save_fragment_sampler_views(blitter, nvfx->nr_textures, nvfx->fragment_sampler_views); + } + + return blitter; +} + +static unsigned +nvfx_region_clone(struct nv04_2d_context* ctx, struct nv04_region* rgn, unsigned w, unsigned h, boolean for_read) +{ + unsigned begin = nv04_region_begin(rgn, w, h); + unsigned end = nv04_region_end(rgn, w, h); + unsigned size = end - begin; + struct nouveau_bo* bo = 0; + nouveau_bo_new(rgn->bo->device, NOUVEAU_BO_MAP | NOUVEAU_BO_GART, 256, size, &bo); + + if(for_read || (size > ((w * h) << rgn->bpps))) + nv04_memcpy(ctx, bo, 0, rgn->bo, rgn->offset + begin, size); + + rgn->bo = bo; + rgn->offset = -begin; + return begin; +} static void -nvfx_surface_copy(struct pipe_context *pipe, - struct pipe_resource *dest, struct pipe_subresource subdst, - unsigned destx, unsigned desty, unsigned destz, - struct pipe_resource *src, struct pipe_subresource subsrc, +nvfx_resource_copy_region(struct pipe_context *pipe, + struct pipe_resource *dstr, struct pipe_subresource subdst, + unsigned dstx, unsigned dsty, unsigned dstz, + struct pipe_resource *srcr, struct pipe_subresource subsrc, unsigned srcx, unsigned srcy, unsigned srcz, - unsigned width, unsigned height) + unsigned w, unsigned h) { - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nv04_surface_2d *eng2d = nvfx->screen->eng2d; - struct pipe_surface *ps_dst, *ps_src; + static int copy_threshold = -1; + struct nv04_2d_context *ctx = nvfx_screen(pipe->screen)->eng2d; + struct nv04_region dst, src; + int dst_to_gpu; + int src_on_gpu; + boolean small; + int ret; + + if(!w || !h) + return; + + if(copy_threshold < 0) + copy_threshold = debug_get_num_option("NOUVEAU_COPY_THRESHOLD", 4); + + dst_to_gpu = dstr->usage != PIPE_USAGE_DYNAMIC && dstr->usage != PIPE_USAGE_STAGING; + src_on_gpu = nvfx_resource_on_gpu(srcr); + + nvfx_region_init_for_subresource(&dst, dstr, subdst, dstx, dsty, dstz, TRUE); + nvfx_region_init_for_subresource(&src, srcr, subsrc, srcx, srcy, srcz, FALSE); + w = util_format_get_stride(dstr->format, w) >> dst.bpps; + h = util_format_get_nblocksy(dstr->format, h); - ps_src = nvfx_miptree_surface_new(pipe->screen, src, subsrc.face, - subsrc.level, srcz, 0 /* bind flags */); - ps_dst = nvfx_miptree_surface_new(pipe->screen, dest, subdst.face, - subdst.level, destz, 0 /* bindflags */); + small = (w * h <= copy_threshold); + if((!dst_to_gpu || !src_on_gpu) && small) + ret = -1; /* use the CPU */ + else + ret = nv04_region_copy_2d(ctx, &dst, &src, w, h, + dstr->target == PIPE_BUFFER ? -1 : nvfx_surface_format(dstr->format), + dstr->target == PIPE_BUFFER ? -1 : nv04_scaled_image_format(dstr->format), + dst_to_gpu, src_on_gpu); + if(!ret) + {} + else if(ret > 0 && dstr->bind & PIPE_BIND_RENDER_TARGET && srcr->bind & PIPE_BIND_SAMPLER_VIEW) + { + struct blitter_context* blitter = nvfx_get_blitter(pipe, 1); + util_blitter_copy_region(blitter, dstr, subdst, dstx, dsty, dstz, srcr, subsrc, srcx, srcy, srcz, w, h, TRUE); + } + else + { + struct nv04_region dstt = dst; + struct nv04_region srct = src; + unsigned dstbegin = 0; - eng2d->copy(eng2d, ps_dst, destx, desty, ps_src, srcx, srcy, width, height); + if(!small) + { + if(src_on_gpu) + nvfx_region_clone(ctx, &srct, w, h, TRUE); - nvfx_miptree_surface_del(ps_src); - nvfx_miptree_surface_del(ps_dst); + if(dst_to_gpu) + dstbegin = nvfx_region_clone(ctx, &dstt, w, h, FALSE); + } + + nv04_region_copy_cpu(&dstt, &srct, w, h); + + if(srct.bo != src.bo) + nouveau_screen_bo_release(pipe->screen, srct.bo); + + if(dstt.bo != dst.bo) + { + nv04_memcpy(ctx, dst.bo, dst.offset + dstbegin, dstt.bo, 0, dstt.bo->size); + nouveau_screen_bo_release(pipe->screen, dstt.bo); + } + } +} + +static int +nvfx_surface_fill(struct pipe_context* pipe, struct pipe_surface *dsts, + unsigned dx, unsigned dy, unsigned w, unsigned h, unsigned value) +{ + struct nv04_2d_context *ctx = nvfx_screen(pipe->screen)->eng2d; + struct nv04_region dst; + int ret; + /* Always try to use the GPU right now, if possible + * If the user wanted the surface data on the CPU, he would have cleared with memset (hopefully) */ + + // we don't care about interior pixel order since we set all them to the same value + nvfx_region_init_for_surface(&dst, (struct nvfx_surface*)dsts, dx, dy, TRUE); + + w = util_format_get_stride(dsts->format, w) >> dst.bpps; + h = util_format_get_nblocksy(dsts->format, h); + + ret = nv04_region_fill_2d(ctx, &dst, w, h, value); + if(ret > 0 && dsts->texture->bind & PIPE_BIND_RENDER_TARGET) + return 1; + else if(ret) + { + struct nv04_region dstt = dst; + unsigned dstbegin = 0; + + if(nvfx_resource_on_gpu(dsts->texture)) + dstbegin = nvfx_region_clone(ctx, &dstt, w, h, FALSE); + + nv04_region_fill_cpu(&dstt, w, h, value); + + if(dstt.bo != dst.bo) + { + nv04_memcpy(ctx, dst.bo, dst.offset + dstbegin, dstt.bo, 0, dstt.bo->size); + nouveau_screen_bo_release(pipe->screen, dstt.bo); + } + } + + return 0; +} + + +void +nvfx_screen_surface_takedown(struct pipe_screen *pscreen) +{ + nv04_2d_context_takedown(nvfx_screen(pscreen)->eng2d); + nvfx_screen(pscreen)->eng2d = 0; +} + +int +nvfx_screen_surface_init(struct pipe_screen *pscreen) +{ + struct nv04_2d_context* ctx = nv04_2d_context_init(nouveau_screen(pscreen)->channel); + if(!ctx) + return -1; + nvfx_screen(pscreen)->eng2d = ctx; + return 0; +} + +static void +nvfx_surface_copy_temp(struct pipe_context* pipe, struct pipe_surface* surf, int to_temp) +{ + struct nvfx_surface* ns = (struct nvfx_surface*)surf; + struct pipe_subresource tempsr, surfsr; + struct nvfx_context* nvfx = nvfx_context(pipe); + + // TODO: we really should do this validation before setting these variable in draw calls + unsigned use_vertex_buffers = nvfx->use_vertex_buffers; + boolean use_index_buffer = nvfx->use_index_buffer; + unsigned base_vertex = nvfx->base_vertex; + + tempsr.face = 0; + tempsr.level = 0; + surfsr.face = surf->face; + surfsr.level = surf->level; + + if(to_temp) + nvfx_resource_copy_region(pipe, &ns->temp->base.base, tempsr, 0, 0, 0, surf->texture, surfsr, 0, 0, surf->zslice, surf->width, surf->height); + else + nvfx_resource_copy_region(pipe, surf->texture, surfsr, 0, 0, surf->zslice, &ns->temp->base.base, tempsr, 0, 0, 0, surf->width, surf->height); + + nvfx->use_vertex_buffers = use_vertex_buffers; + nvfx->use_index_buffer = use_index_buffer; + nvfx->base_vertex = base_vertex; + + nvfx->dirty |= NVFX_NEW_ARRAYS; + nvfx->draw_dirty |= NVFX_NEW_ARRAYS; +} + +void +nvfx_surface_create_temp(struct pipe_context* pipe, struct pipe_surface* surf) +{ + struct nvfx_surface* ns = (struct nvfx_surface*)surf; + struct pipe_resource template; + memset(&template, 0, sizeof(struct pipe_resource)); + template.target = PIPE_TEXTURE_2D; + template.format = surf->format; + template.width0 = surf->width; + template.height0 = surf->height; + template.depth0 = 1; + template.nr_samples = surf->texture->nr_samples; + template.flags = NVFX_RESOURCE_FLAG_LINEAR; + + ns->temp = (struct nvfx_miptree*)nvfx_miptree_create(pipe->screen, &template); + nvfx_surface_copy_temp(pipe, surf, 1); +} + +void +nvfx_surface_flush(struct pipe_context* pipe, struct pipe_surface* surf) +{ + struct nvfx_context* nvfx = (struct nvfx_context*)pipe; + struct nvfx_surface* ns = (struct nvfx_surface*)surf; + boolean bound = FALSE; + + /* must be done before the copy, otherwise the copy will use the temp as destination */ + util_dirty_surface_set_clean(nvfx_surface_get_dirty_surfaces(surf), &ns->base); + + nvfx_surface_copy_temp(pipe, surf, 0); + + if(nvfx->framebuffer.zsbuf == surf) + bound = TRUE; + else + { + for(unsigned i = 0; i < nvfx->framebuffer.nr_cbufs; ++i) + { + if(nvfx->framebuffer.cbufs[i] == surf) + { + bound = TRUE; + break; + } + } + } + + if(!bound) + pipe_resource_reference((struct pipe_resource**)&ns->temp, 0); } static void @@ -62,12 +450,16 @@ nvfx_clear_render_target(struct pipe_context *pipe, unsigned dstx, unsigned dsty, unsigned width, unsigned height) { - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nv04_surface_2d *eng2d = nvfx->screen->eng2d; union util_color uc; util_pack_color(rgba, dst->format, &uc); - eng2d->fill(eng2d, dst, dstx, dsty, width, height, uc.ui); + if(util_format_get_blocksizebits(dst->format) > 32 + || nvfx_surface_fill(pipe, dst, dstx, dsty, width, height, uc.ui)) + { + // TODO: probably should use hardware clear here instead if possible + struct blitter_context* blitter = nvfx_get_blitter(pipe, 0); + util_blitter_clear_render_target(blitter, dst, rgba, dstx, dsty, width, height); + } } static void @@ -79,18 +471,20 @@ nvfx_clear_depth_stencil(struct pipe_context *pipe, unsigned dstx, unsigned dsty, unsigned width, unsigned height) { - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nv04_surface_2d *eng2d = nvfx->screen->eng2d; - - eng2d->fill(eng2d, dst, dstx, dsty, width, height, - util_pack_z_stencil(dst->format, depth, stencil)); + if(util_format_get_blocksizebits(dst->format) > 32 + || nvfx_surface_fill(pipe, dst, dstx, dsty, width, height, util_pack_z_stencil(dst->format, depth, stencil))) + { + // TODO: probably should use hardware clear here instead if possible + struct blitter_context* blitter = nvfx_get_blitter(pipe, 0); + util_blitter_clear_depth_stencil(blitter, dst, clear_flags, depth, stencil, dstx, dsty, width, height); + } } void nvfx_init_surface_functions(struct nvfx_context *nvfx) { - nvfx->pipe.resource_copy_region = nvfx_surface_copy; + nvfx->pipe.resource_copy_region = nvfx_resource_copy_region; nvfx->pipe.clear_render_target = nvfx_clear_render_target; nvfx->pipe.clear_depth_stencil = nvfx_clear_depth_stencil; } diff --git a/src/gallium/drivers/nvfx/nvfx_tex.h b/src/gallium/drivers/nvfx/nvfx_tex.h index 69187a79e79..34be936a891 100644 --- a/src/gallium/drivers/nvfx/nvfx_tex.h +++ b/src/gallium/drivers/nvfx/nvfx_tex.h @@ -1,6 +1,11 @@ #ifndef NVFX_TEX_H_ #define NVFX_TEX_H_ +#include "util/u_math.h" +#include "pipe/p_defines.h" +#include "pipe/p_state.h" +#include <nouveau/nouveau_class.h> + static inline unsigned nvfx_tex_wrap_mode(unsigned wrap) { unsigned ret; @@ -31,7 +36,7 @@ nvfx_tex_wrap_mode(unsigned wrap) { ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP; break; default: - NOUVEAU_ERR("unknown wrap mode: %d\n", wrap); + assert(0); ret = NV34TCL_TX_WRAP_S_REPEAT; break; } @@ -40,31 +45,29 @@ nvfx_tex_wrap_mode(unsigned wrap) { } static inline unsigned -nvfx_tex_wrap_compare_mode(const struct pipe_sampler_state* cso) +nvfx_tex_wrap_compare_mode(unsigned func) { - if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) { - switch (cso->compare_func) { - case PIPE_FUNC_NEVER: - return NV34TCL_TX_WRAP_RCOMP_NEVER; - case PIPE_FUNC_GREATER: - return NV34TCL_TX_WRAP_RCOMP_GREATER; - case PIPE_FUNC_EQUAL: - return NV34TCL_TX_WRAP_RCOMP_EQUAL; - case PIPE_FUNC_GEQUAL: - return NV34TCL_TX_WRAP_RCOMP_GEQUAL; - case PIPE_FUNC_LESS: - return NV34TCL_TX_WRAP_RCOMP_LESS; - case PIPE_FUNC_NOTEQUAL: - return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL; - case PIPE_FUNC_LEQUAL: - return NV34TCL_TX_WRAP_RCOMP_LEQUAL; - case PIPE_FUNC_ALWAYS: - return NV34TCL_TX_WRAP_RCOMP_ALWAYS; - default: - break; - } + switch (func) { + case PIPE_FUNC_NEVER: + return NV34TCL_TX_WRAP_RCOMP_NEVER; + case PIPE_FUNC_GREATER: + return NV34TCL_TX_WRAP_RCOMP_GREATER; + case PIPE_FUNC_EQUAL: + return NV34TCL_TX_WRAP_RCOMP_EQUAL; + case PIPE_FUNC_GEQUAL: + return NV34TCL_TX_WRAP_RCOMP_GEQUAL; + case PIPE_FUNC_LESS: + return NV34TCL_TX_WRAP_RCOMP_LESS; + case PIPE_FUNC_NOTEQUAL: + return NV34TCL_TX_WRAP_RCOMP_NOTEQUAL; + case PIPE_FUNC_LEQUAL: + return NV34TCL_TX_WRAP_RCOMP_LEQUAL; + case PIPE_FUNC_ALWAYS: + return NV34TCL_TX_WRAP_RCOMP_ALWAYS; + default: + assert(0); + return 0; } - return 0; } static inline unsigned nvfx_tex_filter(const struct pipe_sampler_state* cso) @@ -128,6 +131,45 @@ struct nvfx_sampler_state { uint32_t en; uint32_t filt; uint32_t bcol; + uint32_t min_lod; + uint32_t max_lod; + boolean compare; +}; + +struct nvfx_sampler_view { + struct pipe_sampler_view base; + int offset; + uint32_t swizzle; + uint32_t npot_size; + uint32_t filt; + uint32_t wrap_mask; + uint32_t wrap; + uint32_t lod_offset; + uint32_t max_lod_limit; + union + { + struct + { + uint32_t fmt[4]; /* nv30 has 4 entries, nv40 one */ + int rect; + } nv30; + struct + { + uint32_t fmt[2]; /* nv30 has 4 entries, nv40 one */ + uint32_t npot_size2; /* nv40 only */ + } nv40; + uint32_t init_fmt; + } u; }; +struct nvfx_texture_format { + int fmt[6]; + unsigned sign; + unsigned wrap; + unsigned char src[6]; + unsigned char comp[6]; +}; + +extern struct nvfx_texture_format nvfx_texture_formats[PIPE_FORMAT_COUNT]; + #endif /* NVFX_TEX_H_ */ diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.c b/src/gallium/drivers/nvfx/nvfx_transfer.c index 9ff0a93d307..7cb47a20f64 100644 --- a/src/gallium/drivers/nvfx/nvfx_transfer.c +++ b/src/gallium/drivers/nvfx/nvfx_transfer.c @@ -4,204 +4,218 @@ #include "util/u_format.h" #include "util/u_memory.h" #include "util/u_math.h" -#include "nouveau/nouveau_winsys.h" +#include "util/u_staging.h" #include "nvfx_context.h" #include "nvfx_screen.h" #include "nvfx_state.h" #include "nvfx_resource.h" #include "nvfx_transfer.h" -struct nvfx_transfer { - struct pipe_transfer base; - struct pipe_surface *surface; - boolean direct; -}; - -static void -nvfx_compatible_transfer_tex(struct pipe_resource *pt, unsigned width, unsigned height, - unsigned bind, - struct pipe_resource *template) -{ - memset(template, 0, sizeof(struct pipe_resource)); - template->target = pt->target; - template->format = pt->format; - template->width0 = width; - template->height0 = height; - template->depth0 = 1; - template->last_level = 0; - template->nr_samples = pt->nr_samples; - template->bind = bind; - template->usage = PIPE_USAGE_DYNAMIC; - template->flags = NVFX_RESOURCE_FLAG_LINEAR; -} - - -static unsigned nvfx_transfer_bind_flags( unsigned transfer_usage ) +struct nvfx_staging_transfer { - unsigned bind = 0; + struct util_staging_transfer base; -#if 0 - if (transfer_usage & PIPE_TRANSFER_WRITE) - bind |= PIPE_BIND_BLIT_SOURCE; - - if (transfer_usage & PIPE_TRANSFER_READ) - bind |= PIPE_BIND_BLIT_DESTINATION; -#endif - - return bind; -} + unsigned offset; + unsigned map_count; +}; struct pipe_transfer * -nvfx_miptree_transfer_new(struct pipe_context *pipe, +nvfx_transfer_new(struct pipe_context *pipe, struct pipe_resource *pt, struct pipe_subresource sr, unsigned usage, const struct pipe_box *box) { - struct pipe_screen *pscreen = pipe->screen; - struct nvfx_miptree *mt = (struct nvfx_miptree *)pt; - struct nvfx_transfer *tx; - struct pipe_resource tx_tex_template, *tx_tex; - static int no_transfer = -1; - unsigned bind = nvfx_transfer_bind_flags(usage); - if(no_transfer < 0) - no_transfer = debug_get_bool_option("NOUVEAU_NO_TRANSFER", FALSE); - - - tx = CALLOC_STRUCT(nvfx_transfer); - if (!tx) - return NULL; - - /* Don't handle 3D transfers yet. - */ - assert(box->depth == 1); - - pipe_resource_reference(&tx->base.resource, pt); - tx->base.sr = sr; - tx->base.usage = usage; - tx->base.box = *box; - tx->base.stride = mt->level[sr.level].pitch; - - /* Direct access to texture */ - if ((pt->usage == PIPE_USAGE_DYNAMIC || - no_transfer) && - pt->flags & NVFX_RESOURCE_FLAG_LINEAR) + if((usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_DONTBLOCK)) == PIPE_TRANSFER_DONTBLOCK) + { + struct nouveau_bo* bo = ((struct nvfx_resource*)pt)->bo; + if(bo && nouveau_bo_busy(bo, NOUVEAU_BO_WR)) + return NULL; + } + + if(pt->target == PIPE_BUFFER) { - tx->direct = true; - - /* XXX: just call the internal nvfx function. - */ - tx->surface = pscreen->get_tex_surface(pscreen, pt, - sr.face, sr.level, - box->z, - bind); - return &tx->base; - } + // it would be nice if we could avoid all this ridiculous overhead... + struct pipe_transfer* tx; + struct nvfx_buffer* buffer = nvfx_buffer(pt); + + tx = CALLOC_STRUCT(pipe_transfer); + if (!tx) + return NULL; - tx->direct = false; + pipe_resource_reference(&tx->resource, pt); + tx->sr = sr; + tx->usage = usage; + tx->box = *box; - nvfx_compatible_transfer_tex(pt, box->width, box->height, bind, &tx_tex_template); + tx->slice_stride = tx->stride = util_format_get_stride(pt->format, box->width); + tx->data = buffer->data + util_format_get_stride(pt->format, box->x); - tx_tex = pscreen->resource_create(pscreen, &tx_tex_template); - if (!tx_tex) + return tx; + } + else { - FREE(tx); - return NULL; + struct nvfx_staging_transfer* tx; + bool direct = !nvfx_resource_on_gpu(pt) && pt->flags & NVFX_RESOURCE_FLAG_LINEAR; + + tx = CALLOC_STRUCT(nvfx_staging_transfer); + if(!tx) + return NULL; + + util_staging_transfer_init(pipe, pt, sr, usage, box, direct, &tx->base); + + if(direct) + { + tx->base.base.stride = nvfx_subresource_pitch(pt, sr.level); + tx->base.base.slice_stride = tx->base.base.stride * u_minify(pt->height0, sr.level); + tx->offset = nvfx_subresource_offset(pt, sr.face, sr.level, box->z) + + util_format_get_2d_size(pt->format, tx->base.base.stride, box->y) + + util_format_get_stride(pt->format, box->x); + } + else + { + tx->base.base.stride = nvfx_subresource_pitch(tx->base.staging_resource, 0); + tx->base.base.slice_stride = tx->base.base.stride * tx->base.staging_resource->height0; + tx->offset = 0; + } + + assert(tx->base.base.stride); + + return &tx->base.base; } +} - tx->base.stride = ((struct nvfx_miptree*)tx_tex)->level[0].pitch; - - tx->surface = pscreen->get_tex_surface(pscreen, tx_tex, - 0, 0, 0, - bind); - - pipe_resource_reference(&tx_tex, NULL); - - if (!tx->surface) +static void nvfx_buffer_dirty_interval(struct nvfx_buffer* buffer, unsigned begin, unsigned size, boolean unsynchronized) +{ + struct nvfx_screen* screen = nvfx_screen(buffer->base.base.screen); + buffer->last_update_static = buffer->bytes_to_draw_until_static < 0; + if(buffer->dirty_begin == buffer->dirty_end) { - pipe_surface_reference(&tx->surface, NULL); - FREE(tx); - return NULL; + buffer->dirty_begin = begin; + buffer->dirty_end = begin + size; + buffer->dirty_unsynchronized = unsynchronized; + } + else + { + buffer->dirty_begin = MIN2(buffer->dirty_begin, begin); + buffer->dirty_end = MAX2(buffer->dirty_end, begin + size); + buffer->dirty_unsynchronized &= unsynchronized; } - if (usage & PIPE_TRANSFER_READ) { - struct nvfx_screen *nvscreen = nvfx_screen(pscreen); - struct pipe_surface *src; + if(unsynchronized) + { + // TODO: revisit this, it doesn't seem quite right + //printf("UNSYNC UPDATE %p %u %u\n", buffer, begin, size); + buffer->bytes_to_draw_until_static += size * screen->static_reuse_threshold; + } + else + buffer->bytes_to_draw_until_static = buffer->size * screen->static_reuse_threshold; +} - src = pscreen->get_tex_surface(pscreen, pt, - sr.face, sr.level, box->z, - 0 /*PIPE_BIND_BLIT_SOURCE*/); +static void nvfx_transfer_flush_region( struct pipe_context *pipe, + struct pipe_transfer *ptx, + const struct pipe_box *box) +{ + if(ptx->resource->target == PIPE_BUFFER && (ptx->usage & PIPE_TRANSFER_FLUSH_EXPLICIT)) + { + struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource); + nvfx_buffer_dirty_interval(buffer, + (uint8_t*)ptx->data - buffer->data + util_format_get_stride(buffer->base.base.format, box->x), + util_format_get_stride(buffer->base.base.format, box->width), + !!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED)); + } +} - /* TODO: Check if SIFM can deal with x,y,w,h when swizzling */ - /* TODO: Check if SIFM can un-swizzle */ - nvscreen->eng2d->copy(nvscreen->eng2d, - tx->surface, 0, 0, - src, - box->x, box->y, - box->width, box->height); +static void +nvfx_transfer_destroy(struct pipe_context *pipe, struct pipe_transfer *ptx) +{ + if(ptx->resource->target == PIPE_BUFFER) + { + struct nvfx_buffer* buffer = nvfx_buffer(ptx->resource); + if((ptx->usage & (PIPE_TRANSFER_WRITE | PIPE_TRANSFER_FLUSH_EXPLICIT)) == PIPE_TRANSFER_WRITE) + nvfx_buffer_dirty_interval(buffer, + (uint8_t*)ptx->data - buffer->data, + ptx->stride, + !!(ptx->usage & PIPE_TRANSFER_UNSYNCHRONIZED)); + pipe_resource_reference(&ptx->resource, 0); + FREE(ptx); + } + else + { + struct nouveau_channel* chan = nvfx_context(pipe)->screen->base.channel; + util_staging_transfer_destroy(pipe, ptx); - pipe_surface_reference(&src, NULL); + FIRE_RING(chan); } +} - return &tx->base; +void * +nvfx_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx) +{ + if(ptx->resource->target == PIPE_BUFFER) + return ptx->data; + else + { + struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx; + if(!ptx->data) + { + struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource; + uint8_t *map = nouveau_screen_bo_map(pipe->screen, mt->base.bo, nouveau_screen_transfer_flags(ptx->usage)); + ptx->data = map + tx->offset; + } + + ++tx->map_count; + return ptx->data; + } } void -nvfx_miptree_transfer_del(struct pipe_context *pipe, - struct pipe_transfer *ptx) +nvfx_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx) { - struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx; - - if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) { - struct pipe_screen *pscreen = pipe->screen; - struct nvfx_screen *nvscreen = nvfx_screen(pscreen); - struct pipe_surface *dst; - - dst = pscreen->get_tex_surface(pscreen, - ptx->resource, - ptx->sr.face, - ptx->sr.level, - ptx->box.z, - 0 /*PIPE_BIND_BLIT_DESTINATION*/); - - /* TODO: Check if SIFM can deal with x,y,w,h when swizzling */ - nvscreen->eng2d->copy(nvscreen->eng2d, - dst, ptx->box.x, ptx->box.y, - tx->surface, 0, 0, - ptx->box.width, ptx->box.height); - - pipe_surface_reference(&dst, NULL); + if(ptx->resource->target != PIPE_BUFFER) + { + struct nvfx_staging_transfer *tx = (struct nvfx_staging_transfer *)ptx; + struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->base.staging_resource; + + if(!--tx->map_count) + { + nouveau_screen_bo_unmap(pipe->screen, mt->base.bo); + ptx->data = 0; + } } - - pipe_surface_reference(&tx->surface, NULL); - pipe_resource_reference(&ptx->resource, NULL); - FREE(ptx); } -void * -nvfx_miptree_transfer_map(struct pipe_context *pipe, struct pipe_transfer *ptx) +static void nvfx_transfer_inline_write( struct pipe_context *pipe, + struct pipe_resource *pr, + struct pipe_subresource sr, + unsigned usage, + const struct pipe_box *box, + const void *data, + unsigned stride, + unsigned slice_stride) { - struct pipe_screen *pscreen = pipe->screen; - struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx; - struct nv04_surface *ns = (struct nv04_surface *)tx->surface; - struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture; - uint8_t *map = nouveau_screen_bo_map(pscreen, mt->base.bo, - nouveau_screen_transfer_flags(ptx->usage)); - - if(!tx->direct) - return map + ns->base.offset; + if(pr->target != PIPE_BUFFER) + { + u_default_transfer_inline_write(pipe, pr, sr, usage, box, data, stride, slice_stride); + } else - return (map + ns->base.offset + - ptx->box.y * ns->pitch + - ptx->box.x * util_format_get_blocksize(ptx->resource->format)); + { + struct nvfx_buffer* buffer = nvfx_buffer(pr); + unsigned begin = util_format_get_stride(pr->format, box->x); + unsigned size = util_format_get_stride(pr->format, box->width); + memcpy(buffer->data + begin, data, size); + nvfx_buffer_dirty_interval(buffer, begin, size, + !!(pr->flags & PIPE_TRANSFER_UNSYNCHRONIZED)); + } } void -nvfx_miptree_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *ptx) +nvfx_init_transfer_functions(struct pipe_context *pipe) { - struct pipe_screen *pscreen = pipe->screen; - struct nvfx_transfer *tx = (struct nvfx_transfer *)ptx; - struct nvfx_miptree *mt = (struct nvfx_miptree *)tx->surface->texture; - - nouveau_screen_bo_unmap(pscreen, mt->base.bo); + pipe->get_transfer = nvfx_transfer_new; + pipe->transfer_map = nvfx_transfer_map; + pipe->transfer_flush_region = nvfx_transfer_flush_region; + pipe->transfer_unmap = nvfx_transfer_unmap; + pipe->transfer_destroy = nvfx_transfer_destroy; + pipe->transfer_inline_write = nvfx_transfer_inline_write; } diff --git a/src/gallium/drivers/nvfx/nvfx_transfer.h b/src/gallium/drivers/nvfx/nvfx_transfer.h index 3e3317b2c7b..20f20d5b0b8 100644 --- a/src/gallium/drivers/nvfx/nvfx_transfer.h +++ b/src/gallium/drivers/nvfx/nvfx_transfer.h @@ -7,19 +7,17 @@ struct pipe_transfer * -nvfx_miptree_transfer_new(struct pipe_context *pcontext, +nvfx_transfer_new(struct pipe_context *pcontext, struct pipe_resource *pt, struct pipe_subresource sr, unsigned usage, const struct pipe_box *box); -void -nvfx_miptree_transfer_del(struct pipe_context *pcontext, - struct pipe_transfer *ptx); + void * -nvfx_miptree_transfer_map(struct pipe_context *pcontext, +nvfx_transfer_map(struct pipe_context *pcontext, struct pipe_transfer *ptx); void -nvfx_miptree_transfer_unmap(struct pipe_context *pcontext, +nvfx_transfer_unmap(struct pipe_context *pcontext, struct pipe_transfer *ptx); diff --git a/src/gallium/drivers/nvfx/nvfx_vbo.c b/src/gallium/drivers/nvfx/nvfx_vbo.c index 4aa37938425..e6e9a8f2e40 100644 --- a/src/gallium/drivers/nvfx/nvfx_vbo.c +++ b/src/gallium/drivers/nvfx/nvfx_vbo.c @@ -2,6 +2,7 @@ #include "pipe/p_state.h" #include "util/u_inlines.h" #include "util/u_format.h" +#include "translate/translate.h" #include "nvfx_context.h" #include "nvfx_state.h" @@ -10,646 +11,595 @@ #include "nouveau/nouveau_channel.h" #include "nouveau/nouveau_class.h" #include "nouveau/nouveau_pushbuf.h" -#include "nouveau/nouveau_util.h" -static INLINE int -nvfx_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp) +static inline unsigned +util_guess_unique_indices_count(unsigned mode, unsigned indices) { - switch (pipe) { - case PIPE_FORMAT_R32_FLOAT: - case PIPE_FORMAT_R32G32_FLOAT: - case PIPE_FORMAT_R32G32B32_FLOAT: - case PIPE_FORMAT_R32G32B32A32_FLOAT: - *fmt = NV34TCL_VTXFMT_TYPE_FLOAT; - break; - case PIPE_FORMAT_R16_FLOAT: - case PIPE_FORMAT_R16G16_FLOAT: - case PIPE_FORMAT_R16G16B16_FLOAT: - case PIPE_FORMAT_R16G16B16A16_FLOAT: - *fmt = NV34TCL_VTXFMT_TYPE_HALF; - break; - case PIPE_FORMAT_R8_UNORM: - case PIPE_FORMAT_R8G8_UNORM: - case PIPE_FORMAT_R8G8B8_UNORM: - case PIPE_FORMAT_R8G8B8A8_UNORM: - *fmt = NV34TCL_VTXFMT_TYPE_UBYTE; - break; - case PIPE_FORMAT_R16_SSCALED: - case PIPE_FORMAT_R16G16_SSCALED: - case PIPE_FORMAT_R16G16B16_SSCALED: - case PIPE_FORMAT_R16G16B16A16_SSCALED: - *fmt = NV34TCL_VTXFMT_TYPE_USHORT; - break; - default: - NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe)); - return 1; + /* Euler's formula gives V = + * = E - F + 2 = + * = F * (polygon_edges / 2 - 1) + 2 = + * = F * (polygon_edges - 2) / 2 + 2 = + * = indices * (polygon_edges - 2) / (2 * indices_per_face) + 2 + * = indices * (1 / 2 - 1 / polygon_edges) + 2 + */ + switch(mode) + { + case PIPE_PRIM_LINES: + return indices >> 1; + case PIPE_PRIM_TRIANGLES: + { + // avoid an expensive division by 3 using the multiplicative inverse mod 2^32 + unsigned q; + unsigned inv3 = 2863311531; + indices >>= 1; + q = indices * inv3; + if(unlikely(q >= indices)) + { + q += inv3; + if(q >= indices) + q += inv3; + } + return indices + 2; + //return indices / 6 + 2; } - - switch (pipe) { - case PIPE_FORMAT_R8_UNORM: - case PIPE_FORMAT_R32_FLOAT: - case PIPE_FORMAT_R16_FLOAT: - case PIPE_FORMAT_R16_SSCALED: - *ncomp = 1; - break; - case PIPE_FORMAT_R8G8_UNORM: - case PIPE_FORMAT_R32G32_FLOAT: - case PIPE_FORMAT_R16G16_FLOAT: - case PIPE_FORMAT_R16G16_SSCALED: - *ncomp = 2; - break; - case PIPE_FORMAT_R8G8B8_UNORM: - case PIPE_FORMAT_R32G32B32_FLOAT: - case PIPE_FORMAT_R16G16B16_FLOAT: - case PIPE_FORMAT_R16G16B16_SSCALED: - *ncomp = 3; - break; - case PIPE_FORMAT_R8G8B8A8_UNORM: - case PIPE_FORMAT_R32G32B32A32_FLOAT: - case PIPE_FORMAT_R16G16B16A16_FLOAT: - case PIPE_FORMAT_R16G16B16A16_SSCALED: - *ncomp = 4; - break; + // guess that indexed quads are created by successive connections, since a closed mesh seems unlikely + case PIPE_PRIM_QUADS: + return (indices >> 1) + 2; + // return (indices >> 2) + 2; // if it is a closed mesh default: - NOUVEAU_ERR("Unknown format %s\n", util_format_name(pipe)); - return 1; + return indices; } - - return 0; } -static boolean -nvfx_vbo_set_idxbuf(struct nvfx_context *nvfx, struct pipe_resource *ib, - unsigned ib_size) +static unsigned nvfx_decide_upload_mode(struct pipe_context *pipe, const struct pipe_draw_info *info) { - unsigned type; - - if (!ib) { - nvfx->idxbuf_buffer = NULL; - nvfx->idxbuf_format = 0xdeadbeef; - return FALSE; + struct nvfx_context* nvfx = nvfx_context(pipe); + unsigned hardware_cost = 0; + unsigned inline_cost = 0; + unsigned unique_vertices; + unsigned upload_mode; + float best_index_cost_for_hardware_vertices_as_inline_cost; + boolean prefer_hardware_indices; + unsigned index_inline_cost; + unsigned index_hardware_cost; + if (info->indexed) + unique_vertices = util_guess_unique_indices_count(info->mode, info->count); + else + unique_vertices = info->count; + + /* Here we try to figure out if we are better off writing vertex data directly on the FIFO, + * or create hardware buffer objects and pointing the hardware to them. + * + * This is done by computing the total memcpy cost of each option, ignoring uploads + * if we think that the buffer is static and thus the upload cost will be amortized over + * future draw calls. + * + * For instance, if everything looks static, we will always create buffer objects, while if + * everything is a user buffer and we are not doing indexed drawing, we never do. + * + * Other interesting cases are where a small user vertex buffer, but a huge user index buffer, + * where we will upload the vertex buffer, so that we can use hardware index lookup, and + * the opposite case, where we instead do index lookup in software to avoid uploading + * a huge amount of vertex data that is not going to be used. + * + * Otherwise, we generally move to the GPU the after it has been pushed + * NVFX_STATIC_BUFFER_MIN_REUSE_TIMES times to the GPU without having + * been updated with a transfer (or just the buffer having been destroyed). + * + * There is no special handling for user buffers, since applications can use + * OpenGL VBOs in a one-shot fashion. OpenGL 3/4 core profile forces this + * by the way. + * + * Note that currently we don't support only putting some data on the FIFO, and + * some on vertex buffers (constant and instanced data is independent from this). + * + * nVidia doesn't seem to do this either, even though it should be at least + * doable with VTX_ATTR and possibly with VERTEX_DATA too if not indexed. + */ + + for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++) + { + struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index]; + struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer); + buffer->bytes_to_draw_until_static -= vbi->per_vertex_size * unique_vertices; + if (!nvfx_buffer_seems_static(buffer)) + { + hardware_cost += buffer->dirty_end - buffer->dirty_begin; + if (!buffer->base.bo) + hardware_cost += nvfx->screen->buffer_allocation_cost; + } + inline_cost += vbi->per_vertex_size * info->count; } - if (!nvfx->screen->index_buffer_reloc_flags || ib_size == 1) - return FALSE; + best_index_cost_for_hardware_vertices_as_inline_cost = 0.0f; + prefer_hardware_indices = FALSE; + index_inline_cost = 0; + index_hardware_cost = 0; - switch (ib_size) { - case 2: - type = NV34TCL_IDXBUF_FORMAT_TYPE_U16; - break; - case 4: - type = NV34TCL_IDXBUF_FORMAT_TYPE_U32; - break; - default: - return FALSE; - } + if (info->indexed) + { + index_inline_cost = nvfx->idxbuf.index_size * info->count; + if (nvfx->screen->index_buffer_reloc_flags + && (nvfx->idxbuf.index_size == 2 || nvfx->idxbuf.index_size == 4) + && !(nvfx->idxbuf.offset & (nvfx->idxbuf.index_size - 1))) + { + struct nvfx_buffer* buffer = nvfx_buffer(nvfx->idxbuf.buffer); + buffer->bytes_to_draw_until_static -= index_inline_cost; - if (ib != nvfx->idxbuf_buffer || - type != nvfx->idxbuf_format) { - nvfx->dirty |= NVFX_NEW_ARRAYS; - nvfx->idxbuf_buffer = ib; - nvfx->idxbuf_format = type; - } + prefer_hardware_indices = TRUE; - return TRUE; -} + if (!nvfx_buffer_seems_static(buffer)) + { + index_hardware_cost = buffer->dirty_end - buffer->dirty_begin; + if (!buffer->base.bo) + index_hardware_cost += nvfx->screen->buffer_allocation_cost; + } -// type must be floating point -static inline void -nvfx_vbo_static_attrib(struct nvfx_context *nvfx, - int attrib, struct pipe_vertex_element *ve, - struct pipe_vertex_buffer *vb, unsigned ncomp) -{ - struct pipe_transfer *transfer; - struct nouveau_channel* chan = nvfx->screen->base.channel; - void *map; - float *v; - - map = pipe_buffer_map(&nvfx->pipe, vb->buffer, PIPE_TRANSFER_READ, &transfer); - map = (uint8_t *) map + vb->buffer_offset + ve->src_offset; - - v = map; - - switch (ncomp) { - case 4: - OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_4F_X(attrib), 4)); - OUT_RING(chan, fui(v[0])); - OUT_RING(chan, fui(v[1])); - OUT_RING(chan, fui(v[2])); - OUT_RING(chan, fui(v[3])); - break; - case 3: - OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_3F_X(attrib), 3)); - OUT_RING(chan, fui(v[0])); - OUT_RING(chan, fui(v[1])); - OUT_RING(chan, fui(v[2])); - break; - case 2: - OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_2F_X(attrib), 2)); - OUT_RING(chan, fui(v[0])); - OUT_RING(chan, fui(v[1])); - break; - case 1: - OUT_RING(chan, RING_3D(NV34TCL_VTX_ATTR_1F(attrib), 1)); - OUT_RING(chan, fui(v[0])); - break; + if ((float) index_inline_cost < (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost) + { + best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_inline_cost; + } + else + { + best_index_cost_for_hardware_vertices_as_inline_cost = (float) index_hardware_cost * nvfx->screen->inline_cost_per_hardware_cost; + prefer_hardware_indices = TRUE; + } + } } - pipe_buffer_unmap(&nvfx->pipe, vb->buffer, transfer); + /* let's finally figure out which of the 3 paths we want to take */ + if ((float) (inline_cost + index_inline_cost) > ((float) hardware_cost * nvfx->screen->inline_cost_per_hardware_cost + best_index_cost_for_hardware_vertices_as_inline_cost)) + upload_mode = 1 + prefer_hardware_indices; + else + upload_mode = 0; + +#ifdef DEBUG + if (unlikely(nvfx->screen->trace_draw)) + { + fprintf(stderr, "DRAW"); + if (info->indexed) + { + fprintf(stderr, "_IDX%u", nvfx->idxbuf.index_size); + if (info->index_bias) + fprintf(stderr, " biased %u", info->index_bias); + fprintf(stderr, " idxrange %u -> %u", info->min_index, info->max_index); + } + if (info->instance_count > 1) + fprintf(stderr, " %u instances from %u", info->instance_count, info->indexed); + fprintf(stderr, " start %u count %u prim %u", info->start, info->count, info->mode); + if (!upload_mode) + fprintf(stderr, " -> inline vertex data"); + else if (upload_mode == 2 || !info->indexed) + fprintf(stderr, " -> buffer range"); + else + fprintf(stderr, " -> inline indices"); + fprintf(stderr, " [ivtx %u hvtx %u iidx %u hidx %u bidx %f] <", inline_cost, hardware_cost, index_inline_cost, index_hardware_cost, best_index_cost_for_hardware_vertices_as_inline_cost); + for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i) + { + struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index]; + struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer); + if (i) + fprintf(stderr, ", "); + fprintf(stderr, "%p%s left %Li", buffer, buffer->last_update_static ? " static" : "", buffer->bytes_to_draw_until_static); + } + fprintf(stderr, ">\n"); + } +#endif + + return upload_mode; } -static void -nvfx_draw_arrays(struct pipe_context *pipe, - unsigned mode, unsigned start, unsigned count) +void nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) { struct nvfx_context *nvfx = nvfx_context(pipe); - struct nvfx_screen *screen = nvfx->screen; - struct nouveau_channel *chan = screen->base.channel; - unsigned restart = 0; - - nvfx_vbo_set_idxbuf(nvfx, NULL, 0); - if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) { - nvfx_draw_elements_swtnl(pipe, NULL, 0, 0, - mode, start, count); - return; - } + unsigned upload_mode = 0; - while (count) { - unsigned vc, nr, avail; + if (!nvfx->vtxelt->needs_translate) + upload_mode = nvfx_decide_upload_mode(pipe, info); - nvfx_state_emit(nvfx); + nvfx->use_index_buffer = upload_mode > 1; - avail = AVAIL_RING(chan); - avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */ + if ((upload_mode > 0) != nvfx->use_vertex_buffers) + { + nvfx->use_vertex_buffers = (upload_mode > 0); + nvfx->dirty |= NVFX_NEW_ARRAYS; + nvfx->draw_dirty |= NVFX_NEW_ARRAYS; + } - vc = nouveau_vbuf_split(avail, 6, 256, - mode, start, count, &restart); - if (!vc) { - FIRE_RING(chan); - continue; + if (upload_mode > 0) + { + for (unsigned i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; i++) + { + struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index]; + nvfx_buffer_upload(nvfx_buffer(vb->buffer)); } - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, nvgl_primitive(mode)); + if (upload_mode > 1) + { + nvfx_buffer_upload(nvfx_buffer(nvfx->idxbuf.buffer)); - nr = (vc & 0xff); - if (nr) { - OUT_RING(chan, RING_3D(NV34TCL_VB_VERTEX_BATCH, 1)); - OUT_RING (chan, ((nr - 1) << 24) | start); - start += nr; + if (unlikely(info->index_bias != nvfx->base_vertex)) + { + nvfx->base_vertex = info->index_bias; + nvfx->dirty |= NVFX_NEW_ARRAYS; + } } - - nr = vc >> 8; - while (nr) { - unsigned push = nr > 2047 ? 2047 : nr; - - nr -= push; - - OUT_RING(chan, RING_3D_NI(NV34TCL_VB_VERTEX_BATCH, push)); - while (push--) { - OUT_RING(chan, ((0x100 - 1) << 24) | start); - start += 0x100; + else + { + if (unlikely(info->start < nvfx->base_vertex && nvfx->base_vertex)) + { + nvfx->base_vertex = 0; + nvfx->dirty |= NVFX_NEW_ARRAYS; } } - - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, 0); - - count -= vc; - start = restart; } - pipe->flush(pipe, 0, NULL); + if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) + nvfx_draw_vbo_swtnl(pipe, info); + else + nvfx_push_vbo(pipe, info); } -static INLINE void -nvfx_draw_elements_u08(struct nvfx_context *nvfx, void *ib, - unsigned mode, unsigned start, unsigned count) +boolean +nvfx_vbo_validate(struct nvfx_context *nvfx) { - struct nvfx_screen *screen = nvfx->screen; - struct nouveau_channel *chan = screen->base.channel; + struct nouveau_channel* chan = nvfx->screen->base.channel; + int i; + int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr); + unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD; - while (count) { - uint8_t *elts = (uint8_t *)ib + start; - unsigned vc, push, restart = 0, avail; + if (!elements) + return TRUE; - nvfx_state_emit(nvfx); + MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2); + for(unsigned i = 0; i < nvfx->vtxelt->num_constant; ++i) + { + struct nvfx_low_frequency_element *ve = &nvfx->vtxelt->constant[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index]; + struct nvfx_buffer* buffer = nvfx_buffer(vb->buffer); + float v[4]; + ve->fetch_rgba_float(v, buffer->data + vb->buffer_offset + ve->src_offset, 0, 0); + nvfx_emit_vtx_attr(chan, ve->idx, v, ve->ncomp); + } - avail = AVAIL_RING(chan); - avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */ - vc = nouveau_vbuf_split(avail, 6, 2, - mode, start, count, &restart); - if (vc == 0) { - FIRE_RING(chan); - continue; - } - count -= vc; + OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements)); + if(nvfx->use_vertex_buffers) + { + unsigned idx = 0; + for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) { + struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index]; - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, nvgl_primitive(mode)); + if(idx != ve->idx) + { + assert(idx < ve->idx); + OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], ve->idx - idx); + idx = ve->idx; + } - if (vc & 1) { - OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1)); - OUT_RING (chan, elts[0]); - elts++; vc--; + OUT_RING(chan, nvfx->vtxelt->vtxfmt[idx] | (vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT)); + ++idx; } + if(idx != nvfx->vtxelt->num_elements) + OUT_RINGp(chan, &nvfx->vtxelt->vtxfmt[idx], nvfx->vtxelt->num_elements - idx); + } + else + OUT_RINGp(chan, nvfx->vtxelt->vtxfmt, nvfx->vtxelt->num_elements); - while (vc) { - unsigned i; - - push = MIN2(vc, 2047 * 2); - - OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1)); - for (i = 0; i < push; i+=2) - OUT_RING(chan, (elts[i+1] << 16) | elts[i]); + for(i = nvfx->vtxelt->num_elements; i < elements; ++i) + OUT_RING(chan, NV34TCL_VTXFMT_TYPE_32_FLOAT); - vc -= push; - elts += push; + if(nvfx->is_nv4x) { + unsigned i; + /* seems to be some kind of cache flushing */ + for(i = 0; i < 3; ++i) { + OUT_RING(chan, RING_3D(0x1718, 1)); + OUT_RING(chan, 0); } - - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, 0); - - start = restart; } -} - -static INLINE void -nvfx_draw_elements_u16(struct nvfx_context *nvfx, void *ib, - unsigned mode, unsigned start, unsigned count) -{ - struct nvfx_screen *screen = nvfx->screen; - struct nouveau_channel *chan = screen->base.channel; - - while (count) { - uint16_t *elts = (uint16_t *)ib + start; - unsigned vc, push, restart = 0, avail; - - nvfx_state_emit(nvfx); - - avail = AVAIL_RING(chan); - avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */ - vc = nouveau_vbuf_split(avail, 6, 2, - mode, start, count, &restart); - if (vc == 0) { - FIRE_RING(chan); - continue; - } - count -= vc; + OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements)); + if(nvfx->use_vertex_buffers) + { + unsigned idx = 0; + for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) { + struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index]; + struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo; - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, nvgl_primitive(mode)); + for(; idx < ve->idx; ++idx) + OUT_RING(chan, 0); - if (vc & 1) { - OUT_RING(chan, RING_3D(NV34TCL_VB_ELEMENT_U32, 1)); - OUT_RING (chan, elts[0]); - elts++; vc--; + OUT_RELOC(chan, bo, + vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride, + vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR, + 0, NV34TCL_VTXBUF_ADDRESS_DMA1); + ++idx; } - while (vc) { - unsigned i; - - push = MIN2(vc, 2047 * 2); - - OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U16, push >> 1)); - for (i = 0; i < push; i+=2) - OUT_RING(chan, (elts[i+1] << 16) | elts[i]); - - vc -= push; - elts += push; - } + for(; idx < elements; ++idx) + OUT_RING(chan, 0); + } + else + { + for (i = 0; i < elements; i++) + OUT_RING(chan, 0); + } - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, 0); + OUT_RING(chan, RING_3D(0x1710, 1)); + OUT_RING(chan, 0); - start = restart; - } + nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements; + nvfx->relocs_needed &=~ NVFX_RELOCATE_VTXBUF; + return TRUE; } -static INLINE void -nvfx_draw_elements_u32(struct nvfx_context *nvfx, void *ib, - unsigned mode, unsigned start, unsigned count) +void +nvfx_vbo_relocate(struct nvfx_context *nvfx) { - struct nvfx_screen *screen = nvfx->screen; - struct nouveau_channel *chan = screen->base.channel; - - while (count) { - uint32_t *elts = (uint32_t *)ib + start; - unsigned vc, push, restart = 0, avail; - - nvfx_state_emit(nvfx); - - avail = AVAIL_RING(chan); - avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */ - - vc = nouveau_vbuf_split(avail, 5, 1, - mode, start, count, &restart); - if (vc == 0) { - FIRE_RING(chan); - continue; - } - count -= vc; - - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, nvgl_primitive(mode)); - - while (vc) { - push = MIN2(vc, 2047); - - OUT_RING(chan, RING_3D_NI(NV34TCL_VB_ELEMENT_U32, push)); - OUT_RINGp (chan, elts, push); + struct nouveau_channel* chan; + unsigned vb_flags; + int i; - vc -= push; - elts += push; - } + if(!nvfx->use_vertex_buffers) + return; - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, 0); + chan = nvfx->screen->base.channel; + vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY; - start = restart; + MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3); + for (i = 0; i < nvfx->vtxelt->num_per_vertex; i++) { + struct nvfx_per_vertex_element *ve = &nvfx->vtxelt->per_vertex[i]; + struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index]; + struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo; + + OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(ve->idx), 1), + vb_flags, 0, 0); + OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset + nvfx->base_vertex * vb->stride, + vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR, + 0, NV34TCL_VTXBUF_ADDRESS_DMA1); } + nvfx->relocs_needed &=~ NVFX_RELOCATE_VTXBUF; } static void -nvfx_draw_elements_inline(struct pipe_context *pipe, - struct pipe_resource *ib, - unsigned ib_size, int ib_bias, - unsigned mode, unsigned start, unsigned count) +nvfx_idxbuf_emit(struct nvfx_context* nvfx, unsigned ib_flags) { - struct nvfx_context *nvfx = nvfx_context(pipe); - struct pipe_transfer *transfer; - void *map; - - map = pipe_buffer_map(pipe, ib, PIPE_TRANSFER_READ, &transfer); - if (!ib) { - NOUVEAU_ERR("failed mapping ib\n"); - return; - } + struct nouveau_channel* chan = nvfx->screen->base.channel; + unsigned ib_format = (nvfx->idxbuf.index_size == 2) ? NV34TCL_IDXBUF_FORMAT_TYPE_U16 : NV34TCL_IDXBUF_FORMAT_TYPE_U32; + struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf.buffer)->bo; + ib_flags |= nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD; - assert(ib_bias == 0); - - switch (ib_size) { - case 1: - nvfx_draw_elements_u08(nvfx, map, mode, start, count); - break; - case 2: - nvfx_draw_elements_u16(nvfx, map, mode, start, count); - break; - case 4: - nvfx_draw_elements_u32(nvfx, map, mode, start, count); - break; - default: - NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size); - break; - } + assert(nvfx->screen->index_buffer_reloc_flags); - pipe_buffer_unmap(pipe, ib, transfer); + MARK_RING(chan, 3, 3); + if(ib_flags & NOUVEAU_BO_DUMMY) + OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2), ib_flags, 0, 0); + else + OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2)); + OUT_RELOC(chan, bo, nvfx->idxbuf.offset + 1, ib_flags | NOUVEAU_BO_LOW, 0, 0); + OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR, + 0, NV34TCL_IDXBUF_FORMAT_DMA1); + nvfx->relocs_needed &=~ NVFX_RELOCATE_IDXBUF; } -static void -nvfx_draw_elements_vbo(struct pipe_context *pipe, - unsigned mode, unsigned start, unsigned count) +void +nvfx_idxbuf_validate(struct nvfx_context* nvfx) { - struct nvfx_context *nvfx = nvfx_context(pipe); - struct nvfx_screen *screen = nvfx->screen; - struct nouveau_channel *chan = screen->base.channel; - unsigned restart = 0; + nvfx_idxbuf_emit(nvfx, 0); +} - while (count) { - unsigned nr, vc, avail; +void +nvfx_idxbuf_relocate(struct nvfx_context* nvfx) +{ + nvfx_idxbuf_emit(nvfx, NOUVEAU_BO_DUMMY); +} - nvfx_state_emit(nvfx); +unsigned nvfx_vertex_formats[PIPE_FORMAT_COUNT] = +{ + [PIPE_FORMAT_R32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT, + [PIPE_FORMAT_R32G32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT, + [PIPE_FORMAT_R32G32B32A32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT, + [PIPE_FORMAT_R32G32B32_FLOAT] = NV34TCL_VTXFMT_TYPE_32_FLOAT, + [PIPE_FORMAT_R16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT, + [PIPE_FORMAT_R16G16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT, + [PIPE_FORMAT_R16G16B16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT, + [PIPE_FORMAT_R16G16B16A16_FLOAT] = NV34TCL_VTXFMT_TYPE_16_FLOAT, + [PIPE_FORMAT_R8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM, + [PIPE_FORMAT_R8G8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM, + [PIPE_FORMAT_R8G8B8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM, + [PIPE_FORMAT_R8G8B8A8_UNORM] = NV34TCL_VTXFMT_TYPE_8_UNORM, + [PIPE_FORMAT_R8G8B8A8_USCALED] = NV34TCL_VTXFMT_TYPE_8_USCALED, + [PIPE_FORMAT_R16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM, + [PIPE_FORMAT_R16G16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM, + [PIPE_FORMAT_R16G16B16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM, + [PIPE_FORMAT_R16G16B16A16_SNORM] = NV34TCL_VTXFMT_TYPE_16_SNORM, + [PIPE_FORMAT_R16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED, + [PIPE_FORMAT_R16G16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED, + [PIPE_FORMAT_R16G16B16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED, + [PIPE_FORMAT_R16G16B16A16_SSCALED] = NV34TCL_VTXFMT_TYPE_16_SSCALED, +}; + +static void * +nvfx_vtxelts_state_create(struct pipe_context *pipe, + unsigned num_elements, + const struct pipe_vertex_element *elements) +{ + struct nvfx_vtxelt_state *cso = CALLOC_STRUCT(nvfx_vtxelt_state); + struct translate_key transkey; + unsigned per_vertex_size[16]; + unsigned vb_compacted_index[16]; - avail = AVAIL_RING(chan); - avail -= 16 + (avail >> 10); /* for the BEGIN_RING_NIs, conservatively assuming one every 1024, plus 16 for safety */ + if(num_elements > 16) + { + _debug_printf("Error: application attempted to use %u vertex elements, but only 16 are supported: ignoring the rest\n", num_elements); + num_elements = 16; + } - vc = nouveau_vbuf_split(avail, 6, 256, - mode, start, count, &restart); - if (!vc) { - FIRE_RING(chan); - continue; - } + memset(per_vertex_size, 0, sizeof(per_vertex_size)); + memcpy(cso->pipe, elements, num_elements * sizeof(elements[0])); + cso->num_elements = num_elements; + cso->needs_translate = FALSE; + + transkey.nr_elements = 0; + transkey.output_stride = 0; + + for(unsigned i = 0; i < num_elements; ++i) + { + const struct pipe_vertex_element* ve = &elements[i]; + if(!ve->instance_divisor) + per_vertex_size[ve->vertex_buffer_index] += util_format_get_stride(ve->src_format, 1); + } + + for(unsigned i = 0; i < 16; ++i) + { + if(per_vertex_size[i]) + { + unsigned idx = cso->num_per_vertex_buffer_infos++; + cso->per_vertex_buffer_info[idx].vertex_buffer_index = i; + cso->per_vertex_buffer_info[idx].per_vertex_size = per_vertex_size[i]; + vb_compacted_index[i] = idx; + } + } + + for(unsigned i = 0; i < num_elements; ++i) + { + const struct pipe_vertex_element* ve = &elements[i]; + unsigned type = nvfx_vertex_formats[ve->src_format]; + unsigned ncomp = util_format_get_nr_components(ve->src_format); - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, nvgl_primitive(mode)); + //if(ve->frequency != PIPE_ELEMENT_FREQUENCY_PER_VERTEX) + if(ve->instance_divisor) + { + struct nvfx_low_frequency_element* lfve; + cso->vtxfmt[i] = NV34TCL_VTXFMT_TYPE_32_FLOAT; + + //if(ve->frequency == PIPE_ELEMENT_FREQUENCY_CONSTANT) + if(0) + lfve = &cso->constant[cso->num_constant++]; + else + { + lfve = &cso->per_instance[cso->num_per_instance++].base; + ((struct nvfx_per_instance_element*)lfve)->instance_divisor = ve->instance_divisor; + } - nr = (vc & 0xff); - if (nr) { - OUT_RING(chan, RING_3D(NV34TCL_VB_INDEX_BATCH, 1)); - OUT_RING (chan, ((nr - 1) << 24) | start); - start += nr; + lfve->idx = i; + lfve->vertex_buffer_index = ve->vertex_buffer_index; + lfve->src_offset = ve->src_offset; + lfve->fetch_rgba_float = util_format_description(ve->src_format)->fetch_rgba_float; + lfve->ncomp = ncomp; } - - nr = vc >> 8; - while (nr) { - unsigned push = nr > 2047 ? 2047 : nr; - - nr -= push; - - OUT_RING(chan, RING_3D_NI(NV34TCL_VB_INDEX_BATCH, push)); - while (push--) { - OUT_RING(chan, ((0x100 - 1) << 24) | start); - start += 0x100; + else + { + unsigned idx; + + idx = cso->num_per_vertex++; + cso->per_vertex[idx].idx = i; + cso->per_vertex[idx].vertex_buffer_index = ve->vertex_buffer_index; + cso->per_vertex[idx].src_offset = ve->src_offset; + + idx = transkey.nr_elements++; + transkey.element[idx].input_format = ve->src_format; + transkey.element[idx].input_buffer = vb_compacted_index[ve->vertex_buffer_index]; + transkey.element[idx].input_offset = ve->src_offset; + transkey.element[idx].instance_divisor = 0; + transkey.element[idx].type = TRANSLATE_ELEMENT_NORMAL; + if(type) + { + transkey.element[idx].output_format = ve->src_format; + cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type; + } + else + { + unsigned float32[4] = {PIPE_FORMAT_R32_FLOAT, PIPE_FORMAT_R32G32_FLOAT, PIPE_FORMAT_R32G32B32_FLOAT, PIPE_FORMAT_R32G32B32A32_FLOAT}; + transkey.element[idx].output_format = float32[ncomp - 1]; + cso->needs_translate = TRUE; + cso->vtxfmt[i] = (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | NV34TCL_VTXFMT_TYPE_32_FLOAT; } + transkey.element[idx].output_offset = transkey.output_stride; + transkey.output_stride += (util_format_get_stride(transkey.element[idx].output_format, 1) + 3) & ~3; } + } - OUT_RING(chan, RING_3D(NV34TCL_VERTEX_BEGIN_END, 1)); - OUT_RING (chan, 0); + cso->translate = translate_create(&transkey); + cso->vertex_length = transkey.output_stride >> 2; + cso->max_vertices_per_packet = 2047 / cso->vertex_length; - count -= vc; - start = restart; - } + return (void *)cso; } static void -nvfx_draw_elements(struct pipe_context *pipe, - struct pipe_resource *indexBuffer, - unsigned indexSize, int indexBias, - unsigned mode, unsigned start, unsigned count) +nvfx_vtxelts_state_delete(struct pipe_context *pipe, void *hwcso) { - struct nvfx_context *nvfx = nvfx_context(pipe); - boolean idxbuf; - - idxbuf = nvfx_vbo_set_idxbuf(nvfx, indexBuffer, indexSize); - if (nvfx->screen->force_swtnl || !nvfx_state_validate(nvfx)) { - nvfx_draw_elements_swtnl(pipe, - indexBuffer, indexSize, indexBias, - mode, start, count); - return; - } - - if (idxbuf) { - nvfx_draw_elements_vbo(pipe, mode, start, count); - } else { - nvfx_draw_elements_inline(pipe, - indexBuffer, indexSize, indexBias, - mode, start, count); - } - - pipe->flush(pipe, 0, NULL); + FREE(hwcso); } -void -nvfx_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) +static void +nvfx_vtxelts_state_bind(struct pipe_context *pipe, void *hwcso) { struct nvfx_context *nvfx = nvfx_context(pipe); - if (info->indexed && nvfx->idxbuf.buffer) { - unsigned offset; - - assert(nvfx->idxbuf.offset % nvfx->idxbuf.index_size == 0); - offset = nvfx->idxbuf.offset / nvfx->idxbuf.index_size; - - nvfx_draw_elements(pipe, - nvfx->idxbuf.buffer, - nvfx->idxbuf.index_size, - info->index_bias, - info->mode, - info->start + offset, - info->count); - } - else { - nvfx_draw_arrays(pipe, - info->mode, - info->start, - info->count); - } + nvfx->vtxelt = hwcso; + nvfx->use_vertex_buffers = -1; + nvfx->draw_dirty |= NVFX_NEW_ARRAYS; } -boolean -nvfx_vbo_validate(struct nvfx_context *nvfx) +static void +nvfx_set_vertex_buffers(struct pipe_context *pipe, unsigned count, + const struct pipe_vertex_buffer *vb) { - struct nouveau_channel* chan = nvfx->screen->base.channel; - struct pipe_resource *ib = nvfx->idxbuf_buffer; - unsigned ib_format = nvfx->idxbuf_format; - int i; - int elements = MAX2(nvfx->vtxelt->num_elements, nvfx->hw_vtxelt_nr); - uint32_t vtxfmt[16]; - unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD; - - if (!elements) - return TRUE; - - nvfx->vbo_bo = 0; - - MARK_RING(chan, (5 + 2) * 16 + 2 + 11, 16 + 2); - for (i = 0; i < nvfx->vtxelt->num_elements; i++) { - struct pipe_vertex_element *ve; - struct pipe_vertex_buffer *vb; - unsigned type, ncomp; - - ve = &nvfx->vtxelt->pipe[i]; - vb = &nvfx->vtxbuf[ve->vertex_buffer_index]; - - if (nvfx_vbo_format_to_hw(ve->src_format, &type, &ncomp)) { - MARK_UNDO(chan); - nvfx->fallback_swtnl |= NVFX_NEW_ARRAYS; - return FALSE; - } + struct nvfx_context *nvfx = nvfx_context(pipe); - if (!vb->stride && type == NV34TCL_VTXFMT_TYPE_FLOAT) { - nvfx_vbo_static_attrib(nvfx, i, ve, vb, ncomp); - vtxfmt[i] = type; - } else { - vtxfmt[i] = ((vb->stride << NV34TCL_VTXFMT_STRIDE_SHIFT) | - (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type); - nvfx->vbo_bo |= (1 << i); - } + for(unsigned i = 0; i < count; ++i) + { + pipe_resource_reference(&nvfx->vtxbuf[i].buffer, vb[i].buffer); + nvfx->vtxbuf[i].buffer_offset = vb[i].buffer_offset; + nvfx->vtxbuf[i].max_index = vb[i].max_index; + nvfx->vtxbuf[i].stride = vb[i].stride; } - for(; i < elements; ++i) - vtxfmt[i] = NV34TCL_VTXFMT_TYPE_FLOAT; + for(unsigned i = count; i < nvfx->vtxbuf_nr; ++i) + pipe_resource_reference(&nvfx->vtxbuf[i].buffer, 0); - OUT_RING(chan, RING_3D(NV34TCL_VTXFMT(0), elements)); - OUT_RINGp(chan, vtxfmt, elements); - - if(nvfx->is_nv4x) { - unsigned i; - /* seems to be some kind of cache flushing */ - for(i = 0; i < 3; ++i) { - OUT_RING(chan, RING_3D(0x1718, 1)); - OUT_RING(chan, 0); - } - } - - OUT_RING(chan, RING_3D(NV34TCL_VTXBUF_ADDRESS(0), elements)); - for (i = 0; i < nvfx->vtxelt->num_elements; i++) { - struct pipe_vertex_element *ve; - struct pipe_vertex_buffer *vb; + nvfx->vtxbuf_nr = count; + nvfx->use_vertex_buffers = -1; + nvfx->draw_dirty |= NVFX_NEW_ARRAYS; +} - ve = &nvfx->vtxelt->pipe[i]; - vb = &nvfx->vtxbuf[ve->vertex_buffer_index]; +static void +nvfx_set_index_buffer(struct pipe_context *pipe, + const struct pipe_index_buffer *ib) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); - if (!(nvfx->vbo_bo & (1 << i))) - OUT_RING(chan, 0); - else - { - struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo; - OUT_RELOC(chan, bo, - vb->buffer_offset + ve->src_offset, - vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR, - 0, NV34TCL_VTXBUF_ADDRESS_DMA1); - } + if(ib) + { + pipe_resource_reference(&nvfx->idxbuf.buffer, ib->buffer); + nvfx->idxbuf.index_size = ib->index_size; + nvfx->idxbuf.offset = ib->offset; } - - for (; i < elements; i++) - OUT_RING(chan, 0); - - OUT_RING(chan, RING_3D(0x1710, 1)); - OUT_RING(chan, 0); - - if (ib) { - unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD; - struct nouveau_bo* bo = nvfx_resource(ib)->bo; - - assert(nvfx->screen->index_buffer_reloc_flags); - - OUT_RING(chan, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2)); - OUT_RELOC(chan, bo, 0, ib_flags | NOUVEAU_BO_LOW, 0, 0); - OUT_RELOC(chan, bo, ib_format, ib_flags | NOUVEAU_BO_OR, - 0, NV34TCL_IDXBUF_FORMAT_DMA1); + else + { + pipe_resource_reference(&nvfx->idxbuf.buffer, 0); + nvfx->idxbuf.index_size = 0; + nvfx->idxbuf.offset = 0; } - nvfx->hw_vtxelt_nr = nvfx->vtxelt->num_elements; - return TRUE; + nvfx->dirty |= NVFX_NEW_INDEX; + nvfx->draw_dirty |= NVFX_NEW_INDEX; } void -nvfx_vbo_relocate(struct nvfx_context *nvfx) +nvfx_init_vbo_functions(struct nvfx_context *nvfx) { - struct nouveau_channel* chan = nvfx->screen->base.channel; - unsigned vb_flags = nvfx->screen->vertex_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY; - int i; + nvfx->pipe.set_vertex_buffers = nvfx_set_vertex_buffers; + nvfx->pipe.set_index_buffer = nvfx_set_index_buffer; - MARK_RING(chan, 2 * 16 + 3, 2 * 16 + 3); - for(i = 0; i < nvfx->vtxelt->num_elements; ++i) { - if(nvfx->vbo_bo & (1 << i)) { - struct pipe_vertex_element *ve = &nvfx->vtxelt->pipe[i]; - struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->vertex_buffer_index]; - struct nouveau_bo* bo = nvfx_resource(vb->buffer)->bo; - OUT_RELOC(chan, bo, RING_3D(NV34TCL_VTXBUF_ADDRESS(i), 1), - vb_flags, 0, 0); - OUT_RELOC(chan, bo, vb->buffer_offset + ve->src_offset, - vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR, - 0, NV34TCL_VTXBUF_ADDRESS_DMA1); - } - } - - if(nvfx->idxbuf_buffer) - { - unsigned ib_flags = nvfx->screen->index_buffer_reloc_flags | NOUVEAU_BO_RD | NOUVEAU_BO_DUMMY; - struct nouveau_bo* bo = nvfx_resource(nvfx->idxbuf_buffer)->bo; - - assert(nvfx->screen->index_buffer_reloc_flags); - - OUT_RELOC(chan, bo, RING_3D(NV34TCL_IDXBUF_ADDRESS, 2), - ib_flags, 0, 0); - OUT_RELOC(chan, bo, 0, - ib_flags | NOUVEAU_BO_LOW, 0, 0); - OUT_RELOC(chan, bo, nvfx->idxbuf_format, - ib_flags | NOUVEAU_BO_OR, - 0, NV34TCL_IDXBUF_FORMAT_DMA1); - } + nvfx->pipe.create_vertex_elements_state = nvfx_vtxelts_state_create; + nvfx->pipe.delete_vertex_elements_state = nvfx_vtxelts_state_delete; + nvfx->pipe.bind_vertex_elements_state = nvfx_vtxelts_state_bind; } diff --git a/src/gallium/drivers/nvfx/nvfx_vertprog.c b/src/gallium/drivers/nvfx/nvfx_vertprog.c index 24d9846310e..ea7e88c5613 100644 --- a/src/gallium/drivers/nvfx/nvfx_vertprog.c +++ b/src/gallium/drivers/nvfx/nvfx_vertprog.c @@ -1,15 +1,19 @@ #include "pipe/p_context.h" #include "pipe/p_defines.h" #include "pipe/p_state.h" -#include "util/u_inlines.h" +#include "util/u_linkage.h" +#include "util/u_debug.h" #include "pipe/p_shader_tokens.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_dump.h" #include "tgsi/tgsi_util.h" +#include "draw/draw_context.h" + #include "nvfx_context.h" #include "nvfx_state.h" +#include "nvfx_resource.h" /* TODO (at least...): * 1. Indexed consts + ARL @@ -25,26 +29,34 @@ #include "nv30_vertprog.h" #include "nv40_vertprog.h" -#define NVFX_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n)) +struct nvfx_loop_entry +{ + unsigned brk_target; + unsigned cont_target; +}; struct nvfx_vpc { + struct nvfx_context* nvfx; struct nvfx_vertex_program *vp; struct nvfx_vertex_program_exec *vpi; unsigned r_temps; unsigned r_temps_discard; - struct nvfx_sreg r_result[PIPE_MAX_SHADER_OUTPUTS]; - struct nvfx_sreg *r_address; - struct nvfx_sreg *r_temp; + struct nvfx_reg r_result[PIPE_MAX_SHADER_OUTPUTS]; + struct nvfx_reg *r_address; + struct nvfx_reg *r_temp; - struct nvfx_sreg *imm; + struct nvfx_reg *imm; unsigned nr_imm; unsigned hpos_idx; + + struct util_dynarray label_relocs; + struct util_dynarray loop_stack; }; -static struct nvfx_sreg +static struct nvfx_reg temp(struct nvfx_vpc *vpc) { int idx = ffs(~vpc->r_temps) - 1; @@ -52,22 +64,22 @@ temp(struct nvfx_vpc *vpc) if (idx < 0) { NOUVEAU_ERR("out of temps!!\n"); assert(0); - return nvfx_sr(NVFXSR_TEMP, 0); + return nvfx_reg(NVFXSR_TEMP, 0); } vpc->r_temps |= (1 << idx); vpc->r_temps_discard |= (1 << idx); - return nvfx_sr(NVFXSR_TEMP, idx); + return nvfx_reg(NVFXSR_TEMP, idx); } -static INLINE void +static inline void release_temps(struct nvfx_vpc *vpc) { vpc->r_temps &= ~vpc->r_temps_discard; vpc->r_temps_discard = 0; } -static struct nvfx_sreg +static struct nvfx_reg constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w) { struct nvfx_vertex_program *vp = vpc->vp; @@ -77,7 +89,7 @@ constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w) if (pipe >= 0) { for (idx = 0; idx < vp->nr_consts; idx++) { if (vp->consts[idx].index == pipe) - return nvfx_sr(NVFXSR_CONST, idx); + return nvfx_reg(NVFXSR_CONST, idx); } } @@ -90,35 +102,36 @@ constant(struct nvfx_vpc *vpc, int pipe, float x, float y, float z, float w) vpd->value[1] = y; vpd->value[2] = z; vpd->value[3] = w; - return nvfx_sr(NVFXSR_CONST, idx); + return nvfx_reg(NVFXSR_CONST, idx); } -#define arith(cc,s,o,d,m,s0,s1,s2) \ - nvfx_vp_arith(nvfx, (cc), NVFX_VP_INST_SLOT_##s, NVFX_VP_INST_##s##_OP_##o, (d), (m), (s0), (s1), (s2)) +#define arith(s,o,d,m,s0,s1,s2) \ + nvfx_insn(0, (NVFX_VP_INST_SLOT_##s << 7) | NVFX_VP_INST_##s##_OP_##o, -1, (d), (m), (s0), (s1), (s2)) static void -emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_sreg src) +emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, struct nvfx_src src) { struct nvfx_vertex_program *vp = vpc->vp; uint32_t sr = 0; + struct nvfx_relocation reloc; - switch (src.type) { + switch (src.reg.type) { case NVFXSR_TEMP: sr |= (NVFX_VP(SRC_REG_TYPE_TEMP) << NVFX_VP(SRC_REG_TYPE_SHIFT)); - sr |= (src.index << NVFX_VP(SRC_TEMP_SRC_SHIFT)); + sr |= (src.reg.index << NVFX_VP(SRC_TEMP_SRC_SHIFT)); break; case NVFXSR_INPUT: sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) << NVFX_VP(SRC_REG_TYPE_SHIFT)); - vp->ir |= (1 << src.index); - hw[1] |= (src.index << NVFX_VP(INST_INPUT_SRC_SHIFT)); + vp->ir |= (1 << src.reg.index); + hw[1] |= (src.reg.index << NVFX_VP(INST_INPUT_SRC_SHIFT)); break; case NVFXSR_CONST: sr |= (NVFX_VP(SRC_REG_TYPE_CONST) << NVFX_VP(SRC_REG_TYPE_SHIFT)); - assert(vpc->vpi->const_index == -1 || - vpc->vpi->const_index == src.index); - vpc->vpi->const_index = src.index; + reloc.location = vp->nr_insns - 1; + reloc.target = src.reg.index; + util_dynarray_append(&vp->const_relocs, struct nvfx_relocation, reloc); break; case NVFXSR_NONE: sr |= (NVFX_VP(SRC_REG_TYPE_INPUT) << @@ -161,100 +174,67 @@ emit_src(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int pos, } static void -emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_sreg dst) +emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot, struct nvfx_reg dst) { struct nvfx_vertex_program *vp = vpc->vp; switch (dst.type) { + case NVFXSR_NONE: + if(!nvfx->is_nv4x) + hw[0] |= NV30_VP_INST_DEST_TEMP_ID_MASK; + else { + hw[3] |= NV40_VP_INST_DEST_MASK; + if (slot == 0) + hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK; + else + hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK; + } + break; case NVFXSR_TEMP: if(!nvfx->is_nv4x) hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT); else { hw[3] |= NV40_VP_INST_DEST_MASK; - if (slot == 0) { - hw[0] |= (dst.index << - NV40_VP_INST_VEC_DEST_TEMP_SHIFT); - } else { - hw[3] |= (dst.index << - NV40_VP_INST_SCA_DEST_TEMP_SHIFT); - } + if (slot == 0) + hw[0] |= (dst.index << NV40_VP_INST_VEC_DEST_TEMP_SHIFT); + else + hw[3] |= (dst.index << NV40_VP_INST_SCA_DEST_TEMP_SHIFT); } break; case NVFXSR_OUTPUT: /* TODO: this may be wrong because on nv30 COL0 and BFC0 are swapped */ - switch (dst.index) { - case NVFX_VP_INST_DEST_CLIP(0): - vp->or |= (1 << 6); - vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0; - dst.index = NVFX_VP(INST_DEST_FOGC); - break; - case NVFX_VP_INST_DEST_CLIP(1): - vp->or |= (1 << 7); - vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1; - dst.index = NVFX_VP(INST_DEST_FOGC); - break; - case NVFX_VP_INST_DEST_CLIP(2): - vp->or |= (1 << 8); - vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2; - dst.index = NVFX_VP(INST_DEST_FOGC); - break; - case NVFX_VP_INST_DEST_CLIP(3): - vp->or |= (1 << 9); - vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3; - dst.index = NVFX_VP(INST_DEST_PSZ); - break; - case NVFX_VP_INST_DEST_CLIP(4): - vp->or |= (1 << 10); - vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4; - dst.index = NVFX_VP(INST_DEST_PSZ); - break; - case NVFX_VP_INST_DEST_CLIP(5): - vp->or |= (1 << 11); - vp->clip_ctrl |= NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5; - dst.index = NVFX_VP(INST_DEST_PSZ); - break; - default: - if(!nvfx->is_nv4x) { - switch (dst.index) { - case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break; - case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break; - case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break; - case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break; - case NV30_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break; - case NV30_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break; - case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break; - case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break; - case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break; - case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break; - case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break; - case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break; - case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break; - case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break; - } - } else { - switch (dst.index) { - case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break; - case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break; - case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break; - case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break; - case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break; - case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break; - case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break; - case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break; - case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break; - case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break; - case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break; - case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break; - case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break; - case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break; - } + if(nvfx->is_nv4x) { + switch (dst.index) { + case NV30_VP_INST_DEST_CLP(0): + dst.index = NVFX_VP(INST_DEST_FOGC); + break; + case NV30_VP_INST_DEST_CLP(1): + dst.index = NVFX_VP(INST_DEST_FOGC); + break; + case NV30_VP_INST_DEST_CLP(2): + dst.index = NVFX_VP(INST_DEST_FOGC); + break; + case NV30_VP_INST_DEST_CLP(3): + dst.index = NVFX_VP(INST_DEST_PSZ); + break; + case NV30_VP_INST_DEST_CLP(4): + dst.index = NVFX_VP(INST_DEST_PSZ); + break; + case NV30_VP_INST_DEST_CLP(5): + dst.index = NVFX_VP(INST_DEST_PSZ); + break; + case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break; + case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break; + case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break; + case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break; + case NV40_VP_INST_DEST_FOGC: vp->or |= (1 << 4); break; + case NV40_VP_INST_DEST_PSZ : vp->or |= (1 << 5); break; } - break; } if(!nvfx->is_nv4x) { hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT); - hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20); + hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK; /*XXX: no way this is entirely correct, someone needs to * figure out what exactly it is. @@ -264,7 +244,7 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT); if (slot == 0) { hw[0] |= NV40_VP_INST_VEC_RESULT; - hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20); + hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK; } else { hw[3] |= NV40_VP_INST_SCA_RESULT; hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK; @@ -277,26 +257,27 @@ emit_dst(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, uint32_t *hw, int slot } static void -nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op, - struct nvfx_sreg dst, int mask, - struct nvfx_sreg s0, struct nvfx_sreg s1, - struct nvfx_sreg s2) +nvfx_vp_emit(struct nvfx_vpc *vpc, struct nvfx_insn insn) { + struct nvfx_context* nvfx = vpc->nvfx; struct nvfx_vertex_program *vp = vpc->vp; + unsigned slot = insn.op >> 7; + unsigned op = insn.op & 0x7f; uint32_t *hw; vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi)); vpc->vpi = &vp->insns[vp->nr_insns - 1]; memset(vpc->vpi, 0, sizeof(*vpc->vpi)); - vpc->vpi->const_index = -1; hw = vpc->vpi->data; - hw[0] |= (NVFX_COND_TR << NVFX_VP(INST_COND_SHIFT)); - hw[0] |= ((0 << NVFX_VP(INST_COND_SWZ_X_SHIFT)) | - (1 << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) | - (2 << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) | - (3 << NVFX_VP(INST_COND_SWZ_W_SHIFT))); + hw[0] |= (insn.cc_test << NVFX_VP(INST_COND_SHIFT)); + hw[0] |= ((insn.cc_swz[0] << NVFX_VP(INST_COND_SWZ_X_SHIFT)) | + (insn.cc_swz[1] << NVFX_VP(INST_COND_SWZ_Y_SHIFT)) | + (insn.cc_swz[2] << NVFX_VP(INST_COND_SWZ_Z_SHIFT)) | + (insn.cc_swz[3] << NVFX_VP(INST_COND_SWZ_W_SHIFT))); + if(insn.cc_update) + hw[0] |= NVFX_VP(INST_COND_UPDATE_ENABLE); if(!nvfx->is_nv4x) { if(slot == 0) @@ -309,54 +290,56 @@ nvfx_vp_arith(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, int slot, int op, // hw[3] |= NVFX_VP(INST_SCA_DEST_TEMP_MASK); // hw[3] |= (mask << NVFX_VP(INST_VEC_WRITEMASK_SHIFT)); - if (dst.type == NVFXSR_OUTPUT) { + if (insn.dst.type == NVFXSR_OUTPUT) { if (slot) - hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT); + hw[3] |= (insn.mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT); else - hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT); + hw[3] |= (insn.mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT); } else { if (slot) - hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT); + hw[3] |= (insn.mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT); else - hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT); + hw[3] |= (insn.mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT); } } else { if (slot == 0) { hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT); hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK; - hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT); + hw[3] |= (insn.mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT); } else { hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT); - hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20)); - hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT); + hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK ; + hw[3] |= (insn.mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT); } } - emit_dst(nvfx, vpc, hw, slot, dst); - emit_src(nvfx, vpc, hw, 0, s0); - emit_src(nvfx, vpc, hw, 1, s1); - emit_src(nvfx, vpc, hw, 2, s2); + emit_dst(nvfx, vpc, hw, slot, insn.dst); + emit_src(nvfx, vpc, hw, 0, insn.src[0]); + emit_src(nvfx, vpc, hw, 1, insn.src[1]); + emit_src(nvfx, vpc, hw, 2, insn.src[2]); } -static INLINE struct nvfx_sreg +static inline struct nvfx_src tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) { - struct nvfx_sreg src = { 0 }; + struct nvfx_src src; switch (fsrc->Register.File) { case TGSI_FILE_INPUT: - src = nvfx_sr(NVFXSR_INPUT, fsrc->Register.Index); + src.reg = nvfx_reg(NVFXSR_INPUT, fsrc->Register.Index); break; case TGSI_FILE_CONSTANT: - src = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0); + src.reg = constant(vpc, fsrc->Register.Index, 0, 0, 0, 0); break; case TGSI_FILE_IMMEDIATE: - src = vpc->imm[fsrc->Register.Index]; + src.reg = vpc->imm[fsrc->Register.Index]; break; case TGSI_FILE_TEMPORARY: - src = vpc->r_temp[fsrc->Register.Index]; + src.reg = vpc->r_temp[fsrc->Register.Index]; break; default: NOUVEAU_ERR("bad src file\n"); + src.reg.index = 0; + src.reg.type = 0; break; } @@ -369,11 +352,14 @@ tgsi_src(struct nvfx_vpc *vpc, const struct tgsi_full_src_register *fsrc) { return src; } -static INLINE struct nvfx_sreg +static INLINE struct nvfx_reg tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) { - struct nvfx_sreg dst = { 0 }; + struct nvfx_reg dst; switch (fdst->Register.File) { + case TGSI_FILE_NULL: + dst = nvfx_reg(NVFXSR_NONE, 0); + break; case TGSI_FILE_OUTPUT: dst = vpc->r_result[fdst->Register.Index]; break; @@ -384,14 +370,16 @@ tgsi_dst(struct nvfx_vpc *vpc, const struct tgsi_full_dst_register *fdst) { dst = vpc->r_address[fdst->Register.Index]; break; default: - NOUVEAU_ERR("bad dst file\n"); + NOUVEAU_ERR("bad dst file %i\n", fdst->Register.File); + dst.index = 0; + dst.type = 0; break; } return dst; } -static INLINE int +static inline int tgsi_mask(uint tgsi) { int mask = 0; @@ -405,10 +393,14 @@ tgsi_mask(uint tgsi) static boolean nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, - const struct tgsi_full_instruction *finst) + unsigned idx, const struct tgsi_full_instruction *finst) { - struct nvfx_sreg src[3], dst, tmp; - struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0); + struct nvfx_src src[3], tmp; + struct nvfx_reg dst; + struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0)); + struct nvfx_insn insn; + struct nvfx_relocation reloc; + struct nvfx_loop_entry loop; int mask; int ai = -1, ci = -1, ii = -1; int i; @@ -436,9 +428,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, ai = fsrc->Register.Index; src[i] = tgsi_src(vpc, fsrc); } else { - src[i] = temp(vpc); - arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL, - tgsi_src(vpc, fsrc), none, none); + src[i] = nvfx_src(temp(vpc)); + nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none)); } break; case TGSI_FILE_CONSTANT: @@ -447,9 +438,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, ci = fsrc->Register.Index; src[i] = tgsi_src(vpc, fsrc); } else { - src[i] = temp(vpc); - arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL, - tgsi_src(vpc, fsrc), none, none); + src[i] = nvfx_src(temp(vpc)); + nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none)); } break; case TGSI_FILE_IMMEDIATE: @@ -458,9 +448,8 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, ii = fsrc->Register.Index; src[i] = tgsi_src(vpc, fsrc); } else { - src[i] = temp(vpc); - arith(vpc, VEC, MOV, src[i], NVFX_VP_MASK_ALL, - tgsi_src(vpc, fsrc), none, none); + src[i] = nvfx_src(temp(vpc)); + nvfx_vp_emit(vpc, arith(VEC, MOV, src[i].reg, NVFX_VP_MASK_ALL, tgsi_src(vpc, fsrc), none, none)); } break; case TGSI_FILE_TEMPORARY: @@ -477,128 +466,231 @@ nvfx_vertprog_parse_instruction(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, switch (finst->Instruction.Opcode) { case TGSI_OPCODE_ABS: - arith(vpc, VEC, MOV, dst, mask, abs(src[0]), none, none); + nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, abs(src[0]), none, none)); break; case TGSI_OPCODE_ADD: - arith(vpc, VEC, ADD, dst, mask, src[0], none, src[1]); + nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, src[1])); break; case TGSI_OPCODE_ARL: - arith(vpc, VEC, ARL, dst, mask, src[0], none, none); + nvfx_vp_emit(vpc, arith(VEC, ARL, dst, mask, src[0], none, none)); + break; + case TGSI_OPCODE_CMP: + insn = arith(VEC, MOV, none.reg, mask, src[0], none, none); + insn.cc_update = 1; + nvfx_vp_emit(vpc, insn); + + insn = arith(VEC, MOV, dst, mask, src[2], none, none); + insn.cc_test = NVFX_COND_GE; + nvfx_vp_emit(vpc, insn); + + insn = arith(VEC, MOV, dst, mask, src[1], none, none); + insn.cc_test = NVFX_COND_LT; + nvfx_vp_emit(vpc, insn); break; case TGSI_OPCODE_COS: - arith(vpc, SCA, COS, dst, mask, none, none, src[0]); + nvfx_vp_emit(vpc, arith(SCA, COS, dst, mask, none, none, src[0])); break; + case TGSI_OPCODE_DP2: + tmp = nvfx_src(temp(vpc)); + nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X | NVFX_VP_MASK_Y, src[0], src[1], none)); + nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, swz(tmp, X, X, X, X), swz(tmp, Y, Y, Y, Y), none)); + break; case TGSI_OPCODE_DP3: - arith(vpc, VEC, DP3, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, DP3, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_DP4: - arith(vpc, VEC, DP4, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, DP4, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_DPH: - arith(vpc, VEC, DPH, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, DPH, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_DST: - arith(vpc, VEC, DST, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, DST, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_EX2: - arith(vpc, SCA, EX2, dst, mask, none, none, src[0]); + nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, src[0])); break; case TGSI_OPCODE_EXP: - arith(vpc, SCA, EXP, dst, mask, none, none, src[0]); + nvfx_vp_emit(vpc, arith(SCA, EXP, dst, mask, none, none, src[0])); break; case TGSI_OPCODE_FLR: - arith(vpc, VEC, FLR, dst, mask, src[0], none, none); + nvfx_vp_emit(vpc, arith(VEC, FLR, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_FRC: - arith(vpc, VEC, FRC, dst, mask, src[0], none, none); + nvfx_vp_emit(vpc, arith(VEC, FRC, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_LG2: - arith(vpc, SCA, LG2, dst, mask, none, none, src[0]); + nvfx_vp_emit(vpc, arith(SCA, LG2, dst, mask, none, none, src[0])); break; case TGSI_OPCODE_LIT: - arith(vpc, SCA, LIT, dst, mask, none, none, src[0]); + nvfx_vp_emit(vpc, arith(SCA, LIT, dst, mask, none, none, src[0])); break; case TGSI_OPCODE_LOG: - arith(vpc, SCA, LOG, dst, mask, none, none, src[0]); + nvfx_vp_emit(vpc, arith(SCA, LOG, dst, mask, none, none, src[0])); break; case TGSI_OPCODE_LRP: - tmp = temp(vpc); - arith(vpc, VEC, MAD, tmp, mask, neg(src[0]), src[2], src[2]); - arith(vpc, VEC, MAD, dst, mask, src[0], src[1], tmp); + tmp = nvfx_src(temp(vpc)); + nvfx_vp_emit(vpc, arith(VEC, MAD, tmp.reg, mask, neg(src[0]), src[2], src[2])); + nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], tmp)); break; case TGSI_OPCODE_MAD: - arith(vpc, VEC, MAD, dst, mask, src[0], src[1], src[2]); + nvfx_vp_emit(vpc, arith(VEC, MAD, dst, mask, src[0], src[1], src[2])); break; case TGSI_OPCODE_MAX: - arith(vpc, VEC, MAX, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, MAX, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_MIN: - arith(vpc, VEC, MIN, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, MIN, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_MOV: - arith(vpc, VEC, MOV, dst, mask, src[0], none, none); + nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, src[0], none, none)); break; case TGSI_OPCODE_MUL: - arith(vpc, VEC, MUL, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, MUL, dst, mask, src[0], src[1], none)); + break; + case TGSI_OPCODE_NOP: break; case TGSI_OPCODE_POW: - tmp = temp(vpc); - arith(vpc, SCA, LG2, tmp, NVFX_VP_MASK_X, none, none, - swz(src[0], X, X, X, X)); - arith(vpc, VEC, MUL, tmp, NVFX_VP_MASK_X, swz(tmp, X, X, X, X), - swz(src[1], X, X, X, X), none); - arith(vpc, SCA, EX2, dst, mask, none, none, - swz(tmp, X, X, X, X)); + tmp = nvfx_src(temp(vpc)); + nvfx_vp_emit(vpc, arith(SCA, LG2, tmp.reg, NVFX_VP_MASK_X, none, none, swz(src[0], X, X, X, X))); + nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, NVFX_VP_MASK_X, swz(tmp, X, X, X, X), swz(src[1], X, X, X, X), none)); + nvfx_vp_emit(vpc, arith(SCA, EX2, dst, mask, none, none, swz(tmp, X, X, X, X))); break; case TGSI_OPCODE_RCP: - arith(vpc, SCA, RCP, dst, mask, none, none, src[0]); - break; - case TGSI_OPCODE_RET: + nvfx_vp_emit(vpc, arith(SCA, RCP, dst, mask, none, none, src[0])); break; case TGSI_OPCODE_RSQ: - arith(vpc, SCA, RSQ, dst, mask, none, none, abs(src[0])); + nvfx_vp_emit(vpc, arith(SCA, RSQ, dst, mask, none, none, abs(src[0]))); break; case TGSI_OPCODE_SEQ: - arith(vpc, VEC, SEQ, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, SEQ, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SFL: - arith(vpc, VEC, SFL, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, SFL, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SGE: - arith(vpc, VEC, SGE, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, SGE, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SGT: - arith(vpc, VEC, SGT, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, SGT, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SIN: - arith(vpc, SCA, SIN, dst, mask, none, none, src[0]); + nvfx_vp_emit(vpc, arith(SCA, SIN, dst, mask, none, none, src[0])); break; case TGSI_OPCODE_SLE: - arith(vpc, VEC, SLE, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, SLE, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SLT: - arith(vpc, VEC, SLT, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, SLT, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SNE: - arith(vpc, VEC, SNE, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, SNE, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SSG: - arith(vpc, VEC, SSG, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, SSG, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_STR: - arith(vpc, VEC, STR, dst, mask, src[0], src[1], none); + nvfx_vp_emit(vpc, arith(VEC, STR, dst, mask, src[0], src[1], none)); break; case TGSI_OPCODE_SUB: - arith(vpc, VEC, ADD, dst, mask, src[0], none, neg(src[1])); + nvfx_vp_emit(vpc, arith(VEC, ADD, dst, mask, src[0], none, neg(src[1]))); break; + case TGSI_OPCODE_TRUNC: + tmp = nvfx_src(temp(vpc)); + insn = arith(VEC, MOV, none.reg, mask, src[0], none, none); + insn.cc_update = 1; + nvfx_vp_emit(vpc, insn); + + nvfx_vp_emit(vpc, arith(VEC, FLR, tmp.reg, mask, abs(src[0]), none, none)); + nvfx_vp_emit(vpc, arith(VEC, MOV, dst, mask, tmp, none, none)); + + insn = arith(VEC, MOV, dst, mask, neg(tmp), none, none); + insn.cc_test = NVFX_COND_LT; + nvfx_vp_emit(vpc, insn); + break; case TGSI_OPCODE_XPD: - tmp = temp(vpc); - arith(vpc, VEC, MUL, tmp, mask, - swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none); - arith(vpc, VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), - swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), - neg(tmp)); + tmp = nvfx_src(temp(vpc)); + nvfx_vp_emit(vpc, arith(VEC, MUL, tmp.reg, mask, swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none)); + nvfx_vp_emit(vpc, arith(VEC, MAD, dst, (mask & ~NVFX_VP_MASK_W), swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y), neg(tmp))); + break; + + case TGSI_OPCODE_IF: + insn = arith(VEC, MOV, none.reg, NVFX_VP_MASK_X, src[0], none, none); + insn.cc_update = 1; + nvfx_vp_emit(vpc, insn); + + reloc.location = vpc->vp->nr_insns; + reloc.target = finst->Label.Label + 1; + util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc); + + insn = arith(SCA, BRA, none.reg, 0, none, none, none); + insn.cc_test = NVFX_COND_EQ; + insn.cc_swz[0] = insn.cc_swz[1] = insn.cc_swz[2] = insn.cc_swz[3] = 0; + nvfx_vp_emit(vpc, insn); break; + + case TGSI_OPCODE_ELSE: + case TGSI_OPCODE_BRA: + case TGSI_OPCODE_CAL: + reloc.location = vpc->vp->nr_insns; + reloc.target = finst->Label.Label; + util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc); + + if(finst->Instruction.Opcode == TGSI_OPCODE_CAL) + insn = arith(SCA, CAL, none.reg, 0, none, none, none); + else + insn = arith(SCA, BRA, none.reg, 0, none, none, none); + nvfx_vp_emit(vpc, insn); + break; + + case TGSI_OPCODE_RET: + tmp = none; + tmp.swz[0] = tmp.swz[1] = tmp.swz[2] = tmp.swz[3] = 0; + nvfx_vp_emit(vpc, arith(SCA, RET, none.reg, 0, none, none, tmp)); + break; + + case TGSI_OPCODE_BGNSUB: + case TGSI_OPCODE_ENDSUB: + case TGSI_OPCODE_ENDIF: + /* nothing to do here */ + break; + + case TGSI_OPCODE_BGNLOOP: + loop.cont_target = idx; + loop.brk_target = finst->Label.Label + 1; + util_dynarray_append(&vpc->loop_stack, struct nvfx_loop_entry, loop); + break; + + case TGSI_OPCODE_ENDLOOP: + loop = util_dynarray_pop(&vpc->loop_stack, struct nvfx_loop_entry); + + reloc.location = vpc->vp->nr_insns; + reloc.target = loop.cont_target; + util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc); + + nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none)); + break; + + case TGSI_OPCODE_CONT: + loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry); + + reloc.location = vpc->vp->nr_insns; + reloc.target = loop.cont_target; + util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc); + + nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none)); + break; + + case TGSI_OPCODE_BRK: + loop = util_dynarray_top(&vpc->loop_stack, struct nvfx_loop_entry); + + reloc.location = vpc->vp->nr_insns; + reloc.target = loop.brk_target; + util_dynarray_append(&vpc->label_relocs, struct nvfx_relocation, reloc); + + nvfx_vp_emit(vpc, arith(SCA, BRA, none.reg, 0, none, none, none)); + break; + default: NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode); return FALSE; @@ -649,12 +741,8 @@ nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, hw = NVFX_VP(INST_DEST_PSZ); break; case TGSI_SEMANTIC_GENERIC: - if (fdec->Semantic.Index <= 7) { - hw = NVFX_VP(INST_DEST_TC(fdec->Semantic.Index)); - } else { - NOUVEAU_ERR("bad generic semantic index\n"); - return FALSE; - } + hw = (vpc->vp->generic_to_fp_input[fdec->Semantic.Index] & 0xf) + + NVFX_VP(INST_DEST_TC(0)) - NVFX_FP_OP_INPUT_SRC_TC(0); break; case TGSI_SEMANTIC_EDGEFLAG: /* not really an error just a fallback */ @@ -665,7 +753,7 @@ nvfx_vertprog_parse_decl_output(struct nvfx_context* nvfx, struct nvfx_vpc *vpc, return FALSE; } - vpc->r_result[idx] = nvfx_sr(NVFXSR_OUTPUT, hw); + vpc->r_result[idx] = nvfx_reg(NVFXSR_OUTPUT, hw); return TRUE; } @@ -674,6 +762,36 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc) { struct tgsi_parse_context p; int high_temp = -1, high_addr = -1, nr_imm = 0, i; + struct util_semantic_set set; + unsigned char sem_layout[8]; + unsigned num_outputs; + + num_outputs = util_semantic_set_from_program_file(&set, vpc->vp->pipe.tokens, TGSI_FILE_OUTPUT); + + if(num_outputs > 8) { + NOUVEAU_ERR("too many vertex program outputs: %i\n", num_outputs); + return FALSE; + } + util_semantic_layout_from_set(sem_layout, &set, 8, 8); + + /* hope 0xf is (0, 0, 0, 1) initialized; otherwise, we are _probably_ not required to do this */ + memset(vpc->vp->generic_to_fp_input, 0x0f, sizeof(vpc->vp->generic_to_fp_input)); + for(int i = 0; i < 8; ++i) { + if(sem_layout[i] == 0xff) + continue; + //printf("vp: GENERIC[%i] to fpreg %i\n", sem_layout[i], NVFX_FP_OP_INPUT_SRC_TC(0) + i); + vpc->vp->generic_to_fp_input[sem_layout[i]] = 0xf0 | NVFX_FP_OP_INPUT_SRC_TC(i); + } + + vpc->vp->sprite_fp_input = -1; + for(int i = 0; i < 8; ++i) + { + if(sem_layout[i] == 0xff) + { + vpc->vp->sprite_fp_input = NVFX_FP_OP_INPUT_SRC_TC(i); + break; + } + } tgsi_parse_init(&p, vpc->vp->pipe.tokens); while (!tgsi_parse_end_of_tokens(&p)) { @@ -737,18 +855,18 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc) tgsi_parse_free(&p); if (nr_imm) { - vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_sreg)); + vpc->imm = CALLOC(nr_imm, sizeof(struct nvfx_reg)); assert(vpc->imm); } if (++high_temp) { - vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_sreg)); + vpc->r_temp = CALLOC(high_temp, sizeof(struct nvfx_reg)); for (i = 0; i < high_temp; i++) vpc->r_temp[i] = temp(vpc); } if (++high_addr) { - vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_sreg)); + vpc->r_address = CALLOC(high_addr, sizeof(struct nvfx_reg)); for (i = 0; i < high_addr; i++) vpc->r_address[i] = temp(vpc); } @@ -757,20 +875,31 @@ nvfx_vertprog_prepare(struct nvfx_context* nvfx, struct nvfx_vpc *vpc) return TRUE; } +DEBUG_GET_ONCE_BOOL_OPTION(nvfx_dump_vp, "NVFX_DUMP_VP", FALSE) + static void nvfx_vertprog_translate(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp) { struct tgsi_parse_context parse; struct nvfx_vpc *vpc = NULL; - struct nvfx_sreg none = nvfx_sr(NVFXSR_NONE, 0); + struct nvfx_src none = nvfx_src(nvfx_reg(NVFXSR_NONE, 0)); + struct util_dynarray insns; int i; vpc = CALLOC(1, sizeof(struct nvfx_vpc)); if (!vpc) return; + vpc->nvfx = nvfx; vpc->vp = vp; + /* reserve space for ucps */ + if(nvfx->use_vp_clipping) + { + for(i = 0; i < 6; ++i) + constant(vpc, -1, 0, 0, 0, 0); + } + if (!nvfx_vertprog_prepare(nvfx, vpc)) { FREE(vpc); return; @@ -780,13 +909,15 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx, * planes are enabled. We need to append code to the vtxprog * to handle clip planes later. */ - if (vp->ucp.nr) { + /* TODO: maybe support patching this depending on whether there are ucps: not sure if it is really matters much */ + if (nvfx->use_vp_clipping) { vpc->r_result[vpc->hpos_idx] = temp(vpc); vpc->r_temps_discard = 0; } tgsi_parse_init(&parse, vp->pipe.tokens); + util_dynarray_init(&insns); while (!tgsi_parse_end_of_tokens(&parse)) { tgsi_parse_token(&parse); @@ -809,8 +940,10 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx, case TGSI_TOKEN_TYPE_INSTRUCTION: { const struct tgsi_full_instruction *finst; + unsigned idx = insns.size >> 2; + util_dynarray_append(&insns, unsigned, vp->nr_insns); finst = &parse.FullToken.FullInstruction; - if (!nvfx_vertprog_parse_instruction(nvfx, vpc, finst)) + if (!nvfx_vertprog_parse_instruction(nvfx, vpc, idx, finst)) goto out_err; } break; @@ -819,43 +952,87 @@ nvfx_vertprog_translate(struct nvfx_context *nvfx, } } + util_dynarray_append(&insns, unsigned, vp->nr_insns); + + for(unsigned i = 0; i < vpc->label_relocs.size; i += sizeof(struct nvfx_relocation)) + { + struct nvfx_relocation* label_reloc = (struct nvfx_relocation*)((char*)vpc->label_relocs.data + i); + struct nvfx_relocation hw_reloc; + + hw_reloc.location = label_reloc->location; + hw_reloc.target = ((unsigned*)insns.data)[label_reloc->target]; + + //debug_printf("hw %u -> tgsi %u = hw %u\n", hw_reloc.location, label_reloc->target, hw_reloc.target); + + util_dynarray_append(&vp->branch_relocs, struct nvfx_relocation, hw_reloc); + } + util_dynarray_fini(&insns); + util_dynarray_trim(&vp->branch_relocs); + + /* XXX: what if we add a RET before?! make sure we jump here...*/ + /* Write out HPOS if it was redirected to a temp earlier */ if (vpc->r_result[vpc->hpos_idx].type != NVFXSR_OUTPUT) { - struct nvfx_sreg hpos = nvfx_sr(NVFXSR_OUTPUT, + struct nvfx_reg hpos = nvfx_reg(NVFXSR_OUTPUT, NVFX_VP(INST_DEST_POS)); - struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx]; + struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]); - arith(vpc, VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none); + nvfx_vp_emit(vpc, arith(VEC, MOV, hpos, NVFX_VP_MASK_ALL, htmp, none, none)); } /* Insert code to handle user clip planes */ - for (i = 0; i < vp->ucp.nr; i++) { - struct nvfx_sreg cdst = nvfx_sr(NVFXSR_OUTPUT, - NVFX_VP_INST_DEST_CLIP(i)); - struct nvfx_sreg ceqn = constant(vpc, -1, - nvfx->clip.ucp[i][0], - nvfx->clip.ucp[i][1], - nvfx->clip.ucp[i][2], - nvfx->clip.ucp[i][3]); - struct nvfx_sreg htmp = vpc->r_result[vpc->hpos_idx]; - unsigned mask; - - switch (i) { - case 0: case 3: mask = NVFX_VP_MASK_Y; break; - case 1: case 4: mask = NVFX_VP_MASK_Z; break; - case 2: case 5: mask = NVFX_VP_MASK_W; break; - default: - NOUVEAU_ERR("invalid clip dist #%d\n", i); - goto out_err; + if(nvfx->use_vp_clipping) + { + for (i = 0; i < 6; i++) { + struct nvfx_reg cdst = nvfx_reg(NVFXSR_OUTPUT, NV30_VP_INST_DEST_CLP(i)); + struct nvfx_src ceqn = nvfx_src(nvfx_reg(NVFXSR_CONST, i)); + struct nvfx_src htmp = nvfx_src(vpc->r_result[vpc->hpos_idx]); + unsigned mask; + + if(nvfx->is_nv4x) + { + switch (i) { + case 0: case 3: mask = NVFX_VP_MASK_Y; break; + case 1: case 4: mask = NVFX_VP_MASK_Z; break; + case 2: case 5: mask = NVFX_VP_MASK_W; break; + default: + NOUVEAU_ERR("invalid clip dist #%d\n", i); + goto out_err; + } + } + else + mask = NVFX_VP_MASK_X; + + nvfx_vp_emit(vpc, arith(VEC, DP4, cdst, mask, htmp, ceqn, none)); } + } + else + { + if(vp->nr_insns) + vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST; - arith(vpc, VEC, DP4, cdst, mask, htmp, ceqn, none); + nvfx_vp_emit(vpc, arith(VEC, NOP, none.reg, 0, none, none, none)); + vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST; } - vp->insns[vp->nr_insns - 1].data[3] |= NVFX_VP_INST_LAST; + if(debug_get_option_nvfx_dump_vp()) + { + debug_printf("\n"); + tgsi_dump(vp->pipe.tokens, 0); + + debug_printf("\n%s vertex program:\n", nvfx->is_nv4x ? "nv4x" : "nv3x"); + for (i = 0; i < vp->nr_insns; i++) + debug_printf("%3u: %08x %08x %08x %08x\n", i, vp->insns[i].data[0], vp->insns[i].data[1], vp->insns[i].data[2], vp->insns[i].data[3]); + debug_printf("\n"); + } + + vp->clip_nr = -1; + vp->exec_start = -1; vp->translated = TRUE; out_err: tgsi_parse_free(&parse); + util_dynarray_fini(&vpc->label_relocs); + util_dynarray_fini(&vpc->loop_stack); if (vpc->r_temp) FREE(vpc->r_temp); if (vpc->r_address) @@ -868,26 +1045,17 @@ out_err: boolean nvfx_vertprog_validate(struct nvfx_context *nvfx) { - struct pipe_context *pipe = &nvfx->pipe; struct nvfx_screen *screen = nvfx->screen; struct nouveau_channel *chan = screen->base.channel; struct nouveau_grobj *eng3d = screen->eng3d; struct nvfx_vertex_program *vp; struct pipe_resource *constbuf; - struct pipe_transfer *transfer = NULL; boolean upload_code = FALSE, upload_data = FALSE; int i; if (nvfx->render_mode == HW) { vp = nvfx->vertprog; constbuf = nvfx->constbuf[PIPE_SHADER_VERTEX]; - - // TODO: ouch! can't we just use constant slots for these?! - if ((nvfx->dirty & NVFX_NEW_UCP) || - memcmp(&nvfx->clip, &vp->ucp, sizeof(vp->ucp))) { - nvfx_vertprog_destroy(nvfx, vp); - memcpy(&vp->ucp, &nvfx->clip, sizeof(vp->ucp)); - } } else { vp = nvfx->swtnl.vertprog; constbuf = NULL; @@ -918,7 +1086,11 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx) } if (nouveau_resource_alloc(heap, vplen, vp, &vp->exec)) - assert(0); + { + debug_printf("Vertex shader too long: %u instructions\n", vplen); + nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG; + return FALSE; + } } upload_code = TRUE; @@ -937,7 +1109,11 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx) } if (nouveau_resource_alloc(heap, vp->nr_consts, vp, &vp->data)) - assert(0); + { + debug_printf("Vertex shader uses too many constants: %u constants\n", vp->nr_consts); + nvfx->fallback_swtnl |= NVFX_NEW_VERTPROG; + return FALSE; + } } /*XXX: handle this some day */ @@ -952,44 +1128,57 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx) * fixup offsets and register IDs. */ if (vp->exec_start != vp->exec->start) { - for (i = 0; i < vp->nr_insns; i++) { - struct nvfx_vertex_program_exec *vpi = &vp->insns[i]; + //printf("vp_relocs %u -> %u\n", vp->exec_start, vp->exec->start); + for(unsigned i = 0; i < vp->branch_relocs.size; i += sizeof(struct nvfx_relocation)) + { + struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->branch_relocs.data + i); + uint32_t* hw = vp->insns[reloc->location].data; + unsigned target = vp->exec->start + reloc->target; + + //debug_printf("vp_reloc hw %u -> hw %u\n", reloc->location, target); - if (vpi->has_branch_offset) { - assert(0); + if(!nvfx->is_nv4x) + { + hw[2] &=~ NV30_VP_INST_IADDR_MASK; + hw[2] |= (target & 0x1ff) << NV30_VP_INST_IADDR_SHIFT; + } + else + { + hw[3] &=~ NV40_VP_INST_IADDRL_MASK; + hw[3] |= (target & 7) << NV40_VP_INST_IADDRL_SHIFT; + + hw[2] &=~ NV40_VP_INST_IADDRH_MASK; + hw[2] |= ((target >> 3) & 0x3f) << NV40_VP_INST_IADDRH_SHIFT; } } vp->exec_start = vp->exec->start; } - if (vp->nr_consts && vp->data_start != vp->data->start) { - for (i = 0; i < vp->nr_insns; i++) { - struct nvfx_vertex_program_exec *vpi = &vp->insns[i]; + if (vp->data_start != vp->data->start) { + for(unsigned i = 0; i < vp->const_relocs.size; i += sizeof(struct nvfx_relocation)) + { + struct nvfx_relocation* reloc = (struct nvfx_relocation*)((char*)vp->const_relocs.data + i); + struct nvfx_vertex_program_exec *vpi = &vp->insns[reloc->location]; - if (vpi->const_index >= 0) { - vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK); - vpi->data[1] |= - (vpi->const_index + vp->data->start) << + vpi->data[1] &= ~NVFX_VP(INST_CONST_SRC_MASK); + vpi->data[1] |= + (reloc->target + vp->data->start) << NVFX_VP(INST_CONST_SRC_SHIFT); - - } } vp->data_start = vp->data->start; + upload_code = TRUE; } /* Update + Upload constant values */ if (vp->nr_consts) { float *map = NULL; - if (constbuf) { - map = pipe_buffer_map(pipe, constbuf, - PIPE_TRANSFER_READ, - &transfer); - } + if (constbuf) + map = (float*)nvfx_buffer(constbuf)->data; - for (i = 0; i < vp->nr_consts; i++) { + for (i = nvfx->use_vp_clipping ? 6 : 0; i < vp->nr_consts; i++) { struct nvfx_vertex_program_data *vpd = &vp->consts[i]; if (vpd->index >= 0) { @@ -1005,41 +1194,28 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx) OUT_RING (chan, i + vp->data->start); OUT_RINGp (chan, (uint32_t *)vpd->value, 4); } - - if (constbuf) - pipe_buffer_unmap(pipe, constbuf, transfer); } /* Upload vtxprog */ if (upload_code) { -#if 0 - for (i = 0; i < vp->nr_insns; i++) { - NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]); - NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]); - NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]); - NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]); - } -#endif BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_FROM_ID, 1); OUT_RING (chan, vp->exec->start); for (i = 0; i < vp->nr_insns; i++) { BEGIN_RING(chan, eng3d, NV34TCL_VP_UPLOAD_INST(0), 4); OUT_RINGp (chan, vp->insns[i].data, 4); } + vp->clip_nr = -1; } - if(nvfx->dirty & (NVFX_NEW_VERTPROG | NVFX_NEW_UCP)) + if(nvfx->dirty & (NVFX_NEW_VERTPROG)) { - WAIT_RING(chan, 7); + WAIT_RING(chan, 6); OUT_RING(chan, RING_3D(NV34TCL_VP_START_FROM_ID, 1)); OUT_RING(chan, vp->exec->start); if(nvfx->is_nv4x) { - OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 2)); + OUT_RING(chan, RING_3D(NV40TCL_VP_ATTRIB_EN, 1)); OUT_RING(chan, vp->ir); - OUT_RING(chan, vp->or); } - OUT_RING(chan, RING_3D(NV34TCL_VP_CLIP_PLANES_ENABLE, 1)); - OUT_RING(chan, vp->clip_ctrl); } return TRUE; @@ -1048,25 +1224,63 @@ nvfx_vertprog_validate(struct nvfx_context *nvfx) void nvfx_vertprog_destroy(struct nvfx_context *nvfx, struct nvfx_vertex_program *vp) { - vp->translated = FALSE; - - if (vp->nr_insns) { + if (vp->nr_insns) FREE(vp->insns); - vp->insns = NULL; - vp->nr_insns = 0; - } - if (vp->nr_consts) { + if (vp->nr_consts) FREE(vp->consts); - vp->consts = NULL; - vp->nr_consts = 0; - } nouveau_resource_free(&vp->exec); - vp->exec_start = 0; nouveau_resource_free(&vp->data); - vp->data_start = 0; - vp->data_start_min = 0; - vp->ir = vp->or = vp->clip_ctrl = 0; + util_dynarray_fini(&vp->branch_relocs); + util_dynarray_fini(&vp->const_relocs); +} + +static void * +nvfx_vp_state_create(struct pipe_context *pipe, + const struct pipe_shader_state *cso) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + struct nvfx_vertex_program *vp; + + // TODO: use a 64-bit atomic here! + static unsigned long long id = 0; + + vp = CALLOC(1, sizeof(struct nvfx_vertex_program)); + vp->pipe.tokens = tgsi_dup_tokens(cso->tokens); + vp->draw = draw_create_vertex_shader(nvfx->draw, &vp->pipe); + vp->id = ++id; + + return (void *)vp; +} + +static void +nvfx_vp_state_bind(struct pipe_context *pipe, void *hwcso) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + + nvfx->vertprog = hwcso; + nvfx->dirty |= NVFX_NEW_VERTPROG; + nvfx->draw_dirty |= NVFX_NEW_VERTPROG; +} + +static void +nvfx_vp_state_delete(struct pipe_context *pipe, void *hwcso) +{ + struct nvfx_context *nvfx = nvfx_context(pipe); + struct nvfx_vertex_program *vp = hwcso; + + draw_delete_vertex_shader(nvfx->draw, vp->draw); + nvfx_vertprog_destroy(nvfx, vp); + FREE((void*)vp->pipe.tokens); + FREE(vp); +} + +void +nvfx_init_vertprog_functions(struct nvfx_context *nvfx) +{ + nvfx->pipe.create_vs_state = nvfx_vp_state_create; + nvfx->pipe.bind_vs_state = nvfx_vp_state_bind; + nvfx->pipe.delete_vs_state = nvfx_vp_state_delete; } diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c index e8b6c4f7af8..624dadd07d7 100644 --- a/src/gallium/drivers/r300/r300_context.c +++ b/src/gallium/drivers/r300/r300_context.c @@ -65,7 +65,7 @@ static void r300_release_referenced_objects(struct r300_context *r300) unsigned i; /* Framebuffer state. */ - util_assign_framebuffer_state(fb, NULL); + util_unreference_framebuffer_state(fb); /* Textures. */ for (i = 0; i < textures->sampler_view_count; i++) @@ -99,8 +99,10 @@ static void r300_destroy_context(struct pipe_context* context) struct r300_context* r300 = r300_context(context); struct r300_atom *atom; - util_blitter_destroy(r300->blitter); - draw_destroy(r300->draw); + if (r300->blitter) + util_blitter_destroy(r300->blitter); + if (r300->draw) + draw_destroy(r300->draw); /* Print stats, if enabled. */ if (SCREEN_DBG_ON(r300->screen, DBG_STATS)) { @@ -112,40 +114,48 @@ static void r300_destroy_context(struct pipe_context* context) } } - u_upload_destroy(r300->upload_vb); - u_upload_destroy(r300->upload_ib); + if (r300->upload_vb) + u_upload_destroy(r300->upload_vb); + if (r300->upload_ib) + u_upload_destroy(r300->upload_ib); - /* setup hyper-z mm */ - if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ)) - r300_hyperz_destroy_mm(r300); - - translate_cache_destroy(r300->tran.translate_cache); + if (r300->tran.translate_cache) + translate_cache_destroy(r300->tran.translate_cache); + /* XXX: This function assumes r300->query_list was initialized */ r300_release_referenced_objects(r300); - r300->rws->cs_destroy(r300->cs); + if (r300->zmask_mm) + r300_hyperz_destroy_mm(r300); + + if (r300->cs) + r300->rws->cs_destroy(r300->cs); + /* XXX: No way to tell if this was initialized or not? */ util_mempool_destroy(&r300->pool_transfers); r300_update_num_contexts(r300->screen, -1); - FREE(r300->aa_state.state); - FREE(r300->blend_color_state.state); - FREE(r300->clip_state.state); - FREE(r300->fb_state.state); - FREE(r300->gpu_flush.state); - FREE(r300->hyperz_state.state); - FREE(r300->invariant_state.state); - FREE(r300->rs_block_state.state); - FREE(r300->scissor_state.state); - FREE(r300->textures_state.state); - FREE(r300->vap_invariant_state.state); - FREE(r300->viewport_state.state); - FREE(r300->ztop_state.state); - FREE(r300->fs_constants.state); - FREE(r300->vs_constants.state); - if (!r300->screen->caps.has_tcl) { - FREE(r300->vertex_stream_state.state); + /* Free the structs allocated in r300_setup_atoms() */ + if (r300->aa_state.state) { + FREE(r300->aa_state.state); + FREE(r300->blend_color_state.state); + FREE(r300->clip_state.state); + FREE(r300->fb_state.state); + FREE(r300->gpu_flush.state); + FREE(r300->hyperz_state.state); + FREE(r300->invariant_state.state); + FREE(r300->rs_block_state.state); + FREE(r300->scissor_state.state); + FREE(r300->textures_state.state); + FREE(r300->vap_invariant_state.state); + FREE(r300->viewport_state.state); + FREE(r300->ztop_state.state); + FREE(r300->fs_constants.state); + FREE(r300->vs_constants.state); + if (!r300->screen->caps.has_tcl) { + FREE(r300->vertex_stream_state.state); + } } FREE(r300); } @@ -158,12 +168,14 @@ void r300_flush_cb(void *data) } #define R300_INIT_ATOM(atomname, atomsize) \ + do { \ r300->atomname.name = #atomname; \ r300->atomname.state = NULL; \ r300->atomname.size = atomsize; \ r300->atomname.emit = r300_emit_##atomname; \ r300->atomname.dirty = FALSE; \ - insert_at_tail(&r300->atom_list, &r300->atomname); + insert_at_tail(&r300->atom_list, &r300->atomname); \ + } while (0) static void r300_setup_atoms(struct r300_context* r300) { @@ -404,19 +416,21 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, r300->context.destroy = r300_destroy_context; - r300->cs = rws->cs_create(rws); + make_empty_list(&r300->query_list); util_mempool_create(&r300->pool_transfers, sizeof(struct pipe_transfer), 64, UTIL_MEMPOOL_SINGLETHREADED); + r300->cs = rws->cs_create(rws); + if (r300->cs == NULL) + goto fail; + if (!r300screen->caps.has_tcl) { /* Create a Draw. This is used for SW TCL. */ r300->draw = draw_create(&r300->context); /* Enable our renderer. */ draw_set_rasterize_stage(r300->draw, r300_draw_stage(r300)); - /* Enable Draw's clipping. */ - draw_set_driver_clipping(r300->draw, FALSE); /* Disable converting points/lines to triangles. */ draw_wide_line_threshold(r300->draw, 10000000.f); draw_wide_point_threshold(r300->draw, 10000000.f); @@ -424,8 +438,6 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, r300_setup_atoms(r300); - make_empty_list(&r300->query_list); - r300_init_blit_functions(r300); r300_init_flush_functions(r300); r300_init_query_functions(r300); @@ -433,6 +445,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, r300_init_resource_functions(r300); r300->blitter = util_blitter_create(&r300->context); + if (r300->blitter == NULL) + goto fail; /* Render functions must be initialized after blitter. */ r300_init_render_functions(r300); @@ -441,22 +455,25 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, /* setup hyper-z mm */ if (r300->rws->get_value(r300->rws, R300_CAN_HYPERZ)) - r300_hyperz_init_mm(r300); + if (!r300_hyperz_init_mm(r300)) + goto fail; r300->upload_ib = u_upload_create(&r300->context, 32 * 1024, 16, PIPE_BIND_INDEX_BUFFER); if (r300->upload_ib == NULL) - goto no_upload_ib; + goto fail; r300->upload_vb = u_upload_create(&r300->context, 128 * 1024, 16, PIPE_BIND_VERTEX_BUFFER); if (r300->upload_vb == NULL) - goto no_upload_vb; + goto fail; r300->tran.translate_cache = translate_cache_create(); + if (r300->tran.translate_cache == NULL) + goto fail; r300_init_states(&r300->context); @@ -486,10 +503,8 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, return &r300->context; - no_upload_ib: - u_upload_destroy(r300->upload_ib); - no_upload_vb: - FREE(r300); + fail: + r300_destroy_context(&r300->context); return NULL; } diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h index 6fa7f470f98..8f0e86fd378 100644 --- a/src/gallium/drivers/r300/r300_context.h +++ b/src/gallium/drivers/r300/r300_context.h @@ -254,8 +254,8 @@ struct r300_ztop_state { struct r300_constant_buffer { /* Buffer of constants */ uint32_t *ptr; - /* Total number of vec4s */ - unsigned count; + /* Remapping table. */ + unsigned *remap_table; }; /* Query object. @@ -449,6 +449,7 @@ struct r300_context { struct r300_screen *screen; /* Draw module. Used mostly for SW TCL. */ struct draw_context* draw; + size_t draw_vbo_size; /* Accelerated blit support. */ struct blitter_context* blitter; /* Stencil two-sided reference value fallback. */ @@ -649,6 +650,11 @@ void r300_translate_index_buffer(struct r300_context *r300, /* r300_render_stencilref.c */ void r300_plug_in_stencil_ref_fallback(struct r300_context *r300); +/* r300_render.c */ +void r300_draw_flush_vbuf(struct r300_context *r300); +boolean r500_index_bias_supported(struct r300_context *r300); +void r500_emit_index_bias(struct r300_context *r300, int index_bias); + /* r300_state.c */ enum r300_fb_state_change { R300_CHANGED_FB_STATE = 0, diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c index d0fd45349e3..232259e21d1 100644 --- a/src/gallium/drivers/r300/r300_emit.c +++ b/src/gallium/drivers/r300/r300_emit.c @@ -180,9 +180,18 @@ void r300_emit_fs_constants(struct r300_context* r300, unsigned size, void *stat BEGIN_CS(size); OUT_CS_REG_SEQ(R300_PFS_PARAM_0_X, count * 4); - for (i = 0; i < count; i++) - for (j = 0; j < 4; j++) - OUT_CS(pack_float24(*(float*)&buf->ptr[i*4+j])); + if (buf->remap_table){ + for (i = 0; i < count; i++) { + float *data = (float*)&buf->ptr[buf->remap_table[i]*4]; + for (j = 0; j < 4; j++) + OUT_CS(pack_float24(data[j])); + } + } else { + for (i = 0; i < count; i++) + for (j = 0; j < 4; j++) + OUT_CS(pack_float24(*(float*)&buf->ptr[i*4+j])); + } + END_CS; } @@ -226,7 +235,7 @@ void r500_emit_fs_constants(struct r300_context* r300, unsigned size, void *stat { struct r300_fragment_shader *fs = r300_fs(r300); struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state; - unsigned count = fs->shader->externals_count * 4; + unsigned count = fs->shader->externals_count; CS_LOCALS(r300); if (count == 0) @@ -234,8 +243,15 @@ void r500_emit_fs_constants(struct r300_context* r300, unsigned size, void *stat BEGIN_CS(size); OUT_CS_REG(R500_GA_US_VECTOR_INDEX, R500_GA_US_VECTOR_INDEX_TYPE_CONST); - OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, count); - OUT_CS_TABLE(buf->ptr, count); + OUT_CS_ONE_REG(R500_GA_US_VECTOR_DATA, count * 4); + if (buf->remap_table){ + for (unsigned i = 0; i < count; i++) { + uint32_t *data = &buf->ptr[buf->remap_table[i]*4]; + OUT_CS_TABLE(data, 4); + } + } else { + OUT_CS_TABLE(buf->ptr, count * 4); + } END_CS; } @@ -893,7 +909,7 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state) unsigned pvs_num_slots = MIN3(vtx_mem_size / input_count, vtx_mem_size / output_count, 10); - unsigned pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 6); + unsigned pvs_num_controllers = MIN2(vtx_mem_size / temp_count, 5); unsigned imm_first = vs->externals_count; unsigned imm_end = vs->code.constants.Count; @@ -961,6 +977,7 @@ void r300_emit_vs_constants(struct r300_context* r300, unsigned count = ((struct r300_vertex_shader*)r300->vs_state.state)->externals_count; struct r300_constant_buffer *buf = (struct r300_constant_buffer*)state; + unsigned i; CS_LOCALS(r300); if (!count) @@ -971,7 +988,14 @@ void r300_emit_vs_constants(struct r300_context* r300, (r300->screen->caps.is_r500 ? R500_PVS_CONST_START : R300_PVS_CONST_START)); OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, count * 4); - OUT_CS_TABLE(buf->ptr, count * 4); + if (buf->remap_table){ + for (i = 0; i < count; i++) { + uint32_t *data = &buf->ptr[buf->remap_table[i]*4]; + OUT_CS_TABLE(data, 4); + } + } else { + OUT_CS_TABLE(buf->ptr, count * 4); + } END_CS; } @@ -1219,6 +1243,8 @@ unsigned r300_get_num_cs_end_dwords(struct r300_context *r300) /* Emitted in flush. */ dwords += 26; /* emit_query_end */ dwords += r300->hyperz_state.size + 2; /* emit_hyperz_end + zcache flush */ + if (r500_index_bias_supported(r300)) + dwords += 2; return dwords; } diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c index fe182b6615b..2b5d2e42ba5 100644 --- a/src/gallium/drivers/r300/r300_flush.c +++ b/src/gallium/drivers/r300/r300_flush.c @@ -43,9 +43,14 @@ static void r300_flush(struct pipe_context* pipe, u_upload_flush(r300->upload_vb); u_upload_flush(r300->upload_ib); + if (r300->draw) + r300_draw_flush_vbuf(r300); + if (r300->dirty_hw) { r300_emit_hyperz_end(r300); r300_emit_query_end(r300); + if (r500_index_bias_supported(r300)) + r500_emit_index_bias(r300, 0); r300->flush_counter++; r300->rws->cs_flush(r300->cs); diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c index 2a0c30620ad..9845e546109 100644 --- a/src/gallium/drivers/r300/r300_fs.c +++ b/src/gallium/drivers/r300/r300_fs.c @@ -257,17 +257,17 @@ static void r300_emit_fs_code_to_buffer( shader->cb_code_size = 19 + ((code->inst_end + 1) * 6) + imm_count * 7 + - code->int_constant_count * 2; + code->int_constant_count * 2; NEW_CB(shader->cb_code, shader->cb_code_size); OUT_CB_REG(R500_US_CONFIG, R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO); OUT_CB_REG(R500_US_PIXSIZE, code->max_temp_idx); OUT_CB_REG(R500_US_FC_CTRL, code->us_fc_ctrl); for(i = 0; i < code->int_constant_count; i++){ - OUT_CB_REG(R500_US_FC_INT_CONST_0 + (i * 4), - code->int_constants[i]); - } - OUT_CB_REG(R500_US_CODE_RANGE, + OUT_CB_REG(R500_US_FC_INT_CONST_0 + (i * 4), + code->int_constants[i]); + } + OUT_CB_REG(R500_US_CODE_RANGE, R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(code->inst_end)); OUT_CB_REG(R500_US_CODE_OFFSET, 0); OUT_CB_REG(R500_US_CODE_ADDR, @@ -386,6 +386,7 @@ static void r300_translate_fragment_shader( compiler.state = shader->compare_state; compiler.Base.is_r500 = r300->screen->caps.is_r500; compiler.Base.max_temp_regs = compiler.Base.is_r500 ? 128 : 32; + compiler.Base.remove_unused_constants = TRUE; compiler.AllocateHwInputs = &allocate_hardware_inputs; compiler.UserData = &shader->inputs; @@ -431,9 +432,8 @@ static void r300_translate_fragment_shader( } if (compiler.Base.Error) { - DBG(r300, DBG_FP, "r300 FP: Compiler Error:\n%sUsing a dummy shader" - " instead.\nIf there's an 'unknown opcode' message, please" - " file a bug report and attach this log.\n", compiler.Base.ErrorMsg); + fprintf(stderr, "r300 FP: Compiler Error:\n%sUsing a dummy shader" + " instead.\n", compiler.Base.ErrorMsg); if (shader->dummy) { fprintf(stderr, "r300 FP: Cannot compile the dummy shader! " @@ -447,7 +447,12 @@ static void r300_translate_fragment_shader( } /* Initialize numbers of constants for each type. */ - shader->externals_count = ttr.immediate_offset; + shader->externals_count = 0; + for (i = 0; + i < shader->code.constants.Count && + shader->code.constants.Constants[i].Type == RC_CONSTANT_EXTERNAL; i++) { + shader->externals_count = i+1; + } shader->immediates_count = 0; shader->rc_state_count = 0; diff --git a/src/gallium/drivers/r300/r300_hyperz.c b/src/gallium/drivers/r300/r300_hyperz.c index 811b5646e16..eb5b0c36f8f 100644 --- a/src/gallium/drivers/r300/r300_hyperz.c +++ b/src/gallium/drivers/r300/r300_hyperz.c @@ -354,7 +354,12 @@ void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf /* We currently don't handle decompression for 3D textures and cubemaps * correctly. */ if (tex->desc.b.b.target != PIPE_TEXTURE_1D && - tex->desc.b.b.target != PIPE_TEXTURE_2D) + tex->desc.b.b.target != PIPE_TEXTURE_2D && + tex->desc.b.b.target != PIPE_TEXTURE_RECT) + return; + + /* Cannot flush zmask of 16-bit zbuffers. */ + if (util_format_get_blocksizebits(tex->desc.b.b.format) == 16) return; if (tex->zmask_mem[level]) @@ -373,23 +378,36 @@ void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf return; } -void r300_hyperz_init_mm(struct r300_context *r300) +boolean r300_hyperz_init_mm(struct r300_context *r300) { struct r300_screen* r300screen = r300->screen; int frag_pipes = r300screen->caps.num_frag_pipes; - if (r300screen->caps.hiz_ram) + r300->zmask_mm = u_mmInit(0, r300screen->caps.zmask_ram * frag_pipes); + if (!r300->zmask_mm) + return FALSE; + + if (r300screen->caps.hiz_ram) { r300->hiz_mm = u_mmInit(0, r300screen->caps.hiz_ram * frag_pipes); + if (!r300->hiz_mm) { + u_mmDestroy(r300->zmask_mm); + r300->zmask_mm = NULL; + return FALSE; + } + } - r300->zmask_mm = u_mmInit(0, r300screen->caps.zmask_ram * frag_pipes); + return TRUE; } void r300_hyperz_destroy_mm(struct r300_context *r300) { struct r300_screen* r300screen = r300->screen; - if (r300screen->caps.hiz_ram) + if (r300screen->caps.hiz_ram) { u_mmDestroy(r300->hiz_mm); + r300->hiz_mm = NULL; + } u_mmDestroy(r300->zmask_mm); + r300->zmask_mm = NULL; } diff --git a/src/gallium/drivers/r300/r300_hyperz.h b/src/gallium/drivers/r300/r300_hyperz.h index 09e1ff6625c..30a23ec6493 100644 --- a/src/gallium/drivers/r300/r300_hyperz.h +++ b/src/gallium/drivers/r300/r300_hyperz.h @@ -30,6 +30,6 @@ void r300_update_hyperz_state(struct r300_context* r300); void r300_hiz_alloc_block(struct r300_context *r300, struct r300_surface *surf); void r300_zmask_alloc_block(struct r300_context *r300, struct r300_surface *surf, int compress); -void r300_hyperz_init_mm(struct r300_context *r300); +boolean r300_hyperz_init_mm(struct r300_context *r300); void r300_hyperz_destroy_mm(struct r300_context *r300); #endif diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h index 60d3b600cb7..6bea783f697 100644 --- a/src/gallium/drivers/r300/r300_reg.h +++ b/src/gallium/drivers/r300/r300_reg.h @@ -1607,6 +1607,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE. # define R300_TX_FORMAT_3D (1 << 25) # define R300_TX_FORMAT_CUBIC_MAP (2 << 25) +# define R300_TX_FORMAT_TEX_COORD_TYPE_MASK (0x3 << 25) /* alpha modes, convenience mostly */ /* if you have alpha, pick constant appropriate to the diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c index 86b11ca0458..20bad2c56f5 100644 --- a/src/gallium/drivers/r300/r300_render.c +++ b/src/gallium/drivers/r300/r300_render.c @@ -118,13 +118,13 @@ static uint32_t r300_provoking_vertex_fixes(struct r300_context *r300, return color_control; } -static boolean index_bias_supported(struct r300_context *r300) +boolean r500_index_bias_supported(struct r300_context *r300) { return r300->screen->caps.is_r500 && r300->rws->get_value(r300->rws, R300_VID_DRM_2_3_0); } -static void r500_emit_index_bias(struct r300_context *r300, int index_bias) +void r500_emit_index_bias(struct r300_context *r300, int index_bias) { CS_LOCALS(r300); @@ -199,7 +199,7 @@ static void r300_prepare_for_rendering(struct r300_context *r300, boolean emit_aos = flags & PREP_EMIT_AOS; boolean emit_aos_swtcl = flags & PREP_EMIT_AOS_SWTCL; boolean indexed = flags & PREP_INDEXED; - boolean hw_index_bias = index_bias_supported(r300); + boolean hw_index_bias = r500_index_bias_supported(r300); /* Add dirty state, index offset, and AOS. */ if (first_draw) { @@ -506,7 +506,7 @@ static void r300_draw_range_elements(struct pipe_context* pipe, translate = TRUE; } - if (indexBias && !index_bias_supported(r300)) { + if (indexBias && !r500_index_bias_supported(r300)) { r300_split_index_bias(r300, indexBias, &buffer_offset, &index_offset); } @@ -680,18 +680,11 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe, if (info->indexed && r300->index_buffer.buffer) { indices = pipe_buffer_map(pipe, r300->index_buffer.buffer, PIPE_TRANSFER_READ, &ib_transfer); - if (indices) - indices = (void *) ((char *) indices + r300->index_buffer.offset); } - draw_set_mapped_element_buffer_range(r300->draw, (indices) ? - r300->index_buffer.index_size : 0, - info->index_bias, - info->min_index, - info->max_index, - indices); + draw_set_mapped_index_buffer(r300->draw, indices); - draw_arrays(r300->draw, info->mode, info->start, count); + draw_vbo(r300->draw, info); /* XXX Not sure whether this is the best fix. * It prevents CS from being rejected and weird assertion failures. */ @@ -707,8 +700,7 @@ static void r300_swtcl_draw_vbo(struct pipe_context* pipe, if (ib_transfer) { pipe_buffer_unmap(pipe, r300->index_buffer.buffer, ib_transfer); - draw_set_mapped_element_buffer_range(r300->draw, 0, 0, info->start, - info->start + count - 1, NULL); + draw_set_mapped_index_buffer(r300->draw, NULL); } } @@ -726,8 +718,6 @@ struct r300_render { unsigned hwprim; /* VBO */ - struct pipe_resource* vbo; - size_t vbo_size; size_t vbo_offset; size_t vbo_max_used; void * vbo_ptr; @@ -759,31 +749,31 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render, struct pipe_screen* screen = r300->context.screen; size_t size = (size_t)vertex_size * (size_t)count; - if (size + r300render->vbo_offset > r300render->vbo_size) + if (size + r300render->vbo_offset > r300->draw_vbo_size) { - pipe_resource_reference(&r300->vbo, NULL); - r300render->vbo = pipe_buffer_create(screen, - PIPE_BIND_VERTEX_BUFFER, - R300_MAX_DRAW_VBO_SIZE); + pipe_resource_reference(&r300->vbo, NULL); + r300->vbo = pipe_buffer_create(screen, + PIPE_BIND_VERTEX_BUFFER, + R300_MAX_DRAW_VBO_SIZE); r300render->vbo_offset = 0; - r300render->vbo_size = R300_MAX_DRAW_VBO_SIZE; + r300->draw_vbo_size = R300_MAX_DRAW_VBO_SIZE; } r300render->vertex_size = vertex_size; - r300->vbo = r300render->vbo; r300->vbo_offset = r300render->vbo_offset; - return (r300render->vbo) ? TRUE : FALSE; + return (r300->vbo) ? TRUE : FALSE; } static void* r300_render_map_vertices(struct vbuf_render* render) { struct r300_render* r300render = r300_render(render); + struct r300_context* r300 = r300render->r300; assert(!r300render->vbo_transfer); r300render->vbo_ptr = pipe_buffer_map(&r300render->r300->context, - r300render->vbo, + r300->vbo, PIPE_TRANSFER_WRITE, &r300render->vbo_transfer); @@ -798,12 +788,13 @@ static void r300_render_unmap_vertices(struct vbuf_render* render, { struct r300_render* r300render = r300_render(render); struct pipe_context* context = &r300render->r300->context; + struct r300_context* r300 = r300render->r300; assert(r300render->vbo_transfer); r300render->vbo_max_used = MAX2(r300render->vbo_max_used, r300render->vertex_size * (max + 1)); - pipe_buffer_unmap(context, r300render->vbo, r300render->vbo_transfer); + pipe_buffer_unmap(context, r300->vbo, r300render->vbo_transfer); r300render->vbo_transfer = NULL; } @@ -880,7 +871,7 @@ static void r300_render_draw_elements(struct vbuf_render* render, struct r300_context* r300 = r300render->r300; int i; unsigned end_cs_dwords; - unsigned max_index = (r300render->vbo_size - r300render->vbo_offset) / + unsigned max_index = (r300->draw_vbo_size - r300render->vbo_offset) / (r300render->r300->vertex_info.size * 4) - 1; unsigned short_count; unsigned free_dwords; @@ -956,8 +947,6 @@ static struct vbuf_render* r300_render_create(struct r300_context* r300) r300render->base.release_vertices = r300_render_release_vertices; r300render->base.destroy = r300_render_destroy; - r300render->vbo = NULL; - r300render->vbo_size = 0; r300render->vbo_offset = 0; return &r300render->base; @@ -986,6 +975,12 @@ struct draw_stage* r300_draw_stage(struct r300_context* r300) return stage; } +void r300_draw_flush_vbuf(struct r300_context *r300) +{ + pipe_resource_reference(&r300->vbo, NULL); + r300->draw_vbo_size = 0; +} + /**************************************************************************** * End of SW TCL functions * ***************************************************************************/ diff --git a/src/gallium/drivers/r300/r300_state.c b/src/gallium/drivers/r300/r300_state.c index 239edd98e32..8ccb63964e7 100644 --- a/src/gallium/drivers/r300/r300_state.c +++ b/src/gallium/drivers/r300/r300_state.c @@ -23,7 +23,7 @@ #include "draw/draw_context.h" -#include "util/u_blitter.h" +#include "util/u_framebuffer.h" #include "util/u_math.h" #include "util/u_mm.h" #include "util/u_memory.h" @@ -748,7 +748,7 @@ static void /* The tiling flags are dependent on the surface miplevel, unfortunately. */ r300_fb_set_tiling_flags(r300, state); - util_assign_framebuffer_state(r300->fb_state.state, state); + util_copy_framebuffer_state(r300->fb_state.state, state); r300_mark_fb_state_dirty(r300, R300_CHANGED_FB_STATE); @@ -865,6 +865,9 @@ void r300_mark_fs_code_dirty(struct r300_context *r300) r300->fs_rc_constant_state.size = fs->shader->rc_state_count * 5; r300->fs_constants.size = fs->shader->externals_count * 4 + 1; } + + ((struct r300_constant_buffer*)r300->fs_constants.state)->remap_table = + fs->shader->code.constants_remap_table; } /* Bind fragment shader state. */ @@ -937,9 +940,9 @@ static void* r300_create_rs_state(struct pipe_context* pipe, uint32_t stuffing_enable; /* R300_GB_ENABLE: 0x4008 */ /* Point sprites texture coordinates, 0: lower left, 1: upper right */ - float point_texcoord_left; /* R300_GA_POINT_S0: 0x4200 */ + float point_texcoord_left = 0; /* R300_GA_POINT_S0: 0x4200 */ float point_texcoord_bottom = 0;/* R300_GA_POINT_T0: 0x4204 */ - float point_texcoord_right; /* R300_GA_POINT_S1: 0x4208 */ + float point_texcoord_right = 1; /* R300_GA_POINT_S1: 0x4208 */ float point_texcoord_top = 0; /* R300_GA_POINT_T1: 0x420c */ CB_LOCALS; @@ -947,6 +950,11 @@ static void* r300_create_rs_state(struct pipe_context* pipe, rs->rs = *state; rs->rs_draw = *state; + /* Generate point sprite texture coordinates in GENERIC0 + * if point_quad_rasterization is TRUE. */ + rs->rs.sprite_coord_enable = state->point_quad_rasterization * + (state->sprite_coord_enable | 1); + /* Override some states for Draw. */ rs->rs_draw.sprite_coord_enable = 0; /* We can do this in HW. */ @@ -1048,16 +1056,13 @@ static void* r300_create_rs_state(struct pipe_context* pipe, /* Point sprites */ stuffing_enable = 0; - if (state->sprite_coord_enable) { + if (rs->rs.sprite_coord_enable) { stuffing_enable = R300_GB_POINT_STUFF_ENABLE; - for (i = 0; i < 8; i++) { - if (state->sprite_coord_enable & (1 << i)) + for (i = 0; i < 8; i++) { + if (rs->rs.sprite_coord_enable & (1 << i)) stuffing_enable |= R300_GB_TEX_ST << (R300_GB_TEX0_SOURCE_SHIFT + (i*2)); - } - - point_texcoord_left = 0.0f; - point_texcoord_right = 1.0f; + } switch (state->sprite_coord_mode) { case PIPE_SPRITE_COORD_UPPER_LEFT: @@ -1208,8 +1213,8 @@ static void* /* Unfortunately, r300-r500 don't support floating-point mipmap lods. */ /* We must pass these to the merge function to clamp them properly. */ - sampler->min_lod = MAX2((unsigned)state->min_lod, 0); - sampler->max_lod = MAX2((unsigned)ceilf(state->max_lod), 0); + sampler->min_lod = (unsigned)MAX2(state->min_lod, 0); + sampler->max_lod = (unsigned)MAX2(ceilf(state->max_lod), 0); lod_bias = CLAMP((int)(state->lod_bias * 32 + 1), -(1 << 9), (1 << 9) - 1); @@ -1548,7 +1553,12 @@ static void r300_set_index_buffer(struct pipe_context* pipe, memset(&r300->index_buffer, 0, sizeof(r300->index_buffer)); } - /* TODO make this more like a state */ + if (r300->screen->caps.has_tcl) { + /* TODO make this more like a state */ + } + else { + draw_set_index_buffer(r300->draw, ib); + } } /* Initialize the PSC tables. */ @@ -1765,6 +1775,9 @@ static void r300_bind_vs_state(struct pipe_context* pipe, void* shader) r300->vs_constants.size = 0; } + ((struct r300_constant_buffer*)r300->vs_constants.state)->remap_table = + vs->code.constants_remap_table; + r300->pvs_flush.dirty = TRUE; } else { draw_bind_vertex_shader(r300->draw, @@ -1779,6 +1792,8 @@ static void r300_delete_vs_state(struct pipe_context* pipe, void* shader) if (r300->screen->caps.has_tcl) { rc_constants_destroy(&vs->code.constants); + if (vs->code.constants_remap_table) + FREE(vs->code.constants_remap_table); } else { draw_delete_vertex_shader(r300->draw, (struct draw_vertex_shader*)vs->draw_vs); @@ -1795,47 +1810,28 @@ static void r300_set_constant_buffer(struct pipe_context *pipe, struct r300_context* r300 = r300_context(pipe); struct r300_constant_buffer *cbuf; uint32_t *mapped = r300_buffer(buf)->user_buffer; - int max_size = 0, max_size_bytes = 0, clamped_size = 0; switch (shader) { case PIPE_SHADER_VERTEX: cbuf = (struct r300_constant_buffer*)r300->vs_constants.state; - max_size = 256; break; case PIPE_SHADER_FRAGMENT: cbuf = (struct r300_constant_buffer*)r300->fs_constants.state; - if (r300->screen->caps.is_r500) { - max_size = 256; - } else { - max_size = 32; - } break; default: assert(0); return; } - max_size_bytes = max_size * 4 * sizeof(float); if (buf == NULL || buf->width0 == 0 || (mapped = r300_buffer(buf)->constant_buffer) == NULL) { - cbuf->count = 0; return; } if (shader == PIPE_SHADER_FRAGMENT || (shader == PIPE_SHADER_VERTEX && r300->screen->caps.has_tcl)) { assert((buf->width0 % (4 * sizeof(float))) == 0); - - /* Check the size of the constant buffer. */ - /* XXX Subtract immediates and RC_STATE_* variables. */ - if (buf->width0 > max_size_bytes) { - fprintf(stderr, "r300: Max size of the constant buffer is " - "%i*4 floats.\n", max_size); - } - - clamped_size = MIN2(buf->width0, max_size_bytes); - cbuf->count = clamped_size / (4 * sizeof(float)); - cbuf->ptr = mapped; + cbuf->ptr = mapped + index*4; } if (shader == PIPE_SHADER_VERTEX) { diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c index 4a63ed7fc13..960dfdbaf03 100644 --- a/src/gallium/drivers/r300/r300_state_derived.c +++ b/src/gallium/drivers/r300/r300_state_derived.c @@ -211,7 +211,7 @@ static void r300_rs_col(struct r300_rs_block* rs, int id, int ptr, static void r300_rs_col_write(struct r300_rs_block* rs, int id, int fp_offset, enum r300_rs_col_write_type type) { - assert(type != WRITE_COLOR); + assert(type == WRITE_COLOR); rs->inst[id] |= R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_offset); } @@ -592,6 +592,25 @@ static void r300_merge_textures_and_samplers(struct r300_context* r300) texstate->filter1 = sampler->filter1; texstate->border_color = sampler->border_color; + /* determine min/max levels */ + max_level = MIN3(sampler->max_lod + view->base.first_level, + tex->desc.b.b.last_level, view->base.last_level); + min_level = MIN2(sampler->min_lod + view->base.first_level, + max_level); + + if (tex->desc.is_npot && min_level > 0) { + /* Even though we do not implement mipmapping for NPOT + * textures, we should at least honor the minimum level + * which is allowed to be displayed. We do this by setting up + * an i-th mipmap level as the zero level. */ + r300_texture_setup_format_state(r300->screen, &tex->desc, + min_level, + &texstate->format); + texstate->format.tile_config |= + tex->desc.offset_in_bytes[min_level] & 0xffffffe0; + assert((tex->desc.offset_in_bytes[min_level] & 0x1f) == 0); + } + /* Assign a texture cache region. */ texstate->format.format1 |= view->texcache_region; @@ -654,12 +673,7 @@ static void r300_merge_textures_and_samplers(struct r300_context* r300) texstate->filter0 |= R300_TX_WRAP_T(R300_TX_CLAMP_TO_EDGE); } } else { - /* determine min/max levels */ /* the MAX_MIP level is the largest (finest) one */ - max_level = MIN3(sampler->max_lod + view->base.first_level, - tex->desc.b.b.last_level, view->base.last_level); - min_level = MIN2(sampler->min_lod + view->base.first_level, - max_level); texstate->format.format0 |= R300_TX_NUM_LEVELS(max_level); texstate->filter0 |= R300_TX_MAX_MIP_LEVEL(min_level); } diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c index da8eadd3b53..66f6d80bd0c 100644 --- a/src/gallium/drivers/r300/r300_texture.c +++ b/src/gallium/drivers/r300/r300_texture.c @@ -541,48 +541,55 @@ boolean r300_is_sampler_format_supported(enum pipe_format format) return r300_translate_texformat(format, 0, TRUE) != ~0; } -static void r300_texture_setup_immutable_state(struct r300_screen* screen, - struct r300_texture* tex) +void r300_texture_setup_format_state(struct r300_screen *screen, + struct r300_texture_desc *desc, + unsigned level, + struct r300_texture_format_state *out) { - struct r300_texture_format_state* f = &tex->tx_format; - struct pipe_resource *pt = &tex->desc.b.b; + struct pipe_resource *pt = &desc->b.b; boolean is_r500 = screen->caps.is_r500; + /* Mask out all the fields we change. */ + out->format0 = 0; + out->format1 &= ~R300_TX_FORMAT_TEX_COORD_TYPE_MASK; + out->format2 &= R500_TXFORMAT_MSB; + out->tile_config = 0; + /* Set sampler state. */ - f->format0 = R300_TX_WIDTH((pt->width0 - 1) & 0x7ff) | - R300_TX_HEIGHT((pt->height0 - 1) & 0x7ff); + out->format0 = R300_TX_WIDTH((u_minify(pt->width0, level) - 1) & 0x7ff) | + R300_TX_HEIGHT((u_minify(pt->height0, level) - 1) & 0x7ff); - if (tex->desc.uses_stride_addressing) { + if (desc->uses_stride_addressing) { /* rectangles love this */ - f->format0 |= R300_TX_PITCH_EN; - f->format2 = (tex->desc.stride_in_pixels[0] - 1) & 0x1fff; + out->format0 |= R300_TX_PITCH_EN; + out->format2 = (desc->stride_in_pixels[level] - 1) & 0x1fff; } else { /* Power of two textures (3D, mipmaps, and no pitch), * also NPOT textures with a width being POT. */ - f->format0 |= R300_TX_DEPTH(util_logbase2(pt->depth0) & 0xf); + out->format0 |= + R300_TX_DEPTH(util_logbase2(u_minify(pt->depth0, level)) & 0xf); } - f->format1 = 0; if (pt->target == PIPE_TEXTURE_CUBE) { - f->format1 |= R300_TX_FORMAT_CUBIC_MAP; + out->format1 |= R300_TX_FORMAT_CUBIC_MAP; } if (pt->target == PIPE_TEXTURE_3D) { - f->format1 |= R300_TX_FORMAT_3D; + out->format1 |= R300_TX_FORMAT_3D; } /* large textures on r500 */ if (is_r500) { if (pt->width0 > 2048) { - f->format2 |= R500_TXWIDTH_BIT11; + out->format2 |= R500_TXWIDTH_BIT11; } if (pt->height0 > 2048) { - f->format2 |= R500_TXHEIGHT_BIT11; + out->format2 |= R500_TXHEIGHT_BIT11; } } - f->tile_config = R300_TXO_MACRO_TILE(tex->desc.macrotile[0]) | - R300_TXO_MICRO_TILE(tex->desc.microtile); + out->tile_config = R300_TXO_MACRO_TILE(desc->macrotile[level]) | + R300_TXO_MICRO_TILE(desc->microtile); } static void r300_texture_setup_fb_state(struct r300_screen* screen, @@ -716,7 +723,7 @@ r300_texture_create_object(struct r300_screen *rscreen, return NULL; } /* Initialize the hardware state. */ - r300_texture_setup_immutable_state(rscreen, tex); + r300_texture_setup_format_state(rscreen, &tex->desc, 0, &tex->tx_format); r300_texture_setup_fb_state(rscreen, tex); tex->desc.b.vtbl = &r300_texture_vtbl; @@ -754,7 +761,8 @@ struct pipe_resource *r300_texture_create(struct pipe_screen *screen, /* Refuse to create a texture with size 0. */ if (!base->width0 || (!base->height0 && (base->target == PIPE_TEXTURE_2D || - base->target == PIPE_TEXTURE_CUBE)) || + base->target == PIPE_TEXTURE_CUBE || + base->target == PIPE_TEXTURE_RECT)) || (!base->depth0 && base->target == PIPE_TEXTURE_3D)) { fprintf(stderr, "r300: texture_create: " "Got invalid texture dimensions: %ix%ix%i\n", @@ -787,7 +795,8 @@ struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen, unsigned stride, size; /* Support only 2D textures without mipmaps */ - if (base->target != PIPE_TEXTURE_2D || + if ((base->target != PIPE_TEXTURE_2D && + base->target != PIPE_TEXTURE_RECT) || base->depth0 != 1 || base->last_level != 0) { return NULL; diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h index a4524320fda..c4588a0c90b 100644 --- a/src/gallium/drivers/r300/r300_texture.h +++ b/src/gallium/drivers/r300/r300_texture.h @@ -23,11 +23,14 @@ #ifndef R300_TEXTURE_H #define R300_TEXTURE_H +#include "pipe/p_compiler.h" #include "pipe/p_format.h" struct pipe_screen; struct pipe_resource; struct winsys_handle; +struct r300_texture_format_state; +struct r300_texture_desc; struct r300_texture; struct r300_screen; @@ -50,6 +53,10 @@ boolean r300_is_zs_format_supported(enum pipe_format format); boolean r300_is_sampler_format_supported(enum pipe_format format); +void r300_texture_setup_format_state(struct r300_screen *screen, + struct r300_texture_desc *desc, + unsigned level, + struct r300_texture_format_state *out); struct pipe_resource* r300_texture_from_handle(struct pipe_screen* screen, diff --git a/src/gallium/drivers/r300/r300_texture_desc.c b/src/gallium/drivers/r300/r300_texture_desc.c index 5d690e8c332..2fe5d721881 100644 --- a/src/gallium/drivers/r300/r300_texture_desc.c +++ b/src/gallium/drivers/r300/r300_texture_desc.c @@ -184,7 +184,8 @@ static unsigned r300_texture_get_nblocksy(struct r300_texture_desc *desc, /* This is needed for the kernel checker, unfortunately. */ if ((desc->b.b.target != PIPE_TEXTURE_1D && - desc->b.b.target != PIPE_TEXTURE_2D) || + desc->b.b.target != PIPE_TEXTURE_2D && + desc->b.b.target != PIPE_TEXTURE_RECT) || desc->b.b.last_level != 0) { height = util_next_power_of_two(height); } @@ -202,7 +203,8 @@ static unsigned r300_texture_get_nblocksy(struct r300_texture_desc *desc, * Do so for 3 or more macrotiles in the Y direction. */ if (level == 0 && desc->b.b.last_level == 0 && (desc->b.b.target == PIPE_TEXTURE_1D || - desc->b.b.target == PIPE_TEXTURE_2D) && + desc->b.b.target == PIPE_TEXTURE_2D || + desc->b.b.target == PIPE_TEXTURE_RECT) && height >= tile_height * 3) { height = align(height, tile_height * 2); } diff --git a/src/gallium/drivers/r300/r300_texture_desc.h b/src/gallium/drivers/r300/r300_texture_desc.h index 95de66f6549..3d7fe1fb473 100644 --- a/src/gallium/drivers/r300/r300_texture_desc.h +++ b/src/gallium/drivers/r300/r300_texture_desc.h @@ -24,6 +24,7 @@ #ifndef R300_TEXTURE_DESC_H #define R300_TEXTURE_DESC_H +#include "pipe/p_format.h" #include "r300_defines.h" struct pipe_resource; diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c index dd697b9c374..a4911b9a2a6 100644 --- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c +++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c @@ -97,13 +97,13 @@ static unsigned translate_opcode(unsigned opcode) /* case TGSI_OPCODE_BRA: return RC_OPCODE_BRA; */ /* case TGSI_OPCODE_CAL: return RC_OPCODE_CAL; */ /* case TGSI_OPCODE_RET: return RC_OPCODE_RET; */ - /* case TGSI_OPCODE_SSG: return RC_OPCODE_SSG; */ + case TGSI_OPCODE_SSG: return RC_OPCODE_SSG; case TGSI_OPCODE_CMP: return RC_OPCODE_CMP; case TGSI_OPCODE_SCS: return RC_OPCODE_SCS; case TGSI_OPCODE_TXB: return RC_OPCODE_TXB; /* case TGSI_OPCODE_NRM: return RC_OPCODE_NRM; */ /* case TGSI_OPCODE_DIV: return RC_OPCODE_DIV; */ - /* case TGSI_OPCODE_DP2: return RC_OPCODE_DP2; */ + case TGSI_OPCODE_DP2: return RC_OPCODE_DP2; case TGSI_OPCODE_TXL: return RC_OPCODE_TXL; case TGSI_OPCODE_BRK: return RC_OPCODE_BRK; case TGSI_OPCODE_IF: return RC_OPCODE_IF; diff --git a/src/gallium/drivers/r300/r300_vs.c b/src/gallium/drivers/r300/r300_vs.c index 54c8de12419..5f8dbb28d0c 100644 --- a/src/gallium/drivers/r300/r300_vs.c +++ b/src/gallium/drivers/r300/r300_vs.c @@ -196,6 +196,7 @@ void r300_translate_vertex_shader(struct r300_context *r300, { struct r300_vertex_program_compiler compiler; struct tgsi_to_rc ttr; + unsigned i; /* Setup the compiler */ rc_init(&compiler.Base); @@ -205,6 +206,7 @@ void r300_translate_vertex_shader(struct r300_context *r300, compiler.UserData = vs; compiler.Base.is_r500 = r300->screen->caps.is_r500; compiler.Base.max_temp_regs = 32; + compiler.Base.remove_unused_constants = TRUE; if (compiler.Base.Debug) { DBG(r300, DBG_VP, "r300: Initial vertex program\n"); @@ -227,9 +229,8 @@ void r300_translate_vertex_shader(struct r300_context *r300, /* Invoke the compiler */ r3xx_compile_vertex_program(&compiler); if (compiler.Base.Error) { - DBG(r300, DBG_VP, "r300 VP: Compiler error:\n%sUsing a dummy shader" - " instead.\nIf there's an 'unknown opcode' message, please" - " file a bug report and attach this log.\n", compiler.Base.ErrorMsg); + fprintf(stderr, "r300 VP: Compiler error:\n%sUsing a dummy shader" + " instead.\n", compiler.Base.ErrorMsg); if (vs->dummy) { fprintf(stderr, "r300 VP: Cannot compile the dummy shader! " @@ -243,7 +244,15 @@ void r300_translate_vertex_shader(struct r300_context *r300, } /* Initialize numbers of constants for each type. */ - vs->externals_count = ttr.immediate_offset; + vs->externals_count = 0; + for (i = 0; + i < vs->code.constants.Count && + vs->code.constants.Constants[i].Type == RC_CONSTANT_EXTERNAL; i++) { + vs->externals_count = i+1; + } + for (; i < vs->code.constants.Count; i++) { + assert(vs->code.constants.Constants[i].Type == RC_CONSTANT_IMMEDIATE); + } vs->immediates_count = vs->code.constants.Count - vs->externals_count; /* And, finally... */ diff --git a/src/gallium/drivers/r300/r300_winsys.h b/src/gallium/drivers/r300/r300_winsys.h index 187780750fa..4597332399a 100644 --- a/src/gallium/drivers/r300/r300_winsys.h +++ b/src/gallium/drivers/r300/r300_winsys.h @@ -33,6 +33,7 @@ #include "r300_defines.h" +struct winsys_handle; struct r300_winsys_screen; struct r300_winsys_buffer; diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 9ea9d4354d6..6483dac7039 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -76,6 +76,27 @@ int r600_bc_init(struct r600_bc *bc, enum radeon_family family) { LIST_INITHEAD(&bc->cf); bc->family = family; + switch (bc->family) { + case CHIP_R600: + case CHIP_RV610: + case CHIP_RV630: + case CHIP_RV670: + case CHIP_RV620: + case CHIP_RV635: + case CHIP_RS780: + case CHIP_RS880: + bc->chiprev = 0; + break; + case CHIP_RV770: + case CHIP_RV730: + case CHIP_RV710: + case CHIP_RV740: + bc->chiprev = 1; + break; + default: + R600_ERR("unknown family %d\n", bc->family); + return -EINVAL; + } return 0; } @@ -107,7 +128,7 @@ int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output) return 0; } -int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu) +int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type) { struct r600_bc_alu *nalu = r600_bc_alu(); struct r600_bc_alu *lalu; @@ -119,7 +140,7 @@ int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu) nalu->nliteral = 0; /* cf can contains only alu or only vtx or only tex */ - if (bc->cf_last == NULL || bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) || + if (bc->cf_last == NULL || bc->cf_last->inst != (type << 3) || bc->force_add_cf) { /* at most 128 slots, one add alu can add 4 slots + 4 constant worst case */ r = r600_bc_add_cf(bc); @@ -127,7 +148,7 @@ int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu) free(nalu); return r; } - bc->cf_last->inst = V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3; + bc->cf_last->inst = (type << 3); } if (alu->last && (bc->cf_last->ndw >> 1) >= 124) { bc->force_add_cf = 1; @@ -162,6 +183,11 @@ int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu) return 0; } +int r600_bc_add_alu(struct r600_bc *bc, const struct r600_bc_alu *alu) +{ + return r600_bc_add_alu_type(bc, alu, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU); +} + int r600_bc_add_literal(struct r600_bc *bc, const u32 *value) { struct r600_bc_alu *alu; @@ -172,7 +198,17 @@ int r600_bc_add_literal(struct r600_bc *bc, const u32 *value) if (bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_TEX) { return 0; } - if (bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3) || + if (bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_JUMP || + bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_ELSE || + bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL || + bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK || + bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE || + bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END || + bc->cf_last->inst == V_SQ_CF_WORD1_SQ_CF_INST_POP) { + return 0; + } + if (((bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3)) && + (bc->cf_last->inst != (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3))) || LIST_IS_EMPTY(&bc->cf_last->alu)) { R600_ERR("last CF is not ALU (%p)\n", bc->cf_last); return -EINVAL; @@ -241,6 +277,18 @@ int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex) return 0; } +int r600_bc_add_cfinst(struct r600_bc *bc, int inst) +{ + int r; + r = r600_bc_add_cf(bc); + if (r) + return r; + + bc->cf_last->cond = V_SQ_CF_COND_ACTIVE; + bc->cf_last->inst = inst; + return 0; +} + static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsigned id) { bc->bytecode[id++] = S_SQ_VTX_WORD0_BUFFER_ID(vtx->buffer_id) | @@ -292,38 +340,44 @@ static int r600_bc_alu_build(struct r600_bc *bc, struct r600_bc_alu *alu, unsign unsigned i; /* don't replace gpr by pv or ps for destination register */ + bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) | + S_SQ_ALU_WORD0_SRC0_REL(alu->src[0].rel) | + S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) | + S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) | + S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) | + S_SQ_ALU_WORD0_SRC1_REL(alu->src[1].rel) | + S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) | + S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) | + S_SQ_ALU_WORD0_LAST(alu->last); + if (alu->is_op3) { - bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) | - S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) | - S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) | - S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) | - S_SQ_ALU_WORD0_LAST(alu->last); bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | + S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | S_SQ_ALU_WORD1_OP3_SRC2_SEL(alu->src[2].sel) | + S_SQ_ALU_WORD1_OP3_SRC2_REL(alu->src[2].rel) | S_SQ_ALU_WORD1_OP3_SRC2_CHAN(alu->src[2].chan) | S_SQ_ALU_WORD1_OP3_SRC2_NEG(alu->src[2].neg) | S_SQ_ALU_WORD1_OP3_ALU_INST(alu->inst) | S_SQ_ALU_WORD1_BANK_SWIZZLE(0); } else { - bc->bytecode[id++] = S_SQ_ALU_WORD0_SRC0_SEL(alu->src[0].sel) | - S_SQ_ALU_WORD0_SRC0_CHAN(alu->src[0].chan) | - S_SQ_ALU_WORD0_SRC0_NEG(alu->src[0].neg) | - S_SQ_ALU_WORD0_SRC1_SEL(alu->src[1].sel) | - S_SQ_ALU_WORD0_SRC1_CHAN(alu->src[1].chan) | - S_SQ_ALU_WORD0_SRC1_NEG(alu->src[1].neg) | - S_SQ_ALU_WORD0_LAST(alu->last); bc->bytecode[id++] = S_SQ_ALU_WORD1_DST_GPR(alu->dst.sel) | S_SQ_ALU_WORD1_DST_CHAN(alu->dst.chan) | + S_SQ_ALU_WORD1_DST_REL(alu->dst.rel) | S_SQ_ALU_WORD1_CLAMP(alu->dst.clamp) | S_SQ_ALU_WORD1_OP2_SRC0_ABS(alu->src[0].abs) | S_SQ_ALU_WORD1_OP2_SRC1_ABS(alu->src[1].abs) | S_SQ_ALU_WORD1_OP2_WRITE_MASK(alu->dst.write) | S_SQ_ALU_WORD1_OP2_ALU_INST(alu->inst) | - S_SQ_ALU_WORD1_BANK_SWIZZLE(0); + S_SQ_ALU_WORD1_BANK_SWIZZLE(0) | + S_SQ_ALU_WORD1_OP2_UPDATE_EXECUTE_MASK(alu->predicate) | + S_SQ_ALU_WORD1_OP2_UPDATE_PRED(alu->predicate); } if (alu->last) { + if (alu->nliteral && !alu->literal_added) { + R600_ERR("Bug in ALU processing for instruction 0x%08x, literal not added correctly\n"); + } for (i = 0; i < alu->nliteral; i++) { bc->bytecode[id++] = alu->value[i]; } @@ -337,6 +391,7 @@ static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf) switch (cf->inst) { case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3): + case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3): bc->bytecode[id++] = S_SQ_CF_ALU_WORD0_ADDR(cf->addr >> 1); bc->bytecode[id++] = S_SQ_CF_ALU_WORD1_CF_INST(cf->inst >> 3) | S_SQ_CF_ALU_WORD1_BARRIER(1) | @@ -364,6 +419,20 @@ static int r600_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf) S_SQ_CF_ALLOC_EXPORT_WORD1_CF_INST(cf->output.inst) | S_SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM(cf->output.end_of_program); break; + case V_SQ_CF_WORD1_SQ_CF_INST_JUMP: + case V_SQ_CF_WORD1_SQ_CF_INST_ELSE: + case V_SQ_CF_WORD1_SQ_CF_INST_POP: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK: + bc->bytecode[id++] = S_SQ_CF_WORD0_ADDR(cf->cf_addr >> 1); + bc->bytecode[id++] = S_SQ_CF_WORD1_CF_INST(cf->inst) | + S_SQ_CF_WORD1_BARRIER(1) | + S_SQ_CF_WORD1_COND(cf->cond) | + S_SQ_CF_WORD1_POP_COUNT(cf->pop_count); + + break; default: R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst); return -EINVAL; @@ -380,6 +449,8 @@ int r600_bc_build(struct r600_bc *bc) unsigned addr; int r; + if (bc->callstack[0].max > 0) + bc->nstack = ((bc->callstack[0].max + 3) >> 2) + 2; /* first path compute addr of each CF block */ /* addr start after all the CF instructions */ @@ -387,6 +458,7 @@ int r600_bc_build(struct r600_bc *bc) LIST_FOR_EACH_ENTRY(cf, &bc->cf, list) { switch (cf->inst) { case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3): + case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3): break; case V_SQ_CF_WORD1_SQ_CF_INST_TEX: case V_SQ_CF_WORD1_SQ_CF_INST_VTX: @@ -398,6 +470,14 @@ int r600_bc_build(struct r600_bc *bc) case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT: case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE: break; + case V_SQ_CF_WORD1_SQ_CF_INST_JUMP: + case V_SQ_CF_WORD1_SQ_CF_INST_ELSE: + case V_SQ_CF_WORD1_SQ_CF_INST_POP: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK: + break; default: R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst); return -EINVAL; @@ -417,22 +497,13 @@ int r600_bc_build(struct r600_bc *bc) return r; switch (cf->inst) { case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU << 3): + case (V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE << 3): LIST_FOR_EACH_ENTRY(alu, &cf->alu, list) { - switch (bc->family) { - case CHIP_R600: - case CHIP_RV610: - case CHIP_RV630: - case CHIP_RV670: - case CHIP_RV620: - case CHIP_RV635: - case CHIP_RS780: - case CHIP_RS880: + switch(bc->chiprev) { + case 0: r = r600_bc_alu_build(bc, alu, addr); break; - case CHIP_RV770: - case CHIP_RV730: - case CHIP_RV710: - case CHIP_RV740: + case 1: r = r700_bc_alu_build(bc, alu, addr); break; default: @@ -466,6 +537,13 @@ int r600_bc_build(struct r600_bc *bc) break; case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT: case V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT_DONE: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE: + case V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK: + case V_SQ_CF_WORD1_SQ_CF_INST_JUMP: + case V_SQ_CF_WORD1_SQ_CF_INST_ELSE: + case V_SQ_CF_WORD1_SQ_CF_INST_POP: break; default: R600_ERR("unsupported CF instruction (0x%X)\n", cf->inst); diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h index 10d98afaf00..9e65fcdd4fa 100644 --- a/src/gallium/drivers/r600/r600_asm.h +++ b/src/gallium/drivers/r600/r600_asm.h @@ -31,6 +31,7 @@ struct r600_bc_alu_src { unsigned chan; unsigned neg; unsigned abs; + unsigned rel; }; struct r600_bc_alu_dst { @@ -38,6 +39,7 @@ struct r600_bc_alu_dst { unsigned chan; unsigned clamp; unsigned write; + unsigned rel; }; struct r600_bc_alu { @@ -47,6 +49,7 @@ struct r600_bc_alu { unsigned inst; unsigned last; unsigned is_op3; + unsigned predicate; unsigned nliteral; unsigned literal_added; u32 value[4]; @@ -114,22 +117,55 @@ struct r600_bc_cf { unsigned addr; unsigned ndw; unsigned id; + unsigned cond; + unsigned pop_count; + unsigned cf_addr; /* control flow addr */ struct list_head alu; struct list_head tex; struct list_head vtx; struct r600_bc_output output; }; +#define FC_NONE 0 +#define FC_IF 1 +#define FC_LOOP 2 +#define FC_REP 3 +#define FC_PUSH_VPM 4 +#define FC_PUSH_WQM 5 + +struct r600_cf_stack_entry { + int type; + struct r600_bc_cf *start; + struct r600_bc_cf **mid; /* used to store the else point */ + int num_mid; +}; + +#define SQ_MAX_CALL_DEPTH 0x00000020 +struct r600_cf_callstack { + unsigned fc_sp_before_entry; + int sub_desc_index; + int current; + int max; +}; + struct r600_bc { enum radeon_family family; + int chiprev; /* 0 - r600, 1 - r700, 2 - evergreen */ struct list_head cf; struct r600_bc_cf *cf_last; unsigned ndw; unsigned ncf; unsigned ngpr; + unsigned nstack; unsigned nresource; unsigned force_add_cf; u32 *bytecode; + + u32 fc_sp; + struct r600_cf_stack_entry fc_stack[32]; + + unsigned call_sp; + struct r600_cf_callstack callstack[SQ_MAX_CALL_DEPTH]; }; int r600_bc_init(struct r600_bc *bc, enum radeon_family family); @@ -139,5 +175,6 @@ int r600_bc_add_vtx(struct r600_bc *bc, const struct r600_bc_vtx *vtx); int r600_bc_add_tex(struct r600_bc *bc, const struct r600_bc_tex *tex); int r600_bc_add_output(struct r600_bc *bc, const struct r600_bc_output *output); int r600_bc_build(struct r600_bc *bc); - +int r600_bc_add_cfinst(struct r600_bc *bc, int inst); +int r600_bc_add_alu_type(struct r600_bc *bc, const struct r600_bc_alu *alu, int type); #endif diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index f4eedfe4cb1..e6ded342e59 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -24,6 +24,7 @@ * Jerome Glisse * Marek Olšák */ +#include <errno.h> #include <pipe/p_screen.h> #include <util/u_blitter.h> #include <util/u_inlines.h> @@ -31,9 +32,12 @@ #include "util/u_surface.h" #include "r600_screen.h" #include "r600_context.h" +#include "r600d.h" -static void r600_blitter_save_states(struct r600_context *rctx) +static void r600_blitter_save_states(struct pipe_context *ctx) { + struct r600_context *rctx = r600_context(ctx); + util_blitter_save_blend(rctx->blitter, rctx->blend); util_blitter_save_depth_stencil_alpha(rctx->blitter, rctx->dsa); if (rctx->stencil_ref) { @@ -47,48 +51,58 @@ static void r600_blitter_save_states(struct r600_context *rctx) if (rctx->viewport) { util_blitter_save_viewport(rctx->blitter, &rctx->viewport->state.viewport); } - /* XXX util_blitter_save_clip(rctx->blitter, &rctx->clip); */ + if (rctx->clip) { + util_blitter_save_clip(rctx->blitter, &rctx->clip->state.clip); + } util_blitter_save_vertex_buffers(rctx->blitter, rctx->nvertex_buffer, rctx->vertex_buffer); /* remove ptr so they don't get deleted */ rctx->blend = NULL; + rctx->clip = NULL; rctx->vs_shader = NULL; rctx->ps_shader = NULL; rctx->rasterizer = NULL; rctx->dsa = NULL; rctx->vertex_elements = NULL; + + /* suspend queries */ + r600_queries_suspend(ctx); } static void r600_clear(struct pipe_context *ctx, unsigned buffers, - const float *rgba, double depth, unsigned stencil) + const float *rgba, double depth, unsigned stencil) { struct r600_context *rctx = r600_context(ctx); struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer; - r600_blitter_save_states(rctx); + r600_blitter_save_states(ctx); util_blitter_clear(rctx->blitter, fb->width, fb->height, fb->nr_cbufs, buffers, rgba, depth, stencil); + /* resume queries */ + r600_queries_resume(ctx); } -static void r600_clear_render_target(struct pipe_context *pipe, +static void r600_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst, const float *rgba, unsigned dstx, unsigned dsty, unsigned width, unsigned height) { - struct r600_context *rctx = r600_context(pipe); + struct r600_context *rctx = r600_context(ctx); struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer; - r600_blitter_save_states(rctx); + r600_blitter_save_states(ctx); util_blitter_save_framebuffer(rctx->blitter, fb); util_blitter_clear_render_target(rctx->blitter, dst, rgba, dstx, dsty, width, height); + /* resume queries */ + r600_queries_resume(ctx); } -static void r600_clear_depth_stencil(struct pipe_context *pipe, +static void r600_clear_depth_stencil(struct pipe_context *ctx, struct pipe_surface *dst, unsigned clear_flags, double depth, @@ -96,17 +110,20 @@ static void r600_clear_depth_stencil(struct pipe_context *pipe, unsigned dstx, unsigned dsty, unsigned width, unsigned height) { - struct r600_context *rctx = r600_context(pipe); + struct r600_context *rctx = r600_context(ctx); struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer; - r600_blitter_save_states(rctx); + r600_blitter_save_states(ctx); util_blitter_save_framebuffer(rctx->blitter, fb); util_blitter_clear_depth_stencil(rctx->blitter, dst, clear_flags, depth, stencil, dstx, dsty, width, height); + /* resume queries */ + r600_queries_resume(ctx); } -static void r600_resource_copy_region(struct pipe_context *pipe, + +static void r600_resource_copy_region(struct pipe_context *ctx, struct pipe_resource *dst, struct pipe_subresource subdst, unsigned dstx, unsigned dsty, unsigned dstz, @@ -115,7 +132,7 @@ static void r600_resource_copy_region(struct pipe_context *pipe, unsigned srcx, unsigned srcy, unsigned srcz, unsigned width, unsigned height) { - util_resource_copy_region(pipe, dst, subdst, dstx, dsty, dstz, + util_resource_copy_region(ctx, dst, subdst, dstx, dsty, dstz, src, subsrc, srcx, srcy, srcz, width, height); } @@ -126,3 +143,446 @@ void r600_init_blit_functions(struct r600_context *rctx) rctx->context.clear_depth_stencil = r600_clear_depth_stencil; rctx->context.resource_copy_region = r600_resource_copy_region; } + + +struct r600_blit_states { + struct radeon_state rasterizer; + struct radeon_state dsa; + struct radeon_state blend; + struct radeon_state cb_cntl; + struct radeon_state vgt; + struct radeon_state draw; + struct radeon_state vs_constant0; + struct radeon_state vs_constant1; + struct radeon_state vs_constant2; + struct radeon_state vs_constant3; + struct radeon_state ps_shader; + struct radeon_state vs_shader; + struct radeon_state vs_resource0; + struct radeon_state vs_resource1; +}; + +static int r600_blit_state_vs_resources(struct r600_screen *rscreen, struct r600_blit_states *bstates) +{ + struct radeon_state *rstate; + struct radeon_bo *bo; + u32 vbo[] = { + 0xBF800000, 0xBF800000, 0x3F800000, 0x3F800000, + 0x3F000000, 0x3F000000, 0x3F000000, 0x00000000, + 0x3F800000, 0xBF800000, 0x3F800000, 0x3F800000, + 0x3F000000, 0x3F000000, 0x3F000000, 0x00000000, + 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, + 0x3F000000, 0x3F000000, 0x3F000000, 0x00000000, + 0xBF800000, 0x3F800000, 0x3F800000, 0x3F800000, + 0x3F000000, 0x3F000000, 0x3F000000, 0x00000000 + }; + + /* simple shader */ + bo = radeon_bo(rscreen->rw, 0, 128, 4096, NULL); + if (bo == NULL) { + return -ENOMEM; + } + if (radeon_bo_map(rscreen->rw, bo)) { + radeon_bo_decref(rscreen->rw, bo); + return -ENOMEM; + } + memcpy(bo->data, vbo, 128); + radeon_bo_unmap(rscreen->rw, bo); + + rstate = &bstates->vs_resource0; + radeon_state_init(rstate, rscreen->rw, R600_STATE_RESOURCE, 0, R600_SHADER_VS); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD0] = 0x00000000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD1] = 0x00000080; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD2] = 0x02302000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD3] = 0x00000000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD4] = 0x00000000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD5] = 0x00000000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD6] = 0xC0000000; + rstate->bo[0] = bo; + rstate->nbo = 1; + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + if (radeon_state_pm4(rstate)) { + radeon_state_fini(rstate); + return -ENOMEM; + } + + rstate = &bstates->vs_resource1; + radeon_state_init(rstate, rscreen->rw, R600_STATE_RESOURCE, 1, R600_SHADER_VS); + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD0] = 0x00000010; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD1] = 0x00000070; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD2] = 0x02302000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD3] = 0x00000000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD4] = 0x00000000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD5] = 0x00000000; + rstate->states[R600_VS_RESOURCE__RESOURCE160_WORD6] = 0xC0000000; + rstate->bo[0] = radeon_bo_incref(rscreen->rw, bo); + rstate->nbo = 1; + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + if (radeon_state_pm4(rstate)) { + radeon_state_fini(rstate); + return -ENOMEM; + } + + return 0; +} + +static void r600_blit_state_vs_shader(struct r600_screen *rscreen, struct radeon_state *rstate) +{ + struct radeon_bo *bo; + u32 shader_bc_r600[] = { + 0x00000004, 0x81000400, + 0x00000008, 0xA01C0000, + 0xC001A03C, 0x94000688, + 0xC0024000, 0x94200688, + 0x7C000000, 0x002D1001, + 0x00080000, 0x00000000, + 0x7C000100, 0x002D1002, + 0x00080000, 0x00000000, + 0x00000001, 0x00601910, + 0x00000401, 0x20601910, + 0x00000801, 0x40601910, + 0x80000C01, 0x60601910, + 0x00000002, 0x00801910, + 0x00000402, 0x20801910, + 0x00000802, 0x40801910, + 0x80000C02, 0x60801910 + }; + u32 shader_bc_r700[] = { + 0x00000004, 0x81000400, + 0x00000008, 0xA01C0000, + 0xC001A03C, 0x94000688, + 0xC0024000, 0x94200688, + 0x7C000000, 0x002D1001, + 0x00080000, 0x00000000, + 0x7C000100, 0x002D1002, + 0x00080000, 0x00000000, + 0x00000001, 0x00600C90, + 0x00000401, 0x20600C90, + 0x00000801, 0x40600C90, + 0x80000C01, 0x60600C90, + 0x00000002, 0x00800C90, + 0x00000402, 0x20800C90, + 0x00000802, 0x40800C90, + 0x80000C02, 0x60800C90 + }; + + /* simple shader */ + bo = radeon_bo(rscreen->rw, 0, 128, 4096, NULL); + if (bo == NULL) { + return; + } + if (radeon_bo_map(rscreen->rw, bo)) { + radeon_bo_decref(rscreen->rw, bo); + return; + } + switch (rscreen->chip_class) { + case R600: + memcpy(bo->data, shader_bc_r600, 128); + break; + case R700: + memcpy(bo->data, shader_bc_r700, 128); + break; + default: + R600_ERR("unsupported chip family\n"); + radeon_bo_unmap(rscreen->rw, bo); + radeon_bo_decref(rscreen->rw, bo); + return; + } + radeon_bo_unmap(rscreen->rw, bo); + + radeon_state_init(rstate, rscreen->rw, R600_STATE_SHADER, 0, R600_SHADER_VS); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_VS_SHADER__SPI_VS_OUT_ID_0] = 0x03020100; + rstate->states[R600_VS_SHADER__SPI_VS_OUT_ID_1] = 0x07060504; + rstate->states[R600_VS_SHADER__SQ_PGM_RESOURCES_VS] = 0x00000005; + + rstate->bo[0] = bo; + rstate->bo[1] = radeon_bo_incref(rscreen->rw, bo); + rstate->nbo = 2; + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + rstate->placement[2] = RADEON_GEM_DOMAIN_GTT; + + radeon_state_pm4(rstate); +} + +static void r600_blit_state_ps_shader(struct r600_screen *rscreen, struct radeon_state *rstate) +{ + struct radeon_bo *bo; + u32 shader_bc_r600[] = { + 0x00000002, 0xA00C0000, + 0xC0008000, 0x94200688, + 0x00000000, 0x00201910, + 0x00000400, 0x20201910, + 0x00000800, 0x40201910, + 0x80000C00, 0x60201910 + }; + u32 shader_bc_r700[] = { + 0x00000002, 0xA00C0000, + 0xC0008000, 0x94200688, + 0x00000000, 0x00200C90, + 0x00000400, 0x20200C90, + 0x00000800, 0x40200C90, + 0x80000C00, 0x60200C90 + }; + + /* simple shader */ + bo = radeon_bo(rscreen->rw, 0, 128, 4096, NULL); + if (bo == NULL) { + radeon_bo_decref(rscreen->rw, bo); + return; + } + if (radeon_bo_map(rscreen->rw, bo)) { + return; + } + switch (rscreen->chip_class) { + case R600: + memcpy(bo->data, shader_bc_r600, 48); + break; + case R700: + memcpy(bo->data, shader_bc_r700, 48); + break; + default: + R600_ERR("unsupported chip family\n"); + radeon_bo_unmap(rscreen->rw, bo); + radeon_bo_decref(rscreen->rw, bo); + return; + } + radeon_bo_unmap(rscreen->rw, bo); + + radeon_state_init(rstate, rscreen->rw, R600_STATE_SHADER, 0, R600_SHADER_PS); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_PS_SHADER__SPI_PS_INPUT_CNTL_0] = 0x00000C00; + rstate->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_0] = 0x10000001; + rstate->states[R600_PS_SHADER__SQ_PGM_EXPORTS_PS] = 0x00000002; + rstate->states[R600_PS_SHADER__SQ_PGM_RESOURCES_PS] = 0x00000002; + + rstate->bo[0] = bo; + rstate->nbo = 1; + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + + radeon_state_pm4(rstate); +} + +static void r600_blit_state_vgt(struct r600_screen *rscreen, struct radeon_state *rstate) +{ + radeon_state_init(rstate, rscreen->rw, R600_STATE_VGT, 0, 0); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_VGT__VGT_DMA_NUM_INSTANCES] = 0x00000001; + rstate->states[R600_VGT__VGT_MAX_VTX_INDX] = 0x00FFFFFF; + rstate->states[R600_VGT__VGT_PRIMITIVE_TYPE] = 0x00000005; + + radeon_state_pm4(rstate); +} + +static void r600_blit_state_draw(struct r600_screen *rscreen, struct radeon_state *rstate) +{ + radeon_state_init(rstate, rscreen->rw, R600_STATE_DRAW, 0, 0); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_DRAW__VGT_DRAW_INITIATOR] = 0x00000002; + rstate->states[R600_DRAW__VGT_NUM_INDICES] = 0x00000004; + + radeon_state_pm4(rstate); +} + +static void r600_blit_state_vs_constant(struct r600_screen *rscreen, struct radeon_state *rstate, + unsigned id, float c0, float c1, float c2, float c3) +{ + radeon_state_init(rstate, rscreen->rw, R600_STATE_CONSTANT, id, R600_SHADER_VS); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_VS_CONSTANT__SQ_ALU_CONSTANT0_256] = fui(c0); + rstate->states[R600_VS_CONSTANT__SQ_ALU_CONSTANT1_256] = fui(c1); + rstate->states[R600_VS_CONSTANT__SQ_ALU_CONSTANT2_256] = fui(c2); + rstate->states[R600_VS_CONSTANT__SQ_ALU_CONSTANT3_256] = fui(c3); + + radeon_state_pm4(rstate); +} + +static void r600_blit_state_rasterizer(struct r600_screen *rscreen, struct radeon_state *rstate) +{ + radeon_state_init(rstate, rscreen->rw, R600_STATE_RASTERIZER, 0, 0); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_RASTERIZER__PA_CL_GB_HORZ_CLIP_ADJ] = 0x3F800000; + rstate->states[R600_RASTERIZER__PA_CL_GB_HORZ_DISC_ADJ] = 0x3F800000; + rstate->states[R600_RASTERIZER__PA_CL_GB_VERT_CLIP_ADJ] = 0x3F800000; + rstate->states[R600_RASTERIZER__PA_CL_GB_VERT_DISC_ADJ] = 0x3F800000; + rstate->states[R600_RASTERIZER__PA_SC_LINE_CNTL] = 0x00000400; + rstate->states[R600_RASTERIZER__PA_SC_LINE_STIPPLE] = 0x00000005; + rstate->states[R600_RASTERIZER__PA_SU_LINE_CNTL] = 0x00000008; + rstate->states[R600_RASTERIZER__PA_SU_POINT_MINMAX] = 0x80000000; + rstate->states[R600_RASTERIZER__PA_SU_SC_MODE_CNTL] = 0x00080004; + rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] = 0x00000001; + + radeon_state_pm4(rstate); +} + +static void r600_blit_state_dsa(struct r600_screen *rscreen, struct radeon_state *rstate) +{ + radeon_state_init(rstate, rscreen->rw, R600_STATE_DSA, 0, 0); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_DSA__DB_ALPHA_TO_MASK] = 0x0000AA00; + rstate->states[R600_DSA__DB_DEPTH_CLEAR] = 0x3F800000; + rstate->states[R600_DSA__DB_RENDER_CONTROL] = 0x00000060; + rstate->states[R600_DSA__DB_RENDER_OVERRIDE] = 0x0000002A; + rstate->states[R600_DSA__DB_SHADER_CONTROL] = 0x00000210; + + radeon_state_pm4(rstate); +} + +static void r600_blit_state_blend(struct r600_screen *rscreen, struct radeon_state *rstate) +{ + radeon_state_init(rstate, rscreen->rw, R600_STATE_BLEND, 0, 0); + radeon_state_pm4(rstate); +} + +static void r600_blit_state_cb_cntl(struct r600_screen *rscreen, struct radeon_state *rstate) +{ + radeon_state_init(rstate, rscreen->rw, R600_STATE_CB_CNTL, 0, 0); + rstate->states[R600_CB_CNTL__CB_CLRCMP_CONTROL] = 0x01000000; + rstate->states[R600_CB_CNTL__CB_CLRCMP_DST] = 0x000000FF; + rstate->states[R600_CB_CNTL__CB_CLRCMP_MSK] = 0xFFFFFFFF; + rstate->states[R600_CB_CNTL__CB_COLOR_CONTROL] = 0x00CC0080; + rstate->states[R600_CB_CNTL__CB_SHADER_MASK] = 0x0000000F; + rstate->states[R600_CB_CNTL__CB_TARGET_MASK] = 0x0000000F; + rstate->states[R600_CB_CNTL__PA_SC_AA_MASK] = 0xFFFFFFFF; + radeon_state_pm4(rstate); +} + +static int r600_blit_states_init(struct pipe_context *ctx, struct r600_blit_states *bstates) +{ + struct r600_screen *rscreen = r600_screen(ctx->screen); + + r600_blit_state_ps_shader(rscreen, &bstates->ps_shader); + r600_blit_state_vs_shader(rscreen, &bstates->vs_shader); + r600_blit_state_vgt(rscreen, &bstates->vgt); + r600_blit_state_draw(rscreen, &bstates->draw); + r600_blit_state_vs_constant(rscreen, &bstates->vs_constant0, 0, 1.0, 0.0, 0.0, 0.0); + r600_blit_state_vs_constant(rscreen, &bstates->vs_constant1, 1, 0.0, 1.0, 0.0, 0.0); + r600_blit_state_vs_constant(rscreen, &bstates->vs_constant2, 2, 0.0, 0.0, -0.00199900055, 0.0); + r600_blit_state_vs_constant(rscreen, &bstates->vs_constant3, 3, 0.0, 0.0, -0.99900049, 1.0); + r600_blit_state_rasterizer(rscreen, &bstates->rasterizer); + r600_blit_state_dsa(rscreen, &bstates->dsa); + r600_blit_state_blend(rscreen, &bstates->blend); + r600_blit_state_cb_cntl(rscreen, &bstates->cb_cntl); + r600_blit_state_vs_resources(rscreen, bstates); + return 0; +} + +static void r600_blit_states_destroy(struct pipe_context *ctx, struct r600_blit_states *bstates) +{ + radeon_state_fini(&bstates->ps_shader); + radeon_state_fini(&bstates->vs_shader); + radeon_state_fini(&bstates->vs_resource0); + radeon_state_fini(&bstates->vs_resource1); +} + +int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level) +{ + struct r600_screen *rscreen = r600_screen(ctx->screen); + struct r600_context *rctx = r600_context(ctx); + struct radeon_draw draw; + struct r600_blit_states bstates; + int r; + + r = r600_texture_scissor(ctx, rtexture, level); + if (r) { + return r; + } + r = r600_texture_cb(ctx, rtexture, 0, level); + if (r) { + return r; + } + r = r600_texture_db(ctx, rtexture, level); + if (r) { + return r; + } + r = r600_texture_viewport(ctx, rtexture, level); + if (r) { + return r; + } + + r = r600_blit_states_init(ctx, &bstates); + if (r) { + return r; + } + bstates.dsa.states[R600_DSA__DB_RENDER_CONTROL] = 0x0000008C; + bstates.cb_cntl.states[R600_CB_CNTL__CB_TARGET_MASK] = 0x00000001; + /* force rebuild */ + bstates.dsa.cpm4 = bstates.cb_cntl.cpm4 = 0; + if (radeon_state_pm4(&bstates.dsa)) { + goto out; + } + if (radeon_state_pm4(&bstates.cb_cntl)) { + goto out; + } + + r = radeon_draw_init(&draw, rscreen->rw); + if (r) { + R600_ERR("failed creating draw for uncompressing textures\n"); + goto out; + } + + radeon_draw_bind(&draw, &bstates.vs_shader); + radeon_draw_bind(&draw, &bstates.ps_shader); + radeon_draw_bind(&draw, &bstates.rasterizer); + radeon_draw_bind(&draw, &bstates.dsa); + radeon_draw_bind(&draw, &bstates.blend); + radeon_draw_bind(&draw, &bstates.cb_cntl); + radeon_draw_bind(&draw, &rctx->config); + radeon_draw_bind(&draw, &bstates.vgt); + radeon_draw_bind(&draw, &bstates.draw); + radeon_draw_bind(&draw, &bstates.vs_resource0); + radeon_draw_bind(&draw, &bstates.vs_resource1); + radeon_draw_bind(&draw, &bstates.vs_constant0); + radeon_draw_bind(&draw, &bstates.vs_constant1); + radeon_draw_bind(&draw, &bstates.vs_constant2); + radeon_draw_bind(&draw, &bstates.vs_constant3); + radeon_draw_bind(&draw, &rtexture->viewport[level]); + radeon_draw_bind(&draw, &rtexture->scissor[level]); + radeon_draw_bind(&draw, &rtexture->cb[0][level]); + radeon_draw_bind(&draw, &rtexture->db[level]); + + /* suspend queries */ + r600_queries_suspend(ctx); + + /* schedule draw*/ + r = radeon_ctx_set_draw(&rctx->ctx, &draw); + if (r == -EBUSY) { + r600_flush(ctx, 0, NULL); + r = radeon_ctx_set_draw(&rctx->ctx, &draw); + } + if (r) { + goto out; + } + + /* resume queries */ + r600_queries_resume(ctx); + +out: + r600_blit_states_destroy(ctx, &bstates); + return r; +} diff --git a/src/gallium/drivers/r600/r600_context.c b/src/gallium/drivers/r600/r600_context.c index edde80c660a..7a0e5b4049f 100644 --- a/src/gallium/drivers/r600/r600_context.c +++ b/src/gallium/drivers/r600/r600_context.c @@ -34,10 +34,26 @@ #include "r600_resource.h" #include "r600d.h" + static void r600_destroy_context(struct pipe_context *context) { struct r600_context *rctx = r600_context(context); + rctx->rasterizer = r600_context_state_decref(rctx->rasterizer); + rctx->poly_stipple = r600_context_state_decref(rctx->poly_stipple); + rctx->scissor = r600_context_state_decref(rctx->scissor); + rctx->clip = r600_context_state_decref(rctx->clip); + rctx->ps_shader = r600_context_state_decref(rctx->ps_shader); + rctx->vs_shader = r600_context_state_decref(rctx->vs_shader); + rctx->depth = r600_context_state_decref(rctx->depth); + rctx->stencil = r600_context_state_decref(rctx->stencil); + rctx->alpha = r600_context_state_decref(rctx->alpha); + rctx->dsa = r600_context_state_decref(rctx->dsa); + rctx->blend = r600_context_state_decref(rctx->blend); + rctx->stencil_ref = r600_context_state_decref(rctx->stencil_ref); + rctx->viewport = r600_context_state_decref(rctx->viewport); + rctx->framebuffer = r600_context_state_decref(rctx->framebuffer); + radeon_ctx_fini(&rctx->ctx); FREE(rctx); } @@ -45,27 +61,35 @@ void r600_flush(struct pipe_context *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct r600_context *rctx = r600_context(ctx); - struct r600_screen *rscreen = rctx->screen; + struct r600_query *rquery; static int dc = 0; char dname[256]; - if (radeon_ctx_pm4(rctx->ctx)) - return; + /* suspend queries */ + r600_queries_suspend(ctx); /* FIXME dumping should be removed once shader support instructions * without throwing bad code */ - if (!rctx->ctx->cpm4) + if (!rctx->ctx.cdwords) goto out; +#if 0 sprintf(dname, "gallium-%08d.bof", dc); - if (dc < 1) - radeon_ctx_dump_bof(rctx->ctx, dname); + if (dc < 2) { + radeon_ctx_dump_bof(&rctx->ctx, dname); + R600_ERR("dumped %s\n", dname); + } +#endif #if 1 - radeon_ctx_submit(rctx->ctx); + radeon_ctx_submit(&rctx->ctx); #endif + LIST_FOR_EACH_ENTRY(rquery, &rctx->query_list, list) { + rquery->flushed = true; + } dc++; out: - rctx->ctx = radeon_ctx_decref(rctx->ctx); - rctx->ctx = radeon_ctx(rscreen->rw); + radeon_ctx_clear(&rctx->ctx); + /* resume queries */ + r600_queries_resume(ctx); } static void r600_init_config(struct r600_context *rctx) @@ -207,9 +231,9 @@ static void r600_init_config(struct r600_context *rctx) num_es_stack_entries = 0; break; } - rctx->hw_states.config = radeon_state(rctx->rw, R600_CONFIG_TYPE, R600_CONFIG); + radeon_state_init(&rctx->config, rctx->rw, R600_STATE_CONFIG, 0, 0); - rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_CONFIG] = 0x00000000; switch (family) { case CHIP_RV610: case CHIP_RV620: @@ -218,75 +242,85 @@ static void r600_init_config(struct r600_context *rctx) case CHIP_RV710: break; default: - rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_VC_ENABLE(1); + rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_VC_ENABLE(1); break; } - rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_DX9_CONSTS(1); - rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_ALU_INST_PREFER_VECTOR(1); - rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_PS_PRIO(ps_prio); - rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_VS_PRIO(vs_prio); - rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_GS_PRIO(gs_prio); - rctx->hw_states.config->states[R600_CONFIG__SQ_CONFIG] |= S_008C00_ES_PRIO(es_prio); + rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_DX9_CONSTS(1); + rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_ALU_INST_PREFER_VECTOR(1); + rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_PS_PRIO(ps_prio); + rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_VS_PRIO(vs_prio); + rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_GS_PRIO(gs_prio); + rctx->config.states[R600_CONFIG__SQ_CONFIG] |= S_008C00_ES_PRIO(es_prio); + + rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] = 0; + rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_PS_GPRS(num_ps_gprs); + rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_VS_GPRS(num_vs_gprs); + rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs); - rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] = 0; - rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_PS_GPRS(num_ps_gprs); - rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_VS_GPRS(num_vs_gprs); - rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1] |= S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs); + rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] = 0; + rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] |= S_008C08_NUM_GS_GPRS(num_gs_gprs); + rctx->config.states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] |= S_008C08_NUM_GS_GPRS(num_es_gprs); - rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] = 0; - rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] |= S_008C08_NUM_GS_GPRS(num_gs_gprs); - rctx->hw_states.config->states[R600_CONFIG__SQ_GPR_RESOURCE_MGMT_2] |= S_008C08_NUM_GS_GPRS(num_es_gprs); + rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] = 0; + rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_PS_THREADS(num_ps_threads); + rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_VS_THREADS(num_vs_threads); + rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_GS_THREADS(num_gs_threads); + rctx->config.states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_ES_THREADS(num_es_threads); - rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] = 0; - rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_PS_THREADS(num_ps_threads); - rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_VS_THREADS(num_vs_threads); - rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_GS_THREADS(num_gs_threads); - rctx->hw_states.config->states[R600_CONFIG__SQ_THREAD_RESOURCE_MGMT] |= S_008C0C_NUM_ES_THREADS(num_es_threads); + rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] = 0; + rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] |= S_008C10_NUM_PS_STACK_ENTRIES(num_ps_stack_entries); + rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] |= S_008C10_NUM_VS_STACK_ENTRIES(num_vs_stack_entries); - rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] = 0; - rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] |= S_008C10_NUM_PS_STACK_ENTRIES(num_ps_stack_entries); - rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_1] |= S_008C10_NUM_VS_STACK_ENTRIES(num_vs_stack_entries); + rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] = 0; + rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] |= S_008C14_NUM_GS_STACK_ENTRIES(num_gs_stack_entries); + rctx->config.states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] |= S_008C14_NUM_ES_STACK_ENTRIES(num_es_stack_entries); - rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] = 0; - rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] |= S_008C14_NUM_GS_STACK_ENTRIES(num_gs_stack_entries); - rctx->hw_states.config->states[R600_CONFIG__SQ_STACK_RESOURCE_MGMT_2] |= S_008C14_NUM_ES_STACK_ENTRIES(num_es_stack_entries); + rctx->config.states[R600_CONFIG__VC_ENHANCE] = 0x00000000; + rctx->config.states[R600_CONFIG__SX_MISC] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_DYN_GPR_CNTL_PS_FLUSH_REQ] = 0x00004000; - rctx->hw_states.config->states[R600_CONFIG__TA_CNTL_AUX] = 0x07000002; - rctx->hw_states.config->states[R600_CONFIG__VC_ENHANCE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__DB_DEBUG] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__DB_WATERMARKS] = 0x00420204; - rctx->hw_states.config->states[R600_CONFIG__SX_MISC] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SPI_THREAD_GROUPING] = 0x00000001; - rctx->hw_states.config->states[R600_CONFIG__CB_SHADER_CONTROL] = 0x00000003; - rctx->hw_states.config->states[R600_CONFIG__SQ_ESGS_RING_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_GSVS_RING_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_ESTMP_RING_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_GSTMP_RING_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_VSTMP_RING_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_PSTMP_RING_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_FBUF_RING_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_REDUC_RING_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__SQ_GS_VERT_ITEMSIZE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_OUTPUT_PATH_CNTL] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_HOS_CNTL] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_HOS_MAX_TESS_LEVEL] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_HOS_MIN_TESS_LEVEL] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_HOS_REUSE_DEPTH] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_PRIM_TYPE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_FIRST_DECR] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_DECR] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_VECT_0_CNTL] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_VECT_1_CNTL] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_VECT_0_FMT_CNTL] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_GROUP_VECT_1_FMT_CNTL] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_GS_MODE] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__PA_SC_MODE_CNTL] = 0x00514000; - rctx->hw_states.config->states[R600_CONFIG__VGT_STRMOUT_EN] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_REUSE_OFF] = 0x00000001; - rctx->hw_states.config->states[R600_CONFIG__VGT_VTX_CNT_EN] = 0x00000000; - rctx->hw_states.config->states[R600_CONFIG__VGT_STRMOUT_BUFFER_EN] = 0x00000000; - radeon_state_pm4(rctx->hw_states.config); + if (family >= CHIP_RV770) { + rctx->config.states[R600_CONFIG__SQ_DYN_GPR_CNTL_PS_FLUSH_REQ] = 0x00004000; + rctx->config.states[R600_CONFIG__TA_CNTL_AUX] = 0x07000002; + rctx->config.states[R600_CONFIG__DB_DEBUG] = 0x00000000; + rctx->config.states[R600_CONFIG__DB_WATERMARKS] = 0x00420204; + rctx->config.states[R600_CONFIG__SPI_THREAD_GROUPING] = 0x00000000; + rctx->config.states[R600_CONFIG__PA_SC_MODE_CNTL] = 0x00514000; + } else { + rctx->config.states[R600_CONFIG__SQ_DYN_GPR_CNTL_PS_FLUSH_REQ] = 0x00000000; + rctx->config.states[R600_CONFIG__TA_CNTL_AUX] = 0x07000003; + rctx->config.states[R600_CONFIG__DB_DEBUG] = 0x82000000; + rctx->config.states[R600_CONFIG__DB_WATERMARKS] = 0x01020204; + rctx->config.states[R600_CONFIG__SPI_THREAD_GROUPING] = 0x00000001; + rctx->config.states[R600_CONFIG__PA_SC_MODE_CNTL] = 0x00004010; + } + rctx->config.states[R600_CONFIG__CB_SHADER_CONTROL] = 0x00000003; + rctx->config.states[R600_CONFIG__SQ_ESGS_RING_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_GSVS_RING_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_ESTMP_RING_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_GSTMP_RING_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_VSTMP_RING_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_PSTMP_RING_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_FBUF_RING_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_REDUC_RING_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__SQ_GS_VERT_ITEMSIZE] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_OUTPUT_PATH_CNTL] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_HOS_CNTL] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_HOS_MAX_TESS_LEVEL] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_HOS_MIN_TESS_LEVEL] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_HOS_REUSE_DEPTH] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_GROUP_PRIM_TYPE] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_GROUP_FIRST_DECR] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_GROUP_DECR] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_GROUP_VECT_0_CNTL] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_GROUP_VECT_1_CNTL] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_GROUP_VECT_0_FMT_CNTL] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_GROUP_VECT_1_FMT_CNTL] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_GS_MODE] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_STRMOUT_EN] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_REUSE_OFF] = 0x00000001; + rctx->config.states[R600_CONFIG__VGT_VTX_CNT_EN] = 0x00000000; + rctx->config.states[R600_CONFIG__VGT_STRMOUT_BUFFER_EN] = 0x00000000; + radeon_state_pm4(&rctx->config); } struct pipe_context *r600_create_context(struct pipe_screen *screen, void *priv) @@ -320,7 +354,7 @@ struct pipe_context *r600_create_context(struct pipe_screen *screen, void *priv) r600_init_config(rctx); - rctx->ctx = radeon_ctx(rscreen->rw); - rctx->draw = radeon_draw(rscreen->rw); + radeon_ctx_init(&rctx->ctx, rscreen->rw); + radeon_draw_init(&rctx->draw, rscreen->rw); return &rctx->context; } diff --git a/src/gallium/drivers/r600/r600_context.h b/src/gallium/drivers/r600/r600_context.h index 76d5de86532..cea08130545 100644 --- a/src/gallium/drivers/r600/r600_context.h +++ b/src/gallium/drivers/r600/r600_context.h @@ -30,9 +30,32 @@ #include <tgsi/tgsi_parse.h> #include <tgsi/tgsi_util.h> #include <util/u_blitter.h> +#include <util/u_double_list.h> #include "radeon.h" #include "r600_shader.h" +#define R600_QUERY_STATE_STARTED (1 << 0) +#define R600_QUERY_STATE_ENDED (1 << 1) +#define R600_QUERY_STATE_SUSPENDED (1 << 2) + +struct r600_query { + u64 result; + /* The kind of query. Currently only OQ is supported. */ + unsigned type; + /* How many results have been written, in dwords. It's incremented + * after end_query and flush. */ + unsigned num_results; + /* if we've flushed the query */ + boolean flushed; + unsigned state; + /* The buffer where query results are stored. */ + struct radeon_bo *buffer; + unsigned buffer_size; + /* linked list of queries */ + struct list_head list; + struct radeon_state rstate; +}; + /* XXX move this to a more appropriate place */ union pipe_states { struct pipe_rasterizer_state rasterizer; @@ -72,13 +95,16 @@ enum pipe_state_type { pipe_type_count }; +#define R600_MAX_RSTATE 16 + struct r600_context_state { union pipe_states state; unsigned refcount; unsigned type; - struct radeon_state *rstate; + struct radeon_state rstate[R600_MAX_RSTATE]; struct r600_shader shader; struct radeon_bo *bo; + unsigned nrstate; }; struct r600_vertex_element @@ -89,28 +115,25 @@ struct r600_vertex_element }; struct r600_context_hw_states { - struct radeon_state *rasterizer; - struct radeon_state *scissor; - struct radeon_state *dsa; - struct radeon_state *blend; - struct radeon_state *viewport; - struct radeon_state *cb[8]; - struct radeon_state *config; - struct radeon_state *cb_cntl; - struct radeon_state *db; - unsigned ps_nresource; - unsigned ps_nsampler; - struct radeon_state *ps_resource[160]; - struct radeon_state *ps_sampler[16]; + struct radeon_state rasterizer; + struct radeon_state scissor; + struct radeon_state dsa; + struct radeon_state cb_cntl; }; struct r600_context { struct pipe_context context; struct r600_screen *screen; struct radeon *rw; - struct radeon_ctx *ctx; + struct radeon_ctx ctx; struct blitter_context *blitter; - struct radeon_draw *draw; + struct radeon_draw draw; + struct radeon_state config; + /* FIXME get rid of those vs_resource,vs/ps_constant */ + struct radeon_state vs_resource[160]; + unsigned vs_nresource; + struct radeon_state vs_constant[256]; + struct radeon_state ps_constant[256]; /* hw states */ struct r600_context_hw_states hw_states; /* pipe states */ @@ -134,14 +157,15 @@ struct r600_context { struct r600_context_state *stencil_ref; struct r600_context_state *viewport; struct r600_context_state *framebuffer; - struct r600_context_state *ps_sampler[PIPE_MAX_ATTRIBS]; - struct r600_context_state *vs_sampler[PIPE_MAX_ATTRIBS]; - struct r600_context_state *ps_sampler_view[PIPE_MAX_ATTRIBS]; - struct r600_context_state *vs_sampler_view[PIPE_MAX_ATTRIBS]; + struct radeon_state *ps_sampler[PIPE_MAX_ATTRIBS]; + struct radeon_state *vs_sampler[PIPE_MAX_ATTRIBS]; + struct radeon_state *ps_sampler_view[PIPE_MAX_ATTRIBS]; + struct radeon_state *vs_sampler_view[PIPE_MAX_ATTRIBS]; struct r600_vertex_element *vertex_elements; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; struct pipe_index_buffer index_buffer; - struct pipe_blend_color blend_color; + struct pipe_blend_color blend_color; + struct list_head query_list; }; /* Convenience cast wrapper. */ @@ -150,13 +174,18 @@ static INLINE struct r600_context *r600_context(struct pipe_context *pipe) return (struct r600_context*)pipe; } +static INLINE struct r600_query* r600_query(struct pipe_query* q) +{ + return (struct r600_query*)q; +} + struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigned type, const void *state); struct r600_context_state *r600_context_state_incref(struct r600_context_state *rstate); struct r600_context_state *r600_context_state_decref(struct r600_context_state *rstate); void r600_flush(struct pipe_context *ctx, unsigned flags, struct pipe_fence_handle **fence); -int r600_context_hw_states(struct r600_context *rctx); +int r600_context_hw_states(struct pipe_context *ctx); void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info); @@ -178,4 +207,10 @@ extern int r600_pipe_shader_update(struct pipe_context *ctx, uint32_t r600_translate_texformat(enum pipe_format format, const unsigned char *swizzle_view, uint32_t *word4_p, uint32_t *yuv_format_p); + +/* query */ +extern void r600_queries_resume(struct pipe_context *ctx); +extern void r600_queries_suspend(struct pipe_context *ctx); + + #endif diff --git a/src/gallium/drivers/r600/r600_draw.c b/src/gallium/drivers/r600/r600_draw.c index f0584551620..fabd337d239 100644 --- a/src/gallium/drivers/r600/r600_draw.c +++ b/src/gallium/drivers/r600/r600_draw.c @@ -31,6 +31,7 @@ #include <util/u_math.h> #include <util/u_inlines.h> #include <util/u_memory.h> +#include "radeon.h" #include "r600_screen.h" #include "r600_context.h" #include "r600_resource.h" @@ -38,8 +39,8 @@ struct r600_draw { struct pipe_context *ctx; - struct radeon_state *draw; - struct radeon_state *vgt; + struct radeon_state draw; + struct radeon_state vgt; unsigned mode; unsigned start; unsigned count; @@ -51,6 +52,7 @@ static int r600_draw_common(struct r600_draw *draw) { struct r600_context *rctx = r600_context(draw->ctx); struct r600_screen *rscreen = rctx->screen; + /* FIXME vs_resource */ struct radeon_state *vs_resource; struct r600_resource *rbuffer; unsigned i, j, offset, format, prim; @@ -58,7 +60,7 @@ static int r600_draw_common(struct r600_draw *draw) struct pipe_vertex_buffer *vertex_buffer; int r; - r = r600_context_hw_states(rctx); + r = r600_context_hw_states(draw->ctx); if (r) return r; switch (draw->index_size) { @@ -81,6 +83,7 @@ static int r600_draw_common(struct r600_draw *draw) r = r600_conv_pipe_prim(draw->mode, &prim); if (r) return r; + /* rebuild vertex shader if input format changed */ r = r600_pipe_shader_update(draw->ctx, rctx->vs_shader); if (r) @@ -88,26 +91,24 @@ static int r600_draw_common(struct r600_draw *draw) r = r600_pipe_shader_update(draw->ctx, rctx->ps_shader); if (r) return r; - r = radeon_draw_set(rctx->draw, rctx->vs_shader->rstate); - if (r) - return r; - r = radeon_draw_set(rctx->draw, rctx->ps_shader->rstate); - if (r) - return r; + radeon_draw_bind(&rctx->draw, &rctx->vs_shader->rstate[0]); + radeon_draw_bind(&rctx->draw, &rctx->ps_shader->rstate[0]); + for (i = 0 ; i < rctx->vs_nresource; i++) { + radeon_state_fini(&rctx->vs_resource[i]); + } for (i = 0 ; i < rctx->vertex_elements->count; i++) { + vs_resource = &rctx->vs_resource[i]; j = rctx->vertex_elements->elements[i].vertex_buffer_index; vertex_buffer = &rctx->vertex_buffer[j]; rbuffer = (struct r600_resource*)vertex_buffer->buffer; offset = rctx->vertex_elements->elements[i].src_offset + vertex_buffer->buffer_offset; format = r600_translate_colorformat(rctx->vertex_elements->elements[i].src_format); - vs_resource = radeon_state(rscreen->rw, R600_VS_RESOURCE_TYPE, R600_VS_RESOURCE + i); - if (vs_resource == NULL) - return -ENOMEM; + radeon_state_init(vs_resource, rscreen->rw, R600_STATE_RESOURCE, i, R600_SHADER_VS); vs_resource->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); vs_resource->nbo = 1; vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD0] = offset; - vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD1] = rbuffer->bo->size - offset; + vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD1] = rbuffer->bo->size - offset - 1; vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD2] = S_038008_STRIDE(vertex_buffer->stride) | S_038008_DATA_FORMAT(format); vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD3] = 0x00000000; @@ -116,59 +117,61 @@ static int r600_draw_common(struct r600_draw *draw) vs_resource->states[R600_PS_RESOURCE__RESOURCE0_WORD6] = 0xC0000000; vs_resource->placement[0] = RADEON_GEM_DOMAIN_GTT; vs_resource->placement[1] = RADEON_GEM_DOMAIN_GTT; - r = radeon_draw_set_new(rctx->draw, vs_resource); - if (r) + r = radeon_state_pm4(vs_resource); + if (r) { return r; + } + radeon_draw_bind(&rctx->draw, vs_resource); } + rctx->vs_nresource = rctx->vertex_elements->count; /* FIXME start need to change winsys */ - draw->draw = radeon_state(rscreen->rw, R600_DRAW_TYPE, R600_DRAW); - if (draw->draw == NULL) - return -ENOMEM; - draw->draw->states[R600_DRAW__VGT_NUM_INDICES] = draw->count; - draw->draw->states[R600_DRAW__VGT_DRAW_INITIATOR] = vgt_draw_initiator; + radeon_state_init(&draw->draw, rscreen->rw, R600_STATE_DRAW, 0, 0); + draw->draw.states[R600_DRAW__VGT_NUM_INDICES] = draw->count; + draw->draw.states[R600_DRAW__VGT_DRAW_INITIATOR] = vgt_draw_initiator; if (draw->index_buffer) { rbuffer = (struct r600_resource*)draw->index_buffer; - draw->draw->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); - draw->draw->placement[0] = RADEON_GEM_DOMAIN_GTT; - draw->draw->placement[1] = RADEON_GEM_DOMAIN_GTT; - draw->draw->nbo = 1; + draw->draw.bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); + draw->draw.placement[0] = RADEON_GEM_DOMAIN_GTT; + draw->draw.placement[1] = RADEON_GEM_DOMAIN_GTT; + draw->draw.nbo = 1; } - r = radeon_draw_set_new(rctx->draw, draw->draw); - if (r) + r = radeon_state_pm4(&draw->draw); + if (r) { return r; - draw->vgt = radeon_state(rscreen->rw, R600_VGT_TYPE, R600_VGT); - if (draw->vgt == NULL) - return -ENOMEM; - draw->vgt->states[R600_VGT__VGT_PRIMITIVE_TYPE] = prim; - draw->vgt->states[R600_VGT__VGT_MAX_VTX_INDX] = 0x00FFFFFF; - draw->vgt->states[R600_VGT__VGT_MIN_VTX_INDX] = 0x00000000; - draw->vgt->states[R600_VGT__VGT_INDX_OFFSET] = draw->start; - draw->vgt->states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_INDX] = 0x00000000; - draw->vgt->states[R600_VGT__VGT_DMA_INDEX_TYPE] = vgt_dma_index_type; - draw->vgt->states[R600_VGT__VGT_PRIMITIVEID_EN] = 0x00000000; - draw->vgt->states[R600_VGT__VGT_DMA_NUM_INSTANCES] = 0x00000001; - draw->vgt->states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_EN] = 0x00000000; - draw->vgt->states[R600_VGT__VGT_INSTANCE_STEP_RATE_0] = 0x00000000; - draw->vgt->states[R600_VGT__VGT_INSTANCE_STEP_RATE_1] = 0x00000000; - r = radeon_draw_set_new(rctx->draw, draw->vgt); - if (r) + } + radeon_draw_bind(&rctx->draw, &draw->draw); + + radeon_state_init(&draw->vgt, rscreen->rw, R600_STATE_VGT, 0, 0); + draw->vgt.states[R600_VGT__VGT_PRIMITIVE_TYPE] = prim; + draw->vgt.states[R600_VGT__VGT_MAX_VTX_INDX] = 0x00FFFFFF; + draw->vgt.states[R600_VGT__VGT_MIN_VTX_INDX] = 0x00000000; + draw->vgt.states[R600_VGT__VGT_INDX_OFFSET] = draw->start; + draw->vgt.states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_INDX] = 0x00000000; + draw->vgt.states[R600_VGT__VGT_DMA_INDEX_TYPE] = vgt_dma_index_type; + draw->vgt.states[R600_VGT__VGT_PRIMITIVEID_EN] = 0x00000000; + draw->vgt.states[R600_VGT__VGT_DMA_NUM_INSTANCES] = 0x00000001; + draw->vgt.states[R600_VGT__VGT_MULTI_PRIM_IB_RESET_EN] = 0x00000000; + draw->vgt.states[R600_VGT__VGT_INSTANCE_STEP_RATE_0] = 0x00000000; + draw->vgt.states[R600_VGT__VGT_INSTANCE_STEP_RATE_1] = 0x00000000; + r = radeon_state_pm4(&draw->vgt); + if (r) { return r; - /* FIXME */ - r = radeon_ctx_set_draw_new(rctx->ctx, rctx->draw); + } + radeon_draw_bind(&rctx->draw, &draw->vgt); + + r = radeon_ctx_set_draw(&rctx->ctx, &rctx->draw); if (r == -EBUSY) { r600_flush(draw->ctx, 0, NULL); - r = radeon_ctx_set_draw_new(rctx->ctx, rctx->draw); + r = radeon_ctx_set_draw(&rctx->ctx, &rctx->draw); } - if (r) - return r; - rctx->draw = radeon_draw_duplicate(rctx->draw); - return 0; + return r; } void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct r600_context *rctx = r600_context(ctx); struct r600_draw draw; + int r; assert(info->index_bias == 0); @@ -189,5 +192,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) draw.index_size = 0; draw.index_buffer = NULL; } - r600_draw_common(&draw); + r = r600_draw_common(&draw); + if (r) + fprintf(stderr,"draw common failed %d\n", r); } diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c index 9b02ae680e7..530940ed843 100644 --- a/src/gallium/drivers/r600/r600_query.c +++ b/src/gallium/drivers/r600/r600_query.c @@ -24,39 +24,225 @@ * Jerome Glisse * Corbin Simpson */ +#include <errno.h> #include <util/u_inlines.h> #include <util/u_format.h> #include <util/u_memory.h> #include "r600_screen.h" #include "r600_context.h" -static struct pipe_query *r600_create_query(struct pipe_context *pipe, unsigned query_type) +static void r600_query_begin(struct r600_context *rctx, struct r600_query *rquery) { - return NULL; + struct r600_screen *rscreen = rctx->screen; + struct radeon_state *rstate = &rquery->rstate; + + radeon_state_fini(rstate); + radeon_state_init(rstate, rscreen->rw, R600_STATE_QUERY_BEGIN, 0, 0); + rstate->states[R600_QUERY__OFFSET] = rquery->num_results; + rstate->bo[0] = radeon_bo_incref(rscreen->rw, rquery->buffer); + rstate->nbo = 1; + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + if (radeon_state_pm4(rstate)) { + radeon_state_fini(rstate); + } +} + +static void r600_query_end(struct r600_context *rctx, struct r600_query *rquery) +{ + struct r600_screen *rscreen = rctx->screen; + struct radeon_state *rstate = &rquery->rstate; + + radeon_state_fini(rstate); + radeon_state_init(rstate, rscreen->rw, R600_STATE_QUERY_END, 0, 0); + rstate->states[R600_QUERY__OFFSET] = rquery->num_results + 8; + rstate->bo[0] = radeon_bo_incref(rscreen->rw, rquery->buffer); + rstate->nbo = 1; + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + if (radeon_state_pm4(rstate)) { + radeon_state_fini(rstate); + } } -static void r600_destroy_query(struct pipe_context *pipe, struct pipe_query *query) +static struct pipe_query *r600_create_query(struct pipe_context *ctx, unsigned query_type) { + struct r600_screen *rscreen = r600_screen(ctx->screen); + struct r600_context *rctx = r600_context(ctx); + struct r600_query *q; + + if (query_type != PIPE_QUERY_OCCLUSION_COUNTER) + return NULL; + + q = CALLOC_STRUCT(r600_query); + if (!q) + return NULL; + + q->type = query_type; + LIST_ADDTAIL(&q->list, &rctx->query_list); + q->buffer_size = 4096; + + q->buffer = radeon_bo(rscreen->rw, 0, q->buffer_size, 1, NULL); + if (!q->buffer) { + FREE(q); + return NULL; + } + return (struct pipe_query *)q; +} + +static void r600_destroy_query(struct pipe_context *ctx, + struct pipe_query *query) +{ + struct r600_screen *rscreen = r600_screen(ctx->screen); + struct r600_query *q = r600_query(query); + + radeon_bo_decref(rscreen->rw, q->buffer); + LIST_DEL(&q->list); FREE(query); } -static void r600_begin_query(struct pipe_context *pipe, struct pipe_query *query) +static void r600_query_result(struct pipe_context *ctx, struct r600_query *rquery) { + struct r600_screen *rscreen = r600_screen(ctx->screen); + u64 start, end; + u32 *results; + int i; + + radeon_bo_wait(rscreen->rw, rquery->buffer); + radeon_bo_map(rscreen->rw, rquery->buffer); + results = rquery->buffer->data; + for (i = 0; i < rquery->num_results; i += 4) { + start = (u64)results[i] | (u64)results[i + 1] << 32; + end = (u64)results[i + 2] | (u64)results[i + 3] << 32; + if ((start & 0x8000000000000000UL) && (end & 0x8000000000000000UL)) { + rquery->result += end - start; + } + } + radeon_bo_unmap(rscreen->rw, rquery->buffer); + rquery->num_results = 0; } -static void r600_end_query(struct pipe_context *pipe, struct pipe_query *query) +static void r600_query_resume(struct pipe_context *ctx, struct r600_query *rquery) { + struct r600_context *rctx = r600_context(ctx); + + if (rquery->num_results >= ((rquery->buffer_size >> 2) - 2)) { + /* running out of space */ + if (!rquery->flushed) { + ctx->flush(ctx, 0, NULL); + } + r600_query_result(ctx, rquery); + } + r600_query_begin(rctx, rquery); + rquery->flushed = false; +} + +static void r600_query_suspend(struct pipe_context *ctx, struct r600_query *rquery) +{ + struct r600_context *rctx = r600_context(ctx); + + r600_query_end(rctx, rquery); + rquery->num_results += 16; } -static boolean r600_get_query_result(struct pipe_context *pipe, +static void r600_begin_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct r600_context *rctx = r600_context(ctx); + struct r600_query *rquery = r600_query(query); + int r; + + rquery->state = R600_QUERY_STATE_STARTED; + rquery->num_results = 0; + rquery->flushed = false; + r600_query_resume(ctx, rquery); + r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate); + if (r == -EBUSY) { + /* this shouldn't happen */ + R600_ERR("had to flush while emitting end query\n"); + ctx->flush(ctx, 0, NULL); + r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate); + } +} + +static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query) +{ + struct r600_context *rctx = r600_context(ctx); + struct r600_query *rquery = r600_query(query); + int r; + + rquery->state &= ~R600_QUERY_STATE_STARTED; + rquery->state |= R600_QUERY_STATE_ENDED; + r600_query_suspend(ctx, rquery); + r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate); + if (r == -EBUSY) { + /* this shouldn't happen */ + R600_ERR("had to flush while emitting end query\n"); + ctx->flush(ctx, 0, NULL); + r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate); + } +} + +void r600_queries_suspend(struct pipe_context *ctx) +{ + struct r600_context *rctx = r600_context(ctx); + struct r600_query *rquery; + int r; + + LIST_FOR_EACH_ENTRY(rquery, &rctx->query_list, list) { + if (rquery->state & R600_QUERY_STATE_STARTED) { + r600_query_suspend(ctx, rquery); + r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate); + if (r == -EBUSY) { + /* this shouldn't happen */ + R600_ERR("had to flush while emitting end query\n"); + ctx->flush(ctx, 0, NULL); + r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate); + } + } + rquery->state |= R600_QUERY_STATE_SUSPENDED; + } +} + +void r600_queries_resume(struct pipe_context *ctx) +{ + struct r600_context *rctx = r600_context(ctx); + struct r600_query *rquery; + int r; + + LIST_FOR_EACH_ENTRY(rquery, &rctx->query_list, list) { + if (rquery->state & R600_QUERY_STATE_STARTED) { + r600_query_resume(ctx, rquery); + r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate); + if (r == -EBUSY) { + /* this shouldn't happen */ + R600_ERR("had to flush while emitting end query\n"); + ctx->flush(ctx, 0, NULL); + r = radeon_ctx_set_query_state(&rctx->ctx, &rquery->rstate); + } + } + rquery->state &= ~R600_QUERY_STATE_SUSPENDED; + } +} + +static boolean r600_get_query_result(struct pipe_context *ctx, struct pipe_query *query, - boolean wait, void *result) + boolean wait, void *vresult) { + struct r600_query *rquery = r600_query(query); + uint64_t *result = (uint64_t*)vresult; + + if (!rquery->flushed) { + ctx->flush(ctx, 0, NULL); + rquery->flushed = true; + } + r600_query_result(ctx, rquery); + *result = rquery->result; + rquery->result = 0; return TRUE; } void r600_init_query_functions(struct r600_context* rctx) { + LIST_INITHEAD(&rctx->query_list); + rctx->context.create_query = r600_create_query; rctx->context.destroy_query = r600_destroy_query; rctx->context.begin_query = r600_begin_query; diff --git a/src/gallium/drivers/r600/r600_resource.h b/src/gallium/drivers/r600/r600_resource.h index bb90e76fb78..129667ad20f 100644 --- a/src/gallium/drivers/r600/r600_resource.h +++ b/src/gallium/drivers/r600/r600_resource.h @@ -44,10 +44,22 @@ struct r600_resource_texture { struct r600_resource resource; unsigned long offset[PIPE_MAX_TEXTURE_LEVELS]; unsigned long pitch[PIPE_MAX_TEXTURE_LEVELS]; + unsigned long width[PIPE_MAX_TEXTURE_LEVELS]; + unsigned long height[PIPE_MAX_TEXTURE_LEVELS]; unsigned long layer_size[PIPE_MAX_TEXTURE_LEVELS]; unsigned long pitch_override; unsigned long bpt; unsigned long size; + unsigned tilled; + unsigned array_mode; + unsigned tile_type; + unsigned depth; + unsigned dirty; + struct radeon_bo *uncompressed; + struct radeon_state scissor[PIPE_MAX_TEXTURE_LEVELS]; + struct radeon_state cb[8][PIPE_MAX_TEXTURE_LEVELS]; + struct radeon_state db[PIPE_MAX_TEXTURE_LEVELS]; + struct radeon_state viewport[PIPE_MAX_TEXTURE_LEVELS]; }; void r600_init_context_resource_functions(struct r600_context *r600); diff --git a/src/gallium/drivers/r600/r600_screen.c b/src/gallium/drivers/r600/r600_screen.c index cdaca9ed7db..a047a49a6c5 100644 --- a/src/gallium/drivers/r600/r600_screen.c +++ b/src/gallium/drivers/r600/r600_screen.c @@ -69,6 +69,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_SWIZZLE: case PIPE_CAP_INDEP_BLEND_ENABLE: case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE: + case PIPE_CAP_DEPTH_CLAMP: return 1; /* Unsupported features (boolean caps). */ @@ -77,7 +78,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_STREAM_OUTPUT: case PIPE_CAP_INDEP_BLEND_FUNC: /* FIXME allow this */ case PIPE_CAP_GEOMETRY_SHADER4: - case PIPE_CAP_DEPTH_CLAMP: /* FIXME allow this */ return 0; /* Texturing. */ @@ -234,11 +234,34 @@ static void r600_destroy_screen(struct pipe_screen* pscreen) struct pipe_screen *r600_screen_create(struct radeon *rw) { struct r600_screen* rscreen; + enum radeon_family family = radeon_get_family(rw); rscreen = CALLOC_STRUCT(r600_screen); if (rscreen == NULL) { return NULL; } + + switch (family) { + case CHIP_R600: + case CHIP_RV610: + case CHIP_RV630: + case CHIP_RV670: + case CHIP_RV620: + case CHIP_RV635: + case CHIP_RS780: + case CHIP_RS880: + rscreen->chip_class = R600; + break; + case CHIP_RV770: + case CHIP_RV730: + case CHIP_RV710: + case CHIP_RV740: + rscreen->chip_class = R700; + break; + default: + FREE(rscreen); + return NULL; + } rscreen->rw = rw; rscreen->screen.winsys = (struct pipe_winsys*)rw; rscreen->screen.destroy = r600_destroy_screen; diff --git a/src/gallium/drivers/r600/r600_screen.h b/src/gallium/drivers/r600/r600_screen.h index 53b560c617f..b9938f117a8 100644 --- a/src/gallium/drivers/r600/r600_screen.h +++ b/src/gallium/drivers/r600/r600_screen.h @@ -30,6 +30,7 @@ #include <radeon_drm.h> #include "radeon.h" #include "util/u_transfer.h" +#include "r600_resource.h" /* Texture transfer. */ struct r600_transfer { @@ -38,11 +39,19 @@ struct r600_transfer { /* Buffer transfer. */ struct pipe_transfer *buffer_transfer; unsigned offset; + struct pipe_resource *linear_texture; +}; + +enum chip_class { + R600, + R700, + EVERGREEN, }; struct r600_screen { struct pipe_screen screen; struct radeon *rw; + enum chip_class chip_class; }; static INLINE struct r600_screen *r600_screen(struct pipe_screen *screen) @@ -62,7 +71,7 @@ unsigned r600_buffer_is_referenced_by_cs(struct pipe_context *context, struct pipe_resource *r600_buffer_from_handle(struct pipe_screen *screen, struct winsys_handle *whandle); -/* Texture transfer functions. */ +/* r600_texture.c texture transfer functions. */ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx, struct pipe_resource *texture, struct pipe_subresource sr, @@ -74,7 +83,14 @@ void* r600_texture_transfer_map(struct pipe_context *ctx, struct pipe_transfer* transfer); void r600_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer* transfer); +int r600_texture_scissor(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level); +int r600_texture_cb(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned cb, unsigned level); +int r600_texture_db(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level); +int r600_texture_from_depth(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level); +int r600_texture_viewport(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level); +/* r600_blit.c */ +int r600_blit_uncompress_depth(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level); /* helpers */ int r600_conv_pipe_format(unsigned pformat, unsigned *format); diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 956c7e7930c..0ba26a23112 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -48,6 +48,9 @@ struct r600_shader_ctx { struct r600_bc *bc; struct r600_shader *shader; u32 value[4]; + u32 *literals; + u32 nliterals; + u32 max_driver_temp_used; }; struct r600_shader_tgsi_instruction { @@ -105,8 +108,8 @@ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_screen *rscreen = r600_screen(ctx->screen); int r; -fprintf(stderr, "--------------------------------------------------------------\n"); -tgsi_dump(tokens, 0); +//fprintf(stderr, "--------------------------------------------------------------\n"); +//tgsi_dump(tokens, 0); if (rpshader == NULL) return -ENOMEM; rpshader->shader.family = radeon_get_family(rscreen->rw); @@ -120,7 +123,7 @@ tgsi_dump(tokens, 0); R600_ERR("building bytecode failed !\n"); return r; } -fprintf(stderr, "______________________________________________________________\n"); +//fprintf(stderr, "______________________________________________________________\n"); return 0; } @@ -131,10 +134,9 @@ static int r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_context_sta struct radeon_state *state; unsigned i, tmp; - rpshader->rstate = radeon_state_decref(rpshader->rstate); - state = radeon_state(rscreen->rw, R600_VS_SHADER_TYPE, R600_VS_SHADER); - if (state == NULL) - return -ENOMEM; + state = &rpshader->rstate[0]; + radeon_state_fini(&rpshader->rstate[0]); + radeon_state_init(state, rscreen->rw, R600_STATE_SHADER, 0, R600_SHADER_VS); for (i = 0; i < 10; i++) { state->states[R600_VS_SHADER__SPI_VS_OUT_ID_0 + i] = 0; } @@ -144,12 +146,13 @@ static int r600_pipe_shader_vs(struct pipe_context *ctx, struct r600_context_sta state->states[R600_VS_SHADER__SPI_VS_OUT_ID_0 + i / 4] |= tmp; } state->states[R600_VS_SHADER__SPI_VS_OUT_CONFIG] = S_0286C4_VS_EXPORT_COUNT(rshader->noutput - 2); - state->states[R600_VS_SHADER__SQ_PGM_RESOURCES_VS] = S_028868_NUM_GPRS(rshader->bc.ngpr); - rpshader->rstate = state; - rpshader->rstate->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo); - rpshader->rstate->bo[1] = radeon_bo_incref(rscreen->rw, rpshader->bo); - rpshader->rstate->nbo = 2; - rpshader->rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + state->states[R600_VS_SHADER__SQ_PGM_RESOURCES_VS] = S_028868_NUM_GPRS(rshader->bc.ngpr) | + S_028868_STACK_SIZE(rshader->bc.nstack); + state->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo); + state->bo[1] = radeon_bo_incref(rscreen->rw, rpshader->bo); + state->nbo = 2; + state->placement[0] = RADEON_GEM_DOMAIN_GTT; + state->placement[2] = RADEON_GEM_DOMAIN_GTT; return radeon_state_pm4(state); } @@ -161,17 +164,20 @@ static int r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_context_sta struct r600_context *rctx = r600_context(ctx); struct radeon_state *state; unsigned i, tmp, exports_ps, num_cout; + boolean have_pos = FALSE; + state = &rpshader->rstate[0]; rasterizer = &rctx->rasterizer->state.rasterizer; - rpshader->rstate = radeon_state_decref(rpshader->rstate); - state = radeon_state(rscreen->rw, R600_PS_SHADER_TYPE, R600_PS_SHADER); - if (state == NULL) - return -ENOMEM; + radeon_state_fini(state); + radeon_state_init(state, rscreen->rw, R600_STATE_SHADER, 0, R600_SHADER_PS); for (i = 0; i < rshader->ninput; i++) { tmp = S_028644_SEMANTIC(i); tmp |= S_028644_SEL_CENTROID(1); + if (rshader->input[i].name == TGSI_SEMANTIC_POSITION) + have_pos = TRUE; if (rshader->input[i].name == TGSI_SEMANTIC_COLOR || - rshader->input[i].name == TGSI_SEMANTIC_BCOLOR) { + rshader->input[i].name == TGSI_SEMANTIC_BCOLOR || + rshader->input[i].name == TGSI_SEMANTIC_POSITION) { tmp |= S_028644_FLAT_SHADE(rshader->flat_shade); } if (rasterizer->sprite_coord_enable & (1 << i)) { @@ -190,15 +196,24 @@ static int r600_pipe_shader_ps(struct pipe_context *ctx, struct r600_context_sta num_cout++; } } + if (!exports_ps) { + /* always at least export 1 component per pixel */ + exports_ps = 2; + } state->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_0] = S_0286CC_NUM_INTERP(rshader->ninput) | S_0286CC_PERSP_GRADIENT_ENA(1); + if (have_pos) { + state->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_0] |= S_0286CC_POSITION_ENA(1) | + S_0286CC_BARYC_SAMPLE_CNTL(1); + state->states[R600_PS_SHADER__SPI_INPUT_Z] |= 1; + } state->states[R600_PS_SHADER__SPI_PS_IN_CONTROL_1] = 0x00000000; - state->states[R600_PS_SHADER__SQ_PGM_RESOURCES_PS] = S_028868_NUM_GPRS(rshader->bc.ngpr); + state->states[R600_PS_SHADER__SQ_PGM_RESOURCES_PS] = S_028868_NUM_GPRS(rshader->bc.ngpr) | + S_028868_STACK_SIZE(rshader->bc.nstack); state->states[R600_PS_SHADER__SQ_PGM_EXPORTS_PS] = exports_ps; - rpshader->rstate = state; - rpshader->rstate->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo); - rpshader->rstate->nbo = 1; - rpshader->rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + state->bo[0] = radeon_bo_incref(rscreen->rw, rpshader->bo); + state->nbo = 1; + state->placement[0] = RADEON_GEM_DOMAIN_GTT; return radeon_state_pm4(state); } @@ -268,21 +283,24 @@ static int tgsi_is_supported(struct r600_shader_ctx *ctx) R600_ERR("predicate unsupported\n"); return -EINVAL; } +#if 0 if (i->Instruction.Label) { R600_ERR("label unsupported\n"); return -EINVAL; } +#endif for (j = 0; j < i->Instruction.NumSrcRegs; j++) { - if (i->Src[j].Register.Indirect || - i->Src[j].Register.Dimension || + if (i->Src[j].Register.Dimension || i->Src[j].Register.Absolute) { - R600_ERR("unsupported src (indirect|dimension|absolute)\n"); + R600_ERR("unsupported src %d (dimension %d|absolute %d)\n", j, + i->Src[j].Register.Dimension, + i->Src[j].Register.Absolute); return -EINVAL; } } for (j = 0; j < i->Instruction.NumDstRegs; j++) { - if (i->Dst[j].Register.Indirect || i->Dst[j].Register.Dimension) { - R600_ERR("unsupported dst (indirect|dimension)\n"); + if (i->Dst[j].Register.Dimension) { + R600_ERR("unsupported dst (dimension)\n"); return -EINVAL; } } @@ -333,6 +351,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) case TGSI_FILE_CONSTANT: case TGSI_FILE_TEMPORARY: case TGSI_FILE_SAMPLER: + case TGSI_FILE_ADDRESS: break; default: R600_ERR("unsupported file %d declaration\n", d->Declaration.File); @@ -341,6 +360,11 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) return 0; } +static int r600_get_temp(struct r600_shader_ctx *ctx) +{ + return ctx->temp_reg + ctx->max_driver_temp_used++; +} + int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *shader) { struct tgsi_full_immediate *immediate; @@ -362,9 +386,15 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s shader->processor_type = ctx.type; /* register allocations */ - /* Values [0,127] correspond to GPR[0..127]. - * Values [256,511] correspond to cfile constants c[0..255]. + /* Values [0,127] correspond to GPR[0..127]. + * Values [128,159] correspond to constant buffer bank 0 + * Values [160,191] correspond to constant buffer bank 1 + * Values [256,511] correspond to cfile constants c[0..255]. * Other special values are shown in the list below. + * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) + * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) + * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) + * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) * 248 SQ_ALU_SRC_0: special constant 0.0. * 249 SQ_ALU_SRC_1: special constant 1.0 float. * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. @@ -389,15 +419,24 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s ctx.temp_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] + ctx.info.file_count[TGSI_FILE_TEMPORARY]; + ctx.nliterals = 0; + ctx.literals = NULL; + while (!tgsi_parse_end_of_tokens(&ctx.parse)) { tgsi_parse_token(&ctx.parse); switch (ctx.parse.FullToken.Token.Type) { case TGSI_TOKEN_TYPE_IMMEDIATE: immediate = &ctx.parse.FullToken.FullImmediate; - ctx.value[0] = immediate->u[0].Uint; - ctx.value[1] = immediate->u[1].Uint; - ctx.value[2] = immediate->u[2].Uint; - ctx.value[3] = immediate->u[3].Uint; + ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16); + if(ctx.literals == NULL) { + r = -ENOMEM; + goto out_err; + } + ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint; + ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint; + ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint; + ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint; + ctx.nliterals++; break; case TGSI_TOKEN_TYPE_DECLARATION: r = tgsi_declaration(&ctx); @@ -408,6 +447,9 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s r = tgsi_is_supported(&ctx); if (r) goto out_err; + ctx.max_driver_temp_used = 0; + /* reserve first tmp for everyone */ + r600_get_temp(&ctx); opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; r = ctx.inst_info->process(&ctx); @@ -458,6 +500,8 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) { output[i].array_base = 61; + output[i].swizzle_x = 2; + output[i].swizzle_y = output[i].swizzle_z = output[i].swizzle_w = 7; output[i].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; } else { R600_ERR("unsupported fragment output name %d\n", shader->output[i].name); @@ -504,7 +548,7 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s output[0].swizzle_z = 7; output[0].swizzle_w = 7; output[0].barrier = 1; - output[0].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM; + output[0].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL; output[0].array_base = 0; output[0].inst = V_SQ_CF_ALLOC_EXPORT_WORD1_SQ_CF_INST_EXPORT; noutput++; @@ -525,9 +569,11 @@ int r600_shader_from_tgsi(const struct tgsi_token *tokens, struct r600_shader *s if (r) goto out_err; } + free(ctx.literals); tgsi_parse_free(&ctx.parse); return 0; out_err: + free(ctx.literals); tgsi_parse_free(&ctx.parse); return r; } @@ -547,11 +593,19 @@ static int tgsi_src(struct r600_shader_ctx *ctx, const struct tgsi_full_src_register *tgsi_src, struct r600_bc_alu_src *r600_src) { + int index; memset(r600_src, 0, sizeof(struct r600_bc_alu_src)); r600_src->sel = tgsi_src->Register.Index; if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) { r600_src->sel = 0; + index = tgsi_src->Register.Index; + ctx->value[0] = ctx->literals[index * 4 + 0]; + ctx->value[1] = ctx->literals[index * 4 + 1]; + ctx->value[2] = ctx->literals[index * 4 + 2]; + ctx->value[3] = ctx->literals[index * 4 + 3]; } + if (tgsi_src->Register.Indirect) + r600_src->rel = V_SQ_REL_RELATIVE; r600_src->neg = tgsi_src->Register.Negate; r600_src->sel += ctx->file_offset[tgsi_src->Register.File]; return 0; @@ -568,6 +622,8 @@ static int tgsi_dst(struct r600_shader_ctx *ctx, r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File]; r600_dst->chan = swizzle; r600_dst->write = 1; + if (tgsi_dst->Register.Indirect) + r600_dst->rel = V_SQ_REL_RELATIVE; if (inst->Instruction.Saturate) { r600_dst->clamp = 1; } @@ -607,12 +663,13 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx, struct r600_bc_alu_s } for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) { if (inst->Src[j].Register.File == TGSI_FILE_CONSTANT && j > 0) { + int treg = r600_get_temp(ctx); for (k = 0; k < 4; k++) { memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; - alu.src[0].sel = r600_src[0].sel; + alu.src[0].sel = r600_src[j].sel; alu.src[0].chan = k; - alu.dst.sel = ctx->temp_reg + j; + alu.dst.sel = treg; alu.dst.chan = k; alu.dst.write = 1; if (k == 3) @@ -621,37 +678,90 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx, struct r600_bc_alu_s if (r) return r; } - r600_src[0].sel = ctx->temp_reg + j; + r600_src[j].sel = treg; j--; } } return 0; } -static int tgsi_op2(struct r600_shader_ctx *ctx) +/* need to move any immediate into a temp - for trig functions which use literal for PI stuff */ +static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx, struct r600_bc_alu_src r600_src[3]) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu alu; + int i, j, k, nliteral, r; + + for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) { + if (inst->Src[i].Register.File == TGSI_FILE_IMMEDIATE) { + nliteral++; + } + } + for (i = 0, j = 0; i < inst->Instruction.NumSrcRegs; i++) { + if (inst->Src[j].Register.File == TGSI_FILE_IMMEDIATE) { + int treg = r600_get_temp(ctx); + for (k = 0; k < 4; k++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; + alu.src[0].sel = r600_src[j].sel; + alu.src[0].chan = k; + alu.dst.sel = treg; + alu.dst.chan = k; + alu.dst.write = 1; + if (k == 3) + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + r600_src[j].sel = treg; + j++; + } + } + return 0; +} + +static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bc_alu_src r600_src[3]; struct r600_bc_alu alu; int i, j, r; + int lasti = 0; + + for (i = 0; i < 4; i++) { + if (inst->Dst[0].Register.WriteMask & (1 << i)) { + lasti = i; + } + } r = tgsi_split_constant(ctx, r600_src); if (r) return r; - for (i = 0; i < 4; i++) { + for (i = 0; i < lasti + 1; i++) { + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bc_alu)); - if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP; - alu.dst.chan = i; - } else { - alu.inst = ctx->inst_info->r600_opcode; + r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + if (r) + return r; + + alu.inst = ctx->inst_info->r600_opcode; + if (!swap) { for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { alu.src[j] = r600_src[j]; alu.src[j].chan = tgsi_chan(&inst->Src[j], i); } - r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); - if (r) - return r; + } else { + alu.src[0] = r600_src[1]; + alu.src[0].chan = tgsi_chan(&inst->Src[1], i); + + alu.src[1] = r600_src[0]; + alu.src[1].chan = tgsi_chan(&inst->Src[0], i); } /* handle some special cases */ switch (ctx->inst_info->tgsi_opcode) { @@ -664,7 +774,7 @@ static int tgsi_op2(struct r600_shader_ctx *ctx) default: break; } - if (i == 3) { + if (i == lasti) { alu.last = 1; } r = r600_bc_add_alu(ctx->bc, &alu); @@ -674,24 +784,154 @@ static int tgsi_op2(struct r600_shader_ctx *ctx) return 0; } -static int tgsi_kill(struct r600_shader_ctx *ctx) +static int tgsi_op2(struct r600_shader_ctx *ctx) +{ + return tgsi_op2_s(ctx, 0); +} + +static int tgsi_op2_swap(struct r600_shader_ctx *ctx) +{ + return tgsi_op2_s(ctx, 1); +} + +/* + * r600 - trunc to -PI..PI range + * r700 - normalize by dividing by 2PI + * see fdo bug 27901 + */ +static int tgsi_setup_trig(struct r600_shader_ctx *ctx, + struct r600_bc_alu_src r600_src[3]) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + int r; + uint32_t lit_vals[4]; + struct r600_bc_alu alu; + + memset(lit_vals, 0, 4*4); + r = tgsi_split_constant(ctx, r600_src); + if (r) + return r; + + r = tgsi_split_literal_constant(ctx, r600_src); + if (r) + return r; + + lit_vals[0] = fui(1.0 /(3.1415926535 * 2)); + lit_vals[1] = fui(0.5f); + + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD; + alu.is_op3 = 1; + + alu.dst.chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + + alu.src[0] = r600_src[0]; + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].chan = 0; + alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[2].chan = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + r = r600_bc_add_literal(ctx->bc, lit_vals); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT; + + alu.dst.chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 0; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + + if (ctx->bc->chiprev == 0) { + lit_vals[0] = fui(3.1415926535897f * 2.0f); + lit_vals[1] = fui(-3.1415926535897f); + } else { + lit_vals[0] = fui(1.0f); + lit_vals[1] = fui(-0.5f); + } + + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD; + alu.is_op3 = 1; + + alu.dst.chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 0; + + alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[1].chan = 0; + alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[2].chan = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + r = r600_bc_add_literal(ctx->bc, lit_vals); + if (r) + return r; + return 0; +} + +static int tgsi_trig(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu_src r600_src[3]; struct r600_bc_alu alu; int i, r; + int lasti = 0; + + r = tgsi_setup_trig(ctx, r600_src); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = ctx->inst_info->r600_opcode; + alu.dst.chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 0; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + /* replicate result */ for (i = 0; i < 4; i++) { + if (inst->Dst[0].Register.WriteMask & (1 << i)) + lasti = i; + } + for (i = 0; i < lasti + 1; i++) { + if (!(inst->Dst[0].Register.WriteMask & (1 << i))) + continue; + memset(&alu, 0, sizeof(struct r600_bc_alu)); - alu.inst = ctx->inst_info->r600_opcode; - alu.dst.chan = i; - alu.src[0].sel = 248; - r = tgsi_src(ctx, &inst->Src[0], &alu.src[1]); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; + + alu.src[0].sel = ctx->temp_reg; + r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); if (r) return r; - alu.src[1].chan = tgsi_chan(&inst->Src[0], i); - if (i == 3) { + if (i == lasti) alu.last = 1; - } r = r600_bc_add_alu(ctx->bc, &alu); if (r) return r; @@ -699,30 +939,70 @@ static int tgsi_kill(struct r600_shader_ctx *ctx) return 0; } -static int tgsi_slt(struct r600_shader_ctx *ctx) +static int tgsi_scs(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bc_alu_src r600_src[3]; struct r600_bc_alu alu; - int i, r; + int r; - r = tgsi_split_constant(ctx, r600_src); + r = tgsi_setup_trig(ctx, r600_src); + if (r) + return r; + + + /* dst.x = COS */ + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS; + r = tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); if (r) return r; + + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 0; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* dst.y = SIN */ + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN; + r = tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); + if (r) + return r; + + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 0; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + return 0; +} + +static int tgsi_kill(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu alu; + int i, r; + for (i = 0; i < 4; i++) { memset(&alu, 0, sizeof(struct r600_bc_alu)); - if (!(inst->Dst[0].Register.WriteMask & (1 << i))) { - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP; - alu.dst.chan = i; + alu.inst = ctx->inst_info->r600_opcode; + + alu.dst.chan = i; + + alu.src[0].sel = V_SQ_ALU_SRC_0; + + if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILP) { + alu.src[1].sel = V_SQ_ALU_SRC_1; + alu.src[1].neg = 1; } else { - alu.inst = ctx->inst_info->r600_opcode; - alu.src[1] = r600_src[0]; - alu.src[1].chan = tgsi_chan(&inst->Src[0], i); - alu.src[0] = r600_src[1]; - alu.src[0].chan = tgsi_chan(&inst->Src[1], i); - r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + r = tgsi_src(ctx, &inst->Src[0], &alu.src[1]); if (r) return r; + alu.src[1].chan = tgsi_chan(&inst->Src[0], i); } if (i == 3) { alu.last = 1; @@ -731,6 +1011,13 @@ static int tgsi_slt(struct r600_shader_ctx *ctx) if (r) return r; } + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + + /* kill must be last in ALU */ + ctx->bc->force_add_cf = 1; + ctx->shader->uses_kill = TRUE; return 0; } @@ -738,12 +1025,20 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bc_alu alu; + struct r600_bc_alu_src r600_src[3]; int r; + r = tgsi_split_constant(ctx, r600_src); + if (r) + return r; + r = tgsi_split_literal_constant(ctx, r600_src); + if (r) + return r; + /* dst.x, <- 1.0 */ memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; - alu.src[0].sel = 249; /*1.0*/ + alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/ alu.src[0].chan = 0; r = tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst); if (r) @@ -756,11 +1051,9 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) /* dst.y = max(src.x, 0.0) */ memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX; - r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); - if (r) - return r; - alu.src[1].sel = 248; /*0.0*/ - alu.src[1].chan = tgsi_chan(&inst->Src[0], 0); + alu.src[0] = r600_src[0]; + alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/ + alu.src[1].chan = 0; r = tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst); if (r) return r; @@ -769,18 +1062,10 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) if (r) return r; - /* dst.z = NOP - fill Z slot */ - memset(&alu, 0, sizeof(struct r600_bc_alu)); - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP; - alu.dst.chan = 2; - r = r600_bc_add_alu(ctx->bc, &alu); - if (r) - return r; - /* dst.w, <- 1.0 */ memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; - alu.src[0].sel = 249; + alu.src[0].sel = V_SQ_ALU_SRC_1; alu.src[0].chan = 0; r = tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst); if (r) @@ -791,6 +1076,10 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) if (r) return r; + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + if (inst->Dst[0].Register.WriteMask & (1 << 2)) { int chan; @@ -799,9 +1088,7 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) /* dst.z = log(src.y) */ memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_CLAMPED; - r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); - if (r) - return r; + alu.src[0] = r600_src[0]; alu.src[0].chan = tgsi_chan(&inst->Src[0], 1); r = tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst); if (r) @@ -811,21 +1098,22 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) if (r) return r; + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + chan = alu.dst.chan; sel = alu.dst.sel; /* tmp.x = amd MUL_LIT(src.w, dst.z, src.x ) */ memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MUL_LIT; - r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); - if (r) - return r; + alu.src[0] = r600_src[0]; alu.src[0].chan = tgsi_chan(&inst->Src[0], 3); alu.src[1].sel = sel; alu.src[1].chan = chan; - r = tgsi_src(ctx, &inst->Src[0], &alu.src[2]); - if (r) - return r; + + alu.src[2] = r600_src[0]; alu.src[2].chan = tgsi_chan(&inst->Src[0], 0); alu.dst.sel = ctx->temp_reg; alu.dst.chan = 0; @@ -836,6 +1124,9 @@ static int tgsi_lit(struct r600_shader_ctx *ctx) if (r) return r; + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; /* dst.z = exp(tmp.x) */ memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE; @@ -880,19 +1171,43 @@ static int tgsi_trans(struct r600_shader_ctx *ctx) return 0; } +static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu alu; + int i, r; + + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.src[0].sel = ctx->temp_reg; + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; + alu.dst.chan = i; + r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + if (r) + return r; + alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; + if (i == 3) + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; +} + static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; struct r600_bc_alu alu; - int i, j, r; + int i, r; memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = ctx->inst_info->r600_opcode; - for (j = 0; j < inst->Instruction.NumSrcRegs; j++) { - r = tgsi_src(ctx, &inst->Src[j], &alu.src[j]); + for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { + r = tgsi_src(ctx, &inst->Src[i], &alu.src[i]); if (r) return r; - alu.src[j].chan = tgsi_chan(&inst->Src[j], 0); + alu.src[i].chan = tgsi_chan(&inst->Src[i], 0); } alu.dst.sel = ctx->temp_reg; alu.dst.write = 1; @@ -900,16 +1215,124 @@ static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx) r = r600_bc_add_alu(ctx->bc, &alu); if (r) return r; + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; /* replicate result */ + return tgsi_helper_tempx_replicate(ctx); +} + +static int tgsi_pow(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu alu; + int r; + + /* LOG2(a) */ + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE; + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + r = r600_bc_add_literal(ctx->bc,ctx->value); + if (r) + return r; + /* b * LOG2(a) */ + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL_IEEE; + r = tgsi_src(ctx, &inst->Src[1], &alu.src[0]); + if (r) + return r; + alu.src[0].chan = tgsi_chan(&inst->Src[1], 0); + alu.src[1].sel = ctx->temp_reg; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + r = r600_bc_add_literal(ctx->bc,ctx->value); + if (r) + return r; + /* POW(a,b) = EXP2(b * LOG2(a))*/ + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE; + alu.src[0].sel = ctx->temp_reg; + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + r = r600_bc_add_literal(ctx->bc,ctx->value); + if (r) + return r; + return tgsi_helper_tempx_replicate(ctx); +} + +static int tgsi_ssg(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu alu; + struct r600_bc_alu_src r600_src[3]; + int i, r; + + r = tgsi_split_constant(ctx, r600_src); + if (r) + return r; + + /* tmp = (src > 0 ? 1 : src) */ for (i = 0; i < 4; i++) { memset(&alu, 0, sizeof(struct r600_bc_alu)); - alu.src[0].sel = ctx->temp_reg; - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; + alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT; + alu.is_op3 = 1; + + alu.dst.sel = ctx->temp_reg; alu.dst.chan = i; + + alu.src[0] = r600_src[0]; + alu.src[0].chan = tgsi_chan(&inst->Src[0], i); + + alu.src[1].sel = V_SQ_ALU_SRC_1; + + alu.src[2] = r600_src[0]; + alu.src[2].chan = tgsi_chan(&inst->Src[0], i); + if (i == 3) + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + + /* dst = (-tmp > 0 ? -1 : tmp) */ + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGT; + alu.is_op3 = 1; r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); if (r) return r; - alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1; + + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = i; + alu.src[0].neg = 1; + + alu.src[1].sel = V_SQ_ALU_SRC_1; + alu.src[1].neg = 1; + + alu.src[2].sel = ctx->temp_reg; + alu.src[2].chan = i; + if (i == 3) alu.last = 1; r = r600_bc_add_alu(ctx->bc, &alu); @@ -1006,16 +1429,23 @@ static int tgsi_dp(struct r600_shader_ctx *ctx) switch (ctx->inst_info->tgsi_opcode) { case TGSI_OPCODE_DP2: if (i > 1) { - alu.src[0].sel = alu.src[1].sel = 248; + alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; alu.src[0].chan = alu.src[1].chan = 0; } break; case TGSI_OPCODE_DP3: if (i > 2) { - alu.src[0].sel = alu.src[1].sel = 248; + alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0; alu.src[0].chan = alu.src[1].chan = 0; } break; + case TGSI_OPCODE_DPH: + if (i == 3) { + alu.src[0].sel = V_SQ_ALU_SRC_1; + alu.src[0].chan = 0; + alu.src[0].neg = 0; + } + break; default: break; } @@ -1035,75 +1465,197 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) struct r600_bc_tex tex; struct r600_bc_alu alu; unsigned src_gpr; - int r; + int r, i; + int opcode; + boolean src_not_temp = inst->Src[0].Register.File != TGSI_FILE_TEMPORARY; + uint32_t lit_vals[4]; src_gpr = ctx->file_offset[inst->Src[0].Register.File] + inst->Src[0].Register.Index; - /* Add perspective divide */ - memset(&alu, 0, sizeof(struct r600_bc_alu)); - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE; - alu.src[0].sel = src_gpr; - alu.src[0].chan = tgsi_chan(&inst->Src[0], 3); - alu.dst.sel = ctx->temp_reg; - alu.dst.chan = 3; - alu.last = 1; - alu.dst.write = 1; - r = r600_bc_add_alu(ctx->bc, &alu); - if (r) - return r; + if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) { + /* Add perspective divide */ + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE; + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; - memset(&alu, 0, sizeof(struct r600_bc_alu)); - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL; - alu.src[0].sel = ctx->temp_reg; - alu.src[0].chan = 3; - alu.src[1].sel = src_gpr; - alu.src[1].chan = tgsi_chan(&inst->Src[0], 0); - alu.dst.sel = ctx->temp_reg; - alu.dst.chan = 0; - alu.dst.write = 1; - r = r600_bc_add_alu(ctx->bc, &alu); - if (r) - return r; - memset(&alu, 0, sizeof(struct r600_bc_alu)); - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL; - alu.src[0].sel = ctx->temp_reg; - alu.src[0].chan = 3; - alu.src[1].sel = src_gpr; - alu.src[1].chan = tgsi_chan(&inst->Src[0], 1); - alu.dst.sel = ctx->temp_reg; - alu.dst.chan = 1; - alu.dst.write = 1; - r = r600_bc_add_alu(ctx->bc, &alu); - if (r) - return r; - memset(&alu, 0, sizeof(struct r600_bc_alu)); - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL; - alu.src[0].sel = ctx->temp_reg; - alu.src[0].chan = 3; - alu.src[1].sel = src_gpr; - alu.src[1].chan = tgsi_chan(&inst->Src[0], 2); - alu.dst.sel = ctx->temp_reg; - alu.dst.chan = 2; - alu.dst.write = 1; - r = r600_bc_add_alu(ctx->bc, &alu); - if (r) - return r; - memset(&alu, 0, sizeof(struct r600_bc_alu)); - alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; - alu.src[0].sel = 249; - alu.src[0].chan = 0; - alu.dst.sel = ctx->temp_reg; - alu.dst.chan = 3; - alu.last = 1; - alu.dst.write = 1; - r = r600_bc_add_alu(ctx->bc, &alu); - if (r) - return r; - src_gpr = ctx->temp_reg; + alu.src[0].chan = tgsi_chan(&inst->Src[0], 3); + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 3; + alu.last = 1; + alu.dst.write = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + + for (i = 0; i < 3; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL; + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 3; + r = tgsi_src(ctx, &inst->Src[0], &alu.src[1]); + if (r) + return r; + alu.src[1].chan = tgsi_chan(&inst->Src[0], i); + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; + alu.src[0].sel = V_SQ_ALU_SRC_1; + alu.src[0].chan = 0; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 3; + alu.last = 1; + alu.dst.write = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + src_not_temp = false; + src_gpr = ctx->temp_reg; + } + + if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) { + int src_chan, src2_chan; + + /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */ + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_CUBE; + switch (i) { + case 0: + src_chan = 2; + src2_chan = 1; + break; + case 1: + src_chan = 2; + src2_chan = 0; + break; + case 2: + src_chan = 0; + src2_chan = 2; + break; + case 3: + src_chan = 1; + src2_chan = 2; + break; + } + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; + alu.src[0].chan = tgsi_chan(&inst->Src[0], src_chan); + r = tgsi_src(ctx, &inst->Src[0], &alu.src[1]); + if (r) + return r; + alu.src[1].chan = tgsi_chan(&inst->Src[0], src2_chan); + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + if (i == 3) + alu.last = 1; + alu.dst.write = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + /* tmp1.z = RCP_e(|tmp1.z|) */ + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE; + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 2; + alu.src[0].abs = 1; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 2; + alu.dst.write = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + + /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x + * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x + * muladd has no writemask, have to use another temp + */ + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD; + alu.is_op3 = 1; + + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 0; + alu.src[1].sel = ctx->temp_reg; + alu.src[1].chan = 2; + + alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[2].chan = 0; + + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 0; + alu.dst.write = 1; + + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD; + alu.is_op3 = 1; + + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 1; + alu.src[1].sel = ctx->temp_reg; + alu.src[1].chan = 2; + + alu.src[2].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[2].chan = 0; + + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 1; + alu.dst.write = 1; + + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + + lit_vals[0] = fui(1.5f); + + r = r600_bc_add_literal(ctx->bc, lit_vals); + if (r) + return r; + src_not_temp = false; + src_gpr = ctx->temp_reg; + } + + if (src_not_temp) { + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; + alu.src[0].sel = src_gpr; + alu.src[0].chan = i; + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + if (i == 3) + alu.last = 1; + alu.dst.write = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + src_gpr = ctx->temp_reg; + } + + opcode = ctx->inst_info->r600_opcode; + if (opcode == SQ_TEX_INST_SAMPLE && + (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D)) + opcode = SQ_TEX_INST_SAMPLE_C; - /* TODO use temp if src_gpr is not a temporary reg (File != TEMPORARY) */ memset(&tex, 0, sizeof(struct r600_bc_tex)); - tex.inst = ctx->inst_info->r600_opcode; + tex.inst = opcode; tex.resource_id = ctx->file_offset[inst->Src[1].Register.File] + inst->Src[1].Register.Index; tex.sampler_id = tex.resource_id; tex.src_gpr = src_gpr; @@ -1117,13 +1669,30 @@ static int tgsi_tex(struct r600_shader_ctx *ctx) tex.src_sel_z = 2; tex.src_sel_w = 3; + if (inst->Texture.Texture == TGSI_TEXTURE_CUBE) { + tex.src_sel_x = 1; + tex.src_sel_y = 0; + tex.src_sel_z = 3; + tex.src_sel_w = 1; + } + if (inst->Texture.Texture != TGSI_TEXTURE_RECT) { tex.coord_type_x = 1; tex.coord_type_y = 1; tex.coord_type_z = 1; tex.coord_type_w = 1; } - return r600_bc_add_tex(ctx->bc, &tex); + + if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D || inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D) + tex.src_sel_w = 2; + + r = r600_bc_add_tex(ctx->bc, &tex); + if (r) + return r; + + /* add shadow ambient support - gallium doesn't do it yet */ + return 0; + } static int tgsi_lrp(struct r600_shader_ctx *ctx) @@ -1141,7 +1710,7 @@ static int tgsi_lrp(struct r600_shader_ctx *ctx) for (i = 0; i < 4; i++) { memset(&alu, 0, sizeof(struct r600_bc_alu)); alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD; - alu.src[0].sel = 249; + alu.src[0].sel = V_SQ_ALU_SRC_1; alu.src[0].chan = 0; alu.src[1] = r600_src[0]; alu.src[1].chan = tgsi_chan(&inst->Src[0], i); @@ -1205,23 +1774,654 @@ static int tgsi_lrp(struct r600_shader_ctx *ctx) return tgsi_helper_copy(ctx, inst); } +static int tgsi_cmp(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu_src r600_src[3]; + struct r600_bc_alu alu; + int use_temp = 0; + int i, r; + + r = tgsi_split_constant(ctx, r600_src); + if (r) + return r; + + if (inst->Dst[0].Register.WriteMask != 0xf) + use_temp = 1; + + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_CNDGE; + alu.src[0] = r600_src[0]; + alu.src[0].chan = tgsi_chan(&inst->Src[0], i); + + alu.src[1] = r600_src[2]; + alu.src[1].chan = tgsi_chan(&inst->Src[2], i); + + alu.src[2] = r600_src[1]; + alu.src[2].chan = tgsi_chan(&inst->Src[1], i); + + if (use_temp) + alu.dst.sel = ctx->temp_reg; + else { + r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + if (r) + return r; + } + alu.dst.chan = i; + alu.dst.write = 1; + alu.is_op3 = 1; + if (i == 3) + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + if (use_temp) + return tgsi_helper_copy(ctx, inst); + return 0; +} + +static int tgsi_xpd(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu_src r600_src[3]; + struct r600_bc_alu alu; + uint32_t use_temp = 0; + int i, r; + + if (inst->Dst[0].Register.WriteMask != 0xf) + use_temp = 1; + + r = tgsi_split_constant(ctx, r600_src); + if (r) + return r; + + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL; + + alu.src[0] = r600_src[0]; + switch (i) { + case 0: + alu.src[0].chan = tgsi_chan(&inst->Src[0], 2); + break; + case 1: + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + break; + case 2: + alu.src[0].chan = tgsi_chan(&inst->Src[0], 1); + break; + case 3: + alu.src[0].sel = V_SQ_ALU_SRC_0; + alu.src[0].chan = i; + } + + alu.src[1] = r600_src[1]; + switch (i) { + case 0: + alu.src[1].chan = tgsi_chan(&inst->Src[1], 1); + break; + case 1: + alu.src[1].chan = tgsi_chan(&inst->Src[1], 2); + break; + case 2: + alu.src[1].chan = tgsi_chan(&inst->Src[1], 0); + break; + case 3: + alu.src[1].sel = V_SQ_ALU_SRC_0; + alu.src[1].chan = i; + } + + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = i; + alu.dst.write = 1; + + if (i == 3) + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD; + + alu.src[0] = r600_src[0]; + switch (i) { + case 0: + alu.src[0].chan = tgsi_chan(&inst->Src[0], 1); + break; + case 1: + alu.src[0].chan = tgsi_chan(&inst->Src[0], 2); + break; + case 2: + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + break; + case 3: + alu.src[0].sel = V_SQ_ALU_SRC_0; + alu.src[0].chan = i; + } + + alu.src[1] = r600_src[1]; + switch (i) { + case 0: + alu.src[1].chan = tgsi_chan(&inst->Src[1], 2); + break; + case 1: + alu.src[1].chan = tgsi_chan(&inst->Src[1], 0); + break; + case 2: + alu.src[1].chan = tgsi_chan(&inst->Src[1], 1); + break; + case 3: + alu.src[1].sel = V_SQ_ALU_SRC_0; + alu.src[1].chan = i; + } + + alu.src[2].sel = ctx->temp_reg; + alu.src[2].neg = 1; + alu.src[2].chan = i; + + if (use_temp) + alu.dst.sel = ctx->temp_reg; + else { + r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + if (r) + return r; + } + alu.dst.chan = i; + alu.dst.write = 1; + alu.is_op3 = 1; + if (i == 3) + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + if (use_temp) + return tgsi_helper_copy(ctx, inst); + return 0; +} + +static int tgsi_exp(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu_src r600_src[3]; + struct r600_bc_alu alu; + int r; + + /* result.x = 2^floor(src); */ + if (inst->Dst[0].Register.WriteMask & 1) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR; + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; + + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 0; + alu.dst.write = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE; + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = 0; + + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 0; + alu.dst.write = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + } + + /* result.y = tmp - floor(tmp); */ + if ((inst->Dst[0].Register.WriteMask >> 1) & 1) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT; + alu.src[0] = r600_src[0]; + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + + alu.dst.sel = ctx->temp_reg; +// r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); +// if (r) +// return r; + alu.dst.write = 1; + alu.dst.chan = 1; + + alu.last = 1; + + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + } + + /* result.z = RoughApprox2ToX(tmp);*/ + if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE; + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + alu.dst.chan = 2; + + alu.last = 1; + + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + } + + /* result.w = 1.0;*/ + if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV; + alu.src[0].sel = V_SQ_ALU_SRC_1; + alu.src[0].chan = 0; + + alu.dst.sel = ctx->temp_reg; + alu.dst.chan = 3; + alu.dst.write = 1; + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + r = r600_bc_add_literal(ctx->bc, ctx->value); + if (r) + return r; + } + return tgsi_helper_copy(ctx, inst); +} + +static int tgsi_arl(struct r600_shader_ctx *ctx) +{ + /* TODO from r600c, ar values don't persist between clauses */ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu alu; + int r; + memset(&alu, 0, sizeof(struct r600_bc_alu)); + + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOVA_FLOOR; + + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + + alu.last = 1; + + r = r600_bc_add_alu_type(ctx->bc, &alu, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU); + if (r) + return r; + return 0; +} + +static int tgsi_opdst(struct r600_shader_ctx *ctx) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu alu; + int i, r = 0; + + for (i = 0; i < 4; i++) { + memset(&alu, 0, sizeof(struct r600_bc_alu)); + + alu.inst = V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL; + r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst); + if (r) + return r; + + if (i == 0 || i == 3) { + alu.src[0].sel = V_SQ_ALU_SRC_1; + } else { + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; + alu.src[0].chan = tgsi_chan(&inst->Src[0], i); + } + + if (i == 0 || i == 2) { + alu.src[1].sel = V_SQ_ALU_SRC_1; + } else { + r = tgsi_src(ctx, &inst->Src[1], &alu.src[1]); + if (r) + return r; + alu.src[1].chan = tgsi_chan(&inst->Src[1], i); + } + if (i == 3) + alu.last = 1; + r = r600_bc_add_alu(ctx->bc, &alu); + if (r) + return r; + } + return 0; +} + +static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode) +{ + struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; + struct r600_bc_alu alu; + int r; + + memset(&alu, 0, sizeof(struct r600_bc_alu)); + alu.inst = opcode; + alu.predicate = 1; + + alu.dst.sel = ctx->temp_reg; + alu.dst.write = 1; + alu.dst.chan = 0; + + r = tgsi_src(ctx, &inst->Src[0], &alu.src[0]); + if (r) + return r; + alu.src[0].chan = tgsi_chan(&inst->Src[0], 0); + alu.src[1].sel = V_SQ_ALU_SRC_0; + alu.src[1].chan = 0; + + alu.last = 1; + + r = r600_bc_add_alu_type(ctx->bc, &alu, V_SQ_CF_ALU_WORD1_SQ_CF_INST_ALU_PUSH_BEFORE); + if (r) + return r; + return 0; +} + +static int pops(struct r600_shader_ctx *ctx, int pops) +{ + r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_POP); + ctx->bc->cf_last->pop_count = pops; + return 0; +} + +static inline void callstack_decrease_current(struct r600_shader_ctx *ctx, unsigned reason) +{ + switch(reason) { + case FC_PUSH_VPM: + ctx->bc->callstack[ctx->bc->call_sp].current--; + break; + case FC_PUSH_WQM: + case FC_LOOP: + ctx->bc->callstack[ctx->bc->call_sp].current -= 4; + break; + case FC_REP: + /* TOODO : for 16 vp asic should -= 2; */ + ctx->bc->callstack[ctx->bc->call_sp].current --; + break; + } +} + +static inline void callstack_check_depth(struct r600_shader_ctx *ctx, unsigned reason, unsigned check_max_only) +{ + if (check_max_only) { + int diff; + switch (reason) { + case FC_PUSH_VPM: + diff = 1; + break; + case FC_PUSH_WQM: + diff = 4; + break; + } + if ((ctx->bc->callstack[ctx->bc->call_sp].current + diff) > + ctx->bc->callstack[ctx->bc->call_sp].max) { + ctx->bc->callstack[ctx->bc->call_sp].max = + ctx->bc->callstack[ctx->bc->call_sp].current + diff; + } + return; + } + switch (reason) { + case FC_PUSH_VPM: + ctx->bc->callstack[ctx->bc->call_sp].current++; + break; + case FC_PUSH_WQM: + case FC_LOOP: + ctx->bc->callstack[ctx->bc->call_sp].current += 4; + break; + case FC_REP: + ctx->bc->callstack[ctx->bc->call_sp].current++; + break; + } + + if ((ctx->bc->callstack[ctx->bc->call_sp].current) > + ctx->bc->callstack[ctx->bc->call_sp].max) { + ctx->bc->callstack[ctx->bc->call_sp].max = + ctx->bc->callstack[ctx->bc->call_sp].current; + } +} + +static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp) +{ + struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp]; + + sp->mid = (struct r600_bc_cf **)realloc((void *)sp->mid, + sizeof(struct r600_bc_cf *) * (sp->num_mid + 1)); + sp->mid[sp->num_mid] = ctx->bc->cf_last; + sp->num_mid++; +} + +static void fc_pushlevel(struct r600_shader_ctx *ctx, int type) +{ + ctx->bc->fc_sp++; + ctx->bc->fc_stack[ctx->bc->fc_sp].type = type; + ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last; +} + +static void fc_poplevel(struct r600_shader_ctx *ctx) +{ + struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp]; + if (sp->mid) { + free(sp->mid); + sp->mid = NULL; + } + sp->num_mid = 0; + sp->start = NULL; + sp->type = 0; + ctx->bc->fc_sp--; +} + +#if 0 +static int emit_return(struct r600_shader_ctx *ctx) +{ + r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN); + return 0; +} + +static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset) +{ + + r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_JUMP); + ctx->bc->cf_last->pop_count = pops; + /* TODO work out offset */ + return 0; +} + +static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value) +{ + return 0; +} + +static void emit_testflag(struct r600_shader_ctx *ctx) +{ + +} + +static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx) +{ + emit_testflag(ctx); + emit_jump_to_offset(ctx, 1, 4); + emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0); + pops(ctx, ifidx + 1); + emit_return(ctx); +} + +static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp) +{ + emit_testflag(ctx); + + r600_bc_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode); + ctx->bc->cf_last->pop_count = 1; + + fc_set_mid(ctx, fc_sp); + + pops(ctx, 1); +} +#endif + +static int tgsi_if(struct r600_shader_ctx *ctx) +{ + emit_logic_pred(ctx, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_PRED_SETNE); + + r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_JUMP); + + fc_pushlevel(ctx, FC_IF); + + callstack_check_depth(ctx, FC_PUSH_VPM, 0); + return 0; +} + +static int tgsi_else(struct r600_shader_ctx *ctx) +{ + r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_ELSE); + ctx->bc->cf_last->pop_count = 1; + + fc_set_mid(ctx, ctx->bc->fc_sp); + ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id; + return 0; +} + +static int tgsi_endif(struct r600_shader_ctx *ctx) +{ + pops(ctx, 1); + if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) { + R600_ERR("if/endif unbalanced in shader\n"); + return -1; + } + + if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) { + ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2; + ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1; + } else { + ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2; + } + fc_poplevel(ctx); + + callstack_decrease_current(ctx, FC_PUSH_VPM); + return 0; +} + +static int tgsi_bgnloop(struct r600_shader_ctx *ctx) +{ + r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_START_NO_AL); + + fc_pushlevel(ctx, FC_LOOP); + + /* check stack depth */ + callstack_check_depth(ctx, FC_LOOP, 0); + return 0; +} + +static int tgsi_endloop(struct r600_shader_ctx *ctx) +{ + int i; + + r600_bc_add_cfinst(ctx->bc, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_END); + + if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) { + R600_ERR("loop/endloop in shader code are not paired.\n"); + return -EINVAL; + } + + /* fixup loop pointers - from r600isa + LOOP END points to CF after LOOP START, + LOOP START point to CF after LOOP END + BRK/CONT point to LOOP END CF + */ + ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2; + + ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2; + + for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) { + ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id; + } + /* TODO add LOOPRET support */ + fc_poplevel(ctx); + callstack_decrease_current(ctx, FC_LOOP); + return 0; +} + +static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx) +{ + unsigned int fscp; + + for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--) + { + if (FC_LOOP == ctx->bc->fc_stack[fscp].type) + break; + } + + if (fscp == 0) { + R600_ERR("Break not inside loop/endloop pair\n"); + return -EINVAL; + } + + r600_bc_add_cfinst(ctx->bc, ctx->inst_info->r600_opcode); + ctx->bc->cf_last->pop_count = 1; + + fc_set_mid(ctx, fscp); + + pops(ctx, 1); + callstack_check_depth(ctx, FC_PUSH_VPM, 1); + return 0; +} + static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { - {TGSI_OPCODE_ARL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_ARL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_arl}, {TGSI_OPCODE_MOV, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2}, {TGSI_OPCODE_LIT, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lit}, {TGSI_OPCODE_RCP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIP_IEEE, tgsi_trans_srcx_replicate}, {TGSI_OPCODE_RSQ, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE, tgsi_trans_srcx_replicate}, - {TGSI_OPCODE_EXP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_EXP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_exp}, {TGSI_OPCODE_LOG, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_MUL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL, tgsi_op2}, {TGSI_OPCODE_ADD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2}, {TGSI_OPCODE_DP3, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp}, {TGSI_OPCODE_DP4, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp}, - {TGSI_OPCODE_DST, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_DST, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_opdst}, {TGSI_OPCODE_MIN, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MIN, tgsi_op2}, {TGSI_OPCODE_MAX, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MAX, tgsi_op2}, - {TGSI_OPCODE_SLT, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_slt}, - {TGSI_OPCODE_SGE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_SLT, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2_swap}, + {TGSI_OPCODE_SGE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2}, {TGSI_OPCODE_MAD, 1, V_SQ_ALU_WORD1_OP3_SQ_OP3_INST_MULADD, tgsi_op3}, {TGSI_OPCODE_SUB, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_ADD, tgsi_op2}, {TGSI_OPCODE_LRP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_lrp}, @@ -1232,38 +2432,38 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { /* gap */ {22, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {23, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_FRC, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_FRC, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FRACT, tgsi_op2}, {TGSI_OPCODE_CLAMP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_FLR, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_FLR, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLOOR, tgsi_op2}, {TGSI_OPCODE_ROUND, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_EX2, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_EXP_IEEE, tgsi_trans_srcx_replicate}, - {TGSI_OPCODE_LG2, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_POW, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_XPD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_LG2, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_LOG_IEEE, tgsi_trans_srcx_replicate}, + {TGSI_OPCODE_POW, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_pow}, + {TGSI_OPCODE_XPD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_xpd}, /* gap */ {32, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_ABS, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MOV, tgsi_op2}, {TGSI_OPCODE_RCC, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_DPH, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_COS, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_DDX, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_DDY, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_KILP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, /* predicated kill */ + {TGSI_OPCODE_DPH, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp}, + {TGSI_OPCODE_COS, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS, tgsi_trig}, + {TGSI_OPCODE_DDX, 0, SQ_TEX_INST_GET_GRADIENTS_H, tgsi_tex}, + {TGSI_OPCODE_DDY, 0, SQ_TEX_INST_GET_GRADIENTS_V, tgsi_tex}, + {TGSI_OPCODE_KILP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_KILLGT, tgsi_kill}, /* predicated kill */ {TGSI_OPCODE_PK2H, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_PK2US, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_PK4B, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_PK4UB, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_RFL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_SEQ, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_SEQ, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETE, tgsi_op2}, {TGSI_OPCODE_SFL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_SGT, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_SIN, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_SLE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_SNE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_SGT, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGT, tgsi_op2}, + {TGSI_OPCODE_SIN, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN, tgsi_trig}, + {TGSI_OPCODE_SLE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETGE, tgsi_op2_swap}, + {TGSI_OPCODE_SNE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SETNE, tgsi_op2}, {TGSI_OPCODE_STR, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_TEX, 0, 0x10, tgsi_tex}, + {TGSI_OPCODE_TEX, 0, SQ_TEX_INST_SAMPLE, tgsi_tex}, {TGSI_OPCODE_TXD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_TXP, 0, 0x10, tgsi_tex}, + {TGSI_OPCODE_TXP, 0, SQ_TEX_INST_SAMPLE, tgsi_tex}, {TGSI_OPCODE_UP2H, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_UP2US, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_UP4B, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, @@ -1274,21 +2474,21 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { {TGSI_OPCODE_BRA, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_CAL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_RET, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_SSG, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, /* SGN */ - {TGSI_OPCODE_CMP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_SCS, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_TXB, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_SSG, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_ssg}, + {TGSI_OPCODE_CMP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_cmp}, + {TGSI_OPCODE_SCS, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_scs}, + {TGSI_OPCODE_TXB, 0, SQ_TEX_INST_SAMPLE_L, tgsi_tex}, {TGSI_OPCODE_NRM, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_DIV, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_DP2, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_DOT4, tgsi_dp}, {TGSI_OPCODE_TXL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_BRK, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_IF, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_BRK, 0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_BREAK, tgsi_loop_brk_cont}, + {TGSI_OPCODE_IF, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_if}, /* gap */ {75, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {76, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_ELSE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_ENDIF, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_ELSE, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_else}, + {TGSI_OPCODE_ENDIF, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endif}, /* gap */ {79, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {80, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, @@ -1297,7 +2497,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { {TGSI_OPCODE_CEIL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_I2F, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_NOT, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_TRUNC, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_TRUNC, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_trans_srcx_replicate}, {TGSI_OPCODE_SHL, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, /* gap */ {88, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, @@ -1308,12 +2508,12 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = { {TGSI_OPCODE_SAD, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_TXF, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_TXQ, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_CONT, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_CONT, 0, V_SQ_CF_WORD1_SQ_CF_INST_LOOP_CONTINUE, tgsi_loop_brk_cont}, {TGSI_OPCODE_EMIT, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, {TGSI_OPCODE_ENDPRIM, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_BGNLOOP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_BGNLOOP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_bgnloop}, {TGSI_OPCODE_BGNSUB, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, - {TGSI_OPCODE_ENDLOOP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, + {TGSI_OPCODE_ENDLOOP, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_endloop}, {TGSI_OPCODE_ENDSUB, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, /* gap */ {103, 0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported}, diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index 2ee7780ead0..7c722c07cbe 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -42,6 +42,7 @@ struct r600_shader { struct r600_shader_io input[32]; struct r600_shader_io output[32]; enum radeon_family family; + boolean uses_kill; }; #endif diff --git a/src/gallium/drivers/r600/r600_sq.h b/src/gallium/drivers/r600/r600_sq.h index 002660c654a..fa7a31742af 100644 --- a/src/gallium/drivers/r600/r600_sq.h +++ b/src/gallium/drivers/r600/r600_sq.h @@ -206,6 +206,26 @@ #define S_SQ_ALU_WORD0_SRC0_SEL(x) (((x) & 0x1FF) << 0) #define G_SQ_ALU_WORD0_SRC0_SEL(x) (((x) >> 0) & 0x1FF) #define C_SQ_ALU_WORD0_SRC0_SEL 0xFFFFFE00 +/* + * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+) + * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+) + * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+) + * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+) + * 248 SQ_ALU_SRC_0: special constant 0.0. + * 249 SQ_ALU_SRC_1: special constant 1.0 float. + * 250 SQ_ALU_SRC_1_INT: special constant 1 integer. + * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer. + * 252 SQ_ALU_SRC_0_5: special constant 0.5 float. + * 253 SQ_ALU_SRC_LITERAL: literal constant. + * 254 SQ_ALU_SRC_PV: previous vector result. + * 255 SQ_ALU_SRC_PS: previous scalar result. + */ +#define V_SQ_ALU_SRC_0 0x000000F8 +#define V_SQ_ALU_SRC_1 0x000000F9 +#define V_SQ_ALU_SRC_1_INT 0x000000FA +#define V_SQ_ALU_SRC_M_1_INT 0x000000FB +#define V_SQ_ALU_SRC_0_5 0x000000FC +#define V_SQ_ALU_SRC_LITERAL 0x000000FD #define S_SQ_ALU_WORD0_SRC0_REL(x) (((x) & 0x1) << 9) #define G_SQ_ALU_WORD0_SRC0_REL(x) (((x) >> 9) & 0x1) #define C_SQ_ALU_WORD0_SRC0_REL 0xFFFFFDFF @@ -583,4 +603,11 @@ #define G_SQ_TEX_WORD2_SRC_SEL_W(x) (((x) >> 29) & 0x7) #define C_SQ_TEX_WORD2_SRC_SEL_W 0x1FFFFFFF +#define V_SQ_CF_COND_ACTIVE 0x00 +#define V_SQ_CF_COND_FALSE 0x01 +#define V_SQ_CF_COND_BOOL 0x02 +#define V_SQ_CF_COND_NOT_BOOL 0x03 + +#define V_SQ_REL_ABSOLUTE 0 +#define V_SQ_REL_RELATIVE 1 #endif diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 3efd409ae0d..66cab7d7a6e 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -34,6 +34,17 @@ #include "r600d.h" #include "r600_state_inlines.h" +static void r600_blend(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_blend_state *state); +static void r600_viewport(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_viewport_state *state); +static void r600_ucp(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_clip_state *state); +static void r600_sampler(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_sampler_state *state, unsigned id); +static void r600_resource(struct pipe_context *ctx, struct radeon_state *rstate, const struct pipe_sampler_view *view, unsigned id); +static void r600_cb(struct r600_context *rctx, struct radeon_state *rstate, + const struct pipe_framebuffer_state *state, int cb); +static void r600_db(struct r600_context *rctx, struct radeon_state *rstate, + const struct pipe_framebuffer_state *state); + + static void *r600_create_blend_state(struct pipe_context *ctx, const struct pipe_blend_state *state) { @@ -81,11 +92,12 @@ static struct pipe_sampler_view *r600_create_sampler_view(struct pipe_context *c struct r600_context *rctx = r600_context(ctx); struct r600_context_state *rstate; - rstate = r600_context_state(rctx, pipe_sampler_type, state); + rstate = r600_context_state(rctx, pipe_sampler_view_type, state); pipe_reference(NULL, &texture->reference); rstate->state.sampler_view.texture = texture; rstate->state.sampler_view.reference.count = 1; rstate->state.sampler_view.context = ctx; + r600_resource(ctx, &rstate->rstate[0], &rstate->state.sampler_view, 0); return &rstate->state.sampler_view; } @@ -223,12 +235,24 @@ static void r600_bind_ps_sampler(struct pipe_context *ctx, struct r600_context_state *rstate; unsigned i; - for (i = 0; i < rctx->ps_nsampler; i++) { - rctx->ps_sampler[i] = r600_context_state_decref(rctx->ps_sampler[i]); + for (i = 0; i < count; i++) { + rstate = (struct r600_context_state *)states[i]; + if (rstate) { + rstate->nrstate = 0; + } } for (i = 0; i < count; i++) { rstate = (struct r600_context_state *)states[i]; - rctx->ps_sampler[i] = r600_context_state_incref(rstate); + if (rstate) { + if (rstate->nrstate >= R600_MAX_RSTATE) + continue; + if (rstate->nrstate) { + memcpy(&rstate->rstate[rstate->nrstate], &rstate->rstate[0], sizeof(struct radeon_state)); + } + radeon_state_convert(&rstate->rstate[rstate->nrstate], R600_STATE_SAMPLER, i, R600_SHADER_PS); + rctx->ps_sampler[i] = &rstate->rstate[rstate->nrstate]; + rstate->nrstate++; + } } rctx->ps_nsampler = count; } @@ -240,12 +264,24 @@ static void r600_bind_vs_sampler(struct pipe_context *ctx, struct r600_context_state *rstate; unsigned i; - for (i = 0; i < rctx->vs_nsampler; i++) { - rctx->vs_sampler[i] = r600_context_state_decref(rctx->vs_sampler[i]); + for (i = 0; i < count; i++) { + rstate = (struct r600_context_state *)states[i]; + if (rstate) { + rstate->nrstate = 0; + } } for (i = 0; i < count; i++) { rstate = (struct r600_context_state *)states[i]; - rctx->vs_sampler[i] = r600_context_state_incref(rstate); + if (rstate) { + if (rstate->nrstate >= R600_MAX_RSTATE) + continue; + if (rstate->nrstate) { + memcpy(&rstate->rstate[rstate->nrstate], &rstate->rstate[0], sizeof(struct radeon_state)); + } + radeon_state_convert(&rstate->rstate[rstate->nrstate], R600_STATE_SAMPLER, i, R600_SHADER_VS); + rctx->vs_sampler[i] = &rstate->rstate[rstate->nrstate]; + rstate->nrstate++; + } } rctx->vs_nsampler = count; } @@ -268,6 +304,13 @@ static void r600_set_blend_color(struct pipe_context *ctx, static void r600_set_clip_state(struct pipe_context *ctx, const struct pipe_clip_state *state) { + struct r600_context *rctx = r600_context(ctx); + struct r600_context_state *rstate; + + rstate = r600_context_state(rctx, pipe_clip_type, state); + r600_bind_state(ctx, rstate); + /* refcount is taken care of this */ + r600_delete_state(ctx, rstate); } static void r600_set_constant_buffer(struct pipe_context *ctx, @@ -276,19 +319,21 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, { struct r600_screen *rscreen = r600_screen(ctx->screen); struct r600_context *rctx = r600_context(ctx); - unsigned nconstant = 0, i, type, id; - struct radeon_state *rstate; + unsigned nconstant = 0, i, type, shader_class; + struct radeon_state *rstate, *rstates; struct pipe_transfer *transfer; u32 *ptr; + type = R600_STATE_CONSTANT; + switch (shader) { case PIPE_SHADER_VERTEX: - id = R600_VS_CONSTANT; - type = R600_VS_CONSTANT_TYPE; + shader_class = R600_SHADER_VS; + rstates = rctx->vs_constant; break; case PIPE_SHADER_FRAGMENT: - id = R600_PS_CONSTANT; - type = R600_PS_CONSTANT_TYPE; + shader_class = R600_SHADER_PS; + rstates = rctx->ps_constant; break; default: R600_ERR("unsupported %d\n", shader); @@ -300,17 +345,15 @@ static void r600_set_constant_buffer(struct pipe_context *ctx, if (ptr == NULL) return; for (i = 0; i < nconstant; i++) { - rstate = radeon_state(rscreen->rw, type, id + i); - if (rstate == NULL) - return; + rstate = &rstates[i]; + radeon_state_init(rstate, rscreen->rw, type, i, shader_class); rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT0_0] = ptr[i * 4 + 0]; rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT1_0] = ptr[i * 4 + 1]; rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT2_0] = ptr[i * 4 + 2]; rstate->states[R600_PS_CONSTANT__SQ_ALU_CONSTANT3_0] = ptr[i * 4 + 3]; if (radeon_state_pm4(rstate)) return; - if (radeon_draw_set_new(rctx->draw, rstate)) - return; + radeon_draw_bind(&rctx->draw, rstate); } pipe_buffer_unmap(ctx, buffer, transfer); } @@ -324,12 +367,24 @@ static void r600_set_ps_sampler_view(struct pipe_context *ctx, struct r600_context_state *rstate; unsigned i; - for (i = 0; i < rctx->ps_nsampler_view; i++) { - rctx->ps_sampler_view[i] = r600_context_state_decref(rctx->ps_sampler_view[i]); + for (i = 0; i < count; i++) { + rstate = (struct r600_context_state *)views[i]; + if (rstate) { + rstate->nrstate = 0; + } } for (i = 0; i < count; i++) { rstate = (struct r600_context_state *)views[i]; - rctx->ps_sampler_view[i] = r600_context_state_incref(rstate); + if (rstate) { + if (rstate->nrstate >= R600_MAX_RSTATE) + continue; + if (rstate->nrstate) { + memcpy(&rstate->rstate[rstate->nrstate], &rstate->rstate[0], sizeof(struct radeon_state)); + } + radeon_state_convert(&rstate->rstate[rstate->nrstate], R600_STATE_RESOURCE, i, R600_SHADER_PS); + rctx->ps_sampler_view[i] = &rstate->rstate[rstate->nrstate]; + rstate->nrstate++; + } } rctx->ps_nsampler_view = count; } @@ -342,12 +397,24 @@ static void r600_set_vs_sampler_view(struct pipe_context *ctx, struct r600_context_state *rstate; unsigned i; - for (i = 0; i < rctx->vs_nsampler_view; i++) { - rctx->vs_sampler_view[i] = r600_context_state_decref(rctx->vs_sampler_view[i]); + for (i = 0; i < count; i++) { + rstate = (struct r600_context_state *)views[i]; + if (rstate) { + rstate->nrstate = 0; + } } for (i = 0; i < count; i++) { rstate = (struct r600_context_state *)views[i]; - rctx->vs_sampler_view[i] = r600_context_state_incref(rstate); + if (rstate) { + if (rstate->nrstate >= R600_MAX_RSTATE) + continue; + if (rstate->nrstate) { + memcpy(&rstate->rstate[rstate->nrstate], &rstate->rstate[0], sizeof(struct radeon_state)); + } + radeon_state_convert(&rstate->rstate[rstate->nrstate], R600_STATE_RESOURCE, i, R600_SHADER_VS); + rctx->vs_sampler_view[i] = &rstate->rstate[rstate->nrstate]; + rstate->nrstate++; + } } rctx->vs_nsampler_view = count; } @@ -360,6 +427,12 @@ static void r600_set_framebuffer_state(struct pipe_context *ctx, rstate = r600_context_state(rctx, pipe_framebuffer_type, state); r600_bind_state(ctx, rstate); + for (int i = 0; i < state->nr_cbufs; i++) { + r600_cb(rctx, &rstate->rstate[i+1], state, i); + } + if (state->zsbuf) { + r600_db(rctx, &rstate->rstate[0], state); + } } static void r600_set_polygon_stipple(struct pipe_context *ctx, @@ -525,7 +598,7 @@ struct r600_context_state *r600_context_state_decref(struct r600_context_state * R600_ERR("invalid type %d\n", rstate->type); return NULL; } - radeon_state_decref(rstate->rstate); + radeon_state_fini(&rstate->rstate[0]); FREE(rstate); return NULL; } @@ -558,6 +631,7 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne break; case pipe_viewport_type: rstate->state.viewport = (*states).viewport; + r600_viewport(rctx, &rstate->rstate[0], &rstate->state.viewport); break; case pipe_depth_type: rstate->state.depth = (*states).depth; @@ -573,6 +647,7 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne break; case pipe_clip_type: rstate->state.clip = (*states).clip; + r600_ucp(rctx, &rstate->rstate[0], &rstate->state.clip); break; case pipe_stencil_type: rstate->state.stencil = (*states).stencil; @@ -585,6 +660,7 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne break; case pipe_blend_type: rstate->state.blend = (*states).blend; + r600_blend(rctx, &rstate->rstate[0], &rstate->state.blend); break; case pipe_stencil_ref_type: rstate->state.stencil_ref = (*states).stencil_ref; @@ -599,6 +675,7 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne break; case pipe_sampler_type: rstate->state.sampler = (*states).sampler; + r600_sampler(rctx, &rstate->rstate[0], &rstate->state.sampler, 0); break; default: R600_ERR("invalid type %d\n", rstate->type); @@ -608,16 +685,12 @@ struct r600_context_state *r600_context_state(struct r600_context *rctx, unsigne return rstate; } -static struct radeon_state *r600_blend(struct r600_context *rctx) +static void r600_blend(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_blend_state *state) { struct r600_screen *rscreen = rctx->screen; - struct radeon_state *rstate; - const struct pipe_blend_state *state = &rctx->blend->state.blend; int i; - rstate = radeon_state(rscreen->rw, R600_BLEND_TYPE, R600_BLEND); - if (rstate == NULL) - return NULL; + radeon_state_init(rstate, rscreen->rw, R600_STATE_BLEND, 0, 0); rstate->states[R600_BLEND__CB_BLEND_RED] = fui(rctx->blend_color.color[0]); rstate->states[R600_BLEND__CB_BLEND_GREEN] = fui(rctx->blend_color.color[1]); rstate->states[R600_BLEND__CB_BLEND_BLUE] = fui(rctx->blend_color.color[2]); @@ -661,29 +734,38 @@ static struct radeon_state *r600_blend(struct r600_context *rctx) rstate->states[R600_BLEND__CB_BLEND_CONTROL] = bc; } - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; + radeon_state_pm4(rstate); +} + +static void r600_ucp(struct r600_context *rctx, struct radeon_state *rstate, + const struct pipe_clip_state *state) +{ + struct r600_screen *rscreen = rctx->screen; + + radeon_state_init(rstate, rscreen->rw, R600_STATE_UCP, 0, 0); + + for (int i = 0; i < state->nr; i++) { + rstate->states[i * 4 + 0] = fui(state->ucp[i][0]); + rstate->states[i * 4 + 1] = fui(state->ucp[i][1]); + rstate->states[i * 4 + 2] = fui(state->ucp[i][2]); + rstate->states[i * 4 + 3] = fui(state->ucp[i][3]); } - return rstate; + radeon_state_pm4(rstate); } -static struct radeon_state *r600_cb(struct r600_context *rctx, int cb) +static void r600_cb(struct r600_context *rctx, struct radeon_state *rstate, + const struct pipe_framebuffer_state *state, int cb) { struct r600_screen *rscreen = rctx->screen; struct r600_resource_texture *rtex; struct r600_resource *rbuffer; - struct radeon_state *rstate; - const struct pipe_framebuffer_state *state = &rctx->framebuffer->state.framebuffer; unsigned level = state->cbufs[cb]->level; unsigned pitch, slice; unsigned color_info; unsigned format, swap, ntype; const struct util_format_description *desc; - rstate = radeon_state(rscreen->rw, R600_CB0_TYPE + cb, R600_CB0 + cb); - if (rstate == NULL) - return NULL; + radeon_state_init(rstate, rscreen->rw, R600_STATE_CB0 + cb, 0, 0); rtex = (struct r600_resource_texture*)state->cbufs[cb]->texture; rbuffer = &rtex->resource; rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); @@ -710,7 +792,7 @@ static struct radeon_state *r600_cb(struct r600_context *rctx, int cb) S_0280A0_SOURCE_FORMAT(1) | S_0280A0_NUMBER_TYPE(ntype); - rstate->states[R600_CB0__CB_COLOR0_BASE] = 0x00000000; + rstate->states[R600_CB0__CB_COLOR0_BASE] = state->cbufs[cb]->offset >> 8; rstate->states[R600_CB0__CB_COLOR0_INFO] = color_info; rstate->states[R600_CB0__CB_COLOR0_SIZE] = S_028060_PITCH_TILE_MAX(pitch) | S_028060_SLICE_TILE_MAX(slice); @@ -718,32 +800,29 @@ static struct radeon_state *r600_cb(struct r600_context *rctx, int cb) rstate->states[R600_CB0__CB_COLOR0_FRAG] = 0x00000000; rstate->states[R600_CB0__CB_COLOR0_TILE] = 0x00000000; rstate->states[R600_CB0__CB_COLOR0_MASK] = 0x00000000; - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } -static struct radeon_state *r600_db(struct r600_context *rctx) +static void r600_db(struct r600_context *rctx, struct radeon_state *rstate, + const struct pipe_framebuffer_state *state) { struct r600_screen *rscreen = rctx->screen; struct r600_resource_texture *rtex; struct r600_resource *rbuffer; - struct radeon_state *rstate; - const struct pipe_framebuffer_state *state = &rctx->framebuffer->state.framebuffer; unsigned level; unsigned pitch, slice, format; + radeon_state_init(rstate, rscreen->rw, R600_STATE_DB, 0, 0); if (state->zsbuf == NULL) - return NULL; - - rstate = radeon_state(rscreen->rw, R600_DB_TYPE, R600_DB); - if (rstate == NULL) - return NULL; + return; rtex = (struct r600_resource_texture*)state->zsbuf->texture; + rtex->tilled = 1; + rtex->array_mode = 2; + rtex->tile_type = 1; + rtex->depth = 1; rbuffer = &rtex->resource; + rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); rstate->nbo = 1; rstate->placement[0] = RADEON_GEM_DOMAIN_VRAM; @@ -751,31 +830,30 @@ static struct radeon_state *r600_db(struct r600_context *rctx) pitch = (rtex->pitch[level] / rtex->bpt) / 8 - 1; slice = (rtex->pitch[level] / rtex->bpt) * state->zsbuf->height / 64 - 1; format = r600_translate_dbformat(state->zsbuf->texture->format); - rstate->states[R600_DB__DB_DEPTH_BASE] = 0x00000000; - rstate->states[R600_DB__DB_DEPTH_INFO] = 0x00010000 | + rstate->states[R600_DB__DB_DEPTH_BASE] = state->zsbuf->offset >> 8; + rstate->states[R600_DB__DB_DEPTH_INFO] = S_028010_ARRAY_MODE(rtex->array_mode) | S_028010_FORMAT(format); rstate->states[R600_DB__DB_DEPTH_VIEW] = 0x00000000; rstate->states[R600_DB__DB_PREFETCH_LIMIT] = (state->zsbuf->height / 8) -1; rstate->states[R600_DB__DB_DEPTH_SIZE] = S_028000_PITCH_TILE_MAX(pitch) | S_028000_SLICE_TILE_MAX(slice); - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } -static struct radeon_state *r600_rasterizer(struct r600_context *rctx) +static void r600_rasterizer(struct r600_context *rctx, struct radeon_state *rstate) { const struct pipe_rasterizer_state *state = &rctx->rasterizer->state.rasterizer; const struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer; + const struct pipe_clip_state *clip = NULL; struct r600_screen *rscreen = rctx->screen; - struct radeon_state *rstate; float offset_units = 0, offset_scale = 0; char depth = 0; unsigned offset_db_fmt_cntl = 0; unsigned tmp; unsigned prov_vtx = 1; + + if (rctx->clip) + clip = &rctx->clip->state.clip; if (fb->zsbuf) { offset_units = state->offset_units; offset_scale = state->offset_scale * 12.0f; @@ -796,7 +874,7 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx) break; default: R600_ERR("unsupported %d\n", fb->zsbuf->texture->format); - return NULL; + return; } } offset_db_fmt_cntl |= S_028DF8_POLY_OFFSET_NEG_NUM_DB_BITS(depth); @@ -805,9 +883,7 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx) prov_vtx = 0; rctx->flat_shade = state->flatshade; - rstate = radeon_state(rscreen->rw, R600_RASTERIZER_TYPE, R600_RASTERIZER); - if (rstate == NULL) - return NULL; + radeon_state_init(rstate, rscreen->rw, R600_STATE_RASTERIZER, 0, 0); rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] = 0x00000001; if (state->sprite_coord_enable) { rstate->states[R600_RASTERIZER__SPI_INTERP_CONTROL_0] |= @@ -821,7 +897,12 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx) S_0286D4_PNT_SPRITE_TOP_1(1); } } - rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] = 0x00000000; + rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] = 0; + if (clip) { + rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] = S_028810_PS_UCP_MODE(3) | ((1 << clip->nr) - 1); + rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] |= S_028810_ZCLIP_NEAR_DISABLE(clip->depth_clamp); + rstate->states[R600_RASTERIZER__PA_CL_CLIP_CNTL] |= S_028810_ZCLIP_FAR_DISABLE(clip->depth_clamp); + } rstate->states[R600_RASTERIZER__PA_SU_SC_MODE_CNTL] = S_028814_PROVOKING_VTX_LAST(prov_vtx) | S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) | @@ -835,7 +916,7 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx) S_02881C_VS_OUT_MISC_VEC_ENA(state->point_size_per_vertex); rstate->states[R600_RASTERIZER__PA_CL_NANINF_CNTL] = 0x00000000; /* point size 12.4 fixed point */ - tmp = (unsigned)(state->point_size * 8.0 / 2.0); + tmp = (unsigned)(state->point_size * 8.0); rstate->states[R600_RASTERIZER__PA_SU_POINT_SIZE] = S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp); rstate->states[R600_RASTERIZER__PA_SU_POINT_MINMAX] = 0x80000000; rstate->states[R600_RASTERIZER__PA_SU_LINE_CNTL] = 0x00000008; @@ -852,19 +933,14 @@ static struct radeon_state *r600_rasterizer(struct r600_context *rctx) rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_FRONT_OFFSET] = fui(offset_units); rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_BACK_SCALE] = fui(offset_scale); rstate->states[R600_RASTERIZER__PA_SU_POLY_OFFSET_BACK_OFFSET] = fui(offset_units); - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } -static struct radeon_state *r600_scissor(struct r600_context *rctx) +static void r600_scissor(struct r600_context *rctx, struct radeon_state *rstate) { const struct pipe_scissor_state *state = &rctx->scissor->state.scissor; const struct pipe_framebuffer_state *fb = &rctx->framebuffer->state.framebuffer; struct r600_screen *rscreen = rctx->screen; - struct radeon_state *rstate; unsigned minx, maxx, miny, maxy; u32 tl, br; @@ -881,9 +957,7 @@ static struct radeon_state *r600_scissor(struct r600_context *rctx) } tl = S_028240_TL_X(minx) | S_028240_TL_Y(miny) | S_028240_WINDOW_OFFSET_DISABLE(1); br = S_028244_BR_X(maxx) | S_028244_BR_Y(maxy); - rstate = radeon_state(rscreen->rw, R600_SCISSOR_TYPE, R600_SCISSOR); - if (rstate == NULL) - return NULL; + radeon_state_init(rstate, rscreen->rw, R600_STATE_SCISSOR, 0, 0); rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_TL] = tl; rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_BR] = br; rstate->states[R600_SCISSOR__PA_SC_WINDOW_OFFSET] = 0x00000000; @@ -903,22 +977,14 @@ static struct radeon_state *r600_scissor(struct r600_context *rctx) rstate->states[R600_SCISSOR__PA_SC_GENERIC_SCISSOR_BR] = br; rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_TL] = tl; rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_BR] = br; - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } -static struct radeon_state *r600_viewport(struct r600_context *rctx) +static void r600_viewport(struct r600_context *rctx, struct radeon_state *rstate, const struct pipe_viewport_state *state) { - const struct pipe_viewport_state *state = &rctx->viewport->state.viewport; struct r600_screen *rscreen = rctx->screen; - struct radeon_state *rstate; - rstate = radeon_state(rscreen->rw, R600_VIEWPORT_TYPE, R600_VIEWPORT); - if (rstate == NULL) - return NULL; + radeon_state_init(rstate, rscreen->rw, R600_STATE_VIEWPORT, 0, 0); rstate->states[R600_VIEWPORT__PA_SC_VPORT_ZMIN_0] = 0x00000000; rstate->states[R600_VIEWPORT__PA_SC_VPORT_ZMAX_0] = 0x3F800000; rstate->states[R600_VIEWPORT__PA_CL_VPORT_XSCALE_0] = fui(state->scale[0]); @@ -928,29 +994,28 @@ static struct radeon_state *r600_viewport(struct r600_context *rctx) rstate->states[R600_VIEWPORT__PA_CL_VPORT_YOFFSET_0] = fui(state->translate[1]); rstate->states[R600_VIEWPORT__PA_CL_VPORT_ZOFFSET_0] = fui(state->translate[2]); rstate->states[R600_VIEWPORT__PA_CL_VTE_CNTL] = 0x0000043F; - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } -static struct radeon_state *r600_dsa(struct r600_context *rctx) +static void r600_dsa(struct r600_context *rctx, struct radeon_state *rstate) { const struct pipe_depth_stencil_alpha_state *state = &rctx->dsa->state.dsa; const struct pipe_stencil_ref *stencil_ref = &rctx->stencil_ref->state.stencil_ref; struct r600_screen *rscreen = rctx->screen; unsigned db_depth_control, alpha_test_control, alpha_ref, db_shader_control; unsigned stencil_ref_mask, stencil_ref_mask_bf; - struct r600_shader *rshader = &rctx->ps_shader->shader; - struct radeon_state *rstate; + struct r600_shader *rshader; int i; - rstate = radeon_state(rscreen->rw, R600_DSA_TYPE, R600_DSA); - if (rstate == NULL) - return NULL; + if (rctx->ps_shader == NULL) { + return; + } + radeon_state_init(rstate, rscreen->rw, R600_STATE_DSA, 0, 0); db_shader_control = 0x210; + rshader = &rctx->ps_shader->shader; + if (rshader->uses_kill) + db_shader_control |= (1 << 6); for (i = 0; i < rshader->noutput; i++) { if (rshader->output[i].name == TGSI_SEMANTIC_POSITION) db_shader_control |= 1; @@ -1008,11 +1073,7 @@ static struct radeon_state *r600_dsa(struct r600_context *rctx) rstate->states[R600_DSA__DB_SRESULTS_COMPARE_STATE1] = 0x00000000; rstate->states[R600_DSA__DB_PRELOAD_CONTROL] = 0x00000000; rstate->states[R600_DSA__DB_ALPHA_TO_MASK] = 0x0000AA00; - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } static inline unsigned r600_tex_wrap(unsigned wrap) @@ -1090,16 +1151,12 @@ static INLINE u32 S_FIXED(float value, u32 frac_bits) return value * (1 << frac_bits); } -static struct radeon_state *r600_sampler(struct r600_context *rctx, - const struct pipe_sampler_state *state, - unsigned id) +static void r600_sampler(struct r600_context *rctx, struct radeon_state *rstate, + const struct pipe_sampler_state *state, unsigned id) { struct r600_screen *rscreen = rctx->screen; - struct radeon_state *rstate; - rstate = radeon_state(rscreen->rw, R600_PS_SAMPLER_TYPE, id); - if (rstate == NULL) - return NULL; + radeon_state_init(rstate, rscreen->rw, R600_STATE_SAMPLER, id, R600_SHADER_PS); rstate->states[R600_PS_SAMPLER__SQ_TEX_SAMPLER_WORD0_0] = S_03C000_CLAMP_X(r600_tex_wrap(state->wrap_s)) | S_03C000_CLAMP_Y(r600_tex_wrap(state->wrap_t)) | @@ -1114,11 +1171,7 @@ static struct radeon_state *r600_sampler(struct r600_context *rctx, S_03C004_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 6)) | S_03C004_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 6)); rstate->states[R600_PS_SAMPLER__SQ_TEX_SAMPLER_WORD2_0] = S_03C008_TYPE(1); - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } static inline unsigned r600_tex_swizzle(unsigned swizzle) @@ -1160,6 +1213,7 @@ static inline unsigned r600_tex_dim(unsigned dim) case PIPE_TEXTURE_1D: return V_038000_SQ_TEX_DIM_1D; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: return V_038000_SQ_TEX_DIM_2D; case PIPE_TEXTURE_3D: return V_038000_SQ_TEX_DIM_3D; @@ -1168,19 +1222,20 @@ static inline unsigned r600_tex_dim(unsigned dim) } } -static struct radeon_state *r600_resource(struct r600_context *rctx, - const struct pipe_sampler_view *view, - unsigned id) +static void r600_resource(struct pipe_context *ctx, struct radeon_state *rstate, + const struct pipe_sampler_view *view, unsigned id) { + struct r600_context *rctx = r600_context(ctx); struct r600_screen *rscreen = rctx->screen; const struct util_format_description *desc; struct r600_resource_texture *tmp; struct r600_resource *rbuffer; - struct radeon_state *rstate; unsigned format; - uint32_t word4 = 0, yuv_format = 0; - unsigned char swizzle[4]; + uint32_t word4 = 0, yuv_format = 0, pitch = 0; + unsigned char swizzle[4], array_mode = 0, tile_type = 0; + int r; + rstate->cpm4 = 0; swizzle[0] = view->swizzle_r; swizzle[1] = view->swizzle_g; swizzle[2] = view->swizzle_b; @@ -1188,37 +1243,49 @@ static struct radeon_state *r600_resource(struct r600_context *rctx, format = r600_translate_texformat(view->texture->format, swizzle, &word4, &yuv_format); - if (format == ~0) - return NULL; + if (format == ~0) { + return; + } desc = util_format_description(view->texture->format); if (desc == NULL) { R600_ERR("unknow format %d\n", view->texture->format); - return NULL; - } - rstate = radeon_state(rscreen->rw, R600_PS_RESOURCE_TYPE, id); - if (rstate == NULL) { - return NULL; + return; } + radeon_state_init(rstate, rscreen->rw, R600_STATE_RESOURCE, id, R600_SHADER_PS); tmp = (struct r600_resource_texture*)view->texture; rbuffer = &tmp->resource; - rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); - rstate->bo[1] = radeon_bo_incref(rscreen->rw, rbuffer->bo); + if (tmp->depth) { + r = r600_texture_from_depth(ctx, tmp, view->first_level); + if (r) { + return; + } + rstate->bo[0] = radeon_bo_incref(rscreen->rw, tmp->uncompressed); + rstate->bo[1] = radeon_bo_incref(rscreen->rw, tmp->uncompressed); + } else { + rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); + rstate->bo[1] = radeon_bo_incref(rscreen->rw, rbuffer->bo); + } rstate->nbo = 2; rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; rstate->placement[1] = RADEON_GEM_DOMAIN_GTT; rstate->placement[2] = RADEON_GEM_DOMAIN_GTT; rstate->placement[3] = RADEON_GEM_DOMAIN_GTT; + pitch = (tmp->pitch[0] / tmp->bpt); + pitch = (pitch + 0x7) & ~0x7; + /* FIXME properly handle first level != 0 */ rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD0] = S_038000_DIM(r600_tex_dim(view->texture->target)) | - S_038000_PITCH(((tmp->pitch[0] / tmp->bpt) / 8) - 1) | + S_038000_TILE_MODE(array_mode) | + S_038000_TILE_TYPE(tile_type) | + S_038000_PITCH((pitch / 8) - 1) | S_038000_TEX_WIDTH(view->texture->width0 - 1); rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD1] = S_038004_TEX_HEIGHT(view->texture->height0 - 1) | S_038004_TEX_DEPTH(view->texture->depth0 - 1) | S_038004_DATA_FORMAT(format); - rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD2] = 0; + rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD2] = tmp->offset[0] >> 8; rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD3] = tmp->offset[1] >> 8; rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD4] = word4 | @@ -1232,17 +1299,12 @@ static struct radeon_state *r600_resource(struct r600_context *rctx, S_038014_LAST_ARRAY(0); rstate->states[R600_PS_RESOURCE__RESOURCE0_WORD6] = S_038018_TYPE(V_038010_SQ_TEX_VTX_VALID_TEXTURE); - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } -static struct radeon_state *r600_cb_cntl(struct r600_context *rctx) +static void r600_cb_cntl(struct r600_context *rctx, struct radeon_state *rstate) { struct r600_screen *rscreen = rctx->screen; - struct radeon_state *rstate; const struct pipe_blend_state *pbs = &rctx->blend->state.blend; int nr_cbufs = rctx->framebuffer->state.framebuffer.nr_cbufs; uint32_t color_control, target_mask, shader_mask; @@ -1257,7 +1319,7 @@ static struct radeon_state *r600_cb_cntl(struct r600_context *rctx) } if (pbs->logicop_enable) { - color_control |= (pbs->logicop_func) << 16; + color_control |= (pbs->logicop_func << 16) | (pbs->logicop_func << 20); } else { color_control |= (0xcc << 16); } @@ -1277,7 +1339,7 @@ static struct radeon_state *r600_cb_cntl(struct r600_context *rctx) target_mask |= (pbs->rt[0].colormask << (4 * i)); } } - rstate = radeon_state(rscreen->rw, R600_CB_CNTL_TYPE, R600_CB_CNTL); + radeon_state_init(rstate, rscreen->rw, R600_STATE_CB_CNTL, 0, 0); rstate->states[R600_CB_CNTL__CB_SHADER_MASK] = shader_mask; rstate->states[R600_CB_CNTL__CB_TARGET_MASK] = target_mask; rstate->states[R600_CB_CNTL__CB_COLOR_CONTROL] = color_control; @@ -1289,115 +1351,51 @@ static struct radeon_state *r600_cb_cntl(struct r600_context *rctx) rstate->states[R600_CB_CNTL__CB_CLRCMP_DST] = 0x000000FF; rstate->states[R600_CB_CNTL__CB_CLRCMP_MSK] = 0xFFFFFFFF; rstate->states[R600_CB_CNTL__PA_SC_AA_MASK] = 0xFFFFFFFF; - if (radeon_state_pm4(rstate)) { - radeon_state_decref(rstate); - return NULL; - } - return rstate; + radeon_state_pm4(rstate); } -int r600_context_hw_states(struct r600_context *rctx) +int r600_context_hw_states(struct pipe_context *ctx) { + struct r600_context *rctx = r600_context(ctx); unsigned i; - int r; - int nr_cbufs = rctx->framebuffer->state.framebuffer.nr_cbufs; - /* free previous TODO determine what need to be updated, what - * doesn't - */ - //radeon_state_decref(rctx->hw_states.config); - rctx->hw_states.cb_cntl = radeon_state_decref(rctx->hw_states.cb_cntl); - rctx->hw_states.db = radeon_state_decref(rctx->hw_states.db); - rctx->hw_states.rasterizer = radeon_state_decref(rctx->hw_states.rasterizer); - rctx->hw_states.scissor = radeon_state_decref(rctx->hw_states.scissor); - rctx->hw_states.dsa = radeon_state_decref(rctx->hw_states.dsa); - rctx->hw_states.blend = radeon_state_decref(rctx->hw_states.blend); - rctx->hw_states.viewport = radeon_state_decref(rctx->hw_states.viewport); - for (i = 0; i < 8; i++) { - rctx->hw_states.cb[i] = radeon_state_decref(rctx->hw_states.cb[i]); + /* build new states */ + r600_rasterizer(rctx, &rctx->hw_states.rasterizer); + r600_scissor(rctx, &rctx->hw_states.scissor); + r600_dsa(rctx, &rctx->hw_states.dsa); + r600_cb_cntl(rctx, &rctx->hw_states.cb_cntl); + + /* bind states */ + radeon_draw_bind(&rctx->draw, &rctx->hw_states.rasterizer); + radeon_draw_bind(&rctx->draw, &rctx->hw_states.scissor); + radeon_draw_bind(&rctx->draw, &rctx->hw_states.dsa); + radeon_draw_bind(&rctx->draw, &rctx->hw_states.cb_cntl); + + radeon_draw_bind(&rctx->draw, &rctx->config); + + if (rctx->viewport) { + radeon_draw_bind(&rctx->draw, &rctx->viewport->rstate[0]); } - for (i = 0; i < rctx->hw_states.ps_nresource; i++) { - radeon_state_decref(rctx->hw_states.ps_resource[i]); - rctx->hw_states.ps_resource[i] = NULL; + if (rctx->blend) { + radeon_draw_bind(&rctx->draw, &rctx->blend->rstate[0]); } - rctx->hw_states.ps_nresource = 0; - for (i = 0; i < rctx->hw_states.ps_nsampler; i++) { - radeon_state_decref(rctx->hw_states.ps_sampler[i]); - rctx->hw_states.ps_sampler[i] = NULL; + if (rctx->clip) { + radeon_draw_bind(&rctx->draw, &rctx->clip->rstate[0]); } - rctx->hw_states.ps_nsampler = 0; - - /* build new states */ - rctx->hw_states.rasterizer = r600_rasterizer(rctx); - rctx->hw_states.scissor = r600_scissor(rctx); - rctx->hw_states.dsa = r600_dsa(rctx); - rctx->hw_states.blend = r600_blend(rctx); - rctx->hw_states.viewport = r600_viewport(rctx); - for (i = 0; i < nr_cbufs; i++) { - rctx->hw_states.cb[i] = r600_cb(rctx, i); + for (i = 0; i < rctx->framebuffer->state.framebuffer.nr_cbufs; i++) { + radeon_draw_bind(&rctx->draw, &rctx->framebuffer->rstate[i+1]); + } + if (rctx->framebuffer->state.framebuffer.zsbuf) { + radeon_draw_bind(&rctx->draw, &rctx->framebuffer->rstate[0]); } - rctx->hw_states.db = r600_db(rctx); - rctx->hw_states.cb_cntl = r600_cb_cntl(rctx); - for (i = 0; i < rctx->ps_nsampler; i++) { if (rctx->ps_sampler[i]) { - rctx->hw_states.ps_sampler[i] = r600_sampler(rctx, - &rctx->ps_sampler[i]->state.sampler, - R600_PS_SAMPLER + i); + radeon_draw_bind(&rctx->draw, rctx->ps_sampler[i]); } } - rctx->hw_states.ps_nsampler = rctx->ps_nsampler; for (i = 0; i < rctx->ps_nsampler_view; i++) { if (rctx->ps_sampler_view[i]) { - rctx->hw_states.ps_resource[i] = r600_resource(rctx, - &rctx->ps_sampler_view[i]->state.sampler_view, - R600_PS_RESOURCE + i); - } - } - rctx->hw_states.ps_nresource = rctx->ps_nsampler_view; - - /* bind states */ - r = radeon_draw_set(rctx->draw, rctx->hw_states.db); - if (r) - return r; - r = radeon_draw_set(rctx->draw, rctx->hw_states.rasterizer); - if (r) - return r; - r = radeon_draw_set(rctx->draw, rctx->hw_states.scissor); - if (r) - return r; - r = radeon_draw_set(rctx->draw, rctx->hw_states.dsa); - if (r) - return r; - r = radeon_draw_set(rctx->draw, rctx->hw_states.blend); - if (r) - return r; - r = radeon_draw_set(rctx->draw, rctx->hw_states.viewport); - if (r) - return r; - for (i = 0; i < nr_cbufs; i++) { - r = radeon_draw_set(rctx->draw, rctx->hw_states.cb[i]); - if (r) - return r; - } - r = radeon_draw_set(rctx->draw, rctx->hw_states.config); - if (r) - return r; - r = radeon_draw_set(rctx->draw, rctx->hw_states.cb_cntl); - if (r) - return r; - for (i = 0; i < rctx->hw_states.ps_nresource; i++) { - if (rctx->hw_states.ps_resource[i]) { - r = radeon_draw_set(rctx->draw, rctx->hw_states.ps_resource[i]); - if (r) - return r; - } - } - for (i = 0; i < rctx->hw_states.ps_nsampler; i++) { - if (rctx->hw_states.ps_sampler[i]) { - r = radeon_draw_set(rctx->draw, rctx->hw_states.ps_sampler[i]); - if (r) - return r; + radeon_draw_bind(&rctx->draw, rctx->ps_sampler_view[i]); } } return 0; diff --git a/src/gallium/drivers/r600/r600_state_inlines.h b/src/gallium/drivers/r600/r600_state_inlines.h index f93c20da35e..84866825aab 100644 --- a/src/gallium/drivers/r600/r600_state_inlines.h +++ b/src/gallium/drivers/r600/r600_state_inlines.h @@ -252,6 +252,7 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format) case PIPE_FORMAT_R8SG8SB8UX8U_NORM: case PIPE_FORMAT_X8B8G8R8_UNORM: case PIPE_FORMAT_X8R8G8B8_UNORM: + case PIPE_FORMAT_R8G8B8_UNORM: return V_0280A0_COLOR_8_8_8_8; case PIPE_FORMAT_R10G10B10A2_UNORM: @@ -262,7 +263,10 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format) case PIPE_FORMAT_Z24X8_UNORM: case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - return V_0280A0_COLOR_24_8; + return V_0280A0_COLOR_8_24; + + case PIPE_FORMAT_R32_FLOAT: + return V_0280A0_COLOR_32_FLOAT; /* 64-bit buffers. */ case PIPE_FORMAT_R16G16B16A16_UNORM: @@ -275,6 +279,7 @@ static INLINE uint32_t r600_translate_colorformat(enum pipe_format format) /* 128-bit buffers. */ case PIPE_FORMAT_R32G32B32_FLOAT: + return V_0280A0_COLOR_32_32_32_FLOAT; case PIPE_FORMAT_R32G32B32A32_FLOAT: return V_0280A0_COLOR_32_32_32_32_FLOAT; diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c index 30d79ebdd6f..b6698e3885c 100644 --- a/src/gallium/drivers/r600/r600_texture.c +++ b/src/gallium/drivers/r600/r600_texture.c @@ -24,6 +24,7 @@ * Jerome Glisse * Corbin Simpson */ +#include <errno.h> #include <pipe/p_screen.h> #include <util/u_format.h> #include <util/u_math.h> @@ -33,10 +34,26 @@ #include "r600_screen.h" #include "r600_context.h" #include "r600_resource.h" +#include "r600_state_inlines.h" #include "r600d.h" extern struct u_resource_vtbl r600_texture_vtbl; +/* Copy from a tiled texture to a detiled one. */ +static void r600_copy_from_tiled_texture(struct pipe_context *ctx, struct r600_transfer *rtransfer) +{ + struct pipe_transfer *transfer = (struct pipe_transfer*)rtransfer; + struct pipe_resource *texture = transfer->resource; + struct pipe_subresource subdst; + + subdst.face = 0; + subdst.level = 0; + ctx->resource_copy_region(ctx, rtransfer->linear_texture, + subdst, 0, 0, 0, texture, transfer->sr, + transfer->box.x, transfer->box.y, transfer->box.z, + transfer->box.width, transfer->box.height); +} + static unsigned long r600_texture_get_offset(struct r600_resource_texture *rtex, unsigned level, unsigned zslice, unsigned face) @@ -65,7 +82,9 @@ static void r600_setup_miptree(struct r600_screen *rscreen, struct r600_resource for (i = 0, offset = 0; i <= ptex->last_level; i++) { w = u_minify(ptex->width0, i); h = u_minify(ptex->height0, i); + h = util_next_power_of_two(h); pitch = util_format_get_stride(ptex->format, align(w, 64)); + pitch = align(pitch, 256); layer_size = pitch * h; if (ptex->target == PIPE_TEXTURE_CUBE) size = layer_size * 6; @@ -74,6 +93,8 @@ static void r600_setup_miptree(struct r600_screen *rscreen, struct r600_resource rtex->offset[i] = offset; rtex->layer_size[i] = layer_size; rtex->pitch[i] = pitch; + rtex->width[i] = w; + rtex->height[i] = h; offset += size; } rtex->size = offset; @@ -104,10 +125,22 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen, FREE(rtex); return NULL; } - return &resource->base.b; } +static void r600_texture_destroy_state(struct pipe_resource *ptexture) +{ + struct r600_resource_texture *rtexture = (struct r600_resource_texture*)ptexture; + + for (int i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++) { + radeon_state_fini(&rtexture->scissor[i]); + radeon_state_fini(&rtexture->db[i]); + for (int j = 0; j < 8; j++) { + radeon_state_fini(&rtexture->cb[j][i]); + } + } +} + static void r600_texture_destroy(struct pipe_screen *screen, struct pipe_resource *ptex) { @@ -118,6 +151,10 @@ static void r600_texture_destroy(struct pipe_screen *screen, if (resource->bo) { radeon_bo_decref(rscreen->rw, resource->bo); } + if (rtex->uncompressed) { + radeon_bo_decref(rscreen->rw, rtex->uncompressed); + } + r600_texture_destroy_state(ptex); FREE(rtex); } @@ -168,7 +205,8 @@ struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen, } /* Support only 2D textures without mipmaps */ - if (templ->target != PIPE_TEXTURE_2D || templ->depth0 != 1 || templ->last_level != 0) + if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) || + templ->depth0 != 1 || templ->last_level != 0) return NULL; rtex = CALLOC_STRUCT(r600_resource_texture); @@ -181,9 +219,12 @@ struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen, pipe_reference_init(&resource->base.b.reference, 1); resource->base.b.screen = screen; resource->bo = bo; + rtex->depth = 0; rtex->pitch_override = whandle->stride; rtex->bpt = util_format_get_blocksize(templ->format); rtex->pitch[0] = whandle->stride; + rtex->width[0] = templ->width0; + rtex->height[0] = templ->height0; rtex->offset[0] = 0; rtex->size = align(rtex->pitch[0] * templ->height0, 64); @@ -205,6 +246,7 @@ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx, const struct pipe_box *box) { struct r600_resource_texture *rtex = (struct r600_resource_texture*)texture; + struct pipe_resource resource; struct r600_transfer *trans; trans = CALLOC_STRUCT(r600_transfer); @@ -216,48 +258,117 @@ struct pipe_transfer* r600_texture_get_transfer(struct pipe_context *ctx, trans->transfer.box = *box; trans->transfer.stride = rtex->pitch[sr.level]; trans->offset = r600_texture_get_offset(rtex, sr.level, box->z, sr.face); + if (rtex->tilled && !rtex->depth) { + resource.target = PIPE_TEXTURE_2D; + resource.format = texture->format; + resource.width0 = box->width; + resource.height0 = box->height; + resource.depth0 = 0; + resource.last_level = 0; + resource.nr_samples = 0; + resource.usage = PIPE_USAGE_DYNAMIC; + resource.bind = 0; + resource.flags = 0; + /* For texture reading, the temporary (detiled) texture is used as + * a render target when blitting from a tiled texture. */ + if (usage & PIPE_TRANSFER_READ) { + resource.bind |= PIPE_BIND_RENDER_TARGET; + } + /* For texture writing, the temporary texture is used as a sampler + * when blitting into a tiled texture. */ + if (usage & PIPE_TRANSFER_WRITE) { + resource.bind |= PIPE_BIND_SAMPLER_VIEW; + } + /* Create the temporary texture. */ + trans->linear_texture = ctx->screen->resource_create(ctx->screen, &resource); + if (trans->linear_texture == NULL) { + R600_ERR("failed to create temporary texture to hold untiled copy\n"); + pipe_resource_reference(&trans->transfer.resource, NULL); + FREE(trans); + return NULL; + } + if (usage & PIPE_TRANSFER_READ) { + /* We cannot map a tiled texture directly because the data is + * in a different order, therefore we do detiling using a blit. */ + r600_copy_from_tiled_texture(ctx, trans); + /* Always referenced in the blit. */ + ctx->flush(ctx, 0, NULL); + } + } return &trans->transfer; } void r600_texture_transfer_destroy(struct pipe_context *ctx, - struct pipe_transfer *trans) + struct pipe_transfer *transfer) { - pipe_resource_reference(&trans->resource, NULL); - FREE(trans); + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; + + if (rtransfer->linear_texture) { + pipe_resource_reference(&rtransfer->linear_texture, NULL); + } + pipe_resource_reference(&transfer->resource, NULL); + FREE(transfer); } void* r600_texture_transfer_map(struct pipe_context *ctx, struct pipe_transfer* transfer) { struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; - struct r600_resource *resource; + struct radeon_bo *bo; enum pipe_format format = transfer->resource->format; struct r600_screen *rscreen = r600_screen(ctx->screen); + struct r600_resource_texture *rtex; + unsigned long offset = 0; char *map; + int r; r600_flush(ctx, 0, NULL); - - resource = (struct r600_resource *)transfer->resource; - if (radeon_bo_map(rscreen->rw, resource->bo)) { + if (rtransfer->linear_texture) { + bo = ((struct r600_resource *)rtransfer->linear_texture)->bo; + } else { + rtex = (struct r600_resource_texture*)transfer->resource; + if (rtex->depth) { + r = r600_texture_from_depth(ctx, rtex, transfer->sr.level); + if (r) { + return NULL; + } + r600_flush(ctx, 0, NULL); + bo = rtex->uncompressed; + } else { + bo = ((struct r600_resource *)transfer->resource)->bo; + } + offset = rtransfer->offset + + transfer->box.y / util_format_get_blockheight(format) * transfer->stride + + transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format); + } + if (radeon_bo_map(rscreen->rw, bo)) { return NULL; } - radeon_bo_wait(rscreen->rw, resource->bo); - - map = resource->bo->data; + radeon_bo_wait(rscreen->rw, bo); - return map + rtransfer->offset + - transfer->box.y / util_format_get_blockheight(format) * transfer->stride + - transfer->box.x / util_format_get_blockwidth(format) * util_format_get_blocksize(format); + map = bo->data; + return map + offset; } void r600_texture_transfer_unmap(struct pipe_context *ctx, struct pipe_transfer* transfer) { + struct r600_transfer *rtransfer = (struct r600_transfer*)transfer; struct r600_screen *rscreen = r600_screen(ctx->screen); - struct r600_resource *resource; - - resource = (struct r600_resource *)transfer->resource; - radeon_bo_unmap(rscreen->rw, resource->bo); + struct r600_resource_texture *rtex; + struct radeon_bo *bo; + + if (rtransfer->linear_texture) { + bo = ((struct r600_resource *)rtransfer->linear_texture)->bo; + } else { + rtex = (struct r600_resource_texture*)transfer->resource; + if (rtex->depth) { + bo = rtex->uncompressed; + } else { + bo = ((struct r600_resource *)transfer->resource)->bo; + } + } + radeon_bo_unmap(rscreen->rw, bo); } struct u_resource_vtbl r600_texture_vtbl = @@ -280,51 +391,51 @@ void r600_init_screen_texture_functions(struct pipe_screen *screen) } static unsigned r600_get_swizzle_combined(const unsigned char *swizzle_format, - const unsigned char *swizzle_view) + const unsigned char *swizzle_view) { - unsigned i; - unsigned char swizzle[4]; - unsigned result = 0; - const uint32_t swizzle_shift[4] = { - 16, 19, 22, 25, - }; - const uint32_t swizzle_bit[4] = { - 0, 1, 2, 3, - }; - - if (swizzle_view) { - /* Combine two sets of swizzles. */ - for (i = 0; i < 4; i++) { - swizzle[i] = swizzle_view[i] <= UTIL_FORMAT_SWIZZLE_W ? - swizzle_format[swizzle_view[i]] : swizzle_view[i]; - } - } else { - memcpy(swizzle, swizzle_format, 4); - } - - /* Get swizzle. */ - for (i = 0; i < 4; i++) { - switch (swizzle[i]) { - case UTIL_FORMAT_SWIZZLE_Y: - result |= swizzle_bit[1] << swizzle_shift[i]; - break; - case UTIL_FORMAT_SWIZZLE_Z: - result |= swizzle_bit[2] << swizzle_shift[i]; - break; - case UTIL_FORMAT_SWIZZLE_W: - result |= swizzle_bit[3] << swizzle_shift[i]; - break; - case UTIL_FORMAT_SWIZZLE_0: - result |= V_038010_SQ_SEL_0 << swizzle_shift[i]; - break; - case UTIL_FORMAT_SWIZZLE_1: - result |= V_038010_SQ_SEL_1 << swizzle_shift[i]; - break; - default: /* UTIL_FORMAT_SWIZZLE_X */ - result |= swizzle_bit[0] << swizzle_shift[i]; - } - } - return result; + unsigned i; + unsigned char swizzle[4]; + unsigned result = 0; + const uint32_t swizzle_shift[4] = { + 16, 19, 22, 25, + }; + const uint32_t swizzle_bit[4] = { + 0, 1, 2, 3, + }; + + if (swizzle_view) { + /* Combine two sets of swizzles. */ + for (i = 0; i < 4; i++) { + swizzle[i] = swizzle_view[i] <= UTIL_FORMAT_SWIZZLE_W ? + swizzle_format[swizzle_view[i]] : swizzle_view[i]; + } + } else { + memcpy(swizzle, swizzle_format, 4); + } + + /* Get swizzle. */ + for (i = 0; i < 4; i++) { + switch (swizzle[i]) { + case UTIL_FORMAT_SWIZZLE_Y: + result |= swizzle_bit[1] << swizzle_shift[i]; + break; + case UTIL_FORMAT_SWIZZLE_Z: + result |= swizzle_bit[2] << swizzle_shift[i]; + break; + case UTIL_FORMAT_SWIZZLE_W: + result |= swizzle_bit[3] << swizzle_shift[i]; + break; + case UTIL_FORMAT_SWIZZLE_0: + result |= V_038010_SQ_SEL_0 << swizzle_shift[i]; + break; + case UTIL_FORMAT_SWIZZLE_1: + result |= V_038010_SQ_SEL_1 << swizzle_shift[i]; + break; + default: /* UTIL_FORMAT_SWIZZLE_X */ + result |= swizzle_bit[0] << swizzle_shift[i]; + } + } + return result; } /* texture format translate */ @@ -344,19 +455,21 @@ uint32_t r600_translate_texformat(enum pipe_format format, }; desc = util_format_description(format); + word4 |= r600_get_swizzle_combined(desc->swizzle, swizzle_view); + /* Colorspace (return non-RGB formats directly). */ switch (desc->colorspace) { /* Depth stencil formats */ case UTIL_FORMAT_COLORSPACE_ZS: switch (format) { case PIPE_FORMAT_Z16_UNORM: - result = V_028010_DEPTH_16; + result = V_0280A0_COLOR_16; goto out_word4; case PIPE_FORMAT_Z24X8_UNORM: - result = V_028010_DEPTH_X8_24; + result = V_0280A0_COLOR_8_24; goto out_word4; case PIPE_FORMAT_Z24_UNORM_S8_USCALED: - result = V_028010_DEPTH_8_24; + result = V_0280A0_COLOR_8_24; goto out_word4; default: goto out_unknown; @@ -382,8 +495,6 @@ uint32_t r600_translate_texformat(enum pipe_format format, break; } - word4 |= r600_get_swizzle_combined(desc->swizzle, swizzle_view); - /* S3TC formats. TODO */ if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { goto out_unknown; @@ -519,9 +630,221 @@ out_word4: *word4_p = word4; if (yuv_format_p) *yuv_format_p = yuv_format; -// fprintf(stderr,"returning %08x %08x %08x\n", result, word4, yuv_format); return result; out_unknown: // R600_ERR("Unable to handle texformat %d %s\n", format, util_format_name(format)); return ~0; } + +int r600_texture_from_depth(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level) +{ + struct r600_screen *rscreen = r600_screen(ctx->screen); + int r; + + if (!rtexture->depth) { + /* This shouldn't happen maybe print a warning */ + return 0; + } + if (rtexture->uncompressed && !rtexture->dirty) { + /* Uncompressed bo already in good state */ + return 0; + } + + /* allocate uncompressed texture */ + if (rtexture->uncompressed == NULL) { + rtexture->uncompressed = radeon_bo(rscreen->rw, 0, rtexture->size, 4096, NULL); + if (rtexture->uncompressed == NULL) { + return -ENOMEM; + } + } + + /* render a rectangle covering whole buffer to uncompress depth */ + r = r600_blit_uncompress_depth(ctx, rtexture, level); + if (r) { + return r; + } + + rtexture->dirty = 0; + return 0; +} + +static void r600_texture_state_scissor(struct r600_screen *rscreen, + struct r600_resource_texture *rtexture, + unsigned level) +{ + struct radeon_state *rstate = &rtexture->scissor[level]; + + radeon_state_init(rstate, rscreen->rw, R600_STATE_SCISSOR, 0, 0); + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_0_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]); + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_0_TL] = 0x80000000; + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_1_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]); + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_1_TL] = 0x80000000; + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_2_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]); + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_2_TL] = 0x80000000; + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_3_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]); + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_3_TL] = 0x80000000; + rstate->states[R600_SCISSOR__PA_SC_CLIPRECT_RULE] = 0x0000FFFF; + rstate->states[R600_SCISSOR__PA_SC_EDGERULE] = 0xAAAAAAAA; + rstate->states[R600_SCISSOR__PA_SC_GENERIC_SCISSOR_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]); + rstate->states[R600_SCISSOR__PA_SC_GENERIC_SCISSOR_TL] = 0x80000000; + rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]); + rstate->states[R600_SCISSOR__PA_SC_SCREEN_SCISSOR_TL] = 0x80000000; + rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]); + rstate->states[R600_SCISSOR__PA_SC_VPORT_SCISSOR_0_TL] = 0x80000000; + rstate->states[R600_SCISSOR__PA_SC_WINDOW_SCISSOR_BR] = S_028244_BR_X(rtexture->width[level]) | S_028244_BR_Y(rtexture->height[level]); + rstate->states[R600_SCISSOR__PA_SC_WINDOW_SCISSOR_TL] = 0x80000000; + + radeon_state_pm4(rstate); +} + +static void r600_texture_state_cb(struct r600_screen *rscreen, struct r600_resource_texture *rtexture, unsigned cb, unsigned level) +{ + struct radeon_state *rstate; + struct r600_resource *rbuffer; + unsigned pitch, slice; + unsigned color_info; + unsigned format, swap, ntype; + const struct util_format_description *desc; + + rstate = &rtexture->cb[cb][level]; + radeon_state_init(rstate, rscreen->rw, R600_STATE_CB0 + cb, 0, 0); + rbuffer = &rtexture->resource; + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + pitch = (rtexture->pitch[level] / rtexture->bpt) / 8 - 1; + slice = (rtexture->pitch[level] / rtexture->bpt) * rtexture->height[level] / 64 - 1; + ntype = 0; + desc = util_format_description(rbuffer->base.b.format); + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) + ntype = V_0280A0_NUMBER_SRGB; + format = r600_translate_colorformat(rtexture->resource.base.b.format); + swap = r600_translate_colorswap(rtexture->resource.base.b.format); + if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) { + rstate->bo[0] = radeon_bo_incref(rscreen->rw, rtexture->uncompressed); + rstate->bo[1] = radeon_bo_incref(rscreen->rw, rtexture->uncompressed); + rstate->bo[2] = radeon_bo_incref(rscreen->rw, rtexture->uncompressed); + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + rstate->placement[2] = RADEON_GEM_DOMAIN_GTT; + rstate->placement[4] = RADEON_GEM_DOMAIN_GTT; + rstate->nbo = 3; + color_info = 0; + } else { + rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); + rstate->bo[1] = radeon_bo_incref(rscreen->rw, rbuffer->bo); + rstate->bo[2] = radeon_bo_incref(rscreen->rw, rbuffer->bo); + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + rstate->placement[2] = RADEON_GEM_DOMAIN_GTT; + rstate->placement[4] = RADEON_GEM_DOMAIN_GTT; + rstate->nbo = 3; + color_info = S_0280A0_SOURCE_FORMAT(1); + } + color_info |= S_0280A0_FORMAT(format) | + S_0280A0_COMP_SWAP(swap) | + S_0280A0_BLEND_CLAMP(1) | + S_0280A0_NUMBER_TYPE(ntype); + rstate->states[R600_CB0__CB_COLOR0_BASE] = rtexture->offset[level] >> 8; + rstate->states[R600_CB0__CB_COLOR0_INFO] = color_info; + rstate->states[R600_CB0__CB_COLOR0_SIZE] = S_028060_PITCH_TILE_MAX(pitch) | + S_028060_SLICE_TILE_MAX(slice); + + radeon_state_pm4(rstate); +} + +static void r600_texture_state_db(struct r600_screen *rscreen, struct r600_resource_texture *rtexture, unsigned level) +{ + struct radeon_state *rstate = &rtexture->db[level]; + struct r600_resource *rbuffer; + unsigned pitch, slice, format; + + radeon_state_init(rstate, rscreen->rw, R600_STATE_DB, 0, 0); + rbuffer = &rtexture->resource; + rtexture->tilled = 1; + rtexture->array_mode = 2; + rtexture->tile_type = 1; + rtexture->depth = 1; + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + pitch = (rtexture->pitch[level] / rtexture->bpt) / 8 - 1; + slice = (rtexture->pitch[level] / rtexture->bpt) * rtexture->height[level] / 64 - 1; + format = r600_translate_dbformat(rbuffer->base.b.format); + rstate->states[R600_DB__DB_DEPTH_BASE] = rtexture->offset[level] >> 8; + rstate->states[R600_DB__DB_DEPTH_INFO] = S_028010_ARRAY_MODE(rtexture->array_mode) | + S_028010_FORMAT(format); + rstate->states[R600_DB__DB_DEPTH_VIEW] = 0x00000000; + rstate->states[R600_DB__DB_PREFETCH_LIMIT] = (rtexture->height[level] / 8) -1; + rstate->states[R600_DB__DB_DEPTH_SIZE] = S_028000_PITCH_TILE_MAX(pitch) | + S_028000_SLICE_TILE_MAX(slice); + rstate->bo[0] = radeon_bo_incref(rscreen->rw, rbuffer->bo); + rstate->placement[0] = RADEON_GEM_DOMAIN_GTT; + rstate->nbo = 1; + + radeon_state_pm4(rstate); +} + +int r600_texture_scissor(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level) +{ + struct r600_screen *rscreen = r600_screen(ctx->screen); + + if (!rtexture->scissor[level].cpm4) { + r600_texture_state_scissor(rscreen, rtexture, level); + } + return 0; +} + +static void r600_texture_state_viewport(struct r600_screen *rscreen, struct r600_resource_texture *rtexture, unsigned level) +{ + struct radeon_state *rstate = &rtexture->viewport[level]; + + radeon_state_init(rstate, rscreen->rw, R600_STATE_VIEWPORT, 0, 0); + + /* set states (most default value are 0 and struct already + * initialized to 0, thus avoid resetting them) + */ + rstate->states[R600_VIEWPORT__PA_CL_VPORT_XOFFSET_0] = fui((float)rtexture->width[level]/2.0); + rstate->states[R600_VIEWPORT__PA_CL_VPORT_XSCALE_0] = fui((float)rtexture->width[level]/2.0); + rstate->states[R600_VIEWPORT__PA_CL_VPORT_YOFFSET_0] = fui((float)rtexture->height[level]/2.0); + rstate->states[R600_VIEWPORT__PA_CL_VPORT_YSCALE_0] = fui((float)-rtexture->height[level]/2.0); + rstate->states[R600_VIEWPORT__PA_CL_VPORT_ZOFFSET_0] = 0x3F000000; + rstate->states[R600_VIEWPORT__PA_CL_VPORT_ZSCALE_0] = 0x3F000000; + rstate->states[R600_VIEWPORT__PA_CL_VTE_CNTL] = 0x0000043F; + rstate->states[R600_VIEWPORT__PA_SC_VPORT_ZMAX_0] = 0x3F800000; + + radeon_state_pm4(rstate); +} + +int r600_texture_cb(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned cb, unsigned level) +{ + struct r600_screen *rscreen = r600_screen(ctx->screen); + + if (!rtexture->cb[cb][level].cpm4) { + r600_texture_state_cb(rscreen, rtexture, cb, level); + } + return 0; +} + +int r600_texture_db(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level) +{ + struct r600_screen *rscreen = r600_screen(ctx->screen); + + if (!rtexture->db[level].cpm4) { + r600_texture_state_db(rscreen, rtexture, level); + } + return 0; +} + +int r600_texture_viewport(struct pipe_context *ctx, struct r600_resource_texture *rtexture, unsigned level) +{ + struct r600_screen *rscreen = r600_screen(ctx->screen); + + if (!rtexture->viewport[level].cpm4) { + r600_texture_state_viewport(rscreen, rtexture, level); + } + return 0; +} diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 53388f822ea..7b9a983d53b 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -199,6 +199,7 @@ #define V_0280A0_COLOR_16_16_16_16_FLOAT 0x00000020 #define V_0280A0_COLOR_32_32_32_32 0x00000022 #define V_0280A0_COLOR_32_32_32_32_FLOAT 0x00000023 +#define V_0280A0_COLOR_32_32_32_FLOAT 0x00000030 #define S_0280A0_ARRAY_MODE(x) (((x) & 0xF) << 8) #define G_0280A0_ARRAY_MODE(x) (((x) >> 8) & 0xF) #define C_0280A0_ARRAY_MODE 0xFFFFF0FF @@ -1316,4 +1317,11 @@ #define G_0286D4_PNT_SPRITE_TOP_1(x) (((x) >> 14) & 0x1) #define C_0286D4_PNT_SPRITE_TOP_1 0xFFFFBFFF +#define SQ_TEX_INST_LD 0x03 +#define SQ_TEX_INST_GET_GRADIENTS_H 0x7 +#define SQ_TEX_INST_GET_GRADIENTS_V 0x8 + +#define SQ_TEX_INST_SAMPLE 0x10 +#define SQ_TEX_INST_SAMPLE_L 0x11 +#define SQ_TEX_INST_SAMPLE_C 0x18 #endif diff --git a/src/gallium/drivers/r600/radeon.h b/src/gallium/drivers/r600/radeon.h index 8f00a4895a0..aaac8de5283 100644 --- a/src/gallium/drivers/r600/radeon.h +++ b/src/gallium/drivers/r600/radeon.h @@ -77,6 +77,14 @@ enum radeon_family { CHIP_LAST, }; +enum { + R600_SHADER_PS = 1, + R600_SHADER_VS, + R600_SHADER_GS, + R600_SHADER_FS, + R600_SHADER_MAX = R600_SHADER_FS, +}; + enum radeon_family radeon_get_family(struct radeon *rw); /* @@ -98,22 +106,23 @@ struct radeon_bo *radeon_bo_incref(struct radeon *radeon, struct radeon_bo *bo); struct radeon_bo *radeon_bo_decref(struct radeon *radeon, struct radeon_bo *bo); int radeon_bo_wait(struct radeon *radeon, struct radeon_bo *bo); +struct radeon_stype_info; /* * states functions */ struct radeon_state { struct radeon *radeon; unsigned refcount; - unsigned type; + struct radeon_stype_info *stype; + unsigned state_id; unsigned id; + unsigned shader_index; unsigned nstates; - u32 *states; + u32 states[64]; unsigned npm4; unsigned cpm4; u32 pm4_crc; - u32 *pm4; - u32 nimmd; - u32 *immd; + u32 pm4[128]; unsigned nbo; struct radeon_bo *bo[4]; unsigned nreloc; @@ -123,38 +132,22 @@ struct radeon_state { unsigned bo_dirty[4]; }; -struct radeon_state *radeon_state(struct radeon *radeon, u32 type, u32 id); -struct radeon_state *radeon_state_incref(struct radeon_state *state); -struct radeon_state *radeon_state_decref(struct radeon_state *state); +int radeon_state_init(struct radeon_state *rstate, struct radeon *radeon, u32 type, u32 id, u32 shader_class); +void radeon_state_fini(struct radeon_state *state); int radeon_state_pm4(struct radeon_state *state); +int radeon_state_convert(struct radeon_state *state, u32 stype, u32 id, u32 shader_type); /* * draw functions */ struct radeon_draw { - unsigned refcount; struct radeon *radeon; - unsigned nstate; struct radeon_state **state; - unsigned cpm4; }; -struct radeon_draw *radeon_draw(struct radeon *radeon); -struct radeon_draw *radeon_draw_duplicate(struct radeon_draw *draw); -struct radeon_draw *radeon_draw_incref(struct radeon_draw *draw); -struct radeon_draw *radeon_draw_decref(struct radeon_draw *draw); -int radeon_draw_set(struct radeon_draw *draw, struct radeon_state *state); -int radeon_draw_set_new(struct radeon_draw *draw, struct radeon_state *state); -int radeon_draw_check(struct radeon_draw *draw); - -struct radeon_ctx *radeon_ctx(struct radeon *radeon); -struct radeon_ctx *radeon_ctx_decref(struct radeon_ctx *ctx); -struct radeon_ctx *radeon_ctx_incref(struct radeon_ctx *ctx); -int radeon_ctx_set_draw(struct radeon_ctx *ctx, struct radeon_draw *draw); -int radeon_ctx_set_draw_new(struct radeon_ctx *ctx, struct radeon_draw *draw); -int radeon_ctx_pm4(struct radeon_ctx *ctx); -int radeon_ctx_submit(struct radeon_ctx *ctx); -void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file); +int radeon_draw_init(struct radeon_draw *draw, struct radeon *radeon); +void radeon_draw_bind(struct radeon_draw *draw, struct radeon_state *state); +void radeon_draw_unbind(struct radeon_draw *draw, struct radeon_state *state); /* * radeon context functions @@ -169,95 +162,57 @@ struct radeon_cs_reloc { #pragma pack() struct radeon_ctx { - int refcount; struct radeon *radeon; u32 *pm4; - u32 cpm4; - u32 draw_cpm4; - unsigned id; - unsigned next_id; + int cdwords; + int ndwords; unsigned nreloc; struct radeon_cs_reloc *reloc; unsigned nbo; struct radeon_bo **bo; - unsigned ndraw; - struct radeon_draw *cdraw; - struct radeon_draw **draw; - unsigned nstate; - struct radeon_state **state; }; +int radeon_ctx_init(struct radeon_ctx *ctx, struct radeon *radeon); +void radeon_ctx_fini(struct radeon_ctx *ctx); +void radeon_ctx_clear(struct radeon_ctx *ctx); +int radeon_ctx_set_draw(struct radeon_ctx *ctx, struct radeon_draw *draw); +int radeon_ctx_submit(struct radeon_ctx *ctx); +void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file); +int radeon_ctx_set_query_state(struct radeon_ctx *ctx, struct radeon_state *state); + /* * R600/R700 */ -#define R600_NSTATE 1280 -#define R600_NTYPE 32 +enum r600_stype { + R600_STATE_CONFIG, + R600_STATE_CB_CNTL, + R600_STATE_RASTERIZER, + R600_STATE_VIEWPORT, + R600_STATE_SCISSOR, + R600_STATE_BLEND, + R600_STATE_DSA, + R600_STATE_SHADER, /* has PS,VS,GS,FS variants */ + R600_STATE_CONSTANT, /* has PS,VS,GS,FS variants */ + R600_STATE_RESOURCE, /* has PS,VS,GS,FS variants */ + R600_STATE_SAMPLER, /* has PS,VS,GS,FS variants */ + R600_STATE_SAMPLER_BORDER, /* has PS,VS,GS,FS variants */ + R600_STATE_CB0, + R600_STATE_CB1, + R600_STATE_CB2, + R600_STATE_CB3, + R600_STATE_CB4, + R600_STATE_CB5, + R600_STATE_CB6, + R600_STATE_CB7, + R600_STATE_DB, + R600_STATE_QUERY_BEGIN, + R600_STATE_QUERY_END, + R600_STATE_UCP, + R600_STATE_VGT, + R600_STATE_DRAW, +}; -#define R600_CONFIG 0 -#define R600_CONFIG_TYPE 0 -#define R600_CB_CNTL 1 -#define R600_CB_CNTL_TYPE 1 -#define R600_RASTERIZER 2 -#define R600_RASTERIZER_TYPE 2 -#define R600_VIEWPORT 3 -#define R600_VIEWPORT_TYPE 3 -#define R600_SCISSOR 4 -#define R600_SCISSOR_TYPE 4 -#define R600_BLEND 5 -#define R600_BLEND_TYPE 5 -#define R600_DSA 6 -#define R600_DSA_TYPE 6 -#define R600_VS_SHADER 7 -#define R600_VS_SHADER_TYPE 7 -#define R600_PS_SHADER 8 -#define R600_PS_SHADER_TYPE 8 -#define R600_PS_CONSTANT 9 -#define R600_PS_CONSTANT_TYPE 9 -#define R600_VS_CONSTANT 265 -#define R600_VS_CONSTANT_TYPE 10 -#define R600_PS_RESOURCE 521 -#define R600_PS_RESOURCE_TYPE 11 -#define R600_VS_RESOURCE 681 -#define R600_VS_RESOURCE_TYPE 12 -#define R600_FS_RESOURCE 841 -#define R600_FS_RESOURCE_TYPE 13 -#define R600_GS_RESOURCE 1001 -#define R600_GS_RESOURCE_TYPE 14 -#define R600_PS_SAMPLER 1161 -#define R600_PS_SAMPLER_TYPE 15 -#define R600_VS_SAMPLER 1179 -#define R600_VS_SAMPLER_TYPE 16 -#define R600_GS_SAMPLER 1197 -#define R600_GS_SAMPLER_TYPE 17 -#define R600_PS_SAMPLER_BORDER 1215 -#define R600_PS_SAMPLER_BORDER_TYPE 18 -#define R600_VS_SAMPLER_BORDER 1233 -#define R600_VS_SAMPLER_BORDER_TYPE 19 -#define R600_GS_SAMPLER_BORDER 1251 -#define R600_GS_SAMPLER_BORDER_TYPE 20 -#define R600_CB0 1269 -#define R600_CB0_TYPE 21 -#define R600_CB1 1270 -#define R600_CB1_TYPE 22 -#define R600_CB2 1271 -#define R600_CB2_TYPE 23 -#define R600_CB3 1272 -#define R600_CB3_TYPE 24 -#define R600_CB4 1273 -#define R600_CB4_TYPE 25 -#define R600_CB5 1274 -#define R600_CB5_TYPE 26 -#define R600_CB6 1275 -#define R600_CB6_TYPE 27 -#define R600_CB7 1276 -#define R600_CB7_TYPE 28 -#define R600_DB 1277 -#define R600_DB_TYPE 29 -#define R600_VGT 1278 -#define R600_VGT_TYPE 30 -#define R600_DRAW 1279 -#define R600_DRAW_TYPE 31 /* R600_CONFIG */ #define R600_CONFIG__SQ_CONFIG 0 #define R600_CONFIG__SQ_GPR_RESOURCE_MGMT_1 1 @@ -639,9 +594,40 @@ struct radeon_ctx { /* R600_DRAW */ #define R600_DRAW__VGT_NUM_INDICES 0 #define R600_DRAW__VGT_DMA_BASE_HI 1 -#define R600_DRAW__VGT_DMA_BASE 2 +#define R600_DRAW__VGT_DMA_BASE 2 #define R600_DRAW__VGT_DRAW_INITIATOR 3 -#define R600_DRAW_SIZE 4 -#define R600_DRAW_PM4 128 +#define R600_DRAW_SIZE 4 +#define R600_DRAW_PM4 128 +/* R600_CLIP */ +#define R600_CLIP__PA_CL_UCP_X_0 0 +#define R600_CLIP__PA_CL_UCP_Y_0 1 +#define R600_CLIP__PA_CL_UCP_Z_0 2 +#define R600_CLIP__PA_CL_UCP_W_0 3 +#define R600_CLIP__PA_CL_UCP_X_1 4 +#define R600_CLIP__PA_CL_UCP_Y_1 5 +#define R600_CLIP__PA_CL_UCP_Z_1 6 +#define R600_CLIP__PA_CL_UCP_W_1 7 +#define R600_CLIP__PA_CL_UCP_X_2 8 +#define R600_CLIP__PA_CL_UCP_Y_2 9 +#define R600_CLIP__PA_CL_UCP_Z_2 10 +#define R600_CLIP__PA_CL_UCP_W_2 11 +#define R600_CLIP__PA_CL_UCP_X_3 12 +#define R600_CLIP__PA_CL_UCP_Y_3 13 +#define R600_CLIP__PA_CL_UCP_Z_3 14 +#define R600_CLIP__PA_CL_UCP_W_3 15 +#define R600_CLIP__PA_CL_UCP_X_4 16 +#define R600_CLIP__PA_CL_UCP_Y_4 17 +#define R600_CLIP__PA_CL_UCP_Z_4 18 +#define R600_CLIP__PA_CL_UCP_W_4 19 +#define R600_CLIP__PA_CL_UCP_X_5 20 +#define R600_CLIP__PA_CL_UCP_Y_5 21 +#define R600_CLIP__PA_CL_UCP_Z_5 22 +#define R600_CLIP__PA_CL_UCP_W_5 23 +#define R600_CLIP_SIZE 24 +#define R600_CLIP_PM4 128 +/* R600 QUERY BEGIN/END */ +#define R600_QUERY__OFFSET 0 +#define R600_QUERY_SIZE 1 +#define R600_QUERY_PM4 128 #endif diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c index 386c8acb8ce..01b4ca985d0 100644 --- a/src/gallium/drivers/softpipe/sp_draw_arrays.c +++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c @@ -75,14 +75,10 @@ softpipe_draw_stream_output(struct pipe_context *pipe, unsigned mode) buf = (void*)((int32_t*)buf + offset); draw_set_mapped_vertex_buffer(draw, 0, buf); - draw_set_mapped_element_buffer_range(draw, - 0, 0, - start, - start + count - 1, - NULL); + draw_set_mapped_index_buffer(draw, NULL); /* draw! */ - draw_arrays_instanced(draw, mode, start, count, 0, 1); + draw_arrays(draw, mode, start, count); /* unmap vertex/index buffers - will cause draw module to flush */ draw_set_mapped_vertex_buffer(draw, 0, NULL); @@ -138,28 +134,20 @@ softpipe_draw_vbo(struct pipe_context *pipe, } /* Map index buffer, if present */ - if (info->indexed && sp->index_buffer.buffer) { - char *indices = (char *) softpipe_resource(sp->index_buffer.buffer)->data; - mapped_indices = (void *) (indices + sp->index_buffer.offset); - } + if (info->indexed && sp->index_buffer.buffer) + mapped_indices = softpipe_resource(sp->index_buffer.buffer)->data; - draw_set_mapped_element_buffer_range(draw, (mapped_indices) ? - sp->index_buffer.index_size : 0, - info->index_bias, - info->min_index, - info->max_index, - mapped_indices); + draw_set_mapped_index_buffer(draw, mapped_indices); /* draw! */ - draw_arrays_instanced(draw, info->mode, info->start, info->count, - info->start_instance, info->instance_count); + draw_vbo(draw, info); /* unmap vertex/index buffers - will cause draw module to flush */ for (i = 0; i < sp->num_vertex_buffers; i++) { draw_set_mapped_vertex_buffer(draw, i, NULL); } if (mapped_indices) { - draw_set_mapped_element_buffer(draw, 0, 0, NULL); + draw_set_mapped_index_buffer(draw, NULL); } /* diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c index 4a53ef048f3..1071011db0e 100644 --- a/src/gallium/drivers/softpipe/sp_flush.c +++ b/src/gallium/drivers/softpipe/sp_flush.c @@ -31,6 +31,7 @@ #include "pipe/p_defines.h" +#include "pipe/p_screen.h" #include "draw/draw_context.h" #include "sp_flush.h" #include "sp_context.h" diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 93af6ee5b02..73ae2dea561 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -199,6 +199,7 @@ softpipe_is_format_supported( struct pipe_screen *screen, assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D || target == PIPE_TEXTURE_2D || + target == PIPE_TEXTURE_RECT || target == PIPE_TEXTURE_3D || target == PIPE_TEXTURE_CUBE); diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c index 880a7c7cd26..b650fcaea5c 100644 --- a/src/gallium/drivers/softpipe/sp_state_vertex.c +++ b/src/gallium/drivers/softpipe/sp_state_vertex.c @@ -100,5 +100,5 @@ softpipe_set_index_buffer(struct pipe_context *pipe, else memset(&softpipe->index_buffer, 0, sizeof(softpipe->index_buffer)); - /* TODO make this more like a state */ + draw_set_index_buffer(softpipe->draw, ib); } diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c index cf7ab81405c..e654bb77c29 100644 --- a/src/gallium/drivers/softpipe/sp_tex_sample.c +++ b/src/gallium/drivers/softpipe/sp_tex_sample.c @@ -1785,6 +1785,7 @@ get_lambda_func(const union sp_sampler_key key) case PIPE_TEXTURE_1D: return compute_lambda_1d; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: case PIPE_TEXTURE_CUBE: return compute_lambda_2d; case PIPE_TEXTURE_3D: @@ -1809,6 +1810,7 @@ get_img_filter(const union sp_sampler_key key, return img_filter_1d_linear; break; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: /* Try for fast path: */ if (key.bits.is_pot && diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c index 7b2dfe25496..e975f3b02fa 100644 --- a/src/gallium/drivers/svga/svga_cmd.c +++ b/src/gallium/drivers/svga/svga_cmd.c @@ -67,7 +67,7 @@ void surface_to_surfaceid(struct svga_winsys_context *swc, // IN id->mipmap = s->real_level; } else { - id->sid = SVGA3D_INVALID_ID; + swc->surface_relocation(swc, &id->sid, NULL, flags); id->face = 0; id->mipmap = 0; } diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c index 3b30b9e341e..cd3f6b89825 100644 --- a/src/gallium/drivers/svga/svga_context.c +++ b/src/gallium/drivers/svga/svga_context.c @@ -214,6 +214,11 @@ void svga_context_flush( struct svga_context *svga, svga_screen_cache_flush(svgascreen, fence); + /* To force the reemission of rendertargets and texture bindings at + * the beginning of every command buffer. + */ + svga->dirty |= SVGA_NEW_COMMAND_BUFFER; + if (SVGA_DEBUG & DEBUG_SYNC) { if (fence) svga->pipe.screen->fence_finish( svga->pipe.screen, fence, 0); diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h index 67a7614c8af..1fb5a04887f 100644 --- a/src/gallium/drivers/svga/svga_context.h +++ b/src/gallium/drivers/svga/svga_context.h @@ -382,6 +382,7 @@ struct svga_context #define SVGA_NEW_ZERO_STRIDE 0x2000000 #define SVGA_NEW_TEXTURE_FLAGS 0x4000000 #define SVGA_NEW_STENCIL_REF 0x8000000 +#define SVGA_NEW_COMMAND_BUFFER 0x10000000 diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c index de08bc5e562..001ec3616c4 100644 --- a/src/gallium/drivers/svga/svga_pipe_draw.c +++ b/src/gallium/drivers/svga/svga_pipe_draw.c @@ -146,23 +146,15 @@ retry: } - - - static void -svga_draw_range_elements( struct pipe_context *pipe, - struct pipe_resource *index_buffer, - unsigned index_size, - int index_bias, - unsigned min_index, - unsigned max_index, - unsigned prim, unsigned start, unsigned count) +svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) { struct svga_context *svga = svga_context( pipe ); - unsigned reduced_prim = u_reduced_prim(prim); + unsigned reduced_prim = u_reduced_prim( info->mode ); + unsigned count = info->count; enum pipe_error ret = 0; - if (!u_trim_pipe_prim( prim, &count )) + if (!u_trim_pipe_prim( info->mode, &count )) return; /* @@ -187,34 +179,32 @@ svga_draw_range_elements( struct pipe_context *pipe, return; #endif - if (svga->state.sw.need_swtnl) - { - ret = svga_swtnl_draw_range_elements( svga, - index_buffer, - index_size, - index_bias, - min_index, max_index, - prim, - start, count ); + if (svga->state.sw.need_swtnl) { + ret = svga_swtnl_draw_vbo( svga, info ); } else { - if (index_buffer) { + if (info->indexed && svga->curr.ib.buffer) { + unsigned offset; + + assert(svga->curr.ib.offset % svga->curr.ib.index_size == 0); + offset = svga->curr.ib.offset / svga->curr.ib.index_size; + ret = retry_draw_range_elements( svga, - index_buffer, - index_size, - index_bias, - min_index, - max_index, - prim, - start, - count, + svga->curr.ib.buffer, + svga->curr.ib.index_size, + info->index_bias, + info->min_index, + info->max_index, + info->mode, + info->start + offset, + info->count, TRUE ); } else { - ret = retry_draw_arrays( svga, - prim, - start, - count, + ret = retry_draw_arrays( svga, + info->mode, + info->start, + info->count, TRUE ); } } @@ -226,30 +216,6 @@ svga_draw_range_elements( struct pipe_context *pipe, } -static void -svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) -{ - struct svga_context *svga = svga_context(pipe); - - if (info->indexed && svga->curr.ib.buffer) { - unsigned offset; - - assert(svga->curr.ib.offset % svga->curr.ib.index_size == 0); - offset = svga->curr.ib.offset / svga->curr.ib.index_size; - - svga_draw_range_elements(pipe, svga->curr.ib.buffer, - svga->curr.ib.index_size, info->index_bias, - info->min_index, info->max_index, - info->mode, info->start + offset, info->count); - } - else { - svga_draw_range_elements(pipe, NULL, 0, 0, - info->min_index, info->max_index, - info->mode, info->start, info->count); - } -} - - void svga_init_draw_functions( struct svga_context *svga ) { svga->pipe.draw_vbo = svga_draw_vbo; diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c index ff83c750aaf..26eb03a895a 100644 --- a/src/gallium/drivers/svga/svga_resource_texture.c +++ b/src/gallium/drivers/svga/svga_resource_texture.c @@ -583,7 +583,8 @@ svga_texture_from_handle(struct pipe_screen *screen, assert(screen); /* Only supports one type */ - if (template->target != PIPE_TEXTURE_2D || + if ((template->target != PIPE_TEXTURE_2D && + template->target != PIPE_TEXTURE_RECT) || template->last_level != 0 || template->depth0 != 1) { return NULL; diff --git a/src/gallium/drivers/svga/svga_state_framebuffer.c b/src/gallium/drivers/svga/svga_state_framebuffer.c index bd92f003432..fcbb35e7972 100644 --- a/src/gallium/drivers/svga/svga_state_framebuffer.c +++ b/src/gallium/drivers/svga/svga_state_framebuffer.c @@ -43,15 +43,18 @@ static int emit_framebuffer( struct svga_context *svga, { const struct pipe_framebuffer_state *curr = &svga->curr.framebuffer; struct pipe_framebuffer_state *hw = &svga->state.hw_clear.framebuffer; + boolean reemit = !!(dirty & SVGA_NEW_COMMAND_BUFFER); unsigned i; enum pipe_error ret; - /* XXX: Need shadow state in svga->hw to eliminate redundant - * uploads, especially of NULL buffers. + /* + * We need to reemit non-null surface bindings, even when they are not + * dirty, to ensure that the resources are paged in. */ for(i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) { - if (curr->cbufs[i] != hw->cbufs[i]) { + if (curr->cbufs[i] != hw->cbufs[i] || + (reemit && hw->cbufs[i])) { if (svga->curr.nr_fbs++ > 8) return PIPE_ERROR_OUT_OF_MEMORY; @@ -64,7 +67,8 @@ static int emit_framebuffer( struct svga_context *svga, } - if (curr->zsbuf != hw->zsbuf) { + if (curr->zsbuf != hw->zsbuf || + (reemit && hw->zsbuf)) { ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_DEPTH, curr->zsbuf); if (ret != PIPE_OK) return ret; @@ -92,7 +96,8 @@ static int emit_framebuffer( struct svga_context *svga, struct svga_tracked_state svga_hw_framebuffer = { "hw framebuffer state", - SVGA_NEW_FRAME_BUFFER, + SVGA_NEW_FRAME_BUFFER | + SVGA_NEW_COMMAND_BUFFER, emit_framebuffer }; diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c index 76a2dae1435..4a50b19474c 100644 --- a/src/gallium/drivers/svga/svga_state_tss.c +++ b/src/gallium/drivers/svga/svga_state_tss.c @@ -56,6 +56,7 @@ static int update_tss_binding(struct svga_context *svga, unsigned dirty ) { + boolean reemit = !!(dirty & SVGA_NEW_COMMAND_BUFFER); unsigned i; unsigned count = MAX2( svga->curr.num_sampler_views, svga->state.hw_draw.num_views ); @@ -107,12 +108,18 @@ update_tss_binding(struct svga_context *svga, max_lod); } - if (view->dirty) { + /* + * We need to reemit non-null texture bindings, even when they are not + * dirty, to ensure that the resources are paged in. + */ + + if (view->dirty || + (reemit && view->v)) { queue.bind[queue.bind_count].unit = i; queue.bind[queue.bind_count].view = view; queue.bind_count++; } - else if (view->v) { + if (!view->dirty && view->v) { svga_validate_sampler_view(svga, view->v); } } @@ -128,18 +135,21 @@ update_tss_binding(struct svga_context *svga, goto fail; for (i = 0; i < queue.bind_count; i++) { + struct svga_winsys_surface *handle; + ts[i].stage = queue.bind[i].unit; ts[i].name = SVGA3D_TS_BIND_TEXTURE; if (queue.bind[i].view->v) { - svga->swc->surface_relocation(svga->swc, - &ts[i].value, - queue.bind[i].view->v->handle, - SVGA_RELOC_READ); + handle = queue.bind[i].view->v->handle; } else { - ts[i].value = SVGA3D_INVALID_ID; + handle = NULL; } + svga->swc->surface_relocation(svga->swc, + &ts[i].value, + handle, + SVGA_RELOC_READ); queue.bind[i].view->dirty = FALSE; } @@ -157,7 +167,8 @@ fail: struct svga_tracked_state svga_hw_tss_binding = { "texture binding emit", SVGA_NEW_TEXTURE_BINDING | - SVGA_NEW_SAMPLER, + SVGA_NEW_SAMPLER | + SVGA_NEW_COMMAND_BUFFER, update_tss_binding }; diff --git a/src/gallium/drivers/svga/svga_swtnl.h b/src/gallium/drivers/svga/svga_swtnl.h index 65c675f99c9..fc094e51428 100644 --- a/src/gallium/drivers/svga/svga_swtnl.h +++ b/src/gallium/drivers/svga/svga_swtnl.h @@ -38,15 +38,8 @@ void svga_destroy_swtnl( struct svga_context *svga ); enum pipe_error -svga_swtnl_draw_range_elements(struct svga_context *svga, - struct pipe_resource *indexBuffer, - unsigned indexSize, - int indexBias, - unsigned min_index, - unsigned max_index, - unsigned prim, - unsigned start, - unsigned count); +svga_swtnl_draw_vbo(struct svga_context *svga, + const struct pipe_draw_info *info); #endif diff --git a/src/gallium/drivers/svga/svga_swtnl_draw.c b/src/gallium/drivers/svga/svga_swtnl_draw.c index eb71c23195b..814e8edd70f 100644 --- a/src/gallium/drivers/svga/svga_swtnl_draw.c +++ b/src/gallium/drivers/svga/svga_swtnl_draw.c @@ -36,13 +36,8 @@ enum pipe_error -svga_swtnl_draw_range_elements(struct svga_context *svga, - struct pipe_resource *indexBuffer, - unsigned indexSize, - int indexBias, - unsigned min_index, - unsigned max_index, - unsigned prim, unsigned start, unsigned count) +svga_swtnl_draw_vbo(struct svga_context *svga, + const struct pipe_draw_info *info) { struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS]; struct pipe_transfer *ib_transfer = NULL; @@ -76,19 +71,18 @@ svga_swtnl_draw_range_elements(struct svga_context *svga, draw_set_mapped_vertex_buffer(draw, i, map); } + /* TODO move this to update_swtnl_draw */ + draw_set_index_buffer(draw, &svga->curr.ib); + /* Map index buffer, if present */ - if (indexBuffer) { - map = pipe_buffer_map(&svga->pipe, indexBuffer, + map = NULL; + if (info->indexed && svga->curr.ib.buffer) { + map = pipe_buffer_map(&svga->pipe, svga->curr.ib.buffer, PIPE_TRANSFER_READ, - &ib_transfer); - - draw_set_mapped_element_buffer_range(draw, - indexSize, indexBias, - min_index, - max_index, - map); + &ib_transfer); } - + draw_set_mapped_index_buffer(draw, map); + if (svga->curr.cb[PIPE_SHADER_VERTEX]) { map = pipe_buffer_map(&svga->pipe, svga->curr.cb[PIPE_SHADER_VERTEX], @@ -101,7 +95,7 @@ svga_swtnl_draw_range_elements(struct svga_context *svga, svga->curr.cb[PIPE_SHADER_VERTEX]->width0); } - draw_arrays(svga->swtnl.draw, prim, start, count); + draw_vbo(draw, info); draw_flush(svga->swtnl.draw); @@ -117,9 +111,9 @@ svga_swtnl_draw_range_elements(struct svga_context *svga, draw_set_mapped_vertex_buffer(draw, i, NULL); } - if (indexBuffer) { - pipe_buffer_unmap(&svga->pipe, indexBuffer, ib_transfer); - draw_set_mapped_element_buffer(draw, 0, 0, NULL); + if (ib_transfer) { + pipe_buffer_unmap(&svga->pipe, svga->curr.ib.buffer, ib_transfer); + draw_set_mapped_index_buffer(draw, NULL); } if (svga->curr.cb[PIPE_SHADER_VERTEX]) { @@ -157,7 +151,8 @@ boolean svga_init_swtnl( struct svga_context *svga ) draw_install_aapoint_stage(svga->swtnl.draw, &svga->pipe); draw_install_pstipple_stage(svga->swtnl.draw, &svga->pipe); - draw_set_driver_clipping(svga->swtnl.draw, debug_get_bool_option("SVGA_SWTNL_FSE", FALSE)); + if (debug_get_bool_option("SVGA_SWTNL_FSE", FALSE)) + draw_set_driver_clipping(svga->swtnl.draw, TRUE, TRUE); return TRUE; diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h index 48eced2ecea..b4e90a957d0 100644 --- a/src/gallium/drivers/svga/svga_tgsi_emit.h +++ b/src/gallium/drivers/svga/svga_tgsi_emit.h @@ -353,6 +353,7 @@ static INLINE ubyte svga_tgsi_sampler_type( struct svga_shader_emitter *emit, case PIPE_TEXTURE_1D: return SVGA3DSAMP_2D; case PIPE_TEXTURE_2D: + case PIPE_TEXTURE_RECT: return SVGA3DSAMP_2D; case PIPE_TEXTURE_3D: return SVGA3DSAMP_VOLUME; diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c index 67e1f22a701..72dccdf1502 100644 --- a/src/gallium/drivers/svga/svga_tgsi_insn.c +++ b/src/gallium/drivers/svga/svga_tgsi_insn.c @@ -806,6 +806,20 @@ static boolean emit_cmp(struct svga_shader_emitter *emit, const struct src_register src2 = translate_src_register( emit, &insn->Src[2] ); + if (emit->unit == PIPE_SHADER_VERTEX) { + SVGA3dShaderDestToken temp = get_temp(emit); + struct src_register zero = scalar(get_zero_immediate(emit), TGSI_SWIZZLE_X); + + /* Since vertex shaders don't support the CMP instruction, + * simulate it with SLT and LRP instructions. + * SLT TMP, SRC0, 0.0 + * LRP DST, TMP, SRC1, SRC2 + */ + if (!submit_op2(emit, inst_token(SVGA3DOP_SLT), temp, src0, zero)) + return FALSE; + return submit_op3(emit, inst_token(SVGA3DOP_LRP), dst, src(temp), src1, src2); + } + /* CMP DST, SRC0, SRC2, SRC1 */ return submit_op3( emit, inst_token( SVGA3DOP_CMP ), dst, src0, src2, src1); } @@ -2682,6 +2696,11 @@ needs_to_create_zero( struct svga_shader_emitter *emit ) return TRUE; } + if (emit->unit == PIPE_SHADER_VERTEX) { + if (emit->info.opcode_count[TGSI_OPCODE_CMP] >= 1) + return TRUE; + } + if (emit->info.opcode_count[TGSI_OPCODE_IF] >= 1 || emit->info.opcode_count[TGSI_OPCODE_BGNLOOP] >= 1 || emit->info.opcode_count[TGSI_OPCODE_DDX] >= 1 || diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c index 84e5a6a8242..271cd4aff5e 100644 --- a/src/gallium/drivers/trace/tr_context.c +++ b/src/gallium/drivers/trace/tr_context.c @@ -885,7 +885,7 @@ trace_sampler_view_destroy(struct pipe_context *_pipe, trace_dump_arg(ptr, pipe); trace_dump_arg(ptr, view); - pipe->sampler_view_destroy(pipe, view); + pipe_sampler_view_reference(&tr_view->sampler_view, NULL); trace_dump_call_end(); @@ -1002,7 +1002,7 @@ trace_context_set_index_buffer(struct pipe_context *_pipe, trace_dump_call_begin("pipe_context", "set_index_buffer"); trace_dump_arg(ptr, pipe); - trace_dump_arg(index_buffer, ib); + trace_dump_arg(index_buffer, _ib); pipe->set_index_buffer(pipe, ib); @@ -1063,7 +1063,10 @@ trace_context_clear(struct pipe_context *_pipe, trace_dump_arg(ptr, pipe); trace_dump_arg(uint, buffers); - trace_dump_arg_array(float, rgba, 4); + if (rgba) + trace_dump_arg_array(float, rgba, 4); + else + trace_dump_null(); trace_dump_arg(float, depth); trace_dump_arg(uint, stencil); diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h index 1fa3ec8300a..0a5be43f6bf 100644 --- a/src/gallium/include/pipe/p_compiler.h +++ b/src/gallium/include/pipe/p_compiler.h @@ -79,6 +79,14 @@ typedef unsigned char boolean; #define FALSE false #endif +#ifndef va_copy +#ifdef __va_copy +#define va_copy(dest, src) __va_copy((dest), (src)) +#else +#define va_copy(dest, src) (dest) = (src) +#endif +#endif + /* Function inlining */ #ifndef INLINE # ifdef __cplusplus diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h index 0579962ec69..0e53aef6d2e 100644 --- a/src/gallium/include/pipe/p_context.h +++ b/src/gallium/include/pipe/p_context.h @@ -28,19 +28,37 @@ #ifndef PIPE_CONTEXT_H #define PIPE_CONTEXT_H -#include "p_state.h" - +#include "p_compiler.h" #ifdef __cplusplus extern "C" { #endif - -struct pipe_screen; + +struct pipe_blend_color; +struct pipe_blend_state; +struct pipe_box; +struct pipe_clip_state; +struct pipe_depth_stencil_alpha_state; +struct pipe_draw_info; struct pipe_fence_handle; -struct pipe_state_cache; +struct pipe_framebuffer_state; +struct pipe_index_buffer; struct pipe_query; -struct pipe_winsys; +struct pipe_poly_stipple; +struct pipe_rasterizer_state; +struct pipe_resource; +struct pipe_sampler_state; +struct pipe_sampler_view; +struct pipe_scissor_state; +struct pipe_shader_state; +struct pipe_stencil_ref; +struct pipe_stream_output_state; +struct pipe_subresource; +struct pipe_surface; +struct pipe_vertex_buffer; +struct pipe_vertex_element; +struct pipe_viewport_state; /** * Gallium rendering context. Basically: diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index 00aa2076ed5..627b5ae5380 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -28,7 +28,7 @@ #ifndef PIPE_DEFINES_H #define PIPE_DEFINES_H -#include "p_format.h" +#include "p_compiler.h" #ifdef __cplusplus extern "C" { @@ -135,13 +135,15 @@ enum pipe_error { #define PIPE_STENCIL_OP_DECR_WRAP 6 #define PIPE_STENCIL_OP_INVERT 7 -/** Texture types */ +/** Texture types. + * See the documentation for info on PIPE_TEXTURE_RECT vs PIPE_TEXTURE_2D */ enum pipe_texture_target { PIPE_BUFFER = 0, PIPE_TEXTURE_1D = 1, PIPE_TEXTURE_2D = 2, PIPE_TEXTURE_3D = 3, PIPE_TEXTURE_CUBE = 4, + PIPE_TEXTURE_RECT = 5, PIPE_MAX_TEXTURE_TYPES }; diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h index 436c3f627a8..06412f4894c 100644 --- a/src/gallium/include/pipe/p_format.h +++ b/src/gallium/include/pipe/p_format.h @@ -29,8 +29,6 @@ #ifndef PIPE_FORMAT_H #define PIPE_FORMAT_H -#include "p_compiler.h" - #ifdef __cplusplus extern "C" { #endif diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h index 9df20ea8581..c4bd17e92bb 100644 --- a/src/gallium/include/pipe/p_shader_tokens.h +++ b/src/gallium/include/pipe/p_shader_tokens.h @@ -33,8 +33,6 @@ extern "C" { #endif -#include "p_compiler.h" - struct tgsi_header { diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h index 0f1a44cde42..9a2b31da50d 100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@ -43,7 +43,6 @@ #include "p_compiler.h" #include "p_defines.h" #include "p_format.h" -#include "p_screen.h" #ifdef __cplusplus diff --git a/src/gallium/include/state_tracker/graw.h b/src/gallium/include/state_tracker/graw.h index 59b0e337c92..6a99b234aa5 100644 --- a/src/gallium/include/state_tracker/graw.h +++ b/src/gallium/include/state_tracker/graw.h @@ -1,3 +1,30 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + #ifndef GALLIUM_RAW_H #define GALLIUM_RAW_H @@ -14,6 +41,7 @@ * those for parsing text representations of TGSI shaders. */ +#include "pipe/p_compiler.h" #include "pipe/p_format.h" struct pipe_screen; diff --git a/src/gallium/state_trackers/dri/common/dri_context.h b/src/gallium/state_trackers/dri/common/dri_context.h index 692c49d7cd5..35b870a8a32 100644 --- a/src/gallium/state_trackers/dri/common/dri_context.h +++ b/src/gallium/state_trackers/dri/common/dri_context.h @@ -34,7 +34,6 @@ #include "pipe/p_compiler.h" #include "dri_wrapper.h" -#include "main/mtypes.h" struct pipe_context; struct pipe_fence; diff --git a/src/gallium/state_trackers/dri/common/dri_screen.c b/src/gallium/state_trackers/dri/common/dri_screen.c index 6ad2c7da4d6..0ab4dd18931 100644 --- a/src/gallium/state_trackers/dri/common/dri_screen.c +++ b/src/gallium/state_trackers/dri/common/dri_screen.c @@ -383,6 +383,11 @@ dri_init_screen_helper(struct dri_screen *screen, if (!screen->st_api) return NULL; + if(pscreen->get_param(pscreen, PIPE_CAP_NPOT_TEXTURES)) + screen->target = PIPE_TEXTURE_2D; + else + screen->target = PIPE_TEXTURE_RECT; + driParseOptionInfo(&screen->optionCache, __driConfigOptions, __driNConfigOptions); diff --git a/src/gallium/state_trackers/dri/common/dri_screen.h b/src/gallium/state_trackers/dri/common/dri_screen.h index 53ccce145ba..849f399b2f8 100644 --- a/src/gallium/state_trackers/dri/common/dri_screen.h +++ b/src/gallium/state_trackers/dri/common/dri_screen.h @@ -68,6 +68,7 @@ struct dri_screen boolean d_depth_bits_last; boolean sd_depth_bits_last; boolean auto_fake_front; + enum pipe_texture_target target; }; /** cast wrapper */ diff --git a/src/gallium/state_trackers/dri/drm/dri2.c b/src/gallium/state_trackers/dri/drm/dri2.c index 47005c17e2b..93f910a26d6 100644 --- a/src/gallium/state_trackers/dri/drm/dri2.c +++ b/src/gallium/state_trackers/dri/drm/dri2.c @@ -195,7 +195,7 @@ dri2_drawable_process_buffers(struct dri_drawable *drawable, pipe_resource_reference(&drawable->textures[i], NULL); memset(&templ, 0, sizeof(templ)); - templ.target = PIPE_TEXTURE_2D; + templ.target = screen->target; templ.last_level = 0; templ.width0 = dri_drawable->w; templ.height0 = dri_drawable->h; @@ -342,7 +342,7 @@ dri2_create_image_from_name(__DRIcontext *context, memset(&templ, 0, sizeof(templ)); templ.bind = tex_usage; templ.format = pf; - templ.target = PIPE_TEXTURE_2D; + templ.target = screen->target; templ.last_level = 0; templ.width0 = width; templ.height0 = height; diff --git a/src/gallium/state_trackers/dri/sw/drisw.c b/src/gallium/state_trackers/dri/sw/drisw.c index 249ccd7fcf6..04bba631aeb 100644 --- a/src/gallium/state_trackers/dri/sw/drisw.c +++ b/src/gallium/state_trackers/dri/sw/drisw.c @@ -216,7 +216,7 @@ drisw_allocate_textures(struct dri_drawable *drawable, } memset(&templ, 0, sizeof(templ)); - templ.target = PIPE_TEXTURE_2D; + templ.target = screen->target; templ.width0 = width; templ.height0 = height; templ.depth0 = 1; diff --git a/src/gallium/state_trackers/egl/Makefile b/src/gallium/state_trackers/egl/Makefile index 9e9e479e7e0..4199d7c6baa 100644 --- a/src/gallium/state_trackers/egl/Makefile +++ b/src/gallium/state_trackers/egl/Makefile @@ -24,7 +24,7 @@ x11_SOURCES = $(wildcard x11/*.c) \ x11_OBJECTS = $(x11_SOURCES:.c=.o) -kms_INCLUDES = $(shell pkg-config --cflags-only-I libdrm) +kms_INCLUDES = -I$(TOP)/src/gallium/winsys $(shell pkg-config --cflags-only-I libdrm) kms_SOURCES = $(wildcard kms/*.c) kms_OBJECTS = $(kms_SOURCES:.c=.o) diff --git a/src/gallium/state_trackers/egl/SConscript b/src/gallium/state_trackers/egl/SConscript index e71aec35b73..efcce25e317 100644 --- a/src/gallium/state_trackers/egl/SConscript +++ b/src/gallium/state_trackers/egl/SConscript @@ -21,6 +21,7 @@ if 'egl' in env['statetrackers']: 'common/egl_g3d_api.c', 'common/egl_g3d_image.c', 'common/egl_g3d_st.c', + 'common/egl_g3d_sync.c', 'common/native_helper.c', ] diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.c b/src/gallium/state_trackers/egl/common/egl_g3d.c index 56d575ffe08..4e653bdf3b2 100644 --- a/src/gallium/state_trackers/egl/common/egl_g3d.c +++ b/src/gallium/state_trackers/egl/common/egl_g3d.c @@ -530,6 +530,18 @@ egl_g3d_initialize(_EGLDriver *drv, _EGLDisplay *dpy, if (gdpy->native->get_param(gdpy->native, NATIVE_PARAM_USE_NATIVE_BUFFER)) dpy->Extensions.KHR_image_pixmap = EGL_TRUE; + dpy->Extensions.KHR_reusable_sync = EGL_TRUE; + dpy->Extensions.KHR_fence_sync = EGL_TRUE; + + dpy->Extensions.KHR_surfaceless_gles1 = EGL_TRUE; + dpy->Extensions.KHR_surfaceless_gles2 = EGL_TRUE; + dpy->Extensions.KHR_surfaceless_opengl = EGL_TRUE; + + if (dpy->Platform == _EGL_PLATFORM_DRM) { + dpy->Extensions.MESA_drm_display = EGL_TRUE; + dpy->Extensions.MESA_drm_image = EGL_TRUE; + } + if (egl_g3d_add_configs(drv, dpy, 1) == 1) { _eglError(EGL_NOT_INITIALIZED, "eglInitialize(unable to add configs)"); goto fail; diff --git a/src/gallium/state_trackers/egl/common/egl_g3d.h b/src/gallium/state_trackers/egl/common/egl_g3d.h index f33dc91cf90..be450bbede3 100644 --- a/src/gallium/state_trackers/egl/common/egl_g3d.h +++ b/src/gallium/state_trackers/egl/common/egl_g3d.h @@ -30,12 +30,14 @@ #include "pipe/p_screen.h" #include "pipe/p_context.h" #include "pipe/p_format.h" +#include "os/os_thread.h" #include "egldriver.h" #include "egldisplay.h" #include "eglcontext.h" #include "eglsurface.h" #include "eglconfig.h" #include "eglimage.h" +#include "eglsync.h" #include "eglscreen.h" #include "eglmode.h" @@ -99,6 +101,24 @@ struct egl_g3d_image { _EGL_DRIVER_STANDARD_TYPECASTS(egl_g3d) _EGL_DRIVER_TYPECAST(egl_g3d_image, _EGLImage, obj) +#ifdef EGL_KHR_reusable_sync + +struct egl_g3d_sync { + _EGLSync base; + + int refs; + + /* the mutex protects only the condvar, not the struct */ + pipe_mutex mutex; + pipe_condvar condvar; + + /* for fence sync */ + struct pipe_fence_handle *fence; +}; +_EGL_DRIVER_TYPECAST(egl_g3d_sync, _EGLSync, obj) + +#endif /* EGL_KHR_reusable_sync */ + #ifdef EGL_MESA_screen_surface struct egl_g3d_screen { diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_api.c b/src/gallium/state_trackers/egl/common/egl_g3d_api.c index edac72a8223..3ec53653f44 100644 --- a/src/gallium/state_trackers/egl/common/egl_g3d_api.c +++ b/src/gallium/state_trackers/egl/common/egl_g3d_api.c @@ -34,6 +34,7 @@ #include "egl_g3d.h" #include "egl_g3d_api.h" #include "egl_g3d_image.h" +#include "egl_g3d_sync.h" #include "egl_g3d_st.h" #include "egl_g3d_loader.h" #include "native.h" @@ -103,7 +104,7 @@ egl_g3d_create_context(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *conf, } gctx->stctxi = gctx->stapi->create_context(gctx->stapi, gdpy->smapi, - &gconf->stvis, (gshare) ? gshare->stctxi : NULL); + (gconf) ? &gconf->stvis : NULL, (gshare) ? gshare->stctxi : NULL); if (!gctx->stctxi) { FREE(gctx); return NULL; @@ -437,16 +438,19 @@ egl_g3d_make_current(_EGLDriver *drv, _EGLDisplay *dpy, ok = gctx->stapi->make_current(gctx->stapi, gctx->stctxi, (gdraw) ? gdraw->stfbi : NULL, (gread) ? gread->stfbi : NULL); if (ok) { - gctx->stctxi->notify_invalid_framebuffer(gctx->stctxi, gdraw->stfbi); - if (gread != gdraw) { + if (gdraw) { gctx->stctxi->notify_invalid_framebuffer(gctx->stctxi, - gread->stfbi); - } + gdraw->stfbi); - if (gdraw->base.Type == EGL_WINDOW_BIT) { - gctx->base.WindowRenderBuffer = - (gdraw->stvis.render_buffer == ST_ATTACHMENT_FRONT_LEFT) ? - EGL_SINGLE_BUFFER : EGL_BACK_BUFFER; + if (gdraw->base.Type == EGL_WINDOW_BIT) { + gctx->base.WindowRenderBuffer = + (gdraw->stvis.render_buffer == ST_ATTACHMENT_FRONT_LEFT) ? + EGL_SINGLE_BUFFER : EGL_BACK_BUFFER; + } + } + if (gread && gread != gdraw) { + gctx->stctxi->notify_invalid_framebuffer(gctx->stctxi, + gread->stfbi); } } } @@ -805,6 +809,17 @@ egl_g3d_init_driver_api(_EGLDriver *drv) drv->API.CreateImageKHR = egl_g3d_create_image; drv->API.DestroyImageKHR = egl_g3d_destroy_image; +#ifdef EGL_MESA_drm_image + drv->API.CreateDRMImageMESA = egl_g3d_create_drm_image; + drv->API.ExportDRMImageMESA = egl_g3d_export_drm_image; +#endif + +#ifdef EGL_KHR_reusable_sync + drv->API.CreateSyncKHR = egl_g3d_create_sync; + drv->API.DestroySyncKHR = egl_g3d_destroy_sync; + drv->API.ClientWaitSyncKHR = egl_g3d_client_wait_sync; + drv->API.SignalSyncKHR = egl_g3d_signal_sync; +#endif #ifdef EGL_MESA_screen_surface drv->API.CreateScreenSurfaceMESA = egl_g3d_create_screen_surface; diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_image.c b/src/gallium/state_trackers/egl/common/egl_g3d_image.c index 1e13cfcf7e9..558638e72f0 100644 --- a/src/gallium/state_trackers/egl/common/egl_g3d_image.c +++ b/src/gallium/state_trackers/egl/common/egl_g3d_image.c @@ -31,12 +31,16 @@ #include "util/u_rect.h" #include "util/u_inlines.h" #include "eglcurrent.h" +#include "egllog.h" #include "native.h" #include "egl_g3d.h" #include "egl_g3d_api.h" #include "egl_g3d_image.h" +/* move this to native display? */ +#include "state_tracker/drm_driver.h" + /** * Reference and return the front left buffer of the native pixmap. */ @@ -67,6 +71,165 @@ egl_g3d_reference_native_pixmap(_EGLDisplay *dpy, EGLNativePixmapType pix) return textures[natt]; } +#ifdef EGL_MESA_drm_image + +static struct pipe_resource * +egl_g3d_create_drm_buffer(_EGLDisplay *dpy, const EGLint *attribs) +{ + struct egl_g3d_display *gdpy = egl_g3d_display(dpy); + struct pipe_screen *screen = gdpy->native->screen; + struct pipe_resource templ; + EGLint width = 0, height = 0, format = 0, use = 0; + EGLint valid_use; + EGLint i, err = EGL_SUCCESS; + + for (i = 0; attribs[i] != EGL_NONE; i++) { + EGLint attr = attribs[i++]; + EGLint val = attribs[i]; + + switch (attr) { + case EGL_WIDTH: + width = val; + break; + case EGL_HEIGHT: + height = val; + break; + case EGL_DRM_BUFFER_FORMAT_MESA: + format = val; + break; + case EGL_DRM_BUFFER_USE_MESA: + use = val; + break; + default: + err = EGL_BAD_ATTRIBUTE; + break; + } + + if (err != EGL_SUCCESS) { + _eglLog(_EGL_DEBUG, "bad image attribute 0x%04x", attr); + return NULL; + } + } + + if (width <= 0 || height <= 0) { + _eglLog(_EGL_DEBUG, "bad width or height (%dx%d)", width, height); + return NULL; + } + + switch (format) { + case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA: + format = PIPE_FORMAT_B8G8R8A8_UNORM; + break; + default: + _eglLog(_EGL_DEBUG, "bad image format value 0x%04x", format); + return NULL; + break; + } + + valid_use = EGL_DRM_BUFFER_USE_SCANOUT_MESA | + EGL_DRM_BUFFER_USE_SHARE_MESA; + if (use & ~valid_use) { + _eglLog(_EGL_DEBUG, "bad image use bit 0x%04x", use); + return NULL; + } + + memset(&templ, 0, sizeof(templ)); + templ.target = PIPE_TEXTURE_2D; + templ.format = format; + templ.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW; + templ.width0 = width; + templ.height0 = height; + templ.depth0 = 1; + + /* + * XXX fix apps (e.g. wayland) and pipe drivers (e.g. i915) and remove the + * size check + */ + if ((use & EGL_DRM_BUFFER_USE_SCANOUT_MESA) && + width >= 640 && height >= 480) + templ.bind |= PIPE_BIND_SCANOUT; + if (use & EGL_DRM_BUFFER_USE_SHARE_MESA) + templ.bind |= PIPE_BIND_SHARED; + + return screen->resource_create(screen, &templ); +} + +static struct pipe_resource * +egl_g3d_reference_drm_buffer(_EGLDisplay *dpy, EGLint name, + const EGLint *attribs) +{ + struct egl_g3d_display *gdpy = egl_g3d_display(dpy); + struct pipe_screen *screen = gdpy->native->screen; + struct pipe_resource templ; + struct winsys_handle wsh; + EGLint width = 0, height = 0, format = 0, stride = 0; + EGLint i, err = EGL_SUCCESS; + + /* winsys_handle is in theory platform-specific */ + if (dpy->Platform != _EGL_PLATFORM_DRM) + return NULL; + + for (i = 0; attribs[i] != EGL_NONE; i++) { + EGLint attr = attribs[i++]; + EGLint val = attribs[i]; + + switch (attr) { + case EGL_WIDTH: + width = val; + break; + case EGL_HEIGHT: + height = val; + break; + case EGL_DRM_BUFFER_FORMAT_MESA: + format = val; + break; + case EGL_DRM_BUFFER_STRIDE_MESA: + stride = val; + break; + default: + err = EGL_BAD_ATTRIBUTE; + break; + } + + if (err != EGL_SUCCESS) { + _eglLog(_EGL_DEBUG, "bad image attribute 0x%04x", attr); + return NULL; + } + } + + if (width <= 0 || height <= 0 || stride <= 0) { + _eglLog(_EGL_DEBUG, "bad width, height, or stride (%dx%dx%d)", + width, height, stride); + return NULL; + } + + switch (format) { + case EGL_DRM_BUFFER_FORMAT_ARGB32_MESA: + format = PIPE_FORMAT_B8G8R8A8_UNORM; + break; + default: + _eglLog(_EGL_DEBUG, "bad image format value 0x%04x", format); + return NULL; + break; + } + + memset(&templ, 0, sizeof(templ)); + templ.target = PIPE_TEXTURE_2D; + templ.format = format; + templ.bind = PIPE_BIND_RENDER_TARGET | PIPE_BIND_SAMPLER_VIEW; + templ.width0 = width; + templ.height0 = height; + templ.depth0 = 1; + + memset(&wsh, 0, sizeof(wsh)); + wsh.handle = (unsigned) name; + wsh.stride = stride; + + return screen->resource_from_handle(screen, &templ, &wsh); +} + +#endif /* EGL_MESA_drm_image */ + _EGLImage * egl_g3d_create_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx, EGLenum target, EGLClientBuffer buffer, @@ -92,6 +255,11 @@ egl_g3d_create_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx, ptex = egl_g3d_reference_native_pixmap(dpy, (EGLNativePixmapType) buffer); break; +#ifdef EGL_MESA_drm_image + case EGL_DRM_BUFFER_MESA: + ptex = egl_g3d_reference_drm_buffer(dpy, (EGLint) buffer, attribs); + break; +#endif default: ptex = NULL; break; @@ -134,3 +302,80 @@ egl_g3d_destroy_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *img) return EGL_TRUE; } + +_EGLImage * +egl_g3d_create_drm_image(_EGLDriver *drv, _EGLDisplay *dpy, + const EGLint *attribs) +{ + struct egl_g3d_image *gimg; + struct pipe_resource *ptex; + + gimg = CALLOC_STRUCT(egl_g3d_image); + if (!gimg) { + _eglError(EGL_BAD_ALLOC, "eglCreateDRMImageKHR"); + return NULL; + } + + if (!_eglInitImage(&gimg->base, dpy, attribs)) { + FREE(gimg); + return NULL; + } + +#ifdef EGL_MESA_drm_image + ptex = egl_g3d_create_drm_buffer(dpy, attribs); +#else + ptex = NULL; +#endif + if (!ptex) { + FREE(gimg); + return NULL; + } + + /* transfer the ownership to the image */ + gimg->texture = ptex; + gimg->face = 0; + gimg->level = 0; + gimg->zslice = 0; + + return &gimg->base; +} + +EGLBoolean +egl_g3d_export_drm_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *img, + EGLint *name, EGLint *handle, EGLint *stride) +{ + struct egl_g3d_display *gdpy = egl_g3d_display(dpy); + struct egl_g3d_image *gimg = egl_g3d_image(img); + struct pipe_screen *screen = gdpy->native->screen; + struct winsys_handle wsh; + + /* winsys_handle is in theory platform-specific */ + if (dpy->Platform != _EGL_PLATFORM_DRM) + return EGL_FALSE; + + /* get shared handle */ + if (name) { + memset(&handle, 0, sizeof(handle)); + wsh.type = DRM_API_HANDLE_TYPE_SHARED; + if (!screen->resource_get_handle(screen, gimg->texture, &wsh)) { + return EGL_FALSE; + } + + *name = wsh.handle; + } + + /* get KMS handle */ + if (handle || stride) { + memset(&wsh, 0, sizeof(wsh)); + wsh.type = DRM_API_HANDLE_TYPE_KMS; + if (!screen->resource_get_handle(screen, gimg->texture, &wsh)) + return EGL_FALSE; + + if (handle) + *handle = wsh.handle; + if (stride) + *stride = wsh.stride; + } + + return EGL_TRUE; +} diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_image.h b/src/gallium/state_trackers/egl/common/egl_g3d_image.h index adda9333715..f051da82837 100644 --- a/src/gallium/state_trackers/egl/common/egl_g3d_image.h +++ b/src/gallium/state_trackers/egl/common/egl_g3d_image.h @@ -39,4 +39,12 @@ egl_g3d_create_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx, EGLBoolean egl_g3d_destroy_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *image); +_EGLImage * +egl_g3d_create_drm_image(_EGLDriver *drv, _EGLDisplay *dpy, + const EGLint *attribs); + +EGLBoolean +egl_g3d_export_drm_image(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *img, + EGLint *name, EGLint *handle, EGLint *stride); + #endif /* _EGL_G3D_IMAGE_H_ */ diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_sync.c b/src/gallium/state_trackers/egl/common/egl_g3d_sync.c new file mode 100644 index 00000000000..ec74e9eb94c --- /dev/null +++ b/src/gallium/state_trackers/egl/common/egl_g3d_sync.c @@ -0,0 +1,284 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#include "util/u_memory.h" +#include "util/u_atomic.h" +#include "os/os_thread.h" +#include "eglsync.h" +#include "eglcurrent.h" + +#include "egl_g3d.h" +#include "egl_g3d_sync.h" + +#ifdef EGL_KHR_reusable_sync + +/** + * Wait for the conditional variable. + */ +static EGLint +egl_g3d_wait_sync_condvar(struct egl_g3d_sync *gsync, EGLTimeKHR timeout) +{ + _EGLDisplay *dpy = gsync->base.Resource.Display; + + pipe_mutex_lock(gsync->mutex); + + /* unlock display lock just before waiting */ + _eglUnlockMutex(&dpy->Mutex); + + /* No timed wait. Always treat timeout as EGL_FOREVER_KHR */ + pipe_condvar_wait(gsync->condvar, gsync->mutex); + + _eglLockMutex(&dpy->Mutex); + + pipe_mutex_unlock(gsync->mutex); + + return EGL_CONDITION_SATISFIED_KHR; +} + +/** + * Signal the conditional variable. + */ +static void +egl_g3d_signal_sync_condvar(struct egl_g3d_sync *gsync) +{ + pipe_mutex_lock(gsync->mutex); + pipe_condvar_broadcast(gsync->condvar); + pipe_mutex_unlock(gsync->mutex); +} + +/** + * Insert a fence command to the command stream of the current context. + */ +static EGLint +egl_g3d_insert_fence_sync(struct egl_g3d_sync *gsync) +{ + _EGLContext *ctx = _eglGetCurrentContext(); + struct egl_g3d_context *gctx = egl_g3d_context(ctx); + + /* already checked in egl_g3d_create_sync */ + assert(gctx); + + /* insert the fence command */ + gctx->stctxi->flush(gctx->stctxi, 0x0, &gsync->fence); + if (!gsync->fence) + gsync->base.SyncStatus = EGL_SIGNALED_KHR; + + return EGL_SUCCESS; +} + +/** + * Wait for the fence sync to be signaled. + */ +static EGLint +egl_g3d_wait_fence_sync(struct egl_g3d_sync *gsync, EGLTimeKHR timeout) +{ + EGLint ret; + + if (gsync->fence) { + _EGLDisplay *dpy = gsync->base.Resource.Display; + struct egl_g3d_display *gdpy = egl_g3d_display(dpy); + struct pipe_screen *screen = gdpy->native->screen; + struct pipe_fence_handle *fence = gsync->fence; + + gsync->fence = NULL; + + _eglUnlockMutex(&dpy->Mutex); + /* no timed finish? */ + screen->fence_finish(screen, fence, 0x0); + ret = EGL_CONDITION_SATISFIED_KHR; + _eglLockMutex(&dpy->Mutex); + + gsync->base.SyncStatus = EGL_SIGNALED_KHR; + + screen->fence_reference(screen, &fence, NULL); + egl_g3d_signal_sync_condvar(gsync); + } + else { + ret = egl_g3d_wait_sync_condvar(gsync, timeout); + } + + return ret; +} + +static INLINE void +egl_g3d_ref_sync(struct egl_g3d_sync *gsync) +{ + p_atomic_inc(&gsync->refs); +} + +static INLINE void +egl_g3d_unref_sync(struct egl_g3d_sync *gsync) +{ + if (p_atomic_dec_zero(&gsync->refs)) { + pipe_condvar_destroy(gsync->condvar); + pipe_mutex_destroy(gsync->mutex); + + if (gsync->fence) { + struct egl_g3d_display *gdpy = + egl_g3d_display(gsync->base.Resource.Display); + struct pipe_screen *screen = gdpy->native->screen; + + screen->fence_reference(screen, &gsync->fence, NULL); + } + + FREE(gsync); + } +} + +_EGLSync * +egl_g3d_create_sync(_EGLDriver *drv, _EGLDisplay *dpy, + EGLenum type, const EGLint *attrib_list) +{ + _EGLContext *ctx = _eglGetCurrentContext(); + struct egl_g3d_sync *gsync; + EGLint err; + + if (!ctx || ctx->Resource.Display != dpy) { + _eglError(EGL_BAD_MATCH, "eglCreateSyncKHR"); + return NULL; + } + + gsync = CALLOC_STRUCT(egl_g3d_sync); + if (!gsync) { + _eglError(EGL_BAD_ALLOC, "eglCreateSyncKHR"); + return NULL; + } + + if (!_eglInitSync(&gsync->base, dpy, type, attrib_list)) { + FREE(gsync); + return NULL; + } + + switch (type) { + case EGL_SYNC_REUSABLE_KHR: + err = EGL_SUCCESS; + break; + case EGL_SYNC_FENCE_KHR: + err = egl_g3d_insert_fence_sync(gsync); + break; + default: + err = EGL_BAD_ATTRIBUTE; + break; + } + + if (err != EGL_SUCCESS) { + _eglError(err, "eglCreateSyncKHR"); + FREE(gsync); + return NULL; + } + + pipe_mutex_init(gsync->mutex); + pipe_condvar_init(gsync->condvar); + p_atomic_set(&gsync->refs, 1); + + return &gsync->base; +} + +EGLBoolean +egl_g3d_destroy_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync) +{ + struct egl_g3d_sync *gsync = egl_g3d_sync(sync); + + switch (gsync->base.Type) { + case EGL_SYNC_REUSABLE_KHR: + /* signal the waiters */ + if (gsync->base.SyncStatus != EGL_SIGNALED_KHR) { + gsync->base.SyncStatus = EGL_SIGNALED_KHR; + egl_g3d_signal_sync_condvar(gsync); + } + break; + default: + break; + } + + egl_g3d_unref_sync(gsync); + + return EGL_TRUE; +} + +EGLint +egl_g3d_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, + EGLint flags, EGLTimeKHR timeout) +{ + struct egl_g3d_sync *gsync = egl_g3d_sync(sync); + EGLint ret = EGL_CONDITION_SATISFIED_KHR; + + if (gsync->base.SyncStatus != EGL_SIGNALED_KHR) { + /* flush if there is a current context */ + if (flags & EGL_SYNC_FLUSH_COMMANDS_BIT_KHR) { + _EGLContext *ctx = _eglGetCurrentContext(); + struct egl_g3d_context *gctx = egl_g3d_context(ctx); + + if (gctx) + gctx->stctxi->flush(gctx->stctxi, PIPE_FLUSH_RENDER_CACHE , NULL); + } + + if (timeout) { + /* reference the sync object in case it is destroyed while waiting */ + egl_g3d_ref_sync(gsync); + + switch (gsync->base.Type) { + case EGL_SYNC_REUSABLE_KHR: + ret = egl_g3d_wait_sync_condvar(gsync, timeout); + break; + case EGL_SYNC_FENCE_KHR: + ret = egl_g3d_wait_fence_sync(gsync, timeout); + default: + break; + } + + egl_g3d_unref_sync(gsync); + } + else { + ret = EGL_TIMEOUT_EXPIRED_KHR; + } + } + + return ret; +} + +EGLBoolean +egl_g3d_signal_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, + EGLenum mode) +{ + struct egl_g3d_sync *gsync = egl_g3d_sync(sync); + + /* only for reusable sync */ + if (sync->Type != EGL_SYNC_REUSABLE_KHR) + return _eglError(EGL_BAD_MATCH, "eglSignalSyncKHR"); + + if (gsync->base.SyncStatus != mode) { + gsync->base.SyncStatus = mode; + if (mode == EGL_SIGNALED_KHR) + egl_g3d_signal_sync_condvar(gsync); + } + + return EGL_TRUE; +} + +#endif /* EGL_KHR_reusable_sync */ diff --git a/src/gallium/state_trackers/egl/common/egl_g3d_sync.h b/src/gallium/state_trackers/egl/common/egl_g3d_sync.h new file mode 100644 index 00000000000..3179ca04e1a --- /dev/null +++ b/src/gallium/state_trackers/egl/common/egl_g3d_sync.h @@ -0,0 +1,53 @@ +/* + * Mesa 3-D graphics library + * Version: 7.9 + * + * Copyright (C) 2010 LunarG Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Chia-I Wu <[email protected]> + */ + +#ifndef _EGL_G3D_SYNC_H_ +#define _EGL_G3D_SYNC_H_ + +#include "egl_g3d.h" + +#ifdef EGL_KHR_reusable_sync + +_EGLSync * +egl_g3d_create_sync(_EGLDriver *drv, _EGLDisplay *dpy, + EGLenum type, const EGLint *attrib_list); + +EGLBoolean +egl_g3d_destroy_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync); + +EGLint +egl_g3d_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, + EGLint flags, EGLTimeKHR timeout); + +EGLBoolean +egl_g3d_signal_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, + EGLenum mode); + +#endif /* EGL_KHR_reusable_sync */ + +#endif /* _EGL_G3D_SYNC_H_ */ diff --git a/src/gallium/state_trackers/egl/kms/native_kms.c b/src/gallium/state_trackers/egl/kms/native_kms.c index d4e8fbc9131..208f73306cb 100644 --- a/src/gallium/state_trackers/egl/kms/native_kms.c +++ b/src/gallium/state_trackers/egl/kms/native_kms.c @@ -38,6 +38,10 @@ #include "native_kms.h" +/* see get_drm_screen_name */ +#include <radeon_drm.h> +#include "radeon/drm/radeon_drm.h" + static boolean kms_surface_validate(struct native_surface *nsurf, uint attachment_mask, unsigned int *seq_num, struct pipe_resource **textures, @@ -584,7 +588,9 @@ kms_display_get_configs(struct native_display *ndpy, int *num_configs) nconf->color_format = format; - nconf->scanout_bit = TRUE; + /* support KMS */ + if (kdpy->resources) + nconf->scanout_bit = TRUE; } configs = MALLOC(sizeof(*configs)); @@ -664,6 +670,27 @@ kms_display_destroy(struct native_display *ndpy) FREE(kdpy); } +static const char * +get_drm_screen_name(int fd, drmVersionPtr version) +{ + const char *name = version->name; + + if (name && !strcmp(name, "radeon")) { + int chip_id; + struct drm_radeon_info info; + + memset(&info, 0, sizeof(info)); + info.request = RADEON_INFO_DEVICE_ID; + info.value = pointer_to_intptr(&chip_id); + if (drmCommandWriteRead(fd, DRM_RADEON_INFO, &info, sizeof(info)) != 0) + return NULL; + + name = is_r3xx(chip_id) ? "r300" : "r600"; + } + + return name; +} + /** * Initialize KMS and pipe screen. */ @@ -672,6 +699,7 @@ kms_display_init_screen(struct native_display *ndpy) { struct kms_display *kdpy = kms_display(ndpy); drmVersionPtr version; + const char *name; version = drmGetVersion(kdpy->fd); if (!version) { @@ -679,8 +707,11 @@ kms_display_init_screen(struct native_display *ndpy) return FALSE; } - kdpy->base.screen = kdpy->event_handler->new_drm_screen(&kdpy->base, - version->name, kdpy->fd);; + name = get_drm_screen_name(kdpy->fd, version); + if (name) { + kdpy->base.screen = + kdpy->event_handler->new_drm_screen(&kdpy->base, name, kdpy->fd); + } drmFreeVersion(version); if (!kdpy->base.screen) { @@ -717,32 +748,32 @@ kms_create_display(int fd, struct native_event_handler *event_handler, return NULL; } + kdpy->base.destroy = kms_display_destroy; + kdpy->base.get_param = kms_display_get_param; + kdpy->base.get_configs = kms_display_get_configs; + /* resources are fixed, unlike crtc, connector, or encoder */ kdpy->resources = drmModeGetResources(kdpy->fd); - if (!kdpy->resources) { - kms_display_destroy(&kdpy->base); - return NULL; - } + if (kdpy->resources) { + kdpy->saved_crtcs = + CALLOC(kdpy->resources->count_crtcs, sizeof(*kdpy->saved_crtcs)); + if (!kdpy->saved_crtcs) { + kms_display_destroy(&kdpy->base); + return NULL; + } - kdpy->saved_crtcs = - CALLOC(kdpy->resources->count_crtcs, sizeof(*kdpy->saved_crtcs)); - if (!kdpy->saved_crtcs) { - kms_display_destroy(&kdpy->base); - return NULL; - } + kdpy->shown_surfaces = + CALLOC(kdpy->resources->count_crtcs, sizeof(*kdpy->shown_surfaces)); + if (!kdpy->shown_surfaces) { + kms_display_destroy(&kdpy->base); + return NULL; + } - kdpy->shown_surfaces = - CALLOC(kdpy->resources->count_crtcs, sizeof(*kdpy->shown_surfaces)); - if (!kdpy->shown_surfaces) { - kms_display_destroy(&kdpy->base); - return NULL; + kdpy->base.modeset = &kms_display_modeset; + } + else { + _eglLog(_EGL_DEBUG, "Failed to get KMS resources. Disable modeset."); } - - kdpy->base.destroy = kms_display_destroy; - kdpy->base.get_param = kms_display_get_param; - kdpy->base.get_configs = kms_display_get_configs; - - kdpy->base.modeset = &kms_display_modeset; return &kdpy->base; } diff --git a/src/gallium/state_trackers/glx/xlib/glx_api.c b/src/gallium/state_trackers/glx/xlib/glx_api.c index eb8d6a19333..dcd50e19d73 100644 --- a/src/gallium/state_trackers/glx/xlib/glx_api.c +++ b/src/gallium/state_trackers/glx/xlib/glx_api.c @@ -34,10 +34,6 @@ #include "GL/glx.h" #include "xm_api.h" -#include "main/context.h" -#include "main/macros.h" -#include "main/imports.h" -#include "main/version.h" /* This indicates the client-side GLX API and GLX encoder version. */ @@ -603,8 +599,8 @@ destroy_visuals_on_display(Display *dpy) static int close_display_callback(Display *dpy, XExtCodes *codes) { - destroy_visuals_on_display(dpy); xmesa_destroy_buffers_on_display(dpy); + destroy_visuals_on_display(dpy); return 0; } @@ -1299,7 +1295,7 @@ glXCopyContext( Display *dpy, GLXContext src, GLXContext dst, XMesaContext xm_dst = dst->xmesaContext; (void) dpy; if (MakeCurrent_PrevContext == src) { - _mesa_Flush(); + glFlush(); } XMesaCopyContext(xm_src, xm_dst, mask); } diff --git a/src/gallium/state_trackers/glx/xlib/glx_usefont.c b/src/gallium/state_trackers/glx/xlib/glx_usefont.c index 8903b0e6cbd..0aa37e150b8 100644 --- a/src/gallium/state_trackers/glx/xlib/glx_usefont.c +++ b/src/gallium/state_trackers/glx/xlib/glx_usefont.c @@ -30,8 +30,7 @@ */ -#include "main/context.h" -#include "main/imports.h" +#include "main/core.h" #include <GL/glx.h> diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.c b/src/gallium/state_trackers/glx/xlib/xm_api.c index c0c418306fb..eb4ce742669 100644 --- a/src/gallium/state_trackers/glx/xlib/xm_api.c +++ b/src/gallium/state_trackers/glx/xlib/xm_api.c @@ -56,7 +56,6 @@ #include "xm_api.h" #include "xm_st.h" -#include "main/context.h" #include "pipe/p_defines.h" #include "pipe/p_screen.h" #include "pipe/p_context.h" @@ -72,10 +71,35 @@ static struct xm_driver driver; static struct st_api *stapi; +/* Default strict invalidate to false. This means we will not call + * XGetGeometry after every swapbuffers, which allows swapbuffers to + * remain asynchronous. For apps running at 100fps with synchronous + * swapping, a 10% boost is typical. For gears, I see closer to 20% + * speedup. + * + * Note that the work of copying data on swapbuffers doesn't disappear + * - this change just allows the X server to execute the PutImage + * asynchronously without us effectively blocked until its completion. + * + * This speeds up even llvmpipe's threaded rasterization as the + * swapbuffers operation was a large part of the serial component of + * an llvmpipe frame. + * + * The downside of this is correctness - applications which don't call + * glViewport on window resizes will get incorrect rendering. A + * better solution would be to have per-frame but asynchronous + * invalidation. Xcb almost looks as if it could provide this, but + * the API doesn't seem to quite be there. + */ +boolean xmesa_strict_invalidate = FALSE; + void xmesa_set_driver( const struct xm_driver *templ ) { driver = *templ; stapi = driver.create_st_api(); + + xmesa_strict_invalidate = + debug_get_bool_option("XMESA_STRICT_INVALIDATE", FALSE); } @@ -91,7 +115,12 @@ static int xmesa_get_param(struct st_manager *smapi, enum st_manager_param param) { - return 0; + switch(param) { + case ST_MANAGER_BROKEN_INVALIDATE: + return !xmesa_strict_invalidate; + default: + return 0; + } } static XMesaDisplay @@ -263,7 +292,6 @@ xmesa_get_window_size(Display *dpy, XMesaBuffer b, Status stat; pipe_mutex_lock(xmdpy->mutex); - XSync(b->xm_visual->display, 0); /* added for Chromium */ stat = get_drawable_size(dpy, b->ws.drawable, width, height); pipe_mutex_unlock(xmdpy->mutex); @@ -726,15 +754,39 @@ XMesaVisual XMesaCreateVisual( Display *display, alpha_bits = v->mesa_visual.alphaBits; } - _mesa_initialize_visual( &v->mesa_visual, - db_flag, stereo_flag, - red_bits, green_bits, - blue_bits, alpha_bits, - depth_size, - stencil_size, - accum_red_size, accum_green_size, - accum_blue_size, accum_alpha_size, - 0 ); + /* initialize visual */ + { + __GLcontextModes *vis = &v->mesa_visual; + + vis->rgbMode = GL_TRUE; + vis->doubleBufferMode = db_flag; + vis->stereoMode = stereo_flag; + + vis->redBits = red_bits; + vis->greenBits = green_bits; + vis->blueBits = blue_bits; + vis->alphaBits = alpha_bits; + vis->rgbBits = red_bits + green_bits + blue_bits; + + vis->indexBits = 0; + vis->depthBits = depth_size; + vis->stencilBits = stencil_size; + + vis->accumRedBits = accum_red_size; + vis->accumGreenBits = accum_green_size; + vis->accumBlueBits = accum_blue_size; + vis->accumAlphaBits = accum_alpha_size; + + vis->haveAccumBuffer = accum_red_size > 0; + vis->haveDepthBuffer = depth_size > 0; + vis->haveStencilBuffer = stencil_size > 0; + + vis->numAuxBuffers = 0; + vis->level = 0; + vis->pixmapMode = 0; + vis->sampleBuffers = 0; + vis->samples = 0; + } v->stvis.buffer_mask = ST_ATTACHMENT_FRONT_LEFT_MASK; if (db_flag) @@ -1154,7 +1206,7 @@ void XMesaFlush( XMesaContext c ) xmdpy->screen->fence_finish(xmdpy->screen, fence, 0); xmdpy->screen->fence_reference(xmdpy->screen, &fence, NULL); } - XSync( c->xm_visual->display, False ); + XFlush( c->xm_visual->display ); } } diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.h b/src/gallium/state_trackers/glx/xlib/xm_api.h index 4f2c8a6e6a9..f209b14ea13 100644 --- a/src/gallium/state_trackers/glx/xlib/xm_api.h +++ b/src/gallium/state_trackers/glx/xlib/xm_api.h @@ -57,7 +57,7 @@ and create a window, you must do the following to use the X/Mesa interface: #define XMESA_H -#include "main/mtypes.h" +#include "main/core.h" /* for GLvisual and MESA_VERSION_STRING */ #include "state_tracker/st_api.h" #include "os/os_thread.h" @@ -378,6 +378,6 @@ xmesa_buffer_height(XMesaBuffer b) return b->height; } - +extern boolean xmesa_strict_invalidate; #endif diff --git a/src/gallium/state_trackers/glx/xlib/xm_st.c b/src/gallium/state_trackers/glx/xlib/xm_st.c index c62eb8bfbd1..0f74b3f7aa3 100644 --- a/src/gallium/state_trackers/glx/xlib/xm_st.c +++ b/src/gallium/state_trackers/glx/xlib/xm_st.c @@ -26,18 +26,18 @@ * Chia-I Wu <[email protected]> */ -#include "util/u_memory.h" -#include "util/u_inlines.h" - #include "xm_api.h" #include "xm_st.h" +#include "util/u_inlines.h" + struct xmesa_st_framebuffer { XMesaDisplay display; XMesaBuffer buffer; struct pipe_screen *screen; struct st_visual stvis; + enum pipe_texture_target target; unsigned texture_width, texture_height, texture_mask; struct pipe_resource *textures[ST_ATTACHMENT_COUNT]; @@ -139,7 +139,7 @@ xmesa_st_framebuffer_validate_textures(struct st_framebuffer_iface *stfbi, } memset(&templ, 0, sizeof(templ)); - templ.target = PIPE_TEXTURE_2D; + templ.target = xstfb->target; templ.width0 = width; templ.height0 = height; templ.depth0 = 1; @@ -210,6 +210,12 @@ xmesa_st_framebuffer_validate(struct st_framebuffer_iface *stfbi, /* record newly allocated textures */ new_mask = statt_mask & ~xstfb->texture_mask; + /* If xmesa_strict_invalidate is not set, we will not yet have + * called XGetGeometry(). Do so here: + */ + if (!xmesa_strict_invalidate) + xmesa_check_buffer_size(xstfb->buffer); + resized = (xstfb->buffer->width != xstfb->texture_width || xstfb->buffer->height != xstfb->texture_height); @@ -251,7 +257,8 @@ xmesa_st_framebuffer_flush_front(struct st_framebuffer_iface *stfbi, boolean ret; ret = xmesa_st_framebuffer_display(stfbi, statt); - if (ret) + + if (ret && xmesa_strict_invalidate) xmesa_check_buffer_size(xstfb->buffer); return ret; @@ -279,6 +286,10 @@ xmesa_create_st_framebuffer(XMesaDisplay xmdpy, XMesaBuffer b) xstfb->buffer = b; xstfb->screen = xmdpy->screen; xstfb->stvis = b->xm_visual->stvis; + if(xstfb->screen->get_param(xstfb->screen, PIPE_CAP_NPOT_TEXTURES)) + xstfb->target = PIPE_TEXTURE_2D; + else + xstfb->target = PIPE_TEXTURE_RECT; stfbi->visual = &xstfb->stvis; stfbi->flush_front = xmesa_st_framebuffer_flush_front; @@ -322,7 +333,8 @@ xmesa_swap_st_framebuffer(struct st_framebuffer_iface *stfbi) *back = tmp; } - xmesa_check_buffer_size(xstfb->buffer); + if (xmesa_strict_invalidate) + xmesa_check_buffer_size(xstfb->buffer); } } diff --git a/src/gallium/state_trackers/wgl/stw_context.c b/src/gallium/state_trackers/wgl/stw_context.c index 0fb7cd83069..a0e14b96016 100644 --- a/src/gallium/state_trackers/wgl/stw_context.c +++ b/src/gallium/state_trackers/wgl/stw_context.c @@ -33,7 +33,7 @@ /* for _mesa_share_state */ #include "state_tracker/st_context.h" -#include "main/context.h" +#include "main/core.h" #include "stw_icd.h" #include "stw_device.h" diff --git a/src/gallium/state_trackers/wgl/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c index a107c71bdab..37809d084ce 100644 --- a/src/gallium/state_trackers/wgl/stw_device.c +++ b/src/gallium/state_trackers/wgl/stw_device.c @@ -27,7 +27,7 @@ #include <windows.h> -#include "glapi/glthread.h" +#include "glapi/glapi.h" #include "util/u_debug.h" #include "util/u_math.h" #include "util/u_memory.h" diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c index e606477e975..18ac4823696 100644 --- a/src/gallium/state_trackers/wgl/stw_pixelformat.c +++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c @@ -25,15 +25,13 @@ * **************************************************************************/ -#include "main/mtypes.h" -#include "main/context.h" - #include "pipe/p_format.h" #include "pipe/p_defines.h" #include "pipe/p_screen.h" #include "util/u_format.h" #include "util/u_debug.h" +#include "util/u_memory.h" #include "stw_icd.h" #include "stw_device.h" diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.h b/src/gallium/state_trackers/wgl/stw_pixelformat.h index d405172773c..282c9f643c2 100644 --- a/src/gallium/state_trackers/wgl/stw_pixelformat.h +++ b/src/gallium/state_trackers/wgl/stw_pixelformat.h @@ -34,8 +34,6 @@ #define PFD_SUPPORT_COMPOSITION 0x00008000 #endif -#include "main/mtypes.h" - #include "pipe/p_compiler.h" #include "pipe/p_format.h" #include "state_tracker/st_api.h" diff --git a/src/gallium/targets/Makefile.dri b/src/gallium/targets/Makefile.dri index de05f96d231..59961e982aa 100644 --- a/src/gallium/targets/Makefile.dri +++ b/src/gallium/targets/Makefile.dri @@ -1,11 +1,12 @@ # -*-makefile-*- + ifeq ($(MESA_LLVM),1) PIPE_DRIVERS += $(TOP)/src/gallium/drivers/llvmpipe/libllvmpipe.a LDFLAGS += $(LLVM_LDFLAGS) -LD = g++ DRIVER_EXTRAS = $(LLVM_LIBS) -USE_CXX=1 +else +LDFLAGS += -lstdc++ endif MESA_MODULES = \ @@ -75,15 +76,11 @@ default: depend symlinks $(TOP)/$(LIB_DIR)/gallium $(LIBNAME) $(LIBNAME_STAGING) $(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(PIPE_DRIVERS) Makefile \ $(TOP)/src/mesa/drivers/dri/Makefile.template $(TOP)/src/mesa/drivers/dri/common/dri_test.o - $(MKLIB) -o [email protected] -noprefix -linker '$(CC)' -ldflags '$(LDFLAGS)' \ + $(MKLIB) -o [email protected] -noprefix -linker '$(CXX)' -ldflags '$(LDFLAGS)' \ $(OBJECTS) $(PIPE_DRIVERS) \ -Wl,--start-group $(MESA_MODULES) -Wl,--end-group \ $(DRI_LIB_DEPS) $(DRIVER_EXTRAS) - if [ "x${USE_CXX}" == "x" ]; then \ - $(CC) $(CFLAGS) -o [email protected] $(TOP)/src/mesa/drivers/dri/common/dri_test.o [email protected] $(DRI_LIB_DEPS); \ - else \ - $(CXX) $(CFLAGS) -o [email protected] $(TOP)/src/mesa/drivers/dri/common/dri_test.o [email protected] $(DRI_LIB_DEPS); \ - fi + $(CXX) $(CFLAGS) -o [email protected] $(TOP)/src/mesa/drivers/dri/common/dri_test.o [email protected] $(DRI_LIB_DEPS); @rm -f [email protected] mv -f [email protected] $@ diff --git a/src/gallium/targets/SConscript b/src/gallium/targets/SConscript index f8276b15558..e447d093610 100644 --- a/src/gallium/targets/SConscript +++ b/src/gallium/targets/SConscript @@ -1,18 +1,13 @@ import os Import('*') - + # Compatibility with old build scripts: # if 'mesa' in env['statetrackers']: - if 'xlib' in env['winsys']: - SConscript([ - 'libgl-xlib/SConscript', - ]) - - if 'gdi' in env['winsys']: - SConscript([ - 'libgl-gdi/SConscript', - ]) + if 'xlib' in env['winsys'] and 'libgl-xlib' not in env['targets']: + env['targets'].append('libgl-xlib') + if 'gdi' in env['winsys'] and 'libgl-gdi' not in env['targets']: + env['targets'].append('libgl-gdi') if not 'graw-xlib' in env['targets'] and not 'graw-null' in env['targets'] and not env['msvc']: # XXX: disable until MSVC can link correctly diff --git a/src/gallium/targets/dri-radeong/Makefile b/src/gallium/targets/dri-r300/Makefile index 3f9ec361664..9afbb13276d 100644 --- a/src/gallium/targets/dri-radeong/Makefile +++ b/src/gallium/targets/dri-r300/Makefile @@ -1,7 +1,7 @@ TOP = ../../../.. include $(TOP)/configs/current -LIBNAME = radeong_dri.so +LIBNAME = r300_dri.so PIPE_DRIVERS = \ $(TOP)/src/gallium/state_trackers/dri/drm/libdridrm.a \ diff --git a/src/gallium/targets/dri-radeong/SConscript b/src/gallium/targets/dri-r300/SConscript index 1402c3bd120..33a458f2e68 100644 --- a/src/gallium/targets/dri-radeong/SConscript +++ b/src/gallium/targets/dri-r300/SConscript @@ -1,7 +1,7 @@ Import('*') if not 'r300' in env['drivers']: - print 'warning: r300 pipe driver not built skipping radeong_dri.so' + print 'warning: r300 pipe driver not built skipping r300_dri.so' Return() env = drienv.Clone() @@ -24,7 +24,7 @@ env.Prepend(LIBS = [ ]) env.SharedLibrary( - target ='radeon_dri.so', + target ='r300_dri.so', source = 'target.c', SHLIBPREFIX = '', ) diff --git a/src/gallium/targets/dri-radeong/target.c b/src/gallium/targets/dri-r300/target.c index 5a0a8dc5738..2ecf3457a76 100644 --- a/src/gallium/targets/dri-radeong/target.c +++ b/src/gallium/targets/dri-r300/target.c @@ -23,4 +23,4 @@ create_screen(int fd) return screen; } -DRM_DRIVER_DESCRIPTOR("radeon", "radeon", create_screen) +DRM_DRIVER_DESCRIPTOR("r300", "radeon", create_screen) diff --git a/src/gallium/targets/dri-r600/Makefile b/src/gallium/targets/dri-r600/Makefile index 932303d194e..661283de6a8 100644 --- a/src/gallium/targets/dri-r600/Makefile +++ b/src/gallium/targets/dri-r600/Makefile @@ -4,12 +4,12 @@ include $(TOP)/configs/current LIBNAME = r600_dri.so PIPE_DRIVERS = \ + $(TOP)/src/gallium/drivers/r600/libr600.a \ $(TOP)/src/gallium/state_trackers/dri/drm/libdridrm.a \ $(TOP)/src/gallium/winsys/r600/drm/libr600winsys.a \ $(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \ $(TOP)/src/gallium/drivers/trace/libtrace.a \ - $(TOP)/src/gallium/drivers/rbug/librbug.a \ - $(TOP)/src/gallium/drivers/r600/libr600.a + $(TOP)/src/gallium/drivers/rbug/librbug.a C_SOURCES = \ target.c \ @@ -21,6 +21,6 @@ DRIVER_DEFINES = \ include ../Makefile.dri -DRI_LIB_DEPS += -ldrm_radeon +DRI_LIB_DEPS += symlinks: diff --git a/src/gallium/targets/egl-gdi/egl-static.c b/src/gallium/targets/egl-gdi/egl-static.c index ec2f865c317..4655d791170 100644 --- a/src/gallium/targets/egl-gdi/egl-static.c +++ b/src/gallium/targets/egl-gdi/egl-static.c @@ -33,6 +33,8 @@ #include "target-helpers/inline_debug_helper.h" #include "egldriver.h" +static struct st_api *stapis[ST_API_COUNT]; + static uint get_api_mask(void) { @@ -57,7 +59,11 @@ get_api_mask(void) static struct st_api * get_st_api(enum st_api_type api) { - struct st_api *stapi = NULL; + struct st_api *stapi; + + stapi = stapis[api]; + if (stapi) + return stapi; switch (api) { #if FEATURE_GL @@ -84,13 +90,33 @@ get_st_api(enum st_api_type api) break; } + stapis[api] = stapi; + return stapi; } static struct st_api * guess_gl_api(void) { - return NULL; + struct st_api *stapi = NULL; + +#if FEATURE_GL + stapi = get_st_api(ST_API_OPENGL); + if (stapi) + return stapi; +#endif +#if FEATURE_ES1 + stapi = get_st_api(ST_API_OPENGL_ES1); + if (stapi) + return stapi; +#endif +#if FEATURE_ES2 + stapi = get_st_api(ST_API_OPENGL_ES2); + if (stapi) + return stapi; +#endif + + return stapi; } static struct pipe_screen * @@ -127,7 +153,16 @@ init_loader(struct egl_g3d_loader *loader) static void egl_g3d_unload(_EGLDriver *drv) { + int i; + egl_g3d_destroy_driver(drv); + + for (i = 0; i < ST_API_COUNT; i++) { + if (stapis[i]) { + stapis[i]->destroy(stapis[i]); + stapis[i] = NULL; + } + } } static struct egl_g3d_loader loader; diff --git a/src/gallium/targets/egl/Makefile b/src/gallium/targets/egl/Makefile index 1e4bb4d94c2..2784fd0d100 100644 --- a/src/gallium/targets/egl/Makefile +++ b/src/gallium/targets/egl/Makefile @@ -90,13 +90,20 @@ nouveau_LIBS := \ $(TOP)/src/gallium/drivers/nv50/libnv50.a \ $(TOP)/src/gallium/drivers/nouveau/libnouveau.a -# radeon pipe driver -radeon_CPPFLAGS := -radeon_SYS := -ldrm -ldrm_radeon -radeon_LIBS := \ +# r300 pipe driver +r300_CPPFLAGS := +r300_SYS := -ldrm -ldrm_radeon +r300_LIBS := \ $(TOP)/src/gallium/winsys/radeon/drm/libradeonwinsys.a \ $(TOP)/src/gallium/drivers/r300/libr300.a +# r600 pipe driver +r600_CPPFLAGS := +r600_SYS := -ldrm -ldrm_radeon +r600_LIBS := \ + $(TOP)/src/gallium/winsys/r600/drm/libr600winsys.a \ + $(TOP)/src/gallium/drivers/r600/libr600.a + # vmwgfx pipe driver vmwgfx_CPPFLAGS := vmwgfx_SYS := @@ -119,17 +126,17 @@ endif # OpenGL state tracker GL_CPPFLAGS := -I$(TOP)/src/mesa $(API_DEFINES) -GL_SYS := -lpthread -lm -L$(TOP)/$(LIB_DIR) -l$(GL_LIB) +GL_SYS := $(DRI_LIB_DEPS) -L$(TOP)/$(LIB_DIR) -l$(GL_LIB) GL_LIBS := $(TOP)/src/mesa/libmesagallium.a # OpenGL ES 1.x state tracker GLESv1_CM_CPPFLAGS := -I$(TOP)/src/mesa -GLESv1_CM_SYS := -lpthread -lm -L$(TOP)/$(LIB_DIR) -l$(GLESv1_CM_LIB) +GLESv1_CM_SYS := $(DRI_LIB_DEPS) -L$(TOP)/$(LIB_DIR) -l$(GLESv1_CM_LIB) GLESv1_CM_LIBS := $(TOP)/src/mesa/libes1gallium.a # OpenGL ES 2.x state tracker GLESv2_CPPFLAGS := -I$(TOP)/src/mesa -GLESv2_SYS := -lpthread -lm -L$(TOP)/$(LIB_DIR) -l$(GLESv2_LIB) +GLESv2_SYS := $(DRI_LIB_DEPS) -L$(TOP)/$(LIB_DIR) -l$(GLESv2_LIB) GLESv2_LIBS := $(TOP)/src/mesa/libes2gallium.a # OpenVG state tracker @@ -151,7 +158,10 @@ ifneq ($(findstring nouveau/drm,$(GALLIUM_WINSYS_DIRS)),) OUTPUTS += nouveau endif ifneq ($(findstring radeon/drm,$(GALLIUM_WINSYS_DIRS)),) -OUTPUTS += radeon +OUTPUTS += r300 +endif +ifneq ($(findstring r600/drm,$(GALLIUM_WINSYS_DIRS)),) +OUTPUTS += r600 endif ifneq ($(findstring svga/drm,$(GALLIUM_WINSYS_DIRS)),) OUTPUTS += vmwgfx @@ -188,8 +198,11 @@ $(OUTPUT_PATH)/$(PIPE_PREFIX)i965.so: pipe_i965.o $(i965_LIBS) $(OUTPUT_PATH)/$(PIPE_PREFIX)nouveau.so: pipe_nouveau.o $(nouveau_LIBS) $(call mklib,nouveau) -$(OUTPUT_PATH)/$(PIPE_PREFIX)radeon.so: pipe_radeon.o $(radeon_LIBS) - $(call mklib,radeon) +$(OUTPUT_PATH)/$(PIPE_PREFIX)r300.so: pipe_r300.o $(r300_LIBS) + $(call mklib,r300) + +$(OUTPUT_PATH)/$(PIPE_PREFIX)r600.so: pipe_r600.o $(r600_LIBS) + $(call mklib,r600) $(OUTPUT_PATH)/$(PIPE_PREFIX)vmwgfx.so: pipe_vmwgfx.o $(vmwgfx_LIBS) $(call mklib,vmwgfx) diff --git a/src/gallium/targets/egl/egl.c b/src/gallium/targets/egl/egl.c index d9d89485c3c..a573b212179 100644 --- a/src/gallium/targets/egl/egl.c +++ b/src/gallium/targets/egl/egl.c @@ -155,24 +155,23 @@ load_pipe_module(struct pipe_module *pmod, const char *name) if (!pmod->name) return FALSE; + _eglLog(_EGL_DEBUG, "searching for pipe module %s", pmod->name); _eglSearchPathForEach(dlopen_pipe_module_cb, (void *) pmod); if (pmod->lib) { pmod->drmdd = (const struct drm_driver_descriptor *) util_dl_get_proc_address(pmod->lib, "driver_descriptor"); - if (pmod->drmdd) { - if (pmod->drmdd->driver_name) { - /* driver name mismatch */ - if (strcmp(pmod->drmdd->driver_name, pmod->name) != 0) - pmod->drmdd = NULL; - } - else { - /* swrast */ - pmod->swrast_create_screen = - (struct pipe_screen *(*)(struct sw_winsys *)) - util_dl_get_proc_address(pmod->lib, "swrast_create_screen"); - if (!pmod->swrast_create_screen) - pmod->drmdd = NULL; - } + + /* sanity check on the name */ + if (pmod->drmdd && strcmp(pmod->drmdd->name, pmod->name) != 0) + pmod->drmdd = NULL; + + /* swrast */ + if (pmod->drmdd && !pmod->drmdd->driver_name) { + pmod->swrast_create_screen = + (struct pipe_screen *(*)(struct sw_winsys *)) + util_dl_get_proc_address(pmod->lib, "swrast_create_screen"); + if (!pmod->swrast_create_screen) + pmod->drmdd = NULL; } if (!pmod->drmdd) { diff --git a/src/gallium/targets/egl/pipe_radeon.c b/src/gallium/targets/egl/pipe_r300.c index 35550bcb263..d84bb92539a 100644 --- a/src/gallium/targets/egl/pipe_radeon.c +++ b/src/gallium/targets/egl/pipe_r300.c @@ -24,4 +24,4 @@ create_screen(int fd) } PUBLIC -DRM_DRIVER_DESCRIPTOR("radeon", "radeon", create_screen) +DRM_DRIVER_DESCRIPTOR("r300", "radeon", create_screen) diff --git a/src/gallium/targets/egl/pipe_r600.c b/src/gallium/targets/egl/pipe_r600.c new file mode 100644 index 00000000000..486a6592585 --- /dev/null +++ b/src/gallium/targets/egl/pipe_r600.c @@ -0,0 +1,27 @@ + +#include "state_tracker/drm_driver.h" +#include "target-helpers/inline_debug_helper.h" +#include "r600/drm/r600_drm_public.h" +#include "r600/r600_public.h" + +static struct pipe_screen * +create_screen(int fd) +{ + struct radeon *rw; + struct pipe_screen *screen; + + rw = r600_drm_winsys_create(fd); + if (!rw) + return NULL; + + screen = r600_screen_create(rw); + if (!screen) + return NULL; + + screen = debug_screen_wrap(screen); + + return screen; +} + +PUBLIC +DRM_DRIVER_DESCRIPTOR("r600", "radeon", create_screen) diff --git a/src/gallium/targets/graw-xlib/graw_util.c b/src/gallium/targets/graw-xlib/graw_util.c index 47aca4464db..fc7c9ae6f92 100644 --- a/src/gallium/targets/graw-xlib/graw_util.c +++ b/src/gallium/targets/graw-xlib/graw_util.c @@ -1,6 +1,7 @@ #include "pipe/p_compiler.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" #include "tgsi/tgsi_text.h" #include "util/u_memory.h" #include "state_tracker/graw.h" diff --git a/src/gallium/targets/graw-xlib/graw_xlib.c b/src/gallium/targets/graw-xlib/graw_xlib.c index 41120ba3c70..8b64a0b819c 100644 --- a/src/gallium/targets/graw-xlib/graw_xlib.c +++ b/src/gallium/targets/graw-xlib/graw_xlib.c @@ -1,5 +1,6 @@ #include "pipe/p_compiler.h" #include "pipe/p_context.h" +#include "pipe/p_screen.h" #include "util/u_debug.h" #include "util/u_memory.h" #include "target-helpers/wrap_screen.h" diff --git a/src/gallium/targets/libgl-gdi/SConscript b/src/gallium/targets/libgl-gdi/SConscript index 144084f74f8..12fe403f62f 100644 --- a/src/gallium/targets/libgl-gdi/SConscript +++ b/src/gallium/targets/libgl-gdi/SConscript @@ -17,6 +17,7 @@ if env['platform'] == 'windows': 'user32', 'kernel32', 'ws2_32', + talloc, ]) sources = [] diff --git a/src/gallium/targets/libgl-xlib/Makefile b/src/gallium/targets/libgl-xlib/Makefile index e745023ba59..fe0541543ab 100644 --- a/src/gallium/targets/libgl-xlib/Makefile +++ b/src/gallium/targets/libgl-xlib/Makefile @@ -68,8 +68,9 @@ $(TOP)/$(LIB_DIR)/gallium: # Make the libGL.so library $(TOP)/$(LIB_DIR)/gallium/$(GL_LIB_NAME): $(XLIB_TARGET_OBJECTS) $(LIBS) Makefile $(TOP)/bin/mklib -o $(GL_LIB) \ - -linker "$(CC)" \ + -linker "$(CXX)" \ -major $(GL_MAJOR) -minor $(GL_MINOR) -patch $(GL_TINY) \ + -cplusplus \ -install $(TOP)/$(LIB_DIR)/gallium \ $(MKLIB_OPTIONS) $(XLIB_TARGET_OBJECTS) \ -Wl,--start-group $(LIBS) -Wl,--end-group $(GL_LIB_DEPS) diff --git a/src/gallium/targets/libgl-xlib/SConscript b/src/gallium/targets/libgl-xlib/SConscript index 78703fd096d..88e216a65be 100644 --- a/src/gallium/targets/libgl-xlib/SConscript +++ b/src/gallium/targets/libgl-xlib/SConscript @@ -35,6 +35,7 @@ env.Prepend(LIBS = [ mesa, glsl, gallium, + 'talloc' ]) sources = [ diff --git a/src/gallium/targets/libgl-xlib/xlib.c b/src/gallium/targets/libgl-xlib/xlib.c index 69b4ddd33f7..5a9c80c8566 100644 --- a/src/gallium/targets/libgl-xlib/xlib.c +++ b/src/gallium/targets/libgl-xlib/xlib.c @@ -36,6 +36,7 @@ #include "state_tracker/xlib_sw_winsys.h" #include "xm_public.h" +#include "state_tracker/st_api.h" #include "state_tracker/st_gl_api.h" /* piggy back on this libGL for OpenGL support in EGL */ diff --git a/src/gallium/tests/python/tests/texture_blit.py b/src/gallium/tests/python/tests/texture_blit.py index 58706dab93d..089d05c6237 100755 --- a/src/gallium/tests/python/tests/texture_blit.py +++ b/src/gallium/tests/python/tests/texture_blit.py @@ -55,7 +55,7 @@ def tex_coords(texture, face, level, zslice): [0.0, 1.0], ] - if texture.target == PIPE_TEXTURE_2D: + if texture.target == PIPE_TEXTURE_2D or texture.target == PIPE_TEXTURE_RECT: return [[s, t, 0.0] for s, t in st] elif texture.target == PIPE_TEXTURE_3D: depth = texture.get_depth(level) diff --git a/src/gallium/tools/addr2line.sh b/src/gallium/tools/addr2line.sh new file mode 100755 index 00000000000..34dec142716 --- /dev/null +++ b/src/gallium/tools/addr2line.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# This script processes symbols output by Gallium using glibc to human-readable function names + +lastbin= +i=-1 +dir="$(mktemp -d)" +input="$1" + +# Gather all unique addresses for each binary +sed -nre 's|([^ ]*/[^ ]*)\(\+0x([^)]*).*|\1 \2|p' "$input"|sort|uniq|while read bin addr; do + if test "$lastbin" != "$bin"; then + ((++i)) + lastbin="$bin" + echo "$bin" > "$dir/$i.addrs.bin" + fi + echo "$addr" >> "$dir/$i.addrs" +done + +# Construct a sed script to convert hex address to human readable form, and apply it +for i in "$dir"/*.addrs; do + bin="$(<"$i.bin")" + addr2line -p -e "$bin" -a -f < "$i"|sed -nre 's@^0x0*([^:]*): ([^?]*)$@s|'"$bin"'(+0x\1)|\2|g@gp' + rm -f "$i" "$i.bin" +done|sed -f - "$input" + +rmdir "$dir" diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c index 660dbd0c332..d4bf124ce6f 100644 --- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c +++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c @@ -19,7 +19,8 @@ nouveau_drm_destroy_winsys(struct pipe_winsys *s) { struct nouveau_winsys *nv_winsys = nouveau_winsys(s); struct nouveau_screen *nv_screen= nouveau_screen(nv_winsys->pscreen); - nouveau_device_close(&nv_screen->device); + if (nv_screen) + nouveau_device_close(&nv_screen->device); FREE(nv_winsys); } diff --git a/src/gallium/winsys/r600/drm/r600_state.c b/src/gallium/winsys/r600/drm/r600_state.c index d17d6e7954f..71d65f0feab 100644 --- a/src/gallium/winsys/r600/drm/r600_state.c +++ b/src/gallium/winsys/r600/drm/r600_state.c @@ -30,6 +30,8 @@ #include "radeon_priv.h" #include "r600d.h" +#include "util/u_memory.h" + static int r600_state_pm4_resource(struct radeon_state *state); static int r600_state_pm4_cb0(struct radeon_state *state); static int r600_state_pm4_vgt(struct radeon_state *state); @@ -38,24 +40,69 @@ static int r600_state_pm4_shader(struct radeon_state *state); static int r600_state_pm4_draw(struct radeon_state *state); static int r600_state_pm4_config(struct radeon_state *state); static int r600_state_pm4_generic(struct radeon_state *state); +static int r600_state_pm4_query_begin(struct radeon_state *state); +static int r600_state_pm4_query_end(struct radeon_state *state); static int r700_state_pm4_config(struct radeon_state *state); static int r700_state_pm4_cb0(struct radeon_state *state); static int r700_state_pm4_db(struct radeon_state *state); #include "r600_states.h" + +#define SUB_NONE(param) { { 0, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) } } +#define SUB_PS(param) { R600_SHADER_PS, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) } +#define SUB_VS(param) { R600_SHADER_VS, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) } +#define SUB_GS(param) { R600_SHADER_GS, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) } +#define SUB_FS(param) { R600_SHADER_FS, R600_names_##param, (sizeof(R600_names_##param)/sizeof(struct radeon_register)) } + +/* some of these are overriden at runtime for R700 */ +struct radeon_stype_info r600_stypes[] = { + { R600_STATE_CONFIG, 1, 0, r600_state_pm4_config, SUB_NONE(CONFIG), }, + { R600_STATE_CB_CNTL, 1, 0, r600_state_pm4_generic, SUB_NONE(CB_CNTL) }, + { R600_STATE_RASTERIZER, 1, 0, r600_state_pm4_generic, SUB_NONE(RASTERIZER) }, + { R600_STATE_VIEWPORT, 1, 0, r600_state_pm4_generic, SUB_NONE(VIEWPORT) }, + { R600_STATE_SCISSOR, 1, 0, r600_state_pm4_generic, SUB_NONE(SCISSOR) }, + { R600_STATE_BLEND, 1, 0, r600_state_pm4_generic, SUB_NONE(BLEND), }, + { R600_STATE_DSA, 1, 0, r600_state_pm4_generic, SUB_NONE(DSA), }, + { R600_STATE_SHADER, 1, 0, r600_state_pm4_shader, { SUB_PS(PS_SHADER), SUB_VS(VS_SHADER) } }, + { R600_STATE_CONSTANT, 256, 0x10, r600_state_pm4_generic, { SUB_PS(PS_CONSTANT), SUB_VS(VS_CONSTANT) } }, + { R600_STATE_RESOURCE, 160, 0x1c, r600_state_pm4_resource, { SUB_PS(PS_RESOURCE), SUB_VS(VS_RESOURCE), SUB_GS(GS_RESOURCE), SUB_FS(FS_RESOURCE)} }, + { R600_STATE_SAMPLER, 18, 0xc, r600_state_pm4_generic, { SUB_PS(PS_SAMPLER), SUB_VS(VS_SAMPLER), SUB_GS(GS_SAMPLER) } }, + { R600_STATE_SAMPLER_BORDER, 18, 0x10, r600_state_pm4_generic, { SUB_PS(PS_SAMPLER_BORDER), SUB_VS(VS_SAMPLER_BORDER), SUB_GS(GS_SAMPLER_BORDER) } }, + { R600_STATE_CB0, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB0) }, + { R600_STATE_CB1, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB1) }, + { R600_STATE_CB2, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB2) }, + { R600_STATE_CB3, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB3) }, + { R600_STATE_CB4, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB4) }, + { R600_STATE_CB5, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB5) }, + { R600_STATE_CB6, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB6) }, + { R600_STATE_CB7, 1, 0, r600_state_pm4_cb0, SUB_NONE(CB7) }, + { R600_STATE_QUERY_BEGIN, 1, 0, r600_state_pm4_query_begin, SUB_NONE(VGT_EVENT) }, + { R600_STATE_QUERY_END, 1, 0, r600_state_pm4_query_end, SUB_NONE(VGT_EVENT) }, + { R600_STATE_DB, 1, 0, r600_state_pm4_db, SUB_NONE(DB) }, + { R600_STATE_UCP, 1, 0, r600_state_pm4_generic, SUB_NONE(UCP) }, + { R600_STATE_VGT, 1, 0, r600_state_pm4_vgt, SUB_NONE(VGT) }, + { R600_STATE_DRAW, 1, 0, r600_state_pm4_draw, SUB_NONE(DRAW) }, +}; +#define STYPES_SIZE Elements(r600_stypes) + +static const struct radeon_register *get_regs(struct radeon_state *state) +{ + return state->stype->reginfo[state->shader_index].regs; +} + /* * r600/r700 state functions */ static int r600_state_pm4_bytecode(struct radeon_state *state, unsigned offset, unsigned id, unsigned nreg) { - const struct radeon_register *regs = state->radeon->type[state->type].regs; + const struct radeon_register *regs = get_regs(state); unsigned i; int r; if (!offset) { fprintf(stderr, "%s invalid register for state %d %d\n", - __func__, state->type, id); + __func__, state->stype->stype, id); return -EINVAL; } if (offset >= R600_CONFIG_REG_OFFSET && offset < R600_CONFIG_REG_END) { @@ -114,19 +161,18 @@ static int r600_state_pm4_bytecode(struct radeon_state *state, unsigned offset, static int r600_state_pm4_generic(struct radeon_state *state) { - struct radeon *radeon = state->radeon; - unsigned i, offset, nreg, type, coffset, loffset, soffset; + const struct radeon_register *regs = get_regs(state); + unsigned i, offset, nreg, coffset, loffset, soffset; unsigned start; int r; if (!state->nstates) return 0; - type = state->type; - soffset = (state->id - radeon->type[type].id) * radeon->type[type].stride; - offset = loffset = radeon->type[type].regs[0].offset + soffset; + soffset = state->id * state->stype->stride; + offset = loffset = regs[0].offset + soffset; start = 0; for (i = 1, nreg = 1; i < state->nstates; i++) { - coffset = radeon->type[type].regs[i].offset + soffset; + coffset = regs[i].offset + soffset; if (coffset == (loffset + 4)) { nreg++; loffset = coffset; @@ -233,20 +279,54 @@ static int r600_state_pm4_config(struct radeon_state *state) state->pm4[state->cpm4++] = 0x80000000; state->pm4[state->cpm4++] = 0x80000000; state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 0); - state->pm4[state->cpm4++] = 0x00000016; + state->pm4[state->cpm4++] = EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT; state->pm4[state->cpm4++] = PKT3(PKT3_SET_CONFIG_REG, 1); state->pm4[state->cpm4++] = 0x00000010; state->pm4[state->cpm4++] = 0x00028000; return r600_state_pm4_generic(state); } +static int r600_state_pm4_query_begin(struct radeon_state *state) +{ + int r; + + state->cpm4 = 0; + state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 2); + state->pm4[state->cpm4++] = EVENT_TYPE_ZPASS_DONE; + state->pm4[state->cpm4++] = state->states[0]; + state->pm4[state->cpm4++] = 0x0; + state->pm4[state->cpm4++] = PKT3(PKT3_NOP, 0); + r = radeon_state_reloc(state, state->cpm4, 0); + if (r) + return r; + state->pm4[state->cpm4++] = state->bo[0]->handle; + return 0; +} + +static int r600_state_pm4_query_end(struct radeon_state *state) +{ + int r; + + state->cpm4 = 0; + state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 2); + state->pm4[state->cpm4++] = EVENT_TYPE_ZPASS_DONE; + state->pm4[state->cpm4++] = state->states[0]; + state->pm4[state->cpm4++] = 0x0; + state->pm4[state->cpm4++] = PKT3(PKT3_NOP, 0); + r = radeon_state_reloc(state, state->cpm4, 0); + if (r) + return r; + state->pm4[state->cpm4++] = state->bo[0]->handle; + return 0; +} + static int r700_state_pm4_config(struct radeon_state *state) { state->pm4[state->cpm4++] = PKT3(PKT3_CONTEXT_CONTROL, 1); state->pm4[state->cpm4++] = 0x80000000; state->pm4[state->cpm4++] = 0x80000000; state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 0); - state->pm4[state->cpm4++] = 0x00000016; + state->pm4[state->cpm4++] = EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT; state->pm4[state->cpm4++] = PKT3(PKT3_SET_CONFIG_REG, 1); state->pm4[state->cpm4++] = 0x00000010; state->pm4[state->cpm4++] = 0x00028000; @@ -287,7 +367,6 @@ static int r600_state_pm4_vgt(struct radeon_state *state) static int r600_state_pm4_draw(struct radeon_state *state) { - unsigned i; int r; if (state->nbo) { @@ -301,20 +380,13 @@ static int r600_state_pm4_draw(struct radeon_state *state) if (r) return r; state->pm4[state->cpm4++] = state->bo[0]->handle; - } else if (state->nimmd) { - state->pm4[state->cpm4++] = PKT3(PKT3_DRAW_INDEX_IMMD, state->nimmd + 1); - state->pm4[state->cpm4++] = state->states[R600_DRAW__VGT_NUM_INDICES]; - state->pm4[state->cpm4++] = state->states[R600_DRAW__VGT_DRAW_INITIATOR]; - for (i = 0; i < state->nimmd; i++) { - state->pm4[state->cpm4++] = state->immd[i]; - } } else { state->pm4[state->cpm4++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1); state->pm4[state->cpm4++] = state->states[R600_DRAW__VGT_NUM_INDICES]; state->pm4[state->cpm4++] = state->states[R600_DRAW__VGT_DRAW_INITIATOR]; } state->pm4[state->cpm4++] = PKT3(PKT3_EVENT_WRITE, 0); - state->pm4[state->cpm4++] = 0x00000016; + state->pm4[state->cpm4++] = EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT; return 0; } @@ -322,8 +394,9 @@ static int r600_state_pm4_resource(struct radeon_state *state) { u32 flags, type, nbo, offset, soffset; int r; + const struct radeon_register *regs = get_regs(state); - soffset = (state->id - state->radeon->type[state->type].id) * state->radeon->type[state->type].stride; + soffset = state->id * state->stype->stride; type = G_038018_TYPE(state->states[6]); switch (type) { case 2: @@ -342,7 +415,7 @@ static int r600_state_pm4_resource(struct radeon_state *state) return -EINVAL; } r600_state_pm4_with_flush(state, flags); - offset = state->radeon->type[state->type].regs[0].offset + soffset; + offset = regs[0].offset + soffset; state->pm4[state->cpm4++] = PKT3(PKT3_SET_RESOURCE, 7); state->pm4[state->cpm4++] = (offset - R_038000_SQ_TEX_RESOURCE_WORD0_0) >> 2; state->pm4[state->cpm4++] = state->states[0]; @@ -367,33 +440,62 @@ static int r600_state_pm4_resource(struct radeon_state *state) return 0; } -int r600_init(struct radeon *radeon) + +static void r600_modify_type_array(struct radeon *radeon) { + int i; switch (radeon->family) { - case CHIP_R600: - case CHIP_RV610: - case CHIP_RV630: - case CHIP_RV670: - case CHIP_RV620: - case CHIP_RV635: - case CHIP_RS780: - case CHIP_RS880: - radeon->ntype = R600_NTYPE; - radeon->nstate = R600_NSTATE; - radeon->type = R600_types; - break; case CHIP_RV770: case CHIP_RV730: case CHIP_RV710: case CHIP_RV740: - radeon->ntype = R600_NTYPE; - radeon->nstate = R600_NSTATE; - radeon->type = R700_types; break; default: - fprintf(stderr, "%s unknown or unsupported chipset 0x%04X\n", - __func__, radeon->device); - return -EINVAL; + return; + } + + /* r700 needs some mods */ + for (i = 0; i < radeon->nstype; i++) { + struct radeon_stype_info *info = &radeon->stype[i]; + + switch(info->stype) { + case R600_STATE_CONFIG: + info->pm4 = r700_state_pm4_config; + break; + case R600_STATE_CB0: + info->pm4 = r700_state_pm4_cb0; + break; + case R600_STATE_DB: + info->pm4 = r700_state_pm4_db; + }; } +} + +static void r600_build_types_array(struct radeon *radeon) +{ + int i, j; + int id = 0; + + for (i = 0; i < STYPES_SIZE; i++) { + r600_stypes[i].base_id = id; + r600_stypes[i].npm4 = 128; + if (r600_stypes[i].reginfo[0].shader_type == 0) { + id += r600_stypes[i].num; + } else { + for (j = 0; j < R600_SHADER_MAX; j++) { + if (r600_stypes[i].reginfo[j].shader_type) + id += r600_stypes[i].num; + } + } + } + radeon->stype = r600_stypes; + radeon->nstype = STYPES_SIZE; + + r600_modify_type_array(radeon); +} + +int r600_init(struct radeon *radeon) +{ + r600_build_types_array(radeon); return 0; } diff --git a/src/gallium/winsys/r600/drm/r600_states.h b/src/gallium/winsys/r600/drm/r600_states.h index e40c77d8f6c..09d79d498d8 100644 --- a/src/gallium/winsys/r600/drm/r600_states.h +++ b/src/gallium/winsys/r600/drm/r600_states.h @@ -17,7 +17,7 @@ #ifndef R600_STATES_H #define R600_STATES_H -static const struct radeon_register R600_CONFIG_names[] = { +static const struct radeon_register R600_names_CONFIG[] = { {0x00008C00, 0, 0, "SQ_CONFIG"}, {0x00008C04, 0, 0, "SQ_GPR_RESOURCE_MGMT_1"}, {0x00008C08, 0, 0, "SQ_GPR_RESOURCE_MGMT_2"}, @@ -61,7 +61,7 @@ static const struct radeon_register R600_CONFIG_names[] = { {0x00028B20, 0, 0, "VGT_STRMOUT_BUFFER_EN"}, }; -static const struct radeon_register R600_CB_CNTL_names[] = { +static const struct radeon_register R600_names_CB_CNTL[] = { {0x00028120, 0, 0, "CB_CLEAR_RED"}, {0x00028124, 0, 0, "CB_CLEAR_GREEN"}, {0x00028128, 0, 0, "CB_CLEAR_BLUE"}, @@ -82,7 +82,7 @@ static const struct radeon_register R600_CB_CNTL_names[] = { {0x00028C48, 0, 0, "PA_SC_AA_MASK"}, }; -static const struct radeon_register R600_RASTERIZER_names[] = { +static const struct radeon_register R600_names_RASTERIZER[] = { {0x000286D4, 0, 0, "SPI_INTERP_CONTROL_0"}, {0x00028810, 0, 0, "PA_CL_CLIP_CNTL"}, {0x00028814, 0, 0, "PA_SU_SC_MODE_CNTL"}, @@ -106,7 +106,7 @@ static const struct radeon_register R600_RASTERIZER_names[] = { {0x00028E0C, 0, 0, "PA_SU_POLY_OFFSET_BACK_OFFSET"}, }; -static const struct radeon_register R600_VIEWPORT_names[] = { +static const struct radeon_register R600_names_VIEWPORT[] = { {0x000282D0, 0, 0, "PA_SC_VPORT_ZMIN_0"}, {0x000282D4, 0, 0, "PA_SC_VPORT_ZMAX_0"}, {0x0002843C, 0, 0, "PA_CL_VPORT_XSCALE_0"}, @@ -118,7 +118,7 @@ static const struct radeon_register R600_VIEWPORT_names[] = { {0x00028818, 0, 0, "PA_CL_VTE_CNTL"}, }; -static const struct radeon_register R600_SCISSOR_names[] = { +static const struct radeon_register R600_names_SCISSOR[] = { {0x00028030, 0, 0, "PA_SC_SCREEN_SCISSOR_TL"}, {0x00028034, 0, 0, "PA_SC_SCREEN_SCISSOR_BR"}, {0x00028200, 0, 0, "PA_SC_WINDOW_OFFSET"}, @@ -140,7 +140,7 @@ static const struct radeon_register R600_SCISSOR_names[] = { {0x00028254, 0, 0, "PA_SC_VPORT_SCISSOR_0_BR"}, }; -static const struct radeon_register R600_BLEND_names[] = { +static const struct radeon_register R600_names_BLEND[] = { {0x00028414, 0, 0, "CB_BLEND_RED"}, {0x00028418, 0, 0, "CB_BLEND_GREEN"}, {0x0002841C, 0, 0, "CB_BLEND_BLUE"}, @@ -156,7 +156,7 @@ static const struct radeon_register R600_BLEND_names[] = { {0x00028804, 0, 0, "CB_BLEND_CONTROL"}, }; -static const struct radeon_register R600_DSA_names[] = { +static const struct radeon_register R600_names_DSA[] = { {0x00028028, 0, 0, "DB_STENCIL_CLEAR"}, {0x0002802C, 0, 0, "DB_DEPTH_CLEAR"}, {0x00028410, 0, 0, "SX_ALPHA_TEST_CONTROL"}, @@ -175,7 +175,7 @@ static const struct radeon_register R600_DSA_names[] = { {0x00028D44, 0, 0, "DB_ALPHA_TO_MASK"}, }; -static const struct radeon_register R600_VS_SHADER_names[] = { +static const struct radeon_register R600_names_VS_SHADER[] = { {0x00028380, 0, 0, "SQ_VTX_SEMANTIC_0"}, {0x00028384, 0, 0, "SQ_VTX_SEMANTIC_1"}, {0x00028388, 0, 0, "SQ_VTX_SEMANTIC_2"}, @@ -227,7 +227,7 @@ static const struct radeon_register R600_VS_SHADER_names[] = { {0x000288DC, 0, 0, "SQ_PGM_CF_OFFSET_FS"}, }; -static const struct radeon_register R600_PS_SHADER_names[] = { +static const struct radeon_register R600_names_PS_SHADER[] = { {0x00028644, 0, 0, "SPI_PS_INPUT_CNTL_0"}, {0x00028648, 0, 0, "SPI_PS_INPUT_CNTL_1"}, {0x0002864C, 0, 0, "SPI_PS_INPUT_CNTL_2"}, @@ -269,21 +269,48 @@ static const struct radeon_register R600_PS_SHADER_names[] = { {0x000288CC, 0, 0, "SQ_PGM_CF_OFFSET_PS"}, }; -static const struct radeon_register R600_PS_CONSTANT_names[] = { +static const struct radeon_register R600_names_PS_CONSTANT[] = { {0x00030000, 0, 0, "SQ_ALU_CONSTANT0_0"}, {0x00030004, 0, 0, "SQ_ALU_CONSTANT1_0"}, {0x00030008, 0, 0, "SQ_ALU_CONSTANT2_0"}, {0x0003000C, 0, 0, "SQ_ALU_CONSTANT3_0"}, }; -static const struct radeon_register R600_VS_CONSTANT_names[] = { +static const struct radeon_register R600_names_VS_CONSTANT[] = { {0x00031000, 0, 0, "SQ_ALU_CONSTANT0_256"}, {0x00031004, 0, 0, "SQ_ALU_CONSTANT1_256"}, {0x00031008, 0, 0, "SQ_ALU_CONSTANT2_256"}, {0x0003100C, 0, 0, "SQ_ALU_CONSTANT3_256"}, }; -static const struct radeon_register R600_PS_RESOURCE_names[] = { +static const struct radeon_register R600_names_UCP[] = { + {0x00028E20, 0, 0, "PA_CL_UCP0_X"}, + {0x00028E24, 0, 0, "PA_CL_UCP0_Y"}, + {0x00028E28, 0, 0, "PA_CL_UCP0_Z"}, + {0x00028E2C, 0, 0, "PA_CL_UCP0_W"}, + {0x00028E30, 0, 0, "PA_CL_UCP1_X"}, + {0x00028E34, 0, 0, "PA_CL_UCP1_Y"}, + {0x00028E38, 0, 0, "PA_CL_UCP1_Z"}, + {0x00028E3C, 0, 0, "PA_CL_UCP1_W"}, + {0x00028E40, 0, 0, "PA_CL_UCP2_X"}, + {0x00028E44, 0, 0, "PA_CL_UCP2_Y"}, + {0x00028E48, 0, 0, "PA_CL_UCP2_Z"}, + {0x00028E4C, 0, 0, "PA_CL_UCP2_W"}, + {0x00028E50, 0, 0, "PA_CL_UCP3_X"}, + {0x00028E54, 0, 0, "PA_CL_UCP3_Y"}, + {0x00028E58, 0, 0, "PA_CL_UCP3_Z"}, + {0x00028E5C, 0, 0, "PA_CL_UCP3_W"}, + {0x00028E60, 0, 0, "PA_CL_UCP4_X"}, + {0x00028E64, 0, 0, "PA_CL_UCP4_Y"}, + {0x00028E68, 0, 0, "PA_CL_UCP4_Z"}, + {0x00028E6C, 0, 0, "PA_CL_UCP4_W"}, + {0x00028E70, 0, 0, "PA_CL_UCP5_X"}, + {0x00028E74, 0, 0, "PA_CL_UCP5_Y"}, + {0x00028E78, 0, 0, "PA_CL_UCP5_Z"}, + {0x00028E7C, 0, 0, "PA_CL_UCP5_W"}, +}; + +static const struct radeon_register R600_names_PS_RESOURCE[] = { {0x00038000, 0, 0, "RESOURCE0_WORD0"}, {0x00038004, 0, 0, "RESOURCE0_WORD1"}, {0x00038008, 0, 0, "RESOURCE0_WORD2"}, @@ -293,7 +320,7 @@ static const struct radeon_register R600_PS_RESOURCE_names[] = { {0x00038018, 0, 0, "RESOURCE0_WORD6"}, }; -static const struct radeon_register R600_VS_RESOURCE_names[] = { +static const struct radeon_register R600_names_VS_RESOURCE[] = { {0x00039180, 0, 0, "RESOURCE160_WORD0"}, {0x00039184, 0, 0, "RESOURCE160_WORD1"}, {0x00039188, 0, 0, "RESOURCE160_WORD2"}, @@ -303,7 +330,7 @@ static const struct radeon_register R600_VS_RESOURCE_names[] = { {0x00039198, 0, 0, "RESOURCE160_WORD6"}, }; -static const struct radeon_register R600_FS_RESOURCE_names[] = { +static const struct radeon_register R600_names_FS_RESOURCE[] = { {0x0003A300, 0, 0, "RESOURCE320_WORD0"}, {0x0003A304, 0, 0, "RESOURCE320_WORD1"}, {0x0003A308, 0, 0, "RESOURCE320_WORD2"}, @@ -313,7 +340,7 @@ static const struct radeon_register R600_FS_RESOURCE_names[] = { {0x0003A318, 0, 0, "RESOURCE320_WORD6"}, }; -static const struct radeon_register R600_GS_RESOURCE_names[] = { +static const struct radeon_register R600_names_GS_RESOURCE[] = { {0x0003A4C0, 0, 0, "RESOURCE336_WORD0"}, {0x0003A4C4, 0, 0, "RESOURCE336_WORD1"}, {0x0003A4C8, 0, 0, "RESOURCE336_WORD2"}, @@ -323,46 +350,46 @@ static const struct radeon_register R600_GS_RESOURCE_names[] = { {0x0003A4D8, 0, 0, "RESOURCE336_WORD6"}, }; -static const struct radeon_register R600_PS_SAMPLER_names[] = { +static const struct radeon_register R600_names_PS_SAMPLER[] = { {0x0003C000, 0, 0, "SQ_TEX_SAMPLER_WORD0_0"}, {0x0003C004, 0, 0, "SQ_TEX_SAMPLER_WORD1_0"}, {0x0003C008, 0, 0, "SQ_TEX_SAMPLER_WORD2_0"}, }; -static const struct radeon_register R600_VS_SAMPLER_names[] = { +static const struct radeon_register R600_names_VS_SAMPLER[] = { {0x0003C0D8, 0, 0, "SQ_TEX_SAMPLER_WORD0_18"}, {0x0003C0DC, 0, 0, "SQ_TEX_SAMPLER_WORD1_18"}, {0x0003C0E0, 0, 0, "SQ_TEX_SAMPLER_WORD2_18"}, }; -static const struct radeon_register R600_GS_SAMPLER_names[] = { +static const struct radeon_register R600_names_GS_SAMPLER[] = { {0x0003C1B0, 0, 0, "SQ_TEX_SAMPLER_WORD0_36"}, {0x0003C1B4, 0, 0, "SQ_TEX_SAMPLER_WORD1_36"}, {0x0003C1B8, 0, 0, "SQ_TEX_SAMPLER_WORD2_36"}, }; -static const struct radeon_register R600_PS_SAMPLER_BORDER_names[] = { +static const struct radeon_register R600_names_PS_SAMPLER_BORDER[] = { {0x0000A400, 0, 0, "TD_PS_SAMPLER0_BORDER_RED"}, {0x0000A404, 0, 0, "TD_PS_SAMPLER0_BORDER_GREEN"}, {0x0000A408, 0, 0, "TD_PS_SAMPLER0_BORDER_BLUE"}, {0x0000A40C, 0, 0, "TD_PS_SAMPLER0_BORDER_ALPHA"}, }; -static const struct radeon_register R600_VS_SAMPLER_BORDER_names[] = { +static const struct radeon_register R600_names_VS_SAMPLER_BORDER[] = { {0x0000A600, 0, 0, "TD_VS_SAMPLER0_BORDER_RED"}, {0x0000A604, 0, 0, "TD_VS_SAMPLER0_BORDER_GREEN"}, {0x0000A608, 0, 0, "TD_VS_SAMPLER0_BORDER_BLUE"}, {0x0000A60C, 0, 0, "TD_VS_SAMPLER0_BORDER_ALPHA"}, }; -static const struct radeon_register R600_GS_SAMPLER_BORDER_names[] = { +static const struct radeon_register R600_names_GS_SAMPLER_BORDER[] = { {0x0000A800, 0, 0, "TD_GS_SAMPLER0_BORDER_RED"}, {0x0000A804, 0, 0, "TD_GS_SAMPLER0_BORDER_GREEN"}, {0x0000A808, 0, 0, "TD_GS_SAMPLER0_BORDER_BLUE"}, {0x0000A80C, 0, 0, "TD_GS_SAMPLER0_BORDER_ALPHA"}, }; -static const struct radeon_register R600_CB0_names[] = { +static const struct radeon_register R600_names_CB0[] = { {0x00028040, 1, 0, "CB_COLOR0_BASE"}, {0x000280A0, 0, 0, "CB_COLOR0_INFO"}, {0x00028060, 0, 0, "CB_COLOR0_SIZE"}, @@ -372,7 +399,7 @@ static const struct radeon_register R600_CB0_names[] = { {0x00028100, 0, 0, "CB_COLOR0_MASK"}, }; -static const struct radeon_register R600_CB1_names[] = { +static const struct radeon_register R600_names_CB1[] = { {0x00028044, 1, 0, "CB_COLOR1_BASE"}, {0x000280A4, 0, 0, "CB_COLOR1_INFO"}, {0x00028064, 0, 0, "CB_COLOR1_SIZE"}, @@ -382,7 +409,7 @@ static const struct radeon_register R600_CB1_names[] = { {0x00028104, 0, 0, "CB_COLOR1_MASK"}, }; -static const struct radeon_register R600_CB2_names[] = { +static const struct radeon_register R600_names_CB2[] = { {0x00028048, 1, 0, "CB_COLOR2_BASE"}, {0x000280A8, 0, 0, "CB_COLOR2_INFO"}, {0x00028068, 0, 0, "CB_COLOR2_SIZE"}, @@ -392,7 +419,7 @@ static const struct radeon_register R600_CB2_names[] = { {0x00028108, 0, 0, "CB_COLOR2_MASK"}, }; -static const struct radeon_register R600_CB3_names[] = { +static const struct radeon_register R600_names_CB3[] = { {0x0002804C, 1, 0, "CB_COLOR3_BASE"}, {0x000280AC, 0, 0, "CB_COLOR3_INFO"}, {0x0002806C, 0, 0, "CB_COLOR3_SIZE"}, @@ -402,7 +429,7 @@ static const struct radeon_register R600_CB3_names[] = { {0x0002810C, 0, 0, "CB_COLOR3_MASK"}, }; -static const struct radeon_register R600_CB4_names[] = { +static const struct radeon_register R600_names_CB4[] = { {0x00028050, 1, 0, "CB_COLOR4_BASE"}, {0x000280B0, 0, 0, "CB_COLOR4_INFO"}, {0x00028070, 0, 0, "CB_COLOR4_SIZE"}, @@ -412,7 +439,7 @@ static const struct radeon_register R600_CB4_names[] = { {0x00028110, 0, 0, "CB_COLOR4_MASK"}, }; -static const struct radeon_register R600_CB5_names[] = { +static const struct radeon_register R600_names_CB5[] = { {0x00028054, 1, 0, "CB_COLOR5_BASE"}, {0x000280B4, 0, 0, "CB_COLOR5_INFO"}, {0x00028074, 0, 0, "CB_COLOR5_SIZE"}, @@ -422,7 +449,7 @@ static const struct radeon_register R600_CB5_names[] = { {0x00028114, 0, 0, "CB_COLOR5_MASK"}, }; -static const struct radeon_register R600_CB6_names[] = { +static const struct radeon_register R600_names_CB6[] = { {0x00028058, 1, 0, "CB_COLOR6_BASE"}, {0x000280B8, 0, 0, "CB_COLOR6_INFO"}, {0x00028078, 0, 0, "CB_COLOR6_SIZE"}, @@ -432,7 +459,7 @@ static const struct radeon_register R600_CB6_names[] = { {0x00028118, 0, 0, "CB_COLOR6_MASK"}, }; -static const struct radeon_register R600_CB7_names[] = { +static const struct radeon_register R600_names_CB7[] = { {0x0002805C, 1, 0, "CB_COLOR7_BASE"}, {0x000280BC, 0, 0, "CB_COLOR7_INFO"}, {0x0002807C, 0, 0, "CB_COLOR7_SIZE"}, @@ -442,7 +469,7 @@ static const struct radeon_register R600_CB7_names[] = { {0x0002811C, 0, 0, "CB_COLOR7_MASK"}, }; -static const struct radeon_register R600_DB_names[] = { +static const struct radeon_register R600_names_DB[] = { {0x0002800C, 1, 0, "DB_DEPTH_BASE"}, {0x00028000, 0, 0, "DB_DEPTH_SIZE"}, {0x00028004, 0, 0, "DB_DEPTH_VIEW"}, @@ -451,7 +478,7 @@ static const struct radeon_register R600_DB_names[] = { {0x00028D34, 0, 0, "DB_PREFETCH_LIMIT"}, }; -static const struct radeon_register R600_VGT_names[] = { +static const struct radeon_register R600_names_VGT[] = { {0x00008958, 0, 0, "VGT_PRIMITIVE_TYPE"}, {0x00028400, 0, 0, "VGT_MAX_VTX_INDX"}, {0x00028404, 0, 0, "VGT_MIN_VTX_INDX"}, @@ -465,81 +492,15 @@ static const struct radeon_register R600_VGT_names[] = { {0x00028AA4, 0, 0, "VGT_INSTANCE_STEP_RATE_1"}, }; -static const struct radeon_register R600_DRAW_names[] = { +static const struct radeon_register R600_names_DRAW[] = { {0x00008970, 0, 0, "VGT_NUM_INDICES"}, {0x000287E4, 0, 0, "VGT_DMA_BASE_HI"}, {0x000287E8, 1, 0, "VGT_DMA_BASE"}, {0x000287F0, 0, 0, "VGT_DRAW_INITIATOR"}, }; -static struct radeon_type R600_types[] = { - { 128, 0, 0x00000000, 0x00000000, 0x0000, 0, "R600_CONFIG", 41, r600_state_pm4_config, R600_CONFIG_names}, - { 128, 1, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB_CNTL", 18, r600_state_pm4_generic, R600_CB_CNTL_names}, - { 128, 2, 0x00000000, 0x00000000, 0x0000, 0, "R600_RASTERIZER", 21, r600_state_pm4_generic, R600_RASTERIZER_names}, - { 128, 3, 0x00000000, 0x00000000, 0x0000, 0, "R600_VIEWPORT", 9, r600_state_pm4_generic, R600_VIEWPORT_names}, - { 128, 4, 0x00000000, 0x00000000, 0x0000, 0, "R600_SCISSOR", 19, r600_state_pm4_generic, R600_SCISSOR_names}, - { 128, 5, 0x00000000, 0x00000000, 0x0000, 0, "R600_BLEND", 13, r600_state_pm4_generic, R600_BLEND_names}, - { 128, 6, 0x00000000, 0x00000000, 0x0000, 0, "R600_DSA", 16, r600_state_pm4_generic, R600_DSA_names}, - { 128, 7, 0x00000000, 0x00000000, 0x0000, 0, "R600_VS_SHADER", 49, r600_state_pm4_shader, R600_VS_SHADER_names}, - { 128, 8, 0x00000000, 0x00000000, 0x0000, 0, "R600_PS_SHADER", 39, r600_state_pm4_shader, R600_PS_SHADER_names}, - { 128, 9, 0x00030000, 0x00031000, 0x0010, 0, "R600_PS_CONSTANT", 4, r600_state_pm4_generic, R600_PS_CONSTANT_names}, - { 128, 265, 0x00031000, 0x00032000, 0x0010, 0, "R600_VS_CONSTANT", 4, r600_state_pm4_generic, R600_VS_CONSTANT_names}, - { 128, 521, 0x00038000, 0x00039180, 0x001C, 0, "R600_PS_RESOURCE", 7, r600_state_pm4_resource, R600_PS_RESOURCE_names}, - { 128, 681, 0x00039180, 0x0003A300, 0x001C, 0, "R600_VS_RESOURCE", 7, r600_state_pm4_resource, R600_VS_RESOURCE_names}, - { 128, 841, 0x00039180, 0x0003A300, 0x001C, 0, "R600_FS_RESOURCE", 7, r600_state_pm4_resource, R600_FS_RESOURCE_names}, - { 128, 1001, 0x00039180, 0x0003A300, 0x001C, 0, "R600_GS_RESOURCE", 7, r600_state_pm4_resource, R600_GS_RESOURCE_names}, - { 128, 1161, 0x0003C000, 0x0003C0D8, 0x000C, 0, "R600_PS_SAMPLER", 3, r600_state_pm4_generic, R600_PS_SAMPLER_names}, - { 128, 1179, 0x0003C0D8, 0x0003C1B0, 0x000C, 0, "R600_VS_SAMPLER", 3, r600_state_pm4_generic, R600_VS_SAMPLER_names}, - { 128, 1197, 0x0003C1B0, 0x0003C288, 0x000C, 0, "R600_GS_SAMPLER", 3, r600_state_pm4_generic, R600_GS_SAMPLER_names}, - { 128, 1215, 0x0000A400, 0x0000A520, 0x0010, 0, "R600_PS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_PS_SAMPLER_BORDER_names}, - { 128, 1233, 0x0000A600, 0x0000A720, 0x0010, 0, "R600_VS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_VS_SAMPLER_BORDER_names}, - { 128, 1251, 0x0000A800, 0x0000A920, 0x0010, 0, "R600_GS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_GS_SAMPLER_BORDER_names}, - { 128, 1269, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB0", 7, r600_state_pm4_cb0, R600_CB0_names}, - { 128, 1270, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB1", 7, r600_state_pm4_cb0, R600_CB1_names}, - { 128, 1271, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB2", 7, r600_state_pm4_cb0, R600_CB2_names}, - { 128, 1272, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB3", 7, r600_state_pm4_cb0, R600_CB3_names}, - { 128, 1273, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB4", 7, r600_state_pm4_cb0, R600_CB4_names}, - { 128, 1274, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB5", 7, r600_state_pm4_cb0, R600_CB5_names}, - { 128, 1275, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB6", 7, r600_state_pm4_cb0, R600_CB6_names}, - { 128, 1276, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB7", 7, r600_state_pm4_cb0, R600_CB7_names}, - { 128, 1277, 0x00000000, 0x00000000, 0x0000, 0, "R600_DB", 6, r600_state_pm4_db, R600_DB_names}, - { 128, 1278, 0x00000000, 0x00000000, 0x0000, 0, "R600_VGT", 11, r600_state_pm4_vgt, R600_VGT_names}, - { 128, 1279, 0x00000000, 0x00000000, 0x0000, 0, "R600_DRAW", 4, r600_state_pm4_draw, R600_DRAW_names}, -}; - -static struct radeon_type R700_types[] = { - { 128, 0, 0x00000000, 0x00000000, 0x0000, 0, "R600_CONFIG", 41, r700_state_pm4_config, R600_CONFIG_names}, - { 128, 1, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB_CNTL", 18, r600_state_pm4_generic, R600_CB_CNTL_names}, - { 128, 2, 0x00000000, 0x00000000, 0x0000, 0, "R600_RASTERIZER", 21, r600_state_pm4_generic, R600_RASTERIZER_names}, - { 128, 3, 0x00000000, 0x00000000, 0x0000, 0, "R600_VIEWPORT", 9, r600_state_pm4_generic, R600_VIEWPORT_names}, - { 128, 4, 0x00000000, 0x00000000, 0x0000, 0, "R600_SCISSOR", 19, r600_state_pm4_generic, R600_SCISSOR_names}, - { 128, 5, 0x00000000, 0x00000000, 0x0000, 0, "R600_BLEND", 13, r600_state_pm4_generic, R600_BLEND_names}, - { 128, 6, 0x00000000, 0x00000000, 0x0000, 0, "R600_DSA", 16, r600_state_pm4_generic, R600_DSA_names}, - { 128, 7, 0x00000000, 0x00000000, 0x0000, 0, "R600_VS_SHADER", 49, r600_state_pm4_shader, R600_VS_SHADER_names}, - { 128, 8, 0x00000000, 0x00000000, 0x0000, 0, "R600_PS_SHADER", 39, r600_state_pm4_shader, R600_PS_SHADER_names}, - { 128, 9, 0x00030000, 0x00031000, 0x0010, 0, "R600_PS_CONSTANT", 4, r600_state_pm4_generic, R600_PS_CONSTANT_names}, - { 128, 265, 0x00031000, 0x00032000, 0x0010, 0, "R600_VS_CONSTANT", 4, r600_state_pm4_generic, R600_VS_CONSTANT_names}, - { 128, 521, 0x00038000, 0x00039180, 0x001C, 0, "R600_PS_RESOURCE", 7, r600_state_pm4_resource, R600_PS_RESOURCE_names}, - { 128, 681, 0x00039180, 0x0003A300, 0x001C, 0, "R600_VS_RESOURCE", 7, r600_state_pm4_resource, R600_VS_RESOURCE_names}, - { 128, 841, 0x00039180, 0x0003A300, 0x001C, 0, "R600_FS_RESOURCE", 7, r600_state_pm4_resource, R600_FS_RESOURCE_names}, - { 128, 1001, 0x00039180, 0x0003A300, 0x001C, 0, "R600_GS_RESOURCE", 7, r600_state_pm4_resource, R600_GS_RESOURCE_names}, - { 128, 1161, 0x0003C000, 0x0003C0D8, 0x000C, 0, "R600_PS_SAMPLER", 3, r600_state_pm4_generic, R600_PS_SAMPLER_names}, - { 128, 1179, 0x0003C0D8, 0x0003C1B0, 0x000C, 0, "R600_VS_SAMPLER", 3, r600_state_pm4_generic, R600_VS_SAMPLER_names}, - { 128, 1197, 0x0003C1B0, 0x0003C288, 0x000C, 0, "R600_GS_SAMPLER", 3, r600_state_pm4_generic, R600_GS_SAMPLER_names}, - { 128, 1215, 0x0000A400, 0x0000A520, 0x0010, 0, "R600_PS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_PS_SAMPLER_BORDER_names}, - { 128, 1233, 0x0000A600, 0x0000A720, 0x0010, 0, "R600_VS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_VS_SAMPLER_BORDER_names}, - { 128, 1251, 0x0000A800, 0x0000A920, 0x0010, 0, "R600_GS_SAMPLER_BORDER", 4, r600_state_pm4_generic, R600_GS_SAMPLER_BORDER_names}, - { 128, 1269, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB0", 7, r700_state_pm4_cb0, R600_CB0_names}, - { 128, 1270, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB1", 7, r600_state_pm4_cb0, R600_CB1_names}, - { 128, 1271, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB2", 7, r600_state_pm4_cb0, R600_CB2_names}, - { 128, 1272, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB3", 7, r600_state_pm4_cb0, R600_CB3_names}, - { 128, 1273, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB4", 7, r600_state_pm4_cb0, R600_CB4_names}, - { 128, 1274, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB5", 7, r600_state_pm4_cb0, R600_CB5_names}, - { 128, 1275, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB6", 7, r600_state_pm4_cb0, R600_CB6_names}, - { 128, 1276, 0x00000000, 0x00000000, 0x0000, 0, "R600_CB7", 7, r600_state_pm4_cb0, R600_CB7_names}, - { 128, 1277, 0x00000000, 0x00000000, 0x0000, 0, "R600_DB", 6, r700_state_pm4_db, R600_DB_names}, - { 128, 1278, 0x00000000, 0x00000000, 0x0000, 0, "R600_VGT", 11, r600_state_pm4_vgt, R600_VGT_names}, - { 128, 1279, 0x00000000, 0x00000000, 0x0000, 0, "R600_DRAW", 4, r600_state_pm4_draw, R600_DRAW_names}, +static const struct radeon_register R600_names_VGT_EVENT[] = { + {0x00028A90, 1, 0, "VGT_EVENT_INITIATOR"}, }; #endif diff --git a/src/gallium/winsys/r600/drm/r600d.h b/src/gallium/winsys/r600/drm/r600d.h index 5d13378627e..e8c2dc0651c 100644 --- a/src/gallium/winsys/r600/drm/r600d.h +++ b/src/gallium/winsys/r600/drm/r600d.h @@ -82,6 +82,9 @@ #define PKT3_SET_CTL_CONST 0x6F #define PKT3_SURFACE_BASE_UPDATE 0x73 +#define EVENT_TYPE_ZPASS_DONE 0x15 +#define EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT 0x16 + #define PKT_TYPE_S(x) (((x) & 0x3) << 30) #define PKT_TYPE_G(x) (((x) >> 30) & 0x3) #define PKT_TYPE_C 0x3FFFFFFF diff --git a/src/gallium/winsys/r600/drm/radeon.c b/src/gallium/winsys/r600/drm/radeon.c index 80b0a1d3972..e2d813ebac7 100644 --- a/src/gallium/winsys/r600/drm/radeon.c +++ b/src/gallium/winsys/r600/drm/radeon.c @@ -42,24 +42,13 @@ static int radeon_get_device(struct radeon *radeon) return r; } -/* symbol missing drove me crazy hack to get symbol exported */ -static void fake(void) -{ - struct radeon_ctx *ctx; - struct radeon_draw *draw; - - ctx = radeon_ctx(NULL); - draw = radeon_draw(NULL); -} - struct radeon *radeon_new(int fd, unsigned device) { struct radeon *radeon; - int r; + int r, i, id; radeon = calloc(1, sizeof(*radeon)); if (radeon == NULL) { - fake(); return NULL; } radeon->fd = fd; @@ -131,6 +120,19 @@ struct radeon *radeon_new(int fd, unsigned device) __func__, radeon->device); break; } + radeon->state_type_id = calloc(radeon->nstype, sizeof(unsigned)); + if (radeon->state_type_id == NULL) { + return radeon_decref(radeon); + } + for (i = 0, id = 0; i < radeon->nstype; i++) { + radeon->state_type_id[i] = id; + for (int j = 0; j < radeon->nstype; j++) { + if (radeon->stype[j].stype != i) + continue; + id += radeon->stype[j].num; + } + } + radeon->nstate_per_shader = id; return radeon; } @@ -153,47 +155,3 @@ struct radeon *radeon_decref(struct radeon *radeon) free(radeon); return NULL; } - -int radeon_reg_id(struct radeon *radeon, unsigned offset, unsigned *typeid, unsigned *stateid, unsigned *id) -{ - unsigned i, j; - - for (i = 0; i < radeon->ntype; i++) { - if (radeon->type[i].range_start) { - if (offset >= radeon->type[i].range_start && offset < radeon->type[i].range_end) { - *typeid = i; - j = offset - radeon->type[i].range_start; - j /= radeon->type[i].stride; - *stateid = radeon->type[i].id + j; - *id = (offset - radeon->type[i].range_start - radeon->type[i].stride * j) / 4; - return 0; - } - } else { - for (j = 0; j < radeon->type[i].nstates; j++) { - if (radeon->type[i].regs[j].offset == offset) { - *typeid = i; - *stateid = radeon->type[i].id; - *id = j; - return 0; - } - } - } - } - fprintf(stderr, "%s unknown register 0x%08X\n", __func__, offset); - return -EINVAL; -} - -unsigned radeon_type_from_id(struct radeon *radeon, unsigned id) -{ - unsigned i; - - for (i = 0; i < radeon->ntype - 1; i++) { - if (radeon->type[i].id == id) - return i; - if (id > radeon->type[i].id && id < radeon->type[i + 1].id) - return i; - } - if (radeon->type[i].id == id) - return i; - return -1; -} diff --git a/src/gallium/winsys/r600/drm/radeon_bo.c b/src/gallium/winsys/r600/drm/radeon_bo.c index f259ae7fb57..a1306f6e9d2 100644 --- a/src/gallium/winsys/r600/drm/radeon_bo.c +++ b/src/gallium/winsys/r600/drm/radeon_bo.c @@ -145,7 +145,9 @@ struct radeon_bo *radeon_bo_decref(struct radeon *radeon, struct radeon_bo *bo) return NULL; } - munmap(bo->data, bo->size); + if (bo->map_count) { + munmap(bo->data, bo->size); + } memset(&args, 0, sizeof(args)); args.handle = bo->handle; drmIoctl(radeon->fd, DRM_IOCTL_GEM_CLOSE, &args); diff --git a/src/gallium/winsys/r600/drm/radeon_ctx.c b/src/gallium/winsys/r600/drm/radeon_ctx.c index 45b706bb0f9..47fca761368 100644 --- a/src/gallium/winsys/r600/drm/radeon_ctx.c +++ b/src/gallium/winsys/r600/drm/radeon_ctx.c @@ -30,21 +30,16 @@ #include "radeon_drm.h" #include "bof.h" -int radeon_ctx_set_bo_new(struct radeon_ctx *ctx, struct radeon_bo *bo) +static int radeon_ctx_set_bo_new(struct radeon_ctx *ctx, struct radeon_bo *bo) { - void *ptr; - - ptr = realloc(ctx->bo, sizeof(struct radeon_bo) * (ctx->nbo + 1)); - if (ptr == NULL) { - return -ENOMEM; - } - ctx->bo = ptr; + if (ctx->nbo >= RADEON_CTX_MAX_PM4) + return -EBUSY; ctx->bo[ctx->nbo] = bo; ctx->nbo++; return 0; } -struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc) +static struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc) { struct radeon_cs_reloc *greloc; unsigned i; @@ -59,7 +54,7 @@ struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc) return NULL; } -void radeon_ctx_get_placement(struct radeon_ctx *ctx, unsigned reloc, u32 *placement) +static void radeon_ctx_get_placement(struct radeon_ctx *ctx, unsigned reloc, u32 *placement) { struct radeon_cs_reloc *greloc; unsigned i; @@ -76,50 +71,57 @@ void radeon_ctx_get_placement(struct radeon_ctx *ctx, unsigned reloc, u32 *place } } -struct radeon_ctx *radeon_ctx(struct radeon *radeon) +void radeon_ctx_clear(struct radeon_ctx *ctx) { - struct radeon_ctx *ctx; - - if (radeon == NULL) - return NULL; - ctx = calloc(1, sizeof(*ctx)); - if (ctx == NULL) - return NULL; - ctx->radeon = radeon_incref(radeon); - return ctx; + for (int i = 0; i < ctx->nbo; i++) { + ctx->bo[i] = radeon_bo_decref(ctx->radeon, ctx->bo[i]); + } + ctx->ndwords = RADEON_CTX_MAX_PM4; + ctx->cdwords = 0; + ctx->nreloc = 0; + ctx->nbo = 0; } -struct radeon_ctx *radeon_ctx_incref(struct radeon_ctx *ctx) +int radeon_ctx_init(struct radeon_ctx *ctx, struct radeon *radeon) { - ctx->refcount++; - return ctx; + if (radeon == NULL) + return -EINVAL; + memset(ctx, 0, sizeof(struct radeon_ctx)); + ctx->radeon = radeon_incref(radeon); + radeon_ctx_clear(ctx); + ctx->pm4 = malloc(RADEON_CTX_MAX_PM4 * 4); + if (ctx->pm4 == NULL) { + radeon_ctx_fini(ctx); + return -ENOMEM; + } + ctx->reloc = malloc(sizeof(struct radeon_cs_reloc) * RADEON_CTX_MAX_PM4); + if (ctx->reloc == NULL) { + radeon_ctx_fini(ctx); + return -ENOMEM; + } + ctx->bo = malloc(sizeof(void *) * RADEON_CTX_MAX_PM4); + if (ctx->bo == NULL) { + radeon_ctx_fini(ctx); + return -ENOMEM; + } + return 0; } -struct radeon_ctx *radeon_ctx_decref(struct radeon_ctx *ctx) +void radeon_ctx_fini(struct radeon_ctx *ctx) { unsigned i; if (ctx == NULL) - return NULL; - if (--ctx->refcount > 0) { - return NULL; - } + return; - for (i = 0; i < ctx->ndraw; i++) { - ctx->draw[i] = radeon_draw_decref(ctx->draw[i]); - } for (i = 0; i < ctx->nbo; i++) { ctx->bo[i] = radeon_bo_decref(ctx->radeon, ctx->bo[i]); } ctx->radeon = radeon_decref(ctx->radeon); - free(ctx->state); - free(ctx->draw); free(ctx->bo); free(ctx->pm4); free(ctx->reloc); - memset(ctx, 0, sizeof(*ctx)); - free(ctx); - return NULL; + memset(ctx, 0, sizeof(struct radeon_ctx)); } static int radeon_ctx_state_bo(struct radeon_ctx *ctx, struct radeon_state *state) @@ -152,17 +154,17 @@ int radeon_ctx_submit(struct radeon_ctx *ctx) uint64_t chunk_array[2]; int r = 0; - if (!ctx->cpm4) + if (!ctx->cdwords) return 0; #if 0 - for (r = 0; r < ctx->cpm4; r++) { + for (r = 0; r < ctx->cdwords; r++) { fprintf(stderr, "0x%08X\n", ctx->pm4[r]); } #endif drmib.num_chunks = 2; drmib.chunks = (uint64_t)(uintptr_t)chunk_array; chunks[0].chunk_id = RADEON_CHUNK_ID_IB; - chunks[0].length_dw = ctx->cpm4; + chunks[0].length_dw = ctx->cdwords; chunks[0].chunk_data = (uint64_t)(uintptr_t)ctx->pm4; chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS; chunks[1].length_dw = ctx->nreloc * sizeof(struct radeon_cs_reloc) / 4; @@ -180,7 +182,6 @@ static int radeon_ctx_reloc(struct radeon_ctx *ctx, struct radeon_bo *bo, unsigned id, unsigned *placement) { unsigned i; - struct radeon_cs_reloc *ptr; for (i = 0; i < ctx->nreloc; i++) { if (ctx->reloc[i].handle == bo->handle) { @@ -188,14 +189,13 @@ static int radeon_ctx_reloc(struct radeon_ctx *ctx, struct radeon_bo *bo, return 0; } } - ptr = realloc(ctx->reloc, sizeof(struct radeon_cs_reloc) * (ctx->nreloc + 1)); - if (ptr == NULL) - return -ENOMEM; - ctx->reloc = ptr; - ptr[ctx->nreloc].handle = bo->handle; - ptr[ctx->nreloc].read_domain = placement[0] | placement [1]; - ptr[ctx->nreloc].write_domain = placement[0] | placement [1]; - ptr[ctx->nreloc].flags = 0; + if (ctx->nreloc >= RADEON_CTX_MAX_PM4) { + return -EBUSY; + } + ctx->reloc[ctx->nreloc].handle = bo->handle; + ctx->reloc[ctx->nreloc].read_domain = placement[0] | placement [1]; + ctx->reloc[ctx->nreloc].write_domain = placement[0] | placement [1]; + ctx->reloc[ctx->nreloc].flags = 0; ctx->pm4[id] = ctx->nreloc * sizeof(struct radeon_cs_reloc) / 4; ctx->nreloc++; return 0; @@ -208,97 +208,80 @@ static int radeon_ctx_state_schedule(struct radeon_ctx *ctx, struct radeon_state if (state == NULL) return 0; - memcpy(&ctx->pm4[ctx->id], state->pm4, state->cpm4 * 4); + if (state->cpm4 > ctx->ndwords) { + return -EBUSY; + } + memcpy(&ctx->pm4[ctx->cdwords], state->pm4, state->cpm4 * 4); for (i = 0; i < state->nreloc; i++) { rid = state->reloc_pm4_id[i]; bid = state->reloc_bo_id[i]; - cid = ctx->id + rid; + cid = ctx->cdwords + rid; r = radeon_ctx_reloc(ctx, state->bo[bid], cid, &state->placement[bid * 2]); if (r) { - fprintf(stderr, "%s state %d failed to reloc\n", __func__, state->type); + fprintf(stderr, "%s state %d failed to reloc\n", __func__, state->stype->stype); return r; } } - ctx->id += state->cpm4; + ctx->cdwords += state->cpm4; + ctx->ndwords -= state->cpm4; return 0; } -int radeon_ctx_set_draw_new(struct radeon_ctx *ctx, struct radeon_draw *draw) +int radeon_ctx_set_query_state(struct radeon_ctx *ctx, struct radeon_state *state) { - struct radeon_draw *pdraw = NULL; - struct radeon_draw **ndraw; - struct radeon_state *nstate, *ostate; - unsigned cpm4, i, cstate; - void *tmp; int r = 0; - ndraw = realloc(ctx->draw, sizeof(void*) * (ctx->ndraw + 1)); - if (ndraw == NULL) - return -ENOMEM; - ctx->draw = ndraw; - for (i = 0; i < draw->nstate; i++) { - r = radeon_ctx_state_bo(ctx, draw->state[i]); - if (r) - return r; - } - r = radeon_draw_check(draw); + /* !!! ONLY ACCEPT QUERY STATE HERE !!! */ + r = radeon_state_pm4(state); if (r) return r; - if (draw->cpm4 >= RADEON_CTX_MAX_PM4) { - fprintf(stderr, "%s single draw too big %d, max %d\n", - __func__, draw->cpm4, RADEON_CTX_MAX_PM4); + /* BEGIN/END query are balanced in the same cs so account for END + * END query when scheduling BEGIN query + */ + switch (state->stype->stype) { + case R600_STATE_QUERY_BEGIN: + /* is there enough place for begin & end */ + if ((state->cpm4 * 2) > ctx->ndwords) + return -EBUSY; + ctx->ndwords -= state->cpm4; + break; + case R600_STATE_QUERY_END: + ctx->ndwords += state->cpm4; + break; + default: return -EINVAL; } - tmp = realloc(ctx->state, (ctx->nstate + draw->nstate) * sizeof(void*)); - if (tmp == NULL) - return -ENOMEM; - ctx->state = tmp; - pdraw = ctx->cdraw; - for (i = 0, cpm4 = 0, cstate = ctx->nstate; i < draw->nstate - 1; i++) { - nstate = draw->state[i]; - if (nstate) { - if (pdraw && pdraw->state[i]) { - ostate = pdraw->state[i]; - if (ostate->pm4_crc != nstate->pm4_crc) { - ctx->state[cstate++] = nstate; - cpm4 += nstate->cpm4; - } - } else { - ctx->state[cstate++] = nstate; - cpm4 += nstate->cpm4; - } - } - } - /* The last state is the draw state always add it */ - if (draw->state[i] == NULL) { - fprintf(stderr, "%s no draw command\n", __func__); - return -EINVAL; - } - ctx->state[cstate++] = draw->state[i]; - cpm4 += draw->state[i]->cpm4; - if ((ctx->draw_cpm4 + cpm4) > RADEON_CTX_MAX_PM4) { - /* need to flush */ - return -EBUSY; - } - ctx->draw_cpm4 += cpm4; - ctx->nstate = cstate; - ctx->draw[ctx->ndraw++] = draw; - ctx->cdraw = draw; - return 0; + return radeon_ctx_state_schedule(ctx, state); } int radeon_ctx_set_draw(struct radeon_ctx *ctx, struct radeon_draw *draw) { - int r; + unsigned previous_cdwords; + int r = 0; - radeon_draw_incref(draw); - r = radeon_ctx_set_draw_new(ctx, draw); - if (r) - radeon_draw_decref(draw); - return r; + for (int i = 0; i < (ctx->radeon->nstate_per_shader * R600_SHADER_MAX); i++) { + r = radeon_ctx_state_bo(ctx, draw->state[i]); + if (r) + return r; + } + previous_cdwords = ctx->cdwords; + for (int i = 0, id = 0; i < ctx->radeon->nstate_per_shader; i++) { + for (int j = 0; j < R600_SHADER_MAX; j++) { + id = j * ctx->radeon->nstate_per_shader + i; + if (draw->state[id]) { + r = radeon_ctx_state_schedule(ctx, draw->state[id]); + if (r) { + ctx->cdwords = previous_cdwords; + return r; + } + } + } + } + return 0; } +#if 0 int radeon_ctx_pm4(struct radeon_ctx *ctx) { unsigned i; @@ -310,9 +293,6 @@ int radeon_ctx_pm4(struct radeon_ctx *ctx) if (ctx->pm4 == NULL) return -EINVAL; for (i = 0, ctx->id = 0; i < ctx->nstate; i++) { - r = radeon_ctx_state_schedule(ctx, ctx->state[i]); - if (r) - return r; } if (ctx->id != ctx->draw_cpm4) { fprintf(stderr, "%s miss predicted pm4 size %d for %d\n", @@ -322,6 +302,7 @@ int radeon_ctx_pm4(struct radeon_ctx *ctx) ctx->cpm4 = ctx->draw_cpm4; return 0; } +#endif void radeon_ctx_dump_bof(struct radeon_ctx *ctx, const char *file) { @@ -349,8 +330,8 @@ printf("%d relocs\n", ctx->nreloc); bof_decref(blob); blob = NULL; /* dump cs */ -printf("%d pm4\n", ctx->cpm4); - blob = bof_blob(ctx->cpm4 * 4, ctx->pm4); +printf("%d pm4\n", ctx->cdwords); + blob = bof_blob(ctx->cdwords * 4, ctx->pm4); if (blob == NULL) goto out_err; if (bof_object_set(root, "pm4", blob)) @@ -366,7 +347,6 @@ printf("%d pm4\n", ctx->cpm4); if (bo == NULL) goto out_err; size = bof_int32(ctx->bo[i]->size); -printf("[%d] %d bo\n", i, size); if (size == NULL) goto out_err; if (bof_object_set(bo, "size", size)) diff --git a/src/gallium/winsys/r600/drm/radeon_draw.c b/src/gallium/winsys/r600/drm/radeon_draw.c index 4413ed79fbd..b992c4a55dc 100644 --- a/src/gallium/winsys/r600/drm/radeon_draw.c +++ b/src/gallium/winsys/r600/drm/radeon_draw.c @@ -31,111 +31,27 @@ /* * draw functions */ -struct radeon_draw *radeon_draw(struct radeon *radeon) +int radeon_draw_init(struct radeon_draw *draw, struct radeon *radeon) { - struct radeon_draw *draw; - - draw = calloc(1, sizeof(*draw)); - if (draw == NULL) - return NULL; - draw->nstate = radeon->nstate; draw->radeon = radeon; - draw->refcount = 1; - draw->state = calloc(1, sizeof(void*) * draw->nstate); - if (draw->state == NULL) { - free(draw); - return NULL; - } - return draw; -} - -struct radeon_draw *radeon_draw_incref(struct radeon_draw *draw) -{ - draw->refcount++; - return draw; -} - -struct radeon_draw *radeon_draw_decref(struct radeon_draw *draw) -{ - unsigned i; - - if (draw == NULL) - return NULL; - if (--draw->refcount > 0) - return NULL; - for (i = 0; i < draw->nstate; i++) { - draw->state[i] = radeon_state_decref(draw->state[i]); - } - free(draw->state); - memset(draw, 0, sizeof(*draw)); - free(draw); - return NULL; -} - -int radeon_draw_set_new(struct radeon_draw *draw, struct radeon_state *state) -{ - if (state == NULL) - return 0; - if (state->type >= draw->radeon->ntype) - return -EINVAL; - draw->state[state->id] = radeon_state_decref(draw->state[state->id]); - draw->state[state->id] = state; + draw->state = calloc(radeon->nstate_per_shader * R600_SHADER_MAX, sizeof(void*)); + if (draw->state == NULL) + return -ENOMEM; return 0; } -int radeon_draw_set(struct radeon_draw *draw, struct radeon_state *state) +void radeon_draw_bind(struct radeon_draw *draw, struct radeon_state *state) { if (state == NULL) - return 0; - radeon_state_incref(state); - return radeon_draw_set_new(draw, state); + return; + draw->state[state->state_id] = state; } -int radeon_draw_check(struct radeon_draw *draw) +void radeon_draw_unbind(struct radeon_draw *draw, struct radeon_state *state) { - unsigned i; - int r; - - r = radeon_draw_pm4(draw); - if (r) - return r; - for (i = 0, draw->cpm4 = 0; i < draw->nstate; i++) { - if (draw->state[i]) { - draw->cpm4 += draw->state[i]->cpm4; - } - } - return 0; -} - -struct radeon_draw *radeon_draw_duplicate(struct radeon_draw *draw) -{ - struct radeon_draw *ndraw; - unsigned i; - - if (draw == NULL) - return NULL; - ndraw = radeon_draw(draw->radeon); - if (ndraw == NULL) { - return NULL; - } - for (i = 0; i < draw->nstate; i++) { - if (radeon_draw_set(ndraw, draw->state[i])) { - radeon_draw_decref(ndraw); - return NULL; - } - } - return ndraw; -} - -int radeon_draw_pm4(struct radeon_draw *draw) -{ - unsigned i; - int r; - - for (i = 0; i < draw->nstate; i++) { - r = radeon_state_pm4(draw->state[i]); - if (r) - return r; + if (state == NULL) + return; + if (draw->state[state->state_id] == state) { + draw->state[state->state_id] = NULL; } - return 0; } diff --git a/src/gallium/winsys/r600/drm/radeon_priv.h b/src/gallium/winsys/r600/drm/radeon_priv.h index 96c0d060f7e..84e552ba4d3 100644 --- a/src/gallium/winsys/r600/drm/radeon_priv.h +++ b/src/gallium/winsys/r600/drm/radeon_priv.h @@ -37,17 +37,20 @@ struct radeon_register { char name[64]; }; -struct radeon_type { - unsigned npm4; - unsigned id; - unsigned range_start; - unsigned range_end; - unsigned stride; - unsigned immediate; - char name[64]; +struct radeon_sub_type { + int shader_type; + const struct radeon_register *regs; unsigned nstates; +}; + +struct radeon_stype_info { + unsigned stype; + unsigned num; + unsigned stride; radeon_state_pm4_t pm4; - const struct radeon_register *regs; + struct radeon_sub_type reginfo[R600_SHADER_MAX]; + unsigned base_id; + unsigned npm4; }; struct radeon { @@ -55,9 +58,10 @@ struct radeon { int refcount; unsigned device; unsigned family; - unsigned nstate; - unsigned ntype; - const struct radeon_type *type; + unsigned nstype; + unsigned nstate_per_shader; + unsigned *state_type_id; + struct radeon_stype_info *stype; }; extern struct radeon *radeon_new(int fd, unsigned device); @@ -65,15 +69,6 @@ extern struct radeon *radeon_incref(struct radeon *radeon); extern struct radeon *radeon_decref(struct radeon *radeon); extern unsigned radeon_family_from_device(unsigned device); extern int radeon_is_family_compatible(unsigned family1, unsigned family2); -extern int radeon_reg_id(struct radeon *radeon, unsigned offset, unsigned *typeid, unsigned *stateid, unsigned *id); -extern unsigned radeon_type_from_id(struct radeon *radeon, unsigned id); - - -int radeon_ctx_set_bo_new(struct radeon_ctx *ctx, struct radeon_bo *bo); -struct radeon_bo *radeon_ctx_get_bo(struct radeon_ctx *ctx, unsigned reloc); -void radeon_ctx_get_placement(struct radeon_ctx *ctx, unsigned reloc, u32 *placement); -int radeon_ctx_set_draw_new(struct radeon_ctx *ctx, struct radeon_draw *draw); -int radeon_ctx_draw(struct radeon_ctx *ctx); /* * r600/r700 context functions diff --git a/src/gallium/winsys/r600/drm/radeon_state.c b/src/gallium/winsys/r600/drm/radeon_state.c index 308288557a4..ac60485b280 100644 --- a/src/gallium/winsys/r600/drm/radeon_state.c +++ b/src/gallium/winsys/r600/drm/radeon_state.c @@ -32,82 +32,116 @@ /* * state core functions */ -struct radeon_state *radeon_state(struct radeon *radeon, u32 type, u32 id) +int radeon_state_init(struct radeon_state *state, struct radeon *radeon, u32 stype, u32 id, u32 shader_type) { - struct radeon_state *state; + struct radeon_stype_info *found = NULL; + int i, j, shader_index = -1; - if (type > radeon->ntype) { - fprintf(stderr, "%s invalid type %d\n", __func__, type); - return NULL; + /* traverse the stype array */ + for (i = 0; i < radeon->nstype; i++) { + /* if the type doesn't match, if the shader doesn't match */ + if (stype != radeon->stype[i].stype) + continue; + if (shader_type) { + for (j = 0; j < 4; j++) { + if (radeon->stype[i].reginfo[j].shader_type == shader_type) { + shader_index = j; + break; + } + } + if (shader_index == -1) + continue; + } else { + if (radeon->stype[i].reginfo[0].shader_type) + continue; + else + shader_index = 0; + } + if (id > radeon->stype[i].num) + continue; + + found = &radeon->stype[i]; + break; } - if (id > radeon->nstate) { - fprintf(stderr, "%s invalid state id %d\n", __func__, id); - return NULL; + + if (!found) { + fprintf(stderr, "%s invalid type %d/id %d/shader class %d\n", __func__, stype, id, shader_type); + return -EINVAL; } - state = calloc(1, sizeof(*state)); - if (state == NULL) - return NULL; + + memset(state, 0, sizeof(struct radeon_state)); + state->state_id = radeon->nstate_per_shader * shader_index + radeon->state_type_id[stype] + id; + state->stype = found; state->radeon = radeon; - state->type = type; state->id = id; + state->shader_index = shader_index; state->refcount = 1; - state->npm4 = radeon->type[type].npm4; - state->nstates = radeon->type[type].nstates; - state->states = calloc(1, state->nstates * 4); - state->pm4 = calloc(1, radeon->type[type].npm4 * 4); - if (state->states == NULL || state->pm4 == NULL) { - radeon_state_decref(state); - return NULL; - } - return state; + state->npm4 = found->npm4; + state->nstates = found->reginfo[shader_index].nstates; + return 0; } -struct radeon_state *radeon_state_duplicate(struct radeon_state *state) +int radeon_state_convert(struct radeon_state *state, u32 stype, u32 id, u32 shader_type) { - struct radeon_state *nstate = radeon_state(state->radeon, state->type, state->id); - unsigned i; + struct radeon_stype_info *found = NULL; + int i, j, shader_index = -1; if (state == NULL) - return NULL; - nstate->cpm4 = state->cpm4; - nstate->nbo = state->nbo; - nstate->nreloc = state->nreloc; - memcpy(nstate->states, state->states, state->nstates * 4); - memcpy(nstate->pm4, state->pm4, state->npm4 * 4); - memcpy(nstate->placement, state->placement, 8 * 4); - memcpy(nstate->reloc_pm4_id, state->reloc_pm4_id, 8 * 4); - memcpy(nstate->reloc_bo_id, state->reloc_bo_id, 8 * 4); - memcpy(nstate->bo_dirty, state->bo_dirty, 4 * 4); - for (i = 0; i < state->nbo; i++) { - nstate->bo[i] = radeon_bo_incref(state->radeon, state->bo[i]); + return 0; + /* traverse the stype array */ + for (i = 0; i < state->radeon->nstype; i++) { + /* if the type doesn't match, if the shader doesn't match */ + if (stype != state->radeon->stype[i].stype) + continue; + if (shader_type) { + for (j = 0; j < 4; j++) { + if (state->radeon->stype[i].reginfo[j].shader_type == shader_type) { + shader_index = j; + break; + } + } + if (shader_index == -1) + continue; + } else { + if (state->radeon->stype[i].reginfo[0].shader_type) + continue; + else + shader_index = 0; + } + if (id > state->radeon->stype[i].num) + continue; + + found = &state->radeon->stype[i]; + break; } - return nstate; -} -struct radeon_state *radeon_state_incref(struct radeon_state *state) -{ - state->refcount++; - return state; + if (!found) { + fprintf(stderr, "%s invalid type %d/id %d/shader class %d\n", __func__, stype, id, shader_type); + return -EINVAL; + } + + if (found->reginfo[shader_index].nstates != state->nstates) { + fprintf(stderr, "invalid type change from (%d %d %d) to (%d %d %d)\n", + state->stype->stype, state->id, state->shader_index, stype, id, shader_index); + } + + state->stype = found; + state->id = id; + state->shader_index = shader_index; + state->state_id = state->radeon->nstate_per_shader * shader_index + state->radeon->state_type_id[stype] + id; + return radeon_state_pm4(state); } -struct radeon_state *radeon_state_decref(struct radeon_state *state) +void radeon_state_fini(struct radeon_state *state) { unsigned i; if (state == NULL) return NULL; - if (--state->refcount > 0) { - return NULL; - } for (i = 0; i < state->nbo; i++) { state->bo[i] = radeon_bo_decref(state->radeon, state->bo[i]); } - free(state->immd); - free(state->states); - free(state->pm4); - memset(state, 0, sizeof(*state)); - free(state); - return NULL; + memset(state, 0, sizeof(struct radeon_state)); } int radeon_state_replace_always(struct radeon_state *ostate, @@ -147,12 +181,13 @@ int radeon_state_pm4(struct radeon_state *state) { int r; - if (state == NULL || state->cpm4) + if (state == NULL) return 0; - r = state->radeon->type[state->type].pm4(state); + state->cpm4 = 0; + r = state->stype->pm4(state); if (r) { fprintf(stderr, "%s failed to build PM4 for state(%d %d)\n", - __func__, state->type, state->id); + __func__, state->stype->stype, state->id); return r; } state->pm4_crc = crc32(state->pm4, state->cpm4 * 4); diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c index b997abda9b0..3a76098b655 100644 --- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c +++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c @@ -52,6 +52,7 @@ struct wrapper_sw_winsys struct sw_winsys base; struct pipe_screen *screen; struct pipe_context *pipe; + enum pipe_texture_target target; }; struct wrapper_sw_displaytarget @@ -145,7 +146,7 @@ wsw_dt_create(struct sw_winsys *ws, * XXX Why don't we just get the template. */ memset(&templ, 0, sizeof(templ)); - templ.target = PIPE_TEXTURE_2D; + templ.target = wsw->target; templ.width0 = width; templ.height0 = height; templ.format = format; @@ -291,6 +292,11 @@ wrapper_sw_winsys_warp_pipe_screen(struct pipe_screen *screen) if (!wsw->pipe) goto err_free; + if(screen->get_param(screen, PIPE_CAP_NPOT_TEXTURES)) + wsw->target = PIPE_TEXTURE_2D; + else + wsw->target = PIPE_TEXTURE_RECT; + return &wsw->base; err_free: |