diff options
Diffstat (limited to 'src/mesa')
70 files changed, 2970 insertions, 419 deletions
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c index 5610e9ff80f..36bed77b481 100644 --- a/src/mesa/drivers/common/meta.c +++ b/src/mesa/drivers/common/meta.c @@ -1544,7 +1544,8 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) const char *vs_source = "#extension GL_AMD_vertex_shader_layer : enable\n" "#extension GL_ARB_draw_instanced : enable\n" - "attribute vec4 position;\n" + "#extension GL_ARB_explicit_attrib_location :enable\n" + "layout(location = 0) in vec4 position;\n" "void main()\n" "{\n" "#ifdef GL_AMD_vertex_shader_layer\n" @@ -1553,7 +1554,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) " gl_Position = position;\n" "}\n"; const char *fs_source = - "uniform vec4 color;\n" + "#extension GL_ARB_explicit_attrib_location :enable\n" + "#extension GL_ARB_explicit_uniform_location :enable\n" + "layout(location = 0) uniform vec4 color;\n" "void main()\n" "{\n" " gl_FragColor = color;\n" @@ -1580,12 +1583,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) _mesa_DeleteShader(fs); _mesa_AttachShader(clear->ShaderProg, vs); _mesa_DeleteShader(vs); - _mesa_BindAttribLocation(clear->ShaderProg, 0, "position"); _mesa_ObjectLabel(GL_PROGRAM, clear->ShaderProg, -1, "meta clear"); _mesa_LinkProgram(clear->ShaderProg); - clear->ColorLocation = _mesa_GetUniformLocation(clear->ShaderProg, "color"); - has_integer_textures = _mesa_is_gles3(ctx) || (_mesa_is_desktop_gl(ctx) && ctx->Const.GLSLVersion >= 130); @@ -1596,7 +1596,8 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) "#version 130\n" "#extension GL_AMD_vertex_shader_layer : enable\n" "#extension GL_ARB_draw_instanced : enable\n" - "in vec4 position;\n" + "#extension GL_ARB_explicit_attrib_location :enable\n" + "layout(location = 0) in vec4 position;\n" "void main()\n" "{\n" "#ifdef GL_AMD_vertex_shader_layer\n" @@ -1607,7 +1608,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) const char *fs_int_source = ralloc_asprintf(shader_source_mem_ctx, "#version 130\n" - "uniform ivec4 color;\n" + "#extension GL_ARB_explicit_attrib_location :enable\n" + "#extension GL_ARB_explicit_uniform_location :enable\n" + "layout(location = 0) uniform ivec4 color;\n" "out ivec4 out_color;\n" "\n" "void main()\n" @@ -1626,7 +1629,6 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) _mesa_DeleteShader(fs); _mesa_AttachShader(clear->IntegerShaderProg, vs); _mesa_DeleteShader(vs); - _mesa_BindAttribLocation(clear->IntegerShaderProg, 0, "position"); /* Note that user-defined out attributes get automatically assigned * locations starting from 0, so we don't need to explicitly @@ -1636,9 +1638,6 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) _mesa_ObjectLabel(GL_PROGRAM, clear->IntegerShaderProg, -1, "integer clear"); _mesa_meta_link_program_with_debug(ctx, clear->IntegerShaderProg); - - clear->IntegerColorLocation = - _mesa_GetUniformLocation(clear->IntegerShaderProg, "color"); } } @@ -1770,12 +1769,10 @@ meta_clear(struct gl_context *ctx, GLbitfield buffers, bool glsl) if (fb->_IntegerColor) { assert(glsl); _mesa_UseProgram(clear->IntegerShaderProg); - _mesa_Uniform4iv(clear->IntegerColorLocation, 1, - ctx->Color.ClearColor.i); + _mesa_Uniform4iv(0, 1, ctx->Color.ClearColor.i); } else if (glsl) { _mesa_UseProgram(clear->ShaderProg); - _mesa_Uniform4fv(clear->ColorLocation, 1, - ctx->Color.ClearColor.f); + _mesa_Uniform4fv(0, 1, ctx->Color.ClearColor.f); } /* GL_COLOR_BUFFER_BIT */ diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h index 21495eecd27..5b04755a63e 100644 --- a/src/mesa/drivers/common/meta.h +++ b/src/mesa/drivers/common/meta.h @@ -322,12 +322,7 @@ struct clear_state GLuint VAO; struct gl_buffer_object *buf_obj; GLuint ShaderProg; - GLint ColorLocation; - GLint LayerLocation; - GLuint IntegerShaderProg; - GLint IntegerColorLocation; - GLint IntegerLayerLocation; }; diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c index d38e6b88953..2b942d6fd71 100644 --- a/src/mesa/drivers/common/meta_generate_mipmap.c +++ b/src/mesa/drivers/common/meta_generate_mipmap.c @@ -62,6 +62,15 @@ fallback_required(struct gl_context *ctx, GLenum target, GLuint srcLevel; GLenum status; + /* GL_DRAW_FRAMEBUFFER does not exist in OpenGL ES 1.x, and since + * _mesa_meta_begin hasn't been called yet, we have to work-around API + * difficulties. The whole reason that GL_DRAW_FRAMEBUFFER is used instead + * of GL_FRAMEBUFFER is that the read framebuffer may be different. This + * is moot in OpenGL ES 1.x. + */ + const GLenum fbo_target = ctx->API == API_OPENGLES + ? GL_FRAMEBUFFER : GL_DRAW_FRAMEBUFFER; + /* check for fallbacks */ if (target == GL_TEXTURE_3D) { _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH, @@ -102,13 +111,13 @@ fallback_required(struct gl_context *ctx, GLenum target, */ if (!mipmap->FBO) _mesa_GenFramebuffers(1, &mipmap->FBO); - _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, mipmap->FBO); + _mesa_BindFramebuffer(fbo_target, mipmap->FBO); - _mesa_meta_bind_fbo_image(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, baseImage, 0); + _mesa_meta_bind_fbo_image(fbo_target, GL_COLOR_ATTACHMENT0, baseImage, 0); - status = _mesa_CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER); + status = _mesa_CheckFramebufferStatus(fbo_target); - _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fboSave); + _mesa_BindFramebuffer(fbo_target, fboSave); if (status != GL_FRAMEBUFFER_COMPLETE_EXT) { _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH, diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index ac910525082..9ad04ee286b 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -76,6 +76,7 @@ i965_compiler_FILES = \ brw_vec4_reg_allocate.cpp \ brw_vec4_surface_builder.cpp \ brw_vec4_surface_builder.h \ + brw_vec4_tcs.cpp \ brw_vec4_visitor.cpp \ brw_vec4_vs_visitor.cpp \ brw_vue_map.c \ @@ -151,7 +152,9 @@ i965_FILES = \ brw_state.h \ brw_state_upload.c \ brw_structs.h \ + brw_tcs.c \ brw_tcs_surface_state.c \ + brw_tes.c \ brw_tes_surface_state.c \ brw_tex.c \ brw_tex_layout.c \ diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 17a95c6a5bb..7e33fa05632 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -194,6 +194,38 @@ struct brw_vs_prog_key { struct brw_sampler_prog_key_data tex; }; +/** The program key for Tessellation Control Shaders. */ +struct brw_tcs_prog_key +{ + unsigned program_string_id; + + GLenum tes_primitive_mode; + + unsigned input_vertices; + + /** A bitfield of per-patch outputs written. */ + uint32_t patch_outputs_written; + + /** A bitfield of per-vertex outputs written. */ + uint64_t outputs_written; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Tessellation Evaluation Shaders. */ +struct brw_tes_prog_key +{ + unsigned program_string_id; + + /** A bitfield of per-patch inputs read. */ + uint32_t patch_inputs_read; + + /** A bitfield of per-vertex inputs read. */ + uint64_t inputs_read; + + struct brw_sampler_prog_key_data tex; +}; + /** The program key for Geometry Shaders. */ struct brw_gs_prog_key { @@ -445,7 +477,7 @@ struct brw_vue_map { * additional processing is applied before storing them in the VUE), the * value is -1. */ - signed char varying_to_slot[BRW_VARYING_SLOT_COUNT]; + signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; /** * Map from VUE slot to gl_varying_slot value. For slots that do not @@ -454,12 +486,24 @@ struct brw_vue_map { * * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. */ - signed char slot_to_varying[BRW_VARYING_SLOT_COUNT]; + signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; /** * Total number of VUE slots in use */ int num_slots; + + /** + * Number of per-patch VUE slots. Only valid for tessellation control + * shader outputs and tessellation evaluation shader inputs. + */ + int num_per_patch_slots; + + /** + * Number of per-vertex VUE slots. Only valid for tessellation control + * shader outputs and tessellation evaluation shader inputs. + */ + int num_per_vertex_slots; }; void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map); @@ -487,6 +531,10 @@ void brw_compute_vue_map(const struct brw_device_info *devinfo, GLbitfield64 slots_valid, bool separate_shader); +void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map, + const GLbitfield64 slots_valid, + const GLbitfield is_patch); + enum shader_dispatch_mode { DISPATCH_MODE_4X1_SINGLE = 0, DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, @@ -656,6 +704,38 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, char **error_str); /** + * Compile a tessellation control shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_tcs(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const struct nir_shader *nir, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a tessellation evaluation shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_tes(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_tes_prog_key *key, + struct brw_tes_prog_data *prog_data, + const struct nir_shader *shader, + struct gl_shader_program *shader_prog, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** * Compile a vertex shader. * * Returns the final assembly and the program's size. diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 0abe60124f4..005c3236c88 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -159,8 +159,10 @@ intel_viewport(struct gl_context *ctx) __DRIcontext *driContext = brw->driContext; if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) { - dri2InvalidateDrawable(driContext->driDrawablePriv); - dri2InvalidateDrawable(driContext->driReadablePriv); + if (driContext->driDrawablePriv) + dri2InvalidateDrawable(driContext->driDrawablePriv); + if (driContext->driReadablePriv) + dri2InvalidateDrawable(driContext->driReadablePriv); } } @@ -377,7 +379,10 @@ brw_initialize_context_constants(struct brw_context *brw) [MESA_SHADER_GEOMETRY] = brw->gen >= 6, [MESA_SHADER_FRAGMENT] = true, [MESA_SHADER_COMPUTE] = - (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) || + (ctx->API == API_OPENGL_CORE && + ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) || + (ctx->API == API_OPENGLES2 && + ctx->Const.MaxComputeWorkGroupSize[0] >= 128) || _mesa_extension_override_enables.ARB_compute_shader, }; diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 1cc4c7b1282..0239b6214ae 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -179,8 +179,7 @@ enum brw_state_id { BRW_STATE_URB_FENCE = BRW_MAX_CACHE, BRW_STATE_FRAGMENT_PROGRAM, BRW_STATE_GEOMETRY_PROGRAM, - BRW_STATE_TESS_CTRL_PROGRAM, - BRW_STATE_TESS_EVAL_PROGRAM, + BRW_STATE_TESS_PROGRAMS, BRW_STATE_VERTEX_PROGRAM, BRW_STATE_CURBE_OFFSETS, BRW_STATE_REDUCED_PRIMITIVE, @@ -192,6 +191,7 @@ enum brw_state_id { BRW_STATE_BINDING_TABLE_POINTERS, BRW_STATE_INDICES, BRW_STATE_VERTICES, + BRW_STATE_DEFAULT_TESS_LEVELS, BRW_STATE_BATCH, BRW_STATE_INDEX_BUFFER, BRW_STATE_VS_CONSTBUF, @@ -262,8 +262,7 @@ enum brw_state_id { #define BRW_NEW_URB_FENCE (1ull << BRW_STATE_URB_FENCE) #define BRW_NEW_FRAGMENT_PROGRAM (1ull << BRW_STATE_FRAGMENT_PROGRAM) #define BRW_NEW_GEOMETRY_PROGRAM (1ull << BRW_STATE_GEOMETRY_PROGRAM) -#define BRW_NEW_TESS_EVAL_PROGRAM (1ull << BRW_STATE_TESS_EVAL_PROGRAM) -#define BRW_NEW_TESS_CTRL_PROGRAM (1ull << BRW_STATE_TESS_CTRL_PROGRAM) +#define BRW_NEW_TESS_PROGRAMS (1ull << BRW_STATE_TESS_PROGRAMS) #define BRW_NEW_VERTEX_PROGRAM (1ull << BRW_STATE_VERTEX_PROGRAM) #define BRW_NEW_CURBE_OFFSETS (1ull << BRW_STATE_CURBE_OFFSETS) #define BRW_NEW_REDUCED_PRIMITIVE (1ull << BRW_STATE_REDUCED_PRIMITIVE) @@ -275,6 +274,7 @@ enum brw_state_id { #define BRW_NEW_BINDING_TABLE_POINTERS (1ull << BRW_STATE_BINDING_TABLE_POINTERS) #define BRW_NEW_INDICES (1ull << BRW_STATE_INDICES) #define BRW_NEW_VERTICES (1ull << BRW_STATE_VERTICES) +#define BRW_NEW_DEFAULT_TESS_LEVELS (1ull << BRW_STATE_DEFAULT_TESS_LEVELS) /** * Used for any batch entry with a relocated pointer that will be used * by any 3D rendering. @@ -1008,6 +1008,8 @@ struct brw_context struct { GLuint vsize; /* vertex size plus header in urb registers */ GLuint gsize; /* GS output size in urb registers */ + GLuint hsize; /* Tessellation control output size in urb registers */ + GLuint dsize; /* Tessellation evaluation output size in urb registers */ GLuint csize; /* constant buffer size in urb registers */ GLuint sfsize; /* setup data size in urb registers */ @@ -1020,12 +1022,16 @@ struct brw_context GLuint max_gs_entries; /* Maximum number of GS entries */ GLuint nr_vs_entries; + GLuint nr_hs_entries; + GLuint nr_ds_entries; GLuint nr_gs_entries; GLuint nr_clip_entries; GLuint nr_sf_entries; GLuint nr_cs_entries; GLuint vs_start; + GLuint hs_start; + GLuint ds_start; GLuint gs_start; GLuint clip_start; GLuint sf_start; @@ -1042,6 +1048,11 @@ struct brw_context * URB space for the GS. */ bool gs_present; + + /* True if the most recently sent _3DSTATE_URB message allocated + * URB space for the HS and DS. + */ + bool tess_present; } urb; @@ -1648,12 +1659,18 @@ void gen8_emit_3dstate_sample_pattern(struct brw_context *brw); /* gen7_urb.c */ void gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, + unsigned hs_size, unsigned ds_size, unsigned gs_size, unsigned fs_size); void gen7_emit_urb_state(struct brw_context *brw, - unsigned nr_vs_entries, unsigned vs_size, - unsigned vs_start, unsigned nr_gs_entries, + unsigned nr_vs_entries, + unsigned vs_size, unsigned vs_start, + unsigned nr_hs_entries, + unsigned hs_size, unsigned hs_start, + unsigned nr_ds_entries, + unsigned ds_size, unsigned ds_start, + unsigned nr_gs_entries, unsigned gs_size, unsigned gs_start); @@ -1687,6 +1704,18 @@ brw_vertex_program_const(const struct gl_vertex_program *p) return (const struct brw_vertex_program *) p; } +static inline struct brw_tess_ctrl_program * +brw_tess_ctrl_program(struct gl_tess_ctrl_program *p) +{ + return (struct brw_tess_ctrl_program *) p; +} + +static inline struct brw_tess_eval_program * +brw_tess_eval_program(struct gl_tess_eval_program *p) +{ + return (struct brw_tess_eval_program *) p; +} + static inline struct brw_geometry_program * brw_geometry_program(struct gl_geometry_program *p) { diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 7d8723efd5e..5c18d671a0c 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1307,6 +1307,14 @@ enum opcode { * UD immediate). */ SHADER_OPCODE_MOV_INDIRECT, + + VEC4_OPCODE_URB_READ, + TCS_OPCODE_GET_INSTANCE_ID, + TCS_OPCODE_URB_WRITE, + TCS_OPCODE_SET_INPUT_URB_OFFSETS, + TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, + TCS_OPCODE_GET_PRIMITIVE_ID, + TCS_OPCODE_CREATE_BARRIER_HEADER, }; enum brw_urb_write_flags { diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index cbc2f2fbf4b..f9a72903ef1 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1687,6 +1687,21 @@ fs_visitor::assign_vs_urb_setup() } void +fs_visitor::assign_tes_urb_setup() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data; + + first_non_payload_grf += 8 * vue_prog_data->urb_read_length; + + /* Rewrite all ATTR file references to HW_REGs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + +void fs_visitor::assign_gs_urb_setup() { assert(stage == MESA_SHADER_GEOMETRY); @@ -5257,6 +5272,40 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes) } bool +fs_visitor::run_tes() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */ + payload.num_regs = 5; + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_nir_code(); + + if (failed) + return false; + + emit_urb_writes(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_tes_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(); + + return !failed; +} + +bool fs_visitor::run_gs() { assert(stage == MESA_SHADER_GEOMETRY); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index dff86a97a14..bdbfd0c4546 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -81,7 +81,8 @@ public: struct gl_program *prog, const nir_shader *shader, unsigned dispatch_width, - int shader_time_index); + int shader_time_index, + const struct brw_vue_map *input_vue_map = NULL); fs_visitor(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, struct brw_gs_compile *gs_compile, @@ -109,6 +110,7 @@ public: bool run_fs(bool do_rep_send); bool run_vs(gl_clip_plane *clip_planes); + bool run_tes(); bool run_gs(); bool run_cs(); void optimize(); @@ -124,6 +126,7 @@ public: void assign_urb_setup(); void convert_attr_sources_to_hw_regs(fs_inst *inst); void assign_vs_urb_setup(); + void assign_tes_urb_setup(); void assign_gs_urb_setup(); bool assign_regs(bool allow_spilling); void assign_regs_trivial(); @@ -251,6 +254,8 @@ public: nir_intrinsic_instr *instr); void nir_emit_intrinsic(const brw::fs_builder &bld, nir_intrinsic_instr *instr); + void nir_emit_tes_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); void nir_emit_ssbo_atomic(const brw::fs_builder &bld, int op, nir_intrinsic_instr *instr); void nir_emit_shared_atomic(const brw::fs_builder &bld, @@ -262,6 +267,7 @@ public: fs_reg get_nir_src(nir_src src); fs_reg get_nir_dest(nir_dest dest); fs_reg get_nir_image_deref(const nir_deref_var *deref); + fs_reg get_indirect_offset(nir_intrinsic_instr *instr); void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst, unsigned wr_mask); @@ -315,6 +321,8 @@ public: struct brw_stage_prog_data *prog_data; struct gl_program *prog; + const struct brw_vue_map *input_vue_map; + int *param_size; int *virtual_grf_start; diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index b3fb0c6fd6e..78a82406ad7 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -288,23 +288,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) } break; - case ir_unop_any: { - ir_expression *temp; - temp = new(mem_ctx) ir_expression(ir_binop_logic_or, - element_type, - get_element(op_var[0], 0), - get_element(op_var[0], 1)); - - for (i = 2; i < vector_elements; i++) { - temp = new(mem_ctx) ir_expression(ir_binop_logic_or, - element_type, - get_element(op_var[0], i), - temp); - } - assign(ir, 0, temp); - break; - } - case ir_binop_dot: { ir_expression *last = NULL; for (i = 0; i < vector_elements; i++) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 4e0ff50ddcb..ba14ba54303 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -123,6 +123,7 @@ fs_visitor::nir_setup_outputs() switch (stage) { case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_EVAL: case MESA_SHADER_GEOMETRY: { unsigned location = var->data.location; nir_setup_single_output_varying(®, var->type, &location); @@ -443,6 +444,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr) case MESA_SHADER_VERTEX: nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); break; + case MESA_SHADER_TESS_EVAL: + nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; case MESA_SHADER_GEOMETRY: nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); break; @@ -841,12 +845,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_fdot2: case nir_op_fdot3: case nir_op_fdot4: - case nir_op_bany2: - case nir_op_bany3: - case nir_op_bany4: - case nir_op_ball2: - case nir_op_ball3: - case nir_op_ball4: case nir_op_ball_fequal2: case nir_op_ball_iequal2: case nir_op_ball_fequal3: @@ -1715,6 +1713,24 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, } } +fs_reg +fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) +{ + nir_src *offset_src = nir_get_io_offset_src(instr); + nir_const_value *const_value = nir_src_as_const_value(*offset_src); + + if (const_value) { + /* The only constant offset we should find is 0. brw_nir.c's + * add_const_offset_to_base() will fold other constant offsets + * into instr->const_index[0]. + */ + assert(const_value->u[0] == 0); + return fs_reg(); + } + + return get_nir_src(*offset_src); +} + void fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -1747,6 +1763,106 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, } void +fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_TESS_EVAL); + struct brw_tes_prog_data *tes_prog_data = (struct brw_tes_prog_data *) prog_data; + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); + break; + case nir_intrinsic_load_tess_coord: + /* gl_TessCoord is part of the payload in g1-3 */ + for (unsigned i = 0; i < 3; i++) { + bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); + } + break; + + case nir_intrinsic_load_tess_level_outer: + /* When the TES reads gl_TessLevelOuter, we ensure that the patch header + * appears as a push-model input. So, we can simply use the ATTR file + * rather than issuing URB read messages. The data is stored in the + * high DWords in reverse order - DWord 7 contains .x, DWord 6 contains + * .y, and so on. + */ + switch (tes_prog_data->domain) { + case BRW_TESS_DOMAIN_QUAD: + for (unsigned i = 0; i < 4; i++) + bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i)); + break; + case BRW_TESS_DOMAIN_TRI: + for (unsigned i = 0; i < 3; i++) + bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i)); + break; + case BRW_TESS_DOMAIN_ISOLINE: + for (unsigned i = 0; i < 2; i++) + bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i)); + break; + } + break; + + case nir_intrinsic_load_tess_level_inner: + /* When the TES reads gl_TessLevelInner, we ensure that the patch header + * appears as a push-model input. So, we can simply use the ATTR file + * rather than issuing URB read messages. + */ + switch (tes_prog_data->domain) { + case BRW_TESS_DOMAIN_QUAD: + bld.MOV(dest, component(fs_reg(ATTR, 0), 3)); + bld.MOV(offset(dest, bld, 1), component(fs_reg(ATTR, 0), 2)); + break; + case BRW_TESS_DOMAIN_TRI: + bld.MOV(dest, component(fs_reg(ATTR, 0), 4)); + break; + case BRW_TESS_DOMAIN_ISOLINE: + /* ignore - value is undefined */ + break; + } + break; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: { + fs_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + fs_inst *inst; + if (indirect_offset.file == BAD_FILE) { + /* Replicate the patch handle to all enabled channels */ + fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.MOV(patch_handle, retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle); + inst->mlen = 1; + } else { + /* Indirect indexing - use per-slot offsets as well. */ + const fs_reg srcs[] = { + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), + indirect_offset + }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); + + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload); + inst->mlen = 2; + } + inst->offset = imm_offset; + inst->base_mrf = -1; + inst->regs_written = instr->num_components; + break; + } + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 790f1009ca7..8f1fcbb233c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -702,7 +702,10 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) fs_reg sources[8]; fs_reg urb_handle; - urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + if (stage == MESA_SHADER_TESS_EVAL) + urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD)); + else + urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); /* If we don't have any valid slots to write, just do a minimal urb write * send to terminate the shader. This includes 1 slot of undefined data, @@ -936,9 +939,11 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, struct gl_program *prog, const nir_shader *shader, unsigned dispatch_width, - int shader_time_index) + int shader_time_index, + const struct brw_vue_map *input_vue_map) : backend_shader(compiler, log_data, mem_ctx, shader, prog_data), key(key), gs_compile(NULL), prog_data(prog_data), prog(prog), + input_vue_map(input_vue_map), dispatch_width(dispatch_width), shader_time_index(shader_time_index), bld(fs_builder(this, dispatch_width).at_end()) @@ -974,6 +979,9 @@ fs_visitor::init() case MESA_SHADER_VERTEX: key_tex = &((const brw_vs_prog_key *) key)->tex; break; + case MESA_SHADER_TESS_EVAL: + key_tex = &((const brw_tes_prog_key *) key)->tex; + break; case MESA_SHADER_GEOMETRY: key_tex = &((const brw_gs_prog_key *) key)->tex; break; diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp index 31d29ec9045..7cdc830f6b8 100644 --- a/src/mesa/drivers/dri/i965/brw_link.cpp +++ b/src/mesa/drivers/dri/i965/brw_link.cpp @@ -42,6 +42,8 @@ brw_shader_precompile(struct gl_context *ctx, struct gl_shader_program *sh_prog) { struct gl_shader *vs = sh_prog->_LinkedShaders[MESA_SHADER_VERTEX]; + struct gl_shader *tcs = sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]; + struct gl_shader *tes = sh_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]; struct gl_shader *gs = sh_prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; struct gl_shader *fs = sh_prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; struct gl_shader *cs = sh_prog->_LinkedShaders[MESA_SHADER_COMPUTE]; @@ -52,6 +54,12 @@ brw_shader_precompile(struct gl_context *ctx, if (gs && !brw_gs_precompile(ctx, sh_prog, gs->Program)) return false; + if (tes && !brw_tes_precompile(ctx, sh_prog, tes->Program)) + return false; + + if (tcs && !brw_tcs_precompile(ctx, sh_prog, tcs->Program)) + return false; + if (vs && !brw_vs_precompile(ctx, sh_prog, vs->Program)) return false; diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index fdfc4f661d1..eebd2a386b6 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -27,15 +27,43 @@ #include "glsl/nir/nir_builder.h" #include "program/prog_to_nir.h" -struct remap_vs_attrs_state { +static bool +is_input(nir_intrinsic_instr *intrin) +{ + return intrin->intrinsic == nir_intrinsic_load_input || + intrin->intrinsic == nir_intrinsic_load_per_vertex_input; +} + +static bool +is_output(nir_intrinsic_instr *intrin) +{ + return intrin->intrinsic == nir_intrinsic_load_output || + intrin->intrinsic == nir_intrinsic_load_per_vertex_output || + intrin->intrinsic == nir_intrinsic_store_output || + intrin->intrinsic == nir_intrinsic_store_per_vertex_output; +} + +/** + * In many cases, we just add the base and offset together, so there's no + * reason to keep them separate. Sometimes, combining them is essential: + * if a shader only accesses part of a compound variable (such as a matrix + * or array), the variable's base may not actually exist in the VUE map. + * + * This pass adds constant offsets to instr->const_index[0], and resets + * the offset source to 0. Non-constant offsets remain unchanged - since + * we don't know what part of a compound variable is accessed, we allocate + * storage for the entire thing. + */ +struct add_const_offset_to_base_params { nir_builder b; - uint64_t inputs_read; + nir_variable_mode mode; }; static bool -remap_vs_attrs(nir_block *block, void *void_state) +add_const_offset_to_base(nir_block *block, void *closure) { - struct remap_vs_attrs_state *state = void_state; + struct add_const_offset_to_base_params *params = closure; + nir_builder *b = ¶ms->b; nir_foreach_instr_safe(block, instr) { if (instr->type != nir_instr_type_intrinsic) @@ -43,30 +71,120 @@ remap_vs_attrs(nir_block *block, void *void_state) nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if ((params->mode == nir_var_shader_in && is_input(intrin)) || + (params->mode == nir_var_shader_out && is_output(intrin))) { + nir_src *offset = nir_get_io_offset_src(intrin); + nir_const_value *const_offset = nir_src_as_const_value(*offset); + + if (const_offset) { + intrin->const_index[0] += const_offset->u[0]; + b->cursor = nir_before_instr(&intrin->instr); + nir_instr_rewrite_src(&intrin->instr, offset, + nir_src_for_ssa(nir_imm_int(b, 0))); + } + } + } + return true; + +} + +static bool +remap_vs_attrs(nir_block *block, void *closure) +{ + GLbitfield64 inputs_read = *((GLbitfield64 *) closure); + + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == nir_intrinsic_load_input) { /* Attributes come in a contiguous block, ordered by their * gl_vert_attrib value. That means we can compute the slot * number for an attribute by masking out the enabled attributes * before it and counting the bits. */ - nir_const_value *const_offset = nir_src_as_const_value(intrin->src[0]); + int attr = intrin->const_index[0]; + int slot = _mesa_bitcount_64(inputs_read & BITFIELD64_MASK(attr)); + + intrin->const_index[0] = 4 * slot; + } + } + return true; +} - /* We set EmitNoIndirect for VS inputs, so there are no indirects. */ - assert(const_offset); +static bool +remap_inputs_with_vue_map(nir_block *block, void *closure) +{ + const struct brw_vue_map *vue_map = closure; - int attr = intrin->const_index[0] + const_offset->u[0]; - int slot = _mesa_bitcount_64(state->inputs_read & - BITFIELD64_MASK(attr)); + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; - /* The NIR -> FS pass will just add the base and offset together, so - * there's no reason to keep them separate. Just put it all in - * const_index[0] and set the offset src[0] to load_const(0). - */ - intrin->const_index[0] = 4 * slot; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic == nir_intrinsic_load_input || + intrin->intrinsic == nir_intrinsic_load_per_vertex_input) { + int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]]; + assert(vue_slot != -1); + intrin->const_index[0] = vue_slot; + } + } + return true; +} - state->b.cursor = nir_before_instr(&intrin->instr); - nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], - nir_src_for_ssa(nir_imm_int(&state->b, 0))); +struct remap_patch_urb_offsets_state { + nir_builder b; + struct brw_vue_map vue_map; +}; + +static bool +remap_patch_urb_offsets(nir_block *block, void *closure) +{ + struct remap_patch_urb_offsets_state *state = closure; + + nir_foreach_instr_safe(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + gl_shader_stage stage = state->b.shader->stage; + + if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) || + (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) { + int vue_slot = state->vue_map.varying_to_slot[intrin->const_index[0]]; + assert(vue_slot != -1); + intrin->const_index[0] = vue_slot; + + nir_src *vertex = nir_get_io_vertex_index_src(intrin); + if (vertex) { + nir_const_value *const_vertex = nir_src_as_const_value(*vertex); + if (const_vertex) { + intrin->const_index[0] += const_vertex->u[0] * + state->vue_map.num_per_vertex_slots; + } else { + state->b.cursor = nir_before_instr(&intrin->instr); + + /* Multiply by the number of per-vertex slots. */ + nir_ssa_def *vertex_offset = + nir_imul(&state->b, + nir_ssa_for_src(&state->b, *vertex, 1), + nir_imm_int(&state->b, + state->vue_map.num_per_vertex_slots)); + + /* Add it to the existing offset */ + nir_src *offset = nir_get_io_offset_src(intrin); + nir_ssa_def *total_offset = + nir_iadd(&state->b, vertex_offset, + nir_ssa_for_src(&state->b, *offset, 1)); + + nir_instr_rewrite_src(&intrin->instr, offset, + nir_src_for_ssa(total_offset)); + } + } } } return true; @@ -77,6 +195,10 @@ brw_nir_lower_inputs(nir_shader *nir, const struct brw_device_info *devinfo, bool is_scalar) { + struct add_const_offset_to_base_params params = { + .mode = nir_var_shader_in + }; + switch (nir->stage) { case MESA_SHADER_VERTEX: /* Start with the location of the variable's base. */ @@ -97,23 +219,23 @@ brw_nir_lower_inputs(nir_shader *nir, * key->inputs_read since the two are identical aside from Gen4-5 * edge flag differences. */ - struct remap_vs_attrs_state remap_state = { - .inputs_read = nir->info.inputs_read, - }; + GLbitfield64 inputs_read = nir->info.inputs_read; /* This pass needs actual constants */ nir_opt_constant_folding(nir); nir_foreach_overload(nir, overload) { if (overload->impl) { - nir_builder_init(&remap_state.b, overload->impl); - nir_foreach_block(overload->impl, remap_vs_attrs, &remap_state); + nir_builder_init(¶ms.b, overload->impl); + nir_foreach_block(overload->impl, add_const_offset_to_base, ¶ms); + nir_foreach_block(overload->impl, remap_vs_attrs, &inputs_read); } } } break; + case MESA_SHADER_TESS_CTRL: case MESA_SHADER_GEOMETRY: { - if (!is_scalar) { + if (!is_scalar && nir->stage == MESA_SHADER_GEOMETRY) { foreach_list_typed(nir_variable, var, node, &nir->inputs) { var->data.driver_location = var->data.location; } @@ -135,17 +257,52 @@ brw_nir_lower_inputs(nir_shader *nir, GLbitfield64 inputs_read = nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID; brw_compute_vue_map(devinfo, &input_vue_map, inputs_read, - nir->info.separate_shader); + nir->info.separate_shader || + nir->stage == MESA_SHADER_TESS_CTRL); - /* Start with the slot for the variable's base. */ foreach_list_typed(nir_variable, var, node, &nir->inputs) { - assert(input_vue_map.varying_to_slot[var->data.location] != -1); - var->data.driver_location = - input_vue_map.varying_to_slot[var->data.location]; + var->data.driver_location = var->data.location; } /* Inputs are stored in vec4 slots, so use type_size_vec4(). */ nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + nir_foreach_overload(nir, overload) { + if (overload->impl) { + nir_builder_init(¶ms.b, overload->impl); + nir_foreach_block(overload->impl, add_const_offset_to_base, ¶ms); + nir_foreach_block(overload->impl, remap_inputs_with_vue_map, + &input_vue_map); + } + } + } + break; + } + case MESA_SHADER_TESS_EVAL: { + struct remap_patch_urb_offsets_state state; + brw_compute_tess_vue_map(&state.vue_map, + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, + nir->info.patch_inputs_read); + + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + + nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + nir_foreach_overload(nir, overload) { + if (overload->impl) { + nir_builder_init(¶ms.b, overload->impl); + nir_foreach_block(overload->impl, add_const_offset_to_base, ¶ms); + nir_builder_init(&state.b, overload->impl); + nir_foreach_block(overload->impl, remap_patch_urb_offsets, &state); + } } break; } @@ -164,10 +321,13 @@ brw_nir_lower_inputs(nir_shader *nir, } static void -brw_nir_lower_outputs(nir_shader *nir, bool is_scalar) +brw_nir_lower_outputs(nir_shader *nir, + const struct brw_device_info *devinfo, + bool is_scalar) { switch (nir->stage) { case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_EVAL: case MESA_SHADER_GEOMETRY: if (is_scalar) { nir_assign_var_locations(&nir->outputs, &nir->num_outputs, @@ -178,6 +338,34 @@ brw_nir_lower_outputs(nir_shader *nir, bool is_scalar) var->data.driver_location = var->data.location; } break; + case MESA_SHADER_TESS_CTRL: { + struct add_const_offset_to_base_params params = { + .mode = nir_var_shader_out + }; + + struct remap_patch_urb_offsets_state state; + brw_compute_tess_vue_map(&state.vue_map, nir->info.outputs_written, + nir->info.patch_outputs_written); + + nir_foreach_variable(var, &nir->outputs) { + var->data.driver_location = var->data.location; + } + + nir_lower_io(nir, nir_var_shader_out, type_size_vec4); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + nir_foreach_overload(nir, overload) { + if (overload->impl) { + nir_builder_init(¶ms.b, overload->impl); + nir_foreach_block(overload->impl, add_const_offset_to_base, ¶ms); + nir_builder_init(&state.b, overload->impl); + nir_foreach_block(overload->impl, remap_patch_urb_offsets, &state); + } + } + break; + } case MESA_SHADER_FRAGMENT: nir_assign_var_locations(&nir->outputs, &nir->num_outputs, type_size_scalar); @@ -328,37 +516,19 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar) return nir; } -/* Lowers inputs, outputs, uniforms, and samplers for i965 - * - * This function does all of the standard lowering prior to post-processing. - * The lowering done is highly gen, stage, and backend-specific. The - * shader_prog parameter is optional and is used only for lowering sampler - * derefs and atomics for GLSL shaders. - */ +/** Lower input and output loads and stores for i965. */ nir_shader * -brw_lower_nir(nir_shader *nir, - const struct brw_device_info *devinfo, - const struct gl_shader_program *shader_prog, - bool is_scalar) +brw_nir_lower_io(nir_shader *nir, + const struct brw_device_info *devinfo, + bool is_scalar) { bool progress; /* Written by OPT and OPT_V */ (void)progress; OPT_V(brw_nir_lower_inputs, devinfo, is_scalar); - OPT_V(brw_nir_lower_outputs, is_scalar); - //OPT_V(brw_nir_lower_uniforms, is_scalar); + OPT_V(brw_nir_lower_outputs, devinfo, is_scalar); OPT_V(nir_lower_io, nir_var_all, is_scalar ? type_size_scalar : type_size_vec4); - if (shader_prog) { - OPT_V(nir_lower_samplers, shader_prog); - } - - OPT(nir_lower_system_values); - - if (shader_prog) { - OPT_V(nir_lower_atomics, shader_prog); - } - return nir_optimize(nir, is_scalar); } @@ -457,7 +627,19 @@ brw_create_nir(struct brw_context *brw, (void)progress; nir = brw_preprocess_nir(nir, is_scalar); - nir = brw_lower_nir(nir, devinfo, shader_prog, is_scalar); + + OPT(nir_lower_system_values); + OPT_V(brw_nir_lower_uniforms, is_scalar); + + if (shader_prog) { + OPT_V(nir_lower_samplers, shader_prog); + OPT_V(nir_lower_atomics, shader_prog); + } + + if (nir->stage != MESA_SHADER_TESS_CTRL && + nir->stage != MESA_SHADER_TESS_EVAL) { + nir = brw_nir_lower_io(nir, devinfo, is_scalar); + } return nir; } diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h index 0a8a5a280b1..78b139b991d 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.h +++ b/src/mesa/drivers/dri/i965/brw_nir.h @@ -82,10 +82,9 @@ nir_shader *brw_create_nir(struct brw_context *brw, bool is_scalar); nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar); -nir_shader *brw_lower_nir(nir_shader *nir, - const struct brw_device_info *devinfo, - const struct gl_shader_program *shader_prog, - bool is_scalar); +nir_shader *brw_nir_lower_io(nir_shader *nir, + const struct brw_device_info *devinfo, + bool is_scalar); nir_shader *brw_postprocess_nir(nir_shader *nir, const struct brw_device_info *devinfo, bool is_scalar); diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c index c995d2b7e2d..f4d23d81260 100644 --- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c +++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c @@ -109,9 +109,6 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state) uint8_t resolve_status; nir_alu_instr *alu = nir_instr_as_alu(instr); switch (alu->op) { - case nir_op_bany2: - case nir_op_bany3: - case nir_op_bany4: case nir_op_ball_fequal2: case nir_op_ball_iequal2: case nir_op_ball_fequal3: diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c index ae3d8188325..6c636d26139 100644 --- a/src/mesa/drivers/dri/i965/brw_pipe_control.c +++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c @@ -97,7 +97,8 @@ void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags) { if (brw->gen >= 8) { - gen8_add_cs_stall_workaround_bits(&flags); + if (brw->gen == 8) + gen8_add_cs_stall_workaround_bits(&flags); BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2)); @@ -141,7 +142,8 @@ brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags, uint32_t imm_lower, uint32_t imm_upper) { if (brw->gen >= 8) { - gen8_add_cs_stall_workaround_bits(&flags); + if (brw->gen == 8) + gen8_add_cs_stall_workaround_bits(&flags); BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2)); diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h index 339b8e19ec5..059ccf8bd39 100644 --- a/src/mesa/drivers/dri/i965/brw_program.h +++ b/src/mesa/drivers/dri/i965/brw_program.h @@ -56,6 +56,11 @@ void brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog, struct gl_shader *shader, struct gl_program *prog); +void brw_upload_tcs_prog(struct brw_context *brw, + uint64_t per_vertex_slots, uint32_t per_patch_slots); +void brw_upload_tes_prog(struct brw_context *brw, + uint64_t per_vertex_slots, uint32_t per_patch_slots); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h index fa912c96c36..9f2ff9ae5ad 100644 --- a/src/mesa/drivers/dri/i965/brw_reg.h +++ b/src/mesa/drivers/dri/i965/brw_reg.h @@ -84,6 +84,7 @@ struct brw_device_info; #define BRW_SWIZZLE_YZXW BRW_SWIZZLE4(1,2,0,3) #define BRW_SWIZZLE_ZXYW BRW_SWIZZLE4(2,0,1,3) #define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3) +#define BRW_SWIZZLE_WZYX BRW_SWIZZLE4(3,2,1,0) static inline bool brw_is_single_value_swizzle(unsigned swiz) diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c index 3f29e2fc105..d181468f5cb 100644 --- a/src/mesa/drivers/dri/i965/brw_sampler_state.c +++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c @@ -654,7 +654,7 @@ const struct brw_tracked_state brw_gs_samplers = { static void brw_upload_tcs_samplers(struct brw_context *brw) { - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program; if (!tcs) return; @@ -667,7 +667,7 @@ const struct brw_tracked_state brw_tcs_samplers = { .dirty = { .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | - BRW_NEW_TESS_CTRL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tcs_samplers, }; @@ -676,7 +676,7 @@ const struct brw_tracked_state brw_tcs_samplers = { static void brw_upload_tes_samplers(struct brw_context *brw) { - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_program *tes = (struct gl_program *) brw->tess_eval_program; if (!tes) return; @@ -689,7 +689,7 @@ const struct brw_tracked_state brw_tes_samplers = { .dirty = { .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | - BRW_NEW_TESS_EVAL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tes_samplers, }; diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index d051e124584..836cf0a69e9 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -24,6 +24,7 @@ #include "brw_context.h" #include "brw_cfg.h" #include "brw_eu.h" +#include "brw_fs.h" #include "brw_nir.h" #include "glsl/glsl_parser_extras.h" #include "main/shaderobj.h" @@ -84,6 +85,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) compiler->scalar_stage[MESA_SHADER_VERTEX] = devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; + compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = true; compiler->scalar_stage[MESA_SHADER_GEOMETRY] = devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false); compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true; @@ -137,6 +140,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; } + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false; + compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false; + if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false; @@ -548,6 +554,21 @@ brw_instruction_name(enum opcode op) return "mulh"; case SHADER_OPCODE_MOV_INDIRECT: return "mov_indirect"; + + case VEC4_OPCODE_URB_READ: + return "urb_read"; + case TCS_OPCODE_GET_INSTANCE_ID: + return "tcs_get_instance_id"; + case TCS_OPCODE_URB_WRITE: + return "tcs_urb_write"; + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + return "tcs_set_input_urb_offsets"; + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + return "tcs_set_output_urb_offsets"; + case TCS_OPCODE_GET_PRIMITIVE_ID: + return "tcs_get_primitive_id"; + case TCS_OPCODE_CREATE_BARRIER_HEADER: + return "tcs_create_barrier_header"; } unreachable("not reached"); @@ -1292,3 +1313,96 @@ gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx) } } +extern "C" const unsigned * +brw_compile_tes(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tes_prog_key *key, + struct brw_tes_prog_data *prog_data, + const nir_shader *src_shader, + struct gl_shader_program *shader_prog, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + const struct brw_device_info *devinfo = compiler->devinfo; + struct gl_shader *shader = + shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]; + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL]; + + nir_shader *nir = nir_shader_clone(mem_ctx, src_shader); + nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar); + nir->info.inputs_read = key->inputs_read; + nir->info.patch_inputs_read = key->patch_inputs_read; + nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar); + nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar); + + brw_compute_vue_map(devinfo, &prog_data->base.vue_map, + nir->info.outputs_written, + nir->info.separate_shader); + + unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4; + + assert(output_size_bytes >= 1); + if (output_size_bytes > GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, "DS outputs exceed maximum size"); + return NULL; + } + + /* URB entry sizes are stored as a multiple of 64 bytes. */ + prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + + struct brw_vue_map input_vue_map; + brw_compute_tess_vue_map(&input_vue_map, + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, + nir->info.patch_inputs_read); + + bool need_patch_header = nir->info.system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_TESS_LEVEL_OUTER) | + BITFIELD64_BIT(SYSTEM_VALUE_TESS_LEVEL_INNER)); + + /* The TES will pull most inputs using URB read messages. + * + * However, we push the patch header for TessLevel factors when required, + * as it's a tiny amount of extra data. + */ + prog_data->base.urb_read_length = need_patch_header ? 1 : 0; + + if (unlikely(INTEL_DEBUG & DEBUG_TES)) { + fprintf(stderr, "TES Input "); + brw_print_vue_map(stderr, &input_vue_map); + fprintf(stderr, "TES Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map); + } + + if (is_scalar) { + fs_visitor v(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, shader->Program, nir, 8, + shader_time_index, &input_vue_map); + if (!v.run_tes()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; + + fs_generator g(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, v.promoted_constants, false, + "TES"); + if (unlikely(INTEL_DEBUG & DEBUG_TES)) { + g.enable_debug(ralloc_asprintf(mem_ctx, + "%s tessellation evaluation shader %s", + nir->info.label ? nir->info.label + : "unnamed", + nir->info.name)); + } + + g.generate_code(v.cfg, 8); + + return g.get_assembly(final_assembly_size); + } else { + unreachable("XXX: vec4 tessellation evalation shaders not merged yet."); + } +} diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index 0b77dc24539..82374a46c18 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -270,6 +270,12 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage, bool brw_vs_precompile(struct gl_context *ctx, struct gl_shader_program *shader_prog, struct gl_program *prog); +bool brw_tcs_precompile(struct gl_context *ctx, + struct gl_shader_program *shader_prog, + struct gl_program *prog); +bool brw_tes_precompile(struct gl_context *ctx, + struct gl_shader_program *shader_prog, + struct gl_program *prog); bool brw_gs_precompile(struct gl_context *ctx, struct gl_shader_program *shader_prog, struct gl_program *prog); diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c index 3d3a6cf943a..46667884125 100644 --- a/src/mesa/drivers/dri/i965/brw_state_dump.c +++ b/src/mesa/drivers/dri/i965/brw_state_dump.c @@ -319,10 +319,13 @@ dump_gen8_surface_state(struct brw_context *brw, uint32_t offset, int index) GET_FIELD(surf[4], GEN7_SURFACE_MIN_ARRAY_ELEMENT), GET_FIELD(surf[4], GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT) + 1, 1 << GET_BITS(surf[4], 5, 3)); - batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d\n", + batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d," + " tr_mode (gen9+): %d, mip tail (gen9+): %d\n", GET_FIELD(surf[5], BRW_SURFACE_X_OFFSET), GET_FIELD(surf[5], BRW_SURFACE_Y_OFFSET), - GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD)); + GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD), + GET_FIELD(surf[5], GEN9_SURFACE_TRMODE), + GET_FIELD(surf[5], GEN9_SURFACE_MIP_TAIL_START_LOD)); batch_out(brw, name, offset, 6, "AUX pitch: %d qpitch: %d\n", GET_FIELD(surf[6], GEN8_SURFACE_AUX_QPITCH) << 2, GET_FIELD(surf[6], GEN8_SURFACE_AUX_PITCH) << 2); diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index cf3cf97daea..81a67d284e0 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -517,6 +517,7 @@ void brw_init_state( struct brw_context *brw ) ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER; ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER; ctx->DriverFlags.NewImageUnits = BRW_NEW_IMAGE_UNITS; + ctx->DriverFlags.NewDefaultTessLevels = BRW_NEW_DEFAULT_TESS_LEVELS; } @@ -607,8 +608,7 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_URB_FENCE), DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM), DEFINE_BIT(BRW_NEW_GEOMETRY_PROGRAM), - DEFINE_BIT(BRW_NEW_TESS_EVAL_PROGRAM), - DEFINE_BIT(BRW_NEW_TESS_CTRL_PROGRAM), + DEFINE_BIT(BRW_NEW_TESS_PROGRAMS), DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM), DEFINE_BIT(BRW_NEW_CURBE_OFFSETS), DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE), @@ -620,6 +620,7 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_BINDING_TABLE_POINTERS), DEFINE_BIT(BRW_NEW_INDICES), DEFINE_BIT(BRW_NEW_VERTICES), + DEFINE_BIT(BRW_NEW_DEFAULT_TESS_LEVELS), DEFINE_BIT(BRW_NEW_BATCH), DEFINE_BIT(BRW_NEW_INDEX_BUFFER), DEFINE_BIT(BRW_NEW_VS_CONSTBUF), @@ -673,11 +674,40 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map) } static inline void +brw_upload_tess_programs(struct brw_context *brw) +{ + if (brw->tess_eval_program) { + uint64_t per_vertex_slots = brw->tess_eval_program->Base.InputsRead; + uint32_t per_patch_slots = + brw->tess_eval_program->Base.PatchInputsRead; + + /* The TCS may have additional outputs which aren't read by the + * TES (possibly for cross-thread communication). These need to + * be stored in the Patch URB Entry as well. + */ + if (brw->tess_ctrl_program) { + per_vertex_slots |= brw->tess_ctrl_program->Base.OutputsWritten; + per_patch_slots |= + brw->tess_ctrl_program->Base.PatchOutputsWritten; + } + + brw_upload_tcs_prog(brw, per_vertex_slots, per_patch_slots); + brw_upload_tes_prog(brw, per_vertex_slots, per_patch_slots); + } else { + brw->tcs.prog_data = NULL; + brw->tcs.base.prog_data = NULL; + brw->tes.prog_data = NULL; + brw->tes.base.prog_data = NULL; + } +} + +static inline void brw_upload_programs(struct brw_context *brw, enum brw_pipeline pipeline) { if (pipeline == BRW_RENDER_PIPELINE) { brw_upload_vs_prog(brw); + brw_upload_tess_programs(brw); if (brw->gen < 6) brw_upload_ff_gs_prog(brw); @@ -691,6 +721,8 @@ brw_upload_programs(struct brw_context *brw, bool old_separate = brw->vue_map_geom_out.separate; if (brw->geometry_program) brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map; + else if (brw->tess_eval_program) + brw->vue_map_geom_out = brw->tes.prog_data->base.vue_map; else brw->vue_map_geom_out = brw->vs.prog_data->base.vue_map; @@ -750,12 +782,12 @@ brw_upload_pipeline_state(struct brw_context *brw, if (brw->tess_eval_program != ctx->TessEvalProgram._Current) { brw->tess_eval_program = ctx->TessEvalProgram._Current; - brw->ctx.NewDriverState |= BRW_NEW_TESS_EVAL_PROGRAM; + brw->ctx.NewDriverState |= BRW_NEW_TESS_PROGRAMS; } if (brw->tess_ctrl_program != ctx->TessCtrlProgram._Current) { brw->tess_ctrl_program = ctx->TessCtrlProgram._Current; - brw->ctx.NewDriverState |= BRW_NEW_TESS_CTRL_PROGRAM; + brw->ctx.NewDriverState |= BRW_NEW_TESS_PROGRAMS; } if (brw->geometry_program != ctx->GeometryProgram._Current) { diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c index b7d9078d87d..9730078aeab 100644 --- a/src/mesa/drivers/dri/i965/brw_surface_formats.c +++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c @@ -395,6 +395,7 @@ brw_format_for_mesa_format(mesa_format mesa_format) [MESA_FORMAT_A8R8G8B8_SRGB] = 0, [MESA_FORMAT_R8G8B8A8_SRGB] = BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB, [MESA_FORMAT_X8R8G8B8_SRGB] = 0, + [MESA_FORMAT_B8G8R8X8_SRGB] = BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB, [MESA_FORMAT_L_SRGB8] = BRW_SURFACEFORMAT_L8_UNORM_SRGB, [MESA_FORMAT_L8A8_SRGB] = BRW_SURFACEFORMAT_L8A8_UNORM_SRGB, [MESA_FORMAT_A8L8_SRGB] = 0, @@ -660,6 +661,10 @@ brw_init_surface_formats(struct brw_context *brw) if (gen < tinfo->render_target) render = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; break; + case BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB: + if (gen < tinfo->render_target) + render = BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB; + break; case BRW_SURFACEFORMAT_R8G8B8X8_UNORM: render = BRW_SURFACEFORMAT_R8G8B8A8_UNORM; break; diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c new file mode 100644 index 00000000000..2c925e7f572 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_tcs.c @@ -0,0 +1,321 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_tcs.c + * + * Tessellation control shader state upload code. + */ + +#include "brw_context.h" +#include "brw_nir.h" +#include "brw_program.h" +#include "brw_shader.h" +#include "brw_state.h" +#include "program/prog_parameter.h" + +static void +brw_tcs_debug_recompile(struct brw_context *brw, + struct gl_shader_program *shader_prog, + const struct brw_tcs_prog_key *key) +{ + struct brw_cache_item *c = NULL; + const struct brw_tcs_prog_key *old_key = NULL; + bool found = false; + + perf_debug("Recompiling tessellation control shader for program %d\n", + shader_prog->Name); + + for (unsigned int i = 0; i < brw->cache.size; i++) { + for (c = brw->cache.items[i]; c; c = c->next) { + if (c->cache_id == BRW_CACHE_TCS_PROG) { + old_key = c->key; + + if (old_key->program_string_id == key->program_string_id) + break; + } + } + if (c) + break; + } + + if (!c) { + perf_debug(" Didn't find previous compile in the shader cache for " + "debug\n"); + return; + } + + found |= key_debug(brw, "input vertices", old_key->input_vertices, + key->input_vertices); + found |= key_debug(brw, "outputs written", old_key->outputs_written, + key->outputs_written); + found |= key_debug(brw, "patch outputs written", old_key->patch_outputs_written, + key->patch_outputs_written); + found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode, + key->tes_primitive_mode); + found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex); + + if (!found) { + perf_debug(" Something else\n"); + } +} + +static bool +brw_codegen_tcs_prog(struct brw_context *brw, + struct gl_shader_program *shader_prog, + struct brw_tess_ctrl_program *tcp, + struct brw_tcs_prog_key *key) +{ + struct gl_context *ctx = &brw->ctx; + const struct brw_compiler *compiler = brw->intelScreen->compiler; + struct brw_stage_state *stage_state = &brw->tcs.base; + nir_shader *nir; + struct brw_tcs_prog_data prog_data; + bool start_busy = false; + double start_time = 0; + + if (tcp) { + nir = tcp->program.Base.nir; + } else { + /* Create a dummy nir_shader. We won't actually use NIR code to + * generate assembly (it's easier to generate assembly directly), + * but the whole compiler assumes one of these exists. + */ + const nir_shader_compiler_options *options = + ctx->Const.ShaderCompilerOptions[MESA_SHADER_TESS_CTRL].NirOptions; + nir = nir_shader_create(NULL, MESA_SHADER_TESS_CTRL, options); + nir->num_uniforms = 2; /* both halves of the patch header */ + nir->info.outputs_written = key->outputs_written; + nir->info.inputs_read = key->outputs_written; + nir->info.tcs.vertices_out = key->input_vertices; + nir->info.name = ralloc_strdup(nir, "passthrough"); + } + + memset(&prog_data, 0, sizeof(prog_data)); + + /* Allocate the references to the uniforms that will end up in the + * prog_data associated with the compiled program, and which will be freed + * by the state cache. + * + * Note: param_count needs to be num_uniform_components * 4, since we add + * padding around uniform values below vec4 size, so the worst case is that + * every uniform is a float which gets padded to the size of a vec4. + */ + struct gl_shader *tcs = shader_prog ? + shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL] : NULL; + int param_count = nir->num_uniforms; + if (!compiler->scalar_stage[MESA_SHADER_TESS_CTRL]) + param_count *= 4; + + prog_data.base.base.param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.base.pull_param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.base.nr_params = param_count; + + if (tcs) { + prog_data.base.base.image_param = + rzalloc_array(NULL, struct brw_image_param, tcs->NumImages); + prog_data.base.base.nr_image_params = tcs->NumImages; + + brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base, + &prog_data.base.base, false); + } else { + /* Upload the Patch URB Header as the first two uniforms. + * Do the annoying scrambling so the shader doesn't have to. + */ + const float **param = (const float **) prog_data.base.base.param; + static float zero = 0.0f; + for (int i = 0; i < 4; i++) { + param[7 - i] = &ctx->TessCtrlProgram.patch_default_outer_level[i]; + } + + if (key->tes_primitive_mode == GL_QUADS) { + param[3] = &ctx->TessCtrlProgram.patch_default_inner_level[0]; + param[2] = &ctx->TessCtrlProgram.patch_default_inner_level[1]; + param[1] = &zero; + param[0] = &zero; + } else if (key->tes_primitive_mode == GL_TRIANGLES) { + param[4] = &ctx->TessCtrlProgram.patch_default_inner_level[0]; + for (int i = 0; i < 4; i++) + param[i] = &zero; + } + } + + if (unlikely(INTEL_DEBUG & DEBUG_TCS) && tcs) + brw_dump_ir("tessellation control", shader_prog, tcs, NULL); + + int st_index = -1; + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) + st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TCS); + + if (unlikely(brw->perf_debug)) { + start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo); + start_time = get_time(); + } + + void *mem_ctx = ralloc_context(NULL); + unsigned program_size; + char *error_str; + const unsigned *program = + brw_compile_tcs(compiler, brw, mem_ctx, key, &prog_data, nir, st_index, + &program_size, &error_str); + if (program == NULL) { + if (shader_prog) { + shader_prog->LinkStatus = false; + ralloc_strcat(&shader_prog->InfoLog, error_str); + } else { + ralloc_free(nir); + } + + _mesa_problem(NULL, "Failed to compile tessellation control shader: " + "%s\n", error_str); + + ralloc_free(mem_ctx); + return false; + } + + if (unlikely(brw->perf_debug)) { + struct brw_shader *btcs = (struct brw_shader *) tcs; + if (btcs->compiled_once) { + brw_tcs_debug_recompile(brw, shader_prog, key); + } + if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { + perf_debug("TCS compile took %.03f ms and stalled the GPU\n", + (get_time() - start_time) * 1000); + } + btcs->compiled_once = true; + } + + /* Scratch space is used for register spilling */ + if (prog_data.base.base.total_scratch) { + brw_get_scratch_bo(brw, &stage_state->scratch_bo, + prog_data.base.base.total_scratch * + brw->max_hs_threads); + } + + brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG, + key, sizeof(*key), + program, program_size, + &prog_data, sizeof(prog_data), + &stage_state->prog_offset, &brw->tcs.prog_data); + ralloc_free(mem_ctx); + if (!tcs) + ralloc_free(nir); + + return true; +} + + +void +brw_upload_tcs_prog(struct brw_context *brw, + uint64_t per_vertex_slots, + uint32_t per_patch_slots) +{ + struct gl_context *ctx = &brw->ctx; + struct gl_shader_program **current = ctx->_Shader->CurrentProgram; + struct brw_stage_state *stage_state = &brw->tcs.base; + struct brw_tcs_prog_key key; + /* BRW_NEW_TESS_PROGRAMS */ + struct brw_tess_ctrl_program *tcp = + (struct brw_tess_ctrl_program *) brw->tess_ctrl_program; + struct brw_tess_eval_program *tep = + (struct brw_tess_eval_program *) brw->tess_eval_program; + assert(tep); + + if (!brw_state_dirty(brw, + _NEW_TEXTURE, + BRW_NEW_PATCH_PRIMITIVE | + BRW_NEW_TESS_PROGRAMS)) + return; + + struct gl_program *prog = &tcp->program.Base; + + memset(&key, 0, sizeof(key)); + + key.input_vertices = ctx->TessCtrlProgram.patch_vertices; + key.outputs_written = per_vertex_slots; + key.patch_outputs_written = per_patch_slots; + + /* We need to specialize our code generation for tessellation levels + * based on the domain the DS is expecting to tessellate. + */ + key.tes_primitive_mode = tep->program.PrimitiveMode; + + if (tcp) { + key.program_string_id = tcp->id; + + /* _NEW_TEXTURE */ + brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count, + &key.tex); + } else { + key.outputs_written = tep->program.Base.InputsRead; + } + + + if (!brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG, + &key, sizeof(key), + &stage_state->prog_offset, &brw->tcs.prog_data)) { + bool success = brw_codegen_tcs_prog(brw, current[MESA_SHADER_TESS_CTRL], + tcp, &key); + assert(success); + (void)success; + } + brw->tcs.base.prog_data = &brw->tcs.prog_data->base.base; +} + + +bool +brw_tcs_precompile(struct gl_context *ctx, + struct gl_shader_program *shader_prog, + struct gl_program *prog) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_tcs_prog_key key; + uint32_t old_prog_offset = brw->tcs.base.prog_offset; + struct brw_tcs_prog_data *old_prog_data = brw->tcs.prog_data; + bool success; + + struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog; + struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp); + + memset(&key, 0, sizeof(key)); + + key.program_string_id = btcp->id; + brw_setup_tex_for_precompile(brw, &key.tex, prog); + + /* Guess that the input and output patches have the same dimensionality. */ + key.input_vertices = shader_prog->TessCtrl.VerticesOut; + + key.tes_primitive_mode = GL_TRIANGLES; + + key.outputs_written = prog->OutputsWritten; + key.patch_outputs_written = prog->PatchOutputsWritten; + + success = brw_codegen_tcs_prog(brw, shader_prog, btcp, &key); + + brw->tcs.base.prog_offset = old_prog_offset; + brw->tcs.prog_data = old_prog_data; + + return success; +} diff --git a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c index 115c5abd391..28cef3ca589 100644 --- a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c @@ -39,7 +39,7 @@ brw_upload_tcs_pull_constants(struct brw_context *brw) { struct brw_stage_state *stage_state = &brw->tcs.base; - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct brw_tess_ctrl_program *tcp = (struct brw_tess_ctrl_program *) brw->tess_ctrl_program; @@ -59,7 +59,7 @@ const struct brw_tracked_state brw_tcs_pull_constants = { .mesa = _NEW_PROGRAM_CONSTANTS, .brw = BRW_NEW_BATCH | BRW_NEW_TCS_PROG_DATA | - BRW_NEW_TESS_CTRL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tcs_pull_constants, }; @@ -122,7 +122,7 @@ static void brw_upload_tcs_image_surfaces(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_shader_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL]; @@ -138,7 +138,7 @@ const struct brw_tracked_state brw_tcs_image_surfaces = { .brw = BRW_NEW_BATCH | BRW_NEW_TCS_PROG_DATA | BRW_NEW_IMAGE_UNITS | - BRW_NEW_TESS_CTRL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tcs_image_surfaces, }; diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c new file mode 100644 index 00000000000..27dc7e59f5d --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_tes.c @@ -0,0 +1,319 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_tes.c + * + * Tessellation evaluation shader state upload code. + */ + +#include "brw_context.h" +#include "brw_nir.h" +#include "brw_program.h" +#include "brw_shader.h" +#include "brw_state.h" +#include "program/prog_parameter.h" + +static void +brw_tes_debug_recompile(struct brw_context *brw, + struct gl_shader_program *shader_prog, + const struct brw_tes_prog_key *key) +{ + struct brw_cache_item *c = NULL; + const struct brw_tes_prog_key *old_key = NULL; + bool found = false; + + perf_debug("Recompiling tessellation evaluation shader for program %d\n", + shader_prog->Name); + + for (unsigned int i = 0; i < brw->cache.size; i++) { + for (c = brw->cache.items[i]; c; c = c->next) { + if (c->cache_id == BRW_CACHE_TES_PROG) { + old_key = c->key; + + if (old_key->program_string_id == key->program_string_id) + break; + } + } + if (c) + break; + } + + if (!c) { + perf_debug(" Didn't find previous compile in the shader cache for " + "debug\n"); + return; + } + + found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex); + found |= key_debug(brw, "inputs read", old_key->inputs_read, + key->inputs_read); + found |= key_debug(brw, "patch inputs read", old_key->patch_inputs_read, + key->patch_inputs_read); + + if (!found) { + perf_debug(" Something else\n"); + } +} + +static bool +brw_codegen_tes_prog(struct brw_context *brw, + struct gl_shader_program *shader_prog, + struct brw_tess_eval_program *tep, + struct brw_tes_prog_key *key) +{ + const struct brw_compiler *compiler = brw->intelScreen->compiler; + const struct brw_device_info *devinfo = brw->intelScreen->devinfo; + struct brw_stage_state *stage_state = &brw->tes.base; + nir_shader *nir = tep->program.Base.nir; + struct brw_tes_prog_data prog_data; + bool start_busy = false; + double start_time = 0; + + memset(&prog_data, 0, sizeof(prog_data)); + + brw_assign_common_binding_table_offsets(MESA_SHADER_TESS_EVAL, devinfo, + shader_prog, &tep->program.Base, + &prog_data.base.base, 0); + + switch (tep->program.Spacing) { + case GL_EQUAL: + prog_data.partitioning = BRW_TESS_PARTITIONING_INTEGER; + break; + case GL_FRACTIONAL_ODD: + prog_data.partitioning = BRW_TESS_PARTITIONING_ODD_FRACTIONAL; + break; + case GL_FRACTIONAL_EVEN: + prog_data.partitioning = BRW_TESS_PARTITIONING_EVEN_FRACTIONAL; + break; + default: + unreachable("invalid domain shader spacing"); + } + + switch (tep->program.PrimitiveMode) { + case GL_QUADS: + prog_data.domain = BRW_TESS_DOMAIN_QUAD; + break; + case GL_TRIANGLES: + prog_data.domain = BRW_TESS_DOMAIN_TRI; + break; + case GL_ISOLINES: + prog_data.domain = BRW_TESS_DOMAIN_ISOLINE; + break; + default: + unreachable("invalid domain shader primitive mode"); + } + + if (tep->program.PointMode) { + prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_POINT; + } else if (tep->program.PrimitiveMode == GL_ISOLINES) { + prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_LINE; + } else { + /* Hardware winding order is backwards from OpenGL */ + switch (tep->program.VertexOrder) { + case GL_CCW: + prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW; + break; + case GL_CW: + prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW; + break; + default: + unreachable("invalid domain shader vertex order"); + } + } + + /* Allocate the references to the uniforms that will end up in the + * prog_data associated with the compiled program, and which will be freed + * by the state cache. + * + * Note: param_count needs to be num_uniform_components * 4, since we add + * padding around uniform values below vec4 size, so the worst case is that + * every uniform is a float which gets padded to the size of a vec4. + */ + struct gl_shader *tes = shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]; + int param_count = nir->num_uniforms; + if (!compiler->scalar_stage[MESA_SHADER_TESS_EVAL]) + param_count *= 4; + + prog_data.base.base.param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.base.pull_param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.base.image_param = + rzalloc_array(NULL, struct brw_image_param, tes->NumImages); + prog_data.base.base.nr_params = param_count; + prog_data.base.base.nr_image_params = tes->NumImages; + + brw_nir_setup_glsl_uniforms(nir, shader_prog, &tep->program.Base, + &prog_data.base.base, + compiler->scalar_stage[MESA_SHADER_TESS_EVAL]); + + if (unlikely(INTEL_DEBUG & DEBUG_TES)) + brw_dump_ir("tessellation evaluation", shader_prog, tes, NULL); + + int st_index = -1; + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) + st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TES); + + if (unlikely(brw->perf_debug)) { + start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo); + start_time = get_time(); + } + + void *mem_ctx = ralloc_context(NULL); + unsigned program_size; + char *error_str; + const unsigned *program = + brw_compile_tes(compiler, brw, mem_ctx, key, &prog_data, nir, + shader_prog, st_index, &program_size, &error_str); + if (program == NULL) { + if (shader_prog) { + shader_prog->LinkStatus = false; + ralloc_strcat(&shader_prog->InfoLog, error_str); + } + + _mesa_problem(NULL, "Failed to compile tessellation evaluation shader: " + "%s\n", error_str); + + ralloc_free(mem_ctx); + return false; + } + + if (unlikely(brw->perf_debug)) { + struct brw_shader *btes = (struct brw_shader *) tes; + if (btes->compiled_once) { + brw_tes_debug_recompile(brw, shader_prog, key); + } + if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { + perf_debug("TES compile took %.03f ms and stalled the GPU\n", + (get_time() - start_time) * 1000); + } + btes->compiled_once = true; + } + + /* Scratch space is used for register spilling */ + if (prog_data.base.base.total_scratch) { + brw_get_scratch_bo(brw, &stage_state->scratch_bo, + prog_data.base.base.total_scratch * + brw->max_ds_threads); + } + + brw_upload_cache(&brw->cache, BRW_CACHE_TES_PROG, + key, sizeof(*key), + program, program_size, + &prog_data, sizeof(prog_data), + &stage_state->prog_offset, &brw->tes.prog_data); + ralloc_free(mem_ctx); + + return true; +} + + +void +brw_upload_tes_prog(struct brw_context *brw, + uint64_t per_vertex_slots, + uint32_t per_patch_slots) +{ + struct gl_context *ctx = &brw->ctx; + struct gl_shader_program **current = ctx->_Shader->CurrentProgram; + struct brw_stage_state *stage_state = &brw->tes.base; + struct brw_tes_prog_key key; + /* BRW_NEW_TESS_PROGRAMS */ + struct brw_tess_eval_program *tep = + (struct brw_tess_eval_program *) brw->tess_eval_program; + + if (!brw_state_dirty(brw, + _NEW_TEXTURE, + BRW_NEW_TESS_PROGRAMS)) + return; + + struct gl_program *prog = &tep->program.Base; + + memset(&key, 0, sizeof(key)); + + key.program_string_id = tep->id; + + /* Ignore gl_TessLevelInner/Outer - we treat them as system values, + * not inputs, and they're always present in the URB entry regardless + * of whether or not we read them. + */ + key.inputs_read = per_vertex_slots & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); + key.patch_inputs_read = per_patch_slots; + + /* _NEW_TEXTURE */ + brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count, + &key.tex); + + if (!brw_search_cache(&brw->cache, BRW_CACHE_TES_PROG, + &key, sizeof(key), + &stage_state->prog_offset, &brw->tes.prog_data)) { + bool success = brw_codegen_tes_prog(brw, current[MESA_SHADER_TESS_EVAL], + tep, &key); + assert(success); + (void)success; + } + brw->tes.base.prog_data = &brw->tes.prog_data->base.base; +} + + +bool +brw_tes_precompile(struct gl_context *ctx, + struct gl_shader_program *shader_prog, + struct gl_program *prog) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_tes_prog_key key; + uint32_t old_prog_offset = brw->tes.base.prog_offset; + struct brw_tes_prog_data *old_prog_data = brw->tes.prog_data; + bool success; + + struct gl_tess_eval_program *tep = (struct gl_tess_eval_program *)prog; + struct brw_tess_eval_program *btep = brw_tess_eval_program(tep); + + memset(&key, 0, sizeof(key)); + + key.program_string_id = btep->id; + key.inputs_read = prog->InputsRead; + key.patch_inputs_read = prog->PatchInputsRead; + + if (shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) { + struct gl_program *tcp = + shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program; + key.inputs_read |= tcp->OutputsWritten; + key.patch_inputs_read |= tcp->PatchOutputsWritten; + } + + /* Ignore gl_TessLevelInner/Outer - they're system values. */ + key.inputs_read &= ~(VARYING_BIT_TESS_LEVEL_INNER | + VARYING_BIT_TESS_LEVEL_OUTER); + + brw_setup_tex_for_precompile(brw, &key.tex, prog); + + success = brw_codegen_tes_prog(brw, shader_prog, btep, &key); + + brw->tes.base.prog_offset = old_prog_offset; + brw->tes.prog_data = old_prog_data; + + return success; +} diff --git a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c index 142bd5a3109..eff1740c12f 100644 --- a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c @@ -39,7 +39,7 @@ brw_upload_tes_pull_constants(struct brw_context *brw) { struct brw_stage_state *stage_state = &brw->tes.base; - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct brw_tess_eval_program *dp = (struct brw_tess_eval_program *) brw->tess_eval_program; @@ -59,7 +59,7 @@ const struct brw_tracked_state brw_tes_pull_constants = { .mesa = _NEW_PROGRAM_CONSTANTS, .brw = BRW_NEW_BATCH | BRW_NEW_TES_PROG_DATA | - BRW_NEW_TESS_EVAL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tes_pull_constants, }; @@ -122,7 +122,7 @@ static void brw_upload_tes_image_surfaces(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_shader_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL]; @@ -137,7 +137,7 @@ const struct brw_tracked_state brw_tes_image_surfaces = { .dirty = { .brw = BRW_NEW_BATCH | BRW_NEW_IMAGE_UNITS | - BRW_NEW_TESS_EVAL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TES_PROG_DATA, }, .emit = brw_upload_tes_image_surfaces, diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index a697bdf84a0..0cded0c87c6 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -155,6 +155,9 @@ vec4_instruction::is_send_from_grf() case SHADER_OPCODE_TYPED_ATOMIC: case SHADER_OPCODE_TYPED_SURFACE_READ: case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case VEC4_OPCODE_URB_READ: + case TCS_OPCODE_URB_WRITE: + case SHADER_OPCODE_BARRIER: return true; default: return false; @@ -184,7 +187,9 @@ bool vec4_instruction::has_source_and_destination_hazard() const { switch (opcode) { - /* Most opcodes in the vec4 world use MRFs. */ + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + return true; default: return false; } @@ -204,6 +209,7 @@ vec4_instruction::regs_read(unsigned arg) const case SHADER_OPCODE_TYPED_ATOMIC: case SHADER_OPCODE_TYPED_SURFACE_READ: case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case TCS_OPCODE_URB_WRITE: return arg == 0 ? mlen : 1; case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: @@ -281,6 +287,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) return 0; case GS_OPCODE_FF_SYNC: return 1; + case TCS_OPCODE_URB_WRITE: + return 0; case SHADER_OPCODE_SHADER_TIME_ADD: return 0; case SHADER_OPCODE_TEX: diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 27c72766f2d..531eb170419 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -314,6 +314,8 @@ public: bool is_high_sampler(src_reg sampler); + bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate); + virtual void emit_nir_code(); virtual void nir_setup_uniforms(); virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); @@ -341,6 +343,7 @@ public: unsigned num_components = 4); src_reg get_nir_src(nir_src src, unsigned num_components = 4); + src_reg get_indirect_offset(nir_intrinsic_instr *instr); virtual dst_reg *make_reg_for_system_value(int location, const glsl_type *type) = 0; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp index 85cbf24092e..0c1f0c31b0d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp @@ -75,6 +75,8 @@ is_expression(const vec4_instruction *const inst) case VEC4_OPCODE_UNPACK_UNIFORM: case SHADER_OPCODE_FIND_LIVE_CHANNEL: case SHADER_OPCODE_BROADCAST: + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: return true; case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp index 2d0722aa1eb..c31e72def67 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp @@ -45,6 +45,9 @@ can_do_writemask(const struct brw_device_info *devinfo, case VS_OPCODE_PULL_CONSTANT_LOAD: case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + case VEC4_OPCODE_URB_READ: return false; default: /* The MATH instruction on Gen6 only executes in align1 mode, which does diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index 3299843bfa9..86ae9289df1 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -722,6 +722,220 @@ generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst) } static void +generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst) +{ + /* "Instance Count" comes as part of the payload in r0.2 bits 23:17. + * + * Since we operate in SIMD4x2 mode, we need run half as many threads + * as necessary. So we assign (2i + 1, 2i) as the thread counts. We + * shift right by one less to accomplish the multiplication by two. + */ + dst = retype(dst, BRW_REGISTER_TYPE_UD); + struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + const int mask = INTEL_MASK(23, 17); + const int shift = 17; + + brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask)); + brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0), + brw_imm_ud(shift - 1)); + brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1)); + + brw_pop_insn_state(p); +} + +static void +generate_tcs_urb_write(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg urb_header) +{ + const struct brw_device_info *devinfo = p->devinfo; + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, brw_null_reg()); + brw_set_src0(p, send, urb_header); + + brw_set_message_descriptor(p, send, BRW_SFID_URB, + inst->mlen /* mlen */, 0 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD); + brw_inst_set_urb_global_offset(devinfo, send, inst->offset); + brw_inst_set_urb_per_slot_offset(devinfo, send, 1); + brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); + + /* what happens to swizzles? */ +} + + +static void +generate_tcs_input_urb_offsets(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg vertex, + struct brw_reg offset) +{ + /* Generates an URB read/write message header for HS/DS operation. + * Inputs are a vertex index, and a byte offset from the beginning of + * the vertex. */ + + /* If `vertex` is not an immediate, we clobber a0.0 */ + + assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE); + assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D); + + assert(dst.file == BRW_GENERAL_REGISTER_FILE); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, brw_imm_ud(0)); + + /* m0.5 bits 8-15 are channel enables */ + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); + + /* m0.0-0.1: URB handles */ + if (vertex.file == BRW_IMMEDIATE_VALUE) { + uint32_t vertex_index = vertex.ud; + struct brw_reg index_reg = brw_vec1_grf( + 1 + (vertex_index >> 3), vertex_index & 7); + + brw_MOV(p, vec2(get_element_ud(dst, 0)), + retype(index_reg, BRW_REGISTER_TYPE_UD)); + } else { + /* Use indirect addressing. ICP Handles are DWords (single channels + * of a register) and start at g1.0. + * + * In order to start our region at g1.0, we add 8 to the vertex index, + * effectively skipping over the 8 channels in g0.0. This gives us a + * DWord offset to the ICP Handle. + * + * Indirect addressing works in terms of bytes, so we then multiply + * the DWord offset by 4 (by shifting left by 2). + */ + struct brw_reg addr = brw_address_reg(0); + + /* bottom half: m0.0 = g[1.0 + vertex.0]UD */ + brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8)); + brw_SHL(p, addr, addr, brw_imm_ud(2)); + brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0)); + + /* top half: m0.1 = g[1.0 + vertex.4]UD */ + brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8)); + brw_SHL(p, addr, addr, brw_imm_ud(2)); + brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0)); + } + + /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ + if (offset.file != ARF) + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); + + brw_pop_insn_state(p); +} + + +static void +generate_tcs_output_urb_offsets(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg write_mask, + struct brw_reg offset) +{ + /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */ + assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE); + + assert(write_mask.file == BRW_IMMEDIATE_VALUE); + assert(write_mask.type == BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, brw_imm_ud(0)); + + unsigned mask = write_mask.ud; + + /* m0.5 bits 15:12 and 11:8 are channel enables */ + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12))); + + /* HS patch URB handle is delivered in r0.0 */ + struct brw_reg urb_handle = brw_vec1_grf(0, 0); + + /* m0.0-0.1: URB handles */ + brw_MOV(p, vec2(get_element_ud(dst, 0)), + retype(urb_handle, BRW_REGISTER_TYPE_UD)); + + /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ + if (offset.file != ARF) + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); + + brw_pop_insn_state(p); +} + +static void +generate_vec4_urb_read(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg header) +{ + const struct brw_device_info *devinfo = p->devinfo; + + assert(header.file == BRW_GENERAL_REGISTER_FILE); + assert(header.type == BRW_REGISTER_TYPE_UD); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, dst); + brw_set_src0(p, send, header); + + brw_set_message_descriptor(p, send, BRW_SFID_URB, + 1 /* mlen */, 1 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); + brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); + brw_inst_set_urb_per_slot_offset(devinfo, send, 1); + + brw_inst_set_urb_global_offset(devinfo, send, inst->offset); +} + +static void +generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); + brw_pop_insn_state(p); +} + +static void +generate_tcs_create_barrier_header(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + struct brw_reg dst) +{ + struct brw_reg m0_2 = get_element_ud(dst, 2); + unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances; + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + /* Zero the message header */ + brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); + + /* Copy "Barrier ID" from DW0 bits 16:13 */ + brw_AND(p, m0_2, + retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0x1e000)); + + /* Shift it into place */ + brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(11)); + + /* Set the Barrier Count and the enable bit */ + brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15))); + + brw_pop_insn_state(p); +} + +static void generate_oword_dual_block_offsets(struct brw_codegen *p, struct brw_reg m1, struct brw_reg index) @@ -1546,6 +1760,39 @@ generate_code(struct brw_codegen *p, break; } + case TCS_OPCODE_URB_WRITE: + generate_tcs_urb_write(p, inst, src[0]); + break; + + case VEC4_OPCODE_URB_READ: + generate_vec4_urb_read(p, inst, dst, src[0]); + break; + + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + generate_tcs_input_urb_offsets(p, dst, src[0], src[1]); + break; + + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + generate_tcs_output_urb_offsets(p, dst, src[0], src[1]); + break; + + case TCS_OPCODE_GET_INSTANCE_ID: + generate_tcs_get_instance_id(p, dst); + break; + + case TCS_OPCODE_GET_PRIMITIVE_ID: + generate_tcs_get_primitive_id(p, dst); + break; + + case TCS_OPCODE_CREATE_BARRIER_HEADER: + generate_tcs_create_barrier_header(p, prog_data, dst); + break; + + case SHADER_OPCODE_BARRIER: + brw_barrier(p, src[0]); + brw_WAIT(p); + break; + default: unreachable("Unsupported opcode"); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index dcecd772ff6..7781d3c01ef 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -327,6 +327,24 @@ vec4_visitor::get_nir_src(nir_src src, unsigned num_components) return get_nir_src(src, nir_type_int, num_components); } +src_reg +vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr) +{ + nir_src *offset_src = nir_get_io_offset_src(instr); + nir_const_value *const_value = nir_src_as_const_value(*offset_src); + + if (const_value) { + /* The only constant offset we should find is 0. brw_nir.c's + * add_const_offset_to_base() will fold other constant offsets + * into instr->const_index[0]. + */ + assert(const_value->u[0] == 0); + return src_reg(); + } + + return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1); +} + void vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) { @@ -650,7 +668,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_load_vertex_id_zero_base: case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_instance_id: { + case nir_intrinsic_load_instance_id: + case nir_intrinsic_load_invocation_id: + case nir_intrinsic_load_tess_level_inner: + case nir_intrinsic_load_tess_level_outer: { gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); src_reg val = src_reg(nir_system_values[sv]); assert(val.file != BAD_FILE); @@ -888,6 +909,59 @@ brw_conditional_for_nir_comparison(nir_op op) } } +bool +vec4_visitor::optimize_predicate(nir_alu_instr *instr, + enum brw_predicate *predicate) +{ + if (!instr->src[0].src.is_ssa || + instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *cmp_instr = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + + switch (cmp_instr->op) { + case nir_op_bany_fnequal2: + case nir_op_bany_inequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_inequal3: + case nir_op_bany_fnequal4: + case nir_op_bany_inequal4: + *predicate = BRW_PREDICATE_ALIGN16_ANY4H; + break; + case nir_op_ball_fequal2: + case nir_op_ball_iequal2: + case nir_op_ball_fequal3: + case nir_op_ball_iequal3: + case nir_op_ball_fequal4: + case nir_op_ball_iequal4: + *predicate = BRW_PREDICATE_ALIGN16_ALL4H; + break; + default: + return false; + } + + unsigned size_swizzle = + brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]); + + src_reg op[2]; + assert(nir_op_infos[cmp_instr->op].num_inputs == 2); + for (unsigned i = 0; i < 2; i++) { + op[i] = get_nir_src(cmp_instr->src[i].src, + nir_op_infos[cmp_instr->op].input_types[i], 4); + unsigned base_swizzle = + brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle); + op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle); + op[i].abs = cmp_instr->src[i].abs; + op[i].negate = cmp_instr->src[i].negate; + } + + emit(CMP(dst_null_d(), op[0], op[1], + brw_conditional_for_nir_comparison(cmp_instr->op))); + + return true; +} + void vec4_visitor::nir_emit_alu(nir_alu_instr *instr) { @@ -1378,25 +1452,29 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) break; case nir_op_bcsel: - emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); - inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); - switch (dst.writemask) { - case WRITEMASK_X: - inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; - break; - case WRITEMASK_Y: - inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; - break; - case WRITEMASK_Z: - inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; - break; - case WRITEMASK_W: - inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; - break; - default: - inst->predicate = BRW_PREDICATE_NORMAL; - break; + enum brw_predicate predicate; + if (!optimize_predicate(instr, &predicate)) { + emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); + switch (dst.writemask) { + case WRITEMASK_X: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; + break; + case WRITEMASK_Y: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; + break; + case WRITEMASK_Z: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; + break; + case WRITEMASK_W: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; + break; + default: + predicate = BRW_PREDICATE_NORMAL; + break; + } } + inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); + inst->predicate = predicate; break; case nir_op_fdot_replicated2: @@ -1419,20 +1497,6 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; - case nir_op_bany2: - case nir_op_bany3: - case nir_op_bany4: { - unsigned swiz = - brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); - - emit(CMP(dst_null_d(), swizzle(op[0], swiz), brw_imm_d(0), - BRW_CONDITIONAL_NZ)); - emit(MOV(dst, brw_imm_d(0))); - inst = emit(MOV(dst, brw_imm_d(~0))); - inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H; - break; - } - case nir_op_fabs: case nir_op_iabs: case nir_op_fneg: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp new file mode 100644 index 00000000000..507db749e63 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp @@ -0,0 +1,551 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_tcs.cpp + * + * Tessellaton control shader specific code derived from the vec4_visitor class. + */ + +#include "brw_nir.h" +#include "brw_vec4_tcs.h" + +namespace brw { + +vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *nir, + void *mem_ctx, + int shader_time_index, + const struct brw_vue_map *input_vue_map) + : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base, + nir, mem_ctx, false, shader_time_index), + input_vue_map(input_vue_map), key(key) +{ +} + + +void +vec4_tcs_visitor::emit_nir_code() +{ + if (key->program_string_id != 0) { + /* We have a real application-supplied TCS, emit real code. */ + vec4_visitor::emit_nir_code(); + } else { + /* There is no TCS; automatically generate a passthrough shader + * that writes the API-specified default tessellation levels and + * copies VS outputs to TES inputs. + */ + uniforms = 2; + uniform_size[0] = 1; + uniform_size[1] = 1; + + uint64_t varyings = key->outputs_written; + + src_reg vertex_offset(this, glsl_type::uint_type); + emit(MUL(dst_reg(vertex_offset), invocation_id, + brw_imm_ud(prog_data->vue_map.num_per_vertex_slots))); + + while (varyings != 0) { + const int varying = ffsll(varyings) - 1; + + unsigned in_offset = input_vue_map->varying_to_slot[varying]; + unsigned out_offset = prog_data->vue_map.varying_to_slot[varying]; + assert(out_offset >= 2); + + dst_reg val(this, glsl_type::vec4_type); + emit_input_urb_read(val, invocation_id, in_offset, src_reg()); + emit_urb_write(src_reg(val), WRITEMASK_XYZW, out_offset, + vertex_offset); + + varyings &= ~BITFIELD64_BIT(varying); + } + + /* Only write the tessellation factors from invocation 0. + * There's no point in making other threads do redundant work. + */ + emit(CMP(dst_null_d(), invocation_id, brw_imm_ud(0), + BRW_CONDITIONAL_EQ)); + emit(IF(BRW_PREDICATE_NORMAL)); + emit_urb_write(src_reg(UNIFORM, 0, glsl_type::vec4_type), + WRITEMASK_XYZW, 0, src_reg()); + emit_urb_write(src_reg(UNIFORM, 1, glsl_type::vec4_type), + WRITEMASK_XYZW, 1, src_reg()); + emit(BRW_OPCODE_ENDIF); + } +} + +void +vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr) +{ +} + +dst_reg * +vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type *type) +{ + return NULL; +} + + +void +vec4_tcs_visitor::setup_payload() +{ + int reg = 0; + + /* The payload always contains important data in r0, which contains + * the URB handles that are passed on to the URB write at the end + * of the thread. + */ + reg++; + + /* r1.0 - r4.7 may contain the input control point URB handles, + * which we use to pull vertex data. + */ + reg += 4; + + /* Push constants may start at r5.0 */ + reg = setup_uniforms(reg); + + this->first_non_payload_grf = reg; +} + + +void +vec4_tcs_visitor::emit_prolog() +{ + invocation_id = src_reg(this, glsl_type::uint_type); + emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id)); + + /* HS threads are dispatched with the dispatch mask set to 0xFF. + * If there are an odd number of output vertices, then the final + * HS instance dispatched will only have its bottom half doing real + * work, and so we need to disable the upper half: + */ + if (nir->info.tcs.vertices_out % 2) { + emit(CMP(dst_null_d(), invocation_id, + brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L)); + + /* Matching ENDIF is in emit_thread_end() */ + emit(IF(BRW_PREDICATE_NORMAL)); + } +} + + +void +vec4_tcs_visitor::emit_thread_end() +{ + current_annotation = "thread end"; + + if (nir->info.tcs.vertices_out % 2) { + emit(BRW_OPCODE_ENDIF); + } + + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) + emit_shader_time_end(); + + vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); + inst->mlen = 1; /* just the header, no data. */ + inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE; +} + + +void +vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, + const src_reg &vertex_index, + unsigned base_offset, + const src_reg &indirect_offset) +{ + vec4_instruction *inst; + dst_reg temp(this, glsl_type::ivec4_type); + temp.type = dst.type; + + /* Set up the message header to reference the proper parts of the URB */ + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index, + indirect_offset); + inst->force_writemask_all = true; + + /* Read into a temporary, ignoring writemasking. */ + inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); + inst->offset = base_offset; + inst->mlen = 1; + inst->base_mrf = -1; + + /* Copy the temporary to the destination to deal with writemasking. + * + * Also attempt to deal with gl_PointSize being in the .w component. + */ + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { + emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW))); + } else { + emit(MOV(dst, src_reg(temp))); + } +} + +void +vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, + unsigned base_offset, + const src_reg &indirect_offset) +{ + vec4_instruction *inst; + + /* Set up the message header to reference the proper parts of the URB */ + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header, + brw_imm_ud(dst.writemask), indirect_offset); + inst->force_writemask_all = true; + + /* Read into a temporary, ignoring writemasking. */ + vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header)); + read->offset = base_offset; + read->mlen = 1; + read->base_mrf = -1; +} + +void +vec4_tcs_visitor::emit_urb_write(const src_reg &value, + unsigned writemask, + unsigned base_offset, + const src_reg &indirect_offset) +{ + if (writemask == 0) + return; + + src_reg message(this, glsl_type::uvec4_type, 2); + vec4_instruction *inst; + + inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message), + brw_imm_ud(writemask), indirect_offset); + inst->force_writemask_all = true; + inst = emit(MOV(offset(dst_reg(retype(message, value.type)), 1), value)); + inst->force_writemask_all = true; + + inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message); + inst->offset = base_offset; + inst->mlen = 2; + inst->base_mrf = -1; +} + +static unsigned +tesslevel_outer_components(GLenum tes_primitive_mode) +{ + switch (tes_primitive_mode) { + case GL_QUADS: + return 4; + case GL_TRIANGLES: + return 3; + case GL_ISOLINES: + return 2; + default: + unreachable("Bogus tessellation domain"); + } + return 0; +} + +static unsigned +tesslevel_inner_components(GLenum tes_primitive_mode) +{ + switch (tes_primitive_mode) { + case GL_QUADS: + return 2; + case GL_TRIANGLES: + return 1; + case GL_ISOLINES: + return 0; + default: + unreachable("Bogus tessellation domain"); + } + return 0; +} + +/** + * Given a normal .xyzw writemask, convert it to a writemask for a vector + * that's stored backwards, i.e. .wzyx. + */ +static unsigned +writemask_for_backwards_vector(unsigned mask) +{ + unsigned new_mask = 0; + + for (int i = 0; i < 4; i++) + new_mask |= ((mask >> i) & 1) << (3 - i); + + return new_mask; +} + +void +vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_invocation_id: + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD), + invocation_id)); + break; + case nir_intrinsic_load_primitive_id: + emit(TCS_OPCODE_GET_PRIMITIVE_ID, + get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); + break; + case nir_intrinsic_load_patch_vertices_in: + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D), + brw_imm_d(key->input_vertices))); + break; + case nir_intrinsic_load_per_vertex_input: { + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); + src_reg vertex_index = + vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0])) + : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); + + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); + dst.writemask = brw_writemask_for_size(instr->num_components); + + emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset); + break; + } + case nir_intrinsic_load_input: + unreachable("nir_lower_io should use load_per_vertex_input intrinsics"); + break; + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: { + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0];; + + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); + dst.writemask = brw_writemask_for_size(instr->num_components); + + if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { + dst.type = BRW_REGISTER_TYPE_F; + + /* This is a read of gl_TessLevelInner[], which lives in the + * Patch URB header. The layout depends on the domain. + */ + switch (key->tes_primitive_mode) { + case GL_QUADS: { + /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */ + dst_reg tmp(this, glsl_type::vec4_type); + emit_output_urb_read(tmp, 0, src_reg()); + emit(MOV(writemask(dst, WRITEMASK_XY), + swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); + break; + } + case GL_TRIANGLES: + /* DWord 4; use offset 1 but normal swizzle/writemask. */ + emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg()); + break; + case GL_ISOLINES: + /* All channels are undefined. */ + return; + default: + unreachable("Bogus tessellation domain"); + } + } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { + dst.type = BRW_REGISTER_TYPE_F; + + /* This is a read of gl_TessLevelOuter[], which lives in the + * high 4 DWords of the Patch URB header, in reverse order. + */ + switch (key->tes_primitive_mode) { + case GL_QUADS: + dst.writemask = WRITEMASK_XYZW; + break; + case GL_TRIANGLES: + dst.writemask = WRITEMASK_XYZ; + break; + case GL_ISOLINES: + dst.writemask = WRITEMASK_XY; + return; + default: + unreachable("Bogus tessellation domain"); + } + + dst_reg tmp(this, glsl_type::vec4_type); + emit_output_urb_read(tmp, 1, src_reg()); + emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); + } else { + emit_output_urb_read(dst, imm_offset, indirect_offset); + } + break; + } + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: { + src_reg value = get_nir_src(instr->src[0]); + unsigned mask = instr->const_index[1]; + unsigned swiz = BRW_SWIZZLE_XYZW; + + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { + value.type = BRW_REGISTER_TYPE_F; + + mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1; + + /* This is a write to gl_TessLevelInner[], which lives in the + * Patch URB header. The layout depends on the domain. + */ + switch (key->tes_primitive_mode) { + case GL_QUADS: + /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). + * We use an XXYX swizzle to reverse put .xy in the .wz + * channels, and use a .zw writemask. + */ + swiz = BRW_SWIZZLE4(0, 0, 1, 0); + mask = writemask_for_backwards_vector(mask); + break; + case GL_TRIANGLES: + /* gl_TessLevelInner[].x lives at DWord 4, so we set the + * writemask to X and bump the URB offset by 1. + */ + imm_offset = 1; + break; + case GL_ISOLINES: + /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ + return; + default: + unreachable("Bogus tessellation domain"); + } + } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { + value.type = BRW_REGISTER_TYPE_F; + + mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1; + + /* This is a write to gl_TessLevelOuter[] which lives in the + * Patch URB Header at DWords 4-7. However, it's reversed, so + * instead of .xyzw we have .wzyx. + */ + swiz = BRW_SWIZZLE_WZYX; + mask = writemask_for_backwards_vector(mask); + } + + emit_urb_write(swizzle(value, swiz), mask, + imm_offset, indirect_offset); + break; + } + + case nir_intrinsic_barrier: { + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); + emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); + break; + } + + default: + vec4_visitor::nir_emit_intrinsic(instr); + } +} + + +extern "C" const unsigned * +brw_compile_tcs(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *src_shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + const struct brw_device_info *devinfo = compiler->devinfo; + struct brw_vue_prog_data *vue_prog_data = &prog_data->base; + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL]; + + nir_shader *nir = nir_shader_clone(mem_ctx, src_shader); + nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar); + nir->info.outputs_written = key->outputs_written; + nir->info.patch_outputs_written = key->patch_outputs_written; + nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar); + nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar); + + prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2); + + brw_compute_tess_vue_map(&vue_prog_data->vue_map, + nir->info.outputs_written, + nir->info.patch_outputs_written); + + /* Compute URB entry size. The maximum allowed URB entry size is 32k. + * That divides up as follows: + * + * 32 bytes for the patch header (tessellation factors) + * 480 bytes for per-patch varyings (a varying component is 4 bytes and + * gl_MaxTessPatchComponents = 120) + * 16384 bytes for per-vertex varyings (a varying component is 4 bytes, + * gl_MaxPatchVertices = 32 and + * gl_MaxTessControlOutputComponents = 128) + * + * 15808 bytes left for varying packing overhead + */ + const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots; + const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots; + unsigned output_size_bytes = 0; + /* Note that the patch header is counted in num_per_patch_slots. */ + output_size_bytes += num_per_patch_slots * 16; + output_size_bytes += nir->info.tcs.vertices_out * num_per_vertex_slots * 16; + + assert(output_size_bytes >= 1); + if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES) + return false; + + /* URB entry sizes are stored as a multiple of 64 bytes. */ + vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + + struct brw_vue_map input_vue_map; + brw_compute_vue_map(devinfo, &input_vue_map, + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, + true); + + /* HS does not use the usual payload pushing from URB to GRFs, + * because we don't have enough registers for a full-size payload, and + * the hardware is broken on Haswell anyway. + */ + vue_prog_data->urb_read_length = 0; + + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { + fprintf(stderr, "TCS Input "); + brw_print_vue_map(stderr, &input_vue_map); + fprintf(stderr, "TCS Output "); + brw_print_vue_map(stderr, &vue_prog_data->vue_map); + } + + vec4_tcs_visitor v(compiler, log_data, key, prog_data, + nir, mem_ctx, shader_time_index, &input_vue_map); + if (!v.run()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) + v.dump_instructions(); + + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, + &prog_data->base, v.cfg, + final_assembly_size); +} + + +} /* namespace brw */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.h b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h new file mode 100644 index 00000000000..2c6801b2ae3 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h @@ -0,0 +1,88 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_tcs.h + * + * The vec4-mode tessellation control shader compiler backend. + */ + +#ifndef BRW_VEC4_TCS_H +#define BRW_VEC4_TCS_H + +#include "brw_compiler.h" +#include "brw_vec4.h" + +#ifdef __cplusplus +namespace brw { + +class vec4_tcs_visitor : public vec4_visitor +{ +public: + vec4_tcs_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *nir, + void *mem_ctx, + int shader_time_index, + const struct brw_vue_map *input_vue_map); + +protected: + virtual void emit_nir_code(); + virtual dst_reg *make_reg_for_system_value(int location, + const glsl_type *type); + virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); + virtual void setup_payload(); + virtual void emit_prolog(); + virtual void emit_thread_end(); + + virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); + + void emit_input_urb_read(const dst_reg &dst, + const src_reg &vertex_index, + unsigned base_offset, + const src_reg &indirect_offset); + void emit_output_urb_read(const dst_reg &dst, + unsigned base_offset, + const src_reg &indirect_offset); + + void emit_urb_write(const src_reg &value, unsigned writemask, + unsigned base_offset, const src_reg &indirect_offset); + + /* we do not use the normal end-of-shader URB write mechanism -- but every vec4 stage + * must provide implementations of these: + */ + virtual void emit_urb_write_header(int mrf) {} + virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return NULL; } + + const struct brw_vue_map *input_vue_map; + + const struct brw_tcs_prog_key *key; + src_reg invocation_id; +}; + +} /* namespace brw */ +#endif /* __cplusplus */ + +#endif /* BRW_VEC4_TCS_H */ diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 59b748f2055..3095d82d91e 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -148,7 +148,9 @@ brw_codegen_vs_prog(struct brw_context *brw, brw_compute_vue_map(brw->intelScreen->devinfo, &prog_data.base.vue_map, outputs_written, - prog ? prog->SeparateShader : false); + prog ? prog->SeparateShader || + prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] + : false); if (0) { _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG, diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c index 6cb3da46995..09eadbcb54f 100644 --- a/src/mesa/drivers/dri/i965/brw_vue_map.c +++ b/src/mesa/drivers/dri/i965/brw_vue_map.c @@ -176,6 +176,73 @@ brw_compute_vue_map(const struct brw_device_info *devinfo, } vue_map->num_slots = separate ? slot + 1 : slot; + vue_map->num_per_vertex_slots = 0; + vue_map->num_per_patch_slots = 0; +} + +/** + * Compute the VUE map for tessellation control shader outputs and + * tessellation evaluation shader inputs. + */ +void +brw_compute_tess_vue_map(struct brw_vue_map *vue_map, + GLbitfield64 vertex_slots, + GLbitfield patch_slots) +{ + /* I don't think anything actually uses this... */ + vue_map->slots_valid = vertex_slots; + + vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER | + VARYING_BIT_TESS_LEVEL_INNER); + + /* Make sure that the values we store in vue_map->varying_to_slot and + * vue_map->slot_to_varying won't overflow the signed chars that are used + * to store them. Note that since vue_map->slot_to_varying sometimes holds + * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that + * VARYING_SLOT_TESS_MAX is <= 127, not 128. + */ + STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127); + + for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) { + vue_map->varying_to_slot[i] = -1; + vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD; + } + + int slot = 0; + + /* The first 8 DWords are reserved for the "Patch Header". + * + * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout + * depends on the domain type. They might not be in slots 0 and 1 as + * described here, but pretending they're separate allows us to uniquely + * identify them by distinct slot locations. + */ + assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++); + assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++); + + /* first assign per-patch varyings */ + while (patch_slots != 0) { + const int varying = ffsll(patch_slots) - 1; + if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) { + assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++); + } + patch_slots &= ~BITFIELD64_BIT(varying); + } + + /* apparently, including the patch header... */ + vue_map->num_per_patch_slots = slot; + + /* then assign per-vertex varyings for each vertex in our patch */ + while (vertex_slots != 0) { + const int varying = ffsll(vertex_slots) - 1; + if (vue_map->varying_to_slot[varying] == -1) { + assign_vue_slot(vue_map, varying, slot++); + } + vertex_slots &= ~BITFIELD64_BIT(varying); + } + + vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots; + vue_map->num_slots = slot; } static const char * @@ -196,11 +263,28 @@ varying_name(brw_varying_slot slot) void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map) { - fprintf(fp, "VUE map (%d slots, %s)\n", - vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO"); - for (int i = 0; i < vue_map->num_slots; i++) { - fprintf(fp, " [%d] %s\n", i, - varying_name(vue_map->slot_to_varying[i])); + if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) { + fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n", + vue_map->num_slots, + vue_map->num_per_patch_slots, + vue_map->num_per_vertex_slots, + vue_map->separate ? "SSO" : "non-SSO"); + for (int i = 0; i < vue_map->num_slots; i++) { + if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) { + fprintf(fp, " [%d] VARYING_SLOT_PATCH%d\n", i, + vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0); + } else { + fprintf(fp, " [%d] %s\n", i, + varying_name(vue_map->slot_to_varying[i])); + } + } + } else { + fprintf(fp, "VUE map (%d slots, %s)\n", + vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO"); + for (int i = 0; i < vue_map->num_slots; i++) { + fprintf(fp, " [%d] %s\n", i, + varying_name(vue_map->slot_to_varying[i])); + } } fprintf(fp, "\n"); } diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index c4ebbf3b48c..76dc5775121 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -862,10 +862,8 @@ brw_update_texture_surfaces(struct brw_context *brw) /* BRW_NEW_VERTEX_PROGRAM */ struct gl_program *vs = (struct gl_program *) brw->vertex_program; - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program; - - /* BRW_NEW_TESS_EVAL_PROGRAM */ struct gl_program *tes = (struct gl_program *) brw->tess_eval_program; /* BRW_NEW_GEOMETRY_PROGRAM */ @@ -915,8 +913,7 @@ const struct brw_tracked_state brw_texture_surfaces = { BRW_NEW_FS_PROG_DATA | BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_GS_PROG_DATA | - BRW_NEW_TESS_CTRL_PROGRAM | - BRW_NEW_TESS_EVAL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TCS_PROG_DATA | BRW_NEW_TES_PROG_DATA | BRW_NEW_TEXTURE_BUFFER | diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c index 6653a6d759b..da3b4cd90e8 100644 --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c @@ -65,7 +65,8 @@ gen6_upload_push_constants(struct brw_context *brw, * basic type of PROGRAM_STATE_VAR. */ /* XXX: Should this happen somewhere before to get our state flag set? */ - _mesa_load_state_parameters(ctx, prog->Parameters); + if (prog) + _mesa_load_state_parameters(ctx, prog->Parameters); gl_constant_value *param; unsigned i; diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp index e87b9d1657f..89b73ca7519 100644 --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp @@ -50,6 +50,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw) unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16; gen7_emit_push_constant_state(brw, urb_size / 2 /* vs_size */, + 0 /* hs_size */, + 0 /* ds_size */, 0 /* gs_size */, urb_size / 2 /* fs_size */); @@ -60,6 +62,12 @@ gen7_blorp_emit_urb_config(struct brw_context *brw) 32 /* num_vs_entries */, 2 /* vs_size */, 2 /* vs_start */, + 0 /* num_hs_entries */, + 1 /* hs_size */, + 2 /* hs_start */, + 0 /* num_ds_entries */, + 1 /* ds_size */, + 2 /* ds_start */, 0 /* num_gs_entries */, 1 /* gs_size */, 2 /* gs_start */); diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c index 1fde69cf78e..a025bb9dd66 100644 --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c @@ -141,7 +141,7 @@ brw_upload_cs_state(struct brw_context *brw) BEGIN_BATCH(4); OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2)); OUT_BATCH(0); - OUT_BATCH(reg_aligned_constant_size * threads); + OUT_BATCH(ALIGN(reg_aligned_constant_size * threads, 64)); OUT_BATCH(stage_state->push_const_offset); ADVANCE_BATCH(); } @@ -249,8 +249,8 @@ brw_upload_cs_push_constants(struct brw_context *brw, param = (gl_constant_value*) brw_state_batch(brw, type, - reg_aligned_constant_size * threads, - 32, &stage_state->push_const_offset); + ALIGN(reg_aligned_constant_size * threads, 64), + 64, &stage_state->push_const_offset); assert(param); STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); diff --git a/src/mesa/drivers/dri/i965/gen7_ds_state.c b/src/mesa/drivers/dri/i965/gen7_ds_state.c index 4d3d94f68a6..9a697140386 100644 --- a/src/mesa/drivers/dri/i965/gen7_ds_state.c +++ b/src/mesa/drivers/dri/i965/gen7_ds_state.c @@ -30,7 +30,7 @@ static void gen7_upload_tes_push_constants(struct brw_context *brw) { struct brw_stage_state *stage_state = &brw->tes.base; - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ const struct brw_tess_eval_program *tep = (struct brw_tess_eval_program *) brw->tess_eval_program; @@ -49,7 +49,7 @@ const struct brw_tracked_state gen7_tes_push_constants = { .mesa = _NEW_PROGRAM_CONSTANTS, .brw = BRW_NEW_BATCH | BRW_NEW_PUSH_CONSTANT_ALLOCATION | - BRW_NEW_TESS_EVAL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TES_PROG_DATA, }, .emit = gen7_upload_tes_push_constants, diff --git a/src/mesa/drivers/dri/i965/gen7_hs_state.c b/src/mesa/drivers/dri/i965/gen7_hs_state.c index fcaa9197857..6793617b9e2 100644 --- a/src/mesa/drivers/dri/i965/gen7_hs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_hs_state.c @@ -30,26 +30,28 @@ static void gen7_upload_tcs_push_constants(struct brw_context *brw) { struct brw_stage_state *stage_state = &brw->tcs.base; - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ const struct brw_tess_ctrl_program *tcp = (struct brw_tess_ctrl_program *) brw->tess_ctrl_program; + bool active = brw->tess_eval_program; - if (tcp) { + if (active) { /* BRW_NEW_TCS_PROG_DATA */ const struct brw_stage_prog_data *prog_data = &brw->tcs.prog_data->base.base; gen6_upload_push_constants(brw, &tcp->program.Base, prog_data, stage_state, AUB_TRACE_VS_CONSTANTS); } - gen7_upload_constant_state(brw, stage_state, tcp, _3DSTATE_CONSTANT_HS); + gen7_upload_constant_state(brw, stage_state, active, _3DSTATE_CONSTANT_HS); } const struct brw_tracked_state gen7_tcs_push_constants = { .dirty = { .mesa = _NEW_PROGRAM_CONSTANTS, .brw = BRW_NEW_BATCH | + BRW_NEW_DEFAULT_TESS_LEVELS | BRW_NEW_PUSH_CONSTANT_ALLOCATION | - BRW_NEW_TESS_CTRL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TCS_PROG_DATA, }, .emit = gen7_upload_tcs_push_constants, diff --git a/src/mesa/drivers/dri/i965/gen7_te_state.c b/src/mesa/drivers/dri/i965/gen7_te_state.c index 2650fa562ec..f221307bc43 100644 --- a/src/mesa/drivers/dri/i965/gen7_te_state.c +++ b/src/mesa/drivers/dri/i965/gen7_te_state.c @@ -29,10 +29,8 @@ static void upload_te_state(struct brw_context *brw) { - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ bool active = brw->tess_eval_program; - if (active) - assert(brw->tess_ctrl_program); const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data; @@ -61,7 +59,7 @@ const struct brw_tracked_state gen7_te_state = { .mesa = 0, .brw = BRW_NEW_CONTEXT | BRW_NEW_TES_PROG_DATA | - BRW_NEW_TESS_EVAL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = upload_te_state, }; diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c index 99a9d3c6500..00edbcca662 100644 --- a/src/mesa/drivers/dri/i965/gen7_urb.c +++ b/src/mesa/drivers/dri/i965/gen7_urb.c @@ -34,7 +34,7 @@ * __________-__________ _________________-_________________ * / \ / \ * +-------------------------------------------------------------+ - * | VS/FS/GS Push | VS/GS URB | + * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB | * | Constants | Entries | * +-------------------------------------------------------------+ * @@ -60,27 +60,32 @@ static void gen7_allocate_push_constants(struct brw_context *brw) { + /* BRW_NEW_GEOMETRY_PROGRAM */ + bool gs_present = brw->geometry_program; + + /* BRW_NEW_TESS_PROGRAMS */ + bool tess_present = brw->tess_eval_program; + unsigned avail_size = 16; unsigned multiplier = (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1; - /* BRW_NEW_GEOMETRY_PROGRAM */ - bool gs_present = brw->geometry_program; + int stages = 2 + gs_present + 2 * tess_present; - unsigned vs_size, gs_size; - if (gs_present) { - vs_size = avail_size / 3; - avail_size -= vs_size; - gs_size = avail_size / 2; - avail_size -= gs_size; - } else { - vs_size = avail_size / 2; - avail_size -= vs_size; - gs_size = 0; - } - unsigned fs_size = avail_size; + /* Divide up the available space equally between stages. Because we + * round down (using floor division), there may be some left over + * space. We allocate that to the pixel shader stage. + */ + unsigned size_per_stage = avail_size / stages; + + unsigned vs_size = size_per_stage; + unsigned hs_size = tess_present ? size_per_stage : 0; + unsigned ds_size = tess_present ? size_per_stage : 0; + unsigned gs_size = gs_present ? size_per_stage : 0; + unsigned fs_size = avail_size - size_per_stage * (stages - 1); gen7_emit_push_constant_state(brw, multiplier * vs_size, + multiplier * hs_size, multiplier * ds_size, multiplier * gs_size, multiplier * fs_size); /* From p115 of the Ivy Bridge PRM (3.2.1.4 3DSTATE_PUSH_CONSTANT_ALLOC_VS): @@ -99,15 +104,24 @@ gen7_allocate_push_constants(struct brw_context *brw) void gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, + unsigned hs_size, unsigned ds_size, unsigned gs_size, unsigned fs_size) { unsigned offset = 0; - BEGIN_BATCH(6); + BEGIN_BATCH(10); OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2)); OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); offset += vs_size; + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_HS << 16 | (2 - 2)); + OUT_BATCH(hs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); + offset += hs_size; + + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_DS << 16 | (2 - 2)); + OUT_BATCH(ds_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); + offset += ds_size; + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2)); OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); offset += gs_size; @@ -130,7 +144,9 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, const struct brw_tracked_state gen7_push_constant_space = { .dirty = { .mesa = 0, - .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, + .brw = BRW_NEW_CONTEXT | + BRW_NEW_GEOMETRY_PROGRAM | + BRW_NEW_TESS_PROGRAMS, }, .emit = gen7_allocate_push_constants, }; @@ -138,6 +154,7 @@ const struct brw_tracked_state gen7_push_constant_space = { static void gen7_upload_urb(struct brw_context *brw) { + const struct brw_device_info *devinfo = brw->intelScreen->devinfo; const int push_size_kB = (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16; @@ -149,6 +166,15 @@ gen7_upload_urb(struct brw_context *brw) unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1; unsigned gs_entry_size_bytes = gs_size * 64; + /* BRW_NEW_TESS_PROGRAMS */ + const bool tess_present = brw->tess_eval_program; + /* BRW_NEW_TCS_PROG_DATA */ + unsigned hs_size = tess_present ? brw->tcs.prog_data->base.urb_entry_size : 1; + unsigned hs_entry_size_bytes = hs_size * 64; + /* BRW_NEW_TES_PROG_DATA */ + unsigned ds_size = tess_present ? brw->tes.prog_data->base.urb_entry_size : 1; + unsigned ds_entry_size_bytes = ds_size * 64; + /* If we're just switching between programs with the same URB requirements, * skip the rest of the logic. */ @@ -156,21 +182,29 @@ gen7_upload_urb(struct brw_context *brw) !(brw->ctx.NewDriverState & BRW_NEW_URB_SIZE) && brw->urb.vsize == vs_size && brw->urb.gs_present == gs_present && - brw->urb.gsize == gs_size) { + brw->urb.gsize == gs_size && + brw->urb.tess_present == tess_present && + brw->urb.hsize == hs_size && + brw->urb.dsize == ds_size) { return; } brw->urb.vsize = vs_size; brw->urb.gs_present = gs_present; brw->urb.gsize = gs_size; + brw->urb.tess_present = tess_present; + brw->urb.hsize = hs_size; + brw->urb.dsize = ds_size; /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): * * VS Number of URB Entries must be divisible by 8 if the VS URB Entry * Allocation Size is less than 9 512-bit URB entries. * - * Similar text exists for GS. + * Similar text exists for HS, DS and GS. */ unsigned vs_granularity = (vs_size < 9) ? 8 : 1; + unsigned hs_granularity = (hs_size < 9) ? 8 : 1; + unsigned ds_granularity = (ds_size < 9) ? 8 : 1; unsigned gs_granularity = (gs_size < 9) ? 8 : 1; /* URB allocations must be done in 8k chunks. */ @@ -191,13 +225,20 @@ gen7_upload_urb(struct brw_context *brw) * additional space it could actually make use of). */ - /* VS has a lower limit on the number of URB entries */ + /* VS has a lower limit on the number of URB entries. + * + * From the Broadwell PRM, 3DSTATE_URB_VS instruction: + * "When tessellation is enabled, the VS Number of URB Entries must be + * greater than or equal to 192." + */ + unsigned vs_min_entries = + tess_present && brw->gen == 8 ? 192 : brw->urb.min_vs_entries; + unsigned vs_chunks = - ALIGN(brw->urb.min_vs_entries * vs_entry_size_bytes, chunk_size_bytes) / - chunk_size_bytes; + DIV_ROUND_UP(vs_min_entries * vs_entry_size_bytes, chunk_size_bytes); unsigned vs_wants = - ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes - vs_chunks; + DIV_ROUND_UP(brw->urb.max_vs_entries * vs_entry_size_bytes, + chunk_size_bytes) - vs_chunks; unsigned gs_chunks = 0; unsigned gs_wants = 0; @@ -210,21 +251,42 @@ gen7_upload_urb(struct brw_context *brw) * * (2) We can't allocate less than nr_gs_entries_granularity. */ - gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes; - gs_wants = - ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes - gs_chunks; + gs_chunks = DIV_ROUND_UP(MAX2(gs_granularity, 2) * gs_entry_size_bytes, + chunk_size_bytes); + gs_wants = DIV_ROUND_UP(brw->urb.max_gs_entries * gs_entry_size_bytes, + chunk_size_bytes) - gs_chunks; + } + + unsigned hs_chunks = 0; + unsigned hs_wants = 0; + unsigned ds_chunks = 0; + unsigned ds_wants = 0; + + if (tess_present) { + hs_chunks = + DIV_ROUND_UP(hs_granularity * hs_entry_size_bytes, + chunk_size_bytes); + hs_wants = + DIV_ROUND_UP(devinfo->urb.max_hs_entries * hs_entry_size_bytes, + chunk_size_bytes) - hs_chunks; + + ds_chunks = + DIV_ROUND_UP(devinfo->urb.min_ds_entries * ds_entry_size_bytes, + chunk_size_bytes); + ds_wants = + DIV_ROUND_UP(brw->urb.max_ds_entries * ds_entry_size_bytes, + chunk_size_bytes) - ds_chunks; } /* There should always be enough URB space to satisfy the minimum * requirements of each stage. */ - unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks; + unsigned total_needs = push_constant_chunks + + vs_chunks + hs_chunks + ds_chunks + gs_chunks; assert(total_needs <= urb_chunks); /* Mete out remaining space (if any) in proportion to "wants". */ - unsigned total_wants = vs_wants + gs_wants; + unsigned total_wants = vs_wants + hs_wants + ds_wants + gs_wants; unsigned remaining_space = urb_chunks - total_needs; if (remaining_space > total_wants) remaining_space = total_wants; @@ -233,61 +295,100 @@ gen7_upload_urb(struct brw_context *brw) roundf(vs_wants * (((float) remaining_space) / total_wants)); vs_chunks += vs_additional; remaining_space -= vs_additional; + total_wants -= vs_wants; + + unsigned hs_additional = (unsigned) + round(hs_wants * (((double) remaining_space) / total_wants)); + hs_chunks += hs_additional; + remaining_space -= hs_additional; + total_wants -= hs_wants; + + unsigned ds_additional = (unsigned) + round(ds_wants * (((double) remaining_space) / total_wants)); + ds_chunks += ds_additional; + remaining_space -= ds_additional; + total_wants -= ds_wants; + gs_chunks += remaining_space; } /* Sanity check that we haven't over-allocated. */ - assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks); + assert(push_constant_chunks + + vs_chunks + hs_chunks + ds_chunks + gs_chunks <= urb_chunks); /* Finally, compute the number of entries that can fit in the space * allocated to each stage. */ unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes; + unsigned nr_hs_entries = hs_chunks * chunk_size_bytes / hs_entry_size_bytes; + unsigned nr_ds_entries = ds_chunks * chunk_size_bytes / ds_entry_size_bytes; unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes; /* Since we rounded up when computing *_wants, this may be slightly more * than the maximum allowed amount, so correct for that. */ nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries); + nr_hs_entries = MIN2(nr_hs_entries, brw->urb.max_hs_entries); + nr_ds_entries = MIN2(nr_ds_entries, brw->urb.max_ds_entries); nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries); /* Ensure that we program a multiple of the granularity. */ nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity); + nr_hs_entries = ROUND_DOWN_TO(nr_hs_entries, hs_granularity); + nr_ds_entries = ROUND_DOWN_TO(nr_ds_entries, ds_granularity); nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity); /* Finally, sanity check to make sure we have at least the minimum number * of entries needed for each stage. */ - assert(nr_vs_entries >= brw->urb.min_vs_entries); + assert(nr_vs_entries >= vs_min_entries); if (gs_present) assert(nr_gs_entries >= 2); + if (tess_present) { + assert(nr_hs_entries >= 1); + assert(nr_ds_entries >= devinfo->urb.min_ds_entries); + } /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems * better to put reasonable data in there rather than leave them * uninitialized. */ brw->urb.nr_vs_entries = nr_vs_entries; + brw->urb.nr_hs_entries = nr_hs_entries; + brw->urb.nr_ds_entries = nr_ds_entries; brw->urb.nr_gs_entries = nr_gs_entries; /* Lay out the URB in the following order: * - push constants * - VS + * - HS + * - DS * - GS */ brw->urb.vs_start = push_constant_chunks; - brw->urb.gs_start = push_constant_chunks + vs_chunks; + brw->urb.hs_start = push_constant_chunks + vs_chunks; + brw->urb.ds_start = push_constant_chunks + vs_chunks + hs_chunks; + brw->urb.gs_start = push_constant_chunks + vs_chunks + hs_chunks + + ds_chunks; if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail) gen7_emit_vs_workaround_flush(brw); gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start, + brw->urb.nr_hs_entries, hs_size, brw->urb.hs_start, + brw->urb.nr_ds_entries, ds_size, brw->urb.ds_start, brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start); } void gen7_emit_urb_state(struct brw_context *brw, - unsigned nr_vs_entries, unsigned vs_size, - unsigned vs_start, unsigned nr_gs_entries, + unsigned nr_vs_entries, + unsigned vs_size, unsigned vs_start, + unsigned nr_hs_entries, + unsigned hs_size, unsigned hs_start, + unsigned nr_ds_entries, + unsigned ds_size, unsigned ds_start, + unsigned nr_gs_entries, unsigned gs_size, unsigned gs_start) { BEGIN_BATCH(8); @@ -301,14 +402,15 @@ gen7_emit_urb_state(struct brw_context *brw, ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - /* Allocate the HS and DS zero space - we don't use them. */ OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); + OUT_BATCH(nr_hs_entries | + ((hs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | + (hs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2)); - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); + OUT_BATCH(nr_ds_entries | + ((ds_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | + (ds_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); ADVANCE_BATCH(); } @@ -318,7 +420,10 @@ const struct brw_tracked_state gen7_urb = { .brw = BRW_NEW_CONTEXT | BRW_NEW_URB_SIZE | BRW_NEW_GEOMETRY_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_GS_PROG_DATA | + BRW_NEW_TCS_PROG_DATA | + BRW_NEW_TES_PROG_DATA | BRW_NEW_VS_PROG_DATA, }, .emit = gen7_upload_urb, diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 06d5e65786b..7def5f5ad3c 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -77,14 +77,11 @@ upload_wm_state(struct brw_context *brw) dw1 |= GEN7_WM_KILL_ENABLE; } - if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx)) { - dw1 |= GEN7_WM_DISPATCH_ENABLE; - } - /* _NEW_BUFFERS | _NEW_COLOR */ + const bool active_fs_has_side_effects = + _mesa_active_fragment_shader_has_side_effects(&brw->ctx); if (brw_color_buffer_write_enabled(brw) || writes_depth || - prog_data->base.nr_image_params || - dw1 & GEN7_WM_KILL_ENABLE) { + active_fs_has_side_effects || dw1 & GEN7_WM_KILL_ENABLE) { dw1 |= GEN7_WM_DISPATCH_ENABLE; } if (multisampled_fbo) { @@ -110,7 +107,7 @@ upload_wm_state(struct brw_context *brw) /* BRW_NEW_FS_PROG_DATA */ if (prog_data->early_fragment_tests) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS; - else if (prog_data->base.nr_image_params) + else if (active_fs_has_side_effects) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC; /* The "UAV access enable" bits are unnecessary on HSW because they only @@ -123,7 +120,7 @@ upload_wm_state(struct brw_context *brw) */ if (brw->is_haswell && !(brw_color_buffer_write_enabled(brw) || writes_depth) && - prog_data->base.nr_image_params) + active_fs_has_side_effects) dw2 |= HSW_WM_UAV_ONLY; BEGIN_BATCH(3); diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c index a79e8aa020e..d91eb77e631 100644 --- a/src/mesa/drivers/dri/i965/gen8_ds_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c @@ -31,9 +31,8 @@ gen8_upload_ds_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_stage_state *stage_state = &brw->tes.base; - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ bool active = brw->tess_eval_program; - assert(!active || brw->tess_ctrl_program); /* BRW_NEW_TES_PROG_DATA */ const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data; @@ -92,7 +91,7 @@ const struct brw_tracked_state gen8_ds_state = { .dirty = { .mesa = 0, .brw = BRW_NEW_BATCH | - BRW_NEW_TESS_EVAL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TES_PROG_DATA, }, .emit = gen8_upload_ds_state, diff --git a/src/mesa/drivers/dri/i965/gen8_hs_state.c b/src/mesa/drivers/dri/i965/gen8_hs_state.c index 38e22359ffb..21f3d469553 100644 --- a/src/mesa/drivers/dri/i965/gen8_hs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_hs_state.c @@ -30,9 +30,8 @@ static void gen8_upload_hs_state(struct brw_context *brw) { const struct brw_stage_state *stage_state = &brw->tcs.base; - /* BRW_NEW_TESS_CTRL_PROGRAM */ - bool active = brw->tess_ctrl_program; - assert(!active || brw->tess_eval_program); + /* BRW_NEW_TESS_PROGRAMS */ + bool active = brw->tess_eval_program; /* BRW_NEW_HS_PROG_DATA */ const struct brw_vue_prog_data *prog_data = &brw->tcs.prog_data->base; @@ -84,7 +83,7 @@ const struct brw_tracked_state gen8_hs_state = { .mesa = 0, .brw = BRW_NEW_BATCH | BRW_NEW_TCS_PROG_DATA | - BRW_NEW_TESS_CTRL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = gen8_upload_hs_state, }; diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 945f7106e3d..74cdcef015d 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -90,8 +90,7 @@ gen8_upload_ps_extra(struct brw_context *brw, * * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */ - if ((_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) || - prog_data->base.nr_image_params) && + if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx) && !brw_color_buffer_write_enabled(brw)) dw1 |= GEN8_PSX_SHADER_HAS_UAV; @@ -157,7 +156,7 @@ upload_wm_state(struct brw_context *brw) /* BRW_NEW_FS_PROG_DATA */ if (brw->wm.prog_data->early_fragment_tests) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS; - else if (brw->wm.prog_data->base.nr_image_params) + else if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx)) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC; BEGIN_BATCH(2); diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 24761a70638..06672c1b4db 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -362,6 +362,7 @@ intelInitExtensions(struct gl_context *ctx) if (brw->gen >= 8) { ctx->Extensions.ARB_stencil_texturing = true; + ctx->Extensions.ARB_tessellation_shader = true; } if (brw->gen >= 9) { diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index cc90efe4886..a9f58b09422 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -1339,6 +1339,11 @@ set_max_gl_versions(struct intel_screen *screen) switch (screen->devinfo->gen) { case 9: case 8: + psp->max_gl_core_version = 33; + psp->max_gl_compat_version = 30; + psp->max_gl_es1_version = 11; + psp->max_gl_es2_version = 31; + break; case 7: case 6: psp->max_gl_core_version = 33; @@ -1492,6 +1497,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp) intelScreen->compiler = brw_compiler_create(intelScreen, intelScreen->devinfo); + intelScreen->program_id = 1; if (intelScreen->devinfo->has_resource_streamer) { int val = -1; diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.c b/src/mesa/drivers/dri/nouveau/nouveau_screen.c index 153f18e4a34..6f61f66f3b0 100644 --- a/src/mesa/drivers/dri/nouveau/nouveau_screen.c +++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.c @@ -40,6 +40,9 @@ #include "main/renderbuffer.h" #include "swrast/s_renderbuffer.h" +#include <nvif/class.h> +#include <nvif/cl0080.h> + static const __DRIextension *nouveau_screen_extensions[]; static void @@ -99,12 +102,22 @@ nouveau_init_screen2(__DRIscreen *dri_screen) dri_screen->driverPrivate = screen; /* Open the DRM device. */ - ret = nouveau_device_wrap(dri_screen->fd, 0, &screen->device); + ret = nouveau_drm_new(dri_screen->fd, &screen->drm); if (ret) { nouveau_error("Error opening the DRM device.\n"); goto fail; } + ret = nouveau_device_new(&screen->drm->client, NV_DEVICE, + &(struct nv_device_v0) { + .device = ~0ULL, + }, sizeof(struct nv_device_v0), + &screen->device); + if (ret) { + nouveau_error("Error creating device object.\n"); + goto fail; + } + /* Choose the card specific function pointers. */ switch (screen->device->chipset & 0xf0) { case 0x00: @@ -213,6 +226,7 @@ nouveau_destroy_screen(__DRIscreen *dri_screen) return; nouveau_device_del(&screen->device); + nouveau_drm_del(&screen->drm); free(screen); dri_screen->driverPrivate = NULL; diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.h b/src/mesa/drivers/dri/nouveau/nouveau_screen.h index 45b1ee928d6..e3c192802d4 100644 --- a/src/mesa/drivers/dri/nouveau/nouveau_screen.h +++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.h @@ -33,6 +33,7 @@ struct nouveau_context; struct nouveau_screen { __DRIscreen *dri_screen; + struct nouveau_drm *drm; struct nouveau_device *device; const struct nouveau_driver *driver; }; diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c index 5c7dcac3841..8462ab6627d 100644 --- a/src/mesa/drivers/osmesa/osmesa.c +++ b/src/mesa/drivers/osmesa/osmesa.c @@ -645,10 +645,100 @@ GLAPI OSMesaContext GLAPIENTRY OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits, GLint accumBits, OSMesaContext sharelist ) { + int attribs[100], n = 0; + + attribs[n++] = OSMESA_FORMAT; + attribs[n++] = format; + attribs[n++] = OSMESA_DEPTH_BITS; + attribs[n++] = depthBits; + attribs[n++] = OSMESA_STENCIL_BITS; + attribs[n++] = stencilBits; + attribs[n++] = OSMESA_ACCUM_BITS; + attribs[n++] = accumBits; + attribs[n++] = 0; + + return OSMesaCreateContextAttribs(attribs, sharelist); +} + + +/** + * New in Mesa 11.2 + * + * Create context with attribute list. + */ +GLAPI OSMesaContext GLAPIENTRY +OSMesaCreateContextAttribs(const int *attribList, OSMesaContext sharelist) +{ OSMesaContext osmesa; struct dd_function_table functions; GLint rind, gind, bind, aind; GLint redBits = 0, greenBits = 0, blueBits = 0, alphaBits =0; + GLenum format = OSMESA_RGBA; + GLint depthBits = 0, stencilBits = 0, accumBits = 0; + int profile = OSMESA_COMPAT_PROFILE, version_major = 1, version_minor = 0; + gl_api api_profile = API_OPENGL_COMPAT; + int i; + + for (i = 0; attribList[i]; i += 2) { + switch (attribList[i]) { + case OSMESA_FORMAT: + format = attribList[i+1]; + switch (format) { + case OSMESA_COLOR_INDEX: + case OSMESA_RGBA: + case OSMESA_BGRA: + case OSMESA_ARGB: + case OSMESA_RGB: + case OSMESA_BGR: + case OSMESA_RGB_565: + /* legal */ + break; + default: + return NULL; + } + break; + case OSMESA_DEPTH_BITS: + depthBits = attribList[i+1]; + if (depthBits < 0) + return NULL; + break; + case OSMESA_STENCIL_BITS: + stencilBits = attribList[i+1]; + if (stencilBits < 0) + return NULL; + break; + case OSMESA_ACCUM_BITS: + accumBits = attribList[i+1]; + if (accumBits < 0) + return NULL; + break; + case OSMESA_PROFILE: + profile = attribList[i+1]; + if (profile == OSMESA_COMPAT_PROFILE) + api_profile = API_OPENGL_COMPAT; + else if (profile == OSMESA_CORE_PROFILE) + api_profile = API_OPENGL_CORE; + else + return NULL; + break; + case OSMESA_CONTEXT_MAJOR_VERSION: + version_major = attribList[i+1]; + if (version_major < 1) + return NULL; + break; + case OSMESA_CONTEXT_MINOR_VERSION: + version_minor = attribList[i+1]; + if (version_minor < 0) + return NULL; + break; + case 0: + /* end of list */ + break; + default: + fprintf(stderr, "Bad attribute in OSMesaCreateContextAttribs()\n"); + return NULL; + } + } rind = gind = bind = aind = 0; if (format==OSMESA_RGBA) { @@ -742,7 +832,7 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits, functions.UpdateState = osmesa_update_state; if (!_mesa_initialize_context(&osmesa->mesa, - API_OPENGL_COMPAT, + api_profile, osmesa->gl_visual, sharelist ? &sharelist->mesa : (struct gl_context *) NULL, @@ -819,6 +909,13 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits, _mesa_compute_version(ctx); + if (ctx->Version < version_major * 10 + version_minor) { + _mesa_destroy_visual(osmesa->gl_visual); + _mesa_free_context_data(ctx); + free(osmesa); + return NULL; + } + /* Exec table initialization requires the version to be computed */ _mesa_initialize_dispatch_tables(ctx); _mesa_initialize_vbo_vtxfmt(ctx); @@ -1121,6 +1218,7 @@ struct name_function static struct name_function functions[] = { { "OSMesaCreateContext", (OSMESAproc) OSMesaCreateContext }, { "OSMesaCreateContextExt", (OSMESAproc) OSMesaCreateContextExt }, + { "OSMesaCreateContextAttribs", (OSMESAproc) OSMesaCreateContextAttribs }, { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext }, { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent }, { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext }, diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c index 935ba05b7cc..8fcbff6a7a4 100644 --- a/src/mesa/main/atifragshader.c +++ b/src/mesa/main/atifragshader.c @@ -293,7 +293,7 @@ _mesa_DeleteFragmentShaderATI(GLuint id) prog->RefCount--; if (prog->RefCount <= 0) { assert(prog != &DummyShader); - free(prog); + _mesa_delete_ati_fragment_shader(ctx, prog); } } } @@ -345,6 +345,9 @@ _mesa_BeginFragmentShaderATI(void) ctx->ATIFragmentShader.Current->isValid = GL_FALSE; ctx->ATIFragmentShader.Current->swizzlerq = 0; ctx->ATIFragmentShader.Compiling = 1; +#if MESA_DEBUG_ATI_FS + _mesa_debug(ctx, "%s %u\n", __func__, ctx->ATIFragmentShader.Current->Id); +#endif } void GLAPIENTRY diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c index abc553966e9..5729e601c12 100644 --- a/src/mesa/main/blit.c +++ b/src/mesa/main/blit.c @@ -286,8 +286,17 @@ _mesa_blit_framebuffer(struct gl_context *ctx, } /* extra checks for multisample copies... */ if (readFb->Visual.samples > 0 || drawFb->Visual.samples > 0) { - /* color formats must match */ - if (!compatible_resolve_formats(colorReadRb, colorDrawRb)) { + /* color formats must match on GLES. This isn't checked on + * desktop GL because the GL 4.4 spec was changed to allow it. + * In the section entitled “Changes in the released + * Specification of July 22, 2013” it says: + * + * “Relax BlitFramebuffer in section 18.3.1 so that format + * conversion can take place during multisample blits, since + * drivers already allow this and some apps depend on it.” + */ + if (_mesa_is_gles(ctx) && + !compatible_resolve_formats(colorReadRb, colorDrawRb)) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(bad src/dst multisample pixel formats)", func); return; diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index fabe9913c11..26dfac1ab90 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2155,8 +2155,6 @@ struct gl_compute_program_state /** * ATI_fragment_shader runtime state */ -#define ATI_FS_INPUT_PRIMARY 0 -#define ATI_FS_INPUT_SECONDARY 1 struct atifs_instruction; struct atifs_setupinst; @@ -4542,11 +4540,22 @@ enum _debug DEBUG_INCOMPLETE_FBO = (1 << 3) }; +/** + * Checks if the active fragment shader program can have side effects due + * to use of things like atomic buffers or images + */ static inline bool -_mesa_active_fragment_shader_has_atomic_ops(const struct gl_context *ctx) +_mesa_active_fragment_shader_has_side_effects(const struct gl_context *ctx) { - return ctx->Shader._CurrentFragmentProgram != NULL && - ctx->Shader._CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]->NumAtomicBuffers > 0; + const struct gl_shader *sh; + + if (!ctx->_Shader->_CurrentFragmentProgram) + return false; + + sh = ctx->_Shader->_CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]; + return sh->NumAtomicBuffers > 0 || + sh->NumImages > 0 || + sh->NumShaderStorageBlocks > 0; } #ifdef __cplusplus diff --git a/src/mesa/main/performance_monitor.c b/src/mesa/main/performance_monitor.c index 98dfbea083c..43529b2b35a 100644 --- a/src/mesa/main/performance_monitor.c +++ b/src/mesa/main/performance_monitor.c @@ -591,11 +591,10 @@ perf_monitor_result_size(const struct gl_context *ctx, for (group = 0; group < ctx->PerfMonitor.NumGroups; group++) { const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[group]; - for (counter = 0; counter < g->NumCounters; counter++) { - const struct gl_perf_monitor_counter *c = &g->Counters[counter]; + BITSET_WORD tmp; - if (!BITSET_TEST(m->ActiveCounters[group], counter)) - continue; + BITSET_FOREACH_SET(counter, tmp, m->ActiveCounters[group], g->NumCounters) { + const struct gl_perf_monitor_counter *c = &g->Counters[counter]; size += sizeof(uint32_t); /* Group ID */ size += sizeof(uint32_t); /* Counter ID */ diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index ced10a93b1d..e526119db19 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -1373,46 +1373,107 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg, } static bool -validate_io(const struct gl_shader *input_stage, - const struct gl_shader *output_stage, bool isES) +validate_io(const struct gl_shader *producer, + const struct gl_shader *consumer, bool isES) { - assert(input_stage && output_stage); + assert(producer && consumer); + unsigned inputs = 0, outputs = 0; + + /* From OpenGL ES 3.1 spec (Interface matching): + * + * "An output variable is considered to match an input variable in the + * subsequent shader if: + * + * - the two variables match in name, type, and qualification; or + * - the two variables are declared with the same location qualifier and + * match in type and qualification. + * + * ... + * + * At an interface between program objects, the set of inputs and outputs + * are considered to match exactly if and only if: + * + * - Every declared input variable has a matching output, as described + * above. + * + * - There are no user-defined output variables declared without a + * matching input variable declaration. + * + * - All matched input and output variables have identical precision + * qualification. + * + * When the set of inputs and outputs on an interface between programs + * matches exactly, all inputs are well-defined except when the + * corresponding outputs were not written in the previous shader. However, + * any mismatch between inputs and outputs will result in a validation + * failure." + * + * OpenGL Core 4.5 spec includes same paragraph as above but without check + * for precision and the last 'validation failure' clause. Therefore + * behaviour is more relaxed, input and output amount is not required by the + * spec to be validated. + * + * FIXME: Update once Khronos spec bug #15331 is resolved. + * FIXME: Add validation by type, currently information loss during varying + * packing makes this challenging. + */ + + /* Currently no matching done for desktop. */ + if (!isES) + return true; /* For each output in a, find input in b and do any required checks. */ - foreach_in_list(ir_instruction, out, input_stage->ir) { + foreach_in_list(ir_instruction, out, producer->ir) { ir_variable *out_var = out->as_variable(); - if (!out_var || out_var->data.mode != ir_var_shader_out) + if (!out_var || out_var->data.mode != ir_var_shader_out || + is_gl_identifier(out_var->name)) continue; - foreach_in_list(ir_instruction, in, output_stage->ir) { + outputs++; + + inputs = 0; + foreach_in_list(ir_instruction, in, consumer->ir) { ir_variable *in_var = in->as_variable(); - if (!in_var || in_var->data.mode != ir_var_shader_in) + if (!in_var || in_var->data.mode != ir_var_shader_in || + is_gl_identifier(in_var->name)) + continue; + + inputs++; + + /* Match by location qualifier and precision. + * + * FIXME: Add explicit location matching validation here. Be careful + * not to match varyings with explicit locations to varyings without + * explicit locations. + */ + if ((in_var->data.explicit_location && + out_var->data.explicit_location) && + in_var->data.location == out_var->data.location && + in_var->data.precision == out_var->data.precision) continue; - if (strcmp(in_var->name, out_var->name) == 0) { - /* Since we now only validate precision, we can skip this step for - * desktop GLSL shaders, there precision qualifier is ignored. - * - * From OpenGL 4.50 Shading Language spec, section 4.7: - * "For the purposes of determining if an output from one - * shader stage matches an input of the next stage, the - * precision qualifier need not match." + unsigned len = strlen(in_var->name); + + /* Handle input swizzle in variable name. */ + const char *dot = strchr(in_var->name, '.'); + if (dot) + len = dot - in_var->name; + + /* Match by name and precision. */ + if (strncmp(in_var->name, out_var->name, len) == 0) { + /* From OpenGL ES 3.1 spec: + * "When both shaders are in separate programs, mismatched + * precision qualifiers will result in a program interface + * mismatch that will result in program pipeline validation + * failures, as described in section 7.4.1 (“Shader Interface + * Matching”) of the OpenGL ES 3.1 Specification." */ - if (isES) { - /* From OpenGL ES 3.1 spec: - * "When both shaders are in separate programs, mismatched - * precision qualifiers will result in a program interface - * mismatch that will result in program pipeline validation - * failures, as described in section 7.4.1 (“Shader Interface - * Matching”) of the OpenGL ES 3.1 Specification." - */ - if (in_var->data.precision != out_var->data.precision) - return false; - } + if (in_var->data.precision != out_var->data.precision) + return false; } } } - return true; + return inputs == outputs; } /** diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index ac40891f435..e258ad9d1db 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -208,7 +208,7 @@ _mesa_validate_shader_target(const struct gl_context *ctx, GLenum type) case GL_TESS_EVALUATION_SHADER: return ctx == NULL || _mesa_has_tessellation(ctx); case GL_COMPUTE_SHADER: - return ctx == NULL || ctx->Extensions.ARB_compute_shader; + return ctx == NULL || _mesa_has_compute_shaders(ctx); default: return false; } @@ -1514,6 +1514,8 @@ void GLAPIENTRY _mesa_LinkProgram(GLhandleARB programObj) { GET_CURRENT_CONTEXT(ctx); + if (MESA_VERBOSE & VERBOSE_API) + _mesa_debug(ctx, "glLinkProgram %u\n", programObj); link_program(ctx, programObj); } @@ -1731,6 +1733,9 @@ _mesa_UseProgram(GLhandleARB program) GET_CURRENT_CONTEXT(ctx); struct gl_shader_program *shProg; + if (MESA_VERBOSE & VERBOSE_API) + _mesa_debug(ctx, "glUseProgram %u\n", program); + if (_mesa_is_xfb_active_and_unpaused(ctx)) { _mesa_error(ctx, GL_INVALID_OPERATION, "glUseProgram(transform feedback active)"); diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c index e92bb111cc4..112a73dc0e7 100644 --- a/src/mesa/main/version.c +++ b/src/mesa/main/version.c @@ -433,7 +433,8 @@ compute_version_es1(const struct gl_extensions *extensions) } static GLuint -compute_version_es2(const struct gl_extensions *extensions) +compute_version_es2(const struct gl_extensions *extensions, + const struct gl_constants *consts) { /* OpenGL ES 2.0 is derived from OpenGL 2.0 */ const bool ver_2_0 = (extensions->ARB_texture_cube_map && @@ -464,9 +465,11 @@ compute_version_es2(const struct gl_extensions *extensions) extensions->EXT_texture_snorm && extensions->NV_primitive_restart && extensions->OES_depth_texture_cube_map); + const bool es31_compute_shader = + consts->MaxComputeWorkGroupInvocations >= 128; const bool ver_3_1 = (ver_3_0 && extensions->ARB_arrays_of_arrays && - extensions->ARB_compute_shader && + es31_compute_shader && extensions->ARB_draw_indirect && extensions->ARB_explicit_uniform_location && extensions->ARB_framebuffer_no_attachments && @@ -508,7 +511,7 @@ _mesa_get_version(const struct gl_extensions *extensions, case API_OPENGLES: return compute_version_es1(extensions); case API_OPENGLES2: - return compute_version_es2(extensions); + return compute_version_es2(extensions, consts); } return 0; } diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index c5d8c483429..c6f9ef68418 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -1114,7 +1114,13 @@ ir_to_mesa_visitor::visit(ir_expression *ir) if (ir->operands[0]->type->is_vector() || ir->operands[1]->type->is_vector()) { src_reg temp = get_temp(glsl_type::vec4_type); - emit(ir, OPCODE_SNE, dst_reg(temp), op[0], op[1]); + if (ir->operands[0]->type->is_boolean() && + ir->operands[1]->as_constant() && + ir->operands[1]->as_constant()->is_zero()) { + temp = op[0]; + } else { + emit(ir, OPCODE_SNE, dst_reg(temp), op[0], op[1]); + } /* After the dot-product, the value will be an integer on the * range [0,4]. Zero stays zero, and positive values become 1.0. @@ -1140,32 +1146,6 @@ ir_to_mesa_visitor::visit(ir_expression *ir) } break; - case ir_unop_any: { - assert(ir->operands[0]->type->is_vector()); - - /* After the dot-product, the value will be an integer on the - * range [0,4]. Zero stays zero, and positive values become 1.0. - */ - ir_to_mesa_instruction *const dp = - emit_dp(ir, result_dst, op[0], op[0], - ir->operands[0]->type->vector_elements); - if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) { - /* The clamping to [0,1] can be done for free in the fragment - * shader with a saturate. - */ - dp->saturate = true; - } else { - /* Negating the result of the dot-product gives values on the range - * [-4, 0]. Zero stays zero, and negative values become 1.0. This - * is achieved using SLT. - */ - src_reg slt_src = result_src; - slt_src.negate = ~slt_src.negate; - emit(ir, OPCODE_SLT, result_dst, slt_src, src_reg_for_float(0.0)); - } - break; - } - case ir_binop_logic_xor: emit(ir, OPCODE_SNE, result_dst, op[0], op[1]); break; diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c index bdb335e4ba3..12490d0c380 100644 --- a/src/mesa/program/prog_statevars.c +++ b/src/mesa/program/prog_statevars.c @@ -474,7 +474,7 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[], * single MAD. * linear: fogcoord * -1/(end-start) + end/(end-start) * exp: 2^-(density/ln(2) * fogcoord) - * exp2: 2^-((density/(ln(2)^2) * fogcoord)^2) + * exp2: 2^-((density/(sqrt(ln(2))) * fogcoord)^2) */ value[0] = (ctx->Fog.End == ctx->Fog.Start) ? 1.0f : (GLfloat)(-1.0F / (ctx->Fog.End - ctx->Fog.Start)); diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index d5386ee70e8..8ca830a0b14 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -554,8 +554,8 @@ static void ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src) { nir_ssa_def *cmp = b->shader->options->native_integers ? - nir_bany4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0))) : - nir_fany4(b, nir_slt(b, src[0], nir_imm_float(b, 0.0))); + nir_bany_inequal4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_int(b, 0)) : + nir_fany_nequal4(b, nir_slt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_float(b, 0.0)); nir_intrinsic_instr *discard = nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); @@ -928,6 +928,7 @@ ptn_add_output_stores(struct ptn_compile *c) nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var); store->num_components = glsl_get_vector_elements(var->type); + store->const_index[0] = (1 << store->num_components) - 1; store->variables[0] = nir_deref_var_create(store, c->output_vars[var->data.location]); @@ -998,6 +999,7 @@ setup_registers_and_variables(struct ptn_compile *c) nir_intrinsic_instr *store = nir_intrinsic_instr_create(shader, nir_intrinsic_store_var); store->num_components = 4; + store->const_index[0] = WRITEMASK_XYZW; store->variables[0] = nir_deref_var_create(store, fullvar); store->src[0] = nir_src_for_ssa(f001); nir_builder_instr_insert(b, &store->instr); diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c index 4252c27962e..94231cf1946 100644 --- a/src/mesa/state_tracker/st_atom_sampler.c +++ b/src/mesa/state_tracker/st_atom_sampler.c @@ -250,7 +250,7 @@ update_shader_samplers(struct st_context *st, samplers_used = prog->SamplersUsed; if (*num_samplers == 0 && samplers_used == 0x0) - return; + return; *num_samplers = 0; diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index b6a0e6b4f4f..89ad6cd8c28 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -1776,89 +1776,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir) } break; - case ir_unop_any: { - assert(ir->operands[0]->type->is_vector()); - - if (native_integers) { - int dst_swizzle = 0, op0_swizzle, i; - st_src_reg accum = op[0]; - - op0_swizzle = op[0].swizzle; - accum.swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 0), - GET_SWZ(op0_swizzle, 0), - GET_SWZ(op0_swizzle, 0), - GET_SWZ(op0_swizzle, 0)); - for (i = 0; i < 4; i++) { - if (result_dst.writemask & (1 << i)) { - dst_swizzle = MAKE_SWIZZLE4(i, i, i, i); - break; - } - } - assert(i != 4); - assert(ir->operands[0]->type->is_boolean()); - - /* OR all the components together, since they should be either 0 or ~0 - */ - switch (ir->operands[0]->type->vector_elements) { - case 4: - op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 3), - GET_SWZ(op0_swizzle, 3), - GET_SWZ(op0_swizzle, 3), - GET_SWZ(op0_swizzle, 3)); - emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]); - accum = st_src_reg(result_dst); - accum.swizzle = dst_swizzle; - /* fallthrough */ - case 3: - op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 2), - GET_SWZ(op0_swizzle, 2), - GET_SWZ(op0_swizzle, 2), - GET_SWZ(op0_swizzle, 2)); - emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]); - accum = st_src_reg(result_dst); - accum.swizzle = dst_swizzle; - /* fallthrough */ - case 2: - op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 1), - GET_SWZ(op0_swizzle, 1), - GET_SWZ(op0_swizzle, 1), - GET_SWZ(op0_swizzle, 1)); - emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]); - break; - default: - assert(!"Unexpected vector size"); - break; - } - } else { - /* After the dot-product, the value will be an integer on the - * range [0,4]. Zero stays zero, and positive values become 1.0. - */ - glsl_to_tgsi_instruction *const dp = - emit_dp(ir, result_dst, op[0], op[0], - ir->operands[0]->type->vector_elements); - if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB && - result_dst.type == GLSL_TYPE_FLOAT) { - /* The clamping to [0,1] can be done for free in the fragment - * shader with a saturate. - */ - dp->saturate = true; - } else if (result_dst.type == GLSL_TYPE_FLOAT) { - /* Negating the result of the dot-product gives values on the range - * [-4, 0]. Zero stays zero, and negative values become 1.0. This - * is achieved using SLT. - */ - st_src_reg slt_src = result_src; - slt_src.negate = ~slt_src.negate; - emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0)); - } - else { - /* Use SNE 0 if integers are being used as boolean values. */ - emit_asm(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0)); - } - } - break; - } - case ir_binop_logic_xor: if (native_integers) emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]); diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c index 2974deed41b..414a4144e25 100644 --- a/src/mesa/swrast/s_atifragshader.c +++ b/src/mesa/swrast/s_atifragshader.c @@ -26,6 +26,8 @@ #include "swrast/s_atifragshader.h" #include "swrast/s_context.h" +#define ATI_FS_INPUT_PRIMARY 0 +#define ATI_FS_INPUT_SECONDARY 1 /** * State for executing ATI fragment shader. |