diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/intel/vulkan/anv_pipeline.c | 41 | ||||
-rw-r--r-- | src/intel/vulkan/anv_private.h | 5 | ||||
-rw-r--r-- | src/intel/vulkan/gen7_pipeline.c | 12 | ||||
-rw-r--r-- | src/intel/vulkan/gen8_pipeline.c | 12 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_compiler.h | 12 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 52 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm_state.c | 31 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen6_wm_state.c | 63 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen7_wm_state.c | 35 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen8_ps_state.c | 37 |
11 files changed, 110 insertions, 192 deletions
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index f55069ee747..a8e31b13cf1 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -585,17 +585,17 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline, const struct brw_stage_prog_data *stage_prog_data; struct anv_pipeline_bind_map map; struct brw_wm_prog_key key; - uint32_t kernel = NO_KERNEL; unsigned char sha1[20]; populate_wm_prog_key(&pipeline->device->info, info, extra, &key); if (module->size > 0) { anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint, spec_info); - kernel = anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map); + pipeline->ps_ksp0 = + anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map); } - if (kernel == NO_KERNEL) { + if (pipeline->ps_ksp0 == NO_KERNEL) { struct brw_wm_prog_data prog_data = { 0, }; struct anv_pipeline_binding surface_to_descriptor[256]; struct anv_pipeline_binding sampler_to_descriptor[256]; @@ -682,43 +682,16 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline, } stage_prog_data = &prog_data.base; - kernel = anv_pipeline_cache_upload_kernel(cache, - module->size > 0 ? sha1 : NULL, - shader_code, code_size, + pipeline->ps_ksp0 = + anv_pipeline_cache_upload_kernel(cache, + module->size > 0 ? sha1 : NULL, + shader_code, code_size, &stage_prog_data, sizeof(prog_data), &map); ralloc_free(mem_ctx); } - const struct brw_wm_prog_data *wm_prog_data = - (const struct brw_wm_prog_data *) stage_prog_data; - - if (wm_prog_data->no_8) - pipeline->ps_simd8 = NO_KERNEL; - else - pipeline->ps_simd8 = kernel; - - if (wm_prog_data->no_8 || wm_prog_data->prog_offset_16) { - pipeline->ps_simd16 = kernel + wm_prog_data->prog_offset_16; - } else { - pipeline->ps_simd16 = NO_KERNEL; - } - - pipeline->ps_ksp2 = 0; - pipeline->ps_grf_start2 = 0; - if (pipeline->ps_simd8 != NO_KERNEL) { - pipeline->ps_ksp0 = pipeline->ps_simd8; - pipeline->ps_grf_start0 = wm_prog_data->base.dispatch_grf_start_reg; - if (pipeline->ps_simd16 != NO_KERNEL) { - pipeline->ps_ksp2 = pipeline->ps_simd16; - pipeline->ps_grf_start2 = wm_prog_data->dispatch_grf_start_reg_16; - } - } else if (pipeline->ps_simd16 != NO_KERNEL) { - pipeline->ps_ksp0 = pipeline->ps_simd16; - pipeline->ps_grf_start0 = wm_prog_data->dispatch_grf_start_reg_16; - } - anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_FRAGMENT, stage_prog_data, &map); diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index d8a21942d83..c55f1db5180 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1418,12 +1418,7 @@ struct anv_pipeline { struct anv_state blend_state; uint32_t vs_simd8; uint32_t vs_vec4; - uint32_t ps_simd8; - uint32_t ps_simd16; uint32_t ps_ksp0; - uint32_t ps_ksp2; - uint32_t ps_grf_start0; - uint32_t ps_grf_start2; uint32_t gs_kernel; uint32_t cs_simd; diff --git a/src/intel/vulkan/gen7_pipeline.c b/src/intel/vulkan/gen7_pipeline.c index d4797c59977..285b191352c 100644 --- a/src/intel/vulkan/gen7_pipeline.c +++ b/src/intel/vulkan/gen7_pipeline.c @@ -375,19 +375,21 @@ genX(graphics_pipeline_create)( POSOFFSET_SAMPLE : POSOFFSET_NONE; ps._32PixelDispatchEnable = false; - ps._16PixelDispatchEnable = pipeline->ps_simd16 != NO_KERNEL; - ps._8PixelDispatchEnable = pipeline->ps_simd8 != NO_KERNEL; + ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; + ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; - ps.DispatchGRFStartRegisterforConstantSetupData0 = pipeline->ps_grf_start0, + ps.DispatchGRFStartRegisterforConstantSetupData0 = + wm_prog_data->base.dispatch_grf_start_reg, ps.DispatchGRFStartRegisterforConstantSetupData1 = 0, - ps.DispatchGRFStartRegisterforConstantSetupData2 = pipeline->ps_grf_start2, + ps.DispatchGRFStartRegisterforConstantSetupData2 = + wm_prog_data->dispatch_grf_start_reg_2, /* Haswell requires the sample mask to be set in this packet as well as * in 3DSTATE_SAMPLE_MASK; the values should match. */ /* _NEW_BUFFERS, _NEW_MULTISAMPLE */ ps.KernelStartPointer1 = 0; - ps.KernelStartPointer2 = pipeline->ps_ksp2; + ps.KernelStartPointer2 = pipeline->ps_ksp0 + wm_prog_data->prog_offset_2; } /* FIXME-GEN7: This needs a lot more work, cf gen7 upload_wm_state(). */ diff --git a/src/intel/vulkan/gen8_pipeline.c b/src/intel/vulkan/gen8_pipeline.c index 857f9798111..d96669494a2 100644 --- a/src/intel/vulkan/gen8_pipeline.c +++ b/src/intel/vulkan/gen8_pipeline.c @@ -502,9 +502,9 @@ genX(graphics_pipeline_create)( anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) { ps.KernelStartPointer0 = pipeline->ps_ksp0; ps.KernelStartPointer1 = 0; - ps.KernelStartPointer2 = pipeline->ps_ksp2; - ps._8PixelDispatchEnable = pipeline->ps_simd8 != NO_KERNEL; - ps._16PixelDispatchEnable = pipeline->ps_simd16 != NO_KERNEL; + ps.KernelStartPointer2 = pipeline->ps_ksp0 + wm_prog_data->prog_offset_2; + ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; + ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; ps._32PixelDispatchEnable = false; ps.SingleProgramFlow = false; ps.VectorMaskEnable = true; @@ -518,9 +518,11 @@ genX(graphics_pipeline_create)( ps.ScratchSpaceBasePointer = pipeline->scratch_start[MESA_SHADER_FRAGMENT]; ps.PerThreadScratchSpace = scratch_space(&wm_prog_data->base); - ps.DispatchGRFStartRegisterForConstantSetupData0 = pipeline->ps_grf_start0; + ps.DispatchGRFStartRegisterForConstantSetupData0 = + wm_prog_data->base.dispatch_grf_start_reg; ps.DispatchGRFStartRegisterForConstantSetupData1 = 0; - ps.DispatchGRFStartRegisterForConstantSetupData2 = pipeline->ps_grf_start2; + ps.DispatchGRFStartRegisterForConstantSetupData2 = + wm_prog_data->dispatch_grf_start_reg_2; } bool per_sample_ps = pCreateInfo->pMultisampleState && diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 3fcd7e87c4e..a2148ae2656 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -367,9 +367,11 @@ struct brw_wm_prog_data { GLuint num_varying_inputs; - GLuint dispatch_grf_start_reg_16; - GLuint reg_blocks; - GLuint reg_blocks_16; + uint8_t reg_blocks_0; + uint8_t reg_blocks_2; + + uint8_t dispatch_grf_start_reg_2; + uint32_t prog_offset_2; struct { /** @{ @@ -383,7 +385,8 @@ struct brw_wm_prog_data { bool computed_stencil; bool early_fragment_tests; - bool no_8; + bool dispatch_8; + bool dispatch_16; bool dual_src_blend; bool persample_dispatch; bool uses_pos_offset; @@ -393,7 +396,6 @@ struct brw_wm_prog_data { bool uses_src_w; bool uses_sample_mask; bool pulls_bary; - uint32_t prog_offset_16; /** * Mask of which interpolation modes are required by the fragment shader. diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f66ba473411..1e84b101a8e 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5800,11 +5800,6 @@ fs_visitor::run_fs(bool do_rep_send) return false; } - if (dispatch_width == 8) - wm_prog_data->reg_blocks = brw_register_blocks(grf_used); - else - wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used); - return !failed; } @@ -6004,6 +5999,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, shader); cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL; + uint8_t simd8_grf_start, simd16_grf_start; + unsigned simd8_grf_used, simd16_grf_used; fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, prog, shader, 8, @@ -6015,7 +6012,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, return NULL; } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { simd8_cfg = v8.cfg; - prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; + simd8_grf_start = v8.payload.num_regs; + simd8_grf_used = v8.grf_used; } if (!v8.simd16_unsupported && @@ -6031,7 +6029,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, v16.fail_msg); } else { simd16_cfg = v16.cfg; - prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; + simd16_grf_start = v16.payload.num_regs; + simd16_grf_used = v16.grf_used; } } @@ -6047,6 +6046,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, if (compiler->devinfo->gen < 5 && simd16_cfg) simd8_cfg = NULL; + if (prog_data->persample_dispatch) { + /* Starting with SandyBridge (where we first get MSAA), the different + * pixel dispatch combinations are grouped into classifications A + * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware + * generations, the only configurations supporting persample dispatch + * are are this in which only one dispatch width is enabled. + * + * If computed depth is enabled, SNB only allows SIMD8 while IVB+ + * allow SIMD8 or SIMD16 so we choose SIMD16 if available. + */ + if (compiler->devinfo->gen == 6 && + prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) { + simd16_cfg = NULL; + } else if (simd16_cfg) { + simd8_cfg = NULL; + } + } + /* We have to compute the flat inputs after the visitor is finished running * because it relies on prog_data->urb_setup which is computed in * fs_visitor::calculate_urb_setup(). @@ -6065,15 +6082,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, } if (simd8_cfg) { + prog_data->dispatch_8 = true; g.generate_code(simd8_cfg, 8); - prog_data->no_8 = false; - } else { - prog_data->no_8 = true; + prog_data->base.dispatch_grf_start_reg = simd8_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used); + + if (simd16_cfg) { + prog_data->dispatch_16 = true; + prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16); + prog_data->dispatch_grf_start_reg_2 = simd16_grf_start; + prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used); + } + } else if (simd16_cfg) { + prog_data->dispatch_16 = true; + g.generate_code(simd16_cfg, 16); + prog_data->base.dispatch_grf_start_reg = simd16_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used); } - if (simd16_cfg) - prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16); - return g.get_assembly(final_assembly_size); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 58faf2f4694..012492c0e0d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -169,7 +169,7 @@ fs_visitor::emit_dummy_fs() stage_prog_data->nr_pull_params = 0; stage_prog_data->curb_read_length = 0; stage_prog_data->dispatch_grf_start_reg = 2; - wm_prog_data->dispatch_grf_start_reg_16 = 2; + wm_prog_data->dispatch_grf_start_reg_2 = 2; grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */ calculate_cfg(); diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 91b35cd681a..bf1bdc9948f 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -86,48 +86,37 @@ brw_upload_wm_unit(struct brw_context *brw) sizeof(*wm), 32, &brw->wm.base.state_offset); memset(wm, 0, sizeof(*wm)); - if (prog_data->prog_offset_16) { + if (prog_data->dispatch_8 && prog_data->dispatch_16) { /* These two fields should be the same pre-gen6, which is why we * only have one hardware field to program for both dispatch * widths. */ assert(prog_data->base.dispatch_grf_start_reg == - prog_data->dispatch_grf_start_reg_16); + prog_data->dispatch_grf_start_reg_2); } /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_FS_PROG_DATA */ - if (prog_data->no_8) { - wm->wm5.enable_16_pix = 1; - wm->thread0.grf_reg_count = prog_data->reg_blocks_16; - wm->thread0.kernel_start_pointer = - brw_program_reloc(brw, - brw->wm.base.state_offset + - offsetof(struct brw_wm_unit_state, thread0), - brw->wm.base.prog_offset + - prog_data->prog_offset_16 + - (prog_data->reg_blocks_16 << 1)) >> 6; - - } else { - wm->thread0.grf_reg_count = prog_data->reg_blocks; - wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_16; - - wm->wm5.enable_8_pix = 1; - if (prog_data->prog_offset_16) - wm->wm5.enable_16_pix = 1; + wm->wm5.enable_8_pix = prog_data->dispatch_8; + wm->wm5.enable_16_pix = prog_data->dispatch_16; + if (prog_data->dispatch_8 || prog_data->dispatch_16) { + wm->thread0.grf_reg_count = prog_data->reg_blocks_0; wm->thread0.kernel_start_pointer = brw_program_reloc(brw, brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, thread0), brw->wm.base.prog_offset + (wm->thread0.grf_reg_count << 1)) >> 6; + } + if (prog_data->prog_offset_2) { + wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_2; wm->wm9.kernel_start_pointer_2 = brw_program_reloc(brw, brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, wm9), brw->wm.base.prog_offset + - prog_data->prog_offset_16 + + prog_data->prog_offset_2 + (wm->wm9.grf_reg_count_2 << 1)) >> 6; } diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index 4a5aa129d41..3e872af4894 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -129,29 +129,19 @@ gen6_upload_wm_state(struct brw_context *brw, dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_8) + dw5 |= GEN6_WM_8_DISPATCH_ENABLE; + + if (prog_data->dispatch_16) dw5 |= GEN6_WM_16_DISPATCH_ENABLE; - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw5 |= GEN6_WM_8_DISPATCH_ENABLE; - dw4 |= (prog_data->base.dispatch_grf_start_reg << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - dw4 |= (prog_data->dispatch_grf_start_reg_16 << - GEN6_WM_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw4 |= (prog_data->dispatch_grf_start_reg_16 << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } - else { - dw5 |= GEN6_WM_8_DISPATCH_ENABLE; - dw4 |= (prog_data->base.dispatch_grf_start_reg << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + dw4 |= prog_data->base.dispatch_grf_start_reg << + GEN6_WM_DISPATCH_START_GRF_SHIFT_0; + dw4 |= prog_data->dispatch_grf_start_reg_2 << + GEN6_WM_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; if (dual_source_blend_enable) dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE; @@ -200,37 +190,6 @@ gen6_upload_wm_state(struct brw_context *brw, dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE; else { dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL; - - /* From the Sandy Bridge PRM, Vol 2 part 1, 7.7.1 ("Pixel Grouping - * (Dispatch Size) Control"), p.334: - * - * Note: in the table below, the Valid column indicates which - * products that combination is supported on. Combinations of - * dispatch enables not listed in the table are not available on - * any product. - * - * A: Valid on all products - * - * B: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader - * computed depth. - * - * D: Valid on all products, except when in non-1x PERSAMPLE mode - * (applies to [DevSNB+] only). Not valid on [DevSNB] if 4x - * PERPIXEL mode with pixel shader computed depth. - * - * E: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader - * computed depth. - * - * F: Valid on all products, except not valid on [DevSNB] if 4x - * PERPIXEL mode with pixel shader computed depth. - * - * In the table that follows, the only entry with "A" in the Valid - * column is the entry where only 8 pixel dispatch is enabled. - * Therefore, when we are in PERPIXEL mode with pixel shader computed - * depth, we need to disable SIMD16 dispatch. - */ - if (dw5 & GEN6_WM_COMPUTED_DEPTH) - dw5 &= ~GEN6_WM_16_DISPATCH_ENABLE; } } else { dw6 |= GEN6_WM_MSRAST_OFF_PIXEL; diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 8d2e2c32bb4..a618c3ed87b 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -216,34 +216,19 @@ gen7_upload_ps_state(struct brw_context *brw, dw4 |= fast_clear_op; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_16) dw4 |= GEN7_PS_16_DISPATCH_ENABLE; - /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 - * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader - * is successfully compiled. In majority of the cases that bring us - * better performance than 'SIMD8 only' dispatch. - */ - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw4 |= GEN7_PS_8_DISPATCH_ENABLE; - dw5 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - dw5 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw5 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } - else { + if (prog_data->dispatch_8) dw4 |= GEN7_PS_8_DISPATCH_ENABLE; - dw5 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + + dw5 |= prog_data->base.dispatch_grf_start_reg << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0; + dw5 |= prog_data->dispatch_grf_start_reg_2 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; BEGIN_BATCH(8); OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2)); diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index b677a8e1793..c475a52afe0 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -234,34 +234,19 @@ gen8_upload_ps_state(struct brw_context *brw, dw6 |= fast_clear_op; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_8) + dw6 |= GEN7_PS_8_DISPATCH_ENABLE; + + if (prog_data->dispatch_16) dw6 |= GEN7_PS_16_DISPATCH_ENABLE; - /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 - * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader - * is successfully compiled. In majority of the cases that bring us - * better performance than 'SIMD8 only' dispatch. - */ - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - dw7 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - dw7 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw7 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } else { - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - dw7 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + dw7 |= prog_data->base.dispatch_grf_start_reg << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0; + dw7 |= prog_data->dispatch_grf_start_reg_2 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; BEGIN_BATCH(12); OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2)); |