diff options
Diffstat (limited to 'src/gallium/frontends/clover/core/kernel.cpp')
-rw-r--r-- | src/gallium/frontends/clover/core/kernel.cpp | 610 |
1 files changed, 610 insertions, 0 deletions
diff --git a/src/gallium/frontends/clover/core/kernel.cpp b/src/gallium/frontends/clover/core/kernel.cpp new file mode 100644 index 00000000000..7d839767aa0 --- /dev/null +++ b/src/gallium/frontends/clover/core/kernel.cpp @@ -0,0 +1,610 @@ +// +// Copyright 2012 Francisco Jerez +// +// Permission is hereby granted, free of charge, to any person obtaining a +// copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// + +#include "core/kernel.hpp" +#include "core/resource.hpp" +#include "util/factor.hpp" +#include "util/u_math.h" +#include "pipe/p_context.h" + +using namespace clover; + +kernel::kernel(clover::program &prog, const std::string &name, + const std::vector<module::argument> &margs) : + program(prog), _name(name), exec(*this), + program_ref(prog._kernel_ref_counter) { + for (auto &marg : margs) { + if (marg.semantic == module::argument::general) + _args.emplace_back(argument::create(marg)); + } +} + +template<typename V> +static inline std::vector<uint> +pad_vector(command_queue &q, const V &v, uint x) { + std::vector<uint> w { v.begin(), v.end() }; + w.resize(q.device().max_block_size().size(), x); + return w; +} + +void +kernel::launch(command_queue &q, + const std::vector<size_t> &grid_offset, + const std::vector<size_t> &grid_size, + const std::vector<size_t> &block_size) { + const auto m = program().build(q.device()).binary; + const auto reduced_grid_size = + map(divides(), grid_size, block_size); + void *st = exec.bind(&q, grid_offset); + struct pipe_grid_info info = {}; + + // The handles are created during exec_context::bind(), so we need make + // sure to call exec_context::bind() before retrieving them. + std::vector<uint32_t *> g_handles = map([&](size_t h) { + return (uint32_t *)&exec.input[h]; + }, exec.g_handles); + + q.pipe->bind_compute_state(q.pipe, st); + q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, + 0, exec.samplers.size(), + exec.samplers.data()); + + q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, + exec.sviews.size(), exec.sviews.data()); + q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), + exec.resources.data()); + q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), + exec.g_buffers.data(), g_handles.data()); + + // Fill information for the launch_grid() call. + info.work_dim = grid_size.size(); + copy(pad_vector(q, block_size, 1), info.block); + copy(pad_vector(q, reduced_grid_size, 1), info.grid); + info.pc = find(name_equals(_name), m.syms).offset; + info.input = exec.input.data(); + + q.pipe->launch_grid(q.pipe, &info); + + q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL); + q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL); + q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0, + exec.sviews.size(), NULL); + q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0, + exec.samplers.size(), NULL); + + q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER); + exec.unbind(); +} + +size_t +kernel::mem_local() const { + size_t sz = 0; + + for (auto &arg : args()) { + if (dynamic_cast<local_argument *>(&arg)) + sz += arg.storage(); + } + + return sz; +} + +size_t +kernel::mem_private() const { + return 0; +} + +const std::string & +kernel::name() const { + return _name; +} + +std::vector<size_t> +kernel::optimal_block_size(const command_queue &q, + const std::vector<size_t> &grid_size) const { + return factor::find_grid_optimal_factor<size_t>( + q.device().max_threads_per_block(), q.device().max_block_size(), + grid_size); +} + +std::vector<size_t> +kernel::required_block_size() const { + return { 0, 0, 0 }; +} + +kernel::argument_range +kernel::args() { + return map(derefs(), _args); +} + +kernel::const_argument_range +kernel::args() const { + return map(derefs(), _args); +} + +const module & +kernel::module(const command_queue &q) const { + return program().build(q.device()).binary; +} + +kernel::exec_context::exec_context(kernel &kern) : + kern(kern), q(NULL), mem_local(0), st(NULL), cs() { +} + +kernel::exec_context::~exec_context() { + if (st) + q->pipe->delete_compute_state(q->pipe, st); +} + +void * +kernel::exec_context::bind(intrusive_ptr<command_queue> _q, + const std::vector<size_t> &grid_offset) { + std::swap(q, _q); + + // Bind kernel arguments. + auto &m = kern.program().build(q->device()).binary; + auto msym = find(name_equals(kern.name()), m.syms); + auto margs = msym.args; + auto msec = find(id_equals(msym.section), m.secs); + auto explicit_arg = kern._args.begin(); + + for (auto &marg : margs) { + switch (marg.semantic) { + case module::argument::general: + (*(explicit_arg++))->bind(*this, marg); + break; + + case module::argument::grid_dimension: { + const cl_uint dimension = grid_offset.size(); + auto arg = argument::create(marg); + + arg->set(sizeof(dimension), &dimension); + arg->bind(*this, marg); + break; + } + case module::argument::grid_offset: { + for (cl_uint x : pad_vector(*q, grid_offset, 0)) { + auto arg = argument::create(marg); + + arg->set(sizeof(x), &x); + arg->bind(*this, marg); + } + break; + } + case module::argument::image_size: { + auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get(); + std::vector<cl_uint> image_size{ + static_cast<cl_uint>(img->width()), + static_cast<cl_uint>(img->height()), + static_cast<cl_uint>(img->depth())}; + for (auto x : image_size) { + auto arg = argument::create(marg); + + arg->set(sizeof(x), &x); + arg->bind(*this, marg); + } + break; + } + case module::argument::image_format: { + auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get(); + cl_image_format fmt = img->format(); + std::vector<cl_uint> image_format{ + static_cast<cl_uint>(fmt.image_channel_data_type), + static_cast<cl_uint>(fmt.image_channel_order)}; + for (auto x : image_format) { + auto arg = argument::create(marg); + + arg->set(sizeof(x), &x); + arg->bind(*this, marg); + } + break; + } + } + } + + // Create a new compute state if anything changed. + if (!st || q != _q || + cs.req_local_mem != mem_local || + cs.req_input_mem != input.size()) { + if (st) + _q->pipe->delete_compute_state(_q->pipe, st); + + cs.ir_type = q->device().ir_format(); + cs.prog = &(msec.data[0]); + cs.req_local_mem = mem_local; + cs.req_input_mem = input.size(); + st = q->pipe->create_compute_state(q->pipe, &cs); + if (!st) { + unbind(); // Cleanup + throw error(CL_OUT_OF_RESOURCES); + } + } + + return st; +} + +void +kernel::exec_context::unbind() { + for (auto &arg : kern.args()) + arg.unbind(*this); + + input.clear(); + samplers.clear(); + sviews.clear(); + resources.clear(); + g_buffers.clear(); + g_handles.clear(); + mem_local = 0; +} + +namespace { + template<typename T> + std::vector<uint8_t> + bytes(const T& x) { + return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) }; + } + + /// + /// Transform buffer \a v from the native byte order into the byte + /// order specified by \a e. + /// + template<typename T> + void + byteswap(T &v, pipe_endian e) { + if (PIPE_ENDIAN_NATIVE != e) + std::reverse(v.begin(), v.end()); + } + + /// + /// Pad buffer \a v to the next multiple of \a n. + /// + template<typename T> + void + align(T &v, size_t n) { + v.resize(util_align_npot(v.size(), n)); + } + + bool + msb(const std::vector<uint8_t> &s) { + if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE) + return s.back() & 0x80; + else + return s.front() & 0x80; + } + + /// + /// Resize buffer \a v to size \a n using sign or zero extension + /// according to \a ext. + /// + template<typename T> + void + extend(T &v, enum module::argument::ext_type ext, size_t n) { + const size_t m = std::min(v.size(), n); + const bool sign_ext = (ext == module::argument::sign_ext); + const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0); + T w(n, fill); + + if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE) + std::copy_n(v.begin(), m, w.begin()); + else + std::copy_n(v.end() - m, m, w.end() - m); + + std::swap(v, w); + } + + /// + /// Append buffer \a w to \a v. + /// + template<typename T> + void + insert(T &v, const T &w) { + v.insert(v.end(), w.begin(), w.end()); + } + + /// + /// Append \a n elements to the end of buffer \a v. + /// + template<typename T> + size_t + allocate(T &v, size_t n) { + size_t pos = v.size(); + v.resize(pos + n); + return pos; + } +} + +std::unique_ptr<kernel::argument> +kernel::argument::create(const module::argument &marg) { + switch (marg.type) { + case module::argument::scalar: + return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size)); + + case module::argument::global: + return std::unique_ptr<kernel::argument>(new global_argument); + + case module::argument::local: + return std::unique_ptr<kernel::argument>(new local_argument); + + case module::argument::constant: + return std::unique_ptr<kernel::argument>(new constant_argument); + + case module::argument::image2d_rd: + case module::argument::image3d_rd: + return std::unique_ptr<kernel::argument>(new image_rd_argument); + + case module::argument::image2d_wr: + case module::argument::image3d_wr: + return std::unique_ptr<kernel::argument>(new image_wr_argument); + + case module::argument::sampler: + return std::unique_ptr<kernel::argument>(new sampler_argument); + + } + throw error(CL_INVALID_KERNEL_DEFINITION); +} + +kernel::argument::argument() : _set(false) { +} + +bool +kernel::argument::set() const { + return _set; +} + +size_t +kernel::argument::storage() const { + return 0; +} + +kernel::scalar_argument::scalar_argument(size_t size) : size(size) { +} + +void +kernel::scalar_argument::set(size_t size, const void *value) { + if (!value) + throw error(CL_INVALID_ARG_VALUE); + + if (size != this->size) + throw error(CL_INVALID_ARG_SIZE); + + v = { (uint8_t *)value, (uint8_t *)value + size }; + _set = true; +} + +void +kernel::scalar_argument::bind(exec_context &ctx, + const module::argument &marg) { + auto w = v; + + extend(w, marg.ext_type, marg.target_size); + byteswap(w, ctx.q->device().endianness()); + align(ctx.input, marg.target_align); + insert(ctx.input, w); +} + +void +kernel::scalar_argument::unbind(exec_context &ctx) { +} + +void +kernel::global_argument::set(size_t size, const void *value) { + if (size != sizeof(cl_mem)) + throw error(CL_INVALID_ARG_SIZE); + + buf = pobj<buffer>(value ? *(cl_mem *)value : NULL); + svm = nullptr; + _set = true; +} + +void +kernel::global_argument::set_svm(const void *value) { + svm = value; + buf = nullptr; + _set = true; +} + +void +kernel::global_argument::bind(exec_context &ctx, + const module::argument &marg) { + align(ctx.input, marg.target_align); + + if (buf) { + const resource &r = buf->resource(*ctx.q); + ctx.g_handles.push_back(ctx.input.size()); + ctx.g_buffers.push_back(r.pipe); + + // How to handle multi-demensional offsets? + // We don't need to. Buffer offsets are always + // one-dimensional. + auto v = bytes(r.offset[0]); + extend(v, marg.ext_type, marg.target_size); + byteswap(v, ctx.q->device().endianness()); + insert(ctx.input, v); + } else if (svm) { + auto v = bytes(svm); + extend(v, marg.ext_type, marg.target_size); + byteswap(v, ctx.q->device().endianness()); + insert(ctx.input, v); + } else { + // Null pointer. + allocate(ctx.input, marg.target_size); + } +} + +void +kernel::global_argument::unbind(exec_context &ctx) { +} + +size_t +kernel::local_argument::storage() const { + return _storage; +} + +void +kernel::local_argument::set(size_t size, const void *value) { + if (value) + throw error(CL_INVALID_ARG_VALUE); + + if (!size) + throw error(CL_INVALID_ARG_SIZE); + + _storage = size; + _set = true; +} + +void +kernel::local_argument::bind(exec_context &ctx, + const module::argument &marg) { + auto v = bytes(ctx.mem_local); + + extend(v, module::argument::zero_ext, marg.target_size); + byteswap(v, ctx.q->device().endianness()); + align(ctx.input, marg.target_align); + insert(ctx.input, v); + + ctx.mem_local += _storage; +} + +void +kernel::local_argument::unbind(exec_context &ctx) { +} + +void +kernel::constant_argument::set(size_t size, const void *value) { + if (size != sizeof(cl_mem)) + throw error(CL_INVALID_ARG_SIZE); + + buf = pobj<buffer>(value ? *(cl_mem *)value : NULL); + _set = true; +} + +void +kernel::constant_argument::bind(exec_context &ctx, + const module::argument &marg) { + align(ctx.input, marg.target_align); + + if (buf) { + resource &r = buf->resource(*ctx.q); + auto v = bytes(ctx.resources.size() << 24 | r.offset[0]); + + extend(v, module::argument::zero_ext, marg.target_size); + byteswap(v, ctx.q->device().endianness()); + insert(ctx.input, v); + + st = r.bind_surface(*ctx.q, false); + ctx.resources.push_back(st); + } else { + // Null pointer. + allocate(ctx.input, marg.target_size); + } +} + +void +kernel::constant_argument::unbind(exec_context &ctx) { + if (buf) + buf->resource(*ctx.q).unbind_surface(*ctx.q, st); +} + +void +kernel::image_rd_argument::set(size_t size, const void *value) { + if (!value) + throw error(CL_INVALID_ARG_VALUE); + + if (size != sizeof(cl_mem)) + throw error(CL_INVALID_ARG_SIZE); + + img = &obj<image>(*(cl_mem *)value); + _set = true; +} + +void +kernel::image_rd_argument::bind(exec_context &ctx, + const module::argument &marg) { + auto v = bytes(ctx.sviews.size()); + + extend(v, module::argument::zero_ext, marg.target_size); + byteswap(v, ctx.q->device().endianness()); + align(ctx.input, marg.target_align); + insert(ctx.input, v); + + st = img->resource(*ctx.q).bind_sampler_view(*ctx.q); + ctx.sviews.push_back(st); +} + +void +kernel::image_rd_argument::unbind(exec_context &ctx) { + img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st); +} + +void +kernel::image_wr_argument::set(size_t size, const void *value) { + if (!value) + throw error(CL_INVALID_ARG_VALUE); + + if (size != sizeof(cl_mem)) + throw error(CL_INVALID_ARG_SIZE); + + img = &obj<image>(*(cl_mem *)value); + _set = true; +} + +void +kernel::image_wr_argument::bind(exec_context &ctx, + const module::argument &marg) { + auto v = bytes(ctx.resources.size()); + + extend(v, module::argument::zero_ext, marg.target_size); + byteswap(v, ctx.q->device().endianness()); + align(ctx.input, marg.target_align); + insert(ctx.input, v); + + st = img->resource(*ctx.q).bind_surface(*ctx.q, true); + ctx.resources.push_back(st); +} + +void +kernel::image_wr_argument::unbind(exec_context &ctx) { + img->resource(*ctx.q).unbind_surface(*ctx.q, st); +} + +void +kernel::sampler_argument::set(size_t size, const void *value) { + if (!value) + throw error(CL_INVALID_SAMPLER); + + if (size != sizeof(cl_sampler)) + throw error(CL_INVALID_ARG_SIZE); + + s = &obj(*(cl_sampler *)value); + _set = true; +} + +void +kernel::sampler_argument::bind(exec_context &ctx, + const module::argument &marg) { + st = s->bind(*ctx.q); + ctx.samplers.push_back(st); +} + +void +kernel::sampler_argument::unbind(exec_context &ctx) { + s->unbind(*ctx.q, st); +} |