aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/frontends/clover/core/kernel.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/frontends/clover/core/kernel.cpp')
-rw-r--r--src/gallium/frontends/clover/core/kernel.cpp610
1 files changed, 610 insertions, 0 deletions
diff --git a/src/gallium/frontends/clover/core/kernel.cpp b/src/gallium/frontends/clover/core/kernel.cpp
new file mode 100644
index 00000000000..7d839767aa0
--- /dev/null
+++ b/src/gallium/frontends/clover/core/kernel.cpp
@@ -0,0 +1,610 @@
+//
+// Copyright 2012 Francisco Jerez
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+//
+
+#include "core/kernel.hpp"
+#include "core/resource.hpp"
+#include "util/factor.hpp"
+#include "util/u_math.h"
+#include "pipe/p_context.h"
+
+using namespace clover;
+
+kernel::kernel(clover::program &prog, const std::string &name,
+ const std::vector<module::argument> &margs) :
+ program(prog), _name(name), exec(*this),
+ program_ref(prog._kernel_ref_counter) {
+ for (auto &marg : margs) {
+ if (marg.semantic == module::argument::general)
+ _args.emplace_back(argument::create(marg));
+ }
+}
+
+template<typename V>
+static inline std::vector<uint>
+pad_vector(command_queue &q, const V &v, uint x) {
+ std::vector<uint> w { v.begin(), v.end() };
+ w.resize(q.device().max_block_size().size(), x);
+ return w;
+}
+
+void
+kernel::launch(command_queue &q,
+ const std::vector<size_t> &grid_offset,
+ const std::vector<size_t> &grid_size,
+ const std::vector<size_t> &block_size) {
+ const auto m = program().build(q.device()).binary;
+ const auto reduced_grid_size =
+ map(divides(), grid_size, block_size);
+ void *st = exec.bind(&q, grid_offset);
+ struct pipe_grid_info info = {};
+
+ // The handles are created during exec_context::bind(), so we need make
+ // sure to call exec_context::bind() before retrieving them.
+ std::vector<uint32_t *> g_handles = map([&](size_t h) {
+ return (uint32_t *)&exec.input[h];
+ }, exec.g_handles);
+
+ q.pipe->bind_compute_state(q.pipe, st);
+ q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE,
+ 0, exec.samplers.size(),
+ exec.samplers.data());
+
+ q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
+ exec.sviews.size(), exec.sviews.data());
+ q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(),
+ exec.resources.data());
+ q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(),
+ exec.g_buffers.data(), g_handles.data());
+
+ // Fill information for the launch_grid() call.
+ info.work_dim = grid_size.size();
+ copy(pad_vector(q, block_size, 1), info.block);
+ copy(pad_vector(q, reduced_grid_size, 1), info.grid);
+ info.pc = find(name_equals(_name), m.syms).offset;
+ info.input = exec.input.data();
+
+ q.pipe->launch_grid(q.pipe, &info);
+
+ q.pipe->set_global_binding(q.pipe, 0, exec.g_buffers.size(), NULL, NULL);
+ q.pipe->set_compute_resources(q.pipe, 0, exec.resources.size(), NULL);
+ q.pipe->set_sampler_views(q.pipe, PIPE_SHADER_COMPUTE, 0,
+ exec.sviews.size(), NULL);
+ q.pipe->bind_sampler_states(q.pipe, PIPE_SHADER_COMPUTE, 0,
+ exec.samplers.size(), NULL);
+
+ q.pipe->memory_barrier(q.pipe, PIPE_BARRIER_GLOBAL_BUFFER);
+ exec.unbind();
+}
+
+size_t
+kernel::mem_local() const {
+ size_t sz = 0;
+
+ for (auto &arg : args()) {
+ if (dynamic_cast<local_argument *>(&arg))
+ sz += arg.storage();
+ }
+
+ return sz;
+}
+
+size_t
+kernel::mem_private() const {
+ return 0;
+}
+
+const std::string &
+kernel::name() const {
+ return _name;
+}
+
+std::vector<size_t>
+kernel::optimal_block_size(const command_queue &q,
+ const std::vector<size_t> &grid_size) const {
+ return factor::find_grid_optimal_factor<size_t>(
+ q.device().max_threads_per_block(), q.device().max_block_size(),
+ grid_size);
+}
+
+std::vector<size_t>
+kernel::required_block_size() const {
+ return { 0, 0, 0 };
+}
+
+kernel::argument_range
+kernel::args() {
+ return map(derefs(), _args);
+}
+
+kernel::const_argument_range
+kernel::args() const {
+ return map(derefs(), _args);
+}
+
+const module &
+kernel::module(const command_queue &q) const {
+ return program().build(q.device()).binary;
+}
+
+kernel::exec_context::exec_context(kernel &kern) :
+ kern(kern), q(NULL), mem_local(0), st(NULL), cs() {
+}
+
+kernel::exec_context::~exec_context() {
+ if (st)
+ q->pipe->delete_compute_state(q->pipe, st);
+}
+
+void *
+kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
+ const std::vector<size_t> &grid_offset) {
+ std::swap(q, _q);
+
+ // Bind kernel arguments.
+ auto &m = kern.program().build(q->device()).binary;
+ auto msym = find(name_equals(kern.name()), m.syms);
+ auto margs = msym.args;
+ auto msec = find(id_equals(msym.section), m.secs);
+ auto explicit_arg = kern._args.begin();
+
+ for (auto &marg : margs) {
+ switch (marg.semantic) {
+ case module::argument::general:
+ (*(explicit_arg++))->bind(*this, marg);
+ break;
+
+ case module::argument::grid_dimension: {
+ const cl_uint dimension = grid_offset.size();
+ auto arg = argument::create(marg);
+
+ arg->set(sizeof(dimension), &dimension);
+ arg->bind(*this, marg);
+ break;
+ }
+ case module::argument::grid_offset: {
+ for (cl_uint x : pad_vector(*q, grid_offset, 0)) {
+ auto arg = argument::create(marg);
+
+ arg->set(sizeof(x), &x);
+ arg->bind(*this, marg);
+ }
+ break;
+ }
+ case module::argument::image_size: {
+ auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
+ std::vector<cl_uint> image_size{
+ static_cast<cl_uint>(img->width()),
+ static_cast<cl_uint>(img->height()),
+ static_cast<cl_uint>(img->depth())};
+ for (auto x : image_size) {
+ auto arg = argument::create(marg);
+
+ arg->set(sizeof(x), &x);
+ arg->bind(*this, marg);
+ }
+ break;
+ }
+ case module::argument::image_format: {
+ auto img = dynamic_cast<image_argument &>(**(explicit_arg - 1)).get();
+ cl_image_format fmt = img->format();
+ std::vector<cl_uint> image_format{
+ static_cast<cl_uint>(fmt.image_channel_data_type),
+ static_cast<cl_uint>(fmt.image_channel_order)};
+ for (auto x : image_format) {
+ auto arg = argument::create(marg);
+
+ arg->set(sizeof(x), &x);
+ arg->bind(*this, marg);
+ }
+ break;
+ }
+ }
+ }
+
+ // Create a new compute state if anything changed.
+ if (!st || q != _q ||
+ cs.req_local_mem != mem_local ||
+ cs.req_input_mem != input.size()) {
+ if (st)
+ _q->pipe->delete_compute_state(_q->pipe, st);
+
+ cs.ir_type = q->device().ir_format();
+ cs.prog = &(msec.data[0]);
+ cs.req_local_mem = mem_local;
+ cs.req_input_mem = input.size();
+ st = q->pipe->create_compute_state(q->pipe, &cs);
+ if (!st) {
+ unbind(); // Cleanup
+ throw error(CL_OUT_OF_RESOURCES);
+ }
+ }
+
+ return st;
+}
+
+void
+kernel::exec_context::unbind() {
+ for (auto &arg : kern.args())
+ arg.unbind(*this);
+
+ input.clear();
+ samplers.clear();
+ sviews.clear();
+ resources.clear();
+ g_buffers.clear();
+ g_handles.clear();
+ mem_local = 0;
+}
+
+namespace {
+ template<typename T>
+ std::vector<uint8_t>
+ bytes(const T& x) {
+ return { (uint8_t *)&x, (uint8_t *)&x + sizeof(x) };
+ }
+
+ ///
+ /// Transform buffer \a v from the native byte order into the byte
+ /// order specified by \a e.
+ ///
+ template<typename T>
+ void
+ byteswap(T &v, pipe_endian e) {
+ if (PIPE_ENDIAN_NATIVE != e)
+ std::reverse(v.begin(), v.end());
+ }
+
+ ///
+ /// Pad buffer \a v to the next multiple of \a n.
+ ///
+ template<typename T>
+ void
+ align(T &v, size_t n) {
+ v.resize(util_align_npot(v.size(), n));
+ }
+
+ bool
+ msb(const std::vector<uint8_t> &s) {
+ if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
+ return s.back() & 0x80;
+ else
+ return s.front() & 0x80;
+ }
+
+ ///
+ /// Resize buffer \a v to size \a n using sign or zero extension
+ /// according to \a ext.
+ ///
+ template<typename T>
+ void
+ extend(T &v, enum module::argument::ext_type ext, size_t n) {
+ const size_t m = std::min(v.size(), n);
+ const bool sign_ext = (ext == module::argument::sign_ext);
+ const uint8_t fill = (sign_ext && msb(v) ? ~0 : 0);
+ T w(n, fill);
+
+ if (PIPE_ENDIAN_NATIVE == PIPE_ENDIAN_LITTLE)
+ std::copy_n(v.begin(), m, w.begin());
+ else
+ std::copy_n(v.end() - m, m, w.end() - m);
+
+ std::swap(v, w);
+ }
+
+ ///
+ /// Append buffer \a w to \a v.
+ ///
+ template<typename T>
+ void
+ insert(T &v, const T &w) {
+ v.insert(v.end(), w.begin(), w.end());
+ }
+
+ ///
+ /// Append \a n elements to the end of buffer \a v.
+ ///
+ template<typename T>
+ size_t
+ allocate(T &v, size_t n) {
+ size_t pos = v.size();
+ v.resize(pos + n);
+ return pos;
+ }
+}
+
+std::unique_ptr<kernel::argument>
+kernel::argument::create(const module::argument &marg) {
+ switch (marg.type) {
+ case module::argument::scalar:
+ return std::unique_ptr<kernel::argument>(new scalar_argument(marg.size));
+
+ case module::argument::global:
+ return std::unique_ptr<kernel::argument>(new global_argument);
+
+ case module::argument::local:
+ return std::unique_ptr<kernel::argument>(new local_argument);
+
+ case module::argument::constant:
+ return std::unique_ptr<kernel::argument>(new constant_argument);
+
+ case module::argument::image2d_rd:
+ case module::argument::image3d_rd:
+ return std::unique_ptr<kernel::argument>(new image_rd_argument);
+
+ case module::argument::image2d_wr:
+ case module::argument::image3d_wr:
+ return std::unique_ptr<kernel::argument>(new image_wr_argument);
+
+ case module::argument::sampler:
+ return std::unique_ptr<kernel::argument>(new sampler_argument);
+
+ }
+ throw error(CL_INVALID_KERNEL_DEFINITION);
+}
+
+kernel::argument::argument() : _set(false) {
+}
+
+bool
+kernel::argument::set() const {
+ return _set;
+}
+
+size_t
+kernel::argument::storage() const {
+ return 0;
+}
+
+kernel::scalar_argument::scalar_argument(size_t size) : size(size) {
+}
+
+void
+kernel::scalar_argument::set(size_t size, const void *value) {
+ if (!value)
+ throw error(CL_INVALID_ARG_VALUE);
+
+ if (size != this->size)
+ throw error(CL_INVALID_ARG_SIZE);
+
+ v = { (uint8_t *)value, (uint8_t *)value + size };
+ _set = true;
+}
+
+void
+kernel::scalar_argument::bind(exec_context &ctx,
+ const module::argument &marg) {
+ auto w = v;
+
+ extend(w, marg.ext_type, marg.target_size);
+ byteswap(w, ctx.q->device().endianness());
+ align(ctx.input, marg.target_align);
+ insert(ctx.input, w);
+}
+
+void
+kernel::scalar_argument::unbind(exec_context &ctx) {
+}
+
+void
+kernel::global_argument::set(size_t size, const void *value) {
+ if (size != sizeof(cl_mem))
+ throw error(CL_INVALID_ARG_SIZE);
+
+ buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
+ svm = nullptr;
+ _set = true;
+}
+
+void
+kernel::global_argument::set_svm(const void *value) {
+ svm = value;
+ buf = nullptr;
+ _set = true;
+}
+
+void
+kernel::global_argument::bind(exec_context &ctx,
+ const module::argument &marg) {
+ align(ctx.input, marg.target_align);
+
+ if (buf) {
+ const resource &r = buf->resource(*ctx.q);
+ ctx.g_handles.push_back(ctx.input.size());
+ ctx.g_buffers.push_back(r.pipe);
+
+ // How to handle multi-demensional offsets?
+ // We don't need to. Buffer offsets are always
+ // one-dimensional.
+ auto v = bytes(r.offset[0]);
+ extend(v, marg.ext_type, marg.target_size);
+ byteswap(v, ctx.q->device().endianness());
+ insert(ctx.input, v);
+ } else if (svm) {
+ auto v = bytes(svm);
+ extend(v, marg.ext_type, marg.target_size);
+ byteswap(v, ctx.q->device().endianness());
+ insert(ctx.input, v);
+ } else {
+ // Null pointer.
+ allocate(ctx.input, marg.target_size);
+ }
+}
+
+void
+kernel::global_argument::unbind(exec_context &ctx) {
+}
+
+size_t
+kernel::local_argument::storage() const {
+ return _storage;
+}
+
+void
+kernel::local_argument::set(size_t size, const void *value) {
+ if (value)
+ throw error(CL_INVALID_ARG_VALUE);
+
+ if (!size)
+ throw error(CL_INVALID_ARG_SIZE);
+
+ _storage = size;
+ _set = true;
+}
+
+void
+kernel::local_argument::bind(exec_context &ctx,
+ const module::argument &marg) {
+ auto v = bytes(ctx.mem_local);
+
+ extend(v, module::argument::zero_ext, marg.target_size);
+ byteswap(v, ctx.q->device().endianness());
+ align(ctx.input, marg.target_align);
+ insert(ctx.input, v);
+
+ ctx.mem_local += _storage;
+}
+
+void
+kernel::local_argument::unbind(exec_context &ctx) {
+}
+
+void
+kernel::constant_argument::set(size_t size, const void *value) {
+ if (size != sizeof(cl_mem))
+ throw error(CL_INVALID_ARG_SIZE);
+
+ buf = pobj<buffer>(value ? *(cl_mem *)value : NULL);
+ _set = true;
+}
+
+void
+kernel::constant_argument::bind(exec_context &ctx,
+ const module::argument &marg) {
+ align(ctx.input, marg.target_align);
+
+ if (buf) {
+ resource &r = buf->resource(*ctx.q);
+ auto v = bytes(ctx.resources.size() << 24 | r.offset[0]);
+
+ extend(v, module::argument::zero_ext, marg.target_size);
+ byteswap(v, ctx.q->device().endianness());
+ insert(ctx.input, v);
+
+ st = r.bind_surface(*ctx.q, false);
+ ctx.resources.push_back(st);
+ } else {
+ // Null pointer.
+ allocate(ctx.input, marg.target_size);
+ }
+}
+
+void
+kernel::constant_argument::unbind(exec_context &ctx) {
+ if (buf)
+ buf->resource(*ctx.q).unbind_surface(*ctx.q, st);
+}
+
+void
+kernel::image_rd_argument::set(size_t size, const void *value) {
+ if (!value)
+ throw error(CL_INVALID_ARG_VALUE);
+
+ if (size != sizeof(cl_mem))
+ throw error(CL_INVALID_ARG_SIZE);
+
+ img = &obj<image>(*(cl_mem *)value);
+ _set = true;
+}
+
+void
+kernel::image_rd_argument::bind(exec_context &ctx,
+ const module::argument &marg) {
+ auto v = bytes(ctx.sviews.size());
+
+ extend(v, module::argument::zero_ext, marg.target_size);
+ byteswap(v, ctx.q->device().endianness());
+ align(ctx.input, marg.target_align);
+ insert(ctx.input, v);
+
+ st = img->resource(*ctx.q).bind_sampler_view(*ctx.q);
+ ctx.sviews.push_back(st);
+}
+
+void
+kernel::image_rd_argument::unbind(exec_context &ctx) {
+ img->resource(*ctx.q).unbind_sampler_view(*ctx.q, st);
+}
+
+void
+kernel::image_wr_argument::set(size_t size, const void *value) {
+ if (!value)
+ throw error(CL_INVALID_ARG_VALUE);
+
+ if (size != sizeof(cl_mem))
+ throw error(CL_INVALID_ARG_SIZE);
+
+ img = &obj<image>(*(cl_mem *)value);
+ _set = true;
+}
+
+void
+kernel::image_wr_argument::bind(exec_context &ctx,
+ const module::argument &marg) {
+ auto v = bytes(ctx.resources.size());
+
+ extend(v, module::argument::zero_ext, marg.target_size);
+ byteswap(v, ctx.q->device().endianness());
+ align(ctx.input, marg.target_align);
+ insert(ctx.input, v);
+
+ st = img->resource(*ctx.q).bind_surface(*ctx.q, true);
+ ctx.resources.push_back(st);
+}
+
+void
+kernel::image_wr_argument::unbind(exec_context &ctx) {
+ img->resource(*ctx.q).unbind_surface(*ctx.q, st);
+}
+
+void
+kernel::sampler_argument::set(size_t size, const void *value) {
+ if (!value)
+ throw error(CL_INVALID_SAMPLER);
+
+ if (size != sizeof(cl_sampler))
+ throw error(CL_INVALID_ARG_SIZE);
+
+ s = &obj(*(cl_sampler *)value);
+ _set = true;
+}
+
+void
+kernel::sampler_argument::bind(exec_context &ctx,
+ const module::argument &marg) {
+ st = s->bind(*ctx.q);
+ ctx.samplers.push_back(st);
+}
+
+void
+kernel::sampler_argument::unbind(exec_context &ctx) {
+ s->unbind(*ctx.q, st);
+}