diff options
-rw-r--r-- | src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 401 |
1 files changed, 400 insertions, 1 deletions
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 122f47c3cbb..517bf532618 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -575,6 +575,395 @@ void amdgpu_bo_slab_free(void *priv, struct pb_slab *pslab) FREE(slab); } +/* + * Attempt to allocate the given number of backing pages. Fewer pages may be + * allocated (depending on the fragmentation of existing backing buffers), + * which will be reflected by a change to *pnum_pages. + */ +static struct amdgpu_sparse_backing * +sparse_backing_alloc(struct amdgpu_winsys_bo *bo, uint32_t *pstart_page, uint32_t *pnum_pages) +{ + struct amdgpu_sparse_backing *best_backing; + unsigned best_idx; + uint32_t best_num_pages; + + best_backing = NULL; + best_idx = 0; + best_num_pages = 0; + + /* This is a very simple and inefficient best-fit algorithm. */ + list_for_each_entry(struct amdgpu_sparse_backing, backing, &bo->u.sparse.backing, list) { + for (unsigned idx = 0; idx < backing->num_chunks; ++idx) { + uint32_t cur_num_pages = backing->chunks[idx].end - backing->chunks[idx].begin; + if ((best_num_pages < *pnum_pages && cur_num_pages > best_num_pages) || + (best_num_pages > *pnum_pages && cur_num_pages < best_num_pages)) { + best_backing = backing; + best_idx = idx; + best_num_pages = cur_num_pages; + } + } + } + + /* Allocate a new backing buffer if necessary. */ + if (!best_backing) { + struct pb_buffer *buf; + uint64_t size; + uint32_t pages; + + best_backing = CALLOC_STRUCT(amdgpu_sparse_backing); + if (!best_backing) + return NULL; + + best_backing->max_chunks = 4; + best_backing->chunks = CALLOC(best_backing->max_chunks, + sizeof(*best_backing->chunks)); + if (!best_backing->chunks) { + FREE(best_backing); + return NULL; + } + + assert(bo->u.sparse.num_backing_pages < DIV_ROUND_UP(bo->base.size, RADEON_SPARSE_PAGE_SIZE)); + + size = MIN3(bo->base.size / 16, + 8 * 1024 * 1024, + bo->base.size - (uint64_t)bo->u.sparse.num_backing_pages * RADEON_SPARSE_PAGE_SIZE); + size = MAX2(size, RADEON_SPARSE_PAGE_SIZE); + + buf = amdgpu_bo_create(&bo->ws->base, size, RADEON_SPARSE_PAGE_SIZE, + bo->initial_domain, + bo->u.sparse.flags | RADEON_FLAG_HANDLE); + if (!buf) { + FREE(best_backing->chunks); + FREE(best_backing); + return NULL; + } + + /* We might have gotten a bigger buffer than requested via caching. */ + pages = buf->size / RADEON_SPARSE_PAGE_SIZE; + + best_backing->bo = amdgpu_winsys_bo(buf); + best_backing->num_chunks = 1; + best_backing->chunks[0].begin = 0; + best_backing->chunks[0].end = pages; + + list_add(&best_backing->list, &bo->u.sparse.backing); + bo->u.sparse.num_backing_pages += pages; + + best_idx = 0; + best_num_pages = pages; + } + + *pnum_pages = MIN2(*pnum_pages, best_num_pages); + *pstart_page = best_backing->chunks[best_idx].begin; + best_backing->chunks[best_idx].begin += *pnum_pages; + + if (best_backing->chunks[best_idx].begin >= best_backing->chunks[best_idx].end) { + memmove(&best_backing->chunks[best_idx], &best_backing->chunks[best_idx + 1], + sizeof(*best_backing->chunks) * (best_backing->num_chunks - best_idx - 1)); + best_backing->num_chunks--; + } + + return best_backing; +} + +static void +sparse_free_backing_buffer(struct amdgpu_sparse_backing *backing) +{ + bo->u.sparse.num_backing_pages -= backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE; + + list_del(&backing->list); + amdgpu_winsys_bo_reference(&backing->bo, NULL); + FREE(backing->chunks); + FREE(backing); +} + +/* + * Return a range of pages from the given backing buffer back into the + * free structure. + */ +static bool +sparse_backing_free(struct amdgpu_winsys_bo *bo, + struct amdgpu_sparse_backing *backing, + uint32_t start_page, uint32_t num_pages) +{ + uint32_t end_page = start_page + num_pages; + unsigned low = 0; + unsigned high = backing->num_chunks; + + /* Find the first chunk with begin >= start_page. */ + while (low < high) { + unsigned mid = low + (high - low) / 2; + + if (backing->chunks[mid].begin >= start_page) + high = mid; + else + low = mid + 1; + } + + assert(low >= backing->num_chunks || end_page <= backing->chunks[low].begin); + assert(low == 0 || backing->chunks[low - 1].end <= start_page); + + if (low > 0 && backing->chunks[low - 1].end == start_page) { + backing->chunks[low - 1].end = end_page; + + if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { + backing->chunks[low - 1].end = backing->chunks[low].end; + memmove(&backing->chunks[low], &backing->chunks[low + 1], + sizeof(*backing->chunks) * (backing->num_chunks - low - 1)); + backing->num_chunks--; + } + } else if (low < backing->num_chunks && end_page == backing->chunks[low].begin) { + backing->chunks[low].begin = start_page; + } else { + if (backing->num_chunks >= backing->max_chunks) { + unsigned new_max_chunks = 2 * backing->max_chunks; + struct amdgpu_sparse_backing_chunk *new_chunks = + REALLOC(backing->chunks, + sizeof(*backing->chunks) * backing->max_chunks, + sizeof(*backing->chunks) * new_max_chunks); + if (!new_chunks) + return false; + + backing->max_chunks = new_max_chunks; + backing->chunks = new_chunks; + } + + memmove(&backing->chunks[low + 1], &backing->chunks[low], + sizeof(*backing->chunks) * (backing->num_chunks - low)); + backing->chunks[low].begin = start_page; + backing->chunks[low].end = end_page; + backing->num_chunks++; + } + + if (backing->num_chunks == 1 && backing->chunks[0].begin == 0 && + backing->chunks[0].end == backing->bo->base.size / RADEON_SPARSE_PAGE_SIZE) + sparse_free_backing_buffer(backing); + + return true; +} + +static void amdgpu_bo_sparse_destroy(struct pb_buffer *_buf) +{ + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + int r; + + assert(!bo->bo && bo->sparse); + + r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, + (uint64_t)bo->u.sparse.num_va_pages * RADEON_SPARSE_PAGE_SIZE, + bo->va, 0, AMDGPU_VA_OP_CLEAR); + if (r) { + fprintf(stderr, "amdgpu: clearing PRT VA region on destroy failed (%d)\n", r); + } + + while (!list_empty(&bo->u.sparse.backing)) { + struct amdgpu_sparse_backing *dummy = NULL; + sparse_free_backing_buffer(container_of(bo->u.sparse.backing.next, + dummy, list)); + } + + amdgpu_va_range_free(bo->u.sparse.va_handle); + mtx_destroy(&bo->u.sparse.commit_lock); + FREE(bo->u.sparse.commitments); + FREE(bo); +} + +static const struct pb_vtbl amdgpu_winsys_bo_sparse_vtbl = { + amdgpu_bo_sparse_destroy + /* other functions are never called */ +}; + +static struct pb_buffer * +amdgpu_bo_sparse_create(struct amdgpu_winsys *ws, uint64_t size, + enum radeon_bo_domain domain, + enum radeon_bo_flag flags) +{ + struct amdgpu_winsys_bo *bo; + uint64_t map_size; + uint64_t va_gap_size; + int r; + + /* We use 32-bit page numbers; refuse to attempt allocating sparse buffers + * that exceed this limit. This is not really a restriction: we don't have + * that much virtual address space anyway. + */ + if (size > (uint64_t)INT32_MAX * RADEON_SPARSE_PAGE_SIZE) + return NULL; + + bo = CALLOC_STRUCT(amdgpu_winsys_bo); + if (!bo) + return NULL; + + pipe_reference_init(&bo->base.reference, 1); + bo->base.alignment = RADEON_SPARSE_PAGE_SIZE; + bo->base.size = size; + bo->base.vtbl = &amdgpu_winsys_bo_sparse_vtbl; + bo->ws = ws; + bo->initial_domain = domain; + bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); + bo->sparse = true; + bo->u.sparse.flags = flags & ~RADEON_FLAG_SPARSE; + + bo->u.sparse.num_va_pages = DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); + bo->u.sparse.commitments = CALLOC(bo->u.sparse.num_va_pages, + sizeof(*bo->u.sparse.commitments)); + if (!bo->u.sparse.commitments) + goto error_alloc_commitments; + + mtx_init(&bo->u.sparse.commit_lock, mtx_plain); + LIST_INITHEAD(&bo->u.sparse.backing); + + /* For simplicity, we always map a multiple of the page size. */ + map_size = align64(size, RADEON_SPARSE_PAGE_SIZE); + va_gap_size = ws->check_vm ? 4 * RADEON_SPARSE_PAGE_SIZE : 0; + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, + map_size + va_gap_size, RADEON_SPARSE_PAGE_SIZE, + 0, &bo->va, &bo->u.sparse.va_handle, 0); + if (r) + goto error_va_alloc; + + r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, size, bo->va, + AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_MAP); + if (r) + goto error_va_map; + + return &bo->base; + +error_va_map: + amdgpu_va_range_free(bo->u.sparse.va_handle); +error_va_alloc: + mtx_destroy(&bo->u.sparse.commit_lock); + FREE(bo->u.sparse.commitments); +error_alloc_commitments: + FREE(bo); + return NULL; +} + +static bool +amdgpu_bo_sparse_commit(struct pb_buffer *buf, uint64_t offset, uint64_t size, + bool commit) +{ + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buf); + struct amdgpu_sparse_commitment *comm; + uint32_t va_page, end_va_page; + bool ok = true; + int r; + + assert(bo->sparse); + assert(offset % RADEON_SPARSE_PAGE_SIZE == 0); + assert(offset <= bo->base.size); + assert(size <= bo->base.size - offset); + assert(size % RADEON_SPARSE_PAGE_SIZE == 0 || offset + size == bo->base.size); + + comm = bo->u.sparse.commitments; + va_page = offset / RADEON_SPARSE_PAGE_SIZE; + end_va_page = va_page + DIV_ROUND_UP(size, RADEON_SPARSE_PAGE_SIZE); + + mtx_lock(&bo->u.sparse.commit_lock); + + if (commit) { + while (va_page < end_va_page) { + uint32_t span_va_page; + + /* Skip pages that are already committed. */ + if (comm[va_page].backing) { + va_page++; + continue; + } + + /* Determine length of uncommitted span. */ + span_va_page = va_page; + while (va_page < end_va_page && !comm[va_page].backing) + va_page++; + + /* Fill the uncommitted span with chunks of backing memory. */ + while (span_va_page < va_page) { + struct amdgpu_sparse_backing *backing; + uint32_t backing_start, backing_size; + + backing_size = va_page - span_va_page; + backing = sparse_backing_alloc(bo, &backing_start, &backing_size); + if (!backing) { + ok = false; + goto out; + } + + r = amdgpu_bo_va_op_raw(bo->ws->dev, backing->bo->bo, + (uint64_t)backing_start * RADEON_SPARSE_PAGE_SIZE, + (uint64_t)backing_size * RADEON_SPARSE_PAGE_SIZE, + bo->va + (uint64_t)span_va_page * RADEON_SPARSE_PAGE_SIZE, + AMDGPU_VM_PAGE_READABLE | + AMDGPU_VM_PAGE_WRITEABLE | + AMDGPU_VM_PAGE_EXECUTABLE, + AMDGPU_VA_OP_REPLACE); + if (r) { + ok = sparse_backing_free(bo, backing, backing_start, backing_size); + assert(ok && "sufficient memory should already be allocated"); + + ok = false; + goto out; + } + + while (backing_size) { + comm[span_va_page].backing = backing; + comm[span_va_page].page = backing_start; + span_va_page++; + backing_start++; + backing_size--; + } + } + } + } else { + r = amdgpu_bo_va_op_raw(bo->ws->dev, NULL, 0, + (uint64_t)(end_va_page - va_page) * RADEON_SPARSE_PAGE_SIZE, + bo->va + (uint64_t)va_page * RADEON_SPARSE_PAGE_SIZE, + AMDGPU_VM_PAGE_PRT, AMDGPU_VA_OP_REPLACE); + if (r) { + ok = false; + goto out; + } + + while (va_page < end_va_page) { + struct amdgpu_sparse_backing *backing; + uint32_t backing_start; + uint32_t span_pages; + + /* Skip pages that are already uncommitted. */ + if (!comm[va_page].backing) { + va_page++; + continue; + } + + /* Group contiguous spans of pages. */ + backing = comm[va_page].backing; + backing_start = comm[va_page].page; + comm[va_page].backing = NULL; + + span_pages = 1; + va_page++; + + while (va_page < end_va_page && + comm[va_page].backing == backing && + comm[va_page].page == backing_start + span_pages) { + comm[va_page].backing = NULL; + va_page++; + span_pages++; + } + + if (!sparse_backing_free(bo, backing, backing_start, span_pages)) { + /* Couldn't allocate tracking data structures, so we have to leak */ + fprintf(stderr, "amdgpu: leaking PRT backing memory\n"); + ok = false; + } + } + } +out: + + mtx_unlock(&bo->u.sparse.commit_lock); + + return ok; +} + static unsigned eg_tile_split(unsigned tile_split) { switch (tile_split) { @@ -696,7 +1085,7 @@ amdgpu_bo_create(struct radeon_winsys *rws, unsigned usage = 0, pb_cache_bucket; /* Sub-allocate small buffers from slabs. */ - if (!(flags & RADEON_FLAG_HANDLE) && + if (!(flags & (RADEON_FLAG_HANDLE | RADEON_FLAG_SPARSE)) && size <= (1 << AMDGPU_SLAB_MAX_SIZE_LOG2) && alignment <= MAX2(1 << AMDGPU_SLAB_MIN_SIZE_LOG2, util_next_power_of_two(size))) { struct pb_slab_entry *entry; @@ -742,6 +1131,15 @@ amdgpu_bo_create(struct radeon_winsys *rws, } no_slab: + if (flags & RADEON_FLAG_SPARSE) { + assert(RADEON_SPARSE_PAGE_SIZE % alignment == 0); + assert(!(flags & RADEON_FLAG_CPU_ACCESS)); + + flags |= RADEON_FLAG_NO_CPU_ACCESS; + + return amdgpu_bo_sparse_create(ws, size, domain, flags); + } + /* This flag is irrelevant for the cache. */ flags &= ~RADEON_FLAG_HANDLE; @@ -1003,6 +1401,7 @@ void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) ws->base.buffer_from_ptr = amdgpu_bo_from_ptr; ws->base.buffer_is_user_ptr = amdgpu_bo_is_user_ptr; ws->base.buffer_get_handle = amdgpu_bo_get_handle; + ws->base.buffer_commit = amdgpu_bo_sparse_commit; ws->base.buffer_get_virtual_address = amdgpu_bo_get_va; ws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain; } |